diff --git a/ypsu.cpp b/ypsu.cpp index 978cc39..76c81e8 100644 --- a/ypsu.cpp +++ b/ypsu.cpp @@ -183,59 +183,14 @@ for (int i = 0; i < n; i++) a[bucketdata[buf[i] >> 24 & 0xff]++] = buf[i]; free(buf); } - -template -struct VecInitMagic : public VecInitMagic { - inline static thread_local std::vector v {256}; // like a static v.reserve call becuz: * - inline __attribute__((always_inline)) VecInitMagic() noexcept - : VecInitMagic() { - v.clear(); // * - but also needed for subsequent calls - } -}; - -template<> -struct VecInitMagic<-1> {}; - - /** Recursive Functor: no class should be generated I think (compiler should be smart) */ - template - struct VecAccMagic : public VecAccMagic { - inline __attribute__((always_inline)) VecAccMagic(int i) noexcept - : VecAccMagic(i) { - if(i != VI) { - // Needed otherwise bunch - // of brand mispredicts can - // happen because this should - // be the common case, not the - // one when we find the vector! - return; - } else { - this->foundVec = &(VecInitMagic::v); - } - } - }; - /** Ends template recursion */ - template<> - struct VecAccMagic<-1> { - static thread_local std::vector NotFound; - std::vector *foundVec; - inline VecAccMagic(int i) noexcept: foundVec() {} - }; - void vsort(uint32_t *a, int n) { - static thread_local VecInitMagic<255> bts; + thread_local std::vector bts[256]; for (int shift = 0; shift < 32; shift += 8) { - for (int i = 0; i < n; i++) { - VecAccMagic<255> vba(a[i] >> shift & 0xff); - auto &bt = vba.foundVec; - bt->push_back(a[i]); - } - - for (int bti = 0, k = 0; bti < 256; bti++) { - VecAccMagic<255> vba(bti); - auto &bt = vba.foundVec; - memcpy(a + k, &((*bt)[0]), bt->size() * sizeof(a[0])); - k += bt->size(); - bt->clear(); + for (int i = 0; i < n; i++) bts[a[i] >> shift & 0xff].push_back(a[i]); + for (int bt = 0, k = 0; bt < 256; bt++) { + memcpy(a + k, &bts[bt][0], bts[bt].size() * sizeof(a[0])); + k += bts[bt].size(); + bts[bt].clear(); } } }