vsort version that got slower, but is really funny template code

This commit is contained in:
Richard Thier 2021-12-15 14:48:14 +01:00
parent bff96c8f7f
commit fd35dbc51b

View File

@ -183,14 +183,59 @@
for (int i = 0; i < n; i++) a[bucketdata[buf[i] >> 24 & 0xff]++] = buf[i];
free(buf);
}
template<int DIGIT>
struct VecInitMagic : public VecInitMagic<DIGIT - 1> {
inline static thread_local std::vector<uint32_t> v {256}; // like a static v.reserve call becuz: *
inline __attribute__((always_inline)) VecInitMagic() noexcept
: VecInitMagic<DIGIT -1 >() {
v.clear(); // * - but also needed for subsequent calls
}
};
template<>
struct VecInitMagic<-1> {};
/** Recursive Functor: no class should be generated I think (compiler should be smart) */
template<int VI>
struct VecAccMagic : public VecAccMagic<VI - 1> {
inline __attribute__((always_inline)) VecAccMagic(int i) noexcept
: VecAccMagic<VI -1 >(i) {
if(i != VI) {
// Needed otherwise bunch
// of brand mispredicts can
// happen because this should
// be the common case, not the
// one when we find the vector!
return;
} else {
this->foundVec = &(VecInitMagic<VI>::v);
}
}
};
/** Ends template recursion */
template<>
struct VecAccMagic<-1> {
static thread_local std::vector<uint32_t> NotFound;
std::vector<uint32_t> *foundVec;
inline VecAccMagic(int i) noexcept: foundVec() {}
};
void vsort(uint32_t *a, int n) {
thread_local std::vector<uint32_t> bts[256];
static thread_local VecInitMagic<255> bts;
for (int shift = 0; shift < 32; shift += 8) {
for (int i = 0; i < n; i++) bts[a[i] >> shift & 0xff].push_back(a[i]);
for (int bt = 0, k = 0; bt < 256; bt++) {
memcpy(a + k, &bts[bt][0], bts[bt].size() * sizeof(a[0]));
k += bts[bt].size();
bts[bt].clear();
for (int i = 0; i < n; i++) {
VecAccMagic<255> vba(a[i] >> shift & 0xff);
auto &bt = vba.foundVec;
bt->push_back(a[i]);
}
for (int bti = 0, k = 0; bti < 256; bti++) {
VecAccMagic<255> vba(bti);
auto &bt = vba.foundVec;
memcpy(a + k, &((*bt)[0]), bt->size() * sizeof(a[0]));
k += bt->size();
bt->clear();
}
}
}