From fd35dbc51b63fa97ff5a9d7a823cdfa271b99a43 Mon Sep 17 00:00:00 2001 From: Richard Thier Date: Wed, 15 Dec 2021 14:48:14 +0100 Subject: [PATCH] vsort version that got slower, but is really funny template code --- ypsu.cpp | 57 ++++++++++++++++++++++++++++++++++++++++++++++++++------ 1 file changed, 51 insertions(+), 6 deletions(-) diff --git a/ypsu.cpp b/ypsu.cpp index 76c81e8..978cc39 100644 --- a/ypsu.cpp +++ b/ypsu.cpp @@ -183,14 +183,59 @@ for (int i = 0; i < n; i++) a[bucketdata[buf[i] >> 24 & 0xff]++] = buf[i]; free(buf); } + +template +struct VecInitMagic : public VecInitMagic { + inline static thread_local std::vector v {256}; // like a static v.reserve call becuz: * + inline __attribute__((always_inline)) VecInitMagic() noexcept + : VecInitMagic() { + v.clear(); // * - but also needed for subsequent calls + } +}; + +template<> +struct VecInitMagic<-1> {}; + + /** Recursive Functor: no class should be generated I think (compiler should be smart) */ + template + struct VecAccMagic : public VecAccMagic { + inline __attribute__((always_inline)) VecAccMagic(int i) noexcept + : VecAccMagic(i) { + if(i != VI) { + // Needed otherwise bunch + // of brand mispredicts can + // happen because this should + // be the common case, not the + // one when we find the vector! + return; + } else { + this->foundVec = &(VecInitMagic::v); + } + } + }; + /** Ends template recursion */ + template<> + struct VecAccMagic<-1> { + static thread_local std::vector NotFound; + std::vector *foundVec; + inline VecAccMagic(int i) noexcept: foundVec() {} + }; + void vsort(uint32_t *a, int n) { - thread_local std::vector bts[256]; + static thread_local VecInitMagic<255> bts; for (int shift = 0; shift < 32; shift += 8) { - for (int i = 0; i < n; i++) bts[a[i] >> shift & 0xff].push_back(a[i]); - for (int bt = 0, k = 0; bt < 256; bt++) { - memcpy(a + k, &bts[bt][0], bts[bt].size() * sizeof(a[0])); - k += bts[bt].size(); - bts[bt].clear(); + for (int i = 0; i < n; i++) { + VecAccMagic<255> vba(a[i] >> shift & 0xff); + auto &bt = vba.foundVec; + bt->push_back(a[i]); + } + + for (int bti = 0, k = 0; bti < 256; bti++) { + VecAccMagic<255> vba(bti); + auto &bt = vba.foundVec; + memcpy(a + k, &((*bt)[0]), bt->size() * sizeof(a[0])); + k += bt->size(); + bt->clear(); } } }