vsort version that got slower, but is really funny template code

2021-12-15 14:48:14 +01:00 · 2021-12-15 14:48:14 +01:00 · fd35dbc51b
commit fd35dbc51b
parent bff96c8f7f
1 changed files with 51 additions and 6 deletions
--- a/ypsu.cpp
+++ b/ypsu.cpp
@ -183,14 +183,59 @@
    for (int i = 0; i < n; i++) a[bucketdata[buf[i] >> 24 & 0xff]++] = buf[i];
    free(buf);
  }
+
+template<int DIGIT>
+struct VecInitMagic : public VecInitMagic<DIGIT - 1> {
+	inline static thread_local std::vector<uint32_t> v {256}; // like a static v.reserve call becuz: *
+	inline __attribute__((always_inline)) VecInitMagic() noexcept
+			: VecInitMagic<DIGIT -1 >() {
+		v.clear(); // * - but also needed for subsequent calls
+	}
+};
+
+template<>
+struct VecInitMagic<-1> {};
+
+	/** Recursive Functor: no class should be generated I think (compiler should be smart) */
+	template<int VI>
+	struct VecAccMagic : public VecAccMagic<VI - 1> {
+		inline __attribute__((always_inline)) VecAccMagic(int i) noexcept
+				: VecAccMagic<VI -1 >(i) {
+			if(i != VI) {
+				// Needed otherwise bunch
+				// of brand mispredicts can
+				// happen because this should
+				// be the common case, not the
+				// one when we find the vector!
+				return;
+			} else {
+				this->foundVec = &(VecInitMagic<VI>::v);
+			}
+		}
+	};
+	/** Ends template recursion */
+	template<>
+	struct VecAccMagic<-1> {
+		static thread_local std::vector<uint32_t> NotFound;
+		std::vector<uint32_t> *foundVec;
+		inline VecAccMagic(int i) noexcept: foundVec() {}
+	};
+
  void vsort(uint32_t *a, int n) {
-    thread_local std::vector<uint32_t> bts[256];
+    static thread_local VecInitMagic<255> bts;
    for (int shift = 0; shift < 32; shift += 8) {
-      for (int i = 0; i < n; i++) bts[a[i] >> shift & 0xff].push_back(a[i]);
-      for (int bt = 0, k = 0; bt < 256; bt++) {
-        memcpy(a + k, &bts[bt][0], bts[bt].size() * sizeof(a[0]));
-        k += bts[bt].size();
-        bts[bt].clear();
+      for (int i = 0; i < n; i++) {
+            VecAccMagic<255> vba(a[i] >> shift & 0xff);
+            auto &bt = vba.foundVec;
+            bt->push_back(a[i]);
+      }
+
+      for (int bti = 0, k = 0; bti < 256; bti++) {
+        VecAccMagic<255> vba(bti);
+        auto &bt = vba.foundVec;
+        memcpy(a + k, &((*bt)[0]), bt->size() * sizeof(a[0]));
+        k += bt->size();
+        bt->clear();
      }
    }
  }