From 08cb90bb1b982fb95cd0196ac1f77df060f4ed68 Mon Sep 17 00:00:00 2001
From: Richard Thier <magosit@outlook.hu>
Date: Tue, 30 Sep 2025 22:18:10 +0200
Subject: [PATCH] Revert "prepared for flame graph analysis"

This reverts commit ac873f7123c0dd23ff9d73668e005c71944a8afa.
---
 .gitmodules      |   3 --
 FlameGraph       |   1 -
 makefile         |   3 +-
 thiersort3.h     |  24 ++++++------
 threepass_xbit.h | 100 ++++++++++++++++++++++++++---------------------
 ypsu.cpp         |   7 ++--
 6 files changed, 72 insertions(+), 66 deletions(-)
 delete mode 160000 FlameGraph

diff --git a/.gitmodules b/.gitmodules
index e6b053a..95d1c10 100644
--- a/.gitmodules
+++ b/.gitmodules
@@ -1,6 +1,3 @@
 [submodule "vergesort"]
 	path = vergesort
 	url = https://github.com/Morwenn/vergesort
-[submodule "FlameGraph"]
-	path = FlameGraph
-	url = https://github.com/brendangregg/FlameGraph
diff --git a/FlameGraph b/FlameGraph
deleted file mode 160000
index 41fee1f..0000000
--- a/FlameGraph
+++ /dev/null
@@ -1 +0,0 @@
-Subproject commit 41fee1f99f9276008b7cd112fca19dc3ea84ac32
diff --git a/makefile b/makefile
index 2ceaada..7ac27c0 100644
--- a/makefile
+++ b/makefile
@@ -24,8 +24,7 @@ release_ypsu_assert: ypsu.cpp magyarsort.h
 
 release_ypsu_debug_sym: ypsu.cpp magyarsort.h
 	g++ ypsu.cpp -g -std=c++17 -O2 -o ypsu.out
-release_ypsu_noinline_debug_sym: ypsu.cpp magyarsort.h thiersort3.h
-	g++ ypsu.cpp -g -std=c++17 -O2 -fno-inline -fno-inline-functions -fno-inline-functions-called-once -fno-inline-functions-called-once -fno-inline-small-functions -fno-ipa-cp -fno-ipa-sra -fno-early-inlining -fno-omit-frame-pointer -fno-optimize-sibling-calls -o ypsu.out
+
 release3: test.cpp magyarsort.h
 	g++ test.cpp -DNDEBUG -std=c++17 -O3 -o test.out
 
diff --git a/thiersort3.h b/thiersort3.h
index 546f2e4..ecd039e 100644
--- a/thiersort3.h
+++ b/thiersort3.h
@@ -41,9 +41,9 @@ static inline uint32_t witch_bucket3(uint32_t key) {
  * @param n Number of elements in arr and temparr
  * @param rstate Create with sch_rand_state rstate = schwab_rand_state(junk_uint32_t);
  */
-static inline void thiersort3(uint32_t *arr, uint32_t *temparr, uint32_t n) {
-	uint32_t bucket[4096]; /* Inclusive */
-	uint32_t bucket_end[4096]; /* Not inclusive */
+static inline void thiersort3(uint32_t *arr, uint32_t *temparr, int n) {
+	int bucket[4096]; /* Inclusive */
+	int bucket_end[4096]; /* Not inclusive */
 
 	/* Check if need to sort at all - needed for invariants later */
 	if(n < 2) {
@@ -52,25 +52,25 @@ static inline void thiersort3(uint32_t *arr, uint32_t *temparr, uint32_t n) {
 
 	/* Count */
 	#pragma GCC unroll 64
-	for(uint32_t i = 0; i < 4096; ++i) {
+	for(int i = 0; i < 4096; ++i) {
 		bucket[i] = 0;
 	}
 	#pragma GCC unroll 64
-	for(uint32_t i = 0; i < n; ++i) {
+	for(int i = 0; i < n; ++i) {
 		++bucket[witch_bucket3(arr[i])];
 	}
 
 	/* Prefix sum (like in Magyarsort) */
 	uint32_t prev = 0;
 	#pragma GCC unroll 4
-	for (uint32_t i = 0; i < 4096; i++) {
+	for (int i = 0; i < 4096; i++) {
 		bucket[i] += prev;
 		prev = bucket[i];
 	}
 
 	/* Save end-offsets */
 	#pragma GCC unroll 64
-	for(uint32_t i = 0; i < 4096; ++i) {
+	for(int i = 0; i < 4096; ++i) {
 		bucket_end[i] = bucket[i];
 	}
 
@@ -78,18 +78,18 @@ static inline void thiersort3(uint32_t *arr, uint32_t *temparr, uint32_t n) {
 	/* Move to the buckets */
 	/* Rem.: This also changes bucket[i] so they will point to bucket beginnings */
 	#pragma GCC unroll 64
-	for(uint32_t i = 0; i < n; ++i) {
+	for(int i = 0; i < n; ++i) {
 		uint32_t num = arr[i];
 		uint32_t witch = witch_bucket3(num);
-		uint32_t offset = (--bucket[witch]);
+		int offset = (--bucket[witch]);
 		temparr[offset] = num;
 	}
 
 	/* temparr -> arr each bucket and sort them in-place */
 	#pragma GCC unroll 64
-	for(uint32_t b = 0; b < 4096; ++b) {
-		uint32_t begin = bucket[b];
-		uint32_t end = bucket_end[b];
+	for(int b = 0; b < 4096; ++b) {
+		int begin = bucket[b];
+		int end = bucket_end[b];
 
 		/* Ensure exists */
 		if(begin >= end) {
diff --git a/threepass_xbit.h b/threepass_xbit.h
index 3e619a8..4298664 100644
--- a/threepass_xbit.h
+++ b/threepass_xbit.h
@@ -20,34 +20,6 @@ static inline constexpr uint32_t min3u32_xb(uint32_t a, uint32_t b, uint32_t c)
 		((b <= c) ? b : c);
 }
 
-/** Copy the elements to their respective radics-place (f->t  copy) */
-static inline void copy_radics_tpxp(uint32_t *f, uint32_t *t, uint32_t *bucket, uint32_t shr, uint32_t mask, uint32_t n) {
-	// right-to-left to ensure already sorted digits order we keep for iterations
-	#pragma GCC unroll 48
-	for(uint32_t i = n; i > 0; --i) {
-		// Prefetch caches
-		//__builtin_prefetch(&a[i-8]);
-		// Get num and its new offset / location
-		auto num = f[i - 1];
-		auto bkeyni = (num >> shr) & mask;
-		auto offset = --bucket[bkeyni];
-
-		// Add to the proper target location
-		t[offset] = num;
-	}
-}
-
-/* I pulled these out only for better flame graph support */
-/** Count occurences (can count together with good ILP) */
-static inline void count_occurences_tpxp(uint32_t *bucket1, uint32_t *bucket2, uint32_t *bucket3, const uint32_t shr1, const uint32_t shr2, const uint32_t shr3, const uint32_t mask1, const uint32_t mask2, const uint32_t mask3, uint32_t *a, uint32_t n) noexcept {
-	#pragma GCC unroll 64
-	for(uint32_t i = 0; i < n; ++i) {
-		++bucket1[(a[i] >> shr1) & mask1];
-		++bucket2[(a[i] >> shr2) & mask2];
-		++bucket3[(a[i] >> shr3) & mask3];
-	}
-}
-
 /**
  * Simple three-pass (ok: 3 + 1) bottom-up internal radix sort writter for thiersort3
  *
@@ -55,17 +27,17 @@ static inline void count_occurences_tpxp(uint32_t *bucket1, uint32_t *bucket2, u
  * @param buf Result array with the same size - result will be here
  * @param n The number of elements
  */
-static inline void threepass_xb(uint32_t *a, uint32_t *buf, uint32_t n) noexcept {
+static inline void threepass_xb(uint32_t *a, uint32_t *buf, int n) noexcept {
 	assert(buf != NULL);
-	constexpr uint32_t shr1 = TPBX3 + TPBX2;
-	constexpr uint32_t shr2 = TPBX3;
-	constexpr uint32_t shr3 = 0;
-	constexpr uint32_t mask1 = (1 << TPBX1) - 1;
-	constexpr uint32_t mask2 = (1 << TPBX2) - 1;
-	constexpr uint32_t mask3 = (1 << TPBX3) - 1;
+	constexpr int shr1 = TPBX3 + TPBX2;
+	constexpr int shr2 = TPBX3;
+	constexpr int shr3 = 0;
+	constexpr int mask1 = (1 << TPBX1) - 1;
+	constexpr int mask2 = (1 << TPBX2) - 1;
+	constexpr int mask3 = (1 << TPBX3) - 1;
 
 	/* helper buffers. */
-	uint32_t sz = n * sizeof(a[0]);
+	int sz = n * sizeof(a[0]);
 
 	static thread_local uint32_t bucket1[1 << TPBX1];
 	memset(bucket1, 0, (1 << TPBX1) * sizeof(uint32_t));
@@ -74,7 +46,13 @@ static inline void threepass_xb(uint32_t *a, uint32_t *buf, uint32_t n) noexcept
 	static thread_local uint32_t bucket3[1 << TPBX3];
 	memset(bucket3, 0, (1 << TPBX3) * sizeof(uint32_t));
 
-	count_occurences_tpxp(bucket1, bucket2, bucket3, shr1, shr2, shr3, mask1, mask2, mask3, a, n);
+	/* Count occurences (can count together with good ILP) */
+	#pragma GCC unroll 64
+	for(uint32_t i = 0; i < n; ++i) {
+		++bucket1[(a[i] >> shr1) & mask1];
+		++bucket2[(a[i] >> shr2) & mask2];
+		++bucket3[(a[i] >> shr3) & mask3];
+	}
 
 	/* Count prefix sums - try as much ILP as possible because bigger arrays than usual! */
 	uint32_t prev1 = 0;
@@ -85,7 +63,7 @@ static inline void threepass_xb(uint32_t *a, uint32_t *buf, uint32_t n) noexcept
 		(1 << TPBX2),
 		(1 << TPBX3)
 	);
-	uint32_t i = 0;
+	int i = 0;
 	#pragma GCC unroll 8
 	for (; i < common; ++i) {
 		bucket1[i] += prev1;
@@ -96,29 +74,63 @@ static inline void threepass_xb(uint32_t *a, uint32_t *buf, uint32_t n) noexcept
 		prev3 = bucket3[i];
 	}
 	/* Do remaining 1 */
-	for (uint32_t j = i; j < (1 << TPBX1); ++j) {
+	for (int j = i; j < (1 << TPBX1); ++j) {
 		bucket1[j] += prev1;
 		prev1 = bucket1[j];
 	}
 	/* Do remaining 2 */
-	for (uint32_t j = i; j< (1 << TPBX2); ++j) {
+	for (int j = i; j< (1 << TPBX2); ++j) {
 		bucket2[j] += prev2;
 		prev2 = bucket2[j];
 	}
 	/* Do remaining 3 */
-	for (uint32_t j = i; j < (1 << TPBX3); ++j) {
+	for (int j = i; j < (1 << TPBX3); ++j) {
 		bucket3[j] += prev3;
 		prev3 = bucket3[j];
 	}
 
 	// Bottom digit a->buf
-	copy_radics_tpxp(a, buf, bucket3, shr3, mask3, n);
+	// right-to-left to ensure already sorted digits order we keep for iterations
+	#pragma GCC unroll 48
+	for(uint32_t i = n; i > 0; --i) {
+		// Prefetch caches
+		//__builtin_prefetch(&a[i-8]);
+		// Get num and its new offset / location
+		auto num = a[i - 1];
+		auto bkeyni = (num >> shr3) & mask3;
+		auto offset = --bucket3[bkeyni];
 
+		// Add to the proper target location
+		buf[offset] = num;
+	}
 	// Mid digit buf->a
-	copy_radics_tpxp(buf, a, bucket2, shr2, mask2, n);
+	// right-to-left to ensure already sorted digits order we keep for iterations
+	#pragma GCC unroll 48
+	for(uint32_t i = n; i > 0; --i) {
+		// Prefetch caches
+		//__builtin_prefetch(&buf[i-8]);
+		// Get num and its new offset / location
+		auto num = buf[i - 1];
+		auto bkeyni = (num >> shr2) & mask2;
+		auto offset = --bucket2[bkeyni];
 
+		// Add to the proper target location
+		a[offset] = num;
+	}
 	// Top digit a->buf
-	copy_radics_tpxp(a, buf, bucket1, shr1, mask1, n);
+	// right-to-left to ensure already sorted digits order we keep for iterations
+	#pragma GCC unroll 48
+	for(uint32_t i = n; i > 0; --i) {
+		// Prefetch caches
+		// __builtin_prefetch(&a[i-16]);
+		// Get num and its new offset / location
+		auto num = a[i - 1];
+		auto bkeyni = (num >> shr1) & mask1;
+		auto offset = --bucket1[bkeyni];
+
+		// Add to the proper target location
+		buf[offset] = num;
+	}
 }
 
 #endif /* THREE_PASS_XB_H */
diff --git a/ypsu.cpp b/ypsu.cpp
index 48c6988..e3afa07 100644
--- a/ypsu.cpp
+++ b/ypsu.cpp
@@ -858,8 +858,7 @@ void measure_single(int n) {
     //measure(inputtype, "sp", [&] { spsort(&v[0], v.size()); });
     //measure(inputtype, "magyar", [&] { MagyarSort::sort<uint32_t>(&v[0], v.size()); });
     //measure(inputtype, "thier2", [&] { do_thier2(&v[0], v.size()); });
-    //measure(inputtype, "threep", [&] { do_threepass(&v[0], v.size()); });
-    measure(inputtype, "thier3", [&] { do_thier3(&v[0], v.size()); });
+    measure(inputtype, "threep", [&] { do_threepass(&v[0], v.size()); });
 
     for (auto r : results) printf("%9.3fs", r.second);
     puts("");
@@ -896,8 +895,8 @@ int main(int argc, char **argv) {
   printf("Sorting %d elements:\n\n", n);
 
   // Uncomment this for profiling and alg!
-  measure_single(n);
-  return 0;
+  //measure_single(n);
+  //return 0;
 
   for (auto inputtype : inputtypes) {
     printf("%10s", inputtype.c_str());