From ccdf991824f5f871559d7fcff9bf2675eb88b55c Mon Sep 17 00:00:00 2001
From: Richard Thier <magosit@outlook.hu>
Date: Wed, 1 Oct 2025 04:26:44 +0200
Subject: [PATCH] Revert "tpxb: 16-wide manual unroll - but it does not seem to
 be faster"

This reverts commit 6d794612624b445c8e4dae4ea3ee3b42b6a4c92f.
---
 threepass_xbit.h | 188 ++++++-----------------------------------------
 ypsu.cpp         |   4 +-
 2 files changed, 26 insertions(+), 166 deletions(-)

diff --git a/threepass_xbit.h b/threepass_xbit.h
index 840e779..4298664 100644
--- a/threepass_xbit.h
+++ b/threepass_xbit.h
@@ -20,13 +20,6 @@ static inline constexpr uint32_t min3u32_xb(uint32_t a, uint32_t b, uint32_t c)
 		((b <= c) ? b : c);
 }
 
-static inline void tpxb_process_element(uint32_t num, uint32_t* arr, uint32_t* bucket,
-		uint32_t shr, uint32_t mask) {
-	auto bkeyni = (num >> shr) & mask;
-	auto offset = --bucket[bkeyni];
-	arr[offset] = num;
-}
-
 /**
  * Simple three-pass (ok: 3 + 1) bottom-up internal radix sort writter for thiersort3
  *
@@ -98,177 +91,44 @@ static inline void threepass_xb(uint32_t *a, uint32_t *buf, int n) noexcept {
 
 	// Bottom digit a->buf
 	// right-to-left to ensure already sorted digits order we keep for iterations
-	#pragma GCC unroll 3
-	for(i = n; i >= 16; i -= 16) {
-		// Prefetch the NEXT block (not current) at optimal distance
-		if (i > 17) {  // Ensure we don't prefetch out of bounds
-			__builtin_prefetch(&a[i - 17]);
-		}
-		if (i > 17*2) {  // Ensure we don't prefetch out of bounds
-			__builtin_prefetch(&a[i - 17*2]);
-		}
-		if (i > 17*3) {  // Ensure we don't prefetch out of bounds
-			__builtin_prefetch(&a[i - 17*3]);
-		}
-
-		// Process 16 elements in reverse order
-		auto num15 = a[i - 1];
-		auto num14 = a[i - 2];
-		auto num13 = a[i - 3];
-		auto num12 = a[i - 4];
-		auto num11 = a[i - 5];
-		auto num10 = a[i - 6];
-		auto num9  = a[i - 7];
-		auto num8  = a[i - 8];
-		auto num7  = a[i - 9];
-		auto num6  = a[i - 10];
-		auto num5  = a[i - 11];
-		auto num4  = a[i - 12];
-		auto num3  = a[i - 13];
-		auto num2  = a[i - 14];
-		auto num1  = a[i - 15];
-		auto num0  = a[i - 16];
-
-		// Process all 16 elements (your bucket logic here)
-		tpxb_process_element(num15, buf, bucket3, shr3, mask3);
-		tpxb_process_element(num14, buf, bucket3, shr3, mask3);
-		tpxb_process_element(num13, buf, bucket3, shr3, mask3);
-		tpxb_process_element(num12, buf, bucket3, shr3, mask3);
-		tpxb_process_element(num11, buf, bucket3, shr3, mask3);
-		tpxb_process_element(num10, buf, bucket3, shr3, mask3);
-		tpxb_process_element(num9, buf, bucket3, shr3, mask3);
-		tpxb_process_element(num8, buf, bucket3, shr3, mask3);
-		tpxb_process_element(num7, buf, bucket3, shr3, mask3);
-		tpxb_process_element(num6, buf, bucket3, shr3, mask3);
-		tpxb_process_element(num5, buf, bucket3, shr3, mask3);
-		tpxb_process_element(num4, buf, bucket3, shr3, mask3);
-		tpxb_process_element(num3, buf, bucket3, shr3, mask3);
-		tpxb_process_element(num2, buf, bucket3, shr3, mask3);
-		tpxb_process_element(num1, buf, bucket3, shr3, mask3);
-		tpxb_process_element(num0, buf, bucket3, shr3, mask3);
-	}
-	// Handle remainder (less than 16 elements)
-	for(uint32_t j = i; j > 0; --j) {
-		auto num = a[j - 1];
+	#pragma GCC unroll 48
+	for(uint32_t i = n; i > 0; --i) {
+		// Prefetch caches
+		//__builtin_prefetch(&a[i-8]);
+		// Get num and its new offset / location
+		auto num = a[i - 1];
 		auto bkeyni = (num >> shr3) & mask3;
 		auto offset = --bucket3[bkeyni];
+
+		// Add to the proper target location
 		buf[offset] = num;
 	}
-
 	// Mid digit buf->a
 	// right-to-left to ensure already sorted digits order we keep for iterations
-	#pragma GCC unroll 3
-	for(i = n; i >= 16; i -= 16) {
-		// Prefetch the NEXT block (not current) at optimal distance
-		if (i > 17) {  // Ensure we don't prefetch out of bounds
-			__builtin_prefetch(&buf[i - 17]);
-		}
-		if (i > 17*2) {  // Ensure we don't prefetch out of bounds
-			__builtin_prefetch(&buf[i - 17*2]);
-		}
-		if (i > 17*3) {  // Ensure we don't prefetch out of bounds
-			__builtin_prefetch(&buf[i - 17*3]);
-		}
-
-		// Process 16 elements in reverse order
-		auto num15 = buf[i - 1];
-		auto num14 = buf[i - 2];
-		auto num13 = buf[i - 3];
-		auto num12 = buf[i - 4];
-		auto num11 = buf[i - 5];
-		auto num10 = buf[i - 6];
-		auto num9  = buf[i - 7];
-		auto num8  = buf[i - 8];
-		auto num7  = buf[i - 9];
-		auto num6  = buf[i - 10];
-		auto num5  = buf[i - 11];
-		auto num4  = buf[i - 12];
-		auto num3  = buf[i - 13];
-		auto num2  = buf[i - 14];
-		auto num1  = buf[i - 15];
-		auto num0  = buf[i - 16];
-
-		// Process all 16 elements (your bucket logic here)
-		tpxb_process_element(num15, a, bucket2, shr2, mask2);
-		tpxb_process_element(num14, a, bucket2, shr2, mask2);
-		tpxb_process_element(num13, a, bucket2, shr2, mask2);
-		tpxb_process_element(num12, a, bucket2, shr2, mask2);
-		tpxb_process_element(num11, a, bucket2, shr2, mask2);
-		tpxb_process_element(num10, a, bucket2, shr2, mask2);
-		tpxb_process_element(num9, a, bucket2, shr2, mask2);
-		tpxb_process_element(num8, a, bucket2, shr2, mask2);
-		tpxb_process_element(num7, a, bucket2, shr2, mask2);
-		tpxb_process_element(num6, a, bucket2, shr2, mask2);
-		tpxb_process_element(num5, a, bucket2, shr2, mask2);
-		tpxb_process_element(num4, a, bucket2, shr2, mask2);
-		tpxb_process_element(num3, a, bucket2, shr2, mask2);
-		tpxb_process_element(num2, a, bucket2, shr2, mask2);
-		tpxb_process_element(num1, a, bucket2, shr2, mask2);
-		tpxb_process_element(num0, a, bucket2, shr2, mask2);
-	}
-	// Handle remainder (less than 16 elements)
-	for(uint32_t j = i; j > 0; --j) {
-		auto num = buf[j - 1];
+	#pragma GCC unroll 48
+	for(uint32_t i = n; i > 0; --i) {
+		// Prefetch caches
+		//__builtin_prefetch(&buf[i-8]);
+		// Get num and its new offset / location
+		auto num = buf[i - 1];
 		auto bkeyni = (num >> shr2) & mask2;
 		auto offset = --bucket2[bkeyni];
+
+		// Add to the proper target location
 		a[offset] = num;
 	}
 	// Top digit a->buf
 	// right-to-left to ensure already sorted digits order we keep for iterations
-	#pragma GCC unroll 3
-	for(i = n; i >= 16; i -= 16) {
-		// Prefetch the NEXT block (not current) at optimal distance
-		if (i > 17) {  // Ensure we don't prefetch out of bounds
-			__builtin_prefetch(&a[i - 17]);
-		}
-		if (i > 17*2) {  // Ensure we don't prefetch out of bounds
-			__builtin_prefetch(&a[i - 17*2]);
-		}
-		if (i > 17*3) {  // Ensure we don't prefetch out of bounds
-			__builtin_prefetch(&a[i - 17*3]);
-		}
-
-		// Process 16 elements in reverse order
-		auto num15 = a[i - 1];
-		auto num14 = a[i - 2];
-		auto num13 = a[i - 3];
-		auto num12 = a[i - 4];
-		auto num11 = a[i - 5];
-		auto num10 = a[i - 6];
-		auto num9  = a[i - 7];
-		auto num8  = a[i - 8];
-		auto num7  = a[i - 9];
-		auto num6  = a[i - 10];
-		auto num5  = a[i - 11];
-		auto num4  = a[i - 12];
-		auto num3  = a[i - 13];
-		auto num2  = a[i - 14];
-		auto num1  = a[i - 15];
-		auto num0  = a[i - 16];
-
-		// Process all 16 elements (your bucket logic here)
-		tpxb_process_element(num15, buf, bucket1, shr1, mask1);
-		tpxb_process_element(num14, buf, bucket1, shr1, mask1);
-		tpxb_process_element(num13, buf, bucket1, shr1, mask1);
-		tpxb_process_element(num12, buf, bucket1, shr1, mask1);
-		tpxb_process_element(num11, buf, bucket1, shr1, mask1);
-		tpxb_process_element(num10, buf, bucket1, shr1, mask1);
-		tpxb_process_element(num9, buf, bucket1, shr1, mask1);
-		tpxb_process_element(num8, buf, bucket1, shr1, mask1);
-		tpxb_process_element(num7, buf, bucket1, shr1, mask1);
-		tpxb_process_element(num6, buf, bucket1, shr1, mask1);
-		tpxb_process_element(num5, buf, bucket1, shr1, mask1);
-		tpxb_process_element(num4, buf, bucket1, shr1, mask1);
-		tpxb_process_element(num3, buf, bucket1, shr1, mask1);
-		tpxb_process_element(num2, buf, bucket1, shr1, mask1);
-		tpxb_process_element(num1, buf, bucket1, shr1, mask1);
-		tpxb_process_element(num0, buf, bucket1, shr1, mask1);
-	}
-	// Handle remainder (less than 16 elements)
-	for(uint32_t j = i; j > 0; --j) {
-		auto num = a[j - 1];
+	#pragma GCC unroll 48
+	for(uint32_t i = n; i > 0; --i) {
+		// Prefetch caches
+		// __builtin_prefetch(&a[i-16]);
+		// Get num and its new offset / location
+		auto num = a[i - 1];
 		auto bkeyni = (num >> shr1) & mask1;
 		auto offset = --bucket1[bkeyni];
+
+		// Add to the proper target location
 		buf[offset] = num;
 	}
 }
diff --git a/ypsu.cpp b/ypsu.cpp
index ebd776e..57388ef 100644
--- a/ypsu.cpp
+++ b/ypsu.cpp
@@ -895,8 +895,8 @@ int main(int argc, char **argv) {
   printf("Sorting %d elements:\n\n", n);
 
   // Uncomment this for profiling and alg!
-  // measure_single(n);
-  // return 0;
+  measure_single(n);
+  return 0;
 
   for (auto inputtype : inputtypes) {
     printf("%10s", inputtype.c_str());