From 23a5bb1d552fa77be72243a86b02ffbcc3eedb4d Mon Sep 17 00:00:00 2001
From: Richard Thier <magosit@outlook.hu>
Date: Thu, 11 Apr 2024 23:59:13 +0200
Subject: [PATCH] mormordsort ILP version by me - with probably lot of bugs

---
 ypsu.cpp | 189 ++++++++++++++++++++++++++++++++++++-------------------
 1 file changed, 125 insertions(+), 64 deletions(-)
diff --git a/ypsu.cpp b/ypsu.cpp
index 63e24aa..85d2db1 100644
--- a/ypsu.cpp
+++ b/ypsu.cpp
@@ -1,3 +1,4 @@
+#include <utility> // std::pair
 #include <algorithm>
 #include <cassert>
 #include <chrono>
@@ -144,60 +145,70 @@ void twopass(uint32_t *a, int n) {
 //     --index
 // különben
 //     ++pivot_index
+template<int j>
+static inline bool morbittop(uint32_t elem) noexcept {
+	return (elem >> (8 * j)) & 0x80; // Only top bit
+}
+template<int j>
+static inline uint32_t morgrab(uint32_t elem) noexcept {
+	return (elem >> (8 * j)) & 0x7f; // Only 7-bit!
+}
 /**
- * Divides array into two partitions by its topmost bit.
+ * Count occurences AND divides array into two partitions by its topmost bit.
  * - Similar to quicksort partitioning, but changed accordingly.
  * - MSB 0 bit values will come first partition.
- * - The return value tells partition boundary.
  *
  * @param a The array to partition and occurence count.
  * @param n The length of the array.
+ * @param radics1 A 128-sized array for occurence counting the bottom partition.
+ * @param radics2 A 128-sized array for occurence counting the top partition.
+ * @param DIGIT The digit in question (for a morgrab<DIGIT>(..) call)
+ * @returns The partition boundaries - non-inclusive inner ends partitions. Empty partitions accordingly represented!
  */
-static inline uint32_t bit_partition(uint32_t *a, uint32_t n) noexcept {
+template<int DIGIT>
+static inline std::pair<uint32_t, uint32_t> oc_bit_partition(
+		uint32_t *a, uint32_t n, uint32_t *radics1, uint32_t *radics2) noexcept {
+	// See Hoare's OG quicksort why
 	uint32_t i = -1;
 	uint32_t j = n;
 
 	while(true) {
 		// Move past well-placed ones
-		do ++i; while (!(a[i] & 0x80));
-		do --j; while (a[j] & 0x80);
+		// And occurence count them
+		// Rem.: In quicksort usually a do-while loop
+		++i; while ((i < n) && !morbittop<DIGIT>(a[i])) {
+			++radics1[morgrab<DIGIT>(a[i])];
+			++i;
+		}
+		--j; while ((0 < j) && morbittop<DIGIT>(a[j])) {
+			++radics2[morgrab<DIGIT>(a[j])];
+			--j;
+		}
 
 		// If the indices crossed, return
-		if(i >= j) return j;
+		// Rem.: Not >= to ensure occ. counts! See also: (*)
+		if(i > j) return std::make_pair(i, j);
 
 		// Swap badly placed
-		uint32_t tmp = a[i];
-		a[i] = a[j];
-		a[j] = tmp;
+		// Rem.: No need occurence count here as above loops will handle!
+		if(i < j) {
+			uint32_t tmp = a[i];
+			a[i] = a[j];
+			a[j] = tmp;
+		}
 	}
 }
 template<int j>
-static inline uint32_t morgrab(uint32_t elem) noexcept {
-	return (elem >> (8 * j)) & 0xff;
-}
-template<int j>
 static inline void mormord_sort_impl(uint32_t *a, int n) noexcept {
 	/* Preparation */
-	uint32_t radics[256] = {0};
-	uint32_t radics2[256] = {0};
+	uint32_t radics1[128] = {0};
+	uint32_t radics2[128] = {0};
 	/* [from, to) index: only where prefix sums change - usually nonfull */
-	uint32_t real_radics[256 * 2] = {0};
+	uint32_t real_radics1[128 * 2] = {0};
+	uint32_t real_radics2[128 * 2] = {0};
 
-	/* Occurence counting O(n) */
-	/* We can go both down and upwards here to increase ILP or even do SSE2 */
-	uint32_t k1 = 0;
-	uint32_t k2 = (n - 1);
-	#pragma GCC unroll 64
-	for(k1 = 0; k1 < k2; ++k1, --k2) {
-		++radics[morgrab<j>(a[k1])];
-		++radics2[morgrab<j>(a[k2])];
-	}
-	if(k1 == k2) {
-		++radics[morgrab<j>(a[k1])];
-	}
-	for(int i = 0; i < 256; ++i) {
-		radics[i] += radics2[i];
-	}
+	// Count occurences and partition by topmost bit
+	uint32_t n2 = oc_bit_partition<j>(a, n, radics1, radics2) + 1;
 
 	/* Prefix sum + real radics calc O(256) */
 	/* Radics: */
@@ -207,48 +218,95 @@ static inline void mormord_sort_impl(uint32_t *a, int n) noexcept {
 	/* to: {[0, 10], [10, 30], [30, 40], [40, 45], [45, 60]} */
 	/*         0.        1.        2.        4.        5.    */
 	/*       (because radix value 3 is not found in input)   */
-	uint32_t prev = 0;
-	uint32_t reali = 0;
+	uint32_t prev1 = 0;
+	uint32_t reali1 = 0;
+	uint32_t prev2 = 0;
+	uint32_t reali2 = 0;
 	#pragma GCC unroll 16
-	for(int i = 0; i < 256; ++i) {
-		if(radics[i] != 0) {
-			radics[i] += prev;
-			real_radics[reali] = prev;
-			real_radics[reali + 1] = radics[i];
-			prev = radics[i];
-			reali += 2;
+	for(int i = 0; i < 128; ++i) {
+		// Hopefully we get more ILP out of this
+		// Also I tried branchless before adding
+		// ILP here and it slowed things, so first
+		// let us try it with branch prediction!
+		if(radics1[i] != 0) {
+			radics1[i] += prev1;
+			real_radics1[reali1] = prev1;
+			real_radics1[reali1 + 1] = radics1[i];
+			prev1 = radics1[i];
+			reali1 += 2;
 		} else {
-			radics[i] += prev;
-			prev = radics[i];
+			radics1[i] += prev1;
+			prev1 = radics1[i];
+		}
+		if(radics2[i] != 0) {
+			radics2[i] += prev2;
+			real_radics2[reali2] = prev2;
+			real_radics2[reali2 + 1] = radics2[i];
+			prev2 = radics2[i];
+			reali2 += 2;
+		} else {
+			radics2[i] += prev2;
+			prev2 = radics2[i];
 		}
 	}
 
-	// Inplace swap
-	uint32_t pivoti = 0;
-	while(pivoti < n) {
-		uint32_t radixval = morgrab<j>(a[pivoti]);
-		uint32_t targeti = radics[radixval] - 1;
-		if(targeti > pivoti) {
-			// swap
-			uint32_t tmp = a[pivoti];
-			a[pivoti] = a[targeti];
-			a[targeti] = tmp;
-			// dec index
-			--radics[radixval];
-		} else {
-			// progress pivot
-			++pivoti;
-		}
+	// Inplace swap, with added ILP / branchless opt.
+	// Without it its data dependent like crazy...
+	uint32_t pivoti1 = 0;
+	uint32_t pivoti2 = n2;
+	while((pivoti1 < n2) && (pivoti2 < n)) {
+
+		/* Pivot 1 */
+
+		uint32_t radixval1 = morgrab<j>(a[pivoti1]);
+		uint32_t targeti1 = --radics1[radixval1]; // dec index (!)
+
+		// Bitmask: true -> 11.....1; false -> 00.....0
+		uint32_t mask1 = ~((targeti1 > pivoti1) - 1);
+
+		// Branchless swap (using bitmask)
+		uint32_t delta1 = (a[pivoti1] ^ a[targeti1]) & mask1;
+		a[pivoti1] = a[pivoti1] ^ delta1;
+		a[targeti1] = a[targeti1] ^ delta1;
+
+		// "else" branch
+		pivoti1 += !mask1;
+		radics1[radixval1] += !mask1; // undec index (!)
+
+		/* Pivot 2 */
+
+		uint32_t radixval2 = morgrab<j>(a[pivoti2]);
+		uint32_t targeti2 = --radics2[radixval2]; // dec index (!)
+
+		// Bitmask: true -> 11.....1; false -> 00.....0
+		uint32_t mask2 = ~((targeti2 > pivoti2) - 1);
+
+		// Branchless swap (using bitmask)
+		uint32_t delta2 = (a[pivoti2] ^ a[targeti2]) & mask2;
+		a[pivoti2] = a[pivoti2] ^ delta2;
+		a[targeti2] = a[targeti2] ^ delta2;
+
+		// "else" branch
+		pivoti2 += !mask2;
+		radics2[radixval2] += !mask2; // undec index (!)
 	}
 
-	// Ends recursion
+	// Possible recursions
 	if constexpr (j != 0) {
-		// Recursion
-		for(int i = 0; i < reali; i += 2) {
+		/* Partition 1 recursions */
+		for(int i = 0; i < reali1; i += 2) {
 			/* inclusive */
-			uint32_t from = real_radics[i];
+			uint32_t from = real_radics1[i];
 			/* non-inclusive */
-			uint32_t to = real_radics[i + 1];
+			uint32_t to = real_radics1[i + 1];
+			mormord_sort_impl<j - 1>(&a[from], (to - (from)));
+		}
+		/* Partition 2 recursions */
+		for(int i = 0; i < reali2; i += 2) {
+			/* inclusive */
+			uint32_t from = real_radics2[i];
+			/* non-inclusive */
+			uint32_t to = real_radics2[i + 1];
 			mormord_sort_impl<j - 1>(&a[from], (to - (from)));
 		}
 	}
@@ -654,11 +712,12 @@ void measure_single(int n) {
 
 int main(void) {
   //int n = 100000000;
-  int n = 10000000;
+  //int n = 10000000;
   //int n = 1000000;
   //int n = 100000;
   //int n = 10000;
   //int n = 1000;
+  int n = 200;
   //int n = 100;
   //int n = 10;
 
@@ -686,9 +745,11 @@ int main(void) {
         w.swap(buf);
       }
     });
+    /*
     w = v;
     measure(inputtype, "magyar", [&] { MagyarSort::sort<uint32_t>(&w[0], w.size()); });
     assert(w == expected);
+    */
     w = v;
     measure(inputtype, "mormord", [&] { mormord_sort(&w[0], w.size()); });
     assert(w == expected);