From b32c7540a21f3412a427350c59abf553877d99d6 Mon Sep 17 00:00:00 2001
From: Richard Thier <magosit@outlook.hu>
Date: Fri, 9 May 2025 03:29:20 +0200
Subject: [PATCH] schwab: goto-optimized - barely slower on my machine

---
 schwab_sort.h | 28 +++++++++++++++++++++-------
 1 file changed, 21 insertions(+), 7 deletions(-)

diff --git a/schwab_sort.h b/schwab_sort.h
index 36bd2d6..9cd2846 100644
--- a/schwab_sort.h
+++ b/schwab_sort.h
@@ -158,11 +158,24 @@ static inline int schwab_partition(
 		/* TODO: should be copy of whole element when not just uint32s! */
 		uint32_t curr = arr[b3];
 
-		/* TODO: We can do "ILP-memcpy"s here:
-		 *
-		 * Key from b2->b3, value from b2->b3, key from b1->b2, value from b1... etc
-		 * This is likely faster than calling a memcpy if we code this for not just uint32s!
-		 */
+		/* Half-branchless and half-goto trickery */
+		int where = (curr < klo) ? 0 :
+			((curr < kmid) ? 1 : 2);
+		int target = (curr < klo) ? b0 :
+			((curr < kmid) ? b1 : b2);
+
+		arr[b3] = arr[b2];
+		if(where == 2) goto auss;
+		arr[b2] = arr[b1];
+		if(where == 1) goto auss;
+		arr[b1] = arr[b0];
+auss:
+		++b2;
+		b1 += (where < 2);
+		b0 += (where < 1);
+		arr[target] = curr;
+
+		/* Same as this would have been:
 		if(curr < klo) {
 			arr[b3] = arr[b2];
 			arr[b2] = arr[b1];
@@ -176,12 +189,13 @@ static inline int schwab_partition(
 			arr[b3] = arr[b2];
 			arr[b2] = arr[b1];
 			arr[b1] = curr;
-			++b1; ++b2;
+			++b1;
 		} else {
 			arr[b3] = arr[b2];
 			arr[b2] = curr;
-			++b2;
 		}
+		++b2;
+		*/
 	}
 
 	/* [*] Swap the chosen pivot to begin of last block */