swab: fixed == to >= typo - now beats std::sort and is comparison sort fully!

schwab insertion - but buggy from some previous at n=20 rand
2025-05-09 05:05:40 +02:00 · 2025-05-09 04:49:31 +02:00
1 changed files with 75 additions and 47 deletions
--- a/schwab_sort.h
+++ b/schwab_sort.h
@ -14,6 +14,11 @@
 #define SCHWAB_DELTA_THRESHOLD 32
 #endif /* SCHWAB_DELTA_THRESHOLD */

+/** Below this many elements we do insertion sort */
+#ifndef SCHWAB_INSERTION_THRESHOLD
+#define SCHWAB_INSERTION_THRESHOLD 64
+#endif /* SCHWAB_DELTA_THRESHOLD */
+
 typedef uint32_t sch_rand_state;

 /** Create rand state for schwab_sort using a seed - can give 0 if uninterested */
@ -41,6 +46,22 @@ static inline void schwab_swap(uint32_t *a, uint32_t *b) {
 	*b = t;
 }

+/** Simple insertion sort for small cases */
+inline void sch_insertion_sort(uint32_t *arr, int low, int high) {
+    for (int i = low + 1; i <= high; ++i) {
+        uint32_t key = arr[i];
+        int j = i;
+
+        /* Move elements of arr[0..i-1] that are greater than key */
+        /* to one position ahead of their current position */
+        while (j > 0 && arr[j - 1] > key) {
+            arr[j] = arr[j - 1];
+            --j;
+        }
+        arr[j] = key;
+    }
+}
+
 /**
 * 3-way partitioning, in middle all the pivot elements.
 *
@ -166,7 +187,7 @@ static inline int schwab_partition(

 		arr[b3] = arr[b2];
 		arr[b2] = (where == 2) ? arr[b2] : arr[b1];
-		arr[b1] = (where == 1) ? arr[b1] : arr[b0];
+		arr[b1] = (where >= 1) ? arr[b1] : arr[b0];

 		++b2;
 		b1 += (where < 2);
@ -221,57 +242,64 @@ static inline void schwab_sort(
 		sch_rand_state *state) {

 	/* Loop handles longest sub-sort-task which ensused log tree depth */
+	/* Loop also handles start condition */
 	while(low < high) {
-		int r0 = schwab_pick_pivot(state, (high + 1) - low) + low;
-		int r1 = schwab_pick_pivot(state, (high + 1) - low) + low;
-		uint32_t klo = array[r0];
-		uint32_t khi = array[r1];
-		int plo = r0;
-		int phi = r1;
-		if(klo > khi) {
-			uint32_t ktmp = klo;
-			klo = khi;
-			khi = ktmp;
+		if(high - low > SCHWAB_INSERTION_THRESHOLD) {
+			int r0 = schwab_pick_pivot(state, (high + 1) - low) + low;
+			int r1 = schwab_pick_pivot(state, (high + 1) - low) + low;
+			uint32_t klo = array[r0];
+			uint32_t khi = array[r1];
+			int plo = r0;
+			int phi = r1;
+			if(klo > khi) {
+				uint32_t ktmp = klo;
+				klo = khi;
+				khi = ktmp;

-			plo = r1;
-			phi = r0;
-		}
-
-		uint32_t kmid = klo + (khi - klo) / 2;
-
-		int pmid;
-		int needmid = schwab_partition(array, low, high, &plo, kmid, &pmid, &phi);
-
-		/* See where NOT to recurse to avoid worst case stack depth */
-		/* Rem.: These might be "not real" length but we only use them to comparisons */
-		/* REM.: The "real" lengths might be off-by-one but these are FASTER! */
-		int lolen = plo - low;
-		int hilen = high - phi;
-
-		/* Rewrite loop for worst subtask goal and recurse others! */
-		/* Let the branch predictor try to predict input data path */
-		/* Rem.: Best would be to check for biggest in all 4 block */
-		/*       But that would complicate codes above this point! */
-		/* Rem.: Order of operations try to be a cache-friendly as */
-		/*       possible, but had to put loops changes to the end */
-		if(lolen < hilen) {
-			schwab_sort(array, low, plo - 1, state);
-			if(needmid) {
-				schwab_sort(array, plo, pmid - 1, state);
-				schwab_sort(array, pmid, phi - 1, state);
+				plo = r1;
+				phi = r0;
 			}

-			low = phi;
-			/* high = high; */
+			uint32_t kmid = klo + (khi - klo) / 2;
+
+			int pmid;
+			int needmid = schwab_partition(array, low, high, &plo, kmid, &pmid, &phi);
+
+			/* See where NOT to recurse to avoid worst case stack depth */
+			/* Rem.: These might be "not real" length but we only use them to comparisons */
+			/* REM.: The "real" lengths might be off-by-one but these are FASTER! */
+			int lolen = plo - low;
+			int hilen = high - phi;
+
+			/* Rewrite loop for worst subtask goal and recurse others! */
+			/* Let the branch predictor try to predict input data path */
+			/* Rem.: Best would be to check for biggest in all 4 block */
+			/*       But that would complicate codes above this point! */
+			/* Rem.: Order of operations try to be a cache-friendly as */
+			/*       possible, but had to put loops changes to the end */
+			if(lolen < hilen) {
+				schwab_sort(array, low, plo - 1, state);
+				if(needmid) {
+					schwab_sort(array, plo, pmid - 1, state);
+					schwab_sort(array, pmid, phi - 1, state);
+				}
+
+				low = phi;
+				/* high = high; */
+			} else {
+				schwab_sort(array, phi, high, state);
+				if(needmid) {
+					schwab_sort(array, pmid, phi - 1, state);
+					schwab_sort(array, plo, pmid - 1, state);
+				}
+
+				/* low = low; */
+				high = plo - 1;
+			}
 		} else {
-			schwab_sort(array, phi, high, state);
-			if(needmid) {
-				schwab_sort(array, pmid, phi - 1, state);
-				schwab_sort(array, plo, pmid - 1, state);
-			}
-
-			/* low = low; */
-			high = plo - 1;
+			/* Just do an insertion sort instead */
+			sch_insertion_sort(array, low, high);
+			return;
 		}
 	}
 }
Author	SHA1	Message	Date
Richard Thier	95c759b9e3	swab: fixed == to >= typo - now beats std::sort and is comparison sort fully!	2025-05-09 05:05:40 +02:00
Richard Thier	147ca60672	schwab insertion - but buggy from some previous at n=20 rand	2025-05-09 04:49:31 +02:00