schwab: fixed various bugs, now passes magyarsorts all tests until 20k elements but very slow on constant data for some reason - needs checking

schwab: some buggy idea that segfaults - might simplify a lot instead
schwab: fixed endless run, bug in hi not bigger then mid and lo because only indices are checked lol
2025-05-09 00:56:06 +02:00 · 2025-05-08 23:09:37 +02:00 · 2025-05-08 22:47:52 +02:00 · 2025-05-08 21:47:30 +02:00
3 changed files with 198 additions and 1 deletions
--- a/data.inc
+++ b/data.inc
@ -1,5 +1,5 @@
 int data[] = {
-	8, 7, 2, 1, 0, 9, 6,
+	8, 7, 2, 1, 0, 9, 6,1,
 	8, 7, 2, 1, 0, 9, 6,
 	8, 7, 2, 1, 0, 9, 6,
 	8, 7, 2, 1, 0, 9, 6,
--- a/qs.c
+++ b/qs.c
@ -4,6 +4,7 @@
 #include <stdio.h>
 #include "qsort.h"
 #include "zssort.h"
+#include "schwab_sort.h"

 // function to print array elements
 void printArray(int array[], int size) {
@ -102,6 +103,22 @@ void qs3_sp2() {
  printArray(data, n);
 }

+void schwab() {
+  #include "data.inc"
+  
+  int n = sizeof(data) / sizeof(data[0]);
+
+  // memory junnnk is enough
+  uint32_t junk;
+  sch_rand_state rand = schwab_rand_state(junk);
+  
+  // perform sort on data
+  schwab_sort(data, 0, n - 1, &rand);
+  
+  printf("(schwab) Sorted array in ascending order: \n");
+  printArray(data, n);
+}
+
 int main() {
 	qs();
 	qsr();
@ -109,5 +126,6 @@ int main() {
 	qs3();
 	qs3_sp();
 	qs3_sp2();
+	schwab();
 	return 0;
 }
--- a/schwab_sort.h
+++ b/schwab_sort.h
@ -0,0 +1,179 @@
+#ifndef SWAB_SORT_H
+#define SWAB_SORT_H
+
+/* A fast quicksort-like new alg created in Csolnok, Hungary with:
+ * - 4-way partitioning with 0..5 copies (not swaps) per elem per run
+ * - ensured O(log2(n)) worst recursion depth
+ *
+ * LICENCE: CC-BY, 2025 May 08
+ * Author: Richárd István Thier (also author of the Magyarsort)
+ */
+
+typedef uint32_t sch_rand_state;
+
+/** Create rand state for schwab_sort using a seed - can give 0 if uninterested */
+static inline sch_rand_state schwab_rand_state(uint32_t seed) {
+	return seed;
+}
+
+/** 32-bit LCG for fast random generations - from my fastrand.h */
+static inline uint32_t schwab_lcg(sch_rand_state *state) {
+	*state = *state * 1664525u + 1013904223u;
+	return *state;
+}
+
+/** Get pivot index in [0, len-1] without modulus - from my fastrand.h */
+static inline uint32_t schwab_pick_pivot(sch_rand_state *state, uint32_t len) {
+	uint32_t rand = schwab_lcg(state);
+	/* Multiply by len, take the upper 32 bits of the 64-bit result */
+	return (uint32_t)(((uint64_t)rand * len) >> 32);
+}
+
+/**
+ * 4-way partitioning
+ *
+ * Expects: arr[plo] <= kmid <= arr[phi]
+ * Results: arr[low..plo - 1] <= arr[plo..pmid - 1] <= arr[pmid..phi - 1] <= arr[phi.. high]
+ *
+ * Also: Adding together lengths of all results arrays shrinks by 1 compared to start arr.
+ *       This means that we ensure recursions / loops always end in quicksort...
+ *
+ * @param arr The array to partition
+ * @param low Inclusive smallest index.
+ * @param high Inclusive highest index.
+ * @param plo IN-OUT: input low pivot, output index until elements <= low pivot.
+ * @param kmid IN: The mid spliting value (like a pivot value, but can be imaginary nonexistent)
+ * @param pmid OUT: output index until elements <= mid pivot.
+ * @param phi IN-OUT: input high pivot, output index until elements <= high pivot.
+ */
+static inline void schwab_partition(
+		uint32_t *arr,
+		int low,
+		int high,
+		int *plo,
+		uint32_t kmid,
+		int *pmid,
+		int *phi) {
+
+	/* Keys only - no element copy is made here */
+	uint32_t klo = arr[*plo];
+	uint32_t khi = arr[*phi];
+
+	/* [*] Swapping arr[phi]<->arr[high] ensures stop condition later */
+	uint32_t tmphi = arr[*phi];
+	arr[*phi] = arr[high];
+	arr[high] = tmphi;
+
+	/* Aren't inclusive end indices of 4 "blocks" - b0 is smallest vals */
+	int b0 = low, b1 = low, b2 = low, b3 = low;
+
+	while(b3 < high) {
+		/* This I moved to be first for hot code path for constant / smallrange */
+		if(arr[b3] >= khi) {
+			++b3;
+			continue;
+		}
+
+		/* TODO: should be copy of whole element when not just uint32s! */
+		uint32_t curr = arr[b3];
+
+		/* TODO: We can do "ILP-memcpy"s here:
+		 *
+		 * Key from b2->b3, value from b2->b3, key from b1->b2, value from b1... etc
+		 * This is likely faster than calling a memcpy if we code this for not just uint32s!
+		 */
+		if(curr < klo) {
+			arr[b3] = arr[b2];
+			arr[b2] = arr[b1];
+			arr[b1] = arr[b0];
+			arr[b0] = curr;
+			++b0; ++b1; ++b2; ++b3;
+			continue;
+		}
+
+		if(curr < kmid) {
+			arr[b3] = arr[b2];
+			arr[b2] = arr[b1];
+			arr[b1] = curr;
+			++b1; ++b2; ++b3;
+		} else {
+			arr[b3] = arr[b2];
+			arr[b2] = curr;
+			++b2; ++b3;
+		}
+	}
+
+	/* [*] Swap the chosen pivot to begin of last block */
+	/* This way we can return bigger index and by that */
+	/* this always removes an element per run at least */
+	tmphi = arr[b2];
+	arr[b2] = arr[high];
+	arr[high] = tmphi;
+	++b2;
+
+	/* Handle output vars as per doc comment */
+	*plo = b0;
+	*pmid = b1;
+	*phi = b2; /* Because of: [*] */
+}
+
+/** Swabic-sort its somewhat similar to quicksort but 4-way and tricky */
+static inline void schwab_sort(
+		uint32_t *array,
+		int low,
+		int high,
+		sch_rand_state *state) {
+
+	/* Loop handles longest sub-sort-task which ensused log tree depth */
+	while(low < high) {
+		int r0 = schwab_pick_pivot(state, (high + 1) - low) + low;
+		int r1 = schwab_pick_pivot(state, (high + 1) - low) + low;
+		uint32_t klo = array[r0];
+		uint32_t khi = array[r1];
+		int plo = r0;
+		int phi = r1;
+		if(klo > khi) {
+			uint32_t ktmp = klo;
+			klo = khi;
+			khi = ktmp;
+
+			plo = r1;
+			phi = r0;
+		}
+
+		uint32_t kmid = klo + (khi - klo) / 2;
+
+		int pmid;
+		schwab_partition(array, low, high, &plo, kmid, &pmid, &phi);
+
+		/* See where NOT to recurse to avoid worst case stack depth */
+		/* Rem.: These might be "not real" length but we only use them to comparisons */
+		/* REM.: The "real" lengths might be off-by-one but these are FASTER! */
+		int lolen = plo - low;
+		int hilen = high - phi;
+
+		/* Rewrite loop for worst subtask goal and recurse others! */
+		/* Let the branch predictor try to predict input data path */
+		/* Rem.: Best would be to check for biggest in all 4 block */
+		/*       But that would complicate codes above this point! */
+		/* Rem.: Order of operations try to be a cache-friendly as */
+		/*       possible, but had to put loops changes to the end */
+		if(lolen < hilen) {
+			schwab_sort(array, low, plo - 1, state);
+			schwab_sort(array, plo, pmid - 1, state);
+			schwab_sort(array, pmid, phi - 1, state);
+
+			low = phi;
+			/* high = high; */
+		} else {
+			schwab_sort(array, phi, high, state);
+			schwab_sort(array, pmid, phi - 1, state);
+			schwab_sort(array, plo, pmid - 1, state);
+
+			/* low = low; */
+			high = plo - 1;
+		}
+	}
+}
+
+#endif /* SWAB_SORT_H */
Author	SHA1	Message	Date
Richard Thier	c06f02bc94	schwab: fixed various bugs, now passes magyarsorts all tests until 20k elements but very slow on constant data for some reason - needs checking	2025-05-09 00:56:06 +02:00
Richard Thier	6fcf79bee3	schwab: some buggy idea that segfaults - might simplify a lot instead	2025-05-08 23:09:37 +02:00
Richard Thier	6d8802f479	schwab: fixed endless run, bug in hi not bigger then mid and lo because only indices are checked lol	2025-05-08 22:47:52 +02:00
Richard Thier	5775e6c201	schwab_sort but can endless loop	2025-05-08 21:47:30 +02:00