myqsort/schwab_sort.h

#ifndef SCHWAB_SORT_H
#define SCHWAB_SORT_H

/* A fast quicksort-like new alg created in Csolnok, Hungary with:
 * - 4-way partitioning with 0..5 copies (not swaps) per elem per run
 * - ensured O(log2(n)) worst recursion depth
 *
 * LICENCE: CC-BY, 2025 May 08-09
 * Author: Richárd István Thier (also author of the Magyarsort)
 */

/** 3-way optimization for smallrange and const - 0 turns this off! */
#ifndef SCHWAB_DELTA_THRESHOLD
#define SCHWAB_DELTA_THRESHOLD 32
#endif /* SCHWAB_DELTA_THRESHOLD */

/** Below this many elements we do insertion sort */
#ifndef SCHWAB_INSERTION_THRESHOLD
#define SCHWAB_INSERTION_THRESHOLD 128
#endif /* SCHWAB_INSERTION_THRESHOLD */

/** Below this many elements we do insertion sort */
#ifndef SCHWAB_SELECTION_THRESHOLD
#define SCHWAB_SELECTION_THRESHOLD 8
#endif /* SCHWAB_SELECTION_THRESHOLD */

typedef uint32_t sch_rand_state;

/** Create rand state for schwab_sort using a seed - can give 0 if uninterested */
static inline sch_rand_state schwab_rand_state(uint32_t seed) {
	return seed;
}

/** 32-bit LCG for fast random generations - from my fastrand.h */
static inline uint32_t schwab_lcg(sch_rand_state *state) {
	*state = *state * 1664525u + 1013904223u;
	return *state;
}

/** Get pivot index in [0, len-1] without modulus - from my fastrand.h */
static inline uint32_t schwab_pick_pivot(sch_rand_state *state, uint32_t len) {
	uint32_t rand = schwab_lcg(state);
	/* Multiply by len, take the upper 32 bits of the 64-bit result */
	return (uint32_t)(((uint64_t)rand * len) >> 32);
}

/** Swap operation */
static inline void schwab_swap(uint32_t *a, uint32_t *b) {
	uint32_t t = *a;
	*a = *b;
	*b = t;
}

/** Branchless conditional swap */
#define SCH_CSWAP(a, b) do {              \
	uint32_t _x = (a), _y = (b);          \
	uint32_t _gt = -(_x > _y);            \
	uint32_t _tmp = (_x ^ _y) & _gt;      \
	(a) = _x ^ _tmp;                      \
	(b) = _y ^ _tmp;                      \
} while (0)

/**
 * Tiny unrolled register-heapsort of exactly 5 elements.
 *
 * This heap:
 *
 *           a[0]
 *          /    \
 *      a[1]     a[2]
 *     /    \
 *  a[3]   a[4]
 *
 * @param a The array
 */
void sch_hsort5(uint32_t* a) {
	/* Build max heap */
	SCH_CSWAP(a[1], a[3]);
	SCH_CSWAP(a[1], a[4]);

	/* Heapify(0) */
	/* Max child = (a[2] > a[1]) ? 2 : 1; */
	/* SCH_CSWAP(a[0], a[maxChild]) */
	uint32_t cmp = -(a[2] > a[1]);
	uint32_t i = (1 ^ 2) & cmp ^ 1;
	/* Right selects a[2] if cmp==1, else 1 */
	SCH_CSWAP(a[0], a[i]);

	/* Sort phase */

	/* 1st max to end */
	SCH_CSWAP(a[0], a[4]);
	/* heapify a[0..3] */
	cmp = -(a[2] > a[1]);
	i = (1 ^ 2) & cmp ^ 1;
	SCH_CSWAP(a[0], a[i]);

	/* 2nd max to end */
	SCH_CSWAP(a[0], a[3]);
	/* heapify a[0..2] */
	cmp = -(a[2] > a[1]);
	i = (1 ^ 2) & cmp ^ 1;
	SCH_CSWAP(a[0], a[i]);

	/* 3rd max to end */
	SCH_CSWAP(a[0], a[2]);

	/* Final two */
	SCH_CSWAP(a[0], a[1]);
}

/** Simple insertion sort for small cases */
inline void sch_insertion_sort(uint32_t *arr, int low, int high) {
	/* Dual heapsort5 probably helps insertion speed */
	/* This is sane, because insertion benefits from */
	/* data being "basically nearly sorted" as input */
	if(high - low > 10) {
		sch_hsort5(&arr[0]);
		sch_hsort5(&arr[5]);
	}

	/* "Real" insertion sort part comes here */
	for(int i = low + 1; i <= high; ++i) {
		uint32_t key = arr[i];

		/* Move elements of arr[0..i-1] that are greater than key */
		/* to one position ahead of their current position */
		int j = i;
		#pragma GCC unroll 2
		while(j > 0 && arr[j - 1] > key) {
			arr[j] = arr[j - 1];
			--j;
		}
		arr[j] = key;
	}
}

/** Simple SELECTION sort for small cases - not necessarily better */
inline void sch_selection_sort(uint32_t* arr, int low, int high) {
	#pragma GCC unroll 2
	for(int i = low; i < high; ++i) {
		/* Min-search remaining array */
		int mini = i;
		#pragma GCC unroll 4
		for(int j = i + 1; j < high + 1; ++j) {
			if(arr[j] < arr[mini]) mini = j;
		}

		if(mini != i) {
			schwab_swap(&arr[i], &arr[mini]);
		}
	}
}

/**
 * 3-way partitioning, in middle all the pivot elements.
 *
 * Single-pass version 2.0, taken from qsort.h
 *
 * @param array The array to partition
 * @param low From when. (inclusive)
 * @param high Until when. (inclusive too!)
 * @param pivotval This value is used to partition the array.
 * @param plo OUT: until this, more processing might needed.
 * @param phi OUT: from this, more processing might needed.
 */
static inline void schwab_partition3sp2(
		uint32_t *array,
		int low,
		int high,
		uint32_t pivotval,
		int *plo,
		int *phi) {

	/* Invariant for left: index until smaller (than pivot) elements lay */
	int il = (low - 1);
	/* Invariant for right: index until (from top) bigger elements lay */
	int ir = (high + 1);
	/* Indices from where we swap left and right into "is" (and sometimes swap among here too) */
	int jl = low;
	int jr = high;

	while(jl <= jr) {
		/* Handle left and find wrongly placed element */
		while((array[jl] <= pivotval) && (jl <= jr)) {
			int isNonPivot = (array[jl] != pivotval);
			int nonSameIndex = (il + 1 != jl);
			if(isNonPivot & nonSameIndex)
				schwab_swap(&array[il + 1], &array[jl]);
			il += isNonPivot;
			++jl;
		}

		/* Handle right and find wrongly placed element */
		while((array[jr] >= pivotval) && (jl <= jr)) {
			int isNonPivot = (array[jr] != pivotval);
			int nonSameIndex = (ir - 1 != jr);
			if(isNonPivot & nonSameIndex)
					schwab_swap(&array[ir - 1], &array[jr]);
			ir -= isNonPivot;
			--jr;
		}

		/* Swap the two found elements that are wrongly placed */
		if(jl < jr) schwab_swap(&array[jl], &array[jr]);
	}

	/* Output the partition points */
	*plo = il + 1; /* XXX: changed from qsort.h to +1 here! */
	*phi = ir;
}

/**
 * 4-way partitioning
 *
 * Expects: arr[plo] <= kmid <= arr[phi]
 * Results: arr[low..plo - 1] <= arr[plo..pmid - 1] <= arr[pmid..phi - 1] <= arr[phi.. high]
 *
 * Also: Adding together lengths of all results arrays shrinks by 1 compared to start arr.
 *       This means that we ensure recursions / loops always end in quicksort...
 *
 * @param arr The array to partition
 * @param low Inclusive smallest index.
 * @param high Inclusive highest index.
 * @param plo IN-OUT: input low pivot, output - see "Results:"
 * @param kmid IN: The mid spliting value (like a pivot value, but can be imaginary nonexistent)
 * @param pmid OUT: output - see "Results:"
 * @param phi IN-OUT: input high pivot, output - see "Results:"
 * @returns 1 if there is need to process the mid two blocks! Otherwise 0.
 */
static inline int schwab_partition(
		uint32_t *arr,
		int low,
		int high,
		int *plo,
		uint32_t kmid,
		int *pmid,
		int *phi) {

	/* Keys only - no element copy is made here */
	uint32_t klo = arr[*plo];
	uint32_t khi = arr[*phi];

	/* Without this, constant and smallrange is very slooOOoow */
	if(khi - klo < SCHWAB_DELTA_THRESHOLD) {
		/* Use three-way which defeats smallrange */
		/* Outer sort func also optimized for two sides */
		/* check for size for which recurse which not! */
		schwab_partition3sp2(arr, low, high, kmid, plo, phi);

		/* No need to process the midle two blocks - all pivot there */
		return 0;
	}

	/* [*] Swapping arr[phi]<->arr[high] ensures stop condition later */
	uint32_t tmphi = arr[*phi];
	arr[*phi] = arr[high];
	arr[high] = tmphi;

	/* Aren't inclusive end indices of 4 "blocks" - b0 is smallest vals */
	int b0 = low, b1 = low, b2 = low;

	for(int b3 = low; b3 < high; ++b3) {
		/* This I moved to be first to avoid unnecessary curr copy below */
		if(arr[b3] >= khi) {
			continue;
		}

		/* TODO: should be copy of whole element when not just uint32s! */
		uint32_t curr = arr[b3];

		/* Full branchless - see below for branch alternative */
		int where = (curr < klo) ? 0 :
			((curr < kmid) ? 1 : 2);
		int target = (curr < klo) ? b0 :
			((curr < kmid) ? b1 : b2);

		arr[b3] = arr[b2];
		arr[b2] = (where == 2) ? arr[b2] : arr[b1];
		arr[b1] = (where >= 1) ? arr[b1] : arr[b0];

		++b2;
		b1 += (where < 2);
		b0 += (where < 1);
		arr[target] = curr;

		/* Same as this would have been:
		if(curr < klo) {
			arr[b3] = arr[b2];
			arr[b2] = arr[b1];
			arr[b1] = arr[b0];
			arr[b0] = curr;
			++b0; ++b1; ++b2;
			continue;
		}

		if(curr < kmid) {
			arr[b3] = arr[b2];
			arr[b2] = arr[b1];
			arr[b1] = curr;
			++b1;
		} else {
			arr[b3] = arr[b2];
			arr[b2] = curr;
		}
		++b2;
		*/
	}

	/* [*] Swap the chosen pivot to begin of last block */
	/* This way we can return bigger index and by that */
	/* this always removes an element per run at least */
	tmphi = arr[b2];
	arr[b2] = arr[high];
	arr[high] = tmphi;
	++b2;

	/* Handle output vars as per doc comment */
	*plo = b0;
	*pmid = b1;
	*phi = b2;

	/* There are mid parts to process */
	return 1;
}

/** Swabic-sort its somewhat similar to quicksort but 4-way and tricky */
static inline void schwab_sort(
		uint32_t *array,
		int low,
		int high,
		sch_rand_state *state) {

	/* Loop handles longest sub-sort-task which ensused log tree depth */
	/* Loop also handles start condition */
	while(low < high) {
		if(high - low > SCHWAB_INSERTION_THRESHOLD) {
			int r0 = schwab_pick_pivot(state, (high + 1) - low) + low;
			int r1 = schwab_pick_pivot(state, (high + 1) - low) + low;
			uint32_t klo = array[r0];
			uint32_t khi = array[r1];
			int plo = r0;
			int phi = r1;
			if(klo > khi) {
				uint32_t ktmp = klo;
				klo = khi;
				khi = ktmp;

				plo = r1;
				phi = r0;
			}

			uint32_t kmid = klo + (khi - klo) / 2;

			int pmid;
			int needmid = schwab_partition(array, low, high, &plo, kmid, &pmid, &phi);

			/* See where NOT to recurse to avoid worst case stack depth */
			/* Rem.: These might be "not real" length but we only use them to comparisons */
			/* REM.: The "real" lengths might be off-by-one but these are FASTER! */
			int lolen = plo - low;
			int hilen = high - phi;

			/* Rewrite loop for worst subtask goal and recurse others! */
			/* Let the branch predictor try to predict input data path */
			/* Rem.: Best would be to check for biggest in all 4 block */
			/*       But that would complicate codes above this point! */
			/* Rem.: Order of operations try to be a cache-friendly as */
			/*       possible, but had to put loops changes to the end */
			if(lolen < hilen) {
				schwab_sort(array, low, plo - 1, state);
				if(needmid) {
					schwab_sort(array, plo, pmid - 1, state);
					schwab_sort(array, pmid, phi - 1, state);
				}

				low = phi;
				/* high = high; */
			} else {
				schwab_sort(array, phi, high, state);
				if(needmid) {
					schwab_sort(array, pmid, phi - 1, state);
					schwab_sort(array, plo, pmid - 1, state);
				}

				/* low = low; */
				high = plo - 1;
			}
		} else {
			if(high - low > SCHWAB_SELECTION_THRESHOLD) {
				/* Just do an insertion sort instead */
				sch_insertion_sort(array, low, high);
				return;
			} else {
				sch_selection_sort(array, low, high);
				return;
			}
		}
	}
}

#endif /* SCHWAB_SORT_H */
schwab: neoqs-like performance with no issues of smallrange and const because partitioning might do threeway when sensing possible issue 2025-05-09 02:16:23 +02:00			`#ifndef SCHWAB_SORT_H`
			`#define SCHWAB_SORT_H`
schwab_sort but can endless loop 2025-05-08 21:47:30 +02:00
			`/* A fast quicksort-like new alg created in Csolnok, Hungary with:`
			`* - 4-way partitioning with 0..5 copies (not swaps) per elem per run`
Revert "schwab: tried to help a bit with ILP - does not seem to help at all" This reverts commit bf9c22f4ecd37039fe69142e9b72db6eb50b82a9. 2025-05-09 02:44:56 +02:00			`* - ensured O(log2(n)) worst recursion depth`
schwab_sort but can endless loop 2025-05-08 21:47:30 +02:00			`*`
schwab: neoqs-like performance with no issues of smallrange and const because partitioning might do threeway when sensing possible issue 2025-05-09 02:16:23 +02:00			`* LICENCE: CC-BY, 2025 May 08-09`
schwab_sort but can endless loop 2025-05-08 21:47:30 +02:00			`* Author: Richárd István Thier (also author of the Magyarsort)`
			`*/`

schwab: neoqs-like performance with no issues of smallrange and const because partitioning might do threeway when sensing possible issue 2025-05-09 02:16:23 +02:00			`/** 3-way optimization for smallrange and const - 0 turns this off! */`
			`#ifndef SCHWAB_DELTA_THRESHOLD`
			`#define SCHWAB_DELTA_THRESHOLD 32`
			`#endif /* SCHWAB_DELTA_THRESHOLD */`

schwab insertion - but buggy from some previous at n=20 rand 2025-05-09 04:49:31 +02:00			`/** Below this many elements we do insertion sort */`
			`#ifndef SCHWAB_INSERTION_THRESHOLD`
schwab: minor fine-tune so we more easily win over std::sort 2025-05-09 05:35:42 +02:00			`#define SCHWAB_INSERTION_THRESHOLD 128`
schwab: selection sort trying... 2025-05-09 06:01:44 +02:00			`#endif /* SCHWAB_INSERTION_THRESHOLD */`

			`/** Below this many elements we do insertion sort */`
			`#ifndef SCHWAB_SELECTION_THRESHOLD`
schwab: hax in regz - not helping extra much 2025-05-09 06:35:50 +02:00			`#define SCHWAB_SELECTION_THRESHOLD 8`
schwab: selection sort trying... 2025-05-09 06:01:44 +02:00			`#endif /* SCHWAB_SELECTION_THRESHOLD */`
schwab insertion - but buggy from some previous at n=20 rand 2025-05-09 04:49:31 +02:00
schwab_sort but can endless loop 2025-05-08 21:47:30 +02:00			`typedef uint32_t sch_rand_state;`

			`/** Create rand state for schwab_sort using a seed - can give 0 if uninterested */`
			`static inline sch_rand_state schwab_rand_state(uint32_t seed) {`
			`return seed;`
			`}`

			`/** 32-bit LCG for fast random generations - from my fastrand.h */`
			`static inline uint32_t schwab_lcg(sch_rand_state *state) {`
			`state = state * 1664525u + 1013904223u;`
			`return *state;`
			`}`

			`/** Get pivot index in [0, len-1] without modulus - from my fastrand.h */`
			`static inline uint32_t schwab_pick_pivot(sch_rand_state *state, uint32_t len) {`
			`uint32_t rand = schwab_lcg(state);`
			`/* Multiply by len, take the upper 32 bits of the 64-bit result */`
			`return (uint32_t)(((uint64_t)rand * len) >> 32);`
			`}`

schwab: neoqs-like performance with no issues of smallrange and const because partitioning might do threeway when sensing possible issue 2025-05-09 02:16:23 +02:00			`/** Swap operation */`
			`static inline void schwab_swap(uint32_t a, uint32_t b) {`
			`uint32_t t = *a;`
			`a = b;`
			`*b = t;`
			`}`

schwab: hax in regz - not helping extra much 2025-05-09 06:35:50 +02:00			`/** Branchless conditional swap */`
			`#define SCH_CSWAP(a, b) do { \`
			`uint32_t _x = (a), _y = (b); \`
			`uint32_t _gt = -(_x > _y); \`
			`uint32_t _tmp = (_x ^ _y) & _gt; \`
			`(a) = _x ^ _tmp; \`
			`(b) = _y ^ _tmp; \`
			`} while (0)`

			`/**`
			`* Tiny unrolled register-heapsort of exactly 5 elements.`
			`*`
			`* This heap:`
			`*`
			`* a[0]`
			`* / \`
			`* a[1] a[2]`
			`* / \`
			`* a[3] a[4]`
			`*`
			`* @param a The array`
			`*/`
			`void sch_hsort5(uint32_t* a) {`
			`/* Build max heap */`
			`SCH_CSWAP(a[1], a[3]);`
			`SCH_CSWAP(a[1], a[4]);`

			`/* Heapify(0) */`
			`/* Max child = (a[2] > a[1]) ? 2 : 1; */`
			`/* SCH_CSWAP(a[0], a[maxChild]) */`
			`uint32_t cmp = -(a[2] > a[1]);`
			`uint32_t i = (1 ^ 2) & cmp ^ 1;`
			`/* Right selects a[2] if cmp==1, else 1 */`
			`SCH_CSWAP(a[0], a[i]);`

			`/* Sort phase */`

			`/* 1st max to end */`
			`SCH_CSWAP(a[0], a[4]);`
			`/* heapify a[0..3] */`
			`cmp = -(a[2] > a[1]);`
			`i = (1 ^ 2) & cmp ^ 1;`
			`SCH_CSWAP(a[0], a[i]);`

			`/* 2nd max to end */`
			`SCH_CSWAP(a[0], a[3]);`
			`/* heapify a[0..2] */`
			`cmp = -(a[2] > a[1]);`
			`i = (1 ^ 2) & cmp ^ 1;`
			`SCH_CSWAP(a[0], a[i]);`

			`/* 3rd max to end */`
			`SCH_CSWAP(a[0], a[2]);`

			`/* Final two */`
			`SCH_CSWAP(a[0], a[1]);`
			`}`

schwab insertion - but buggy from some previous at n=20 rand 2025-05-09 04:49:31 +02:00			`/** Simple insertion sort for small cases */`
			`inline void sch_insertion_sort(uint32_t *arr, int low, int high) {`
schwab: hax in regz - not helping extra much 2025-05-09 06:35:50 +02:00			`/* Dual heapsort5 probably helps insertion speed */`
			`/* This is sane, because insertion benefits from */`
			`/* data being "basically nearly sorted" as input */`
smallest tune 2025-05-09 06:44:19 +02:00			`if(high - low > 10) {`
schwab: hax in regz - not helping extra much 2025-05-09 06:35:50 +02:00			`sch_hsort5(&arr[0]);`
			`sch_hsort5(&arr[5]);`
			`}`

			`/* "Real" insertion sort part comes here */`
schwab: minor fine-tune so we more easily win over std::sort 2025-05-09 05:35:42 +02:00			`for(int i = low + 1; i <= high; ++i) {`
			`uint32_t key = arr[i];`
schwab insertion - but buggy from some previous at n=20 rand 2025-05-09 04:49:31 +02:00
schwab: minor fine-tune so we more easily win over std::sort 2025-05-09 05:35:42 +02:00			`/* Move elements of arr[0..i-1] that are greater than key */`
			`/* to one position ahead of their current position */`
			`int j = i;`
			`#pragma GCC unroll 2`
			`while(j > 0 && arr[j - 1] > key) {`
			`arr[j] = arr[j - 1];`
			`--j;`
			`}`
			`arr[j] = key;`
			`}`
			`}`

schwab: selection sort trying... 2025-05-09 06:01:44 +02:00			`/** Simple SELECTION sort for small cases - not necessarily better */`
			`inline void sch_selection_sort(uint32_t* arr, int low, int high) {`
			`#pragma GCC unroll 2`
			`for(int i = low; i < high; ++i) {`
			`/* Min-search remaining array */`
			`int mini = i;`
			`#pragma GCC unroll 4`
			`for(int j = i + 1; j < high + 1; ++j) {`
			`if(arr[j] < arr[mini]) mini = j;`
			`}`

			`if(mini != i) {`
			`schwab_swap(&arr[i], &arr[mini]);`
			`}`
			`}`
			`}`

schwab: neoqs-like performance with no issues of smallrange and const because partitioning might do threeway when sensing possible issue 2025-05-09 02:16:23 +02:00			`/**`
			`* 3-way partitioning, in middle all the pivot elements.`
			`*`
			`* Single-pass version 2.0, taken from qsort.h`
			`*`
			`* @param array The array to partition`
			`* @param low From when. (inclusive)`
			`* @param high Until when. (inclusive too!)`
			`* @param pivotval This value is used to partition the array.`
			`* @param plo OUT: until this, more processing might needed.`
			`* @param phi OUT: from this, more processing might needed.`
			`*/`
			`static inline void schwab_partition3sp2(`
			`uint32_t *array,`
			`int low,`
			`int high,`
			`uint32_t pivotval,`
			`int *plo,`
			`int *phi) {`

			`/* Invariant for left: index until smaller (than pivot) elements lay */`
			`int il = (low - 1);`
			`/* Invariant for right: index until (from top) bigger elements lay */`
			`int ir = (high + 1);`
			`/* Indices from where we swap left and right into "is" (and sometimes swap among here too) */`
			`int jl = low;`
			`int jr = high;`

			`while(jl <= jr) {`
			`/* Handle left and find wrongly placed element */`
			`while((array[jl] <= pivotval) && (jl <= jr)) {`
			`int isNonPivot = (array[jl] != pivotval);`
			`int nonSameIndex = (il + 1 != jl);`
			`if(isNonPivot & nonSameIndex)`
			`schwab_swap(&array[il + 1], &array[jl]);`
			`il += isNonPivot;`
			`++jl;`
			`}`

			`/* Handle right and find wrongly placed element */`
			`while((array[jr] >= pivotval) && (jl <= jr)) {`
			`int isNonPivot = (array[jr] != pivotval);`
			`int nonSameIndex = (ir - 1 != jr);`
			`if(isNonPivot & nonSameIndex)`
			`schwab_swap(&array[ir - 1], &array[jr]);`
			`ir -= isNonPivot;`
			`--jr;`
			`}`

			`/* Swap the two found elements that are wrongly placed */`
			`if(jl < jr) schwab_swap(&array[jl], &array[jr]);`
			`}`

			`/* Output the partition points */`
			`plo = il + 1; / XXX: changed from qsort.h to +1 here! */`
			`*phi = ir;`
			`}`

schwab_sort but can endless loop 2025-05-08 21:47:30 +02:00			`/**`
			`* 4-way partitioning`
			`*`
schwab: fixed various bugs, now passes magyarsorts all tests until 20k elements but very slow on constant data for some reason - needs checking 2025-05-09 00:56:06 +02:00			`* Expects: arr[plo] <= kmid <= arr[phi]`
schwab_sort but can endless loop 2025-05-08 21:47:30 +02:00			`* Results: arr[low..plo - 1] <= arr[plo..pmid - 1] <= arr[pmid..phi - 1] <= arr[phi.. high]`
			`*`
schwab: fixed endless run, bug in hi not bigger then mid and lo because only indices are checked lol 2025-05-08 22:47:52 +02:00			`* Also: Adding together lengths of all results arrays shrinks by 1 compared to start arr.`
			`* This means that we ensure recursions / loops always end in quicksort...`
			`*`
schwab_sort but can endless loop 2025-05-08 21:47:30 +02:00			`* @param arr The array to partition`
			`* @param low Inclusive smallest index.`
			`* @param high Inclusive highest index.`
schwab: neoqs-like performance with no issues of smallrange and const because partitioning might do threeway when sensing possible issue 2025-05-09 02:16:23 +02:00			`* @param plo IN-OUT: input low pivot, output - see "Results:"`
schwab: fixed various bugs, now passes magyarsorts all tests until 20k elements but very slow on constant data for some reason - needs checking 2025-05-09 00:56:06 +02:00			`* @param kmid IN: The mid spliting value (like a pivot value, but can be imaginary nonexistent)`
schwab: neoqs-like performance with no issues of smallrange and const because partitioning might do threeway when sensing possible issue 2025-05-09 02:16:23 +02:00			`* @param pmid OUT: output - see "Results:"`
			`* @param phi IN-OUT: input high pivot, output - see "Results:"`
			`* @returns 1 if there is need to process the mid two blocks! Otherwise 0.`
schwab_sort but can endless loop 2025-05-08 21:47:30 +02:00			`*/`
schwab: neoqs-like performance with no issues of smallrange and const because partitioning might do threeway when sensing possible issue 2025-05-09 02:16:23 +02:00			`static inline int schwab_partition(`
schwab_sort but can endless loop 2025-05-08 21:47:30 +02:00			`uint32_t *arr,`
			`int low,`
			`int high,`
			`int *plo,`
schwab: fixed various bugs, now passes magyarsorts all tests until 20k elements but very slow on constant data for some reason - needs checking 2025-05-09 00:56:06 +02:00			`uint32_t kmid,`
schwab_sort but can endless loop 2025-05-08 21:47:30 +02:00			`int *pmid,`
			`int *phi) {`

schwab: fixed various bugs, now passes magyarsorts all tests until 20k elements but very slow on constant data for some reason - needs checking 2025-05-09 00:56:06 +02:00			`/* Keys only - no element copy is made here */`
			`uint32_t klo = arr[*plo];`
			`uint32_t khi = arr[*phi];`

schwab: neoqs-like performance with no issues of smallrange and const because partitioning might do threeway when sensing possible issue 2025-05-09 02:16:23 +02:00			`/* Without this, constant and smallrange is very slooOOoow */`
			`if(khi - klo < SCHWAB_DELTA_THRESHOLD) {`
			`/* Use three-way which defeats smallrange */`
			`/* Outer sort func also optimized for two sides */`
			`/* check for size for which recurse which not! */`
			`schwab_partition3sp2(arr, low, high, kmid, plo, phi);`

			`/* No need to process the midle two blocks - all pivot there */`
			`return 0;`
			`}`

schwab: fixed endless run, bug in hi not bigger then mid and lo because only indices are checked lol 2025-05-08 22:47:52 +02:00			`/* [] Swapping arr[phi]<->arr[high] ensures stop condition later /`
			`uint32_t tmphi = arr[*phi];`
			`arr[*phi] = arr[high];`
			`arr[high] = tmphi;`
schwab_sort but can endless loop 2025-05-08 21:47:30 +02:00
			`/* Aren't inclusive end indices of 4 "blocks" - b0 is smallest vals */`
schwab: for-loop instead of while - does not seems like speedup 2025-05-09 02:30:59 +02:00			`int b0 = low, b1 = low, b2 = low;`
schwab_sort but can endless loop 2025-05-08 21:47:30 +02:00
schwab: for-loop instead of while - does not seems like speedup 2025-05-09 02:30:59 +02:00			`for(int b3 = low; b3 < high; ++b3) {`
			`/* This I moved to be first to avoid unnecessary curr copy below */`
schwab_sort but can endless loop 2025-05-08 21:47:30 +02:00			`if(arr[b3] >= khi) {`
			`continue;`
			`}`

			`/* TODO: should be copy of whole element when not just uint32s! */`
			`uint32_t curr = arr[b3];`

schwab: full branchless parts on partitioning now - speed gain but minimal... need like 5-6 of these 2025-05-09 03:51:37 +02:00			`/* Full branchless - see below for branch alternative */`
schwab: goto-optimized - barely slower on my machine 2025-05-09 03:29:20 +02:00			`int where = (curr < klo) ? 0 :`
			`((curr < kmid) ? 1 : 2);`
			`int target = (curr < klo) ? b0 :`
			`((curr < kmid) ? b1 : b2);`

			`arr[b3] = arr[b2];`
schwab: full branchless parts on partitioning now - speed gain but minimal... need like 5-6 of these 2025-05-09 03:51:37 +02:00			`arr[b2] = (where == 2) ? arr[b2] : arr[b1];`
swab: fixed == to >= typo - now beats std::sort and is comparison sort fully! 2025-05-09 05:05:40 +02:00			`arr[b1] = (where >= 1) ? arr[b1] : arr[b0];`
schwab: full branchless parts on partitioning now - speed gain but minimal... need like 5-6 of these 2025-05-09 03:51:37 +02:00
schwab: goto-optimized - barely slower on my machine 2025-05-09 03:29:20 +02:00			`++b2;`
			`b1 += (where < 2);`
			`b0 += (where < 1);`
			`arr[target] = curr;`

			`/* Same as this would have been:`
schwab_sort but can endless loop 2025-05-08 21:47:30 +02:00			`if(curr < klo) {`
Revert "schwab: tried to help a bit with ILP - does not seem to help at all" This reverts commit bf9c22f4ecd37039fe69142e9b72db6eb50b82a9. 2025-05-09 02:44:56 +02:00			`arr[b3] = arr[b2];`
			`arr[b2] = arr[b1];`
			`arr[b1] = arr[b0];`
			`arr[b0] = curr;`
			`++b0; ++b1; ++b2;`
schwab_sort but can endless loop 2025-05-08 21:47:30 +02:00			`continue;`
			`}`

			`if(curr < kmid) {`
Revert "schwab: tried to help a bit with ILP - does not seem to help at all" This reverts commit bf9c22f4ecd37039fe69142e9b72db6eb50b82a9. 2025-05-09 02:44:56 +02:00			`arr[b3] = arr[b2];`
			`arr[b2] = arr[b1];`
			`arr[b1] = curr;`
schwab: goto-optimized - barely slower on my machine 2025-05-09 03:29:20 +02:00			`++b1;`
schwab_sort but can endless loop 2025-05-08 21:47:30 +02:00			`} else {`
Revert "schwab: tried to help a bit with ILP - does not seem to help at all" This reverts commit bf9c22f4ecd37039fe69142e9b72db6eb50b82a9. 2025-05-09 02:44:56 +02:00			`arr[b3] = arr[b2];`
			`arr[b2] = curr;`
schwab_sort but can endless loop 2025-05-08 21:47:30 +02:00			`}`
schwab: goto-optimized - barely slower on my machine 2025-05-09 03:29:20 +02:00			`++b2;`
			`*/`
schwab_sort but can endless loop 2025-05-08 21:47:30 +02:00			`}`

schwab: fixed endless run, bug in hi not bigger then mid and lo because only indices are checked lol 2025-05-08 22:47:52 +02:00			`/* [] Swap the chosen pivot to begin of last block /`
			`/* This way we can return bigger index and by that */`
			`/* this always removes an element per run at least */`
			`tmphi = arr[b2];`
schwab: fixed various bugs, now passes magyarsorts all tests until 20k elements but very slow on constant data for some reason - needs checking 2025-05-09 00:56:06 +02:00			`arr[b2] = arr[high];`
schwab: fixed endless run, bug in hi not bigger then mid and lo because only indices are checked lol 2025-05-08 22:47:52 +02:00			`arr[high] = tmphi;`
schwab: fixed various bugs, now passes magyarsorts all tests until 20k elements but very slow on constant data for some reason - needs checking 2025-05-09 00:56:06 +02:00			`++b2;`
schwab: fixed endless run, bug in hi not bigger then mid and lo because only indices are checked lol 2025-05-08 22:47:52 +02:00
schwab_sort but can endless loop 2025-05-08 21:47:30 +02:00			`/* Handle output vars as per doc comment */`
			`*plo = b0;`
			`*pmid = b1;`
schwab: for-loop instead of while - does not seems like speedup 2025-05-09 02:30:59 +02:00			`*phi = b2;`
schwab: neoqs-like performance with no issues of smallrange and const because partitioning might do threeway when sensing possible issue 2025-05-09 02:16:23 +02:00
			`/* There are mid parts to process */`
			`return 1;`
schwab_sort but can endless loop 2025-05-08 21:47:30 +02:00			`}`

schwab: fixed various bugs, now passes magyarsorts all tests until 20k elements but very slow on constant data for some reason - needs checking 2025-05-09 00:56:06 +02:00			`/** Swabic-sort its somewhat similar to quicksort but 4-way and tricky */`
schwab_sort but can endless loop 2025-05-08 21:47:30 +02:00			`static inline void schwab_sort(`
			`uint32_t *array,`
			`int low,`
			`int high,`
			`sch_rand_state *state) {`

			`/* Loop handles longest sub-sort-task which ensused log tree depth */`
schwab insertion - but buggy from some previous at n=20 rand 2025-05-09 04:49:31 +02:00			`/* Loop also handles start condition */`
schwab_sort but can endless loop 2025-05-08 21:47:30 +02:00			`while(low < high) {`
swab: fixed == to >= typo - now beats std::sort and is comparison sort fully! 2025-05-09 05:05:40 +02:00			`if(high - low > SCHWAB_INSERTION_THRESHOLD) {`
schwab insertion - but buggy from some previous at n=20 rand 2025-05-09 04:49:31 +02:00			`int r0 = schwab_pick_pivot(state, (high + 1) - low) + low;`
			`int r1 = schwab_pick_pivot(state, (high + 1) - low) + low;`
			`uint32_t klo = array[r0];`
			`uint32_t khi = array[r1];`
			`int plo = r0;`
			`int phi = r1;`
			`if(klo > khi) {`
			`uint32_t ktmp = klo;`
			`klo = khi;`
			`khi = ktmp;`

			`plo = r1;`
			`phi = r0;`
Revert "schwab: tried to change cache friendlyness for branch prediction - it seems it is very marginally worse and much worse if I move thid after the other branches" This reverts commit fac01a81e35f4670aa6534aad9f1a7f4e5d692dd. 2025-05-09 02:36:15 +02:00			`}`
schwab: fixed various bugs, now passes magyarsorts all tests until 20k elements but very slow on constant data for some reason - needs checking 2025-05-09 00:56:06 +02:00
schwab insertion - but buggy from some previous at n=20 rand 2025-05-09 04:49:31 +02:00			`uint32_t kmid = klo + (khi - klo) / 2;`

			`int pmid;`
			`int needmid = schwab_partition(array, low, high, &plo, kmid, &pmid, &phi);`

			`/* See where NOT to recurse to avoid worst case stack depth */`
			`/* Rem.: These might be "not real" length but we only use them to comparisons */`
			`/* REM.: The "real" lengths might be off-by-one but these are FASTER! */`
			`int lolen = plo - low;`
			`int hilen = high - phi;`

			`/* Rewrite loop for worst subtask goal and recurse others! */`
			`/* Let the branch predictor try to predict input data path */`
			`/* Rem.: Best would be to check for biggest in all 4 block */`
			`/* But that would complicate codes above this point! */`
			`/* Rem.: Order of operations try to be a cache-friendly as */`
			`/* possible, but had to put loops changes to the end */`
			`if(lolen < hilen) {`
			`schwab_sort(array, low, plo - 1, state);`
			`if(needmid) {`
			`schwab_sort(array, plo, pmid - 1, state);`
			`schwab_sort(array, pmid, phi - 1, state);`
			`}`

			`low = phi;`
			`/* high = high; */`
			`} else {`
			`schwab_sort(array, phi, high, state);`
			`if(needmid) {`
			`schwab_sort(array, pmid, phi - 1, state);`
			`schwab_sort(array, plo, pmid - 1, state);`
			`}`

			`/* low = low; */`
			`high = plo - 1;`
Revert "schwab: tried to change cache friendlyness for branch prediction - it seems it is very marginally worse and much worse if I move thid after the other branches" This reverts commit fac01a81e35f4670aa6534aad9f1a7f4e5d692dd. 2025-05-09 02:36:15 +02:00			`}`
schwab insertion - but buggy from some previous at n=20 rand 2025-05-09 04:49:31 +02:00			`} else {`
schwab: selection sort trying... 2025-05-09 06:01:44 +02:00			`if(high - low > SCHWAB_SELECTION_THRESHOLD) {`
			`/* Just do an insertion sort instead */`
			`sch_insertion_sort(array, low, high);`
			`return;`
			`} else {`
			`sch_selection_sort(array, low, high);`
			`return;`
			`}`
schwab_sort but can endless loop 2025-05-08 21:47:30 +02:00			`}`
			`}`
			`}`

schwab: neoqs-like performance with no issues of smallrange and const because partitioning might do threeway when sensing possible issue 2025-05-09 02:16:23 +02:00			`#endif /* SCHWAB_SORT_H */`