2025-05-09 02:16:23 +02:00
|
|
|
#ifndef SCHWAB_SORT_H
|
|
|
|
#define SCHWAB_SORT_H
|
2025-05-08 21:47:30 +02:00
|
|
|
|
|
|
|
/* A fast quicksort-like new alg created in Csolnok, Hungary with:
|
|
|
|
* - 4-way partitioning with 0..5 copies (not swaps) per elem per run
|
2025-05-09 02:44:56 +02:00
|
|
|
* - ensured O(log2(n)) worst recursion depth
|
2025-05-08 21:47:30 +02:00
|
|
|
*
|
2025-05-09 02:16:23 +02:00
|
|
|
* LICENCE: CC-BY, 2025 May 08-09
|
2025-05-08 21:47:30 +02:00
|
|
|
* Author: Richárd István Thier (also author of the Magyarsort)
|
|
|
|
*/
|
|
|
|
|
2025-05-09 02:16:23 +02:00
|
|
|
/** 3-way optimization for smallrange and const - 0 turns this off! */
|
|
|
|
#ifndef SCHWAB_DELTA_THRESHOLD
|
|
|
|
#define SCHWAB_DELTA_THRESHOLD 32
|
|
|
|
#endif /* SCHWAB_DELTA_THRESHOLD */
|
|
|
|
|
2025-05-09 04:49:31 +02:00
|
|
|
/** Below this many elements we do insertion sort */
|
|
|
|
#ifndef SCHWAB_INSERTION_THRESHOLD
|
2025-05-09 05:35:42 +02:00
|
|
|
#define SCHWAB_INSERTION_THRESHOLD 128
|
2025-05-09 06:01:44 +02:00
|
|
|
#endif /* SCHWAB_INSERTION_THRESHOLD */
|
|
|
|
|
|
|
|
/** Below this many elements we do insertion sort */
|
|
|
|
#ifndef SCHWAB_SELECTION_THRESHOLD
|
2025-05-09 06:35:50 +02:00
|
|
|
#define SCHWAB_SELECTION_THRESHOLD 8
|
2025-05-09 06:01:44 +02:00
|
|
|
#endif /* SCHWAB_SELECTION_THRESHOLD */
|
2025-05-09 04:49:31 +02:00
|
|
|
|
2025-05-08 21:47:30 +02:00
|
|
|
typedef uint32_t sch_rand_state;
|
|
|
|
|
|
|
|
/** Create rand state for schwab_sort using a seed - can give 0 if uninterested */
|
|
|
|
static inline sch_rand_state schwab_rand_state(uint32_t seed) {
|
|
|
|
return seed;
|
|
|
|
}
|
|
|
|
|
|
|
|
/** 32-bit LCG for fast random generations - from my fastrand.h */
|
|
|
|
static inline uint32_t schwab_lcg(sch_rand_state *state) {
|
|
|
|
*state = *state * 1664525u + 1013904223u;
|
|
|
|
return *state;
|
|
|
|
}
|
|
|
|
|
|
|
|
/** Get pivot index in [0, len-1] without modulus - from my fastrand.h */
|
|
|
|
static inline uint32_t schwab_pick_pivot(sch_rand_state *state, uint32_t len) {
|
|
|
|
uint32_t rand = schwab_lcg(state);
|
|
|
|
/* Multiply by len, take the upper 32 bits of the 64-bit result */
|
|
|
|
return (uint32_t)(((uint64_t)rand * len) >> 32);
|
|
|
|
}
|
|
|
|
|
2025-05-09 02:16:23 +02:00
|
|
|
/** Swap operation */
|
|
|
|
static inline void schwab_swap(uint32_t *a, uint32_t *b) {
|
|
|
|
uint32_t t = *a;
|
|
|
|
*a = *b;
|
|
|
|
*b = t;
|
|
|
|
}
|
|
|
|
|
2025-05-09 06:35:50 +02:00
|
|
|
/** Branchless conditional swap */
|
|
|
|
#define SCH_CSWAP(a, b) do { \
|
|
|
|
uint32_t _x = (a), _y = (b); \
|
|
|
|
uint32_t _gt = -(_x > _y); \
|
|
|
|
uint32_t _tmp = (_x ^ _y) & _gt; \
|
|
|
|
(a) = _x ^ _tmp; \
|
|
|
|
(b) = _y ^ _tmp; \
|
|
|
|
} while (0)
|
|
|
|
|
|
|
|
/**
|
|
|
|
* Tiny unrolled register-heapsort of exactly 5 elements.
|
|
|
|
*
|
|
|
|
* This heap:
|
|
|
|
*
|
|
|
|
* a[0]
|
|
|
|
* / \
|
|
|
|
* a[1] a[2]
|
|
|
|
* / \
|
|
|
|
* a[3] a[4]
|
|
|
|
*
|
|
|
|
* @param a The array
|
|
|
|
*/
|
|
|
|
void sch_hsort5(uint32_t* a) {
|
|
|
|
/* Build max heap */
|
|
|
|
SCH_CSWAP(a[1], a[3]);
|
|
|
|
SCH_CSWAP(a[1], a[4]);
|
|
|
|
|
|
|
|
/* Heapify(0) */
|
|
|
|
/* Max child = (a[2] > a[1]) ? 2 : 1; */
|
|
|
|
/* SCH_CSWAP(a[0], a[maxChild]) */
|
|
|
|
uint32_t cmp = -(a[2] > a[1]);
|
|
|
|
uint32_t i = (1 ^ 2) & cmp ^ 1;
|
|
|
|
/* Right selects a[2] if cmp==1, else 1 */
|
|
|
|
SCH_CSWAP(a[0], a[i]);
|
|
|
|
|
|
|
|
/* Sort phase */
|
|
|
|
|
|
|
|
/* 1st max to end */
|
|
|
|
SCH_CSWAP(a[0], a[4]);
|
|
|
|
/* heapify a[0..3] */
|
|
|
|
cmp = -(a[2] > a[1]);
|
|
|
|
i = (1 ^ 2) & cmp ^ 1;
|
|
|
|
SCH_CSWAP(a[0], a[i]);
|
|
|
|
|
|
|
|
/* 2nd max to end */
|
|
|
|
SCH_CSWAP(a[0], a[3]);
|
|
|
|
/* heapify a[0..2] */
|
|
|
|
cmp = -(a[2] > a[1]);
|
|
|
|
i = (1 ^ 2) & cmp ^ 1;
|
|
|
|
SCH_CSWAP(a[0], a[i]);
|
|
|
|
|
|
|
|
/* 3rd max to end */
|
|
|
|
SCH_CSWAP(a[0], a[2]);
|
|
|
|
|
|
|
|
/* Final two */
|
|
|
|
SCH_CSWAP(a[0], a[1]);
|
|
|
|
}
|
|
|
|
|
2025-05-09 04:49:31 +02:00
|
|
|
/** Simple insertion sort for small cases */
|
|
|
|
inline void sch_insertion_sort(uint32_t *arr, int low, int high) {
|
2025-05-09 06:35:50 +02:00
|
|
|
/* Dual heapsort5 probably helps insertion speed */
|
|
|
|
/* This is sane, because insertion benefits from */
|
|
|
|
/* data being "basically nearly sorted" as input */
|
2025-05-09 06:44:19 +02:00
|
|
|
if(high - low > 10) {
|
2025-05-09 06:35:50 +02:00
|
|
|
sch_hsort5(&arr[0]);
|
|
|
|
sch_hsort5(&arr[5]);
|
|
|
|
}
|
|
|
|
|
|
|
|
/* "Real" insertion sort part comes here */
|
2025-05-09 05:35:42 +02:00
|
|
|
for(int i = low + 1; i <= high; ++i) {
|
|
|
|
uint32_t key = arr[i];
|
2025-05-09 04:49:31 +02:00
|
|
|
|
2025-05-09 05:35:42 +02:00
|
|
|
/* Move elements of arr[0..i-1] that are greater than key */
|
|
|
|
/* to one position ahead of their current position */
|
|
|
|
int j = i;
|
|
|
|
#pragma GCC unroll 2
|
|
|
|
while(j > 0 && arr[j - 1] > key) {
|
|
|
|
arr[j] = arr[j - 1];
|
|
|
|
--j;
|
|
|
|
}
|
|
|
|
arr[j] = key;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2025-05-09 06:01:44 +02:00
|
|
|
/** Simple SELECTION sort for small cases - not necessarily better */
|
|
|
|
inline void sch_selection_sort(uint32_t* arr, int low, int high) {
|
|
|
|
#pragma GCC unroll 2
|
|
|
|
for(int i = low; i < high; ++i) {
|
|
|
|
/* Min-search remaining array */
|
|
|
|
int mini = i;
|
|
|
|
#pragma GCC unroll 4
|
|
|
|
for(int j = i + 1; j < high + 1; ++j) {
|
|
|
|
if(arr[j] < arr[mini]) mini = j;
|
|
|
|
}
|
|
|
|
|
|
|
|
if(mini != i) {
|
|
|
|
schwab_swap(&arr[i], &arr[mini]);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2025-05-09 02:16:23 +02:00
|
|
|
/**
|
|
|
|
* 3-way partitioning, in middle all the pivot elements.
|
|
|
|
*
|
|
|
|
* Single-pass version 2.0, taken from qsort.h
|
|
|
|
*
|
|
|
|
* @param array The array to partition
|
|
|
|
* @param low From when. (inclusive)
|
|
|
|
* @param high Until when. (inclusive too!)
|
|
|
|
* @param pivotval This value is used to partition the array.
|
|
|
|
* @param plo OUT: until this, more processing might needed.
|
|
|
|
* @param phi OUT: from this, more processing might needed.
|
|
|
|
*/
|
|
|
|
static inline void schwab_partition3sp2(
|
|
|
|
uint32_t *array,
|
|
|
|
int low,
|
|
|
|
int high,
|
|
|
|
uint32_t pivotval,
|
|
|
|
int *plo,
|
|
|
|
int *phi) {
|
|
|
|
|
|
|
|
/* Invariant for left: index until smaller (than pivot) elements lay */
|
|
|
|
int il = (low - 1);
|
|
|
|
/* Invariant for right: index until (from top) bigger elements lay */
|
|
|
|
int ir = (high + 1);
|
|
|
|
/* Indices from where we swap left and right into "is" (and sometimes swap among here too) */
|
|
|
|
int jl = low;
|
|
|
|
int jr = high;
|
|
|
|
|
|
|
|
while(jl <= jr) {
|
|
|
|
/* Handle left and find wrongly placed element */
|
|
|
|
while((array[jl] <= pivotval) && (jl <= jr)) {
|
|
|
|
int isNonPivot = (array[jl] != pivotval);
|
|
|
|
int nonSameIndex = (il + 1 != jl);
|
|
|
|
if(isNonPivot & nonSameIndex)
|
|
|
|
schwab_swap(&array[il + 1], &array[jl]);
|
|
|
|
il += isNonPivot;
|
|
|
|
++jl;
|
|
|
|
}
|
|
|
|
|
|
|
|
/* Handle right and find wrongly placed element */
|
|
|
|
while((array[jr] >= pivotval) && (jl <= jr)) {
|
|
|
|
int isNonPivot = (array[jr] != pivotval);
|
|
|
|
int nonSameIndex = (ir - 1 != jr);
|
|
|
|
if(isNonPivot & nonSameIndex)
|
|
|
|
schwab_swap(&array[ir - 1], &array[jr]);
|
|
|
|
ir -= isNonPivot;
|
|
|
|
--jr;
|
|
|
|
}
|
|
|
|
|
|
|
|
/* Swap the two found elements that are wrongly placed */
|
|
|
|
if(jl < jr) schwab_swap(&array[jl], &array[jr]);
|
|
|
|
}
|
|
|
|
|
|
|
|
/* Output the partition points */
|
|
|
|
*plo = il + 1; /* XXX: changed from qsort.h to +1 here! */
|
|
|
|
*phi = ir;
|
|
|
|
}
|
|
|
|
|
2025-05-08 21:47:30 +02:00
|
|
|
/**
|
|
|
|
* 4-way partitioning
|
|
|
|
*
|
2025-05-09 00:56:06 +02:00
|
|
|
* Expects: arr[plo] <= kmid <= arr[phi]
|
2025-05-08 21:47:30 +02:00
|
|
|
* Results: arr[low..plo - 1] <= arr[plo..pmid - 1] <= arr[pmid..phi - 1] <= arr[phi.. high]
|
|
|
|
*
|
2025-05-08 22:47:52 +02:00
|
|
|
* Also: Adding together lengths of all results arrays shrinks by 1 compared to start arr.
|
|
|
|
* This means that we ensure recursions / loops always end in quicksort...
|
|
|
|
*
|
2025-05-08 21:47:30 +02:00
|
|
|
* @param arr The array to partition
|
|
|
|
* @param low Inclusive smallest index.
|
|
|
|
* @param high Inclusive highest index.
|
2025-05-09 02:16:23 +02:00
|
|
|
* @param plo IN-OUT: input low pivot, output - see "Results:"
|
2025-05-09 00:56:06 +02:00
|
|
|
* @param kmid IN: The mid spliting value (like a pivot value, but can be imaginary nonexistent)
|
2025-05-09 02:16:23 +02:00
|
|
|
* @param pmid OUT: output - see "Results:"
|
|
|
|
* @param phi IN-OUT: input high pivot, output - see "Results:"
|
|
|
|
* @returns 1 if there is need to process the mid two blocks! Otherwise 0.
|
2025-05-08 21:47:30 +02:00
|
|
|
*/
|
2025-05-09 02:16:23 +02:00
|
|
|
static inline int schwab_partition(
|
2025-05-08 21:47:30 +02:00
|
|
|
uint32_t *arr,
|
|
|
|
int low,
|
|
|
|
int high,
|
|
|
|
int *plo,
|
2025-05-09 00:56:06 +02:00
|
|
|
uint32_t kmid,
|
2025-05-08 21:47:30 +02:00
|
|
|
int *pmid,
|
|
|
|
int *phi) {
|
|
|
|
|
2025-05-09 00:56:06 +02:00
|
|
|
/* Keys only - no element copy is made here */
|
|
|
|
uint32_t klo = arr[*plo];
|
|
|
|
uint32_t khi = arr[*phi];
|
|
|
|
|
2025-05-09 02:16:23 +02:00
|
|
|
/* Without this, constant and smallrange is very slooOOoow */
|
|
|
|
if(khi - klo < SCHWAB_DELTA_THRESHOLD) {
|
|
|
|
/* Use three-way which defeats smallrange */
|
|
|
|
/* Outer sort func also optimized for two sides */
|
|
|
|
/* check for size for which recurse which not! */
|
|
|
|
schwab_partition3sp2(arr, low, high, kmid, plo, phi);
|
|
|
|
|
|
|
|
/* No need to process the midle two blocks - all pivot there */
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
2025-05-08 22:47:52 +02:00
|
|
|
/* [*] Swapping arr[phi]<->arr[high] ensures stop condition later */
|
|
|
|
uint32_t tmphi = arr[*phi];
|
|
|
|
arr[*phi] = arr[high];
|
|
|
|
arr[high] = tmphi;
|
2025-05-08 21:47:30 +02:00
|
|
|
|
|
|
|
/* Aren't inclusive end indices of 4 "blocks" - b0 is smallest vals */
|
2025-05-09 02:30:59 +02:00
|
|
|
int b0 = low, b1 = low, b2 = low;
|
2025-05-08 21:47:30 +02:00
|
|
|
|
2025-05-09 02:30:59 +02:00
|
|
|
for(int b3 = low; b3 < high; ++b3) {
|
|
|
|
/* This I moved to be first to avoid unnecessary curr copy below */
|
2025-05-08 21:47:30 +02:00
|
|
|
if(arr[b3] >= khi) {
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
|
|
|
|
/* TODO: should be copy of whole element when not just uint32s! */
|
|
|
|
uint32_t curr = arr[b3];
|
|
|
|
|
2025-05-09 03:51:37 +02:00
|
|
|
/* Full branchless - see below for branch alternative */
|
2025-05-09 03:29:20 +02:00
|
|
|
int where = (curr < klo) ? 0 :
|
|
|
|
((curr < kmid) ? 1 : 2);
|
|
|
|
int target = (curr < klo) ? b0 :
|
|
|
|
((curr < kmid) ? b1 : b2);
|
|
|
|
|
|
|
|
arr[b3] = arr[b2];
|
2025-05-09 03:51:37 +02:00
|
|
|
arr[b2] = (where == 2) ? arr[b2] : arr[b1];
|
2025-05-09 05:05:40 +02:00
|
|
|
arr[b1] = (where >= 1) ? arr[b1] : arr[b0];
|
2025-05-09 03:51:37 +02:00
|
|
|
|
2025-05-09 03:29:20 +02:00
|
|
|
++b2;
|
|
|
|
b1 += (where < 2);
|
|
|
|
b0 += (where < 1);
|
|
|
|
arr[target] = curr;
|
|
|
|
|
|
|
|
/* Same as this would have been:
|
2025-05-08 21:47:30 +02:00
|
|
|
if(curr < klo) {
|
2025-05-09 02:44:56 +02:00
|
|
|
arr[b3] = arr[b2];
|
|
|
|
arr[b2] = arr[b1];
|
|
|
|
arr[b1] = arr[b0];
|
|
|
|
arr[b0] = curr;
|
|
|
|
++b0; ++b1; ++b2;
|
2025-05-08 21:47:30 +02:00
|
|
|
continue;
|
|
|
|
}
|
|
|
|
|
|
|
|
if(curr < kmid) {
|
2025-05-09 02:44:56 +02:00
|
|
|
arr[b3] = arr[b2];
|
|
|
|
arr[b2] = arr[b1];
|
|
|
|
arr[b1] = curr;
|
2025-05-09 03:29:20 +02:00
|
|
|
++b1;
|
2025-05-08 21:47:30 +02:00
|
|
|
} else {
|
2025-05-09 02:44:56 +02:00
|
|
|
arr[b3] = arr[b2];
|
|
|
|
arr[b2] = curr;
|
2025-05-08 21:47:30 +02:00
|
|
|
}
|
2025-05-09 03:29:20 +02:00
|
|
|
++b2;
|
|
|
|
*/
|
2025-05-08 21:47:30 +02:00
|
|
|
}
|
|
|
|
|
2025-05-08 22:47:52 +02:00
|
|
|
/* [*] Swap the chosen pivot to begin of last block */
|
|
|
|
/* This way we can return bigger index and by that */
|
|
|
|
/* this always removes an element per run at least */
|
|
|
|
tmphi = arr[b2];
|
2025-05-09 00:56:06 +02:00
|
|
|
arr[b2] = arr[high];
|
2025-05-08 22:47:52 +02:00
|
|
|
arr[high] = tmphi;
|
2025-05-09 00:56:06 +02:00
|
|
|
++b2;
|
2025-05-08 22:47:52 +02:00
|
|
|
|
2025-05-08 21:47:30 +02:00
|
|
|
/* Handle output vars as per doc comment */
|
|
|
|
*plo = b0;
|
|
|
|
*pmid = b1;
|
2025-05-09 02:30:59 +02:00
|
|
|
*phi = b2;
|
2025-05-09 02:16:23 +02:00
|
|
|
|
|
|
|
/* There are mid parts to process */
|
|
|
|
return 1;
|
2025-05-08 21:47:30 +02:00
|
|
|
}
|
|
|
|
|
2025-05-09 00:56:06 +02:00
|
|
|
/** Swabic-sort its somewhat similar to quicksort but 4-way and tricky */
|
2025-05-08 21:47:30 +02:00
|
|
|
static inline void schwab_sort(
|
|
|
|
uint32_t *array,
|
|
|
|
int low,
|
|
|
|
int high,
|
|
|
|
sch_rand_state *state) {
|
|
|
|
|
|
|
|
/* Loop handles longest sub-sort-task which ensused log tree depth */
|
2025-05-09 04:49:31 +02:00
|
|
|
/* Loop also handles start condition */
|
2025-05-08 21:47:30 +02:00
|
|
|
while(low < high) {
|
2025-05-09 05:05:40 +02:00
|
|
|
if(high - low > SCHWAB_INSERTION_THRESHOLD) {
|
2025-05-09 04:49:31 +02:00
|
|
|
int r0 = schwab_pick_pivot(state, (high + 1) - low) + low;
|
|
|
|
int r1 = schwab_pick_pivot(state, (high + 1) - low) + low;
|
|
|
|
uint32_t klo = array[r0];
|
|
|
|
uint32_t khi = array[r1];
|
|
|
|
int plo = r0;
|
|
|
|
int phi = r1;
|
|
|
|
if(klo > khi) {
|
|
|
|
uint32_t ktmp = klo;
|
|
|
|
klo = khi;
|
|
|
|
khi = ktmp;
|
|
|
|
|
|
|
|
plo = r1;
|
|
|
|
phi = r0;
|
2025-05-09 02:36:15 +02:00
|
|
|
}
|
2025-05-09 00:56:06 +02:00
|
|
|
|
2025-05-09 04:49:31 +02:00
|
|
|
uint32_t kmid = klo + (khi - klo) / 2;
|
|
|
|
|
|
|
|
int pmid;
|
|
|
|
int needmid = schwab_partition(array, low, high, &plo, kmid, &pmid, &phi);
|
|
|
|
|
|
|
|
/* See where NOT to recurse to avoid worst case stack depth */
|
|
|
|
/* Rem.: These might be "not real" length but we only use them to comparisons */
|
|
|
|
/* REM.: The "real" lengths might be off-by-one but these are FASTER! */
|
|
|
|
int lolen = plo - low;
|
|
|
|
int hilen = high - phi;
|
|
|
|
|
|
|
|
/* Rewrite loop for worst subtask goal and recurse others! */
|
|
|
|
/* Let the branch predictor try to predict input data path */
|
|
|
|
/* Rem.: Best would be to check for biggest in all 4 block */
|
|
|
|
/* But that would complicate codes above this point! */
|
|
|
|
/* Rem.: Order of operations try to be a cache-friendly as */
|
|
|
|
/* possible, but had to put loops changes to the end */
|
|
|
|
if(lolen < hilen) {
|
|
|
|
schwab_sort(array, low, plo - 1, state);
|
|
|
|
if(needmid) {
|
|
|
|
schwab_sort(array, plo, pmid - 1, state);
|
|
|
|
schwab_sort(array, pmid, phi - 1, state);
|
|
|
|
}
|
|
|
|
|
|
|
|
low = phi;
|
|
|
|
/* high = high; */
|
|
|
|
} else {
|
|
|
|
schwab_sort(array, phi, high, state);
|
|
|
|
if(needmid) {
|
|
|
|
schwab_sort(array, pmid, phi - 1, state);
|
|
|
|
schwab_sort(array, plo, pmid - 1, state);
|
|
|
|
}
|
|
|
|
|
|
|
|
/* low = low; */
|
|
|
|
high = plo - 1;
|
2025-05-09 02:36:15 +02:00
|
|
|
}
|
2025-05-09 04:49:31 +02:00
|
|
|
} else {
|
2025-05-09 06:01:44 +02:00
|
|
|
if(high - low > SCHWAB_SELECTION_THRESHOLD) {
|
|
|
|
/* Just do an insertion sort instead */
|
|
|
|
sch_insertion_sort(array, low, high);
|
|
|
|
return;
|
|
|
|
} else {
|
|
|
|
sch_selection_sort(array, low, high);
|
|
|
|
return;
|
|
|
|
}
|
2025-05-08 21:47:30 +02:00
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2025-05-09 02:16:23 +02:00
|
|
|
#endif /* SCHWAB_SORT_H */
|