#ifndef SCHWAB_SORT_H #define SCHWAB_SORT_H /* A fast quicksort-like new alg created in Csolnok, Hungary with: * - 4-way partitioning with 0..5 copies (not swaps) per elem per run * - ensured O(log2(n)) worst recursion depth * * LICENCE: CC-BY, 2025 May 08-09 * Author: Richárd István Thier (also author of the Magyarsort) */ /** 3-way optimization for smallrange and const - 0 turns this off! */ #ifndef SCHWAB_DELTA_THRESHOLD #define SCHWAB_DELTA_THRESHOLD 32 #endif /* SCHWAB_DELTA_THRESHOLD */ /** Below this many elements we do insertion sort */ #ifndef SCHWAB_INSERTION_THRESHOLD #define SCHWAB_INSERTION_THRESHOLD 128 #endif /* SCHWAB_INSERTION_THRESHOLD */ /** Below this many elements we do insertion sort */ #ifndef SCHWAB_SELECTION_THRESHOLD #define SCHWAB_SELECTION_THRESHOLD 16 #endif /* SCHWAB_SELECTION_THRESHOLD */ typedef uint32_t sch_rand_state; /** Create rand state for schwab_sort using a seed - can give 0 if uninterested */ static inline sch_rand_state schwab_rand_state(uint32_t seed) { return seed; } /** 32-bit LCG for fast random generations - from my fastrand.h */ static inline uint32_t schwab_lcg(sch_rand_state *state) { *state = *state * 1664525u + 1013904223u; return *state; } /** Get pivot index in [0, len-1] without modulus - from my fastrand.h */ static inline uint32_t schwab_pick_pivot(sch_rand_state *state, uint32_t len) { uint32_t rand = schwab_lcg(state); /* Multiply by len, take the upper 32 bits of the 64-bit result */ return (uint32_t)(((uint64_t)rand * len) >> 32); } /** Swap operation */ static inline void schwab_swap(uint32_t *a, uint32_t *b) { uint32_t t = *a; *a = *b; *b = t; } /** Simple insertion sort for small cases */ inline void sch_insertion_sort(uint32_t *arr, int low, int high) { for(int i = low + 1; i <= high; ++i) { uint32_t key = arr[i]; /* Move elements of arr[0..i-1] that are greater than key */ /* to one position ahead of their current position */ int j = i; #pragma GCC unroll 2 while(j > 0 && arr[j - 1] > key) { arr[j] = arr[j - 1]; --j; } arr[j] = key; } } /** Simple insertion sort for small cases v2 - not necessarily better */ inline void sch_insertion_sort2(uint32_t* arr, int low, int high) { for(int i = low + 1; i <= high; ++i) { uint32_t key = arr[i]; /* Separate load and compare to expose ILP */ int j = i; #pragma GCC unroll 2 while(j > 0) { uint32_t prev = arr[j - 1]; if (prev <= key) break; arr[j] = prev; --j; } arr[j] = key; } } /** Simple SELECTION sort for small cases - not necessarily better */ inline void sch_selection_sort(uint32_t* arr, int low, int high) { #pragma GCC unroll 2 for(int i = low; i < high; ++i) { /* Min-search remaining array */ int mini = i; #pragma GCC unroll 4 for(int j = i + 1; j < high + 1; ++j) { if(arr[j] < arr[mini]) mini = j; } if(mini != i) { schwab_swap(&arr[i], &arr[mini]); } } } /** * 3-way partitioning, in middle all the pivot elements. * * Single-pass version 2.0, taken from qsort.h * * @param array The array to partition * @param low From when. (inclusive) * @param high Until when. (inclusive too!) * @param pivotval This value is used to partition the array. * @param plo OUT: until this, more processing might needed. * @param phi OUT: from this, more processing might needed. */ static inline void schwab_partition3sp2( uint32_t *array, int low, int high, uint32_t pivotval, int *plo, int *phi) { /* Invariant for left: index until smaller (than pivot) elements lay */ int il = (low - 1); /* Invariant for right: index until (from top) bigger elements lay */ int ir = (high + 1); /* Indices from where we swap left and right into "is" (and sometimes swap among here too) */ int jl = low; int jr = high; while(jl <= jr) { /* Handle left and find wrongly placed element */ while((array[jl] <= pivotval) && (jl <= jr)) { int isNonPivot = (array[jl] != pivotval); int nonSameIndex = (il + 1 != jl); if(isNonPivot & nonSameIndex) schwab_swap(&array[il + 1], &array[jl]); il += isNonPivot; ++jl; } /* Handle right and find wrongly placed element */ while((array[jr] >= pivotval) && (jl <= jr)) { int isNonPivot = (array[jr] != pivotval); int nonSameIndex = (ir - 1 != jr); if(isNonPivot & nonSameIndex) schwab_swap(&array[ir - 1], &array[jr]); ir -= isNonPivot; --jr; } /* Swap the two found elements that are wrongly placed */ if(jl < jr) schwab_swap(&array[jl], &array[jr]); } /* Output the partition points */ *plo = il + 1; /* XXX: changed from qsort.h to +1 here! */ *phi = ir; } /** * 4-way partitioning * * Expects: arr[plo] <= kmid <= arr[phi] * Results: arr[low..plo - 1] <= arr[plo..pmid - 1] <= arr[pmid..phi - 1] <= arr[phi.. high] * * Also: Adding together lengths of all results arrays shrinks by 1 compared to start arr. * This means that we ensure recursions / loops always end in quicksort... * * @param arr The array to partition * @param low Inclusive smallest index. * @param high Inclusive highest index. * @param plo IN-OUT: input low pivot, output - see "Results:" * @param kmid IN: The mid spliting value (like a pivot value, but can be imaginary nonexistent) * @param pmid OUT: output - see "Results:" * @param phi IN-OUT: input high pivot, output - see "Results:" * @returns 1 if there is need to process the mid two blocks! Otherwise 0. */ static inline int schwab_partition( uint32_t *arr, int low, int high, int *plo, uint32_t kmid, int *pmid, int *phi) { /* Keys only - no element copy is made here */ uint32_t klo = arr[*plo]; uint32_t khi = arr[*phi]; /* Without this, constant and smallrange is very slooOOoow */ if(khi - klo < SCHWAB_DELTA_THRESHOLD) { /* Use three-way which defeats smallrange */ /* Outer sort func also optimized for two sides */ /* check for size for which recurse which not! */ schwab_partition3sp2(arr, low, high, kmid, plo, phi); /* No need to process the midle two blocks - all pivot there */ return 0; } /* [*] Swapping arr[phi]<->arr[high] ensures stop condition later */ uint32_t tmphi = arr[*phi]; arr[*phi] = arr[high]; arr[high] = tmphi; /* Aren't inclusive end indices of 4 "blocks" - b0 is smallest vals */ int b0 = low, b1 = low, b2 = low; for(int b3 = low; b3 < high; ++b3) { /* This I moved to be first to avoid unnecessary curr copy below */ if(arr[b3] >= khi) { continue; } /* TODO: should be copy of whole element when not just uint32s! */ uint32_t curr = arr[b3]; /* Full branchless - see below for branch alternative */ int where = (curr < klo) ? 0 : ((curr < kmid) ? 1 : 2); int target = (curr < klo) ? b0 : ((curr < kmid) ? b1 : b2); arr[b3] = arr[b2]; arr[b2] = (where == 2) ? arr[b2] : arr[b1]; arr[b1] = (where >= 1) ? arr[b1] : arr[b0]; ++b2; b1 += (where < 2); b0 += (where < 1); arr[target] = curr; /* Same as this would have been: if(curr < klo) { arr[b3] = arr[b2]; arr[b2] = arr[b1]; arr[b1] = arr[b0]; arr[b0] = curr; ++b0; ++b1; ++b2; continue; } if(curr < kmid) { arr[b3] = arr[b2]; arr[b2] = arr[b1]; arr[b1] = curr; ++b1; } else { arr[b3] = arr[b2]; arr[b2] = curr; } ++b2; */ } /* [*] Swap the chosen pivot to begin of last block */ /* This way we can return bigger index and by that */ /* this always removes an element per run at least */ tmphi = arr[b2]; arr[b2] = arr[high]; arr[high] = tmphi; ++b2; /* Handle output vars as per doc comment */ *plo = b0; *pmid = b1; *phi = b2; /* There are mid parts to process */ return 1; } /** Swabic-sort its somewhat similar to quicksort but 4-way and tricky */ static inline void schwab_sort( uint32_t *array, int low, int high, sch_rand_state *state) { /* Loop handles longest sub-sort-task which ensused log tree depth */ /* Loop also handles start condition */ while(low < high) { if(high - low > SCHWAB_INSERTION_THRESHOLD) { int r0 = schwab_pick_pivot(state, (high + 1) - low) + low; int r1 = schwab_pick_pivot(state, (high + 1) - low) + low; uint32_t klo = array[r0]; uint32_t khi = array[r1]; int plo = r0; int phi = r1; if(klo > khi) { uint32_t ktmp = klo; klo = khi; khi = ktmp; plo = r1; phi = r0; } uint32_t kmid = klo + (khi - klo) / 2; int pmid; int needmid = schwab_partition(array, low, high, &plo, kmid, &pmid, &phi); /* See where NOT to recurse to avoid worst case stack depth */ /* Rem.: These might be "not real" length but we only use them to comparisons */ /* REM.: The "real" lengths might be off-by-one but these are FASTER! */ int lolen = plo - low; int hilen = high - phi; /* Rewrite loop for worst subtask goal and recurse others! */ /* Let the branch predictor try to predict input data path */ /* Rem.: Best would be to check for biggest in all 4 block */ /* But that would complicate codes above this point! */ /* Rem.: Order of operations try to be a cache-friendly as */ /* possible, but had to put loops changes to the end */ if(lolen < hilen) { schwab_sort(array, low, plo - 1, state); if(needmid) { schwab_sort(array, plo, pmid - 1, state); schwab_sort(array, pmid, phi - 1, state); } low = phi; /* high = high; */ } else { schwab_sort(array, phi, high, state); if(needmid) { schwab_sort(array, pmid, phi - 1, state); schwab_sort(array, plo, pmid - 1, state); } /* low = low; */ high = plo - 1; } } else { if(high - low > SCHWAB_SELECTION_THRESHOLD) { /* Just do an insertion sort instead */ sch_insertion_sort(array, low, high); return; } else { sch_selection_sort(array, low, high); return; } } } } #endif /* SCHWAB_SORT_H */