myqsort/schwab_sort.h

268 lines
7.5 KiB
C

#ifndef SCHWAB_SORT_H
#define SCHWAB_SORT_H
/* A fast quicksort-like new alg created in Csolnok, Hungary with:
* - 4-way partitioning with 0..5 copies (not swaps) per elem per run
* - ensured O(log2(n)) worst recursion depth
*
* LICENCE: CC-BY, 2025 May 08-09
* Author: Richárd István Thier (also author of the Magyarsort)
*/
/** 3-way optimization for smallrange and const - 0 turns this off! */
#ifndef SCHWAB_DELTA_THRESHOLD
#define SCHWAB_DELTA_THRESHOLD 32
#endif /* SCHWAB_DELTA_THRESHOLD */
typedef uint32_t sch_rand_state;
/** Create rand state for schwab_sort using a seed - can give 0 if uninterested */
static inline sch_rand_state schwab_rand_state(uint32_t seed) {
return seed;
}
/** 32-bit LCG for fast random generations - from my fastrand.h */
static inline uint32_t schwab_lcg(sch_rand_state *state) {
*state = *state * 1664525u + 1013904223u;
return *state;
}
/** Get pivot index in [0, len-1] without modulus - from my fastrand.h */
static inline uint32_t schwab_pick_pivot(sch_rand_state *state, uint32_t len) {
uint32_t rand = schwab_lcg(state);
/* Multiply by len, take the upper 32 bits of the 64-bit result */
return (uint32_t)(((uint64_t)rand * len) >> 32);
}
/** Swap operation */
static inline void schwab_swap(uint32_t *a, uint32_t *b) {
uint32_t t = *a;
*a = *b;
*b = t;
}
/**
* 3-way partitioning, in middle all the pivot elements.
*
* Single-pass version 2.0, taken from qsort.h
*
* @param array The array to partition
* @param low From when. (inclusive)
* @param high Until when. (inclusive too!)
* @param pivotval This value is used to partition the array.
* @param plo OUT: until this, more processing might needed.
* @param phi OUT: from this, more processing might needed.
*/
static inline void schwab_partition3sp2(
uint32_t *array,
int low,
int high,
uint32_t pivotval,
int *plo,
int *phi) {
/* Invariant for left: index until smaller (than pivot) elements lay */
int il = (low - 1);
/* Invariant for right: index until (from top) bigger elements lay */
int ir = (high + 1);
/* Indices from where we swap left and right into "is" (and sometimes swap among here too) */
int jl = low;
int jr = high;
while(jl <= jr) {
/* Handle left and find wrongly placed element */
while((array[jl] <= pivotval) && (jl <= jr)) {
int isNonPivot = (array[jl] != pivotval);
int nonSameIndex = (il + 1 != jl);
if(isNonPivot & nonSameIndex)
schwab_swap(&array[il + 1], &array[jl]);
il += isNonPivot;
++jl;
}
/* Handle right and find wrongly placed element */
while((array[jr] >= pivotval) && (jl <= jr)) {
int isNonPivot = (array[jr] != pivotval);
int nonSameIndex = (ir - 1 != jr);
if(isNonPivot & nonSameIndex)
schwab_swap(&array[ir - 1], &array[jr]);
ir -= isNonPivot;
--jr;
}
/* Swap the two found elements that are wrongly placed */
if(jl < jr) schwab_swap(&array[jl], &array[jr]);
}
/* Output the partition points */
*plo = il + 1; /* XXX: changed from qsort.h to +1 here! */
*phi = ir;
}
/**
* 4-way partitioning
*
* Expects: arr[plo] <= kmid <= arr[phi]
* Results: arr[low..plo - 1] <= arr[plo..pmid - 1] <= arr[pmid..phi - 1] <= arr[phi.. high]
*
* Also: Adding together lengths of all results arrays shrinks by 1 compared to start arr.
* This means that we ensure recursions / loops always end in quicksort...
*
* @param arr The array to partition
* @param low Inclusive smallest index.
* @param high Inclusive highest index.
* @param plo IN-OUT: input low pivot, output - see "Results:"
* @param kmid IN: The mid spliting value (like a pivot value, but can be imaginary nonexistent)
* @param pmid OUT: output - see "Results:"
* @param phi IN-OUT: input high pivot, output - see "Results:"
* @returns 1 if there is need to process the mid two blocks! Otherwise 0.
*/
static inline int schwab_partition(
uint32_t *arr,
int low,
int high,
int *plo,
uint32_t kmid,
int *pmid,
int *phi) {
/* Keys only - no element copy is made here */
uint32_t klo = arr[*plo];
uint32_t khi = arr[*phi];
/* Without this, constant and smallrange is very slooOOoow */
if(khi - klo < SCHWAB_DELTA_THRESHOLD) {
/* Use three-way which defeats smallrange */
/* Outer sort func also optimized for two sides */
/* check for size for which recurse which not! */
schwab_partition3sp2(arr, low, high, kmid, plo, phi);
/* No need to process the midle two blocks - all pivot there */
return 0;
}
/* [*] Swapping arr[phi]<->arr[high] ensures stop condition later */
uint32_t tmphi = arr[*phi];
arr[*phi] = arr[high];
arr[high] = tmphi;
/* Aren't inclusive end indices of 4 "blocks" - b0 is smallest vals */
int b0 = low, b1 = low, b2 = low;
for(int b3 = low; b3 < high; ++b3) {
/* This I moved to be first to avoid unnecessary curr copy below */
if(arr[b3] >= khi) {
continue;
}
/* TODO: should be copy of whole element when not just uint32s! */
uint32_t curr = arr[b3];
/* TODO: We can do "ILP-memcpy"s here:
*
* Key from b2->b3, value from b2->b3, key from b1->b2, value from b1... etc
* This is likely faster than calling a memcpy if we code this for not just uint32s!
*/
if(curr < klo) {
arr[b3] = arr[b2];
arr[b2] = arr[b1];
arr[b1] = arr[b0];
arr[b0] = curr;
++b0; ++b1; ++b2;
continue;
}
if(curr < kmid) {
arr[b3] = arr[b2];
arr[b2] = arr[b1];
arr[b1] = curr;
++b1; ++b2;
} else {
arr[b3] = arr[b2];
arr[b2] = curr;
++b2;
}
}
/* [*] Swap the chosen pivot to begin of last block */
/* This way we can return bigger index and by that */
/* this always removes an element per run at least */
tmphi = arr[b2];
arr[b2] = arr[high];
arr[high] = tmphi;
++b2;
/* Handle output vars as per doc comment */
*plo = b0;
*pmid = b1;
*phi = b2;
/* There are mid parts to process */
return 1;
}
/** Swabic-sort its somewhat similar to quicksort but 4-way and tricky */
static inline void schwab_sort(
uint32_t *array,
int low,
int high,
sch_rand_state *state) {
/* Loop handles longest sub-sort-task which ensused log tree depth */
while(low < high) {
int r0 = schwab_pick_pivot(state, (high + 1) - low) + low;
int r1 = schwab_pick_pivot(state, (high + 1) - low) + low;
uint32_t klo = array[r0];
uint32_t khi = array[r1];
int plo = r0;
int phi = r1;
if(klo > khi) {
uint32_t ktmp = klo;
klo = khi;
khi = ktmp;
plo = r1;
phi = r0;
}
uint32_t kmid = klo + (khi - klo) / 2;
int pmid;
int needmid = schwab_partition(array, low, high, &plo, kmid, &pmid, &phi);
/* See where NOT to recurse to avoid worst case stack depth */
/* Rem.: These might be "not real" length but we only use them to comparisons */
/* REM.: The "real" lengths might be off-by-one but these are FASTER! */
int lolen = plo - low;
int hilen = high - phi;
/* Rewrite loop for worst subtask goal and recurse others! */
/* Let the branch predictor try to predict input data path */
/* Rem.: Best would be to check for biggest in all 4 block */
/* But that would complicate codes above this point! */
/* Rem.: Order of operations try to be a cache-friendly as */
/* possible, but had to put loops changes to the end */
if(lolen < hilen) {
schwab_sort(array, low, plo - 1, state);
if(needmid) {
schwab_sort(array, plo, pmid - 1, state);
schwab_sort(array, pmid, phi - 1, state);
}
low = phi;
/* high = high; */
} else {
schwab_sort(array, phi, high, state);
if(needmid) {
schwab_sort(array, pmid, phi - 1, state);
schwab_sort(array, plo, pmid - 1, state);
}
/* low = low; */
high = plo - 1;
}
}
}
#endif /* SCHWAB_SORT_H */