#ifndef SWAB_SORT_H #define SWAB_SORT_H /* A fast quicksort-like new alg created in Csolnok, Hungary with: * - 4-way partitioning with 0..5 copies (not swaps) per elem per run * - ensured O(log2(n)) worst recursion depth * * LICENCE: CC-BY, 2025 May 08 * Author: Richárd István Thier (also author of the Magyarsort) */ typedef uint32_t sch_rand_state; /** Create rand state for schwab_sort using a seed - can give 0 if uninterested */ static inline sch_rand_state schwab_rand_state(uint32_t seed) { return seed; } /** 32-bit LCG for fast random generations - from my fastrand.h */ static inline uint32_t schwab_lcg(sch_rand_state *state) { *state = *state * 1664525u + 1013904223u; return *state; } /** Get pivot index in [0, len-1] without modulus - from my fastrand.h */ static inline uint32_t schwab_pick_pivot(sch_rand_state *state, uint32_t len) { uint32_t rand = schwab_lcg(state); /* Multiply by len, take the upper 32 bits of the 64-bit result */ return (uint32_t)(((uint64_t)rand * len) >> 32); } /** * 4-way partitioning * * Expects: arr[plo] <= kmid <= arr[phi] * Results: arr[low..plo - 1] <= arr[plo..pmid - 1] <= arr[pmid..phi - 1] <= arr[phi.. high] * * Also: Adding together lengths of all results arrays shrinks by 1 compared to start arr. * This means that we ensure recursions / loops always end in quicksort... * * @param arr The array to partition * @param low Inclusive smallest index. * @param high Inclusive highest index. * @param plo IN-OUT: input low pivot, output index until elements <= low pivot. * @param kmid IN: The mid spliting value (like a pivot value, but can be imaginary nonexistent) * @param pmid OUT: output index until elements <= mid pivot. * @param phi IN-OUT: input high pivot, output index until elements <= high pivot. */ static inline void schwab_partition( uint32_t *arr, int low, int high, int *plo, uint32_t kmid, int *pmid, int *phi) { /* Keys only - no element copy is made here */ uint32_t klo = arr[*plo]; uint32_t khi = arr[*phi]; /* [*] Swapping arr[phi]<->arr[high] ensures stop condition later */ uint32_t tmphi = arr[*phi]; arr[*phi] = arr[high]; arr[high] = tmphi; /* Aren't inclusive end indices of 4 "blocks" - b0 is smallest vals */ int b0 = low, b1 = low, b2 = low, b3 = low; while(b3 < high) { /* This I moved to be first for hot code path for constant / smallrange */ if(arr[b3] >= khi) { ++b3; continue; } /* TODO: should be copy of whole element when not just uint32s! */ uint32_t curr = arr[b3]; /* TODO: We can do "ILP-memcpy"s here: * * Key from b2->b3, value from b2->b3, key from b1->b2, value from b1... etc * This is likely faster than calling a memcpy if we code this for not just uint32s! */ if(curr < klo) { arr[b3] = arr[b2]; arr[b2] = arr[b1]; arr[b1] = arr[b0]; arr[b0] = curr; ++b0; ++b1; ++b2; ++b3; continue; } if(curr < kmid) { arr[b3] = arr[b2]; arr[b2] = arr[b1]; arr[b1] = curr; ++b1; ++b2; ++b3; } else { arr[b3] = arr[b2]; arr[b2] = curr; ++b2; ++b3; } } /* [*] Swap the chosen pivot to begin of last block */ /* This way we can return bigger index and by that */ /* this always removes an element per run at least */ tmphi = arr[b2]; arr[b2] = arr[high]; arr[high] = tmphi; ++b2; /* Handle output vars as per doc comment */ *plo = b0; *pmid = b1; *phi = b2; /* Because of: [*] */ } /** Swabic-sort its somewhat similar to quicksort but 4-way and tricky */ static inline void schwab_sort( uint32_t *array, int low, int high, sch_rand_state *state) { /* Loop handles longest sub-sort-task which ensused log tree depth */ while(low < high) { int r0 = schwab_pick_pivot(state, (high + 1) - low) + low; int r1 = schwab_pick_pivot(state, (high + 1) - low) + low; uint32_t klo = array[r0]; uint32_t khi = array[r1]; int plo = r0; int phi = r1; if(klo > khi) { uint32_t ktmp = klo; klo = khi; khi = ktmp; plo = r1; phi = r0; } uint32_t kmid = klo + (khi - klo) / 2; int pmid; schwab_partition(array, low, high, &plo, kmid, &pmid, &phi); /* See where NOT to recurse to avoid worst case stack depth */ /* Rem.: These might be "not real" length but we only use them to comparisons */ /* REM.: The "real" lengths might be off-by-one but these are FASTER! */ int lolen = plo - low; int hilen = high - phi; /* Rewrite loop for worst subtask goal and recurse others! */ /* Let the branch predictor try to predict input data path */ /* Rem.: Best would be to check for biggest in all 4 block */ /* But that would complicate codes above this point! */ /* Rem.: Order of operations try to be a cache-friendly as */ /* possible, but had to put loops changes to the end */ if(lolen < hilen) { schwab_sort(array, low, plo - 1, state); schwab_sort(array, plo, pmid - 1, state); schwab_sort(array, pmid, phi - 1, state); low = phi; /* high = high; */ } else { schwab_sort(array, phi, high, state); schwab_sort(array, pmid, phi - 1, state); schwab_sort(array, plo, pmid - 1, state); /* low = low; */ high = plo - 1; } } } #endif /* SWAB_SORT_H */