schwab: hax in regz - not helping extra much

This commit is contained in:
Richard Thier 2025-05-09 06:35:50 +02:00
parent c1d152c6f9
commit 0900fece84

View File

@ -21,7 +21,7 @@
/** Below this many elements we do insertion sort */
#ifndef SCHWAB_SELECTION_THRESHOLD
#define SCHWAB_SELECTION_THRESHOLD 16
#define SCHWAB_SELECTION_THRESHOLD 8
#endif /* SCHWAB_SELECTION_THRESHOLD */
typedef uint32_t sch_rand_state;
@ -51,8 +51,87 @@ static inline void schwab_swap(uint32_t *a, uint32_t *b) {
*b = t;
}
/** Branchless conditional swap */
#define SCH_CSWAP(a, b) do { \
uint32_t _x = (a), _y = (b); \
uint32_t _gt = -(_x > _y); \
uint32_t _tmp = (_x ^ _y) & _gt; \
(a) = _x ^ _tmp; \
(b) = _y ^ _tmp; \
} while (0)
/**
* Tiny unrolled register-heapsort of exactly 5 elements.
*
* This heap:
*
* a[0]
* / \
* a[1] a[2]
* / \
* a[3] a[4]
*
* @param a The array
*/
void sch_hsort5(uint32_t* a) {
/* Build max heap */
SCH_CSWAP(a[1], a[3]);
SCH_CSWAP(a[1], a[4]);
/* Heapify(0) */
/* Max child = (a[2] > a[1]) ? 2 : 1; */
/* SCH_CSWAP(a[0], a[maxChild]) */
uint32_t cmp = -(a[2] > a[1]);
uint32_t i = (1 ^ 2) & cmp ^ 1;
/* Right selects a[2] if cmp==1, else 1 */
SCH_CSWAP(a[0], a[i]);
/* Sort phase */
/* 1st max to end */
SCH_CSWAP(a[0], a[4]);
/* heapify a[0..3] */
cmp = -(a[2] > a[1]);
i = (1 ^ 2) & cmp ^ 1;
SCH_CSWAP(a[0], a[i]);
/* 2nd max to end */
SCH_CSWAP(a[0], a[3]);
/* heapify a[0..2] */
cmp = -(a[2] > a[1]);
i = (1 ^ 2) & cmp ^ 1;
SCH_CSWAP(a[0], a[i]);
/* 3rd max to end */
SCH_CSWAP(a[0], a[2]);
/* Final two */
SCH_CSWAP(a[0], a[1]);
}
/** Simple insertion sort for small cases */
inline void sch_insertion_sort(uint32_t *arr, int low, int high) {
/* Dual heapsort5 probably helps insertion speed */
/* This is sane, because insertion benefits from */
/* data being "basically nearly sorted" as input */
if(high + 1 - low > 10) {
sch_hsort5(&arr[0]);
sch_hsort5(&arr[5]);
}
if(high + 1 - low > 20) {
sch_hsort5(&arr[10]);
sch_hsort5(&arr[15]);
}
if(high + 1 - low > 30) {
sch_hsort5(&arr[20]);
sch_hsort5(&arr[25]);
}
if(high + 1 - low > 40) {
sch_hsort5(&arr[30]);
sch_hsort5(&arr[35]);
}
/* "Real" insertion sort part comes here */
for(int i = low + 1; i <= high; ++i) {
uint32_t key = arr[i];
@ -68,24 +147,6 @@ inline void sch_insertion_sort(uint32_t *arr, int low, int high) {
}
}
/** Simple insertion sort for small cases v2 - not necessarily better */
inline void sch_insertion_sort2(uint32_t* arr, int low, int high) {
for(int i = low + 1; i <= high; ++i) {
uint32_t key = arr[i];
/* Separate load and compare to expose ILP */
int j = i;
#pragma GCC unroll 2
while(j > 0) {
uint32_t prev = arr[j - 1];
if (prev <= key) break;
arr[j] = prev;
--j;
}
arr[j] = key;
}
}
/** Simple SELECTION sort for small cases - not necessarily better */
inline void sch_selection_sort(uint32_t* arr, int low, int high) {
#pragma GCC unroll 2