schwab: hax in regz - not helping extra much
This commit is contained in:
parent
c1d152c6f9
commit
0900fece84
@ -21,7 +21,7 @@
|
||||
|
||||
/** Below this many elements we do insertion sort */
|
||||
#ifndef SCHWAB_SELECTION_THRESHOLD
|
||||
#define SCHWAB_SELECTION_THRESHOLD 16
|
||||
#define SCHWAB_SELECTION_THRESHOLD 8
|
||||
#endif /* SCHWAB_SELECTION_THRESHOLD */
|
||||
|
||||
typedef uint32_t sch_rand_state;
|
||||
@ -51,8 +51,87 @@ static inline void schwab_swap(uint32_t *a, uint32_t *b) {
|
||||
*b = t;
|
||||
}
|
||||
|
||||
/** Branchless conditional swap */
|
||||
#define SCH_CSWAP(a, b) do { \
|
||||
uint32_t _x = (a), _y = (b); \
|
||||
uint32_t _gt = -(_x > _y); \
|
||||
uint32_t _tmp = (_x ^ _y) & _gt; \
|
||||
(a) = _x ^ _tmp; \
|
||||
(b) = _y ^ _tmp; \
|
||||
} while (0)
|
||||
|
||||
/**
|
||||
* Tiny unrolled register-heapsort of exactly 5 elements.
|
||||
*
|
||||
* This heap:
|
||||
*
|
||||
* a[0]
|
||||
* / \
|
||||
* a[1] a[2]
|
||||
* / \
|
||||
* a[3] a[4]
|
||||
*
|
||||
* @param a The array
|
||||
*/
|
||||
void sch_hsort5(uint32_t* a) {
|
||||
/* Build max heap */
|
||||
SCH_CSWAP(a[1], a[3]);
|
||||
SCH_CSWAP(a[1], a[4]);
|
||||
|
||||
/* Heapify(0) */
|
||||
/* Max child = (a[2] > a[1]) ? 2 : 1; */
|
||||
/* SCH_CSWAP(a[0], a[maxChild]) */
|
||||
uint32_t cmp = -(a[2] > a[1]);
|
||||
uint32_t i = (1 ^ 2) & cmp ^ 1;
|
||||
/* Right selects a[2] if cmp==1, else 1 */
|
||||
SCH_CSWAP(a[0], a[i]);
|
||||
|
||||
/* Sort phase */
|
||||
|
||||
/* 1st max to end */
|
||||
SCH_CSWAP(a[0], a[4]);
|
||||
/* heapify a[0..3] */
|
||||
cmp = -(a[2] > a[1]);
|
||||
i = (1 ^ 2) & cmp ^ 1;
|
||||
SCH_CSWAP(a[0], a[i]);
|
||||
|
||||
/* 2nd max to end */
|
||||
SCH_CSWAP(a[0], a[3]);
|
||||
/* heapify a[0..2] */
|
||||
cmp = -(a[2] > a[1]);
|
||||
i = (1 ^ 2) & cmp ^ 1;
|
||||
SCH_CSWAP(a[0], a[i]);
|
||||
|
||||
/* 3rd max to end */
|
||||
SCH_CSWAP(a[0], a[2]);
|
||||
|
||||
/* Final two */
|
||||
SCH_CSWAP(a[0], a[1]);
|
||||
}
|
||||
|
||||
/** Simple insertion sort for small cases */
|
||||
inline void sch_insertion_sort(uint32_t *arr, int low, int high) {
|
||||
/* Dual heapsort5 probably helps insertion speed */
|
||||
/* This is sane, because insertion benefits from */
|
||||
/* data being "basically nearly sorted" as input */
|
||||
if(high + 1 - low > 10) {
|
||||
sch_hsort5(&arr[0]);
|
||||
sch_hsort5(&arr[5]);
|
||||
}
|
||||
if(high + 1 - low > 20) {
|
||||
sch_hsort5(&arr[10]);
|
||||
sch_hsort5(&arr[15]);
|
||||
}
|
||||
if(high + 1 - low > 30) {
|
||||
sch_hsort5(&arr[20]);
|
||||
sch_hsort5(&arr[25]);
|
||||
}
|
||||
if(high + 1 - low > 40) {
|
||||
sch_hsort5(&arr[30]);
|
||||
sch_hsort5(&arr[35]);
|
||||
}
|
||||
|
||||
/* "Real" insertion sort part comes here */
|
||||
for(int i = low + 1; i <= high; ++i) {
|
||||
uint32_t key = arr[i];
|
||||
|
||||
@ -68,24 +147,6 @@ inline void sch_insertion_sort(uint32_t *arr, int low, int high) {
|
||||
}
|
||||
}
|
||||
|
||||
/** Simple insertion sort for small cases v2 - not necessarily better */
|
||||
inline void sch_insertion_sort2(uint32_t* arr, int low, int high) {
|
||||
for(int i = low + 1; i <= high; ++i) {
|
||||
uint32_t key = arr[i];
|
||||
|
||||
/* Separate load and compare to expose ILP */
|
||||
int j = i;
|
||||
#pragma GCC unroll 2
|
||||
while(j > 0) {
|
||||
uint32_t prev = arr[j - 1];
|
||||
if (prev <= key) break;
|
||||
arr[j] = prev;
|
||||
--j;
|
||||
}
|
||||
arr[j] = key;
|
||||
}
|
||||
}
|
||||
|
||||
/** Simple SELECTION sort for small cases - not necessarily better */
|
||||
inline void sch_selection_sort(uint32_t* arr, int low, int high) {
|
||||
#pragma GCC unroll 2
|
||||
|
Loading…
x
Reference in New Issue
Block a user