diff --git a/schwab_sort.h b/schwab_sort.h index a11e0bb..976fcd3 100644 --- a/schwab_sort.h +++ b/schwab_sort.h @@ -21,7 +21,7 @@ /** Below this many elements we do insertion sort */ #ifndef SCHWAB_SELECTION_THRESHOLD -#define SCHWAB_SELECTION_THRESHOLD 16 +#define SCHWAB_SELECTION_THRESHOLD 8 #endif /* SCHWAB_SELECTION_THRESHOLD */ typedef uint32_t sch_rand_state; @@ -51,8 +51,87 @@ static inline void schwab_swap(uint32_t *a, uint32_t *b) { *b = t; } +/** Branchless conditional swap */ +#define SCH_CSWAP(a, b) do { \ + uint32_t _x = (a), _y = (b); \ + uint32_t _gt = -(_x > _y); \ + uint32_t _tmp = (_x ^ _y) & _gt; \ + (a) = _x ^ _tmp; \ + (b) = _y ^ _tmp; \ +} while (0) + +/** + * Tiny unrolled register-heapsort of exactly 5 elements. + * + * This heap: + * + * a[0] + * / \ + * a[1] a[2] + * / \ + * a[3] a[4] + * + * @param a The array + */ +void sch_hsort5(uint32_t* a) { + /* Build max heap */ + SCH_CSWAP(a[1], a[3]); + SCH_CSWAP(a[1], a[4]); + + /* Heapify(0) */ + /* Max child = (a[2] > a[1]) ? 2 : 1; */ + /* SCH_CSWAP(a[0], a[maxChild]) */ + uint32_t cmp = -(a[2] > a[1]); + uint32_t i = (1 ^ 2) & cmp ^ 1; + /* Right selects a[2] if cmp==1, else 1 */ + SCH_CSWAP(a[0], a[i]); + + /* Sort phase */ + + /* 1st max to end */ + SCH_CSWAP(a[0], a[4]); + /* heapify a[0..3] */ + cmp = -(a[2] > a[1]); + i = (1 ^ 2) & cmp ^ 1; + SCH_CSWAP(a[0], a[i]); + + /* 2nd max to end */ + SCH_CSWAP(a[0], a[3]); + /* heapify a[0..2] */ + cmp = -(a[2] > a[1]); + i = (1 ^ 2) & cmp ^ 1; + SCH_CSWAP(a[0], a[i]); + + /* 3rd max to end */ + SCH_CSWAP(a[0], a[2]); + + /* Final two */ + SCH_CSWAP(a[0], a[1]); +} + /** Simple insertion sort for small cases */ inline void sch_insertion_sort(uint32_t *arr, int low, int high) { + /* Dual heapsort5 probably helps insertion speed */ + /* This is sane, because insertion benefits from */ + /* data being "basically nearly sorted" as input */ + if(high + 1 - low > 10) { + sch_hsort5(&arr[0]); + sch_hsort5(&arr[5]); + } + if(high + 1 - low > 20) { + sch_hsort5(&arr[10]); + sch_hsort5(&arr[15]); + } + if(high + 1 - low > 30) { + sch_hsort5(&arr[20]); + sch_hsort5(&arr[25]); + } + if(high + 1 - low > 40) { + sch_hsort5(&arr[30]); + sch_hsort5(&arr[35]); + } + + /* "Real" insertion sort part comes here */ for(int i = low + 1; i <= high; ++i) { uint32_t key = arr[i]; @@ -68,24 +147,6 @@ inline void sch_insertion_sort(uint32_t *arr, int low, int high) { } } -/** Simple insertion sort for small cases v2 - not necessarily better */ -inline void sch_insertion_sort2(uint32_t* arr, int low, int high) { - for(int i = low + 1; i <= high; ++i) { - uint32_t key = arr[i]; - - /* Separate load and compare to expose ILP */ - int j = i; - #pragma GCC unroll 2 - while(j > 0) { - uint32_t prev = arr[j - 1]; - if (prev <= key) break; - arr[j] = prev; - --j; - } - arr[j] = key; - } -} - /** Simple SELECTION sort for small cases - not necessarily better */ inline void sch_selection_sort(uint32_t* arr, int low, int high) { #pragma GCC unroll 2