diff --git a/thiersort2.h b/thiersort2.h index 8a7b714..a09e7fd 100644 --- a/thiersort2.h +++ b/thiersort2.h @@ -41,7 +41,8 @@ static inline uint32_t witch_bucket(uint32_t key) { as.f = (float) key; uint32_t witch_base = (key <= 2) ? 0 : (as.u >> 23) - 128; // 0, [127..159] -> [0..31] - return witch_base * 8 + ((as.u >> (23 - 3)) & 7); + // return witch_base * 8 + ((as.u >> (23 - 3)) & 7); // 0..255 + return witch_base * 16 + ((as.u >> (23 - 4)) & 15); // 0..255 /* Alternative (but I measure it being worse): return (as.u >> 23); */ @@ -56,8 +57,8 @@ static inline uint32_t witch_bucket(uint32_t key) { * @param rstate Create with sch_rand_state rstate = schwab_rand_state(junk_uint32_t); */ static inline void thiersort2(uint32_t *arr, uint32_t *temparr, int n, sch_rand_state *rstate) { - int bucket[256]; /* Inclusive */ - int bucket_end[256]; /* Not inclusive */ + int bucket[512]; /* Inclusive */ + int bucket_end[512]; /* Not inclusive */ /* Check if need to sort at all - needed for invariants later */ if(n < 2) { @@ -66,7 +67,7 @@ static inline void thiersort2(uint32_t *arr, uint32_t *temparr, int n, sch_rand_ /* Count */ #pragma GCC unroll 64 - for(int i = 0; i < 256; ++i) { + for(int i = 0; i < 512; ++i) { bucket[i] = 0; } #pragma GCC unroll 64 @@ -77,14 +78,14 @@ static inline void thiersort2(uint32_t *arr, uint32_t *temparr, int n, sch_rand_ /* Prefix sum (like in Magyarsort) */ uint32_t prev = 0; #pragma GCC unroll 4 - for (int i = 0; i < 256; i++) { + for (int i = 0; i < 512; i++) { bucket[i] += prev; prev = bucket[i]; } /* Save end-offsets */ #pragma GCC unroll 64 - for(int i = 0; i < 256; ++i) { + for(int i = 0; i < 512; ++i) { bucket_end[i] = bucket[i]; } @@ -101,7 +102,7 @@ static inline void thiersort2(uint32_t *arr, uint32_t *temparr, int n, sch_rand_ /* temparr -> arr each bucket and sort them in-place */ #pragma GCC unroll 64 - for(int b = 0; b < 256; ++b) { + for(int b = 0; b < 512; ++b) { int begin = bucket[b]; int end = bucket_end[b]; @@ -118,18 +119,24 @@ static inline void thiersort2(uint32_t *arr, uint32_t *temparr, int n, sch_rand_ uint32_t pivot = temparr[i]; #pragma GCC unroll 4 for(int j = begin + 1; j < end; ++j) { - if(UNLIKELY(temparr[j] == pivot)) { + if(LIKELY(temparr[j] != pivot)) { + /* Branchless partitioning */ + /* copy to left */ + /* Rem.: Because we overwrite by copy we can simplify this line */ + /* arr[smalli] = (temparr[j] < pivot) ? temparr[j] : arr[smalli]; */ + arr[smalli] = temparr[j]; + smalli += (temparr[j] < pivot); + /* copy to right */ + /* Rem.: Because we overwrite by copy we can simplify this line */ + /* arr[biggi] = (temparr[j] > pivot) ? temparr[j] : arr[biggi]; */ + arr[biggi] = temparr[j]; + biggi -= (temparr[j] > pivot); + } else { /* swap to front partition */ ++i; uint32_t tmp = temparr[i]; temparr[i] = temparr[j]; temparr[j] = tmp; - } else if(temparr[j] < pivot) { - /* copy to left */ - arr[smalli++] = temparr[j]; - } else { - /* copy to right */ - arr[biggi--] = temparr[j]; } } /* Copy the mid elements back */ diff --git a/ypsu.cpp b/ypsu.cpp index eff4284..23df8b2 100644 --- a/ypsu.cpp +++ b/ypsu.cpp @@ -859,8 +859,8 @@ void measure_single(int n) { int main(void) { //int n = 100000000; - int n = 10000000; - //int n = 5000000; + //int n = 10000000; + int n = 5000000; //int n = 1000000; //int n = 100000; //int n = 20001; @@ -875,8 +875,8 @@ int main(void) { printf("Sorting %d elements:\n\n", n); // Uncomment this for profiling and alg! - measure_single(n); - return 0; + //measure_single(n); + //return 0; for (auto inputtype : inputtypes) { printf("%10s", inputtype.c_str());