diff --git a/thiersort2.h b/thiersort2.h index bcc93d7..19c13b8 100644 --- a/thiersort2.h +++ b/thiersort2.h @@ -2,7 +2,8 @@ #define THIER_SORT2_H #include #include "qsort/schwab_sort.h" -/* A non-implace tricky float-hackz based bucket sort variant. Uses schwabsort! */ +#include "threepass.h" +/* A non-implace tricky float-hackz based bucket sort variant. Uses schwabsort or threepass! */ #ifdef _MSC_VER #define KM_PREFETCH(x) @@ -147,8 +148,13 @@ static inline void thiersort2(uint32_t *arr, uint32_t *temparr, int n, sch_rand_ } /* Call schwabsort - only to [begin..smalli) and (biggie..end) */ +#ifdef USE_SCHWAB schwab_sort(arr, begin, smalli - 1, rstate); schwab_sort(arr, biggi + 1, end - 1, rstate); +#else + threepass(&arr[begin], smalli - begin); + threepass(&arr[biggi + 1], end - (biggi + 1)); +#endif /* USE_SCHWAB */ } } diff --git a/threepass.h b/threepass.h index 99b45a5..d78fc26 100644 --- a/threepass.h +++ b/threepass.h @@ -2,9 +2,9 @@ #define THREE_PASS_H /* How the 32 bits gets separated? */ -#define TPB1 11 // top +#define TPB1 12 // top #define TPB2 11 // mid -#define TPB3 10 // bottom +#define TPB3 9 // bottom static inline constexpr uint32_t min3u32(uint32_t a, uint32_t b, uint32_t c) { return (a <= b) ? @@ -33,7 +33,6 @@ static inline void threepass(uint32_t *a, int n) noexcept { uint32_t *buf = (uint32_t *)malloc(sz); assert(buf != NULL); - memset(buf, 0, n * sizeof(uint32_t)); // XXX: TODO: REMOVE /* Count occurences (can count together with good ILP) */ #pragma GCC unroll 64 @@ -80,9 +79,10 @@ static inline void threepass(uint32_t *a, int n) noexcept { // Bottom digit // right-to-left to ensure already sorted digits order we keep for iterations - #pragma GCC unroll 64 + #pragma GCC unroll 16 for(uint32_t i = n; i > 0; --i) { // Prefetch caches + //__builtin_prefetch(&a[i-8]); // Get num and its new offset / location auto num = a[i - 1]; auto bkeyni = (num >> shr3) & mask3; @@ -93,9 +93,10 @@ static inline void threepass(uint32_t *a, int n) noexcept { } // Mid digit // right-to-left to ensure already sorted digits order we keep for iterations - #pragma GCC unroll 64 + #pragma GCC unroll 16 for(uint32_t i = n; i > 0; --i) { // Prefetch caches + //__builtin_prefetch(&buf[i-8]); // Get num and its new offset / location auto num = buf[i - 1]; auto bkeyni = (num >> shr2) & mask2; @@ -106,9 +107,10 @@ static inline void threepass(uint32_t *a, int n) noexcept { } // Top digit // right-to-left to ensure already sorted digits order we keep for iterations - #pragma GCC unroll 64 + #pragma GCC unroll 16 for(uint32_t i = n; i > 0; --i) { // Prefetch caches + // __builtin_prefetch(&a[i-16]); // Get num and its new offset / location auto num = a[i - 1]; auto bkeyni = (num >> shr1) & mask1; diff --git a/ypsu.cpp b/ypsu.cpp index 447aa2a..c9bee07 100644 --- a/ypsu.cpp +++ b/ypsu.cpp @@ -887,8 +887,8 @@ int main(int argc, char **argv) { printf("Sorting %d elements:\n\n", n); // Uncomment this for profiling and alg! - measure_single(n); - return 0; + //measure_single(n); + //return 0; for (auto inputtype : inputtypes) { printf("%10s", inputtype.c_str());