From a17b284c8acc7a31967e2fed6e24815dcd4e8c42 Mon Sep 17 00:00:00 2001 From: Richard Thier Date: Mon, 29 Sep 2025 02:24:50 +0200 Subject: [PATCH] added three-plus-one pass radix which performs very well, but there is 0.8 ILP only because of lot of cache misses. worse perf on random than magyarsort, but better than ska_copy and best worst cases - might hook into thier2? --- perf_cache.sh | 3 ++ threepass.h | 127 ++++++++++++++++++++++++++++++++++++++++++++++++++ ypsu.cpp | 30 ++++++++++-- 3 files changed, 155 insertions(+), 5 deletions(-) create mode 100755 perf_cache.sh create mode 100644 threepass.h diff --git a/perf_cache.sh b/perf_cache.sh new file mode 100755 index 0000000..996e869 --- /dev/null +++ b/perf_cache.sh @@ -0,0 +1,3 @@ +#!/bin/sh + +perf stat -e cache-references,cache-misses,cycles,instructions,branches,faults,migrations ./ypsu.out diff --git a/threepass.h b/threepass.h new file mode 100644 index 0000000..99b45a5 --- /dev/null +++ b/threepass.h @@ -0,0 +1,127 @@ +#ifndef THREE_PASS_H +#define THREE_PASS_H + +/* How the 32 bits gets separated? */ +#define TPB1 11 // top +#define TPB2 11 // mid +#define TPB3 10 // bottom + +static inline constexpr uint32_t min3u32(uint32_t a, uint32_t b, uint32_t c) { + return (a <= b) ? + ((a <= c) ? a : c) : + ((b <= c) ? b : c); +} + +/** Simple three-pass (ok: 3 + 1) bottom-up radix sort for uint32_t */ +static inline void threepass(uint32_t *a, int n) noexcept { + constexpr int shr1 = TPB3 + TPB2; + constexpr int shr2 = TPB3; + constexpr int shr3 = 0; + constexpr int mask1 = (1 << TPB1) - 1; + constexpr int mask2 = (1 << TPB2) - 1; + constexpr int mask3 = (1 << TPB3) - 1; + + /* helper buffers. */ + int sz = n * sizeof(a[0]); + + static thread_local uint32_t bucket1[1 << TPB1]; + memset(bucket1, 0, (1 << TPB1) * sizeof(uint32_t)); + static thread_local uint32_t bucket2[1 << TPB2]; + memset(bucket2, 0, (1 << TPB2) * sizeof(uint32_t)); + static thread_local uint32_t bucket3[1 << TPB3]; + memset(bucket3, 0, (1 << TPB3) * sizeof(uint32_t)); + + uint32_t *buf = (uint32_t *)malloc(sz); + assert(buf != NULL); + memset(buf, 0, n * sizeof(uint32_t)); // XXX: TODO: REMOVE + + /* Count occurences (can count together with good ILP) */ + #pragma GCC unroll 64 + for(uint32_t i = 0; i < n; ++i) { + ++bucket1[(a[i] >> shr1) & mask1]; + ++bucket2[(a[i] >> shr2) & mask2]; + ++bucket3[(a[i] >> shr3) & mask3]; + } + + /* Count prefix sums - try as much ILP as possible because bigger arrays than usual! */ + uint32_t prev1 = 0; + uint32_t prev2 = 0; + uint32_t prev3 = 0; + uint32_t common = min3u32( + (1 << TPB1), + (1 << TPB2), + (1 << TPB3) + ); + int i = 0; + #pragma GCC unroll 8 + for (; i < common; ++i) { + bucket1[i] += prev1; + prev1 = bucket1[i]; + bucket2[i] += prev2; + prev2 = bucket2[i]; + bucket3[i] += prev3; + prev3 = bucket3[i]; + } + /* Do remaining 1 */ + for (int j = i; j < (1 << TPB1); ++j) { + bucket1[j] += prev1; + prev1 = bucket1[j]; + } + /* Do remaining 2 */ + for (int j = i; j< (1 << TPB2); ++j) { + bucket2[j] += prev2; + prev2 = bucket2[j]; + } + /* Do remaining 3 */ + for (int j = i; j < (1 << TPB3); ++j) { + bucket3[j] += prev3; + prev3 = bucket3[j]; + } + + // Bottom digit + // right-to-left to ensure already sorted digits order we keep for iterations + #pragma GCC unroll 64 + for(uint32_t i = n; i > 0; --i) { + // Prefetch caches + // Get num and its new offset / location + auto num = a[i - 1]; + auto bkeyni = (num >> shr3) & mask3; + auto offset = --bucket3[bkeyni]; + + // Add to the proper target location + buf[offset] = num; + } + // Mid digit + // right-to-left to ensure already sorted digits order we keep for iterations + #pragma GCC unroll 64 + for(uint32_t i = n; i > 0; --i) { + // Prefetch caches + // Get num and its new offset / location + auto num = buf[i - 1]; + auto bkeyni = (num >> shr2) & mask2; + auto offset = --bucket2[bkeyni]; + + // Add to the proper target location + a[offset] = num; + } + // Top digit + // right-to-left to ensure already sorted digits order we keep for iterations + #pragma GCC unroll 64 + for(uint32_t i = n; i > 0; --i) { + // Prefetch caches + // Get num and its new offset / location + auto num = a[i - 1]; + auto bkeyni = (num >> shr1) & mask1; + auto offset = --bucket1[bkeyni]; + + // Add to the proper target location + buf[offset] = num; + } + + // Memcpy back! + memcpy(a, buf, n * sizeof(uint32_t)); + + free(buf); +} + +#endif /* THREE_PASS_H */ diff --git a/ypsu.cpp b/ypsu.cpp index 23df8b2..447aa2a 100644 --- a/ypsu.cpp +++ b/ypsu.cpp @@ -21,6 +21,7 @@ #include "qsort/zssort.h" #include "qsort/schwab_sort.h" #include "qsort/chatgpt_qs.h" +#include "threepass.h" // #define MAGYAR_SORT_DEFAULT_REUSE #include "magyarsort.h" @@ -218,6 +219,11 @@ static inline void do_thier2(uint32_t *a, int n) noexcept { thiersort2(a, &(tmp[0]), n, &state); } +/** 3+1 pass bottom-up radix */ +static inline void do_threepass(uint32_t *a, int n) noexcept { + threepass(a, n); +} + // mormord — Today at 2:27 AM // 1 2 2 2 3 // @@ -843,7 +849,8 @@ void measure_single(int n) { v = geninput(inputtype, n); //measure(inputtype, "sp", [&] { spsort(&v[0], v.size()); }); //measure(inputtype, "magyar", [&] { MagyarSort::sort(&v[0], v.size()); }); - measure(inputtype, "thier2", [&] { do_thier2(&v[0], v.size()); }); + //measure(inputtype, "thier2", [&] { do_thier2(&v[0], v.size()); }); + measure(inputtype, "threep", [&] { do_threepass(&v[0], v.size()); }); for (auto r : results) printf("%9.3fs", r.second); puts(""); @@ -857,7 +864,7 @@ void measure_single(int n) { puts(""); } -int main(void) { +int main(int argc, char **argv) { //int n = 100000000; //int n = 10000000; int n = 5000000; @@ -872,11 +879,16 @@ int main(void) { //int n = 180; //int n = 20; + if(argc > 1) { + const char* arg = argv[1]; + n = atoi(arg); + } + printf("Sorting %d elements:\n\n", n); // Uncomment this for profiling and alg! - //measure_single(n); - //return 0; + measure_single(n); + return 0; for (auto inputtype : inputtypes) { printf("%10s", inputtype.c_str()); @@ -899,6 +911,7 @@ int main(void) { w.swap(buf); } }); + w = v; measure(inputtype, "magyar", [&] { MagyarSort::sort(&w[0], w.size()); }); assert(w == expected); @@ -966,14 +979,17 @@ int main(void) { w = v; measure(inputtype, "zsr3_sp", [&] { do_zsr3_sp(&w[0], w.size()); }); assert(w == expected); - */ w = v; measure(inputtype, "zsr3_sp2", [&] { do_zsr3_sp2(&w[0], w.size()); }); assert(w == expected); + */ + /* + // TODO: This is buggy! See valgrind! w = v; measure(inputtype, "neoqs", [&] { do_neoqs(&w[0], w.size()); }); assert(w == expected); + */ w = v; measure(inputtype, "schwab", [&] { do_schwab(&w[0], w.size()); }); @@ -983,6 +999,10 @@ int main(void) { measure(inputtype, "thier2", [&] { do_thier2(&w[0], w.size()); }); assert(w == expected); + w = v; + measure(inputtype, "threep", [&] { do_threepass(&w[0], w.size()); }); + assert(w == expected); + /* w = v; measure(inputtype, "magbuck", [&] { magyar_bucket_sort(&w[0], w.size()); });