From a3643eba9b58abd5404dbf9281534c627fe1be1d Mon Sep 17 00:00:00 2001 From: Richard Thier Date: Thu, 11 Sep 2025 20:42:04 +0200 Subject: [PATCH] added thiersort2 - better than std, somewhat similar to schwab in perf but is a bucket sort - very interestingly not huge boost in bucketing speed --- thiersort2.h | 126 +++++++++++++++++++++++++++++++++++++++++++++++++++ ypsu.cpp | 41 ++++++++++++----- 2 files changed, 155 insertions(+), 12 deletions(-) create mode 100644 thiersort2.h diff --git a/thiersort2.h b/thiersort2.h new file mode 100644 index 0000000..66f0f0d --- /dev/null +++ b/thiersort2.h @@ -0,0 +1,126 @@ +#ifndef THIER_SORT2_H +#define THIER_SORT2_H +#include +#include "qsort/schwab_sort.h" +/* A non-implace tricky float-hackz based bucket sort variant. Uses schwabsort! */ + +/* Float and unsigned32 reinterpreter */ +union th2_fu { + float f; + uint32_t u; +}; +typedef union th2_fu th2_fu; + +/** Tells from the key which bucket it is in. */ +static inline uint32_t witch_bucket(uint32_t key) { + /* This is hackz to misuse int->float converter HEAVILY and IEE for bucketing */ + /* https://en.wikipedia.org/wiki/Single-precision_floating-point_format */ + + /* Old Hungarian ASM trick I know from Tomcat/Abaddon mailing list and prog.hu */ + /* "A következő nagyon fontos gondolat, hogy két lebegőpontos számot, */ + /* amennyiben pozitív, "simán" is összehasonlíthatunk" */ + /* See: https://prog.hu/cikkek/100239/fpu-gems */ + + /* This approach uses 12 bits from a 32 bit float to map onto a byte bucket index */ + th2_fu as; + as.f = (float) key; + uint32_t witch_base = (key <= 2) ? 0 : (as.u >> 23) - 128; // 0, [127..159] -> [0..31] + return witch_base * 8 + ((as.u >> (23 - 3)) & 7); +} + +/** + * Sort the array using the temporary array of the same size with fast bucket sort thiersort. + * + * @param arr The array to sort, will contain result afterwards + * @param temparr The temporary array with same size + * @param n Number of elements in arr and temparr + * @param rstate Create with sch_rand_state rstate = schwab_rand_state(junk_uint32_t); + */ +static inline void thiersort2(uint32_t *arr, uint32_t *temparr, int n, sch_rand_state *rstate) { + int bucket[256]; /* Inclusive */ + int bucket_end[256]; /* Not inclusive */ + + /* Check if need to sort at all - needed for invariants later */ + if(n < 2) { + return; + } + + /* Count */ + #pragma GCC unroll 4 + for(int i = 0; i < 256; ++i) { + bucket[i] = 0; + } + #pragma GCC unroll 4 + for(int i = 0; i < n; ++i) { + ++bucket[witch_bucket(arr[i])]; + } + + /* Prefix sum (like in Magyarsort) */ + uint32_t prev = 0; + for (int i = 0; i < 256; i++) { + bucket[i] += prev; + prev = bucket[i]; + } + + /* Save end-offsets */ + #pragma GCC unroll 4 + for(int i = 0; i < 256; ++i) { + bucket_end[i] = bucket[i]; + } + + /* arr -> temparr */ + /* Move to the buckets - backwards going save a few cache miss */ + /* Rem.: This also changes bucket[i] so they will point to bucket beginnings */ + #pragma GCC unroll 64 + for(int i = n; i > 0; --i) { + uint32_t num = arr[i - 1]; + uint32_t witch = witch_bucket(num); + int offset = (--bucket[witch]); + temparr[offset] = num; + } + + /* temparr -> arr each bucket and sort them in-place */ + #pragma GCC unroll 64 + for(int b = 0; b < 256; ++b) { + int begin = bucket[b]; + int end = bucket_end[b]; + + /* Ensure exists */ + if(begin >= end) { + continue; + } + + /* We make a three-way FAST quicksort partitioning with first elem pivot: */ + /* Basically a Lomuto-like unidirectional partitioning for pivot and two-way for small/big */ + int smalli = begin; + int biggi = end - 1; /* always exists */ + int i = begin; + uint32_t pivot = temparr[i]; + #pragma GCC unroll 4 + for(int j = begin + 1; j < end; ++j) { + if(temparr[j] == b) { + /* swap to front partition */ + ++i; + uint32_t tmp = temparr[i]; + temparr[i] = temparr[j]; + temparr[j] = tmp; + } else if(temparr[j] < b) { + /* copy to left */ + arr[smalli++] = temparr[j]; + } else { + /* copy to right */ + arr[biggi--] = temparr[j]; + } + } + /* Copy the mid elements back */ + #pragma GCC unroll 4 + for(int j = begin; j < i + 1; ++j) { + arr[smalli++] = temparr[j]; + } + + /* Call schwabsort */ + schwab_sort(arr, begin, end - 1, rstate); + } +} + +#endif /* THIER_SORT2_H */ diff --git a/ypsu.cpp b/ypsu.cpp index 880d40c..2fdd1e5 100644 --- a/ypsu.cpp +++ b/ypsu.cpp @@ -16,6 +16,7 @@ #include "ska_sort.hpp" #include "gptsort.h" #include "thiersort.h" +#include "thiersort2.h" #include "qsort/qsort.h" #include "qsort/zssort.h" #include "qsort/schwab_sort.h" @@ -208,6 +209,15 @@ static inline void do_schwab(uint32_t *a, int n) noexcept { schwab_sort(a, 0, n - 1, &state); } +/** thier2 */ +static inline void do_thier2(uint32_t *a, int n) noexcept { + assert(n * uint32_t(sizeof(a[0])) <= INT_MAX); + uint32_t junk; + sch_rand_state state = schwab_rand_state(junk); + std::vector tmp(n); + thiersort2(a, &(tmp[0]), n, &state); +} + // mormord — Today at 2:27 AM // 1 2 2 2 3 // @@ -849,6 +859,7 @@ void measure_single(int n) { int main(void) { //int n = 100000000; //int n = 10000000; + //int n = 5000000; int n = 1000000; //int n = 100000; //int n = 20000; @@ -857,7 +868,7 @@ int main(void) { //int n = 170; //int n = 100; //int n = 180; - //int n = 10; + //int n = 20; printf("Sorting %d elements:\n\n", n); @@ -878,7 +889,6 @@ int main(void) { /* w = v; measure(inputtype, "ska", [&] { ska_sort(std::begin(w), std::end(w)); }); - */ w = v; measure(inputtype, "ska_copy", [&] { std::vector buf(w.size()); @@ -886,11 +896,10 @@ int main(void) { w.swap(buf); } }); - /* + */ w = v; measure(inputtype, "magyar", [&] { MagyarSort::sort(&w[0], w.size()); }); assert(w == expected); - */ /* w = v; @@ -937,22 +946,26 @@ int main(void) { measure(inputtype, "qsr3", [&] { do_qsr3(&w[0], w.size()); }); assert(w == expected); */ + + /* + w = v; + measure(inputtype, "zsrc", [&] { do_zsrc(&w[0], w.size()); }); + assert(w == expected); + */ + + /* + w = v; + measure(inputtype, "meanqs", [&] { do_meanqs(&w[0], w.size()); }); + assert(w == expected); w = v; measure(inputtype, "zsr3", [&] { do_zsr3(&w[0], w.size()); }); assert(w == expected); w = v; measure(inputtype, "zsr3_sp", [&] { do_zsr3_sp(&w[0], w.size()); }); - assert(w == expected); - w = v; - measure(inputtype, "zsr3_sp2", [&] { do_zsr3_sp2(&w[0], w.size()); }); - assert(w == expected); - /* - w = v; - measure(inputtype, "zsrc", [&] { do_zsrc(&w[0], w.size()); }); assert(w == expected); */ w = v; - measure(inputtype, "meanqs", [&] { do_meanqs(&w[0], w.size()); }); + measure(inputtype, "zsr3_sp2", [&] { do_zsr3_sp2(&w[0], w.size()); }); assert(w == expected); w = v; @@ -963,6 +976,10 @@ int main(void) { measure(inputtype, "schwab", [&] { do_schwab(&w[0], w.size()); }); assert(w == expected); + w = v; + measure(inputtype, "thier2", [&] { do_thier2(&w[0], w.size()); }); + assert(w == expected); + /* w = v; measure(inputtype, "magbuck", [&] { magyar_bucket_sort(&w[0], w.size()); });