From 7d407000fe6bea74600ad8e1a1760fe7c603ac06 Mon Sep 17 00:00:00 2001 From: Richard Thier Date: Wed, 1 Oct 2025 16:49:00 +0200 Subject: [PATCH] added pre-randomized sorts (not so great so far - probably too much cache misses) --- .gitmodules | 3 +++ cache_miss_flamegraph.sh | 5 ++++- fastrand | 1 + randominus.h | 47 ++++++++++++++++++++++++++++++++++++++++ ypsu.cpp | 34 +++++++++++++++++++++++++---- 5 files changed, 85 insertions(+), 5 deletions(-) create mode 160000 fastrand create mode 100644 randominus.h diff --git a/.gitmodules b/.gitmodules index e6b053a..4cb8433 100644 --- a/.gitmodules +++ b/.gitmodules @@ -4,3 +4,6 @@ [submodule "FlameGraph"] path = FlameGraph url = https://github.com/brendangregg/FlameGraph +[submodule "fastrand"] + path = fastrand + url = ssh://gitea@magosit.hu:8122/prenex/fastrand.git diff --git a/cache_miss_flamegraph.sh b/cache_miss_flamegraph.sh index ce49587..bfb1dda 100755 --- a/cache_miss_flamegraph.sh +++ b/cache_miss_flamegraph.sh @@ -1,6 +1,9 @@ #!/bin/sh -rm perf.data; perf record -e L1-dcache-load-misses:u -c 1000 -g -- ./ypsu.out +# time based sampling +rm perf.data; perf record -e L1-dcache-load-misses:u -F 99 -g -- ./ypsu.out +# counters only - might work or not +#rm perf.data; perf record -e L1-dcache-load-misses:u -c 1000 -g -- ./ypsu.out perf script | FlameGraph/stackcollapse-perf.pl > out.perf-folded FlameGraph/flamegraph.pl out.perf-folded > perf.svg brave perf.svg diff --git a/fastrand b/fastrand new file mode 160000 index 0000000..42943f4 --- /dev/null +++ b/fastrand @@ -0,0 +1 @@ +Subproject commit 42943f467831bfcc123214659f4b5b7f5a5e2c54 diff --git a/randominus.h b/randominus.h new file mode 100644 index 0000000..67709ae --- /dev/null +++ b/randominus.h @@ -0,0 +1,47 @@ +#ifndef RANDOMINUS_H +#define RANDOMINUS_H +/* To randomize an array - hopefully as fast as possible */ + +#include "fastrand/fastrand.h" + +/** swap */ +static void inline rd_swap(uint32_t *a, uint32_t *b) { + uint32_t tmp = *a; + *a = *b; + *b = tmp; +} + +/** This is by no means "cryptographically correct" or stuff, but fast */ +static inline void randominus(uint32_t *a, int n, uint32_t seed) { + + /** Initialized ILP random generator */ + uint32_t ilp_seeds[8]; + rand_ilp_state rsi; + rand_state rs = init_rand(); + for(int i = 0; i < 8; ++i) { + uint32_t choice = rand_between(&rs, 0, n); + ilp_seeds[i] = choice; + } + + /** Go over the array and randomly swap stuff - hand unrolled with ILP random get! */ + for(int i = 0; i < (n - 8); i += 8) { + uint32_t to0 = fastmodlike(lcg_ilp(&rsi, A), n); + uint32_t to1 = fastmodlike(lcg_ilp(&rsi, B), n); + uint32_t to2 = fastmodlike(lcg_ilp(&rsi, C), n); + uint32_t to3 = fastmodlike(lcg_ilp(&rsi, D), n); + uint32_t to4 = fastmodlike(lcg_ilp(&rsi, E), n); + uint32_t to5 = fastmodlike(lcg_ilp(&rsi, F), n); + uint32_t to6 = fastmodlike(lcg_ilp(&rsi, G), n); + uint32_t to7 = fastmodlike(lcg_ilp(&rsi, H), n); + rd_swap(&a[i], &a[to0]); + rd_swap(&a[i + 1], &a[to1]); + rd_swap(&a[i + 2], &a[to2]); + rd_swap(&a[i + 3], &a[to3]); + rd_swap(&a[i + 4], &a[to4]); + rd_swap(&a[i + 5], &a[to5]); + rd_swap(&a[i + 6], &a[to6]); + rd_swap(&a[i + 7], &a[to7]); + } +} + +#endif /* RANDOMINUS_H */ diff --git a/ypsu.cpp b/ypsu.cpp index e3afa07..74b3838 100644 --- a/ypsu.cpp +++ b/ypsu.cpp @@ -23,6 +23,7 @@ #include "qsort/chatgpt_qs.h" #include "threepass.h" #include "thiersort3.h" +#include "randominus.h" // #define MAGYAR_SORT_DEFAULT_REUSE #include "magyarsort.h" @@ -227,6 +228,15 @@ static inline void do_thier3(uint32_t *a, int n) noexcept { thiersort3(a, &(tmp[0]), n); } +/** rthier */ +static inline void do_rthier(uint32_t *a, int n) noexcept { + assert(n * uint32_t(sizeof(a[0])) <= INT_MAX); + uint32_t junk; + randominus(a, n, junk); + std::vector tmp(n); + thiersort3(a, &(tmp[0]), n); +} + /** 3+1 pass bottom-up radix */ static inline void do_threepass(uint32_t *a, int n) noexcept { threepass(a, n); @@ -895,8 +905,8 @@ int main(int argc, char **argv) { printf("Sorting %d elements:\n\n", n); // Uncomment this for profiling and alg! - //measure_single(n); - //return 0; + // measure_single(n); + // return 0; for (auto inputtype : inputtypes) { printf("%10s", inputtype.c_str()); @@ -921,7 +931,17 @@ int main(int argc, char **argv) { }); w = v; - measure(inputtype, "magyar", [&] { MagyarSort::sort(&w[0], w.size()); }); + measure(inputtype, "magyar", [&] { + MagyarSort::sort(&w[0], w.size()); + }); + assert(w == expected); + + w = v; + measure(inputtype, "rmagyar", [&] { + uint32_t junk; + randominus(&w[0], w.size(), junk); + MagyarSort::sort(&w[0], w.size()); + }); assert(w == expected); w = v; @@ -1011,6 +1031,10 @@ int main(int argc, char **argv) { measure(inputtype, "thier3", [&] { do_thier3(&w[0], w.size()); }); assert(w == expected); + w = v; + measure(inputtype, "rthier", [&] { do_rthier(&w[0], w.size()); }); + assert(w == expected); + w = v; measure(inputtype, "threep", [&] { do_threepass(&w[0], w.size()); }); assert(w == expected); @@ -1046,10 +1070,12 @@ int main(int argc, char **argv) { } assert(w == expected); */ - /* + w = v; measure(inputtype, "frewr", [&] { frewr(&w[0], w.size()); }); assert(w == expected); + + /* w = v; measure(inputtype, "vsort", [&] { vsort(&w[0], w.size()); }); assert(w == expected);