From ac873f7123c0dd23ff9d73668e005c71944a8afa Mon Sep 17 00:00:00 2001 From: Richard Thier Date: Tue, 30 Sep 2025 17:19:47 +0200 Subject: [PATCH] prepared for flame graph analysis --- .gitmodules | 3 ++ FlameGraph | 1 + makefile | 3 +- thiersort3.h | 24 ++++++------ threepass_xbit.h | 100 +++++++++++++++++++++-------------------------- ypsu.cpp | 7 ++-- 6 files changed, 66 insertions(+), 72 deletions(-) create mode 160000 FlameGraph diff --git a/.gitmodules b/.gitmodules index 95d1c10..e6b053a 100644 --- a/.gitmodules +++ b/.gitmodules @@ -1,3 +1,6 @@ [submodule "vergesort"] path = vergesort url = https://github.com/Morwenn/vergesort +[submodule "FlameGraph"] + path = FlameGraph + url = https://github.com/brendangregg/FlameGraph diff --git a/FlameGraph b/FlameGraph new file mode 160000 index 0000000..41fee1f --- /dev/null +++ b/FlameGraph @@ -0,0 +1 @@ +Subproject commit 41fee1f99f9276008b7cd112fca19dc3ea84ac32 diff --git a/makefile b/makefile index 7ac27c0..2ceaada 100644 --- a/makefile +++ b/makefile @@ -24,7 +24,8 @@ release_ypsu_assert: ypsu.cpp magyarsort.h release_ypsu_debug_sym: ypsu.cpp magyarsort.h g++ ypsu.cpp -g -std=c++17 -O2 -o ypsu.out - +release_ypsu_noinline_debug_sym: ypsu.cpp magyarsort.h thiersort3.h + g++ ypsu.cpp -g -std=c++17 -O2 -fno-inline -fno-inline-functions -fno-inline-functions-called-once -fno-inline-functions-called-once -fno-inline-small-functions -fno-ipa-cp -fno-ipa-sra -fno-early-inlining -fno-omit-frame-pointer -fno-optimize-sibling-calls -o ypsu.out release3: test.cpp magyarsort.h g++ test.cpp -DNDEBUG -std=c++17 -O3 -o test.out diff --git a/thiersort3.h b/thiersort3.h index ecd039e..546f2e4 100644 --- a/thiersort3.h +++ b/thiersort3.h @@ -41,9 +41,9 @@ static inline uint32_t witch_bucket3(uint32_t key) { * @param n Number of elements in arr and temparr * @param rstate Create with sch_rand_state rstate = schwab_rand_state(junk_uint32_t); */ -static inline void thiersort3(uint32_t *arr, uint32_t *temparr, int n) { - int bucket[4096]; /* Inclusive */ - int bucket_end[4096]; /* Not inclusive */ +static inline void thiersort3(uint32_t *arr, uint32_t *temparr, uint32_t n) { + uint32_t bucket[4096]; /* Inclusive */ + uint32_t bucket_end[4096]; /* Not inclusive */ /* Check if need to sort at all - needed for invariants later */ if(n < 2) { @@ -52,25 +52,25 @@ static inline void thiersort3(uint32_t *arr, uint32_t *temparr, int n) { /* Count */ #pragma GCC unroll 64 - for(int i = 0; i < 4096; ++i) { + for(uint32_t i = 0; i < 4096; ++i) { bucket[i] = 0; } #pragma GCC unroll 64 - for(int i = 0; i < n; ++i) { + for(uint32_t i = 0; i < n; ++i) { ++bucket[witch_bucket3(arr[i])]; } /* Prefix sum (like in Magyarsort) */ uint32_t prev = 0; #pragma GCC unroll 4 - for (int i = 0; i < 4096; i++) { + for (uint32_t i = 0; i < 4096; i++) { bucket[i] += prev; prev = bucket[i]; } /* Save end-offsets */ #pragma GCC unroll 64 - for(int i = 0; i < 4096; ++i) { + for(uint32_t i = 0; i < 4096; ++i) { bucket_end[i] = bucket[i]; } @@ -78,18 +78,18 @@ static inline void thiersort3(uint32_t *arr, uint32_t *temparr, int n) { /* Move to the buckets */ /* Rem.: This also changes bucket[i] so they will point to bucket beginnings */ #pragma GCC unroll 64 - for(int i = 0; i < n; ++i) { + for(uint32_t i = 0; i < n; ++i) { uint32_t num = arr[i]; uint32_t witch = witch_bucket3(num); - int offset = (--bucket[witch]); + uint32_t offset = (--bucket[witch]); temparr[offset] = num; } /* temparr -> arr each bucket and sort them in-place */ #pragma GCC unroll 64 - for(int b = 0; b < 4096; ++b) { - int begin = bucket[b]; - int end = bucket_end[b]; + for(uint32_t b = 0; b < 4096; ++b) { + uint32_t begin = bucket[b]; + uint32_t end = bucket_end[b]; /* Ensure exists */ if(begin >= end) { diff --git a/threepass_xbit.h b/threepass_xbit.h index 4298664..3e619a8 100644 --- a/threepass_xbit.h +++ b/threepass_xbit.h @@ -20,6 +20,34 @@ static inline constexpr uint32_t min3u32_xb(uint32_t a, uint32_t b, uint32_t c) ((b <= c) ? b : c); } +/** Copy the elements to their respective radics-place (f->t copy) */ +static inline void copy_radics_tpxp(uint32_t *f, uint32_t *t, uint32_t *bucket, uint32_t shr, uint32_t mask, uint32_t n) { + // right-to-left to ensure already sorted digits order we keep for iterations + #pragma GCC unroll 48 + for(uint32_t i = n; i > 0; --i) { + // Prefetch caches + //__builtin_prefetch(&a[i-8]); + // Get num and its new offset / location + auto num = f[i - 1]; + auto bkeyni = (num >> shr) & mask; + auto offset = --bucket[bkeyni]; + + // Add to the proper target location + t[offset] = num; + } +} + +/* I pulled these out only for better flame graph support */ +/** Count occurences (can count together with good ILP) */ +static inline void count_occurences_tpxp(uint32_t *bucket1, uint32_t *bucket2, uint32_t *bucket3, const uint32_t shr1, const uint32_t shr2, const uint32_t shr3, const uint32_t mask1, const uint32_t mask2, const uint32_t mask3, uint32_t *a, uint32_t n) noexcept { + #pragma GCC unroll 64 + for(uint32_t i = 0; i < n; ++i) { + ++bucket1[(a[i] >> shr1) & mask1]; + ++bucket2[(a[i] >> shr2) & mask2]; + ++bucket3[(a[i] >> shr3) & mask3]; + } +} + /** * Simple three-pass (ok: 3 + 1) bottom-up internal radix sort writter for thiersort3 * @@ -27,17 +55,17 @@ static inline constexpr uint32_t min3u32_xb(uint32_t a, uint32_t b, uint32_t c) * @param buf Result array with the same size - result will be here * @param n The number of elements */ -static inline void threepass_xb(uint32_t *a, uint32_t *buf, int n) noexcept { +static inline void threepass_xb(uint32_t *a, uint32_t *buf, uint32_t n) noexcept { assert(buf != NULL); - constexpr int shr1 = TPBX3 + TPBX2; - constexpr int shr2 = TPBX3; - constexpr int shr3 = 0; - constexpr int mask1 = (1 << TPBX1) - 1; - constexpr int mask2 = (1 << TPBX2) - 1; - constexpr int mask3 = (1 << TPBX3) - 1; + constexpr uint32_t shr1 = TPBX3 + TPBX2; + constexpr uint32_t shr2 = TPBX3; + constexpr uint32_t shr3 = 0; + constexpr uint32_t mask1 = (1 << TPBX1) - 1; + constexpr uint32_t mask2 = (1 << TPBX2) - 1; + constexpr uint32_t mask3 = (1 << TPBX3) - 1; /* helper buffers. */ - int sz = n * sizeof(a[0]); + uint32_t sz = n * sizeof(a[0]); static thread_local uint32_t bucket1[1 << TPBX1]; memset(bucket1, 0, (1 << TPBX1) * sizeof(uint32_t)); @@ -46,13 +74,7 @@ static inline void threepass_xb(uint32_t *a, uint32_t *buf, int n) noexcept { static thread_local uint32_t bucket3[1 << TPBX3]; memset(bucket3, 0, (1 << TPBX3) * sizeof(uint32_t)); - /* Count occurences (can count together with good ILP) */ - #pragma GCC unroll 64 - for(uint32_t i = 0; i < n; ++i) { - ++bucket1[(a[i] >> shr1) & mask1]; - ++bucket2[(a[i] >> shr2) & mask2]; - ++bucket3[(a[i] >> shr3) & mask3]; - } + count_occurences_tpxp(bucket1, bucket2, bucket3, shr1, shr2, shr3, mask1, mask2, mask3, a, n); /* Count prefix sums - try as much ILP as possible because bigger arrays than usual! */ uint32_t prev1 = 0; @@ -63,7 +85,7 @@ static inline void threepass_xb(uint32_t *a, uint32_t *buf, int n) noexcept { (1 << TPBX2), (1 << TPBX3) ); - int i = 0; + uint32_t i = 0; #pragma GCC unroll 8 for (; i < common; ++i) { bucket1[i] += prev1; @@ -74,63 +96,29 @@ static inline void threepass_xb(uint32_t *a, uint32_t *buf, int n) noexcept { prev3 = bucket3[i]; } /* Do remaining 1 */ - for (int j = i; j < (1 << TPBX1); ++j) { + for (uint32_t j = i; j < (1 << TPBX1); ++j) { bucket1[j] += prev1; prev1 = bucket1[j]; } /* Do remaining 2 */ - for (int j = i; j< (1 << TPBX2); ++j) { + for (uint32_t j = i; j< (1 << TPBX2); ++j) { bucket2[j] += prev2; prev2 = bucket2[j]; } /* Do remaining 3 */ - for (int j = i; j < (1 << TPBX3); ++j) { + for (uint32_t j = i; j < (1 << TPBX3); ++j) { bucket3[j] += prev3; prev3 = bucket3[j]; } // Bottom digit a->buf - // right-to-left to ensure already sorted digits order we keep for iterations - #pragma GCC unroll 48 - for(uint32_t i = n; i > 0; --i) { - // Prefetch caches - //__builtin_prefetch(&a[i-8]); - // Get num and its new offset / location - auto num = a[i - 1]; - auto bkeyni = (num >> shr3) & mask3; - auto offset = --bucket3[bkeyni]; + copy_radics_tpxp(a, buf, bucket3, shr3, mask3, n); - // Add to the proper target location - buf[offset] = num; - } // Mid digit buf->a - // right-to-left to ensure already sorted digits order we keep for iterations - #pragma GCC unroll 48 - for(uint32_t i = n; i > 0; --i) { - // Prefetch caches - //__builtin_prefetch(&buf[i-8]); - // Get num and its new offset / location - auto num = buf[i - 1]; - auto bkeyni = (num >> shr2) & mask2; - auto offset = --bucket2[bkeyni]; + copy_radics_tpxp(buf, a, bucket2, shr2, mask2, n); - // Add to the proper target location - a[offset] = num; - } // Top digit a->buf - // right-to-left to ensure already sorted digits order we keep for iterations - #pragma GCC unroll 48 - for(uint32_t i = n; i > 0; --i) { - // Prefetch caches - // __builtin_prefetch(&a[i-16]); - // Get num and its new offset / location - auto num = a[i - 1]; - auto bkeyni = (num >> shr1) & mask1; - auto offset = --bucket1[bkeyni]; - - // Add to the proper target location - buf[offset] = num; - } + copy_radics_tpxp(a, buf, bucket1, shr1, mask1, n); } #endif /* THREE_PASS_XB_H */ diff --git a/ypsu.cpp b/ypsu.cpp index e3afa07..48c6988 100644 --- a/ypsu.cpp +++ b/ypsu.cpp @@ -858,7 +858,8 @@ void measure_single(int n) { //measure(inputtype, "sp", [&] { spsort(&v[0], v.size()); }); //measure(inputtype, "magyar", [&] { MagyarSort::sort(&v[0], v.size()); }); //measure(inputtype, "thier2", [&] { do_thier2(&v[0], v.size()); }); - measure(inputtype, "threep", [&] { do_threepass(&v[0], v.size()); }); + //measure(inputtype, "threep", [&] { do_threepass(&v[0], v.size()); }); + measure(inputtype, "thier3", [&] { do_thier3(&v[0], v.size()); }); for (auto r : results) printf("%9.3fs", r.second); puts(""); @@ -895,8 +896,8 @@ int main(int argc, char **argv) { printf("Sorting %d elements:\n\n", n); // Uncomment this for profiling and alg! - //measure_single(n); - //return 0; + measure_single(n); + return 0; for (auto inputtype : inputtypes) { printf("%10s", inputtype.c_str());