From 08cb90bb1b982fb95cd0196ac1f77df060f4ed68 Mon Sep 17 00:00:00 2001 From: Richard Thier Date: Tue, 30 Sep 2025 22:18:10 +0200 Subject: [PATCH] Revert "prepared for flame graph analysis" This reverts commit ac873f7123c0dd23ff9d73668e005c71944a8afa. --- .gitmodules | 3 -- FlameGraph | 1 - makefile | 3 +- thiersort3.h | 24 ++++++------ threepass_xbit.h | 100 ++++++++++++++++++++++++++--------------------- ypsu.cpp | 7 ++-- 6 files changed, 72 insertions(+), 66 deletions(-) delete mode 160000 FlameGraph diff --git a/.gitmodules b/.gitmodules index e6b053a..95d1c10 100644 --- a/.gitmodules +++ b/.gitmodules @@ -1,6 +1,3 @@ [submodule "vergesort"] path = vergesort url = https://github.com/Morwenn/vergesort -[submodule "FlameGraph"] - path = FlameGraph - url = https://github.com/brendangregg/FlameGraph diff --git a/FlameGraph b/FlameGraph deleted file mode 160000 index 41fee1f..0000000 --- a/FlameGraph +++ /dev/null @@ -1 +0,0 @@ -Subproject commit 41fee1f99f9276008b7cd112fca19dc3ea84ac32 diff --git a/makefile b/makefile index 2ceaada..7ac27c0 100644 --- a/makefile +++ b/makefile @@ -24,8 +24,7 @@ release_ypsu_assert: ypsu.cpp magyarsort.h release_ypsu_debug_sym: ypsu.cpp magyarsort.h g++ ypsu.cpp -g -std=c++17 -O2 -o ypsu.out -release_ypsu_noinline_debug_sym: ypsu.cpp magyarsort.h thiersort3.h - g++ ypsu.cpp -g -std=c++17 -O2 -fno-inline -fno-inline-functions -fno-inline-functions-called-once -fno-inline-functions-called-once -fno-inline-small-functions -fno-ipa-cp -fno-ipa-sra -fno-early-inlining -fno-omit-frame-pointer -fno-optimize-sibling-calls -o ypsu.out + release3: test.cpp magyarsort.h g++ test.cpp -DNDEBUG -std=c++17 -O3 -o test.out diff --git a/thiersort3.h b/thiersort3.h index 546f2e4..ecd039e 100644 --- a/thiersort3.h +++ b/thiersort3.h @@ -41,9 +41,9 @@ static inline uint32_t witch_bucket3(uint32_t key) { * @param n Number of elements in arr and temparr * @param rstate Create with sch_rand_state rstate = schwab_rand_state(junk_uint32_t); */ -static inline void thiersort3(uint32_t *arr, uint32_t *temparr, uint32_t n) { - uint32_t bucket[4096]; /* Inclusive */ - uint32_t bucket_end[4096]; /* Not inclusive */ +static inline void thiersort3(uint32_t *arr, uint32_t *temparr, int n) { + int bucket[4096]; /* Inclusive */ + int bucket_end[4096]; /* Not inclusive */ /* Check if need to sort at all - needed for invariants later */ if(n < 2) { @@ -52,25 +52,25 @@ static inline void thiersort3(uint32_t *arr, uint32_t *temparr, uint32_t n) { /* Count */ #pragma GCC unroll 64 - for(uint32_t i = 0; i < 4096; ++i) { + for(int i = 0; i < 4096; ++i) { bucket[i] = 0; } #pragma GCC unroll 64 - for(uint32_t i = 0; i < n; ++i) { + for(int i = 0; i < n; ++i) { ++bucket[witch_bucket3(arr[i])]; } /* Prefix sum (like in Magyarsort) */ uint32_t prev = 0; #pragma GCC unroll 4 - for (uint32_t i = 0; i < 4096; i++) { + for (int i = 0; i < 4096; i++) { bucket[i] += prev; prev = bucket[i]; } /* Save end-offsets */ #pragma GCC unroll 64 - for(uint32_t i = 0; i < 4096; ++i) { + for(int i = 0; i < 4096; ++i) { bucket_end[i] = bucket[i]; } @@ -78,18 +78,18 @@ static inline void thiersort3(uint32_t *arr, uint32_t *temparr, uint32_t n) { /* Move to the buckets */ /* Rem.: This also changes bucket[i] so they will point to bucket beginnings */ #pragma GCC unroll 64 - for(uint32_t i = 0; i < n; ++i) { + for(int i = 0; i < n; ++i) { uint32_t num = arr[i]; uint32_t witch = witch_bucket3(num); - uint32_t offset = (--bucket[witch]); + int offset = (--bucket[witch]); temparr[offset] = num; } /* temparr -> arr each bucket and sort them in-place */ #pragma GCC unroll 64 - for(uint32_t b = 0; b < 4096; ++b) { - uint32_t begin = bucket[b]; - uint32_t end = bucket_end[b]; + for(int b = 0; b < 4096; ++b) { + int begin = bucket[b]; + int end = bucket_end[b]; /* Ensure exists */ if(begin >= end) { diff --git a/threepass_xbit.h b/threepass_xbit.h index 3e619a8..4298664 100644 --- a/threepass_xbit.h +++ b/threepass_xbit.h @@ -20,34 +20,6 @@ static inline constexpr uint32_t min3u32_xb(uint32_t a, uint32_t b, uint32_t c) ((b <= c) ? b : c); } -/** Copy the elements to their respective radics-place (f->t copy) */ -static inline void copy_radics_tpxp(uint32_t *f, uint32_t *t, uint32_t *bucket, uint32_t shr, uint32_t mask, uint32_t n) { - // right-to-left to ensure already sorted digits order we keep for iterations - #pragma GCC unroll 48 - for(uint32_t i = n; i > 0; --i) { - // Prefetch caches - //__builtin_prefetch(&a[i-8]); - // Get num and its new offset / location - auto num = f[i - 1]; - auto bkeyni = (num >> shr) & mask; - auto offset = --bucket[bkeyni]; - - // Add to the proper target location - t[offset] = num; - } -} - -/* I pulled these out only for better flame graph support */ -/** Count occurences (can count together with good ILP) */ -static inline void count_occurences_tpxp(uint32_t *bucket1, uint32_t *bucket2, uint32_t *bucket3, const uint32_t shr1, const uint32_t shr2, const uint32_t shr3, const uint32_t mask1, const uint32_t mask2, const uint32_t mask3, uint32_t *a, uint32_t n) noexcept { - #pragma GCC unroll 64 - for(uint32_t i = 0; i < n; ++i) { - ++bucket1[(a[i] >> shr1) & mask1]; - ++bucket2[(a[i] >> shr2) & mask2]; - ++bucket3[(a[i] >> shr3) & mask3]; - } -} - /** * Simple three-pass (ok: 3 + 1) bottom-up internal radix sort writter for thiersort3 * @@ -55,17 +27,17 @@ static inline void count_occurences_tpxp(uint32_t *bucket1, uint32_t *bucket2, u * @param buf Result array with the same size - result will be here * @param n The number of elements */ -static inline void threepass_xb(uint32_t *a, uint32_t *buf, uint32_t n) noexcept { +static inline void threepass_xb(uint32_t *a, uint32_t *buf, int n) noexcept { assert(buf != NULL); - constexpr uint32_t shr1 = TPBX3 + TPBX2; - constexpr uint32_t shr2 = TPBX3; - constexpr uint32_t shr3 = 0; - constexpr uint32_t mask1 = (1 << TPBX1) - 1; - constexpr uint32_t mask2 = (1 << TPBX2) - 1; - constexpr uint32_t mask3 = (1 << TPBX3) - 1; + constexpr int shr1 = TPBX3 + TPBX2; + constexpr int shr2 = TPBX3; + constexpr int shr3 = 0; + constexpr int mask1 = (1 << TPBX1) - 1; + constexpr int mask2 = (1 << TPBX2) - 1; + constexpr int mask3 = (1 << TPBX3) - 1; /* helper buffers. */ - uint32_t sz = n * sizeof(a[0]); + int sz = n * sizeof(a[0]); static thread_local uint32_t bucket1[1 << TPBX1]; memset(bucket1, 0, (1 << TPBX1) * sizeof(uint32_t)); @@ -74,7 +46,13 @@ static inline void threepass_xb(uint32_t *a, uint32_t *buf, uint32_t n) noexcept static thread_local uint32_t bucket3[1 << TPBX3]; memset(bucket3, 0, (1 << TPBX3) * sizeof(uint32_t)); - count_occurences_tpxp(bucket1, bucket2, bucket3, shr1, shr2, shr3, mask1, mask2, mask3, a, n); + /* Count occurences (can count together with good ILP) */ + #pragma GCC unroll 64 + for(uint32_t i = 0; i < n; ++i) { + ++bucket1[(a[i] >> shr1) & mask1]; + ++bucket2[(a[i] >> shr2) & mask2]; + ++bucket3[(a[i] >> shr3) & mask3]; + } /* Count prefix sums - try as much ILP as possible because bigger arrays than usual! */ uint32_t prev1 = 0; @@ -85,7 +63,7 @@ static inline void threepass_xb(uint32_t *a, uint32_t *buf, uint32_t n) noexcept (1 << TPBX2), (1 << TPBX3) ); - uint32_t i = 0; + int i = 0; #pragma GCC unroll 8 for (; i < common; ++i) { bucket1[i] += prev1; @@ -96,29 +74,63 @@ static inline void threepass_xb(uint32_t *a, uint32_t *buf, uint32_t n) noexcept prev3 = bucket3[i]; } /* Do remaining 1 */ - for (uint32_t j = i; j < (1 << TPBX1); ++j) { + for (int j = i; j < (1 << TPBX1); ++j) { bucket1[j] += prev1; prev1 = bucket1[j]; } /* Do remaining 2 */ - for (uint32_t j = i; j< (1 << TPBX2); ++j) { + for (int j = i; j< (1 << TPBX2); ++j) { bucket2[j] += prev2; prev2 = bucket2[j]; } /* Do remaining 3 */ - for (uint32_t j = i; j < (1 << TPBX3); ++j) { + for (int j = i; j < (1 << TPBX3); ++j) { bucket3[j] += prev3; prev3 = bucket3[j]; } // Bottom digit a->buf - copy_radics_tpxp(a, buf, bucket3, shr3, mask3, n); + // right-to-left to ensure already sorted digits order we keep for iterations + #pragma GCC unroll 48 + for(uint32_t i = n; i > 0; --i) { + // Prefetch caches + //__builtin_prefetch(&a[i-8]); + // Get num and its new offset / location + auto num = a[i - 1]; + auto bkeyni = (num >> shr3) & mask3; + auto offset = --bucket3[bkeyni]; + // Add to the proper target location + buf[offset] = num; + } // Mid digit buf->a - copy_radics_tpxp(buf, a, bucket2, shr2, mask2, n); + // right-to-left to ensure already sorted digits order we keep for iterations + #pragma GCC unroll 48 + for(uint32_t i = n; i > 0; --i) { + // Prefetch caches + //__builtin_prefetch(&buf[i-8]); + // Get num and its new offset / location + auto num = buf[i - 1]; + auto bkeyni = (num >> shr2) & mask2; + auto offset = --bucket2[bkeyni]; + // Add to the proper target location + a[offset] = num; + } // Top digit a->buf - copy_radics_tpxp(a, buf, bucket1, shr1, mask1, n); + // right-to-left to ensure already sorted digits order we keep for iterations + #pragma GCC unroll 48 + for(uint32_t i = n; i > 0; --i) { + // Prefetch caches + // __builtin_prefetch(&a[i-16]); + // Get num and its new offset / location + auto num = a[i - 1]; + auto bkeyni = (num >> shr1) & mask1; + auto offset = --bucket1[bkeyni]; + + // Add to the proper target location + buf[offset] = num; + } } #endif /* THREE_PASS_XB_H */ diff --git a/ypsu.cpp b/ypsu.cpp index 48c6988..e3afa07 100644 --- a/ypsu.cpp +++ b/ypsu.cpp @@ -858,8 +858,7 @@ void measure_single(int n) { //measure(inputtype, "sp", [&] { spsort(&v[0], v.size()); }); //measure(inputtype, "magyar", [&] { MagyarSort::sort(&v[0], v.size()); }); //measure(inputtype, "thier2", [&] { do_thier2(&v[0], v.size()); }); - //measure(inputtype, "threep", [&] { do_threepass(&v[0], v.size()); }); - measure(inputtype, "thier3", [&] { do_thier3(&v[0], v.size()); }); + measure(inputtype, "threep", [&] { do_threepass(&v[0], v.size()); }); for (auto r : results) printf("%9.3fs", r.second); puts(""); @@ -896,8 +895,8 @@ int main(int argc, char **argv) { printf("Sorting %d elements:\n\n", n); // Uncomment this for profiling and alg! - measure_single(n); - return 0; + //measure_single(n); + //return 0; for (auto inputtype : inputtypes) { printf("%10s", inputtype.c_str());