From 66376651a3f27407623d9eee8514ed5a6dea5ccf Mon Sep 17 00:00:00 2001 From: Richard Thier Date: Thu, 2 Oct 2025 08:09:57 +0200 Subject: [PATCH] Revert "thier3: tricky rotation based state storing..." This reverts commit 1d1f151c0730314ee4370eb288bf1f8c09824b02. --- threepass_xbit.h | 68 +++++++++++++++++++++++++++--------------------- 1 file changed, 39 insertions(+), 29 deletions(-) diff --git a/threepass_xbit.h b/threepass_xbit.h index 5c87d27..4298664 100644 --- a/threepass_xbit.h +++ b/threepass_xbit.h @@ -55,9 +55,9 @@ static inline void threepass_xb(uint32_t *a, uint32_t *buf, int n) noexcept { } /* Count prefix sums - try as much ILP as possible because bigger arrays than usual! */ - int prev1 = -1; - int prev2 = -1; - int prev3 = -1; + uint32_t prev1 = 0; + uint32_t prev2 = 0; + uint32_t prev3 = 0; uint32_t common = min3u32_xb( (1 << TPBX1), (1 << TPBX2), @@ -89,37 +89,47 @@ static inline void threepass_xb(uint32_t *a, uint32_t *buf, int n) noexcept { prev3 = bucket3[j]; } - /* Move elements (the 3 pass): This utilizes frewr trick to overwrite keys (elements) by ror-ing them */ - /* This rewrites the key into its rotated state to save some pipeline stalls. (3 rotates happen and in the end its ok) */ - /* right-to-left to ensure already sorted digits order we keep for iterations */ - - /* Bottom digit a->buf */ - #pragma GCC unroll 80 + // Bottom digit a->buf + // right-to-left to ensure already sorted digits order we keep for iterations + #pragma GCC unroll 48 for(uint32_t i = n; i > 0; --i) { - uint32_t num = a[i - 1]; - /* save rotated into the masked, bucketed loc */ - buf[bucket3[num & mask3]--] = (num >> TPBX3) | (num << (32 - TPBX3)); - //__builtin_prefetch(&buf[bucket3[num & mask3] - 2]); + // Prefetch caches + //__builtin_prefetch(&a[i-8]); + // Get num and its new offset / location + auto num = a[i - 1]; + auto bkeyni = (num >> shr3) & mask3; + auto offset = --bucket3[bkeyni]; + + // Add to the proper target location + buf[offset] = num; } - - /* Mid digit buf->a */ - #pragma GCC unroll 80 + // Mid digit buf->a + // right-to-left to ensure already sorted digits order we keep for iterations + #pragma GCC unroll 48 for(uint32_t i = n; i > 0; --i) { - uint32_t num = buf[i - 1]; - /* save rotated into the masked, bucketed loc */ - a[bucket2[num & mask2]--] = (num >> TPBX2) | (num << (32 - TPBX2)); - //__builtin_prefetch(&a[bucket2[num & mask2] - 2]); + // Prefetch caches + //__builtin_prefetch(&buf[i-8]); + // Get num and its new offset / location + auto num = buf[i - 1]; + auto bkeyni = (num >> shr2) & mask2; + auto offset = --bucket2[bkeyni]; + + // Add to the proper target location + a[offset] = num; } - - /* Top digit a->buf */ - #pragma GCC unroll 80 + // Top digit a->buf + // right-to-left to ensure already sorted digits order we keep for iterations + #pragma GCC unroll 48 for(uint32_t i = n; i > 0; --i) { - uint32_t num = a[i - 1]; - /* Reconstruct the original key in this element by where its stuff is stored */ - constexpr int rot = TPBX1 + (32 - TPBX1 - TPBX2 - TPBX3); - /* save rotated into the masked, bucketed loc */ - buf[bucket1[num & mask1]--] = (num >> rot) | (num << (32 - rot)); - //__builtin_prefetch(&buf[bucket1[num & mask1] - 2]); + // Prefetch caches + // __builtin_prefetch(&a[i-16]); + // Get num and its new offset / location + auto num = a[i - 1]; + auto bkeyni = (num >> shr1) & mask1; + auto offset = --bucket1[bkeyni]; + + // Add to the proper target location + buf[offset] = num; } }