diff --git a/threepass_xbit.h b/threepass_xbit.h index 4298664..5c87d27 100644 --- a/threepass_xbit.h +++ b/threepass_xbit.h @@ -55,9 +55,9 @@ static inline void threepass_xb(uint32_t *a, uint32_t *buf, int n) noexcept { } /* Count prefix sums - try as much ILP as possible because bigger arrays than usual! */ - uint32_t prev1 = 0; - uint32_t prev2 = 0; - uint32_t prev3 = 0; + int prev1 = -1; + int prev2 = -1; + int prev3 = -1; uint32_t common = min3u32_xb( (1 << TPBX1), (1 << TPBX2), @@ -89,47 +89,37 @@ static inline void threepass_xb(uint32_t *a, uint32_t *buf, int n) noexcept { prev3 = bucket3[j]; } - // Bottom digit a->buf - // right-to-left to ensure already sorted digits order we keep for iterations - #pragma GCC unroll 48 - for(uint32_t i = n; i > 0; --i) { - // Prefetch caches - //__builtin_prefetch(&a[i-8]); - // Get num and its new offset / location - auto num = a[i - 1]; - auto bkeyni = (num >> shr3) & mask3; - auto offset = --bucket3[bkeyni]; + /* Move elements (the 3 pass): This utilizes frewr trick to overwrite keys (elements) by ror-ing them */ + /* This rewrites the key into its rotated state to save some pipeline stalls. (3 rotates happen and in the end its ok) */ + /* right-to-left to ensure already sorted digits order we keep for iterations */ - // Add to the proper target location - buf[offset] = num; + /* Bottom digit a->buf */ + #pragma GCC unroll 80 + for(uint32_t i = n; i > 0; --i) { + uint32_t num = a[i - 1]; + /* save rotated into the masked, bucketed loc */ + buf[bucket3[num & mask3]--] = (num >> TPBX3) | (num << (32 - TPBX3)); + //__builtin_prefetch(&buf[bucket3[num & mask3] - 2]); } - // Mid digit buf->a - // right-to-left to ensure already sorted digits order we keep for iterations - #pragma GCC unroll 48 - for(uint32_t i = n; i > 0; --i) { - // Prefetch caches - //__builtin_prefetch(&buf[i-8]); - // Get num and its new offset / location - auto num = buf[i - 1]; - auto bkeyni = (num >> shr2) & mask2; - auto offset = --bucket2[bkeyni]; - // Add to the proper target location - a[offset] = num; + /* Mid digit buf->a */ + #pragma GCC unroll 80 + for(uint32_t i = n; i > 0; --i) { + uint32_t num = buf[i - 1]; + /* save rotated into the masked, bucketed loc */ + a[bucket2[num & mask2]--] = (num >> TPBX2) | (num << (32 - TPBX2)); + //__builtin_prefetch(&a[bucket2[num & mask2] - 2]); } - // Top digit a->buf - // right-to-left to ensure already sorted digits order we keep for iterations - #pragma GCC unroll 48 - for(uint32_t i = n; i > 0; --i) { - // Prefetch caches - // __builtin_prefetch(&a[i-16]); - // Get num and its new offset / location - auto num = a[i - 1]; - auto bkeyni = (num >> shr1) & mask1; - auto offset = --bucket1[bkeyni]; - // Add to the proper target location - buf[offset] = num; + /* Top digit a->buf */ + #pragma GCC unroll 80 + for(uint32_t i = n; i > 0; --i) { + uint32_t num = a[i - 1]; + /* Reconstruct the original key in this element by where its stuff is stored */ + constexpr int rot = TPBX1 + (32 - TPBX1 - TPBX2 - TPBX3); + /* save rotated into the masked, bucketed loc */ + buf[bucket1[num & mask1]--] = (num >> rot) | (num << (32 - rot)); + //__builtin_prefetch(&buf[bucket1[num & mask1] - 2]); } }