Revert "thier3: tricky rotation based state storing..."

This reverts commit 1d1f151c0730314ee4370eb288bf1f8c09824b02.
This commit is contained in:
Richard Thier 2025-10-02 08:09:57 +02:00
parent 74e24486f4
commit 66376651a3

View File

@ -55,9 +55,9 @@ static inline void threepass_xb(uint32_t *a, uint32_t *buf, int n) noexcept {
} }
/* Count prefix sums - try as much ILP as possible because bigger arrays than usual! */ /* Count prefix sums - try as much ILP as possible because bigger arrays than usual! */
int prev1 = -1; uint32_t prev1 = 0;
int prev2 = -1; uint32_t prev2 = 0;
int prev3 = -1; uint32_t prev3 = 0;
uint32_t common = min3u32_xb( uint32_t common = min3u32_xb(
(1 << TPBX1), (1 << TPBX1),
(1 << TPBX2), (1 << TPBX2),
@ -89,37 +89,47 @@ static inline void threepass_xb(uint32_t *a, uint32_t *buf, int n) noexcept {
prev3 = bucket3[j]; prev3 = bucket3[j];
} }
/* Move elements (the 3 pass): This utilizes frewr trick to overwrite keys (elements) by ror-ing them */ // Bottom digit a->buf
/* This rewrites the key into its rotated state to save some pipeline stalls. (3 rotates happen and in the end its ok) */ // right-to-left to ensure already sorted digits order we keep for iterations
/* right-to-left to ensure already sorted digits order we keep for iterations */ #pragma GCC unroll 48
/* Bottom digit a->buf */
#pragma GCC unroll 80
for(uint32_t i = n; i > 0; --i) { for(uint32_t i = n; i > 0; --i) {
uint32_t num = a[i - 1]; // Prefetch caches
/* save rotated into the masked, bucketed loc */ //__builtin_prefetch(&a[i-8]);
buf[bucket3[num & mask3]--] = (num >> TPBX3) | (num << (32 - TPBX3)); // Get num and its new offset / location
//__builtin_prefetch(&buf[bucket3[num & mask3] - 2]); auto num = a[i - 1];
auto bkeyni = (num >> shr3) & mask3;
auto offset = --bucket3[bkeyni];
// Add to the proper target location
buf[offset] = num;
} }
// Mid digit buf->a
/* Mid digit buf->a */ // right-to-left to ensure already sorted digits order we keep for iterations
#pragma GCC unroll 80 #pragma GCC unroll 48
for(uint32_t i = n; i > 0; --i) { for(uint32_t i = n; i > 0; --i) {
uint32_t num = buf[i - 1]; // Prefetch caches
/* save rotated into the masked, bucketed loc */ //__builtin_prefetch(&buf[i-8]);
a[bucket2[num & mask2]--] = (num >> TPBX2) | (num << (32 - TPBX2)); // Get num and its new offset / location
//__builtin_prefetch(&a[bucket2[num & mask2] - 2]); auto num = buf[i - 1];
auto bkeyni = (num >> shr2) & mask2;
auto offset = --bucket2[bkeyni];
// Add to the proper target location
a[offset] = num;
} }
// Top digit a->buf
/* Top digit a->buf */ // right-to-left to ensure already sorted digits order we keep for iterations
#pragma GCC unroll 80 #pragma GCC unroll 48
for(uint32_t i = n; i > 0; --i) { for(uint32_t i = n; i > 0; --i) {
uint32_t num = a[i - 1]; // Prefetch caches
/* Reconstruct the original key in this element by where its stuff is stored */ // __builtin_prefetch(&a[i-16]);
constexpr int rot = TPBX1 + (32 - TPBX1 - TPBX2 - TPBX3); // Get num and its new offset / location
/* save rotated into the masked, bucketed loc */ auto num = a[i - 1];
buf[bucket1[num & mask1]--] = (num >> rot) | (num << (32 - rot)); auto bkeyni = (num >> shr1) & mask1;
//__builtin_prefetch(&buf[bucket1[num & mask1] - 2]); auto offset = --bucket1[bkeyni];
// Add to the proper target location
buf[offset] = num;
} }
} }