tiny (hopefully) optimization for tpxb

This commit is contained in:
Richard Thier 2025-10-02 08:09:27 +02:00
parent ce121571ca
commit 2aa7de0d40

View File

@ -94,27 +94,27 @@ static inline void threepass_xb(uint32_t *a, uint32_t *buf, int n) noexcept {
/* right-to-left to ensure already sorted digits order we keep for iterations */ /* right-to-left to ensure already sorted digits order we keep for iterations */
/* Bottom digit a->buf */ /* Bottom digit a->buf */
#pragma GCC unroll 80 #pragma GCC unroll 64
for(uint32_t i = n; i > 0; --i) { for(int i = n - 1; i >= 0; --i) {
uint32_t num = a[i - 1]; uint32_t num = a[i];
/* save rotated into the masked, bucketed loc */ /* save rotated into the masked, bucketed loc */
buf[bucket3[num & mask3]--] = (num >> TPBX3) | (num << (32 - TPBX3)); buf[bucket3[num & mask3]--] = (num >> TPBX3) | (num << (32 - TPBX3));
//__builtin_prefetch(&buf[bucket3[num & mask3] - 2]); //__builtin_prefetch(&buf[bucket3[num & mask3] - 2]);
} }
/* Mid digit buf->a */ /* Mid digit buf->a */
#pragma GCC unroll 80 #pragma GCC unroll 64
for(uint32_t i = n; i > 0; --i) { for(int i = n - 1; i >= 0; --i) {
uint32_t num = buf[i - 1]; uint32_t num = buf[i];
/* save rotated into the masked, bucketed loc */ /* save rotated into the masked, bucketed loc */
a[bucket2[num & mask2]--] = (num >> TPBX2) | (num << (32 - TPBX2)); a[bucket2[num & mask2]--] = (num >> TPBX2) | (num << (32 - TPBX2));
//__builtin_prefetch(&a[bucket2[num & mask2] - 2]); //__builtin_prefetch(&a[bucket2[num & mask2] - 2]);
} }
/* Top digit a->buf */ /* Top digit a->buf */
#pragma GCC unroll 80 #pragma GCC unroll 64
for(uint32_t i = n; i > 0; --i) { for(int i = n - 1; i >= 0; --i) {
uint32_t num = a[i - 1]; uint32_t num = a[i];
/* Reconstruct the original key in this element by where its stuff is stored */ /* Reconstruct the original key in this element by where its stuff is stored */
constexpr int rot = TPBX1 + (32 - TPBX1 - TPBX2 - TPBX3); constexpr int rot = TPBX1 + (32 - TPBX1 - TPBX2 - TPBX3);
/* save rotated into the masked, bucketed loc */ /* save rotated into the masked, bucketed loc */