From 523605e8d841733d7c398131ea50e356b35b88e3 Mon Sep 17 00:00:00 2001 From: Richard Thier Date: Mon, 29 Sep 2025 18:51:53 +0200 Subject: [PATCH] hand unrolled thiersort3 - I think its slower than gcc unrolling and surely more complex so I will revert --- threepass_xbit.h | 93 +++++++++++++++++++++++++++++++++++------------- 1 file changed, 68 insertions(+), 25 deletions(-) diff --git a/threepass_xbit.h b/threepass_xbit.h index 4298664..65673e3 100644 --- a/threepass_xbit.h +++ b/threepass_xbit.h @@ -63,7 +63,7 @@ static inline void threepass_xb(uint32_t *a, uint32_t *buf, int n) noexcept { (1 << TPBX2), (1 << TPBX3) ); - int i = 0; + uint32_t i = 0; #pragma GCC unroll 8 for (; i < common; ++i) { bucket1[i] += prev1; @@ -74,61 +74,104 @@ static inline void threepass_xb(uint32_t *a, uint32_t *buf, int n) noexcept { prev3 = bucket3[i]; } /* Do remaining 1 */ - for (int j = i; j < (1 << TPBX1); ++j) { + for (uint32_t j = i; j < (1 << TPBX1); ++j) { bucket1[j] += prev1; prev1 = bucket1[j]; } /* Do remaining 2 */ - for (int j = i; j< (1 << TPBX2); ++j) { + for (uint32_t j = i; j< (1 << TPBX2); ++j) { bucket2[j] += prev2; prev2 = bucket2[j]; } /* Do remaining 3 */ - for (int j = i; j < (1 << TPBX3); ++j) { + for (uint32_t j = i; j < (1 << TPBX3); ++j) { bucket3[j] += prev3; prev3 = bucket3[j]; } // Bottom digit a->buf // right-to-left to ensure already sorted digits order we keep for iterations - #pragma GCC unroll 48 - for(uint32_t i = n; i > 0; --i) { - // Prefetch caches - //__builtin_prefetch(&a[i-8]); - // Get num and its new offset / location + #pragma GCC unroll 13 + for(i = n; i >= 4; i -= 4) { + auto num0 = a[i - 1]; + auto bkeyni0 = (num0 >> shr3) & mask3; + auto offset0 = --bucket3[bkeyni0]; + buf[offset0] = num0; + auto num1 = a[i - 2]; + auto bkeyni1 = (num1 >> shr3) & mask3; + auto offset1 = --bucket3[bkeyni1]; + buf[offset1] = num1; + auto num2 = a[i - 3]; + auto bkeyni2 = (num2 >> shr3) & mask3; + auto offset2 = --bucket3[bkeyni2]; + buf[offset2] = num2; + auto num3 = a[i - 4]; + auto bkeyni3 = (num3 >> shr3) & mask3; + auto offset3 = --bucket3[bkeyni3]; + buf[offset3] = num3; + } + #pragma GCC unroll 4 + for(; i > 0; --i) { auto num = a[i - 1]; auto bkeyni = (num >> shr3) & mask3; auto offset = --bucket3[bkeyni]; - - // Add to the proper target location buf[offset] = num; } + // Mid digit buf->a // right-to-left to ensure already sorted digits order we keep for iterations - #pragma GCC unroll 48 - for(uint32_t i = n; i > 0; --i) { - // Prefetch caches - //__builtin_prefetch(&buf[i-8]); - // Get num and its new offset / location + #pragma GCC unroll 13 + for(i = n; i >= 4; i -= 4) { + auto num0 = buf[i - 1]; + auto bkeyni0 = (num0 >> shr2) & mask2; + auto offset0 = --bucket2[bkeyni0]; + a[offset0] = num0; + auto num1 = buf[i - 2]; + auto bkeyni1 = (num1 >> shr2) & mask2; + auto offset1 = --bucket2[bkeyni1]; + a[offset1] = num1; + auto num2 = buf[i - 3]; + auto bkeyni2 = (num2 >> shr2) & mask2; + auto offset2 = --bucket2[bkeyni2]; + a[offset2] = num2; + auto num3 = buf[i - 4]; + auto bkeyni3 = (num3 >> shr2) & mask2; + auto offset3 = --bucket2[bkeyni3]; + a[offset3] = num3; + } + #pragma GCC unroll 4 + for(; i > 0; --i) { auto num = buf[i - 1]; auto bkeyni = (num >> shr2) & mask2; auto offset = --bucket2[bkeyni]; - - // Add to the proper target location a[offset] = num; } // Top digit a->buf // right-to-left to ensure already sorted digits order we keep for iterations - #pragma GCC unroll 48 - for(uint32_t i = n; i > 0; --i) { - // Prefetch caches - // __builtin_prefetch(&a[i-16]); - // Get num and its new offset / location + #pragma GCC unroll 13 + for(i = n; i >= 4; i -= 4) { + auto num0 = a[i - 1]; + auto bkeyni0 = (num0 >> shr1) & mask1; + auto offset0 = --bucket1[bkeyni0]; + buf[offset0] = num0; + auto num1 = a[i - 2]; + auto bkeyni1 = (num1 >> shr1) & mask1; + auto offset1 = --bucket1[bkeyni1]; + buf[offset1] = num1; + auto num2 = a[i - 3]; + auto bkeyni2 = (num2 >> shr1) & mask1; + auto offset2 = --bucket1[bkeyni2]; + buf[offset2] = num2; + auto num3 = a[i - 4]; + auto bkeyni3 = (num3 >> shr1) & mask1; + auto offset3 = --bucket1[bkeyni3]; + buf[offset3] = num3; + } + #pragma GCC unroll 4 + for(; i > 0; --i) { auto num = a[i - 1]; auto bkeyni = (num >> shr1) & mask1; auto offset = --bucket1[bkeyni]; - - // Add to the proper target location buf[offset] = num; } }