diff --git a/threepass_xbit.h b/threepass_xbit.h index 65673e3..4298664 100644 --- a/threepass_xbit.h +++ b/threepass_xbit.h @@ -63,7 +63,7 @@ static inline void threepass_xb(uint32_t *a, uint32_t *buf, int n) noexcept { (1 << TPBX2), (1 << TPBX3) ); - uint32_t i = 0; + int i = 0; #pragma GCC unroll 8 for (; i < common; ++i) { bucket1[i] += prev1; @@ -74,104 +74,61 @@ static inline void threepass_xb(uint32_t *a, uint32_t *buf, int n) noexcept { prev3 = bucket3[i]; } /* Do remaining 1 */ - for (uint32_t j = i; j < (1 << TPBX1); ++j) { + for (int j = i; j < (1 << TPBX1); ++j) { bucket1[j] += prev1; prev1 = bucket1[j]; } /* Do remaining 2 */ - for (uint32_t j = i; j< (1 << TPBX2); ++j) { + for (int j = i; j< (1 << TPBX2); ++j) { bucket2[j] += prev2; prev2 = bucket2[j]; } /* Do remaining 3 */ - for (uint32_t j = i; j < (1 << TPBX3); ++j) { + for (int j = i; j < (1 << TPBX3); ++j) { bucket3[j] += prev3; prev3 = bucket3[j]; } // Bottom digit a->buf // right-to-left to ensure already sorted digits order we keep for iterations - #pragma GCC unroll 13 - for(i = n; i >= 4; i -= 4) { - auto num0 = a[i - 1]; - auto bkeyni0 = (num0 >> shr3) & mask3; - auto offset0 = --bucket3[bkeyni0]; - buf[offset0] = num0; - auto num1 = a[i - 2]; - auto bkeyni1 = (num1 >> shr3) & mask3; - auto offset1 = --bucket3[bkeyni1]; - buf[offset1] = num1; - auto num2 = a[i - 3]; - auto bkeyni2 = (num2 >> shr3) & mask3; - auto offset2 = --bucket3[bkeyni2]; - buf[offset2] = num2; - auto num3 = a[i - 4]; - auto bkeyni3 = (num3 >> shr3) & mask3; - auto offset3 = --bucket3[bkeyni3]; - buf[offset3] = num3; - } - #pragma GCC unroll 4 - for(; i > 0; --i) { + #pragma GCC unroll 48 + for(uint32_t i = n; i > 0; --i) { + // Prefetch caches + //__builtin_prefetch(&a[i-8]); + // Get num and its new offset / location auto num = a[i - 1]; auto bkeyni = (num >> shr3) & mask3; auto offset = --bucket3[bkeyni]; + + // Add to the proper target location buf[offset] = num; } - // Mid digit buf->a // right-to-left to ensure already sorted digits order we keep for iterations - #pragma GCC unroll 13 - for(i = n; i >= 4; i -= 4) { - auto num0 = buf[i - 1]; - auto bkeyni0 = (num0 >> shr2) & mask2; - auto offset0 = --bucket2[bkeyni0]; - a[offset0] = num0; - auto num1 = buf[i - 2]; - auto bkeyni1 = (num1 >> shr2) & mask2; - auto offset1 = --bucket2[bkeyni1]; - a[offset1] = num1; - auto num2 = buf[i - 3]; - auto bkeyni2 = (num2 >> shr2) & mask2; - auto offset2 = --bucket2[bkeyni2]; - a[offset2] = num2; - auto num3 = buf[i - 4]; - auto bkeyni3 = (num3 >> shr2) & mask2; - auto offset3 = --bucket2[bkeyni3]; - a[offset3] = num3; - } - #pragma GCC unroll 4 - for(; i > 0; --i) { + #pragma GCC unroll 48 + for(uint32_t i = n; i > 0; --i) { + // Prefetch caches + //__builtin_prefetch(&buf[i-8]); + // Get num and its new offset / location auto num = buf[i - 1]; auto bkeyni = (num >> shr2) & mask2; auto offset = --bucket2[bkeyni]; + + // Add to the proper target location a[offset] = num; } // Top digit a->buf // right-to-left to ensure already sorted digits order we keep for iterations - #pragma GCC unroll 13 - for(i = n; i >= 4; i -= 4) { - auto num0 = a[i - 1]; - auto bkeyni0 = (num0 >> shr1) & mask1; - auto offset0 = --bucket1[bkeyni0]; - buf[offset0] = num0; - auto num1 = a[i - 2]; - auto bkeyni1 = (num1 >> shr1) & mask1; - auto offset1 = --bucket1[bkeyni1]; - buf[offset1] = num1; - auto num2 = a[i - 3]; - auto bkeyni2 = (num2 >> shr1) & mask1; - auto offset2 = --bucket1[bkeyni2]; - buf[offset2] = num2; - auto num3 = a[i - 4]; - auto bkeyni3 = (num3 >> shr1) & mask1; - auto offset3 = --bucket1[bkeyni3]; - buf[offset3] = num3; - } - #pragma GCC unroll 4 - for(; i > 0; --i) { + #pragma GCC unroll 48 + for(uint32_t i = n; i > 0; --i) { + // Prefetch caches + // __builtin_prefetch(&a[i-16]); + // Get num and its new offset / location auto num = a[i - 1]; auto bkeyni = (num >> shr1) & mask1; auto offset = --bucket1[bkeyni]; + + // Add to the proper target location buf[offset] = num; } }