diff --git a/threepass_xbit.h b/threepass_xbit.h index 7febb7f..840e779 100644 --- a/threepass_xbit.h +++ b/threepass_xbit.h @@ -98,8 +98,8 @@ static inline void threepass_xb(uint32_t *a, uint32_t *buf, int n) noexcept { // Bottom digit a->buf // right-to-left to ensure already sorted digits order we keep for iterations - #pragma GCC unroll 2 - for(i = n; i >= 32; i -= 32) { + #pragma GCC unroll 3 + for(i = n; i >= 16; i -= 16) { // Prefetch the NEXT block (not current) at optimal distance if (i > 17) { // Ensure we don't prefetch out of bounds __builtin_prefetch(&a[i - 17]); @@ -111,57 +111,25 @@ static inline void threepass_xb(uint32_t *a, uint32_t *buf, int n) noexcept { __builtin_prefetch(&a[i - 17*3]); } - // Process 32 elements in reverse order - auto num31 = a[i - 1]; - auto num30 = a[i - 2]; - auto num29 = a[i - 3]; - auto num28 = a[i - 4]; - auto num27 = a[i - 5]; - auto num26 = a[i - 6]; - auto num25 = a[i - 7]; - auto num24 = a[i - 8]; - auto num23 = a[i - 9]; - auto num22 = a[i - 10]; - auto num21 = a[i - 11]; - auto num20 = a[i - 12]; - auto num19 = a[i - 13]; - auto num18 = a[i - 14]; - auto num17 = a[i - 15]; - auto num16 = a[i - 16]; - auto num15 = a[i - 17]; - auto num14 = a[i - 18]; - auto num13 = a[i - 19]; - auto num12 = a[i - 20]; - auto num11 = a[i - 21]; - auto num10 = a[i - 22]; - auto num9 = a[i - 23]; - auto num8 = a[i - 24]; - auto num7 = a[i - 25]; - auto num6 = a[i - 26]; - auto num5 = a[i - 27]; - auto num4 = a[i - 28]; - auto num3 = a[i - 29]; - auto num2 = a[i - 30]; - auto num1 = a[i - 31]; - auto num0 = a[i - 32]; + // Process 16 elements in reverse order + auto num15 = a[i - 1]; + auto num14 = a[i - 2]; + auto num13 = a[i - 3]; + auto num12 = a[i - 4]; + auto num11 = a[i - 5]; + auto num10 = a[i - 6]; + auto num9 = a[i - 7]; + auto num8 = a[i - 8]; + auto num7 = a[i - 9]; + auto num6 = a[i - 10]; + auto num5 = a[i - 11]; + auto num4 = a[i - 12]; + auto num3 = a[i - 13]; + auto num2 = a[i - 14]; + auto num1 = a[i - 15]; + auto num0 = a[i - 16]; - // Process all 32 elements (bucket logic here) - tpxb_process_element(num31, buf, bucket3, shr3, mask3); - tpxb_process_element(num30, buf, bucket3, shr3, mask3); - tpxb_process_element(num29, buf, bucket3, shr3, mask3); - tpxb_process_element(num28, buf, bucket3, shr3, mask3); - tpxb_process_element(num27, buf, bucket3, shr3, mask3); - tpxb_process_element(num26, buf, bucket3, shr3, mask3); - tpxb_process_element(num25, buf, bucket3, shr3, mask3); - tpxb_process_element(num24, buf, bucket3, shr3, mask3); - tpxb_process_element(num23, buf, bucket3, shr3, mask3); - tpxb_process_element(num22, buf, bucket3, shr3, mask3); - tpxb_process_element(num21, buf, bucket3, shr3, mask3); - tpxb_process_element(num20, buf, bucket3, shr3, mask3); - tpxb_process_element(num19, buf, bucket3, shr3, mask3); - tpxb_process_element(num18, buf, bucket3, shr3, mask3); - tpxb_process_element(num17, buf, bucket3, shr3, mask3); - tpxb_process_element(num16, buf, bucket3, shr3, mask3); + // Process all 16 elements (your bucket logic here) tpxb_process_element(num15, buf, bucket3, shr3, mask3); tpxb_process_element(num14, buf, bucket3, shr3, mask3); tpxb_process_element(num13, buf, bucket3, shr3, mask3); @@ -179,7 +147,7 @@ static inline void threepass_xb(uint32_t *a, uint32_t *buf, int n) noexcept { tpxb_process_element(num1, buf, bucket3, shr3, mask3); tpxb_process_element(num0, buf, bucket3, shr3, mask3); } - // Handle remainder + // Handle remainder (less than 16 elements) for(uint32_t j = i; j > 0; --j) { auto num = a[j - 1]; auto bkeyni = (num >> shr3) & mask3; @@ -189,8 +157,8 @@ static inline void threepass_xb(uint32_t *a, uint32_t *buf, int n) noexcept { // Mid digit buf->a // right-to-left to ensure already sorted digits order we keep for iterations - #pragma GCC unroll 2 - for(i = n; i >= 32; i -= 32) { + #pragma GCC unroll 3 + for(i = n; i >= 16; i -= 16) { // Prefetch the NEXT block (not current) at optimal distance if (i > 17) { // Ensure we don't prefetch out of bounds __builtin_prefetch(&buf[i - 17]); @@ -202,57 +170,25 @@ static inline void threepass_xb(uint32_t *a, uint32_t *buf, int n) noexcept { __builtin_prefetch(&buf[i - 17*3]); } - // Process 32 elements in reverse order - auto num31 = buf[i - 1]; - auto num30 = buf[i - 2]; - auto num29 = buf[i - 3]; - auto num28 = buf[i - 4]; - auto num27 = buf[i - 5]; - auto num26 = buf[i - 6]; - auto num25 = buf[i - 7]; - auto num24 = buf[i - 8]; - auto num23 = buf[i - 9]; - auto num22 = buf[i - 10]; - auto num21 = buf[i - 11]; - auto num20 = buf[i - 12]; - auto num19 = buf[i - 13]; - auto num18 = buf[i - 14]; - auto num17 = buf[i - 15]; - auto num16 = buf[i - 16]; - auto num15 = buf[i - 17]; - auto num14 = buf[i - 18]; - auto num13 = buf[i - 19]; - auto num12 = buf[i - 20]; - auto num11 = buf[i - 21]; - auto num10 = buf[i - 22]; - auto num9 = buf[i - 23]; - auto num8 = buf[i - 24]; - auto num7 = buf[i - 25]; - auto num6 = buf[i - 26]; - auto num5 = buf[i - 27]; - auto num4 = buf[i - 28]; - auto num3 = buf[i - 29]; - auto num2 = buf[i - 30]; - auto num1 = buf[i - 31]; - auto num0 = buf[i - 32]; + // Process 16 elements in reverse order + auto num15 = buf[i - 1]; + auto num14 = buf[i - 2]; + auto num13 = buf[i - 3]; + auto num12 = buf[i - 4]; + auto num11 = buf[i - 5]; + auto num10 = buf[i - 6]; + auto num9 = buf[i - 7]; + auto num8 = buf[i - 8]; + auto num7 = buf[i - 9]; + auto num6 = buf[i - 10]; + auto num5 = buf[i - 11]; + auto num4 = buf[i - 12]; + auto num3 = buf[i - 13]; + auto num2 = buf[i - 14]; + auto num1 = buf[i - 15]; + auto num0 = buf[i - 16]; - // Process all 32 elements (bucket logic here) - tpxb_process_element(num31, a, bucket2, shr2, mask2); - tpxb_process_element(num30, a, bucket2, shr2, mask2); - tpxb_process_element(num29, a, bucket2, shr2, mask2); - tpxb_process_element(num28, a, bucket2, shr2, mask2); - tpxb_process_element(num27, a, bucket2, shr2, mask2); - tpxb_process_element(num26, a, bucket2, shr2, mask2); - tpxb_process_element(num25, a, bucket2, shr2, mask2); - tpxb_process_element(num24, a, bucket2, shr2, mask2); - tpxb_process_element(num23, a, bucket2, shr2, mask2); - tpxb_process_element(num22, a, bucket2, shr2, mask2); - tpxb_process_element(num21, a, bucket2, shr2, mask2); - tpxb_process_element(num20, a, bucket2, shr2, mask2); - tpxb_process_element(num19, a, bucket2, shr2, mask2); - tpxb_process_element(num18, a, bucket2, shr2, mask2); - tpxb_process_element(num17, a, bucket2, shr2, mask2); - tpxb_process_element(num16, a, bucket2, shr2, mask2); + // Process all 16 elements (your bucket logic here) tpxb_process_element(num15, a, bucket2, shr2, mask2); tpxb_process_element(num14, a, bucket2, shr2, mask2); tpxb_process_element(num13, a, bucket2, shr2, mask2); @@ -270,7 +206,7 @@ static inline void threepass_xb(uint32_t *a, uint32_t *buf, int n) noexcept { tpxb_process_element(num1, a, bucket2, shr2, mask2); tpxb_process_element(num0, a, bucket2, shr2, mask2); } - // Handle remainder + // Handle remainder (less than 16 elements) for(uint32_t j = i; j > 0; --j) { auto num = buf[j - 1]; auto bkeyni = (num >> shr2) & mask2; @@ -279,8 +215,8 @@ static inline void threepass_xb(uint32_t *a, uint32_t *buf, int n) noexcept { } // Top digit a->buf // right-to-left to ensure already sorted digits order we keep for iterations - #pragma GCC unroll 2 - for(i = n; i >= 32; i -= 32) { + #pragma GCC unroll 3 + for(i = n; i >= 16; i -= 16) { // Prefetch the NEXT block (not current) at optimal distance if (i > 17) { // Ensure we don't prefetch out of bounds __builtin_prefetch(&a[i - 17]); @@ -292,57 +228,25 @@ static inline void threepass_xb(uint32_t *a, uint32_t *buf, int n) noexcept { __builtin_prefetch(&a[i - 17*3]); } - // Process 32 elements in reverse order - auto num31 = a[i - 1]; - auto num30 = a[i - 2]; - auto num29 = a[i - 3]; - auto num28 = a[i - 4]; - auto num27 = a[i - 5]; - auto num26 = a[i - 6]; - auto num25 = a[i - 7]; - auto num24 = a[i - 8]; - auto num23 = a[i - 9]; - auto num22 = a[i - 10]; - auto num21 = a[i - 11]; - auto num20 = a[i - 12]; - auto num19 = a[i - 13]; - auto num18 = a[i - 14]; - auto num17 = a[i - 15]; - auto num16 = a[i - 16]; - auto num15 = a[i - 17]; - auto num14 = a[i - 18]; - auto num13 = a[i - 19]; - auto num12 = a[i - 20]; - auto num11 = a[i - 21]; - auto num10 = a[i - 22]; - auto num9 = a[i - 23]; - auto num8 = a[i - 24]; - auto num7 = a[i - 25]; - auto num6 = a[i - 26]; - auto num5 = a[i - 27]; - auto num4 = a[i - 28]; - auto num3 = a[i - 29]; - auto num2 = a[i - 30]; - auto num1 = a[i - 31]; - auto num0 = a[i - 32]; + // Process 16 elements in reverse order + auto num15 = a[i - 1]; + auto num14 = a[i - 2]; + auto num13 = a[i - 3]; + auto num12 = a[i - 4]; + auto num11 = a[i - 5]; + auto num10 = a[i - 6]; + auto num9 = a[i - 7]; + auto num8 = a[i - 8]; + auto num7 = a[i - 9]; + auto num6 = a[i - 10]; + auto num5 = a[i - 11]; + auto num4 = a[i - 12]; + auto num3 = a[i - 13]; + auto num2 = a[i - 14]; + auto num1 = a[i - 15]; + auto num0 = a[i - 16]; - // Process all 32 elements (your bucket logic here) - tpxb_process_element(num31, buf, bucket1, shr1, mask1); - tpxb_process_element(num30, buf, bucket1, shr1, mask1); - tpxb_process_element(num29, buf, bucket1, shr1, mask1); - tpxb_process_element(num28, buf, bucket1, shr1, mask1); - tpxb_process_element(num27, buf, bucket1, shr1, mask1); - tpxb_process_element(num26, buf, bucket1, shr1, mask1); - tpxb_process_element(num25, buf, bucket1, shr1, mask1); - tpxb_process_element(num24, buf, bucket1, shr1, mask1); - tpxb_process_element(num23, buf, bucket1, shr1, mask1); - tpxb_process_element(num22, buf, bucket1, shr1, mask1); - tpxb_process_element(num21, buf, bucket1, shr1, mask1); - tpxb_process_element(num20, buf, bucket1, shr1, mask1); - tpxb_process_element(num19, buf, bucket1, shr1, mask1); - tpxb_process_element(num18, buf, bucket1, shr1, mask1); - tpxb_process_element(num17, buf, bucket1, shr1, mask1); - tpxb_process_element(num16, buf, bucket1, shr1, mask1); + // Process all 16 elements (your bucket logic here) tpxb_process_element(num15, buf, bucket1, shr1, mask1); tpxb_process_element(num14, buf, bucket1, shr1, mask1); tpxb_process_element(num13, buf, bucket1, shr1, mask1); @@ -360,7 +264,7 @@ static inline void threepass_xb(uint32_t *a, uint32_t *buf, int n) noexcept { tpxb_process_element(num1, buf, bucket1, shr1, mask1); tpxb_process_element(num0, buf, bucket1, shr1, mask1); } - // Handle remainder + // Handle remainder (less than 16 elements) for(uint32_t j = i; j > 0; --j) { auto num = a[j - 1]; auto bkeyni = (num >> shr1) & mask1;