32-wide manual unroll with 2x compiled... still not as good perf as automatic 48x
This commit is contained in:
parent
6d79461262
commit
18b734a6e7
222
threepass_xbit.h
222
threepass_xbit.h
@ -98,8 +98,8 @@ static inline void threepass_xb(uint32_t *a, uint32_t *buf, int n) noexcept {
|
|||||||
|
|
||||||
// Bottom digit a->buf
|
// Bottom digit a->buf
|
||||||
// right-to-left to ensure already sorted digits order we keep for iterations
|
// right-to-left to ensure already sorted digits order we keep for iterations
|
||||||
#pragma GCC unroll 3
|
#pragma GCC unroll 2
|
||||||
for(i = n; i >= 16; i -= 16) {
|
for(i = n; i >= 32; i -= 32) {
|
||||||
// Prefetch the NEXT block (not current) at optimal distance
|
// Prefetch the NEXT block (not current) at optimal distance
|
||||||
if (i > 17) { // Ensure we don't prefetch out of bounds
|
if (i > 17) { // Ensure we don't prefetch out of bounds
|
||||||
__builtin_prefetch(&a[i - 17]);
|
__builtin_prefetch(&a[i - 17]);
|
||||||
@ -111,25 +111,57 @@ static inline void threepass_xb(uint32_t *a, uint32_t *buf, int n) noexcept {
|
|||||||
__builtin_prefetch(&a[i - 17*3]);
|
__builtin_prefetch(&a[i - 17*3]);
|
||||||
}
|
}
|
||||||
|
|
||||||
// Process 16 elements in reverse order
|
// Process 32 elements in reverse order
|
||||||
auto num15 = a[i - 1];
|
auto num31 = a[i - 1];
|
||||||
auto num14 = a[i - 2];
|
auto num30 = a[i - 2];
|
||||||
auto num13 = a[i - 3];
|
auto num29 = a[i - 3];
|
||||||
auto num12 = a[i - 4];
|
auto num28 = a[i - 4];
|
||||||
auto num11 = a[i - 5];
|
auto num27 = a[i - 5];
|
||||||
auto num10 = a[i - 6];
|
auto num26 = a[i - 6];
|
||||||
auto num9 = a[i - 7];
|
auto num25 = a[i - 7];
|
||||||
auto num8 = a[i - 8];
|
auto num24 = a[i - 8];
|
||||||
auto num7 = a[i - 9];
|
auto num23 = a[i - 9];
|
||||||
auto num6 = a[i - 10];
|
auto num22 = a[i - 10];
|
||||||
auto num5 = a[i - 11];
|
auto num21 = a[i - 11];
|
||||||
auto num4 = a[i - 12];
|
auto num20 = a[i - 12];
|
||||||
auto num3 = a[i - 13];
|
auto num19 = a[i - 13];
|
||||||
auto num2 = a[i - 14];
|
auto num18 = a[i - 14];
|
||||||
auto num1 = a[i - 15];
|
auto num17 = a[i - 15];
|
||||||
auto num0 = a[i - 16];
|
auto num16 = a[i - 16];
|
||||||
|
auto num15 = a[i - 17];
|
||||||
|
auto num14 = a[i - 18];
|
||||||
|
auto num13 = a[i - 19];
|
||||||
|
auto num12 = a[i - 20];
|
||||||
|
auto num11 = a[i - 21];
|
||||||
|
auto num10 = a[i - 22];
|
||||||
|
auto num9 = a[i - 23];
|
||||||
|
auto num8 = a[i - 24];
|
||||||
|
auto num7 = a[i - 25];
|
||||||
|
auto num6 = a[i - 26];
|
||||||
|
auto num5 = a[i - 27];
|
||||||
|
auto num4 = a[i - 28];
|
||||||
|
auto num3 = a[i - 29];
|
||||||
|
auto num2 = a[i - 30];
|
||||||
|
auto num1 = a[i - 31];
|
||||||
|
auto num0 = a[i - 32];
|
||||||
|
|
||||||
// Process all 16 elements (your bucket logic here)
|
// Process all 32 elements (bucket logic here)
|
||||||
|
tpxb_process_element(num31, buf, bucket3, shr3, mask3);
|
||||||
|
tpxb_process_element(num30, buf, bucket3, shr3, mask3);
|
||||||
|
tpxb_process_element(num29, buf, bucket3, shr3, mask3);
|
||||||
|
tpxb_process_element(num28, buf, bucket3, shr3, mask3);
|
||||||
|
tpxb_process_element(num27, buf, bucket3, shr3, mask3);
|
||||||
|
tpxb_process_element(num26, buf, bucket3, shr3, mask3);
|
||||||
|
tpxb_process_element(num25, buf, bucket3, shr3, mask3);
|
||||||
|
tpxb_process_element(num24, buf, bucket3, shr3, mask3);
|
||||||
|
tpxb_process_element(num23, buf, bucket3, shr3, mask3);
|
||||||
|
tpxb_process_element(num22, buf, bucket3, shr3, mask3);
|
||||||
|
tpxb_process_element(num21, buf, bucket3, shr3, mask3);
|
||||||
|
tpxb_process_element(num20, buf, bucket3, shr3, mask3);
|
||||||
|
tpxb_process_element(num19, buf, bucket3, shr3, mask3);
|
||||||
|
tpxb_process_element(num18, buf, bucket3, shr3, mask3);
|
||||||
|
tpxb_process_element(num17, buf, bucket3, shr3, mask3);
|
||||||
|
tpxb_process_element(num16, buf, bucket3, shr3, mask3);
|
||||||
tpxb_process_element(num15, buf, bucket3, shr3, mask3);
|
tpxb_process_element(num15, buf, bucket3, shr3, mask3);
|
||||||
tpxb_process_element(num14, buf, bucket3, shr3, mask3);
|
tpxb_process_element(num14, buf, bucket3, shr3, mask3);
|
||||||
tpxb_process_element(num13, buf, bucket3, shr3, mask3);
|
tpxb_process_element(num13, buf, bucket3, shr3, mask3);
|
||||||
@ -147,7 +179,7 @@ static inline void threepass_xb(uint32_t *a, uint32_t *buf, int n) noexcept {
|
|||||||
tpxb_process_element(num1, buf, bucket3, shr3, mask3);
|
tpxb_process_element(num1, buf, bucket3, shr3, mask3);
|
||||||
tpxb_process_element(num0, buf, bucket3, shr3, mask3);
|
tpxb_process_element(num0, buf, bucket3, shr3, mask3);
|
||||||
}
|
}
|
||||||
// Handle remainder (less than 16 elements)
|
// Handle remainder
|
||||||
for(uint32_t j = i; j > 0; --j) {
|
for(uint32_t j = i; j > 0; --j) {
|
||||||
auto num = a[j - 1];
|
auto num = a[j - 1];
|
||||||
auto bkeyni = (num >> shr3) & mask3;
|
auto bkeyni = (num >> shr3) & mask3;
|
||||||
@ -157,8 +189,8 @@ static inline void threepass_xb(uint32_t *a, uint32_t *buf, int n) noexcept {
|
|||||||
|
|
||||||
// Mid digit buf->a
|
// Mid digit buf->a
|
||||||
// right-to-left to ensure already sorted digits order we keep for iterations
|
// right-to-left to ensure already sorted digits order we keep for iterations
|
||||||
#pragma GCC unroll 3
|
#pragma GCC unroll 2
|
||||||
for(i = n; i >= 16; i -= 16) {
|
for(i = n; i >= 32; i -= 32) {
|
||||||
// Prefetch the NEXT block (not current) at optimal distance
|
// Prefetch the NEXT block (not current) at optimal distance
|
||||||
if (i > 17) { // Ensure we don't prefetch out of bounds
|
if (i > 17) { // Ensure we don't prefetch out of bounds
|
||||||
__builtin_prefetch(&buf[i - 17]);
|
__builtin_prefetch(&buf[i - 17]);
|
||||||
@ -170,25 +202,57 @@ static inline void threepass_xb(uint32_t *a, uint32_t *buf, int n) noexcept {
|
|||||||
__builtin_prefetch(&buf[i - 17*3]);
|
__builtin_prefetch(&buf[i - 17*3]);
|
||||||
}
|
}
|
||||||
|
|
||||||
// Process 16 elements in reverse order
|
// Process 32 elements in reverse order
|
||||||
auto num15 = buf[i - 1];
|
auto num31 = buf[i - 1];
|
||||||
auto num14 = buf[i - 2];
|
auto num30 = buf[i - 2];
|
||||||
auto num13 = buf[i - 3];
|
auto num29 = buf[i - 3];
|
||||||
auto num12 = buf[i - 4];
|
auto num28 = buf[i - 4];
|
||||||
auto num11 = buf[i - 5];
|
auto num27 = buf[i - 5];
|
||||||
auto num10 = buf[i - 6];
|
auto num26 = buf[i - 6];
|
||||||
auto num9 = buf[i - 7];
|
auto num25 = buf[i - 7];
|
||||||
auto num8 = buf[i - 8];
|
auto num24 = buf[i - 8];
|
||||||
auto num7 = buf[i - 9];
|
auto num23 = buf[i - 9];
|
||||||
auto num6 = buf[i - 10];
|
auto num22 = buf[i - 10];
|
||||||
auto num5 = buf[i - 11];
|
auto num21 = buf[i - 11];
|
||||||
auto num4 = buf[i - 12];
|
auto num20 = buf[i - 12];
|
||||||
auto num3 = buf[i - 13];
|
auto num19 = buf[i - 13];
|
||||||
auto num2 = buf[i - 14];
|
auto num18 = buf[i - 14];
|
||||||
auto num1 = buf[i - 15];
|
auto num17 = buf[i - 15];
|
||||||
auto num0 = buf[i - 16];
|
auto num16 = buf[i - 16];
|
||||||
|
auto num15 = buf[i - 17];
|
||||||
|
auto num14 = buf[i - 18];
|
||||||
|
auto num13 = buf[i - 19];
|
||||||
|
auto num12 = buf[i - 20];
|
||||||
|
auto num11 = buf[i - 21];
|
||||||
|
auto num10 = buf[i - 22];
|
||||||
|
auto num9 = buf[i - 23];
|
||||||
|
auto num8 = buf[i - 24];
|
||||||
|
auto num7 = buf[i - 25];
|
||||||
|
auto num6 = buf[i - 26];
|
||||||
|
auto num5 = buf[i - 27];
|
||||||
|
auto num4 = buf[i - 28];
|
||||||
|
auto num3 = buf[i - 29];
|
||||||
|
auto num2 = buf[i - 30];
|
||||||
|
auto num1 = buf[i - 31];
|
||||||
|
auto num0 = buf[i - 32];
|
||||||
|
|
||||||
// Process all 16 elements (your bucket logic here)
|
// Process all 32 elements (bucket logic here)
|
||||||
|
tpxb_process_element(num31, a, bucket2, shr2, mask2);
|
||||||
|
tpxb_process_element(num30, a, bucket2, shr2, mask2);
|
||||||
|
tpxb_process_element(num29, a, bucket2, shr2, mask2);
|
||||||
|
tpxb_process_element(num28, a, bucket2, shr2, mask2);
|
||||||
|
tpxb_process_element(num27, a, bucket2, shr2, mask2);
|
||||||
|
tpxb_process_element(num26, a, bucket2, shr2, mask2);
|
||||||
|
tpxb_process_element(num25, a, bucket2, shr2, mask2);
|
||||||
|
tpxb_process_element(num24, a, bucket2, shr2, mask2);
|
||||||
|
tpxb_process_element(num23, a, bucket2, shr2, mask2);
|
||||||
|
tpxb_process_element(num22, a, bucket2, shr2, mask2);
|
||||||
|
tpxb_process_element(num21, a, bucket2, shr2, mask2);
|
||||||
|
tpxb_process_element(num20, a, bucket2, shr2, mask2);
|
||||||
|
tpxb_process_element(num19, a, bucket2, shr2, mask2);
|
||||||
|
tpxb_process_element(num18, a, bucket2, shr2, mask2);
|
||||||
|
tpxb_process_element(num17, a, bucket2, shr2, mask2);
|
||||||
|
tpxb_process_element(num16, a, bucket2, shr2, mask2);
|
||||||
tpxb_process_element(num15, a, bucket2, shr2, mask2);
|
tpxb_process_element(num15, a, bucket2, shr2, mask2);
|
||||||
tpxb_process_element(num14, a, bucket2, shr2, mask2);
|
tpxb_process_element(num14, a, bucket2, shr2, mask2);
|
||||||
tpxb_process_element(num13, a, bucket2, shr2, mask2);
|
tpxb_process_element(num13, a, bucket2, shr2, mask2);
|
||||||
@ -206,7 +270,7 @@ static inline void threepass_xb(uint32_t *a, uint32_t *buf, int n) noexcept {
|
|||||||
tpxb_process_element(num1, a, bucket2, shr2, mask2);
|
tpxb_process_element(num1, a, bucket2, shr2, mask2);
|
||||||
tpxb_process_element(num0, a, bucket2, shr2, mask2);
|
tpxb_process_element(num0, a, bucket2, shr2, mask2);
|
||||||
}
|
}
|
||||||
// Handle remainder (less than 16 elements)
|
// Handle remainder
|
||||||
for(uint32_t j = i; j > 0; --j) {
|
for(uint32_t j = i; j > 0; --j) {
|
||||||
auto num = buf[j - 1];
|
auto num = buf[j - 1];
|
||||||
auto bkeyni = (num >> shr2) & mask2;
|
auto bkeyni = (num >> shr2) & mask2;
|
||||||
@ -215,8 +279,8 @@ static inline void threepass_xb(uint32_t *a, uint32_t *buf, int n) noexcept {
|
|||||||
}
|
}
|
||||||
// Top digit a->buf
|
// Top digit a->buf
|
||||||
// right-to-left to ensure already sorted digits order we keep for iterations
|
// right-to-left to ensure already sorted digits order we keep for iterations
|
||||||
#pragma GCC unroll 3
|
#pragma GCC unroll 2
|
||||||
for(i = n; i >= 16; i -= 16) {
|
for(i = n; i >= 32; i -= 32) {
|
||||||
// Prefetch the NEXT block (not current) at optimal distance
|
// Prefetch the NEXT block (not current) at optimal distance
|
||||||
if (i > 17) { // Ensure we don't prefetch out of bounds
|
if (i > 17) { // Ensure we don't prefetch out of bounds
|
||||||
__builtin_prefetch(&a[i - 17]);
|
__builtin_prefetch(&a[i - 17]);
|
||||||
@ -228,25 +292,57 @@ static inline void threepass_xb(uint32_t *a, uint32_t *buf, int n) noexcept {
|
|||||||
__builtin_prefetch(&a[i - 17*3]);
|
__builtin_prefetch(&a[i - 17*3]);
|
||||||
}
|
}
|
||||||
|
|
||||||
// Process 16 elements in reverse order
|
// Process 32 elements in reverse order
|
||||||
auto num15 = a[i - 1];
|
auto num31 = a[i - 1];
|
||||||
auto num14 = a[i - 2];
|
auto num30 = a[i - 2];
|
||||||
auto num13 = a[i - 3];
|
auto num29 = a[i - 3];
|
||||||
auto num12 = a[i - 4];
|
auto num28 = a[i - 4];
|
||||||
auto num11 = a[i - 5];
|
auto num27 = a[i - 5];
|
||||||
auto num10 = a[i - 6];
|
auto num26 = a[i - 6];
|
||||||
auto num9 = a[i - 7];
|
auto num25 = a[i - 7];
|
||||||
auto num8 = a[i - 8];
|
auto num24 = a[i - 8];
|
||||||
auto num7 = a[i - 9];
|
auto num23 = a[i - 9];
|
||||||
auto num6 = a[i - 10];
|
auto num22 = a[i - 10];
|
||||||
auto num5 = a[i - 11];
|
auto num21 = a[i - 11];
|
||||||
auto num4 = a[i - 12];
|
auto num20 = a[i - 12];
|
||||||
auto num3 = a[i - 13];
|
auto num19 = a[i - 13];
|
||||||
auto num2 = a[i - 14];
|
auto num18 = a[i - 14];
|
||||||
auto num1 = a[i - 15];
|
auto num17 = a[i - 15];
|
||||||
auto num0 = a[i - 16];
|
auto num16 = a[i - 16];
|
||||||
|
auto num15 = a[i - 17];
|
||||||
|
auto num14 = a[i - 18];
|
||||||
|
auto num13 = a[i - 19];
|
||||||
|
auto num12 = a[i - 20];
|
||||||
|
auto num11 = a[i - 21];
|
||||||
|
auto num10 = a[i - 22];
|
||||||
|
auto num9 = a[i - 23];
|
||||||
|
auto num8 = a[i - 24];
|
||||||
|
auto num7 = a[i - 25];
|
||||||
|
auto num6 = a[i - 26];
|
||||||
|
auto num5 = a[i - 27];
|
||||||
|
auto num4 = a[i - 28];
|
||||||
|
auto num3 = a[i - 29];
|
||||||
|
auto num2 = a[i - 30];
|
||||||
|
auto num1 = a[i - 31];
|
||||||
|
auto num0 = a[i - 32];
|
||||||
|
|
||||||
// Process all 16 elements (your bucket logic here)
|
// Process all 32 elements (your bucket logic here)
|
||||||
|
tpxb_process_element(num31, buf, bucket1, shr1, mask1);
|
||||||
|
tpxb_process_element(num30, buf, bucket1, shr1, mask1);
|
||||||
|
tpxb_process_element(num29, buf, bucket1, shr1, mask1);
|
||||||
|
tpxb_process_element(num28, buf, bucket1, shr1, mask1);
|
||||||
|
tpxb_process_element(num27, buf, bucket1, shr1, mask1);
|
||||||
|
tpxb_process_element(num26, buf, bucket1, shr1, mask1);
|
||||||
|
tpxb_process_element(num25, buf, bucket1, shr1, mask1);
|
||||||
|
tpxb_process_element(num24, buf, bucket1, shr1, mask1);
|
||||||
|
tpxb_process_element(num23, buf, bucket1, shr1, mask1);
|
||||||
|
tpxb_process_element(num22, buf, bucket1, shr1, mask1);
|
||||||
|
tpxb_process_element(num21, buf, bucket1, shr1, mask1);
|
||||||
|
tpxb_process_element(num20, buf, bucket1, shr1, mask1);
|
||||||
|
tpxb_process_element(num19, buf, bucket1, shr1, mask1);
|
||||||
|
tpxb_process_element(num18, buf, bucket1, shr1, mask1);
|
||||||
|
tpxb_process_element(num17, buf, bucket1, shr1, mask1);
|
||||||
|
tpxb_process_element(num16, buf, bucket1, shr1, mask1);
|
||||||
tpxb_process_element(num15, buf, bucket1, shr1, mask1);
|
tpxb_process_element(num15, buf, bucket1, shr1, mask1);
|
||||||
tpxb_process_element(num14, buf, bucket1, shr1, mask1);
|
tpxb_process_element(num14, buf, bucket1, shr1, mask1);
|
||||||
tpxb_process_element(num13, buf, bucket1, shr1, mask1);
|
tpxb_process_element(num13, buf, bucket1, shr1, mask1);
|
||||||
@ -264,7 +360,7 @@ static inline void threepass_xb(uint32_t *a, uint32_t *buf, int n) noexcept {
|
|||||||
tpxb_process_element(num1, buf, bucket1, shr1, mask1);
|
tpxb_process_element(num1, buf, bucket1, shr1, mask1);
|
||||||
tpxb_process_element(num0, buf, bucket1, shr1, mask1);
|
tpxb_process_element(num0, buf, bucket1, shr1, mask1);
|
||||||
}
|
}
|
||||||
// Handle remainder (less than 16 elements)
|
// Handle remainder
|
||||||
for(uint32_t j = i; j > 0; --j) {
|
for(uint32_t j = i; j > 0; --j) {
|
||||||
auto num = a[j - 1];
|
auto num = a[j - 1];
|
||||||
auto bkeyni = (num >> shr1) & mask1;
|
auto bkeyni = (num >> shr1) & mask1;
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user