hand unrolled thiersort3 - I think its slower than gcc unrolling and surely more complex so I will revert

This commit is contained in:
Richard Thier 2025-09-29 18:51:53 +02:00
parent a5cb0995e3
commit 523605e8d8

View File

@ -63,7 +63,7 @@ static inline void threepass_xb(uint32_t *a, uint32_t *buf, int n) noexcept {
(1 << TPBX2),
(1 << TPBX3)
);
int i = 0;
uint32_t i = 0;
#pragma GCC unroll 8
for (; i < common; ++i) {
bucket1[i] += prev1;
@ -74,61 +74,104 @@ static inline void threepass_xb(uint32_t *a, uint32_t *buf, int n) noexcept {
prev3 = bucket3[i];
}
/* Do remaining 1 */
for (int j = i; j < (1 << TPBX1); ++j) {
for (uint32_t j = i; j < (1 << TPBX1); ++j) {
bucket1[j] += prev1;
prev1 = bucket1[j];
}
/* Do remaining 2 */
for (int j = i; j< (1 << TPBX2); ++j) {
for (uint32_t j = i; j< (1 << TPBX2); ++j) {
bucket2[j] += prev2;
prev2 = bucket2[j];
}
/* Do remaining 3 */
for (int j = i; j < (1 << TPBX3); ++j) {
for (uint32_t j = i; j < (1 << TPBX3); ++j) {
bucket3[j] += prev3;
prev3 = bucket3[j];
}
// Bottom digit a->buf
// right-to-left to ensure already sorted digits order we keep for iterations
#pragma GCC unroll 48
for(uint32_t i = n; i > 0; --i) {
// Prefetch caches
//__builtin_prefetch(&a[i-8]);
// Get num and its new offset / location
#pragma GCC unroll 13
for(i = n; i >= 4; i -= 4) {
auto num0 = a[i - 1];
auto bkeyni0 = (num0 >> shr3) & mask3;
auto offset0 = --bucket3[bkeyni0];
buf[offset0] = num0;
auto num1 = a[i - 2];
auto bkeyni1 = (num1 >> shr3) & mask3;
auto offset1 = --bucket3[bkeyni1];
buf[offset1] = num1;
auto num2 = a[i - 3];
auto bkeyni2 = (num2 >> shr3) & mask3;
auto offset2 = --bucket3[bkeyni2];
buf[offset2] = num2;
auto num3 = a[i - 4];
auto bkeyni3 = (num3 >> shr3) & mask3;
auto offset3 = --bucket3[bkeyni3];
buf[offset3] = num3;
}
#pragma GCC unroll 4
for(; i > 0; --i) {
auto num = a[i - 1];
auto bkeyni = (num >> shr3) & mask3;
auto offset = --bucket3[bkeyni];
// Add to the proper target location
buf[offset] = num;
}
// Mid digit buf->a
// right-to-left to ensure already sorted digits order we keep for iterations
#pragma GCC unroll 48
for(uint32_t i = n; i > 0; --i) {
// Prefetch caches
//__builtin_prefetch(&buf[i-8]);
// Get num and its new offset / location
#pragma GCC unroll 13
for(i = n; i >= 4; i -= 4) {
auto num0 = buf[i - 1];
auto bkeyni0 = (num0 >> shr2) & mask2;
auto offset0 = --bucket2[bkeyni0];
a[offset0] = num0;
auto num1 = buf[i - 2];
auto bkeyni1 = (num1 >> shr2) & mask2;
auto offset1 = --bucket2[bkeyni1];
a[offset1] = num1;
auto num2 = buf[i - 3];
auto bkeyni2 = (num2 >> shr2) & mask2;
auto offset2 = --bucket2[bkeyni2];
a[offset2] = num2;
auto num3 = buf[i - 4];
auto bkeyni3 = (num3 >> shr2) & mask2;
auto offset3 = --bucket2[bkeyni3];
a[offset3] = num3;
}
#pragma GCC unroll 4
for(; i > 0; --i) {
auto num = buf[i - 1];
auto bkeyni = (num >> shr2) & mask2;
auto offset = --bucket2[bkeyni];
// Add to the proper target location
a[offset] = num;
}
// Top digit a->buf
// right-to-left to ensure already sorted digits order we keep for iterations
#pragma GCC unroll 48
for(uint32_t i = n; i > 0; --i) {
// Prefetch caches
// __builtin_prefetch(&a[i-16]);
// Get num and its new offset / location
#pragma GCC unroll 13
for(i = n; i >= 4; i -= 4) {
auto num0 = a[i - 1];
auto bkeyni0 = (num0 >> shr1) & mask1;
auto offset0 = --bucket1[bkeyni0];
buf[offset0] = num0;
auto num1 = a[i - 2];
auto bkeyni1 = (num1 >> shr1) & mask1;
auto offset1 = --bucket1[bkeyni1];
buf[offset1] = num1;
auto num2 = a[i - 3];
auto bkeyni2 = (num2 >> shr1) & mask1;
auto offset2 = --bucket1[bkeyni2];
buf[offset2] = num2;
auto num3 = a[i - 4];
auto bkeyni3 = (num3 >> shr1) & mask1;
auto offset3 = --bucket1[bkeyni3];
buf[offset3] = num3;
}
#pragma GCC unroll 4
for(; i > 0; --i) {
auto num = a[i - 1];
auto bkeyni = (num >> shr1) & mask1;
auto offset = --bucket1[bkeyni];
// Add to the proper target location
buf[offset] = num;
}
}