hand unrolled thiersort3 - I think its slower than gcc unrolling and surely more complex so I will revert

This commit is contained in:
Richard Thier 2025-09-29 18:51:53 +02:00
parent a5cb0995e3
commit 523605e8d8

View File

@ -63,7 +63,7 @@ static inline void threepass_xb(uint32_t *a, uint32_t *buf, int n) noexcept {
(1 << TPBX2), (1 << TPBX2),
(1 << TPBX3) (1 << TPBX3)
); );
int i = 0; uint32_t i = 0;
#pragma GCC unroll 8 #pragma GCC unroll 8
for (; i < common; ++i) { for (; i < common; ++i) {
bucket1[i] += prev1; bucket1[i] += prev1;
@ -74,61 +74,104 @@ static inline void threepass_xb(uint32_t *a, uint32_t *buf, int n) noexcept {
prev3 = bucket3[i]; prev3 = bucket3[i];
} }
/* Do remaining 1 */ /* Do remaining 1 */
for (int j = i; j < (1 << TPBX1); ++j) { for (uint32_t j = i; j < (1 << TPBX1); ++j) {
bucket1[j] += prev1; bucket1[j] += prev1;
prev1 = bucket1[j]; prev1 = bucket1[j];
} }
/* Do remaining 2 */ /* Do remaining 2 */
for (int j = i; j< (1 << TPBX2); ++j) { for (uint32_t j = i; j< (1 << TPBX2); ++j) {
bucket2[j] += prev2; bucket2[j] += prev2;
prev2 = bucket2[j]; prev2 = bucket2[j];
} }
/* Do remaining 3 */ /* Do remaining 3 */
for (int j = i; j < (1 << TPBX3); ++j) { for (uint32_t j = i; j < (1 << TPBX3); ++j) {
bucket3[j] += prev3; bucket3[j] += prev3;
prev3 = bucket3[j]; prev3 = bucket3[j];
} }
// Bottom digit a->buf // Bottom digit a->buf
// right-to-left to ensure already sorted digits order we keep for iterations // right-to-left to ensure already sorted digits order we keep for iterations
#pragma GCC unroll 48 #pragma GCC unroll 13
for(uint32_t i = n; i > 0; --i) { for(i = n; i >= 4; i -= 4) {
// Prefetch caches auto num0 = a[i - 1];
//__builtin_prefetch(&a[i-8]); auto bkeyni0 = (num0 >> shr3) & mask3;
// Get num and its new offset / location auto offset0 = --bucket3[bkeyni0];
buf[offset0] = num0;
auto num1 = a[i - 2];
auto bkeyni1 = (num1 >> shr3) & mask3;
auto offset1 = --bucket3[bkeyni1];
buf[offset1] = num1;
auto num2 = a[i - 3];
auto bkeyni2 = (num2 >> shr3) & mask3;
auto offset2 = --bucket3[bkeyni2];
buf[offset2] = num2;
auto num3 = a[i - 4];
auto bkeyni3 = (num3 >> shr3) & mask3;
auto offset3 = --bucket3[bkeyni3];
buf[offset3] = num3;
}
#pragma GCC unroll 4
for(; i > 0; --i) {
auto num = a[i - 1]; auto num = a[i - 1];
auto bkeyni = (num >> shr3) & mask3; auto bkeyni = (num >> shr3) & mask3;
auto offset = --bucket3[bkeyni]; auto offset = --bucket3[bkeyni];
// Add to the proper target location
buf[offset] = num; buf[offset] = num;
} }
// Mid digit buf->a // Mid digit buf->a
// right-to-left to ensure already sorted digits order we keep for iterations // right-to-left to ensure already sorted digits order we keep for iterations
#pragma GCC unroll 48 #pragma GCC unroll 13
for(uint32_t i = n; i > 0; --i) { for(i = n; i >= 4; i -= 4) {
// Prefetch caches auto num0 = buf[i - 1];
//__builtin_prefetch(&buf[i-8]); auto bkeyni0 = (num0 >> shr2) & mask2;
// Get num and its new offset / location auto offset0 = --bucket2[bkeyni0];
a[offset0] = num0;
auto num1 = buf[i - 2];
auto bkeyni1 = (num1 >> shr2) & mask2;
auto offset1 = --bucket2[bkeyni1];
a[offset1] = num1;
auto num2 = buf[i - 3];
auto bkeyni2 = (num2 >> shr2) & mask2;
auto offset2 = --bucket2[bkeyni2];
a[offset2] = num2;
auto num3 = buf[i - 4];
auto bkeyni3 = (num3 >> shr2) & mask2;
auto offset3 = --bucket2[bkeyni3];
a[offset3] = num3;
}
#pragma GCC unroll 4
for(; i > 0; --i) {
auto num = buf[i - 1]; auto num = buf[i - 1];
auto bkeyni = (num >> shr2) & mask2; auto bkeyni = (num >> shr2) & mask2;
auto offset = --bucket2[bkeyni]; auto offset = --bucket2[bkeyni];
// Add to the proper target location
a[offset] = num; a[offset] = num;
} }
// Top digit a->buf // Top digit a->buf
// right-to-left to ensure already sorted digits order we keep for iterations // right-to-left to ensure already sorted digits order we keep for iterations
#pragma GCC unroll 48 #pragma GCC unroll 13
for(uint32_t i = n; i > 0; --i) { for(i = n; i >= 4; i -= 4) {
// Prefetch caches auto num0 = a[i - 1];
// __builtin_prefetch(&a[i-16]); auto bkeyni0 = (num0 >> shr1) & mask1;
// Get num and its new offset / location auto offset0 = --bucket1[bkeyni0];
buf[offset0] = num0;
auto num1 = a[i - 2];
auto bkeyni1 = (num1 >> shr1) & mask1;
auto offset1 = --bucket1[bkeyni1];
buf[offset1] = num1;
auto num2 = a[i - 3];
auto bkeyni2 = (num2 >> shr1) & mask1;
auto offset2 = --bucket1[bkeyni2];
buf[offset2] = num2;
auto num3 = a[i - 4];
auto bkeyni3 = (num3 >> shr1) & mask1;
auto offset3 = --bucket1[bkeyni3];
buf[offset3] = num3;
}
#pragma GCC unroll 4
for(; i > 0; --i) {
auto num = a[i - 1]; auto num = a[i - 1];
auto bkeyni = (num >> shr1) & mask1; auto bkeyni = (num >> shr1) & mask1;
auto offset = --bucket1[bkeyni]; auto offset = --bucket1[bkeyni];
// Add to the proper target location
buf[offset] = num; buf[offset] = num;
} }
} }