Revert "hand unrolled thiersort3 - I think its slower than gcc unrolling and surely more complex so I will revert"
This reverts commit 523605e8d841733d7c398131ea50e356b35b88e3.
This commit is contained in:
parent
523605e8d8
commit
0a199b9d72
@ -63,7 +63,7 @@ static inline void threepass_xb(uint32_t *a, uint32_t *buf, int n) noexcept {
|
||||
(1 << TPBX2),
|
||||
(1 << TPBX3)
|
||||
);
|
||||
uint32_t i = 0;
|
||||
int i = 0;
|
||||
#pragma GCC unroll 8
|
||||
for (; i < common; ++i) {
|
||||
bucket1[i] += prev1;
|
||||
@ -74,104 +74,61 @@ static inline void threepass_xb(uint32_t *a, uint32_t *buf, int n) noexcept {
|
||||
prev3 = bucket3[i];
|
||||
}
|
||||
/* Do remaining 1 */
|
||||
for (uint32_t j = i; j < (1 << TPBX1); ++j) {
|
||||
for (int j = i; j < (1 << TPBX1); ++j) {
|
||||
bucket1[j] += prev1;
|
||||
prev1 = bucket1[j];
|
||||
}
|
||||
/* Do remaining 2 */
|
||||
for (uint32_t j = i; j< (1 << TPBX2); ++j) {
|
||||
for (int j = i; j< (1 << TPBX2); ++j) {
|
||||
bucket2[j] += prev2;
|
||||
prev2 = bucket2[j];
|
||||
}
|
||||
/* Do remaining 3 */
|
||||
for (uint32_t j = i; j < (1 << TPBX3); ++j) {
|
||||
for (int j = i; j < (1 << TPBX3); ++j) {
|
||||
bucket3[j] += prev3;
|
||||
prev3 = bucket3[j];
|
||||
}
|
||||
|
||||
// Bottom digit a->buf
|
||||
// right-to-left to ensure already sorted digits order we keep for iterations
|
||||
#pragma GCC unroll 13
|
||||
for(i = n; i >= 4; i -= 4) {
|
||||
auto num0 = a[i - 1];
|
||||
auto bkeyni0 = (num0 >> shr3) & mask3;
|
||||
auto offset0 = --bucket3[bkeyni0];
|
||||
buf[offset0] = num0;
|
||||
auto num1 = a[i - 2];
|
||||
auto bkeyni1 = (num1 >> shr3) & mask3;
|
||||
auto offset1 = --bucket3[bkeyni1];
|
||||
buf[offset1] = num1;
|
||||
auto num2 = a[i - 3];
|
||||
auto bkeyni2 = (num2 >> shr3) & mask3;
|
||||
auto offset2 = --bucket3[bkeyni2];
|
||||
buf[offset2] = num2;
|
||||
auto num3 = a[i - 4];
|
||||
auto bkeyni3 = (num3 >> shr3) & mask3;
|
||||
auto offset3 = --bucket3[bkeyni3];
|
||||
buf[offset3] = num3;
|
||||
}
|
||||
#pragma GCC unroll 4
|
||||
for(; i > 0; --i) {
|
||||
#pragma GCC unroll 48
|
||||
for(uint32_t i = n; i > 0; --i) {
|
||||
// Prefetch caches
|
||||
//__builtin_prefetch(&a[i-8]);
|
||||
// Get num and its new offset / location
|
||||
auto num = a[i - 1];
|
||||
auto bkeyni = (num >> shr3) & mask3;
|
||||
auto offset = --bucket3[bkeyni];
|
||||
|
||||
// Add to the proper target location
|
||||
buf[offset] = num;
|
||||
}
|
||||
|
||||
// Mid digit buf->a
|
||||
// right-to-left to ensure already sorted digits order we keep for iterations
|
||||
#pragma GCC unroll 13
|
||||
for(i = n; i >= 4; i -= 4) {
|
||||
auto num0 = buf[i - 1];
|
||||
auto bkeyni0 = (num0 >> shr2) & mask2;
|
||||
auto offset0 = --bucket2[bkeyni0];
|
||||
a[offset0] = num0;
|
||||
auto num1 = buf[i - 2];
|
||||
auto bkeyni1 = (num1 >> shr2) & mask2;
|
||||
auto offset1 = --bucket2[bkeyni1];
|
||||
a[offset1] = num1;
|
||||
auto num2 = buf[i - 3];
|
||||
auto bkeyni2 = (num2 >> shr2) & mask2;
|
||||
auto offset2 = --bucket2[bkeyni2];
|
||||
a[offset2] = num2;
|
||||
auto num3 = buf[i - 4];
|
||||
auto bkeyni3 = (num3 >> shr2) & mask2;
|
||||
auto offset3 = --bucket2[bkeyni3];
|
||||
a[offset3] = num3;
|
||||
}
|
||||
#pragma GCC unroll 4
|
||||
for(; i > 0; --i) {
|
||||
#pragma GCC unroll 48
|
||||
for(uint32_t i = n; i > 0; --i) {
|
||||
// Prefetch caches
|
||||
//__builtin_prefetch(&buf[i-8]);
|
||||
// Get num and its new offset / location
|
||||
auto num = buf[i - 1];
|
||||
auto bkeyni = (num >> shr2) & mask2;
|
||||
auto offset = --bucket2[bkeyni];
|
||||
|
||||
// Add to the proper target location
|
||||
a[offset] = num;
|
||||
}
|
||||
// Top digit a->buf
|
||||
// right-to-left to ensure already sorted digits order we keep for iterations
|
||||
#pragma GCC unroll 13
|
||||
for(i = n; i >= 4; i -= 4) {
|
||||
auto num0 = a[i - 1];
|
||||
auto bkeyni0 = (num0 >> shr1) & mask1;
|
||||
auto offset0 = --bucket1[bkeyni0];
|
||||
buf[offset0] = num0;
|
||||
auto num1 = a[i - 2];
|
||||
auto bkeyni1 = (num1 >> shr1) & mask1;
|
||||
auto offset1 = --bucket1[bkeyni1];
|
||||
buf[offset1] = num1;
|
||||
auto num2 = a[i - 3];
|
||||
auto bkeyni2 = (num2 >> shr1) & mask1;
|
||||
auto offset2 = --bucket1[bkeyni2];
|
||||
buf[offset2] = num2;
|
||||
auto num3 = a[i - 4];
|
||||
auto bkeyni3 = (num3 >> shr1) & mask1;
|
||||
auto offset3 = --bucket1[bkeyni3];
|
||||
buf[offset3] = num3;
|
||||
}
|
||||
#pragma GCC unroll 4
|
||||
for(; i > 0; --i) {
|
||||
#pragma GCC unroll 48
|
||||
for(uint32_t i = n; i > 0; --i) {
|
||||
// Prefetch caches
|
||||
// __builtin_prefetch(&a[i-16]);
|
||||
// Get num and its new offset / location
|
||||
auto num = a[i - 1];
|
||||
auto bkeyni = (num >> shr1) & mask1;
|
||||
auto offset = --bucket1[bkeyni];
|
||||
|
||||
// Add to the proper target location
|
||||
buf[offset] = num;
|
||||
}
|
||||
}
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user