hand unrolled thiersort3 - I think its slower than gcc unrolling and surely more complex so I will revert
This commit is contained in:
parent
a5cb0995e3
commit
523605e8d8
@ -63,7 +63,7 @@ static inline void threepass_xb(uint32_t *a, uint32_t *buf, int n) noexcept {
|
|||||||
(1 << TPBX2),
|
(1 << TPBX2),
|
||||||
(1 << TPBX3)
|
(1 << TPBX3)
|
||||||
);
|
);
|
||||||
int i = 0;
|
uint32_t i = 0;
|
||||||
#pragma GCC unroll 8
|
#pragma GCC unroll 8
|
||||||
for (; i < common; ++i) {
|
for (; i < common; ++i) {
|
||||||
bucket1[i] += prev1;
|
bucket1[i] += prev1;
|
||||||
@ -74,61 +74,104 @@ static inline void threepass_xb(uint32_t *a, uint32_t *buf, int n) noexcept {
|
|||||||
prev3 = bucket3[i];
|
prev3 = bucket3[i];
|
||||||
}
|
}
|
||||||
/* Do remaining 1 */
|
/* Do remaining 1 */
|
||||||
for (int j = i; j < (1 << TPBX1); ++j) {
|
for (uint32_t j = i; j < (1 << TPBX1); ++j) {
|
||||||
bucket1[j] += prev1;
|
bucket1[j] += prev1;
|
||||||
prev1 = bucket1[j];
|
prev1 = bucket1[j];
|
||||||
}
|
}
|
||||||
/* Do remaining 2 */
|
/* Do remaining 2 */
|
||||||
for (int j = i; j< (1 << TPBX2); ++j) {
|
for (uint32_t j = i; j< (1 << TPBX2); ++j) {
|
||||||
bucket2[j] += prev2;
|
bucket2[j] += prev2;
|
||||||
prev2 = bucket2[j];
|
prev2 = bucket2[j];
|
||||||
}
|
}
|
||||||
/* Do remaining 3 */
|
/* Do remaining 3 */
|
||||||
for (int j = i; j < (1 << TPBX3); ++j) {
|
for (uint32_t j = i; j < (1 << TPBX3); ++j) {
|
||||||
bucket3[j] += prev3;
|
bucket3[j] += prev3;
|
||||||
prev3 = bucket3[j];
|
prev3 = bucket3[j];
|
||||||
}
|
}
|
||||||
|
|
||||||
// Bottom digit a->buf
|
// Bottom digit a->buf
|
||||||
// right-to-left to ensure already sorted digits order we keep for iterations
|
// right-to-left to ensure already sorted digits order we keep for iterations
|
||||||
#pragma GCC unroll 48
|
#pragma GCC unroll 13
|
||||||
for(uint32_t i = n; i > 0; --i) {
|
for(i = n; i >= 4; i -= 4) {
|
||||||
// Prefetch caches
|
auto num0 = a[i - 1];
|
||||||
//__builtin_prefetch(&a[i-8]);
|
auto bkeyni0 = (num0 >> shr3) & mask3;
|
||||||
// Get num and its new offset / location
|
auto offset0 = --bucket3[bkeyni0];
|
||||||
|
buf[offset0] = num0;
|
||||||
|
auto num1 = a[i - 2];
|
||||||
|
auto bkeyni1 = (num1 >> shr3) & mask3;
|
||||||
|
auto offset1 = --bucket3[bkeyni1];
|
||||||
|
buf[offset1] = num1;
|
||||||
|
auto num2 = a[i - 3];
|
||||||
|
auto bkeyni2 = (num2 >> shr3) & mask3;
|
||||||
|
auto offset2 = --bucket3[bkeyni2];
|
||||||
|
buf[offset2] = num2;
|
||||||
|
auto num3 = a[i - 4];
|
||||||
|
auto bkeyni3 = (num3 >> shr3) & mask3;
|
||||||
|
auto offset3 = --bucket3[bkeyni3];
|
||||||
|
buf[offset3] = num3;
|
||||||
|
}
|
||||||
|
#pragma GCC unroll 4
|
||||||
|
for(; i > 0; --i) {
|
||||||
auto num = a[i - 1];
|
auto num = a[i - 1];
|
||||||
auto bkeyni = (num >> shr3) & mask3;
|
auto bkeyni = (num >> shr3) & mask3;
|
||||||
auto offset = --bucket3[bkeyni];
|
auto offset = --bucket3[bkeyni];
|
||||||
|
|
||||||
// Add to the proper target location
|
|
||||||
buf[offset] = num;
|
buf[offset] = num;
|
||||||
}
|
}
|
||||||
|
|
||||||
// Mid digit buf->a
|
// Mid digit buf->a
|
||||||
// right-to-left to ensure already sorted digits order we keep for iterations
|
// right-to-left to ensure already sorted digits order we keep for iterations
|
||||||
#pragma GCC unroll 48
|
#pragma GCC unroll 13
|
||||||
for(uint32_t i = n; i > 0; --i) {
|
for(i = n; i >= 4; i -= 4) {
|
||||||
// Prefetch caches
|
auto num0 = buf[i - 1];
|
||||||
//__builtin_prefetch(&buf[i-8]);
|
auto bkeyni0 = (num0 >> shr2) & mask2;
|
||||||
// Get num and its new offset / location
|
auto offset0 = --bucket2[bkeyni0];
|
||||||
|
a[offset0] = num0;
|
||||||
|
auto num1 = buf[i - 2];
|
||||||
|
auto bkeyni1 = (num1 >> shr2) & mask2;
|
||||||
|
auto offset1 = --bucket2[bkeyni1];
|
||||||
|
a[offset1] = num1;
|
||||||
|
auto num2 = buf[i - 3];
|
||||||
|
auto bkeyni2 = (num2 >> shr2) & mask2;
|
||||||
|
auto offset2 = --bucket2[bkeyni2];
|
||||||
|
a[offset2] = num2;
|
||||||
|
auto num3 = buf[i - 4];
|
||||||
|
auto bkeyni3 = (num3 >> shr2) & mask2;
|
||||||
|
auto offset3 = --bucket2[bkeyni3];
|
||||||
|
a[offset3] = num3;
|
||||||
|
}
|
||||||
|
#pragma GCC unroll 4
|
||||||
|
for(; i > 0; --i) {
|
||||||
auto num = buf[i - 1];
|
auto num = buf[i - 1];
|
||||||
auto bkeyni = (num >> shr2) & mask2;
|
auto bkeyni = (num >> shr2) & mask2;
|
||||||
auto offset = --bucket2[bkeyni];
|
auto offset = --bucket2[bkeyni];
|
||||||
|
|
||||||
// Add to the proper target location
|
|
||||||
a[offset] = num;
|
a[offset] = num;
|
||||||
}
|
}
|
||||||
// Top digit a->buf
|
// Top digit a->buf
|
||||||
// right-to-left to ensure already sorted digits order we keep for iterations
|
// right-to-left to ensure already sorted digits order we keep for iterations
|
||||||
#pragma GCC unroll 48
|
#pragma GCC unroll 13
|
||||||
for(uint32_t i = n; i > 0; --i) {
|
for(i = n; i >= 4; i -= 4) {
|
||||||
// Prefetch caches
|
auto num0 = a[i - 1];
|
||||||
// __builtin_prefetch(&a[i-16]);
|
auto bkeyni0 = (num0 >> shr1) & mask1;
|
||||||
// Get num and its new offset / location
|
auto offset0 = --bucket1[bkeyni0];
|
||||||
|
buf[offset0] = num0;
|
||||||
|
auto num1 = a[i - 2];
|
||||||
|
auto bkeyni1 = (num1 >> shr1) & mask1;
|
||||||
|
auto offset1 = --bucket1[bkeyni1];
|
||||||
|
buf[offset1] = num1;
|
||||||
|
auto num2 = a[i - 3];
|
||||||
|
auto bkeyni2 = (num2 >> shr1) & mask1;
|
||||||
|
auto offset2 = --bucket1[bkeyni2];
|
||||||
|
buf[offset2] = num2;
|
||||||
|
auto num3 = a[i - 4];
|
||||||
|
auto bkeyni3 = (num3 >> shr1) & mask1;
|
||||||
|
auto offset3 = --bucket1[bkeyni3];
|
||||||
|
buf[offset3] = num3;
|
||||||
|
}
|
||||||
|
#pragma GCC unroll 4
|
||||||
|
for(; i > 0; --i) {
|
||||||
auto num = a[i - 1];
|
auto num = a[i - 1];
|
||||||
auto bkeyni = (num >> shr1) & mask1;
|
auto bkeyni = (num >> shr1) & mask1;
|
||||||
auto offset = --bucket1[bkeyni];
|
auto offset = --bucket1[bkeyni];
|
||||||
|
|
||||||
// Add to the proper target location
|
|
||||||
buf[offset] = num;
|
buf[offset] = num;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user