tpxb: 16-wide manual unroll - but it does not seem to be faster
This commit is contained in:
parent
036725611b
commit
6d79461262
188
threepass_xbit.h
188
threepass_xbit.h
@ -20,6 +20,13 @@ static inline constexpr uint32_t min3u32_xb(uint32_t a, uint32_t b, uint32_t c)
|
|||||||
((b <= c) ? b : c);
|
((b <= c) ? b : c);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static inline void tpxb_process_element(uint32_t num, uint32_t* arr, uint32_t* bucket,
|
||||||
|
uint32_t shr, uint32_t mask) {
|
||||||
|
auto bkeyni = (num >> shr) & mask;
|
||||||
|
auto offset = --bucket[bkeyni];
|
||||||
|
arr[offset] = num;
|
||||||
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Simple three-pass (ok: 3 + 1) bottom-up internal radix sort writter for thiersort3
|
* Simple three-pass (ok: 3 + 1) bottom-up internal radix sort writter for thiersort3
|
||||||
*
|
*
|
||||||
@ -91,44 +98,177 @@ static inline void threepass_xb(uint32_t *a, uint32_t *buf, int n) noexcept {
|
|||||||
|
|
||||||
// Bottom digit a->buf
|
// Bottom digit a->buf
|
||||||
// right-to-left to ensure already sorted digits order we keep for iterations
|
// right-to-left to ensure already sorted digits order we keep for iterations
|
||||||
#pragma GCC unroll 48
|
#pragma GCC unroll 3
|
||||||
for(uint32_t i = n; i > 0; --i) {
|
for(i = n; i >= 16; i -= 16) {
|
||||||
// Prefetch caches
|
// Prefetch the NEXT block (not current) at optimal distance
|
||||||
//__builtin_prefetch(&a[i-8]);
|
if (i > 17) { // Ensure we don't prefetch out of bounds
|
||||||
// Get num and its new offset / location
|
__builtin_prefetch(&a[i - 17]);
|
||||||
auto num = a[i - 1];
|
}
|
||||||
|
if (i > 17*2) { // Ensure we don't prefetch out of bounds
|
||||||
|
__builtin_prefetch(&a[i - 17*2]);
|
||||||
|
}
|
||||||
|
if (i > 17*3) { // Ensure we don't prefetch out of bounds
|
||||||
|
__builtin_prefetch(&a[i - 17*3]);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Process 16 elements in reverse order
|
||||||
|
auto num15 = a[i - 1];
|
||||||
|
auto num14 = a[i - 2];
|
||||||
|
auto num13 = a[i - 3];
|
||||||
|
auto num12 = a[i - 4];
|
||||||
|
auto num11 = a[i - 5];
|
||||||
|
auto num10 = a[i - 6];
|
||||||
|
auto num9 = a[i - 7];
|
||||||
|
auto num8 = a[i - 8];
|
||||||
|
auto num7 = a[i - 9];
|
||||||
|
auto num6 = a[i - 10];
|
||||||
|
auto num5 = a[i - 11];
|
||||||
|
auto num4 = a[i - 12];
|
||||||
|
auto num3 = a[i - 13];
|
||||||
|
auto num2 = a[i - 14];
|
||||||
|
auto num1 = a[i - 15];
|
||||||
|
auto num0 = a[i - 16];
|
||||||
|
|
||||||
|
// Process all 16 elements (your bucket logic here)
|
||||||
|
tpxb_process_element(num15, buf, bucket3, shr3, mask3);
|
||||||
|
tpxb_process_element(num14, buf, bucket3, shr3, mask3);
|
||||||
|
tpxb_process_element(num13, buf, bucket3, shr3, mask3);
|
||||||
|
tpxb_process_element(num12, buf, bucket3, shr3, mask3);
|
||||||
|
tpxb_process_element(num11, buf, bucket3, shr3, mask3);
|
||||||
|
tpxb_process_element(num10, buf, bucket3, shr3, mask3);
|
||||||
|
tpxb_process_element(num9, buf, bucket3, shr3, mask3);
|
||||||
|
tpxb_process_element(num8, buf, bucket3, shr3, mask3);
|
||||||
|
tpxb_process_element(num7, buf, bucket3, shr3, mask3);
|
||||||
|
tpxb_process_element(num6, buf, bucket3, shr3, mask3);
|
||||||
|
tpxb_process_element(num5, buf, bucket3, shr3, mask3);
|
||||||
|
tpxb_process_element(num4, buf, bucket3, shr3, mask3);
|
||||||
|
tpxb_process_element(num3, buf, bucket3, shr3, mask3);
|
||||||
|
tpxb_process_element(num2, buf, bucket3, shr3, mask3);
|
||||||
|
tpxb_process_element(num1, buf, bucket3, shr3, mask3);
|
||||||
|
tpxb_process_element(num0, buf, bucket3, shr3, mask3);
|
||||||
|
}
|
||||||
|
// Handle remainder (less than 16 elements)
|
||||||
|
for(uint32_t j = i; j > 0; --j) {
|
||||||
|
auto num = a[j - 1];
|
||||||
auto bkeyni = (num >> shr3) & mask3;
|
auto bkeyni = (num >> shr3) & mask3;
|
||||||
auto offset = --bucket3[bkeyni];
|
auto offset = --bucket3[bkeyni];
|
||||||
|
|
||||||
// Add to the proper target location
|
|
||||||
buf[offset] = num;
|
buf[offset] = num;
|
||||||
}
|
}
|
||||||
|
|
||||||
// Mid digit buf->a
|
// Mid digit buf->a
|
||||||
// right-to-left to ensure already sorted digits order we keep for iterations
|
// right-to-left to ensure already sorted digits order we keep for iterations
|
||||||
#pragma GCC unroll 48
|
#pragma GCC unroll 3
|
||||||
for(uint32_t i = n; i > 0; --i) {
|
for(i = n; i >= 16; i -= 16) {
|
||||||
// Prefetch caches
|
// Prefetch the NEXT block (not current) at optimal distance
|
||||||
//__builtin_prefetch(&buf[i-8]);
|
if (i > 17) { // Ensure we don't prefetch out of bounds
|
||||||
// Get num and its new offset / location
|
__builtin_prefetch(&buf[i - 17]);
|
||||||
auto num = buf[i - 1];
|
}
|
||||||
|
if (i > 17*2) { // Ensure we don't prefetch out of bounds
|
||||||
|
__builtin_prefetch(&buf[i - 17*2]);
|
||||||
|
}
|
||||||
|
if (i > 17*3) { // Ensure we don't prefetch out of bounds
|
||||||
|
__builtin_prefetch(&buf[i - 17*3]);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Process 16 elements in reverse order
|
||||||
|
auto num15 = buf[i - 1];
|
||||||
|
auto num14 = buf[i - 2];
|
||||||
|
auto num13 = buf[i - 3];
|
||||||
|
auto num12 = buf[i - 4];
|
||||||
|
auto num11 = buf[i - 5];
|
||||||
|
auto num10 = buf[i - 6];
|
||||||
|
auto num9 = buf[i - 7];
|
||||||
|
auto num8 = buf[i - 8];
|
||||||
|
auto num7 = buf[i - 9];
|
||||||
|
auto num6 = buf[i - 10];
|
||||||
|
auto num5 = buf[i - 11];
|
||||||
|
auto num4 = buf[i - 12];
|
||||||
|
auto num3 = buf[i - 13];
|
||||||
|
auto num2 = buf[i - 14];
|
||||||
|
auto num1 = buf[i - 15];
|
||||||
|
auto num0 = buf[i - 16];
|
||||||
|
|
||||||
|
// Process all 16 elements (your bucket logic here)
|
||||||
|
tpxb_process_element(num15, a, bucket2, shr2, mask2);
|
||||||
|
tpxb_process_element(num14, a, bucket2, shr2, mask2);
|
||||||
|
tpxb_process_element(num13, a, bucket2, shr2, mask2);
|
||||||
|
tpxb_process_element(num12, a, bucket2, shr2, mask2);
|
||||||
|
tpxb_process_element(num11, a, bucket2, shr2, mask2);
|
||||||
|
tpxb_process_element(num10, a, bucket2, shr2, mask2);
|
||||||
|
tpxb_process_element(num9, a, bucket2, shr2, mask2);
|
||||||
|
tpxb_process_element(num8, a, bucket2, shr2, mask2);
|
||||||
|
tpxb_process_element(num7, a, bucket2, shr2, mask2);
|
||||||
|
tpxb_process_element(num6, a, bucket2, shr2, mask2);
|
||||||
|
tpxb_process_element(num5, a, bucket2, shr2, mask2);
|
||||||
|
tpxb_process_element(num4, a, bucket2, shr2, mask2);
|
||||||
|
tpxb_process_element(num3, a, bucket2, shr2, mask2);
|
||||||
|
tpxb_process_element(num2, a, bucket2, shr2, mask2);
|
||||||
|
tpxb_process_element(num1, a, bucket2, shr2, mask2);
|
||||||
|
tpxb_process_element(num0, a, bucket2, shr2, mask2);
|
||||||
|
}
|
||||||
|
// Handle remainder (less than 16 elements)
|
||||||
|
for(uint32_t j = i; j > 0; --j) {
|
||||||
|
auto num = buf[j - 1];
|
||||||
auto bkeyni = (num >> shr2) & mask2;
|
auto bkeyni = (num >> shr2) & mask2;
|
||||||
auto offset = --bucket2[bkeyni];
|
auto offset = --bucket2[bkeyni];
|
||||||
|
|
||||||
// Add to the proper target location
|
|
||||||
a[offset] = num;
|
a[offset] = num;
|
||||||
}
|
}
|
||||||
// Top digit a->buf
|
// Top digit a->buf
|
||||||
// right-to-left to ensure already sorted digits order we keep for iterations
|
// right-to-left to ensure already sorted digits order we keep for iterations
|
||||||
#pragma GCC unroll 48
|
#pragma GCC unroll 3
|
||||||
for(uint32_t i = n; i > 0; --i) {
|
for(i = n; i >= 16; i -= 16) {
|
||||||
// Prefetch caches
|
// Prefetch the NEXT block (not current) at optimal distance
|
||||||
// __builtin_prefetch(&a[i-16]);
|
if (i > 17) { // Ensure we don't prefetch out of bounds
|
||||||
// Get num and its new offset / location
|
__builtin_prefetch(&a[i - 17]);
|
||||||
auto num = a[i - 1];
|
}
|
||||||
|
if (i > 17*2) { // Ensure we don't prefetch out of bounds
|
||||||
|
__builtin_prefetch(&a[i - 17*2]);
|
||||||
|
}
|
||||||
|
if (i > 17*3) { // Ensure we don't prefetch out of bounds
|
||||||
|
__builtin_prefetch(&a[i - 17*3]);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Process 16 elements in reverse order
|
||||||
|
auto num15 = a[i - 1];
|
||||||
|
auto num14 = a[i - 2];
|
||||||
|
auto num13 = a[i - 3];
|
||||||
|
auto num12 = a[i - 4];
|
||||||
|
auto num11 = a[i - 5];
|
||||||
|
auto num10 = a[i - 6];
|
||||||
|
auto num9 = a[i - 7];
|
||||||
|
auto num8 = a[i - 8];
|
||||||
|
auto num7 = a[i - 9];
|
||||||
|
auto num6 = a[i - 10];
|
||||||
|
auto num5 = a[i - 11];
|
||||||
|
auto num4 = a[i - 12];
|
||||||
|
auto num3 = a[i - 13];
|
||||||
|
auto num2 = a[i - 14];
|
||||||
|
auto num1 = a[i - 15];
|
||||||
|
auto num0 = a[i - 16];
|
||||||
|
|
||||||
|
// Process all 16 elements (your bucket logic here)
|
||||||
|
tpxb_process_element(num15, buf, bucket1, shr1, mask1);
|
||||||
|
tpxb_process_element(num14, buf, bucket1, shr1, mask1);
|
||||||
|
tpxb_process_element(num13, buf, bucket1, shr1, mask1);
|
||||||
|
tpxb_process_element(num12, buf, bucket1, shr1, mask1);
|
||||||
|
tpxb_process_element(num11, buf, bucket1, shr1, mask1);
|
||||||
|
tpxb_process_element(num10, buf, bucket1, shr1, mask1);
|
||||||
|
tpxb_process_element(num9, buf, bucket1, shr1, mask1);
|
||||||
|
tpxb_process_element(num8, buf, bucket1, shr1, mask1);
|
||||||
|
tpxb_process_element(num7, buf, bucket1, shr1, mask1);
|
||||||
|
tpxb_process_element(num6, buf, bucket1, shr1, mask1);
|
||||||
|
tpxb_process_element(num5, buf, bucket1, shr1, mask1);
|
||||||
|
tpxb_process_element(num4, buf, bucket1, shr1, mask1);
|
||||||
|
tpxb_process_element(num3, buf, bucket1, shr1, mask1);
|
||||||
|
tpxb_process_element(num2, buf, bucket1, shr1, mask1);
|
||||||
|
tpxb_process_element(num1, buf, bucket1, shr1, mask1);
|
||||||
|
tpxb_process_element(num0, buf, bucket1, shr1, mask1);
|
||||||
|
}
|
||||||
|
// Handle remainder (less than 16 elements)
|
||||||
|
for(uint32_t j = i; j > 0; --j) {
|
||||||
|
auto num = a[j - 1];
|
||||||
auto bkeyni = (num >> shr1) & mask1;
|
auto bkeyni = (num >> shr1) & mask1;
|
||||||
auto offset = --bucket1[bkeyni];
|
auto offset = --bucket1[bkeyni];
|
||||||
|
|
||||||
// Add to the proper target location
|
|
||||||
buf[offset] = num;
|
buf[offset] = num;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
4
ypsu.cpp
4
ypsu.cpp
@ -895,8 +895,8 @@ int main(int argc, char **argv) {
|
|||||||
printf("Sorting %d elements:\n\n", n);
|
printf("Sorting %d elements:\n\n", n);
|
||||||
|
|
||||||
// Uncomment this for profiling and alg!
|
// Uncomment this for profiling and alg!
|
||||||
measure_single(n);
|
// measure_single(n);
|
||||||
return 0;
|
// return 0;
|
||||||
|
|
||||||
for (auto inputtype : inputtypes) {
|
for (auto inputtype : inputtypes) {
|
||||||
printf("%10s", inputtype.c_str());
|
printf("%10s", inputtype.c_str());
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user