mormordsort got template recursion for 33% speedup (I think it still has 2x maybe)

This commit is contained in:
Richard Thier 2024-04-11 19:00:52 +02:00
parent ae2cd09452
commit d16505a297

View File

@ -144,10 +144,12 @@ void twopass(uint32_t *a, int n) {
// --index
// különben
// ++pivot_index
static inline uint32_t morgrab(uint32_t elem, uint32_t j) noexcept {
template<int j>
static inline uint32_t morgrab(uint32_t elem) noexcept {
return (elem >> (8 * j)) & 0xff;
}
static inline void mormord_sort_impl(uint32_t *a, int n, int j) noexcept {
template<int j>
static inline void mormord_sort_impl(uint32_t *a, int n) noexcept {
/* Preparation */
uint32_t radics[256] = {0};
uint32_t radics2[256] = {0};
@ -158,12 +160,13 @@ static inline void mormord_sort_impl(uint32_t *a, int n, int j) noexcept {
/* We can go both down and upwards here to increase ILP or even do SSE2 */
uint32_t k1 = 0;
uint32_t k2 = (n - 1);
#pragma GCC unroll 64
for(k1 = 0; k1 < k2; ++k1, --k2) {
++radics[morgrab(a[k1], j)];
++radics2[morgrab(a[k2], j)];
++radics[morgrab<j>(a[k1])];
++radics2[morgrab<j>(a[k2])];
}
if(k1 == k2) {
++radics[morgrab(a[k1], j)];
++radics[morgrab<j>(a[k1])];
}
for(int i = 0; i < 256; ++i) {
radics[i] += radics2[i];
@ -179,6 +182,7 @@ static inline void mormord_sort_impl(uint32_t *a, int n, int j) noexcept {
/* (because radix value 3 is not found in input) */
uint32_t prev = 0;
uint32_t reali = 0;
#pragma GCC unroll 16
for(int i = 0; i < 256; ++i) {
if(radics[i] != 0) {
radics[i] += prev;
@ -195,7 +199,7 @@ static inline void mormord_sort_impl(uint32_t *a, int n, int j) noexcept {
// Inplace swap
uint32_t pivoti = 0;
while(pivoti < n) {
uint32_t radixval = morgrab(a[pivoti], j);
uint32_t radixval = morgrab<j>(a[pivoti]);
uint32_t targeti = radics[radixval] - 1;
if(targeti > pivoti) {
// swap
@ -211,20 +215,20 @@ static inline void mormord_sort_impl(uint32_t *a, int n, int j) noexcept {
}
// Ends recursion
if(j == 0) return;
// Recursion
for(int i = 0; i < reali; i += 2) {
/* inclusive */
uint32_t from = real_radics[i];
/* non-inclusive */
uint32_t to = real_radics[i + 1];
mormord_sort_impl(&a[from], (to - (from)), j - 1);
if constexpr (j != 0) {
// Recursion
for(int i = 0; i < reali; i += 2) {
/* inclusive */
uint32_t from = real_radics[i];
/* non-inclusive */
uint32_t to = real_radics[i + 1];
mormord_sort_impl<j - 1>(&a[from], (to - (from)));
}
}
}
static inline void mormord_sort(uint32_t *a, int n) noexcept {
assert(n * uint32_t(sizeof(a[0])) <= INT_MAX);
mormord_sort_impl(a, n, 3);
mormord_sort_impl<3>(a, n);
}
void fourpass(uint32_t *a, int n) {