mormordsort got template recursion for 33% speedup (I think it still has 2x maybe)

This commit is contained in:
Richard Thier 2024-04-11 19:00:52 +02:00
parent ae2cd09452
commit d16505a297

View File

@ -144,10 +144,12 @@ void twopass(uint32_t *a, int n) {
// --index // --index
// különben // különben
// ++pivot_index // ++pivot_index
static inline uint32_t morgrab(uint32_t elem, uint32_t j) noexcept { template<int j>
static inline uint32_t morgrab(uint32_t elem) noexcept {
return (elem >> (8 * j)) & 0xff; return (elem >> (8 * j)) & 0xff;
} }
static inline void mormord_sort_impl(uint32_t *a, int n, int j) noexcept { template<int j>
static inline void mormord_sort_impl(uint32_t *a, int n) noexcept {
/* Preparation */ /* Preparation */
uint32_t radics[256] = {0}; uint32_t radics[256] = {0};
uint32_t radics2[256] = {0}; uint32_t radics2[256] = {0};
@ -158,12 +160,13 @@ static inline void mormord_sort_impl(uint32_t *a, int n, int j) noexcept {
/* We can go both down and upwards here to increase ILP or even do SSE2 */ /* We can go both down and upwards here to increase ILP or even do SSE2 */
uint32_t k1 = 0; uint32_t k1 = 0;
uint32_t k2 = (n - 1); uint32_t k2 = (n - 1);
#pragma GCC unroll 64
for(k1 = 0; k1 < k2; ++k1, --k2) { for(k1 = 0; k1 < k2; ++k1, --k2) {
++radics[morgrab(a[k1], j)]; ++radics[morgrab<j>(a[k1])];
++radics2[morgrab(a[k2], j)]; ++radics2[morgrab<j>(a[k2])];
} }
if(k1 == k2) { if(k1 == k2) {
++radics[morgrab(a[k1], j)]; ++radics[morgrab<j>(a[k1])];
} }
for(int i = 0; i < 256; ++i) { for(int i = 0; i < 256; ++i) {
radics[i] += radics2[i]; radics[i] += radics2[i];
@ -179,6 +182,7 @@ static inline void mormord_sort_impl(uint32_t *a, int n, int j) noexcept {
/* (because radix value 3 is not found in input) */ /* (because radix value 3 is not found in input) */
uint32_t prev = 0; uint32_t prev = 0;
uint32_t reali = 0; uint32_t reali = 0;
#pragma GCC unroll 16
for(int i = 0; i < 256; ++i) { for(int i = 0; i < 256; ++i) {
if(radics[i] != 0) { if(radics[i] != 0) {
radics[i] += prev; radics[i] += prev;
@ -195,7 +199,7 @@ static inline void mormord_sort_impl(uint32_t *a, int n, int j) noexcept {
// Inplace swap // Inplace swap
uint32_t pivoti = 0; uint32_t pivoti = 0;
while(pivoti < n) { while(pivoti < n) {
uint32_t radixval = morgrab(a[pivoti], j); uint32_t radixval = morgrab<j>(a[pivoti]);
uint32_t targeti = radics[radixval] - 1; uint32_t targeti = radics[radixval] - 1;
if(targeti > pivoti) { if(targeti > pivoti) {
// swap // swap
@ -211,20 +215,20 @@ static inline void mormord_sort_impl(uint32_t *a, int n, int j) noexcept {
} }
// Ends recursion // Ends recursion
if(j == 0) return; if constexpr (j != 0) {
// Recursion
// Recursion for(int i = 0; i < reali; i += 2) {
for(int i = 0; i < reali; i += 2) { /* inclusive */
/* inclusive */ uint32_t from = real_radics[i];
uint32_t from = real_radics[i]; /* non-inclusive */
/* non-inclusive */ uint32_t to = real_radics[i + 1];
uint32_t to = real_radics[i + 1]; mormord_sort_impl<j - 1>(&a[from], (to - (from)));
mormord_sort_impl(&a[from], (to - (from)), j - 1); }
} }
} }
static inline void mormord_sort(uint32_t *a, int n) noexcept { static inline void mormord_sort(uint32_t *a, int n) noexcept {
assert(n * uint32_t(sizeof(a[0])) <= INT_MAX); assert(n * uint32_t(sizeof(a[0])) <= INT_MAX);
mormord_sort_impl(a, n, 3); mormord_sort_impl<3>(a, n);
} }
void fourpass(uint32_t *a, int n) { void fourpass(uint32_t *a, int n) {