From d16505a297c25d0fc1e38955b5d62ab13a27334b Mon Sep 17 00:00:00 2001 From: Richard Thier Date: Thu, 11 Apr 2024 19:00:52 +0200 Subject: [PATCH] mormordsort got template recursion for 33% speedup (I think it still has 2x maybe) --- ypsu.cpp | 36 ++++++++++++++++++++---------------- 1 file changed, 20 insertions(+), 16 deletions(-) diff --git a/ypsu.cpp b/ypsu.cpp index abc6112..81d579f 100644 --- a/ypsu.cpp +++ b/ypsu.cpp @@ -144,10 +144,12 @@ void twopass(uint32_t *a, int n) { // --index // különben // ++pivot_index -static inline uint32_t morgrab(uint32_t elem, uint32_t j) noexcept { +template +static inline uint32_t morgrab(uint32_t elem) noexcept { return (elem >> (8 * j)) & 0xff; } -static inline void mormord_sort_impl(uint32_t *a, int n, int j) noexcept { +template +static inline void mormord_sort_impl(uint32_t *a, int n) noexcept { /* Preparation */ uint32_t radics[256] = {0}; uint32_t radics2[256] = {0}; @@ -158,12 +160,13 @@ static inline void mormord_sort_impl(uint32_t *a, int n, int j) noexcept { /* We can go both down and upwards here to increase ILP or even do SSE2 */ uint32_t k1 = 0; uint32_t k2 = (n - 1); + #pragma GCC unroll 64 for(k1 = 0; k1 < k2; ++k1, --k2) { - ++radics[morgrab(a[k1], j)]; - ++radics2[morgrab(a[k2], j)]; + ++radics[morgrab(a[k1])]; + ++radics2[morgrab(a[k2])]; } if(k1 == k2) { - ++radics[morgrab(a[k1], j)]; + ++radics[morgrab(a[k1])]; } for(int i = 0; i < 256; ++i) { radics[i] += radics2[i]; @@ -179,6 +182,7 @@ static inline void mormord_sort_impl(uint32_t *a, int n, int j) noexcept { /* (because radix value 3 is not found in input) */ uint32_t prev = 0; uint32_t reali = 0; + #pragma GCC unroll 16 for(int i = 0; i < 256; ++i) { if(radics[i] != 0) { radics[i] += prev; @@ -195,7 +199,7 @@ static inline void mormord_sort_impl(uint32_t *a, int n, int j) noexcept { // Inplace swap uint32_t pivoti = 0; while(pivoti < n) { - uint32_t radixval = morgrab(a[pivoti], j); + uint32_t radixval = morgrab(a[pivoti]); uint32_t targeti = radics[radixval] - 1; if(targeti > pivoti) { // swap @@ -211,20 +215,20 @@ static inline void mormord_sort_impl(uint32_t *a, int n, int j) noexcept { } // Ends recursion - if(j == 0) return; - - // Recursion - for(int i = 0; i < reali; i += 2) { - /* inclusive */ - uint32_t from = real_radics[i]; - /* non-inclusive */ - uint32_t to = real_radics[i + 1]; - mormord_sort_impl(&a[from], (to - (from)), j - 1); + if constexpr (j != 0) { + // Recursion + for(int i = 0; i < reali; i += 2) { + /* inclusive */ + uint32_t from = real_radics[i]; + /* non-inclusive */ + uint32_t to = real_radics[i + 1]; + mormord_sort_impl(&a[from], (to - (from))); + } } } static inline void mormord_sort(uint32_t *a, int n) noexcept { assert(n * uint32_t(sizeof(a[0])) <= INT_MAX); - mormord_sort_impl(a, n, 3); + mormord_sort_impl<3>(a, n); } void fourpass(uint32_t *a, int n) {