diff --git a/ypsu.cpp b/ypsu.cpp index d5e0e71..8a72d70 100644 --- a/ypsu.cpp +++ b/ypsu.cpp @@ -16,7 +16,7 @@ #include "gptsort.h" #include "thiersort.h" -#define MAGYAR_SORT_DEFAULT_REUSE +// #define MAGYAR_SORT_DEFAULT_REUSE #include "magyarsort.h" #include "space_partitioning_sort/spsort.h" @@ -150,13 +150,23 @@ static inline uint32_t morgrab(uint32_t elem, uint32_t j) noexcept { static inline void mormord_sort_impl(uint32_t *a, int n, int j) noexcept { /* Preparation */ uint32_t radics[256] = {0}; + uint32_t radics2[256] = {0}; /* [from, to) index: only where prefix sums change - usually nonfull */ uint32_t real_radics[256 * 2] = {0}; /* Occurence counting O(n) */ - /* TODO: We can go both down and upwards here to increase ILP or even do SSE2 */ - for(uint32_t i = 0; i < n; ++i) { - ++radics[morgrab(a[i], j)]; + /* We can go both down and upwards here to increase ILP or even do SSE2 */ + uint32_t k1 = 0; + uint32_t k2 = (n - 1); + for(k1 = 0; k1 < k2; ++k1, --k2) { + ++radics[morgrab(a[k1], j)]; + ++radics2[morgrab(a[k2], j)]; + } + if(k1 == k2) { + ++radics[morgrab(a[k1], j)]; + } + for(int i = 0; i < 256; ++i) { + radics[i] += radics2[i]; } /* Prefix sum + real radics calc O(256) */