mormordsort got template recursion for 33% speedup (I think it still has 2x maybe)
This commit is contained in:
parent
ae2cd09452
commit
d16505a297
36
ypsu.cpp
36
ypsu.cpp
@ -144,10 +144,12 @@ void twopass(uint32_t *a, int n) {
|
||||
// --index
|
||||
// különben
|
||||
// ++pivot_index
|
||||
static inline uint32_t morgrab(uint32_t elem, uint32_t j) noexcept {
|
||||
template<int j>
|
||||
static inline uint32_t morgrab(uint32_t elem) noexcept {
|
||||
return (elem >> (8 * j)) & 0xff;
|
||||
}
|
||||
static inline void mormord_sort_impl(uint32_t *a, int n, int j) noexcept {
|
||||
template<int j>
|
||||
static inline void mormord_sort_impl(uint32_t *a, int n) noexcept {
|
||||
/* Preparation */
|
||||
uint32_t radics[256] = {0};
|
||||
uint32_t radics2[256] = {0};
|
||||
@ -158,12 +160,13 @@ static inline void mormord_sort_impl(uint32_t *a, int n, int j) noexcept {
|
||||
/* We can go both down and upwards here to increase ILP or even do SSE2 */
|
||||
uint32_t k1 = 0;
|
||||
uint32_t k2 = (n - 1);
|
||||
#pragma GCC unroll 64
|
||||
for(k1 = 0; k1 < k2; ++k1, --k2) {
|
||||
++radics[morgrab(a[k1], j)];
|
||||
++radics2[morgrab(a[k2], j)];
|
||||
++radics[morgrab<j>(a[k1])];
|
||||
++radics2[morgrab<j>(a[k2])];
|
||||
}
|
||||
if(k1 == k2) {
|
||||
++radics[morgrab(a[k1], j)];
|
||||
++radics[morgrab<j>(a[k1])];
|
||||
}
|
||||
for(int i = 0; i < 256; ++i) {
|
||||
radics[i] += radics2[i];
|
||||
@ -179,6 +182,7 @@ static inline void mormord_sort_impl(uint32_t *a, int n, int j) noexcept {
|
||||
/* (because radix value 3 is not found in input) */
|
||||
uint32_t prev = 0;
|
||||
uint32_t reali = 0;
|
||||
#pragma GCC unroll 16
|
||||
for(int i = 0; i < 256; ++i) {
|
||||
if(radics[i] != 0) {
|
||||
radics[i] += prev;
|
||||
@ -195,7 +199,7 @@ static inline void mormord_sort_impl(uint32_t *a, int n, int j) noexcept {
|
||||
// Inplace swap
|
||||
uint32_t pivoti = 0;
|
||||
while(pivoti < n) {
|
||||
uint32_t radixval = morgrab(a[pivoti], j);
|
||||
uint32_t radixval = morgrab<j>(a[pivoti]);
|
||||
uint32_t targeti = radics[radixval] - 1;
|
||||
if(targeti > pivoti) {
|
||||
// swap
|
||||
@ -211,20 +215,20 @@ static inline void mormord_sort_impl(uint32_t *a, int n, int j) noexcept {
|
||||
}
|
||||
|
||||
// Ends recursion
|
||||
if(j == 0) return;
|
||||
|
||||
// Recursion
|
||||
for(int i = 0; i < reali; i += 2) {
|
||||
/* inclusive */
|
||||
uint32_t from = real_radics[i];
|
||||
/* non-inclusive */
|
||||
uint32_t to = real_radics[i + 1];
|
||||
mormord_sort_impl(&a[from], (to - (from)), j - 1);
|
||||
if constexpr (j != 0) {
|
||||
// Recursion
|
||||
for(int i = 0; i < reali; i += 2) {
|
||||
/* inclusive */
|
||||
uint32_t from = real_radics[i];
|
||||
/* non-inclusive */
|
||||
uint32_t to = real_radics[i + 1];
|
||||
mormord_sort_impl<j - 1>(&a[from], (to - (from)));
|
||||
}
|
||||
}
|
||||
}
|
||||
static inline void mormord_sort(uint32_t *a, int n) noexcept {
|
||||
assert(n * uint32_t(sizeof(a[0])) <= INT_MAX);
|
||||
mormord_sort_impl(a, n, 3);
|
||||
mormord_sort_impl<3>(a, n);
|
||||
}
|
||||
|
||||
void fourpass(uint32_t *a, int n) {
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user