Revert "mormord sort more branchless plus extra edge-case handling for empty sized calls" - speed was not great...

This reverts commit 2d2cad2c5a4fbae0d2f008b4164ffb1a49ba3a88.
This commit is contained in:
Richard Thier 2024-04-11 20:02:23 +02:00
parent 9894f6c6d4
commit 3f0ae7ae77

View File

@ -150,9 +150,6 @@ static inline uint32_t morgrab(uint32_t elem) noexcept {
} }
template<int j> template<int j>
static inline void mormord_sort_impl(uint32_t *a, int n) noexcept { static inline void mormord_sort_impl(uint32_t *a, int n) noexcept {
/* Edge-case */
if(n == 0) return;
/* Preparation */ /* Preparation */
uint32_t radics[256] = {0}; uint32_t radics[256] = {0};
uint32_t radics2[256] = {0}; uint32_t radics2[256] = {0};
@ -187,41 +184,34 @@ static inline void mormord_sort_impl(uint32_t *a, int n) noexcept {
uint32_t reali = 0; uint32_t reali = 0;
#pragma GCC unroll 16 #pragma GCC unroll 16
for(int i = 0; i < 256; ++i) { for(int i = 0; i < 256; ++i) {
radics[i] += prev;
if(radics[i] != 0) { if(radics[i] != 0) {
radics[i] += prev;
real_radics[reali] = prev; real_radics[reali] = prev;
real_radics[reali + 1] = radics[i]; real_radics[reali + 1] = radics[i];
prev = radics[i];
reali += 2; reali += 2;
} } else {
radics[i] += prev;
prev = radics[i]; prev = radics[i];
} }
}
// Inplace swap
// Inplace swap - own ideas + some ideas based-on "famous" ct-swap (for branchless / more ILP):
// void ct-swap(bool secret, uint64_t a[], uint64_t b[], size_t len) {
// uint64_t mask = ~((uint64_t)secret - 1); // 1->111....111; 0->000....000
// for (size_t i = 0; i < len; i++) {
// uint64_t delta = (a[i] ^ b[i]) & mask;
// a[i] = a[i] ^ delta; // b[i], ha secret - amúgy a[i]
// b[i] = b[i] ^ delta; // fordítva
// }
// }
uint32_t pivoti = 0; uint32_t pivoti = 0;
while(pivoti < n) { while(pivoti < n) {
uint32_t radixval = morgrab<j>(a[pivoti]); uint32_t radixval = morgrab<j>(a[pivoti]);
uint32_t targeti = --radics[radixval]; // dec index (!) uint32_t targeti = radics[radixval] - 1;
if(targeti > pivoti) {
// Bitmask: true -> 11.....1; false -> 00.....0 // swap
uint32_t mask = ~((targeti > pivoti) - 1); uint32_t tmp = a[pivoti];
a[pivoti] = a[targeti];
// Branchless swap (using bitmask) a[targeti] = tmp;
uint32_t delta = (a[pivoti] ^ a[targeti]) & mask; // dec index
a[pivoti] = a[pivoti] ^ delta; --radics[radixval];
a[targeti] = a[targeti] ^ delta; } else {
// progress pivot
// "else" branch ++pivoti;
pivoti += !mask; }
radics[radixval] += !mask; // undec index (!)
} }
// Ends recursion // Ends recursion