diff --git a/ypsu.cpp b/ypsu.cpp index 8f6a25d..62c4bcf 100644 --- a/ypsu.cpp +++ b/ypsu.cpp @@ -265,7 +265,7 @@ static inline void mormord_sort_impl(uint32_t *a, int n) noexcept { // Without it its data dependent like crazy... uint32_t pivoti1 = 0; uint32_t pivoti2 = boundz.second; - while((pivoti1 < boundz.first) && (pivoti2 < n)) { // FIXME: needs two more "finisher-loops" behind this!!! + while((pivoti1 < boundz.first) && (pivoti2 < n)) { /* Pivot 1 */ @@ -301,6 +301,46 @@ static inline void mormord_sort_impl(uint32_t *a, int n) noexcept { pivoti2 += !mask2; radics2[radixval2] += !mask2; // undec index (!) } + // Finish pivot1 if there are still elements.. + while(pivoti1 < boundz.first) { + + /* Pivot 1+ */ + + uint32_t radixval1 = morgrab(a[pivoti1]); + uint32_t targeti1 = --radics1[radixval1]; // dec index (!) + + // Bitmask: true -> 11.....1; false -> 00.....0 + uint32_t mask1 = ~((targeti1 > pivoti1) - 1); + + // Branchless swap (using bitmask) + uint32_t delta1 = (a[pivoti1] ^ a[targeti1]) & mask1; + a[pivoti1] = a[pivoti1] ^ delta1; + a[targeti1] = a[targeti1] ^ delta1; + + // "else" branch + pivoti1 += !mask1; + radics1[radixval1] += !mask1; // undec index (!) + } + // Finish pivot2 if there are still elements.. + while(pivoti2 < n) { + + /* Pivot 2+ */ + + uint32_t radixval2 = morgrab(a[pivoti2]); + uint32_t targeti2 = --radics2[radixval2]; // dec index (!) + + // Bitmask: true -> 11.....1; false -> 00.....0 + uint32_t mask2 = ~((targeti2 > pivoti2) - 1); + + // Branchless swap (using bitmask) + uint32_t delta2 = (a[pivoti2] ^ a[targeti2]) & mask2; + a[pivoti2] = a[pivoti2] ^ delta2; + a[targeti2] = a[targeti2] ^ delta2; + + // "else" branch + pivoti2 += !mask2; + radics2[radixval2] += !mask2; // undec index (!) + } // Possible recursions if constexpr (j != 0) {