mormord ILP-variant "nearly sorting properly" but some values buggy

This commit is contained in:
Richard Thier 2024-04-12 01:09:59 +02:00
parent b2d66b7fd0
commit b2c4e7082b

View File

@ -265,7 +265,7 @@ static inline void mormord_sort_impl(uint32_t *a, int n) noexcept {
// Without it its data dependent like crazy... // Without it its data dependent like crazy...
uint32_t pivoti1 = 0; uint32_t pivoti1 = 0;
uint32_t pivoti2 = boundz.second; uint32_t pivoti2 = boundz.second;
while((pivoti1 < boundz.first) && (pivoti2 < n)) { // FIXME: needs two more "finisher-loops" behind this!!! while((pivoti1 < boundz.first) && (pivoti2 < n)) {
/* Pivot 1 */ /* Pivot 1 */
@ -301,6 +301,46 @@ static inline void mormord_sort_impl(uint32_t *a, int n) noexcept {
pivoti2 += !mask2; pivoti2 += !mask2;
radics2[radixval2] += !mask2; // undec index (!) radics2[radixval2] += !mask2; // undec index (!)
} }
// Finish pivot1 if there are still elements..
while(pivoti1 < boundz.first) {
/* Pivot 1+ */
uint32_t radixval1 = morgrab<j>(a[pivoti1]);
uint32_t targeti1 = --radics1[radixval1]; // dec index (!)
// Bitmask: true -> 11.....1; false -> 00.....0
uint32_t mask1 = ~((targeti1 > pivoti1) - 1);
// Branchless swap (using bitmask)
uint32_t delta1 = (a[pivoti1] ^ a[targeti1]) & mask1;
a[pivoti1] = a[pivoti1] ^ delta1;
a[targeti1] = a[targeti1] ^ delta1;
// "else" branch
pivoti1 += !mask1;
radics1[radixval1] += !mask1; // undec index (!)
}
// Finish pivot2 if there are still elements..
while(pivoti2 < n) {
/* Pivot 2+ */
uint32_t radixval2 = morgrab<j>(a[pivoti2]);
uint32_t targeti2 = --radics2[radixval2]; // dec index (!)
// Bitmask: true -> 11.....1; false -> 00.....0
uint32_t mask2 = ~((targeti2 > pivoti2) - 1);
// Branchless swap (using bitmask)
uint32_t delta2 = (a[pivoti2] ^ a[targeti2]) & mask2;
a[pivoti2] = a[pivoti2] ^ delta2;
a[targeti2] = a[targeti2] ^ delta2;
// "else" branch
pivoti2 += !mask2;
radics2[radixval2] += !mask2; // undec index (!)
}
// Possible recursions // Possible recursions
if constexpr (j != 0) { if constexpr (j != 0) {