mormord ILP-variant "nearly sorting properly" but some values buggy
This commit is contained in:
parent
b2d66b7fd0
commit
b2c4e7082b
42
ypsu.cpp
42
ypsu.cpp
@ -265,7 +265,7 @@ static inline void mormord_sort_impl(uint32_t *a, int n) noexcept {
|
||||
// Without it its data dependent like crazy...
|
||||
uint32_t pivoti1 = 0;
|
||||
uint32_t pivoti2 = boundz.second;
|
||||
while((pivoti1 < boundz.first) && (pivoti2 < n)) { // FIXME: needs two more "finisher-loops" behind this!!!
|
||||
while((pivoti1 < boundz.first) && (pivoti2 < n)) {
|
||||
|
||||
/* Pivot 1 */
|
||||
|
||||
@ -301,6 +301,46 @@ static inline void mormord_sort_impl(uint32_t *a, int n) noexcept {
|
||||
pivoti2 += !mask2;
|
||||
radics2[radixval2] += !mask2; // undec index (!)
|
||||
}
|
||||
// Finish pivot1 if there are still elements..
|
||||
while(pivoti1 < boundz.first) {
|
||||
|
||||
/* Pivot 1+ */
|
||||
|
||||
uint32_t radixval1 = morgrab<j>(a[pivoti1]);
|
||||
uint32_t targeti1 = --radics1[radixval1]; // dec index (!)
|
||||
|
||||
// Bitmask: true -> 11.....1; false -> 00.....0
|
||||
uint32_t mask1 = ~((targeti1 > pivoti1) - 1);
|
||||
|
||||
// Branchless swap (using bitmask)
|
||||
uint32_t delta1 = (a[pivoti1] ^ a[targeti1]) & mask1;
|
||||
a[pivoti1] = a[pivoti1] ^ delta1;
|
||||
a[targeti1] = a[targeti1] ^ delta1;
|
||||
|
||||
// "else" branch
|
||||
pivoti1 += !mask1;
|
||||
radics1[radixval1] += !mask1; // undec index (!)
|
||||
}
|
||||
// Finish pivot2 if there are still elements..
|
||||
while(pivoti2 < n) {
|
||||
|
||||
/* Pivot 2+ */
|
||||
|
||||
uint32_t radixval2 = morgrab<j>(a[pivoti2]);
|
||||
uint32_t targeti2 = --radics2[radixval2]; // dec index (!)
|
||||
|
||||
// Bitmask: true -> 11.....1; false -> 00.....0
|
||||
uint32_t mask2 = ~((targeti2 > pivoti2) - 1);
|
||||
|
||||
// Branchless swap (using bitmask)
|
||||
uint32_t delta2 = (a[pivoti2] ^ a[targeti2]) & mask2;
|
||||
a[pivoti2] = a[pivoti2] ^ delta2;
|
||||
a[targeti2] = a[targeti2] ^ delta2;
|
||||
|
||||
// "else" branch
|
||||
pivoti2 += !mask2;
|
||||
radics2[radixval2] += !mask2; // undec index (!)
|
||||
}
|
||||
|
||||
// Possible recursions
|
||||
if constexpr (j != 0) {
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user