some fixes for mormord-ilp-richi

This commit is contained in:
Richard Thier 2024-04-12 00:37:50 +02:00
parent 23a5bb1d55
commit b2d66b7fd0

View File

@ -163,38 +163,49 @@ static inline uint32_t morgrab(uint32_t elem) noexcept {
* @param radics1 A 128-sized array for occurence counting the bottom partition.
* @param radics2 A 128-sized array for occurence counting the top partition.
* @param DIGIT The digit in question (for a morgrab<DIGIT>(..) call)
* @returns The partition boundaries - non-inclusive inner ends partitions. Empty partitions accordingly represented!
* @returns The partition bounds are: [0..first) and [second..n) with logical means to mark empty partitions.
*/
template<int DIGIT>
static inline std::pair<uint32_t, uint32_t> oc_bit_partition(
uint32_t *a, uint32_t n, uint32_t *radics1, uint32_t *radics2) noexcept {
// See Hoare's OG quicksort why
uint32_t i = -1;
uint32_t j = n;
int64_t i = 0;
int64_t j = n - 1;
while(true) {
// Move past well-placed ones
// And occurence count them
// Rem.: In quicksort usually a do-while loop
++i; while ((i < n) && !morbittop<DIGIT>(a[i])) {
while ((i < j) && !morbittop<DIGIT>(a[i])) {
++radics1[morgrab<DIGIT>(a[i])];
++i;
}
--j; while ((0 < j) && morbittop<DIGIT>(a[j])) {
while ((i < j) && morbittop<DIGIT>(a[j])) {
++radics2[morgrab<DIGIT>(a[j])];
--j;
}
// If the indices crossed, return
// Rem.: Not >= to ensure occ. counts! See also: (*)
if(i > j) return std::make_pair(i, j);
if(i > j) return std::make_pair(i, j + 1);
// Swap badly placed
// Rem.: No need occurence count here as above loops will handle!
// Check for swap
if(i < j) {
// Swap
// No need occurence count here as above loops will handle!
uint32_t tmp = a[i];
a[i] = a[j];
a[j] = tmp;
} else {
// i == j case: count occurence properly for the one.
if(!morbittop<DIGIT>(a[j])) {
++radics1[morgrab<DIGIT>(a[i])];
++i;
} else {
++radics2[morgrab<DIGIT>(a[j])];
--j;
}
}
}
}
@ -208,7 +219,7 @@ static inline void mormord_sort_impl(uint32_t *a, int n) noexcept {
uint32_t real_radics2[128 * 2] = {0};
// Count occurences and partition by topmost bit
uint32_t n2 = oc_bit_partition<j>(a, n, radics1, radics2) + 1;
std::pair<uint32_t, uint32_t> boundz = oc_bit_partition<j>(a, n, radics1, radics2);
/* Prefix sum + real radics calc O(256) */
/* Radics: */
@ -253,8 +264,8 @@ static inline void mormord_sort_impl(uint32_t *a, int n) noexcept {
// Inplace swap, with added ILP / branchless opt.
// Without it its data dependent like crazy...
uint32_t pivoti1 = 0;
uint32_t pivoti2 = n2;
while((pivoti1 < n2) && (pivoti2 < n)) {
uint32_t pivoti2 = boundz.second;
while((pivoti1 < boundz.first) && (pivoti2 < n)) { // FIXME: needs two more "finisher-loops" behind this!!!
/* Pivot 1 */