diff --git a/ypsu.cpp b/ypsu.cpp index 85d2db1..8f6a25d 100644 --- a/ypsu.cpp +++ b/ypsu.cpp @@ -163,38 +163,49 @@ static inline uint32_t morgrab(uint32_t elem) noexcept { * @param radics1 A 128-sized array for occurence counting the bottom partition. * @param radics2 A 128-sized array for occurence counting the top partition. * @param DIGIT The digit in question (for a morgrab(..) call) - * @returns The partition boundaries - non-inclusive inner ends partitions. Empty partitions accordingly represented! + * @returns The partition bounds are: [0..first) and [second..n) with logical means to mark empty partitions. */ template static inline std::pair oc_bit_partition( uint32_t *a, uint32_t n, uint32_t *radics1, uint32_t *radics2) noexcept { // See Hoare's OG quicksort why - uint32_t i = -1; - uint32_t j = n; + int64_t i = 0; + int64_t j = n - 1; while(true) { // Move past well-placed ones // And occurence count them // Rem.: In quicksort usually a do-while loop - ++i; while ((i < n) && !morbittop(a[i])) { + while ((i < j) && !morbittop(a[i])) { ++radics1[morgrab(a[i])]; ++i; } - --j; while ((0 < j) && morbittop(a[j])) { + while ((i < j) && morbittop(a[j])) { ++radics2[morgrab(a[j])]; --j; } // If the indices crossed, return // Rem.: Not >= to ensure occ. counts! See also: (*) - if(i > j) return std::make_pair(i, j); + if(i > j) return std::make_pair(i, j + 1); - // Swap badly placed - // Rem.: No need occurence count here as above loops will handle! + // Check for swap if(i < j) { + // Swap + // No need occurence count here as above loops will handle! uint32_t tmp = a[i]; a[i] = a[j]; a[j] = tmp; + } else { + // i == j case: count occurence properly for the one. + if(!morbittop(a[j])) { + ++radics1[morgrab(a[i])]; + ++i; + } else { + ++radics2[morgrab(a[j])]; + --j; + } + } } } @@ -208,7 +219,7 @@ static inline void mormord_sort_impl(uint32_t *a, int n) noexcept { uint32_t real_radics2[128 * 2] = {0}; // Count occurences and partition by topmost bit - uint32_t n2 = oc_bit_partition(a, n, radics1, radics2) + 1; + std::pair boundz = oc_bit_partition(a, n, radics1, radics2); /* Prefix sum + real radics calc O(256) */ /* Radics: */ @@ -253,8 +264,8 @@ static inline void mormord_sort_impl(uint32_t *a, int n) noexcept { // Inplace swap, with added ILP / branchless opt. // Without it its data dependent like crazy... uint32_t pivoti1 = 0; - uint32_t pivoti2 = n2; - while((pivoti1 < n2) && (pivoti2 < n)) { + uint32_t pivoti2 = boundz.second; + while((pivoti1 < boundz.first) && (pivoti2 < n)) { // FIXME: needs two more "finisher-loops" behind this!!! /* Pivot 1 */