From 23a5bb1d552fa77be72243a86b02ffbcc3eedb4d Mon Sep 17 00:00:00 2001 From: Richard Thier Date: Thu, 11 Apr 2024 23:59:13 +0200 Subject: [PATCH] mormordsort ILP version by me - with probably lot of bugs --- ypsu.cpp | 189 ++++++++++++++++++++++++++++++++++++------------------- 1 file changed, 125 insertions(+), 64 deletions(-) diff --git a/ypsu.cpp b/ypsu.cpp index 63e24aa..85d2db1 100644 --- a/ypsu.cpp +++ b/ypsu.cpp @@ -1,3 +1,4 @@ +#include // std::pair #include #include #include @@ -144,60 +145,70 @@ void twopass(uint32_t *a, int n) { // --index // különben // ++pivot_index +template +static inline bool morbittop(uint32_t elem) noexcept { + return (elem >> (8 * j)) & 0x80; // Only top bit +} +template +static inline uint32_t morgrab(uint32_t elem) noexcept { + return (elem >> (8 * j)) & 0x7f; // Only 7-bit! +} /** - * Divides array into two partitions by its topmost bit. + * Count occurences AND divides array into two partitions by its topmost bit. * - Similar to quicksort partitioning, but changed accordingly. * - MSB 0 bit values will come first partition. - * - The return value tells partition boundary. * * @param a The array to partition and occurence count. * @param n The length of the array. + * @param radics1 A 128-sized array for occurence counting the bottom partition. + * @param radics2 A 128-sized array for occurence counting the top partition. + * @param DIGIT The digit in question (for a morgrab(..) call) + * @returns The partition boundaries - non-inclusive inner ends partitions. Empty partitions accordingly represented! */ -static inline uint32_t bit_partition(uint32_t *a, uint32_t n) noexcept { +template +static inline std::pair oc_bit_partition( + uint32_t *a, uint32_t n, uint32_t *radics1, uint32_t *radics2) noexcept { + // See Hoare's OG quicksort why uint32_t i = -1; uint32_t j = n; while(true) { // Move past well-placed ones - do ++i; while (!(a[i] & 0x80)); - do --j; while (a[j] & 0x80); + // And occurence count them + // Rem.: In quicksort usually a do-while loop + ++i; while ((i < n) && !morbittop(a[i])) { + ++radics1[morgrab(a[i])]; + ++i; + } + --j; while ((0 < j) && morbittop(a[j])) { + ++radics2[morgrab(a[j])]; + --j; + } // If the indices crossed, return - if(i >= j) return j; + // Rem.: Not >= to ensure occ. counts! See also: (*) + if(i > j) return std::make_pair(i, j); // Swap badly placed - uint32_t tmp = a[i]; - a[i] = a[j]; - a[j] = tmp; + // Rem.: No need occurence count here as above loops will handle! + if(i < j) { + uint32_t tmp = a[i]; + a[i] = a[j]; + a[j] = tmp; + } } } template -static inline uint32_t morgrab(uint32_t elem) noexcept { - return (elem >> (8 * j)) & 0xff; -} -template static inline void mormord_sort_impl(uint32_t *a, int n) noexcept { /* Preparation */ - uint32_t radics[256] = {0}; - uint32_t radics2[256] = {0}; + uint32_t radics1[128] = {0}; + uint32_t radics2[128] = {0}; /* [from, to) index: only where prefix sums change - usually nonfull */ - uint32_t real_radics[256 * 2] = {0}; + uint32_t real_radics1[128 * 2] = {0}; + uint32_t real_radics2[128 * 2] = {0}; - /* Occurence counting O(n) */ - /* We can go both down and upwards here to increase ILP or even do SSE2 */ - uint32_t k1 = 0; - uint32_t k2 = (n - 1); - #pragma GCC unroll 64 - for(k1 = 0; k1 < k2; ++k1, --k2) { - ++radics[morgrab(a[k1])]; - ++radics2[morgrab(a[k2])]; - } - if(k1 == k2) { - ++radics[morgrab(a[k1])]; - } - for(int i = 0; i < 256; ++i) { - radics[i] += radics2[i]; - } + // Count occurences and partition by topmost bit + uint32_t n2 = oc_bit_partition(a, n, radics1, radics2) + 1; /* Prefix sum + real radics calc O(256) */ /* Radics: */ @@ -207,48 +218,95 @@ static inline void mormord_sort_impl(uint32_t *a, int n) noexcept { /* to: {[0, 10], [10, 30], [30, 40], [40, 45], [45, 60]} */ /* 0. 1. 2. 4. 5. */ /* (because radix value 3 is not found in input) */ - uint32_t prev = 0; - uint32_t reali = 0; + uint32_t prev1 = 0; + uint32_t reali1 = 0; + uint32_t prev2 = 0; + uint32_t reali2 = 0; #pragma GCC unroll 16 - for(int i = 0; i < 256; ++i) { - if(radics[i] != 0) { - radics[i] += prev; - real_radics[reali] = prev; - real_radics[reali + 1] = radics[i]; - prev = radics[i]; - reali += 2; + for(int i = 0; i < 128; ++i) { + // Hopefully we get more ILP out of this + // Also I tried branchless before adding + // ILP here and it slowed things, so first + // let us try it with branch prediction! + if(radics1[i] != 0) { + radics1[i] += prev1; + real_radics1[reali1] = prev1; + real_radics1[reali1 + 1] = radics1[i]; + prev1 = radics1[i]; + reali1 += 2; } else { - radics[i] += prev; - prev = radics[i]; + radics1[i] += prev1; + prev1 = radics1[i]; + } + if(radics2[i] != 0) { + radics2[i] += prev2; + real_radics2[reali2] = prev2; + real_radics2[reali2 + 1] = radics2[i]; + prev2 = radics2[i]; + reali2 += 2; + } else { + radics2[i] += prev2; + prev2 = radics2[i]; } } - // Inplace swap - uint32_t pivoti = 0; - while(pivoti < n) { - uint32_t radixval = morgrab(a[pivoti]); - uint32_t targeti = radics[radixval] - 1; - if(targeti > pivoti) { - // swap - uint32_t tmp = a[pivoti]; - a[pivoti] = a[targeti]; - a[targeti] = tmp; - // dec index - --radics[radixval]; - } else { - // progress pivot - ++pivoti; - } + // Inplace swap, with added ILP / branchless opt. + // Without it its data dependent like crazy... + uint32_t pivoti1 = 0; + uint32_t pivoti2 = n2; + while((pivoti1 < n2) && (pivoti2 < n)) { + + /* Pivot 1 */ + + uint32_t radixval1 = morgrab(a[pivoti1]); + uint32_t targeti1 = --radics1[radixval1]; // dec index (!) + + // Bitmask: true -> 11.....1; false -> 00.....0 + uint32_t mask1 = ~((targeti1 > pivoti1) - 1); + + // Branchless swap (using bitmask) + uint32_t delta1 = (a[pivoti1] ^ a[targeti1]) & mask1; + a[pivoti1] = a[pivoti1] ^ delta1; + a[targeti1] = a[targeti1] ^ delta1; + + // "else" branch + pivoti1 += !mask1; + radics1[radixval1] += !mask1; // undec index (!) + + /* Pivot 2 */ + + uint32_t radixval2 = morgrab(a[pivoti2]); + uint32_t targeti2 = --radics2[radixval2]; // dec index (!) + + // Bitmask: true -> 11.....1; false -> 00.....0 + uint32_t mask2 = ~((targeti2 > pivoti2) - 1); + + // Branchless swap (using bitmask) + uint32_t delta2 = (a[pivoti2] ^ a[targeti2]) & mask2; + a[pivoti2] = a[pivoti2] ^ delta2; + a[targeti2] = a[targeti2] ^ delta2; + + // "else" branch + pivoti2 += !mask2; + radics2[radixval2] += !mask2; // undec index (!) } - // Ends recursion + // Possible recursions if constexpr (j != 0) { - // Recursion - for(int i = 0; i < reali; i += 2) { + /* Partition 1 recursions */ + for(int i = 0; i < reali1; i += 2) { /* inclusive */ - uint32_t from = real_radics[i]; + uint32_t from = real_radics1[i]; /* non-inclusive */ - uint32_t to = real_radics[i + 1]; + uint32_t to = real_radics1[i + 1]; + mormord_sort_impl(&a[from], (to - (from))); + } + /* Partition 2 recursions */ + for(int i = 0; i < reali2; i += 2) { + /* inclusive */ + uint32_t from = real_radics2[i]; + /* non-inclusive */ + uint32_t to = real_radics2[i + 1]; mormord_sort_impl(&a[from], (to - (from))); } } @@ -654,11 +712,12 @@ void measure_single(int n) { int main(void) { //int n = 100000000; - int n = 10000000; + //int n = 10000000; //int n = 1000000; //int n = 100000; //int n = 10000; //int n = 1000; + int n = 200; //int n = 100; //int n = 10; @@ -686,9 +745,11 @@ int main(void) { w.swap(buf); } }); + /* w = v; measure(inputtype, "magyar", [&] { MagyarSort::sort(&w[0], w.size()); }); assert(w == expected); + */ w = v; measure(inputtype, "mormord", [&] { mormord_sort(&w[0], w.size()); }); assert(w == expected);