mormordsort ILP version by me - with probably lot of bugs

This commit is contained in:
Richard Thier 2024-04-11 23:59:13 +02:00
parent 0f716e912c
commit 23a5bb1d55

189
ypsu.cpp
View File

@ -1,3 +1,4 @@
#include <utility> // std::pair
#include <algorithm>
#include <cassert>
#include <chrono>
@ -144,60 +145,70 @@ void twopass(uint32_t *a, int n) {
// --index
// különben
// ++pivot_index
template<int j>
static inline bool morbittop(uint32_t elem) noexcept {
return (elem >> (8 * j)) & 0x80; // Only top bit
}
template<int j>
static inline uint32_t morgrab(uint32_t elem) noexcept {
return (elem >> (8 * j)) & 0x7f; // Only 7-bit!
}
/**
* Divides array into two partitions by its topmost bit.
* Count occurences AND divides array into two partitions by its topmost bit.
* - Similar to quicksort partitioning, but changed accordingly.
* - MSB 0 bit values will come first partition.
* - The return value tells partition boundary.
*
* @param a The array to partition and occurence count.
* @param n The length of the array.
* @param radics1 A 128-sized array for occurence counting the bottom partition.
* @param radics2 A 128-sized array for occurence counting the top partition.
* @param DIGIT The digit in question (for a morgrab<DIGIT>(..) call)
* @returns The partition boundaries - non-inclusive inner ends partitions. Empty partitions accordingly represented!
*/
static inline uint32_t bit_partition(uint32_t *a, uint32_t n) noexcept {
template<int DIGIT>
static inline std::pair<uint32_t, uint32_t> oc_bit_partition(
uint32_t *a, uint32_t n, uint32_t *radics1, uint32_t *radics2) noexcept {
// See Hoare's OG quicksort why
uint32_t i = -1;
uint32_t j = n;
while(true) {
// Move past well-placed ones
do ++i; while (!(a[i] & 0x80));
do --j; while (a[j] & 0x80);
// And occurence count them
// Rem.: In quicksort usually a do-while loop
++i; while ((i < n) && !morbittop<DIGIT>(a[i])) {
++radics1[morgrab<DIGIT>(a[i])];
++i;
}
--j; while ((0 < j) && morbittop<DIGIT>(a[j])) {
++radics2[morgrab<DIGIT>(a[j])];
--j;
}
// If the indices crossed, return
if(i >= j) return j;
// Rem.: Not >= to ensure occ. counts! See also: (*)
if(i > j) return std::make_pair(i, j);
// Swap badly placed
uint32_t tmp = a[i];
a[i] = a[j];
a[j] = tmp;
// Rem.: No need occurence count here as above loops will handle!
if(i < j) {
uint32_t tmp = a[i];
a[i] = a[j];
a[j] = tmp;
}
}
}
template<int j>
static inline uint32_t morgrab(uint32_t elem) noexcept {
return (elem >> (8 * j)) & 0xff;
}
template<int j>
static inline void mormord_sort_impl(uint32_t *a, int n) noexcept {
/* Preparation */
uint32_t radics[256] = {0};
uint32_t radics2[256] = {0};
uint32_t radics1[128] = {0};
uint32_t radics2[128] = {0};
/* [from, to) index: only where prefix sums change - usually nonfull */
uint32_t real_radics[256 * 2] = {0};
uint32_t real_radics1[128 * 2] = {0};
uint32_t real_radics2[128 * 2] = {0};
/* Occurence counting O(n) */
/* We can go both down and upwards here to increase ILP or even do SSE2 */
uint32_t k1 = 0;
uint32_t k2 = (n - 1);
#pragma GCC unroll 64
for(k1 = 0; k1 < k2; ++k1, --k2) {
++radics[morgrab<j>(a[k1])];
++radics2[morgrab<j>(a[k2])];
}
if(k1 == k2) {
++radics[morgrab<j>(a[k1])];
}
for(int i = 0; i < 256; ++i) {
radics[i] += radics2[i];
}
// Count occurences and partition by topmost bit
uint32_t n2 = oc_bit_partition<j>(a, n, radics1, radics2) + 1;
/* Prefix sum + real radics calc O(256) */
/* Radics: */
@ -207,48 +218,95 @@ static inline void mormord_sort_impl(uint32_t *a, int n) noexcept {
/* to: {[0, 10], [10, 30], [30, 40], [40, 45], [45, 60]} */
/* 0. 1. 2. 4. 5. */
/* (because radix value 3 is not found in input) */
uint32_t prev = 0;
uint32_t reali = 0;
uint32_t prev1 = 0;
uint32_t reali1 = 0;
uint32_t prev2 = 0;
uint32_t reali2 = 0;
#pragma GCC unroll 16
for(int i = 0; i < 256; ++i) {
if(radics[i] != 0) {
radics[i] += prev;
real_radics[reali] = prev;
real_radics[reali + 1] = radics[i];
prev = radics[i];
reali += 2;
for(int i = 0; i < 128; ++i) {
// Hopefully we get more ILP out of this
// Also I tried branchless before adding
// ILP here and it slowed things, so first
// let us try it with branch prediction!
if(radics1[i] != 0) {
radics1[i] += prev1;
real_radics1[reali1] = prev1;
real_radics1[reali1 + 1] = radics1[i];
prev1 = radics1[i];
reali1 += 2;
} else {
radics[i] += prev;
prev = radics[i];
radics1[i] += prev1;
prev1 = radics1[i];
}
if(radics2[i] != 0) {
radics2[i] += prev2;
real_radics2[reali2] = prev2;
real_radics2[reali2 + 1] = radics2[i];
prev2 = radics2[i];
reali2 += 2;
} else {
radics2[i] += prev2;
prev2 = radics2[i];
}
}
// Inplace swap
uint32_t pivoti = 0;
while(pivoti < n) {
uint32_t radixval = morgrab<j>(a[pivoti]);
uint32_t targeti = radics[radixval] - 1;
if(targeti > pivoti) {
// swap
uint32_t tmp = a[pivoti];
a[pivoti] = a[targeti];
a[targeti] = tmp;
// dec index
--radics[radixval];
} else {
// progress pivot
++pivoti;
}
// Inplace swap, with added ILP / branchless opt.
// Without it its data dependent like crazy...
uint32_t pivoti1 = 0;
uint32_t pivoti2 = n2;
while((pivoti1 < n2) && (pivoti2 < n)) {
/* Pivot 1 */
uint32_t radixval1 = morgrab<j>(a[pivoti1]);
uint32_t targeti1 = --radics1[radixval1]; // dec index (!)
// Bitmask: true -> 11.....1; false -> 00.....0
uint32_t mask1 = ~((targeti1 > pivoti1) - 1);
// Branchless swap (using bitmask)
uint32_t delta1 = (a[pivoti1] ^ a[targeti1]) & mask1;
a[pivoti1] = a[pivoti1] ^ delta1;
a[targeti1] = a[targeti1] ^ delta1;
// "else" branch
pivoti1 += !mask1;
radics1[radixval1] += !mask1; // undec index (!)
/* Pivot 2 */
uint32_t radixval2 = morgrab<j>(a[pivoti2]);
uint32_t targeti2 = --radics2[radixval2]; // dec index (!)
// Bitmask: true -> 11.....1; false -> 00.....0
uint32_t mask2 = ~((targeti2 > pivoti2) - 1);
// Branchless swap (using bitmask)
uint32_t delta2 = (a[pivoti2] ^ a[targeti2]) & mask2;
a[pivoti2] = a[pivoti2] ^ delta2;
a[targeti2] = a[targeti2] ^ delta2;
// "else" branch
pivoti2 += !mask2;
radics2[radixval2] += !mask2; // undec index (!)
}
// Ends recursion
// Possible recursions
if constexpr (j != 0) {
// Recursion
for(int i = 0; i < reali; i += 2) {
/* Partition 1 recursions */
for(int i = 0; i < reali1; i += 2) {
/* inclusive */
uint32_t from = real_radics[i];
uint32_t from = real_radics1[i];
/* non-inclusive */
uint32_t to = real_radics[i + 1];
uint32_t to = real_radics1[i + 1];
mormord_sort_impl<j - 1>(&a[from], (to - (from)));
}
/* Partition 2 recursions */
for(int i = 0; i < reali2; i += 2) {
/* inclusive */
uint32_t from = real_radics2[i];
/* non-inclusive */
uint32_t to = real_radics2[i + 1];
mormord_sort_impl<j - 1>(&a[from], (to - (from)));
}
}
@ -654,11 +712,12 @@ void measure_single(int n) {
int main(void) {
//int n = 100000000;
int n = 10000000;
//int n = 10000000;
//int n = 1000000;
//int n = 100000;
//int n = 10000;
//int n = 1000;
int n = 200;
//int n = 100;
//int n = 10;
@ -686,9 +745,11 @@ int main(void) {
w.swap(buf);
}
});
/*
w = v;
measure(inputtype, "magyar", [&] { MagyarSort::sort<uint32_t>(&w[0], w.size()); });
assert(w == expected);
*/
w = v;
measure(inputtype, "mormord", [&] { mormord_sort(&w[0], w.size()); });
assert(w == expected);