mormordsort ILP version by me - with probably lot of bugs
This commit is contained in:
parent
0f716e912c
commit
23a5bb1d55
189
ypsu.cpp
189
ypsu.cpp
@ -1,3 +1,4 @@
|
||||
#include <utility> // std::pair
|
||||
#include <algorithm>
|
||||
#include <cassert>
|
||||
#include <chrono>
|
||||
@ -144,60 +145,70 @@ void twopass(uint32_t *a, int n) {
|
||||
// --index
|
||||
// különben
|
||||
// ++pivot_index
|
||||
template<int j>
|
||||
static inline bool morbittop(uint32_t elem) noexcept {
|
||||
return (elem >> (8 * j)) & 0x80; // Only top bit
|
||||
}
|
||||
template<int j>
|
||||
static inline uint32_t morgrab(uint32_t elem) noexcept {
|
||||
return (elem >> (8 * j)) & 0x7f; // Only 7-bit!
|
||||
}
|
||||
/**
|
||||
* Divides array into two partitions by its topmost bit.
|
||||
* Count occurences AND divides array into two partitions by its topmost bit.
|
||||
* - Similar to quicksort partitioning, but changed accordingly.
|
||||
* - MSB 0 bit values will come first partition.
|
||||
* - The return value tells partition boundary.
|
||||
*
|
||||
* @param a The array to partition and occurence count.
|
||||
* @param n The length of the array.
|
||||
* @param radics1 A 128-sized array for occurence counting the bottom partition.
|
||||
* @param radics2 A 128-sized array for occurence counting the top partition.
|
||||
* @param DIGIT The digit in question (for a morgrab<DIGIT>(..) call)
|
||||
* @returns The partition boundaries - non-inclusive inner ends partitions. Empty partitions accordingly represented!
|
||||
*/
|
||||
static inline uint32_t bit_partition(uint32_t *a, uint32_t n) noexcept {
|
||||
template<int DIGIT>
|
||||
static inline std::pair<uint32_t, uint32_t> oc_bit_partition(
|
||||
uint32_t *a, uint32_t n, uint32_t *radics1, uint32_t *radics2) noexcept {
|
||||
// See Hoare's OG quicksort why
|
||||
uint32_t i = -1;
|
||||
uint32_t j = n;
|
||||
|
||||
while(true) {
|
||||
// Move past well-placed ones
|
||||
do ++i; while (!(a[i] & 0x80));
|
||||
do --j; while (a[j] & 0x80);
|
||||
// And occurence count them
|
||||
// Rem.: In quicksort usually a do-while loop
|
||||
++i; while ((i < n) && !morbittop<DIGIT>(a[i])) {
|
||||
++radics1[morgrab<DIGIT>(a[i])];
|
||||
++i;
|
||||
}
|
||||
--j; while ((0 < j) && morbittop<DIGIT>(a[j])) {
|
||||
++radics2[morgrab<DIGIT>(a[j])];
|
||||
--j;
|
||||
}
|
||||
|
||||
// If the indices crossed, return
|
||||
if(i >= j) return j;
|
||||
// Rem.: Not >= to ensure occ. counts! See also: (*)
|
||||
if(i > j) return std::make_pair(i, j);
|
||||
|
||||
// Swap badly placed
|
||||
uint32_t tmp = a[i];
|
||||
a[i] = a[j];
|
||||
a[j] = tmp;
|
||||
// Rem.: No need occurence count here as above loops will handle!
|
||||
if(i < j) {
|
||||
uint32_t tmp = a[i];
|
||||
a[i] = a[j];
|
||||
a[j] = tmp;
|
||||
}
|
||||
}
|
||||
}
|
||||
template<int j>
|
||||
static inline uint32_t morgrab(uint32_t elem) noexcept {
|
||||
return (elem >> (8 * j)) & 0xff;
|
||||
}
|
||||
template<int j>
|
||||
static inline void mormord_sort_impl(uint32_t *a, int n) noexcept {
|
||||
/* Preparation */
|
||||
uint32_t radics[256] = {0};
|
||||
uint32_t radics2[256] = {0};
|
||||
uint32_t radics1[128] = {0};
|
||||
uint32_t radics2[128] = {0};
|
||||
/* [from, to) index: only where prefix sums change - usually nonfull */
|
||||
uint32_t real_radics[256 * 2] = {0};
|
||||
uint32_t real_radics1[128 * 2] = {0};
|
||||
uint32_t real_radics2[128 * 2] = {0};
|
||||
|
||||
/* Occurence counting O(n) */
|
||||
/* We can go both down and upwards here to increase ILP or even do SSE2 */
|
||||
uint32_t k1 = 0;
|
||||
uint32_t k2 = (n - 1);
|
||||
#pragma GCC unroll 64
|
||||
for(k1 = 0; k1 < k2; ++k1, --k2) {
|
||||
++radics[morgrab<j>(a[k1])];
|
||||
++radics2[morgrab<j>(a[k2])];
|
||||
}
|
||||
if(k1 == k2) {
|
||||
++radics[morgrab<j>(a[k1])];
|
||||
}
|
||||
for(int i = 0; i < 256; ++i) {
|
||||
radics[i] += radics2[i];
|
||||
}
|
||||
// Count occurences and partition by topmost bit
|
||||
uint32_t n2 = oc_bit_partition<j>(a, n, radics1, radics2) + 1;
|
||||
|
||||
/* Prefix sum + real radics calc O(256) */
|
||||
/* Radics: */
|
||||
@ -207,48 +218,95 @@ static inline void mormord_sort_impl(uint32_t *a, int n) noexcept {
|
||||
/* to: {[0, 10], [10, 30], [30, 40], [40, 45], [45, 60]} */
|
||||
/* 0. 1. 2. 4. 5. */
|
||||
/* (because radix value 3 is not found in input) */
|
||||
uint32_t prev = 0;
|
||||
uint32_t reali = 0;
|
||||
uint32_t prev1 = 0;
|
||||
uint32_t reali1 = 0;
|
||||
uint32_t prev2 = 0;
|
||||
uint32_t reali2 = 0;
|
||||
#pragma GCC unroll 16
|
||||
for(int i = 0; i < 256; ++i) {
|
||||
if(radics[i] != 0) {
|
||||
radics[i] += prev;
|
||||
real_radics[reali] = prev;
|
||||
real_radics[reali + 1] = radics[i];
|
||||
prev = radics[i];
|
||||
reali += 2;
|
||||
for(int i = 0; i < 128; ++i) {
|
||||
// Hopefully we get more ILP out of this
|
||||
// Also I tried branchless before adding
|
||||
// ILP here and it slowed things, so first
|
||||
// let us try it with branch prediction!
|
||||
if(radics1[i] != 0) {
|
||||
radics1[i] += prev1;
|
||||
real_radics1[reali1] = prev1;
|
||||
real_radics1[reali1 + 1] = radics1[i];
|
||||
prev1 = radics1[i];
|
||||
reali1 += 2;
|
||||
} else {
|
||||
radics[i] += prev;
|
||||
prev = radics[i];
|
||||
radics1[i] += prev1;
|
||||
prev1 = radics1[i];
|
||||
}
|
||||
if(radics2[i] != 0) {
|
||||
radics2[i] += prev2;
|
||||
real_radics2[reali2] = prev2;
|
||||
real_radics2[reali2 + 1] = radics2[i];
|
||||
prev2 = radics2[i];
|
||||
reali2 += 2;
|
||||
} else {
|
||||
radics2[i] += prev2;
|
||||
prev2 = radics2[i];
|
||||
}
|
||||
}
|
||||
|
||||
// Inplace swap
|
||||
uint32_t pivoti = 0;
|
||||
while(pivoti < n) {
|
||||
uint32_t radixval = morgrab<j>(a[pivoti]);
|
||||
uint32_t targeti = radics[radixval] - 1;
|
||||
if(targeti > pivoti) {
|
||||
// swap
|
||||
uint32_t tmp = a[pivoti];
|
||||
a[pivoti] = a[targeti];
|
||||
a[targeti] = tmp;
|
||||
// dec index
|
||||
--radics[radixval];
|
||||
} else {
|
||||
// progress pivot
|
||||
++pivoti;
|
||||
}
|
||||
// Inplace swap, with added ILP / branchless opt.
|
||||
// Without it its data dependent like crazy...
|
||||
uint32_t pivoti1 = 0;
|
||||
uint32_t pivoti2 = n2;
|
||||
while((pivoti1 < n2) && (pivoti2 < n)) {
|
||||
|
||||
/* Pivot 1 */
|
||||
|
||||
uint32_t radixval1 = morgrab<j>(a[pivoti1]);
|
||||
uint32_t targeti1 = --radics1[radixval1]; // dec index (!)
|
||||
|
||||
// Bitmask: true -> 11.....1; false -> 00.....0
|
||||
uint32_t mask1 = ~((targeti1 > pivoti1) - 1);
|
||||
|
||||
// Branchless swap (using bitmask)
|
||||
uint32_t delta1 = (a[pivoti1] ^ a[targeti1]) & mask1;
|
||||
a[pivoti1] = a[pivoti1] ^ delta1;
|
||||
a[targeti1] = a[targeti1] ^ delta1;
|
||||
|
||||
// "else" branch
|
||||
pivoti1 += !mask1;
|
||||
radics1[radixval1] += !mask1; // undec index (!)
|
||||
|
||||
/* Pivot 2 */
|
||||
|
||||
uint32_t radixval2 = morgrab<j>(a[pivoti2]);
|
||||
uint32_t targeti2 = --radics2[radixval2]; // dec index (!)
|
||||
|
||||
// Bitmask: true -> 11.....1; false -> 00.....0
|
||||
uint32_t mask2 = ~((targeti2 > pivoti2) - 1);
|
||||
|
||||
// Branchless swap (using bitmask)
|
||||
uint32_t delta2 = (a[pivoti2] ^ a[targeti2]) & mask2;
|
||||
a[pivoti2] = a[pivoti2] ^ delta2;
|
||||
a[targeti2] = a[targeti2] ^ delta2;
|
||||
|
||||
// "else" branch
|
||||
pivoti2 += !mask2;
|
||||
radics2[radixval2] += !mask2; // undec index (!)
|
||||
}
|
||||
|
||||
// Ends recursion
|
||||
// Possible recursions
|
||||
if constexpr (j != 0) {
|
||||
// Recursion
|
||||
for(int i = 0; i < reali; i += 2) {
|
||||
/* Partition 1 recursions */
|
||||
for(int i = 0; i < reali1; i += 2) {
|
||||
/* inclusive */
|
||||
uint32_t from = real_radics[i];
|
||||
uint32_t from = real_radics1[i];
|
||||
/* non-inclusive */
|
||||
uint32_t to = real_radics[i + 1];
|
||||
uint32_t to = real_radics1[i + 1];
|
||||
mormord_sort_impl<j - 1>(&a[from], (to - (from)));
|
||||
}
|
||||
/* Partition 2 recursions */
|
||||
for(int i = 0; i < reali2; i += 2) {
|
||||
/* inclusive */
|
||||
uint32_t from = real_radics2[i];
|
||||
/* non-inclusive */
|
||||
uint32_t to = real_radics2[i + 1];
|
||||
mormord_sort_impl<j - 1>(&a[from], (to - (from)));
|
||||
}
|
||||
}
|
||||
@ -654,11 +712,12 @@ void measure_single(int n) {
|
||||
|
||||
int main(void) {
|
||||
//int n = 100000000;
|
||||
int n = 10000000;
|
||||
//int n = 10000000;
|
||||
//int n = 1000000;
|
||||
//int n = 100000;
|
||||
//int n = 10000;
|
||||
//int n = 1000;
|
||||
int n = 200;
|
||||
//int n = 100;
|
||||
//int n = 10;
|
||||
|
||||
@ -686,9 +745,11 @@ int main(void) {
|
||||
w.swap(buf);
|
||||
}
|
||||
});
|
||||
/*
|
||||
w = v;
|
||||
measure(inputtype, "magyar", [&] { MagyarSort::sort<uint32_t>(&w[0], w.size()); });
|
||||
assert(w == expected);
|
||||
*/
|
||||
w = v;
|
||||
measure(inputtype, "mormord", [&] { mormord_sort(&w[0], w.size()); });
|
||||
assert(w == expected);
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user