mormordsort ILP version by me - with probably lot of bugs
This commit is contained in:
parent
0f716e912c
commit
23a5bb1d55
189
ypsu.cpp
189
ypsu.cpp
@ -1,3 +1,4 @@
|
|||||||
|
#include <utility> // std::pair
|
||||||
#include <algorithm>
|
#include <algorithm>
|
||||||
#include <cassert>
|
#include <cassert>
|
||||||
#include <chrono>
|
#include <chrono>
|
||||||
@ -144,60 +145,70 @@ void twopass(uint32_t *a, int n) {
|
|||||||
// --index
|
// --index
|
||||||
// különben
|
// különben
|
||||||
// ++pivot_index
|
// ++pivot_index
|
||||||
|
template<int j>
|
||||||
|
static inline bool morbittop(uint32_t elem) noexcept {
|
||||||
|
return (elem >> (8 * j)) & 0x80; // Only top bit
|
||||||
|
}
|
||||||
|
template<int j>
|
||||||
|
static inline uint32_t morgrab(uint32_t elem) noexcept {
|
||||||
|
return (elem >> (8 * j)) & 0x7f; // Only 7-bit!
|
||||||
|
}
|
||||||
/**
|
/**
|
||||||
* Divides array into two partitions by its topmost bit.
|
* Count occurences AND divides array into two partitions by its topmost bit.
|
||||||
* - Similar to quicksort partitioning, but changed accordingly.
|
* - Similar to quicksort partitioning, but changed accordingly.
|
||||||
* - MSB 0 bit values will come first partition.
|
* - MSB 0 bit values will come first partition.
|
||||||
* - The return value tells partition boundary.
|
|
||||||
*
|
*
|
||||||
* @param a The array to partition and occurence count.
|
* @param a The array to partition and occurence count.
|
||||||
* @param n The length of the array.
|
* @param n The length of the array.
|
||||||
|
* @param radics1 A 128-sized array for occurence counting the bottom partition.
|
||||||
|
* @param radics2 A 128-sized array for occurence counting the top partition.
|
||||||
|
* @param DIGIT The digit in question (for a morgrab<DIGIT>(..) call)
|
||||||
|
* @returns The partition boundaries - non-inclusive inner ends partitions. Empty partitions accordingly represented!
|
||||||
*/
|
*/
|
||||||
static inline uint32_t bit_partition(uint32_t *a, uint32_t n) noexcept {
|
template<int DIGIT>
|
||||||
|
static inline std::pair<uint32_t, uint32_t> oc_bit_partition(
|
||||||
|
uint32_t *a, uint32_t n, uint32_t *radics1, uint32_t *radics2) noexcept {
|
||||||
|
// See Hoare's OG quicksort why
|
||||||
uint32_t i = -1;
|
uint32_t i = -1;
|
||||||
uint32_t j = n;
|
uint32_t j = n;
|
||||||
|
|
||||||
while(true) {
|
while(true) {
|
||||||
// Move past well-placed ones
|
// Move past well-placed ones
|
||||||
do ++i; while (!(a[i] & 0x80));
|
// And occurence count them
|
||||||
do --j; while (a[j] & 0x80);
|
// Rem.: In quicksort usually a do-while loop
|
||||||
|
++i; while ((i < n) && !morbittop<DIGIT>(a[i])) {
|
||||||
|
++radics1[morgrab<DIGIT>(a[i])];
|
||||||
|
++i;
|
||||||
|
}
|
||||||
|
--j; while ((0 < j) && morbittop<DIGIT>(a[j])) {
|
||||||
|
++radics2[morgrab<DIGIT>(a[j])];
|
||||||
|
--j;
|
||||||
|
}
|
||||||
|
|
||||||
// If the indices crossed, return
|
// If the indices crossed, return
|
||||||
if(i >= j) return j;
|
// Rem.: Not >= to ensure occ. counts! See also: (*)
|
||||||
|
if(i > j) return std::make_pair(i, j);
|
||||||
|
|
||||||
// Swap badly placed
|
// Swap badly placed
|
||||||
uint32_t tmp = a[i];
|
// Rem.: No need occurence count here as above loops will handle!
|
||||||
a[i] = a[j];
|
if(i < j) {
|
||||||
a[j] = tmp;
|
uint32_t tmp = a[i];
|
||||||
|
a[i] = a[j];
|
||||||
|
a[j] = tmp;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
template<int j>
|
template<int j>
|
||||||
static inline uint32_t morgrab(uint32_t elem) noexcept {
|
|
||||||
return (elem >> (8 * j)) & 0xff;
|
|
||||||
}
|
|
||||||
template<int j>
|
|
||||||
static inline void mormord_sort_impl(uint32_t *a, int n) noexcept {
|
static inline void mormord_sort_impl(uint32_t *a, int n) noexcept {
|
||||||
/* Preparation */
|
/* Preparation */
|
||||||
uint32_t radics[256] = {0};
|
uint32_t radics1[128] = {0};
|
||||||
uint32_t radics2[256] = {0};
|
uint32_t radics2[128] = {0};
|
||||||
/* [from, to) index: only where prefix sums change - usually nonfull */
|
/* [from, to) index: only where prefix sums change - usually nonfull */
|
||||||
uint32_t real_radics[256 * 2] = {0};
|
uint32_t real_radics1[128 * 2] = {0};
|
||||||
|
uint32_t real_radics2[128 * 2] = {0};
|
||||||
|
|
||||||
/* Occurence counting O(n) */
|
// Count occurences and partition by topmost bit
|
||||||
/* We can go both down and upwards here to increase ILP or even do SSE2 */
|
uint32_t n2 = oc_bit_partition<j>(a, n, radics1, radics2) + 1;
|
||||||
uint32_t k1 = 0;
|
|
||||||
uint32_t k2 = (n - 1);
|
|
||||||
#pragma GCC unroll 64
|
|
||||||
for(k1 = 0; k1 < k2; ++k1, --k2) {
|
|
||||||
++radics[morgrab<j>(a[k1])];
|
|
||||||
++radics2[morgrab<j>(a[k2])];
|
|
||||||
}
|
|
||||||
if(k1 == k2) {
|
|
||||||
++radics[morgrab<j>(a[k1])];
|
|
||||||
}
|
|
||||||
for(int i = 0; i < 256; ++i) {
|
|
||||||
radics[i] += radics2[i];
|
|
||||||
}
|
|
||||||
|
|
||||||
/* Prefix sum + real radics calc O(256) */
|
/* Prefix sum + real radics calc O(256) */
|
||||||
/* Radics: */
|
/* Radics: */
|
||||||
@ -207,48 +218,95 @@ static inline void mormord_sort_impl(uint32_t *a, int n) noexcept {
|
|||||||
/* to: {[0, 10], [10, 30], [30, 40], [40, 45], [45, 60]} */
|
/* to: {[0, 10], [10, 30], [30, 40], [40, 45], [45, 60]} */
|
||||||
/* 0. 1. 2. 4. 5. */
|
/* 0. 1. 2. 4. 5. */
|
||||||
/* (because radix value 3 is not found in input) */
|
/* (because radix value 3 is not found in input) */
|
||||||
uint32_t prev = 0;
|
uint32_t prev1 = 0;
|
||||||
uint32_t reali = 0;
|
uint32_t reali1 = 0;
|
||||||
|
uint32_t prev2 = 0;
|
||||||
|
uint32_t reali2 = 0;
|
||||||
#pragma GCC unroll 16
|
#pragma GCC unroll 16
|
||||||
for(int i = 0; i < 256; ++i) {
|
for(int i = 0; i < 128; ++i) {
|
||||||
if(radics[i] != 0) {
|
// Hopefully we get more ILP out of this
|
||||||
radics[i] += prev;
|
// Also I tried branchless before adding
|
||||||
real_radics[reali] = prev;
|
// ILP here and it slowed things, so first
|
||||||
real_radics[reali + 1] = radics[i];
|
// let us try it with branch prediction!
|
||||||
prev = radics[i];
|
if(radics1[i] != 0) {
|
||||||
reali += 2;
|
radics1[i] += prev1;
|
||||||
|
real_radics1[reali1] = prev1;
|
||||||
|
real_radics1[reali1 + 1] = radics1[i];
|
||||||
|
prev1 = radics1[i];
|
||||||
|
reali1 += 2;
|
||||||
} else {
|
} else {
|
||||||
radics[i] += prev;
|
radics1[i] += prev1;
|
||||||
prev = radics[i];
|
prev1 = radics1[i];
|
||||||
|
}
|
||||||
|
if(radics2[i] != 0) {
|
||||||
|
radics2[i] += prev2;
|
||||||
|
real_radics2[reali2] = prev2;
|
||||||
|
real_radics2[reali2 + 1] = radics2[i];
|
||||||
|
prev2 = radics2[i];
|
||||||
|
reali2 += 2;
|
||||||
|
} else {
|
||||||
|
radics2[i] += prev2;
|
||||||
|
prev2 = radics2[i];
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// Inplace swap
|
// Inplace swap, with added ILP / branchless opt.
|
||||||
uint32_t pivoti = 0;
|
// Without it its data dependent like crazy...
|
||||||
while(pivoti < n) {
|
uint32_t pivoti1 = 0;
|
||||||
uint32_t radixval = morgrab<j>(a[pivoti]);
|
uint32_t pivoti2 = n2;
|
||||||
uint32_t targeti = radics[radixval] - 1;
|
while((pivoti1 < n2) && (pivoti2 < n)) {
|
||||||
if(targeti > pivoti) {
|
|
||||||
// swap
|
/* Pivot 1 */
|
||||||
uint32_t tmp = a[pivoti];
|
|
||||||
a[pivoti] = a[targeti];
|
uint32_t radixval1 = morgrab<j>(a[pivoti1]);
|
||||||
a[targeti] = tmp;
|
uint32_t targeti1 = --radics1[radixval1]; // dec index (!)
|
||||||
// dec index
|
|
||||||
--radics[radixval];
|
// Bitmask: true -> 11.....1; false -> 00.....0
|
||||||
} else {
|
uint32_t mask1 = ~((targeti1 > pivoti1) - 1);
|
||||||
// progress pivot
|
|
||||||
++pivoti;
|
// Branchless swap (using bitmask)
|
||||||
}
|
uint32_t delta1 = (a[pivoti1] ^ a[targeti1]) & mask1;
|
||||||
|
a[pivoti1] = a[pivoti1] ^ delta1;
|
||||||
|
a[targeti1] = a[targeti1] ^ delta1;
|
||||||
|
|
||||||
|
// "else" branch
|
||||||
|
pivoti1 += !mask1;
|
||||||
|
radics1[radixval1] += !mask1; // undec index (!)
|
||||||
|
|
||||||
|
/* Pivot 2 */
|
||||||
|
|
||||||
|
uint32_t radixval2 = morgrab<j>(a[pivoti2]);
|
||||||
|
uint32_t targeti2 = --radics2[radixval2]; // dec index (!)
|
||||||
|
|
||||||
|
// Bitmask: true -> 11.....1; false -> 00.....0
|
||||||
|
uint32_t mask2 = ~((targeti2 > pivoti2) - 1);
|
||||||
|
|
||||||
|
// Branchless swap (using bitmask)
|
||||||
|
uint32_t delta2 = (a[pivoti2] ^ a[targeti2]) & mask2;
|
||||||
|
a[pivoti2] = a[pivoti2] ^ delta2;
|
||||||
|
a[targeti2] = a[targeti2] ^ delta2;
|
||||||
|
|
||||||
|
// "else" branch
|
||||||
|
pivoti2 += !mask2;
|
||||||
|
radics2[radixval2] += !mask2; // undec index (!)
|
||||||
}
|
}
|
||||||
|
|
||||||
// Ends recursion
|
// Possible recursions
|
||||||
if constexpr (j != 0) {
|
if constexpr (j != 0) {
|
||||||
// Recursion
|
/* Partition 1 recursions */
|
||||||
for(int i = 0; i < reali; i += 2) {
|
for(int i = 0; i < reali1; i += 2) {
|
||||||
/* inclusive */
|
/* inclusive */
|
||||||
uint32_t from = real_radics[i];
|
uint32_t from = real_radics1[i];
|
||||||
/* non-inclusive */
|
/* non-inclusive */
|
||||||
uint32_t to = real_radics[i + 1];
|
uint32_t to = real_radics1[i + 1];
|
||||||
|
mormord_sort_impl<j - 1>(&a[from], (to - (from)));
|
||||||
|
}
|
||||||
|
/* Partition 2 recursions */
|
||||||
|
for(int i = 0; i < reali2; i += 2) {
|
||||||
|
/* inclusive */
|
||||||
|
uint32_t from = real_radics2[i];
|
||||||
|
/* non-inclusive */
|
||||||
|
uint32_t to = real_radics2[i + 1];
|
||||||
mormord_sort_impl<j - 1>(&a[from], (to - (from)));
|
mormord_sort_impl<j - 1>(&a[from], (to - (from)));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -654,11 +712,12 @@ void measure_single(int n) {
|
|||||||
|
|
||||||
int main(void) {
|
int main(void) {
|
||||||
//int n = 100000000;
|
//int n = 100000000;
|
||||||
int n = 10000000;
|
//int n = 10000000;
|
||||||
//int n = 1000000;
|
//int n = 1000000;
|
||||||
//int n = 100000;
|
//int n = 100000;
|
||||||
//int n = 10000;
|
//int n = 10000;
|
||||||
//int n = 1000;
|
//int n = 1000;
|
||||||
|
int n = 200;
|
||||||
//int n = 100;
|
//int n = 100;
|
||||||
//int n = 10;
|
//int n = 10;
|
||||||
|
|
||||||
@ -686,9 +745,11 @@ int main(void) {
|
|||||||
w.swap(buf);
|
w.swap(buf);
|
||||||
}
|
}
|
||||||
});
|
});
|
||||||
|
/*
|
||||||
w = v;
|
w = v;
|
||||||
measure(inputtype, "magyar", [&] { MagyarSort::sort<uint32_t>(&w[0], w.size()); });
|
measure(inputtype, "magyar", [&] { MagyarSort::sort<uint32_t>(&w[0], w.size()); });
|
||||||
assert(w == expected);
|
assert(w == expected);
|
||||||
|
*/
|
||||||
w = v;
|
w = v;
|
||||||
measure(inputtype, "mormord", [&] { mormord_sort(&w[0], w.size()); });
|
measure(inputtype, "mormord", [&] { mormord_sort(&w[0], w.size()); });
|
||||||
assert(w == expected);
|
assert(w == expected);
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user