Merge branch 'tmp' into ilp-radix-1
This commit is contained in:
commit
d858f39708
42
magyarsort.h
42
magyarsort.h
@ -23,6 +23,10 @@
|
||||
#include <vector>
|
||||
#include <algorithm> // std::swap
|
||||
|
||||
#ifndef NO_MLOCK
|
||||
#include <sys/mman.h> // mlock & munlock
|
||||
#endif // !NO_MLOCK
|
||||
|
||||
namespace MagyarSort {
|
||||
/* CONFIG */
|
||||
|
||||
@ -111,7 +115,7 @@ namespace MagyarSort {
|
||||
//#pragma GCC unroll 4
|
||||
for(; i < size - 64; i += 64) {
|
||||
// Prefetch for read level-1 cache
|
||||
//__builtin_prefetch(&arr[i + (1 * 16)], 0/*r*/, 2/*L2 or L3 cache likely*/);
|
||||
//__builtin_prefetch(&arr[i + (1 * 16)], 0, 2); // r, L2 or L3 cache
|
||||
__builtin_prefetch(&arr[i + (1 * 16)]);
|
||||
// Creates no object, struct is empty
|
||||
OccurenceMagic<DIGITS - 1, COUNTER_TYP>(arr, i, radicsOut);
|
||||
@ -290,16 +294,16 @@ namespace MagyarSort {
|
||||
*
|
||||
* Rem.: If you use the VectorGiverWithReuse please remind yourself to Gc() it time-to-time!
|
||||
*
|
||||
* Beware: GC needs to happen on all threads that use us!
|
||||
* Beware: GC needs to happen on all threads that use us if you want to GC!
|
||||
*
|
||||
* @param arr The array to sort. Result will be in the same array - as sorted.
|
||||
* @param size The lenght of the array - should fit in the COUNTER_TYP.
|
||||
* @param COUNTER_TYP OPTIONAL: When set this type will be the counter type.
|
||||
* @param COUNTER_TYP OPTIONAL: When set this type will be the counter type. For most cases uint32_t is enough.
|
||||
* @param REUSE OPTIONAL: When true, we reuse the array instead of always gettin' and releasin' from da heap.
|
||||
* @param GC OPTIONAL: When true, we garbage collect memory from previous sorts if REUSE is true.
|
||||
* @param GC_WITHOUT_SORT OPTIONAL: When true, we "just GC" but do not sort in case of GC is true.
|
||||
*/
|
||||
template<typename COUNTER_TYP = size_t, bool REUSE = false, bool GC = false, bool GC_WITHOUT_SORT = false>
|
||||
template<typename COUNTER_TYP = uint32_t, bool REUSE = false, bool GC = false, bool GC_WITHOUT_SORT = false>
|
||||
inline void __attribute__((always_inline)) sort_impl(uint32_t arr[], COUNTER_TYP size) noexcept {
|
||||
// Most funny optimization is this multiply here :-)
|
||||
//
|
||||
@ -308,6 +312,10 @@ namespace MagyarSort {
|
||||
// optimize the first call for sort when we REUSE the array so size is fine!
|
||||
static thread_local std::vector<uint32_t> arc(size * REUSE);
|
||||
|
||||
#ifndef NO_MLOCK
|
||||
mlock(arr, size * sizeof(uint32_t));
|
||||
#endif // !NO_MLOCK
|
||||
|
||||
// "Garbage-collection"
|
||||
if(GC) {
|
||||
arc = std::vector<uint32_t>();
|
||||
@ -321,6 +329,9 @@ namespace MagyarSort {
|
||||
// Holds "digit" occurences, prefix sums, whatevers
|
||||
// First "DIGIT_RANGE" elem is for MSB "DIGITS", last is for LSB
|
||||
static thread_local COUNTER_TYP radics[DIGITS * DIGIT_RANGE];
|
||||
#ifndef NO_MLOCK
|
||||
mlock(radics, (DIGITS * DIGIT_RANGE) * sizeof(COUNTER_TYP));
|
||||
#endif // !NO_MLOCK
|
||||
// Write prefetchin'
|
||||
//__builtin_prefetch(&radicsOut[..], 1);
|
||||
PrefetchMagic<DIGITS * DIGIT_RANGE, (64/sizeof(COUNTER_TYP)), COUNTER_TYP, 1/*w*/> pm(radics);
|
||||
@ -346,9 +357,6 @@ namespace MagyarSort {
|
||||
// Regular radix sort needs a copy, see: https://www.youtube.com/watch?v=ujb2CIWE8zY
|
||||
// But instead of the below, we do a trickery...
|
||||
//
|
||||
//std::vector<uint32_t> arc(size);
|
||||
//auto arc = VectorGiver::Give(size); // "auto" is needed for this to perform well with some givers!
|
||||
//
|
||||
// Rem.: The branch is optimized out in compile time!
|
||||
if(REUSE) {
|
||||
arc.resize(size);
|
||||
@ -357,6 +365,9 @@ namespace MagyarSort {
|
||||
// We must regain memory of previous!
|
||||
arc = std::move(std::vector<uint32_t>(size));
|
||||
}
|
||||
#ifndef NO_MLOCK
|
||||
mlock(&arc[0], size * sizeof(uint32_t));
|
||||
#endif // !NO_MLOCK
|
||||
|
||||
uint32_t *from = arr;
|
||||
uint32_t *to = &arc[0];
|
||||
@ -371,6 +382,11 @@ namespace MagyarSort {
|
||||
if(swapped) { // <- in reality this is what we want because of last swap happened anyways!
|
||||
memcpy(arr, to, size);
|
||||
}
|
||||
#ifndef NO_MLOCK
|
||||
munlock(radics, (DIGITS * DIGIT_RANGE) * sizeof(COUNTER_TYP));
|
||||
munlock(&arc[0], size * sizeof(uint32_t));
|
||||
munlock(arr, size * sizeof(uint32_t));
|
||||
#endif // !NO_MLOCK
|
||||
}
|
||||
|
||||
/**
|
||||
@ -425,10 +441,10 @@ namespace MagyarSort {
|
||||
*
|
||||
* @param arr The array to sort. Result will be in the same array - as sorted.
|
||||
* @param size The lenght of the array.
|
||||
* @param COUNTER_TYP OPTIONAL: It is best kepts as size_t, but for smaller arrays, you can use uint32_t.
|
||||
* @param COUNTER_TYP: Should be size_t for HUGE arrays, but regular arrays, you can use uint32_t. Should be auto found-out
|
||||
*/
|
||||
template<typename COUNTER_TYP = size_t>
|
||||
inline void __attribute__((always_inline)) sort_no_reuse(uint32_t arr[], size_t size) noexcept {
|
||||
template<typename COUNTER_TYP>
|
||||
inline void __attribute__((always_inline)) sort_no_reuse(uint32_t arr[], COUNTER_TYP size) noexcept {
|
||||
// We use the heap once per every call...
|
||||
// This is safer and we do not need garbage collecting
|
||||
MagyarSort::sort_impl<COUNTER_TYP>(arr, size);
|
||||
@ -446,10 +462,10 @@ namespace MagyarSort {
|
||||
*
|
||||
* @param arr The array to sort. Result will be in the same array - as sorted.
|
||||
* @param size The lenght of the array.
|
||||
* @param COUNTER_TYP OPTIONAL: It is best kepts as size_t, but for smaller arrays, you can use uint32_t.
|
||||
* @param COUNTER_TYP: Should be size_t for HUGE arrays, but regular arrays, you can use uint32_t. Should be auto found-out
|
||||
*/
|
||||
template<typename COUNTER_TYP = size_t>
|
||||
inline void sort(uint32_t arr[], size_t size) noexcept {
|
||||
template<typename COUNTER_TYP>
|
||||
inline void sort(uint32_t arr[], COUNTER_TYP size) noexcept {
|
||||
#ifdef MAGYAR_SORT_DEFAULT_REUSE
|
||||
MagyarSort::sort_reuse<COUNTER_TYP>(arr, size);
|
||||
#else
|
||||
|
||||
49
ypsu.cpp
49
ypsu.cpp
@ -11,6 +11,7 @@
|
||||
#include <string>
|
||||
#include <vector>
|
||||
#include <numeric>
|
||||
#include <sys/mman.h> // mlock & munlock
|
||||
#include "ska_sort.hpp"
|
||||
|
||||
|
||||
@ -260,6 +261,47 @@
|
||||
free(buf);
|
||||
}
|
||||
|
||||
// frewr - four rewrites.
|
||||
void frewr(uint32_t *arr, int n) {
|
||||
uint32_t *tmpbuf = (uint32_t *)malloc(n * 4);
|
||||
mlock(tmpbuf, n * 4);
|
||||
int btoffsets[4][256] = {};
|
||||
#pragma GCC unroll 64
|
||||
for (int i = n - 1; i >= 0; i--) {
|
||||
uint32_t a = arr[i];
|
||||
btoffsets[3][a & 0xff]++;
|
||||
btoffsets[2][a >> 8 & 0xff]++;
|
||||
btoffsets[1][a >> 16 & 0xff]++;
|
||||
btoffsets[0][a >> 24 & 0xff]++;
|
||||
}
|
||||
int btend[4] = {n - 1, n - 1, n - 1, n - 1};
|
||||
#pragma GCC unroll 16
|
||||
for (int i = 255; i >= 0; i--) {
|
||||
#pragma GCC unroll 4
|
||||
for (int pass = 3; pass >= 0; pass--) {
|
||||
int nbtend = btend[pass] - btoffsets[pass][i];
|
||||
btoffsets[pass][i] = btend[pass];
|
||||
btend[pass] = nbtend;
|
||||
}
|
||||
}
|
||||
uint32_t *src = arr, *dst = tmpbuf;
|
||||
#pragma GCC unroll 4
|
||||
for (int pass = 3; pass >= 0; pass--) {
|
||||
int *off = btoffsets[pass];
|
||||
#pragma GCC unroll 64
|
||||
for (int i = n - 1; i >= 0; i--) {
|
||||
uint32_t v = src[i];
|
||||
dst[off[v & 0xff]--] = v >> 8 | v << 24;
|
||||
__builtin_prefetch(&dst[off[v & 0xff] - 2]);
|
||||
}
|
||||
uint32_t *tmp = src;
|
||||
src = dst;
|
||||
dst = tmp;
|
||||
}
|
||||
munlock(tmpbuf, n * 4);
|
||||
free(tmpbuf);
|
||||
}
|
||||
|
||||
void vsort(uint32_t *a, int n) {
|
||||
thread_local std::vector<uint32_t> bts[256];
|
||||
#pragma GCC unroll 4
|
||||
@ -349,8 +391,8 @@
|
||||
}
|
||||
|
||||
int main(void) {
|
||||
//int n = 100000000;
|
||||
int n = 40000000;
|
||||
int n = 100000000;
|
||||
//int n = 10000000;
|
||||
for (auto inputtype : inputtypes) {
|
||||
printf("%10s", inputtype.c_str());
|
||||
fflush(stdout);
|
||||
@ -390,6 +432,9 @@
|
||||
w = v;
|
||||
measure(inputtype, "4rot", [&] { fourrots(&w[0], w.size()); });
|
||||
assert(w == expected);
|
||||
w = v;
|
||||
measure(inputtype, "frewr", [&] { frewr(&w[0], w.size()); });
|
||||
assert(w == expected);
|
||||
/*
|
||||
w = v;
|
||||
measure(inputtype, "vsort", [&] { vsort(&w[0], w.size()); });
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user