Merge branch 'tmp' into ilp-radix-1
This commit is contained in:
commit
d858f39708
42
magyarsort.h
42
magyarsort.h
@ -23,6 +23,10 @@
|
|||||||
#include <vector>
|
#include <vector>
|
||||||
#include <algorithm> // std::swap
|
#include <algorithm> // std::swap
|
||||||
|
|
||||||
|
#ifndef NO_MLOCK
|
||||||
|
#include <sys/mman.h> // mlock & munlock
|
||||||
|
#endif // !NO_MLOCK
|
||||||
|
|
||||||
namespace MagyarSort {
|
namespace MagyarSort {
|
||||||
/* CONFIG */
|
/* CONFIG */
|
||||||
|
|
||||||
@ -111,7 +115,7 @@ namespace MagyarSort {
|
|||||||
//#pragma GCC unroll 4
|
//#pragma GCC unroll 4
|
||||||
for(; i < size - 64; i += 64) {
|
for(; i < size - 64; i += 64) {
|
||||||
// Prefetch for read level-1 cache
|
// Prefetch for read level-1 cache
|
||||||
//__builtin_prefetch(&arr[i + (1 * 16)], 0/*r*/, 2/*L2 or L3 cache likely*/);
|
//__builtin_prefetch(&arr[i + (1 * 16)], 0, 2); // r, L2 or L3 cache
|
||||||
__builtin_prefetch(&arr[i + (1 * 16)]);
|
__builtin_prefetch(&arr[i + (1 * 16)]);
|
||||||
// Creates no object, struct is empty
|
// Creates no object, struct is empty
|
||||||
OccurenceMagic<DIGITS - 1, COUNTER_TYP>(arr, i, radicsOut);
|
OccurenceMagic<DIGITS - 1, COUNTER_TYP>(arr, i, radicsOut);
|
||||||
@ -290,16 +294,16 @@ namespace MagyarSort {
|
|||||||
*
|
*
|
||||||
* Rem.: If you use the VectorGiverWithReuse please remind yourself to Gc() it time-to-time!
|
* Rem.: If you use the VectorGiverWithReuse please remind yourself to Gc() it time-to-time!
|
||||||
*
|
*
|
||||||
* Beware: GC needs to happen on all threads that use us!
|
* Beware: GC needs to happen on all threads that use us if you want to GC!
|
||||||
*
|
*
|
||||||
* @param arr The array to sort. Result will be in the same array - as sorted.
|
* @param arr The array to sort. Result will be in the same array - as sorted.
|
||||||
* @param size The lenght of the array - should fit in the COUNTER_TYP.
|
* @param size The lenght of the array - should fit in the COUNTER_TYP.
|
||||||
* @param COUNTER_TYP OPTIONAL: When set this type will be the counter type.
|
* @param COUNTER_TYP OPTIONAL: When set this type will be the counter type. For most cases uint32_t is enough.
|
||||||
* @param REUSE OPTIONAL: When true, we reuse the array instead of always gettin' and releasin' from da heap.
|
* @param REUSE OPTIONAL: When true, we reuse the array instead of always gettin' and releasin' from da heap.
|
||||||
* @param GC OPTIONAL: When true, we garbage collect memory from previous sorts if REUSE is true.
|
* @param GC OPTIONAL: When true, we garbage collect memory from previous sorts if REUSE is true.
|
||||||
* @param GC_WITHOUT_SORT OPTIONAL: When true, we "just GC" but do not sort in case of GC is true.
|
* @param GC_WITHOUT_SORT OPTIONAL: When true, we "just GC" but do not sort in case of GC is true.
|
||||||
*/
|
*/
|
||||||
template<typename COUNTER_TYP = size_t, bool REUSE = false, bool GC = false, bool GC_WITHOUT_SORT = false>
|
template<typename COUNTER_TYP = uint32_t, bool REUSE = false, bool GC = false, bool GC_WITHOUT_SORT = false>
|
||||||
inline void __attribute__((always_inline)) sort_impl(uint32_t arr[], COUNTER_TYP size) noexcept {
|
inline void __attribute__((always_inline)) sort_impl(uint32_t arr[], COUNTER_TYP size) noexcept {
|
||||||
// Most funny optimization is this multiply here :-)
|
// Most funny optimization is this multiply here :-)
|
||||||
//
|
//
|
||||||
@ -308,6 +312,10 @@ namespace MagyarSort {
|
|||||||
// optimize the first call for sort when we REUSE the array so size is fine!
|
// optimize the first call for sort when we REUSE the array so size is fine!
|
||||||
static thread_local std::vector<uint32_t> arc(size * REUSE);
|
static thread_local std::vector<uint32_t> arc(size * REUSE);
|
||||||
|
|
||||||
|
#ifndef NO_MLOCK
|
||||||
|
mlock(arr, size * sizeof(uint32_t));
|
||||||
|
#endif // !NO_MLOCK
|
||||||
|
|
||||||
// "Garbage-collection"
|
// "Garbage-collection"
|
||||||
if(GC) {
|
if(GC) {
|
||||||
arc = std::vector<uint32_t>();
|
arc = std::vector<uint32_t>();
|
||||||
@ -321,6 +329,9 @@ namespace MagyarSort {
|
|||||||
// Holds "digit" occurences, prefix sums, whatevers
|
// Holds "digit" occurences, prefix sums, whatevers
|
||||||
// First "DIGIT_RANGE" elem is for MSB "DIGITS", last is for LSB
|
// First "DIGIT_RANGE" elem is for MSB "DIGITS", last is for LSB
|
||||||
static thread_local COUNTER_TYP radics[DIGITS * DIGIT_RANGE];
|
static thread_local COUNTER_TYP radics[DIGITS * DIGIT_RANGE];
|
||||||
|
#ifndef NO_MLOCK
|
||||||
|
mlock(radics, (DIGITS * DIGIT_RANGE) * sizeof(COUNTER_TYP));
|
||||||
|
#endif // !NO_MLOCK
|
||||||
// Write prefetchin'
|
// Write prefetchin'
|
||||||
//__builtin_prefetch(&radicsOut[..], 1);
|
//__builtin_prefetch(&radicsOut[..], 1);
|
||||||
PrefetchMagic<DIGITS * DIGIT_RANGE, (64/sizeof(COUNTER_TYP)), COUNTER_TYP, 1/*w*/> pm(radics);
|
PrefetchMagic<DIGITS * DIGIT_RANGE, (64/sizeof(COUNTER_TYP)), COUNTER_TYP, 1/*w*/> pm(radics);
|
||||||
@ -346,9 +357,6 @@ namespace MagyarSort {
|
|||||||
// Regular radix sort needs a copy, see: https://www.youtube.com/watch?v=ujb2CIWE8zY
|
// Regular radix sort needs a copy, see: https://www.youtube.com/watch?v=ujb2CIWE8zY
|
||||||
// But instead of the below, we do a trickery...
|
// But instead of the below, we do a trickery...
|
||||||
//
|
//
|
||||||
//std::vector<uint32_t> arc(size);
|
|
||||||
//auto arc = VectorGiver::Give(size); // "auto" is needed for this to perform well with some givers!
|
|
||||||
//
|
|
||||||
// Rem.: The branch is optimized out in compile time!
|
// Rem.: The branch is optimized out in compile time!
|
||||||
if(REUSE) {
|
if(REUSE) {
|
||||||
arc.resize(size);
|
arc.resize(size);
|
||||||
@ -357,6 +365,9 @@ namespace MagyarSort {
|
|||||||
// We must regain memory of previous!
|
// We must regain memory of previous!
|
||||||
arc = std::move(std::vector<uint32_t>(size));
|
arc = std::move(std::vector<uint32_t>(size));
|
||||||
}
|
}
|
||||||
|
#ifndef NO_MLOCK
|
||||||
|
mlock(&arc[0], size * sizeof(uint32_t));
|
||||||
|
#endif // !NO_MLOCK
|
||||||
|
|
||||||
uint32_t *from = arr;
|
uint32_t *from = arr;
|
||||||
uint32_t *to = &arc[0];
|
uint32_t *to = &arc[0];
|
||||||
@ -371,6 +382,11 @@ namespace MagyarSort {
|
|||||||
if(swapped) { // <- in reality this is what we want because of last swap happened anyways!
|
if(swapped) { // <- in reality this is what we want because of last swap happened anyways!
|
||||||
memcpy(arr, to, size);
|
memcpy(arr, to, size);
|
||||||
}
|
}
|
||||||
|
#ifndef NO_MLOCK
|
||||||
|
munlock(radics, (DIGITS * DIGIT_RANGE) * sizeof(COUNTER_TYP));
|
||||||
|
munlock(&arc[0], size * sizeof(uint32_t));
|
||||||
|
munlock(arr, size * sizeof(uint32_t));
|
||||||
|
#endif // !NO_MLOCK
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
@ -425,10 +441,10 @@ namespace MagyarSort {
|
|||||||
*
|
*
|
||||||
* @param arr The array to sort. Result will be in the same array - as sorted.
|
* @param arr The array to sort. Result will be in the same array - as sorted.
|
||||||
* @param size The lenght of the array.
|
* @param size The lenght of the array.
|
||||||
* @param COUNTER_TYP OPTIONAL: It is best kepts as size_t, but for smaller arrays, you can use uint32_t.
|
* @param COUNTER_TYP: Should be size_t for HUGE arrays, but regular arrays, you can use uint32_t. Should be auto found-out
|
||||||
*/
|
*/
|
||||||
template<typename COUNTER_TYP = size_t>
|
template<typename COUNTER_TYP>
|
||||||
inline void __attribute__((always_inline)) sort_no_reuse(uint32_t arr[], size_t size) noexcept {
|
inline void __attribute__((always_inline)) sort_no_reuse(uint32_t arr[], COUNTER_TYP size) noexcept {
|
||||||
// We use the heap once per every call...
|
// We use the heap once per every call...
|
||||||
// This is safer and we do not need garbage collecting
|
// This is safer and we do not need garbage collecting
|
||||||
MagyarSort::sort_impl<COUNTER_TYP>(arr, size);
|
MagyarSort::sort_impl<COUNTER_TYP>(arr, size);
|
||||||
@ -446,10 +462,10 @@ namespace MagyarSort {
|
|||||||
*
|
*
|
||||||
* @param arr The array to sort. Result will be in the same array - as sorted.
|
* @param arr The array to sort. Result will be in the same array - as sorted.
|
||||||
* @param size The lenght of the array.
|
* @param size The lenght of the array.
|
||||||
* @param COUNTER_TYP OPTIONAL: It is best kepts as size_t, but for smaller arrays, you can use uint32_t.
|
* @param COUNTER_TYP: Should be size_t for HUGE arrays, but regular arrays, you can use uint32_t. Should be auto found-out
|
||||||
*/
|
*/
|
||||||
template<typename COUNTER_TYP = size_t>
|
template<typename COUNTER_TYP>
|
||||||
inline void sort(uint32_t arr[], size_t size) noexcept {
|
inline void sort(uint32_t arr[], COUNTER_TYP size) noexcept {
|
||||||
#ifdef MAGYAR_SORT_DEFAULT_REUSE
|
#ifdef MAGYAR_SORT_DEFAULT_REUSE
|
||||||
MagyarSort::sort_reuse<COUNTER_TYP>(arr, size);
|
MagyarSort::sort_reuse<COUNTER_TYP>(arr, size);
|
||||||
#else
|
#else
|
||||||
|
|||||||
49
ypsu.cpp
49
ypsu.cpp
@ -11,6 +11,7 @@
|
|||||||
#include <string>
|
#include <string>
|
||||||
#include <vector>
|
#include <vector>
|
||||||
#include <numeric>
|
#include <numeric>
|
||||||
|
#include <sys/mman.h> // mlock & munlock
|
||||||
#include "ska_sort.hpp"
|
#include "ska_sort.hpp"
|
||||||
|
|
||||||
|
|
||||||
@ -260,6 +261,47 @@
|
|||||||
free(buf);
|
free(buf);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// frewr - four rewrites.
|
||||||
|
void frewr(uint32_t *arr, int n) {
|
||||||
|
uint32_t *tmpbuf = (uint32_t *)malloc(n * 4);
|
||||||
|
mlock(tmpbuf, n * 4);
|
||||||
|
int btoffsets[4][256] = {};
|
||||||
|
#pragma GCC unroll 64
|
||||||
|
for (int i = n - 1; i >= 0; i--) {
|
||||||
|
uint32_t a = arr[i];
|
||||||
|
btoffsets[3][a & 0xff]++;
|
||||||
|
btoffsets[2][a >> 8 & 0xff]++;
|
||||||
|
btoffsets[1][a >> 16 & 0xff]++;
|
||||||
|
btoffsets[0][a >> 24 & 0xff]++;
|
||||||
|
}
|
||||||
|
int btend[4] = {n - 1, n - 1, n - 1, n - 1};
|
||||||
|
#pragma GCC unroll 16
|
||||||
|
for (int i = 255; i >= 0; i--) {
|
||||||
|
#pragma GCC unroll 4
|
||||||
|
for (int pass = 3; pass >= 0; pass--) {
|
||||||
|
int nbtend = btend[pass] - btoffsets[pass][i];
|
||||||
|
btoffsets[pass][i] = btend[pass];
|
||||||
|
btend[pass] = nbtend;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
uint32_t *src = arr, *dst = tmpbuf;
|
||||||
|
#pragma GCC unroll 4
|
||||||
|
for (int pass = 3; pass >= 0; pass--) {
|
||||||
|
int *off = btoffsets[pass];
|
||||||
|
#pragma GCC unroll 64
|
||||||
|
for (int i = n - 1; i >= 0; i--) {
|
||||||
|
uint32_t v = src[i];
|
||||||
|
dst[off[v & 0xff]--] = v >> 8 | v << 24;
|
||||||
|
__builtin_prefetch(&dst[off[v & 0xff] - 2]);
|
||||||
|
}
|
||||||
|
uint32_t *tmp = src;
|
||||||
|
src = dst;
|
||||||
|
dst = tmp;
|
||||||
|
}
|
||||||
|
munlock(tmpbuf, n * 4);
|
||||||
|
free(tmpbuf);
|
||||||
|
}
|
||||||
|
|
||||||
void vsort(uint32_t *a, int n) {
|
void vsort(uint32_t *a, int n) {
|
||||||
thread_local std::vector<uint32_t> bts[256];
|
thread_local std::vector<uint32_t> bts[256];
|
||||||
#pragma GCC unroll 4
|
#pragma GCC unroll 4
|
||||||
@ -349,8 +391,8 @@
|
|||||||
}
|
}
|
||||||
|
|
||||||
int main(void) {
|
int main(void) {
|
||||||
//int n = 100000000;
|
int n = 100000000;
|
||||||
int n = 40000000;
|
//int n = 10000000;
|
||||||
for (auto inputtype : inputtypes) {
|
for (auto inputtype : inputtypes) {
|
||||||
printf("%10s", inputtype.c_str());
|
printf("%10s", inputtype.c_str());
|
||||||
fflush(stdout);
|
fflush(stdout);
|
||||||
@ -390,6 +432,9 @@
|
|||||||
w = v;
|
w = v;
|
||||||
measure(inputtype, "4rot", [&] { fourrots(&w[0], w.size()); });
|
measure(inputtype, "4rot", [&] { fourrots(&w[0], w.size()); });
|
||||||
assert(w == expected);
|
assert(w == expected);
|
||||||
|
w = v;
|
||||||
|
measure(inputtype, "frewr", [&] { frewr(&w[0], w.size()); });
|
||||||
|
assert(w == expected);
|
||||||
/*
|
/*
|
||||||
w = v;
|
w = v;
|
||||||
measure(inputtype, "vsort", [&] { vsort(&w[0], w.size()); });
|
measure(inputtype, "vsort", [&] { vsort(&w[0], w.size()); });
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user