mlocks and frewr algorithm both added

This commit is contained in:
Richard Thier 2021-12-19 21:55:48 +01:00
parent a4d50c3309
commit c77e592a84
2 changed files with 76 additions and 15 deletions

View File

@ -23,6 +23,10 @@
#include <vector>
#include <algorithm> // std::swap
#ifndef NO_MLOCK
#include <sys/mman.h> // mlock & munlock
#endif // !NO_MLOCK
namespace MagyarSort {
/* CONFIG */
@ -111,7 +115,7 @@ namespace MagyarSort {
//#pragma GCC unroll 4
for(; i < size - 64; i += 64) {
// Prefetch for read level-1 cache
//__builtin_prefetch(&arr[i + (1 * 16)], 0/*r*/, 2/*L2 or L3 cache likely*/);
//__builtin_prefetch(&arr[i + (1 * 16)], 0, 2); // r, L2 or L3 cache
__builtin_prefetch(&arr[i + (1 * 16)]);
// Creates no object, struct is empty
OccurenceMagic<DIGITS - 1, COUNTER_TYP>(arr, i, radicsOut);
@ -290,16 +294,16 @@ namespace MagyarSort {
*
* Rem.: If you use the VectorGiverWithReuse please remind yourself to Gc() it time-to-time!
*
* Beware: GC needs to happen on all threads that use us!
* Beware: GC needs to happen on all threads that use us if you want to GC!
*
* @param arr The array to sort. Result will be in the same array - as sorted.
* @param size The lenght of the array - should fit in the COUNTER_TYP.
* @param COUNTER_TYP OPTIONAL: When set this type will be the counter type.
* @param COUNTER_TYP OPTIONAL: When set this type will be the counter type. For most cases uint32_t is enough.
* @param REUSE OPTIONAL: When true, we reuse the array instead of always gettin' and releasin' from da heap.
* @param GC OPTIONAL: When true, we garbage collect memory from previous sorts if REUSE is true.
* @param GC_WITHOUT_SORT OPTIONAL: When true, we "just GC" but do not sort in case of GC is true.
*/
template<typename COUNTER_TYP = size_t, bool REUSE = false, bool GC = false, bool GC_WITHOUT_SORT = false>
template<typename COUNTER_TYP = uint32_t, bool REUSE = false, bool GC = false, bool GC_WITHOUT_SORT = false>
inline void __attribute__((always_inline)) sort_impl(uint32_t arr[], COUNTER_TYP size) noexcept {
// Most funny optimization is this multiply here :-)
//
@ -308,6 +312,10 @@ namespace MagyarSort {
// optimize the first call for sort when we REUSE the array so size is fine!
static thread_local std::vector<uint32_t> arc(size * REUSE);
#ifndef NO_MLOCK
mlock(arr, size * sizeof(uint32_t));
#endif // !NO_MLOCK
// "Garbage-collection"
if(GC) {
arc = std::vector<uint32_t>();
@ -321,6 +329,9 @@ namespace MagyarSort {
// Holds "digit" occurences, prefix sums, whatevers
// First "DIGIT_RANGE" elem is for MSB "DIGITS", last is for LSB
static thread_local COUNTER_TYP radics[DIGITS * DIGIT_RANGE];
#ifndef NO_MLOCK
mlock(radics, (DIGITS * DIGIT_RANGE) * sizeof(COUNTER_TYP));
#endif // !NO_MLOCK
// Write prefetchin'
//__builtin_prefetch(&radicsOut[..], 1);
PrefetchMagic<DIGITS * DIGIT_RANGE, (64/sizeof(COUNTER_TYP)), COUNTER_TYP, 1/*w*/> pm(radics);
@ -346,9 +357,6 @@ namespace MagyarSort {
// Regular radix sort needs a copy, see: https://www.youtube.com/watch?v=ujb2CIWE8zY
// But instead of the below, we do a trickery...
//
//std::vector<uint32_t> arc(size);
//auto arc = VectorGiver::Give(size); // "auto" is needed for this to perform well with some givers!
//
// Rem.: The branch is optimized out in compile time!
if(REUSE) {
arc.resize(size);
@ -357,6 +365,9 @@ namespace MagyarSort {
// We must regain memory of previous!
arc = std::move(std::vector<uint32_t>(size));
}
#ifndef NO_MLOCK
mlock(&arc[0], size * sizeof(uint32_t));
#endif // !NO_MLOCK
uint32_t *from = arr;
uint32_t *to = &arc[0];
@ -371,6 +382,11 @@ namespace MagyarSort {
if(swapped) { // <- in reality this is what we want because of last swap happened anyways!
memcpy(arr, to, size);
}
#ifndef NO_MLOCK
munlock(radics, (DIGITS * DIGIT_RANGE) * sizeof(COUNTER_TYP));
munlock(&arc[0], size * sizeof(uint32_t));
munlock(arr, size * sizeof(uint32_t));
#endif // !NO_MLOCK
}
/**
@ -425,10 +441,10 @@ namespace MagyarSort {
*
* @param arr The array to sort. Result will be in the same array - as sorted.
* @param size The lenght of the array.
* @param COUNTER_TYP OPTIONAL: It is best kepts as size_t, but for smaller arrays, you can use uint32_t.
* @param COUNTER_TYP: Should be size_t for HUGE arrays, but regular arrays, you can use uint32_t. Should be auto found-out
*/
template<typename COUNTER_TYP = size_t>
inline void __attribute__((always_inline)) sort_no_reuse(uint32_t arr[], size_t size) noexcept {
template<typename COUNTER_TYP>
inline void __attribute__((always_inline)) sort_no_reuse(uint32_t arr[], COUNTER_TYP size) noexcept {
// We use the heap once per every call...
// This is safer and we do not need garbage collecting
MagyarSort::sort_impl<COUNTER_TYP>(arr, size);
@ -446,10 +462,10 @@ namespace MagyarSort {
*
* @param arr The array to sort. Result will be in the same array - as sorted.
* @param size The lenght of the array.
* @param COUNTER_TYP OPTIONAL: It is best kepts as size_t, but for smaller arrays, you can use uint32_t.
* @param COUNTER_TYP: Should be size_t for HUGE arrays, but regular arrays, you can use uint32_t. Should be auto found-out
*/
template<typename COUNTER_TYP = size_t>
inline void sort(uint32_t arr[], size_t size) noexcept {
template<typename COUNTER_TYP>
inline void sort(uint32_t arr[], COUNTER_TYP size) noexcept {
#ifdef MAGYAR_SORT_DEFAULT_REUSE
MagyarSort::sort_reuse<COUNTER_TYP>(arr, size);
#else

View File

@ -11,6 +11,7 @@
#include <string>
#include <vector>
#include <numeric>
#include <sys/mman.h> // mlock & munlock
#include "ska_sort.hpp"
@ -260,6 +261,47 @@
free(buf);
}
// frewr - four rewrites.
void frewr(uint32_t *arr, int n) {
uint32_t *tmpbuf = (uint32_t *)malloc(n * 4);
mlock(tmpbuf, n * 4);
int btoffsets[4][256] = {};
#pragma GCC unroll 64
for (int i = n - 1; i >= 0; i--) {
uint32_t a = arr[i];
btoffsets[3][a & 0xff]++;
btoffsets[2][a >> 8 & 0xff]++;
btoffsets[1][a >> 16 & 0xff]++;
btoffsets[0][a >> 24 & 0xff]++;
}
int btend[4] = {n - 1, n - 1, n - 1, n - 1};
#pragma GCC unroll 16
for (int i = 255; i >= 0; i--) {
#pragma GCC unroll 4
for (int pass = 3; pass >= 0; pass--) {
int nbtend = btend[pass] - btoffsets[pass][i];
btoffsets[pass][i] = btend[pass];
btend[pass] = nbtend;
}
}
uint32_t *src = arr, *dst = tmpbuf;
#pragma GCC unroll 4
for (int pass = 3; pass >= 0; pass--) {
int *off = btoffsets[pass];
#pragma GCC unroll 64
for (int i = n - 1; i >= 0; i--) {
uint32_t v = src[i];
dst[off[v & 0xff]--] = v >> 8 | v << 24;
__builtin_prefetch(&dst[off[v & 0xff] - 2]);
}
uint32_t *tmp = src;
src = dst;
dst = tmp;
}
munlock(tmpbuf, n * 4);
free(tmpbuf);
}
void vsort(uint32_t *a, int n) {
thread_local std::vector<uint32_t> bts[256];
#pragma GCC unroll 4
@ -349,8 +391,8 @@
}
int main(void) {
//int n = 100000000;
int n = 10000000;
int n = 100000000;
//int n = 10000000;
for (auto inputtype : inputtypes) {
printf("%10s", inputtype.c_str());
fflush(stdout);
@ -390,6 +432,9 @@
w = v;
measure(inputtype, "4rot", [&] { fourrots(&w[0], w.size()); });
assert(w == expected);
w = v;
measure(inputtype, "frewr", [&] { frewr(&w[0], w.size()); });
assert(w == expected);
/*
w = v;
measure(inputtype, "vsort", [&] { vsort(&w[0], w.size()); });