diff --git a/magyarsort.h b/magyarsort.h index 8394fe9..0967990 100644 --- a/magyarsort.h +++ b/magyarsort.h @@ -23,6 +23,10 @@ #include #include // std::swap +#ifndef NO_MLOCK +#include // mlock & munlock +#endif // !NO_MLOCK + namespace MagyarSort { /* CONFIG */ @@ -111,7 +115,7 @@ namespace MagyarSort { //#pragma GCC unroll 4 for(; i < size - 64; i += 64) { // Prefetch for read level-1 cache - //__builtin_prefetch(&arr[i + (1 * 16)], 0/*r*/, 2/*L2 or L3 cache likely*/); + //__builtin_prefetch(&arr[i + (1 * 16)], 0, 2); // r, L2 or L3 cache __builtin_prefetch(&arr[i + (1 * 16)]); // Creates no object, struct is empty OccurenceMagic(arr, i, radicsOut); @@ -290,16 +294,16 @@ namespace MagyarSort { * * Rem.: If you use the VectorGiverWithReuse please remind yourself to Gc() it time-to-time! * - * Beware: GC needs to happen on all threads that use us! + * Beware: GC needs to happen on all threads that use us if you want to GC! * * @param arr The array to sort. Result will be in the same array - as sorted. * @param size The lenght of the array - should fit in the COUNTER_TYP. - * @param COUNTER_TYP OPTIONAL: When set this type will be the counter type. + * @param COUNTER_TYP OPTIONAL: When set this type will be the counter type. For most cases uint32_t is enough. * @param REUSE OPTIONAL: When true, we reuse the array instead of always gettin' and releasin' from da heap. * @param GC OPTIONAL: When true, we garbage collect memory from previous sorts if REUSE is true. * @param GC_WITHOUT_SORT OPTIONAL: When true, we "just GC" but do not sort in case of GC is true. */ - template + template inline void __attribute__((always_inline)) sort_impl(uint32_t arr[], COUNTER_TYP size) noexcept { // Most funny optimization is this multiply here :-) // @@ -308,6 +312,10 @@ namespace MagyarSort { // optimize the first call for sort when we REUSE the array so size is fine! static thread_local std::vector arc(size * REUSE); +#ifndef NO_MLOCK + mlock(arr, size * sizeof(uint32_t)); +#endif // !NO_MLOCK + // "Garbage-collection" if(GC) { arc = std::vector(); @@ -321,6 +329,9 @@ namespace MagyarSort { // Holds "digit" occurences, prefix sums, whatevers // First "DIGIT_RANGE" elem is for MSB "DIGITS", last is for LSB static thread_local COUNTER_TYP radics[DIGITS * DIGIT_RANGE]; +#ifndef NO_MLOCK + mlock(radics, (DIGITS * DIGIT_RANGE) * sizeof(COUNTER_TYP)); +#endif // !NO_MLOCK // Write prefetchin' //__builtin_prefetch(&radicsOut[..], 1); PrefetchMagic pm(radics); @@ -346,9 +357,6 @@ namespace MagyarSort { // Regular radix sort needs a copy, see: https://www.youtube.com/watch?v=ujb2CIWE8zY // But instead of the below, we do a trickery... // - //std::vector arc(size); - //auto arc = VectorGiver::Give(size); // "auto" is needed for this to perform well with some givers! - // // Rem.: The branch is optimized out in compile time! if(REUSE) { arc.resize(size); @@ -357,6 +365,9 @@ namespace MagyarSort { // We must regain memory of previous! arc = std::move(std::vector(size)); } +#ifndef NO_MLOCK + mlock(&arc[0], size * sizeof(uint32_t)); +#endif // !NO_MLOCK uint32_t *from = arr; uint32_t *to = &arc[0]; @@ -371,6 +382,11 @@ namespace MagyarSort { if(swapped) { // <- in reality this is what we want because of last swap happened anyways! memcpy(arr, to, size); } +#ifndef NO_MLOCK + munlock(radics, (DIGITS * DIGIT_RANGE) * sizeof(COUNTER_TYP)); + munlock(&arc[0], size * sizeof(uint32_t)); + munlock(arr, size * sizeof(uint32_t)); +#endif // !NO_MLOCK } /** @@ -425,10 +441,10 @@ namespace MagyarSort { * * @param arr The array to sort. Result will be in the same array - as sorted. * @param size The lenght of the array. - * @param COUNTER_TYP OPTIONAL: It is best kepts as size_t, but for smaller arrays, you can use uint32_t. + * @param COUNTER_TYP: Should be size_t for HUGE arrays, but regular arrays, you can use uint32_t. Should be auto found-out */ - template - inline void __attribute__((always_inline)) sort_no_reuse(uint32_t arr[], size_t size) noexcept { + template + inline void __attribute__((always_inline)) sort_no_reuse(uint32_t arr[], COUNTER_TYP size) noexcept { // We use the heap once per every call... // This is safer and we do not need garbage collecting MagyarSort::sort_impl(arr, size); @@ -446,10 +462,10 @@ namespace MagyarSort { * * @param arr The array to sort. Result will be in the same array - as sorted. * @param size The lenght of the array. - * @param COUNTER_TYP OPTIONAL: It is best kepts as size_t, but for smaller arrays, you can use uint32_t. + * @param COUNTER_TYP: Should be size_t for HUGE arrays, but regular arrays, you can use uint32_t. Should be auto found-out */ - template - inline void sort(uint32_t arr[], size_t size) noexcept { + template + inline void sort(uint32_t arr[], COUNTER_TYP size) noexcept { #ifdef MAGYAR_SORT_DEFAULT_REUSE MagyarSort::sort_reuse(arr, size); #else diff --git a/ypsu.cpp b/ypsu.cpp index a6fc8f6..e832f0f 100644 --- a/ypsu.cpp +++ b/ypsu.cpp @@ -11,6 +11,7 @@ #include #include #include + #include // mlock & munlock #include "ska_sort.hpp" @@ -260,6 +261,47 @@ free(buf); } + // frewr - four rewrites. + void frewr(uint32_t *arr, int n) { + uint32_t *tmpbuf = (uint32_t *)malloc(n * 4); + mlock(tmpbuf, n * 4); + int btoffsets[4][256] = {}; + #pragma GCC unroll 64 + for (int i = n - 1; i >= 0; i--) { + uint32_t a = arr[i]; + btoffsets[3][a & 0xff]++; + btoffsets[2][a >> 8 & 0xff]++; + btoffsets[1][a >> 16 & 0xff]++; + btoffsets[0][a >> 24 & 0xff]++; + } + int btend[4] = {n - 1, n - 1, n - 1, n - 1}; + #pragma GCC unroll 16 + for (int i = 255; i >= 0; i--) { + #pragma GCC unroll 4 + for (int pass = 3; pass >= 0; pass--) { + int nbtend = btend[pass] - btoffsets[pass][i]; + btoffsets[pass][i] = btend[pass]; + btend[pass] = nbtend; + } + } + uint32_t *src = arr, *dst = tmpbuf; + #pragma GCC unroll 4 + for (int pass = 3; pass >= 0; pass--) { + int *off = btoffsets[pass]; + #pragma GCC unroll 64 + for (int i = n - 1; i >= 0; i--) { + uint32_t v = src[i]; + dst[off[v & 0xff]--] = v >> 8 | v << 24; + __builtin_prefetch(&dst[off[v & 0xff] - 2]); + } + uint32_t *tmp = src; + src = dst; + dst = tmp; + } + munlock(tmpbuf, n * 4); + free(tmpbuf); + } + void vsort(uint32_t *a, int n) { thread_local std::vector bts[256]; #pragma GCC unroll 4 @@ -349,8 +391,8 @@ } int main(void) { - //int n = 100000000; - int n = 10000000; + int n = 100000000; + //int n = 10000000; for (auto inputtype : inputtypes) { printf("%10s", inputtype.c_str()); fflush(stdout); @@ -390,6 +432,9 @@ w = v; measure(inputtype, "4rot", [&] { fourrots(&w[0], w.size()); }); assert(w == expected); + w = v; + measure(inputtype, "frewr", [&] { frewr(&w[0], w.size()); }); + assert(w == expected); /* w = v; measure(inputtype, "vsort", [&] { vsort(&w[0], w.size()); });