#ifndef MAGYAR_SORT_H #define MAGYAR_SORT_H /** * single header lib: In-place, fast heavily modified and optimized radix sort. * * Only unsigned ints for now, but should be able to modify for int and float... * This is the counting variant with smart changes (not per-bit). * * LICENCE: CC3 - look it up, you need to mention me but that is all */ /* * Does not help much: // #pragma GCC target ("avx2") // #pragma GCC optimization ("unroll-loops") */ #include #include #include // memset // TODO: Only for the regular radix I guess #include #include // std::swap #ifndef NO_MLOCK #include // mlock & munlock #endif // !NO_MLOCK namespace MagyarSort { /* CONFIG */ // Only change these if you know what you are doing // I use these because I want to see if nibbles are // better or something... // // Bytes of nibbles only: // - DIGIT_RANGE and BITS_PER_DIGIT should correspond // - DIGITS should also correspond with the uint32_t // - and DIGIT_RANGE should be 2^n value (16 or 256) #ifdef MAGYAR_SORT_NIBBLE // Per-nibble digits sorting static constexpr int DIGITS = 8; // "helyiérték" static constexpr int BITS_PER_DIGIT = 4; // "bit / helyiérték" static constexpr int DIGIT_RANGE = 16; // "helyiérték állapottér" #else /* // Per-word digits sorting static constexpr int DIGITS = 2; // "helyiérték" static constexpr int BITS_PER_DIGIT = 16; // "bit / helyiérték" static constexpr int DIGIT_RANGE = 65536; // "helyiérték állapottér" */ // Per-byte digits sorting static constexpr int DIGITS = 4; // "helyiérték" static constexpr int BITS_PER_DIGIT = 8; // "bit / helyiérték" static constexpr int DIGIT_RANGE = 256; // "helyiérték állapottér" #endif /* DEBUG */ void debugArr(uint32_t *arr, size_t size) { for(size_t i = 0; i < size; ++i) { printf("%x, ", arr[i]); } printf("\n"); } template void debugRadics(COUNTER_TYP *magics) { for(size_t j = 0; j < DIGITS; ++j) { printf("d%zu: ", j); for(int i = 0; i < DIGIT_RANGE; ++i) { printf("%zu,", magics[i + DIGIT_RANGE*j]); } printf("\n\n"); } } /* HELPERS */ template static inline __attribute__((always_inline)) uint32_t getDigit(uint32_t num) noexcept { static constexpr int SHIFT = DIGIT_CHOICE * BITS_PER_DIGIT; uint32_t shifted = num >> SHIFT; return shifted & (DIGIT_RANGE - 1); } /** Recursive Functor: no class should be generated I think (compiler should be smart) */ template struct OccurenceMagic : public OccurenceMagic { inline __attribute__((always_inline)) OccurenceMagic(uint32_t arr[], COUNTER_TYP i, COUNTER_TYP *magicsOut) noexcept : OccurenceMagic(arr, i, magicsOut) { // Parents run first so template recursion runs DIGIT=0 first... ++magicsOut[getDigit(arr[i]) + DIGIT_RANGE * DIGIT]; } }; /** Ends template recursion */ template struct OccurenceMagic<-1, COUNTER_TYP> { inline __attribute__((always_inline)) OccurenceMagic(uint32_t arr[], COUNTER_TYP i, COUNTER_TYP *magicsOut) noexcept {} }; /** ARR_END must be an (STEP * k) */ template struct PrefetchMagic : public PrefetchMagic<(ARR_END - STEP), STEP, ARR_T, R_OR_W, LOCALITY> { inline __attribute__((always_inline)) PrefetchMagic(ARR_T *arr) noexcept : PrefetchMagic<(ARR_END - STEP), STEP, ARR_T, R_OR_W, LOCALITY>(arr) { __builtin_prefetch(&arr[ARR_END - STEP], R_OR_W, LOCALITY); } }; template struct PrefetchMagic<0, STEP, ARR_T, R_OR_W, LOCALITY> { inline __attribute__((always_inline)) PrefetchMagic(ARR_T *arr) noexcept {} }; template static inline void countOccurences(uint32_t arr[], COUNTER_TYP size, COUNTER_TYP *magicsOut) noexcept { COUNTER_TYP i = 0; //#pragma GCC unroll 4 for(; i < size - 64; i += 64) { // Prefetch for read level-1 cache //__builtin_prefetch(&arr[i + (1 * 16)], 0, 2); // r, L2 or L3 cache __builtin_prefetch(&arr[i + (1 * 16)]); // Creates no object, struct is empty OccurenceMagic(arr, i, magicsOut); OccurenceMagic(arr, i + 1, magicsOut); OccurenceMagic(arr, i + 2, magicsOut); OccurenceMagic(arr, i + 3, magicsOut); OccurenceMagic(arr, i + 4, magicsOut); OccurenceMagic(arr, i + 5, magicsOut); OccurenceMagic(arr, i + 6, magicsOut); OccurenceMagic(arr, i + 7, magicsOut); OccurenceMagic(arr, i + 8, magicsOut); OccurenceMagic(arr, i + 9, magicsOut); OccurenceMagic(arr, i + 10, magicsOut); OccurenceMagic(arr, i + 11, magicsOut); OccurenceMagic(arr, i + 12, magicsOut); OccurenceMagic(arr, i + 13, magicsOut); OccurenceMagic(arr, i + 14, magicsOut); OccurenceMagic(arr, i + 15, magicsOut); // Prefetch for read level-1 cache __builtin_prefetch(&arr[i + (2 * 16)]); OccurenceMagic(arr, i + 16, magicsOut); OccurenceMagic(arr, i + 17, magicsOut); OccurenceMagic(arr, i + 18, magicsOut); OccurenceMagic(arr, i + 19, magicsOut); OccurenceMagic(arr, i + 20, magicsOut); OccurenceMagic(arr, i + 21, magicsOut); OccurenceMagic(arr, i + 22, magicsOut); OccurenceMagic(arr, i + 23, magicsOut); OccurenceMagic(arr, i + 24, magicsOut); OccurenceMagic(arr, i + 25, magicsOut); OccurenceMagic(arr, i + 26, magicsOut); OccurenceMagic(arr, i + 27, magicsOut); OccurenceMagic(arr, i + 28, magicsOut); OccurenceMagic(arr, i + 29, magicsOut); OccurenceMagic(arr, i + 30, magicsOut); OccurenceMagic(arr, i + 31, magicsOut); __builtin_prefetch(&arr[i + (3 * 16)]); OccurenceMagic(arr, i + 32, magicsOut); OccurenceMagic(arr, i + 33, magicsOut); OccurenceMagic(arr, i + 34, magicsOut); OccurenceMagic(arr, i + 35, magicsOut); OccurenceMagic(arr, i + 36, magicsOut); OccurenceMagic(arr, i + 37, magicsOut); OccurenceMagic(arr, i + 38, magicsOut); OccurenceMagic(arr, i + 39, magicsOut); OccurenceMagic(arr, i + 40, magicsOut); OccurenceMagic(arr, i + 41, magicsOut); OccurenceMagic(arr, i + 42, magicsOut); OccurenceMagic(arr, i + 43, magicsOut); OccurenceMagic(arr, i + 44, magicsOut); OccurenceMagic(arr, i + 45, magicsOut); OccurenceMagic(arr, i + 46, magicsOut); OccurenceMagic(arr, i + 47, magicsOut); // __builtin_prefetch(&arr[i + (4 * 16)]); // Only needed for longer than 64 unrolls OccurenceMagic(arr, i + 48, magicsOut); OccurenceMagic(arr, i + 49, magicsOut); OccurenceMagic(arr, i + 50, magicsOut); OccurenceMagic(arr, i + 51, magicsOut); OccurenceMagic(arr, i + 52, magicsOut); OccurenceMagic(arr, i + 53, magicsOut); OccurenceMagic(arr, i + 54, magicsOut); OccurenceMagic(arr, i + 55, magicsOut); OccurenceMagic(arr, i + 56, magicsOut); OccurenceMagic(arr, i + 57, magicsOut); OccurenceMagic(arr, i + 58, magicsOut); OccurenceMagic(arr, i + 59, magicsOut); OccurenceMagic(arr, i + 60, magicsOut); OccurenceMagic(arr, i + 61, magicsOut); OccurenceMagic(arr, i + 62, magicsOut); OccurenceMagic(arr, i + 63, magicsOut); } #pragma GCC unroll 4 for(; i < size; ++i) { OccurenceMagic(arr, i, magicsOut); } } /** Recursive Functor: no class should be generated I think (compiler should be smart) */ template struct PrefixMagic : public PrefixMagic { inline __attribute__((always_inline)) PrefixMagic(COUNTER_TYP *magics, COUNTER_TYP *prev, int i) noexcept : PrefixMagic(magics, prev, i) { static constexpr int DSTART = (DIGIT * DIGIT_RANGE); magics[DSTART + i] += prev[DIGIT]; prev[DIGIT] = magics[DSTART + i]; } }; /** Ends template recursion */ template struct PrefixMagic<-1, COUNTER_TYP> { inline PrefixMagic(COUNTER_TYP *magics, COUNTER_TYP *prev, int i) noexcept {} }; /** Gets REFERENCE to the given digit from the radix-array that has more than one digits */ template static inline __attribute__((always_inline)) COUNTER_TYP &rGet(COUNTER_TYP *magics, int i) noexcept { static constexpr int DSTART = (DIGIT * DIGIT_RANGE); return magics[DSTART + i]; } /** Helper for calcPrefixSums */ template struct PMagic2 : public PMagic2 { inline __attribute__((always_inline)) PMagic2(COUNTER_TYP *magics, COUNTER_TYP *prev) : PMagic2(magics, prev) { // Again first the 0th digit because of parent constructors! // This is a template-unrolled loop too PrefixMagic(magics, prev, DIGIT); } }; /** Template recursion endpoint */ template struct PMagic2<-1, COUNTER_TYP> { inline __attribute__((always_inline)) PMagic2(COUNTER_TYP *magics, COUNTER_TYP *prev) {} }; template static inline void calcPrefixSums(COUNTER_TYP *magics) noexcept { static thread_local COUNTER_TYP prev[DIGITS]; memset(prev, 0, sizeof(prev)); // This is a template-unrolled loop too if constexpr (DIGIT_RANGE < 1024) { // Extra optimization for bytes and nibbles - totally unrolled loop! PMagic2(magics, prev); } else { // The above would not work for words and higher up... #pragma GCC unroll 16 for(int j = 0; j < DIGITS; ++j) { int offset = 0; #pragma GCC unroll 64 for(int i = 0; i < DIGIT_RANGE; ++i) { int DSTART = (j * DIGIT_RANGE); magics[DSTART + i] += prev[j]; prev[j] = magics[DSTART + i]; } } } } /** Recursive Functor: no class should be generated I think (compiler should be smart) */ template struct RadixMagic : public RadixMagic { inline __attribute__((always_inline)) RadixMagic(bool &swapped, COUNTER_TYP *magics, uint32_t *from, uint32_t *to, COUNTER_TYP size) noexcept : RadixMagic(swapped, magics, from, to, size) { // Tricky: see (**) if(swapped) { // never true for DIGIT 0, see (***) std::swap(from, to); } // DEBUG //printf("%d before: ", DIGIT); //debugArr(from, size); #pragma GCC unroll 64 for(COUNTER_TYP i = size; i > 0; --i) { // right-to-left to ensure already sorted digits order we keep for iterations // Prefetch caches /* __builtin_prefetch(&from[i]); // TODO: is good? if(i >= 64) { __builtin_prefetch(&from[i - 64]); } // TODO: manually unroll? */ // Get num and its new offset / location auto num = from[i - 1]; auto digVal = getDigit(num); auto offset = (--rGet(magics, digVal)); // Add to the proper target location to[offset] = num; } // DEBUG //printf("%d after: ", DIGIT); //debugArr(to, size); // (**) Only swaps pointers above in the child class constructor IF NEEDED :-) swapped = !swapped; } }; /** Ends template recursion */ template struct RadixMagic<-1, COUNTER_TYP> { inline RadixMagic(bool swapped, COUNTER_TYP *magics, uint32_t *&from, uint32_t *&to, COUNTER_TYP size) noexcept {} }; /* SORT */ // FIXME: I think the GC is not working because these are separate functions generated with sep static threadlocals!!! /* * Sort the given array (in-place sorting) with the given size. * * Rem.: If you use the VectorGiverWithReuse please remind yourself to Gc() it time-to-time! * * Beware: GC needs to happen on all threads that use us if you want to GC! * * @param arr The array to sort. Result will be in the same array - as sorted. * @param size The lenght of the array - should fit in the COUNTER_TYP. * @param COUNTER_TYP OPTIONAL: When set this type will be the counter type. For most cases uint32_t is enough. * @param REUSE OPTIONAL: When true, we reuse the array instead of always gettin' and releasin' from da heap. * @param GC OPTIONAL: When true, we garbage collect memory from previous sorts if REUSE is true. * @param GC_WITHOUT_SORT OPTIONAL: When true, we "just GC" but do not sort in case of GC is true. */ template inline void __attribute__((always_inline)) sort_impl(uint32_t arr[], COUNTER_TYP size) noexcept { // Most funny optimization is this multiply here :-) // // Literally.. come on.. this makes it nearly a compile-time, macro-like // ifdef-like thing as we avoid memory allocations of size BUT also we // optimize the first call for sort when we REUSE the array so size is fine! static thread_local std::vector arc(size * REUSE); #ifndef NO_MLOCK mlock(arr, size * sizeof(uint32_t)); #endif // !NO_MLOCK // "Garbage-collection" if constexpr (GC) { arc = std::vector(); // This must be implemented, because we can only access // the static in our function body so this is the "way". if constexpr (GC_WITHOUT_SORT) { return; } } // Holds "digit" occurences, prefix sums, whatevers // First "DIGIT_RANGE" elem is for MSB "DIGITS", last is for LSB static thread_local COUNTER_TYP magics[DIGITS * DIGIT_RANGE]; #ifndef NO_MLOCK mlock(magics, (DIGITS * DIGIT_RANGE) * sizeof(COUNTER_TYP)); #endif // !NO_MLOCK // Write prefetchin' //__builtin_prefetch(&magicsOut[..], 1); if constexpr (DIGIT_RANGE <= 1024) { PrefetchMagic pm(magics); } memset(magics, 0, sizeof(magics)); // Calculate occurences of digits countOccurences(arr, size, magics); //debugRadics(magics); // Calculate prefix sums calcPrefixSums(magics); //debugRadics(magics); /* Regular (old) radix sort with small twist */ // Regular radix sort - I just changed occurence couting and prefix summing to have more ILP // But because my approach does not use that, I want to keep this version in a branch for a // regular radix sort using better ILP just to see how it is doing if I wrote those "Magic" // above already anyways... // Regular radix sort needs a copy, see: https://www.youtube.com/watch?v=ujb2CIWE8zY // But instead of the below, we do a trickery... // // Rem.: The branch is optimized out in compile time! if constexpr (REUSE) { arc.resize(size); } else { // Must not be .clean() !!! // We must regain memory of previous! arc = std::move(std::vector(size)); } #ifndef NO_MLOCK mlock(&arc[0], size * sizeof(uint32_t)); #endif // !NO_MLOCK uint32_t *from = arr; uint32_t *to = &arc[0]; static thread_local bool swapped; swapped = false; // must be separate line RadixMagic r(swapped, magics, from, to, size); // With an other API we could spare this copy if we can delete original arr and return ptr or something... // I am fine with this... this is not my main idea anyways, just little ILP tweak to regular radix sort //if(to != arr) // <- logically, but bad they are already swapped here!!! BEWARE if(swapped) { // <- in reality this is what we want because of last swap happened anyways! memcpy(arr, to, size); } #ifndef NO_MLOCK munlock(magics, (DIGITS * DIGIT_RANGE) * sizeof(COUNTER_TYP)); munlock(&arc[0], size * sizeof(uint32_t)); munlock(arr, size * sizeof(uint32_t)); #endif // !NO_MLOCK } /** * Garbage collect reused data structures from last call. * * This is optimized and is a NO-OP if MAGYAR_SORT_DEFAULT_REUSE is not defined! * - unless you use the FORCE! May it be with you if you need it. * * @param FORCE OPTIONAL: When true, the gc happens even if MAGYAR_SORT_DEFAULT_REUSE is not defined! */ template inline void gc() noexcept { if constexpr (FORCE) { // Only GC-ing MagyarSort::sort_impl(nullptr, 0); } else { #ifdef MAGYAR_SORT_DEFAULT_REUSE // Only GC-ing MagyarSort::sort_impl(nullptr, 0); #endif } } /** * Sort the given array (in-place sorting) with the given size. * * Rem.: Please remind yourself to cc() from time-to-time! * Rem.: Thread-safe to use! * * Beware: MagyarSort::gc(); needs to happen on all threads that use this variant otherwise memory leaks away! * Please mind the "true" template parameter that forces the GC even when sort by default not reuses... * * @param arr The array to sort. Result will be in the same array - as sorted. * @param size The lenght of the array. * @param COUNTER_TYP OPTIONAL: It is best kepts as size_t, but for smaller arrays, you can use uint32_t. * @param GC OPTIONAL: When true, we garbage collect before this sort - so cached memory size will be "size" elems. */ template inline void __attribute__((always_inline)) sort_reuse(uint32_t arr[], COUNTER_TYP size) noexcept { // Reuse the temporary vectors across runs // This results in much less heap allocations and much faster on gcc // and also a bit faster on clang too. MagyarSort::sort_impl(arr, size); } /** * Sort the given array (in-place sorting) with the given size. * * Rem.: Thread-safe to use! * * Beware: MagyarSort::gc(); needs to happen on all threads that use this variant otherwise memory leaks away! * * @param arr The array to sort. Result will be in the same array - as sorted. * @param size The lenght of the array. * @param COUNTER_TYP: Should be size_t for HUGE arrays, but regular arrays, you can use uint32_t. Should be auto found-out */ template inline void __attribute__((always_inline)) sort_no_reuse(uint32_t arr[], COUNTER_TYP size) noexcept { // We use the heap once per every call... // This is safer and we do not need garbage collecting MagyarSort::sort_impl(arr, size); } /* * Sort the given array (in-place sorting) with the given size. * * Rem.: If you use the VectorGiverWithReuse please remind yourself to Gc() it time-to-time! * * Beware: MagyarSort::gc(); should be called after "sort bursts" (consecutive fast sorts of when you need memory * on all threads that use this variant otherwise memory leaks away as biggest sorted array keeps being in ram! * This depends on the config #define MAGYAR_SORT_DEFAULT_REUSE is defined or not. Define and you get reuse * and if you get reuse you can call multiple sorts with reused temporary buffers that you gc() afterwards! * * @param arr The array to sort. Result will be in the same array - as sorted. * @param size The lenght of the array. * @param COUNTER_TYP: Should be size_t for HUGE arrays, but regular arrays, you can use uint32_t. Should be auto found-out */ template inline void sort(uint32_t arr[], COUNTER_TYP size) noexcept { #ifdef MAGYAR_SORT_DEFAULT_REUSE MagyarSort::sort_reuse(arr, size); #else MagyarSort::sort_no_reuse(arr, size); #endif } }; #endif