#ifndef MAGYAR_SORT_H #define MAGYAR_SORT_H /** * single header lib: In-place, fast heavily modified and optimized radix sort. * * Only unsigned ints for now, but should be able to modify for int and float... * This is the counting variant with smart changes (not per-bit). * * LICENCE: CC3 - look it up, you need to mention me but that is all */ #include #include #include // memset // TODO: Only for the regular radix I guess #include #include // std::swap namespace MagyarSort { /* CONFIG */ // Only change these if you know what you are doing // I use these because I want to see if nibbles are // better or something... // // Bytes of nibbles only: // - DIGIT_RANGE and BITS_PER_DIGIT should correspond // - DIGITS should also correspond with the uint32_t // - and DIGIT_RANGE should be 2^n value (16 or 256) #ifdef MAGYAR_SORT_NIBBLE // Per-nibble digits sorting static constexpr int DIGITS = 8; // "helyiérték" static constexpr int BITS_PER_DIGIT = 4; // "bit / helyiérték" static constexpr int DIGIT_RANGE = 16; // "helyiérték állapottér" #else // Per-byte digits sorting static constexpr int DIGITS = 4; // "helyiérték" static constexpr int BITS_PER_DIGIT = 8; // "bit / helyiérték" static constexpr int DIGIT_RANGE = 256; // "helyiérték állapottér" #endif /* DEBUG */ void debugArr(uint32_t *arr, size_t size) { for(int i = 0; i < size; ++i) { printf("%x, ", arr[i]); } printf("\n"); } void debugRadics(size_t *radics) { for(size_t j = 0; j < DIGITS; ++j) { printf("d%d: ", j); for(size_t i = 0; i < DIGIT_RANGE; ++i) { printf("%d,", radics[i + DIGIT_RANGE*j]); } printf("\n\n"); } } /* HELPERS */ template static inline uint32_t getDigit(uint32_t num) noexcept { static constexpr int SHIFT = DIGIT_CHOICE * BITS_PER_DIGIT; uint32_t shifted = num >> SHIFT; return shifted & (DIGIT_RANGE - 1); } /** Recursive Functor: no class should be generated I think (compiler should be smart) */ template struct OccurenceMagic : public OccurenceMagic { inline OccurenceMagic(uint32_t arr[], size_t i, size_t *radicsOut) noexcept : OccurenceMagic(arr, i, radicsOut) { // Parents run first so template recursion runs DIGIT=0 first... ++radicsOut[getDigit(arr[i]) + DIGIT_RANGE * DIGIT]; } }; /** Ends template recursion */ template<> struct OccurenceMagic<-1> { inline OccurenceMagic(uint32_t arr[], size_t i, size_t *radicsOut) noexcept {} }; static inline void countOccurences(uint32_t arr[], size_t size, size_t *radicsOut) noexcept { for(size_t i = 0; i < size; ++i) { // Creates no object, struct is empty OccurenceMagic(arr, i, radicsOut); } } /** Recursive Functor: no class should be generated I think (compiler should be smart) */ template struct PrefixMagic : public PrefixMagic { inline PrefixMagic(size_t *radics, size_t *prev, int i) noexcept : PrefixMagic(radics, prev, i) { static constexpr int DSTART = (DIGIT * DIGIT_RANGE); radics[DSTART + i] += prev[DIGIT]; prev[DIGIT] = radics[DSTART + i]; } }; /** Ends template recursion */ template<> struct PrefixMagic<-1> { inline PrefixMagic(size_t *radics, size_t *prev, int i) noexcept {} }; /** Gets REFERENCE to the given digit from the radix-array that has more than one digits */ template static inline size_t &rGet(size_t *radics, size_t i) noexcept { static constexpr int DSTART = (DIGIT * DIGIT_RANGE); return radics[DSTART + i]; } static inline void calcPrefixSums(size_t *radics) noexcept { static thread_local size_t prev[DIGITS]; memset(prev, 0, sizeof(prev)); for(int i = 0; i < DIGIT_RANGE; ++i) { // This is a template-unrolled loop too PrefixMagic(radics, prev, i); } } /** Recursive Functor: no class should be generated I think (compiler should be smart) */ template struct RadixMagic : public RadixMagic { inline RadixMagic(size_t *radics, uint32_t *&from, uint32_t *&to, size_t size) noexcept // BEWARE: "*&" needed to swap pointers.. : RadixMagic(radics, from, to, size) { // DEBUG //printf("%d before: ", DIGIT); //debugArr(from, size); for(size_t i = size; i > 0; --i) { // right-to-left to ensure already sorted digits order we keep for iterations // Get num and its new offset / location auto num = from[i - 1]; auto digVal = getDigit(num); auto offset = (--rGet(radics, digVal)); // Add to the proper target location to[offset] = num; } // DEBUG //printf("%d after: ", DIGIT); //debugArr(to, size); // Only swaps pointers :-) std::swap(from, to); } }; /** Ends template recursion */ template<> struct RadixMagic<-1> { inline RadixMagic(size_t *radics, uint32_t *&from, uint32_t *&to, size_t size) noexcept { } }; /* SORT */ /** Sort the given array (in-place sorting) with the given size */ inline void sort(uint32_t arr[], size_t size) noexcept { // Holds "digit" occurences, prefix sums, whatevers // First "DIGIT_RANGE" elem is for MSB "DIGITS", last is for LSB static thread_local size_t radics[DIGITS * DIGIT_RANGE]; memset(radics, 0, sizeof(radics)); // Calculate occurences of digits countOccurences(arr, size, radics); //debugRadics(radics); // Calculate prefix sums calcPrefixSums(radics); //debugRadics(radics); /* Regular (old) radix sort with small twist */ // Regular radix sort - I just changed occurence couting and prefix summing to have more ILP // But because my approach does not use that, I want to keep this version in a branch for a // regular radix sort using better ILP just to see how it is doing if I wrote those "Magic" // above already anyways... // Regular radix sort needs a copy, see: https://www.youtube.com/watch?v=ujb2CIWE8zY std::vector arc(size); uint32_t *from = arr; uint32_t *to = &arc[0]; RadixMagic(radics, from, to, size); // With an other API we could spare this copy if we can delete original arr and return ptr or something... // I am fine with this... this is not my main idea anyways, just little ILP tweak to regular radix sort //if(to != arr) { // <- logically, but bad they are already swapped here!!! BEWARE if(from != arr) { // <- in reality this is what we want because of last swap happened anyways! memcpy(arr, from, size); } } }; #endif