diff --git a/magyarsort.h b/magyarsort.h index dd2ed53..27e7146 100644 --- a/magyarsort.h +++ b/magyarsort.h @@ -11,6 +11,7 @@ #include #include +#include // memset namespace MagyarSort { @@ -34,21 +35,19 @@ namespace MagyarSort { return shifted & (DIGIT_RANGE - 1); } - /** Functor: no class should be generated I think (compiler should be smart) */ + /** Recursive Functor: no class should be generated I think (compiler should be smart) */ template struct OccurenceMagic : public OccurenceMagic { inline OccurenceMagic(uint32_t arr[], size_t i, size_t *radicsOut) noexcept : OccurenceMagic(arr, i, radicsOut) { - // parents run first so template recursion runs DIGIT=0 first... + // Parents run first so template recursion runs DIGIT=0 first... ++radicsOut[getDigit(arr[i]) + DIGIT_RANGE * DIGIT]; } }; /** Ends template recursion */ template<> struct OccurenceMagic<-1> { - inline OccurenceMagic(uint32_t arr[], size_t i, size_t *radicsOut) noexcept { - /* empty */ - } + inline OccurenceMagic(uint32_t arr[], size_t i, size_t *radicsOut) noexcept {} }; static inline void countOccurences(uint32_t arr[], size_t size, size_t *radicsOut) noexcept { @@ -58,13 +57,29 @@ namespace MagyarSort { } } - template - static inline void prefixSum(size_t *radics) noexcept { - static constexpr int DSTART = DIGIT_CHOICE * DIGIT_RANGE; - size_t prev = 0; - for(int i = DSTART; i < (DSTART + DIGIT_RANGE); ++i) { - radics[i] += prev; - prev = radics[i]; + /** Recursive Functor: no class should be generated I think (compiler should be smart) */ + template + struct PrefixMagic : public PrefixMagic { + inline PrefixMagic(size_t *radics, size_t *prev, int i) noexcept + : PrefixMagic(radics, prev, i) { + static constexpr int DSTART = (DIGIT * DIGIT_RANGE); + radics[DSTART + i] += prev[DIGIT]; + prev[DIGIT] = radics[DSTART + i]; + } + }; + /** Ends template recursion */ + template<> + struct PrefixMagic<-1> { + inline PrefixMagic(size_t *radics, size_t *prev, int i) noexcept {} + }; + + static inline void calcPrefixSums(size_t *radics) noexcept { + static thread_local size_t prev[DIGITS]; + memset(prev, 0, sizeof(prev)); + + for(int i = 0; i < DIGIT_RANGE; ++i) { + // This is a template-unrolled loop too + PrefixMagic(radics, prev, i); } } @@ -73,19 +88,13 @@ namespace MagyarSort { // Holds "digit" occurences, prefix sums, whatevers // First "DIGIT_RANGE" elem is for MSB "DIGITS", last is for LSB static thread_local size_t radics[DIGITS * DIGIT_RANGE]; - for(int i = 0; i < (DIGITS * DIGIT_RANGE); ++i) { radics[i] = 0; } + memset(radics, 0, sizeof(radics)); // Calculate occurences of digits countOccurences(arr, size, radics); // Calculate prefix sums - // TODO: Maybe should use better ILP here? - // but maybe this is more cache friendly? - // TODO: manual digits! - prefixSum<0>(radics); - prefixSum<1>(radics); - prefixSum<2>(radics); - prefixSum<3>(radics); + calcPrefixSums(radics); /* // DEBUG: */