diff --git a/magyarsort.h b/magyarsort.h index 5c6fab1..65812f4 100644 --- a/magyarsort.h +++ b/magyarsort.h @@ -43,16 +43,17 @@ namespace MagyarSort { /* DEBUG */ void debugArr(uint32_t *arr, size_t size) { - for(int i = 0; i < size; ++i) { + for(size_t i = 0; i < size; ++i) { printf("%x, ", arr[i]); } printf("\n"); } - void debugRadics(size_t *radics) { + template + void debugRadics(COUNTER_TYP *radics) { for(size_t j = 0; j < DIGITS; ++j) { printf("d%zu: ", j); - for(size_t i = 0; i < DIGIT_RANGE; ++i) { + for(int i = 0; i < DIGIT_RANGE; ++i) { printf("%zu,", radics[i + DIGIT_RANGE*j]); } printf("\n\n"); @@ -70,72 +71,74 @@ namespace MagyarSort { } /** Recursive Functor: no class should be generated I think (compiler should be smart) */ - template - struct OccurenceMagic : public OccurenceMagic { - inline __attribute__((always_inline)) OccurenceMagic(uint32_t arr[], size_t i, size_t *radicsOut) noexcept - : OccurenceMagic(arr, i, radicsOut) { + template + struct OccurenceMagic : public OccurenceMagic { + inline __attribute__((always_inline)) OccurenceMagic(uint32_t arr[], COUNTER_TYP i, COUNTER_TYP *radicsOut) noexcept + : OccurenceMagic(arr, i, radicsOut) { // Parents run first so template recursion runs DIGIT=0 first... ++radicsOut[getDigit(arr[i]) + DIGIT_RANGE * DIGIT]; } }; /** Ends template recursion */ - template<> - struct OccurenceMagic<-1> { - inline OccurenceMagic(uint32_t arr[], size_t i, size_t *radicsOut) noexcept {} + template + struct OccurenceMagic<-1, COUNTER_TYP> { + inline OccurenceMagic(uint32_t arr[], COUNTER_TYP i, COUNTER_TYP *radicsOut) noexcept {} }; - static inline void countOccurences(uint32_t arr[], size_t size, size_t *radicsOut) noexcept { + template + static inline void countOccurences(uint32_t arr[], COUNTER_TYP size, COUNTER_TYP *radicsOut) noexcept { #pragma GCC unroll 64 - for(size_t i = 0; i < size; ++i) { + for(COUNTER_TYP i = 0; i < size; ++i) { // Creates no object, struct is empty - OccurenceMagic(arr, i, radicsOut); + OccurenceMagic(arr, i, radicsOut); } } /** Recursive Functor: no class should be generated I think (compiler should be smart) */ - template - struct PrefixMagic : public PrefixMagic { - inline __attribute__((always_inline)) PrefixMagic(size_t *radics, size_t *prev, int i) noexcept - : PrefixMagic(radics, prev, i) { + template + struct PrefixMagic : public PrefixMagic { + inline __attribute__((always_inline)) PrefixMagic(COUNTER_TYP *radics, COUNTER_TYP *prev, int i) noexcept + : PrefixMagic(radics, prev, i) { static constexpr int DSTART = (DIGIT * DIGIT_RANGE); radics[DSTART + i] += prev[DIGIT]; prev[DIGIT] = radics[DSTART + i]; } }; /** Ends template recursion */ - template<> - struct PrefixMagic<-1> { - inline PrefixMagic(size_t *radics, size_t *prev, int i) noexcept {} + template + struct PrefixMagic<-1, COUNTER_TYP> { + inline PrefixMagic(COUNTER_TYP *radics, COUNTER_TYP *prev, int i) noexcept {} }; /** Gets REFERENCE to the given digit from the radix-array that has more than one digits */ - template - static inline __attribute__((always_inline)) size_t &rGet(size_t *radics, size_t i) noexcept { + template + static inline __attribute__((always_inline)) COUNTER_TYP &rGet(COUNTER_TYP *radics, int i) noexcept { static constexpr int DSTART = (DIGIT * DIGIT_RANGE); return radics[DSTART + i]; } - static inline void calcPrefixSums(size_t *radics) noexcept { - static thread_local size_t prev[DIGITS]; + template + static inline void calcPrefixSums(COUNTER_TYP *radics) noexcept { + static thread_local COUNTER_TYP prev[DIGITS]; memset(prev, 0, sizeof(prev)); for(int i = 0; i < DIGIT_RANGE; ++i) { // This is a template-unrolled loop too - PrefixMagic(radics, prev, i); + PrefixMagic(radics, prev, i); } } /** Recursive Functor: no class should be generated I think (compiler should be smart) */ - template - struct RadixMagic : public RadixMagic { - inline __attribute__((always_inline)) RadixMagic(size_t *radics, uint32_t *&from, uint32_t *&to, size_t size) noexcept // BEWARE: "*&" needed to swap pointers.. - : RadixMagic(radics, from, to, size) { + template + struct RadixMagic : public RadixMagic { + inline __attribute__((always_inline)) RadixMagic(COUNTER_TYP *radics, uint32_t *&from, uint32_t *&to, COUNTER_TYP size) noexcept // BEWARE: "*&" needed to swap pointers.. + : RadixMagic(radics, from, to, size) { // DEBUG //printf("%d before: ", DIGIT); //debugArr(from, size); #pragma GCC unroll 64 - for(size_t i = size; i > 0; --i) { // right-to-left to ensure already sorted digits order we keep for iterations + for(COUNTER_TYP i = size; i > 0; --i) { // right-to-left to ensure already sorted digits order we keep for iterations // Get num and its new offset / location auto num = from[i - 1]; auto digVal = getDigit(num); @@ -154,68 +157,13 @@ namespace MagyarSort { } }; /** Ends template recursion */ - template<> - struct RadixMagic<-1> { - inline RadixMagic(size_t *radics, uint32_t *&from, uint32_t *&to, size_t size) noexcept { } + template + struct RadixMagic<-1, COUNTER_TYP> { + inline RadixMagic(COUNTER_TYP *radics, uint32_t *&from, uint32_t *&to, COUNTER_TYP size) noexcept { } }; /* SORT */ - /** - * Example: A simple "vector-giver" which provides a static thread_local that is reused - * - * This is to be used when you will call sort many times successively! - * If you forget to garbage-collect manually, use a VectorGiverHeap. - * - * XXX - BEWARE: This give references - that is also acceptable and supported! - * - * This is thread-safe (the Heap one also). - */ - struct VectorGiverWithReuse { - /** - * Give a reference to the vector to use as temporary. - * Will be resized, is reused so "leaks" memory to be the biggest sorted array size, but you can "Gc()". - * - * @param s The given vector should have this size. - * @param gc OPTIONAL: When true, we create a new empty shared vector. This saves memory after a big sort! - * @returns A reference that never go out of scope! - */ - static inline __attribute__((always_inline)) std::vector &Give(size_t s, const bool gc = false) noexcept { - static thread_local std::vector arc(s); // saves time on first call to have size here! - if(gc) { arc = std::vector(); } // by default optimized out! - arc.resize(s); // JHP - // Safe because of static it will not go out of scope - return arc; // just a reference - no copy! - } - - /** Release memory back to zero. After this, the first sort will need memory from heap again. */ - inline __attribute__((always_inline)) void Gc() noexcept { - VectorGiverWithReuse::Give(0, true); - } - }; - - /** - * Example: A simple "vector-giver" which provides new vector from heap. - * - * This is thread-safe (the VectorGiverWithReuse one also). - */ - struct VectorGiverHeap { - /** - * Give a temporary vector which is to be created on heap and freed after sort. - * - * XXX - BEWARE: Please mind we do not return reference, but value here! - * This works because standard ENSURES return value optimization! - * - * @param s The given vector should have this size. - * @param gc OPTIONAL: When true, we create a new empty shared vector. This saves memory after a big sort! - * @returns A vector of appropriate size. - */ - inline __attribute__((always_inline)) std::vector Give(size_t s) noexcept { - return std::vector(s); // RVO ensured! - } - - }; - /* * Sort the given array (in-place sorting) with the given size. * @@ -224,13 +172,14 @@ namespace MagyarSort { * Beware: GC needs to happen on all threads that use us! * * @param arr The array to sort. Result will be in the same array - as sorted. - * @param size The lenght of the array. + * @param size The lenght of the array - should fit in the COUNTER_TYP. + * @param COUNTER_TYP OPTIONAL: When set this type will be the counter type. * @param REUSE OPTIONAL: When true, we reuse the array instead of always gettin' and releasin' from da heap. * @param GC OPTIONAL: When true, we garbage collect memory from previous sorts if REUSE is true. * @param GC_WITHOUT_SORT OPTIONAL: When true, we "just GC" but do not sort in case of GC is true. */ - template - inline void __attribute__((always_inline)) sort_impl(uint32_t arr[], size_t size) noexcept { + template + inline void __attribute__((always_inline)) sort_impl(uint32_t arr[], COUNTER_TYP size) noexcept { // Most funny optimization is this multiply here :-) // // Literally.. come on.. this makes it nearly a compile-time, macro-like @@ -250,18 +199,18 @@ namespace MagyarSort { // Holds "digit" occurences, prefix sums, whatevers // First "DIGIT_RANGE" elem is for MSB "DIGITS", last is for LSB - static thread_local size_t radics[DIGITS * DIGIT_RANGE]; + static thread_local COUNTER_TYP radics[DIGITS * DIGIT_RANGE]; memset(radics, 0, sizeof(radics)); // Calculate occurences of digits countOccurences(arr, size, radics); - //debugRadics(radics); + //debugRadics(radics); // Calculate prefix sums calcPrefixSums(radics); - //debugRadics(radics); + //debugRadics(radics); /* Regular (old) radix sort with small twist */ @@ -288,7 +237,7 @@ namespace MagyarSort { uint32_t *from = arr; uint32_t *to = &arc[0]; - RadixMagic(radics, from, to, size); + RadixMagic(radics, from, to, size); // With an other API we could spare this copy if we can delete original arr and return ptr or something... // I am fine with this... this is not my main idea anyways, just little ILP tweak to regular radix sort @@ -306,15 +255,15 @@ namespace MagyarSort { * * @param FORCE OPTIONAL: When true, the gc happens even if MAGYAR_SORT_DEFAULT_REUSE is not defined! */ - template + template inline void gc() noexcept { if(FORCE) { // Only GC-ing - MagyarSort::sort_impl(nullptr, 0); + MagyarSort::sort_impl(nullptr, 0); } else { #ifdef MAGYAR_SORT_DEFAULT_REUSE // Only GC-ing - MagyarSort::sort_impl(nullptr, 0); + MagyarSort::sort_impl(nullptr, 0); #endif } } @@ -330,14 +279,15 @@ namespace MagyarSort { * * @param arr The array to sort. Result will be in the same array - as sorted. * @param size The lenght of the array. + * @param COUNTER_TYP OPTIONAL: It is best kepts as size_t, but for smaller arrays, you can use uint32_t. * @param GC OPTIONAL: When true, we garbage collect before this sort - so cached memory size will be "size" elems. */ - template - inline void __attribute__((always_inline)) sort_reuse(uint32_t arr[], size_t size) noexcept { + template + inline void __attribute__((always_inline)) sort_reuse(uint32_t arr[], COUNTER_TYP size) noexcept { // Reuse the temporary vectors across runs // This results in much less heap allocations and much faster on gcc // and also a bit faster on clang too. - MagyarSort::sort_impl(arr, size); + MagyarSort::sort_impl(arr, size); } /** @@ -349,11 +299,13 @@ namespace MagyarSort { * * @param arr The array to sort. Result will be in the same array - as sorted. * @param size The lenght of the array. + * @param COUNTER_TYP OPTIONAL: It is best kepts as size_t, but for smaller arrays, you can use uint32_t. */ + template inline void __attribute__((always_inline)) sort_no_reuse(uint32_t arr[], size_t size) noexcept { // We use the heap once per every call... // This is safer and we do not need garbage collecting - MagyarSort::sort_impl(arr, size); + MagyarSort::sort_impl(arr, size); } /* @@ -368,12 +320,14 @@ namespace MagyarSort { * * @param arr The array to sort. Result will be in the same array - as sorted. * @param size The lenght of the array. + * @param COUNTER_TYP OPTIONAL: It is best kepts as size_t, but for smaller arrays, you can use uint32_t. */ + template inline void sort(uint32_t arr[], size_t size) noexcept { #ifdef MAGYAR_SORT_DEFAULT_REUSE - MagyarSort::sort_reuse(arr, size); + MagyarSort::sort_reuse(arr, size); #else - MagyarSort::sort_no_reuse(arr, size); + MagyarSort::sort_no_reuse(arr, size); #endif } }; diff --git a/ypsu.cpp b/ypsu.cpp index 5d2ffd0..8f221ee 100644 --- a/ypsu.cpp +++ b/ypsu.cpp @@ -369,7 +369,7 @@ } }); w = v; - measure(inputtype, "magyar", [&] { MagyarSort::sort(&w[0], w.size()); }); + measure(inputtype, "magyar", [&] { MagyarSort::sort(&w[0], w.size()); }); assert(w == expected); /*