diff --git a/magyarsort.h b/magyarsort.h index d7413b3..5c6fab1 100644 --- a/magyarsort.h +++ b/magyarsort.h @@ -216,17 +216,38 @@ namespace MagyarSort { }; - /** + /* * Sort the given array (in-place sorting) with the given size. * * Rem.: If you use the VectorGiverWithReuse please remind yourself to Gc() it time-to-time! * + * Beware: GC needs to happen on all threads that use us! + * * @param arr The array to sort. Result will be in the same array - as sorted. * @param size The lenght of the array. - * @param VectorGiver is either VectorGiverHeap or VectorGiverWithReuse. Have Give(size_t size, ...) returning value or ref. + * @param REUSE OPTIONAL: When true, we reuse the array instead of always gettin' and releasin' from da heap. + * @param GC OPTIONAL: When true, we garbage collect memory from previous sorts if REUSE is true. + * @param GC_WITHOUT_SORT OPTIONAL: When true, we "just GC" but do not sort in case of GC is true. */ - //template - inline void sort(uint32_t arr[], size_t size) noexcept { + template + inline void __attribute__((always_inline)) sort_impl(uint32_t arr[], size_t size) noexcept { + // Most funny optimization is this multiply here :-) + // + // Literally.. come on.. this makes it nearly a compile-time, macro-like + // ifdef-like thing as we avoid memory allocations of size BUT also we + // optimize the first call for sort when we REUSE the array so size is fine! + static thread_local std::vector arc(size * REUSE); + + // "Garbage-collection" + if(GC) { + arc = std::vector(); + // This must be implemented, because we can only access + // the static in our function body so this is the "way". + if(GC_WITHOUT_SORT) { + return; + } + } + // Holds "digit" occurences, prefix sums, whatevers // First "DIGIT_RANGE" elem is for MSB "DIGITS", last is for LSB static thread_local size_t radics[DIGITS * DIGIT_RANGE]; @@ -250,10 +271,19 @@ namespace MagyarSort { // above already anyways... // Regular radix sort needs a copy, see: https://www.youtube.com/watch?v=ujb2CIWE8zY + // But instead of the below, we do a trickery... + // //std::vector arc(size); //auto arc = VectorGiver::Give(size); // "auto" is needed for this to perform well with some givers! - static thread_local std::vector arc(size); - arc.resize(size); // JHP + // + // Rem.: The branch is optimized out in compile time! + if(REUSE) { + arc.resize(size); + } else { + // Must not be .clean() !!! + // We must regain memory of previous! + arc = std::move(std::vector(size)); + } uint32_t *from = arr; uint32_t *to = &arc[0]; @@ -262,11 +292,90 @@ namespace MagyarSort { // With an other API we could spare this copy if we can delete original arr and return ptr or something... // I am fine with this... this is not my main idea anyways, just little ILP tweak to regular radix sort - //if(to != arr) { // <- logically, but bad they are already swapped here!!! BEWARE + //if(to != arr) // <- logically, but bad they are already swapped here!!! BEWARE if(from != arr) { // <- in reality this is what we want because of last swap happened anyways! memcpy(arr, from, size); } } + + /** + * Garbage collect reused data structures from last call. + * + * This is optimized and is a NO-OP if MAGYAR_SORT_DEFAULT_REUSE is not defined! + * - unless you use the FORCE! May it be with you if you need it. + * + * @param FORCE OPTIONAL: When true, the gc happens even if MAGYAR_SORT_DEFAULT_REUSE is not defined! + */ + template + inline void gc() noexcept { + if(FORCE) { + // Only GC-ing + MagyarSort::sort_impl(nullptr, 0); + } else { +#ifdef MAGYAR_SORT_DEFAULT_REUSE + // Only GC-ing + MagyarSort::sort_impl(nullptr, 0); +#endif + } + } + + /** + * Sort the given array (in-place sorting) with the given size. + * + * Rem.: Please remind yourself to cc() from time-to-time! + * Rem.: Thread-safe to use! + * + * Beware: MagyarSort::gc(); needs to happen on all threads that use this variant otherwise memory leaks away! + * Please mind the "true" template parameter that forces the GC even when sort by default not reuses... + * + * @param arr The array to sort. Result will be in the same array - as sorted. + * @param size The lenght of the array. + * @param GC OPTIONAL: When true, we garbage collect before this sort - so cached memory size will be "size" elems. + */ + template + inline void __attribute__((always_inline)) sort_reuse(uint32_t arr[], size_t size) noexcept { + // Reuse the temporary vectors across runs + // This results in much less heap allocations and much faster on gcc + // and also a bit faster on clang too. + MagyarSort::sort_impl(arr, size); + } + + /** + * Sort the given array (in-place sorting) with the given size. + * + * Rem.: Thread-safe to use! + * + * Beware: MagyarSort::gc(); needs to happen on all threads that use this variant otherwise memory leaks away! + * + * @param arr The array to sort. Result will be in the same array - as sorted. + * @param size The lenght of the array. + */ + inline void __attribute__((always_inline)) sort_no_reuse(uint32_t arr[], size_t size) noexcept { + // We use the heap once per every call... + // This is safer and we do not need garbage collecting + MagyarSort::sort_impl(arr, size); + } + + /* + * Sort the given array (in-place sorting) with the given size. + * + * Rem.: If you use the VectorGiverWithReuse please remind yourself to Gc() it time-to-time! + * + * Beware: MagyarSort::gc(); should be called after "sort bursts" (consecutive fast sorts of when you need memory + * on all threads that use this variant otherwise memory leaks away as biggest sorted array keeps being in ram! + * This depends on the config #define MAGYAR_SORT_DEFAULT_REUSE is defined or not. Define and you get reuse + * and if you get reuse you can call multiple sorts with reused temporary buffers that you gc() afterwards! + * + * @param arr The array to sort. Result will be in the same array - as sorted. + * @param size The lenght of the array. + */ + inline void sort(uint32_t arr[], size_t size) noexcept { +#ifdef MAGYAR_SORT_DEFAULT_REUSE + MagyarSort::sort_reuse(arr, size); +#else + MagyarSort::sort_no_reuse(arr, size); +#endif + } }; #endif diff --git a/simd-sort/speed.cpp b/simd-sort/speed.cpp index aa58089..c60c1b4 100644 --- a/simd-sort/speed.cpp +++ b/simd-sort/speed.cpp @@ -14,6 +14,7 @@ #include "quicksort-all.cpp" #include "avx2-altquicksort.h" //#include "avx2-nate-quicksort.cpp" +#define MAGYAR_SORT_DEFAULT_REUSE #include "../magyarsort.h" // mine #include "avx2-natenodutch-quicksort.h" #define USE_RDTSC // undef to get measurments in seconds diff --git a/simd-sort/speed_avx2 b/simd-sort/speed_avx2 index 6bbb97a..cb8adeb 100755 Binary files a/simd-sort/speed_avx2 and b/simd-sort/speed_avx2 differ diff --git a/test.cpp b/test.cpp index 5572222..cd946f1 100644 --- a/test.cpp +++ b/test.cpp @@ -32,6 +32,8 @@ #include #include #include // std::sort + +#define MAGYAR_SORT_DEFAULT_REUSE #include "magyarsort.h" #ifdef SKA_SORT diff --git a/ypsu.cpp b/ypsu.cpp index 1d11e51..859887c 100644 --- a/ypsu.cpp +++ b/ypsu.cpp @@ -11,7 +11,11 @@ #include #include #include "ska_sort.hpp" + + + #define MAGYAR_SORT_DEFAULT_REUSE #include "magyarsort.h" + std::map results; std::map worst; void measure(const std::string &inputtype, const std::string &name,