diff --git a/magyarsort.h b/magyarsort.h index 8cd9627..b3a085b 100644 --- a/magyarsort.h +++ b/magyarsort.h @@ -66,11 +66,11 @@ namespace MagyarSort { } template - void debugRadics(COUNTER_TYP *radics) { + void debugRadics(COUNTER_TYP *magics) { for(size_t j = 0; j < DIGITS; ++j) { printf("d%zu: ", j); for(int i = 0; i < DIGIT_RANGE; ++i) { - printf("%zu,", radics[i + DIGIT_RANGE*j]); + printf("%zu,", magics[i + DIGIT_RANGE*j]); } printf("\n\n"); } @@ -89,16 +89,16 @@ namespace MagyarSort { /** Recursive Functor: no class should be generated I think (compiler should be smart) */ template struct OccurenceMagic : public OccurenceMagic { - inline __attribute__((always_inline)) OccurenceMagic(uint32_t arr[], COUNTER_TYP i, COUNTER_TYP *radicsOut) noexcept - : OccurenceMagic(arr, i, radicsOut) { + inline __attribute__((always_inline)) OccurenceMagic(uint32_t arr[], COUNTER_TYP i, COUNTER_TYP *magicsOut) noexcept + : OccurenceMagic(arr, i, magicsOut) { // Parents run first so template recursion runs DIGIT=0 first... - ++radicsOut[getDigit(arr[i]) + DIGIT_RANGE * DIGIT]; + ++magicsOut[getDigit(arr[i]) + DIGIT_RANGE * DIGIT]; } }; /** Ends template recursion */ template struct OccurenceMagic<-1, COUNTER_TYP> { - inline __attribute__((always_inline)) OccurenceMagic(uint32_t arr[], COUNTER_TYP i, COUNTER_TYP *radicsOut) noexcept {} + inline __attribute__((always_inline)) OccurenceMagic(uint32_t arr[], COUNTER_TYP i, COUNTER_TYP *magicsOut) noexcept {} }; /** ARR_END must be an (STEP * k) */ @@ -116,7 +116,7 @@ namespace MagyarSort { }; template - static inline void countOccurences(uint32_t arr[], COUNTER_TYP size, COUNTER_TYP *radicsOut) noexcept { + static inline void countOccurences(uint32_t arr[], COUNTER_TYP size, COUNTER_TYP *magicsOut) noexcept { COUNTER_TYP i = 0; //#pragma GCC unroll 4 for(; i < size - 64; i += 64) { @@ -124,131 +124,131 @@ namespace MagyarSort { //__builtin_prefetch(&arr[i + (1 * 16)], 0, 2); // r, L2 or L3 cache __builtin_prefetch(&arr[i + (1 * 16)]); // Creates no object, struct is empty - OccurenceMagic(arr, i, radicsOut); - OccurenceMagic(arr, i + 1, radicsOut); - OccurenceMagic(arr, i + 2, radicsOut); - OccurenceMagic(arr, i + 3, radicsOut); - OccurenceMagic(arr, i + 4, radicsOut); - OccurenceMagic(arr, i + 5, radicsOut); - OccurenceMagic(arr, i + 6, radicsOut); - OccurenceMagic(arr, i + 7, radicsOut); - OccurenceMagic(arr, i + 8, radicsOut); - OccurenceMagic(arr, i + 9, radicsOut); - OccurenceMagic(arr, i + 10, radicsOut); - OccurenceMagic(arr, i + 11, radicsOut); - OccurenceMagic(arr, i + 12, radicsOut); - OccurenceMagic(arr, i + 13, radicsOut); - OccurenceMagic(arr, i + 14, radicsOut); - OccurenceMagic(arr, i + 15, radicsOut); + OccurenceMagic(arr, i, magicsOut); + OccurenceMagic(arr, i + 1, magicsOut); + OccurenceMagic(arr, i + 2, magicsOut); + OccurenceMagic(arr, i + 3, magicsOut); + OccurenceMagic(arr, i + 4, magicsOut); + OccurenceMagic(arr, i + 5, magicsOut); + OccurenceMagic(arr, i + 6, magicsOut); + OccurenceMagic(arr, i + 7, magicsOut); + OccurenceMagic(arr, i + 8, magicsOut); + OccurenceMagic(arr, i + 9, magicsOut); + OccurenceMagic(arr, i + 10, magicsOut); + OccurenceMagic(arr, i + 11, magicsOut); + OccurenceMagic(arr, i + 12, magicsOut); + OccurenceMagic(arr, i + 13, magicsOut); + OccurenceMagic(arr, i + 14, magicsOut); + OccurenceMagic(arr, i + 15, magicsOut); // Prefetch for read level-1 cache __builtin_prefetch(&arr[i + (2 * 16)]); - OccurenceMagic(arr, i + 16, radicsOut); - OccurenceMagic(arr, i + 17, radicsOut); - OccurenceMagic(arr, i + 18, radicsOut); - OccurenceMagic(arr, i + 19, radicsOut); - OccurenceMagic(arr, i + 20, radicsOut); - OccurenceMagic(arr, i + 21, radicsOut); - OccurenceMagic(arr, i + 22, radicsOut); - OccurenceMagic(arr, i + 23, radicsOut); - OccurenceMagic(arr, i + 24, radicsOut); - OccurenceMagic(arr, i + 25, radicsOut); - OccurenceMagic(arr, i + 26, radicsOut); - OccurenceMagic(arr, i + 27, radicsOut); - OccurenceMagic(arr, i + 28, radicsOut); - OccurenceMagic(arr, i + 29, radicsOut); - OccurenceMagic(arr, i + 30, radicsOut); - OccurenceMagic(arr, i + 31, radicsOut); + OccurenceMagic(arr, i + 16, magicsOut); + OccurenceMagic(arr, i + 17, magicsOut); + OccurenceMagic(arr, i + 18, magicsOut); + OccurenceMagic(arr, i + 19, magicsOut); + OccurenceMagic(arr, i + 20, magicsOut); + OccurenceMagic(arr, i + 21, magicsOut); + OccurenceMagic(arr, i + 22, magicsOut); + OccurenceMagic(arr, i + 23, magicsOut); + OccurenceMagic(arr, i + 24, magicsOut); + OccurenceMagic(arr, i + 25, magicsOut); + OccurenceMagic(arr, i + 26, magicsOut); + OccurenceMagic(arr, i + 27, magicsOut); + OccurenceMagic(arr, i + 28, magicsOut); + OccurenceMagic(arr, i + 29, magicsOut); + OccurenceMagic(arr, i + 30, magicsOut); + OccurenceMagic(arr, i + 31, magicsOut); __builtin_prefetch(&arr[i + (3 * 16)]); - OccurenceMagic(arr, i + 32, radicsOut); - OccurenceMagic(arr, i + 33, radicsOut); - OccurenceMagic(arr, i + 34, radicsOut); - OccurenceMagic(arr, i + 35, radicsOut); - OccurenceMagic(arr, i + 36, radicsOut); - OccurenceMagic(arr, i + 37, radicsOut); - OccurenceMagic(arr, i + 38, radicsOut); - OccurenceMagic(arr, i + 39, radicsOut); - OccurenceMagic(arr, i + 40, radicsOut); - OccurenceMagic(arr, i + 41, radicsOut); - OccurenceMagic(arr, i + 42, radicsOut); - OccurenceMagic(arr, i + 43, radicsOut); - OccurenceMagic(arr, i + 44, radicsOut); - OccurenceMagic(arr, i + 45, radicsOut); - OccurenceMagic(arr, i + 46, radicsOut); - OccurenceMagic(arr, i + 47, radicsOut); + OccurenceMagic(arr, i + 32, magicsOut); + OccurenceMagic(arr, i + 33, magicsOut); + OccurenceMagic(arr, i + 34, magicsOut); + OccurenceMagic(arr, i + 35, magicsOut); + OccurenceMagic(arr, i + 36, magicsOut); + OccurenceMagic(arr, i + 37, magicsOut); + OccurenceMagic(arr, i + 38, magicsOut); + OccurenceMagic(arr, i + 39, magicsOut); + OccurenceMagic(arr, i + 40, magicsOut); + OccurenceMagic(arr, i + 41, magicsOut); + OccurenceMagic(arr, i + 42, magicsOut); + OccurenceMagic(arr, i + 43, magicsOut); + OccurenceMagic(arr, i + 44, magicsOut); + OccurenceMagic(arr, i + 45, magicsOut); + OccurenceMagic(arr, i + 46, magicsOut); + OccurenceMagic(arr, i + 47, magicsOut); // __builtin_prefetch(&arr[i + (4 * 16)]); // Only needed for longer than 64 unrolls - OccurenceMagic(arr, i + 48, radicsOut); - OccurenceMagic(arr, i + 49, radicsOut); - OccurenceMagic(arr, i + 50, radicsOut); - OccurenceMagic(arr, i + 51, radicsOut); - OccurenceMagic(arr, i + 52, radicsOut); - OccurenceMagic(arr, i + 53, radicsOut); - OccurenceMagic(arr, i + 54, radicsOut); - OccurenceMagic(arr, i + 55, radicsOut); - OccurenceMagic(arr, i + 56, radicsOut); - OccurenceMagic(arr, i + 57, radicsOut); - OccurenceMagic(arr, i + 58, radicsOut); - OccurenceMagic(arr, i + 59, radicsOut); - OccurenceMagic(arr, i + 60, radicsOut); - OccurenceMagic(arr, i + 61, radicsOut); - OccurenceMagic(arr, i + 62, radicsOut); - OccurenceMagic(arr, i + 63, radicsOut); + OccurenceMagic(arr, i + 48, magicsOut); + OccurenceMagic(arr, i + 49, magicsOut); + OccurenceMagic(arr, i + 50, magicsOut); + OccurenceMagic(arr, i + 51, magicsOut); + OccurenceMagic(arr, i + 52, magicsOut); + OccurenceMagic(arr, i + 53, magicsOut); + OccurenceMagic(arr, i + 54, magicsOut); + OccurenceMagic(arr, i + 55, magicsOut); + OccurenceMagic(arr, i + 56, magicsOut); + OccurenceMagic(arr, i + 57, magicsOut); + OccurenceMagic(arr, i + 58, magicsOut); + OccurenceMagic(arr, i + 59, magicsOut); + OccurenceMagic(arr, i + 60, magicsOut); + OccurenceMagic(arr, i + 61, magicsOut); + OccurenceMagic(arr, i + 62, magicsOut); + OccurenceMagic(arr, i + 63, magicsOut); } #pragma GCC unroll 4 for(; i < size; ++i) { - OccurenceMagic(arr, i, radicsOut); + OccurenceMagic(arr, i, magicsOut); } } /** Recursive Functor: no class should be generated I think (compiler should be smart) */ template struct PrefixMagic : public PrefixMagic { - inline __attribute__((always_inline)) PrefixMagic(COUNTER_TYP *radics, COUNTER_TYP *prev, int i) noexcept - : PrefixMagic(radics, prev, i) { + inline __attribute__((always_inline)) PrefixMagic(COUNTER_TYP *magics, COUNTER_TYP *prev, int i) noexcept + : PrefixMagic(magics, prev, i) { static constexpr int DSTART = (DIGIT * DIGIT_RANGE); - radics[DSTART + i] += prev[DIGIT]; - prev[DIGIT] = radics[DSTART + i]; + magics[DSTART + i] += prev[DIGIT]; + prev[DIGIT] = magics[DSTART + i]; } }; /** Ends template recursion */ template struct PrefixMagic<-1, COUNTER_TYP> { - inline PrefixMagic(COUNTER_TYP *radics, COUNTER_TYP *prev, int i) noexcept {} + inline PrefixMagic(COUNTER_TYP *magics, COUNTER_TYP *prev, int i) noexcept {} }; /** Gets REFERENCE to the given digit from the radix-array that has more than one digits */ template - static inline __attribute__((always_inline)) COUNTER_TYP &rGet(COUNTER_TYP *radics, int i) noexcept { + static inline __attribute__((always_inline)) COUNTER_TYP &rGet(COUNTER_TYP *magics, int i) noexcept { static constexpr int DSTART = (DIGIT * DIGIT_RANGE); - return radics[DSTART + i]; + return magics[DSTART + i]; } /** Helper for calcPrefixSums */ template struct PMagic2 : public PMagic2 { - inline __attribute__((always_inline)) PMagic2(COUNTER_TYP *radics, COUNTER_TYP *prev) - : PMagic2(radics, prev) { + inline __attribute__((always_inline)) PMagic2(COUNTER_TYP *magics, COUNTER_TYP *prev) + : PMagic2(magics, prev) { // Again first the 0th digit because of parent constructors! // This is a template-unrolled loop too - PrefixMagic(radics, prev, DIGIT); + PrefixMagic(magics, prev, DIGIT); } }; /** Template recursion endpoint */ template struct PMagic2<-1, COUNTER_TYP> { - inline __attribute__((always_inline)) PMagic2(COUNTER_TYP *radics, COUNTER_TYP *prev) {} + inline __attribute__((always_inline)) PMagic2(COUNTER_TYP *magics, COUNTER_TYP *prev) {} }; template - static inline void calcPrefixSums(COUNTER_TYP *radics) noexcept { + static inline void calcPrefixSums(COUNTER_TYP *magics) noexcept { static thread_local COUNTER_TYP prev[DIGITS]; memset(prev, 0, sizeof(prev)); // This is a template-unrolled loop too if constexpr (DIGIT_RANGE < 1024) { // Extra optimization for bytes and nibbles - totally unrolled loop! - PMagic2(radics, prev); + PMagic2(magics, prev); } else { // The above would not work for words and higher up... #pragma GCC unroll 16 @@ -257,8 +257,8 @@ namespace MagyarSort { #pragma GCC unroll 64 for(int i = 0; i < DIGIT_RANGE; ++i) { int DSTART = (j * DIGIT_RANGE); - radics[DSTART + i] += prev[j]; - prev[j] = radics[DSTART + i]; + magics[DSTART + i] += prev[j]; + prev[j] = magics[DSTART + i]; } } } @@ -267,8 +267,8 @@ namespace MagyarSort { /** Recursive Functor: no class should be generated I think (compiler should be smart) */ template struct RadixMagic : public RadixMagic { - inline __attribute__((always_inline)) RadixMagic(bool &swapped, COUNTER_TYP *radics, uint32_t *from, uint32_t *to, COUNTER_TYP size) noexcept - : RadixMagic(swapped, radics, from, to, size) { + inline __attribute__((always_inline)) RadixMagic(bool &swapped, COUNTER_TYP *magics, uint32_t *from, uint32_t *to, COUNTER_TYP size) noexcept + : RadixMagic(swapped, magics, from, to, size) { // Tricky: see (**) if(swapped) { // never true for DIGIT 0, see (***) std::swap(from, to); @@ -288,7 +288,7 @@ namespace MagyarSort { // Get num and its new offset / location auto num = from[i - 1]; auto digVal = getDigit(num); - auto offset = (--rGet(radics, digVal)); + auto offset = (--rGet(magics, digVal)); // Add to the proper target location to[offset] = num; @@ -305,7 +305,7 @@ namespace MagyarSort { /** Ends template recursion */ template struct RadixMagic<-1, COUNTER_TYP> { - inline RadixMagic(bool swapped, COUNTER_TYP *radics, uint32_t *&from, uint32_t *&to, COUNTER_TYP size) noexcept {} + inline RadixMagic(bool swapped, COUNTER_TYP *magics, uint32_t *&from, uint32_t *&to, COUNTER_TYP size) noexcept {} }; /* SORT */ @@ -349,26 +349,26 @@ namespace MagyarSort { // Holds "digit" occurences, prefix sums, whatevers // First "DIGIT_RANGE" elem is for MSB "DIGITS", last is for LSB - static thread_local COUNTER_TYP radics[DIGITS * DIGIT_RANGE]; + static thread_local COUNTER_TYP magics[DIGITS * DIGIT_RANGE]; #ifndef NO_MLOCK - mlock(radics, (DIGITS * DIGIT_RANGE) * sizeof(COUNTER_TYP)); + mlock(magics, (DIGITS * DIGIT_RANGE) * sizeof(COUNTER_TYP)); #endif // !NO_MLOCK // Write prefetchin' - //__builtin_prefetch(&radicsOut[..], 1); + //__builtin_prefetch(&magicsOut[..], 1); if constexpr (DIGIT_RANGE <= 1024) { - PrefetchMagic pm(radics); + PrefetchMagic pm(magics); } - memset(radics, 0, sizeof(radics)); + memset(magics, 0, sizeof(magics)); // Calculate occurences of digits - countOccurences(arr, size, radics); + countOccurences(arr, size, magics); - //debugRadics(radics); + //debugRadics(magics); // Calculate prefix sums - calcPrefixSums(radics); + calcPrefixSums(magics); - //debugRadics(radics); + //debugRadics(magics); /* Regular (old) radix sort with small twist */ @@ -397,7 +397,7 @@ namespace MagyarSort { static thread_local bool swapped; swapped = false; // must be separate line - RadixMagic r(swapped, radics, from, to, size); + RadixMagic r(swapped, magics, from, to, size); // With an other API we could spare this copy if we can delete original arr and return ptr or something... // I am fine with this... this is not my main idea anyways, just little ILP tweak to regular radix sort @@ -406,7 +406,7 @@ namespace MagyarSort { memcpy(arr, to, size); } #ifndef NO_MLOCK - munlock(radics, (DIGITS * DIGIT_RANGE) * sizeof(COUNTER_TYP)); + munlock(magics, (DIGITS * DIGIT_RANGE) * sizeof(COUNTER_TYP)); munlock(&arc[0], size * sizeof(uint32_t)); munlock(arr, size * sizeof(uint32_t)); #endif // !NO_MLOCK diff --git a/test.cpp b/test.cpp index 12861f0..830c72d 100644 --- a/test.cpp +++ b/test.cpp @@ -9,8 +9,8 @@ //#define INPUT_MOD (65536*128) // Number of input elements to generate - unused when CREEL is defined! -//#define SORT_WIDTH 200000000 -#define SORT_WIDTH 40000000 +#define SORT_WIDTH 100000000 +//#define SORT_WIDTH 40000000 // Uncomment this to use nibbles as digits and not bytes - CREEL defines this anyways //#define MAGYAR_SORT_NIBBLE @@ -18,7 +18,7 @@ //#define PRINT_OUTPUT // Uncomment if you want to see how many elements are unique and duplicant in the input (debugging info) -#define COUNT_DUPLICANTS +// #define COUNT_DUPLICANTS //#define SKA_SORT diff --git a/ypsu.cpp b/ypsu.cpp index 7925f8e..7607bc0 100644 --- a/ypsu.cpp +++ b/ypsu.cpp @@ -33,9 +33,17 @@ void measure(const std::string &inputtype, const std::string &name, worst[name] = std::max(worst[name], seconds); } std::vector inputtypes = { - /*"constant", "asc", "desc", "ascasc", "ascdesc", - "descasc", "descdesc", "smallrange",*/ - "rand", + /* + "constant" + "asc" + "desc" + "ascasc" + "ascdesc", + "descasc" + "descdesc" + "rand", + */ + "smallrange", }; std::vector geninput(const std::string &type, int n) { std::vector v(n); @@ -113,6 +121,84 @@ void twopass(uint32_t *a, int n) { for (int i = 0; i < n; i++) a[bucketdata[buf[i] >> 16]++] = buf[i]; free(buf); } + +// TODO: zssort (quicksort jobbítás) + +// mormord — Today at 2:27 AM +// 1 2 2 2 3 +// +// 0 1 2 3 4 +// |1|2 2 3 2 +// 1|2|2 3 2 +// 1|3|2 2 2 +// 1|2|2 2 3 +// 1|2|2 2 3 +// 1 2|2|2 3 +// ^ +// Pivot +// +// állítás: pivottól balra helyükön vannak az elemek rendezettségük szerint +// +// Kezdés Indexek = Prefix összeg - 1 (utolsó helyek az elemeknek) +// +// Ha pivot új helyének meghatározot index > pivot_index (|.| helye) +// swap +// --index +// különben +// ++pivot_index +static inline uint32_t morgrab(uint32_t elem, uint32_t j) noexcept { + return (elem >> (8 * j)) & 0xff; +} +static inline void mormord_sort_impl(uint32_t *a, int n, int j) noexcept { + // Occurence count + uint32_t prefix[256] = { 0 }; + uint32_t index[256] = { 0 }; + for(uint32_t i = 0; i < n; ++i) { + // ++prefix[(a[i] >> (8 * j)) && 0xff]; + ++prefix[morgrab(a[i], j)]; + } + + // Prefix sum + index[0] = prefix[0]; + for(uint32_t i = 1; i < 256; ++i) { + prefix[i] += prefix[i - 1]; + index[i] = prefix[i]; + } + + // Inplace swap + uint32_t pivoti = 0; + while(pivoti < n) { + uint32_t radixval = morgrab(a[pivoti], j); + uint32_t targeti = index[radixval] - 1; + if(targeti > pivoti) { + // swap + uint32_t tmp = a[pivoti]; + a[pivoti] = a[targeti]; + a[targeti] = tmp; + // dec index + --index[radixval]; + } else { + ++pivoti; + } + } + + // Ends recursion + if(j == 0) return; + + // Recursion + for(uint32_t i = 0; i < 256; ++i) { + uint32_t from = index[i]; + uint32_t to = prefix[i]; + if(from != to) { + mormord_sort_impl(&a[from - 1], (to - (from - 1)), j - 1); + } + } +} +static inline void mormord_sort(uint32_t *a, int n) noexcept { + assert(n * uint32_t(sizeof(a[0])) <= INT_MAX); + mormord_sort_impl(a, n, 3); +} + void fourpass(uint32_t *a, int n) { assert(n * int64_t(sizeof(a[0])) <= INT_MAX); // alloc helper buffers. @@ -510,11 +596,11 @@ void measure_single(int n) { int main(void) { //int n = 100000000; //int n = 10000000; - int n = 1000000; + //int n = 1000000; //int n = 100000; //int n = 10000; //int n = 100; - //int n = 10; + int n = 10; printf("Sorting %d elements:\n\n", n); @@ -532,7 +618,6 @@ int main(void) { measure(inputtype, "std", [&] { std::sort(std::begin(w), std::end(w)); }); expected = w; w = v; - /* measure(inputtype, "ska", [&] { ska_sort(std::begin(w), std::end(w)); }); w = v; measure(inputtype, "ska_copy", [&] { @@ -541,9 +626,15 @@ int main(void) { w.swap(buf); } }); + /* w = v; measure(inputtype, "magyar", [&] { MagyarSort::sort(&w[0], w.size()); }); assert(w == expected); + */ + w = v; + measure(inputtype, "mormord", [&] { mormord_sort(&w[0], w.size()); }); + assert(w == expected); + /* w = v; measure(inputtype, "2pass", [&] { twopass(&w[0], w.size()); }); @@ -563,17 +654,16 @@ int main(void) { w = v; measure(inputtype, "sp", [&] { spsort(&w[0], w.size()); }); assert(w == expected);*/ - /* w = v; measure(inputtype, "gptbuck", [&] { gpt_bucket_sort(&w[0], w.size()); }); assert(w == expected); + /* w = v; measure(inputtype, "magbuck", [&] { magyar_bucket_sort(&w[0], w.size()); }); assert(w == expected); w = v; measure(inputtype, "magbuck2", [&] { magyar_bucket_sort2(&w[0], w.size()); }); assert(w == expected); - */ w = v; w = {10, 20, 20}; measure(inputtype, "qsmine", [&] { thier_quicksort(&w[0], w.size()); }); @@ -597,6 +687,7 @@ int main(void) { } } assert(w == expected); + */ /* w = v; measure(inputtype, "frewr", [&] { frewr(&w[0], w.size()); });