diff --git a/magyarsort.h b/magyarsort.h index c2d538e..8cd9627 100644 --- a/magyarsort.h +++ b/magyarsort.h @@ -44,6 +44,12 @@ namespace MagyarSort { static constexpr int BITS_PER_DIGIT = 4; // "bit / helyiérték" static constexpr int DIGIT_RANGE = 16; // "helyiérték állapottér" #else +/* + // Per-word digits sorting + static constexpr int DIGITS = 2; // "helyiérték" + static constexpr int BITS_PER_DIGIT = 16; // "bit / helyiérték" + static constexpr int DIGIT_RANGE = 65536; // "helyiérték állapottér" +*/ // Per-byte digits sorting static constexpr int DIGITS = 4; // "helyiérték" static constexpr int BITS_PER_DIGIT = 8; // "bit / helyiérték" @@ -240,7 +246,22 @@ namespace MagyarSort { memset(prev, 0, sizeof(prev)); // This is a template-unrolled loop too - PMagic2(radics, prev); + if constexpr (DIGIT_RANGE < 1024) { + // Extra optimization for bytes and nibbles - totally unrolled loop! + PMagic2(radics, prev); + } else { + // The above would not work for words and higher up... + #pragma GCC unroll 16 + for(int j = 0; j < DIGITS; ++j) { + int offset = 0; + #pragma GCC unroll 64 + for(int i = 0; i < DIGIT_RANGE; ++i) { + int DSTART = (j * DIGIT_RANGE); + radics[DSTART + i] += prev[j]; + prev[j] = radics[DSTART + i]; + } + } + } } /** Recursive Functor: no class should be generated I think (compiler should be smart) */ @@ -334,7 +355,9 @@ namespace MagyarSort { #endif // !NO_MLOCK // Write prefetchin' //__builtin_prefetch(&radicsOut[..], 1); - PrefetchMagic pm(radics); + if constexpr (DIGIT_RANGE <= 1024) { + PrefetchMagic pm(radics); + } memset(radics, 0, sizeof(radics)); // Calculate occurences of digits diff --git a/makefile b/makefile index 939f35a..d2c2b12 100644 --- a/makefile +++ b/makefile @@ -12,6 +12,7 @@ release_debug_sym: test.cpp magyarsort.h release: test.cpp magyarsort.h g++ test.cpp -DNDEBUG -std=c++17 -O2 -o test.out + # g++ test.cpp -DNDEBUG -std=c++17 -O2 -ftree-vectorize -fopt-info-vec-missed -o test.out release_ypsu: ypsu.cpp magyarsort.h g++ ypsu.cpp -DNDEBUG -std=c++17 -O2 -o ypsu.out diff --git a/test.cpp b/test.cpp index bc97415..12861f0 100644 --- a/test.cpp +++ b/test.cpp @@ -5,15 +5,21 @@ // Uncomment next line to follow Creel: https://www.youtube.com/watch?v=ujb2CIWE8zY // #define CREEL // Overwrites TEST_LEN to 16 and sets MAGYAR_SORT_NIBBLE! +// Uncomment and give a value for input being modulo this value! +//#define INPUT_MOD (65536*128) + // Number of input elements to generate - unused when CREEL is defined! -#define SORT_WIDTH 200000000 -//#define SORT_WIDTH 40000000 +//#define SORT_WIDTH 200000000 +#define SORT_WIDTH 40000000 // Uncomment this to use nibbles as digits and not bytes - CREEL defines this anyways //#define MAGYAR_SORT_NIBBLE // Uncomment if you want to see output before / after sorts (debugging for example) //#define PRINT_OUTPUT +// Uncomment if you want to see how many elements are unique and duplicant in the input (debugging info) +#define COUNT_DUPLICANTS + //#define SKA_SORT // Uncomment for perf / cachegring and similar runs! @@ -86,7 +92,11 @@ static inline std::vector GenerateInput() { ret.resize(SORT_WIDTH); for(size_t ek = 0; ek < SORT_WIDTH; ++ek) { +#ifndef INPUT_MOD ret[ek] = (uint32_t)std::rand(); +#else + ret[ek] = (uint32_t)std::rand() % INPUT_MOD; +#endif } return ret; @@ -155,9 +165,27 @@ int main() { #ifndef MEASURE_ONLY bool good = true; +#ifdef COUNT_DUPLICANTS + size_t dups = 0; + uint32_t prev = (in1.size() > 0) ? in1[0] : 0; +#endif // COUNT_DUPLICANTS for(size_t i = 0; good && (i < in1.size()); ++i) { good &= (in1[i] == in2[i]); +#ifdef COUNT_DUPLICANTS + if(i > 0) { + uint32_t curr = in1[i]; + if(curr == prev) { + ++dups; + } else { + prev = curr; + } + } +#endif // COUNT_DUPLICANTS } +#ifdef COUNT_DUPLICANTS + printf("Duplications are %d out of %d, which is %f percent\n", dups, in1.size(), (float)(dups * 100) / in1.size()); +#endif // COUNT_DUPLICANTS + #endif // !MEASURE_ONLY printf("Results:\n\n");