diff --git a/ypsu.cpp b/ypsu.cpp index 7607bc0..d5e0e71 100644 --- a/ypsu.cpp +++ b/ypsu.cpp @@ -33,16 +33,14 @@ void measure(const std::string &inputtype, const std::string &name, worst[name] = std::max(worst[name], seconds); } std::vector inputtypes = { - /* - "constant" - "asc" - "desc" - "ascasc" + "constant", + "asc", + "desc", + "ascasc", "ascdesc", - "descasc" - "descdesc" + "descasc", + "descdesc", "rand", - */ "smallrange", }; std::vector geninput(const std::string &type, int n) { @@ -150,34 +148,54 @@ static inline uint32_t morgrab(uint32_t elem, uint32_t j) noexcept { return (elem >> (8 * j)) & 0xff; } static inline void mormord_sort_impl(uint32_t *a, int n, int j) noexcept { - // Occurence count - uint32_t prefix[256] = { 0 }; - uint32_t index[256] = { 0 }; + /* Preparation */ + uint32_t radics[256] = {0}; + /* [from, to) index: only where prefix sums change - usually nonfull */ + uint32_t real_radics[256 * 2] = {0}; + + /* Occurence counting O(n) */ + /* TODO: We can go both down and upwards here to increase ILP or even do SSE2 */ for(uint32_t i = 0; i < n; ++i) { - // ++prefix[(a[i] >> (8 * j)) && 0xff]; - ++prefix[morgrab(a[i], j)]; + ++radics[morgrab(a[i], j)]; } - // Prefix sum - index[0] = prefix[0]; - for(uint32_t i = 1; i < 256; ++i) { - prefix[i] += prefix[i - 1]; - index[i] = prefix[i]; + /* Prefix sum + real radics calc O(256) */ + /* Radics: */ + /* fr: {10, 20, 10, 0, 5, 15,...} */ + /* to: {10, 30, 40, 40, 45, 60,..} */ + /* Real radics: */ + /* to: {[0, 10], [10, 30], [30, 40], [40, 45], [45, 60]} */ + /* 0. 1. 2. 4. 5. */ + /* (because radix value 3 is not found in input) */ + uint32_t prev = 0; + uint32_t reali = 0; + for(int i = 0; i < 256; ++i) { + if(radics[i] != 0) { + radics[i] += prev; + real_radics[reali] = prev; + real_radics[reali + 1] = radics[i]; + prev = radics[i]; + reali += 2; + } else { + radics[i] += prev; + prev = radics[i]; + } } - + // Inplace swap uint32_t pivoti = 0; while(pivoti < n) { uint32_t radixval = morgrab(a[pivoti], j); - uint32_t targeti = index[radixval] - 1; + uint32_t targeti = radics[radixval] - 1; if(targeti > pivoti) { // swap uint32_t tmp = a[pivoti]; a[pivoti] = a[targeti]; a[targeti] = tmp; // dec index - --index[radixval]; + --radics[radixval]; } else { + // progress pivot ++pivoti; } } @@ -186,11 +204,13 @@ static inline void mormord_sort_impl(uint32_t *a, int n, int j) noexcept { if(j == 0) return; // Recursion - for(uint32_t i = 0; i < 256; ++i) { - uint32_t from = index[i]; - uint32_t to = prefix[i]; - if(from != to) { - mormord_sort_impl(&a[from - 1], (to - (from - 1)), j - 1); + for(int i = 0; i < reali; i += 2) { + /* inclusive */ + uint32_t from = real_radics[i]; + /* non-inclusive */ + uint32_t to = real_radics[i + 1]; + if(from < to) { // TODO: check if this "if" is needed! + mormord_sort_impl(&a[from], (to - (from)), j - 1); } } } @@ -595,12 +615,13 @@ void measure_single(int n) { int main(void) { //int n = 100000000; - //int n = 10000000; + int n = 10000000; //int n = 1000000; //int n = 100000; //int n = 10000; + //int n = 1000; //int n = 100; - int n = 10; + //int n = 10; printf("Sorting %d elements:\n\n", n); @@ -610,7 +631,7 @@ int main(void) { for (auto inputtype : inputtypes) { printf("%10s", inputtype.c_str()); - fflush(stdout); + // fflush(stdout); // XXX: FIXME? std::vector v(n), w(n), expected(n); v = geninput(inputtype, n); measure(inputtype, "copy", [&] { w = v; }); @@ -626,11 +647,9 @@ int main(void) { w.swap(buf); } }); - /* w = v; measure(inputtype, "magyar", [&] { MagyarSort::sort(&w[0], w.size()); }); assert(w == expected); - */ w = v; measure(inputtype, "mormord", [&] { mormord_sort(&w[0], w.size()); }); assert(w == expected);