#include #include #include #include #include #include #include #include #include #include #include #include #include #include // mlock & munlock #include "ska_sort.hpp" #include "gptsort.h" #define MAGYAR_SORT_DEFAULT_REUSE #include "magyarsort.h" #include "space_partitioning_sort/spsort.h" std::map results; std::map worst; void measure(const std::string &inputtype, const std::string &name, std::function f) { auto begin = std::chrono::high_resolution_clock::now(); f(); auto dur = std::chrono::high_resolution_clock::now() - begin; double seconds = dur / std::chrono::milliseconds(1) / 1000.0; results[name] = seconds; worst[name] = std::max(worst[name], seconds); } std::vector inputtypes = { "constant", "asc", "desc", "ascasc", "ascdesc", "descasc", "descdesc", "smallrange", "rand", }; std::vector geninput(const std::string &type, int n) { std::vector v(n); if (type == "constant") { int c = rand(); for (int i = 0; i < n; i++) { v[i] = c; } } else if (type == "asc") { for (int i = 0; i < n; i++) { v[i] = i; } } else if (type == "desc") { for (int i = 0; i < n; i++) { v[i] = n - i; } } else if (type == "ascasc") { for (int i = 0; i < n / 2; i++) { v[i] = i; v[i + n / 2] = i; } } else if (type == "ascdesc") { for (int i = 0; i < n / 2; i++) { v[i] = i; v[i + n / 2] = n - i; } } else if (type == "descasc") { for (int i = 0; i < n / 2; i++) { v[i] = n - i; v[i + n / 2] = i; } } else if (type == "descdesc") { for (int i = 0; i < n / 2; i++) { v[i] = n - i; v[i + n / 2] = n - i; } } else if (type == "smallrange") { int c = rand() / 2; for (int i = 0; i < n; i++) { v[i] = c + rand() % 100; } } else if (type == "rand") { for (int i = 0; i < n; i++) { v[i] = rand(); } } return v; } void twopass(uint32_t *a, int n) { assert(n * int64_t(sizeof(a[0])) <= INT_MAX); // alloc helper buffers. int sz = n * sizeof(a[0]); std::vector bucketdata(1 << 16); uint32_t *buf = (uint32_t *)malloc(sz); assert(buf != NULL); // pass 1: sort by lower 16 bits. for (int i = 0; i < n; i++) bucketdata[a[i] & 0xffff]++; int offset = 0; for (int i = 0; i < 1 << 16; i++) { int d = bucketdata[i]; bucketdata[i] = offset; offset += d; } for (int i = 0; i < n; i++) buf[bucketdata[a[i] & 0xffff]++] = a[i]; // pass 2: sort by upper 16 bits. memset(&bucketdata[0], 0, bucketdata.size() * sizeof(bucketdata[0])); for (int i = 0; i < n; i++) bucketdata[buf[i] >> 16]++; offset = 0; for (int i = 0; i < 1 << 16; i++) { int d = bucketdata[i]; bucketdata[i] = offset; offset += d; } for (int i = 0; i < n; i++) a[bucketdata[buf[i] >> 16]++] = buf[i]; free(buf); } void fourpass(uint32_t *a, int n) { assert(n * int64_t(sizeof(a[0])) <= INT_MAX); // alloc helper buffers. int sz = n * sizeof(a[0]); std::vector bucketdata(1 << 8); uint32_t *buf = (uint32_t *)malloc(sz); assert(buf != NULL); uint32_t *src = a, *dst = buf; uintptr_t swapmask = (uintptr_t)a ^ (uintptr_t)buf; for (int shift = 0; shift < 32; shift += 8) { memset(&bucketdata[0], 0, bucketdata.size() * sizeof(bucketdata[0])); for (int i = 0; i < n; i++) bucketdata[src[i] >> shift & 0xff]++; int offset = 0; for (int i = 0; i < 1 << 8; i++) { int d = bucketdata[i]; bucketdata[i] = offset; offset += d; } for (int i = 0; i < n; i++) { dst[bucketdata[src[i] >> shift & 0xff]++] = src[i]; } src = (uint32_t *)((uintptr_t)src ^ swapmask); dst = (uint32_t *)((uintptr_t)dst ^ swapmask); } free(buf); } /** Only werks für das fourpassu! */ void my_memset(int *v) { memset(v, 0, (1 << 8) * sizeof(int)); } // hand-unrolled fourpass. void fourpassu(uint32_t *a, int n) { assert(n * int64_t(sizeof(a[0])) <= INT_MAX); // alloc helper buffers. int sz = n * sizeof(a[0]); static thread_local int bucketdata[1 << 8]; my_memset(bucketdata); uint32_t *buf = (uint32_t *)malloc(sz); assert(buf != NULL); // pass 1: sort by lower 8 bits. #pragma GCC unroll 32 for (int i = 0; i < n; i++) bucketdata[a[i] & 0xff]++; int offset = 0; #pragma GCC unroll 8 for (int i = 0; i < 1 << 8; i++) { int d = bucketdata[i]; bucketdata[i] = offset; offset += d; } for (int i = 0; i < n; i++) buf[bucketdata[a[i] & 0xff]++] = a[i]; // pass 2: sort by 2nd 8 bits. my_memset(bucketdata); #pragma GCC unroll 32 for (int i = 0; i < n; i++) bucketdata[buf[i] >> 8 & 0xff]++; offset = 0; #pragma GCC unroll 8 for (int i = 0; i < 1 << 8; i++) { int d = bucketdata[i]; bucketdata[i] = offset; offset += d; } #pragma GCC unroll 64 for (int i = 0; i < n; i++) a[bucketdata[buf[i] >> 8 & 0xff]++] = buf[i]; // pass 3: sort by 3rd 8 bits. my_memset(bucketdata); #pragma GCC unroll 32 for (int i = 0; i < n; i++) bucketdata[a[i] >> 16 & 0xff]++; offset = 0; #pragma GCC unroll 8 for (int i = 0; i < 1 << 8; i++) { int d = bucketdata[i]; bucketdata[i] = offset; offset += d; } #pragma GCC unroll 64 for (int i = 0; i < n; i++) buf[bucketdata[a[i] >> 16 & 0xff]++] = a[i]; // pass 4: sort by 4th 8 bits. my_memset(bucketdata); #pragma GCC unroll 32 for (int i = 0; i < n; i++) bucketdata[buf[i] >> 24 & 0xff]++; offset = 0; #pragma GCC unroll 8 for (int i = 0; i < 1 << 8; i++) { int d = bucketdata[i]; bucketdata[i] = offset; offset += d; } #pragma GCC unroll 32 for (int i = 0; i < n; i++) a[bucketdata[buf[i] >> 24 & 0xff]++] = buf[i]; free(buf); } static inline uint32_t byterotate(uint32_t x) { return (x >> 8) | (x << 24); } void fourrots(uint32_t *arr, int n) { assert(n * int64_t(sizeof(arr[0])) <= INT_MAX); assert(n % 4 == 0); // alloc helper buffers. int sz = n * sizeof(arr[0]); std::vector bucketdata(1 << 8); int *btd = &bucketdata[0]; uint32_t *buf = (uint32_t *)malloc(sz); assert(buf != NULL); uint32_t *src = arr, *dst = buf; uintptr_t swapmask = (uintptr_t)arr ^ (uintptr_t)buf; uint32_t a, b, c, d; uint32_t abt, bbt, cbt, dbt; for (int shift = 0; shift < 32; shift += 8) { memset(btd, 0, bucketdata.size() * sizeof(bucketdata[0])); for (int i = 0; i < n; i += 4) { a = src[i]; b = src[i + 1]; c = src[i + 2]; d = src[i + 3]; abt = a & 0xff; bbt = b & 0xff; cbt = c & 0xff; dbt = d & 0xff; btd[abt]++; btd[bbt]++; btd[cbt]++; btd[dbt]++; } int offset = 0; for (int i = 0; i < 1 << 8; i++) { int d = bucketdata[i]; bucketdata[i] = offset; offset += d; } for (int i = 0; i < n; i += 4) { a = src[i]; b = src[i + 1]; c = src[i + 2]; d = src[i + 3]; abt = a & 0xff; bbt = b & 0xff; cbt = c & 0xff; dbt = d & 0xff; dst[btd[abt]++] = byterotate(a); dst[btd[bbt]++] = byterotate(b); dst[btd[cbt]++] = byterotate(c); dst[btd[dbt]++] = byterotate(d); } src = (uint32_t *)((uintptr_t)src ^ swapmask); dst = (uint32_t *)((uintptr_t)dst ^ swapmask); } free(buf); } // frewr - four rewrites. void frewr(uint32_t *arr, int n) { uint32_t *tmpbuf = (uint32_t *)malloc(n * 4); mlock(tmpbuf, n * 4); int btoffsets[4][256] = {}; #pragma GCC unroll 64 for (int i = n - 1; i >= 0; i--) { uint32_t a = arr[i]; btoffsets[3][a & 0xff]++; btoffsets[2][a >> 8 & 0xff]++; btoffsets[1][a >> 16 & 0xff]++; btoffsets[0][a >> 24 & 0xff]++; } int btend[4] = {n - 1, n - 1, n - 1, n - 1}; #pragma GCC unroll 16 for (int i = 255; i >= 0; i--) { #pragma GCC unroll 4 for (int pass = 3; pass >= 0; pass--) { int nbtend = btend[pass] - btoffsets[pass][i]; btoffsets[pass][i] = btend[pass]; btend[pass] = nbtend; } } uint32_t *src = arr, *dst = tmpbuf; #pragma GCC unroll 4 for (int pass = 3; pass >= 0; pass--) { int *off = btoffsets[pass]; #pragma GCC unroll 64 for (int i = n - 1; i >= 0; i--) { uint32_t v = src[i]; dst[off[v & 0xff]--] = v >> 8 | v << 24; __builtin_prefetch(&dst[off[v & 0xff] - 2]); } uint32_t *tmp = src; src = dst; dst = tmp; } munlock(tmpbuf, n * 4); free(tmpbuf); } void vsort(uint32_t *a, int n) { thread_local std::vector bts[256]; #pragma GCC unroll 4 for (int shift = 0; shift < 32; shift += 8) { #pragma GCC unroll 64 for (int i = 0; i < n; i++) bts[a[i] >> shift & 0xff].push_back(a[i]); #pragma GCC unroll 64 for (int bt = 0, k = 0; bt < 256; bt++) { memcpy(a + k, &bts[bt][0], bts[bt].size() * sizeof(a[0])); k += bts[bt].size(); bts[bt].clear(); } } } void pagedsort(uint32_t *a, int n) { enum { pagesize = 1024 }; int pagecount = (n + pagesize - 1) / pagesize + 512; uint32_t *pd = (uint32_t *)malloc(pagecount * pagesize * sizeof(a[0])); std::vector freelist(pagecount); std::vector next(pagecount); std::iota(std::begin(freelist), std::end(freelist), 0); struct bucket { int len; int headpage, lastpage; }; bucket bts[512]; // initial scatter. for (int bt = 0; bt < 256; bt++) { int p = freelist.back(); freelist.pop_back(); bts[bt] = {0, p, p}; } for (int i = 0; i < n; i++) { bucket *bt = &bts[a[i] & 0xff]; pd[bt->lastpage * pagesize + bt->len++ % pagesize] = a[i]; if (bt->len % pagesize == 0) { int p = freelist.back(); freelist.pop_back(); next[bt->lastpage] = p; bt->lastpage = p; } } // intermediate level scatters. int ibase = 0, obase = 256; for (int shift = 8; shift < 32; shift += 8) { for (int bt = 0; bt < 256; bt++) { int p = freelist.back(); freelist.pop_back(); bts[obase + bt] = {0, p, p}; } for (int ibti = 0; ibti < 256; ibti++) { struct bucket *ibt = &bts[ibase + ibti]; int page = ibt->headpage; for (int i = 0; i < ibt->len; i++) { uint32_t v = pd[page * pagesize + i % pagesize]; struct bucket *obt = &bts[obase + (v >> shift & 0xff)]; pd[obt->lastpage * pagesize + obt->len++ % pagesize] = v; if (obt->len % pagesize == 0) { int p = freelist.back(); freelist.pop_back(); next[obt->lastpage] = p; obt->lastpage = p; } if (i % pagesize == pagesize - 1) { freelist.push_back(page); page = next[page]; } } freelist.push_back(ibt->lastpage); } ibase = 256 - ibase; obase = 256 - obase; } // the final gather. int k = 0; for (int ibti = 0; ibti < 256; ibti++) { struct bucket *ibt = &bts[ibase + ibti]; int page = ibt->headpage; for (int i = 0; i < ibt->len; i++) { a[k++] = pd[page * pagesize + i % pagesize]; if (i % pagesize == pagesize - 1) { page = next[page]; } } } free(pd); } // to measure / profile a single variant void measure_single(int n) { for (auto inputtype : inputtypes) { printf("%10s", inputtype.c_str()); fflush(stdout); std::vector v(n); v = geninput(inputtype, n); measure(inputtype, "sp", [&] { spsort(&v[0], v.size()); }); for (auto r : results) printf("%9.3fs", r.second); puts(""); } puts(""); printf("%10s", "worst"); for (auto w : worst) printf("%9.3fs", w.second); puts(""); printf("%10s", ""); for (auto w : worst) printf("%10s", w.first.c_str()); puts(""); } int main(void) { int n = 100000000; //int n = 10000000; //int n = 100; // Uncomment this for profiling and alg! //measure_single(n); //return 0; for (auto inputtype : inputtypes) { printf("%10s", inputtype.c_str()); fflush(stdout); std::vector v(n), w(n), expected(n); v = geninput(inputtype, n); measure(inputtype, "copy", [&] { w = v; }); w = v; measure(inputtype, "std", [&] { std::sort(std::begin(w), std::end(w)); }); expected = w; w = v; measure(inputtype, "ska", [&] { ska_sort(std::begin(w), std::end(w)); }); w = v; measure(inputtype, "ska_copy", [&] { std::vector buf(w.size()); if (ska_sort_copy(std::begin(w), std::end(w), std::begin(buf))) { w.swap(buf); } }); w = v; measure(inputtype, "magyar", [&] { MagyarSort::sort(&w[0], w.size()); }); assert(w == expected); /* w = v; measure(inputtype, "2pass", [&] { twopass(&w[0], w.size()); }); assert(w == expected); w = v; measure(inputtype, "4pass", [&] { fourpass(&w[0], w.size()); }); assert(w == expected); w = v; measure(inputtype, "psort", [&] { pagedsort(&w[0], w.size()); }); assert(w == expected); w = v; measure(inputtype, "4pasu", [&] { fourpassu(&w[0], w.size()); }); assert(w == expected); */ w = v; measure(inputtype, "4rot", [&] { fourrots(&w[0], w.size()); }); assert(w == expected); w = v; /*measure(inputtype, "sp", [&] { spsort(&w[0], w.size()); }); assert(w == expected); w = v;*/ measure(inputtype, "gptbuck", [&] { gpt_bucket_sort(&w[0], w.size()); }); assert(w == expected); /* w = v; measure(inputtype, "frewr", [&] { frewr(&w[0], w.size()); }); assert(w == expected); w = v; measure(inputtype, "vsort", [&] { vsort(&w[0], w.size()); }); assert(w == expected); */ for (auto r : results) printf("%9.3fs", r.second); puts(""); } puts(""); printf("%10s", "worst"); for (auto w : worst) printf("%9.3fs", w.second); puts(""); printf("%10s", ""); for (auto w : worst) printf("%10s", w.first.c_str()); puts(""); return 0; }