From 520db7049de025f06eb05ff43f98fd3b95779012 Mon Sep 17 00:00:00 2001 From: Richard Thier Date: Wed, 15 Dec 2021 12:52:33 +0100 Subject: [PATCH] added ypsu-variants of radix-like things --- ypsu.cpp | 209 +++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 209 insertions(+) diff --git a/ypsu.cpp b/ypsu.cpp index 859887c..1579a9d 100644 --- a/ypsu.cpp +++ b/ypsu.cpp @@ -10,6 +10,7 @@ #include #include #include + #include #include "ska_sort.hpp" @@ -78,6 +79,195 @@ } return v; } + + void twopass(uint32_t *a, int n) { + assert(n * int64_t(sizeof(a[0])) <= INT_MAX); + // alloc helper buffers. + int sz = n * sizeof(a[0]); + std::vector bucketdata(1 << 16); + uint32_t *buf = (uint32_t *)malloc(sz); + assert(buf != NULL); + // pass 1: sort by lower 16 bits. + for (int i = 0; i < n; i++) bucketdata[a[i] & 0xffff]++; + int offset = 0; + for (int i = 0; i < 1 << 16; i++) { + int d = bucketdata[i]; + bucketdata[i] = offset; + offset += d; + } + for (int i = 0; i < n; i++) buf[bucketdata[a[i] & 0xffff]++] = a[i]; + // pass 2: sort by upper 16 bits. + memset(&bucketdata[0], 0, bucketdata.size() * sizeof(bucketdata[0])); + for (int i = 0; i < n; i++) bucketdata[buf[i] >> 16]++; + offset = 0; + for (int i = 0; i < 1 << 16; i++) { + int d = bucketdata[i]; + bucketdata[i] = offset; + offset += d; + } + for (int i = 0; i < n; i++) a[bucketdata[buf[i] >> 16]++] = buf[i]; + free(buf); + } + void fourpass(uint32_t *a, int n) { + assert(n * int64_t(sizeof(a[0])) <= INT_MAX); + // alloc helper buffers. + int sz = n * sizeof(a[0]); + std::vector bucketdata(1 << 8); + uint32_t *buf = (uint32_t *)malloc(sz); + assert(buf != NULL); + uint32_t *src = a, *dst = buf; + uintptr_t swapmask = (uintptr_t)a ^ (uintptr_t)buf; + for (int shift = 0; shift < 32; shift += 8) { + memset(&bucketdata[0], 0, bucketdata.size() * sizeof(bucketdata[0])); + for (int i = 0; i < n; i++) bucketdata[src[i] >> shift & 0xff]++; + int offset = 0; + for (int i = 0; i < 1 << 8; i++) { + int d = bucketdata[i]; + bucketdata[i] = offset; + offset += d; + } + for (int i = 0; i < n; i++) { + dst[bucketdata[src[i] >> shift & 0xff]++] = src[i]; + } + src = (uint32_t *)((uintptr_t)src ^ swapmask); + dst = (uint32_t *)((uintptr_t)dst ^ swapmask); + } + free(buf); + } + // hand-unrolled fourpass. + void fourpassu(uint32_t *a, int n) { + assert(n * int64_t(sizeof(a[0])) <= INT_MAX); + // alloc helper buffers. + int sz = n * sizeof(a[0]); + std::vector bucketdata(1 << 8); + uint32_t *buf = (uint32_t *)malloc(sz); + assert(buf != NULL); + // pass 1: sort by lower 8 bits. + for (int i = 0; i < n; i++) bucketdata[a[i] & 0xff]++; + int offset = 0; + for (int i = 0; i < 1 << 8; i++) { + int d = bucketdata[i]; + bucketdata[i] = offset; + offset += d; + } + for (int i = 0; i < n; i++) buf[bucketdata[a[i] & 0xff]++] = a[i]; + // pass 2: sort by 2nd 8 bits. + memset(&bucketdata[0], 0, bucketdata.size() * sizeof(bucketdata[0])); + for (int i = 0; i < n; i++) bucketdata[buf[i] >> 8 & 0xff]++; + offset = 0; + for (int i = 0; i < 1 << 8; i++) { + int d = bucketdata[i]; + bucketdata[i] = offset; + offset += d; + } + for (int i = 0; i < n; i++) a[bucketdata[buf[i] >> 8 & 0xff]++] = buf[i]; + // pass 3: sort by 3rd 8 bits. + memset(&bucketdata[0], 0, bucketdata.size() * sizeof(bucketdata[0])); + for (int i = 0; i < n; i++) bucketdata[a[i] >> 16 & 0xff]++; + offset = 0; + for (int i = 0; i < 1 << 8; i++) { + int d = bucketdata[i]; + bucketdata[i] = offset; + offset += d; + } + for (int i = 0; i < n; i++) buf[bucketdata[a[i] >> 16 & 0xff]++] = a[i]; + // pass 4: sort by 4th 8 bits. + memset(&bucketdata[0], 0, bucketdata.size() * sizeof(bucketdata[0])); + for (int i = 0; i < n; i++) bucketdata[buf[i] >> 24 & 0xff]++; + offset = 0; + for (int i = 0; i < 1 << 8; i++) { + int d = bucketdata[i]; + bucketdata[i] = offset; + offset += d; + } + for (int i = 0; i < n; i++) a[bucketdata[buf[i] >> 24 & 0xff]++] = buf[i]; + free(buf); + } + void vsort(uint32_t *a, int n) { + static thread_local std::vector bts[256]; + for (int shift = 0; shift < 32; shift += 8) { + for (int i = 0; i < n; i++) bts[a[i] >> shift & 0xff].push_back(a[i]); + for (int bt = 0, k = 0; bt < 256; bt++) { + memcpy(a + k, &bts[bt][0], bts[bt].size() * sizeof(a[0])); + k += bts[bt].size(); + bts[bt].clear(); + } + } + } + void pagedsort(uint32_t *a, int n) { + enum { pagesize = 1024 }; + int pagecount = (n + pagesize - 1) / pagesize + 512; + uint32_t *pd = (uint32_t *)malloc(pagecount * pagesize * sizeof(a[0])); + std::vector freelist(pagecount); + std::vector next(pagecount); + std::iota(std::begin(freelist), std::end(freelist), 0); + struct bucket { + int len; + int headpage, lastpage; + }; + bucket bts[512]; + // initial scatter. + for (int bt = 0; bt < 256; bt++) { + int p = freelist.back(); + freelist.pop_back(); + bts[bt] = {0, p, p}; + } + for (int i = 0; i < n; i++) { + bucket *bt = &bts[a[i] & 0xff]; + pd[bt->lastpage * pagesize + bt->len++ % pagesize] = a[i]; + if (bt->len % pagesize == 0) { + int p = freelist.back(); + freelist.pop_back(); + next[bt->lastpage] = p; + bt->lastpage = p; + } + } + // intermediate level scatters. + int ibase = 0, obase = 256; + for (int shift = 8; shift < 32; shift += 8) { + for (int bt = 0; bt < 256; bt++) { + int p = freelist.back(); + freelist.pop_back(); + bts[obase + bt] = {0, p, p}; + } + for (int ibti = 0; ibti < 256; ibti++) { + struct bucket *ibt = &bts[ibase + ibti]; + int page = ibt->headpage; + for (int i = 0; i < ibt->len; i++) { + uint32_t v = pd[page * pagesize + i % pagesize]; + struct bucket *obt = &bts[obase + (v >> shift & 0xff)]; + pd[obt->lastpage * pagesize + obt->len++ % pagesize] = v; + if (obt->len % pagesize == 0) { + int p = freelist.back(); + freelist.pop_back(); + next[obt->lastpage] = p; + obt->lastpage = p; + } + if (i % pagesize == pagesize - 1) { + freelist.push_back(page); + page = next[page]; + } + } + freelist.push_back(ibt->lastpage); + } + ibase = 256 - ibase; + obase = 256 - obase; + } + // the final gather. + int k = 0; + for (int ibti = 0; ibti < 256; ibti++) { + struct bucket *ibt = &bts[ibase + ibti]; + int page = ibt->headpage; + for (int i = 0; i < ibt->len; i++) { + a[k++] = pd[page * pagesize + i % pagesize]; + if (i % pagesize == pagesize - 1) { + page = next[page]; + } + } + } + free(pd); + } + int main(void) { int n = 100000000; for (auto inputtype : inputtypes) { @@ -101,6 +291,25 @@ w = v; measure(inputtype, "magyar", [&] { MagyarSort::sort(&w[0], w.size()); }); assert(w == expected); + + /* + w = v; + measure(inputtype, "2pass", [&] { twopass(&w[0], w.size()); }); + assert(w == expected); + w = v; + measure(inputtype, "4pass", [&] { fourpass(&w[0], w.size()); }); + assert(w == expected); + w = v; + measure(inputtype, "psort", [&] { pagedsort(&w[0], w.size()); }); + assert(w == expected); + */ + w = v; + measure(inputtype, "4pasu", [&] { fourpassu(&w[0], w.size()); }); + assert(w == expected); + w = v; + measure(inputtype, "vsort", [&] { vsort(&w[0], w.size()); }); + assert(w == expected); + for (auto r : results) printf("%9.3fs", r.second); puts(""); }