From 1686967f1022b6de0352a9027a5df2804cea25b2 Mon Sep 17 00:00:00 2001 From: Richard Thier Date: Fri, 17 Dec 2021 19:20:58 +0100 Subject: [PATCH] minor tweaks to 4pasu and added 4rot --- ypsu.cpp | 75 ++++++++++++++++++++++++++++++++++++++++++++++++-------- 1 file changed, 65 insertions(+), 10 deletions(-) diff --git a/ypsu.cpp b/ypsu.cpp index 030df79..5d2ffd0 100644 --- a/ypsu.cpp +++ b/ypsu.cpp @@ -135,12 +135,9 @@ free(buf); } - /** Regular memset will not do with thread_local I think... */ - void my_memset(std::vector &v) { - size_t n = v.size(); - v.clear(); - v.resize(n); - std::fill(v.begin(), v.end(), 0); + /** Only werks für das fourpassu! */ + void my_memset(int *v) { + memset(v, 0, (1 << 8) * sizeof(int)); } // hand-unrolled fourpass. @@ -149,10 +146,7 @@ // alloc helper buffers. int sz = n * sizeof(a[0]); - //std::vector bucketdata(1 << 8); - // Below code crashes :-( - static std::vector bucketdata(1 << 8); - bucketdata.resize(1 << 8); // JHapak's "pattern" + static thread_local int bucketdata[1 << 8]; my_memset(bucketdata); uint32_t *buf = (uint32_t *)malloc(sz); @@ -210,6 +204,62 @@ free(buf); } + static inline uint32_t byterotate(uint32_t x) { return (x >> 8) | (x << 24); } + void fourrots(uint32_t *arr, int n) { + assert(n * int64_t(sizeof(arr[0])) <= INT_MAX); + assert(n % 4 == 0); + // alloc helper buffers. + int sz = n * sizeof(arr[0]); + std::vector bucketdata(1 << 8); + int *btd = &bucketdata[0]; + uint32_t *buf = (uint32_t *)malloc(sz); + assert(buf != NULL); + uint32_t *src = arr, *dst = buf; + uintptr_t swapmask = (uintptr_t)arr ^ (uintptr_t)buf; + uint32_t a, b, c, d; + uint32_t abt, bbt, cbt, dbt; + for (int shift = 0; shift < 32; shift += 8) { + memset(btd, 0, bucketdata.size() * sizeof(bucketdata[0])); + for (int i = 0; i < n; i += 4) { + a = src[i]; + b = src[i + 1]; + c = src[i + 2]; + d = src[i + 3]; + abt = a & 0xff; + bbt = b & 0xff; + cbt = c & 0xff; + dbt = d & 0xff; + btd[abt]++; + btd[bbt]++; + btd[cbt]++; + btd[dbt]++; + } + int offset = 0; + for (int i = 0; i < 1 << 8; i++) { + int d = bucketdata[i]; + bucketdata[i] = offset; + offset += d; + } + for (int i = 0; i < n; i += 4) { + a = src[i]; + b = src[i + 1]; + c = src[i + 2]; + d = src[i + 3]; + abt = a & 0xff; + bbt = b & 0xff; + cbt = c & 0xff; + dbt = d & 0xff; + dst[btd[abt]++] = byterotate(a); + dst[btd[bbt]++] = byterotate(b); + dst[btd[cbt]++] = byterotate(c); + dst[btd[dbt]++] = byterotate(d); + } + src = (uint32_t *)((uintptr_t)src ^ swapmask); + dst = (uint32_t *)((uintptr_t)dst ^ swapmask); + } + free(buf); + } + void vsort(uint32_t *a, int n) { thread_local std::vector bts[256]; #pragma GCC unroll 4 @@ -337,8 +387,13 @@ measure(inputtype, "4pasu", [&] { fourpassu(&w[0], w.size()); }); assert(w == expected); w = v; + measure(inputtype, "4rot", [&] { fourrots(&w[0], w.size()); }); + assert(w == expected); + /* + w = v; measure(inputtype, "vsort", [&] { vsort(&w[0], w.size()); }); assert(w == expected); + */ for (auto r : results) printf("%9.3fs", r.second); puts("");