minor tweaks to 4pasu and added 4rot

This commit is contained in:
Richard Thier 2021-12-17 19:20:58 +01:00
parent a878f20100
commit 1686967f10

View File

@ -135,12 +135,9 @@
free(buf);
}
/** Regular memset will not do with thread_local I think... */
void my_memset(std::vector<int> &v) {
size_t n = v.size();
v.clear();
v.resize(n);
std::fill(v.begin(), v.end(), 0);
/** Only werks für das fourpassu! */
void my_memset(int *v) {
memset(v, 0, (1 << 8) * sizeof(int));
}
// hand-unrolled fourpass.
@ -149,10 +146,7 @@
// alloc helper buffers.
int sz = n * sizeof(a[0]);
//std::vector<int> bucketdata(1 << 8);
// Below code crashes :-(
static std::vector<int> bucketdata(1 << 8);
bucketdata.resize(1 << 8); // JHapak's "pattern"
static thread_local int bucketdata[1 << 8];
my_memset(bucketdata);
uint32_t *buf = (uint32_t *)malloc(sz);
@ -210,6 +204,62 @@
free(buf);
}
static inline uint32_t byterotate(uint32_t x) { return (x >> 8) | (x << 24); }
void fourrots(uint32_t *arr, int n) {
assert(n * int64_t(sizeof(arr[0])) <= INT_MAX);
assert(n % 4 == 0);
// alloc helper buffers.
int sz = n * sizeof(arr[0]);
std::vector<int> bucketdata(1 << 8);
int *btd = &bucketdata[0];
uint32_t *buf = (uint32_t *)malloc(sz);
assert(buf != NULL);
uint32_t *src = arr, *dst = buf;
uintptr_t swapmask = (uintptr_t)arr ^ (uintptr_t)buf;
uint32_t a, b, c, d;
uint32_t abt, bbt, cbt, dbt;
for (int shift = 0; shift < 32; shift += 8) {
memset(btd, 0, bucketdata.size() * sizeof(bucketdata[0]));
for (int i = 0; i < n; i += 4) {
a = src[i];
b = src[i + 1];
c = src[i + 2];
d = src[i + 3];
abt = a & 0xff;
bbt = b & 0xff;
cbt = c & 0xff;
dbt = d & 0xff;
btd[abt]++;
btd[bbt]++;
btd[cbt]++;
btd[dbt]++;
}
int offset = 0;
for (int i = 0; i < 1 << 8; i++) {
int d = bucketdata[i];
bucketdata[i] = offset;
offset += d;
}
for (int i = 0; i < n; i += 4) {
a = src[i];
b = src[i + 1];
c = src[i + 2];
d = src[i + 3];
abt = a & 0xff;
bbt = b & 0xff;
cbt = c & 0xff;
dbt = d & 0xff;
dst[btd[abt]++] = byterotate(a);
dst[btd[bbt]++] = byterotate(b);
dst[btd[cbt]++] = byterotate(c);
dst[btd[dbt]++] = byterotate(d);
}
src = (uint32_t *)((uintptr_t)src ^ swapmask);
dst = (uint32_t *)((uintptr_t)dst ^ swapmask);
}
free(buf);
}
void vsort(uint32_t *a, int n) {
thread_local std::vector<uint32_t> bts[256];
#pragma GCC unroll 4
@ -337,8 +387,13 @@
measure(inputtype, "4pasu", [&] { fourpassu(&w[0], w.size()); });
assert(w == expected);
w = v;
measure(inputtype, "4rot", [&] { fourrots(&w[0], w.size()); });
assert(w == expected);
/*
w = v;
measure(inputtype, "vsort", [&] { vsort(&w[0], w.size()); });
assert(w == expected);
*/
for (auto r : results) printf("%9.3fs", r.second);
puts("");