minor tweaks to 4pasu and added 4rot
This commit is contained in:
parent
a878f20100
commit
1686967f10
75
ypsu.cpp
75
ypsu.cpp
@ -135,12 +135,9 @@
|
||||
free(buf);
|
||||
}
|
||||
|
||||
/** Regular memset will not do with thread_local I think... */
|
||||
void my_memset(std::vector<int> &v) {
|
||||
size_t n = v.size();
|
||||
v.clear();
|
||||
v.resize(n);
|
||||
std::fill(v.begin(), v.end(), 0);
|
||||
/** Only werks für das fourpassu! */
|
||||
void my_memset(int *v) {
|
||||
memset(v, 0, (1 << 8) * sizeof(int));
|
||||
}
|
||||
|
||||
// hand-unrolled fourpass.
|
||||
@ -149,10 +146,7 @@
|
||||
// alloc helper buffers.
|
||||
int sz = n * sizeof(a[0]);
|
||||
|
||||
//std::vector<int> bucketdata(1 << 8);
|
||||
// Below code crashes :-(
|
||||
static std::vector<int> bucketdata(1 << 8);
|
||||
bucketdata.resize(1 << 8); // JHapak's "pattern"
|
||||
static thread_local int bucketdata[1 << 8];
|
||||
my_memset(bucketdata);
|
||||
|
||||
uint32_t *buf = (uint32_t *)malloc(sz);
|
||||
@ -210,6 +204,62 @@
|
||||
free(buf);
|
||||
}
|
||||
|
||||
static inline uint32_t byterotate(uint32_t x) { return (x >> 8) | (x << 24); }
|
||||
void fourrots(uint32_t *arr, int n) {
|
||||
assert(n * int64_t(sizeof(arr[0])) <= INT_MAX);
|
||||
assert(n % 4 == 0);
|
||||
// alloc helper buffers.
|
||||
int sz = n * sizeof(arr[0]);
|
||||
std::vector<int> bucketdata(1 << 8);
|
||||
int *btd = &bucketdata[0];
|
||||
uint32_t *buf = (uint32_t *)malloc(sz);
|
||||
assert(buf != NULL);
|
||||
uint32_t *src = arr, *dst = buf;
|
||||
uintptr_t swapmask = (uintptr_t)arr ^ (uintptr_t)buf;
|
||||
uint32_t a, b, c, d;
|
||||
uint32_t abt, bbt, cbt, dbt;
|
||||
for (int shift = 0; shift < 32; shift += 8) {
|
||||
memset(btd, 0, bucketdata.size() * sizeof(bucketdata[0]));
|
||||
for (int i = 0; i < n; i += 4) {
|
||||
a = src[i];
|
||||
b = src[i + 1];
|
||||
c = src[i + 2];
|
||||
d = src[i + 3];
|
||||
abt = a & 0xff;
|
||||
bbt = b & 0xff;
|
||||
cbt = c & 0xff;
|
||||
dbt = d & 0xff;
|
||||
btd[abt]++;
|
||||
btd[bbt]++;
|
||||
btd[cbt]++;
|
||||
btd[dbt]++;
|
||||
}
|
||||
int offset = 0;
|
||||
for (int i = 0; i < 1 << 8; i++) {
|
||||
int d = bucketdata[i];
|
||||
bucketdata[i] = offset;
|
||||
offset += d;
|
||||
}
|
||||
for (int i = 0; i < n; i += 4) {
|
||||
a = src[i];
|
||||
b = src[i + 1];
|
||||
c = src[i + 2];
|
||||
d = src[i + 3];
|
||||
abt = a & 0xff;
|
||||
bbt = b & 0xff;
|
||||
cbt = c & 0xff;
|
||||
dbt = d & 0xff;
|
||||
dst[btd[abt]++] = byterotate(a);
|
||||
dst[btd[bbt]++] = byterotate(b);
|
||||
dst[btd[cbt]++] = byterotate(c);
|
||||
dst[btd[dbt]++] = byterotate(d);
|
||||
}
|
||||
src = (uint32_t *)((uintptr_t)src ^ swapmask);
|
||||
dst = (uint32_t *)((uintptr_t)dst ^ swapmask);
|
||||
}
|
||||
free(buf);
|
||||
}
|
||||
|
||||
void vsort(uint32_t *a, int n) {
|
||||
thread_local std::vector<uint32_t> bts[256];
|
||||
#pragma GCC unroll 4
|
||||
@ -337,8 +387,13 @@
|
||||
measure(inputtype, "4pasu", [&] { fourpassu(&w[0], w.size()); });
|
||||
assert(w == expected);
|
||||
w = v;
|
||||
measure(inputtype, "4rot", [&] { fourrots(&w[0], w.size()); });
|
||||
assert(w == expected);
|
||||
/*
|
||||
w = v;
|
||||
measure(inputtype, "vsort", [&] { vsort(&w[0], w.size()); });
|
||||
assert(w == expected);
|
||||
*/
|
||||
|
||||
for (auto r : results) printf("%9.3fs", r.second);
|
||||
puts("");
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user