ypsus 4passu method optimized a bit

This commit is contained in:
Richard Thier 2021-12-15 16:09:40 +01:00
parent a947cda58d
commit a878f20100

View File

@ -134,17 +134,34 @@
}
free(buf);
}
/** Regular memset will not do with thread_local I think... */
void my_memset(std::vector<int> &v) {
size_t n = v.size();
v.clear();
v.resize(n);
std::fill(v.begin(), v.end(), 0);
}
// hand-unrolled fourpass.
void fourpassu(uint32_t *a, int n) {
assert(n * int64_t(sizeof(a[0])) <= INT_MAX);
// alloc helper buffers.
int sz = n * sizeof(a[0]);
std::vector<int> bucketdata(1 << 8);
//std::vector<int> bucketdata(1 << 8);
// Below code crashes :-(
static std::vector<int> bucketdata(1 << 8);
bucketdata.resize(1 << 8); // JHapak's "pattern"
my_memset(bucketdata);
uint32_t *buf = (uint32_t *)malloc(sz);
assert(buf != NULL);
// pass 1: sort by lower 8 bits.
#pragma GCC unroll 32
for (int i = 0; i < n; i++) bucketdata[a[i] & 0xff]++;
int offset = 0;
#pragma GCC unroll 8
for (int i = 0; i < 1 << 8; i++) {
int d = bucketdata[i];
bucketdata[i] = offset;
@ -152,41 +169,54 @@
}
for (int i = 0; i < n; i++) buf[bucketdata[a[i] & 0xff]++] = a[i];
// pass 2: sort by 2nd 8 bits.
memset(&bucketdata[0], 0, bucketdata.size() * sizeof(bucketdata[0]));
my_memset(bucketdata);
#pragma GCC unroll 32
for (int i = 0; i < n; i++) bucketdata[buf[i] >> 8 & 0xff]++;
offset = 0;
#pragma GCC unroll 8
for (int i = 0; i < 1 << 8; i++) {
int d = bucketdata[i];
bucketdata[i] = offset;
offset += d;
}
#pragma GCC unroll 64
for (int i = 0; i < n; i++) a[bucketdata[buf[i] >> 8 & 0xff]++] = buf[i];
// pass 3: sort by 3rd 8 bits.
memset(&bucketdata[0], 0, bucketdata.size() * sizeof(bucketdata[0]));
my_memset(bucketdata);
#pragma GCC unroll 32
for (int i = 0; i < n; i++) bucketdata[a[i] >> 16 & 0xff]++;
offset = 0;
#pragma GCC unroll 8
for (int i = 0; i < 1 << 8; i++) {
int d = bucketdata[i];
bucketdata[i] = offset;
offset += d;
}
#pragma GCC unroll 64
for (int i = 0; i < n; i++) buf[bucketdata[a[i] >> 16 & 0xff]++] = a[i];
// pass 4: sort by 4th 8 bits.
memset(&bucketdata[0], 0, bucketdata.size() * sizeof(bucketdata[0]));
my_memset(bucketdata);
#pragma GCC unroll 32
for (int i = 0; i < n; i++) bucketdata[buf[i] >> 24 & 0xff]++;
offset = 0;
#pragma GCC unroll 8
for (int i = 0; i < 1 << 8; i++) {
int d = bucketdata[i];
bucketdata[i] = offset;
offset += d;
}
#pragma GCC unroll 32
for (int i = 0; i < n; i++) a[bucketdata[buf[i] >> 24 & 0xff]++] = buf[i];
free(buf);
}
void vsort(uint32_t *a, int n) {
thread_local std::vector<uint32_t> bts[256];
#pragma GCC unroll 4
for (int shift = 0; shift < 32; shift += 8) {
#pragma GCC unroll 64
for (int i = 0; i < n; i++) bts[a[i] >> shift & 0xff].push_back(a[i]);
#pragma GCC unroll 64
for (int bt = 0, k = 0; bt < 256; bt++) {
memcpy(a + k, &bts[bt][0], bts[bt].size() * sizeof(a[0]));
k += bts[bt].size();