ypsus 4passu method optimized a bit
This commit is contained in:
parent
a947cda58d
commit
a878f20100
38
ypsu.cpp
38
ypsu.cpp
@ -134,17 +134,34 @@
|
||||
}
|
||||
free(buf);
|
||||
}
|
||||
|
||||
/** Regular memset will not do with thread_local I think... */
|
||||
void my_memset(std::vector<int> &v) {
|
||||
size_t n = v.size();
|
||||
v.clear();
|
||||
v.resize(n);
|
||||
std::fill(v.begin(), v.end(), 0);
|
||||
}
|
||||
|
||||
// hand-unrolled fourpass.
|
||||
void fourpassu(uint32_t *a, int n) {
|
||||
assert(n * int64_t(sizeof(a[0])) <= INT_MAX);
|
||||
// alloc helper buffers.
|
||||
int sz = n * sizeof(a[0]);
|
||||
std::vector<int> bucketdata(1 << 8);
|
||||
|
||||
//std::vector<int> bucketdata(1 << 8);
|
||||
// Below code crashes :-(
|
||||
static std::vector<int> bucketdata(1 << 8);
|
||||
bucketdata.resize(1 << 8); // JHapak's "pattern"
|
||||
my_memset(bucketdata);
|
||||
|
||||
uint32_t *buf = (uint32_t *)malloc(sz);
|
||||
assert(buf != NULL);
|
||||
// pass 1: sort by lower 8 bits.
|
||||
#pragma GCC unroll 32
|
||||
for (int i = 0; i < n; i++) bucketdata[a[i] & 0xff]++;
|
||||
int offset = 0;
|
||||
#pragma GCC unroll 8
|
||||
for (int i = 0; i < 1 << 8; i++) {
|
||||
int d = bucketdata[i];
|
||||
bucketdata[i] = offset;
|
||||
@ -152,41 +169,54 @@
|
||||
}
|
||||
for (int i = 0; i < n; i++) buf[bucketdata[a[i] & 0xff]++] = a[i];
|
||||
// pass 2: sort by 2nd 8 bits.
|
||||
memset(&bucketdata[0], 0, bucketdata.size() * sizeof(bucketdata[0]));
|
||||
my_memset(bucketdata);
|
||||
#pragma GCC unroll 32
|
||||
for (int i = 0; i < n; i++) bucketdata[buf[i] >> 8 & 0xff]++;
|
||||
offset = 0;
|
||||
#pragma GCC unroll 8
|
||||
for (int i = 0; i < 1 << 8; i++) {
|
||||
int d = bucketdata[i];
|
||||
bucketdata[i] = offset;
|
||||
offset += d;
|
||||
}
|
||||
#pragma GCC unroll 64
|
||||
for (int i = 0; i < n; i++) a[bucketdata[buf[i] >> 8 & 0xff]++] = buf[i];
|
||||
// pass 3: sort by 3rd 8 bits.
|
||||
memset(&bucketdata[0], 0, bucketdata.size() * sizeof(bucketdata[0]));
|
||||
my_memset(bucketdata);
|
||||
#pragma GCC unroll 32
|
||||
for (int i = 0; i < n; i++) bucketdata[a[i] >> 16 & 0xff]++;
|
||||
offset = 0;
|
||||
#pragma GCC unroll 8
|
||||
for (int i = 0; i < 1 << 8; i++) {
|
||||
int d = bucketdata[i];
|
||||
bucketdata[i] = offset;
|
||||
offset += d;
|
||||
}
|
||||
#pragma GCC unroll 64
|
||||
for (int i = 0; i < n; i++) buf[bucketdata[a[i] >> 16 & 0xff]++] = a[i];
|
||||
// pass 4: sort by 4th 8 bits.
|
||||
memset(&bucketdata[0], 0, bucketdata.size() * sizeof(bucketdata[0]));
|
||||
my_memset(bucketdata);
|
||||
#pragma GCC unroll 32
|
||||
for (int i = 0; i < n; i++) bucketdata[buf[i] >> 24 & 0xff]++;
|
||||
offset = 0;
|
||||
#pragma GCC unroll 8
|
||||
for (int i = 0; i < 1 << 8; i++) {
|
||||
int d = bucketdata[i];
|
||||
bucketdata[i] = offset;
|
||||
offset += d;
|
||||
}
|
||||
#pragma GCC unroll 32
|
||||
for (int i = 0; i < n; i++) a[bucketdata[buf[i] >> 24 & 0xff]++] = buf[i];
|
||||
free(buf);
|
||||
}
|
||||
|
||||
void vsort(uint32_t *a, int n) {
|
||||
thread_local std::vector<uint32_t> bts[256];
|
||||
#pragma GCC unroll 4
|
||||
for (int shift = 0; shift < 32; shift += 8) {
|
||||
#pragma GCC unroll 64
|
||||
for (int i = 0; i < n; i++) bts[a[i] >> shift & 0xff].push_back(a[i]);
|
||||
#pragma GCC unroll 64
|
||||
for (int bt = 0, k = 0; bt < 256; bt++) {
|
||||
memcpy(a + k, &bts[bt][0], bts[bt].size() * sizeof(a[0]));
|
||||
k += bts[bt].size();
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user