ypsus 4passu method optimized a bit
This commit is contained in:
parent
a947cda58d
commit
a878f20100
38
ypsu.cpp
38
ypsu.cpp
@ -134,17 +134,34 @@
|
|||||||
}
|
}
|
||||||
free(buf);
|
free(buf);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/** Regular memset will not do with thread_local I think... */
|
||||||
|
void my_memset(std::vector<int> &v) {
|
||||||
|
size_t n = v.size();
|
||||||
|
v.clear();
|
||||||
|
v.resize(n);
|
||||||
|
std::fill(v.begin(), v.end(), 0);
|
||||||
|
}
|
||||||
|
|
||||||
// hand-unrolled fourpass.
|
// hand-unrolled fourpass.
|
||||||
void fourpassu(uint32_t *a, int n) {
|
void fourpassu(uint32_t *a, int n) {
|
||||||
assert(n * int64_t(sizeof(a[0])) <= INT_MAX);
|
assert(n * int64_t(sizeof(a[0])) <= INT_MAX);
|
||||||
// alloc helper buffers.
|
// alloc helper buffers.
|
||||||
int sz = n * sizeof(a[0]);
|
int sz = n * sizeof(a[0]);
|
||||||
std::vector<int> bucketdata(1 << 8);
|
|
||||||
|
//std::vector<int> bucketdata(1 << 8);
|
||||||
|
// Below code crashes :-(
|
||||||
|
static std::vector<int> bucketdata(1 << 8);
|
||||||
|
bucketdata.resize(1 << 8); // JHapak's "pattern"
|
||||||
|
my_memset(bucketdata);
|
||||||
|
|
||||||
uint32_t *buf = (uint32_t *)malloc(sz);
|
uint32_t *buf = (uint32_t *)malloc(sz);
|
||||||
assert(buf != NULL);
|
assert(buf != NULL);
|
||||||
// pass 1: sort by lower 8 bits.
|
// pass 1: sort by lower 8 bits.
|
||||||
|
#pragma GCC unroll 32
|
||||||
for (int i = 0; i < n; i++) bucketdata[a[i] & 0xff]++;
|
for (int i = 0; i < n; i++) bucketdata[a[i] & 0xff]++;
|
||||||
int offset = 0;
|
int offset = 0;
|
||||||
|
#pragma GCC unroll 8
|
||||||
for (int i = 0; i < 1 << 8; i++) {
|
for (int i = 0; i < 1 << 8; i++) {
|
||||||
int d = bucketdata[i];
|
int d = bucketdata[i];
|
||||||
bucketdata[i] = offset;
|
bucketdata[i] = offset;
|
||||||
@ -152,41 +169,54 @@
|
|||||||
}
|
}
|
||||||
for (int i = 0; i < n; i++) buf[bucketdata[a[i] & 0xff]++] = a[i];
|
for (int i = 0; i < n; i++) buf[bucketdata[a[i] & 0xff]++] = a[i];
|
||||||
// pass 2: sort by 2nd 8 bits.
|
// pass 2: sort by 2nd 8 bits.
|
||||||
memset(&bucketdata[0], 0, bucketdata.size() * sizeof(bucketdata[0]));
|
my_memset(bucketdata);
|
||||||
|
#pragma GCC unroll 32
|
||||||
for (int i = 0; i < n; i++) bucketdata[buf[i] >> 8 & 0xff]++;
|
for (int i = 0; i < n; i++) bucketdata[buf[i] >> 8 & 0xff]++;
|
||||||
offset = 0;
|
offset = 0;
|
||||||
|
#pragma GCC unroll 8
|
||||||
for (int i = 0; i < 1 << 8; i++) {
|
for (int i = 0; i < 1 << 8; i++) {
|
||||||
int d = bucketdata[i];
|
int d = bucketdata[i];
|
||||||
bucketdata[i] = offset;
|
bucketdata[i] = offset;
|
||||||
offset += d;
|
offset += d;
|
||||||
}
|
}
|
||||||
|
#pragma GCC unroll 64
|
||||||
for (int i = 0; i < n; i++) a[bucketdata[buf[i] >> 8 & 0xff]++] = buf[i];
|
for (int i = 0; i < n; i++) a[bucketdata[buf[i] >> 8 & 0xff]++] = buf[i];
|
||||||
// pass 3: sort by 3rd 8 bits.
|
// pass 3: sort by 3rd 8 bits.
|
||||||
memset(&bucketdata[0], 0, bucketdata.size() * sizeof(bucketdata[0]));
|
my_memset(bucketdata);
|
||||||
|
#pragma GCC unroll 32
|
||||||
for (int i = 0; i < n; i++) bucketdata[a[i] >> 16 & 0xff]++;
|
for (int i = 0; i < n; i++) bucketdata[a[i] >> 16 & 0xff]++;
|
||||||
offset = 0;
|
offset = 0;
|
||||||
|
#pragma GCC unroll 8
|
||||||
for (int i = 0; i < 1 << 8; i++) {
|
for (int i = 0; i < 1 << 8; i++) {
|
||||||
int d = bucketdata[i];
|
int d = bucketdata[i];
|
||||||
bucketdata[i] = offset;
|
bucketdata[i] = offset;
|
||||||
offset += d;
|
offset += d;
|
||||||
}
|
}
|
||||||
|
#pragma GCC unroll 64
|
||||||
for (int i = 0; i < n; i++) buf[bucketdata[a[i] >> 16 & 0xff]++] = a[i];
|
for (int i = 0; i < n; i++) buf[bucketdata[a[i] >> 16 & 0xff]++] = a[i];
|
||||||
// pass 4: sort by 4th 8 bits.
|
// pass 4: sort by 4th 8 bits.
|
||||||
memset(&bucketdata[0], 0, bucketdata.size() * sizeof(bucketdata[0]));
|
my_memset(bucketdata);
|
||||||
|
#pragma GCC unroll 32
|
||||||
for (int i = 0; i < n; i++) bucketdata[buf[i] >> 24 & 0xff]++;
|
for (int i = 0; i < n; i++) bucketdata[buf[i] >> 24 & 0xff]++;
|
||||||
offset = 0;
|
offset = 0;
|
||||||
|
#pragma GCC unroll 8
|
||||||
for (int i = 0; i < 1 << 8; i++) {
|
for (int i = 0; i < 1 << 8; i++) {
|
||||||
int d = bucketdata[i];
|
int d = bucketdata[i];
|
||||||
bucketdata[i] = offset;
|
bucketdata[i] = offset;
|
||||||
offset += d;
|
offset += d;
|
||||||
}
|
}
|
||||||
|
#pragma GCC unroll 32
|
||||||
for (int i = 0; i < n; i++) a[bucketdata[buf[i] >> 24 & 0xff]++] = buf[i];
|
for (int i = 0; i < n; i++) a[bucketdata[buf[i] >> 24 & 0xff]++] = buf[i];
|
||||||
free(buf);
|
free(buf);
|
||||||
}
|
}
|
||||||
|
|
||||||
void vsort(uint32_t *a, int n) {
|
void vsort(uint32_t *a, int n) {
|
||||||
thread_local std::vector<uint32_t> bts[256];
|
thread_local std::vector<uint32_t> bts[256];
|
||||||
|
#pragma GCC unroll 4
|
||||||
for (int shift = 0; shift < 32; shift += 8) {
|
for (int shift = 0; shift < 32; shift += 8) {
|
||||||
|
#pragma GCC unroll 64
|
||||||
for (int i = 0; i < n; i++) bts[a[i] >> shift & 0xff].push_back(a[i]);
|
for (int i = 0; i < n; i++) bts[a[i] >> shift & 0xff].push_back(a[i]);
|
||||||
|
#pragma GCC unroll 64
|
||||||
for (int bt = 0, k = 0; bt < 256; bt++) {
|
for (int bt = 0, k = 0; bt < 256; bt++) {
|
||||||
memcpy(a + k, &bts[bt][0], bts[bt].size() * sizeof(a[0]));
|
memcpy(a + k, &bts[bt][0], bts[bt].size() * sizeof(a[0]));
|
||||||
k += bts[bt].size();
|
k += bts[bt].size();
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user