added ypsu-variants of radix-like things
This commit is contained in:
parent
a044787846
commit
520db7049d
209
ypsu.cpp
209
ypsu.cpp
@ -10,6 +10,7 @@
|
||||
#include <set>
|
||||
#include <string>
|
||||
#include <vector>
|
||||
#include <numeric>
|
||||
#include "ska_sort.hpp"
|
||||
|
||||
|
||||
@ -78,6 +79,195 @@
|
||||
}
|
||||
return v;
|
||||
}
|
||||
|
||||
void twopass(uint32_t *a, int n) {
|
||||
assert(n * int64_t(sizeof(a[0])) <= INT_MAX);
|
||||
// alloc helper buffers.
|
||||
int sz = n * sizeof(a[0]);
|
||||
std::vector<int> bucketdata(1 << 16);
|
||||
uint32_t *buf = (uint32_t *)malloc(sz);
|
||||
assert(buf != NULL);
|
||||
// pass 1: sort by lower 16 bits.
|
||||
for (int i = 0; i < n; i++) bucketdata[a[i] & 0xffff]++;
|
||||
int offset = 0;
|
||||
for (int i = 0; i < 1 << 16; i++) {
|
||||
int d = bucketdata[i];
|
||||
bucketdata[i] = offset;
|
||||
offset += d;
|
||||
}
|
||||
for (int i = 0; i < n; i++) buf[bucketdata[a[i] & 0xffff]++] = a[i];
|
||||
// pass 2: sort by upper 16 bits.
|
||||
memset(&bucketdata[0], 0, bucketdata.size() * sizeof(bucketdata[0]));
|
||||
for (int i = 0; i < n; i++) bucketdata[buf[i] >> 16]++;
|
||||
offset = 0;
|
||||
for (int i = 0; i < 1 << 16; i++) {
|
||||
int d = bucketdata[i];
|
||||
bucketdata[i] = offset;
|
||||
offset += d;
|
||||
}
|
||||
for (int i = 0; i < n; i++) a[bucketdata[buf[i] >> 16]++] = buf[i];
|
||||
free(buf);
|
||||
}
|
||||
void fourpass(uint32_t *a, int n) {
|
||||
assert(n * int64_t(sizeof(a[0])) <= INT_MAX);
|
||||
// alloc helper buffers.
|
||||
int sz = n * sizeof(a[0]);
|
||||
std::vector<int> bucketdata(1 << 8);
|
||||
uint32_t *buf = (uint32_t *)malloc(sz);
|
||||
assert(buf != NULL);
|
||||
uint32_t *src = a, *dst = buf;
|
||||
uintptr_t swapmask = (uintptr_t)a ^ (uintptr_t)buf;
|
||||
for (int shift = 0; shift < 32; shift += 8) {
|
||||
memset(&bucketdata[0], 0, bucketdata.size() * sizeof(bucketdata[0]));
|
||||
for (int i = 0; i < n; i++) bucketdata[src[i] >> shift & 0xff]++;
|
||||
int offset = 0;
|
||||
for (int i = 0; i < 1 << 8; i++) {
|
||||
int d = bucketdata[i];
|
||||
bucketdata[i] = offset;
|
||||
offset += d;
|
||||
}
|
||||
for (int i = 0; i < n; i++) {
|
||||
dst[bucketdata[src[i] >> shift & 0xff]++] = src[i];
|
||||
}
|
||||
src = (uint32_t *)((uintptr_t)src ^ swapmask);
|
||||
dst = (uint32_t *)((uintptr_t)dst ^ swapmask);
|
||||
}
|
||||
free(buf);
|
||||
}
|
||||
// hand-unrolled fourpass.
|
||||
void fourpassu(uint32_t *a, int n) {
|
||||
assert(n * int64_t(sizeof(a[0])) <= INT_MAX);
|
||||
// alloc helper buffers.
|
||||
int sz = n * sizeof(a[0]);
|
||||
std::vector<int> bucketdata(1 << 8);
|
||||
uint32_t *buf = (uint32_t *)malloc(sz);
|
||||
assert(buf != NULL);
|
||||
// pass 1: sort by lower 8 bits.
|
||||
for (int i = 0; i < n; i++) bucketdata[a[i] & 0xff]++;
|
||||
int offset = 0;
|
||||
for (int i = 0; i < 1 << 8; i++) {
|
||||
int d = bucketdata[i];
|
||||
bucketdata[i] = offset;
|
||||
offset += d;
|
||||
}
|
||||
for (int i = 0; i < n; i++) buf[bucketdata[a[i] & 0xff]++] = a[i];
|
||||
// pass 2: sort by 2nd 8 bits.
|
||||
memset(&bucketdata[0], 0, bucketdata.size() * sizeof(bucketdata[0]));
|
||||
for (int i = 0; i < n; i++) bucketdata[buf[i] >> 8 & 0xff]++;
|
||||
offset = 0;
|
||||
for (int i = 0; i < 1 << 8; i++) {
|
||||
int d = bucketdata[i];
|
||||
bucketdata[i] = offset;
|
||||
offset += d;
|
||||
}
|
||||
for (int i = 0; i < n; i++) a[bucketdata[buf[i] >> 8 & 0xff]++] = buf[i];
|
||||
// pass 3: sort by 3rd 8 bits.
|
||||
memset(&bucketdata[0], 0, bucketdata.size() * sizeof(bucketdata[0]));
|
||||
for (int i = 0; i < n; i++) bucketdata[a[i] >> 16 & 0xff]++;
|
||||
offset = 0;
|
||||
for (int i = 0; i < 1 << 8; i++) {
|
||||
int d = bucketdata[i];
|
||||
bucketdata[i] = offset;
|
||||
offset += d;
|
||||
}
|
||||
for (int i = 0; i < n; i++) buf[bucketdata[a[i] >> 16 & 0xff]++] = a[i];
|
||||
// pass 4: sort by 4th 8 bits.
|
||||
memset(&bucketdata[0], 0, bucketdata.size() * sizeof(bucketdata[0]));
|
||||
for (int i = 0; i < n; i++) bucketdata[buf[i] >> 24 & 0xff]++;
|
||||
offset = 0;
|
||||
for (int i = 0; i < 1 << 8; i++) {
|
||||
int d = bucketdata[i];
|
||||
bucketdata[i] = offset;
|
||||
offset += d;
|
||||
}
|
||||
for (int i = 0; i < n; i++) a[bucketdata[buf[i] >> 24 & 0xff]++] = buf[i];
|
||||
free(buf);
|
||||
}
|
||||
void vsort(uint32_t *a, int n) {
|
||||
static thread_local std::vector<uint32_t> bts[256];
|
||||
for (int shift = 0; shift < 32; shift += 8) {
|
||||
for (int i = 0; i < n; i++) bts[a[i] >> shift & 0xff].push_back(a[i]);
|
||||
for (int bt = 0, k = 0; bt < 256; bt++) {
|
||||
memcpy(a + k, &bts[bt][0], bts[bt].size() * sizeof(a[0]));
|
||||
k += bts[bt].size();
|
||||
bts[bt].clear();
|
||||
}
|
||||
}
|
||||
}
|
||||
void pagedsort(uint32_t *a, int n) {
|
||||
enum { pagesize = 1024 };
|
||||
int pagecount = (n + pagesize - 1) / pagesize + 512;
|
||||
uint32_t *pd = (uint32_t *)malloc(pagecount * pagesize * sizeof(a[0]));
|
||||
std::vector<int> freelist(pagecount);
|
||||
std::vector<int> next(pagecount);
|
||||
std::iota(std::begin(freelist), std::end(freelist), 0);
|
||||
struct bucket {
|
||||
int len;
|
||||
int headpage, lastpage;
|
||||
};
|
||||
bucket bts[512];
|
||||
// initial scatter.
|
||||
for (int bt = 0; bt < 256; bt++) {
|
||||
int p = freelist.back();
|
||||
freelist.pop_back();
|
||||
bts[bt] = {0, p, p};
|
||||
}
|
||||
for (int i = 0; i < n; i++) {
|
||||
bucket *bt = &bts[a[i] & 0xff];
|
||||
pd[bt->lastpage * pagesize + bt->len++ % pagesize] = a[i];
|
||||
if (bt->len % pagesize == 0) {
|
||||
int p = freelist.back();
|
||||
freelist.pop_back();
|
||||
next[bt->lastpage] = p;
|
||||
bt->lastpage = p;
|
||||
}
|
||||
}
|
||||
// intermediate level scatters.
|
||||
int ibase = 0, obase = 256;
|
||||
for (int shift = 8; shift < 32; shift += 8) {
|
||||
for (int bt = 0; bt < 256; bt++) {
|
||||
int p = freelist.back();
|
||||
freelist.pop_back();
|
||||
bts[obase + bt] = {0, p, p};
|
||||
}
|
||||
for (int ibti = 0; ibti < 256; ibti++) {
|
||||
struct bucket *ibt = &bts[ibase + ibti];
|
||||
int page = ibt->headpage;
|
||||
for (int i = 0; i < ibt->len; i++) {
|
||||
uint32_t v = pd[page * pagesize + i % pagesize];
|
||||
struct bucket *obt = &bts[obase + (v >> shift & 0xff)];
|
||||
pd[obt->lastpage * pagesize + obt->len++ % pagesize] = v;
|
||||
if (obt->len % pagesize == 0) {
|
||||
int p = freelist.back();
|
||||
freelist.pop_back();
|
||||
next[obt->lastpage] = p;
|
||||
obt->lastpage = p;
|
||||
}
|
||||
if (i % pagesize == pagesize - 1) {
|
||||
freelist.push_back(page);
|
||||
page = next[page];
|
||||
}
|
||||
}
|
||||
freelist.push_back(ibt->lastpage);
|
||||
}
|
||||
ibase = 256 - ibase;
|
||||
obase = 256 - obase;
|
||||
}
|
||||
// the final gather.
|
||||
int k = 0;
|
||||
for (int ibti = 0; ibti < 256; ibti++) {
|
||||
struct bucket *ibt = &bts[ibase + ibti];
|
||||
int page = ibt->headpage;
|
||||
for (int i = 0; i < ibt->len; i++) {
|
||||
a[k++] = pd[page * pagesize + i % pagesize];
|
||||
if (i % pagesize == pagesize - 1) {
|
||||
page = next[page];
|
||||
}
|
||||
}
|
||||
}
|
||||
free(pd);
|
||||
}
|
||||
|
||||
int main(void) {
|
||||
int n = 100000000;
|
||||
for (auto inputtype : inputtypes) {
|
||||
@ -101,6 +291,25 @@
|
||||
w = v;
|
||||
measure(inputtype, "magyar", [&] { MagyarSort::sort(&w[0], w.size()); });
|
||||
assert(w == expected);
|
||||
|
||||
/*
|
||||
w = v;
|
||||
measure(inputtype, "2pass", [&] { twopass(&w[0], w.size()); });
|
||||
assert(w == expected);
|
||||
w = v;
|
||||
measure(inputtype, "4pass", [&] { fourpass(&w[0], w.size()); });
|
||||
assert(w == expected);
|
||||
w = v;
|
||||
measure(inputtype, "psort", [&] { pagedsort(&w[0], w.size()); });
|
||||
assert(w == expected);
|
||||
*/
|
||||
w = v;
|
||||
measure(inputtype, "4pasu", [&] { fourpassu(&w[0], w.size()); });
|
||||
assert(w == expected);
|
||||
w = v;
|
||||
measure(inputtype, "vsort", [&] { vsort(&w[0], w.size()); });
|
||||
assert(w == expected);
|
||||
|
||||
for (auto r : results) printf("%9.3fs", r.second);
|
||||
puts("");
|
||||
}
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user