magyarsort/ypsu.cpp
2021-12-15 12:52:33 +01:00

325 lines
10 KiB
C++

#include <algorithm>
#include <cassert>
#include <chrono>
#include <climits>
#include <cstdint>
#include <cstdio>
#include <cstring>
#include <functional>
#include <map>
#include <set>
#include <string>
#include <vector>
#include <numeric>
#include "ska_sort.hpp"
#define MAGYAR_SORT_DEFAULT_REUSE
#include "magyarsort.h"
std::map<std::string, double> results;
std::map<std::string, double> worst;
void measure(const std::string &inputtype, const std::string &name,
std::function<void()> f) {
auto begin = std::chrono::high_resolution_clock::now();
f();
auto dur = std::chrono::high_resolution_clock::now() - begin;
double seconds = dur / std::chrono::milliseconds(1) / 1000.0;
results[name] = seconds;
worst[name] = std::max(worst[name], seconds);
}
std::vector<std::string> inputtypes = {
"constant", "asc", "desc", "ascasc", "ascdesc",
"descasc", "descdesc", "smallrange", "rand",
};
std::vector<uint32_t> geninput(const std::string &type, int n) {
std::vector<uint32_t> v(n);
if (type == "constant") {
int c = rand();
for (int i = 0; i < n; i++) {
v[i] = c;
}
} else if (type == "asc") {
for (int i = 0; i < n; i++) {
v[i] = i;
}
} else if (type == "desc") {
for (int i = 0; i < n; i++) {
v[i] = n - i;
}
} else if (type == "ascasc") {
for (int i = 0; i < n / 2; i++) {
v[i] = i;
v[i + n / 2] = i;
}
} else if (type == "ascdesc") {
for (int i = 0; i < n / 2; i++) {
v[i] = i;
v[i + n / 2] = n - i;
}
} else if (type == "descasc") {
for (int i = 0; i < n / 2; i++) {
v[i] = n - i;
v[i + n / 2] = i;
}
} else if (type == "descdesc") {
for (int i = 0; i < n / 2; i++) {
v[i] = n - i;
v[i + n / 2] = n - i;
}
} else if (type == "smallrange") {
int c = rand() / 2;
for (int i = 0; i < n; i++) {
v[i] = c + rand() % 100;
}
} else if (type == "rand") {
for (int i = 0; i < n; i++) {
v[i] = rand();
}
}
return v;
}
void twopass(uint32_t *a, int n) {
assert(n * int64_t(sizeof(a[0])) <= INT_MAX);
// alloc helper buffers.
int sz = n * sizeof(a[0]);
std::vector<int> bucketdata(1 << 16);
uint32_t *buf = (uint32_t *)malloc(sz);
assert(buf != NULL);
// pass 1: sort by lower 16 bits.
for (int i = 0; i < n; i++) bucketdata[a[i] & 0xffff]++;
int offset = 0;
for (int i = 0; i < 1 << 16; i++) {
int d = bucketdata[i];
bucketdata[i] = offset;
offset += d;
}
for (int i = 0; i < n; i++) buf[bucketdata[a[i] & 0xffff]++] = a[i];
// pass 2: sort by upper 16 bits.
memset(&bucketdata[0], 0, bucketdata.size() * sizeof(bucketdata[0]));
for (int i = 0; i < n; i++) bucketdata[buf[i] >> 16]++;
offset = 0;
for (int i = 0; i < 1 << 16; i++) {
int d = bucketdata[i];
bucketdata[i] = offset;
offset += d;
}
for (int i = 0; i < n; i++) a[bucketdata[buf[i] >> 16]++] = buf[i];
free(buf);
}
void fourpass(uint32_t *a, int n) {
assert(n * int64_t(sizeof(a[0])) <= INT_MAX);
// alloc helper buffers.
int sz = n * sizeof(a[0]);
std::vector<int> bucketdata(1 << 8);
uint32_t *buf = (uint32_t *)malloc(sz);
assert(buf != NULL);
uint32_t *src = a, *dst = buf;
uintptr_t swapmask = (uintptr_t)a ^ (uintptr_t)buf;
for (int shift = 0; shift < 32; shift += 8) {
memset(&bucketdata[0], 0, bucketdata.size() * sizeof(bucketdata[0]));
for (int i = 0; i < n; i++) bucketdata[src[i] >> shift & 0xff]++;
int offset = 0;
for (int i = 0; i < 1 << 8; i++) {
int d = bucketdata[i];
bucketdata[i] = offset;
offset += d;
}
for (int i = 0; i < n; i++) {
dst[bucketdata[src[i] >> shift & 0xff]++] = src[i];
}
src = (uint32_t *)((uintptr_t)src ^ swapmask);
dst = (uint32_t *)((uintptr_t)dst ^ swapmask);
}
free(buf);
}
// hand-unrolled fourpass.
void fourpassu(uint32_t *a, int n) {
assert(n * int64_t(sizeof(a[0])) <= INT_MAX);
// alloc helper buffers.
int sz = n * sizeof(a[0]);
std::vector<int> bucketdata(1 << 8);
uint32_t *buf = (uint32_t *)malloc(sz);
assert(buf != NULL);
// pass 1: sort by lower 8 bits.
for (int i = 0; i < n; i++) bucketdata[a[i] & 0xff]++;
int offset = 0;
for (int i = 0; i < 1 << 8; i++) {
int d = bucketdata[i];
bucketdata[i] = offset;
offset += d;
}
for (int i = 0; i < n; i++) buf[bucketdata[a[i] & 0xff]++] = a[i];
// pass 2: sort by 2nd 8 bits.
memset(&bucketdata[0], 0, bucketdata.size() * sizeof(bucketdata[0]));
for (int i = 0; i < n; i++) bucketdata[buf[i] >> 8 & 0xff]++;
offset = 0;
for (int i = 0; i < 1 << 8; i++) {
int d = bucketdata[i];
bucketdata[i] = offset;
offset += d;
}
for (int i = 0; i < n; i++) a[bucketdata[buf[i] >> 8 & 0xff]++] = buf[i];
// pass 3: sort by 3rd 8 bits.
memset(&bucketdata[0], 0, bucketdata.size() * sizeof(bucketdata[0]));
for (int i = 0; i < n; i++) bucketdata[a[i] >> 16 & 0xff]++;
offset = 0;
for (int i = 0; i < 1 << 8; i++) {
int d = bucketdata[i];
bucketdata[i] = offset;
offset += d;
}
for (int i = 0; i < n; i++) buf[bucketdata[a[i] >> 16 & 0xff]++] = a[i];
// pass 4: sort by 4th 8 bits.
memset(&bucketdata[0], 0, bucketdata.size() * sizeof(bucketdata[0]));
for (int i = 0; i < n; i++) bucketdata[buf[i] >> 24 & 0xff]++;
offset = 0;
for (int i = 0; i < 1 << 8; i++) {
int d = bucketdata[i];
bucketdata[i] = offset;
offset += d;
}
for (int i = 0; i < n; i++) a[bucketdata[buf[i] >> 24 & 0xff]++] = buf[i];
free(buf);
}
void vsort(uint32_t *a, int n) {
static thread_local std::vector<uint32_t> bts[256];
for (int shift = 0; shift < 32; shift += 8) {
for (int i = 0; i < n; i++) bts[a[i] >> shift & 0xff].push_back(a[i]);
for (int bt = 0, k = 0; bt < 256; bt++) {
memcpy(a + k, &bts[bt][0], bts[bt].size() * sizeof(a[0]));
k += bts[bt].size();
bts[bt].clear();
}
}
}
void pagedsort(uint32_t *a, int n) {
enum { pagesize = 1024 };
int pagecount = (n + pagesize - 1) / pagesize + 512;
uint32_t *pd = (uint32_t *)malloc(pagecount * pagesize * sizeof(a[0]));
std::vector<int> freelist(pagecount);
std::vector<int> next(pagecount);
std::iota(std::begin(freelist), std::end(freelist), 0);
struct bucket {
int len;
int headpage, lastpage;
};
bucket bts[512];
// initial scatter.
for (int bt = 0; bt < 256; bt++) {
int p = freelist.back();
freelist.pop_back();
bts[bt] = {0, p, p};
}
for (int i = 0; i < n; i++) {
bucket *bt = &bts[a[i] & 0xff];
pd[bt->lastpage * pagesize + bt->len++ % pagesize] = a[i];
if (bt->len % pagesize == 0) {
int p = freelist.back();
freelist.pop_back();
next[bt->lastpage] = p;
bt->lastpage = p;
}
}
// intermediate level scatters.
int ibase = 0, obase = 256;
for (int shift = 8; shift < 32; shift += 8) {
for (int bt = 0; bt < 256; bt++) {
int p = freelist.back();
freelist.pop_back();
bts[obase + bt] = {0, p, p};
}
for (int ibti = 0; ibti < 256; ibti++) {
struct bucket *ibt = &bts[ibase + ibti];
int page = ibt->headpage;
for (int i = 0; i < ibt->len; i++) {
uint32_t v = pd[page * pagesize + i % pagesize];
struct bucket *obt = &bts[obase + (v >> shift & 0xff)];
pd[obt->lastpage * pagesize + obt->len++ % pagesize] = v;
if (obt->len % pagesize == 0) {
int p = freelist.back();
freelist.pop_back();
next[obt->lastpage] = p;
obt->lastpage = p;
}
if (i % pagesize == pagesize - 1) {
freelist.push_back(page);
page = next[page];
}
}
freelist.push_back(ibt->lastpage);
}
ibase = 256 - ibase;
obase = 256 - obase;
}
// the final gather.
int k = 0;
for (int ibti = 0; ibti < 256; ibti++) {
struct bucket *ibt = &bts[ibase + ibti];
int page = ibt->headpage;
for (int i = 0; i < ibt->len; i++) {
a[k++] = pd[page * pagesize + i % pagesize];
if (i % pagesize == pagesize - 1) {
page = next[page];
}
}
}
free(pd);
}
int main(void) {
int n = 100000000;
for (auto inputtype : inputtypes) {
printf("%10s", inputtype.c_str());
fflush(stdout);
std::vector<uint32_t> v(n), w(n), expected(n);
v = geninput(inputtype, n);
measure(inputtype, "copy", [&] { w = v; });
w = v;
measure(inputtype, "std", [&] { std::sort(std::begin(w), std::end(w)); });
expected = w;
w = v;
measure(inputtype, "ska", [&] { ska_sort(std::begin(w), std::end(w)); });
w = v;
measure(inputtype, "ska_copy", [&] {
std::vector<uint32_t> buf(w.size());
if (ska_sort_copy(std::begin(w), std::end(w), std::begin(buf))) {
w.swap(buf);
}
});
w = v;
measure(inputtype, "magyar", [&] { MagyarSort::sort(&w[0], w.size()); });
assert(w == expected);
/*
w = v;
measure(inputtype, "2pass", [&] { twopass(&w[0], w.size()); });
assert(w == expected);
w = v;
measure(inputtype, "4pass", [&] { fourpass(&w[0], w.size()); });
assert(w == expected);
w = v;
measure(inputtype, "psort", [&] { pagedsort(&w[0], w.size()); });
assert(w == expected);
*/
w = v;
measure(inputtype, "4pasu", [&] { fourpassu(&w[0], w.size()); });
assert(w == expected);
w = v;
measure(inputtype, "vsort", [&] { vsort(&w[0], w.size()); });
assert(w == expected);
for (auto r : results) printf("%9.3fs", r.second);
puts("");
}
puts("");
printf("%10s", "worst");
for (auto w : worst) printf("%9.3fs", w.second);
puts("");
printf("%10s", "");
for (auto w : worst) printf("%10s", w.first.c_str());
puts("");
return 0;
}