little optimization to gpts and mine space partition bucket sort

This commit is contained in:
Richard Thier 2022-12-10 11:04:53 +01:00
parent 7e21807668
commit 50b1997d5c
3 changed files with 485 additions and 442 deletions

View File

@ -2,6 +2,7 @@
#include <vector> #include <vector>
#include <algorithm> #include <algorithm>
// ChatGPT and me did this space partitioning bucket sort
void gpt_bucket_sort(uint32_t* array, int n) { void gpt_bucket_sort(uint32_t* array, int n) {
// Calculate the number of buckets to use // Calculate the number of buckets to use
int num_buckets = std::sqrt(n); int num_buckets = std::sqrt(n);
@ -36,3 +37,40 @@ void gpt_bucket_sort(uint32_t* array, int n) {
} }
} }
} }
// Further optimizations (no chatGPT)
void my_bucket_sort(uint32_t* array, int n) {
// Calculate the number of buckets to use
int num_buckets = std::sqrt(n);
// Create a vector of buckets
std::vector<std::vector<uint32_t>> buckets(num_buckets);
// Calculate the range of values that each bucket can hold
auto mm = std::minmax_element(array, array + n);
uint32_t min_value = *mm.first;
uint32_t max_value = *mm.second;
uint32_t range = max_value - min_value + 1;
uint32_t bucket_size = range / num_buckets + 1;
// Distribute the elements of the array into the buckets
for (int i = 0; i < n; i++) {
// Calculate the bucket index for this element
// using the range of values and the bucket size as the divisor
int bucket_index = (array[i] - min_value) / bucket_size;
buckets[bucket_index].push_back(array[i]);
}
// Sort the elements in each bucket using std::sort
for (int i = 0; i < num_buckets; i++) {
std::sort(buckets[i].begin(), buckets[i].end());
}
// Concatenate the buckets to get the sorted array
int k = 0;
for (int i = 0; i < num_buckets; i++) {
for (int j = 0; j < buckets[i].size(); j++) {
array[k++] = buckets[i][j];
}
}
}

View File

@ -31,5 +31,8 @@ clang_release: test.cpp magyarsort.h
clang_release3: test.cpp magyarsort.h clang_release3: test.cpp magyarsort.h
clang++ test.cpp -DNDEBUG -std=c++17 -O3 -o test.out clang++ test.cpp -DNDEBUG -std=c++17 -O3 -o test.out
clang_release_ypsu: ypsu.cpp magyarsort.h
clang++ ypsu.cpp -DNDEBUG -std=c++17 -O2 -o ypsu.out
clean: test.out clean: test.out
rm test.out rm test.out

886
ypsu.cpp
View File

@ -1,490 +1,492 @@
#include <algorithm> #include <algorithm>
#include <cassert> #include <cassert>
#include <chrono> #include <chrono>
#include <climits> #include <climits>
#include <cstdint> #include <cstdint>
#include <cstdio> #include <cstdio>
#include <cstring> #include <cstring>
#include <functional> #include <functional>
#include <map> #include <map>
#include <set> #include <set>
#include <string> #include <string>
#include <vector> #include <vector>
#include <numeric> #include <numeric>
#include <sys/mman.h> // mlock & munlock #include <sys/mman.h> // mlock & munlock
#include "ska_sort.hpp" #include "ska_sort.hpp"
#include "gptsort.h" #include "gptsort.h"
#define MAGYAR_SORT_DEFAULT_REUSE #define MAGYAR_SORT_DEFAULT_REUSE
#include "magyarsort.h" #include "magyarsort.h"
#include "space_partitioning_sort/spsort.h" #include "space_partitioning_sort/spsort.h"
std::map<std::string, double> results; std::map<std::string, double> results;
std::map<std::string, double> worst; std::map<std::string, double> worst;
void measure(const std::string &inputtype, const std::string &name, void measure(const std::string &inputtype, const std::string &name,
std::function<void()> f) { std::function<void()> f) {
auto begin = std::chrono::high_resolution_clock::now(); auto begin = std::chrono::high_resolution_clock::now();
f(); f();
auto dur = std::chrono::high_resolution_clock::now() - begin; auto dur = std::chrono::high_resolution_clock::now() - begin;
double seconds = dur / std::chrono::milliseconds(1) / 1000.0; double seconds = dur / std::chrono::milliseconds(1) / 1000.0;
results[name] = seconds; results[name] = seconds;
worst[name] = std::max(worst[name], seconds); worst[name] = std::max(worst[name], seconds);
} }
std::vector<std::string> inputtypes = { std::vector<std::string> inputtypes = {
"constant", "asc", "desc", "ascasc", "ascdesc", "constant", "asc", "desc", "ascasc", "ascdesc",
"descasc", "descdesc", "smallrange", "rand", "descasc", "descdesc", "smallrange", "rand",
}; };
std::vector<uint32_t> geninput(const std::string &type, int n) { std::vector<uint32_t> geninput(const std::string &type, int n) {
std::vector<uint32_t> v(n); std::vector<uint32_t> v(n);
if (type == "constant") { if (type == "constant") {
int c = rand(); int c = rand();
for (int i = 0; i < n; i++) { for (int i = 0; i < n; i++) {
v[i] = c; v[i] = c;
}
} else if (type == "asc") {
for (int i = 0; i < n; i++) {
v[i] = i;
}
} else if (type == "desc") {
for (int i = 0; i < n; i++) {
v[i] = n - i;
}
} else if (type == "ascasc") {
for (int i = 0; i < n / 2; i++) {
v[i] = i;
v[i + n / 2] = i;
}
} else if (type == "ascdesc") {
for (int i = 0; i < n / 2; i++) {
v[i] = i;
v[i + n / 2] = n - i;
}
} else if (type == "descasc") {
for (int i = 0; i < n / 2; i++) {
v[i] = n - i;
v[i + n / 2] = i;
}
} else if (type == "descdesc") {
for (int i = 0; i < n / 2; i++) {
v[i] = n - i;
v[i + n / 2] = n - i;
}
} else if (type == "smallrange") {
int c = rand() / 2;
for (int i = 0; i < n; i++) {
v[i] = c + rand() % 100;
}
} else if (type == "rand") {
for (int i = 0; i < n; i++) {
v[i] = rand();
}
} }
return v; } else if (type == "asc") {
} for (int i = 0; i < n; i++) {
v[i] = i;
void twopass(uint32_t *a, int n) {
assert(n * int64_t(sizeof(a[0])) <= INT_MAX);
// alloc helper buffers.
int sz = n * sizeof(a[0]);
std::vector<int> bucketdata(1 << 16);
uint32_t *buf = (uint32_t *)malloc(sz);
assert(buf != NULL);
// pass 1: sort by lower 16 bits.
for (int i = 0; i < n; i++) bucketdata[a[i] & 0xffff]++;
int offset = 0;
for (int i = 0; i < 1 << 16; i++) {
int d = bucketdata[i];
bucketdata[i] = offset;
offset += d;
} }
for (int i = 0; i < n; i++) buf[bucketdata[a[i] & 0xffff]++] = a[i]; } else if (type == "desc") {
// pass 2: sort by upper 16 bits. for (int i = 0; i < n; i++) {
v[i] = n - i;
}
} else if (type == "ascasc") {
for (int i = 0; i < n / 2; i++) {
v[i] = i;
v[i + n / 2] = i;
}
} else if (type == "ascdesc") {
for (int i = 0; i < n / 2; i++) {
v[i] = i;
v[i + n / 2] = n - i;
}
} else if (type == "descasc") {
for (int i = 0; i < n / 2; i++) {
v[i] = n - i;
v[i + n / 2] = i;
}
} else if (type == "descdesc") {
for (int i = 0; i < n / 2; i++) {
v[i] = n - i;
v[i + n / 2] = n - i;
}
} else if (type == "smallrange") {
int c = rand() / 2;
for (int i = 0; i < n; i++) {
v[i] = c + rand() % 100;
}
} else if (type == "rand") {
for (int i = 0; i < n; i++) {
v[i] = rand();
}
}
return v;
}
void twopass(uint32_t *a, int n) {
assert(n * int64_t(sizeof(a[0])) <= INT_MAX);
// alloc helper buffers.
int sz = n * sizeof(a[0]);
std::vector<int> bucketdata(1 << 16);
uint32_t *buf = (uint32_t *)malloc(sz);
assert(buf != NULL);
// pass 1: sort by lower 16 bits.
for (int i = 0; i < n; i++) bucketdata[a[i] & 0xffff]++;
int offset = 0;
for (int i = 0; i < 1 << 16; i++) {
int d = bucketdata[i];
bucketdata[i] = offset;
offset += d;
}
for (int i = 0; i < n; i++) buf[bucketdata[a[i] & 0xffff]++] = a[i];
// pass 2: sort by upper 16 bits.
memset(&bucketdata[0], 0, bucketdata.size() * sizeof(bucketdata[0]));
for (int i = 0; i < n; i++) bucketdata[buf[i] >> 16]++;
offset = 0;
for (int i = 0; i < 1 << 16; i++) {
int d = bucketdata[i];
bucketdata[i] = offset;
offset += d;
}
for (int i = 0; i < n; i++) a[bucketdata[buf[i] >> 16]++] = buf[i];
free(buf);
}
void fourpass(uint32_t *a, int n) {
assert(n * int64_t(sizeof(a[0])) <= INT_MAX);
// alloc helper buffers.
int sz = n * sizeof(a[0]);
std::vector<int> bucketdata(1 << 8);
uint32_t *buf = (uint32_t *)malloc(sz);
assert(buf != NULL);
uint32_t *src = a, *dst = buf;
uintptr_t swapmask = (uintptr_t)a ^ (uintptr_t)buf;
for (int shift = 0; shift < 32; shift += 8) {
memset(&bucketdata[0], 0, bucketdata.size() * sizeof(bucketdata[0])); memset(&bucketdata[0], 0, bucketdata.size() * sizeof(bucketdata[0]));
for (int i = 0; i < n; i++) bucketdata[buf[i] >> 16]++; for (int i = 0; i < n; i++) bucketdata[src[i] >> shift & 0xff]++;
offset = 0;
for (int i = 0; i < 1 << 16; i++) {
int d = bucketdata[i];
bucketdata[i] = offset;
offset += d;
}
for (int i = 0; i < n; i++) a[bucketdata[buf[i] >> 16]++] = buf[i];
free(buf);
}
void fourpass(uint32_t *a, int n) {
assert(n * int64_t(sizeof(a[0])) <= INT_MAX);
// alloc helper buffers.
int sz = n * sizeof(a[0]);
std::vector<int> bucketdata(1 << 8);
uint32_t *buf = (uint32_t *)malloc(sz);
assert(buf != NULL);
uint32_t *src = a, *dst = buf;
uintptr_t swapmask = (uintptr_t)a ^ (uintptr_t)buf;
for (int shift = 0; shift < 32; shift += 8) {
memset(&bucketdata[0], 0, bucketdata.size() * sizeof(bucketdata[0]));
for (int i = 0; i < n; i++) bucketdata[src[i] >> shift & 0xff]++;
int offset = 0;
for (int i = 0; i < 1 << 8; i++) {
int d = bucketdata[i];
bucketdata[i] = offset;
offset += d;
}
for (int i = 0; i < n; i++) {
dst[bucketdata[src[i] >> shift & 0xff]++] = src[i];
}
src = (uint32_t *)((uintptr_t)src ^ swapmask);
dst = (uint32_t *)((uintptr_t)dst ^ swapmask);
}
free(buf);
}
/** Only werks für das fourpassu! */
void my_memset(int *v) {
memset(v, 0, (1 << 8) * sizeof(int));
}
// hand-unrolled fourpass.
void fourpassu(uint32_t *a, int n) {
assert(n * int64_t(sizeof(a[0])) <= INT_MAX);
// alloc helper buffers.
int sz = n * sizeof(a[0]);
static thread_local int bucketdata[1 << 8];
my_memset(bucketdata);
uint32_t *buf = (uint32_t *)malloc(sz);
assert(buf != NULL);
// pass 1: sort by lower 8 bits.
#pragma GCC unroll 32
for (int i = 0; i < n; i++) bucketdata[a[i] & 0xff]++;
int offset = 0; int offset = 0;
#pragma GCC unroll 8
for (int i = 0; i < 1 << 8; i++) { for (int i = 0; i < 1 << 8; i++) {
int d = bucketdata[i]; int d = bucketdata[i];
bucketdata[i] = offset; bucketdata[i] = offset;
offset += d; offset += d;
} }
for (int i = 0; i < n; i++) buf[bucketdata[a[i] & 0xff]++] = a[i]; for (int i = 0; i < n; i++) {
// pass 2: sort by 2nd 8 bits. dst[bucketdata[src[i] >> shift & 0xff]++] = src[i];
my_memset(bucketdata);
#pragma GCC unroll 32
for (int i = 0; i < n; i++) bucketdata[buf[i] >> 8 & 0xff]++;
offset = 0;
#pragma GCC unroll 8
for (int i = 0; i < 1 << 8; i++) {
int d = bucketdata[i];
bucketdata[i] = offset;
offset += d;
} }
#pragma GCC unroll 64 src = (uint32_t *)((uintptr_t)src ^ swapmask);
for (int i = 0; i < n; i++) a[bucketdata[buf[i] >> 8 & 0xff]++] = buf[i]; dst = (uint32_t *)((uintptr_t)dst ^ swapmask);
// pass 3: sort by 3rd 8 bits.
my_memset(bucketdata);
#pragma GCC unroll 32
for (int i = 0; i < n; i++) bucketdata[a[i] >> 16 & 0xff]++;
offset = 0;
#pragma GCC unroll 8
for (int i = 0; i < 1 << 8; i++) {
int d = bucketdata[i];
bucketdata[i] = offset;
offset += d;
}
#pragma GCC unroll 64
for (int i = 0; i < n; i++) buf[bucketdata[a[i] >> 16 & 0xff]++] = a[i];
// pass 4: sort by 4th 8 bits.
my_memset(bucketdata);
#pragma GCC unroll 32
for (int i = 0; i < n; i++) bucketdata[buf[i] >> 24 & 0xff]++;
offset = 0;
#pragma GCC unroll 8
for (int i = 0; i < 1 << 8; i++) {
int d = bucketdata[i];
bucketdata[i] = offset;
offset += d;
}
#pragma GCC unroll 32
for (int i = 0; i < n; i++) a[bucketdata[buf[i] >> 24 & 0xff]++] = buf[i];
free(buf);
} }
free(buf);
}
static inline uint32_t byterotate(uint32_t x) { return (x >> 8) | (x << 24); } /** Only werks für das fourpassu! */
void fourrots(uint32_t *arr, int n) { void my_memset(int *v) {
assert(n * int64_t(sizeof(arr[0])) <= INT_MAX); memset(v, 0, (1 << 8) * sizeof(int));
assert(n % 4 == 0); }
// alloc helper buffers.
int sz = n * sizeof(arr[0]); // hand-unrolled fourpass.
std::vector<int> bucketdata(1 << 8); void fourpassu(uint32_t *a, int n) {
int *btd = &bucketdata[0]; assert(n * int64_t(sizeof(a[0])) <= INT_MAX);
uint32_t *buf = (uint32_t *)malloc(sz); // alloc helper buffers.
assert(buf != NULL); int sz = n * sizeof(a[0]);
uint32_t *src = arr, *dst = buf;
uintptr_t swapmask = (uintptr_t)arr ^ (uintptr_t)buf; static thread_local int bucketdata[1 << 8];
uint32_t a, b, c, d; my_memset(bucketdata);
uint32_t abt, bbt, cbt, dbt;
for (int shift = 0; shift < 32; shift += 8) { uint32_t *buf = (uint32_t *)malloc(sz);
memset(btd, 0, bucketdata.size() * sizeof(bucketdata[0])); assert(buf != NULL);
for (int i = 0; i < n; i += 4) { // pass 1: sort by lower 8 bits.
a = src[i]; #pragma GCC unroll 32
b = src[i + 1]; for (int i = 0; i < n; i++) bucketdata[a[i] & 0xff]++;
c = src[i + 2]; int offset = 0;
d = src[i + 3]; #pragma GCC unroll 8
abt = a & 0xff; for (int i = 0; i < 1 << 8; i++) {
bbt = b & 0xff; int d = bucketdata[i];
cbt = c & 0xff; bucketdata[i] = offset;
dbt = d & 0xff; offset += d;
btd[abt]++;
btd[bbt]++;
btd[cbt]++;
btd[dbt]++;
}
int offset = 0;
for (int i = 0; i < 1 << 8; i++) {
int d = bucketdata[i];
bucketdata[i] = offset;
offset += d;
}
for (int i = 0; i < n; i += 4) {
a = src[i];
b = src[i + 1];
c = src[i + 2];
d = src[i + 3];
abt = a & 0xff;
bbt = b & 0xff;
cbt = c & 0xff;
dbt = d & 0xff;
dst[btd[abt]++] = byterotate(a);
dst[btd[bbt]++] = byterotate(b);
dst[btd[cbt]++] = byterotate(c);
dst[btd[dbt]++] = byterotate(d);
}
src = (uint32_t *)((uintptr_t)src ^ swapmask);
dst = (uint32_t *)((uintptr_t)dst ^ swapmask);
}
free(buf);
} }
for (int i = 0; i < n; i++) buf[bucketdata[a[i] & 0xff]++] = a[i];
// pass 2: sort by 2nd 8 bits.
my_memset(bucketdata);
#pragma GCC unroll 32
for (int i = 0; i < n; i++) bucketdata[buf[i] >> 8 & 0xff]++;
offset = 0;
#pragma GCC unroll 8
for (int i = 0; i < 1 << 8; i++) {
int d = bucketdata[i];
bucketdata[i] = offset;
offset += d;
}
#pragma GCC unroll 64
for (int i = 0; i < n; i++) a[bucketdata[buf[i] >> 8 & 0xff]++] = buf[i];
// pass 3: sort by 3rd 8 bits.
my_memset(bucketdata);
#pragma GCC unroll 32
for (int i = 0; i < n; i++) bucketdata[a[i] >> 16 & 0xff]++;
offset = 0;
#pragma GCC unroll 8
for (int i = 0; i < 1 << 8; i++) {
int d = bucketdata[i];
bucketdata[i] = offset;
offset += d;
}
#pragma GCC unroll 64
for (int i = 0; i < n; i++) buf[bucketdata[a[i] >> 16 & 0xff]++] = a[i];
// pass 4: sort by 4th 8 bits.
my_memset(bucketdata);
#pragma GCC unroll 32
for (int i = 0; i < n; i++) bucketdata[buf[i] >> 24 & 0xff]++;
offset = 0;
#pragma GCC unroll 8
for (int i = 0; i < 1 << 8; i++) {
int d = bucketdata[i];
bucketdata[i] = offset;
offset += d;
}
#pragma GCC unroll 32
for (int i = 0; i < n; i++) a[bucketdata[buf[i] >> 24 & 0xff]++] = buf[i];
free(buf);
}
// frewr - four rewrites. static inline uint32_t byterotate(uint32_t x) { return (x >> 8) | (x << 24); }
void frewr(uint32_t *arr, int n) { void fourrots(uint32_t *arr, int n) {
uint32_t *tmpbuf = (uint32_t *)malloc(n * 4); assert(n * int64_t(sizeof(arr[0])) <= INT_MAX);
mlock(tmpbuf, n * 4); assert(n % 4 == 0);
int btoffsets[4][256] = {}; // alloc helper buffers.
int sz = n * sizeof(arr[0]);
std::vector<int> bucketdata(1 << 8);
int *btd = &bucketdata[0];
uint32_t *buf = (uint32_t *)malloc(sz);
assert(buf != NULL);
uint32_t *src = arr, *dst = buf;
uintptr_t swapmask = (uintptr_t)arr ^ (uintptr_t)buf;
uint32_t a, b, c, d;
uint32_t abt, bbt, cbt, dbt;
for (int shift = 0; shift < 32; shift += 8) {
memset(btd, 0, bucketdata.size() * sizeof(bucketdata[0]));
for (int i = 0; i < n; i += 4) {
a = src[i];
b = src[i + 1];
c = src[i + 2];
d = src[i + 3];
abt = a & 0xff;
bbt = b & 0xff;
cbt = c & 0xff;
dbt = d & 0xff;
btd[abt]++;
btd[bbt]++;
btd[cbt]++;
btd[dbt]++;
}
int offset = 0;
for (int i = 0; i < 1 << 8; i++) {
int d = bucketdata[i];
bucketdata[i] = offset;
offset += d;
}
for (int i = 0; i < n; i += 4) {
a = src[i];
b = src[i + 1];
c = src[i + 2];
d = src[i + 3];
abt = a & 0xff;
bbt = b & 0xff;
cbt = c & 0xff;
dbt = d & 0xff;
dst[btd[abt]++] = byterotate(a);
dst[btd[bbt]++] = byterotate(b);
dst[btd[cbt]++] = byterotate(c);
dst[btd[dbt]++] = byterotate(d);
}
src = (uint32_t *)((uintptr_t)src ^ swapmask);
dst = (uint32_t *)((uintptr_t)dst ^ swapmask);
}
free(buf);
}
// frewr - four rewrites.
void frewr(uint32_t *arr, int n) {
uint32_t *tmpbuf = (uint32_t *)malloc(n * 4);
mlock(tmpbuf, n * 4);
int btoffsets[4][256] = {};
#pragma GCC unroll 64
for (int i = n - 1; i >= 0; i--) {
uint32_t a = arr[i];
btoffsets[3][a & 0xff]++;
btoffsets[2][a >> 8 & 0xff]++;
btoffsets[1][a >> 16 & 0xff]++;
btoffsets[0][a >> 24 & 0xff]++;
}
int btend[4] = {n - 1, n - 1, n - 1, n - 1};
#pragma GCC unroll 16
for (int i = 255; i >= 0; i--) {
#pragma GCC unroll 4
for (int pass = 3; pass >= 0; pass--) {
int nbtend = btend[pass] - btoffsets[pass][i];
btoffsets[pass][i] = btend[pass];
btend[pass] = nbtend;
}
}
uint32_t *src = arr, *dst = tmpbuf;
#pragma GCC unroll 4
for (int pass = 3; pass >= 0; pass--) {
int *off = btoffsets[pass];
#pragma GCC unroll 64 #pragma GCC unroll 64
for (int i = n - 1; i >= 0; i--) { for (int i = n - 1; i >= 0; i--) {
uint32_t a = arr[i]; uint32_t v = src[i];
btoffsets[3][a & 0xff]++; dst[off[v & 0xff]--] = v >> 8 | v << 24;
btoffsets[2][a >> 8 & 0xff]++; __builtin_prefetch(&dst[off[v & 0xff] - 2]);
btoffsets[1][a >> 16 & 0xff]++;
btoffsets[0][a >> 24 & 0xff]++;
} }
int btend[4] = {n - 1, n - 1, n - 1, n - 1}; uint32_t *tmp = src;
#pragma GCC unroll 16 src = dst;
for (int i = 255; i >= 0; i--) { dst = tmp;
#pragma GCC unroll 4
for (int pass = 3; pass >= 0; pass--) {
int nbtend = btend[pass] - btoffsets[pass][i];
btoffsets[pass][i] = btend[pass];
btend[pass] = nbtend;
}
}
uint32_t *src = arr, *dst = tmpbuf;
#pragma GCC unroll 4
for (int pass = 3; pass >= 0; pass--) {
int *off = btoffsets[pass];
#pragma GCC unroll 64
for (int i = n - 1; i >= 0; i--) {
uint32_t v = src[i];
dst[off[v & 0xff]--] = v >> 8 | v << 24;
__builtin_prefetch(&dst[off[v & 0xff] - 2]);
}
uint32_t *tmp = src;
src = dst;
dst = tmp;
}
munlock(tmpbuf, n * 4);
free(tmpbuf);
} }
munlock(tmpbuf, n * 4);
free(tmpbuf);
}
void vsort(uint32_t *a, int n) { void vsort(uint32_t *a, int n) {
thread_local std::vector<uint32_t> bts[256]; thread_local std::vector<uint32_t> bts[256];
#pragma GCC unroll 4 #pragma GCC unroll 4
for (int shift = 0; shift < 32; shift += 8) { for (int shift = 0; shift < 32; shift += 8) {
#pragma GCC unroll 64 #pragma GCC unroll 64
for (int i = 0; i < n; i++) bts[a[i] >> shift & 0xff].push_back(a[i]); for (int i = 0; i < n; i++) bts[a[i] >> shift & 0xff].push_back(a[i]);
#pragma GCC unroll 64 #pragma GCC unroll 64
for (int bt = 0, k = 0; bt < 256; bt++) { for (int bt = 0, k = 0; bt < 256; bt++) {
memcpy(a + k, &bts[bt][0], bts[bt].size() * sizeof(a[0])); memcpy(a + k, &bts[bt][0], bts[bt].size() * sizeof(a[0]));
k += bts[bt].size(); k += bts[bt].size();
bts[bt].clear(); bts[bt].clear();
}
} }
} }
void pagedsort(uint32_t *a, int n) { }
enum { pagesize = 1024 }; void pagedsort(uint32_t *a, int n) {
int pagecount = (n + pagesize - 1) / pagesize + 512; enum { pagesize = 1024 };
uint32_t *pd = (uint32_t *)malloc(pagecount * pagesize * sizeof(a[0])); int pagecount = (n + pagesize - 1) / pagesize + 512;
std::vector<int> freelist(pagecount); uint32_t *pd = (uint32_t *)malloc(pagecount * pagesize * sizeof(a[0]));
std::vector<int> next(pagecount); std::vector<int> freelist(pagecount);
std::iota(std::begin(freelist), std::end(freelist), 0); std::vector<int> next(pagecount);
struct bucket { std::iota(std::begin(freelist), std::end(freelist), 0);
int len; struct bucket {
int headpage, lastpage; int len;
}; int headpage, lastpage;
bucket bts[512]; };
// initial scatter. bucket bts[512];
// initial scatter.
for (int bt = 0; bt < 256; bt++) {
int p = freelist.back();
freelist.pop_back();
bts[bt] = {0, p, p};
}
for (int i = 0; i < n; i++) {
bucket *bt = &bts[a[i] & 0xff];
pd[bt->lastpage * pagesize + bt->len++ % pagesize] = a[i];
if (bt->len % pagesize == 0) {
int p = freelist.back();
freelist.pop_back();
next[bt->lastpage] = p;
bt->lastpage = p;
}
}
// intermediate level scatters.
int ibase = 0, obase = 256;
for (int shift = 8; shift < 32; shift += 8) {
for (int bt = 0; bt < 256; bt++) { for (int bt = 0; bt < 256; bt++) {
int p = freelist.back(); int p = freelist.back();
freelist.pop_back(); freelist.pop_back();
bts[bt] = {0, p, p}; bts[obase + bt] = {0, p, p};
} }
for (int i = 0; i < n; i++) {
bucket *bt = &bts[a[i] & 0xff];
pd[bt->lastpage * pagesize + bt->len++ % pagesize] = a[i];
if (bt->len % pagesize == 0) {
int p = freelist.back();
freelist.pop_back();
next[bt->lastpage] = p;
bt->lastpage = p;
}
}
// intermediate level scatters.
int ibase = 0, obase = 256;
for (int shift = 8; shift < 32; shift += 8) {
for (int bt = 0; bt < 256; bt++) {
int p = freelist.back();
freelist.pop_back();
bts[obase + bt] = {0, p, p};
}
for (int ibti = 0; ibti < 256; ibti++) {
struct bucket *ibt = &bts[ibase + ibti];
int page = ibt->headpage;
for (int i = 0; i < ibt->len; i++) {
uint32_t v = pd[page * pagesize + i % pagesize];
struct bucket *obt = &bts[obase + (v >> shift & 0xff)];
pd[obt->lastpage * pagesize + obt->len++ % pagesize] = v;
if (obt->len % pagesize == 0) {
int p = freelist.back();
freelist.pop_back();
next[obt->lastpage] = p;
obt->lastpage = p;
}
if (i % pagesize == pagesize - 1) {
freelist.push_back(page);
page = next[page];
}
}
freelist.push_back(ibt->lastpage);
}
ibase = 256 - ibase;
obase = 256 - obase;
}
// the final gather.
int k = 0;
for (int ibti = 0; ibti < 256; ibti++) { for (int ibti = 0; ibti < 256; ibti++) {
struct bucket *ibt = &bts[ibase + ibti]; struct bucket *ibt = &bts[ibase + ibti];
int page = ibt->headpage; int page = ibt->headpage;
for (int i = 0; i < ibt->len; i++) { for (int i = 0; i < ibt->len; i++) {
a[k++] = pd[page * pagesize + i % pagesize]; uint32_t v = pd[page * pagesize + i % pagesize];
struct bucket *obt = &bts[obase + (v >> shift & 0xff)];
pd[obt->lastpage * pagesize + obt->len++ % pagesize] = v;
if (obt->len % pagesize == 0) {
int p = freelist.back();
freelist.pop_back();
next[obt->lastpage] = p;
obt->lastpage = p;
}
if (i % pagesize == pagesize - 1) { if (i % pagesize == pagesize - 1) {
freelist.push_back(page);
page = next[page]; page = next[page];
} }
} }
freelist.push_back(ibt->lastpage);
} }
free(pd); ibase = 256 - ibase;
obase = 256 - obase;
} }
// the final gather.
// to measure / profile a single variant int k = 0;
void measure_single(int n) { for (int ibti = 0; ibti < 256; ibti++) {
for (auto inputtype : inputtypes) { struct bucket *ibt = &bts[ibase + ibti];
printf("%10s", inputtype.c_str()); int page = ibt->headpage;
fflush(stdout); for (int i = 0; i < ibt->len; i++) {
std::vector<uint32_t> v(n); a[k++] = pd[page * pagesize + i % pagesize];
v = geninput(inputtype, n); if (i % pagesize == pagesize - 1) {
measure(inputtype, "sp", [&] { spsort(&v[0], v.size()); }); page = next[page];
}
for (auto r : results) printf("%9.3fs", r.second);
puts("");
} }
puts(""); }
printf("%10s", "worst"); free(pd);
for (auto w : worst) printf("%9.3fs", w.second); }
puts("");
printf("%10s", ""); // to measure / profile a single variant
for (auto w : worst) printf("%10s", w.first.c_str()); void measure_single(int n) {
for (auto inputtype : inputtypes) {
printf("%10s", inputtype.c_str());
fflush(stdout);
std::vector<uint32_t> v(n);
v = geninput(inputtype, n);
//measure(inputtype, "sp", [&] { spsort(&v[0], v.size()); });
measure(inputtype, "magyar", [&] { MagyarSort::sort<uint32_t>(&v[0], v.size()); });
for (auto r : results) printf("%9.3fs", r.second);
puts(""); puts("");
} }
puts("");
printf("%10s", "worst");
for (auto w : worst) printf("%9.3fs", w.second);
puts("");
printf("%10s", "");
for (auto w : worst) printf("%10s", w.first.c_str());
puts("");
}
int main(void) { int main(void) {
int n = 100000000; //int n = 100000000;
//int n = 10000000; int n = 10000000;
//int n = 100; //int n = 100;
// Uncomment this for profiling and alg! // Uncomment this for profiling and alg!
//measure_single(n); //measure_single(n);
//return 0; //return 0;
for (auto inputtype : inputtypes) { for (auto inputtype : inputtypes) {
printf("%10s", inputtype.c_str()); printf("%10s", inputtype.c_str());
fflush(stdout); fflush(stdout);
std::vector<uint32_t> v(n), w(n), expected(n); std::vector<uint32_t> v(n), w(n), expected(n);
v = geninput(inputtype, n); v = geninput(inputtype, n);
measure(inputtype, "copy", [&] { w = v; }); measure(inputtype, "copy", [&] { w = v; });
w = v; w = v;
measure(inputtype, "std", [&] { std::sort(std::begin(w), std::end(w)); }); measure(inputtype, "std", [&] { std::sort(std::begin(w), std::end(w)); });
expected = w; expected = w;
w = v; w = v;
measure(inputtype, "ska", [&] { ska_sort(std::begin(w), std::end(w)); }); measure(inputtype, "ska", [&] { ska_sort(std::begin(w), std::end(w)); });
w = v; w = v;
measure(inputtype, "ska_copy", [&] { measure(inputtype, "ska_copy", [&] {
std::vector<uint32_t> buf(w.size()); std::vector<uint32_t> buf(w.size());
if (ska_sort_copy(std::begin(w), std::end(w), std::begin(buf))) { if (ska_sort_copy(std::begin(w), std::end(w), std::begin(buf))) {
w.swap(buf); w.swap(buf);
} }
}); });
w = v; w = v;
measure(inputtype, "magyar", [&] { MagyarSort::sort<uint32_t>(&w[0], w.size()); }); measure(inputtype, "magyar", [&] { MagyarSort::sort<uint32_t>(&w[0], w.size()); });
assert(w == expected); assert(w == expected);
/* /*
w = v; w = v;
measure(inputtype, "2pass", [&] { twopass(&w[0], w.size()); }); measure(inputtype, "2pass", [&] { twopass(&w[0], w.size()); });
assert(w == expected); assert(w == expected);
w = v; w = v;
measure(inputtype, "4pass", [&] { fourpass(&w[0], w.size()); }); measure(inputtype, "4pass", [&] { fourpass(&w[0], w.size()); });
assert(w == expected); assert(w == expected);
w = v; w = v;
measure(inputtype, "psort", [&] { pagedsort(&w[0], w.size()); }); measure(inputtype, "psort", [&] { pagedsort(&w[0], w.size()); });
assert(w == expected); assert(w == expected);
w = v; w = v;
measure(inputtype, "4pasu", [&] { fourpassu(&w[0], w.size()); }); measure(inputtype, "4pasu", [&] { fourpassu(&w[0], w.size()); });
assert(w == expected); assert(w == expected);
*/ w = v;
w = v; measure(inputtype, "4rot", [&] { fourrots(&w[0], w.size()); });
measure(inputtype, "4rot", [&] { fourrots(&w[0], w.size()); }); assert(w == expected);
assert(w == expected); w = v;
w = v; measure(inputtype, "sp", [&] { spsort(&w[0], w.size()); });
/*measure(inputtype, "sp", [&] { spsort(&w[0], w.size()); }); assert(w == expected);
assert(w == expected); w = v;*/
w = v;*/ measure(inputtype, "gptbuck", [&] { gpt_bucket_sort(&w[0], w.size()); });
measure(inputtype, "gptbuck", [&] { gpt_bucket_sort(&w[0], w.size()); }); assert(w == expected);
assert(w == expected); measure(inputtype, "mybuck", [&] { my_bucket_sort(&w[0], w.size()); });
/* assert(w == expected);
w = v; /*
measure(inputtype, "frewr", [&] { frewr(&w[0], w.size()); }); w = v;
assert(w == expected); measure(inputtype, "frewr", [&] { frewr(&w[0], w.size()); });
w = v; assert(w == expected);
measure(inputtype, "vsort", [&] { vsort(&w[0], w.size()); }); w = v;
assert(w == expected); measure(inputtype, "vsort", [&] { vsort(&w[0], w.size()); });
*/ assert(w == expected);
*/
for (auto r : results) printf("%9.3fs", r.second); for (auto r : results) printf("%9.3fs", r.second);
puts("");
}
puts(""); puts("");
printf("%10s", "worst");
for (auto w : worst) printf("%9.3fs", w.second);
puts("");
printf("%10s", "");
for (auto w : worst) printf("%10s", w.first.c_str());
puts("");
return 0;
} }
puts("");
printf("%10s", "worst");
for (auto w : worst) printf("%9.3fs", w.second);
puts("");
printf("%10s", "");
for (auto w : worst) printf("%10s", w.first.c_str());
puts("");
return 0;
}