diff --git a/gptsort.h b/gptsort.h index 26b4fbe..d969f58 100644 --- a/gptsort.h +++ b/gptsort.h @@ -2,6 +2,7 @@ #include #include +// ChatGPT and me did this space partitioning bucket sort void gpt_bucket_sort(uint32_t* array, int n) { // Calculate the number of buckets to use int num_buckets = std::sqrt(n); @@ -36,3 +37,40 @@ void gpt_bucket_sort(uint32_t* array, int n) { } } } + +// Further optimizations (no chatGPT) +void my_bucket_sort(uint32_t* array, int n) { + // Calculate the number of buckets to use + int num_buckets = std::sqrt(n); + + // Create a vector of buckets + std::vector> buckets(num_buckets); + + // Calculate the range of values that each bucket can hold + auto mm = std::minmax_element(array, array + n); + uint32_t min_value = *mm.first; + uint32_t max_value = *mm.second; + uint32_t range = max_value - min_value + 1; + uint32_t bucket_size = range / num_buckets + 1; + + // Distribute the elements of the array into the buckets + for (int i = 0; i < n; i++) { + // Calculate the bucket index for this element + // using the range of values and the bucket size as the divisor + int bucket_index = (array[i] - min_value) / bucket_size; + buckets[bucket_index].push_back(array[i]); + } + + // Sort the elements in each bucket using std::sort + for (int i = 0; i < num_buckets; i++) { + std::sort(buckets[i].begin(), buckets[i].end()); + } + + // Concatenate the buckets to get the sorted array + int k = 0; + for (int i = 0; i < num_buckets; i++) { + for (int j = 0; j < buckets[i].size(); j++) { + array[k++] = buckets[i][j]; + } + } +} diff --git a/makefile b/makefile index 939f35a..9805cae 100644 --- a/makefile +++ b/makefile @@ -31,5 +31,8 @@ clang_release: test.cpp magyarsort.h clang_release3: test.cpp magyarsort.h clang++ test.cpp -DNDEBUG -std=c++17 -O3 -o test.out +clang_release_ypsu: ypsu.cpp magyarsort.h + clang++ ypsu.cpp -DNDEBUG -std=c++17 -O2 -o ypsu.out + clean: test.out rm test.out diff --git a/ypsu.cpp b/ypsu.cpp index 3241797..31e13d5 100644 --- a/ypsu.cpp +++ b/ypsu.cpp @@ -1,490 +1,492 @@ - #include - #include - #include - #include - #include - #include - #include - #include - #include - #include - #include - #include - #include - #include // mlock & munlock - #include "ska_sort.hpp" - #include "gptsort.h" +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include // mlock & munlock +#include "ska_sort.hpp" +#include "gptsort.h" - #define MAGYAR_SORT_DEFAULT_REUSE - #include "magyarsort.h" +#define MAGYAR_SORT_DEFAULT_REUSE +#include "magyarsort.h" - #include "space_partitioning_sort/spsort.h" +#include "space_partitioning_sort/spsort.h" - std::map results; - std::map worst; - void measure(const std::string &inputtype, const std::string &name, - std::function f) { - auto begin = std::chrono::high_resolution_clock::now(); - f(); - auto dur = std::chrono::high_resolution_clock::now() - begin; - double seconds = dur / std::chrono::milliseconds(1) / 1000.0; - results[name] = seconds; - worst[name] = std::max(worst[name], seconds); - } - std::vector inputtypes = { - "constant", "asc", "desc", "ascasc", "ascdesc", - "descasc", "descdesc", "smallrange", "rand", - }; - std::vector geninput(const std::string &type, int n) { - std::vector v(n); - if (type == "constant") { - int c = rand(); - for (int i = 0; i < n; i++) { - v[i] = c; - } - } else if (type == "asc") { - for (int i = 0; i < n; i++) { - v[i] = i; - } - } else if (type == "desc") { - for (int i = 0; i < n; i++) { - v[i] = n - i; - } - } else if (type == "ascasc") { - for (int i = 0; i < n / 2; i++) { - v[i] = i; - v[i + n / 2] = i; - } - } else if (type == "ascdesc") { - for (int i = 0; i < n / 2; i++) { - v[i] = i; - v[i + n / 2] = n - i; - } - } else if (type == "descasc") { - for (int i = 0; i < n / 2; i++) { - v[i] = n - i; - v[i + n / 2] = i; - } - } else if (type == "descdesc") { - for (int i = 0; i < n / 2; i++) { - v[i] = n - i; - v[i + n / 2] = n - i; - } - } else if (type == "smallrange") { - int c = rand() / 2; - for (int i = 0; i < n; i++) { - v[i] = c + rand() % 100; - } - } else if (type == "rand") { - for (int i = 0; i < n; i++) { - v[i] = rand(); - } +std::map results; +std::map worst; +void measure(const std::string &inputtype, const std::string &name, + std::function f) { + auto begin = std::chrono::high_resolution_clock::now(); + f(); + auto dur = std::chrono::high_resolution_clock::now() - begin; + double seconds = dur / std::chrono::milliseconds(1) / 1000.0; + results[name] = seconds; + worst[name] = std::max(worst[name], seconds); +} +std::vector inputtypes = { + "constant", "asc", "desc", "ascasc", "ascdesc", + "descasc", "descdesc", "smallrange", "rand", +}; +std::vector geninput(const std::string &type, int n) { + std::vector v(n); + if (type == "constant") { + int c = rand(); + for (int i = 0; i < n; i++) { + v[i] = c; } - return v; - } - - void twopass(uint32_t *a, int n) { - assert(n * int64_t(sizeof(a[0])) <= INT_MAX); - // alloc helper buffers. - int sz = n * sizeof(a[0]); - std::vector bucketdata(1 << 16); - uint32_t *buf = (uint32_t *)malloc(sz); - assert(buf != NULL); - // pass 1: sort by lower 16 bits. - for (int i = 0; i < n; i++) bucketdata[a[i] & 0xffff]++; - int offset = 0; - for (int i = 0; i < 1 << 16; i++) { - int d = bucketdata[i]; - bucketdata[i] = offset; - offset += d; + } else if (type == "asc") { + for (int i = 0; i < n; i++) { + v[i] = i; } - for (int i = 0; i < n; i++) buf[bucketdata[a[i] & 0xffff]++] = a[i]; - // pass 2: sort by upper 16 bits. + } else if (type == "desc") { + for (int i = 0; i < n; i++) { + v[i] = n - i; + } + } else if (type == "ascasc") { + for (int i = 0; i < n / 2; i++) { + v[i] = i; + v[i + n / 2] = i; + } + } else if (type == "ascdesc") { + for (int i = 0; i < n / 2; i++) { + v[i] = i; + v[i + n / 2] = n - i; + } + } else if (type == "descasc") { + for (int i = 0; i < n / 2; i++) { + v[i] = n - i; + v[i + n / 2] = i; + } + } else if (type == "descdesc") { + for (int i = 0; i < n / 2; i++) { + v[i] = n - i; + v[i + n / 2] = n - i; + } + } else if (type == "smallrange") { + int c = rand() / 2; + for (int i = 0; i < n; i++) { + v[i] = c + rand() % 100; + } + } else if (type == "rand") { + for (int i = 0; i < n; i++) { + v[i] = rand(); + } + } + return v; +} + +void twopass(uint32_t *a, int n) { + assert(n * int64_t(sizeof(a[0])) <= INT_MAX); + // alloc helper buffers. + int sz = n * sizeof(a[0]); + std::vector bucketdata(1 << 16); + uint32_t *buf = (uint32_t *)malloc(sz); + assert(buf != NULL); + // pass 1: sort by lower 16 bits. + for (int i = 0; i < n; i++) bucketdata[a[i] & 0xffff]++; + int offset = 0; + for (int i = 0; i < 1 << 16; i++) { + int d = bucketdata[i]; + bucketdata[i] = offset; + offset += d; + } + for (int i = 0; i < n; i++) buf[bucketdata[a[i] & 0xffff]++] = a[i]; + // pass 2: sort by upper 16 bits. + memset(&bucketdata[0], 0, bucketdata.size() * sizeof(bucketdata[0])); + for (int i = 0; i < n; i++) bucketdata[buf[i] >> 16]++; + offset = 0; + for (int i = 0; i < 1 << 16; i++) { + int d = bucketdata[i]; + bucketdata[i] = offset; + offset += d; + } + for (int i = 0; i < n; i++) a[bucketdata[buf[i] >> 16]++] = buf[i]; + free(buf); +} +void fourpass(uint32_t *a, int n) { + assert(n * int64_t(sizeof(a[0])) <= INT_MAX); + // alloc helper buffers. + int sz = n * sizeof(a[0]); + std::vector bucketdata(1 << 8); + uint32_t *buf = (uint32_t *)malloc(sz); + assert(buf != NULL); + uint32_t *src = a, *dst = buf; + uintptr_t swapmask = (uintptr_t)a ^ (uintptr_t)buf; + for (int shift = 0; shift < 32; shift += 8) { memset(&bucketdata[0], 0, bucketdata.size() * sizeof(bucketdata[0])); - for (int i = 0; i < n; i++) bucketdata[buf[i] >> 16]++; - offset = 0; - for (int i = 0; i < 1 << 16; i++) { - int d = bucketdata[i]; - bucketdata[i] = offset; - offset += d; - } - for (int i = 0; i < n; i++) a[bucketdata[buf[i] >> 16]++] = buf[i]; - free(buf); - } - void fourpass(uint32_t *a, int n) { - assert(n * int64_t(sizeof(a[0])) <= INT_MAX); - // alloc helper buffers. - int sz = n * sizeof(a[0]); - std::vector bucketdata(1 << 8); - uint32_t *buf = (uint32_t *)malloc(sz); - assert(buf != NULL); - uint32_t *src = a, *dst = buf; - uintptr_t swapmask = (uintptr_t)a ^ (uintptr_t)buf; - for (int shift = 0; shift < 32; shift += 8) { - memset(&bucketdata[0], 0, bucketdata.size() * sizeof(bucketdata[0])); - for (int i = 0; i < n; i++) bucketdata[src[i] >> shift & 0xff]++; - int offset = 0; - for (int i = 0; i < 1 << 8; i++) { - int d = bucketdata[i]; - bucketdata[i] = offset; - offset += d; - } - for (int i = 0; i < n; i++) { - dst[bucketdata[src[i] >> shift & 0xff]++] = src[i]; - } - src = (uint32_t *)((uintptr_t)src ^ swapmask); - dst = (uint32_t *)((uintptr_t)dst ^ swapmask); - } - free(buf); - } - - /** Only werks für das fourpassu! */ - void my_memset(int *v) { - memset(v, 0, (1 << 8) * sizeof(int)); - } - - // hand-unrolled fourpass. - void fourpassu(uint32_t *a, int n) { - assert(n * int64_t(sizeof(a[0])) <= INT_MAX); - // alloc helper buffers. - int sz = n * sizeof(a[0]); - - static thread_local int bucketdata[1 << 8]; - my_memset(bucketdata); - - uint32_t *buf = (uint32_t *)malloc(sz); - assert(buf != NULL); - // pass 1: sort by lower 8 bits. - #pragma GCC unroll 32 - for (int i = 0; i < n; i++) bucketdata[a[i] & 0xff]++; + for (int i = 0; i < n; i++) bucketdata[src[i] >> shift & 0xff]++; int offset = 0; - #pragma GCC unroll 8 for (int i = 0; i < 1 << 8; i++) { int d = bucketdata[i]; bucketdata[i] = offset; offset += d; } - for (int i = 0; i < n; i++) buf[bucketdata[a[i] & 0xff]++] = a[i]; - // pass 2: sort by 2nd 8 bits. - my_memset(bucketdata); - #pragma GCC unroll 32 - for (int i = 0; i < n; i++) bucketdata[buf[i] >> 8 & 0xff]++; - offset = 0; - #pragma GCC unroll 8 - for (int i = 0; i < 1 << 8; i++) { - int d = bucketdata[i]; - bucketdata[i] = offset; - offset += d; + for (int i = 0; i < n; i++) { + dst[bucketdata[src[i] >> shift & 0xff]++] = src[i]; } - #pragma GCC unroll 64 - for (int i = 0; i < n; i++) a[bucketdata[buf[i] >> 8 & 0xff]++] = buf[i]; - // pass 3: sort by 3rd 8 bits. - my_memset(bucketdata); - #pragma GCC unroll 32 - for (int i = 0; i < n; i++) bucketdata[a[i] >> 16 & 0xff]++; - offset = 0; - #pragma GCC unroll 8 - for (int i = 0; i < 1 << 8; i++) { - int d = bucketdata[i]; - bucketdata[i] = offset; - offset += d; - } - #pragma GCC unroll 64 - for (int i = 0; i < n; i++) buf[bucketdata[a[i] >> 16 & 0xff]++] = a[i]; - // pass 4: sort by 4th 8 bits. - my_memset(bucketdata); - #pragma GCC unroll 32 - for (int i = 0; i < n; i++) bucketdata[buf[i] >> 24 & 0xff]++; - offset = 0; - #pragma GCC unroll 8 - for (int i = 0; i < 1 << 8; i++) { - int d = bucketdata[i]; - bucketdata[i] = offset; - offset += d; - } - #pragma GCC unroll 32 - for (int i = 0; i < n; i++) a[bucketdata[buf[i] >> 24 & 0xff]++] = buf[i]; - free(buf); + src = (uint32_t *)((uintptr_t)src ^ swapmask); + dst = (uint32_t *)((uintptr_t)dst ^ swapmask); } + free(buf); +} - static inline uint32_t byterotate(uint32_t x) { return (x >> 8) | (x << 24); } - void fourrots(uint32_t *arr, int n) { - assert(n * int64_t(sizeof(arr[0])) <= INT_MAX); - assert(n % 4 == 0); - // alloc helper buffers. - int sz = n * sizeof(arr[0]); - std::vector bucketdata(1 << 8); - int *btd = &bucketdata[0]; - uint32_t *buf = (uint32_t *)malloc(sz); - assert(buf != NULL); - uint32_t *src = arr, *dst = buf; - uintptr_t swapmask = (uintptr_t)arr ^ (uintptr_t)buf; - uint32_t a, b, c, d; - uint32_t abt, bbt, cbt, dbt; - for (int shift = 0; shift < 32; shift += 8) { - memset(btd, 0, bucketdata.size() * sizeof(bucketdata[0])); - for (int i = 0; i < n; i += 4) { - a = src[i]; - b = src[i + 1]; - c = src[i + 2]; - d = src[i + 3]; - abt = a & 0xff; - bbt = b & 0xff; - cbt = c & 0xff; - dbt = d & 0xff; - btd[abt]++; - btd[bbt]++; - btd[cbt]++; - btd[dbt]++; - } - int offset = 0; - for (int i = 0; i < 1 << 8; i++) { - int d = bucketdata[i]; - bucketdata[i] = offset; - offset += d; - } - for (int i = 0; i < n; i += 4) { - a = src[i]; - b = src[i + 1]; - c = src[i + 2]; - d = src[i + 3]; - abt = a & 0xff; - bbt = b & 0xff; - cbt = c & 0xff; - dbt = d & 0xff; - dst[btd[abt]++] = byterotate(a); - dst[btd[bbt]++] = byterotate(b); - dst[btd[cbt]++] = byterotate(c); - dst[btd[dbt]++] = byterotate(d); - } - src = (uint32_t *)((uintptr_t)src ^ swapmask); - dst = (uint32_t *)((uintptr_t)dst ^ swapmask); - } - free(buf); +/** Only werks für das fourpassu! */ +void my_memset(int *v) { + memset(v, 0, (1 << 8) * sizeof(int)); +} + +// hand-unrolled fourpass. +void fourpassu(uint32_t *a, int n) { + assert(n * int64_t(sizeof(a[0])) <= INT_MAX); + // alloc helper buffers. + int sz = n * sizeof(a[0]); + + static thread_local int bucketdata[1 << 8]; + my_memset(bucketdata); + + uint32_t *buf = (uint32_t *)malloc(sz); + assert(buf != NULL); + // pass 1: sort by lower 8 bits. + #pragma GCC unroll 32 + for (int i = 0; i < n; i++) bucketdata[a[i] & 0xff]++; + int offset = 0; + #pragma GCC unroll 8 + for (int i = 0; i < 1 << 8; i++) { + int d = bucketdata[i]; + bucketdata[i] = offset; + offset += d; } + for (int i = 0; i < n; i++) buf[bucketdata[a[i] & 0xff]++] = a[i]; + // pass 2: sort by 2nd 8 bits. + my_memset(bucketdata); + #pragma GCC unroll 32 + for (int i = 0; i < n; i++) bucketdata[buf[i] >> 8 & 0xff]++; + offset = 0; + #pragma GCC unroll 8 + for (int i = 0; i < 1 << 8; i++) { + int d = bucketdata[i]; + bucketdata[i] = offset; + offset += d; + } + #pragma GCC unroll 64 + for (int i = 0; i < n; i++) a[bucketdata[buf[i] >> 8 & 0xff]++] = buf[i]; + // pass 3: sort by 3rd 8 bits. + my_memset(bucketdata); + #pragma GCC unroll 32 + for (int i = 0; i < n; i++) bucketdata[a[i] >> 16 & 0xff]++; + offset = 0; + #pragma GCC unroll 8 + for (int i = 0; i < 1 << 8; i++) { + int d = bucketdata[i]; + bucketdata[i] = offset; + offset += d; + } + #pragma GCC unroll 64 + for (int i = 0; i < n; i++) buf[bucketdata[a[i] >> 16 & 0xff]++] = a[i]; + // pass 4: sort by 4th 8 bits. + my_memset(bucketdata); + #pragma GCC unroll 32 + for (int i = 0; i < n; i++) bucketdata[buf[i] >> 24 & 0xff]++; + offset = 0; + #pragma GCC unroll 8 + for (int i = 0; i < 1 << 8; i++) { + int d = bucketdata[i]; + bucketdata[i] = offset; + offset += d; + } + #pragma GCC unroll 32 + for (int i = 0; i < n; i++) a[bucketdata[buf[i] >> 24 & 0xff]++] = buf[i]; + free(buf); +} - // frewr - four rewrites. - void frewr(uint32_t *arr, int n) { - uint32_t *tmpbuf = (uint32_t *)malloc(n * 4); - mlock(tmpbuf, n * 4); - int btoffsets[4][256] = {}; +static inline uint32_t byterotate(uint32_t x) { return (x >> 8) | (x << 24); } +void fourrots(uint32_t *arr, int n) { + assert(n * int64_t(sizeof(arr[0])) <= INT_MAX); + assert(n % 4 == 0); + // alloc helper buffers. + int sz = n * sizeof(arr[0]); + std::vector bucketdata(1 << 8); + int *btd = &bucketdata[0]; + uint32_t *buf = (uint32_t *)malloc(sz); + assert(buf != NULL); + uint32_t *src = arr, *dst = buf; + uintptr_t swapmask = (uintptr_t)arr ^ (uintptr_t)buf; + uint32_t a, b, c, d; + uint32_t abt, bbt, cbt, dbt; + for (int shift = 0; shift < 32; shift += 8) { + memset(btd, 0, bucketdata.size() * sizeof(bucketdata[0])); + for (int i = 0; i < n; i += 4) { + a = src[i]; + b = src[i + 1]; + c = src[i + 2]; + d = src[i + 3]; + abt = a & 0xff; + bbt = b & 0xff; + cbt = c & 0xff; + dbt = d & 0xff; + btd[abt]++; + btd[bbt]++; + btd[cbt]++; + btd[dbt]++; + } + int offset = 0; + for (int i = 0; i < 1 << 8; i++) { + int d = bucketdata[i]; + bucketdata[i] = offset; + offset += d; + } + for (int i = 0; i < n; i += 4) { + a = src[i]; + b = src[i + 1]; + c = src[i + 2]; + d = src[i + 3]; + abt = a & 0xff; + bbt = b & 0xff; + cbt = c & 0xff; + dbt = d & 0xff; + dst[btd[abt]++] = byterotate(a); + dst[btd[bbt]++] = byterotate(b); + dst[btd[cbt]++] = byterotate(c); + dst[btd[dbt]++] = byterotate(d); + } + src = (uint32_t *)((uintptr_t)src ^ swapmask); + dst = (uint32_t *)((uintptr_t)dst ^ swapmask); + } + free(buf); +} + +// frewr - four rewrites. +void frewr(uint32_t *arr, int n) { + uint32_t *tmpbuf = (uint32_t *)malloc(n * 4); + mlock(tmpbuf, n * 4); + int btoffsets[4][256] = {}; + #pragma GCC unroll 64 + for (int i = n - 1; i >= 0; i--) { + uint32_t a = arr[i]; + btoffsets[3][a & 0xff]++; + btoffsets[2][a >> 8 & 0xff]++; + btoffsets[1][a >> 16 & 0xff]++; + btoffsets[0][a >> 24 & 0xff]++; + } + int btend[4] = {n - 1, n - 1, n - 1, n - 1}; + #pragma GCC unroll 16 + for (int i = 255; i >= 0; i--) { + #pragma GCC unroll 4 + for (int pass = 3; pass >= 0; pass--) { + int nbtend = btend[pass] - btoffsets[pass][i]; + btoffsets[pass][i] = btend[pass]; + btend[pass] = nbtend; + } + } + uint32_t *src = arr, *dst = tmpbuf; + #pragma GCC unroll 4 + for (int pass = 3; pass >= 0; pass--) { + int *off = btoffsets[pass]; #pragma GCC unroll 64 for (int i = n - 1; i >= 0; i--) { - uint32_t a = arr[i]; - btoffsets[3][a & 0xff]++; - btoffsets[2][a >> 8 & 0xff]++; - btoffsets[1][a >> 16 & 0xff]++; - btoffsets[0][a >> 24 & 0xff]++; + uint32_t v = src[i]; + dst[off[v & 0xff]--] = v >> 8 | v << 24; + __builtin_prefetch(&dst[off[v & 0xff] - 2]); } - int btend[4] = {n - 1, n - 1, n - 1, n - 1}; - #pragma GCC unroll 16 - for (int i = 255; i >= 0; i--) { - #pragma GCC unroll 4 - for (int pass = 3; pass >= 0; pass--) { - int nbtend = btend[pass] - btoffsets[pass][i]; - btoffsets[pass][i] = btend[pass]; - btend[pass] = nbtend; - } - } - uint32_t *src = arr, *dst = tmpbuf; - #pragma GCC unroll 4 - for (int pass = 3; pass >= 0; pass--) { - int *off = btoffsets[pass]; - #pragma GCC unroll 64 - for (int i = n - 1; i >= 0; i--) { - uint32_t v = src[i]; - dst[off[v & 0xff]--] = v >> 8 | v << 24; - __builtin_prefetch(&dst[off[v & 0xff] - 2]); - } - uint32_t *tmp = src; - src = dst; - dst = tmp; - } - munlock(tmpbuf, n * 4); - free(tmpbuf); + uint32_t *tmp = src; + src = dst; + dst = tmp; } + munlock(tmpbuf, n * 4); + free(tmpbuf); +} - void vsort(uint32_t *a, int n) { - thread_local std::vector bts[256]; - #pragma GCC unroll 4 - for (int shift = 0; shift < 32; shift += 8) { - #pragma GCC unroll 64 - for (int i = 0; i < n; i++) bts[a[i] >> shift & 0xff].push_back(a[i]); - #pragma GCC unroll 64 - for (int bt = 0, k = 0; bt < 256; bt++) { - memcpy(a + k, &bts[bt][0], bts[bt].size() * sizeof(a[0])); - k += bts[bt].size(); - bts[bt].clear(); - } +void vsort(uint32_t *a, int n) { + thread_local std::vector bts[256]; + #pragma GCC unroll 4 + for (int shift = 0; shift < 32; shift += 8) { + #pragma GCC unroll 64 + for (int i = 0; i < n; i++) bts[a[i] >> shift & 0xff].push_back(a[i]); + #pragma GCC unroll 64 + for (int bt = 0, k = 0; bt < 256; bt++) { + memcpy(a + k, &bts[bt][0], bts[bt].size() * sizeof(a[0])); + k += bts[bt].size(); + bts[bt].clear(); } } - void pagedsort(uint32_t *a, int n) { - enum { pagesize = 1024 }; - int pagecount = (n + pagesize - 1) / pagesize + 512; - uint32_t *pd = (uint32_t *)malloc(pagecount * pagesize * sizeof(a[0])); - std::vector freelist(pagecount); - std::vector next(pagecount); - std::iota(std::begin(freelist), std::end(freelist), 0); - struct bucket { - int len; - int headpage, lastpage; - }; - bucket bts[512]; - // initial scatter. +} +void pagedsort(uint32_t *a, int n) { + enum { pagesize = 1024 }; + int pagecount = (n + pagesize - 1) / pagesize + 512; + uint32_t *pd = (uint32_t *)malloc(pagecount * pagesize * sizeof(a[0])); + std::vector freelist(pagecount); + std::vector next(pagecount); + std::iota(std::begin(freelist), std::end(freelist), 0); + struct bucket { + int len; + int headpage, lastpage; + }; + bucket bts[512]; + // initial scatter. + for (int bt = 0; bt < 256; bt++) { + int p = freelist.back(); + freelist.pop_back(); + bts[bt] = {0, p, p}; + } + for (int i = 0; i < n; i++) { + bucket *bt = &bts[a[i] & 0xff]; + pd[bt->lastpage * pagesize + bt->len++ % pagesize] = a[i]; + if (bt->len % pagesize == 0) { + int p = freelist.back(); + freelist.pop_back(); + next[bt->lastpage] = p; + bt->lastpage = p; + } + } + // intermediate level scatters. + int ibase = 0, obase = 256; + for (int shift = 8; shift < 32; shift += 8) { for (int bt = 0; bt < 256; bt++) { int p = freelist.back(); freelist.pop_back(); - bts[bt] = {0, p, p}; + bts[obase + bt] = {0, p, p}; } - for (int i = 0; i < n; i++) { - bucket *bt = &bts[a[i] & 0xff]; - pd[bt->lastpage * pagesize + bt->len++ % pagesize] = a[i]; - if (bt->len % pagesize == 0) { - int p = freelist.back(); - freelist.pop_back(); - next[bt->lastpage] = p; - bt->lastpage = p; - } - } - // intermediate level scatters. - int ibase = 0, obase = 256; - for (int shift = 8; shift < 32; shift += 8) { - for (int bt = 0; bt < 256; bt++) { - int p = freelist.back(); - freelist.pop_back(); - bts[obase + bt] = {0, p, p}; - } - for (int ibti = 0; ibti < 256; ibti++) { - struct bucket *ibt = &bts[ibase + ibti]; - int page = ibt->headpage; - for (int i = 0; i < ibt->len; i++) { - uint32_t v = pd[page * pagesize + i % pagesize]; - struct bucket *obt = &bts[obase + (v >> shift & 0xff)]; - pd[obt->lastpage * pagesize + obt->len++ % pagesize] = v; - if (obt->len % pagesize == 0) { - int p = freelist.back(); - freelist.pop_back(); - next[obt->lastpage] = p; - obt->lastpage = p; - } - if (i % pagesize == pagesize - 1) { - freelist.push_back(page); - page = next[page]; - } - } - freelist.push_back(ibt->lastpage); - } - ibase = 256 - ibase; - obase = 256 - obase; - } - // the final gather. - int k = 0; for (int ibti = 0; ibti < 256; ibti++) { struct bucket *ibt = &bts[ibase + ibti]; int page = ibt->headpage; for (int i = 0; i < ibt->len; i++) { - a[k++] = pd[page * pagesize + i % pagesize]; + uint32_t v = pd[page * pagesize + i % pagesize]; + struct bucket *obt = &bts[obase + (v >> shift & 0xff)]; + pd[obt->lastpage * pagesize + obt->len++ % pagesize] = v; + if (obt->len % pagesize == 0) { + int p = freelist.back(); + freelist.pop_back(); + next[obt->lastpage] = p; + obt->lastpage = p; + } if (i % pagesize == pagesize - 1) { + freelist.push_back(page); page = next[page]; } } + freelist.push_back(ibt->lastpage); } - free(pd); + ibase = 256 - ibase; + obase = 256 - obase; } - - // to measure / profile a single variant - void measure_single(int n) { - for (auto inputtype : inputtypes) { - printf("%10s", inputtype.c_str()); - fflush(stdout); - std::vector v(n); - v = geninput(inputtype, n); - measure(inputtype, "sp", [&] { spsort(&v[0], v.size()); }); - - for (auto r : results) printf("%9.3fs", r.second); - puts(""); + // the final gather. + int k = 0; + for (int ibti = 0; ibti < 256; ibti++) { + struct bucket *ibt = &bts[ibase + ibti]; + int page = ibt->headpage; + for (int i = 0; i < ibt->len; i++) { + a[k++] = pd[page * pagesize + i % pagesize]; + if (i % pagesize == pagesize - 1) { + page = next[page]; + } } - puts(""); - printf("%10s", "worst"); - for (auto w : worst) printf("%9.3fs", w.second); - puts(""); - printf("%10s", ""); - for (auto w : worst) printf("%10s", w.first.c_str()); + } + free(pd); +} + +// to measure / profile a single variant +void measure_single(int n) { + for (auto inputtype : inputtypes) { + printf("%10s", inputtype.c_str()); + fflush(stdout); + std::vector v(n); + v = geninput(inputtype, n); + //measure(inputtype, "sp", [&] { spsort(&v[0], v.size()); }); + measure(inputtype, "magyar", [&] { MagyarSort::sort(&v[0], v.size()); }); + + for (auto r : results) printf("%9.3fs", r.second); puts(""); } + puts(""); + printf("%10s", "worst"); + for (auto w : worst) printf("%9.3fs", w.second); + puts(""); + printf("%10s", ""); + for (auto w : worst) printf("%10s", w.first.c_str()); + puts(""); +} - int main(void) { - int n = 100000000; - //int n = 10000000; - //int n = 100; +int main(void) { + //int n = 100000000; + int n = 10000000; + //int n = 100; - // Uncomment this for profiling and alg! - //measure_single(n); - //return 0; + // Uncomment this for profiling and alg! + //measure_single(n); + //return 0; - for (auto inputtype : inputtypes) { - printf("%10s", inputtype.c_str()); - fflush(stdout); - std::vector v(n), w(n), expected(n); - v = geninput(inputtype, n); - measure(inputtype, "copy", [&] { w = v; }); - w = v; - measure(inputtype, "std", [&] { std::sort(std::begin(w), std::end(w)); }); - expected = w; - w = v; - measure(inputtype, "ska", [&] { ska_sort(std::begin(w), std::end(w)); }); - w = v; - measure(inputtype, "ska_copy", [&] { - std::vector buf(w.size()); - if (ska_sort_copy(std::begin(w), std::end(w), std::begin(buf))) { - w.swap(buf); - } - }); - w = v; - measure(inputtype, "magyar", [&] { MagyarSort::sort(&w[0], w.size()); }); - assert(w == expected); + for (auto inputtype : inputtypes) { + printf("%10s", inputtype.c_str()); + fflush(stdout); + std::vector v(n), w(n), expected(n); + v = geninput(inputtype, n); + measure(inputtype, "copy", [&] { w = v; }); + w = v; + measure(inputtype, "std", [&] { std::sort(std::begin(w), std::end(w)); }); + expected = w; + w = v; + measure(inputtype, "ska", [&] { ska_sort(std::begin(w), std::end(w)); }); + w = v; + measure(inputtype, "ska_copy", [&] { + std::vector buf(w.size()); + if (ska_sort_copy(std::begin(w), std::end(w), std::begin(buf))) { + w.swap(buf); + } + }); + w = v; + measure(inputtype, "magyar", [&] { MagyarSort::sort(&w[0], w.size()); }); + assert(w == expected); - /* - w = v; - measure(inputtype, "2pass", [&] { twopass(&w[0], w.size()); }); - assert(w == expected); - w = v; - measure(inputtype, "4pass", [&] { fourpass(&w[0], w.size()); }); - assert(w == expected); - w = v; - measure(inputtype, "psort", [&] { pagedsort(&w[0], w.size()); }); - assert(w == expected); - w = v; - measure(inputtype, "4pasu", [&] { fourpassu(&w[0], w.size()); }); - assert(w == expected); - */ - w = v; - measure(inputtype, "4rot", [&] { fourrots(&w[0], w.size()); }); - assert(w == expected); - w = v; - /*measure(inputtype, "sp", [&] { spsort(&w[0], w.size()); }); - assert(w == expected); - w = v;*/ - measure(inputtype, "gptbuck", [&] { gpt_bucket_sort(&w[0], w.size()); }); - assert(w == expected); - /* - w = v; - measure(inputtype, "frewr", [&] { frewr(&w[0], w.size()); }); - assert(w == expected); - w = v; - measure(inputtype, "vsort", [&] { vsort(&w[0], w.size()); }); - assert(w == expected); - */ + /* + w = v; + measure(inputtype, "2pass", [&] { twopass(&w[0], w.size()); }); + assert(w == expected); + w = v; + measure(inputtype, "4pass", [&] { fourpass(&w[0], w.size()); }); + assert(w == expected); + w = v; + measure(inputtype, "psort", [&] { pagedsort(&w[0], w.size()); }); + assert(w == expected); + w = v; + measure(inputtype, "4pasu", [&] { fourpassu(&w[0], w.size()); }); + assert(w == expected); + w = v; + measure(inputtype, "4rot", [&] { fourrots(&w[0], w.size()); }); + assert(w == expected); + w = v; + measure(inputtype, "sp", [&] { spsort(&w[0], w.size()); }); + assert(w == expected); + w = v;*/ + measure(inputtype, "gptbuck", [&] { gpt_bucket_sort(&w[0], w.size()); }); + assert(w == expected); + measure(inputtype, "mybuck", [&] { my_bucket_sort(&w[0], w.size()); }); + assert(w == expected); + /* + w = v; + measure(inputtype, "frewr", [&] { frewr(&w[0], w.size()); }); + assert(w == expected); + w = v; + measure(inputtype, "vsort", [&] { vsort(&w[0], w.size()); }); + assert(w == expected); + */ - for (auto r : results) printf("%9.3fs", r.second); - puts(""); - } + for (auto r : results) printf("%9.3fs", r.second); puts(""); - printf("%10s", "worst"); - for (auto w : worst) printf("%9.3fs", w.second); - puts(""); - printf("%10s", ""); - for (auto w : worst) printf("%10s", w.first.c_str()); - puts(""); - return 0; } + puts(""); + printf("%10s", "worst"); + for (auto w : worst) printf("%9.3fs", w.second); + puts(""); + printf("%10s", ""); + for (auto w : worst) printf("%10s", w.first.c_str()); + puts(""); + return 0; +}