2024-04-11 23:59:13 +02:00
|
|
|
#include <utility> // std::pair
|
2022-12-10 11:04:53 +01:00
|
|
|
#include <algorithm>
|
|
|
|
|
#include <cassert>
|
|
|
|
|
#include <chrono>
|
|
|
|
|
#include <climits>
|
|
|
|
|
#include <cstdint>
|
|
|
|
|
#include <cstdio>
|
|
|
|
|
#include <cstring>
|
|
|
|
|
#include <functional>
|
|
|
|
|
#include <map>
|
|
|
|
|
#include <set>
|
|
|
|
|
#include <string>
|
|
|
|
|
#include <vector>
|
|
|
|
|
#include <numeric>
|
|
|
|
|
#include <sys/mman.h> // mlock & munlock
|
|
|
|
|
#include "ska_sort.hpp"
|
|
|
|
|
#include "gptsort.h"
|
2023-05-02 13:20:07 +02:00
|
|
|
#include "thiersort.h"
|
2025-09-11 20:42:04 +02:00
|
|
|
#include "thiersort2.h"
|
2025-05-06 03:06:37 +02:00
|
|
|
#include "qsort/qsort.h"
|
|
|
|
|
#include "qsort/zssort.h"
|
2025-05-09 01:10:12 +02:00
|
|
|
#include "qsort/schwab_sort.h"
|
2025-05-06 03:06:37 +02:00
|
|
|
#include "qsort/chatgpt_qs.h"
|
2025-09-29 02:24:50 +02:00
|
|
|
#include "threepass.h"
|
2025-09-29 18:18:37 +02:00
|
|
|
#include "thiersort3.h"
|
2025-10-01 16:49:00 +02:00
|
|
|
#include "randominus.h"
|
2021-12-15 03:14:35 +01:00
|
|
|
|
2024-04-11 16:59:09 +02:00
|
|
|
// #define MAGYAR_SORT_DEFAULT_REUSE
|
2022-12-10 11:04:53 +01:00
|
|
|
#include "magyarsort.h"
|
2021-12-15 03:14:35 +01:00
|
|
|
|
2022-12-10 11:04:53 +01:00
|
|
|
#include "space_partitioning_sort/spsort.h"
|
2022-08-16 03:29:08 +02:00
|
|
|
|
2022-12-10 11:04:53 +01:00
|
|
|
std::map<std::string, double> results;
|
|
|
|
|
std::map<std::string, double> worst;
|
|
|
|
|
void measure(const std::string &inputtype, const std::string &name,
|
|
|
|
|
std::function<void()> f) {
|
|
|
|
|
auto begin = std::chrono::high_resolution_clock::now();
|
|
|
|
|
f();
|
|
|
|
|
auto dur = std::chrono::high_resolution_clock::now() - begin;
|
|
|
|
|
double seconds = dur / std::chrono::milliseconds(1) / 1000.0;
|
|
|
|
|
results[name] = seconds;
|
|
|
|
|
worst[name] = std::max(worst[name], seconds);
|
|
|
|
|
}
|
|
|
|
|
std::vector<std::string> inputtypes = {
|
2024-04-11 16:41:08 +02:00
|
|
|
"constant",
|
|
|
|
|
"asc",
|
|
|
|
|
"desc",
|
|
|
|
|
"ascasc",
|
2024-04-11 06:13:51 +02:00
|
|
|
"ascdesc",
|
2024-04-11 16:41:08 +02:00
|
|
|
"descasc",
|
|
|
|
|
"descdesc",
|
2024-04-11 06:13:51 +02:00
|
|
|
"smallrange",
|
2025-05-06 03:06:37 +02:00
|
|
|
"rand",
|
2022-12-10 11:04:53 +01:00
|
|
|
};
|
|
|
|
|
std::vector<uint32_t> geninput(const std::string &type, int n) {
|
|
|
|
|
std::vector<uint32_t> v(n);
|
|
|
|
|
if (type == "constant") {
|
|
|
|
|
int c = rand();
|
|
|
|
|
for (int i = 0; i < n; i++) {
|
|
|
|
|
v[i] = c;
|
|
|
|
|
}
|
|
|
|
|
} else if (type == "asc") {
|
|
|
|
|
for (int i = 0; i < n; i++) {
|
|
|
|
|
v[i] = i;
|
|
|
|
|
}
|
|
|
|
|
} else if (type == "desc") {
|
|
|
|
|
for (int i = 0; i < n; i++) {
|
|
|
|
|
v[i] = n - i;
|
|
|
|
|
}
|
|
|
|
|
} else if (type == "ascasc") {
|
|
|
|
|
for (int i = 0; i < n / 2; i++) {
|
|
|
|
|
v[i] = i;
|
|
|
|
|
v[i + n / 2] = i;
|
|
|
|
|
}
|
|
|
|
|
} else if (type == "ascdesc") {
|
|
|
|
|
for (int i = 0; i < n / 2; i++) {
|
|
|
|
|
v[i] = i;
|
|
|
|
|
v[i + n / 2] = n - i;
|
|
|
|
|
}
|
|
|
|
|
} else if (type == "descasc") {
|
|
|
|
|
for (int i = 0; i < n / 2; i++) {
|
|
|
|
|
v[i] = n - i;
|
|
|
|
|
v[i + n / 2] = i;
|
|
|
|
|
}
|
|
|
|
|
} else if (type == "descdesc") {
|
|
|
|
|
for (int i = 0; i < n / 2; i++) {
|
|
|
|
|
v[i] = n - i;
|
|
|
|
|
v[i + n / 2] = n - i;
|
|
|
|
|
}
|
|
|
|
|
} else if (type == "smallrange") {
|
|
|
|
|
int c = rand() / 2;
|
|
|
|
|
for (int i = 0; i < n; i++) {
|
|
|
|
|
v[i] = c + rand() % 100;
|
|
|
|
|
}
|
|
|
|
|
} else if (type == "rand") {
|
|
|
|
|
for (int i = 0; i < n; i++) {
|
|
|
|
|
v[i] = rand();
|
2021-12-13 02:30:12 +01:00
|
|
|
}
|
|
|
|
|
}
|
2022-12-10 11:04:53 +01:00
|
|
|
return v;
|
|
|
|
|
}
|
2021-12-15 12:52:33 +01:00
|
|
|
|
2022-12-10 11:04:53 +01:00
|
|
|
void twopass(uint32_t *a, int n) {
|
|
|
|
|
assert(n * int64_t(sizeof(a[0])) <= INT_MAX);
|
|
|
|
|
// alloc helper buffers.
|
|
|
|
|
int sz = n * sizeof(a[0]);
|
|
|
|
|
std::vector<int> bucketdata(1 << 16);
|
|
|
|
|
uint32_t *buf = (uint32_t *)malloc(sz);
|
|
|
|
|
assert(buf != NULL);
|
|
|
|
|
// pass 1: sort by lower 16 bits.
|
|
|
|
|
for (int i = 0; i < n; i++) bucketdata[a[i] & 0xffff]++;
|
|
|
|
|
int offset = 0;
|
|
|
|
|
for (int i = 0; i < 1 << 16; i++) {
|
|
|
|
|
int d = bucketdata[i];
|
|
|
|
|
bucketdata[i] = offset;
|
|
|
|
|
offset += d;
|
|
|
|
|
}
|
|
|
|
|
for (int i = 0; i < n; i++) buf[bucketdata[a[i] & 0xffff]++] = a[i];
|
|
|
|
|
// pass 2: sort by upper 16 bits.
|
|
|
|
|
memset(&bucketdata[0], 0, bucketdata.size() * sizeof(bucketdata[0]));
|
|
|
|
|
for (int i = 0; i < n; i++) bucketdata[buf[i] >> 16]++;
|
|
|
|
|
offset = 0;
|
|
|
|
|
for (int i = 0; i < 1 << 16; i++) {
|
|
|
|
|
int d = bucketdata[i];
|
|
|
|
|
bucketdata[i] = offset;
|
|
|
|
|
offset += d;
|
|
|
|
|
}
|
|
|
|
|
for (int i = 0; i < n; i++) a[bucketdata[buf[i] >> 16]++] = buf[i];
|
|
|
|
|
free(buf);
|
|
|
|
|
}
|
2024-04-11 06:13:51 +02:00
|
|
|
|
2025-05-06 03:06:37 +02:00
|
|
|
/** "Standardly" written inplace recursive quicksort */
|
|
|
|
|
static inline void do_qsort(uint32_t *a, int n) noexcept {
|
|
|
|
|
assert(n * uint32_t(sizeof(a[0])) <= INT_MAX);
|
|
|
|
|
quicksort(a, 0, n - 1);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
static inline void do_qsr3(uint32_t *a, int n) noexcept {
|
|
|
|
|
assert(n * uint32_t(sizeof(a[0])) <= INT_MAX);
|
|
|
|
|
rpivotstate state;
|
|
|
|
|
quicksort_rand3(a, 0, n - 1, &state);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
/** Quicksort with fast random pivoting */
|
|
|
|
|
static inline void do_qsr(uint32_t *a, int n) noexcept {
|
|
|
|
|
assert(n * uint32_t(sizeof(a[0])) <= INT_MAX);
|
|
|
|
|
rpivotstate state;
|
|
|
|
|
quicksort_rand(a, 0, n - 1, &state);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/** Zsolti's quicksort version with at most O(log(n)) memuse because loop instead half of the recursions */
|
|
|
|
|
static inline void do_zsssort(uint32_t *a, int n) noexcept {
|
|
|
|
|
assert(n * uint32_t(sizeof(a[0])) <= INT_MAX);
|
|
|
|
|
zssort(a, 0, n - 1);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/** Fastrandomized zss */
|
|
|
|
|
static inline void do_zsr(uint32_t *a, int n) noexcept {
|
|
|
|
|
assert(n * uint32_t(sizeof(a[0])) <= INT_MAX);
|
|
|
|
|
rpivotstate state;
|
|
|
|
|
zssort_rand(a, 0, n - 1, &state);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/** Fastrandomized zss3 */
|
|
|
|
|
static inline void do_zsr3(uint32_t *a, int n) noexcept {
|
|
|
|
|
assert(n * uint32_t(sizeof(a[0])) <= INT_MAX);
|
|
|
|
|
rpivotstate state;
|
|
|
|
|
zssort_rand3(a, 0, n - 1, &state);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/** Fastrandomized zss3 single-pass threewayed */
|
|
|
|
|
static inline void do_zsr3_sp(uint32_t *a, int n) noexcept {
|
|
|
|
|
assert(n * uint32_t(sizeof(a[0])) <= INT_MAX);
|
|
|
|
|
rpivotstate state;
|
|
|
|
|
zssort_rand3_sp(a, 0, n - 1, &state);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/** Fastrandomized zss3 single-pass threewayed */
|
|
|
|
|
static inline void do_zsr3_sp2(uint32_t *a, int n) noexcept {
|
|
|
|
|
assert(n * uint32_t(sizeof(a[0])) <= INT_MAX);
|
|
|
|
|
rpivotstate state;
|
|
|
|
|
zssort_rand3_sp2(a, 0, n - 1, &state);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/** Fastrandomized zss with const input check */
|
|
|
|
|
static inline void do_zsrc(uint32_t *a, int n) noexcept {
|
|
|
|
|
assert(n * uint32_t(sizeof(a[0])) <= INT_MAX);
|
|
|
|
|
rpivotstate state;
|
|
|
|
|
zsrc(a, 0, n - 1, &state);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/** meanqs */
|
|
|
|
|
static inline void do_meanqs(uint32_t *a, int n) noexcept {
|
|
|
|
|
assert(n * uint32_t(sizeof(a[0])) <= INT_MAX);
|
|
|
|
|
rpivotstate state;
|
|
|
|
|
meanqs(a, 0, n - 1, &state);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/** neoqs */
|
|
|
|
|
static inline void do_neoqs(uint32_t *a, int n) noexcept {
|
|
|
|
|
assert(n * uint32_t(sizeof(a[0])) <= INT_MAX);
|
|
|
|
|
rpivotstate state;
|
|
|
|
|
neoqs(a, 0, n - 1, &state);
|
|
|
|
|
}
|
2024-04-11 06:13:51 +02:00
|
|
|
|
2025-05-09 01:10:12 +02:00
|
|
|
/** schwab */
|
|
|
|
|
static inline void do_schwab(uint32_t *a, int n) noexcept {
|
|
|
|
|
assert(n * uint32_t(sizeof(a[0])) <= INT_MAX);
|
|
|
|
|
uint32_t junk;
|
|
|
|
|
sch_rand_state state = schwab_rand_state(junk);
|
|
|
|
|
schwab_sort(a, 0, n - 1, &state);
|
|
|
|
|
}
|
|
|
|
|
|
2025-09-11 20:42:04 +02:00
|
|
|
/** thier2 */
|
|
|
|
|
static inline void do_thier2(uint32_t *a, int n) noexcept {
|
|
|
|
|
assert(n * uint32_t(sizeof(a[0])) <= INT_MAX);
|
|
|
|
|
uint32_t junk;
|
|
|
|
|
sch_rand_state state = schwab_rand_state(junk);
|
|
|
|
|
std::vector<uint32_t> tmp(n);
|
|
|
|
|
thiersort2(a, &(tmp[0]), n, &state);
|
|
|
|
|
}
|
|
|
|
|
|
2025-09-29 18:18:37 +02:00
|
|
|
/** thier3 */
|
|
|
|
|
static inline void do_thier3(uint32_t *a, int n) noexcept {
|
|
|
|
|
assert(n * uint32_t(sizeof(a[0])) <= INT_MAX);
|
|
|
|
|
std::vector<uint32_t> tmp(n);
|
|
|
|
|
thiersort3(a, &(tmp[0]), n);
|
|
|
|
|
}
|
|
|
|
|
|
2025-10-01 16:49:00 +02:00
|
|
|
/** rthier */
|
|
|
|
|
static inline void do_rthier(uint32_t *a, int n) noexcept {
|
|
|
|
|
assert(n * uint32_t(sizeof(a[0])) <= INT_MAX);
|
|
|
|
|
uint32_t junk;
|
|
|
|
|
randominus(a, n, junk);
|
|
|
|
|
std::vector<uint32_t> tmp(n);
|
|
|
|
|
thiersort3(a, &(tmp[0]), n);
|
|
|
|
|
}
|
|
|
|
|
|
2025-09-29 02:24:50 +02:00
|
|
|
/** 3+1 pass bottom-up radix */
|
|
|
|
|
static inline void do_threepass(uint32_t *a, int n) noexcept {
|
|
|
|
|
threepass(a, n);
|
|
|
|
|
}
|
|
|
|
|
|
2024-04-11 06:13:51 +02:00
|
|
|
// mormord — Today at 2:27 AM
|
|
|
|
|
// 1 2 2 2 3
|
|
|
|
|
//
|
|
|
|
|
// 0 1 2 3 4
|
|
|
|
|
// |1|2 2 3 2
|
|
|
|
|
// 1|2|2 3 2
|
|
|
|
|
// 1|3|2 2 2
|
|
|
|
|
// 1|2|2 2 3
|
|
|
|
|
// 1|2|2 2 3
|
|
|
|
|
// 1 2|2|2 3
|
|
|
|
|
// ^
|
|
|
|
|
// Pivot
|
|
|
|
|
//
|
|
|
|
|
// állítás: pivottól balra helyükön vannak az elemek rendezettségük szerint
|
|
|
|
|
//
|
|
|
|
|
// Kezdés Indexek = Prefix összeg - 1 (utolsó helyek az elemeknek)
|
|
|
|
|
//
|
|
|
|
|
// Ha pivot új helyének meghatározot index > pivot_index (|.| helye)
|
|
|
|
|
// swap
|
|
|
|
|
// --index
|
|
|
|
|
// különben
|
|
|
|
|
// ++pivot_index
|
2024-04-11 23:59:13 +02:00
|
|
|
template<int j>
|
|
|
|
|
static inline bool morbittop(uint32_t elem) noexcept {
|
|
|
|
|
return (elem >> (8 * j)) & 0x80; // Only top bit
|
|
|
|
|
}
|
|
|
|
|
template<int j>
|
|
|
|
|
static inline uint32_t morgrab(uint32_t elem) noexcept {
|
|
|
|
|
return (elem >> (8 * j)) & 0x7f; // Only 7-bit!
|
|
|
|
|
}
|
2024-04-11 21:43:18 +02:00
|
|
|
/**
|
2024-04-11 23:59:13 +02:00
|
|
|
* Count occurences AND divides array into two partitions by its topmost bit.
|
2024-04-11 21:43:18 +02:00
|
|
|
* - Similar to quicksort partitioning, but changed accordingly.
|
|
|
|
|
* - MSB 0 bit values will come first partition.
|
|
|
|
|
*
|
|
|
|
|
* @param a The array to partition and occurence count.
|
|
|
|
|
* @param n The length of the array.
|
2024-04-11 23:59:13 +02:00
|
|
|
* @param radics1 A 128-sized array for occurence counting the bottom partition.
|
|
|
|
|
* @param radics2 A 128-sized array for occurence counting the top partition.
|
|
|
|
|
* @param DIGIT The digit in question (for a morgrab<DIGIT>(..) call)
|
2024-04-12 00:37:50 +02:00
|
|
|
* @returns The partition bounds are: [0..first) and [second..n) with logical means to mark empty partitions.
|
2024-04-11 21:43:18 +02:00
|
|
|
*/
|
2024-04-11 23:59:13 +02:00
|
|
|
template<int DIGIT>
|
|
|
|
|
static inline std::pair<uint32_t, uint32_t> oc_bit_partition(
|
|
|
|
|
uint32_t *a, uint32_t n, uint32_t *radics1, uint32_t *radics2) noexcept {
|
|
|
|
|
// See Hoare's OG quicksort why
|
2024-04-12 00:37:50 +02:00
|
|
|
int64_t i = 0;
|
|
|
|
|
int64_t j = n - 1;
|
2024-04-11 21:43:18 +02:00
|
|
|
|
|
|
|
|
while(true) {
|
|
|
|
|
// Move past well-placed ones
|
2024-04-11 23:59:13 +02:00
|
|
|
// And occurence count them
|
|
|
|
|
// Rem.: In quicksort usually a do-while loop
|
2024-04-12 00:37:50 +02:00
|
|
|
while ((i < j) && !morbittop<DIGIT>(a[i])) {
|
2024-04-11 23:59:13 +02:00
|
|
|
++radics1[morgrab<DIGIT>(a[i])];
|
|
|
|
|
++i;
|
|
|
|
|
}
|
2024-04-12 00:37:50 +02:00
|
|
|
while ((i < j) && morbittop<DIGIT>(a[j])) {
|
2024-04-11 23:59:13 +02:00
|
|
|
++radics2[morgrab<DIGIT>(a[j])];
|
|
|
|
|
--j;
|
|
|
|
|
}
|
2024-04-11 21:43:18 +02:00
|
|
|
|
|
|
|
|
// If the indices crossed, return
|
2024-04-11 23:59:13 +02:00
|
|
|
// Rem.: Not >= to ensure occ. counts! See also: (*)
|
2024-04-12 00:37:50 +02:00
|
|
|
if(i > j) return std::make_pair(i, j + 1);
|
2024-04-11 21:43:18 +02:00
|
|
|
|
2024-04-12 00:37:50 +02:00
|
|
|
// Check for swap
|
2024-04-11 23:59:13 +02:00
|
|
|
if(i < j) {
|
2024-04-12 00:37:50 +02:00
|
|
|
// Swap
|
|
|
|
|
// No need occurence count here as above loops will handle!
|
2024-04-11 23:59:13 +02:00
|
|
|
uint32_t tmp = a[i];
|
|
|
|
|
a[i] = a[j];
|
|
|
|
|
a[j] = tmp;
|
2024-04-12 00:37:50 +02:00
|
|
|
} else {
|
|
|
|
|
// i == j case: count occurence properly for the one.
|
|
|
|
|
if(!morbittop<DIGIT>(a[j])) {
|
|
|
|
|
++radics1[morgrab<DIGIT>(a[i])];
|
|
|
|
|
++i;
|
|
|
|
|
} else {
|
|
|
|
|
++radics2[morgrab<DIGIT>(a[j])];
|
|
|
|
|
--j;
|
|
|
|
|
}
|
|
|
|
|
|
2024-04-11 23:59:13 +02:00
|
|
|
}
|
2024-04-11 21:43:18 +02:00
|
|
|
}
|
|
|
|
|
}
|
2024-04-11 19:00:52 +02:00
|
|
|
template<int j>
|
|
|
|
|
static inline void mormord_sort_impl(uint32_t *a, int n) noexcept {
|
2024-04-11 16:41:08 +02:00
|
|
|
/* Preparation */
|
2024-04-11 23:59:13 +02:00
|
|
|
uint32_t radics1[128] = {0};
|
|
|
|
|
uint32_t radics2[128] = {0};
|
2024-04-11 16:41:08 +02:00
|
|
|
/* [from, to) index: only where prefix sums change - usually nonfull */
|
2024-04-11 23:59:13 +02:00
|
|
|
uint32_t real_radics1[128 * 2] = {0};
|
|
|
|
|
uint32_t real_radics2[128 * 2] = {0};
|
|
|
|
|
|
|
|
|
|
// Count occurences and partition by topmost bit
|
2024-04-12 00:37:50 +02:00
|
|
|
std::pair<uint32_t, uint32_t> boundz = oc_bit_partition<j>(a, n, radics1, radics2);
|
2024-04-11 06:13:51 +02:00
|
|
|
|
2024-04-11 16:41:08 +02:00
|
|
|
/* Prefix sum + real radics calc O(256) */
|
|
|
|
|
/* Radics: */
|
|
|
|
|
/* fr: {10, 20, 10, 0, 5, 15,...} */
|
|
|
|
|
/* to: {10, 30, 40, 40, 45, 60,..} */
|
|
|
|
|
/* Real radics: */
|
|
|
|
|
/* to: {[0, 10], [10, 30], [30, 40], [40, 45], [45, 60]} */
|
|
|
|
|
/* 0. 1. 2. 4. 5. */
|
|
|
|
|
/* (because radix value 3 is not found in input) */
|
2024-04-11 23:59:13 +02:00
|
|
|
uint32_t prev1 = 0;
|
|
|
|
|
uint32_t reali1 = 0;
|
|
|
|
|
uint32_t prev2 = 0;
|
|
|
|
|
uint32_t reali2 = 0;
|
2024-04-11 19:54:07 +02:00
|
|
|
#pragma GCC unroll 16
|
2024-04-11 23:59:13 +02:00
|
|
|
for(int i = 0; i < 128; ++i) {
|
|
|
|
|
// Hopefully we get more ILP out of this
|
|
|
|
|
// Also I tried branchless before adding
|
|
|
|
|
// ILP here and it slowed things, so first
|
|
|
|
|
// let us try it with branch prediction!
|
|
|
|
|
if(radics1[i] != 0) {
|
|
|
|
|
radics1[i] += prev1;
|
|
|
|
|
real_radics1[reali1] = prev1;
|
|
|
|
|
real_radics1[reali1 + 1] = radics1[i];
|
|
|
|
|
prev1 = radics1[i];
|
|
|
|
|
reali1 += 2;
|
2024-04-11 20:02:23 +02:00
|
|
|
} else {
|
2024-04-11 23:59:13 +02:00
|
|
|
radics1[i] += prev1;
|
|
|
|
|
prev1 = radics1[i];
|
2024-04-11 19:54:07 +02:00
|
|
|
}
|
2024-04-11 23:59:13 +02:00
|
|
|
if(radics2[i] != 0) {
|
|
|
|
|
radics2[i] += prev2;
|
|
|
|
|
real_radics2[reali2] = prev2;
|
|
|
|
|
real_radics2[reali2 + 1] = radics2[i];
|
|
|
|
|
prev2 = radics2[i];
|
|
|
|
|
reali2 += 2;
|
2024-04-11 20:02:23 +02:00
|
|
|
} else {
|
2024-04-11 23:59:13 +02:00
|
|
|
radics2[i] += prev2;
|
|
|
|
|
prev2 = radics2[i];
|
2024-04-11 20:02:23 +02:00
|
|
|
}
|
2024-04-11 06:13:51 +02:00
|
|
|
}
|
|
|
|
|
|
2024-04-11 23:59:13 +02:00
|
|
|
// Inplace swap, with added ILP / branchless opt.
|
|
|
|
|
// Without it its data dependent like crazy...
|
|
|
|
|
uint32_t pivoti1 = 0;
|
2024-04-12 00:37:50 +02:00
|
|
|
uint32_t pivoti2 = boundz.second;
|
2024-04-12 01:09:59 +02:00
|
|
|
while((pivoti1 < boundz.first) && (pivoti2 < n)) {
|
2024-04-11 23:59:13 +02:00
|
|
|
|
|
|
|
|
/* Pivot 1 */
|
|
|
|
|
|
|
|
|
|
uint32_t radixval1 = morgrab<j>(a[pivoti1]);
|
|
|
|
|
uint32_t targeti1 = --radics1[radixval1]; // dec index (!)
|
|
|
|
|
|
|
|
|
|
// Bitmask: true -> 11.....1; false -> 00.....0
|
|
|
|
|
uint32_t mask1 = ~((targeti1 > pivoti1) - 1);
|
|
|
|
|
|
|
|
|
|
// Branchless swap (using bitmask)
|
|
|
|
|
uint32_t delta1 = (a[pivoti1] ^ a[targeti1]) & mask1;
|
|
|
|
|
a[pivoti1] = a[pivoti1] ^ delta1;
|
|
|
|
|
a[targeti1] = a[targeti1] ^ delta1;
|
|
|
|
|
|
|
|
|
|
// "else" branch
|
|
|
|
|
pivoti1 += !mask1;
|
|
|
|
|
radics1[radixval1] += !mask1; // undec index (!)
|
|
|
|
|
|
|
|
|
|
/* Pivot 2 */
|
|
|
|
|
|
|
|
|
|
uint32_t radixval2 = morgrab<j>(a[pivoti2]);
|
2025-05-06 03:06:37 +02:00
|
|
|
uint32_t targeti2 = boundz.second + (--radics2[radixval2]); // dec index (!)
|
2024-04-11 23:59:13 +02:00
|
|
|
|
|
|
|
|
// Bitmask: true -> 11.....1; false -> 00.....0
|
|
|
|
|
uint32_t mask2 = ~((targeti2 > pivoti2) - 1);
|
|
|
|
|
|
|
|
|
|
// Branchless swap (using bitmask)
|
|
|
|
|
uint32_t delta2 = (a[pivoti2] ^ a[targeti2]) & mask2;
|
|
|
|
|
a[pivoti2] = a[pivoti2] ^ delta2;
|
|
|
|
|
a[targeti2] = a[targeti2] ^ delta2;
|
|
|
|
|
|
|
|
|
|
// "else" branch
|
|
|
|
|
pivoti2 += !mask2;
|
|
|
|
|
radics2[radixval2] += !mask2; // undec index (!)
|
|
|
|
|
}
|
2024-04-12 01:09:59 +02:00
|
|
|
// Finish pivot1 if there are still elements..
|
|
|
|
|
while(pivoti1 < boundz.first) {
|
|
|
|
|
|
|
|
|
|
/* Pivot 1+ */
|
|
|
|
|
|
|
|
|
|
uint32_t radixval1 = morgrab<j>(a[pivoti1]);
|
|
|
|
|
uint32_t targeti1 = --radics1[radixval1]; // dec index (!)
|
|
|
|
|
|
|
|
|
|
// Bitmask: true -> 11.....1; false -> 00.....0
|
|
|
|
|
uint32_t mask1 = ~((targeti1 > pivoti1) - 1);
|
|
|
|
|
|
|
|
|
|
// Branchless swap (using bitmask)
|
|
|
|
|
uint32_t delta1 = (a[pivoti1] ^ a[targeti1]) & mask1;
|
|
|
|
|
a[pivoti1] = a[pivoti1] ^ delta1;
|
|
|
|
|
a[targeti1] = a[targeti1] ^ delta1;
|
|
|
|
|
|
|
|
|
|
// "else" branch
|
|
|
|
|
pivoti1 += !mask1;
|
|
|
|
|
radics1[radixval1] += !mask1; // undec index (!)
|
|
|
|
|
}
|
|
|
|
|
// Finish pivot2 if there are still elements..
|
|
|
|
|
while(pivoti2 < n) {
|
|
|
|
|
|
|
|
|
|
/* Pivot 2+ */
|
|
|
|
|
|
|
|
|
|
uint32_t radixval2 = morgrab<j>(a[pivoti2]);
|
2025-05-06 03:06:37 +02:00
|
|
|
uint32_t targeti2 = boundz.second + (--radics2[radixval2]); // dec index (!)
|
2024-04-12 01:09:59 +02:00
|
|
|
|
|
|
|
|
// Bitmask: true -> 11.....1; false -> 00.....0
|
|
|
|
|
uint32_t mask2 = ~((targeti2 > pivoti2) - 1);
|
|
|
|
|
|
|
|
|
|
// Branchless swap (using bitmask)
|
|
|
|
|
uint32_t delta2 = (a[pivoti2] ^ a[targeti2]) & mask2;
|
|
|
|
|
a[pivoti2] = a[pivoti2] ^ delta2;
|
|
|
|
|
a[targeti2] = a[targeti2] ^ delta2;
|
|
|
|
|
|
|
|
|
|
// "else" branch
|
|
|
|
|
pivoti2 += !mask2;
|
|
|
|
|
radics2[radixval2] += !mask2; // undec index (!)
|
|
|
|
|
}
|
2024-04-11 23:59:13 +02:00
|
|
|
|
|
|
|
|
// Possible recursions
|
2024-04-11 19:00:52 +02:00
|
|
|
if constexpr (j != 0) {
|
2024-04-11 23:59:13 +02:00
|
|
|
/* Partition 1 recursions */
|
|
|
|
|
for(int i = 0; i < reali1; i += 2) {
|
|
|
|
|
/* inclusive */
|
|
|
|
|
uint32_t from = real_radics1[i];
|
|
|
|
|
/* non-inclusive */
|
|
|
|
|
uint32_t to = real_radics1[i + 1];
|
|
|
|
|
mormord_sort_impl<j - 1>(&a[from], (to - (from)));
|
|
|
|
|
}
|
|
|
|
|
/* Partition 2 recursions */
|
|
|
|
|
for(int i = 0; i < reali2; i += 2) {
|
2024-04-11 19:00:52 +02:00
|
|
|
/* inclusive */
|
2024-04-11 23:59:13 +02:00
|
|
|
uint32_t from = real_radics2[i];
|
2024-04-11 19:00:52 +02:00
|
|
|
/* non-inclusive */
|
2024-04-11 23:59:13 +02:00
|
|
|
uint32_t to = real_radics2[i + 1];
|
2024-04-11 19:00:52 +02:00
|
|
|
mormord_sort_impl<j - 1>(&a[from], (to - (from)));
|
|
|
|
|
}
|
2024-04-11 06:13:51 +02:00
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
static inline void mormord_sort(uint32_t *a, int n) noexcept {
|
|
|
|
|
assert(n * uint32_t(sizeof(a[0])) <= INT_MAX);
|
2024-04-11 19:00:52 +02:00
|
|
|
mormord_sort_impl<3>(a, n);
|
2024-04-11 06:13:51 +02:00
|
|
|
}
|
|
|
|
|
|
2022-12-10 11:04:53 +01:00
|
|
|
void fourpass(uint32_t *a, int n) {
|
|
|
|
|
assert(n * int64_t(sizeof(a[0])) <= INT_MAX);
|
|
|
|
|
// alloc helper buffers.
|
|
|
|
|
int sz = n * sizeof(a[0]);
|
|
|
|
|
std::vector<int> bucketdata(1 << 8);
|
|
|
|
|
uint32_t *buf = (uint32_t *)malloc(sz);
|
|
|
|
|
assert(buf != NULL);
|
|
|
|
|
uint32_t *src = a, *dst = buf;
|
|
|
|
|
uintptr_t swapmask = (uintptr_t)a ^ (uintptr_t)buf;
|
|
|
|
|
for (int shift = 0; shift < 32; shift += 8) {
|
2021-12-15 12:52:33 +01:00
|
|
|
memset(&bucketdata[0], 0, bucketdata.size() * sizeof(bucketdata[0]));
|
2022-12-10 11:04:53 +01:00
|
|
|
for (int i = 0; i < n; i++) bucketdata[src[i] >> shift & 0xff]++;
|
|
|
|
|
int offset = 0;
|
|
|
|
|
for (int i = 0; i < 1 << 8; i++) {
|
2021-12-15 12:52:33 +01:00
|
|
|
int d = bucketdata[i];
|
|
|
|
|
bucketdata[i] = offset;
|
|
|
|
|
offset += d;
|
|
|
|
|
}
|
2022-12-10 11:04:53 +01:00
|
|
|
for (int i = 0; i < n; i++) {
|
|
|
|
|
dst[bucketdata[src[i] >> shift & 0xff]++] = src[i];
|
2021-12-15 12:52:33 +01:00
|
|
|
}
|
2022-12-10 11:04:53 +01:00
|
|
|
src = (uint32_t *)((uintptr_t)src ^ swapmask);
|
|
|
|
|
dst = (uint32_t *)((uintptr_t)dst ^ swapmask);
|
2021-12-15 12:52:33 +01:00
|
|
|
}
|
2022-12-10 11:04:53 +01:00
|
|
|
free(buf);
|
|
|
|
|
}
|
2021-12-15 16:09:40 +01:00
|
|
|
|
2022-12-10 11:04:53 +01:00
|
|
|
/** Only werks für das fourpassu! */
|
|
|
|
|
void my_memset(int *v) {
|
|
|
|
|
memset(v, 0, (1 << 8) * sizeof(int));
|
|
|
|
|
}
|
2021-12-15 16:09:40 +01:00
|
|
|
|
2022-12-10 11:04:53 +01:00
|
|
|
// hand-unrolled fourpass.
|
|
|
|
|
void fourpassu(uint32_t *a, int n) {
|
|
|
|
|
assert(n * int64_t(sizeof(a[0])) <= INT_MAX);
|
|
|
|
|
// alloc helper buffers.
|
|
|
|
|
int sz = n * sizeof(a[0]);
|
2021-12-15 16:09:40 +01:00
|
|
|
|
2022-12-10 11:04:53 +01:00
|
|
|
static thread_local int bucketdata[1 << 8];
|
|
|
|
|
my_memset(bucketdata);
|
2021-12-15 16:09:40 +01:00
|
|
|
|
2022-12-10 11:04:53 +01:00
|
|
|
uint32_t *buf = (uint32_t *)malloc(sz);
|
|
|
|
|
assert(buf != NULL);
|
|
|
|
|
// pass 1: sort by lower 8 bits.
|
|
|
|
|
#pragma GCC unroll 32
|
|
|
|
|
for (int i = 0; i < n; i++) bucketdata[a[i] & 0xff]++;
|
|
|
|
|
int offset = 0;
|
|
|
|
|
#pragma GCC unroll 8
|
|
|
|
|
for (int i = 0; i < 1 << 8; i++) {
|
|
|
|
|
int d = bucketdata[i];
|
|
|
|
|
bucketdata[i] = offset;
|
|
|
|
|
offset += d;
|
|
|
|
|
}
|
|
|
|
|
for (int i = 0; i < n; i++) buf[bucketdata[a[i] & 0xff]++] = a[i];
|
|
|
|
|
// pass 2: sort by 2nd 8 bits.
|
|
|
|
|
my_memset(bucketdata);
|
|
|
|
|
#pragma GCC unroll 32
|
|
|
|
|
for (int i = 0; i < n; i++) bucketdata[buf[i] >> 8 & 0xff]++;
|
|
|
|
|
offset = 0;
|
|
|
|
|
#pragma GCC unroll 8
|
|
|
|
|
for (int i = 0; i < 1 << 8; i++) {
|
|
|
|
|
int d = bucketdata[i];
|
|
|
|
|
bucketdata[i] = offset;
|
|
|
|
|
offset += d;
|
|
|
|
|
}
|
|
|
|
|
#pragma GCC unroll 64
|
|
|
|
|
for (int i = 0; i < n; i++) a[bucketdata[buf[i] >> 8 & 0xff]++] = buf[i];
|
|
|
|
|
// pass 3: sort by 3rd 8 bits.
|
|
|
|
|
my_memset(bucketdata);
|
|
|
|
|
#pragma GCC unroll 32
|
|
|
|
|
for (int i = 0; i < n; i++) bucketdata[a[i] >> 16 & 0xff]++;
|
|
|
|
|
offset = 0;
|
|
|
|
|
#pragma GCC unroll 8
|
|
|
|
|
for (int i = 0; i < 1 << 8; i++) {
|
|
|
|
|
int d = bucketdata[i];
|
|
|
|
|
bucketdata[i] = offset;
|
|
|
|
|
offset += d;
|
|
|
|
|
}
|
|
|
|
|
#pragma GCC unroll 64
|
|
|
|
|
for (int i = 0; i < n; i++) buf[bucketdata[a[i] >> 16 & 0xff]++] = a[i];
|
|
|
|
|
// pass 4: sort by 4th 8 bits.
|
|
|
|
|
my_memset(bucketdata);
|
|
|
|
|
#pragma GCC unroll 32
|
|
|
|
|
for (int i = 0; i < n; i++) bucketdata[buf[i] >> 24 & 0xff]++;
|
|
|
|
|
offset = 0;
|
|
|
|
|
#pragma GCC unroll 8
|
|
|
|
|
for (int i = 0; i < 1 << 8; i++) {
|
|
|
|
|
int d = bucketdata[i];
|
|
|
|
|
bucketdata[i] = offset;
|
|
|
|
|
offset += d;
|
|
|
|
|
}
|
|
|
|
|
#pragma GCC unroll 32
|
|
|
|
|
for (int i = 0; i < n; i++) a[bucketdata[buf[i] >> 24 & 0xff]++] = buf[i];
|
|
|
|
|
free(buf);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
static inline uint32_t byterotate(uint32_t x) { return (x >> 8) | (x << 24); }
|
|
|
|
|
void fourrots(uint32_t *arr, int n) {
|
|
|
|
|
assert(n * int64_t(sizeof(arr[0])) <= INT_MAX);
|
|
|
|
|
assert(n % 4 == 0);
|
|
|
|
|
// alloc helper buffers.
|
|
|
|
|
int sz = n * sizeof(arr[0]);
|
|
|
|
|
std::vector<int> bucketdata(1 << 8);
|
|
|
|
|
int *btd = &bucketdata[0];
|
|
|
|
|
uint32_t *buf = (uint32_t *)malloc(sz);
|
|
|
|
|
assert(buf != NULL);
|
|
|
|
|
uint32_t *src = arr, *dst = buf;
|
|
|
|
|
uintptr_t swapmask = (uintptr_t)arr ^ (uintptr_t)buf;
|
|
|
|
|
uint32_t a, b, c, d;
|
|
|
|
|
uint32_t abt, bbt, cbt, dbt;
|
|
|
|
|
for (int shift = 0; shift < 32; shift += 8) {
|
|
|
|
|
memset(btd, 0, bucketdata.size() * sizeof(bucketdata[0]));
|
|
|
|
|
for (int i = 0; i < n; i += 4) {
|
|
|
|
|
a = src[i];
|
|
|
|
|
b = src[i + 1];
|
|
|
|
|
c = src[i + 2];
|
|
|
|
|
d = src[i + 3];
|
|
|
|
|
abt = a & 0xff;
|
|
|
|
|
bbt = b & 0xff;
|
|
|
|
|
cbt = c & 0xff;
|
|
|
|
|
dbt = d & 0xff;
|
|
|
|
|
btd[abt]++;
|
|
|
|
|
btd[bbt]++;
|
|
|
|
|
btd[cbt]++;
|
|
|
|
|
btd[dbt]++;
|
2021-12-15 12:52:33 +01:00
|
|
|
}
|
2022-12-10 11:04:53 +01:00
|
|
|
int offset = 0;
|
2021-12-15 12:52:33 +01:00
|
|
|
for (int i = 0; i < 1 << 8; i++) {
|
|
|
|
|
int d = bucketdata[i];
|
|
|
|
|
bucketdata[i] = offset;
|
|
|
|
|
offset += d;
|
|
|
|
|
}
|
2022-12-10 11:04:53 +01:00
|
|
|
for (int i = 0; i < n; i += 4) {
|
|
|
|
|
a = src[i];
|
|
|
|
|
b = src[i + 1];
|
|
|
|
|
c = src[i + 2];
|
|
|
|
|
d = src[i + 3];
|
|
|
|
|
abt = a & 0xff;
|
|
|
|
|
bbt = b & 0xff;
|
|
|
|
|
cbt = c & 0xff;
|
|
|
|
|
dbt = d & 0xff;
|
|
|
|
|
dst[btd[abt]++] = byterotate(a);
|
|
|
|
|
dst[btd[bbt]++] = byterotate(b);
|
|
|
|
|
dst[btd[cbt]++] = byterotate(c);
|
|
|
|
|
dst[btd[dbt]++] = byterotate(d);
|
2021-12-15 12:52:33 +01:00
|
|
|
}
|
2022-12-10 11:04:53 +01:00
|
|
|
src = (uint32_t *)((uintptr_t)src ^ swapmask);
|
|
|
|
|
dst = (uint32_t *)((uintptr_t)dst ^ swapmask);
|
2021-12-15 12:52:33 +01:00
|
|
|
}
|
2022-12-10 11:04:53 +01:00
|
|
|
free(buf);
|
|
|
|
|
}
|
2021-12-15 16:09:40 +01:00
|
|
|
|
2022-12-10 11:04:53 +01:00
|
|
|
// frewr - four rewrites.
|
2025-10-01 19:09:18 +02:00
|
|
|
// Rem.: I realized this changes the keys (rerwrites) by rotating them by 8 in each pass
|
|
|
|
|
// So in pass 0 we just & 0xff and put in place, but stored v gets rotated right by 8!
|
|
|
|
|
// So in pass 1 we can just & 0xff again - and store again rotated
|
|
|
|
|
// Because there are 4 rotates and 4 radices - in the end the "real" values "came back" as keys.
|
|
|
|
|
// Currently (in October 1, 2025) this is fastest, but I KNOW this was slower than Magyar in 2021 dec (slightly)
|
|
|
|
|
// Interestingly if I build that old build now, its already faster than my Magyarsort (and it misses some later opt)
|
|
|
|
|
// So I imagine this is because compiler technology became better and favors this algorithm more now!
|
|
|
|
|
// This algorithm is made by Ypsu - my friend. He made it originally as a response to Magyarsort...
|
|
|
|
|
// I added and measured unrolls to this and some minor optimizations - but not understanding the alg back then...
|
2022-12-10 11:04:53 +01:00
|
|
|
void frewr(uint32_t *arr, int n) {
|
|
|
|
|
uint32_t *tmpbuf = (uint32_t *)malloc(n * 4);
|
|
|
|
|
mlock(tmpbuf, n * 4);
|
|
|
|
|
int btoffsets[4][256] = {};
|
|
|
|
|
#pragma GCC unroll 64
|
|
|
|
|
for (int i = n - 1; i >= 0; i--) {
|
|
|
|
|
uint32_t a = arr[i];
|
|
|
|
|
btoffsets[3][a & 0xff]++;
|
|
|
|
|
btoffsets[2][a >> 8 & 0xff]++;
|
|
|
|
|
btoffsets[1][a >> 16 & 0xff]++;
|
|
|
|
|
btoffsets[0][a >> 24 & 0xff]++;
|
|
|
|
|
}
|
|
|
|
|
int btend[4] = {n - 1, n - 1, n - 1, n - 1};
|
|
|
|
|
#pragma GCC unroll 16
|
|
|
|
|
for (int i = 255; i >= 0; i--) {
|
|
|
|
|
#pragma GCC unroll 4
|
|
|
|
|
for (int pass = 3; pass >= 0; pass--) {
|
|
|
|
|
int nbtend = btend[pass] - btoffsets[pass][i];
|
|
|
|
|
btoffsets[pass][i] = btend[pass];
|
|
|
|
|
btend[pass] = nbtend;
|
2021-12-17 19:20:58 +01:00
|
|
|
}
|
|
|
|
|
}
|
2022-12-10 11:04:53 +01:00
|
|
|
uint32_t *src = arr, *dst = tmpbuf;
|
|
|
|
|
#pragma GCC unroll 4
|
|
|
|
|
for (int pass = 3; pass >= 0; pass--) {
|
|
|
|
|
int *off = btoffsets[pass];
|
2021-12-19 21:55:48 +01:00
|
|
|
#pragma GCC unroll 64
|
|
|
|
|
for (int i = n - 1; i >= 0; i--) {
|
2022-12-10 11:04:53 +01:00
|
|
|
uint32_t v = src[i];
|
|
|
|
|
dst[off[v & 0xff]--] = v >> 8 | v << 24;
|
|
|
|
|
__builtin_prefetch(&dst[off[v & 0xff] - 2]);
|
2021-12-19 21:55:48 +01:00
|
|
|
}
|
2022-12-10 11:04:53 +01:00
|
|
|
uint32_t *tmp = src;
|
|
|
|
|
src = dst;
|
|
|
|
|
dst = tmp;
|
2021-12-19 21:55:48 +01:00
|
|
|
}
|
2022-12-10 11:04:53 +01:00
|
|
|
munlock(tmpbuf, n * 4);
|
|
|
|
|
free(tmpbuf);
|
|
|
|
|
}
|
2021-12-19 21:55:48 +01:00
|
|
|
|
2022-12-10 11:04:53 +01:00
|
|
|
void vsort(uint32_t *a, int n) {
|
|
|
|
|
thread_local std::vector<uint32_t> bts[256];
|
|
|
|
|
#pragma GCC unroll 4
|
|
|
|
|
for (int shift = 0; shift < 32; shift += 8) {
|
|
|
|
|
#pragma GCC unroll 64
|
|
|
|
|
for (int i = 0; i < n; i++) bts[a[i] >> shift & 0xff].push_back(a[i]);
|
|
|
|
|
#pragma GCC unroll 64
|
|
|
|
|
for (int bt = 0, k = 0; bt < 256; bt++) {
|
|
|
|
|
memcpy(a + k, &bts[bt][0], bts[bt].size() * sizeof(a[0]));
|
|
|
|
|
k += bts[bt].size();
|
|
|
|
|
bts[bt].clear();
|
2021-12-15 12:52:33 +01:00
|
|
|
}
|
|
|
|
|
}
|
2022-12-10 11:04:53 +01:00
|
|
|
}
|
2023-06-30 16:39:33 +02:00
|
|
|
|
2022-12-10 11:04:53 +01:00
|
|
|
void pagedsort(uint32_t *a, int n) {
|
|
|
|
|
enum { pagesize = 1024 };
|
|
|
|
|
int pagecount = (n + pagesize - 1) / pagesize + 512;
|
|
|
|
|
uint32_t *pd = (uint32_t *)malloc(pagecount * pagesize * sizeof(a[0]));
|
|
|
|
|
std::vector<int> freelist(pagecount);
|
|
|
|
|
std::vector<int> next(pagecount);
|
|
|
|
|
std::iota(std::begin(freelist), std::end(freelist), 0);
|
|
|
|
|
struct bucket {
|
|
|
|
|
int len;
|
|
|
|
|
int headpage, lastpage;
|
|
|
|
|
};
|
|
|
|
|
bucket bts[512];
|
|
|
|
|
// initial scatter.
|
|
|
|
|
for (int bt = 0; bt < 256; bt++) {
|
|
|
|
|
int p = freelist.back();
|
|
|
|
|
freelist.pop_back();
|
|
|
|
|
bts[bt] = {0, p, p};
|
|
|
|
|
}
|
|
|
|
|
for (int i = 0; i < n; i++) {
|
|
|
|
|
bucket *bt = &bts[a[i] & 0xff];
|
|
|
|
|
pd[bt->lastpage * pagesize + bt->len++ % pagesize] = a[i];
|
|
|
|
|
if (bt->len % pagesize == 0) {
|
2021-12-15 12:52:33 +01:00
|
|
|
int p = freelist.back();
|
|
|
|
|
freelist.pop_back();
|
2022-12-10 11:04:53 +01:00
|
|
|
next[bt->lastpage] = p;
|
|
|
|
|
bt->lastpage = p;
|
2021-12-15 12:52:33 +01:00
|
|
|
}
|
2022-12-10 11:04:53 +01:00
|
|
|
}
|
|
|
|
|
// intermediate level scatters.
|
|
|
|
|
int ibase = 0, obase = 256;
|
|
|
|
|
for (int shift = 8; shift < 32; shift += 8) {
|
|
|
|
|
for (int bt = 0; bt < 256; bt++) {
|
|
|
|
|
int p = freelist.back();
|
|
|
|
|
freelist.pop_back();
|
|
|
|
|
bts[obase + bt] = {0, p, p};
|
2021-12-15 12:52:33 +01:00
|
|
|
}
|
|
|
|
|
for (int ibti = 0; ibti < 256; ibti++) {
|
|
|
|
|
struct bucket *ibt = &bts[ibase + ibti];
|
|
|
|
|
int page = ibt->headpage;
|
|
|
|
|
for (int i = 0; i < ibt->len; i++) {
|
2022-12-10 11:04:53 +01:00
|
|
|
uint32_t v = pd[page * pagesize + i % pagesize];
|
|
|
|
|
struct bucket *obt = &bts[obase + (v >> shift & 0xff)];
|
|
|
|
|
pd[obt->lastpage * pagesize + obt->len++ % pagesize] = v;
|
|
|
|
|
if (obt->len % pagesize == 0) {
|
|
|
|
|
int p = freelist.back();
|
|
|
|
|
freelist.pop_back();
|
|
|
|
|
next[obt->lastpage] = p;
|
|
|
|
|
obt->lastpage = p;
|
|
|
|
|
}
|
2021-12-15 12:52:33 +01:00
|
|
|
if (i % pagesize == pagesize - 1) {
|
2022-12-10 11:04:53 +01:00
|
|
|
freelist.push_back(page);
|
2021-12-15 12:52:33 +01:00
|
|
|
page = next[page];
|
|
|
|
|
}
|
|
|
|
|
}
|
2022-12-10 11:04:53 +01:00
|
|
|
freelist.push_back(ibt->lastpage);
|
2021-12-15 12:52:33 +01:00
|
|
|
}
|
2022-12-10 11:04:53 +01:00
|
|
|
ibase = 256 - ibase;
|
|
|
|
|
obase = 256 - obase;
|
2021-12-15 12:52:33 +01:00
|
|
|
}
|
2022-12-10 11:04:53 +01:00
|
|
|
// the final gather.
|
|
|
|
|
int k = 0;
|
|
|
|
|
for (int ibti = 0; ibti < 256; ibti++) {
|
|
|
|
|
struct bucket *ibt = &bts[ibase + ibti];
|
|
|
|
|
int page = ibt->headpage;
|
|
|
|
|
for (int i = 0; i < ibt->len; i++) {
|
|
|
|
|
a[k++] = pd[page * pagesize + i % pagesize];
|
|
|
|
|
if (i % pagesize == pagesize - 1) {
|
|
|
|
|
page = next[page];
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
free(pd);
|
|
|
|
|
}
|
2021-12-15 12:52:33 +01:00
|
|
|
|
2023-06-30 18:00:44 +02:00
|
|
|
// I measured this being faster than std::sort which is giga-lol wtf...
|
|
|
|
|
void thier_quicksort(uint32_t *arr, int n) {
|
|
|
|
|
// Prepare: O(n)
|
|
|
|
|
tselem *tarr = thiersort_prepare_array(
|
|
|
|
|
arr,
|
|
|
|
|
// union tskey (askey)(void *elem),
|
|
|
|
|
[] (void *elem) {
|
|
|
|
|
tskey k;
|
|
|
|
|
k.u = *((uint32_t *)elem);
|
|
|
|
|
return k;
|
|
|
|
|
},
|
|
|
|
|
4, // elemsize,
|
|
|
|
|
n, // length,
|
|
|
|
|
malloc);
|
|
|
|
|
|
2023-07-01 04:50:32 +02:00
|
|
|
/*
|
2023-06-30 22:06:24 +02:00
|
|
|
for(uint32_t i = 0; i < n; ++i) {
|
|
|
|
|
printf("In: %d @%d\n", tarr[i].key.u, tarr[i].i);
|
|
|
|
|
}
|
2023-07-01 04:50:32 +02:00
|
|
|
*/
|
2023-06-30 22:06:24 +02:00
|
|
|
|
2023-06-30 18:00:44 +02:00
|
|
|
// Quicksort by me
|
|
|
|
|
ts_quicksort_inplace(
|
|
|
|
|
tarr,
|
|
|
|
|
0, // from
|
|
|
|
|
n, // to
|
|
|
|
|
ts_lt_uint,
|
|
|
|
|
nullptr);
|
|
|
|
|
|
2023-07-01 04:50:32 +02:00
|
|
|
/*
|
2023-06-30 22:06:24 +02:00
|
|
|
for(uint32_t i = 0; i < n; ++i) {
|
|
|
|
|
printf("Out: %d @%d\n", tarr[i].key.u, tarr[i].i);
|
|
|
|
|
}
|
2023-07-01 04:50:32 +02:00
|
|
|
*/
|
2023-06-30 22:06:24 +02:00
|
|
|
|
2023-06-30 18:00:44 +02:00
|
|
|
// Apply: O(n)
|
|
|
|
|
uint32_t tmp[1]; // needed for elem swaps
|
|
|
|
|
thiersort_apply(
|
|
|
|
|
tarr,
|
|
|
|
|
arr,
|
|
|
|
|
n,
|
|
|
|
|
4, // elemsize
|
|
|
|
|
tmp,
|
|
|
|
|
free);
|
|
|
|
|
}
|
|
|
|
|
|
2023-06-30 16:39:33 +02:00
|
|
|
void thiersort_uintkey8(uint32_t *arr, int n) {
|
|
|
|
|
// Prepare: O(n)
|
|
|
|
|
tselem *tarr = thiersort_prepare_array(
|
|
|
|
|
arr,
|
|
|
|
|
// union tskey (askey)(void *elem),
|
|
|
|
|
[] (void *elem) {
|
|
|
|
|
tskey k;
|
|
|
|
|
k.u = *((uint32_t *)elem);
|
|
|
|
|
return k;
|
|
|
|
|
},
|
|
|
|
|
4, // elemsize,
|
|
|
|
|
n, // length,
|
|
|
|
|
malloc);
|
|
|
|
|
|
2023-06-30 18:00:44 +02:00
|
|
|
/*
|
2023-06-30 16:39:33 +02:00
|
|
|
for(uint32_t i = 0; i < n; ++i) {
|
2023-06-30 17:00:37 +02:00
|
|
|
printf("In: %d @%d\n", tarr[i].key.u, tarr[i].i);
|
2023-06-30 16:39:33 +02:00
|
|
|
}
|
2023-06-30 18:00:44 +02:00
|
|
|
*/
|
2023-06-30 16:39:33 +02:00
|
|
|
|
|
|
|
|
// Sort: O(n*loglogn on amortized on random input):
|
|
|
|
|
thiersort8_uintkey(
|
|
|
|
|
tarr,
|
|
|
|
|
n,
|
|
|
|
|
malloc,
|
|
|
|
|
free);
|
|
|
|
|
|
2023-06-30 18:00:44 +02:00
|
|
|
/*
|
2023-06-30 16:39:33 +02:00
|
|
|
for(uint32_t i = 0; i < n; ++i) {
|
2023-06-30 17:00:37 +02:00
|
|
|
printf("Out: %d @%d\n", tarr[i].key.u, tarr[i].i);
|
2023-06-30 16:39:33 +02:00
|
|
|
}
|
2023-06-30 18:00:44 +02:00
|
|
|
*/
|
2023-06-30 16:39:33 +02:00
|
|
|
|
|
|
|
|
// Apply: O(n)
|
|
|
|
|
uint32_t tmp[1]; // needed for elem swaps
|
|
|
|
|
thiersort_apply(
|
|
|
|
|
tarr,
|
|
|
|
|
arr,
|
|
|
|
|
n,
|
|
|
|
|
4, // elemsize
|
|
|
|
|
tmp,
|
|
|
|
|
free);
|
|
|
|
|
}
|
|
|
|
|
|
2022-12-10 11:04:53 +01:00
|
|
|
// to measure / profile a single variant
|
|
|
|
|
void measure_single(int n) {
|
|
|
|
|
for (auto inputtype : inputtypes) {
|
|
|
|
|
printf("%10s", inputtype.c_str());
|
|
|
|
|
fflush(stdout);
|
|
|
|
|
std::vector<uint32_t> v(n);
|
|
|
|
|
v = geninput(inputtype, n);
|
|
|
|
|
//measure(inputtype, "sp", [&] { spsort(&v[0], v.size()); });
|
2025-09-12 02:25:57 +02:00
|
|
|
//measure(inputtype, "magyar", [&] { MagyarSort::sort<uint32_t>(&v[0], v.size()); });
|
2025-09-29 02:24:50 +02:00
|
|
|
//measure(inputtype, "thier2", [&] { do_thier2(&v[0], v.size()); });
|
2025-09-30 22:18:10 +02:00
|
|
|
measure(inputtype, "threep", [&] { do_threepass(&v[0], v.size()); });
|
2022-08-16 18:37:13 +02:00
|
|
|
|
2022-12-10 11:04:53 +01:00
|
|
|
for (auto r : results) printf("%9.3fs", r.second);
|
2022-08-16 18:37:13 +02:00
|
|
|
puts("");
|
|
|
|
|
}
|
2022-12-10 11:04:53 +01:00
|
|
|
puts("");
|
|
|
|
|
printf("%10s", "worst");
|
|
|
|
|
for (auto w : worst) printf("%9.3fs", w.second);
|
|
|
|
|
puts("");
|
|
|
|
|
printf("%10s", "");
|
|
|
|
|
for (auto w : worst) printf("%10s", w.first.c_str());
|
|
|
|
|
puts("");
|
|
|
|
|
}
|
2022-08-16 18:37:13 +02:00
|
|
|
|
2025-09-29 02:24:50 +02:00
|
|
|
int main(int argc, char **argv) {
|
2023-06-30 16:39:33 +02:00
|
|
|
//int n = 100000000;
|
2025-09-27 01:24:18 +02:00
|
|
|
//int n = 10000000;
|
|
|
|
|
int n = 5000000;
|
2025-09-12 02:25:57 +02:00
|
|
|
//int n = 1000000;
|
2023-07-01 06:37:20 +02:00
|
|
|
//int n = 100000;
|
2025-09-12 01:42:11 +02:00
|
|
|
//int n = 20001;
|
2025-05-06 03:06:37 +02:00
|
|
|
//int n = 20000;
|
2024-04-11 16:41:08 +02:00
|
|
|
//int n = 1000;
|
2025-05-06 03:06:37 +02:00
|
|
|
//int n = 200;
|
|
|
|
|
//int n = 170;
|
2023-06-30 18:00:44 +02:00
|
|
|
//int n = 100;
|
2025-05-06 03:06:37 +02:00
|
|
|
//int n = 180;
|
2025-09-11 20:42:04 +02:00
|
|
|
//int n = 20;
|
2023-06-30 16:39:33 +02:00
|
|
|
|
2025-09-29 02:24:50 +02:00
|
|
|
if(argc > 1) {
|
|
|
|
|
const char* arg = argv[1];
|
|
|
|
|
n = atoi(arg);
|
|
|
|
|
}
|
|
|
|
|
|
2023-06-30 16:39:33 +02:00
|
|
|
printf("Sorting %d elements:\n\n", n);
|
2022-08-16 18:37:13 +02:00
|
|
|
|
2022-12-10 11:04:53 +01:00
|
|
|
// Uncomment this for profiling and alg!
|
2025-10-01 16:49:00 +02:00
|
|
|
// measure_single(n);
|
|
|
|
|
// return 0;
|
2022-08-16 18:37:13 +02:00
|
|
|
|
2022-12-10 11:04:53 +01:00
|
|
|
for (auto inputtype : inputtypes) {
|
|
|
|
|
printf("%10s", inputtype.c_str());
|
2025-05-06 03:06:37 +02:00
|
|
|
// fflush(stdout);
|
2022-12-10 11:04:53 +01:00
|
|
|
std::vector<uint32_t> v(n), w(n), expected(n);
|
|
|
|
|
v = geninput(inputtype, n);
|
|
|
|
|
measure(inputtype, "copy", [&] { w = v; });
|
|
|
|
|
w = v;
|
|
|
|
|
measure(inputtype, "std", [&] { std::sort(std::begin(w), std::end(w)); });
|
|
|
|
|
expected = w;
|
2025-05-06 03:06:37 +02:00
|
|
|
|
|
|
|
|
/*
|
2022-12-10 11:04:53 +01:00
|
|
|
w = v;
|
|
|
|
|
measure(inputtype, "ska", [&] { ska_sort(std::begin(w), std::end(w)); });
|
2025-09-12 01:42:11 +02:00
|
|
|
*/
|
2022-12-10 11:04:53 +01:00
|
|
|
w = v;
|
|
|
|
|
measure(inputtype, "ska_copy", [&] {
|
|
|
|
|
std::vector<uint32_t> buf(w.size());
|
|
|
|
|
if (ska_sort_copy(std::begin(w), std::end(w), std::begin(buf))) {
|
|
|
|
|
w.swap(buf);
|
|
|
|
|
}
|
|
|
|
|
});
|
2025-09-29 02:24:50 +02:00
|
|
|
|
2022-12-10 11:04:53 +01:00
|
|
|
w = v;
|
2025-10-01 16:49:00 +02:00
|
|
|
measure(inputtype, "magyar", [&] {
|
|
|
|
|
MagyarSort::sort<uint32_t>(&w[0], w.size());
|
|
|
|
|
});
|
|
|
|
|
assert(w == expected);
|
|
|
|
|
|
|
|
|
|
w = v;
|
|
|
|
|
measure(inputtype, "rmagyar", [&] {
|
|
|
|
|
uint32_t junk;
|
|
|
|
|
randominus(&w[0], w.size(), junk);
|
|
|
|
|
MagyarSort::sort<uint32_t>(&w[0], w.size());
|
|
|
|
|
});
|
2022-12-10 11:04:53 +01:00
|
|
|
assert(w == expected);
|
2025-05-06 03:06:37 +02:00
|
|
|
|
2025-09-12 01:42:11 +02:00
|
|
|
w = v;
|
|
|
|
|
measure(inputtype, "gptbuck", [&] { gpt_bucket_sort(&w[0], w.size()); });
|
|
|
|
|
assert(w == expected);
|
|
|
|
|
|
2025-05-06 03:06:37 +02:00
|
|
|
/*
|
2024-04-11 06:13:51 +02:00
|
|
|
w = v;
|
|
|
|
|
measure(inputtype, "mormord", [&] { mormord_sort(&w[0], w.size()); });
|
|
|
|
|
assert(w == expected);
|
2021-12-15 12:52:33 +01:00
|
|
|
|
2022-12-10 11:04:53 +01:00
|
|
|
w = v;
|
|
|
|
|
measure(inputtype, "2pass", [&] { twopass(&w[0], w.size()); });
|
|
|
|
|
assert(w == expected);
|
|
|
|
|
w = v;
|
|
|
|
|
measure(inputtype, "4pass", [&] { fourpass(&w[0], w.size()); });
|
|
|
|
|
assert(w == expected);
|
|
|
|
|
w = v;
|
|
|
|
|
measure(inputtype, "psort", [&] { pagedsort(&w[0], w.size()); });
|
|
|
|
|
assert(w == expected);
|
|
|
|
|
w = v;
|
|
|
|
|
measure(inputtype, "4pasu", [&] { fourpassu(&w[0], w.size()); });
|
|
|
|
|
assert(w == expected);
|
|
|
|
|
w = v;
|
|
|
|
|
measure(inputtype, "4rot", [&] { fourrots(&w[0], w.size()); });
|
|
|
|
|
assert(w == expected);
|
|
|
|
|
w = v;
|
|
|
|
|
measure(inputtype, "sp", [&] { spsort(&w[0], w.size()); });
|
2025-05-06 03:06:37 +02:00
|
|
|
assert(w == expected);
|
2023-06-30 22:06:24 +02:00
|
|
|
w = v;
|
2025-05-06 03:06:37 +02:00
|
|
|
measure(inputtype, "gpt_qsort", [&] { gpt_quicksort(w); });
|
|
|
|
|
assert(w == expected);
|
|
|
|
|
w = v;
|
|
|
|
|
measure(inputtype, "qsort", [&] { do_qsort(&w[0], w.size()); });
|
|
|
|
|
assert(w == expected);
|
|
|
|
|
w = v;
|
|
|
|
|
measure(inputtype, "zssort", [&] { do_zsssort(&w[0], w.size()); });
|
|
|
|
|
assert(w == expected);
|
|
|
|
|
w = v;
|
|
|
|
|
measure(inputtype, "qsr", [&] { do_qsr(&w[0], w.size()); });
|
|
|
|
|
assert(w == expected);
|
|
|
|
|
w = v;
|
|
|
|
|
measure(inputtype, "zsr", [&] { do_zsr(&w[0], w.size()); });
|
|
|
|
|
assert(w == expected);
|
|
|
|
|
w = v;
|
|
|
|
|
measure(inputtype, "qsr3", [&] { do_qsr3(&w[0], w.size()); });
|
|
|
|
|
assert(w == expected);
|
2025-05-09 01:10:12 +02:00
|
|
|
*/
|
2025-09-11 20:42:04 +02:00
|
|
|
|
|
|
|
|
/*
|
2025-05-06 03:06:37 +02:00
|
|
|
w = v;
|
2025-09-11 20:42:04 +02:00
|
|
|
measure(inputtype, "zsrc", [&] { do_zsrc(&w[0], w.size()); });
|
2025-05-06 03:06:37 +02:00
|
|
|
assert(w == expected);
|
2025-09-11 20:42:04 +02:00
|
|
|
*/
|
|
|
|
|
|
|
|
|
|
/*
|
2025-05-06 03:06:37 +02:00
|
|
|
w = v;
|
2025-09-11 20:42:04 +02:00
|
|
|
measure(inputtype, "meanqs", [&] { do_meanqs(&w[0], w.size()); });
|
2025-05-06 03:06:37 +02:00
|
|
|
assert(w == expected);
|
|
|
|
|
w = v;
|
2025-09-11 20:42:04 +02:00
|
|
|
measure(inputtype, "zsr3", [&] { do_zsr3(&w[0], w.size()); });
|
2025-05-06 03:06:37 +02:00
|
|
|
assert(w == expected);
|
|
|
|
|
w = v;
|
2025-09-11 20:42:04 +02:00
|
|
|
measure(inputtype, "zsr3_sp", [&] { do_zsr3_sp(&w[0], w.size()); });
|
2025-05-06 03:06:37 +02:00
|
|
|
assert(w == expected);
|
|
|
|
|
w = v;
|
2025-09-11 20:42:04 +02:00
|
|
|
measure(inputtype, "zsr3_sp2", [&] { do_zsr3_sp2(&w[0], w.size()); });
|
2025-05-06 03:06:37 +02:00
|
|
|
assert(w == expected);
|
2025-09-29 02:24:50 +02:00
|
|
|
*/
|
2025-05-06 03:06:37 +02:00
|
|
|
|
2025-09-29 02:24:50 +02:00
|
|
|
/*
|
|
|
|
|
// TODO: This is buggy! See valgrind!
|
2025-05-06 03:06:37 +02:00
|
|
|
w = v;
|
|
|
|
|
measure(inputtype, "neoqs", [&] { do_neoqs(&w[0], w.size()); });
|
|
|
|
|
assert(w == expected);
|
2025-09-29 02:24:50 +02:00
|
|
|
*/
|
2025-05-06 03:06:37 +02:00
|
|
|
|
2025-05-09 01:10:12 +02:00
|
|
|
w = v;
|
|
|
|
|
measure(inputtype, "schwab", [&] { do_schwab(&w[0], w.size()); });
|
|
|
|
|
assert(w == expected);
|
|
|
|
|
|
2025-09-11 20:42:04 +02:00
|
|
|
w = v;
|
|
|
|
|
measure(inputtype, "thier2", [&] { do_thier2(&w[0], w.size()); });
|
|
|
|
|
assert(w == expected);
|
|
|
|
|
|
2025-09-29 18:18:37 +02:00
|
|
|
w = v;
|
|
|
|
|
measure(inputtype, "thier3", [&] { do_thier3(&w[0], w.size()); });
|
|
|
|
|
assert(w == expected);
|
|
|
|
|
|
2025-10-01 16:49:00 +02:00
|
|
|
w = v;
|
|
|
|
|
measure(inputtype, "rthier", [&] { do_rthier(&w[0], w.size()); });
|
|
|
|
|
assert(w == expected);
|
|
|
|
|
|
2025-09-29 02:24:50 +02:00
|
|
|
w = v;
|
|
|
|
|
measure(inputtype, "threep", [&] { do_threepass(&w[0], w.size()); });
|
|
|
|
|
assert(w == expected);
|
|
|
|
|
|
2024-04-11 06:13:51 +02:00
|
|
|
/*
|
2023-06-30 22:06:24 +02:00
|
|
|
w = v;
|
2023-06-30 18:00:44 +02:00
|
|
|
measure(inputtype, "magbuck", [&] { magyar_bucket_sort(&w[0], w.size()); });
|
|
|
|
|
assert(w == expected);
|
2023-06-30 22:06:24 +02:00
|
|
|
w = v;
|
2023-04-09 17:20:58 +02:00
|
|
|
measure(inputtype, "magbuck2", [&] { magyar_bucket_sort2(&w[0], w.size()); });
|
2022-12-10 11:04:53 +01:00
|
|
|
assert(w == expected);
|
2023-07-01 04:50:32 +02:00
|
|
|
w = v;
|
2023-07-02 15:56:21 +02:00
|
|
|
w = {10, 20, 20};
|
2023-06-30 18:00:44 +02:00
|
|
|
measure(inputtype, "qsmine", [&] { thier_quicksort(&w[0], w.size()); });
|
2023-07-02 13:33:27 +02:00
|
|
|
if(w != expected) {
|
|
|
|
|
for(uint32_t i = 0; i < n; ++i) {
|
|
|
|
|
// assert(w[i] == expected[i]);
|
|
|
|
|
if(w[i] != expected[i]) {
|
|
|
|
|
fprintf(stderr, "Difference at %d: %d != %d\n", i, w[i], expected[i]);
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
2023-06-30 18:00:44 +02:00
|
|
|
assert(w == expected);
|
2023-06-30 22:06:24 +02:00
|
|
|
w = v;
|
2023-06-30 18:00:44 +02:00
|
|
|
measure(inputtype, "thier", [&] { thiersort_uintkey8(&w[0], w.size()); });
|
|
|
|
|
if(w != expected) {
|
|
|
|
|
for(uint32_t i = 0; i < n; ++i) {
|
2023-07-01 06:48:38 +02:00
|
|
|
// assert(w[i] == expected[i]);
|
|
|
|
|
if(w[i] != expected[i]) {
|
2023-07-02 13:33:27 +02:00
|
|
|
fprintf(stderr, "Difference at %d: %d != %d\n", i, w[i], expected[i]);
|
2023-07-01 06:48:38 +02:00
|
|
|
}
|
2023-06-30 18:00:44 +02:00
|
|
|
}
|
|
|
|
|
}
|
2023-06-30 16:39:33 +02:00
|
|
|
assert(w == expected);
|
2024-04-11 06:13:51 +02:00
|
|
|
*/
|
2025-10-01 16:49:00 +02:00
|
|
|
|
2022-12-10 11:04:53 +01:00
|
|
|
w = v;
|
|
|
|
|
measure(inputtype, "frewr", [&] { frewr(&w[0], w.size()); });
|
|
|
|
|
assert(w == expected);
|
2025-10-01 16:49:00 +02:00
|
|
|
|
|
|
|
|
/*
|
2022-12-10 11:04:53 +01:00
|
|
|
w = v;
|
|
|
|
|
measure(inputtype, "vsort", [&] { vsort(&w[0], w.size()); });
|
|
|
|
|
assert(w == expected);
|
|
|
|
|
*/
|
2021-12-15 12:52:33 +01:00
|
|
|
|
2022-12-10 11:04:53 +01:00
|
|
|
for (auto r : results) printf("%9.3fs", r.second);
|
2021-12-13 02:30:12 +01:00
|
|
|
puts("");
|
|
|
|
|
}
|
2022-12-10 11:04:53 +01:00
|
|
|
puts("");
|
|
|
|
|
printf("%10s", "worst");
|
|
|
|
|
for (auto w : worst) printf("%9.3fs", w.second);
|
|
|
|
|
puts("");
|
|
|
|
|
printf("%10s", "");
|
|
|
|
|
for (auto w : worst) printf("%10s", w.first.c_str());
|
|
|
|
|
puts("");
|
|
|
|
|
return 0;
|
|
|
|
|
}
|