magyarsort/ypsu.cpp

#include <utility> // std::pair
#include <algorithm>
#include <cassert>
#include <chrono>
#include <climits>
#include <cstdint>
#include <cstdio>
#include <cstring>
#include <functional>
#include <map>
#include <set>
#include <string>
#include <vector>
#include <numeric>
#include <sys/mman.h> // mlock & munlock
#include "ska_sort.hpp"
#include "gptsort.h"
#include "thiersort.h"
#include "qsort/qsort.h"
#include "qsort/zssort.h"
#include "qsort/schwab_sort.h"
#include "qsort/chatgpt_qs.h"

// #define MAGYAR_SORT_DEFAULT_REUSE
#include "magyarsort.h"

#include "space_partitioning_sort/spsort.h"

std::map<std::string, double> results;
std::map<std::string, double> worst;
void measure(const std::string &inputtype, const std::string &name,
             std::function<void()> f) {
  auto begin = std::chrono::high_resolution_clock::now();
  f();
  auto dur = std::chrono::high_resolution_clock::now() - begin;
  double seconds = dur / std::chrono::milliseconds(1) / 1000.0;
  results[name] = seconds;
  worst[name] = std::max(worst[name], seconds);
}
std::vector<std::string> inputtypes = {
	"constant",
	"asc",
	"desc",
	"ascasc",
	"ascdesc",
	"descasc",
	"descdesc",
	"smallrange",
	"rand",
};
std::vector<uint32_t> geninput(const std::string &type, int n) {
  std::vector<uint32_t> v(n);
  if (type == "constant") {
    int c = rand();
    for (int i = 0; i < n; i++) {
      v[i] = c;
    }
  } else if (type == "asc") {
    for (int i = 0; i < n; i++) {
      v[i] = i;
    }
  } else if (type == "desc") {
    for (int i = 0; i < n; i++) {
      v[i] = n - i;
    }
  } else if (type == "ascasc") {
    for (int i = 0; i < n / 2; i++) {
      v[i] = i;
      v[i + n / 2] = i;
    }
  } else if (type == "ascdesc") {
    for (int i = 0; i < n / 2; i++) {
      v[i] = i;
      v[i + n / 2] = n - i;
    }
  } else if (type == "descasc") {
    for (int i = 0; i < n / 2; i++) {
      v[i] = n - i;
      v[i + n / 2] = i;
    }
  } else if (type == "descdesc") {
    for (int i = 0; i < n / 2; i++) {
      v[i] = n - i;
      v[i + n / 2] = n - i;
    }
  } else if (type == "smallrange") {
    int c = rand() / 2;
    for (int i = 0; i < n; i++) {
      v[i] = c + rand() % 100;
    }
  } else if (type == "rand") {
    for (int i = 0; i < n; i++) {
      v[i] = rand();
    }
  }
  return v;
}

void twopass(uint32_t *a, int n) {
  assert(n * int64_t(sizeof(a[0])) <= INT_MAX);
  // alloc helper buffers.
  int sz = n * sizeof(a[0]);
  std::vector<int> bucketdata(1 << 16);
  uint32_t *buf = (uint32_t *)malloc(sz);
  assert(buf != NULL);
  // pass 1: sort by lower 16 bits.
  for (int i = 0; i < n; i++) bucketdata[a[i] & 0xffff]++;
  int offset = 0;
  for (int i = 0; i < 1 << 16; i++) {
    int d = bucketdata[i];
    bucketdata[i] = offset;
    offset += d;
  }
  for (int i = 0; i < n; i++) buf[bucketdata[a[i] & 0xffff]++] = a[i];
  // pass 2: sort by upper 16 bits.
  memset(&bucketdata[0], 0, bucketdata.size() * sizeof(bucketdata[0]));
  for (int i = 0; i < n; i++) bucketdata[buf[i] >> 16]++;
  offset = 0;
  for (int i = 0; i < 1 << 16; i++) {
    int d = bucketdata[i];
    bucketdata[i] = offset;
    offset += d;
  }
  for (int i = 0; i < n; i++) a[bucketdata[buf[i] >> 16]++] = buf[i];
  free(buf);
}

/** "Standardly" written inplace recursive quicksort */
static inline void do_qsort(uint32_t *a, int n) noexcept {
	assert(n * uint32_t(sizeof(a[0])) <= INT_MAX);
	quicksort(a, 0, n - 1);
}

static inline void do_qsr3(uint32_t *a, int n) noexcept {
	assert(n * uint32_t(sizeof(a[0])) <= INT_MAX);
	rpivotstate state;
	quicksort_rand3(a, 0, n - 1, &state);
}


/** Quicksort with fast random pivoting */
static inline void do_qsr(uint32_t *a, int n) noexcept {
	assert(n * uint32_t(sizeof(a[0])) <= INT_MAX);
	rpivotstate state;
	quicksort_rand(a, 0, n - 1, &state);
}

/** Zsolti's quicksort version with at most O(log(n)) memuse because loop instead half of the recursions */
static inline void do_zsssort(uint32_t *a, int n) noexcept {
	assert(n * uint32_t(sizeof(a[0])) <= INT_MAX);
	zssort(a, 0, n - 1);
}

/** Fastrandomized zss */
static inline void do_zsr(uint32_t *a, int n) noexcept {
	assert(n * uint32_t(sizeof(a[0])) <= INT_MAX);
	rpivotstate state;
	zssort_rand(a, 0, n - 1, &state);
}

/** Fastrandomized zss3 */
static inline void do_zsr3(uint32_t *a, int n) noexcept {
	assert(n * uint32_t(sizeof(a[0])) <= INT_MAX);
	rpivotstate state;
	zssort_rand3(a, 0, n - 1, &state);
}

/** Fastrandomized zss3 single-pass threewayed */
static inline void do_zsr3_sp(uint32_t *a, int n) noexcept {
	assert(n * uint32_t(sizeof(a[0])) <= INT_MAX);
	rpivotstate state;
	zssort_rand3_sp(a, 0, n - 1, &state);
}

/** Fastrandomized zss3 single-pass threewayed */
static inline void do_zsr3_sp2(uint32_t *a, int n) noexcept {
	assert(n * uint32_t(sizeof(a[0])) <= INT_MAX);
	rpivotstate state;
	zssort_rand3_sp2(a, 0, n - 1, &state);
}

/** Fastrandomized zss with const input check */
static inline void do_zsrc(uint32_t *a, int n) noexcept {
	assert(n * uint32_t(sizeof(a[0])) <= INT_MAX);
	rpivotstate state;
	zsrc(a, 0, n - 1, &state);
}

/** meanqs */
static inline void do_meanqs(uint32_t *a, int n) noexcept {
	assert(n * uint32_t(sizeof(a[0])) <= INT_MAX);
	rpivotstate state;
	meanqs(a, 0, n - 1, &state);
}

/** neoqs */
static inline void do_neoqs(uint32_t *a, int n) noexcept {
	assert(n * uint32_t(sizeof(a[0])) <= INT_MAX);
	rpivotstate state;
	neoqs(a, 0, n - 1, &state);
}

/** schwab */
static inline void do_schwab(uint32_t *a, int n) noexcept {
	assert(n * uint32_t(sizeof(a[0])) <= INT_MAX);
	uint32_t junk;
	sch_rand_state state = schwab_rand_state(junk);
	schwab_sort(a, 0, n - 1, &state);
}

// mormord — Today at 2:27 AM
// 1 2 2 2 3
// 
//  0 1 2 3 4
// |1|2 2 3 2 
//  1|2|2 3 2 
//  1|3|2 2 2 
//  1|2|2 2 3
//  1|2|2 2 3
//  1 2|2|2 3
//      ^
//     Pivot
// 
// állítás: pivottól balra helyükön vannak az elemek rendezettségük szerint
// 
// Kezdés Indexek = Prefix összeg - 1 (utolsó helyek az elemeknek)
// 
// Ha pivot új helyének meghatározot index > pivot_index (|.| helye)
//     swap
//     --index
// különben
//     ++pivot_index
template<int j>
static inline bool morbittop(uint32_t elem) noexcept {
	return (elem >> (8 * j)) & 0x80; // Only top bit
}
template<int j>
static inline uint32_t morgrab(uint32_t elem) noexcept {
	return (elem >> (8 * j)) & 0x7f; // Only 7-bit!
}
/**
 * Count occurences AND divides array into two partitions by its topmost bit.
 * - Similar to quicksort partitioning, but changed accordingly.
 * - MSB 0 bit values will come first partition.
 *
 * @param a The array to partition and occurence count.
 * @param n The length of the array.
 * @param radics1 A 128-sized array for occurence counting the bottom partition.
 * @param radics2 A 128-sized array for occurence counting the top partition.
 * @param DIGIT The digit in question (for a morgrab<DIGIT>(..) call)
 * @returns The partition bounds are: [0..first) and [second..n) with logical means to mark empty partitions.
 */
template<int DIGIT>
static inline std::pair<uint32_t, uint32_t> oc_bit_partition(
		uint32_t *a, uint32_t n, uint32_t *radics1, uint32_t *radics2) noexcept {
	// See Hoare's OG quicksort why
	int64_t i = 0;
	int64_t j = n - 1;

	while(true) {
		// Move past well-placed ones
		// And occurence count them
		// Rem.: In quicksort usually a do-while loop
		while ((i < j) && !morbittop<DIGIT>(a[i])) {
			++radics1[morgrab<DIGIT>(a[i])];
			++i;
		}
		while ((i < j) && morbittop<DIGIT>(a[j])) {
			++radics2[morgrab<DIGIT>(a[j])];
			--j;
		}

		// If the indices crossed, return
		// Rem.: Not >= to ensure occ. counts! See also: (*)
		if(i > j) return std::make_pair(i, j + 1);

		// Check for swap
		if(i < j) {
			// Swap
			// No need occurence count here as above loops will handle!
			uint32_t tmp = a[i];
			a[i] = a[j];
			a[j] = tmp;
		} else {
			// i == j case: count occurence properly for the one.
			if(!morbittop<DIGIT>(a[j])) {
				++radics1[morgrab<DIGIT>(a[i])];
				++i;
			} else {
				++radics2[morgrab<DIGIT>(a[j])];
				--j;
			}

		}
	}
}
template<int j>
static inline void mormord_sort_impl(uint32_t *a, int n) noexcept {
	/* Preparation */
	uint32_t radics1[128] = {0};
	uint32_t radics2[128] = {0};
	/* [from, to) index: only where prefix sums change - usually nonfull */
	uint32_t real_radics1[128 * 2] = {0};
	uint32_t real_radics2[128 * 2] = {0};

	// Count occurences and partition by topmost bit
	std::pair<uint32_t, uint32_t> boundz = oc_bit_partition<j>(a, n, radics1, radics2);

	/* Prefix sum + real radics calc O(256) */
	/* Radics: */
	/* fr: {10, 20, 10, 0, 5, 15,...} */
	/* to: {10, 30, 40, 40, 45, 60,..} */
	/* Real radics: */
	/* to: {[0, 10], [10, 30], [30, 40], [40, 45], [45, 60]} */
	/*         0.        1.        2.        4.        5.    */
	/*       (because radix value 3 is not found in input)   */
	uint32_t prev1 = 0;
	uint32_t reali1 = 0;
	uint32_t prev2 = 0;
	uint32_t reali2 = 0;
	#pragma GCC unroll 16
	for(int i = 0; i < 128; ++i) {
		// Hopefully we get more ILP out of this
		// Also I tried branchless before adding
		// ILP here and it slowed things, so first
		// let us try it with branch prediction!
		if(radics1[i] != 0) {
			radics1[i] += prev1;
			real_radics1[reali1] = prev1;
			real_radics1[reali1 + 1] = radics1[i];
			prev1 = radics1[i];
			reali1 += 2;
		} else {
			radics1[i] += prev1;
			prev1 = radics1[i];
		}
		if(radics2[i] != 0) {
			radics2[i] += prev2;
			real_radics2[reali2] = prev2;
			real_radics2[reali2 + 1] = radics2[i];
			prev2 = radics2[i];
			reali2 += 2;
		} else {
			radics2[i] += prev2;
			prev2 = radics2[i];
		}
	}

	// Inplace swap, with added ILP / branchless opt.
	// Without it its data dependent like crazy...
	uint32_t pivoti1 = 0;
	uint32_t pivoti2 = boundz.second;
	while((pivoti1 < boundz.first) && (pivoti2 < n)) {

		/* Pivot 1 */

		uint32_t radixval1 = morgrab<j>(a[pivoti1]);
		uint32_t targeti1 = --radics1[radixval1]; // dec index (!)

		// Bitmask: true -> 11.....1; false -> 00.....0
		uint32_t mask1 = ~((targeti1 > pivoti1) - 1);

		// Branchless swap (using bitmask)
		uint32_t delta1 = (a[pivoti1] ^ a[targeti1]) & mask1;
		a[pivoti1] = a[pivoti1] ^ delta1;
		a[targeti1] = a[targeti1] ^ delta1;

		// "else" branch
		pivoti1 += !mask1;
		radics1[radixval1] += !mask1; // undec index (!)

		/* Pivot 2 */

		uint32_t radixval2 = morgrab<j>(a[pivoti2]);
		uint32_t targeti2 = boundz.second + (--radics2[radixval2]); // dec index (!)

		// Bitmask: true -> 11.....1; false -> 00.....0
		uint32_t mask2 = ~((targeti2 > pivoti2) - 1);

		// Branchless swap (using bitmask)
		uint32_t delta2 = (a[pivoti2] ^ a[targeti2]) & mask2;
		a[pivoti2] = a[pivoti2] ^ delta2;
		a[targeti2] = a[targeti2] ^ delta2;

		// "else" branch
		pivoti2 += !mask2;
		radics2[radixval2] += !mask2; // undec index (!)
	}
	// Finish pivot1 if there are still elements..
	while(pivoti1 < boundz.first) {

		/* Pivot 1+ */

		uint32_t radixval1 = morgrab<j>(a[pivoti1]);
		uint32_t targeti1 = --radics1[radixval1]; // dec index (!)

		// Bitmask: true -> 11.....1; false -> 00.....0
		uint32_t mask1 = ~((targeti1 > pivoti1) - 1);

		// Branchless swap (using bitmask)
		uint32_t delta1 = (a[pivoti1] ^ a[targeti1]) & mask1;
		a[pivoti1] = a[pivoti1] ^ delta1;
		a[targeti1] = a[targeti1] ^ delta1;

		// "else" branch
		pivoti1 += !mask1;
		radics1[radixval1] += !mask1; // undec index (!)
	}
	// Finish pivot2 if there are still elements..
	while(pivoti2 < n) {

		/* Pivot 2+ */

		uint32_t radixval2 = morgrab<j>(a[pivoti2]);
		uint32_t targeti2 = boundz.second + (--radics2[radixval2]); // dec index (!)

		// Bitmask: true -> 11.....1; false -> 00.....0
		uint32_t mask2 = ~((targeti2 > pivoti2) - 1);

		// Branchless swap (using bitmask)
		uint32_t delta2 = (a[pivoti2] ^ a[targeti2]) & mask2;
		a[pivoti2] = a[pivoti2] ^ delta2;
		a[targeti2] = a[targeti2] ^ delta2;

		// "else" branch
		pivoti2 += !mask2;
		radics2[radixval2] += !mask2; // undec index (!)
	}

	// Possible recursions
	if constexpr (j != 0) {
		/* Partition 1 recursions */
		for(int i = 0; i < reali1; i += 2) {
			/* inclusive */
			uint32_t from = real_radics1[i];
			/* non-inclusive */
			uint32_t to = real_radics1[i + 1];
			mormord_sort_impl<j - 1>(&a[from], (to - (from)));
		}
		/* Partition 2 recursions */
		for(int i = 0; i < reali2; i += 2) {
			/* inclusive */
			uint32_t from = real_radics2[i];
			/* non-inclusive */
			uint32_t to = real_radics2[i + 1];
			mormord_sort_impl<j - 1>(&a[from], (to - (from)));
		}
	}
}
static inline void mormord_sort(uint32_t *a, int n) noexcept {
	assert(n * uint32_t(sizeof(a[0])) <= INT_MAX);
	mormord_sort_impl<3>(a, n);
}

void fourpass(uint32_t *a, int n) {
  assert(n * int64_t(sizeof(a[0])) <= INT_MAX);
  // alloc helper buffers.
  int sz = n * sizeof(a[0]);
  std::vector<int> bucketdata(1 << 8);
  uint32_t *buf = (uint32_t *)malloc(sz);
  assert(buf != NULL);
  uint32_t *src = a, *dst = buf;
  uintptr_t swapmask = (uintptr_t)a ^ (uintptr_t)buf;
  for (int shift = 0; shift < 32; shift += 8) {
    memset(&bucketdata[0], 0, bucketdata.size() * sizeof(bucketdata[0]));
    for (int i = 0; i < n; i++) bucketdata[src[i] >> shift & 0xff]++;
    int offset = 0;
    for (int i = 0; i < 1 << 8; i++) {
      int d = bucketdata[i];
      bucketdata[i] = offset;
      offset += d;
    }
    for (int i = 0; i < n; i++) {
      dst[bucketdata[src[i] >> shift & 0xff]++] = src[i];
    }
    src = (uint32_t *)((uintptr_t)src ^ swapmask);
    dst = (uint32_t *)((uintptr_t)dst ^ swapmask);
  }
  free(buf);
}

/** Only werks für das fourpassu! */
void my_memset(int *v) {
  memset(v, 0, (1 << 8) * sizeof(int));
}

// hand-unrolled fourpass.
void fourpassu(uint32_t *a, int n) {
  assert(n * int64_t(sizeof(a[0])) <= INT_MAX);
  // alloc helper buffers.
  int sz = n * sizeof(a[0]);

  static thread_local int bucketdata[1 << 8];
  my_memset(bucketdata);

  uint32_t *buf = (uint32_t *)malloc(sz);
  assert(buf != NULL);
  // pass 1: sort by lower 8 bits.
  #pragma GCC unroll 32
  for (int i = 0; i < n; i++) bucketdata[a[i] & 0xff]++;
  int offset = 0;
  #pragma GCC unroll 8
  for (int i = 0; i < 1 << 8; i++) {
    int d = bucketdata[i];
    bucketdata[i] = offset;
    offset += d;
  }
  for (int i = 0; i < n; i++) buf[bucketdata[a[i] & 0xff]++] = a[i];
  // pass 2: sort by 2nd 8 bits.
  my_memset(bucketdata);
  #pragma GCC unroll 32
  for (int i = 0; i < n; i++) bucketdata[buf[i] >> 8 & 0xff]++;
  offset = 0;
  #pragma GCC unroll 8
  for (int i = 0; i < 1 << 8; i++) {
    int d = bucketdata[i];
    bucketdata[i] = offset;
    offset += d;
  }
  #pragma GCC unroll 64
  for (int i = 0; i < n; i++) a[bucketdata[buf[i] >> 8 & 0xff]++] = buf[i];
  // pass 3: sort by 3rd 8 bits.
  my_memset(bucketdata);
  #pragma GCC unroll 32
  for (int i = 0; i < n; i++) bucketdata[a[i] >> 16 & 0xff]++;
  offset = 0;
  #pragma GCC unroll 8
  for (int i = 0; i < 1 << 8; i++) {
    int d = bucketdata[i];
    bucketdata[i] = offset;
    offset += d;
  }
  #pragma GCC unroll 64
  for (int i = 0; i < n; i++) buf[bucketdata[a[i] >> 16 & 0xff]++] = a[i];
  // pass 4: sort by 4th 8 bits.
  my_memset(bucketdata);
  #pragma GCC unroll 32
  for (int i = 0; i < n; i++) bucketdata[buf[i] >> 24 & 0xff]++;
  offset = 0;
  #pragma GCC unroll 8
  for (int i = 0; i < 1 << 8; i++) {
    int d = bucketdata[i];
    bucketdata[i] = offset;
    offset += d;
  }
  #pragma GCC unroll 32
  for (int i = 0; i < n; i++) a[bucketdata[buf[i] >> 24 & 0xff]++] = buf[i];
  free(buf);
}

static inline uint32_t byterotate(uint32_t x) { return (x >> 8) | (x << 24); }
void fourrots(uint32_t *arr, int n) {
  assert(n * int64_t(sizeof(arr[0])) <= INT_MAX);
  assert(n % 4 == 0);
  // alloc helper buffers.
  int sz = n * sizeof(arr[0]);
  std::vector<int> bucketdata(1 << 8);
  int *btd = &bucketdata[0];
  uint32_t *buf = (uint32_t *)malloc(sz);
  assert(buf != NULL);
  uint32_t *src = arr, *dst = buf;
  uintptr_t swapmask = (uintptr_t)arr ^ (uintptr_t)buf;
  uint32_t a, b, c, d;
  uint32_t abt, bbt, cbt, dbt;
  for (int shift = 0; shift < 32; shift += 8) {
    memset(btd, 0, bucketdata.size() * sizeof(bucketdata[0]));
    for (int i = 0; i < n; i += 4) {
      a = src[i];
      b = src[i + 1];
      c = src[i + 2];
      d = src[i + 3];
      abt = a & 0xff;
      bbt = b & 0xff;
      cbt = c & 0xff;
      dbt = d & 0xff;
      btd[abt]++;
      btd[bbt]++;
      btd[cbt]++;
      btd[dbt]++;
    }
    int offset = 0;
    for (int i = 0; i < 1 << 8; i++) {
      int d = bucketdata[i];
      bucketdata[i] = offset;
      offset += d;
    }
    for (int i = 0; i < n; i += 4) {
      a = src[i];
      b = src[i + 1];
      c = src[i + 2];
      d = src[i + 3];
      abt = a & 0xff;
      bbt = b & 0xff;
      cbt = c & 0xff;
      dbt = d & 0xff;
      dst[btd[abt]++] = byterotate(a);
      dst[btd[bbt]++] = byterotate(b);
      dst[btd[cbt]++] = byterotate(c);
      dst[btd[dbt]++] = byterotate(d);
    }
    src = (uint32_t *)((uintptr_t)src ^ swapmask);
    dst = (uint32_t *)((uintptr_t)dst ^ swapmask);
  }
  free(buf);
}

// frewr - four rewrites.
void frewr(uint32_t *arr, int n) {
  uint32_t *tmpbuf = (uint32_t *)malloc(n * 4);
  mlock(tmpbuf, n * 4);
  int btoffsets[4][256] = {};
  #pragma GCC unroll 64
  for (int i = n - 1; i >= 0; i--) {
    uint32_t a = arr[i];
    btoffsets[3][a & 0xff]++;
    btoffsets[2][a >> 8 & 0xff]++;
    btoffsets[1][a >> 16 & 0xff]++;
    btoffsets[0][a >> 24 & 0xff]++;
  }
  int btend[4] = {n - 1, n - 1, n - 1, n - 1};
  #pragma GCC unroll 16
  for (int i = 255; i >= 0; i--) {
  #pragma GCC unroll 4
    for (int pass = 3; pass >= 0; pass--) {
      int nbtend = btend[pass] - btoffsets[pass][i];
      btoffsets[pass][i] = btend[pass];
      btend[pass] = nbtend;
    }
  }
  uint32_t *src = arr, *dst = tmpbuf;
  #pragma GCC unroll 4
  for (int pass = 3; pass >= 0; pass--) {
    int *off = btoffsets[pass];
    #pragma GCC unroll 64
    for (int i = n - 1; i >= 0; i--) {
      uint32_t v = src[i];
      dst[off[v & 0xff]--] = v >> 8 | v << 24;
      __builtin_prefetch(&dst[off[v & 0xff] - 2]);
    }
    uint32_t *tmp = src;
    src = dst;
    dst = tmp;
  }
  munlock(tmpbuf, n * 4);
  free(tmpbuf);
}

void vsort(uint32_t *a, int n) {
  thread_local std::vector<uint32_t> bts[256];
  #pragma GCC unroll 4
  for (int shift = 0; shift < 32; shift += 8) {
    #pragma GCC unroll 64
    for (int i = 0; i < n; i++) bts[a[i] >> shift & 0xff].push_back(a[i]);
    #pragma GCC unroll 64
    for (int bt = 0, k = 0; bt < 256; bt++) {
      memcpy(a + k, &bts[bt][0], bts[bt].size() * sizeof(a[0]));
      k += bts[bt].size();
      bts[bt].clear();
    }
  }
}

void pagedsort(uint32_t *a, int n) {
  enum { pagesize = 1024 };
  int pagecount = (n + pagesize - 1) / pagesize + 512;
  uint32_t *pd = (uint32_t *)malloc(pagecount * pagesize * sizeof(a[0]));
  std::vector<int> freelist(pagecount);
  std::vector<int> next(pagecount);
  std::iota(std::begin(freelist), std::end(freelist), 0);
  struct bucket {
    int len;
    int headpage, lastpage;
  };
  bucket bts[512];
  // initial scatter.
  for (int bt = 0; bt < 256; bt++) {
    int p = freelist.back();
    freelist.pop_back();
    bts[bt] = {0, p, p};
  }
  for (int i = 0; i < n; i++) {
    bucket *bt = &bts[a[i] & 0xff];
    pd[bt->lastpage * pagesize + bt->len++ % pagesize] = a[i];
    if (bt->len % pagesize == 0) {
      int p = freelist.back();
      freelist.pop_back();
      next[bt->lastpage] = p;
      bt->lastpage = p;
    }
  }
  // intermediate level scatters.
  int ibase = 0, obase = 256;
  for (int shift = 8; shift < 32; shift += 8) {
    for (int bt = 0; bt < 256; bt++) {
      int p = freelist.back();
      freelist.pop_back();
      bts[obase + bt] = {0, p, p};
    }
    for (int ibti = 0; ibti < 256; ibti++) {
      struct bucket *ibt = &bts[ibase + ibti];
      int page = ibt->headpage;
      for (int i = 0; i < ibt->len; i++) {
        uint32_t v = pd[page * pagesize + i % pagesize];
        struct bucket *obt = &bts[obase + (v >> shift & 0xff)];
        pd[obt->lastpage * pagesize + obt->len++ % pagesize] = v;
        if (obt->len % pagesize == 0) {
          int p = freelist.back();
          freelist.pop_back();
          next[obt->lastpage] = p;
          obt->lastpage = p;
        }
        if (i % pagesize == pagesize - 1) {
          freelist.push_back(page);
          page = next[page];
        }
      }
      freelist.push_back(ibt->lastpage);
    }
    ibase = 256 - ibase;
    obase = 256 - obase;
  }
  // the final gather.
  int k = 0;
  for (int ibti = 0; ibti < 256; ibti++) {
    struct bucket *ibt = &bts[ibase + ibti];
    int page = ibt->headpage;
    for (int i = 0; i < ibt->len; i++) {
      a[k++] = pd[page * pagesize + i % pagesize];
      if (i % pagesize == pagesize - 1) {
        page = next[page];
      }
    }
  }
  free(pd);
}

// I measured this being faster than std::sort which is giga-lol wtf...
void thier_quicksort(uint32_t *arr, int n) {
	// Prepare: O(n)
	tselem *tarr = thiersort_prepare_array(
		arr,
		// union tskey (askey)(void *elem),
		[] (void *elem) {
			tskey k;
			k.u = *((uint32_t *)elem);
			return k;
		},
		4, // elemsize,
		n, // length,
		malloc);

	/*
	for(uint32_t i = 0; i < n; ++i) {
		printf("In: %d @%d\n", tarr[i].key.u, tarr[i].i);
	}
	*/

	// Quicksort by me
	ts_quicksort_inplace(
		tarr,
		0, // from
		n, // to
		ts_lt_uint,
		nullptr);

	/*
	for(uint32_t i = 0; i < n; ++i) {
		printf("Out: %d @%d\n", tarr[i].key.u, tarr[i].i);
	}
	*/

	// Apply: O(n)
	uint32_t tmp[1]; // needed for elem swaps
	thiersort_apply(
		tarr,
		arr,
		n,
		4, // elemsize
		tmp,
		free);
}

void thiersort_uintkey8(uint32_t *arr, int n) {
	// Prepare: O(n)
	tselem *tarr = thiersort_prepare_array(
		arr,
		// union tskey (askey)(void *elem),
		[] (void *elem) {
			tskey k;
			k.u = *((uint32_t *)elem);
			return k;
		},
		4, // elemsize,
		n, // length,
		malloc);

	/*
	for(uint32_t i = 0; i < n; ++i) {
		printf("In: %d @%d\n", tarr[i].key.u, tarr[i].i);
	}
	*/

	// Sort: O(n*loglogn on amortized on random input):
	thiersort8_uintkey(
		tarr,
		n,
		malloc,
		free);

	/*
	for(uint32_t i = 0; i < n; ++i) {
		printf("Out: %d @%d\n", tarr[i].key.u, tarr[i].i);
	}
	*/

	// Apply: O(n)
	uint32_t tmp[1]; // needed for elem swaps
	thiersort_apply(
		tarr,
		arr,
		n,
		4, // elemsize
		tmp,
		free);
}

// to measure / profile a single variant
void measure_single(int n) {
  for (auto inputtype : inputtypes) {
    printf("%10s", inputtype.c_str());
    fflush(stdout);
    std::vector<uint32_t> v(n);
    v = geninput(inputtype, n);
    //measure(inputtype, "sp", [&] { spsort(&v[0], v.size()); });
    measure(inputtype, "magyar", [&] { MagyarSort::sort<uint32_t>(&v[0], v.size()); });

    for (auto r : results) printf("%9.3fs", r.second);
    puts("");
  }
  puts("");
  printf("%10s", "worst");
  for (auto w : worst) printf("%9.3fs", w.second);
  puts("");
  printf("%10s", "");
  for (auto w : worst) printf("%10s", w.first.c_str());
  puts("");
}

int main(void) {
  //int n = 100000000;
  //int n = 10000000;
  int n = 1000000;
  //int n = 100000;
  //int n = 20000;
  //int n = 1000;
  //int n = 200;
  //int n = 170;
  //int n = 100;
  //int n = 180;
  //int n = 10;

  printf("Sorting %d elements:\n\n", n);

  // Uncomment this for profiling and alg!
  //measure_single(n);
  //return 0;

  for (auto inputtype : inputtypes) {
    printf("%10s", inputtype.c_str());
    // fflush(stdout);
    std::vector<uint32_t> v(n), w(n), expected(n);
    v = geninput(inputtype, n);
    measure(inputtype, "copy", [&] { w = v; });
    w = v;
    measure(inputtype, "std", [&] { std::sort(std::begin(w), std::end(w)); });
    expected = w;

	/*
    w = v;
    measure(inputtype, "ska", [&] { ska_sort(std::begin(w), std::end(w)); });
	*/
    w = v;
    measure(inputtype, "ska_copy", [&] {
      std::vector<uint32_t> buf(w.size());
      if (ska_sort_copy(std::begin(w), std::end(w), std::begin(buf))) {
        w.swap(buf);
      }
    });
	/*
    w = v;
    measure(inputtype, "magyar", [&] { MagyarSort::sort<uint32_t>(&w[0], w.size()); });
    assert(w == expected);
	*/

    /*
    w = v;
    measure(inputtype, "mormord", [&] { mormord_sort(&w[0], w.size()); });
    assert(w == expected);

    w = v;
    measure(inputtype, "2pass", [&] { twopass(&w[0], w.size()); });
    assert(w == expected);
    w = v;
    measure(inputtype, "4pass", [&] { fourpass(&w[0], w.size()); });
    assert(w == expected);
    w = v;
    measure(inputtype, "psort", [&] { pagedsort(&w[0], w.size()); });
    assert(w == expected);
    w = v;
    measure(inputtype, "4pasu", [&] { fourpassu(&w[0], w.size()); });
    assert(w == expected);
    w = v;
    measure(inputtype, "4rot", [&] { fourrots(&w[0], w.size()); });
    assert(w == expected);
    w = v;
    measure(inputtype, "sp", [&] { spsort(&w[0], w.size()); });
    assert(w == expected);
    w = v;
    measure(inputtype, "gptbuck", [&] { gpt_bucket_sort(&w[0], w.size()); });
    assert(w == expected);
    w = v;
    measure(inputtype, "gpt_qsort", [&] { gpt_quicksort(w); });
    assert(w == expected);
    w = v;
    measure(inputtype, "qsort", [&] { do_qsort(&w[0], w.size()); });
    assert(w == expected);
    w = v;
    measure(inputtype, "zssort", [&] { do_zsssort(&w[0], w.size()); });
    assert(w == expected);
    w = v;
    measure(inputtype, "qsr", [&] { do_qsr(&w[0], w.size()); });
    assert(w == expected);
    w = v;
    measure(inputtype, "zsr", [&] { do_zsr(&w[0], w.size()); });
    assert(w == expected);
    w = v;
    measure(inputtype, "qsr3", [&] { do_qsr3(&w[0], w.size()); });
    assert(w == expected);
    */
    w = v;
    measure(inputtype, "zsr3", [&] { do_zsr3(&w[0], w.size()); });
    assert(w == expected);
    w = v;
    measure(inputtype, "zsr3_sp", [&] { do_zsr3_sp(&w[0], w.size()); });
    assert(w == expected);
    w = v;
    measure(inputtype, "zsr3_sp2", [&] { do_zsr3_sp2(&w[0], w.size()); });
    assert(w == expected);
	/*
    w = v;
    measure(inputtype, "zsrc", [&] { do_zsrc(&w[0], w.size()); });
    assert(w == expected);
	*/
    w = v;
    measure(inputtype, "meanqs", [&] { do_meanqs(&w[0], w.size()); });
    assert(w == expected);

    w = v;
    measure(inputtype, "neoqs", [&] { do_neoqs(&w[0], w.size()); });
    assert(w == expected);

    w = v;
    measure(inputtype, "schwab", [&] { do_schwab(&w[0], w.size()); });
    assert(w == expected);

    /*
    w = v;
    measure(inputtype, "magbuck", [&] { magyar_bucket_sort(&w[0], w.size()); });
    assert(w == expected);
    w = v;
    measure(inputtype, "magbuck2", [&] { magyar_bucket_sort2(&w[0], w.size()); });
    assert(w == expected);
    w = v;
    w = {10, 20, 20};
    measure(inputtype, "qsmine", [&] { thier_quicksort(&w[0], w.size()); });
    if(w != expected) {
	    for(uint32_t i = 0; i < n; ++i) {
		    // assert(w[i] == expected[i]);
		    if(w[i] != expected[i]) {
			    fprintf(stderr, "Difference at %d: %d != %d\n", i, w[i], expected[i]);
		    }
	    }
    }
    assert(w == expected);
    w = v;
    measure(inputtype, "thier", [&] { thiersort_uintkey8(&w[0], w.size()); });
    if(w != expected) {
	    for(uint32_t i = 0; i < n; ++i) {
		    // assert(w[i] == expected[i]);
		    if(w[i] != expected[i]) {
			    fprintf(stderr, "Difference at %d: %d != %d\n", i, w[i], expected[i]);
		    }
	    }
    }
    assert(w == expected);
    */
    /*
    w = v;
    measure(inputtype, "frewr", [&] { frewr(&w[0], w.size()); });
    assert(w == expected);
    w = v;
    measure(inputtype, "vsort", [&] { vsort(&w[0], w.size()); });
    assert(w == expected);
    */

    for (auto r : results) printf("%9.3fs", r.second);
    puts("");
  }
  puts("");
  printf("%10s", "worst");
  for (auto w : worst) printf("%9.3fs", w.second);
  puts("");
  printf("%10s", "");
  for (auto w : worst) printf("%10s", w.first.c_str());
  puts("");
  return 0;
}