magyarsort/magyarsort.h
2021-12-13 03:48:17 +01:00

206 lines
6.6 KiB
C++

#ifndef MAGYAR_SORT_H
#define MAGYAR_SORT_H
/**
* single header lib: In-place, fast heavily modified and optimized radix sort.
*
* Only unsigned ints for now, but should be able to modify for int and float...
* This is the counting variant with smart changes (not per-bit).
*
* LICENCE: CC3 - look it up, you need to mention me but that is all
*/
#include <cstdio>
#include <cstdint>
#include <cstring> // memset
// TODO: Only for the regular radix I guess
#include <vector>
#include <algorithm> // std::swap
namespace MagyarSort {
/* CONFIG */
// Only change these if you know what you are doing
// I use these because I want to see if nibbles are
// better or something...
//
// Bytes of nibbles only:
// - DIGIT_RANGE and BITS_PER_DIGIT should correspond
// - DIGITS should also correspond with the uint32_t
// - and DIGIT_RANGE should be 2^n value (16 or 256)
#ifdef MAGYAR_SORT_NIBBLE
// Per-nibble digits sorting
static constexpr int DIGITS = 8; // "helyiérték"
static constexpr int BITS_PER_DIGIT = 4; // "bit / helyiérték"
static constexpr int DIGIT_RANGE = 16; // "helyiérték állapottér"
#else
// Per-byte digits sorting
static constexpr int DIGITS = 4; // "helyiérték"
static constexpr int BITS_PER_DIGIT = 8; // "bit / helyiérték"
static constexpr int DIGIT_RANGE = 256; // "helyiérték állapottér"
#endif
/* DEBUG */
void debugArr(uint32_t *arr, size_t size) {
for(int i = 0; i < size; ++i) {
printf("%x, ", arr[i]);
}
printf("\n");
}
void debugRadics(size_t *radics) {
for(size_t j = 0; j < DIGITS; ++j) {
printf("d%d: ", j);
for(size_t i = 0; i < DIGIT_RANGE; ++i) {
printf("%d,", radics[i + DIGIT_RANGE*j]);
}
printf("\n\n");
}
}
/* HELPERS */
template<int DIGIT_CHOICE>
static inline __attribute__((always_inline)) uint32_t getDigit(uint32_t num) noexcept {
static constexpr int SHIFT = DIGIT_CHOICE * BITS_PER_DIGIT;
uint32_t shifted = num >> SHIFT;
return shifted & (DIGIT_RANGE - 1);
}
/** Recursive Functor: no class should be generated I think (compiler should be smart) */
template<int DIGIT>
struct OccurenceMagic : public OccurenceMagic<DIGIT - 1> {
inline __attribute__((always_inline)) OccurenceMagic(uint32_t arr[], size_t i, size_t *radicsOut) noexcept
: OccurenceMagic<DIGIT -1 >(arr, i, radicsOut) {
// Parents run first so template recursion runs DIGIT=0 first...
++radicsOut[getDigit<DIGIT>(arr[i]) + DIGIT_RANGE * DIGIT];
}
};
/** Ends template recursion */
template<>
struct OccurenceMagic<-1> {
inline OccurenceMagic(uint32_t arr[], size_t i, size_t *radicsOut) noexcept {}
};
static inline void countOccurences(uint32_t arr[], size_t size, size_t *radicsOut) noexcept {
#pragma GCC unroll 64
for(size_t i = 0; i < size; ++i) {
// Creates no object, struct is empty
OccurenceMagic<DIGITS - 1>(arr, i, radicsOut);
}
}
/** Recursive Functor: no class should be generated I think (compiler should be smart) */
template<int DIGIT>
struct PrefixMagic : public PrefixMagic<DIGIT - 1> {
inline __attribute__((always_inline)) PrefixMagic(size_t *radics, size_t *prev, int i) noexcept
: PrefixMagic<DIGIT - 1>(radics, prev, i) {
static constexpr int DSTART = (DIGIT * DIGIT_RANGE);
radics[DSTART + i] += prev[DIGIT];
prev[DIGIT] = radics[DSTART + i];
}
};
/** Ends template recursion */
template<>
struct PrefixMagic<-1> {
inline PrefixMagic(size_t *radics, size_t *prev, int i) noexcept {}
};
/** Gets REFERENCE to the given digit from the radix-array that has more than one digits */
template<int DIGIT>
static inline __attribute__((always_inline)) size_t &rGet(size_t *radics, size_t i) noexcept {
static constexpr int DSTART = (DIGIT * DIGIT_RANGE);
return radics[DSTART + i];
}
static inline void calcPrefixSums(size_t *radics) noexcept {
static thread_local size_t prev[DIGITS];
memset(prev, 0, sizeof(prev));
for(int i = 0; i < DIGIT_RANGE; ++i) {
// This is a template-unrolled loop too
PrefixMagic<DIGITS - 1>(radics, prev, i);
}
}
/** Recursive Functor: no class should be generated I think (compiler should be smart) */
template<int DIGIT>
struct RadixMagic : public RadixMagic<DIGIT - 1> {
inline __attribute__((always_inline)) RadixMagic(size_t *radics, uint32_t *&from, uint32_t *&to, size_t size) noexcept // BEWARE: "*&" needed to swap pointers..
: RadixMagic<DIGIT - 1>(radics, from, to, size) {
// DEBUG
//printf("%d before: ", DIGIT);
//debugArr(from, size);
#pragma GCC unroll 64
for(size_t i = size; i > 0; --i) { // right-to-left to ensure already sorted digits order we keep for iterations
// Get num and its new offset / location
auto num = from[i - 1];
auto digVal = getDigit<DIGIT>(num);
auto offset = (--rGet<DIGIT>(radics, digVal));
// Add to the proper target location
to[offset] = num;
}
// DEBUG
//printf("%d after: ", DIGIT);
//debugArr(to, size);
// Only swaps pointers :-)
std::swap(from, to);
}
};
/** Ends template recursion */
template<>
struct RadixMagic<-1> {
inline RadixMagic(size_t *radics, uint32_t *&from, uint32_t *&to, size_t size) noexcept { }
};
/* SORT */
/** Sort the given array (in-place sorting) with the given size */
inline void sort(uint32_t arr[], size_t size) noexcept {
// Holds "digit" occurences, prefix sums, whatevers
// First "DIGIT_RANGE" elem is for MSB "DIGITS", last is for LSB
static thread_local size_t radics[DIGITS * DIGIT_RANGE];
memset(radics, 0, sizeof(radics));
// Calculate occurences of digits
countOccurences(arr, size, radics);
//debugRadics(radics);
// Calculate prefix sums
calcPrefixSums(radics);
//debugRadics(radics);
/* Regular (old) radix sort with small twist */
// Regular radix sort - I just changed occurence couting and prefix summing to have more ILP
// But because my approach does not use that, I want to keep this version in a branch for a
// regular radix sort using better ILP just to see how it is doing if I wrote those "Magic"
// above already anyways...
// Regular radix sort needs a copy, see: https://www.youtube.com/watch?v=ujb2CIWE8zY
std::vector<uint32_t> arc(size);
uint32_t *from = arr;
uint32_t *to = &arc[0];
RadixMagic<DIGITS - 1>(radics, from, to, size);
// With an other API we could spare this copy if we can delete original arr and return ptr or something...
// I am fine with this... this is not my main idea anyways, just little ILP tweak to regular radix sort
//if(to != arr) { // <- logically, but bad they are already swapped here!!! BEWARE
if(from != arr) { // <- in reality this is what we want because of last swap happened anyways!
memcpy(arr, from, size);
}
}
};
#endif