magyarsort/magyarsort.h

#ifndef MAGYAR_SORT_H
#define MAGYAR_SORT_H
/**
 * single header lib: In-place, fast heavily modified and optimized radix sort.
 *
 * Only unsigned ints for now, but should be able to modify for int and float...
 * This is the counting variant with smart changes (not per-bit).
 *
 * LICENCE: CC3 - look it up, you need to mention me but that is all
 */

/*
 * Does not help much:
// #pragma GCC target ("avx2")
// #pragma GCC optimization ("unroll-loops")
*/

#include <cstdio>
#include <cstdint>
#include <cstring> // memset

// TODO: Only for the regular radix I guess
#include <vector>
#include <algorithm> // std::swap

#ifndef NO_MLOCK
#include <sys/mman.h> // mlock & munlock
#endif // !NO_MLOCK

namespace MagyarSort {
	/* CONFIG */

	// Only change these if you know what you are doing
	// I use these because I want to see if nibbles are
	// better or something...
	//
	// Bytes of nibbles only:
	// - DIGIT_RANGE and BITS_PER_DIGIT should correspond
	// - DIGITS should also correspond with the uint32_t
	// - and DIGIT_RANGE should be 2^n value (16 or 256)
#ifdef MAGYAR_SORT_NIBBLE
	// Per-nibble digits sorting
	static constexpr int DIGITS = 8; // "helyiérték"
	static constexpr int BITS_PER_DIGIT = 4; // "bit / helyiérték"
	static constexpr int DIGIT_RANGE = 16; // "helyiérték állapottér"
#else
/*
	// Per-word digits sorting
	static constexpr int DIGITS = 2; // "helyiérték"
	static constexpr int BITS_PER_DIGIT = 16; // "bit / helyiérték"
	static constexpr int DIGIT_RANGE = 65536; // "helyiérték állapottér"
*/
	// Per-byte digits sorting
	static constexpr int DIGITS = 4; // "helyiérték"
	static constexpr int BITS_PER_DIGIT = 8; // "bit / helyiérték"
	static constexpr int DIGIT_RANGE = 256; // "helyiérték állapottér"
#endif

	/* DEBUG */

	void debugArr(uint32_t *arr, size_t size) {
		for(size_t i = 0; i < size; ++i) {
			printf("%x, ", arr[i]);
		}
		printf("\n");
	}

	template<typename COUNTER_TYP>
	void debugRadics(COUNTER_TYP *radics) {
		for(size_t j = 0; j < DIGITS; ++j) {
			printf("d%zu: ", j);
			for(int i = 0; i < DIGIT_RANGE; ++i) {
				printf("%zu,", radics[i + DIGIT_RANGE*j]);
			}
			printf("\n\n");
		}
	}

	/* HELPERS */

	template<int DIGIT_CHOICE>
	static inline __attribute__((always_inline)) uint32_t getDigit(uint32_t num) noexcept {
		static constexpr int SHIFT = DIGIT_CHOICE * BITS_PER_DIGIT;

		uint32_t shifted = num >> SHIFT;
		return shifted & (DIGIT_RANGE - 1);
	}

	/** Recursive Functor: no class should be generated I think (compiler should be smart) */
	template<int DIGIT, typename COUNTER_TYP>
	struct OccurenceMagic : public OccurenceMagic<DIGIT - 1, COUNTER_TYP> {
		inline __attribute__((always_inline)) OccurenceMagic(uint32_t arr[], COUNTER_TYP i, COUNTER_TYP *radicsOut) noexcept
				: OccurenceMagic<DIGIT - 1 ,COUNTER_TYP>(arr, i, radicsOut) {
			// Parents run first so template recursion runs DIGIT=0 first...
			++radicsOut[getDigit<DIGIT>(arr[i]) + DIGIT_RANGE * DIGIT];
		}
	};
	/** Ends template recursion */
	template<typename COUNTER_TYP>
	struct OccurenceMagic<-1, COUNTER_TYP> {
		inline __attribute__((always_inline)) OccurenceMagic(uint32_t arr[], COUNTER_TYP i, COUNTER_TYP *radicsOut) noexcept {}
	};

	/** ARR_END must be an (STEP * k) */
	template<int ARR_END, int STEP, typename ARR_T, int R_OR_W = 0 /* 0:R, 1:W */, int LOCALITY = 3 /* 3 is best, 0 worst*/>
	struct PrefetchMagic : public PrefetchMagic<(ARR_END - STEP), STEP, ARR_T, R_OR_W, LOCALITY> {
		inline __attribute__((always_inline)) PrefetchMagic(ARR_T *arr) noexcept
				: PrefetchMagic<(ARR_END - STEP), STEP, ARR_T, R_OR_W, LOCALITY>(arr) {
			__builtin_prefetch(&arr[ARR_END - STEP], R_OR_W, LOCALITY);
		}
	};

	template<int STEP, typename ARR_T, int R_OR_W, int LOCALITY>
	struct PrefetchMagic<0, STEP, ARR_T, R_OR_W, LOCALITY> {
		inline __attribute__((always_inline)) PrefetchMagic(ARR_T *arr) noexcept {}
	};

	template<typename COUNTER_TYP>
	static inline void countOccurences(uint32_t arr[], COUNTER_TYP size, COUNTER_TYP *radicsOut) noexcept {
		COUNTER_TYP i = 0;
		//#pragma GCC unroll 4
		for(; i < size - 64; i += 64) {
			// Prefetch for read level-1 cache
			//__builtin_prefetch(&arr[i + (1 * 16)], 0, 2); // r, L2 or L3 cache
			__builtin_prefetch(&arr[i + (1 * 16)]);
			// Creates no object, struct is empty
			OccurenceMagic<DIGITS - 1, COUNTER_TYP>(arr, i, radicsOut);
			OccurenceMagic<DIGITS - 1, COUNTER_TYP>(arr, i + 1, radicsOut);
			OccurenceMagic<DIGITS - 1, COUNTER_TYP>(arr, i + 2, radicsOut);
			OccurenceMagic<DIGITS - 1, COUNTER_TYP>(arr, i + 3, radicsOut);
			OccurenceMagic<DIGITS - 1, COUNTER_TYP>(arr, i + 4, radicsOut);
			OccurenceMagic<DIGITS - 1, COUNTER_TYP>(arr, i + 5, radicsOut);
			OccurenceMagic<DIGITS - 1, COUNTER_TYP>(arr, i + 6, radicsOut);
			OccurenceMagic<DIGITS - 1, COUNTER_TYP>(arr, i + 7, radicsOut);
			OccurenceMagic<DIGITS - 1, COUNTER_TYP>(arr, i + 8, radicsOut);
			OccurenceMagic<DIGITS - 1, COUNTER_TYP>(arr, i + 9, radicsOut);
			OccurenceMagic<DIGITS - 1, COUNTER_TYP>(arr, i + 10, radicsOut);
			OccurenceMagic<DIGITS - 1, COUNTER_TYP>(arr, i + 11, radicsOut);
			OccurenceMagic<DIGITS - 1, COUNTER_TYP>(arr, i + 12, radicsOut);
			OccurenceMagic<DIGITS - 1, COUNTER_TYP>(arr, i + 13, radicsOut);
			OccurenceMagic<DIGITS - 1, COUNTER_TYP>(arr, i + 14, radicsOut);
			OccurenceMagic<DIGITS - 1, COUNTER_TYP>(arr, i + 15, radicsOut);
			// Prefetch for read level-1 cache
			__builtin_prefetch(&arr[i + (2 * 16)]);
			OccurenceMagic<DIGITS - 1, COUNTER_TYP>(arr, i + 16, radicsOut);
			OccurenceMagic<DIGITS - 1, COUNTER_TYP>(arr, i + 17, radicsOut);
			OccurenceMagic<DIGITS - 1, COUNTER_TYP>(arr, i + 18, radicsOut);
			OccurenceMagic<DIGITS - 1, COUNTER_TYP>(arr, i + 19, radicsOut);
			OccurenceMagic<DIGITS - 1, COUNTER_TYP>(arr, i + 20, radicsOut);
			OccurenceMagic<DIGITS - 1, COUNTER_TYP>(arr, i + 21, radicsOut);
			OccurenceMagic<DIGITS - 1, COUNTER_TYP>(arr, i + 22, radicsOut);
			OccurenceMagic<DIGITS - 1, COUNTER_TYP>(arr, i + 23, radicsOut);
			OccurenceMagic<DIGITS - 1, COUNTER_TYP>(arr, i + 24, radicsOut);
			OccurenceMagic<DIGITS - 1, COUNTER_TYP>(arr, i + 25, radicsOut);
			OccurenceMagic<DIGITS - 1, COUNTER_TYP>(arr, i + 26, radicsOut);
			OccurenceMagic<DIGITS - 1, COUNTER_TYP>(arr, i + 27, radicsOut);
			OccurenceMagic<DIGITS - 1, COUNTER_TYP>(arr, i + 28, radicsOut);
			OccurenceMagic<DIGITS - 1, COUNTER_TYP>(arr, i + 29, radicsOut);
			OccurenceMagic<DIGITS - 1, COUNTER_TYP>(arr, i + 30, radicsOut);
			OccurenceMagic<DIGITS - 1, COUNTER_TYP>(arr, i + 31, radicsOut);
			__builtin_prefetch(&arr[i + (3 * 16)]);
			OccurenceMagic<DIGITS - 1, COUNTER_TYP>(arr, i + 32, radicsOut);
			OccurenceMagic<DIGITS - 1, COUNTER_TYP>(arr, i + 33, radicsOut);
			OccurenceMagic<DIGITS - 1, COUNTER_TYP>(arr, i + 34, radicsOut);
			OccurenceMagic<DIGITS - 1, COUNTER_TYP>(arr, i + 35, radicsOut);
			OccurenceMagic<DIGITS - 1, COUNTER_TYP>(arr, i + 36, radicsOut);
			OccurenceMagic<DIGITS - 1, COUNTER_TYP>(arr, i + 37, radicsOut);
			OccurenceMagic<DIGITS - 1, COUNTER_TYP>(arr, i + 38, radicsOut);
			OccurenceMagic<DIGITS - 1, COUNTER_TYP>(arr, i + 39, radicsOut);
			OccurenceMagic<DIGITS - 1, COUNTER_TYP>(arr, i + 40, radicsOut);
			OccurenceMagic<DIGITS - 1, COUNTER_TYP>(arr, i + 41, radicsOut);
			OccurenceMagic<DIGITS - 1, COUNTER_TYP>(arr, i + 42, radicsOut);
			OccurenceMagic<DIGITS - 1, COUNTER_TYP>(arr, i + 43, radicsOut);
			OccurenceMagic<DIGITS - 1, COUNTER_TYP>(arr, i + 44, radicsOut);
			OccurenceMagic<DIGITS - 1, COUNTER_TYP>(arr, i + 45, radicsOut);
			OccurenceMagic<DIGITS - 1, COUNTER_TYP>(arr, i + 46, radicsOut);
			OccurenceMagic<DIGITS - 1, COUNTER_TYP>(arr, i + 47, radicsOut);
			// __builtin_prefetch(&arr[i + (4 * 16)]); // Only needed for longer than 64 unrolls
			OccurenceMagic<DIGITS - 1, COUNTER_TYP>(arr, i + 48, radicsOut);
			OccurenceMagic<DIGITS - 1, COUNTER_TYP>(arr, i + 49, radicsOut);
			OccurenceMagic<DIGITS - 1, COUNTER_TYP>(arr, i + 50, radicsOut);
			OccurenceMagic<DIGITS - 1, COUNTER_TYP>(arr, i + 51, radicsOut);
			OccurenceMagic<DIGITS - 1, COUNTER_TYP>(arr, i + 52, radicsOut);
			OccurenceMagic<DIGITS - 1, COUNTER_TYP>(arr, i + 53, radicsOut);
			OccurenceMagic<DIGITS - 1, COUNTER_TYP>(arr, i + 54, radicsOut);
			OccurenceMagic<DIGITS - 1, COUNTER_TYP>(arr, i + 55, radicsOut);
			OccurenceMagic<DIGITS - 1, COUNTER_TYP>(arr, i + 56, radicsOut);
			OccurenceMagic<DIGITS - 1, COUNTER_TYP>(arr, i + 57, radicsOut);
			OccurenceMagic<DIGITS - 1, COUNTER_TYP>(arr, i + 58, radicsOut);
			OccurenceMagic<DIGITS - 1, COUNTER_TYP>(arr, i + 59, radicsOut);
			OccurenceMagic<DIGITS - 1, COUNTER_TYP>(arr, i + 60, radicsOut);
			OccurenceMagic<DIGITS - 1, COUNTER_TYP>(arr, i + 61, radicsOut);
			OccurenceMagic<DIGITS - 1, COUNTER_TYP>(arr, i + 62, radicsOut);
			OccurenceMagic<DIGITS - 1, COUNTER_TYP>(arr, i + 63, radicsOut);
		}

		#pragma GCC unroll 4
		for(; i < size; ++i) {
			OccurenceMagic<DIGITS - 1, COUNTER_TYP>(arr, i, radicsOut);
		}
	}

	/** Recursive Functor: no class should be generated I think (compiler should be smart) */
	template<int DIGIT, typename COUNTER_TYP>
	struct PrefixMagic : public PrefixMagic<DIGIT - 1, COUNTER_TYP> {
		inline __attribute__((always_inline)) PrefixMagic(COUNTER_TYP *radics, COUNTER_TYP *prev, int i) noexcept
				: PrefixMagic<DIGIT - 1, COUNTER_TYP>(radics, prev, i) {
			static constexpr int DSTART = (DIGIT * DIGIT_RANGE);
			radics[DSTART + i] += prev[DIGIT];
			prev[DIGIT] = radics[DSTART + i];
		}
	};
	/** Ends template recursion */
	template<typename COUNTER_TYP>
	struct PrefixMagic<-1, COUNTER_TYP> {
		inline PrefixMagic(COUNTER_TYP *radics, COUNTER_TYP *prev, int i) noexcept {}
	};

	/** Gets REFERENCE to the given digit from the radix-array that has more than one digits */
	template<int DIGIT, typename COUNTER_TYP>
	static inline __attribute__((always_inline)) COUNTER_TYP &rGet(COUNTER_TYP *radics, int i) noexcept {
		static constexpr int DSTART = (DIGIT * DIGIT_RANGE);
		return radics[DSTART + i];
	}

	/** Helper for calcPrefixSums */
	template<int DIGIT, typename COUNTER_TYP>
	struct PMagic2 : public PMagic2<DIGIT - 1, COUNTER_TYP> {
		inline __attribute__((always_inline)) PMagic2(COUNTER_TYP *radics, COUNTER_TYP *prev)
				: PMagic2<DIGIT - 1, COUNTER_TYP>(radics, prev) {
			// Again first the 0th digit because of parent constructors!
			// This is a template-unrolled loop too
			PrefixMagic<DIGITS - 1, COUNTER_TYP>(radics, prev, DIGIT);
		}
	};

	/** Template recursion endpoint */
	template<typename COUNTER_TYP>
	struct PMagic2<-1, COUNTER_TYP> {
		inline __attribute__((always_inline)) PMagic2(COUNTER_TYP *radics, COUNTER_TYP *prev) {}
	};

	template<typename COUNTER_TYP>
	static inline void calcPrefixSums(COUNTER_TYP *__restrict radics) noexcept {
		static thread_local COUNTER_TYP prev[DIGITS];
		memset(prev, 0, sizeof(prev));

		// This is a template-unrolled loop too
		if constexpr (DIGIT_RANGE < 1024) {
			// Extra optimization for bytes and nibbles - totally unrolled loop!
			PMagic2<DIGIT_RANGE - 1, COUNTER_TYP>(radics, prev);
		} else {
			// The above would not work for words and higher up...
			#pragma GCC unroll 16
			for(int j = 0; j < DIGITS; ++j) {
				int offset = 0;
				#pragma GCC unroll 64
				for(int i = 0; i < DIGIT_RANGE; ++i) {
					int DSTART = (j * DIGIT_RANGE);
					radics[DSTART + i] += prev[j];
					prev[j] = radics[DSTART + i];
				}
			}
		}
	}

	/** Recursive Functor: no class should be generated I think (compiler should be smart) */
	template<int DIGIT, typename COUNTER_TYP>
	struct RadixMagic : public RadixMagic<DIGIT - 1, COUNTER_TYP> {
		inline __attribute__((always_inline)) RadixMagic(bool &swapped, COUNTER_TYP *__restrict radics, uint32_t *__restrict from, uint32_t *__restrict to, COUNTER_TYP size) noexcept
				: RadixMagic<DIGIT - 1, COUNTER_TYP>(swapped, radics, from, to, size) {
			// Tricky: see (**)
			if(swapped) { // never true for DIGIT 0, see (***)
				std::swap(from, to);
			}

			// DEBUG
			//printf("%d before: ", DIGIT);
			//debugArr(from, size);

			#pragma GCC unroll 64
			for(COUNTER_TYP i = size; i > 0; --i) { // right-to-left to ensure already sorted digits order we keep for iterations
				// Prefetch caches
				/*
				__builtin_prefetch(&from[i]); // TODO: is good?
				if(i >= 64) { __builtin_prefetch(&from[i - 64]); } // TODO: manually unroll?
				*/
				// Get num and its new offset / location
				auto num = from[i - 1];
				auto digVal = getDigit<DIGIT>(num);
				auto offset = (--rGet<DIGIT>(radics, digVal));

				// Add to the proper target location
				to[offset] = num;
			}

			// DEBUG
			//printf("%d after: ", DIGIT);
			//debugArr(to, size);

			// (**) Only swaps pointers above in the child class constructor IF NEEDED :-)
			swapped = !swapped;
		}
	};
	/** Ends template recursion */
	template<typename COUNTER_TYP>
	struct RadixMagic<-1, COUNTER_TYP> {
		inline RadixMagic(bool swapped, COUNTER_TYP *__restrict radics, uint32_t *__restrict from, uint32_t *__restrict to, COUNTER_TYP size) noexcept {}
	};

	/* SORT */

	/*
	 * Sort the given array (in-place sorting) with the given size.
	 *
	 * Rem.: If you use the VectorGiverWithReuse please remind yourself to Gc() it time-to-time!
	 *
	 * Beware: GC needs to happen on all threads that use us if you want to GC!
	 *
	 * @param arr The array to sort. Result will be in the same array - as sorted.
	 * @param size The lenght of the array - should fit in the COUNTER_TYP.
	 * @param COUNTER_TYP OPTIONAL: When set this type will be the counter type. For most cases uint32_t is enough.
	 * @param REUSE OPTIONAL: When true, we reuse the array instead of always gettin' and releasin' from da heap.
	 * @param GC OPTIONAL: When true, we garbage collect memory from previous sorts if REUSE is true.
	 * @param GC_WITHOUT_SORT OPTIONAL: When true, we "just GC" but do not sort in case of GC is true.
	 */
	template<typename COUNTER_TYP = uint32_t, bool REUSE = false, bool GC = false, bool GC_WITHOUT_SORT = false>
	inline void __attribute__((always_inline)) sort_impl(uint32_t arr[], COUNTER_TYP size) noexcept {
		// Most funny optimization is this multiply here :-)
		//
		// Literally.. come on.. this makes it nearly a compile-time, macro-like
		// ifdef-like thing as we avoid memory allocations of size BUT also we
		// optimize the first call for sort when we REUSE the array so size is fine!
		static thread_local std::vector<uint32_t> arc(size * REUSE);

#ifndef NO_MLOCK
    		mlock(arr, size * sizeof(uint32_t));
#endif // !NO_MLOCK

		// "Garbage-collection"
		if constexpr (GC) {
			arc = std::vector<uint32_t>();
			// This must be implemented, because we can only access
			// the static in our function body so this is the "way".
			if constexpr (GC_WITHOUT_SORT) {
				return;
			}
		}

		// Holds "digit" occurences, prefix sums, whatevers
		// First "DIGIT_RANGE" elem is for MSB "DIGITS", last is for LSB
		static thread_local COUNTER_TYP radics[DIGITS * DIGIT_RANGE];
#ifndef NO_MLOCK
    		mlock(radics, (DIGITS * DIGIT_RANGE) * sizeof(COUNTER_TYP));
#endif // !NO_MLOCK
		// Write prefetchin'
		//__builtin_prefetch(&radicsOut[..], 1);
		if constexpr (DIGIT_RANGE <= 1024) {
			PrefetchMagic<DIGITS * DIGIT_RANGE, (64/sizeof(COUNTER_TYP)), COUNTER_TYP, 1/*w*/> pm(radics);
		}
		memset(radics, 0, sizeof(radics));

		// Calculate occurences of digits
		countOccurences(arr, size, radics);

		//debugRadics<COUNTER_TYP>(radics);

		// Calculate prefix sums
		calcPrefixSums(radics);

		//debugRadics<COUNTER_TYP>(radics);

		/* Regular (old) radix sort with small twist */

		// Regular radix sort - I just changed occurence couting and prefix summing to have more ILP
		// But because my approach does not use that, I want to keep this version in a branch for a
		// regular radix sort using better ILP just to see how it is doing if I wrote those "Magic"
		// above already anyways...

		// Regular radix sort needs a copy, see: https://www.youtube.com/watch?v=ujb2CIWE8zY
		// But instead of the below, we do a trickery...
		//
		// Rem.: The branch is optimized out in compile time!
		if constexpr (REUSE) {
			arc.resize(size);
		} else {
			// Must not be .clean() !!!
			// We must regain memory of previous!
			arc = std::move(std::vector<uint32_t>(size));
		}
#ifndef NO_MLOCK
    		mlock(&arc[0], size * sizeof(uint32_t));
#endif // !NO_MLOCK

		uint32_t *from = arr;
		uint32_t *to = &arc[0];
		static thread_local bool swapped;
		swapped = false; // must be separate line

		RadixMagic<DIGITS - 1, COUNTER_TYP> r(swapped, radics, from, to, size);

		// With an other API we could spare this copy if we can delete original arr and return ptr or something...
		// I am fine with this... this is not my main idea anyways, just little ILP tweak to regular radix sort
		//if(to != arr) // <- logically, but bad they are already swapped here!!! BEWARE
		if(swapped) { // <- in reality this is what we want because of last swap happened anyways!
			memcpy(arr, to, size);
		}
#ifndef NO_MLOCK
    		munlock(radics, (DIGITS * DIGIT_RANGE) * sizeof(COUNTER_TYP));
		munlock(&arc[0], size * sizeof(uint32_t));
    		munlock(arr, size * sizeof(uint32_t));
#endif // !NO_MLOCK
	}

	/**
	 * Garbage collect reused data structures from last call.
	 *
	 * This is optimized and is a NO-OP if MAGYAR_SORT_DEFAULT_REUSE is not defined!
	 *  - unless you use the FORCE! May it be with you if you need it.
	 *
	 * @param FORCE OPTIONAL: When true, the gc happens even if MAGYAR_SORT_DEFAULT_REUSE is not defined!
	 */
	template<bool FORCE = false, typename COUNTER_TYP = size_t>
	inline void gc() noexcept {
		if constexpr (FORCE) {
			// Only GC-ing
			MagyarSort::sort_impl<COUNTER_TYP, true, true, true>(nullptr, 0);
		} else {
#ifdef MAGYAR_SORT_DEFAULT_REUSE
			// Only GC-ing
			MagyarSort::sort_impl<COUNTER_TYP, true, true, true>(nullptr, 0);
#endif
		}
	}

	/**
	 * Sort the given array (in-place sorting) with the given size.
	 *
	 * Rem.: Please remind yourself to cc() from time-to-time!
	 * Rem.: Thread-safe to use!
	 *
	 * Beware: MagyarSort::gc<true>(); needs to happen on all threads that use this variant otherwise memory leaks away!
	 *         Please mind the "true" template parameter that forces the GC even when sort by default not reuses...
	 *
	 * @param arr The array to sort. Result will be in the same array - as sorted.
	 * @param size The lenght of the array.
	 * @param COUNTER_TYP OPTIONAL: It is best kepts as size_t, but for smaller arrays, you can use uint32_t.
	 * @param GC OPTIONAL: When true, we garbage collect before this sort - so cached memory size will be "size" elems.
	 */
	template<typename COUNTER_TYP = size_t, bool GC = false>
	inline void __attribute__((always_inline)) sort_reuse(uint32_t arr[], COUNTER_TYP size) noexcept {
		// Reuse the temporary vectors across runs
		// This results in much less heap allocations and much faster on gcc
		// and also a bit faster on clang too.
		MagyarSort::sort_impl<COUNTER_TYP, true>(arr, size);
	}

	/**
	 * Sort the given array (in-place sorting) with the given size.
	 *
	 * Rem.: Thread-safe to use!
	 *
	 * Beware: MagyarSort::gc(); needs to happen on all threads that use this variant otherwise memory leaks away!
	 *
	 * @param arr The array to sort. Result will be in the same array - as sorted.
	 * @param size The lenght of the array.
	 * @param COUNTER_TYP: Should be size_t for HUGE arrays, but regular arrays, you can use uint32_t. Should be auto found-out
	 */
	template<typename COUNTER_TYP>
	inline void __attribute__((always_inline)) sort_no_reuse(uint32_t arr[], COUNTER_TYP size) noexcept {
		// We use the heap once per every call...
		// This is safer and we do not need garbage collecting
		MagyarSort::sort_impl<COUNTER_TYP>(arr, size);
	}

	/*
	 * Sort the given array (in-place sorting) with the given size.
	 *
	 * Rem.: If you use the VectorGiverWithReuse please remind yourself to Gc() it time-to-time!
	 *
	 * Beware: MagyarSort::gc(); should be called after "sort bursts" (consecutive fast sorts of when you need memory
	 *         on all threads that use this variant otherwise memory leaks away as biggest sorted array keeps being in ram!
	 *         This depends on the config #define MAGYAR_SORT_DEFAULT_REUSE is defined or not. Define and you get reuse
	 *         and if you get reuse you can call multiple sorts with reused temporary buffers that you gc() afterwards!
	 *
	 * @param arr The array to sort. Result will be in the same array - as sorted.
	 * @param size The lenght of the array.
	 * @param COUNTER_TYP: Should be size_t for HUGE arrays, but regular arrays, you can use uint32_t. Should be auto found-out
	 */
	template<typename COUNTER_TYP>
	inline void sort(uint32_t arr[], COUNTER_TYP size) noexcept {
#ifdef MAGYAR_SORT_DEFAULT_REUSE
		MagyarSort::sort_reuse<COUNTER_TYP>(arr, size);
#else
		MagyarSort::sort_no_reuse<COUNTER_TYP>(arr, size);
#endif
	}
};

#endif