#ifndef THIER_SORT_H
#define THIER_SORT_H
/*
 * This sort alg. is a two step bucket sort algorithm.
 * - Step 1 creates top-down pre-sorted buckets with a float radix (allocates)
 * - Step 2 uses M quicksort algorithms with source-to-destination backwrite.
 *
 * What is "float radix" sort?
 * - We convert each integer key to a 32 bit float value
 * - We take that and create 8 bit ((*): or 16 bit) approxmator
 * - This is what we radix sort on as if it would be 8 bit char (**)
 * - In both cases, those are just right-shifted floats (LSB bits lost)
 *
 * For calculating occurence counts, we need a pass on the whole data set.
 * Then we need an other pass - into a new array - to do radix step, or
 * if we do 16 bit float variant we need two steps (new + old array).
 *
 * For random data, likely the best is to do less passes, so we do the
 * 8 bit version as first implementation, this gives more data in the
 * buckets for the "Step 2" quicksort part - which is not std::sort or
 * something, but we write a variant that puts back data in original
 * array so no back copy will be necessary. For the (*) variant, with
 * 16 bit floats, we do not need back-copy and can do quicksort in place!
 *
 * To avoid costly float conversions, we can do SSE2 here (or neon)!
 *
 * These are useful for us:
 * 
 *	__m128i _mm_load_si128 (__m128i const* mem_addr)
 *	#include <emmintrin.h>
 *	Instruction: movdqa xmm, m128 [Latency: 6, Throughput: 0.33 - 0.5]
 *	CPUID Flags: SSE2
 *	Description
 *	Load 128-bits of integer data from memory into dst. mem_addr must be
 *	aligned on a 16-byte boundary or a general-protection exception may be
 *	generated.
 *
 *	__m128i _mm_add_epi8 (__m128i a, __m128i b)
 *	#include <emmintrin.h>
 *	Instruction: paddb xmm, xmm [Latency: 1, Throughput: 0.33]
 *	CPUID Flags: SSE2
 *	Description
 *	Add packed 8-bit integers in a and b, and store the results in dst.
 *
 *	__m128i _mm_and_si128 (__m128i a, __m128i b)
 *	#include <emmintrin.h>
 *	Instruction: pand xmm, xmm [Latency: 1, Throughput: 0.33]
 *	CPUID Flags: SSE2
 *	Description
 *	Compute the bitwise AND of 128 bits (representing integer data)
 *	in a and b, and store the result in dst.
 *
 *	__m128i _mm_set_epi8 (
 *		char e15, char e14, char e13, char e12,
 *		char e11, char e10, char e9, char e8,
 *		char e7, char e6, char e5, char e4,
 *		char e3, char e2, char e1, char e0)
 *	#include <emmintrin.h>
 *	Instruction: Sequence [XXX: slow - best outside loop]
 *	CPUID Flags: SSE2
 *	Description
 *	Set packed 8-bit integers in dst with the supplied values.
 *
 *	__m128 _mm_cvtepi32_ps (__m128i a)
 *	#include <emmintrin.h>
 *	Instruction: cvtdq2ps xmm, xmm [Latency: 4, Throughput: 0.5]
 *	CPUID Flags: SSE2
 *	Description
 *	Convert packed signed 32-bit integers in a to packed single-precision
 *	(32-bit) floating-point elements, and store the results in dst.
 *
 *	__m128i _mm_castps_si128 (__m128 a)
 *	#include <emmintrin.h>
 *	CPUID Flags: SSE2
 *	Description
 *	Cast vector of type __m128 to type __m128i. This intrinsic is only
 *	used for compilation and does not generate any instructions, thus it
 *	has zero latency.
 *
 *	void _mm_store_si128 (__m128i* mem_addr, __m128i a)
 *	#include <emmintrin.h>
 *	Instruction: movdqa m128, xmm [Latency: 1 - 5, Throughput: 0.5 - 1]
 *	CPUID Flags: SSE2
 *	Description
 *	Store 128-bits of integer data from a into memory. mem_addr must be
 *	aligned on a 16-byte boundary or a general-protection exception may
 *	be generated.
 *
 *	void _mm_maskmoveu_si128 (__m128i a, __m128i mask, char* mem_addr)
 *	#include <emmintrin.h>
 *	Instruction: maskmovdqu xmm, xmm [Latency: 6, Throughput: 1 - XXX: NT!]
 *	CPUID Flags: SSE2
 *	Description
 *	Conditionally store 8-bit integer elements from a into memory using
 *	mask (elements are not stored when the highest bit is not set in the
 *	corresponding element) and a non-temporal memory hint. mem_addr does
 *	not need to be aligned on any particular boundary.
 *
 *	int _mm_extract_epi16 (__m128i a, int imm8)
 *	#include <emmintrin.h>
 *	Instruction: pextrw r32, xmm, imm8 [Latency: 4, Throughput: 1]
 *	CPUID Flags: SSE2
 *	Description
 *	Extract a 16-bit integer from a, selected with imm8,
 *	and store the result in the lower element of dst.
 *	Operation
 *	dst[15:0] := (a[127:0] >> (imm8[2:0] * 16))[15:0]
 *	dst[31:16] := 0
 *
 *	void _mm_store_si128 (__m128i* mem_addr, __m128i a)
 *	#include <emmintrin.h>
 *	Instruction: movdqa m128, xmm [Latency: 1 - 5, Throughput: 0.5 - 1]
 *	CPUID Flags: SSE2
 *	Description
 *	Store 128-bits of integer data from a into memory. mem_addr must be
 *	aligned on a 16-byte boundary or a general-protection exception
 *	may be generated.
 *
 *	__m128i _mm_srli_epi32 (__m128i a, int imm8)
 *	#include <emmintrin.h>
 *	Instruction: psrld xmm, imm8 [Latency: 1, Throughput: 0.5]
 *	CPUID Flags: SSE2
 *	Description
 *	Shift packed 32-bit integers in a right by imm8 while shifting in
 *	zeros, and store the results in dst.
 *
 * See also:
 * https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html
 * https://www.laruence.com/x86/MASKMOVDQU.html
 * https://blogs.fau.de/hager/archives/2103
 *
 * For ARM / Neon:
 * https://github.com/DLTcollab/sse2neon
 *
 * Data layout:
 *
 * 	[int32_t key][int32_t i][int32_t key][int32_t i]...
 * 
 * That is, we extract keys and "pointer" like offsets into an
 * array of the "real" elements. So you sort "short pointers".
 *
 * We might create a version that just sorts bare integers,
 * but that is only good to show off speed compared to algs.
 *
 * Main alg:
 * - We first process data to have the 16 byte alignment (maybe 64 byte align?)
 * - We process the input in 64 byte / loop with 3x SSE 128 bit read + 1xnormal
 * - We process remaining part of the array
 *
 * Middle part should be skipped if there is less than 64 bytes this way...
 *
 *	(pre-loop)
 *	pat1 = set(1,1,1,1,0,0,0,0,1,1,1,1...)
 *	pat2 = set(0,0,0,0,1,1,1,1,0,0,0,0...)
 *
 *	(loop)
 *	a = LOAD4int[+0]
 *	b = LOAD4int[+4]
 *	c = LOAD4int[+8]
 *	<+d some int code>
 *
 *	pata1 = a & pat1;
 *	pata2 = a & pat2;
 *	patb1 = b & pat1;
 *	patb2 = b & pat2;
 *	patc1 = c & pat1;
 *	patc2 = c & pat2;
 *	<+d some int code>
 *
 *	rsa = pata1 >> 24;
 *	rsb = patb1 >> 24;
 *	rsc = patc1 >> 24;
 *	<+d some int code>
 *
 *	resa = rsa + pata2;
 *	resb = rsb + patb2;
 *	resc = rsc + patc2;
 *	<+d some int code>
 *
 *	store(resa, resb, resc) -> tmp
 *
 *	<integer code to process(tmp)>
 *
 * The tmp should be automatically 16 bit aligned being on-stack allocated!
 *
 * The integer code that process tmp, depends on what phase we are in, in
 * the phase where we count occurences, we do not need the pat[x]2 parts!
 * However the reorganize phase passes need it to do the copy around!
 *
 * This is how we do 64 byte / iteration and should be pipelined well!
 *
 * However the "integer" code to process tmp is the bottleneck, because
 * that part has to process 16 counts when occurence counting likely
 * with a 4-wide pipeline in (at least) 4 steps and more in the other
 * phase where we are not counting but moving elements where they belong.
 * XXX: Now that I think more... pata2 likes are not needed as only make
 *      issues? I mean we could (should) copy key/index directly from source!
 *
 * In the 8 bit float version, I guess we only have bits from exponent..
 * In the 16 bit version, with an extra pass, we have few bits from mantissa..
 *
 * I still think, in the random case it worths the less passes by 8 bit float!
 *
 * String sorting:
 * - First create the [key,index] 64 bit data for strings to sort.
 * - Key should be first 4 character (extra zeroes if there is no enough char)
 * - Do the sorting, with comparator using both the first 4 char AND the strcmp
 * 
 * ^^I actually think this gives a pretty fast string comparison because it do
 *   give a lot of higher integer values even for short strings (8 bit works)!
 *
 * XXX: We can do approcimately 2x (1,5x?) the speed for integer-only sort!
 *
 * Unsigned/signed:
 * - Custom trickery is needed when processing tmp (both occurence and move)
 * - It should assign different bucket based on topmost bit for sign modes!
 * - Modes: msb_early, msb_late
 * - These two handles: integer, unsigned int, float32
 *
 * XXX: So basically a good sorting algo, for float keyes sorting!
 *
 * Number of passes (8 bit): 3.5
 * - 1x   Occurence counting pass
 * - 1x   8 bit float radix pass
 * - 1.5x quicksort pass [more elements]
 *
 * Number of passes (16 bit): 4.2
 * - 1x   Occurence counting pass
 * - 2x   8 bit float radix pass
 * - 1.2x quicksort pass [less elements]
 *
 * Rem.: Basically these are regular, pipelined radix passes, with only
 *       the float conversions AND shifts vectorized with SSE somewhat...
 */

#endif /* THIER_SORT_H */