magyarsort/thiersort.h

#ifndef THIER_SORT_H
#define THIER_SORT_H
/*
 * This sort alg. is a two step bucket sort algorithm.
 * - Step 1 creates top-down pre-sorted buckets with a float radix (allocates)
 * - Step 2 uses M quicksort algorithms with source-to-destination backwrite.
 *
 * What is "float radix" sort?
 * - We convert each integer key to a 32 bit float value
 * - We take that and create 8 bit ((*): or 16 bit) approxmator
 * - This is what we radix sort on as if it would be 8 bit char (**)
 * - In both cases, those are just right-shifted floats (LSB bits lost)
 *
 * For calculating occurence counts, we need a pass on the whole data set.
 * Then we need an other pass - into a new array - to do radix step, or
 * if we do 16 bit float variant we need two steps (new + old array).
 *
 * For random data, likely the best is to do less passes, so we do the
 * 8 bit version as first implementation, this gives more data in the
 * buckets for the "Step 2" quicksort part - which is not std::sort or
 * something, but we write a variant that puts back data in original
 * array so no back copy will be necessary. For the (*) variant, with
 * 16 bit floats, we do not need back-copy and can do quicksort in place!
 *
 * To avoid costly float conversions, we can do SSE2 here (or neon)!
 *
 * These are useful for us:
 * 
 *	__m128i _mm_load_si128 (__m128i const* mem_addr)
 *	#include <emmintrin.h>
 *	Instruction: movdqa xmm, m128 [Latency: 6, Throughput: 0.33 - 0.5]
 *	CPUID Flags: SSE2
 *	Description
 *	Load 128-bits of integer data from memory into dst. mem_addr must be
 *	aligned on a 16-byte boundary or a general-protection exception may be
 *	generated.
 *
 *	__m128i _mm_add_epi8 (__m128i a, __m128i b)
 *	#include <emmintrin.h>
 *	Instruction: paddb xmm, xmm [Latency: 1, Throughput: 0.33]
 *	CPUID Flags: SSE2
 *	Description
 *	Add packed 8-bit integers in a and b, and store the results in dst.
 *
 *	__m128i _mm_and_si128 (__m128i a, __m128i b)
 *	#include <emmintrin.h>
 *	Instruction: pand xmm, xmm [Latency: 1, Throughput: 0.33]
 *	CPUID Flags: SSE2
 *	Description
 *	Compute the bitwise AND of 128 bits (representing integer data)
 *	in a and b, and store the result in dst.
 *
 *	__m128i _mm_set_epi8 (
 *		char e15, char e14, char e13, char e12,
 *		char e11, char e10, char e9, char e8,
 *		char e7, char e6, char e5, char e4,
 *		char e3, char e2, char e1, char e0)
 *	#include <emmintrin.h>
 *	Instruction: Sequence [XXX: slow - best outside loop]
 *	CPUID Flags: SSE2
 *	Description
 *	Set packed 8-bit integers in dst with the supplied values.
 *
 *	__m128 _mm_cvtepi32_ps (__m128i a)
 *	#include <emmintrin.h>
 *	Instruction: cvtdq2ps xmm, xmm [Latency: 4, Throughput: 0.5]
 *	CPUID Flags: SSE2
 *	Description
 *	Convert packed signed 32-bit integers in a to packed single-precision
 *	(32-bit) floating-point elements, and store the results in dst.
 *
 *	__m128i _mm_castps_si128 (__m128 a)
 *	#include <emmintrin.h>
 *	CPUID Flags: SSE2
 *	Description
 *	Cast vector of type __m128 to type __m128i. This intrinsic is only
 *	used for compilation and does not generate any instructions, thus it
 *	has zero latency.
 *
 *	void _mm_store_si128 (__m128i* mem_addr, __m128i a)
 *	#include <emmintrin.h>
 *	Instruction: movdqa m128, xmm [Latency: 1 - 5, Throughput: 0.5 - 1]
 *	CPUID Flags: SSE2
 *	Description
 *	Store 128-bits of integer data from a into memory. mem_addr must be
 *	aligned on a 16-byte boundary or a general-protection exception may
 *	be generated.
 *
 *	void _mm_maskmoveu_si128 (__m128i a, __m128i mask, char* mem_addr)
 *	#include <emmintrin.h>
 *	Instruction: maskmovdqu xmm, xmm [Latency: 6, Throughput: 1 - XXX: NT!]
 *	CPUID Flags: SSE2
 *	Description
 *	Conditionally store 8-bit integer elements from a into memory using
 *	mask (elements are not stored when the highest bit is not set in the
 *	corresponding element) and a non-temporal memory hint. mem_addr does
 *	not need to be aligned on any particular boundary.
 *
 *	int _mm_extract_epi16 (__m128i a, int imm8)
 *	#include <emmintrin.h>
 *	Instruction: pextrw r32, xmm, imm8 [Latency: 4, Throughput: 1]
 *	CPUID Flags: SSE2
 *	Description
 *	Extract a 16-bit integer from a, selected with imm8,
 *	and store the result in the lower element of dst.
 *	Operation
 *	dst[15:0] := (a[127:0] >> (imm8[2:0] * 16))[15:0]
 *	dst[31:16] := 0
 *
 *	void _mm_store_si128 (__m128i* mem_addr, __m128i a)
 *	#include <emmintrin.h>
 *	Instruction: movdqa m128, xmm [Latency: 1 - 5, Throughput: 0.5 - 1]
 *	CPUID Flags: SSE2
 *	Description
 *	Store 128-bits of integer data from a into memory. mem_addr must be
 *	aligned on a 16-byte boundary or a general-protection exception
 *	may be generated.
 *
 *	__m128i _mm_srli_epi32 (__m128i a, int imm8)
 *	#include <emmintrin.h>
 *	Instruction: psrld xmm, imm8 [Latency: 1, Throughput: 0.5]
 *	CPUID Flags: SSE2
 *	Description
 *	Shift packed 32-bit integers in a right by imm8 while shifting in
 *	zeros, and store the results in dst.
 *
 * See also:
 * https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html
 * https://www.laruence.com/x86/MASKMOVDQU.html
 * https://blogs.fau.de/hager/archives/2103
 *
 * For ARM / Neon:
 * https://github.com/DLTcollab/sse2neon
 *
 * Data layout:
 *
 * 	[int32_t key][int32_t i][int32_t key][int32_t i]...
 * 
 * That is, we extract keys and "pointer" like offsets into an
 * array of the "real" elements. So you sort "short pointers".
 *
 * We might create a version that just sorts bare integers,
 * but that is only good to show off speed compared to algs.
 *
 * Main alg:
 * - We first process data to have the 16 byte alignment (maybe 64 byte align?)
 * - We process the input in 64 byte / loop with 3x SSE 128 bit read + 1xnormal
 * - We process remaining part of the array
 *
 * Middle part should be skipped if there is less than 64 bytes this way...
 *
 *	(pre-loop)
 *	pat1 = set(1,1,1,1,0,0,0,0,1,1,1,1...)
 *	pat2 = set(0,0,0,0,1,1,1,1,0,0,0,0...)
 *
 *	(loop)
 *	a = LOAD4int[+0]
 *	b = LOAD4int[+4]
 *	c = LOAD4int[+8]
 *	<+d some int code>
 *
 *	pata1 = a & pat1;
 *	pata2 = a & pat2;
 *	patb1 = b & pat1;
 *	patb2 = b & pat2;
 *	patc1 = c & pat1;
 *	patc2 = c & pat2;
 *	<+d some int code>
 *
 *	rsa = pata1 >> 24;
 *	rsb = patb1 >> 24;
 *	rsc = patc1 >> 24;
 *	<+d some int code>
 *
 *	resa = rsa + pata2;
 *	resb = rsb + patb2;
 *	resc = rsc + patc2;
 *	<+d some int code>
 *
 *	store(resa, resb, resc) -> tmp
 *
 *	<integer code to process(tmp)>
 *
 * The tmp should be automatically 16 bit aligned being on-stack allocated!
 *
 * The integer code that process tmp, depends on what phase we are in, in
 * the phase where we count occurences, we do not need the pat[x]2 parts!
 * However the reorganize phase passes need it to do the copy around!
 *
 * This is how we do 64 byte / iteration and should be pipelined well!
 *
 * However the "integer" code to process tmp is the bottleneck, because
 * that part has to process 16 counts when occurence counting likely
 * with a 4-wide pipeline in (at least) 4 steps and more in the other
 * phase where we are not counting but moving elements where they belong.
 * XXX: Now that I think more... pata2 likes are not needed as only make
 *      issues? I mean we could (should) copy key/index directly from source!
 *
 * In the 8 bit float version, I guess we only have bits from exponent..
 * In the 16 bit version, with an extra pass, we have few bits from mantissa..
 *
 * I still think, in the random case it worths the less passes by 8 bit float!
 *
 * String sorting:
 * - First create the [key,index] 64 bit data for strings to sort.
 * - Key should be first 4 character (extra zeroes if there is no enough char)
 * - Do the sorting, with comparator using both the first 4 char AND the strcmp
 * 
 * ^^I actually think this gives a pretty fast string comparison because it do
 *   give a lot of higher integer values even for short strings (8 bit works)!
 *
 * XXX: We can do approcimately 2x (1,5x?) the speed for integer-only sort!
 *
 * Unsigned/signed:
 * - Custom trickery is needed when processing tmp (both occurence and move)
 * - It should assign different bucket based on topmost bit for sign modes!
 * - Modes: msb_early, msb_late
 * - These two handles: integer, unsigned int, float32
 *
 * XXX: So basically a good sorting algo, for float keyes sorting!
 *
 * Number of passes (8 bit): 3.5
 * - 1x   Occurence counting pass
 * - 1x   8 bit float radix pass
 * - 1.5x quicksort pass [more elements]
 *
 * Number of passes (16 bit): 4.2
 * - 1x   Occurence counting pass
 * - 2x   8 bit float radix pass
 * - 1.2x quicksort pass [less elements]
 *
 * Rem.: Basically these are regular, pipelined radix passes, with only
 *       the float conversions AND shifts vectorized with SSE somewhat...
 */

#include <stdint.h>

#ifdef TS_SSE2
#include <emmintrin.h>
#endif /* TS_SSE2 */
#ifdef TS_SSE2_NEON
#include "sse2neon.h"
#define TS_SSE2
#endif /* TS_SSE2_NEON */

#ifndef TS_CUSTOM_TYPES
#define TSBOOL uint8_t
#define TSTRUE 1
#define TSFALSE 0
#define TSU8 uint8_t
#define TSU32 uint32_t
#define TSI32 int32_t
#endif /* TS_CUSTOM_TYPES */

/* Nearly same alg for all these keys */
union tskey {
	TSI32 i;
	TSU32 u;
	float f;
};

struct tselem {
	union tskey key;
	TSU32 i;
};

/* Forward decl of internal code */
static inline void thiersort8_internal(
		const tselem *arr,
		uint32_t length,
		void* (*malloc)(size_t size),
		void (*free)(void *ptr),
		const TSBOOL (*lt)(
			const union tskey ak,
			const union tskey bk,
			const TSU32 ai,
			const TSU32 bi,
			void *reent_data),
		const TSBOOL isint,
		const TSBOOL isunsigned,
		const TSBOOL isfloat);

/** The '<' operation used for int32 */
static inline const int ts_lt_int(
		const union tskey ak,
		const union tskey bk,
		const TSU32 ai,
		const TSU32 bi,
		void *reent_data) { return (ak.i < bk.i); }

/** The '<' operation used for unsigned int32 */
static inline const int ts_lt_uint(
		const union tskey ak,
		const union tskey bk,
		const TSU32 ai,
		const TSU32 bi,
		void *reent_data) { return (ak.u < bk.u); }

/** The '<' operation used for float */
static inline const int ts_lt_float(
		const union tskey ak,
		const union tskey bk,
		const TSU32 ai,
		const TSU32 bi,
		void *reent_data) { return (ak.f < bk.f); }

/**
 * Sort {key, index32} elements where key is 32 bit integer.
 */
static inline void thiersort8_intkey(
		const tselem *arr,
		uint32_t length,
		void* (*malloc)(size_t size),
		void (*free)(void *ptr)) {
	thiersort8_internal(
		arr,
		length,
		malloc,
		free,
		ts_lt_int,
		TSTRUE,
		TSFALSE,
		TSFALSE);
}

/**
 * Sort {key, index32} elements where key is unsigned 32 bit integer.
 */
static inline void thiersort8_uintkey(
		const tselem *arr,
		uint32_t length,
		void* (*malloc)(size_t size),
		void (*free)(void *ptr)) {
	thiersort8_internal(
		arr,
		length,
		malloc,
		free,
		ts_lt_uint,
		TSFALSE,
		TSTRUE,
		TSFALSE);
}

/**
 * Sort {key, index32} elements where key is 32 bit integer.
 */
static inline void thiersort8_floatkey(
		const tselem *arr,
		uint32_t length,
		void* (*malloc)(size_t size),
		void (*free)(void *ptr)) {
	thiersort8_internal(
		arr,
		length,
		malloc,
		free,
		ts_lt_float,
		TSFALSE,
		TSFALSE,
		TSTRUE);
}

/**
 * Do sign-trick based radix transform on nom-2s-complement 8 bit float!
 *
 * -1 means -127
 * -2 means -126
 * ...
 * -127 means -0
 * 0 means +0
 * 1 means 1
 * ...
 * 127 means 127
 *
 * We convert the above (in this order) into [0..255]!
 */
static inline TSU8 ts_floatsigntrick(TSU8 floatsigned) {
	if((floatsigned & 0x80) == 0) {
		/* Positive */
		return (128 + floatsigned);
	} else {
		/* Negative */
		return (128 - (floatsigned & 0x80));
	}
}

/** Gets which bucket (index) the given key should go */
static inline TSU8 ts_radixi(
		union tskey key,
		const TSBOOL isint,
		const TSBOOL isunsigned,
		const TSBOOL isfloat) {
	/* Compiler should optimize out branch here! */
	union tskey k;
	if(isint) {
		/* Sign bit can be 1! */
		k.f = (float)(arr[i].i);

		/* 8 bit float */
		k.u >> 24;

		/* Need float sign trickery */
		return ts_floatsigntrick((TSU8)k.u);
	}
	if(isunsigned) {
		/* Sign bit CANNOT be 1! */
		k.f = (float)(arr[i].u);

		/* 8 bit float */
		k.u >> 24;

		/* We are fine! */
		return (TSU8)k.u;
	}
	if(isfloat) {
		/* Sign bit can be 1! */
		k.f = (float) (arr[i].f);

		/* 8 bit float */
		k.u >> 24;

		/* Need float sign trickery */
		return ts_floatsigntrick((TSU8)k.u);
	}
}

/** Internal code reuse (type-branches should be optimized out) */
static inline void thiersort8_internal(
		struct tselem *arr,
		const uint32_t length,
		void* (*malloc)(size_t size),
		void (*free)(void *ptr),
		const TSBOOL (*lt)(
			const union tskey ak,
			const union tskey bk,
			const TSU32 ai,
			const TSU32 bi,
			void *reent_data),
		const TSBOOL isint,
		const TSBOOL isunsigned,
		const TSBOOL isfloat) {

	/* Preparation */
	uint32_t radics[256] = {0};
	/* [from, to) index: only where prefix sums change - usually nonfull */
	uint32_t real_radics[256 * 2] = {0};
	tselem *arr2 = malloc((sizeof struct tselem) * length);

	/* Step 1) Bucketing */
#ifdef TS_SSE2
	// TODO: implement with SSE2
#else
	/* Occurence counting O(n) */
	for(TSU32 i = 0; i < length; ++i) {
		++radics[ts_radixi(arr[i].key, isint, isunsigned, isfloat)];
	}

	/* Prefix sum + real radics calc O(256) */
	/* Radics: */
	/* fr: {10, 20, 10, 0, 5, 15,...} */
	/* to: {10, 30, 40, 40, 45, 60,..} */
	/* Real radics: */
	/* to: {[0, 10], [10, 30], [30, 40], [40, 45], [45, 60]} */
	/*         0.        1.        2.        4.        5.    */
	/*       (because radix value 3 is not found in input)   */
	TSU32 prev = 0;
	TSU32 reali = 0;
	for(int i = 0; i < 256; ++i) {
		if(radics[i] != 0) {
			radics[i] += prev;
			real_radics[reali] = prev;
			real_radics[reali + 1] = radics[i];
			prev = radics[i];
			reali += 2;
		} else {
			radics[i] += prev;
			prev = radics[i];
		}
	}

	/* Move elements according to their buckets O(n) */
	/* Right-to-left ensures keeping internal order */
	for(TSU32 i = length; i > 0; ++i) {
		TSU8 radi = ts_radixi(arr[i - 1].key, isint, isunsigned, isfloat);
		TSU32 offset = --radics[radi];
		arr2[offset] = arr[i - 1];
	}
#endif /* TS_SSE2 */

	/* Step 2) Sorting buckets contents - also moving back to original array */
	for(int i = 0; i < reali; i += 2) {
		/* inclusive */
		TSU32 from = real_radics[i];
		/* non-inclusive */
		TSU32 to = real_radics[i + 1];

		/* TODO: Quicksort this bucket in O(mlogm), where m << n usually */
	}

	/* Cleanup */
	free(arr2);
}

#endif /* THIER_SORT_H */
added thiersort idea, measure magyar_bucket 1&2 2023-04-09 17:20:58 +02:00			`#ifndef THIER_SORT_H`
			`#define THIER_SORT_H`
			`/*`
			`* This sort alg. is a two step bucket sort algorithm.`
			`* - Step 1 creates top-down pre-sorted buckets with a float radix (allocates)`
			`* - Step 2 uses M quicksort algorithms with source-to-destination backwrite.`
			`*`
			`* What is "float radix" sort?`
			`* - We convert each integer key to a 32 bit float value`
			`* - We take that and create 8 bit ((*): or 16 bit) approxmator`
			`* - This is what we radix sort on as if it would be 8 bit char (**)`
			`* - In both cases, those are just right-shifted floats (LSB bits lost)`
			`*`
			`* For calculating occurence counts, we need a pass on the whole data set.`
			`* Then we need an other pass - into a new array - to do radix step, or`
			`* if we do 16 bit float variant we need two steps (new + old array).`
			`*`
			`* For random data, likely the best is to do less passes, so we do the`
			`* 8 bit version as first implementation, this gives more data in the`
			`* buckets for the "Step 2" quicksort part - which is not std::sort or`
			`* something, but we write a variant that puts back data in original`
			`* array so no back copy will be necessary. For the (*) variant, with`
			`* 16 bit floats, we do not need back-copy and can do quicksort in place!`
			`*`
			`* To avoid costly float conversions, we can do SSE2 here (or neon)!`
			`*`
			`* These are useful for us:`
			`*`
			`* __m128i _mm_load_si128 (__m128i const* mem_addr)`
			`* #include <emmintrin.h>`
			`* Instruction: movdqa xmm, m128 [Latency: 6, Throughput: 0.33 - 0.5]`
			`* CPUID Flags: SSE2`
			`* Description`
			`* Load 128-bits of integer data from memory into dst. mem_addr must be`
			`* aligned on a 16-byte boundary or a general-protection exception may be`
			`* generated.`
			`*`
			`* __m128i _mm_add_epi8 (__m128i a, __m128i b)`
			`* #include <emmintrin.h>`
			`* Instruction: paddb xmm, xmm [Latency: 1, Throughput: 0.33]`
			`* CPUID Flags: SSE2`
			`* Description`
			`* Add packed 8-bit integers in a and b, and store the results in dst.`
			`*`
			`* __m128i _mm_and_si128 (__m128i a, __m128i b)`
			`* #include <emmintrin.h>`
			`* Instruction: pand xmm, xmm [Latency: 1, Throughput: 0.33]`
			`* CPUID Flags: SSE2`
			`* Description`
			`* Compute the bitwise AND of 128 bits (representing integer data)`
			`* in a and b, and store the result in dst.`
			`*`
			`* __m128i _mm_set_epi8 (`
			`* char e15, char e14, char e13, char e12,`
			`* char e11, char e10, char e9, char e8,`
			`* char e7, char e6, char e5, char e4,`
			`* char e3, char e2, char e1, char e0)`
			`* #include <emmintrin.h>`
			`* Instruction: Sequence [XXX: slow - best outside loop]`
			`* CPUID Flags: SSE2`
			`* Description`
			`* Set packed 8-bit integers in dst with the supplied values.`
			`*`
			`* __m128 _mm_cvtepi32_ps (__m128i a)`
			`* #include <emmintrin.h>`
			`* Instruction: cvtdq2ps xmm, xmm [Latency: 4, Throughput: 0.5]`
			`* CPUID Flags: SSE2`
			`* Description`
			`* Convert packed signed 32-bit integers in a to packed single-precision`
			`* (32-bit) floating-point elements, and store the results in dst.`
			`*`
			`* __m128i _mm_castps_si128 (__m128 a)`
			`* #include <emmintrin.h>`
			`* CPUID Flags: SSE2`
			`* Description`
			`* Cast vector of type __m128 to type __m128i. This intrinsic is only`
			`* used for compilation and does not generate any instructions, thus it`
			`* has zero latency.`
			`*`
			`* void _mm_store_si128 (__m128i* mem_addr, __m128i a)`
			`* #include <emmintrin.h>`
			`* Instruction: movdqa m128, xmm [Latency: 1 - 5, Throughput: 0.5 - 1]`
			`* CPUID Flags: SSE2`
			`* Description`
			`* Store 128-bits of integer data from a into memory. mem_addr must be`
			`* aligned on a 16-byte boundary or a general-protection exception may`
			`* be generated.`
			`*`
			`* void _mm_maskmoveu_si128 (__m128i a, __m128i mask, char* mem_addr)`
			`* #include <emmintrin.h>`
			`* Instruction: maskmovdqu xmm, xmm [Latency: 6, Throughput: 1 - XXX: NT!]`
			`* CPUID Flags: SSE2`
			`* Description`
			`* Conditionally store 8-bit integer elements from a into memory using`
			`* mask (elements are not stored when the highest bit is not set in the`
			`* corresponding element) and a non-temporal memory hint. mem_addr does`
			`* not need to be aligned on any particular boundary.`
			`*`
			`* int _mm_extract_epi16 (__m128i a, int imm8)`
			`* #include <emmintrin.h>`
			`* Instruction: pextrw r32, xmm, imm8 [Latency: 4, Throughput: 1]`
			`* CPUID Flags: SSE2`
			`* Description`
			`* Extract a 16-bit integer from a, selected with imm8,`
			`* and store the result in the lower element of dst.`
			`* Operation`
			`* dst[15:0] := (a[127:0] >> (imm8[2:0] * 16))[15:0]`
			`* dst[31:16] := 0`
			`*`
			`* void _mm_store_si128 (__m128i* mem_addr, __m128i a)`
			`* #include <emmintrin.h>`
			`* Instruction: movdqa m128, xmm [Latency: 1 - 5, Throughput: 0.5 - 1]`
			`* CPUID Flags: SSE2`
			`* Description`
			`* Store 128-bits of integer data from a into memory. mem_addr must be`
			`* aligned on a 16-byte boundary or a general-protection exception`
			`* may be generated.`
			`*`
			`* __m128i _mm_srli_epi32 (__m128i a, int imm8)`
			`* #include <emmintrin.h>`
			`* Instruction: psrld xmm, imm8 [Latency: 1, Throughput: 0.5]`
			`* CPUID Flags: SSE2`
			`* Description`
			`* Shift packed 32-bit integers in a right by imm8 while shifting in`
			`* zeros, and store the results in dst.`
			`*`
			`* See also:`
			`* https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html`
			`* https://www.laruence.com/x86/MASKMOVDQU.html`
			`* https://blogs.fau.de/hager/archives/2103`
			`*`
			`* For ARM / Neon:`
			`* https://github.com/DLTcollab/sse2neon`
			`*`
			`* Data layout:`
			`*`
			`* [int32_t key][int32_t i][int32_t key][int32_t i]...`
			`*`
			`* That is, we extract keys and "pointer" like offsets into an`
			`* array of the "real" elements. So you sort "short pointers".`
			`*`
			`* We might create a version that just sorts bare integers,`
			`* but that is only good to show off speed compared to algs.`
			`*`
			`* Main alg:`
			`* - We first process data to have the 16 byte alignment (maybe 64 byte align?)`
			`* - We process the input in 64 byte / loop with 3x SSE 128 bit read + 1xnormal`
			`* - We process remaining part of the array`
			`*`
			`* Middle part should be skipped if there is less than 64 bytes this way...`
			`*`
			`* (pre-loop)`
			`* pat1 = set(1,1,1,1,0,0,0,0,1,1,1,1...)`
			`* pat2 = set(0,0,0,0,1,1,1,1,0,0,0,0...)`
			`*`
			`* (loop)`
			`* a = LOAD4int[+0]`
			`* b = LOAD4int[+4]`
			`* c = LOAD4int[+8]`
			`* <+d some int code>`
			`*`
			`* pata1 = a & pat1;`
			`* pata2 = a & pat2;`
			`* patb1 = b & pat1;`
			`* patb2 = b & pat2;`
			`* patc1 = c & pat1;`
			`* patc2 = c & pat2;`
			`* <+d some int code>`
			`*`
			`* rsa = pata1 >> 24;`
			`* rsb = patb1 >> 24;`
			`* rsc = patc1 >> 24;`
			`* <+d some int code>`
			`*`
			`* resa = rsa + pata2;`
			`* resb = rsb + patb2;`
			`* resc = rsc + patc2;`
			`* <+d some int code>`
			`*`
			`* store(resa, resb, resc) -> tmp`
			`*`
			`* <integer code to process(tmp)>`
			`*`
			`* The tmp should be automatically 16 bit aligned being on-stack allocated!`
			`*`
			`* The integer code that process tmp, depends on what phase we are in, in`
			`* the phase where we count occurences, we do not need the pat[x]2 parts!`
			`* However the reorganize phase passes need it to do the copy around!`
			`*`
			`* This is how we do 64 byte / iteration and should be pipelined well!`
			`*`
			`* However the "integer" code to process tmp is the bottleneck, because`
			`* that part has to process 16 counts when occurence counting likely`
			`* with a 4-wide pipeline in (at least) 4 steps and more in the other`
			`* phase where we are not counting but moving elements where they belong.`
			`* XXX: Now that I think more... pata2 likes are not needed as only make`
			`* issues? I mean we could (should) copy key/index directly from source!`
			`*`
			`* In the 8 bit float version, I guess we only have bits from exponent..`
			`* In the 16 bit version, with an extra pass, we have few bits from mantissa..`
			`*`
			`* I still think, in the random case it worths the less passes by 8 bit float!`
			`*`
			`* String sorting:`
			`* - First create the [key,index] 64 bit data for strings to sort.`
			`* - Key should be first 4 character (extra zeroes if there is no enough char)`
			`* - Do the sorting, with comparator using both the first 4 char AND the strcmp`
			`*`
			`* ^^I actually think this gives a pretty fast string comparison because it do`
			`* give a lot of higher integer values even for short strings (8 bit works)!`
			`*`
			`* XXX: We can do approcimately 2x (1,5x?) the speed for integer-only sort!`
			`*`
			`* Unsigned/signed:`
			`* - Custom trickery is needed when processing tmp (both occurence and move)`
			`* - It should assign different bucket based on topmost bit for sign modes!`
			`* - Modes: msb_early, msb_late`
			`* - These two handles: integer, unsigned int, float32`
			`*`
			`* XXX: So basically a good sorting algo, for float keyes sorting!`
			`*`
			`* Number of passes (8 bit): 3.5`
			`* - 1x Occurence counting pass`
			`* - 1x 8 bit float radix pass`
			`* - 1.5x quicksort pass [more elements]`
			`*`
			`* Number of passes (16 bit): 4.2`
			`* - 1x Occurence counting pass`
			`* - 2x 8 bit float radix pass`
			`* - 1.2x quicksort pass [less elements]`
			`*`
			`* Rem.: Basically these are regular, pipelined radix passes, with only`
			`* the float conversions AND shifts vectorized with SSE somewhat...`
			`*/`

thier progress 2023-04-09 20:21:51 +02:00			`#include <stdint.h>`
thiersort non-SSE implementation, but missing quicksort at end 2023-04-09 23:13:16 +02:00
			`#ifdef TS_SSE2`
			`#include <emmintrin.h>`
			`#endif /* TS_SSE2 */`
			`#ifdef TS_SSE2_NEON`
			`#include "sse2neon.h"`
			`#define TS_SSE2`
			`#endif /* TS_SSE2_NEON */`

			`#ifndef TS_CUSTOM_TYPES`
			`#define TSBOOL uint8_t`
			`#define TSTRUE 1`
			`#define TSFALSE 0`
			`#define TSU8 uint8_t`
			`#define TSU32 uint32_t`
			`#define TSI32 int32_t`
			`#endif /* TS_CUSTOM_TYPES */`
thier progress 2023-04-09 20:21:51 +02:00
			`/* Nearly same alg for all these keys */`
thiersort non-SSE implementation, but missing quicksort at end 2023-04-09 23:13:16 +02:00			`union tskey {`
			`TSI32 i;`
			`TSU32 u;`
thier progress 2023-04-09 20:21:51 +02:00			`float f;`
			`};`

thiersort non-SSE implementation, but missing quicksort at end 2023-04-09 23:13:16 +02:00			`struct tselem {`
			`union tskey key;`
			`TSU32 i;`
thier progress 2023-04-09 20:21:51 +02:00			`};`

			`/* Forward decl of internal code */`
			`static inline void thiersort8_internal(`
thiersort non-SSE implementation, but missing quicksort at end 2023-04-09 23:13:16 +02:00			`const tselem *arr,`
thier progress 2023-04-09 20:21:51 +02:00			`uint32_t length,`
thiersort non-SSE implementation, but missing quicksort at end 2023-04-09 23:13:16 +02:00			`void* (*malloc)(size_t size),`
			`void (free)(void ptr),`
			`const TSBOOL (*lt)(`
			`const union tskey ak,`
			`const union tskey bk,`
			`const TSU32 ai,`
			`const TSU32 bi,`
thier progress 2023-04-09 20:21:51 +02:00			`void *reent_data),`
thiersort non-SSE implementation, but missing quicksort at end 2023-04-09 23:13:16 +02:00			`const TSBOOL isint,`
			`const TSBOOL isunsigned,`
			`const TSBOOL isfloat);`

			`/** The '<' operation used for int32 */`
			`static inline const int ts_lt_int(`
			`const union tskey ak,`
			`const union tskey bk,`
			`const TSU32 ai,`
			`const TSU32 bi,`
			`void *reent_data) { return (ak.i < bk.i); }`

			`/** The '<' operation used for unsigned int32 */`
			`static inline const int ts_lt_uint(`
			`const union tskey ak,`
			`const union tskey bk,`
			`const TSU32 ai,`
			`const TSU32 bi,`
			`void *reent_data) { return (ak.u < bk.u); }`

			`/** The '<' operation used for float */`
			`static inline const int ts_lt_float(`
			`const union tskey ak,`
			`const union tskey bk,`
			`const TSU32 ai,`
			`const TSU32 bi,`
			`void *reent_data) { return (ak.f < bk.f); }`
thier progress 2023-04-09 20:21:51 +02:00
			`/**`
			`* Sort {key, index32} elements where key is 32 bit integer.`
			`*/`
			`static inline void thiersort8_intkey(`
thiersort non-SSE implementation, but missing quicksort at end 2023-04-09 23:13:16 +02:00			`const tselem *arr,`
			`uint32_t length,`
			`void* (*malloc)(size_t size),`
			`void (free)(void ptr)) {`
			`thiersort8_internal(`
			`arr,`
			`length,`
			`malloc,`
			`free,`
			`ts_lt_int,`
			`TSTRUE,`
			`TSFALSE,`
			`TSFALSE);`
			`}`

			`/**`
			`* Sort {key, index32} elements where key is unsigned 32 bit integer.`
			`*/`
			`static inline void thiersort8_uintkey(`
			`const tselem *arr,`
			`uint32_t length,`
			`void* (*malloc)(size_t size),`
			`void (free)(void ptr)) {`
			`thiersort8_internal(`
			`arr,`
			`length,`
			`malloc,`
			`free,`
			`ts_lt_uint,`
			`TSFALSE,`
			`TSTRUE,`
			`TSFALSE);`
			`}`

			`/**`
			`* Sort {key, index32} elements where key is 32 bit integer.`
			`*/`
			`static inline void thiersort8_floatkey(`
			`const tselem *arr,`
			`uint32_t length,`
			`void* (*malloc)(size_t size),`
			`void (free)(void ptr)) {`
			`thiersort8_internal(`
			`arr,`
			`length,`
			`malloc,`
			`free,`
			`ts_lt_float,`
			`TSFALSE,`
			`TSFALSE,`
			`TSTRUE);`
			`}`

			`/**`
			`* Do sign-trick based radix transform on nom-2s-complement 8 bit float!`
			`*`
			`* -1 means -127`
			`* -2 means -126`
			`* ...`
			`* -127 means -0`
			`* 0 means +0`
			`* 1 means 1`
			`* ...`
			`* 127 means 127`
			`*`
			`* We convert the above (in this order) into [0..255]!`
			`*/`
			`static inline TSU8 ts_floatsigntrick(TSU8 floatsigned) {`
			`if((floatsigned & 0x80) == 0) {`
			`/* Positive */`
			`return (128 + floatsigned);`
			`} else {`
			`/* Negative */`
			`return (128 - (floatsigned & 0x80));`
			`}`
thier progress 2023-04-09 20:21:51 +02:00			`}`

thiersort non-SSE implementation, but missing quicksort at end 2023-04-09 23:13:16 +02:00			`/** Gets which bucket (index) the given key should go */`
			`static inline TSU8 ts_radixi(`
			`union tskey key,`
			`const TSBOOL isint,`
			`const TSBOOL isunsigned,`
			`const TSBOOL isfloat) {`
			`/* Compiler should optimize out branch here! */`
			`union tskey k;`
			`if(isint) {`
			`/* Sign bit can be 1! */`
			`k.f = (float)(arr[i].i);`

			`/* 8 bit float */`
			`k.u >> 24;`

			`/* Need float sign trickery */`
			`return ts_floatsigntrick((TSU8)k.u);`
			`}`
			`if(isunsigned) {`
			`/* Sign bit CANNOT be 1! */`
			`k.f = (float)(arr[i].u);`

			`/* 8 bit float */`
			`k.u >> 24;`

			`/* We are fine! */`
			`return (TSU8)k.u;`
			`}`
			`if(isfloat) {`
			`/* Sign bit can be 1! */`
			`k.f = (float) (arr[i].f);`

			`/* 8 bit float */`
			`k.u >> 24;`

			`/* Need float sign trickery */`
			`return ts_floatsigntrick((TSU8)k.u);`
			`}`
			`}`

			`/** Internal code reuse (type-branches should be optimized out) */`
thier progress 2023-04-09 20:21:51 +02:00			`static inline void thiersort8_internal(`
thiersort non-SSE implementation, but missing quicksort at end 2023-04-09 23:13:16 +02:00			`struct tselem *arr,`
thier progress 2023-04-09 20:21:51 +02:00			`const uint32_t length,`
thiersort non-SSE implementation, but missing quicksort at end 2023-04-09 23:13:16 +02:00			`void* (*malloc)(size_t size),`
			`void (free)(void ptr),`
			`const TSBOOL (*lt)(`
			`const union tskey ak,`
			`const union tskey bk,`
			`const TSU32 ai,`
			`const TSU32 bi,`
thier progress 2023-04-09 20:21:51 +02:00			`void *reent_data),`
thiersort non-SSE implementation, but missing quicksort at end 2023-04-09 23:13:16 +02:00			`const TSBOOL isint,`
			`const TSBOOL isunsigned,`
			`const TSBOOL isfloat) {`

			`/* Preparation */`
			`uint32_t radics[256] = {0};`
			`/* [from, to) index: only where prefix sums change - usually nonfull */`
			`uint32_t real_radics[256 * 2] = {0};`
			`tselem arr2 = malloc((sizeof struct tselem) length);`

			`/* Step 1) Bucketing */`
			`#ifdef TS_SSE2`
			`// TODO: implement with SSE2`
			`#else`
			`/* Occurence counting O(n) */`
			`for(TSU32 i = 0; i < length; ++i) {`
			`++radics[ts_radixi(arr[i].key, isint, isunsigned, isfloat)];`
			`}`

			`/* Prefix sum + real radics calc O(256) */`
			`/* Radics: */`
			`/* fr: {10, 20, 10, 0, 5, 15,...} */`
			`/* to: {10, 30, 40, 40, 45, 60,..} */`
			`/* Real radics: */`
			`/* to: {[0, 10], [10, 30], [30, 40], [40, 45], [45, 60]} */`
			`/* 0. 1. 2. 4. 5. */`
			`/* (because radix value 3 is not found in input) */`
			`TSU32 prev = 0;`
			`TSU32 reali = 0;`
			`for(int i = 0; i < 256; ++i) {`
			`if(radics[i] != 0) {`
			`radics[i] += prev;`
			`real_radics[reali] = prev;`
			`real_radics[reali + 1] = radics[i];`
			`prev = radics[i];`
			`reali += 2;`
			`} else {`
			`radics[i] += prev;`
			`prev = radics[i];`
			`}`
			`}`

			`/* Move elements according to their buckets O(n) */`
			`/* Right-to-left ensures keeping internal order */`
			`for(TSU32 i = length; i > 0; ++i) {`
			`TSU8 radi = ts_radixi(arr[i - 1].key, isint, isunsigned, isfloat);`
			`TSU32 offset = --radics[radi];`
			`arr2[offset] = arr[i - 1];`
			`}`
			`#endif /* TS_SSE2 */`

			`/* Step 2) Sorting buckets contents - also moving back to original array */`
			`for(int i = 0; i < reali; i += 2) {`
			`/* inclusive */`
			`TSU32 from = real_radics[i];`
			`/* non-inclusive */`
			`TSU32 to = real_radics[i + 1];`

			`/* TODO: Quicksort this bucket in O(mlogm), where m << n usually */`
			`}`

			`/* Cleanup */`
			`free(arr2);`
thier progress 2023-04-09 20:21:51 +02:00			`}`
thiersort non-SSE implementation, but missing quicksort at end 2023-04-09 23:13:16 +02:00
added thiersort idea, measure magyar_bucket 1&2 2023-04-09 17:20:58 +02:00			`#endif /* THIER_SORT_H */`