magyarsort/thiersort.h

506 lines
14 KiB
C
Raw Normal View History

#ifndef THIER_SORT_H
#define THIER_SORT_H
/*
* This sort alg. is a two step bucket sort algorithm.
* - Step 1 creates top-down pre-sorted buckets with a float radix (allocates)
* - Step 2 uses M quicksort algorithms with source-to-destination backwrite.
*
* What is "float radix" sort?
* - We convert each integer key to a 32 bit float value
* - We take that and create 8 bit ((*): or 16 bit) approxmator
* - This is what we radix sort on as if it would be 8 bit char (**)
* - In both cases, those are just right-shifted floats (LSB bits lost)
*
* For calculating occurence counts, we need a pass on the whole data set.
* Then we need an other pass - into a new array - to do radix step, or
* if we do 16 bit float variant we need two steps (new + old array).
*
* For random data, likely the best is to do less passes, so we do the
* 8 bit version as first implementation, this gives more data in the
* buckets for the "Step 2" quicksort part - which is not std::sort or
* something, but we write a variant that puts back data in original
* array so no back copy will be necessary. For the (*) variant, with
* 16 bit floats, we do not need back-copy and can do quicksort in place!
*
* To avoid costly float conversions, we can do SSE2 here (or neon)!
*
* These are useful for us:
*
* __m128i _mm_load_si128 (__m128i const* mem_addr)
* #include <emmintrin.h>
* Instruction: movdqa xmm, m128 [Latency: 6, Throughput: 0.33 - 0.5]
* CPUID Flags: SSE2
* Description
* Load 128-bits of integer data from memory into dst. mem_addr must be
* aligned on a 16-byte boundary or a general-protection exception may be
* generated.
*
* __m128i _mm_add_epi8 (__m128i a, __m128i b)
* #include <emmintrin.h>
* Instruction: paddb xmm, xmm [Latency: 1, Throughput: 0.33]
* CPUID Flags: SSE2
* Description
* Add packed 8-bit integers in a and b, and store the results in dst.
*
* __m128i _mm_and_si128 (__m128i a, __m128i b)
* #include <emmintrin.h>
* Instruction: pand xmm, xmm [Latency: 1, Throughput: 0.33]
* CPUID Flags: SSE2
* Description
* Compute the bitwise AND of 128 bits (representing integer data)
* in a and b, and store the result in dst.
*
* __m128i _mm_set_epi8 (
* char e15, char e14, char e13, char e12,
* char e11, char e10, char e9, char e8,
* char e7, char e6, char e5, char e4,
* char e3, char e2, char e1, char e0)
* #include <emmintrin.h>
* Instruction: Sequence [XXX: slow - best outside loop]
* CPUID Flags: SSE2
* Description
* Set packed 8-bit integers in dst with the supplied values.
*
* __m128 _mm_cvtepi32_ps (__m128i a)
* #include <emmintrin.h>
* Instruction: cvtdq2ps xmm, xmm [Latency: 4, Throughput: 0.5]
* CPUID Flags: SSE2
* Description
* Convert packed signed 32-bit integers in a to packed single-precision
* (32-bit) floating-point elements, and store the results in dst.
*
* __m128i _mm_castps_si128 (__m128 a)
* #include <emmintrin.h>
* CPUID Flags: SSE2
* Description
* Cast vector of type __m128 to type __m128i. This intrinsic is only
* used for compilation and does not generate any instructions, thus it
* has zero latency.
*
* void _mm_store_si128 (__m128i* mem_addr, __m128i a)
* #include <emmintrin.h>
* Instruction: movdqa m128, xmm [Latency: 1 - 5, Throughput: 0.5 - 1]
* CPUID Flags: SSE2
* Description
* Store 128-bits of integer data from a into memory. mem_addr must be
* aligned on a 16-byte boundary or a general-protection exception may
* be generated.
*
* void _mm_maskmoveu_si128 (__m128i a, __m128i mask, char* mem_addr)
* #include <emmintrin.h>
* Instruction: maskmovdqu xmm, xmm [Latency: 6, Throughput: 1 - XXX: NT!]
* CPUID Flags: SSE2
* Description
* Conditionally store 8-bit integer elements from a into memory using
* mask (elements are not stored when the highest bit is not set in the
* corresponding element) and a non-temporal memory hint. mem_addr does
* not need to be aligned on any particular boundary.
*
* int _mm_extract_epi16 (__m128i a, int imm8)
* #include <emmintrin.h>
* Instruction: pextrw r32, xmm, imm8 [Latency: 4, Throughput: 1]
* CPUID Flags: SSE2
* Description
* Extract a 16-bit integer from a, selected with imm8,
* and store the result in the lower element of dst.
* Operation
* dst[15:0] := (a[127:0] >> (imm8[2:0] * 16))[15:0]
* dst[31:16] := 0
*
* void _mm_store_si128 (__m128i* mem_addr, __m128i a)
* #include <emmintrin.h>
* Instruction: movdqa m128, xmm [Latency: 1 - 5, Throughput: 0.5 - 1]
* CPUID Flags: SSE2
* Description
* Store 128-bits of integer data from a into memory. mem_addr must be
* aligned on a 16-byte boundary or a general-protection exception
* may be generated.
*
* __m128i _mm_srli_epi32 (__m128i a, int imm8)
* #include <emmintrin.h>
* Instruction: psrld xmm, imm8 [Latency: 1, Throughput: 0.5]
* CPUID Flags: SSE2
* Description
* Shift packed 32-bit integers in a right by imm8 while shifting in
* zeros, and store the results in dst.
*
* See also:
* https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html
* https://www.laruence.com/x86/MASKMOVDQU.html
* https://blogs.fau.de/hager/archives/2103
*
* For ARM / Neon:
* https://github.com/DLTcollab/sse2neon
*
* Data layout:
*
* [int32_t key][int32_t i][int32_t key][int32_t i]...
*
* That is, we extract keys and "pointer" like offsets into an
* array of the "real" elements. So you sort "short pointers".
*
* We might create a version that just sorts bare integers,
* but that is only good to show off speed compared to algs.
*
* Main alg:
* - We first process data to have the 16 byte alignment (maybe 64 byte align?)
* - We process the input in 64 byte / loop with 3x SSE 128 bit read + 1xnormal
* - We process remaining part of the array
*
* Middle part should be skipped if there is less than 64 bytes this way...
*
* (pre-loop)
* pat1 = set(1,1,1,1,0,0,0,0,1,1,1,1...)
* pat2 = set(0,0,0,0,1,1,1,1,0,0,0,0...)
*
* (loop)
* a = LOAD4int[+0]
* b = LOAD4int[+4]
* c = LOAD4int[+8]
* <+d some int code>
*
* pata1 = a & pat1;
* pata2 = a & pat2;
* patb1 = b & pat1;
* patb2 = b & pat2;
* patc1 = c & pat1;
* patc2 = c & pat2;
* <+d some int code>
*
* rsa = pata1 >> 24;
* rsb = patb1 >> 24;
* rsc = patc1 >> 24;
* <+d some int code>
*
* resa = rsa + pata2;
* resb = rsb + patb2;
* resc = rsc + patc2;
* <+d some int code>
*
* store(resa, resb, resc) -> tmp
*
* <integer code to process(tmp)>
*
* The tmp should be automatically 16 bit aligned being on-stack allocated!
*
* The integer code that process tmp, depends on what phase we are in, in
* the phase where we count occurences, we do not need the pat[x]2 parts!
* However the reorganize phase passes need it to do the copy around!
*
* This is how we do 64 byte / iteration and should be pipelined well!
*
* However the "integer" code to process tmp is the bottleneck, because
* that part has to process 16 counts when occurence counting likely
* with a 4-wide pipeline in (at least) 4 steps and more in the other
* phase where we are not counting but moving elements where they belong.
* XXX: Now that I think more... pata2 likes are not needed as only make
* issues? I mean we could (should) copy key/index directly from source!
*
* In the 8 bit float version, I guess we only have bits from exponent..
* In the 16 bit version, with an extra pass, we have few bits from mantissa..
*
* I still think, in the random case it worths the less passes by 8 bit float!
*
* String sorting:
* - First create the [key,index] 64 bit data for strings to sort.
* - Key should be first 4 character (extra zeroes if there is no enough char)
* - Do the sorting, with comparator using both the first 4 char AND the strcmp
*
* ^^I actually think this gives a pretty fast string comparison because it do
* give a lot of higher integer values even for short strings (8 bit works)!
*
* XXX: We can do approcimately 2x (1,5x?) the speed for integer-only sort!
*
* Unsigned/signed:
* - Custom trickery is needed when processing tmp (both occurence and move)
* - It should assign different bucket based on topmost bit for sign modes!
* - Modes: msb_early, msb_late
* - These two handles: integer, unsigned int, float32
*
* XXX: So basically a good sorting algo, for float keyes sorting!
*
* Number of passes (8 bit): 3.5
* - 1x Occurence counting pass
* - 1x 8 bit float radix pass
* - 1.5x quicksort pass [more elements]
*
* Number of passes (16 bit): 4.2
* - 1x Occurence counting pass
* - 2x 8 bit float radix pass
* - 1.2x quicksort pass [less elements]
*
* Rem.: Basically these are regular, pipelined radix passes, with only
* the float conversions AND shifts vectorized with SSE somewhat...
*/
2023-04-09 20:21:51 +02:00
#include <stdint.h>
#ifdef TS_SSE2
#include <emmintrin.h>
#endif /* TS_SSE2 */
#ifdef TS_SSE2_NEON
#include "sse2neon.h"
#define TS_SSE2
#endif /* TS_SSE2_NEON */
#ifndef TS_CUSTOM_TYPES
#define TSBOOL uint8_t
#define TSTRUE 1
#define TSFALSE 0
#define TSU8 uint8_t
#define TSU32 uint32_t
#define TSI32 int32_t
#endif /* TS_CUSTOM_TYPES */
2023-04-09 20:21:51 +02:00
/* Nearly same alg for all these keys */
union tskey {
TSI32 i;
TSU32 u;
2023-04-09 20:21:51 +02:00
float f;
};
struct tselem {
union tskey key;
TSU32 i;
2023-04-09 20:21:51 +02:00
};
/* Forward decl of internal code */
static inline void thiersort8_internal(
const tselem *arr,
2023-04-09 20:21:51 +02:00
uint32_t length,
void* (*malloc)(size_t size),
void (*free)(void *ptr),
const TSBOOL (*lt)(
const union tskey ak,
const union tskey bk,
const TSU32 ai,
const TSU32 bi,
2023-04-09 20:21:51 +02:00
void *reent_data),
const TSBOOL isint,
const TSBOOL isunsigned,
const TSBOOL isfloat);
/** The '<' operation used for int32 */
static inline const int ts_lt_int(
const union tskey ak,
const union tskey bk,
const TSU32 ai,
const TSU32 bi,
void *reent_data) { return (ak.i < bk.i); }
/** The '<' operation used for unsigned int32 */
static inline const int ts_lt_uint(
const union tskey ak,
const union tskey bk,
const TSU32 ai,
const TSU32 bi,
void *reent_data) { return (ak.u < bk.u); }
/** The '<' operation used for float */
static inline const int ts_lt_float(
const union tskey ak,
const union tskey bk,
const TSU32 ai,
const TSU32 bi,
void *reent_data) { return (ak.f < bk.f); }
2023-04-09 20:21:51 +02:00
/**
* Sort {key, index32} elements where key is 32 bit integer.
*/
static inline void thiersort8_intkey(
const tselem *arr,
uint32_t length,
void* (*malloc)(size_t size),
void (*free)(void *ptr)) {
thiersort8_internal(
arr,
length,
malloc,
free,
ts_lt_int,
TSTRUE,
TSFALSE,
TSFALSE);
}
/**
* Sort {key, index32} elements where key is unsigned 32 bit integer.
*/
static inline void thiersort8_uintkey(
const tselem *arr,
uint32_t length,
void* (*malloc)(size_t size),
void (*free)(void *ptr)) {
thiersort8_internal(
arr,
length,
malloc,
free,
ts_lt_uint,
TSFALSE,
TSTRUE,
TSFALSE);
}
/**
* Sort {key, index32} elements where key is 32 bit integer.
*/
static inline void thiersort8_floatkey(
const tselem *arr,
uint32_t length,
void* (*malloc)(size_t size),
void (*free)(void *ptr)) {
thiersort8_internal(
arr,
length,
malloc,
free,
ts_lt_float,
TSFALSE,
TSFALSE,
TSTRUE);
}
/**
* Do sign-trick based radix transform on nom-2s-complement 8 bit float!
*
* -1 means -127
* -2 means -126
* ...
* -127 means -0
* 0 means +0
* 1 means 1
* ...
* 127 means 127
*
* We convert the above (in this order) into [0..255]!
*/
static inline TSU8 ts_floatsigntrick(TSU8 floatsigned) {
if((floatsigned & 0x80) == 0) {
/* Positive */
return (128 + floatsigned);
} else {
/* Negative */
return (128 - (floatsigned & 0x80));
}
2023-04-09 20:21:51 +02:00
}
/** Gets which bucket (index) the given key should go */
static inline TSU8 ts_radixi(
union tskey key,
const TSBOOL isint,
const TSBOOL isunsigned,
const TSBOOL isfloat) {
/* Compiler should optimize out branch here! */
union tskey k;
if(isint) {
/* Sign bit can be 1! */
k.f = (float)(arr[i].i);
/* 8 bit float */
k.u >> 24;
/* Need float sign trickery */
return ts_floatsigntrick((TSU8)k.u);
}
if(isunsigned) {
/* Sign bit CANNOT be 1! */
k.f = (float)(arr[i].u);
/* 8 bit float */
k.u >> 24;
/* We are fine! */
return (TSU8)k.u;
}
if(isfloat) {
/* Sign bit can be 1! */
k.f = (float) (arr[i].f);
/* 8 bit float */
k.u >> 24;
/* Need float sign trickery */
return ts_floatsigntrick((TSU8)k.u);
}
}
/** Internal code reuse (type-branches should be optimized out) */
2023-04-09 20:21:51 +02:00
static inline void thiersort8_internal(
struct tselem *arr,
2023-04-09 20:21:51 +02:00
const uint32_t length,
void* (*malloc)(size_t size),
void (*free)(void *ptr),
const TSBOOL (*lt)(
const union tskey ak,
const union tskey bk,
const TSU32 ai,
const TSU32 bi,
2023-04-09 20:21:51 +02:00
void *reent_data),
const TSBOOL isint,
const TSBOOL isunsigned,
const TSBOOL isfloat) {
/* Preparation */
uint32_t radics[256] = {0};
/* [from, to) index: only where prefix sums change - usually nonfull */
uint32_t real_radics[256 * 2] = {0};
tselem *arr2 = malloc((sizeof struct tselem) * length);
/* Step 1) Bucketing */
#ifdef TS_SSE2
// TODO: implement with SSE2
#else
/* Occurence counting O(n) */
for(TSU32 i = 0; i < length; ++i) {
++radics[ts_radixi(arr[i].key, isint, isunsigned, isfloat)];
}
/* Prefix sum + real radics calc O(256) */
/* Radics: */
/* fr: {10, 20, 10, 0, 5, 15,...} */
/* to: {10, 30, 40, 40, 45, 60,..} */
/* Real radics: */
/* to: {[0, 10], [10, 30], [30, 40], [40, 45], [45, 60]} */
/* 0. 1. 2. 4. 5. */
/* (because radix value 3 is not found in input) */
TSU32 prev = 0;
TSU32 reali = 0;
for(int i = 0; i < 256; ++i) {
if(radics[i] != 0) {
radics[i] += prev;
real_radics[reali] = prev;
real_radics[reali + 1] = radics[i];
prev = radics[i];
reali += 2;
} else {
radics[i] += prev;
prev = radics[i];
}
}
/* Move elements according to their buckets O(n) */
/* Right-to-left ensures keeping internal order */
for(TSU32 i = length; i > 0; ++i) {
TSU8 radi = ts_radixi(arr[i - 1].key, isint, isunsigned, isfloat);
TSU32 offset = --radics[radi];
arr2[offset] = arr[i - 1];
}
#endif /* TS_SSE2 */
/* Step 2) Sorting buckets contents - also moving back to original array */
for(int i = 0; i < reali; i += 2) {
/* inclusive */
TSU32 from = real_radics[i];
/* non-inclusive */
TSU32 to = real_radics[i + 1];
/* TODO: Quicksort this bucket in O(mlogm), where m << n usually */
}
/* Cleanup */
free(arr2);
2023-04-09 20:21:51 +02:00
}
#endif /* THIER_SORT_H */