797 lines
23 KiB
C++
797 lines
23 KiB
C++
#ifndef THIER_SORT_H
|
|
#define THIER_SORT_H
|
|
/*
|
|
* This sort alg. is a two step bucket sort algorithm.
|
|
* - Step 1 creates top-down pre-sorted buckets with a float radix (allocates)
|
|
* - Step 2 uses M quicksort algorithms with source-to-destination backwrite.
|
|
*
|
|
* What is "float radix" sort?
|
|
* - We convert each integer key to a 32 bit float value
|
|
* - We take that and create 8 bit ((*): or 16 bit) approxmator
|
|
* - This is what we radix sort on as if it would be 8 bit char (**)
|
|
* - In both cases, those are just right-shifted floats (LSB bits lost)
|
|
*
|
|
* For calculating occurence counts, we need a pass on the whole data set.
|
|
* Then we need an other pass - into a new array - to do radix step, or
|
|
* if we do 16 bit float variant we need two steps (new + old array).
|
|
*
|
|
* For random data, likely the best is to do less passes, so we do the
|
|
* 8 bit version as first implementation, this gives more data in the
|
|
* buckets for the "Step 2" quicksort part - which is not std::sort or
|
|
* something, but we write a variant that puts back data in original
|
|
* array so no back copy will be necessary. For the (*) variant, with
|
|
* 16 bit floats, we do not need back-copy and can do quicksort in place!
|
|
*
|
|
* To avoid costly float conversions, we can do SSE2 here (or neon)!
|
|
*
|
|
* These are useful for us:
|
|
*
|
|
* __m128i _mm_load_si128 (__m128i const* mem_addr)
|
|
* #include <emmintrin.h>
|
|
* Instruction: movdqa xmm, m128 [Latency: 6, Throughput: 0.33 - 0.5]
|
|
* CPUID Flags: SSE2
|
|
* Description
|
|
* Load 128-bits of integer data from memory into dst. mem_addr must be
|
|
* aligned on a 16-byte boundary or a general-protection exception may be
|
|
* generated.
|
|
*
|
|
* __m128i _mm_add_epi8 (__m128i a, __m128i b)
|
|
* #include <emmintrin.h>
|
|
* Instruction: paddb xmm, xmm [Latency: 1, Throughput: 0.33]
|
|
* CPUID Flags: SSE2
|
|
* Description
|
|
* Add packed 8-bit integers in a and b, and store the results in dst.
|
|
*
|
|
* __m128i _mm_and_si128 (__m128i a, __m128i b)
|
|
* #include <emmintrin.h>
|
|
* Instruction: pand xmm, xmm [Latency: 1, Throughput: 0.33]
|
|
* CPUID Flags: SSE2
|
|
* Description
|
|
* Compute the bitwise AND of 128 bits (representing integer data)
|
|
* in a and b, and store the result in dst.
|
|
*
|
|
* __m128i _mm_set_epi8 (
|
|
* char e15, char e14, char e13, char e12,
|
|
* char e11, char e10, char e9, char e8,
|
|
* char e7, char e6, char e5, char e4,
|
|
* char e3, char e2, char e1, char e0)
|
|
* #include <emmintrin.h>
|
|
* Instruction: Sequence [XXX: slow - best outside loop]
|
|
* CPUID Flags: SSE2
|
|
* Description
|
|
* Set packed 8-bit integers in dst with the supplied values.
|
|
*
|
|
* __m128 _mm_cvtepi32_ps (__m128i a)
|
|
* #include <emmintrin.h>
|
|
* Instruction: cvtdq2ps xmm, xmm [Latency: 4, Throughput: 0.5]
|
|
* CPUID Flags: SSE2
|
|
* Description
|
|
* Convert packed signed 32-bit integers in a to packed single-precision
|
|
* (32-bit) floating-point elements, and store the results in dst.
|
|
*
|
|
* __m128i _mm_castps_si128 (__m128 a)
|
|
* #include <emmintrin.h>
|
|
* CPUID Flags: SSE2
|
|
* Description
|
|
* Cast vector of type __m128 to type __m128i. This intrinsic is only
|
|
* used for compilation and does not generate any instructions, thus it
|
|
* has zero latency.
|
|
*
|
|
* void _mm_store_si128 (__m128i* mem_addr, __m128i a)
|
|
* #include <emmintrin.h>
|
|
* Instruction: movdqa m128, xmm [Latency: 1 - 5, Throughput: 0.5 - 1]
|
|
* CPUID Flags: SSE2
|
|
* Description
|
|
* Store 128-bits of integer data from a into memory. mem_addr must be
|
|
* aligned on a 16-byte boundary or a general-protection exception may
|
|
* be generated.
|
|
*
|
|
* void _mm_maskmoveu_si128 (__m128i a, __m128i mask, char* mem_addr)
|
|
* #include <emmintrin.h>
|
|
* Instruction: maskmovdqu xmm, xmm [Latency: 6, Throughput: 1 - XXX: NT!]
|
|
* CPUID Flags: SSE2
|
|
* Description
|
|
* Conditionally store 8-bit integer elements from a into memory using
|
|
* mask (elements are not stored when the highest bit is not set in the
|
|
* corresponding element) and a non-temporal memory hint. mem_addr does
|
|
* not need to be aligned on any particular boundary.
|
|
*
|
|
* int _mm_extract_epi16 (__m128i a, int imm8)
|
|
* #include <emmintrin.h>
|
|
* Instruction: pextrw r32, xmm, imm8 [Latency: 4, Throughput: 1]
|
|
* CPUID Flags: SSE2
|
|
* Description
|
|
* Extract a 16-bit integer from a, selected with imm8,
|
|
* and store the result in the lower element of dst.
|
|
* Operation
|
|
* dst[15:0] := (a[127:0] >> (imm8[2:0] * 16))[15:0]
|
|
* dst[31:16] := 0
|
|
*
|
|
* void _mm_store_si128 (__m128i* mem_addr, __m128i a)
|
|
* #include <emmintrin.h>
|
|
* Instruction: movdqa m128, xmm [Latency: 1 - 5, Throughput: 0.5 - 1]
|
|
* CPUID Flags: SSE2
|
|
* Description
|
|
* Store 128-bits of integer data from a into memory. mem_addr must be
|
|
* aligned on a 16-byte boundary or a general-protection exception
|
|
* may be generated.
|
|
*
|
|
* __m128i _mm_srli_epi32 (__m128i a, int imm8)
|
|
* #include <emmintrin.h>
|
|
* Instruction: psrld xmm, imm8 [Latency: 1, Throughput: 0.5]
|
|
* CPUID Flags: SSE2
|
|
* Description
|
|
* Shift packed 32-bit integers in a right by imm8 while shifting in
|
|
* zeros, and store the results in dst.
|
|
*
|
|
* See also:
|
|
* https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html
|
|
* https://www.laruence.com/x86/MASKMOVDQU.html
|
|
* https://blogs.fau.de/hager/archives/2103
|
|
*
|
|
* For ARM / Neon:
|
|
* https://github.com/DLTcollab/sse2neon
|
|
*
|
|
* Data layout:
|
|
*
|
|
* [int32_t key][int32_t i][int32_t key][int32_t i]...
|
|
*
|
|
* That is, we extract keys and "pointer" like offsets into an
|
|
* array of the "real" elements. So you sort "short pointers".
|
|
*
|
|
* We might create a version that just sorts bare integers,
|
|
* but that is only good to show off speed compared to algs.
|
|
*
|
|
* Main alg:
|
|
* - We first process data to have the 16 byte alignment (maybe 64 byte align?)
|
|
* - We process the input in 64 byte / loop with 3x SSE 128 bit read + 1xnormal
|
|
* - We process remaining part of the array
|
|
*
|
|
* Middle part should be skipped if there is less than 64 bytes this way...
|
|
*
|
|
* (pre-loop)
|
|
* pat1 = set(1,1,1,1,0,0,0,0,1,1,1,1...)
|
|
* pat2 = set(0,0,0,0,1,1,1,1,0,0,0,0...)
|
|
*
|
|
* (loop)
|
|
* a = LOAD4int[+0]
|
|
* b = LOAD4int[+4]
|
|
* c = LOAD4int[+8]
|
|
* <+d some int code>
|
|
*
|
|
* pata1 = a & pat1;
|
|
* pata2 = a & pat2;
|
|
* patb1 = b & pat1;
|
|
* patb2 = b & pat2;
|
|
* patc1 = c & pat1;
|
|
* patc2 = c & pat2;
|
|
* <+d some int code>
|
|
*
|
|
* rsa = pata1 >> 24;
|
|
* rsb = patb1 >> 24;
|
|
* rsc = patc1 >> 24;
|
|
* <+d some int code>
|
|
*
|
|
* resa = rsa + pata2;
|
|
* resb = rsb + patb2;
|
|
* resc = rsc + patc2;
|
|
* <+d some int code>
|
|
*
|
|
* store(resa, resb, resc) -> tmp
|
|
*
|
|
* <integer code to process(tmp)>
|
|
*
|
|
* The tmp should be automatically 16 bit aligned being on-stack allocated!
|
|
*
|
|
* The integer code that process tmp, depends on what phase we are in, in
|
|
* the phase where we count occurences, we do not need the pat[x]2 parts!
|
|
* However the reorganize phase passes need it to do the copy around!
|
|
*
|
|
* This is how we do 64 byte / iteration and should be pipelined well!
|
|
*
|
|
* However the "integer" code to process tmp is the bottleneck, because
|
|
* that part has to process 16 counts when occurence counting likely
|
|
* with a 4-wide pipeline in (at least) 4 steps and more in the other
|
|
* phase where we are not counting but moving elements where they belong.
|
|
* XXX: Now that I think more... pata2 likes are not needed as only make
|
|
* issues? I mean we could (should) copy key/index directly from source!
|
|
*
|
|
* In the 8 bit float version, I guess we only have bits from exponent..
|
|
* In the 16 bit version, with an extra pass, we have few bits from mantissa..
|
|
*
|
|
* I still think, in the random case it worths the less passes by 8 bit float!
|
|
*
|
|
* String sorting:
|
|
* - First create the [key,index] 64 bit data for strings to sort.
|
|
* - Key should be first 4 character (extra zeroes if there is no enough char)
|
|
* - Do the sorting, with comparator using both the first 4 char AND the strcmp
|
|
*
|
|
* ^^I actually think this gives a pretty fast string comparison because it do
|
|
* give a lot of higher integer values even for short strings (8 bit works)!
|
|
*
|
|
* XXX: We can do approcimately 2x (1,5x?) the speed for integer-only sort!
|
|
*
|
|
* Unsigned/signed:
|
|
* - Custom trickery is needed when processing tmp (both occurence and move)
|
|
* - It should assign different bucket based on topmost bit for sign modes!
|
|
* - Modes: msb_early, msb_late
|
|
* - These two handles: integer, unsigned int, float32
|
|
*
|
|
* XXX: So basically a good sorting algo, for float keyes sorting!
|
|
*
|
|
* Number of passes (8 bit): 3.5
|
|
* - 1x Occurence counting pass
|
|
* - 1x 8 bit float radix pass
|
|
* - 1.5x quicksort pass [more elements]
|
|
*
|
|
* Number of passes (16 bit): 4.2
|
|
* - 1x Occurence counting pass
|
|
* - 2x 8 bit float radix pass
|
|
* - 1.2x quicksort pass [less elements]
|
|
*
|
|
* Rem.: Basically these are regular, pipelined radix passes, with only
|
|
* the float conversions AND shifts vectorized with SSE somewhat...
|
|
*/
|
|
|
|
#include <stdint.h>
|
|
|
|
#ifdef TS_SSE2
|
|
#include <emmintrin.h>
|
|
#endif /* TS_SSE2 */
|
|
#ifdef TS_SSE2_NEON
|
|
#include "sse2neon.h"
|
|
#define TS_SSE2
|
|
#endif /* TS_SSE2_NEON */
|
|
|
|
#ifndef TS_CUSTOM_TYPES
|
|
#define TSBOOL uint8_t
|
|
#define TSTRUE 1
|
|
#define TSFALSE 0
|
|
#define TSU8 uint8_t
|
|
#define TSU32 uint32_t
|
|
#define TSI32 int32_t
|
|
#endif /* TS_CUSTOM_TYPES */
|
|
|
|
/* Nearly same alg for all these keys */
|
|
union tskey {
|
|
TSI32 i;
|
|
TSU32 u;
|
|
float f;
|
|
};
|
|
|
|
struct tselem {
|
|
union tskey key;
|
|
TSU32 i;
|
|
};
|
|
|
|
/* Forward decl of internal code */
|
|
static inline void thiersort8_internal(
|
|
struct tselem *arr,
|
|
TSU32 length,
|
|
void* (*malloc)(size_t size),
|
|
void (*free)(void *ptr),
|
|
const TSBOOL (*lt)(
|
|
const union tskey ak,
|
|
const union tskey bk,
|
|
const TSU32 ai,
|
|
const TSU32 bi,
|
|
void *reent_data),
|
|
const TSBOOL isint,
|
|
const TSBOOL isunsigned,
|
|
const TSBOOL isfloat,
|
|
void *reent_data);
|
|
|
|
/** The '<' operation used for int32 */
|
|
static inline const TSBOOL ts_lt_int(
|
|
const union tskey ak,
|
|
const union tskey bk,
|
|
const TSU32 ai,
|
|
const TSU32 bi,
|
|
void *reent_data) { return (ak.i < bk.i); }
|
|
|
|
/** The '<' operation used for unsigned int32 */
|
|
static inline const TSBOOL ts_lt_uint(
|
|
const union tskey ak,
|
|
const union tskey bk,
|
|
const TSU32 ai,
|
|
const TSU32 bi,
|
|
void *reent_data) { return (ak.u < bk.u); }
|
|
|
|
/** The '<' operation used for float */
|
|
static inline const TSBOOL ts_lt_float(
|
|
const union tskey ak,
|
|
const union tskey bk,
|
|
const TSU32 ai,
|
|
const TSU32 bi,
|
|
void *reent_data) { return (ak.f < bk.f); }
|
|
|
|
/**
|
|
* Sort {key, index32} elements where key is 32 bit integer.
|
|
*/
|
|
static inline void thiersort8_intkey(
|
|
struct tselem *arr,
|
|
TSU32 length,
|
|
void* (*malloc)(size_t size),
|
|
void (*free)(void *ptr)) {
|
|
thiersort8_internal(
|
|
arr,
|
|
length,
|
|
malloc,
|
|
free,
|
|
ts_lt_int,
|
|
TSTRUE,
|
|
TSFALSE,
|
|
TSFALSE,
|
|
NULL);
|
|
}
|
|
|
|
/**
|
|
* Sort {key, index32} elements where key is unsigned 32 bit integer.
|
|
*/
|
|
static inline void thiersort8_uintkey(
|
|
struct tselem *arr,
|
|
TSU32 length,
|
|
void* (*malloc)(size_t size),
|
|
void (*free)(void *ptr)) {
|
|
thiersort8_internal(
|
|
arr,
|
|
length,
|
|
malloc,
|
|
free,
|
|
ts_lt_uint,
|
|
TSFALSE,
|
|
TSTRUE,
|
|
TSFALSE,
|
|
NULL);
|
|
}
|
|
|
|
/**
|
|
* Sort {key, index32} elements where key is 32 bit integer.
|
|
*/
|
|
static inline void thiersort8_floatkey(
|
|
struct tselem *arr,
|
|
TSU32 length,
|
|
void* (*malloc)(size_t size),
|
|
void (*free)(void *ptr)) {
|
|
thiersort8_internal(
|
|
arr,
|
|
length,
|
|
malloc,
|
|
free,
|
|
ts_lt_float,
|
|
TSFALSE,
|
|
TSFALSE,
|
|
TSTRUE,
|
|
NULL);
|
|
}
|
|
|
|
/**
|
|
* Create [key, index] pairs from your raw input data array. O(n) runtime.
|
|
*
|
|
* Useful if you have an array of elements to sort and what them prepared to be
|
|
* properly thiersorted as key, index pairs. Then you can apply that reordering
|
|
* later using thiersort_apply(..).
|
|
*
|
|
* BEWARE: The returned array must be freed by you - with a corresponding free.
|
|
* An alternative is to do a thiersort_apply(..) call (for the array case).
|
|
*
|
|
* @param arr The input array.
|
|
* @param askey The sequence that generates input. Gives the ith elem as key.
|
|
* @param elemsize The size of a single elem,
|
|
* @param length The length of input sequence. Number of elements.
|
|
* @param malloc The function to use for allocating the key-index array.
|
|
* @returns Created array of keys+indices for that key to sort.
|
|
*/
|
|
static inline struct tselem *thiersort_prepare_array(
|
|
void *arr,
|
|
union tskey (askey)(void *elem),
|
|
TSU32 elemsize,
|
|
TSU32 length,
|
|
void* (*malloc)(size_t size)) {
|
|
/* Allocate */
|
|
tselem *out = (tselem *)malloc(length * sizeof(tselem));
|
|
|
|
/* Fill */
|
|
TSU32 j = 0;
|
|
for(size_t i = 0; i < (length * elemsize); i += elemsize, ++j) {
|
|
out[j].key = askey((void *)((TSU8 *)arr + i));
|
|
out[j].i = j;
|
|
}
|
|
|
|
/* Return */
|
|
return out;
|
|
}
|
|
|
|
/**
|
|
* Apply the given [key,index] sort result back to an array.
|
|
*
|
|
* Rem.: Also frees the sort result array!
|
|
*
|
|
* @param sortres A thiersort result on previously prepared arr.
|
|
* @param arr The original array which was prepared for sort and was sorted.
|
|
* @param length The number of element (both in sortres and arr)
|
|
* @param elemsize Size of a single element
|
|
* @param tmp An array of elemsize bytes to store values for swap!
|
|
* @param free The operation used for freeing the sort res array.
|
|
*/
|
|
static inline void thiersort_apply(
|
|
struct tselem *sortres,
|
|
void *arr,
|
|
TSU32 length,
|
|
TSU32 elemsize,
|
|
void *tmp,
|
|
void (*free)(void *ptr)) {
|
|
/* Replace all positions */
|
|
for(TSU32 i = 0; i < length; ++i) {
|
|
/* Replace the whole chain starting at i (j for chain index) */
|
|
/* This works because we can easily check what does'nt need moves anymore. */
|
|
TSU32 j = i;
|
|
TSU32 ni = sortres[j].i; /* (*) */
|
|
|
|
/* Until would move to its own position, chain */
|
|
/* This also solves "already at right place" originals. */
|
|
while(ni != j) {
|
|
/* xchg j and ni - but only if not already swapped! See: (*) */
|
|
if(sortres[ni].i != ni) {
|
|
memcpy(tmp, &((TSU8*)arr)[(size_t)j * elemsize], elemsize);
|
|
memcpy(
|
|
&((TSU8*)arr)[(size_t)j * elemsize],
|
|
&((TSU8*)arr)[(size_t)ni * elemsize],
|
|
elemsize);
|
|
memcpy(&((TSU8*)arr)[(size_t)ni * elemsize], tmp, elemsize);
|
|
}
|
|
|
|
/* Mark j index as done in sortres for outer loop. */
|
|
/* This is necessary for inner loop stopping early */
|
|
/* and outer catch up on already processed location. */
|
|
sortres[j].i = j; /* (*) */
|
|
|
|
/* Update j and ni */
|
|
/* Now we must find what should be at new location j */
|
|
/* instead of what we swap in there from old j loc. */
|
|
j = ni;
|
|
ni = sortres[j].i;
|
|
}
|
|
}
|
|
|
|
/* Release mem - better we do because we changed it behind the scenes! */
|
|
free(sortres);
|
|
}
|
|
|
|
/**
|
|
* Do sign-trick based radix transform on nom-2s-complement 8 bit float!
|
|
*
|
|
* -1 means -127
|
|
* -2 means -126
|
|
* ...
|
|
* -127 means -0
|
|
* 0 means +0
|
|
* 1 means 1
|
|
* ...
|
|
* 127 means 127
|
|
*
|
|
* We convert the above (in this order) into [0..255]!
|
|
*/
|
|
static inline TSU8 ts_floatsigntrick(TSU8 floatsigned) {
|
|
if((floatsigned & 0x80) == 0) {
|
|
/* Positive */
|
|
return (128 + floatsigned);
|
|
} else {
|
|
/* Negative */
|
|
return (128 - (floatsigned & 0x80));
|
|
}
|
|
}
|
|
|
|
/** Gets which bucket (index) the given key should go */
|
|
static inline TSU8 ts_radixi(
|
|
union tskey key,
|
|
const TSBOOL isint,
|
|
const TSBOOL isunsigned,
|
|
const TSBOOL isfloat) {
|
|
/* Compiler should optimize out branch here! */
|
|
union tskey k;
|
|
if(isint) {
|
|
/* Sign bit can be 1! */
|
|
k.f = (float)(key.i);
|
|
|
|
/* 8 bit float */
|
|
k.u = k.u >> 24;
|
|
|
|
/* Need float sign trickery */
|
|
return ts_floatsigntrick((TSU8)k.u);
|
|
}
|
|
if(isunsigned) {
|
|
/* Sign bit CANNOT be 1! */
|
|
k.f = (float)(key.u);
|
|
|
|
/* 8 bit float */
|
|
/* top bit always zero so ignore it and use 8 useful bits */
|
|
/* - this is why we shift 23 and not 24 here */
|
|
k.u = k.u >> 23;
|
|
|
|
/* We are fine! */
|
|
return (TSU8)k.u;
|
|
}
|
|
if(isfloat) {
|
|
/* Sign bit can be 1! */
|
|
k.f = key.f;
|
|
|
|
/* 8 bit float */
|
|
k.u = k.u >> 24;
|
|
|
|
/* Need float sign trickery */
|
|
return ts_floatsigntrick((TSU8)k.u);
|
|
}
|
|
|
|
/* Should never happen */
|
|
assert(false);
|
|
}
|
|
|
|
/* Forward decl. */
|
|
static inline void ts_quicksort_inplace_impl(
|
|
struct tselem *arr,
|
|
TSU32 from,
|
|
TSU32 to,
|
|
const TSBOOL (*lt)(
|
|
const union tskey ak,
|
|
const union tskey bk,
|
|
const TSU32 ai,
|
|
const TSU32 bi,
|
|
void *reent_data),
|
|
void *reent_data,
|
|
TSBOOL const_check);
|
|
|
|
/** Simple inplace quicksort for tselems */
|
|
static inline void ts_quicksort_inplace(
|
|
struct tselem *arr,
|
|
TSU32 from,
|
|
TSU32 to,
|
|
const TSBOOL (*lt)(
|
|
const union tskey ak,
|
|
const union tskey bk,
|
|
const TSU32 ai,
|
|
const TSU32 bi,
|
|
void *reent_data),
|
|
void *reent_data) {
|
|
ts_quicksort_inplace_impl(
|
|
arr,
|
|
from,
|
|
to,
|
|
lt,
|
|
reent_data,
|
|
TSFALSE);
|
|
}
|
|
|
|
|
|
/** Implementation details for ts_quicksort_inplace(..) - use that instead! */
|
|
static inline void ts_quicksort_inplace_impl(
|
|
struct tselem *arr,
|
|
TSU32 from,
|
|
TSU32 to,
|
|
const TSBOOL (*lt)(
|
|
const union tskey ak,
|
|
const union tskey bk,
|
|
const TSU32 ai,
|
|
const TSU32 bi,
|
|
void *reent_data),
|
|
void *reent_data,
|
|
TSBOOL const_check) {
|
|
/* Must do this early exit! */
|
|
// TODO: TS_UNLIKELY
|
|
if(to == 0) return;
|
|
if(from >= to - 1) return;
|
|
|
|
/* For checking sorting constant array (same value always worst cases) */
|
|
TSU32 pivcnt = 0;
|
|
|
|
/* Pivoting */
|
|
TSU32 len = (to - from);
|
|
TSU32 mid = from + len / 2;
|
|
const union tskey pivotkey = arr[mid].key;
|
|
TSU32 pivoti = arr[mid].i;
|
|
|
|
/* Main loop */
|
|
TSU32 left = from;
|
|
TSU32 right = to - 1;
|
|
while(left < right) {
|
|
/* Step over rightly positioned elems from left */
|
|
union tskey leftkey = arr[left].key;
|
|
TSU32 lefti = arr[left].i;
|
|
while((left < right) && lt(leftkey, pivotkey, lefti, pivoti, reent_data)) {
|
|
/* Step */
|
|
++left;
|
|
leftkey = arr[left].key;
|
|
lefti = arr[left].i;
|
|
}
|
|
|
|
/* Step over rightly positioned elems from right */
|
|
union tskey rightkey = arr[right].key;
|
|
TSU32 righti = arr[right].i;
|
|
while((left < right) && !lt(rightkey, pivotkey, righti, pivoti, reent_data)) {
|
|
/* Compiler should optimize this branch out compile time! */
|
|
if(const_check) {
|
|
/* (**): Check for stepping over everything because its all the same.. */
|
|
/* This with the above if gives two GE so an equality check */
|
|
if(!lt(pivotkey, rightkey, pivoti, righti, reent_data)) {
|
|
++pivcnt;
|
|
}
|
|
}
|
|
|
|
/* Step */
|
|
--right;
|
|
rightkey = arr[right].key;
|
|
righti = arr[right].i;
|
|
}
|
|
|
|
/* Swap wrongly positioned element */
|
|
if(left < right) {
|
|
const struct tselem tmp = arr[left];
|
|
arr[left] = arr[right];
|
|
arr[right] = tmp;
|
|
++left;
|
|
--right;
|
|
}
|
|
}
|
|
|
|
/* Fix left to be after the point of separation in odd lengths */
|
|
if(left == right) {
|
|
const union tskey leftkey = arr[left].key;
|
|
TSU32 lefti = arr[left].i;
|
|
left += (lt(leftkey, pivotkey, lefti, pivoti, reent_data));
|
|
}
|
|
|
|
/* Check edge-case when left got empty. */
|
|
/* Rem.: Right moves over the pivot invariably as left cannot pass over if! */
|
|
/* This means that right subset can never be empty, but left can! */
|
|
/* This also means that if left is empty, all invariants were just */
|
|
/* stepped over (from right) and not moved from its original position! */
|
|
// TODO: TS_UNLIKELY
|
|
if(from < left) {
|
|
/** Good case: Somewhere cutting the array in around half */
|
|
ts_quicksort_inplace(arr, from, left, lt, reent_data);
|
|
ts_quicksort_inplace(arr, left, to, lt, reent_data);
|
|
} else {
|
|
/* Left sub-array got to be empty - but to step with recursion */
|
|
/* we need to shrink sizes, which is possible by moving the pivot */
|
|
/* to the start of the right (because we know its a minimal elem!) */
|
|
/* XXX: If we collect up all pivots, we can move all here if want.. */
|
|
|
|
/* Swap */
|
|
const struct tselem tmp = arr[left];
|
|
arr[left] = arr[mid];
|
|
arr[mid] = tmp;
|
|
|
|
/* XXX: Instead of (**) above, we could so some other sort here as a fallback (like a heapsort or merge) */
|
|
/* This ignores constantly the same arrays */
|
|
if(pivcnt < len - 1) {
|
|
/* Bad case: could shrink the array by one element only */
|
|
ts_quicksort_inplace_impl(arr, left + 1, to, lt, reent_data, TSTRUE);
|
|
}
|
|
}
|
|
}
|
|
|
|
/**
|
|
* Simple quicksort implementation that also moves data.
|
|
*
|
|
* Decidedly not inplace to avoid non inpleace algs arr copy.
|
|
*/
|
|
static inline void ts_quicksort_fromto(
|
|
struct tselem *src,
|
|
struct tselem *dst,
|
|
TSU32 from,
|
|
TSU32 to,
|
|
const TSBOOL (*lt)(
|
|
const union tskey ak,
|
|
const union tskey bk,
|
|
const TSU32 ai,
|
|
const TSU32 bi,
|
|
void *reent_data),
|
|
void *reent_data) {
|
|
// TODO: Check if code around here is good - tests look like OK, but I thought its buggy!?
|
|
if(from >= to) return;
|
|
|
|
TSU32 len = (to - from);
|
|
TSU32 mid = from + len / 2;
|
|
const union tskey pivotkey = src[mid].key;
|
|
TSU32 pivoti = src[mid].i;
|
|
|
|
TSU32 li = from;
|
|
TSU32 ri = to - 1;
|
|
for(TSU32 i = from; i < to; ++i) {
|
|
const struct tselem e = src[i];
|
|
if(lt(e.key, pivotkey, e.i, pivoti, reent_data)) {
|
|
dst[li++] = e;
|
|
} else {
|
|
dst[ri--] = e;
|
|
}
|
|
}
|
|
|
|
/* Can use inplace steps from now on */
|
|
ts_quicksort_inplace(dst, from, li, lt, reent_data);
|
|
ts_quicksort_inplace(dst, li, to, lt, reent_data);
|
|
}
|
|
|
|
/** Internal code reuse (type-branches should be optimized out) */
|
|
static inline void thiersort8_internal(
|
|
struct tselem *arr,
|
|
const TSU32 length,
|
|
void* (*malloc)(size_t size),
|
|
void (*free)(void *ptr),
|
|
const TSBOOL (*lt)(
|
|
const union tskey ak,
|
|
const union tskey bk,
|
|
const TSU32 ai,
|
|
const TSU32 bi,
|
|
void *reent_data),
|
|
const TSBOOL isint,
|
|
const TSBOOL isunsigned,
|
|
const TSBOOL isfloat,
|
|
void *reent_data) {
|
|
|
|
/* Preparation */
|
|
TSU32 radics[256] = {0};
|
|
/* [from, to) index: only where prefix sums change - usually nonfull */
|
|
TSU32 real_radics[256 * 2] = {0};
|
|
struct tselem *arr2 = (tselem *)malloc((sizeof(struct tselem)) * length);
|
|
|
|
/* Step 1) Bucketing */
|
|
#ifdef TS_SSE2
|
|
/* TODO: implement with SSE2 */
|
|
#else
|
|
/* TODO: We can go both down and upwards here to increase ILP! */
|
|
/* Occurence counting O(n) */
|
|
for(TSU32 i = 0; i < length; ++i) {
|
|
++radics[ts_radixi(arr[i].key, isint, isunsigned, isfloat)];
|
|
// FIXME: remove this debug code!
|
|
// arr[i].i = ts_radixi(arr[i].key, isint, isunsigned, isfloat);
|
|
}
|
|
|
|
/* Prefix sum + real radics calc O(256) */
|
|
/* Radics: */
|
|
/* fr: {10, 20, 10, 0, 5, 15,...} */
|
|
/* to: {10, 30, 40, 40, 45, 60,..} */
|
|
/* Real radics: */
|
|
/* to: {[0, 10], [10, 30], [30, 40], [40, 45], [45, 60]} */
|
|
/* 0. 1. 2. 4. 5. */
|
|
/* (because radix value 3 is not found in input) */
|
|
TSU32 prev = 0;
|
|
TSU32 reali = 0;
|
|
for(int i = 0; i < 256; ++i) {
|
|
if(radics[i] != 0) {
|
|
radics[i] += prev;
|
|
real_radics[reali] = prev;
|
|
real_radics[reali + 1] = radics[i];
|
|
prev = radics[i];
|
|
reali += 2;
|
|
} else {
|
|
radics[i] += prev;
|
|
prev = radics[i];
|
|
}
|
|
}
|
|
|
|
/* Move elements according to their buckets O(n) */
|
|
/* Right-to-left ensures keeping internal order */
|
|
for(TSU32 i = length; i > 0; --i) {
|
|
TSU8 radi = ts_radixi(arr[i - 1].key, isint, isunsigned, isfloat);
|
|
TSU32 offset = --radics[radi];
|
|
arr2[offset] = arr[i - 1];
|
|
}
|
|
#endif /* TS_SSE2 */
|
|
|
|
/* Step 2) Sorting buckets contents - also moving back to original array */
|
|
for(int i = 0; i < reali; i += 2) {
|
|
/* inclusive */
|
|
TSU32 from = real_radics[i];
|
|
/* non-inclusive */
|
|
TSU32 to = real_radics[i + 1];
|
|
|
|
/* Quicksort this bucket in O(mlogm), where m << n usually */
|
|
ts_quicksort_fromto(arr2, arr, from, to, lt, reent_data);
|
|
}
|
|
|
|
/* Cleanup */
|
|
free(arr2);
|
|
}
|
|
|
|
#endif /* THIER_SORT_H */
|