#ifndef THIER_SORT_H #define THIER_SORT_H /* * This sort alg. is a two step bucket sort algorithm. * - Step 1 creates top-down pre-sorted buckets with a float radix (allocates) * - Step 2 uses M quicksort algorithms with source-to-destination backwrite. * * What is "float radix" sort? * - We convert each integer key to a 32 bit float value * - We take that and create 8 bit ((*): or 16 bit) approxmator * - This is what we radix sort on as if it would be 8 bit char (**) * - In both cases, those are just right-shifted floats (LSB bits lost) * * For calculating occurence counts, we need a pass on the whole data set. * Then we need an other pass - into a new array - to do radix step, or * if we do 16 bit float variant we need two steps (new + old array). * * For random data, likely the best is to do less passes, so we do the * 8 bit version as first implementation, this gives more data in the * buckets for the "Step 2" quicksort part - which is not std::sort or * something, but we write a variant that puts back data in original * array so no back copy will be necessary. For the (*) variant, with * 16 bit floats, we do not need back-copy and can do quicksort in place! * * To avoid costly float conversions, we can do SSE2 here (or neon)! * * These are useful for us: * * __m128i _mm_load_si128 (__m128i const* mem_addr) * #include * Instruction: movdqa xmm, m128 [Latency: 6, Throughput: 0.33 - 0.5] * CPUID Flags: SSE2 * Description * Load 128-bits of integer data from memory into dst. mem_addr must be * aligned on a 16-byte boundary or a general-protection exception may be * generated. * * __m128i _mm_add_epi8 (__m128i a, __m128i b) * #include * Instruction: paddb xmm, xmm [Latency: 1, Throughput: 0.33] * CPUID Flags: SSE2 * Description * Add packed 8-bit integers in a and b, and store the results in dst. * * __m128i _mm_and_si128 (__m128i a, __m128i b) * #include * Instruction: pand xmm, xmm [Latency: 1, Throughput: 0.33] * CPUID Flags: SSE2 * Description * Compute the bitwise AND of 128 bits (representing integer data) * in a and b, and store the result in dst. * * __m128i _mm_set_epi8 ( * char e15, char e14, char e13, char e12, * char e11, char e10, char e9, char e8, * char e7, char e6, char e5, char e4, * char e3, char e2, char e1, char e0) * #include * Instruction: Sequence [XXX: slow - best outside loop] * CPUID Flags: SSE2 * Description * Set packed 8-bit integers in dst with the supplied values. * * __m128 _mm_cvtepi32_ps (__m128i a) * #include * Instruction: cvtdq2ps xmm, xmm [Latency: 4, Throughput: 0.5] * CPUID Flags: SSE2 * Description * Convert packed signed 32-bit integers in a to packed single-precision * (32-bit) floating-point elements, and store the results in dst. * * __m128i _mm_castps_si128 (__m128 a) * #include * CPUID Flags: SSE2 * Description * Cast vector of type __m128 to type __m128i. This intrinsic is only * used for compilation and does not generate any instructions, thus it * has zero latency. * * void _mm_store_si128 (__m128i* mem_addr, __m128i a) * #include * Instruction: movdqa m128, xmm [Latency: 1 - 5, Throughput: 0.5 - 1] * CPUID Flags: SSE2 * Description * Store 128-bits of integer data from a into memory. mem_addr must be * aligned on a 16-byte boundary or a general-protection exception may * be generated. * * void _mm_maskmoveu_si128 (__m128i a, __m128i mask, char* mem_addr) * #include * Instruction: maskmovdqu xmm, xmm [Latency: 6, Throughput: 1 - XXX: NT!] * CPUID Flags: SSE2 * Description * Conditionally store 8-bit integer elements from a into memory using * mask (elements are not stored when the highest bit is not set in the * corresponding element) and a non-temporal memory hint. mem_addr does * not need to be aligned on any particular boundary. * * int _mm_extract_epi16 (__m128i a, int imm8) * #include * Instruction: pextrw r32, xmm, imm8 [Latency: 4, Throughput: 1] * CPUID Flags: SSE2 * Description * Extract a 16-bit integer from a, selected with imm8, * and store the result in the lower element of dst. * Operation * dst[15:0] := (a[127:0] >> (imm8[2:0] * 16))[15:0] * dst[31:16] := 0 * * void _mm_store_si128 (__m128i* mem_addr, __m128i a) * #include * Instruction: movdqa m128, xmm [Latency: 1 - 5, Throughput: 0.5 - 1] * CPUID Flags: SSE2 * Description * Store 128-bits of integer data from a into memory. mem_addr must be * aligned on a 16-byte boundary or a general-protection exception * may be generated. * * __m128i _mm_srli_epi32 (__m128i a, int imm8) * #include * Instruction: psrld xmm, imm8 [Latency: 1, Throughput: 0.5] * CPUID Flags: SSE2 * Description * Shift packed 32-bit integers in a right by imm8 while shifting in * zeros, and store the results in dst. * * See also: * https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html * https://www.laruence.com/x86/MASKMOVDQU.html * https://blogs.fau.de/hager/archives/2103 * * For ARM / Neon: * https://github.com/DLTcollab/sse2neon * * Data layout: * * [int32_t key][int32_t i][int32_t key][int32_t i]... * * That is, we extract keys and "pointer" like offsets into an * array of the "real" elements. So you sort "short pointers". * * We might create a version that just sorts bare integers, * but that is only good to show off speed compared to algs. * * Main alg: * - We first process data to have the 16 byte alignment (maybe 64 byte align?) * - We process the input in 64 byte / loop with 3x SSE 128 bit read + 1xnormal * - We process remaining part of the array * * Middle part should be skipped if there is less than 64 bytes this way... * * (pre-loop) * pat1 = set(1,1,1,1,0,0,0,0,1,1,1,1...) * pat2 = set(0,0,0,0,1,1,1,1,0,0,0,0...) * * (loop) * a = LOAD4int[+0] * b = LOAD4int[+4] * c = LOAD4int[+8] * <+d some int code> * * pata1 = a & pat1; * pata2 = a & pat2; * patb1 = b & pat1; * patb2 = b & pat2; * patc1 = c & pat1; * patc2 = c & pat2; * <+d some int code> * * rsa = pata1 >> 24; * rsb = patb1 >> 24; * rsc = patc1 >> 24; * <+d some int code> * * resa = rsa + pata2; * resb = rsb + patb2; * resc = rsc + patc2; * <+d some int code> * * store(resa, resb, resc) -> tmp * * * * The tmp should be automatically 16 bit aligned being on-stack allocated! * * The integer code that process tmp, depends on what phase we are in, in * the phase where we count occurences, we do not need the pat[x]2 parts! * However the reorganize phase passes need it to do the copy around! * * This is how we do 64 byte / iteration and should be pipelined well! * * However the "integer" code to process tmp is the bottleneck, because * that part has to process 16 counts when occurence counting likely * with a 4-wide pipeline in (at least) 4 steps and more in the other * phase where we are not counting but moving elements where they belong. * XXX: Now that I think more... pata2 likes are not needed as only make * issues? I mean we could (should) copy key/index directly from source! * * In the 8 bit float version, I guess we only have bits from exponent.. * In the 16 bit version, with an extra pass, we have few bits from mantissa.. * * I still think, in the random case it worths the less passes by 8 bit float! * * String sorting: * - First create the [key,index] 64 bit data for strings to sort. * - Key should be first 4 character (extra zeroes if there is no enough char) * - Do the sorting, with comparator using both the first 4 char AND the strcmp * * ^^I actually think this gives a pretty fast string comparison because it do * give a lot of higher integer values even for short strings (8 bit works)! * * XXX: We can do approcimately 2x (1,5x?) the speed for integer-only sort! * * Unsigned/signed: * - Custom trickery is needed when processing tmp (both occurence and move) * - It should assign different bucket based on topmost bit for sign modes! * - Modes: msb_early, msb_late * - These two handles: integer, unsigned int, float32 * * XXX: So basically a good sorting algo, for float keyes sorting! * * Number of passes (8 bit): 3.5 * - 1x Occurence counting pass * - 1x 8 bit float radix pass * - 1.5x quicksort pass [more elements] * * Number of passes (16 bit): 4.2 * - 1x Occurence counting pass * - 2x 8 bit float radix pass * - 1.2x quicksort pass [less elements] * * Rem.: Basically these are regular, pipelined radix passes, with only * the float conversions AND shifts vectorized with SSE somewhat... */ #include #ifdef TS_SSE2 #include #endif /* TS_SSE2 */ #ifdef TS_SSE2_NEON #include "sse2neon.h" #define TS_SSE2 #endif /* TS_SSE2_NEON */ #ifndef TS_CUSTOM_TYPES #define TSBOOL uint8_t #define TSTRUE 1 #define TSFALSE 0 #define TSU8 uint8_t #define TSU32 uint32_t #define TSI32 int32_t #endif /* TS_CUSTOM_TYPES */ /* Nearly same alg for all these keys */ union tskey { TSI32 i; TSU32 u; float f; }; struct tselem { union tskey key; TSU32 i; }; /* Forward decl of internal code */ static inline void thiersort8_internal( struct tselem *arr, TSU32 length, void* (*malloc)(size_t size), void (*free)(void *ptr), const TSBOOL (*lt)( const union tskey ak, const union tskey bk, const TSU32 ai, const TSU32 bi, void *reent_data), const TSBOOL isint, const TSBOOL isunsigned, const TSBOOL isfloat, void *reent_data); /** The '<' operation used for int32 */ static inline const TSBOOL ts_lt_int( const union tskey ak, const union tskey bk, const TSU32 ai, const TSU32 bi, void *reent_data) { return (ak.i < bk.i); } /** The '<' operation used for unsigned int32 */ static inline const TSBOOL ts_lt_uint( const union tskey ak, const union tskey bk, const TSU32 ai, const TSU32 bi, void *reent_data) { return (ak.u < bk.u); } /** The '<' operation used for float */ static inline const TSBOOL ts_lt_float( const union tskey ak, const union tskey bk, const TSU32 ai, const TSU32 bi, void *reent_data) { return (ak.f < bk.f); } /** * Sort {key, index32} elements where key is 32 bit integer. */ static inline void thiersort8_intkey( struct tselem *arr, TSU32 length, void* (*malloc)(size_t size), void (*free)(void *ptr)) { thiersort8_internal( arr, length, malloc, free, ts_lt_int, TSTRUE, TSFALSE, TSFALSE, NULL); } /** * Sort {key, index32} elements where key is unsigned 32 bit integer. */ static inline void thiersort8_uintkey( struct tselem *arr, TSU32 length, void* (*malloc)(size_t size), void (*free)(void *ptr)) { thiersort8_internal( arr, length, malloc, free, ts_lt_uint, TSFALSE, TSTRUE, TSFALSE, NULL); } /** * Sort {key, index32} elements where key is 32 bit integer. */ static inline void thiersort8_floatkey( struct tselem *arr, TSU32 length, void* (*malloc)(size_t size), void (*free)(void *ptr)) { thiersort8_internal( arr, length, malloc, free, ts_lt_float, TSFALSE, TSFALSE, TSTRUE, NULL); } /** * Create [key, index] pairs from your raw input data array. O(n) runtime. * * Useful if you have an array of elements to sort and what them prepared to be * properly thiersorted as key, index pairs. Then you can apply that reordering * later using thiersort_apply(..). * * BEWARE: The returned array must be freed by you - with a corresponding free. * An alternative is to do a thiersort_apply(..) call (for the array case). * * @param arr The input array. * @param askey The sequence that generates input. Gives the ith elem as key. * @param elemsize The size of a single elem, * @param length The length of input sequence. Number of elements. * @param malloc The function to use for allocating the key-index array. * @returns Created array of keys+indices for that key to sort. */ static inline struct tselem *thiersort_prepare_array( void *arr, union tskey (askey)(void *elem), TSU32 elemsize, TSU32 length, void* (*malloc)(size_t size)) { /* Allocate */ tselem *out = (tselem *)malloc(length * sizeof(tselem)); /* Fill */ TSU32 j = 0; for(size_t i = 0; i < (length * elemsize); i += elemsize, ++j) { out[j].key = askey((void *)((TSU8 *)arr + i)); out[j].i = j; } /* Return */ return out; } /** * Apply the given [key,index] sort result back to an array. * * Rem.: Also frees the sort result array! * * @param sortres A thiersort result on previously prepared arr. * @param arr The original array which was prepared for sort and was sorted. * @param length The number of element (both in sortres and arr) * @param elemsize Size of a single element * @param tmp An array of elemsize bytes to store values for swap! * @param free The operation used for freeing the sort res array. */ static inline void thiersort_apply( struct tselem *sortres, void *arr, TSU32 length, TSU32 elemsize, void *tmp, void (*free)(void *ptr)) { /* Replace all positions */ for(TSU32 i = 0; i < length; ++i) { /* Replace the whole chain starting at i (j for chain index) */ /* This works because we can easily check what does'nt need moves anymore. */ TSU32 j = i; TSU32 ni = sortres[j].i; /* (*) */ /* Until would move to its own position, chain */ /* This also solves "already at right place" originals. */ while(ni != j) { /* xchg j and ni - but only if not already swapped! See: (*) */ if(sortres[ni].i != ni) { memcpy(tmp, &((TSU8*)arr)[(size_t)j * elemsize], elemsize); memcpy( &((TSU8*)arr)[(size_t)j * elemsize], &((TSU8*)arr)[(size_t)ni * elemsize], elemsize); memcpy(&((TSU8*)arr)[(size_t)ni * elemsize], tmp, elemsize); } /* Mark j index as done in sortres for outer loop. */ /* This is necessary for inner loop stopping early */ /* and outer catch up on already processed location. */ sortres[j].i = j; /* (*) */ /* Update j and ni */ /* Now we must find what should be at new location j */ /* instead of what we swap in there from old j loc. */ j = ni; ni = sortres[j].i; } } /* Release mem - better we do because we changed it behind the scenes! */ free(sortres); } /** * Do sign-trick based radix transform on nom-2s-complement 8 bit float! * * -1 means -127 * -2 means -126 * ... * -127 means -0 * 0 means +0 * 1 means 1 * ... * 127 means 127 * * We convert the above (in this order) into [0..255]! */ static inline TSU8 ts_floatsigntrick(TSU8 floatsigned) { if((floatsigned & 0x80) == 0) { /* Positive */ return (128 + floatsigned); } else { /* Negative */ return (128 - (floatsigned & 0x80)); } } /** Gets which bucket (index) the given key should go */ static inline TSU8 ts_radixi( union tskey key, const TSBOOL isint, const TSBOOL isunsigned, const TSBOOL isfloat) { /* Compiler should optimize out branch here! */ union tskey k; if(isint) { /* Sign bit can be 1! */ k.f = (float)(key.i); /* 8 bit float */ k.u = k.u >> 24; /* Need float sign trickery */ return ts_floatsigntrick((TSU8)k.u); } if(isunsigned) { /* Sign bit CANNOT be 1! */ k.f = (float)(key.u); /* 8 bit float */ /* top bit always zero so ignore it and use 8 useful bits */ /* - this is why we shift 23 and not 24 here */ k.u = k.u >> 23; /* We are fine! */ return (TSU8)k.u; } if(isfloat) { /* Sign bit can be 1! */ k.f = key.f; /* 8 bit float */ k.u = k.u >> 24; /* Need float sign trickery */ return ts_floatsigntrick((TSU8)k.u); } /* Should never happen */ assert(false); } /** Simple inplace quicksort for tselems */ static inline void ts_quicksort_inplace( struct tselem *arr, TSU32 from, TSU32 to, const TSBOOL (*lt)( const union tskey ak, const union tskey bk, const TSU32 ai, const TSU32 bi, void *reent_data), void *reent_data) { /* Must do this early exit! */ // TODO: TS_UNLIKELY if(to == 0) return; if(from >= to - 1) return; TSU32 len = (to - from); TSU32 mid = from + len / 2; const union tskey pivotkey = arr[mid].key; TSU32 pivoti = arr[mid].i; TSU32 left = from; TSU32 right = to - 1; while(left < right) { /* Step over rightly positioned elems from left */ union tskey leftkey = arr[left].key; TSU32 lefti = arr[left].i; while((left < right) && lt(leftkey, pivotkey, lefti, pivoti, reent_data)) { ++left; leftkey = arr[left].key; lefti = arr[left].i; } /* Step over rightly positioned elems from right */ union tskey rightkey = arr[right].key; TSU32 righti = arr[right].i; while((left < right) && !lt(rightkey, pivotkey, righti, pivoti, reent_data)) { --right; rightkey = arr[right].key; righti = arr[right].i; } /* Swap wrongly positioned element */ if(left < right) { const struct tselem tmp = arr[left]; arr[left] = arr[right]; arr[right] = tmp; ++left; --right; } } /* Fix left to be after the point of separation in odd lengths */ if(left == right) { const union tskey leftkey = arr[left].key; TSU32 lefti = arr[left].i; left += (lt(leftkey, pivotkey, lefti, pivoti, reent_data)); } /* Check edge-case when left got empty. */ /* Rem.: Right moves over the pivot invariably as left cannot pass over if! */ /* This means that right subset can never be empty, but left can! */ /* This also means that if left is empty, all invariants were just */ /* stepped over (from right) and not moved from its original position! */ // TODO: TS_UNLIKELY if(from < left) { /** Good case: Somewhere cutting the array in around half */ ts_quicksort_inplace(arr, from, left, lt, reent_data); ts_quicksort_inplace(arr, left, to, lt, reent_data); } else { /* Left sub-array got to be empty - but to step with recursion */ /* we need to shrink sizes, which is possible by moving the pivot */ /* to the start of the right (because we know its a minimal elem!) */ /* XXX: If we collect up all pivots, we can move all here if want.. */ /* Swap */ const struct tselem tmp = arr[left]; arr[left] = arr[mid]; arr[mid] = tmp; /* Bad case: could shrink the array by one element only */ ts_quicksort_inplace(arr, left + 1, to, lt, reent_data); } } /** * Simple quicksort implementation that also moves data. * * Decidedly not inplace to avoid non inpleace algs arr copy. */ static inline void ts_quicksort_fromto( struct tselem *src, struct tselem *dst, TSU32 from, TSU32 to, const TSBOOL (*lt)( const union tskey ak, const union tskey bk, const TSU32 ai, const TSU32 bi, void *reent_data), void *reent_data) { // FIXME: Code is rotten here - see above variant pls! if(from >= to) return; TSU32 len = (to - from); TSU32 mid = from + len / 2; const union tskey pivotkey = src[mid].key; TSU32 pivoti = src[mid].i; TSU32 li = from; TSU32 ri = to - 1; for(TSU32 i = from; i < to; ++i) { const struct tselem e = src[i]; if(lt(e.key, pivotkey, e.i, pivoti, reent_data)) { dst[li++] = e; } else { dst[ri--] = e; } } /* Can use inplace steps from now on */ ts_quicksort_inplace(dst, from, li, lt, reent_data); ts_quicksort_inplace(dst, li, to, lt, reent_data); } /** Internal code reuse (type-branches should be optimized out) */ static inline void thiersort8_internal( struct tselem *arr, const TSU32 length, void* (*malloc)(size_t size), void (*free)(void *ptr), const TSBOOL (*lt)( const union tskey ak, const union tskey bk, const TSU32 ai, const TSU32 bi, void *reent_data), const TSBOOL isint, const TSBOOL isunsigned, const TSBOOL isfloat, void *reent_data) { /* Preparation */ TSU32 radics[256] = {0}; /* [from, to) index: only where prefix sums change - usually nonfull */ TSU32 real_radics[256 * 2] = {0}; struct tselem *arr2 = (tselem *)malloc((sizeof(struct tselem)) * length); /* Step 1) Bucketing */ #ifdef TS_SSE2 /* TODO: implement with SSE2 */ #else /* TODO: We can go both down and upwards here to increase ILP! */ /* Occurence counting O(n) */ for(TSU32 i = 0; i < length; ++i) { ++radics[ts_radixi(arr[i].key, isint, isunsigned, isfloat)]; // FIXME: remove this debug code! // arr[i].i = ts_radixi(arr[i].key, isint, isunsigned, isfloat); } /* Prefix sum + real radics calc O(256) */ /* Radics: */ /* fr: {10, 20, 10, 0, 5, 15,...} */ /* to: {10, 30, 40, 40, 45, 60,..} */ /* Real radics: */ /* to: {[0, 10], [10, 30], [30, 40], [40, 45], [45, 60]} */ /* 0. 1. 2. 4. 5. */ /* (because radix value 3 is not found in input) */ TSU32 prev = 0; TSU32 reali = 0; for(int i = 0; i < 256; ++i) { if(radics[i] != 0) { radics[i] += prev; real_radics[reali] = prev; real_radics[reali + 1] = radics[i]; prev = radics[i]; reali += 2; } else { radics[i] += prev; prev = radics[i]; } } /* Move elements according to their buckets O(n) */ /* Right-to-left ensures keeping internal order */ for(TSU32 i = length; i > 0; --i) { TSU8 radi = ts_radixi(arr[i - 1].key, isint, isunsigned, isfloat); TSU32 offset = --radics[radi]; arr2[offset] = arr[i - 1]; } #endif /* TS_SSE2 */ /* Step 2) Sorting buckets contents - also moving back to original array */ for(int i = 0; i < reali; i += 2) { /* inclusive */ TSU32 from = real_radics[i]; /* non-inclusive */ TSU32 to = real_radics[i + 1]; /* Quicksort this bucket in O(mlogm), where m << n usually */ ts_quicksort_fromto(arr2, arr, from, to, lt, reent_data); } /* Cleanup */ free(arr2); } #endif /* THIER_SORT_H */