From 3c0b2d5202fafa23664d2f795710a1bdc61e9c30 Mon Sep 17 00:00:00 2001 From: Richard Thier Date: Sun, 9 Apr 2023 23:13:16 +0200 Subject: [PATCH] thiersort non-SSE implementation, but missing quicksort at end --- thiersort.h | 284 +++++++++++++++++++++++++++++++++++++++++++++------- 1 file changed, 249 insertions(+), 35 deletions(-) diff --git a/thiersort.h b/thiersort.h index 776912c..8caceb0 100644 --- a/thiersort.h +++ b/thiersort.h @@ -233,59 +233,273 @@ * the float conversions AND shifts vectorized with SSE somewhat... */ - #include -#define TS_BOOL uint8_t -#define TS_TRUE 1 -#define TS_FALSE 0 + +#ifdef TS_SSE2 +#include +#endif /* TS_SSE2 */ +#ifdef TS_SSE2_NEON +#include "sse2neon.h" +#define TS_SSE2 +#endif /* TS_SSE2_NEON */ + +#ifndef TS_CUSTOM_TYPES +#define TSBOOL uint8_t +#define TSTRUE 1 +#define TSFALSE 0 +#define TSU8 uint8_t +#define TSU32 uint32_t +#define TSI32 int32_t +#endif /* TS_CUSTOM_TYPES */ /* Nearly same alg for all these keys */ -union thier_key { - int32_t i; - uint32_t u; +union tskey { + TSI32 i; + TSU32 u; float f; }; -struct thier_elem { - union thier_key key; - uint32_t i; +struct tselem { + union tskey key; + TSU32 i; }; /* Forward decl of internal code */ static inline void thiersort8_internal( - const thier_elem *arr, + const tselem *arr, uint32_t length, - const int (*compar)( - const union thier_key ak, - const union thier_key bk, - const void *aiptr, - const void *biptr, + void* (*malloc)(size_t size), + void (*free)(void *ptr), + const TSBOOL (*lt)( + const union tskey ak, + const union tskey bk, + const TSU32 ai, + const TSU32 bi, void *reent_data), - const TS_BOOL isint, - const TS_BOOL isunsigned, - const TS_BOOL isfloat); + const TSBOOL isint, + const TSBOOL isunsigned, + const TSBOOL isfloat); + +/** The '<' operation used for int32 */ +static inline const int ts_lt_int( + const union tskey ak, + const union tskey bk, + const TSU32 ai, + const TSU32 bi, + void *reent_data) { return (ak.i < bk.i); } + +/** The '<' operation used for unsigned int32 */ +static inline const int ts_lt_uint( + const union tskey ak, + const union tskey bk, + const TSU32 ai, + const TSU32 bi, + void *reent_data) { return (ak.u < bk.u); } + +/** The '<' operation used for float */ +static inline const int ts_lt_float( + const union tskey ak, + const union tskey bk, + const TSU32 ai, + const TSU32 bi, + void *reent_data) { return (ak.f < bk.f); } /** * Sort {key, index32} elements where key is 32 bit integer. */ static inline void thiersort8_intkey( - const thier_elem *arr, - uint32_t length) { + const tselem *arr, + uint32_t length, + void* (*malloc)(size_t size), + void (*free)(void *ptr)) { + thiersort8_internal( + arr, + length, + malloc, + free, + ts_lt_int, + TSTRUE, + TSFALSE, + TSFALSE); } -/** Internal code reuse (branches shoul be optimized out) */ -static inline void thiersort8_internal( - thier_elem *arr, - const uint32_t length, - const int (*compar)( - const union thier_key ak, - const union thier_key bk, - const void *aiptr, - const void *biptr, - void *reent_data), - const TS_BOOL isint, - const TS_BOOL isunsigned, - const TS_BOOL isfloat) { - // TODO: implement +/** + * Sort {key, index32} elements where key is unsigned 32 bit integer. + */ +static inline void thiersort8_uintkey( + const tselem *arr, + uint32_t length, + void* (*malloc)(size_t size), + void (*free)(void *ptr)) { + thiersort8_internal( + arr, + length, + malloc, + free, + ts_lt_uint, + TSFALSE, + TSTRUE, + TSFALSE); } + +/** + * Sort {key, index32} elements where key is 32 bit integer. + */ +static inline void thiersort8_floatkey( + const tselem *arr, + uint32_t length, + void* (*malloc)(size_t size), + void (*free)(void *ptr)) { + thiersort8_internal( + arr, + length, + malloc, + free, + ts_lt_float, + TSFALSE, + TSFALSE, + TSTRUE); +} + +/** + * Do sign-trick based radix transform on nom-2s-complement 8 bit float! + * + * -1 means -127 + * -2 means -126 + * ... + * -127 means -0 + * 0 means +0 + * 1 means 1 + * ... + * 127 means 127 + * + * We convert the above (in this order) into [0..255]! + */ +static inline TSU8 ts_floatsigntrick(TSU8 floatsigned) { + if((floatsigned & 0x80) == 0) { + /* Positive */ + return (128 + floatsigned); + } else { + /* Negative */ + return (128 - (floatsigned & 0x80)); + } +} + +/** Gets which bucket (index) the given key should go */ +static inline TSU8 ts_radixi( + union tskey key, + const TSBOOL isint, + const TSBOOL isunsigned, + const TSBOOL isfloat) { + /* Compiler should optimize out branch here! */ + union tskey k; + if(isint) { + /* Sign bit can be 1! */ + k.f = (float)(arr[i].i); + + /* 8 bit float */ + k.u >> 24; + + /* Need float sign trickery */ + return ts_floatsigntrick((TSU8)k.u); + } + if(isunsigned) { + /* Sign bit CANNOT be 1! */ + k.f = (float)(arr[i].u); + + /* 8 bit float */ + k.u >> 24; + + /* We are fine! */ + return (TSU8)k.u; + } + if(isfloat) { + /* Sign bit can be 1! */ + k.f = (float) (arr[i].f); + + /* 8 bit float */ + k.u >> 24; + + /* Need float sign trickery */ + return ts_floatsigntrick((TSU8)k.u); + } +} + +/** Internal code reuse (type-branches should be optimized out) */ +static inline void thiersort8_internal( + struct tselem *arr, + const uint32_t length, + void* (*malloc)(size_t size), + void (*free)(void *ptr), + const TSBOOL (*lt)( + const union tskey ak, + const union tskey bk, + const TSU32 ai, + const TSU32 bi, + void *reent_data), + const TSBOOL isint, + const TSBOOL isunsigned, + const TSBOOL isfloat) { + + /* Preparation */ + uint32_t radics[256] = {0}; + /* [from, to) index: only where prefix sums change - usually nonfull */ + uint32_t real_radics[256 * 2] = {0}; + tselem *arr2 = malloc((sizeof struct tselem) * length); + + /* Step 1) Bucketing */ +#ifdef TS_SSE2 + // TODO: implement with SSE2 +#else + /* Occurence counting O(n) */ + for(TSU32 i = 0; i < length; ++i) { + ++radics[ts_radixi(arr[i].key, isint, isunsigned, isfloat)]; + } + + /* Prefix sum + real radics calc O(256) */ + /* Radics: */ + /* fr: {10, 20, 10, 0, 5, 15,...} */ + /* to: {10, 30, 40, 40, 45, 60,..} */ + /* Real radics: */ + /* to: {[0, 10], [10, 30], [30, 40], [40, 45], [45, 60]} */ + /* 0. 1. 2. 4. 5. */ + /* (because radix value 3 is not found in input) */ + TSU32 prev = 0; + TSU32 reali = 0; + for(int i = 0; i < 256; ++i) { + if(radics[i] != 0) { + radics[i] += prev; + real_radics[reali] = prev; + real_radics[reali + 1] = radics[i]; + prev = radics[i]; + reali += 2; + } else { + radics[i] += prev; + prev = radics[i]; + } + } + + /* Move elements according to their buckets O(n) */ + /* Right-to-left ensures keeping internal order */ + for(TSU32 i = length; i > 0; ++i) { + TSU8 radi = ts_radixi(arr[i - 1].key, isint, isunsigned, isfloat); + TSU32 offset = --radics[radi]; + arr2[offset] = arr[i - 1]; + } +#endif /* TS_SSE2 */ + + /* Step 2) Sorting buckets contents - also moving back to original array */ + for(int i = 0; i < reali; i += 2) { + /* inclusive */ + TSU32 from = real_radics[i]; + /* non-inclusive */ + TSU32 to = real_radics[i + 1]; + + /* TODO: Quicksort this bucket in O(mlogm), where m << n usually */ + } + + /* Cleanup */ + free(arr2); +} + #endif /* THIER_SORT_H */