thiersort non-SSE implementation, but missing quicksort at end

This commit is contained in:
Richard Thier 2023-04-09 23:13:16 +02:00
parent caa3aecaad
commit 3c0b2d5202

View File

@ -233,59 +233,273 @@
* the float conversions AND shifts vectorized with SSE somewhat...
*/
#include <stdint.h>
#define TS_BOOL uint8_t
#define TS_TRUE 1
#define TS_FALSE 0
#ifdef TS_SSE2
#include <emmintrin.h>
#endif /* TS_SSE2 */
#ifdef TS_SSE2_NEON
#include "sse2neon.h"
#define TS_SSE2
#endif /* TS_SSE2_NEON */
#ifndef TS_CUSTOM_TYPES
#define TSBOOL uint8_t
#define TSTRUE 1
#define TSFALSE 0
#define TSU8 uint8_t
#define TSU32 uint32_t
#define TSI32 int32_t
#endif /* TS_CUSTOM_TYPES */
/* Nearly same alg for all these keys */
union thier_key {
int32_t i;
uint32_t u;
union tskey {
TSI32 i;
TSU32 u;
float f;
};
struct thier_elem {
union thier_key key;
uint32_t i;
struct tselem {
union tskey key;
TSU32 i;
};
/* Forward decl of internal code */
static inline void thiersort8_internal(
const thier_elem *arr,
const tselem *arr,
uint32_t length,
const int (*compar)(
const union thier_key ak,
const union thier_key bk,
const void *aiptr,
const void *biptr,
void* (*malloc)(size_t size),
void (*free)(void *ptr),
const TSBOOL (*lt)(
const union tskey ak,
const union tskey bk,
const TSU32 ai,
const TSU32 bi,
void *reent_data),
const TS_BOOL isint,
const TS_BOOL isunsigned,
const TS_BOOL isfloat);
const TSBOOL isint,
const TSBOOL isunsigned,
const TSBOOL isfloat);
/** The '<' operation used for int32 */
static inline const int ts_lt_int(
const union tskey ak,
const union tskey bk,
const TSU32 ai,
const TSU32 bi,
void *reent_data) { return (ak.i < bk.i); }
/** The '<' operation used for unsigned int32 */
static inline const int ts_lt_uint(
const union tskey ak,
const union tskey bk,
const TSU32 ai,
const TSU32 bi,
void *reent_data) { return (ak.u < bk.u); }
/** The '<' operation used for float */
static inline const int ts_lt_float(
const union tskey ak,
const union tskey bk,
const TSU32 ai,
const TSU32 bi,
void *reent_data) { return (ak.f < bk.f); }
/**
* Sort {key, index32} elements where key is 32 bit integer.
*/
static inline void thiersort8_intkey(
const thier_elem *arr,
uint32_t length) {
const tselem *arr,
uint32_t length,
void* (*malloc)(size_t size),
void (*free)(void *ptr)) {
thiersort8_internal(
arr,
length,
malloc,
free,
ts_lt_int,
TSTRUE,
TSFALSE,
TSFALSE);
}
/** Internal code reuse (branches shoul be optimized out) */
static inline void thiersort8_internal(
thier_elem *arr,
const uint32_t length,
const int (*compar)(
const union thier_key ak,
const union thier_key bk,
const void *aiptr,
const void *biptr,
void *reent_data),
const TS_BOOL isint,
const TS_BOOL isunsigned,
const TS_BOOL isfloat) {
// TODO: implement
/**
* Sort {key, index32} elements where key is unsigned 32 bit integer.
*/
static inline void thiersort8_uintkey(
const tselem *arr,
uint32_t length,
void* (*malloc)(size_t size),
void (*free)(void *ptr)) {
thiersort8_internal(
arr,
length,
malloc,
free,
ts_lt_uint,
TSFALSE,
TSTRUE,
TSFALSE);
}
/**
* Sort {key, index32} elements where key is 32 bit integer.
*/
static inline void thiersort8_floatkey(
const tselem *arr,
uint32_t length,
void* (*malloc)(size_t size),
void (*free)(void *ptr)) {
thiersort8_internal(
arr,
length,
malloc,
free,
ts_lt_float,
TSFALSE,
TSFALSE,
TSTRUE);
}
/**
* Do sign-trick based radix transform on nom-2s-complement 8 bit float!
*
* -1 means -127
* -2 means -126
* ...
* -127 means -0
* 0 means +0
* 1 means 1
* ...
* 127 means 127
*
* We convert the above (in this order) into [0..255]!
*/
static inline TSU8 ts_floatsigntrick(TSU8 floatsigned) {
if((floatsigned & 0x80) == 0) {
/* Positive */
return (128 + floatsigned);
} else {
/* Negative */
return (128 - (floatsigned & 0x80));
}
}
/** Gets which bucket (index) the given key should go */
static inline TSU8 ts_radixi(
union tskey key,
const TSBOOL isint,
const TSBOOL isunsigned,
const TSBOOL isfloat) {
/* Compiler should optimize out branch here! */
union tskey k;
if(isint) {
/* Sign bit can be 1! */
k.f = (float)(arr[i].i);
/* 8 bit float */
k.u >> 24;
/* Need float sign trickery */
return ts_floatsigntrick((TSU8)k.u);
}
if(isunsigned) {
/* Sign bit CANNOT be 1! */
k.f = (float)(arr[i].u);
/* 8 bit float */
k.u >> 24;
/* We are fine! */
return (TSU8)k.u;
}
if(isfloat) {
/* Sign bit can be 1! */
k.f = (float) (arr[i].f);
/* 8 bit float */
k.u >> 24;
/* Need float sign trickery */
return ts_floatsigntrick((TSU8)k.u);
}
}
/** Internal code reuse (type-branches should be optimized out) */
static inline void thiersort8_internal(
struct tselem *arr,
const uint32_t length,
void* (*malloc)(size_t size),
void (*free)(void *ptr),
const TSBOOL (*lt)(
const union tskey ak,
const union tskey bk,
const TSU32 ai,
const TSU32 bi,
void *reent_data),
const TSBOOL isint,
const TSBOOL isunsigned,
const TSBOOL isfloat) {
/* Preparation */
uint32_t radics[256] = {0};
/* [from, to) index: only where prefix sums change - usually nonfull */
uint32_t real_radics[256 * 2] = {0};
tselem *arr2 = malloc((sizeof struct tselem) * length);
/* Step 1) Bucketing */
#ifdef TS_SSE2
// TODO: implement with SSE2
#else
/* Occurence counting O(n) */
for(TSU32 i = 0; i < length; ++i) {
++radics[ts_radixi(arr[i].key, isint, isunsigned, isfloat)];
}
/* Prefix sum + real radics calc O(256) */
/* Radics: */
/* fr: {10, 20, 10, 0, 5, 15,...} */
/* to: {10, 30, 40, 40, 45, 60,..} */
/* Real radics: */
/* to: {[0, 10], [10, 30], [30, 40], [40, 45], [45, 60]} */
/* 0. 1. 2. 4. 5. */
/* (because radix value 3 is not found in input) */
TSU32 prev = 0;
TSU32 reali = 0;
for(int i = 0; i < 256; ++i) {
if(radics[i] != 0) {
radics[i] += prev;
real_radics[reali] = prev;
real_radics[reali + 1] = radics[i];
prev = radics[i];
reali += 2;
} else {
radics[i] += prev;
prev = radics[i];
}
}
/* Move elements according to their buckets O(n) */
/* Right-to-left ensures keeping internal order */
for(TSU32 i = length; i > 0; ++i) {
TSU8 radi = ts_radixi(arr[i - 1].key, isint, isunsigned, isfloat);
TSU32 offset = --radics[radi];
arr2[offset] = arr[i - 1];
}
#endif /* TS_SSE2 */
/* Step 2) Sorting buckets contents - also moving back to original array */
for(int i = 0; i < reali; i += 2) {
/* inclusive */
TSU32 from = real_radics[i];
/* non-inclusive */
TSU32 to = real_radics[i + 1];
/* TODO: Quicksort this bucket in O(mlogm), where m << n usually */
}
/* Cleanup */
free(arr2);
}
#endif /* THIER_SORT_H */