thiersort non-SSE implementation, but missing quicksort at end
This commit is contained in:
parent
caa3aecaad
commit
3c0b2d5202
284
thiersort.h
284
thiersort.h
@ -233,59 +233,273 @@
|
|||||||
* the float conversions AND shifts vectorized with SSE somewhat...
|
* the float conversions AND shifts vectorized with SSE somewhat...
|
||||||
*/
|
*/
|
||||||
|
|
||||||
|
|
||||||
#include <stdint.h>
|
#include <stdint.h>
|
||||||
#define TS_BOOL uint8_t
|
|
||||||
#define TS_TRUE 1
|
#ifdef TS_SSE2
|
||||||
#define TS_FALSE 0
|
#include <emmintrin.h>
|
||||||
|
#endif /* TS_SSE2 */
|
||||||
|
#ifdef TS_SSE2_NEON
|
||||||
|
#include "sse2neon.h"
|
||||||
|
#define TS_SSE2
|
||||||
|
#endif /* TS_SSE2_NEON */
|
||||||
|
|
||||||
|
#ifndef TS_CUSTOM_TYPES
|
||||||
|
#define TSBOOL uint8_t
|
||||||
|
#define TSTRUE 1
|
||||||
|
#define TSFALSE 0
|
||||||
|
#define TSU8 uint8_t
|
||||||
|
#define TSU32 uint32_t
|
||||||
|
#define TSI32 int32_t
|
||||||
|
#endif /* TS_CUSTOM_TYPES */
|
||||||
|
|
||||||
/* Nearly same alg for all these keys */
|
/* Nearly same alg for all these keys */
|
||||||
union thier_key {
|
union tskey {
|
||||||
int32_t i;
|
TSI32 i;
|
||||||
uint32_t u;
|
TSU32 u;
|
||||||
float f;
|
float f;
|
||||||
};
|
};
|
||||||
|
|
||||||
struct thier_elem {
|
struct tselem {
|
||||||
union thier_key key;
|
union tskey key;
|
||||||
uint32_t i;
|
TSU32 i;
|
||||||
};
|
};
|
||||||
|
|
||||||
/* Forward decl of internal code */
|
/* Forward decl of internal code */
|
||||||
static inline void thiersort8_internal(
|
static inline void thiersort8_internal(
|
||||||
const thier_elem *arr,
|
const tselem *arr,
|
||||||
uint32_t length,
|
uint32_t length,
|
||||||
const int (*compar)(
|
void* (*malloc)(size_t size),
|
||||||
const union thier_key ak,
|
void (*free)(void *ptr),
|
||||||
const union thier_key bk,
|
const TSBOOL (*lt)(
|
||||||
const void *aiptr,
|
const union tskey ak,
|
||||||
const void *biptr,
|
const union tskey bk,
|
||||||
|
const TSU32 ai,
|
||||||
|
const TSU32 bi,
|
||||||
void *reent_data),
|
void *reent_data),
|
||||||
const TS_BOOL isint,
|
const TSBOOL isint,
|
||||||
const TS_BOOL isunsigned,
|
const TSBOOL isunsigned,
|
||||||
const TS_BOOL isfloat);
|
const TSBOOL isfloat);
|
||||||
|
|
||||||
|
/** The '<' operation used for int32 */
|
||||||
|
static inline const int ts_lt_int(
|
||||||
|
const union tskey ak,
|
||||||
|
const union tskey bk,
|
||||||
|
const TSU32 ai,
|
||||||
|
const TSU32 bi,
|
||||||
|
void *reent_data) { return (ak.i < bk.i); }
|
||||||
|
|
||||||
|
/** The '<' operation used for unsigned int32 */
|
||||||
|
static inline const int ts_lt_uint(
|
||||||
|
const union tskey ak,
|
||||||
|
const union tskey bk,
|
||||||
|
const TSU32 ai,
|
||||||
|
const TSU32 bi,
|
||||||
|
void *reent_data) { return (ak.u < bk.u); }
|
||||||
|
|
||||||
|
/** The '<' operation used for float */
|
||||||
|
static inline const int ts_lt_float(
|
||||||
|
const union tskey ak,
|
||||||
|
const union tskey bk,
|
||||||
|
const TSU32 ai,
|
||||||
|
const TSU32 bi,
|
||||||
|
void *reent_data) { return (ak.f < bk.f); }
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Sort {key, index32} elements where key is 32 bit integer.
|
* Sort {key, index32} elements where key is 32 bit integer.
|
||||||
*/
|
*/
|
||||||
static inline void thiersort8_intkey(
|
static inline void thiersort8_intkey(
|
||||||
const thier_elem *arr,
|
const tselem *arr,
|
||||||
uint32_t length) {
|
uint32_t length,
|
||||||
|
void* (*malloc)(size_t size),
|
||||||
|
void (*free)(void *ptr)) {
|
||||||
|
thiersort8_internal(
|
||||||
|
arr,
|
||||||
|
length,
|
||||||
|
malloc,
|
||||||
|
free,
|
||||||
|
ts_lt_int,
|
||||||
|
TSTRUE,
|
||||||
|
TSFALSE,
|
||||||
|
TSFALSE);
|
||||||
}
|
}
|
||||||
|
|
||||||
/** Internal code reuse (branches shoul be optimized out) */
|
/**
|
||||||
static inline void thiersort8_internal(
|
* Sort {key, index32} elements where key is unsigned 32 bit integer.
|
||||||
thier_elem *arr,
|
*/
|
||||||
const uint32_t length,
|
static inline void thiersort8_uintkey(
|
||||||
const int (*compar)(
|
const tselem *arr,
|
||||||
const union thier_key ak,
|
uint32_t length,
|
||||||
const union thier_key bk,
|
void* (*malloc)(size_t size),
|
||||||
const void *aiptr,
|
void (*free)(void *ptr)) {
|
||||||
const void *biptr,
|
thiersort8_internal(
|
||||||
void *reent_data),
|
arr,
|
||||||
const TS_BOOL isint,
|
length,
|
||||||
const TS_BOOL isunsigned,
|
malloc,
|
||||||
const TS_BOOL isfloat) {
|
free,
|
||||||
// TODO: implement
|
ts_lt_uint,
|
||||||
|
TSFALSE,
|
||||||
|
TSTRUE,
|
||||||
|
TSFALSE);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Sort {key, index32} elements where key is 32 bit integer.
|
||||||
|
*/
|
||||||
|
static inline void thiersort8_floatkey(
|
||||||
|
const tselem *arr,
|
||||||
|
uint32_t length,
|
||||||
|
void* (*malloc)(size_t size),
|
||||||
|
void (*free)(void *ptr)) {
|
||||||
|
thiersort8_internal(
|
||||||
|
arr,
|
||||||
|
length,
|
||||||
|
malloc,
|
||||||
|
free,
|
||||||
|
ts_lt_float,
|
||||||
|
TSFALSE,
|
||||||
|
TSFALSE,
|
||||||
|
TSTRUE);
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Do sign-trick based radix transform on nom-2s-complement 8 bit float!
|
||||||
|
*
|
||||||
|
* -1 means -127
|
||||||
|
* -2 means -126
|
||||||
|
* ...
|
||||||
|
* -127 means -0
|
||||||
|
* 0 means +0
|
||||||
|
* 1 means 1
|
||||||
|
* ...
|
||||||
|
* 127 means 127
|
||||||
|
*
|
||||||
|
* We convert the above (in this order) into [0..255]!
|
||||||
|
*/
|
||||||
|
static inline TSU8 ts_floatsigntrick(TSU8 floatsigned) {
|
||||||
|
if((floatsigned & 0x80) == 0) {
|
||||||
|
/* Positive */
|
||||||
|
return (128 + floatsigned);
|
||||||
|
} else {
|
||||||
|
/* Negative */
|
||||||
|
return (128 - (floatsigned & 0x80));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/** Gets which bucket (index) the given key should go */
|
||||||
|
static inline TSU8 ts_radixi(
|
||||||
|
union tskey key,
|
||||||
|
const TSBOOL isint,
|
||||||
|
const TSBOOL isunsigned,
|
||||||
|
const TSBOOL isfloat) {
|
||||||
|
/* Compiler should optimize out branch here! */
|
||||||
|
union tskey k;
|
||||||
|
if(isint) {
|
||||||
|
/* Sign bit can be 1! */
|
||||||
|
k.f = (float)(arr[i].i);
|
||||||
|
|
||||||
|
/* 8 bit float */
|
||||||
|
k.u >> 24;
|
||||||
|
|
||||||
|
/* Need float sign trickery */
|
||||||
|
return ts_floatsigntrick((TSU8)k.u);
|
||||||
|
}
|
||||||
|
if(isunsigned) {
|
||||||
|
/* Sign bit CANNOT be 1! */
|
||||||
|
k.f = (float)(arr[i].u);
|
||||||
|
|
||||||
|
/* 8 bit float */
|
||||||
|
k.u >> 24;
|
||||||
|
|
||||||
|
/* We are fine! */
|
||||||
|
return (TSU8)k.u;
|
||||||
|
}
|
||||||
|
if(isfloat) {
|
||||||
|
/* Sign bit can be 1! */
|
||||||
|
k.f = (float) (arr[i].f);
|
||||||
|
|
||||||
|
/* 8 bit float */
|
||||||
|
k.u >> 24;
|
||||||
|
|
||||||
|
/* Need float sign trickery */
|
||||||
|
return ts_floatsigntrick((TSU8)k.u);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/** Internal code reuse (type-branches should be optimized out) */
|
||||||
|
static inline void thiersort8_internal(
|
||||||
|
struct tselem *arr,
|
||||||
|
const uint32_t length,
|
||||||
|
void* (*malloc)(size_t size),
|
||||||
|
void (*free)(void *ptr),
|
||||||
|
const TSBOOL (*lt)(
|
||||||
|
const union tskey ak,
|
||||||
|
const union tskey bk,
|
||||||
|
const TSU32 ai,
|
||||||
|
const TSU32 bi,
|
||||||
|
void *reent_data),
|
||||||
|
const TSBOOL isint,
|
||||||
|
const TSBOOL isunsigned,
|
||||||
|
const TSBOOL isfloat) {
|
||||||
|
|
||||||
|
/* Preparation */
|
||||||
|
uint32_t radics[256] = {0};
|
||||||
|
/* [from, to) index: only where prefix sums change - usually nonfull */
|
||||||
|
uint32_t real_radics[256 * 2] = {0};
|
||||||
|
tselem *arr2 = malloc((sizeof struct tselem) * length);
|
||||||
|
|
||||||
|
/* Step 1) Bucketing */
|
||||||
|
#ifdef TS_SSE2
|
||||||
|
// TODO: implement with SSE2
|
||||||
|
#else
|
||||||
|
/* Occurence counting O(n) */
|
||||||
|
for(TSU32 i = 0; i < length; ++i) {
|
||||||
|
++radics[ts_radixi(arr[i].key, isint, isunsigned, isfloat)];
|
||||||
|
}
|
||||||
|
|
||||||
|
/* Prefix sum + real radics calc O(256) */
|
||||||
|
/* Radics: */
|
||||||
|
/* fr: {10, 20, 10, 0, 5, 15,...} */
|
||||||
|
/* to: {10, 30, 40, 40, 45, 60,..} */
|
||||||
|
/* Real radics: */
|
||||||
|
/* to: {[0, 10], [10, 30], [30, 40], [40, 45], [45, 60]} */
|
||||||
|
/* 0. 1. 2. 4. 5. */
|
||||||
|
/* (because radix value 3 is not found in input) */
|
||||||
|
TSU32 prev = 0;
|
||||||
|
TSU32 reali = 0;
|
||||||
|
for(int i = 0; i < 256; ++i) {
|
||||||
|
if(radics[i] != 0) {
|
||||||
|
radics[i] += prev;
|
||||||
|
real_radics[reali] = prev;
|
||||||
|
real_radics[reali + 1] = radics[i];
|
||||||
|
prev = radics[i];
|
||||||
|
reali += 2;
|
||||||
|
} else {
|
||||||
|
radics[i] += prev;
|
||||||
|
prev = radics[i];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/* Move elements according to their buckets O(n) */
|
||||||
|
/* Right-to-left ensures keeping internal order */
|
||||||
|
for(TSU32 i = length; i > 0; ++i) {
|
||||||
|
TSU8 radi = ts_radixi(arr[i - 1].key, isint, isunsigned, isfloat);
|
||||||
|
TSU32 offset = --radics[radi];
|
||||||
|
arr2[offset] = arr[i - 1];
|
||||||
|
}
|
||||||
|
#endif /* TS_SSE2 */
|
||||||
|
|
||||||
|
/* Step 2) Sorting buckets contents - also moving back to original array */
|
||||||
|
for(int i = 0; i < reali; i += 2) {
|
||||||
|
/* inclusive */
|
||||||
|
TSU32 from = real_radics[i];
|
||||||
|
/* non-inclusive */
|
||||||
|
TSU32 to = real_radics[i + 1];
|
||||||
|
|
||||||
|
/* TODO: Quicksort this bucket in O(mlogm), where m << n usually */
|
||||||
|
}
|
||||||
|
|
||||||
|
/* Cleanup */
|
||||||
|
free(arr2);
|
||||||
|
}
|
||||||
|
|
||||||
#endif /* THIER_SORT_H */
|
#endif /* THIER_SORT_H */
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user