minor speed tweaks by being able to define the counter type
This commit is contained in:
parent
1686967f10
commit
0b4eb5e5a6
162
magyarsort.h
162
magyarsort.h
@ -43,16 +43,17 @@ namespace MagyarSort {
|
||||
/* DEBUG */
|
||||
|
||||
void debugArr(uint32_t *arr, size_t size) {
|
||||
for(int i = 0; i < size; ++i) {
|
||||
for(size_t i = 0; i < size; ++i) {
|
||||
printf("%x, ", arr[i]);
|
||||
}
|
||||
printf("\n");
|
||||
}
|
||||
|
||||
void debugRadics(size_t *radics) {
|
||||
template<typename COUNTER_TYP>
|
||||
void debugRadics(COUNTER_TYP *radics) {
|
||||
for(size_t j = 0; j < DIGITS; ++j) {
|
||||
printf("d%zu: ", j);
|
||||
for(size_t i = 0; i < DIGIT_RANGE; ++i) {
|
||||
for(int i = 0; i < DIGIT_RANGE; ++i) {
|
||||
printf("%zu,", radics[i + DIGIT_RANGE*j]);
|
||||
}
|
||||
printf("\n\n");
|
||||
@ -70,72 +71,74 @@ namespace MagyarSort {
|
||||
}
|
||||
|
||||
/** Recursive Functor: no class should be generated I think (compiler should be smart) */
|
||||
template<int DIGIT>
|
||||
struct OccurenceMagic : public OccurenceMagic<DIGIT - 1> {
|
||||
inline __attribute__((always_inline)) OccurenceMagic(uint32_t arr[], size_t i, size_t *radicsOut) noexcept
|
||||
: OccurenceMagic<DIGIT -1 >(arr, i, radicsOut) {
|
||||
template<int DIGIT, typename COUNTER_TYP>
|
||||
struct OccurenceMagic : public OccurenceMagic<DIGIT - 1, COUNTER_TYP> {
|
||||
inline __attribute__((always_inline)) OccurenceMagic(uint32_t arr[], COUNTER_TYP i, COUNTER_TYP *radicsOut) noexcept
|
||||
: OccurenceMagic<DIGIT - 1 ,COUNTER_TYP>(arr, i, radicsOut) {
|
||||
// Parents run first so template recursion runs DIGIT=0 first...
|
||||
++radicsOut[getDigit<DIGIT>(arr[i]) + DIGIT_RANGE * DIGIT];
|
||||
}
|
||||
};
|
||||
/** Ends template recursion */
|
||||
template<>
|
||||
struct OccurenceMagic<-1> {
|
||||
inline OccurenceMagic(uint32_t arr[], size_t i, size_t *radicsOut) noexcept {}
|
||||
template<typename COUNTER_TYP>
|
||||
struct OccurenceMagic<-1, COUNTER_TYP> {
|
||||
inline OccurenceMagic(uint32_t arr[], COUNTER_TYP i, COUNTER_TYP *radicsOut) noexcept {}
|
||||
};
|
||||
|
||||
static inline void countOccurences(uint32_t arr[], size_t size, size_t *radicsOut) noexcept {
|
||||
template<typename COUNTER_TYP>
|
||||
static inline void countOccurences(uint32_t arr[], COUNTER_TYP size, COUNTER_TYP *radicsOut) noexcept {
|
||||
#pragma GCC unroll 64
|
||||
for(size_t i = 0; i < size; ++i) {
|
||||
for(COUNTER_TYP i = 0; i < size; ++i) {
|
||||
// Creates no object, struct is empty
|
||||
OccurenceMagic<DIGITS - 1>(arr, i, radicsOut);
|
||||
OccurenceMagic<DIGITS - 1, COUNTER_TYP>(arr, i, radicsOut);
|
||||
}
|
||||
}
|
||||
|
||||
/** Recursive Functor: no class should be generated I think (compiler should be smart) */
|
||||
template<int DIGIT>
|
||||
struct PrefixMagic : public PrefixMagic<DIGIT - 1> {
|
||||
inline __attribute__((always_inline)) PrefixMagic(size_t *radics, size_t *prev, int i) noexcept
|
||||
: PrefixMagic<DIGIT - 1>(radics, prev, i) {
|
||||
template<int DIGIT, typename COUNTER_TYP>
|
||||
struct PrefixMagic : public PrefixMagic<DIGIT - 1, COUNTER_TYP> {
|
||||
inline __attribute__((always_inline)) PrefixMagic(COUNTER_TYP *radics, COUNTER_TYP *prev, int i) noexcept
|
||||
: PrefixMagic<DIGIT - 1, COUNTER_TYP>(radics, prev, i) {
|
||||
static constexpr int DSTART = (DIGIT * DIGIT_RANGE);
|
||||
radics[DSTART + i] += prev[DIGIT];
|
||||
prev[DIGIT] = radics[DSTART + i];
|
||||
}
|
||||
};
|
||||
/** Ends template recursion */
|
||||
template<>
|
||||
struct PrefixMagic<-1> {
|
||||
inline PrefixMagic(size_t *radics, size_t *prev, int i) noexcept {}
|
||||
template<typename COUNTER_TYP>
|
||||
struct PrefixMagic<-1, COUNTER_TYP> {
|
||||
inline PrefixMagic(COUNTER_TYP *radics, COUNTER_TYP *prev, int i) noexcept {}
|
||||
};
|
||||
|
||||
/** Gets REFERENCE to the given digit from the radix-array that has more than one digits */
|
||||
template<int DIGIT>
|
||||
static inline __attribute__((always_inline)) size_t &rGet(size_t *radics, size_t i) noexcept {
|
||||
template<int DIGIT, typename COUNTER_TYP>
|
||||
static inline __attribute__((always_inline)) COUNTER_TYP &rGet(COUNTER_TYP *radics, int i) noexcept {
|
||||
static constexpr int DSTART = (DIGIT * DIGIT_RANGE);
|
||||
return radics[DSTART + i];
|
||||
}
|
||||
|
||||
static inline void calcPrefixSums(size_t *radics) noexcept {
|
||||
static thread_local size_t prev[DIGITS];
|
||||
template<typename COUNTER_TYP>
|
||||
static inline void calcPrefixSums(COUNTER_TYP *radics) noexcept {
|
||||
static thread_local COUNTER_TYP prev[DIGITS];
|
||||
memset(prev, 0, sizeof(prev));
|
||||
|
||||
for(int i = 0; i < DIGIT_RANGE; ++i) {
|
||||
// This is a template-unrolled loop too
|
||||
PrefixMagic<DIGITS - 1>(radics, prev, i);
|
||||
PrefixMagic<DIGITS - 1, COUNTER_TYP>(radics, prev, i);
|
||||
}
|
||||
}
|
||||
|
||||
/** Recursive Functor: no class should be generated I think (compiler should be smart) */
|
||||
template<int DIGIT>
|
||||
struct RadixMagic : public RadixMagic<DIGIT - 1> {
|
||||
inline __attribute__((always_inline)) RadixMagic(size_t *radics, uint32_t *&from, uint32_t *&to, size_t size) noexcept // BEWARE: "*&" needed to swap pointers..
|
||||
: RadixMagic<DIGIT - 1>(radics, from, to, size) {
|
||||
template<int DIGIT, typename COUNTER_TYP>
|
||||
struct RadixMagic : public RadixMagic<DIGIT - 1, COUNTER_TYP> {
|
||||
inline __attribute__((always_inline)) RadixMagic(COUNTER_TYP *radics, uint32_t *&from, uint32_t *&to, COUNTER_TYP size) noexcept // BEWARE: "*&" needed to swap pointers..
|
||||
: RadixMagic<DIGIT - 1, COUNTER_TYP>(radics, from, to, size) {
|
||||
// DEBUG
|
||||
//printf("%d before: ", DIGIT);
|
||||
//debugArr(from, size);
|
||||
|
||||
#pragma GCC unroll 64
|
||||
for(size_t i = size; i > 0; --i) { // right-to-left to ensure already sorted digits order we keep for iterations
|
||||
for(COUNTER_TYP i = size; i > 0; --i) { // right-to-left to ensure already sorted digits order we keep for iterations
|
||||
// Get num and its new offset / location
|
||||
auto num = from[i - 1];
|
||||
auto digVal = getDigit<DIGIT>(num);
|
||||
@ -154,68 +157,13 @@ namespace MagyarSort {
|
||||
}
|
||||
};
|
||||
/** Ends template recursion */
|
||||
template<>
|
||||
struct RadixMagic<-1> {
|
||||
inline RadixMagic(size_t *radics, uint32_t *&from, uint32_t *&to, size_t size) noexcept { }
|
||||
template<typename COUNTER_TYP>
|
||||
struct RadixMagic<-1, COUNTER_TYP> {
|
||||
inline RadixMagic(COUNTER_TYP *radics, uint32_t *&from, uint32_t *&to, COUNTER_TYP size) noexcept { }
|
||||
};
|
||||
|
||||
/* SORT */
|
||||
|
||||
/**
|
||||
* Example: A simple "vector-giver" which provides a static thread_local that is reused
|
||||
*
|
||||
* This is to be used when you will call sort many times successively!
|
||||
* If you forget to garbage-collect manually, use a VectorGiverHeap.
|
||||
*
|
||||
* XXX - BEWARE: This give references - that is also acceptable and supported!
|
||||
*
|
||||
* This is thread-safe (the Heap one also).
|
||||
*/
|
||||
struct VectorGiverWithReuse {
|
||||
/**
|
||||
* Give a reference to the vector to use as temporary.
|
||||
* Will be resized, is reused so "leaks" memory to be the biggest sorted array size, but you can "Gc()".
|
||||
*
|
||||
* @param s The given vector should have this size.
|
||||
* @param gc OPTIONAL: When true, we create a new empty shared vector. This saves memory after a big sort!
|
||||
* @returns A reference that never go out of scope!
|
||||
*/
|
||||
static inline __attribute__((always_inline)) std::vector<uint32_t> &Give(size_t s, const bool gc = false) noexcept {
|
||||
static thread_local std::vector<uint32_t> arc(s); // saves time on first call to have size here!
|
||||
if(gc) { arc = std::vector<uint32_t>(); } // by default optimized out!
|
||||
arc.resize(s); // JHP
|
||||
// Safe because of static it will not go out of scope
|
||||
return arc; // just a reference - no copy!
|
||||
}
|
||||
|
||||
/** Release memory back to zero. After this, the first sort will need memory from heap again. */
|
||||
inline __attribute__((always_inline)) void Gc() noexcept {
|
||||
VectorGiverWithReuse::Give(0, true);
|
||||
}
|
||||
};
|
||||
|
||||
/**
|
||||
* Example: A simple "vector-giver" which provides new vector from heap.
|
||||
*
|
||||
* This is thread-safe (the VectorGiverWithReuse one also).
|
||||
*/
|
||||
struct VectorGiverHeap {
|
||||
/**
|
||||
* Give a temporary vector which is to be created on heap and freed after sort.
|
||||
*
|
||||
* XXX - BEWARE: Please mind we do not return reference, but value here!
|
||||
* This works because standard ENSURES return value optimization!
|
||||
*
|
||||
* @param s The given vector should have this size.
|
||||
* @param gc OPTIONAL: When true, we create a new empty shared vector. This saves memory after a big sort!
|
||||
* @returns A vector of appropriate size.
|
||||
*/
|
||||
inline __attribute__((always_inline)) std::vector<uint32_t> Give(size_t s) noexcept {
|
||||
return std::vector<uint32_t>(s); // RVO ensured!
|
||||
}
|
||||
|
||||
};
|
||||
|
||||
/*
|
||||
* Sort the given array (in-place sorting) with the given size.
|
||||
*
|
||||
@ -224,13 +172,14 @@ namespace MagyarSort {
|
||||
* Beware: GC needs to happen on all threads that use us!
|
||||
*
|
||||
* @param arr The array to sort. Result will be in the same array - as sorted.
|
||||
* @param size The lenght of the array.
|
||||
* @param size The lenght of the array - should fit in the COUNTER_TYP.
|
||||
* @param COUNTER_TYP OPTIONAL: When set this type will be the counter type.
|
||||
* @param REUSE OPTIONAL: When true, we reuse the array instead of always gettin' and releasin' from da heap.
|
||||
* @param GC OPTIONAL: When true, we garbage collect memory from previous sorts if REUSE is true.
|
||||
* @param GC_WITHOUT_SORT OPTIONAL: When true, we "just GC" but do not sort in case of GC is true.
|
||||
*/
|
||||
template<bool REUSE = false, bool GC = false, bool GC_WITHOUT_SORT = false>
|
||||
inline void __attribute__((always_inline)) sort_impl(uint32_t arr[], size_t size) noexcept {
|
||||
template<typename COUNTER_TYP = size_t, bool REUSE = false, bool GC = false, bool GC_WITHOUT_SORT = false>
|
||||
inline void __attribute__((always_inline)) sort_impl(uint32_t arr[], COUNTER_TYP size) noexcept {
|
||||
// Most funny optimization is this multiply here :-)
|
||||
//
|
||||
// Literally.. come on.. this makes it nearly a compile-time, macro-like
|
||||
@ -250,18 +199,18 @@ namespace MagyarSort {
|
||||
|
||||
// Holds "digit" occurences, prefix sums, whatevers
|
||||
// First "DIGIT_RANGE" elem is for MSB "DIGITS", last is for LSB
|
||||
static thread_local size_t radics[DIGITS * DIGIT_RANGE];
|
||||
static thread_local COUNTER_TYP radics[DIGITS * DIGIT_RANGE];
|
||||
memset(radics, 0, sizeof(radics));
|
||||
|
||||
// Calculate occurences of digits
|
||||
countOccurences(arr, size, radics);
|
||||
|
||||
//debugRadics(radics);
|
||||
//debugRadics<COUNTER_TYP>(radics);
|
||||
|
||||
// Calculate prefix sums
|
||||
calcPrefixSums(radics);
|
||||
|
||||
//debugRadics(radics);
|
||||
//debugRadics<COUNTER_TYP>(radics);
|
||||
|
||||
/* Regular (old) radix sort with small twist */
|
||||
|
||||
@ -288,7 +237,7 @@ namespace MagyarSort {
|
||||
uint32_t *from = arr;
|
||||
uint32_t *to = &arc[0];
|
||||
|
||||
RadixMagic<DIGITS - 1>(radics, from, to, size);
|
||||
RadixMagic<DIGITS - 1, COUNTER_TYP>(radics, from, to, size);
|
||||
|
||||
// With an other API we could spare this copy if we can delete original arr and return ptr or something...
|
||||
// I am fine with this... this is not my main idea anyways, just little ILP tweak to regular radix sort
|
||||
@ -306,15 +255,15 @@ namespace MagyarSort {
|
||||
*
|
||||
* @param FORCE OPTIONAL: When true, the gc happens even if MAGYAR_SORT_DEFAULT_REUSE is not defined!
|
||||
*/
|
||||
template<bool FORCE = false>
|
||||
template<bool FORCE = false, typename COUNTER_TYP = size_t>
|
||||
inline void gc() noexcept {
|
||||
if(FORCE) {
|
||||
// Only GC-ing
|
||||
MagyarSort::sort_impl<true, true, true>(nullptr, 0);
|
||||
MagyarSort::sort_impl<COUNTER_TYP, true, true, true>(nullptr, 0);
|
||||
} else {
|
||||
#ifdef MAGYAR_SORT_DEFAULT_REUSE
|
||||
// Only GC-ing
|
||||
MagyarSort::sort_impl<true, true, true>(nullptr, 0);
|
||||
MagyarSort::sort_impl<COUNTER_TYP, true, true, true>(nullptr, 0);
|
||||
#endif
|
||||
}
|
||||
}
|
||||
@ -330,14 +279,15 @@ namespace MagyarSort {
|
||||
*
|
||||
* @param arr The array to sort. Result will be in the same array - as sorted.
|
||||
* @param size The lenght of the array.
|
||||
* @param COUNTER_TYP OPTIONAL: It is best kepts as size_t, but for smaller arrays, you can use uint32_t.
|
||||
* @param GC OPTIONAL: When true, we garbage collect before this sort - so cached memory size will be "size" elems.
|
||||
*/
|
||||
template<bool GC = false>
|
||||
inline void __attribute__((always_inline)) sort_reuse(uint32_t arr[], size_t size) noexcept {
|
||||
template<typename COUNTER_TYP = size_t, bool GC = false>
|
||||
inline void __attribute__((always_inline)) sort_reuse(uint32_t arr[], COUNTER_TYP size) noexcept {
|
||||
// Reuse the temporary vectors across runs
|
||||
// This results in much less heap allocations and much faster on gcc
|
||||
// and also a bit faster on clang too.
|
||||
MagyarSort::sort_impl<true>(arr, size);
|
||||
MagyarSort::sort_impl<COUNTER_TYP, true>(arr, size);
|
||||
}
|
||||
|
||||
/**
|
||||
@ -349,11 +299,13 @@ namespace MagyarSort {
|
||||
*
|
||||
* @param arr The array to sort. Result will be in the same array - as sorted.
|
||||
* @param size The lenght of the array.
|
||||
* @param COUNTER_TYP OPTIONAL: It is best kepts as size_t, but for smaller arrays, you can use uint32_t.
|
||||
*/
|
||||
template<typename COUNTER_TYP = size_t>
|
||||
inline void __attribute__((always_inline)) sort_no_reuse(uint32_t arr[], size_t size) noexcept {
|
||||
// We use the heap once per every call...
|
||||
// This is safer and we do not need garbage collecting
|
||||
MagyarSort::sort_impl(arr, size);
|
||||
MagyarSort::sort_impl<COUNTER_TYP>(arr, size);
|
||||
}
|
||||
|
||||
/*
|
||||
@ -368,12 +320,14 @@ namespace MagyarSort {
|
||||
*
|
||||
* @param arr The array to sort. Result will be in the same array - as sorted.
|
||||
* @param size The lenght of the array.
|
||||
* @param COUNTER_TYP OPTIONAL: It is best kepts as size_t, but for smaller arrays, you can use uint32_t.
|
||||
*/
|
||||
template<typename COUNTER_TYP = size_t>
|
||||
inline void sort(uint32_t arr[], size_t size) noexcept {
|
||||
#ifdef MAGYAR_SORT_DEFAULT_REUSE
|
||||
MagyarSort::sort_reuse(arr, size);
|
||||
MagyarSort::sort_reuse<COUNTER_TYP>(arr, size);
|
||||
#else
|
||||
MagyarSort::sort_no_reuse(arr, size);
|
||||
MagyarSort::sort_no_reuse<COUNTER_TYP>(arr, size);
|
||||
#endif
|
||||
}
|
||||
};
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user