minor speed tweaks by being able to define the counter type

This commit is contained in:
Richard Thier 2021-12-17 21:17:53 +01:00
parent 1686967f10
commit 0b4eb5e5a6
2 changed files with 59 additions and 105 deletions

View File

@ -43,16 +43,17 @@ namespace MagyarSort {
/* DEBUG */
void debugArr(uint32_t *arr, size_t size) {
for(int i = 0; i < size; ++i) {
for(size_t i = 0; i < size; ++i) {
printf("%x, ", arr[i]);
}
printf("\n");
}
void debugRadics(size_t *radics) {
template<typename COUNTER_TYP>
void debugRadics(COUNTER_TYP *radics) {
for(size_t j = 0; j < DIGITS; ++j) {
printf("d%zu: ", j);
for(size_t i = 0; i < DIGIT_RANGE; ++i) {
for(int i = 0; i < DIGIT_RANGE; ++i) {
printf("%zu,", radics[i + DIGIT_RANGE*j]);
}
printf("\n\n");
@ -70,72 +71,74 @@ namespace MagyarSort {
}
/** Recursive Functor: no class should be generated I think (compiler should be smart) */
template<int DIGIT>
struct OccurenceMagic : public OccurenceMagic<DIGIT - 1> {
inline __attribute__((always_inline)) OccurenceMagic(uint32_t arr[], size_t i, size_t *radicsOut) noexcept
: OccurenceMagic<DIGIT -1 >(arr, i, radicsOut) {
template<int DIGIT, typename COUNTER_TYP>
struct OccurenceMagic : public OccurenceMagic<DIGIT - 1, COUNTER_TYP> {
inline __attribute__((always_inline)) OccurenceMagic(uint32_t arr[], COUNTER_TYP i, COUNTER_TYP *radicsOut) noexcept
: OccurenceMagic<DIGIT - 1 ,COUNTER_TYP>(arr, i, radicsOut) {
// Parents run first so template recursion runs DIGIT=0 first...
++radicsOut[getDigit<DIGIT>(arr[i]) + DIGIT_RANGE * DIGIT];
}
};
/** Ends template recursion */
template<>
struct OccurenceMagic<-1> {
inline OccurenceMagic(uint32_t arr[], size_t i, size_t *radicsOut) noexcept {}
template<typename COUNTER_TYP>
struct OccurenceMagic<-1, COUNTER_TYP> {
inline OccurenceMagic(uint32_t arr[], COUNTER_TYP i, COUNTER_TYP *radicsOut) noexcept {}
};
static inline void countOccurences(uint32_t arr[], size_t size, size_t *radicsOut) noexcept {
template<typename COUNTER_TYP>
static inline void countOccurences(uint32_t arr[], COUNTER_TYP size, COUNTER_TYP *radicsOut) noexcept {
#pragma GCC unroll 64
for(size_t i = 0; i < size; ++i) {
for(COUNTER_TYP i = 0; i < size; ++i) {
// Creates no object, struct is empty
OccurenceMagic<DIGITS - 1>(arr, i, radicsOut);
OccurenceMagic<DIGITS - 1, COUNTER_TYP>(arr, i, radicsOut);
}
}
/** Recursive Functor: no class should be generated I think (compiler should be smart) */
template<int DIGIT>
struct PrefixMagic : public PrefixMagic<DIGIT - 1> {
inline __attribute__((always_inline)) PrefixMagic(size_t *radics, size_t *prev, int i) noexcept
: PrefixMagic<DIGIT - 1>(radics, prev, i) {
template<int DIGIT, typename COUNTER_TYP>
struct PrefixMagic : public PrefixMagic<DIGIT - 1, COUNTER_TYP> {
inline __attribute__((always_inline)) PrefixMagic(COUNTER_TYP *radics, COUNTER_TYP *prev, int i) noexcept
: PrefixMagic<DIGIT - 1, COUNTER_TYP>(radics, prev, i) {
static constexpr int DSTART = (DIGIT * DIGIT_RANGE);
radics[DSTART + i] += prev[DIGIT];
prev[DIGIT] = radics[DSTART + i];
}
};
/** Ends template recursion */
template<>
struct PrefixMagic<-1> {
inline PrefixMagic(size_t *radics, size_t *prev, int i) noexcept {}
template<typename COUNTER_TYP>
struct PrefixMagic<-1, COUNTER_TYP> {
inline PrefixMagic(COUNTER_TYP *radics, COUNTER_TYP *prev, int i) noexcept {}
};
/** Gets REFERENCE to the given digit from the radix-array that has more than one digits */
template<int DIGIT>
static inline __attribute__((always_inline)) size_t &rGet(size_t *radics, size_t i) noexcept {
template<int DIGIT, typename COUNTER_TYP>
static inline __attribute__((always_inline)) COUNTER_TYP &rGet(COUNTER_TYP *radics, int i) noexcept {
static constexpr int DSTART = (DIGIT * DIGIT_RANGE);
return radics[DSTART + i];
}
static inline void calcPrefixSums(size_t *radics) noexcept {
static thread_local size_t prev[DIGITS];
template<typename COUNTER_TYP>
static inline void calcPrefixSums(COUNTER_TYP *radics) noexcept {
static thread_local COUNTER_TYP prev[DIGITS];
memset(prev, 0, sizeof(prev));
for(int i = 0; i < DIGIT_RANGE; ++i) {
// This is a template-unrolled loop too
PrefixMagic<DIGITS - 1>(radics, prev, i);
PrefixMagic<DIGITS - 1, COUNTER_TYP>(radics, prev, i);
}
}
/** Recursive Functor: no class should be generated I think (compiler should be smart) */
template<int DIGIT>
struct RadixMagic : public RadixMagic<DIGIT - 1> {
inline __attribute__((always_inline)) RadixMagic(size_t *radics, uint32_t *&from, uint32_t *&to, size_t size) noexcept // BEWARE: "*&" needed to swap pointers..
: RadixMagic<DIGIT - 1>(radics, from, to, size) {
template<int DIGIT, typename COUNTER_TYP>
struct RadixMagic : public RadixMagic<DIGIT - 1, COUNTER_TYP> {
inline __attribute__((always_inline)) RadixMagic(COUNTER_TYP *radics, uint32_t *&from, uint32_t *&to, COUNTER_TYP size) noexcept // BEWARE: "*&" needed to swap pointers..
: RadixMagic<DIGIT - 1, COUNTER_TYP>(radics, from, to, size) {
// DEBUG
//printf("%d before: ", DIGIT);
//debugArr(from, size);
#pragma GCC unroll 64
for(size_t i = size; i > 0; --i) { // right-to-left to ensure already sorted digits order we keep for iterations
for(COUNTER_TYP i = size; i > 0; --i) { // right-to-left to ensure already sorted digits order we keep for iterations
// Get num and its new offset / location
auto num = from[i - 1];
auto digVal = getDigit<DIGIT>(num);
@ -154,68 +157,13 @@ namespace MagyarSort {
}
};
/** Ends template recursion */
template<>
struct RadixMagic<-1> {
inline RadixMagic(size_t *radics, uint32_t *&from, uint32_t *&to, size_t size) noexcept { }
template<typename COUNTER_TYP>
struct RadixMagic<-1, COUNTER_TYP> {
inline RadixMagic(COUNTER_TYP *radics, uint32_t *&from, uint32_t *&to, COUNTER_TYP size) noexcept { }
};
/* SORT */
/**
* Example: A simple "vector-giver" which provides a static thread_local that is reused
*
* This is to be used when you will call sort many times successively!
* If you forget to garbage-collect manually, use a VectorGiverHeap.
*
* XXX - BEWARE: This give references - that is also acceptable and supported!
*
* This is thread-safe (the Heap one also).
*/
struct VectorGiverWithReuse {
/**
* Give a reference to the vector to use as temporary.
* Will be resized, is reused so "leaks" memory to be the biggest sorted array size, but you can "Gc()".
*
* @param s The given vector should have this size.
* @param gc OPTIONAL: When true, we create a new empty shared vector. This saves memory after a big sort!
* @returns A reference that never go out of scope!
*/
static inline __attribute__((always_inline)) std::vector<uint32_t> &Give(size_t s, const bool gc = false) noexcept {
static thread_local std::vector<uint32_t> arc(s); // saves time on first call to have size here!
if(gc) { arc = std::vector<uint32_t>(); } // by default optimized out!
arc.resize(s); // JHP
// Safe because of static it will not go out of scope
return arc; // just a reference - no copy!
}
/** Release memory back to zero. After this, the first sort will need memory from heap again. */
inline __attribute__((always_inline)) void Gc() noexcept {
VectorGiverWithReuse::Give(0, true);
}
};
/**
* Example: A simple "vector-giver" which provides new vector from heap.
*
* This is thread-safe (the VectorGiverWithReuse one also).
*/
struct VectorGiverHeap {
/**
* Give a temporary vector which is to be created on heap and freed after sort.
*
* XXX - BEWARE: Please mind we do not return reference, but value here!
* This works because standard ENSURES return value optimization!
*
* @param s The given vector should have this size.
* @param gc OPTIONAL: When true, we create a new empty shared vector. This saves memory after a big sort!
* @returns A vector of appropriate size.
*/
inline __attribute__((always_inline)) std::vector<uint32_t> Give(size_t s) noexcept {
return std::vector<uint32_t>(s); // RVO ensured!
}
};
/*
* Sort the given array (in-place sorting) with the given size.
*
@ -224,13 +172,14 @@ namespace MagyarSort {
* Beware: GC needs to happen on all threads that use us!
*
* @param arr The array to sort. Result will be in the same array - as sorted.
* @param size The lenght of the array.
* @param size The lenght of the array - should fit in the COUNTER_TYP.
* @param COUNTER_TYP OPTIONAL: When set this type will be the counter type.
* @param REUSE OPTIONAL: When true, we reuse the array instead of always gettin' and releasin' from da heap.
* @param GC OPTIONAL: When true, we garbage collect memory from previous sorts if REUSE is true.
* @param GC_WITHOUT_SORT OPTIONAL: When true, we "just GC" but do not sort in case of GC is true.
*/
template<bool REUSE = false, bool GC = false, bool GC_WITHOUT_SORT = false>
inline void __attribute__((always_inline)) sort_impl(uint32_t arr[], size_t size) noexcept {
template<typename COUNTER_TYP = size_t, bool REUSE = false, bool GC = false, bool GC_WITHOUT_SORT = false>
inline void __attribute__((always_inline)) sort_impl(uint32_t arr[], COUNTER_TYP size) noexcept {
// Most funny optimization is this multiply here :-)
//
// Literally.. come on.. this makes it nearly a compile-time, macro-like
@ -250,18 +199,18 @@ namespace MagyarSort {
// Holds "digit" occurences, prefix sums, whatevers
// First "DIGIT_RANGE" elem is for MSB "DIGITS", last is for LSB
static thread_local size_t radics[DIGITS * DIGIT_RANGE];
static thread_local COUNTER_TYP radics[DIGITS * DIGIT_RANGE];
memset(radics, 0, sizeof(radics));
// Calculate occurences of digits
countOccurences(arr, size, radics);
//debugRadics(radics);
//debugRadics<COUNTER_TYP>(radics);
// Calculate prefix sums
calcPrefixSums(radics);
//debugRadics(radics);
//debugRadics<COUNTER_TYP>(radics);
/* Regular (old) radix sort with small twist */
@ -288,7 +237,7 @@ namespace MagyarSort {
uint32_t *from = arr;
uint32_t *to = &arc[0];
RadixMagic<DIGITS - 1>(radics, from, to, size);
RadixMagic<DIGITS - 1, COUNTER_TYP>(radics, from, to, size);
// With an other API we could spare this copy if we can delete original arr and return ptr or something...
// I am fine with this... this is not my main idea anyways, just little ILP tweak to regular radix sort
@ -306,15 +255,15 @@ namespace MagyarSort {
*
* @param FORCE OPTIONAL: When true, the gc happens even if MAGYAR_SORT_DEFAULT_REUSE is not defined!
*/
template<bool FORCE = false>
template<bool FORCE = false, typename COUNTER_TYP = size_t>
inline void gc() noexcept {
if(FORCE) {
// Only GC-ing
MagyarSort::sort_impl<true, true, true>(nullptr, 0);
MagyarSort::sort_impl<COUNTER_TYP, true, true, true>(nullptr, 0);
} else {
#ifdef MAGYAR_SORT_DEFAULT_REUSE
// Only GC-ing
MagyarSort::sort_impl<true, true, true>(nullptr, 0);
MagyarSort::sort_impl<COUNTER_TYP, true, true, true>(nullptr, 0);
#endif
}
}
@ -330,14 +279,15 @@ namespace MagyarSort {
*
* @param arr The array to sort. Result will be in the same array - as sorted.
* @param size The lenght of the array.
* @param COUNTER_TYP OPTIONAL: It is best kepts as size_t, but for smaller arrays, you can use uint32_t.
* @param GC OPTIONAL: When true, we garbage collect before this sort - so cached memory size will be "size" elems.
*/
template<bool GC = false>
inline void __attribute__((always_inline)) sort_reuse(uint32_t arr[], size_t size) noexcept {
template<typename COUNTER_TYP = size_t, bool GC = false>
inline void __attribute__((always_inline)) sort_reuse(uint32_t arr[], COUNTER_TYP size) noexcept {
// Reuse the temporary vectors across runs
// This results in much less heap allocations and much faster on gcc
// and also a bit faster on clang too.
MagyarSort::sort_impl<true>(arr, size);
MagyarSort::sort_impl<COUNTER_TYP, true>(arr, size);
}
/**
@ -349,11 +299,13 @@ namespace MagyarSort {
*
* @param arr The array to sort. Result will be in the same array - as sorted.
* @param size The lenght of the array.
* @param COUNTER_TYP OPTIONAL: It is best kepts as size_t, but for smaller arrays, you can use uint32_t.
*/
template<typename COUNTER_TYP = size_t>
inline void __attribute__((always_inline)) sort_no_reuse(uint32_t arr[], size_t size) noexcept {
// We use the heap once per every call...
// This is safer and we do not need garbage collecting
MagyarSort::sort_impl(arr, size);
MagyarSort::sort_impl<COUNTER_TYP>(arr, size);
}
/*
@ -368,12 +320,14 @@ namespace MagyarSort {
*
* @param arr The array to sort. Result will be in the same array - as sorted.
* @param size The lenght of the array.
* @param COUNTER_TYP OPTIONAL: It is best kepts as size_t, but for smaller arrays, you can use uint32_t.
*/
template<typename COUNTER_TYP = size_t>
inline void sort(uint32_t arr[], size_t size) noexcept {
#ifdef MAGYAR_SORT_DEFAULT_REUSE
MagyarSort::sort_reuse(arr, size);
MagyarSort::sort_reuse<COUNTER_TYP>(arr, size);
#else
MagyarSort::sort_no_reuse(arr, size);
MagyarSort::sort_no_reuse<COUNTER_TYP>(arr, size);
#endif
}
};

View File

@ -369,7 +369,7 @@
}
});
w = v;
measure(inputtype, "magyar", [&] { MagyarSort::sort(&w[0], w.size()); });
measure(inputtype, "magyar", [&] { MagyarSort::sort<uint32_t>(&w[0], w.size()); });
assert(w == expected);
/*