mormordsort - buggy version (I actually think its some of the Magyarsort 2.x in this form - but needs fixing
This commit is contained in:
parent
6426560519
commit
55583bcb4a
204
magyarsort.h
204
magyarsort.h
@ -66,11 +66,11 @@ namespace MagyarSort {
|
||||
}
|
||||
|
||||
template<typename COUNTER_TYP>
|
||||
void debugRadics(COUNTER_TYP *radics) {
|
||||
void debugRadics(COUNTER_TYP *magics) {
|
||||
for(size_t j = 0; j < DIGITS; ++j) {
|
||||
printf("d%zu: ", j);
|
||||
for(int i = 0; i < DIGIT_RANGE; ++i) {
|
||||
printf("%zu,", radics[i + DIGIT_RANGE*j]);
|
||||
printf("%zu,", magics[i + DIGIT_RANGE*j]);
|
||||
}
|
||||
printf("\n\n");
|
||||
}
|
||||
@ -89,16 +89,16 @@ namespace MagyarSort {
|
||||
/** Recursive Functor: no class should be generated I think (compiler should be smart) */
|
||||
template<int DIGIT, typename COUNTER_TYP>
|
||||
struct OccurenceMagic : public OccurenceMagic<DIGIT - 1, COUNTER_TYP> {
|
||||
inline __attribute__((always_inline)) OccurenceMagic(uint32_t arr[], COUNTER_TYP i, COUNTER_TYP *radicsOut) noexcept
|
||||
: OccurenceMagic<DIGIT - 1 ,COUNTER_TYP>(arr, i, radicsOut) {
|
||||
inline __attribute__((always_inline)) OccurenceMagic(uint32_t arr[], COUNTER_TYP i, COUNTER_TYP *magicsOut) noexcept
|
||||
: OccurenceMagic<DIGIT - 1 ,COUNTER_TYP>(arr, i, magicsOut) {
|
||||
// Parents run first so template recursion runs DIGIT=0 first...
|
||||
++radicsOut[getDigit<DIGIT>(arr[i]) + DIGIT_RANGE * DIGIT];
|
||||
++magicsOut[getDigit<DIGIT>(arr[i]) + DIGIT_RANGE * DIGIT];
|
||||
}
|
||||
};
|
||||
/** Ends template recursion */
|
||||
template<typename COUNTER_TYP>
|
||||
struct OccurenceMagic<-1, COUNTER_TYP> {
|
||||
inline __attribute__((always_inline)) OccurenceMagic(uint32_t arr[], COUNTER_TYP i, COUNTER_TYP *radicsOut) noexcept {}
|
||||
inline __attribute__((always_inline)) OccurenceMagic(uint32_t arr[], COUNTER_TYP i, COUNTER_TYP *magicsOut) noexcept {}
|
||||
};
|
||||
|
||||
/** ARR_END must be an (STEP * k) */
|
||||
@ -116,7 +116,7 @@ namespace MagyarSort {
|
||||
};
|
||||
|
||||
template<typename COUNTER_TYP>
|
||||
static inline void countOccurences(uint32_t arr[], COUNTER_TYP size, COUNTER_TYP *radicsOut) noexcept {
|
||||
static inline void countOccurences(uint32_t arr[], COUNTER_TYP size, COUNTER_TYP *magicsOut) noexcept {
|
||||
COUNTER_TYP i = 0;
|
||||
//#pragma GCC unroll 4
|
||||
for(; i < size - 64; i += 64) {
|
||||
@ -124,131 +124,131 @@ namespace MagyarSort {
|
||||
//__builtin_prefetch(&arr[i + (1 * 16)], 0, 2); // r, L2 or L3 cache
|
||||
__builtin_prefetch(&arr[i + (1 * 16)]);
|
||||
// Creates no object, struct is empty
|
||||
OccurenceMagic<DIGITS - 1, COUNTER_TYP>(arr, i, radicsOut);
|
||||
OccurenceMagic<DIGITS - 1, COUNTER_TYP>(arr, i + 1, radicsOut);
|
||||
OccurenceMagic<DIGITS - 1, COUNTER_TYP>(arr, i + 2, radicsOut);
|
||||
OccurenceMagic<DIGITS - 1, COUNTER_TYP>(arr, i + 3, radicsOut);
|
||||
OccurenceMagic<DIGITS - 1, COUNTER_TYP>(arr, i + 4, radicsOut);
|
||||
OccurenceMagic<DIGITS - 1, COUNTER_TYP>(arr, i + 5, radicsOut);
|
||||
OccurenceMagic<DIGITS - 1, COUNTER_TYP>(arr, i + 6, radicsOut);
|
||||
OccurenceMagic<DIGITS - 1, COUNTER_TYP>(arr, i + 7, radicsOut);
|
||||
OccurenceMagic<DIGITS - 1, COUNTER_TYP>(arr, i + 8, radicsOut);
|
||||
OccurenceMagic<DIGITS - 1, COUNTER_TYP>(arr, i + 9, radicsOut);
|
||||
OccurenceMagic<DIGITS - 1, COUNTER_TYP>(arr, i + 10, radicsOut);
|
||||
OccurenceMagic<DIGITS - 1, COUNTER_TYP>(arr, i + 11, radicsOut);
|
||||
OccurenceMagic<DIGITS - 1, COUNTER_TYP>(arr, i + 12, radicsOut);
|
||||
OccurenceMagic<DIGITS - 1, COUNTER_TYP>(arr, i + 13, radicsOut);
|
||||
OccurenceMagic<DIGITS - 1, COUNTER_TYP>(arr, i + 14, radicsOut);
|
||||
OccurenceMagic<DIGITS - 1, COUNTER_TYP>(arr, i + 15, radicsOut);
|
||||
OccurenceMagic<DIGITS - 1, COUNTER_TYP>(arr, i, magicsOut);
|
||||
OccurenceMagic<DIGITS - 1, COUNTER_TYP>(arr, i + 1, magicsOut);
|
||||
OccurenceMagic<DIGITS - 1, COUNTER_TYP>(arr, i + 2, magicsOut);
|
||||
OccurenceMagic<DIGITS - 1, COUNTER_TYP>(arr, i + 3, magicsOut);
|
||||
OccurenceMagic<DIGITS - 1, COUNTER_TYP>(arr, i + 4, magicsOut);
|
||||
OccurenceMagic<DIGITS - 1, COUNTER_TYP>(arr, i + 5, magicsOut);
|
||||
OccurenceMagic<DIGITS - 1, COUNTER_TYP>(arr, i + 6, magicsOut);
|
||||
OccurenceMagic<DIGITS - 1, COUNTER_TYP>(arr, i + 7, magicsOut);
|
||||
OccurenceMagic<DIGITS - 1, COUNTER_TYP>(arr, i + 8, magicsOut);
|
||||
OccurenceMagic<DIGITS - 1, COUNTER_TYP>(arr, i + 9, magicsOut);
|
||||
OccurenceMagic<DIGITS - 1, COUNTER_TYP>(arr, i + 10, magicsOut);
|
||||
OccurenceMagic<DIGITS - 1, COUNTER_TYP>(arr, i + 11, magicsOut);
|
||||
OccurenceMagic<DIGITS - 1, COUNTER_TYP>(arr, i + 12, magicsOut);
|
||||
OccurenceMagic<DIGITS - 1, COUNTER_TYP>(arr, i + 13, magicsOut);
|
||||
OccurenceMagic<DIGITS - 1, COUNTER_TYP>(arr, i + 14, magicsOut);
|
||||
OccurenceMagic<DIGITS - 1, COUNTER_TYP>(arr, i + 15, magicsOut);
|
||||
// Prefetch for read level-1 cache
|
||||
__builtin_prefetch(&arr[i + (2 * 16)]);
|
||||
OccurenceMagic<DIGITS - 1, COUNTER_TYP>(arr, i + 16, radicsOut);
|
||||
OccurenceMagic<DIGITS - 1, COUNTER_TYP>(arr, i + 17, radicsOut);
|
||||
OccurenceMagic<DIGITS - 1, COUNTER_TYP>(arr, i + 18, radicsOut);
|
||||
OccurenceMagic<DIGITS - 1, COUNTER_TYP>(arr, i + 19, radicsOut);
|
||||
OccurenceMagic<DIGITS - 1, COUNTER_TYP>(arr, i + 20, radicsOut);
|
||||
OccurenceMagic<DIGITS - 1, COUNTER_TYP>(arr, i + 21, radicsOut);
|
||||
OccurenceMagic<DIGITS - 1, COUNTER_TYP>(arr, i + 22, radicsOut);
|
||||
OccurenceMagic<DIGITS - 1, COUNTER_TYP>(arr, i + 23, radicsOut);
|
||||
OccurenceMagic<DIGITS - 1, COUNTER_TYP>(arr, i + 24, radicsOut);
|
||||
OccurenceMagic<DIGITS - 1, COUNTER_TYP>(arr, i + 25, radicsOut);
|
||||
OccurenceMagic<DIGITS - 1, COUNTER_TYP>(arr, i + 26, radicsOut);
|
||||
OccurenceMagic<DIGITS - 1, COUNTER_TYP>(arr, i + 27, radicsOut);
|
||||
OccurenceMagic<DIGITS - 1, COUNTER_TYP>(arr, i + 28, radicsOut);
|
||||
OccurenceMagic<DIGITS - 1, COUNTER_TYP>(arr, i + 29, radicsOut);
|
||||
OccurenceMagic<DIGITS - 1, COUNTER_TYP>(arr, i + 30, radicsOut);
|
||||
OccurenceMagic<DIGITS - 1, COUNTER_TYP>(arr, i + 31, radicsOut);
|
||||
OccurenceMagic<DIGITS - 1, COUNTER_TYP>(arr, i + 16, magicsOut);
|
||||
OccurenceMagic<DIGITS - 1, COUNTER_TYP>(arr, i + 17, magicsOut);
|
||||
OccurenceMagic<DIGITS - 1, COUNTER_TYP>(arr, i + 18, magicsOut);
|
||||
OccurenceMagic<DIGITS - 1, COUNTER_TYP>(arr, i + 19, magicsOut);
|
||||
OccurenceMagic<DIGITS - 1, COUNTER_TYP>(arr, i + 20, magicsOut);
|
||||
OccurenceMagic<DIGITS - 1, COUNTER_TYP>(arr, i + 21, magicsOut);
|
||||
OccurenceMagic<DIGITS - 1, COUNTER_TYP>(arr, i + 22, magicsOut);
|
||||
OccurenceMagic<DIGITS - 1, COUNTER_TYP>(arr, i + 23, magicsOut);
|
||||
OccurenceMagic<DIGITS - 1, COUNTER_TYP>(arr, i + 24, magicsOut);
|
||||
OccurenceMagic<DIGITS - 1, COUNTER_TYP>(arr, i + 25, magicsOut);
|
||||
OccurenceMagic<DIGITS - 1, COUNTER_TYP>(arr, i + 26, magicsOut);
|
||||
OccurenceMagic<DIGITS - 1, COUNTER_TYP>(arr, i + 27, magicsOut);
|
||||
OccurenceMagic<DIGITS - 1, COUNTER_TYP>(arr, i + 28, magicsOut);
|
||||
OccurenceMagic<DIGITS - 1, COUNTER_TYP>(arr, i + 29, magicsOut);
|
||||
OccurenceMagic<DIGITS - 1, COUNTER_TYP>(arr, i + 30, magicsOut);
|
||||
OccurenceMagic<DIGITS - 1, COUNTER_TYP>(arr, i + 31, magicsOut);
|
||||
__builtin_prefetch(&arr[i + (3 * 16)]);
|
||||
OccurenceMagic<DIGITS - 1, COUNTER_TYP>(arr, i + 32, radicsOut);
|
||||
OccurenceMagic<DIGITS - 1, COUNTER_TYP>(arr, i + 33, radicsOut);
|
||||
OccurenceMagic<DIGITS - 1, COUNTER_TYP>(arr, i + 34, radicsOut);
|
||||
OccurenceMagic<DIGITS - 1, COUNTER_TYP>(arr, i + 35, radicsOut);
|
||||
OccurenceMagic<DIGITS - 1, COUNTER_TYP>(arr, i + 36, radicsOut);
|
||||
OccurenceMagic<DIGITS - 1, COUNTER_TYP>(arr, i + 37, radicsOut);
|
||||
OccurenceMagic<DIGITS - 1, COUNTER_TYP>(arr, i + 38, radicsOut);
|
||||
OccurenceMagic<DIGITS - 1, COUNTER_TYP>(arr, i + 39, radicsOut);
|
||||
OccurenceMagic<DIGITS - 1, COUNTER_TYP>(arr, i + 40, radicsOut);
|
||||
OccurenceMagic<DIGITS - 1, COUNTER_TYP>(arr, i + 41, radicsOut);
|
||||
OccurenceMagic<DIGITS - 1, COUNTER_TYP>(arr, i + 42, radicsOut);
|
||||
OccurenceMagic<DIGITS - 1, COUNTER_TYP>(arr, i + 43, radicsOut);
|
||||
OccurenceMagic<DIGITS - 1, COUNTER_TYP>(arr, i + 44, radicsOut);
|
||||
OccurenceMagic<DIGITS - 1, COUNTER_TYP>(arr, i + 45, radicsOut);
|
||||
OccurenceMagic<DIGITS - 1, COUNTER_TYP>(arr, i + 46, radicsOut);
|
||||
OccurenceMagic<DIGITS - 1, COUNTER_TYP>(arr, i + 47, radicsOut);
|
||||
OccurenceMagic<DIGITS - 1, COUNTER_TYP>(arr, i + 32, magicsOut);
|
||||
OccurenceMagic<DIGITS - 1, COUNTER_TYP>(arr, i + 33, magicsOut);
|
||||
OccurenceMagic<DIGITS - 1, COUNTER_TYP>(arr, i + 34, magicsOut);
|
||||
OccurenceMagic<DIGITS - 1, COUNTER_TYP>(arr, i + 35, magicsOut);
|
||||
OccurenceMagic<DIGITS - 1, COUNTER_TYP>(arr, i + 36, magicsOut);
|
||||
OccurenceMagic<DIGITS - 1, COUNTER_TYP>(arr, i + 37, magicsOut);
|
||||
OccurenceMagic<DIGITS - 1, COUNTER_TYP>(arr, i + 38, magicsOut);
|
||||
OccurenceMagic<DIGITS - 1, COUNTER_TYP>(arr, i + 39, magicsOut);
|
||||
OccurenceMagic<DIGITS - 1, COUNTER_TYP>(arr, i + 40, magicsOut);
|
||||
OccurenceMagic<DIGITS - 1, COUNTER_TYP>(arr, i + 41, magicsOut);
|
||||
OccurenceMagic<DIGITS - 1, COUNTER_TYP>(arr, i + 42, magicsOut);
|
||||
OccurenceMagic<DIGITS - 1, COUNTER_TYP>(arr, i + 43, magicsOut);
|
||||
OccurenceMagic<DIGITS - 1, COUNTER_TYP>(arr, i + 44, magicsOut);
|
||||
OccurenceMagic<DIGITS - 1, COUNTER_TYP>(arr, i + 45, magicsOut);
|
||||
OccurenceMagic<DIGITS - 1, COUNTER_TYP>(arr, i + 46, magicsOut);
|
||||
OccurenceMagic<DIGITS - 1, COUNTER_TYP>(arr, i + 47, magicsOut);
|
||||
// __builtin_prefetch(&arr[i + (4 * 16)]); // Only needed for longer than 64 unrolls
|
||||
OccurenceMagic<DIGITS - 1, COUNTER_TYP>(arr, i + 48, radicsOut);
|
||||
OccurenceMagic<DIGITS - 1, COUNTER_TYP>(arr, i + 49, radicsOut);
|
||||
OccurenceMagic<DIGITS - 1, COUNTER_TYP>(arr, i + 50, radicsOut);
|
||||
OccurenceMagic<DIGITS - 1, COUNTER_TYP>(arr, i + 51, radicsOut);
|
||||
OccurenceMagic<DIGITS - 1, COUNTER_TYP>(arr, i + 52, radicsOut);
|
||||
OccurenceMagic<DIGITS - 1, COUNTER_TYP>(arr, i + 53, radicsOut);
|
||||
OccurenceMagic<DIGITS - 1, COUNTER_TYP>(arr, i + 54, radicsOut);
|
||||
OccurenceMagic<DIGITS - 1, COUNTER_TYP>(arr, i + 55, radicsOut);
|
||||
OccurenceMagic<DIGITS - 1, COUNTER_TYP>(arr, i + 56, radicsOut);
|
||||
OccurenceMagic<DIGITS - 1, COUNTER_TYP>(arr, i + 57, radicsOut);
|
||||
OccurenceMagic<DIGITS - 1, COUNTER_TYP>(arr, i + 58, radicsOut);
|
||||
OccurenceMagic<DIGITS - 1, COUNTER_TYP>(arr, i + 59, radicsOut);
|
||||
OccurenceMagic<DIGITS - 1, COUNTER_TYP>(arr, i + 60, radicsOut);
|
||||
OccurenceMagic<DIGITS - 1, COUNTER_TYP>(arr, i + 61, radicsOut);
|
||||
OccurenceMagic<DIGITS - 1, COUNTER_TYP>(arr, i + 62, radicsOut);
|
||||
OccurenceMagic<DIGITS - 1, COUNTER_TYP>(arr, i + 63, radicsOut);
|
||||
OccurenceMagic<DIGITS - 1, COUNTER_TYP>(arr, i + 48, magicsOut);
|
||||
OccurenceMagic<DIGITS - 1, COUNTER_TYP>(arr, i + 49, magicsOut);
|
||||
OccurenceMagic<DIGITS - 1, COUNTER_TYP>(arr, i + 50, magicsOut);
|
||||
OccurenceMagic<DIGITS - 1, COUNTER_TYP>(arr, i + 51, magicsOut);
|
||||
OccurenceMagic<DIGITS - 1, COUNTER_TYP>(arr, i + 52, magicsOut);
|
||||
OccurenceMagic<DIGITS - 1, COUNTER_TYP>(arr, i + 53, magicsOut);
|
||||
OccurenceMagic<DIGITS - 1, COUNTER_TYP>(arr, i + 54, magicsOut);
|
||||
OccurenceMagic<DIGITS - 1, COUNTER_TYP>(arr, i + 55, magicsOut);
|
||||
OccurenceMagic<DIGITS - 1, COUNTER_TYP>(arr, i + 56, magicsOut);
|
||||
OccurenceMagic<DIGITS - 1, COUNTER_TYP>(arr, i + 57, magicsOut);
|
||||
OccurenceMagic<DIGITS - 1, COUNTER_TYP>(arr, i + 58, magicsOut);
|
||||
OccurenceMagic<DIGITS - 1, COUNTER_TYP>(arr, i + 59, magicsOut);
|
||||
OccurenceMagic<DIGITS - 1, COUNTER_TYP>(arr, i + 60, magicsOut);
|
||||
OccurenceMagic<DIGITS - 1, COUNTER_TYP>(arr, i + 61, magicsOut);
|
||||
OccurenceMagic<DIGITS - 1, COUNTER_TYP>(arr, i + 62, magicsOut);
|
||||
OccurenceMagic<DIGITS - 1, COUNTER_TYP>(arr, i + 63, magicsOut);
|
||||
}
|
||||
|
||||
#pragma GCC unroll 4
|
||||
for(; i < size; ++i) {
|
||||
OccurenceMagic<DIGITS - 1, COUNTER_TYP>(arr, i, radicsOut);
|
||||
OccurenceMagic<DIGITS - 1, COUNTER_TYP>(arr, i, magicsOut);
|
||||
}
|
||||
}
|
||||
|
||||
/** Recursive Functor: no class should be generated I think (compiler should be smart) */
|
||||
template<int DIGIT, typename COUNTER_TYP>
|
||||
struct PrefixMagic : public PrefixMagic<DIGIT - 1, COUNTER_TYP> {
|
||||
inline __attribute__((always_inline)) PrefixMagic(COUNTER_TYP *radics, COUNTER_TYP *prev, int i) noexcept
|
||||
: PrefixMagic<DIGIT - 1, COUNTER_TYP>(radics, prev, i) {
|
||||
inline __attribute__((always_inline)) PrefixMagic(COUNTER_TYP *magics, COUNTER_TYP *prev, int i) noexcept
|
||||
: PrefixMagic<DIGIT - 1, COUNTER_TYP>(magics, prev, i) {
|
||||
static constexpr int DSTART = (DIGIT * DIGIT_RANGE);
|
||||
radics[DSTART + i] += prev[DIGIT];
|
||||
prev[DIGIT] = radics[DSTART + i];
|
||||
magics[DSTART + i] += prev[DIGIT];
|
||||
prev[DIGIT] = magics[DSTART + i];
|
||||
}
|
||||
};
|
||||
/** Ends template recursion */
|
||||
template<typename COUNTER_TYP>
|
||||
struct PrefixMagic<-1, COUNTER_TYP> {
|
||||
inline PrefixMagic(COUNTER_TYP *radics, COUNTER_TYP *prev, int i) noexcept {}
|
||||
inline PrefixMagic(COUNTER_TYP *magics, COUNTER_TYP *prev, int i) noexcept {}
|
||||
};
|
||||
|
||||
/** Gets REFERENCE to the given digit from the radix-array that has more than one digits */
|
||||
template<int DIGIT, typename COUNTER_TYP>
|
||||
static inline __attribute__((always_inline)) COUNTER_TYP &rGet(COUNTER_TYP *radics, int i) noexcept {
|
||||
static inline __attribute__((always_inline)) COUNTER_TYP &rGet(COUNTER_TYP *magics, int i) noexcept {
|
||||
static constexpr int DSTART = (DIGIT * DIGIT_RANGE);
|
||||
return radics[DSTART + i];
|
||||
return magics[DSTART + i];
|
||||
}
|
||||
|
||||
/** Helper for calcPrefixSums */
|
||||
template<int DIGIT, typename COUNTER_TYP>
|
||||
struct PMagic2 : public PMagic2<DIGIT - 1, COUNTER_TYP> {
|
||||
inline __attribute__((always_inline)) PMagic2(COUNTER_TYP *radics, COUNTER_TYP *prev)
|
||||
: PMagic2<DIGIT - 1, COUNTER_TYP>(radics, prev) {
|
||||
inline __attribute__((always_inline)) PMagic2(COUNTER_TYP *magics, COUNTER_TYP *prev)
|
||||
: PMagic2<DIGIT - 1, COUNTER_TYP>(magics, prev) {
|
||||
// Again first the 0th digit because of parent constructors!
|
||||
// This is a template-unrolled loop too
|
||||
PrefixMagic<DIGITS - 1, COUNTER_TYP>(radics, prev, DIGIT);
|
||||
PrefixMagic<DIGITS - 1, COUNTER_TYP>(magics, prev, DIGIT);
|
||||
}
|
||||
};
|
||||
|
||||
/** Template recursion endpoint */
|
||||
template<typename COUNTER_TYP>
|
||||
struct PMagic2<-1, COUNTER_TYP> {
|
||||
inline __attribute__((always_inline)) PMagic2(COUNTER_TYP *radics, COUNTER_TYP *prev) {}
|
||||
inline __attribute__((always_inline)) PMagic2(COUNTER_TYP *magics, COUNTER_TYP *prev) {}
|
||||
};
|
||||
|
||||
template<typename COUNTER_TYP>
|
||||
static inline void calcPrefixSums(COUNTER_TYP *radics) noexcept {
|
||||
static inline void calcPrefixSums(COUNTER_TYP *magics) noexcept {
|
||||
static thread_local COUNTER_TYP prev[DIGITS];
|
||||
memset(prev, 0, sizeof(prev));
|
||||
|
||||
// This is a template-unrolled loop too
|
||||
if constexpr (DIGIT_RANGE < 1024) {
|
||||
// Extra optimization for bytes and nibbles - totally unrolled loop!
|
||||
PMagic2<DIGIT_RANGE - 1, COUNTER_TYP>(radics, prev);
|
||||
PMagic2<DIGIT_RANGE - 1, COUNTER_TYP>(magics, prev);
|
||||
} else {
|
||||
// The above would not work for words and higher up...
|
||||
#pragma GCC unroll 16
|
||||
@ -257,8 +257,8 @@ namespace MagyarSort {
|
||||
#pragma GCC unroll 64
|
||||
for(int i = 0; i < DIGIT_RANGE; ++i) {
|
||||
int DSTART = (j * DIGIT_RANGE);
|
||||
radics[DSTART + i] += prev[j];
|
||||
prev[j] = radics[DSTART + i];
|
||||
magics[DSTART + i] += prev[j];
|
||||
prev[j] = magics[DSTART + i];
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -267,8 +267,8 @@ namespace MagyarSort {
|
||||
/** Recursive Functor: no class should be generated I think (compiler should be smart) */
|
||||
template<int DIGIT, typename COUNTER_TYP>
|
||||
struct RadixMagic : public RadixMagic<DIGIT - 1, COUNTER_TYP> {
|
||||
inline __attribute__((always_inline)) RadixMagic(bool &swapped, COUNTER_TYP *radics, uint32_t *from, uint32_t *to, COUNTER_TYP size) noexcept
|
||||
: RadixMagic<DIGIT - 1, COUNTER_TYP>(swapped, radics, from, to, size) {
|
||||
inline __attribute__((always_inline)) RadixMagic(bool &swapped, COUNTER_TYP *magics, uint32_t *from, uint32_t *to, COUNTER_TYP size) noexcept
|
||||
: RadixMagic<DIGIT - 1, COUNTER_TYP>(swapped, magics, from, to, size) {
|
||||
// Tricky: see (**)
|
||||
if(swapped) { // never true for DIGIT 0, see (***)
|
||||
std::swap(from, to);
|
||||
@ -288,7 +288,7 @@ namespace MagyarSort {
|
||||
// Get num and its new offset / location
|
||||
auto num = from[i - 1];
|
||||
auto digVal = getDigit<DIGIT>(num);
|
||||
auto offset = (--rGet<DIGIT>(radics, digVal));
|
||||
auto offset = (--rGet<DIGIT>(magics, digVal));
|
||||
|
||||
// Add to the proper target location
|
||||
to[offset] = num;
|
||||
@ -305,7 +305,7 @@ namespace MagyarSort {
|
||||
/** Ends template recursion */
|
||||
template<typename COUNTER_TYP>
|
||||
struct RadixMagic<-1, COUNTER_TYP> {
|
||||
inline RadixMagic(bool swapped, COUNTER_TYP *radics, uint32_t *&from, uint32_t *&to, COUNTER_TYP size) noexcept {}
|
||||
inline RadixMagic(bool swapped, COUNTER_TYP *magics, uint32_t *&from, uint32_t *&to, COUNTER_TYP size) noexcept {}
|
||||
};
|
||||
|
||||
/* SORT */
|
||||
@ -349,26 +349,26 @@ namespace MagyarSort {
|
||||
|
||||
// Holds "digit" occurences, prefix sums, whatevers
|
||||
// First "DIGIT_RANGE" elem is for MSB "DIGITS", last is for LSB
|
||||
static thread_local COUNTER_TYP radics[DIGITS * DIGIT_RANGE];
|
||||
static thread_local COUNTER_TYP magics[DIGITS * DIGIT_RANGE];
|
||||
#ifndef NO_MLOCK
|
||||
mlock(radics, (DIGITS * DIGIT_RANGE) * sizeof(COUNTER_TYP));
|
||||
mlock(magics, (DIGITS * DIGIT_RANGE) * sizeof(COUNTER_TYP));
|
||||
#endif // !NO_MLOCK
|
||||
// Write prefetchin'
|
||||
//__builtin_prefetch(&radicsOut[..], 1);
|
||||
//__builtin_prefetch(&magicsOut[..], 1);
|
||||
if constexpr (DIGIT_RANGE <= 1024) {
|
||||
PrefetchMagic<DIGITS * DIGIT_RANGE, (64/sizeof(COUNTER_TYP)), COUNTER_TYP, 1/*w*/> pm(radics);
|
||||
PrefetchMagic<DIGITS * DIGIT_RANGE, (64/sizeof(COUNTER_TYP)), COUNTER_TYP, 1/*w*/> pm(magics);
|
||||
}
|
||||
memset(radics, 0, sizeof(radics));
|
||||
memset(magics, 0, sizeof(magics));
|
||||
|
||||
// Calculate occurences of digits
|
||||
countOccurences(arr, size, radics);
|
||||
countOccurences(arr, size, magics);
|
||||
|
||||
//debugRadics<COUNTER_TYP>(radics);
|
||||
//debugRadics<COUNTER_TYP>(magics);
|
||||
|
||||
// Calculate prefix sums
|
||||
calcPrefixSums(radics);
|
||||
calcPrefixSums(magics);
|
||||
|
||||
//debugRadics<COUNTER_TYP>(radics);
|
||||
//debugRadics<COUNTER_TYP>(magics);
|
||||
|
||||
/* Regular (old) radix sort with small twist */
|
||||
|
||||
@ -397,7 +397,7 @@ namespace MagyarSort {
|
||||
static thread_local bool swapped;
|
||||
swapped = false; // must be separate line
|
||||
|
||||
RadixMagic<DIGITS - 1, COUNTER_TYP> r(swapped, radics, from, to, size);
|
||||
RadixMagic<DIGITS - 1, COUNTER_TYP> r(swapped, magics, from, to, size);
|
||||
|
||||
// With an other API we could spare this copy if we can delete original arr and return ptr or something...
|
||||
// I am fine with this... this is not my main idea anyways, just little ILP tweak to regular radix sort
|
||||
@ -406,7 +406,7 @@ namespace MagyarSort {
|
||||
memcpy(arr, to, size);
|
||||
}
|
||||
#ifndef NO_MLOCK
|
||||
munlock(radics, (DIGITS * DIGIT_RANGE) * sizeof(COUNTER_TYP));
|
||||
munlock(magics, (DIGITS * DIGIT_RANGE) * sizeof(COUNTER_TYP));
|
||||
munlock(&arc[0], size * sizeof(uint32_t));
|
||||
munlock(arr, size * sizeof(uint32_t));
|
||||
#endif // !NO_MLOCK
|
||||
|
||||
6
test.cpp
6
test.cpp
@ -9,8 +9,8 @@
|
||||
//#define INPUT_MOD (65536*128)
|
||||
|
||||
// Number of input elements to generate - unused when CREEL is defined!
|
||||
//#define SORT_WIDTH 200000000
|
||||
#define SORT_WIDTH 40000000
|
||||
#define SORT_WIDTH 100000000
|
||||
//#define SORT_WIDTH 40000000
|
||||
// Uncomment this to use nibbles as digits and not bytes - CREEL defines this anyways
|
||||
//#define MAGYAR_SORT_NIBBLE
|
||||
|
||||
@ -18,7 +18,7 @@
|
||||
//#define PRINT_OUTPUT
|
||||
|
||||
// Uncomment if you want to see how many elements are unique and duplicant in the input (debugging info)
|
||||
#define COUNT_DUPLICANTS
|
||||
// #define COUNT_DUPLICANTS
|
||||
|
||||
//#define SKA_SORT
|
||||
|
||||
|
||||
107
ypsu.cpp
107
ypsu.cpp
@ -33,9 +33,17 @@ void measure(const std::string &inputtype, const std::string &name,
|
||||
worst[name] = std::max(worst[name], seconds);
|
||||
}
|
||||
std::vector<std::string> inputtypes = {
|
||||
/*"constant", "asc", "desc", "ascasc", "ascdesc",
|
||||
"descasc", "descdesc", "smallrange",*/
|
||||
"rand",
|
||||
/*
|
||||
"constant"
|
||||
"asc"
|
||||
"desc"
|
||||
"ascasc"
|
||||
"ascdesc",
|
||||
"descasc"
|
||||
"descdesc"
|
||||
"rand",
|
||||
*/
|
||||
"smallrange",
|
||||
};
|
||||
std::vector<uint32_t> geninput(const std::string &type, int n) {
|
||||
std::vector<uint32_t> v(n);
|
||||
@ -113,6 +121,84 @@ void twopass(uint32_t *a, int n) {
|
||||
for (int i = 0; i < n; i++) a[bucketdata[buf[i] >> 16]++] = buf[i];
|
||||
free(buf);
|
||||
}
|
||||
|
||||
// TODO: zssort (quicksort jobbítás)
|
||||
|
||||
// mormord — Today at 2:27 AM
|
||||
// 1 2 2 2 3
|
||||
//
|
||||
// 0 1 2 3 4
|
||||
// |1|2 2 3 2
|
||||
// 1|2|2 3 2
|
||||
// 1|3|2 2 2
|
||||
// 1|2|2 2 3
|
||||
// 1|2|2 2 3
|
||||
// 1 2|2|2 3
|
||||
// ^
|
||||
// Pivot
|
||||
//
|
||||
// állítás: pivottól balra helyükön vannak az elemek rendezettségük szerint
|
||||
//
|
||||
// Kezdés Indexek = Prefix összeg - 1 (utolsó helyek az elemeknek)
|
||||
//
|
||||
// Ha pivot új helyének meghatározot index > pivot_index (|.| helye)
|
||||
// swap
|
||||
// --index
|
||||
// különben
|
||||
// ++pivot_index
|
||||
static inline uint32_t morgrab(uint32_t elem, uint32_t j) noexcept {
|
||||
return (elem >> (8 * j)) & 0xff;
|
||||
}
|
||||
static inline void mormord_sort_impl(uint32_t *a, int n, int j) noexcept {
|
||||
// Occurence count
|
||||
uint32_t prefix[256] = { 0 };
|
||||
uint32_t index[256] = { 0 };
|
||||
for(uint32_t i = 0; i < n; ++i) {
|
||||
// ++prefix[(a[i] >> (8 * j)) && 0xff];
|
||||
++prefix[morgrab(a[i], j)];
|
||||
}
|
||||
|
||||
// Prefix sum
|
||||
index[0] = prefix[0];
|
||||
for(uint32_t i = 1; i < 256; ++i) {
|
||||
prefix[i] += prefix[i - 1];
|
||||
index[i] = prefix[i];
|
||||
}
|
||||
|
||||
// Inplace swap
|
||||
uint32_t pivoti = 0;
|
||||
while(pivoti < n) {
|
||||
uint32_t radixval = morgrab(a[pivoti], j);
|
||||
uint32_t targeti = index[radixval] - 1;
|
||||
if(targeti > pivoti) {
|
||||
// swap
|
||||
uint32_t tmp = a[pivoti];
|
||||
a[pivoti] = a[targeti];
|
||||
a[targeti] = tmp;
|
||||
// dec index
|
||||
--index[radixval];
|
||||
} else {
|
||||
++pivoti;
|
||||
}
|
||||
}
|
||||
|
||||
// Ends recursion
|
||||
if(j == 0) return;
|
||||
|
||||
// Recursion
|
||||
for(uint32_t i = 0; i < 256; ++i) {
|
||||
uint32_t from = index[i];
|
||||
uint32_t to = prefix[i];
|
||||
if(from != to) {
|
||||
mormord_sort_impl(&a[from - 1], (to - (from - 1)), j - 1);
|
||||
}
|
||||
}
|
||||
}
|
||||
static inline void mormord_sort(uint32_t *a, int n) noexcept {
|
||||
assert(n * uint32_t(sizeof(a[0])) <= INT_MAX);
|
||||
mormord_sort_impl(a, n, 3);
|
||||
}
|
||||
|
||||
void fourpass(uint32_t *a, int n) {
|
||||
assert(n * int64_t(sizeof(a[0])) <= INT_MAX);
|
||||
// alloc helper buffers.
|
||||
@ -510,11 +596,11 @@ void measure_single(int n) {
|
||||
int main(void) {
|
||||
//int n = 100000000;
|
||||
//int n = 10000000;
|
||||
int n = 1000000;
|
||||
//int n = 1000000;
|
||||
//int n = 100000;
|
||||
//int n = 10000;
|
||||
//int n = 100;
|
||||
//int n = 10;
|
||||
int n = 10;
|
||||
|
||||
printf("Sorting %d elements:\n\n", n);
|
||||
|
||||
@ -532,7 +618,6 @@ int main(void) {
|
||||
measure(inputtype, "std", [&] { std::sort(std::begin(w), std::end(w)); });
|
||||
expected = w;
|
||||
w = v;
|
||||
/*
|
||||
measure(inputtype, "ska", [&] { ska_sort(std::begin(w), std::end(w)); });
|
||||
w = v;
|
||||
measure(inputtype, "ska_copy", [&] {
|
||||
@ -541,9 +626,15 @@ int main(void) {
|
||||
w.swap(buf);
|
||||
}
|
||||
});
|
||||
/*
|
||||
w = v;
|
||||
measure(inputtype, "magyar", [&] { MagyarSort::sort<uint32_t>(&w[0], w.size()); });
|
||||
assert(w == expected);
|
||||
*/
|
||||
w = v;
|
||||
measure(inputtype, "mormord", [&] { mormord_sort(&w[0], w.size()); });
|
||||
assert(w == expected);
|
||||
/*
|
||||
|
||||
w = v;
|
||||
measure(inputtype, "2pass", [&] { twopass(&w[0], w.size()); });
|
||||
@ -563,17 +654,16 @@ int main(void) {
|
||||
w = v;
|
||||
measure(inputtype, "sp", [&] { spsort(&w[0], w.size()); });
|
||||
assert(w == expected);*/
|
||||
/*
|
||||
w = v;
|
||||
measure(inputtype, "gptbuck", [&] { gpt_bucket_sort(&w[0], w.size()); });
|
||||
assert(w == expected);
|
||||
/*
|
||||
w = v;
|
||||
measure(inputtype, "magbuck", [&] { magyar_bucket_sort(&w[0], w.size()); });
|
||||
assert(w == expected);
|
||||
w = v;
|
||||
measure(inputtype, "magbuck2", [&] { magyar_bucket_sort2(&w[0], w.size()); });
|
||||
assert(w == expected);
|
||||
*/
|
||||
w = v;
|
||||
w = {10, 20, 20};
|
||||
measure(inputtype, "qsmine", [&] { thier_quicksort(&w[0], w.size()); });
|
||||
@ -597,6 +687,7 @@ int main(void) {
|
||||
}
|
||||
}
|
||||
assert(w == expected);
|
||||
*/
|
||||
/*
|
||||
w = v;
|
||||
measure(inputtype, "frewr", [&] { frewr(&w[0], w.size()); });
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user