basic prefetch optimizations

This commit is contained in:
Richard Thier 2021-12-18 01:23:06 +01:00
parent e5d4ff74ad
commit e7b677e4db

View File

@ -88,16 +88,30 @@ namespace MagyarSort {
/** Ends template recursion */
template<typename COUNTER_TYP>
struct OccurenceMagic<-1, COUNTER_TYP> {
inline OccurenceMagic(uint32_t arr[], COUNTER_TYP i, COUNTER_TYP *radicsOut) noexcept {}
inline __attribute__((always_inline)) OccurenceMagic(uint32_t arr[], COUNTER_TYP i, COUNTER_TYP *radicsOut) noexcept {}
};
/** ARR_END must be an (STEP * k) */
template<int ARR_END, int STEP, typename ARR_T, int R_OR_W = 0 /* 0:R, 1:W */, int LOCALITY = 3 /* 3 is best, 0 worst*/>
struct PrefetchMagic : public PrefetchMagic<(ARR_END - STEP), STEP, ARR_T, R_OR_W, LOCALITY> {
inline __attribute__((always_inline)) PrefetchMagic(ARR_T *arr) noexcept
: PrefetchMagic<(ARR_END - STEP), STEP, ARR_T, R_OR_W, LOCALITY>(arr) {
__builtin_prefetch(&arr[ARR_END - STEP], R_OR_W, LOCALITY);
}
};
template<int STEP, typename ARR_T, int R_OR_W, int LOCALITY>
struct PrefetchMagic<0, STEP, ARR_T, R_OR_W, LOCALITY> {
inline __attribute__((always_inline)) PrefetchMagic(ARR_T *arr) noexcept {}
};
template<typename COUNTER_TYP>
static inline void countOccurences(uint32_t arr[], COUNTER_TYP size, COUNTER_TYP *radicsOut) noexcept {
// #pragma GCC unroll 64
COUNTER_TYP i = 0;
// #pragma GCC unroll 4
for(; i < size - 64; i += 64) {
// Prefetch caches
//__builtin_prefetch(&arr[i + 64]);
// Prefetch for read level-1 cache
__builtin_prefetch(&arr[i + (1 * 16)]);
// Creates no object, struct is empty
OccurenceMagic<DIGITS - 1, COUNTER_TYP>(arr, i, radicsOut);
OccurenceMagic<DIGITS - 1, COUNTER_TYP>(arr, i + 1, radicsOut);
@ -115,6 +129,8 @@ namespace MagyarSort {
OccurenceMagic<DIGITS - 1, COUNTER_TYP>(arr, i + 13, radicsOut);
OccurenceMagic<DIGITS - 1, COUNTER_TYP>(arr, i + 14, radicsOut);
OccurenceMagic<DIGITS - 1, COUNTER_TYP>(arr, i + 15, radicsOut);
// Prefetch for read level-1 cache
__builtin_prefetch(&arr[i + (2 * 16)]);
OccurenceMagic<DIGITS - 1, COUNTER_TYP>(arr, i + 16, radicsOut);
OccurenceMagic<DIGITS - 1, COUNTER_TYP>(arr, i + 17, radicsOut);
OccurenceMagic<DIGITS - 1, COUNTER_TYP>(arr, i + 18, radicsOut);
@ -131,6 +147,7 @@ namespace MagyarSort {
OccurenceMagic<DIGITS - 1, COUNTER_TYP>(arr, i + 29, radicsOut);
OccurenceMagic<DIGITS - 1, COUNTER_TYP>(arr, i + 30, radicsOut);
OccurenceMagic<DIGITS - 1, COUNTER_TYP>(arr, i + 31, radicsOut);
__builtin_prefetch(&arr[i + (3 * 16)]);
OccurenceMagic<DIGITS - 1, COUNTER_TYP>(arr, i + 32, radicsOut);
OccurenceMagic<DIGITS - 1, COUNTER_TYP>(arr, i + 33, radicsOut);
OccurenceMagic<DIGITS - 1, COUNTER_TYP>(arr, i + 34, radicsOut);
@ -147,6 +164,7 @@ namespace MagyarSort {
OccurenceMagic<DIGITS - 1, COUNTER_TYP>(arr, i + 45, radicsOut);
OccurenceMagic<DIGITS - 1, COUNTER_TYP>(arr, i + 46, radicsOut);
OccurenceMagic<DIGITS - 1, COUNTER_TYP>(arr, i + 47, radicsOut);
// __builtin_prefetch(&arr[i + (4 * 16)]); // Only needed for longer than 64 unrolls
OccurenceMagic<DIGITS - 1, COUNTER_TYP>(arr, i + 48, radicsOut);
OccurenceMagic<DIGITS - 1, COUNTER_TYP>(arr, i + 49, radicsOut);
OccurenceMagic<DIGITS - 1, COUNTER_TYP>(arr, i + 50, radicsOut);
@ -296,6 +314,9 @@ namespace MagyarSort {
// Holds "digit" occurences, prefix sums, whatevers
// First "DIGIT_RANGE" elem is for MSB "DIGITS", last is for LSB
static thread_local COUNTER_TYP radics[DIGITS * DIGIT_RANGE];
// Write prefetchin'
//__builtin_prefetch(&radicsOut[..], 1);
PrefetchMagic<DIGITS * DIGIT_RANGE, (64/sizeof(COUNTER_TYP)), COUNTER_TYP, 1/*w*/> pm(radics);
memset(radics, 0, sizeof(radics));
// Calculate occurences of digits