diff --git a/magyarsort.h b/magyarsort.h index a5eff63..8a189d5 100644 --- a/magyarsort.h +++ b/magyarsort.h @@ -88,16 +88,30 @@ namespace MagyarSort { /** Ends template recursion */ template struct OccurenceMagic<-1, COUNTER_TYP> { - inline OccurenceMagic(uint32_t arr[], COUNTER_TYP i, COUNTER_TYP *radicsOut) noexcept {} + inline __attribute__((always_inline)) OccurenceMagic(uint32_t arr[], COUNTER_TYP i, COUNTER_TYP *radicsOut) noexcept {} + }; + + /** ARR_END must be an (STEP * k) */ + template + struct PrefetchMagic : public PrefetchMagic<(ARR_END - STEP), STEP, ARR_T, R_OR_W, LOCALITY> { + inline __attribute__((always_inline)) PrefetchMagic(ARR_T *arr) noexcept + : PrefetchMagic<(ARR_END - STEP), STEP, ARR_T, R_OR_W, LOCALITY>(arr) { + __builtin_prefetch(&arr[ARR_END - STEP], R_OR_W, LOCALITY); + } + }; + + template + struct PrefetchMagic<0, STEP, ARR_T, R_OR_W, LOCALITY> { + inline __attribute__((always_inline)) PrefetchMagic(ARR_T *arr) noexcept {} }; template static inline void countOccurences(uint32_t arr[], COUNTER_TYP size, COUNTER_TYP *radicsOut) noexcept { - // #pragma GCC unroll 64 COUNTER_TYP i = 0; + // #pragma GCC unroll 4 for(; i < size - 64; i += 64) { - // Prefetch caches - //__builtin_prefetch(&arr[i + 64]); + // Prefetch for read level-1 cache + __builtin_prefetch(&arr[i + (1 * 16)]); // Creates no object, struct is empty OccurenceMagic(arr, i, radicsOut); OccurenceMagic(arr, i + 1, radicsOut); @@ -115,6 +129,8 @@ namespace MagyarSort { OccurenceMagic(arr, i + 13, radicsOut); OccurenceMagic(arr, i + 14, radicsOut); OccurenceMagic(arr, i + 15, radicsOut); + // Prefetch for read level-1 cache + __builtin_prefetch(&arr[i + (2 * 16)]); OccurenceMagic(arr, i + 16, radicsOut); OccurenceMagic(arr, i + 17, radicsOut); OccurenceMagic(arr, i + 18, radicsOut); @@ -131,6 +147,7 @@ namespace MagyarSort { OccurenceMagic(arr, i + 29, radicsOut); OccurenceMagic(arr, i + 30, radicsOut); OccurenceMagic(arr, i + 31, radicsOut); + __builtin_prefetch(&arr[i + (3 * 16)]); OccurenceMagic(arr, i + 32, radicsOut); OccurenceMagic(arr, i + 33, radicsOut); OccurenceMagic(arr, i + 34, radicsOut); @@ -147,6 +164,7 @@ namespace MagyarSort { OccurenceMagic(arr, i + 45, radicsOut); OccurenceMagic(arr, i + 46, radicsOut); OccurenceMagic(arr, i + 47, radicsOut); + // __builtin_prefetch(&arr[i + (4 * 16)]); // Only needed for longer than 64 unrolls OccurenceMagic(arr, i + 48, radicsOut); OccurenceMagic(arr, i + 49, radicsOut); OccurenceMagic(arr, i + 50, radicsOut); @@ -296,6 +314,9 @@ namespace MagyarSort { // Holds "digit" occurences, prefix sums, whatevers // First "DIGIT_RANGE" elem is for MSB "DIGITS", last is for LSB static thread_local COUNTER_TYP radics[DIGITS * DIGIT_RANGE]; + // Write prefetchin' + //__builtin_prefetch(&radicsOut[..], 1); + PrefetchMagic pm(radics); memset(radics, 0, sizeof(radics)); // Calculate occurences of digits