minor threepass optimizations and thier2 variant that uses threepass (but does unnecessary work in that case: allocation, extra copies, extra step for partitioning, etc)
This commit is contained in:
parent
a17b284c8a
commit
86f81d2a1c
@ -2,7 +2,8 @@
|
||||
#define THIER_SORT2_H
|
||||
#include <stdint.h>
|
||||
#include "qsort/schwab_sort.h"
|
||||
/* A non-implace tricky float-hackz based bucket sort variant. Uses schwabsort! */
|
||||
#include "threepass.h"
|
||||
/* A non-implace tricky float-hackz based bucket sort variant. Uses schwabsort or threepass! */
|
||||
|
||||
#ifdef _MSC_VER
|
||||
#define KM_PREFETCH(x)
|
||||
@ -147,8 +148,13 @@ static inline void thiersort2(uint32_t *arr, uint32_t *temparr, int n, sch_rand_
|
||||
}
|
||||
|
||||
/* Call schwabsort - only to [begin..smalli) and (biggie..end) */
|
||||
#ifdef USE_SCHWAB
|
||||
schwab_sort(arr, begin, smalli - 1, rstate);
|
||||
schwab_sort(arr, biggi + 1, end - 1, rstate);
|
||||
#else
|
||||
threepass(&arr[begin], smalli - begin);
|
||||
threepass(&arr[biggi + 1], end - (biggi + 1));
|
||||
#endif /* USE_SCHWAB */
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
14
threepass.h
14
threepass.h
@ -2,9 +2,9 @@
|
||||
#define THREE_PASS_H
|
||||
|
||||
/* How the 32 bits gets separated? */
|
||||
#define TPB1 11 // top
|
||||
#define TPB1 12 // top
|
||||
#define TPB2 11 // mid
|
||||
#define TPB3 10 // bottom
|
||||
#define TPB3 9 // bottom
|
||||
|
||||
static inline constexpr uint32_t min3u32(uint32_t a, uint32_t b, uint32_t c) {
|
||||
return (a <= b) ?
|
||||
@ -33,7 +33,6 @@ static inline void threepass(uint32_t *a, int n) noexcept {
|
||||
|
||||
uint32_t *buf = (uint32_t *)malloc(sz);
|
||||
assert(buf != NULL);
|
||||
memset(buf, 0, n * sizeof(uint32_t)); // XXX: TODO: REMOVE
|
||||
|
||||
/* Count occurences (can count together with good ILP) */
|
||||
#pragma GCC unroll 64
|
||||
@ -80,9 +79,10 @@ static inline void threepass(uint32_t *a, int n) noexcept {
|
||||
|
||||
// Bottom digit
|
||||
// right-to-left to ensure already sorted digits order we keep for iterations
|
||||
#pragma GCC unroll 64
|
||||
#pragma GCC unroll 16
|
||||
for(uint32_t i = n; i > 0; --i) {
|
||||
// Prefetch caches
|
||||
//__builtin_prefetch(&a[i-8]);
|
||||
// Get num and its new offset / location
|
||||
auto num = a[i - 1];
|
||||
auto bkeyni = (num >> shr3) & mask3;
|
||||
@ -93,9 +93,10 @@ static inline void threepass(uint32_t *a, int n) noexcept {
|
||||
}
|
||||
// Mid digit
|
||||
// right-to-left to ensure already sorted digits order we keep for iterations
|
||||
#pragma GCC unroll 64
|
||||
#pragma GCC unroll 16
|
||||
for(uint32_t i = n; i > 0; --i) {
|
||||
// Prefetch caches
|
||||
//__builtin_prefetch(&buf[i-8]);
|
||||
// Get num and its new offset / location
|
||||
auto num = buf[i - 1];
|
||||
auto bkeyni = (num >> shr2) & mask2;
|
||||
@ -106,9 +107,10 @@ static inline void threepass(uint32_t *a, int n) noexcept {
|
||||
}
|
||||
// Top digit
|
||||
// right-to-left to ensure already sorted digits order we keep for iterations
|
||||
#pragma GCC unroll 64
|
||||
#pragma GCC unroll 16
|
||||
for(uint32_t i = n; i > 0; --i) {
|
||||
// Prefetch caches
|
||||
// __builtin_prefetch(&a[i-16]);
|
||||
// Get num and its new offset / location
|
||||
auto num = a[i - 1];
|
||||
auto bkeyni = (num >> shr1) & mask1;
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user