minor threepass optimizations and thier2 variant that uses threepass (but does unnecessary work in that case: allocation, extra copies, extra step for partitioning, etc)
This commit is contained in:
parent
a17b284c8a
commit
86f81d2a1c
@ -2,7 +2,8 @@
|
|||||||
#define THIER_SORT2_H
|
#define THIER_SORT2_H
|
||||||
#include <stdint.h>
|
#include <stdint.h>
|
||||||
#include "qsort/schwab_sort.h"
|
#include "qsort/schwab_sort.h"
|
||||||
/* A non-implace tricky float-hackz based bucket sort variant. Uses schwabsort! */
|
#include "threepass.h"
|
||||||
|
/* A non-implace tricky float-hackz based bucket sort variant. Uses schwabsort or threepass! */
|
||||||
|
|
||||||
#ifdef _MSC_VER
|
#ifdef _MSC_VER
|
||||||
#define KM_PREFETCH(x)
|
#define KM_PREFETCH(x)
|
||||||
@ -147,8 +148,13 @@ static inline void thiersort2(uint32_t *arr, uint32_t *temparr, int n, sch_rand_
|
|||||||
}
|
}
|
||||||
|
|
||||||
/* Call schwabsort - only to [begin..smalli) and (biggie..end) */
|
/* Call schwabsort - only to [begin..smalli) and (biggie..end) */
|
||||||
|
#ifdef USE_SCHWAB
|
||||||
schwab_sort(arr, begin, smalli - 1, rstate);
|
schwab_sort(arr, begin, smalli - 1, rstate);
|
||||||
schwab_sort(arr, biggi + 1, end - 1, rstate);
|
schwab_sort(arr, biggi + 1, end - 1, rstate);
|
||||||
|
#else
|
||||||
|
threepass(&arr[begin], smalli - begin);
|
||||||
|
threepass(&arr[biggi + 1], end - (biggi + 1));
|
||||||
|
#endif /* USE_SCHWAB */
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
14
threepass.h
14
threepass.h
@ -2,9 +2,9 @@
|
|||||||
#define THREE_PASS_H
|
#define THREE_PASS_H
|
||||||
|
|
||||||
/* How the 32 bits gets separated? */
|
/* How the 32 bits gets separated? */
|
||||||
#define TPB1 11 // top
|
#define TPB1 12 // top
|
||||||
#define TPB2 11 // mid
|
#define TPB2 11 // mid
|
||||||
#define TPB3 10 // bottom
|
#define TPB3 9 // bottom
|
||||||
|
|
||||||
static inline constexpr uint32_t min3u32(uint32_t a, uint32_t b, uint32_t c) {
|
static inline constexpr uint32_t min3u32(uint32_t a, uint32_t b, uint32_t c) {
|
||||||
return (a <= b) ?
|
return (a <= b) ?
|
||||||
@ -33,7 +33,6 @@ static inline void threepass(uint32_t *a, int n) noexcept {
|
|||||||
|
|
||||||
uint32_t *buf = (uint32_t *)malloc(sz);
|
uint32_t *buf = (uint32_t *)malloc(sz);
|
||||||
assert(buf != NULL);
|
assert(buf != NULL);
|
||||||
memset(buf, 0, n * sizeof(uint32_t)); // XXX: TODO: REMOVE
|
|
||||||
|
|
||||||
/* Count occurences (can count together with good ILP) */
|
/* Count occurences (can count together with good ILP) */
|
||||||
#pragma GCC unroll 64
|
#pragma GCC unroll 64
|
||||||
@ -80,9 +79,10 @@ static inline void threepass(uint32_t *a, int n) noexcept {
|
|||||||
|
|
||||||
// Bottom digit
|
// Bottom digit
|
||||||
// right-to-left to ensure already sorted digits order we keep for iterations
|
// right-to-left to ensure already sorted digits order we keep for iterations
|
||||||
#pragma GCC unroll 64
|
#pragma GCC unroll 16
|
||||||
for(uint32_t i = n; i > 0; --i) {
|
for(uint32_t i = n; i > 0; --i) {
|
||||||
// Prefetch caches
|
// Prefetch caches
|
||||||
|
//__builtin_prefetch(&a[i-8]);
|
||||||
// Get num and its new offset / location
|
// Get num and its new offset / location
|
||||||
auto num = a[i - 1];
|
auto num = a[i - 1];
|
||||||
auto bkeyni = (num >> shr3) & mask3;
|
auto bkeyni = (num >> shr3) & mask3;
|
||||||
@ -93,9 +93,10 @@ static inline void threepass(uint32_t *a, int n) noexcept {
|
|||||||
}
|
}
|
||||||
// Mid digit
|
// Mid digit
|
||||||
// right-to-left to ensure already sorted digits order we keep for iterations
|
// right-to-left to ensure already sorted digits order we keep for iterations
|
||||||
#pragma GCC unroll 64
|
#pragma GCC unroll 16
|
||||||
for(uint32_t i = n; i > 0; --i) {
|
for(uint32_t i = n; i > 0; --i) {
|
||||||
// Prefetch caches
|
// Prefetch caches
|
||||||
|
//__builtin_prefetch(&buf[i-8]);
|
||||||
// Get num and its new offset / location
|
// Get num and its new offset / location
|
||||||
auto num = buf[i - 1];
|
auto num = buf[i - 1];
|
||||||
auto bkeyni = (num >> shr2) & mask2;
|
auto bkeyni = (num >> shr2) & mask2;
|
||||||
@ -106,9 +107,10 @@ static inline void threepass(uint32_t *a, int n) noexcept {
|
|||||||
}
|
}
|
||||||
// Top digit
|
// Top digit
|
||||||
// right-to-left to ensure already sorted digits order we keep for iterations
|
// right-to-left to ensure already sorted digits order we keep for iterations
|
||||||
#pragma GCC unroll 64
|
#pragma GCC unroll 16
|
||||||
for(uint32_t i = n; i > 0; --i) {
|
for(uint32_t i = n; i > 0; --i) {
|
||||||
// Prefetch caches
|
// Prefetch caches
|
||||||
|
// __builtin_prefetch(&a[i-16]);
|
||||||
// Get num and its new offset / location
|
// Get num and its new offset / location
|
||||||
auto num = a[i - 1];
|
auto num = a[i - 1];
|
||||||
auto bkeyni = (num >> shr1) & mask1;
|
auto bkeyni = (num >> shr1) & mask1;
|
||||||
|
|||||||
4
ypsu.cpp
4
ypsu.cpp
@ -887,8 +887,8 @@ int main(int argc, char **argv) {
|
|||||||
printf("Sorting %d elements:\n\n", n);
|
printf("Sorting %d elements:\n\n", n);
|
||||||
|
|
||||||
// Uncomment this for profiling and alg!
|
// Uncomment this for profiling and alg!
|
||||||
measure_single(n);
|
//measure_single(n);
|
||||||
return 0;
|
//return 0;
|
||||||
|
|
||||||
for (auto inputtype : inputtypes) {
|
for (auto inputtype : inputtypes) {
|
||||||
printf("%10s", inputtype.c_str());
|
printf("%10s", inputtype.c_str());
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user