minor threepass optimizations and thier2 variant that uses threepass (but does unnecessary work in that case: allocation, extra copies, extra step for partitioning, etc)

This commit is contained in:
Richard Thier 2025-09-29 03:31:06 +02:00
parent a17b284c8a
commit 86f81d2a1c
3 changed files with 17 additions and 9 deletions

View File

@ -2,7 +2,8 @@
#define THIER_SORT2_H #define THIER_SORT2_H
#include <stdint.h> #include <stdint.h>
#include "qsort/schwab_sort.h" #include "qsort/schwab_sort.h"
/* A non-implace tricky float-hackz based bucket sort variant. Uses schwabsort! */ #include "threepass.h"
/* A non-implace tricky float-hackz based bucket sort variant. Uses schwabsort or threepass! */
#ifdef _MSC_VER #ifdef _MSC_VER
#define KM_PREFETCH(x) #define KM_PREFETCH(x)
@ -147,8 +148,13 @@ static inline void thiersort2(uint32_t *arr, uint32_t *temparr, int n, sch_rand_
} }
/* Call schwabsort - only to [begin..smalli) and (biggie..end) */ /* Call schwabsort - only to [begin..smalli) and (biggie..end) */
#ifdef USE_SCHWAB
schwab_sort(arr, begin, smalli - 1, rstate); schwab_sort(arr, begin, smalli - 1, rstate);
schwab_sort(arr, biggi + 1, end - 1, rstate); schwab_sort(arr, biggi + 1, end - 1, rstate);
#else
threepass(&arr[begin], smalli - begin);
threepass(&arr[biggi + 1], end - (biggi + 1));
#endif /* USE_SCHWAB */
} }
} }

View File

@ -2,9 +2,9 @@
#define THREE_PASS_H #define THREE_PASS_H
/* How the 32 bits gets separated? */ /* How the 32 bits gets separated? */
#define TPB1 11 // top #define TPB1 12 // top
#define TPB2 11 // mid #define TPB2 11 // mid
#define TPB3 10 // bottom #define TPB3 9 // bottom
static inline constexpr uint32_t min3u32(uint32_t a, uint32_t b, uint32_t c) { static inline constexpr uint32_t min3u32(uint32_t a, uint32_t b, uint32_t c) {
return (a <= b) ? return (a <= b) ?
@ -33,7 +33,6 @@ static inline void threepass(uint32_t *a, int n) noexcept {
uint32_t *buf = (uint32_t *)malloc(sz); uint32_t *buf = (uint32_t *)malloc(sz);
assert(buf != NULL); assert(buf != NULL);
memset(buf, 0, n * sizeof(uint32_t)); // XXX: TODO: REMOVE
/* Count occurences (can count together with good ILP) */ /* Count occurences (can count together with good ILP) */
#pragma GCC unroll 64 #pragma GCC unroll 64
@ -80,9 +79,10 @@ static inline void threepass(uint32_t *a, int n) noexcept {
// Bottom digit // Bottom digit
// right-to-left to ensure already sorted digits order we keep for iterations // right-to-left to ensure already sorted digits order we keep for iterations
#pragma GCC unroll 64 #pragma GCC unroll 16
for(uint32_t i = n; i > 0; --i) { for(uint32_t i = n; i > 0; --i) {
// Prefetch caches // Prefetch caches
//__builtin_prefetch(&a[i-8]);
// Get num and its new offset / location // Get num and its new offset / location
auto num = a[i - 1]; auto num = a[i - 1];
auto bkeyni = (num >> shr3) & mask3; auto bkeyni = (num >> shr3) & mask3;
@ -93,9 +93,10 @@ static inline void threepass(uint32_t *a, int n) noexcept {
} }
// Mid digit // Mid digit
// right-to-left to ensure already sorted digits order we keep for iterations // right-to-left to ensure already sorted digits order we keep for iterations
#pragma GCC unroll 64 #pragma GCC unroll 16
for(uint32_t i = n; i > 0; --i) { for(uint32_t i = n; i > 0; --i) {
// Prefetch caches // Prefetch caches
//__builtin_prefetch(&buf[i-8]);
// Get num and its new offset / location // Get num and its new offset / location
auto num = buf[i - 1]; auto num = buf[i - 1];
auto bkeyni = (num >> shr2) & mask2; auto bkeyni = (num >> shr2) & mask2;
@ -106,9 +107,10 @@ static inline void threepass(uint32_t *a, int n) noexcept {
} }
// Top digit // Top digit
// right-to-left to ensure already sorted digits order we keep for iterations // right-to-left to ensure already sorted digits order we keep for iterations
#pragma GCC unroll 64 #pragma GCC unroll 16
for(uint32_t i = n; i > 0; --i) { for(uint32_t i = n; i > 0; --i) {
// Prefetch caches // Prefetch caches
// __builtin_prefetch(&a[i-16]);
// Get num and its new offset / location // Get num and its new offset / location
auto num = a[i - 1]; auto num = a[i - 1];
auto bkeyni = (num >> shr1) & mask1; auto bkeyni = (num >> shr1) & mask1;

View File

@ -887,8 +887,8 @@ int main(int argc, char **argv) {
printf("Sorting %d elements:\n\n", n); printf("Sorting %d elements:\n\n", n);
// Uncomment this for profiling and alg! // Uncomment this for profiling and alg!
measure_single(n); //measure_single(n);
return 0; //return 0;
for (auto inputtype : inputtypes) { for (auto inputtype : inputtypes) {
printf("%10s", inputtype.c_str()); printf("%10s", inputtype.c_str());