Compare commits

...

5 Commits

Author SHA1 Message Date
Richard Thier
1d1f151c07 thier3: tricky rotation based state storing... 2025-10-02 05:48:24 +02:00
Richard Thier
7ef63734a1 magyarsort: comment about GC totally not working in my opinion 2025-10-02 04:52:13 +02:00
Richard Thier
12431f229e rthier randomized only above threshold 2025-10-02 04:51:33 +02:00
Richard Thier
9b9997cbdb Revert "simpler occurence template"
This reverts commit d487bb111b93f4ab186147fd876373f46eff0e59.
2025-10-02 02:28:54 +02:00
Richard Thier
d487bb111b simpler occurence template 2025-10-02 02:28:46 +02:00
3 changed files with 37 additions and 41 deletions

View File

@ -310,6 +310,7 @@ namespace MagyarSort {
/* SORT */ /* SORT */
// FIXME: I think the GC is not working because these are separate functions generated with sep static threadlocals!!!
/* /*
* Sort the given array (in-place sorting) with the given size. * Sort the given array (in-place sorting) with the given size.
* *

View File

@ -55,9 +55,9 @@ static inline void threepass_xb(uint32_t *a, uint32_t *buf, int n) noexcept {
} }
/* Count prefix sums - try as much ILP as possible because bigger arrays than usual! */ /* Count prefix sums - try as much ILP as possible because bigger arrays than usual! */
uint32_t prev1 = 0; int prev1 = -1;
uint32_t prev2 = 0; int prev2 = -1;
uint32_t prev3 = 0; int prev3 = -1;
uint32_t common = min3u32_xb( uint32_t common = min3u32_xb(
(1 << TPBX1), (1 << TPBX1),
(1 << TPBX2), (1 << TPBX2),
@ -89,47 +89,37 @@ static inline void threepass_xb(uint32_t *a, uint32_t *buf, int n) noexcept {
prev3 = bucket3[j]; prev3 = bucket3[j];
} }
// Bottom digit a->buf /* Move elements (the 3 pass): This utilizes frewr trick to overwrite keys (elements) by ror-ing them */
// right-to-left to ensure already sorted digits order we keep for iterations /* This rewrites the key into its rotated state to save some pipeline stalls. (3 rotates happen and in the end its ok) */
#pragma GCC unroll 48 /* right-to-left to ensure already sorted digits order we keep for iterations */
for(uint32_t i = n; i > 0; --i) {
// Prefetch caches
//__builtin_prefetch(&a[i-8]);
// Get num and its new offset / location
auto num = a[i - 1];
auto bkeyni = (num >> shr3) & mask3;
auto offset = --bucket3[bkeyni];
// Add to the proper target location /* Bottom digit a->buf */
buf[offset] = num; #pragma GCC unroll 80
for(uint32_t i = n; i > 0; --i) {
uint32_t num = a[i - 1];
/* save rotated into the masked, bucketed loc */
buf[bucket3[num & mask3]--] = (num >> TPBX3) | (num << (32 - TPBX3));
//__builtin_prefetch(&buf[bucket3[num & mask3] - 2]);
} }
// Mid digit buf->a
// right-to-left to ensure already sorted digits order we keep for iterations
#pragma GCC unroll 48
for(uint32_t i = n; i > 0; --i) {
// Prefetch caches
//__builtin_prefetch(&buf[i-8]);
// Get num and its new offset / location
auto num = buf[i - 1];
auto bkeyni = (num >> shr2) & mask2;
auto offset = --bucket2[bkeyni];
// Add to the proper target location /* Mid digit buf->a */
a[offset] = num; #pragma GCC unroll 80
for(uint32_t i = n; i > 0; --i) {
uint32_t num = buf[i - 1];
/* save rotated into the masked, bucketed loc */
a[bucket2[num & mask2]--] = (num >> TPBX2) | (num << (32 - TPBX2));
//__builtin_prefetch(&a[bucket2[num & mask2] - 2]);
} }
// Top digit a->buf
// right-to-left to ensure already sorted digits order we keep for iterations
#pragma GCC unroll 48
for(uint32_t i = n; i > 0; --i) {
// Prefetch caches
// __builtin_prefetch(&a[i-16]);
// Get num and its new offset / location
auto num = a[i - 1];
auto bkeyni = (num >> shr1) & mask1;
auto offset = --bucket1[bkeyni];
// Add to the proper target location /* Top digit a->buf */
buf[offset] = num; #pragma GCC unroll 80
for(uint32_t i = n; i > 0; --i) {
uint32_t num = a[i - 1];
/* Reconstruct the original key in this element by where its stuff is stored */
constexpr int rot = TPBX1 + (32 - TPBX1 - TPBX2 - TPBX3);
/* save rotated into the masked, bucketed loc */
buf[bucket1[num & mask1]--] = (num >> rot) | (num << (32 - rot));
//__builtin_prefetch(&buf[bucket1[num & mask1] - 2]);
} }
} }

View File

@ -231,8 +231,11 @@ static inline void do_thier3(uint32_t *a, int n) noexcept {
/** rthier */ /** rthier */
static inline void do_rthier(uint32_t *a, int n) noexcept { static inline void do_rthier(uint32_t *a, int n) noexcept {
assert(n * uint32_t(sizeof(a[0])) <= INT_MAX); assert(n * uint32_t(sizeof(a[0])) <= INT_MAX);
uint32_t junk; if(n > 140000000) {
randominus(a, n, junk); /* Helps a bit against our worst cases in big numbers */
uint32_t junk;
randominus(a, n, junk);
}
std::vector<uint32_t> tmp(n); std::vector<uint32_t> tmp(n);
thiersort3(a, &(tmp[0]), n); thiersort3(a, &(tmp[0]), n);
} }
@ -945,6 +948,7 @@ int main(int argc, char **argv) {
}); });
assert(w == expected); assert(w == expected);
/*
w = v; w = v;
measure(inputtype, "rmagyar", [&] { measure(inputtype, "rmagyar", [&] {
uint32_t junk; uint32_t junk;
@ -952,6 +956,7 @@ int main(int argc, char **argv) {
MagyarSort::sort<uint32_t>(&w[0], w.size()); MagyarSort::sort<uint32_t>(&w[0], w.size());
}); });
assert(w == expected); assert(w == expected);
*/
w = v; w = v;
measure(inputtype, "gptbuck", [&] { gpt_bucket_sort(&w[0], w.size()); }); measure(inputtype, "gptbuck", [&] { gpt_bucket_sort(&w[0], w.size()); });