Compare commits
5 Commits
b5aeaa1bdb
...
1d1f151c07
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
1d1f151c07 | ||
|
|
7ef63734a1 | ||
|
|
12431f229e | ||
|
|
9b9997cbdb | ||
|
|
d487bb111b |
@ -310,6 +310,7 @@ namespace MagyarSort {
|
|||||||
|
|
||||||
/* SORT */
|
/* SORT */
|
||||||
|
|
||||||
|
// FIXME: I think the GC is not working because these are separate functions generated with sep static threadlocals!!!
|
||||||
/*
|
/*
|
||||||
* Sort the given array (in-place sorting) with the given size.
|
* Sort the given array (in-place sorting) with the given size.
|
||||||
*
|
*
|
||||||
|
|||||||
@ -55,9 +55,9 @@ static inline void threepass_xb(uint32_t *a, uint32_t *buf, int n) noexcept {
|
|||||||
}
|
}
|
||||||
|
|
||||||
/* Count prefix sums - try as much ILP as possible because bigger arrays than usual! */
|
/* Count prefix sums - try as much ILP as possible because bigger arrays than usual! */
|
||||||
uint32_t prev1 = 0;
|
int prev1 = -1;
|
||||||
uint32_t prev2 = 0;
|
int prev2 = -1;
|
||||||
uint32_t prev3 = 0;
|
int prev3 = -1;
|
||||||
uint32_t common = min3u32_xb(
|
uint32_t common = min3u32_xb(
|
||||||
(1 << TPBX1),
|
(1 << TPBX1),
|
||||||
(1 << TPBX2),
|
(1 << TPBX2),
|
||||||
@ -89,47 +89,37 @@ static inline void threepass_xb(uint32_t *a, uint32_t *buf, int n) noexcept {
|
|||||||
prev3 = bucket3[j];
|
prev3 = bucket3[j];
|
||||||
}
|
}
|
||||||
|
|
||||||
// Bottom digit a->buf
|
/* Move elements (the 3 pass): This utilizes frewr trick to overwrite keys (elements) by ror-ing them */
|
||||||
// right-to-left to ensure already sorted digits order we keep for iterations
|
/* This rewrites the key into its rotated state to save some pipeline stalls. (3 rotates happen and in the end its ok) */
|
||||||
#pragma GCC unroll 48
|
/* right-to-left to ensure already sorted digits order we keep for iterations */
|
||||||
for(uint32_t i = n; i > 0; --i) {
|
|
||||||
// Prefetch caches
|
|
||||||
//__builtin_prefetch(&a[i-8]);
|
|
||||||
// Get num and its new offset / location
|
|
||||||
auto num = a[i - 1];
|
|
||||||
auto bkeyni = (num >> shr3) & mask3;
|
|
||||||
auto offset = --bucket3[bkeyni];
|
|
||||||
|
|
||||||
// Add to the proper target location
|
/* Bottom digit a->buf */
|
||||||
buf[offset] = num;
|
#pragma GCC unroll 80
|
||||||
|
for(uint32_t i = n; i > 0; --i) {
|
||||||
|
uint32_t num = a[i - 1];
|
||||||
|
/* save rotated into the masked, bucketed loc */
|
||||||
|
buf[bucket3[num & mask3]--] = (num >> TPBX3) | (num << (32 - TPBX3));
|
||||||
|
//__builtin_prefetch(&buf[bucket3[num & mask3] - 2]);
|
||||||
}
|
}
|
||||||
// Mid digit buf->a
|
|
||||||
// right-to-left to ensure already sorted digits order we keep for iterations
|
|
||||||
#pragma GCC unroll 48
|
|
||||||
for(uint32_t i = n; i > 0; --i) {
|
|
||||||
// Prefetch caches
|
|
||||||
//__builtin_prefetch(&buf[i-8]);
|
|
||||||
// Get num and its new offset / location
|
|
||||||
auto num = buf[i - 1];
|
|
||||||
auto bkeyni = (num >> shr2) & mask2;
|
|
||||||
auto offset = --bucket2[bkeyni];
|
|
||||||
|
|
||||||
// Add to the proper target location
|
/* Mid digit buf->a */
|
||||||
a[offset] = num;
|
#pragma GCC unroll 80
|
||||||
|
for(uint32_t i = n; i > 0; --i) {
|
||||||
|
uint32_t num = buf[i - 1];
|
||||||
|
/* save rotated into the masked, bucketed loc */
|
||||||
|
a[bucket2[num & mask2]--] = (num >> TPBX2) | (num << (32 - TPBX2));
|
||||||
|
//__builtin_prefetch(&a[bucket2[num & mask2] - 2]);
|
||||||
}
|
}
|
||||||
// Top digit a->buf
|
|
||||||
// right-to-left to ensure already sorted digits order we keep for iterations
|
|
||||||
#pragma GCC unroll 48
|
|
||||||
for(uint32_t i = n; i > 0; --i) {
|
|
||||||
// Prefetch caches
|
|
||||||
// __builtin_prefetch(&a[i-16]);
|
|
||||||
// Get num and its new offset / location
|
|
||||||
auto num = a[i - 1];
|
|
||||||
auto bkeyni = (num >> shr1) & mask1;
|
|
||||||
auto offset = --bucket1[bkeyni];
|
|
||||||
|
|
||||||
// Add to the proper target location
|
/* Top digit a->buf */
|
||||||
buf[offset] = num;
|
#pragma GCC unroll 80
|
||||||
|
for(uint32_t i = n; i > 0; --i) {
|
||||||
|
uint32_t num = a[i - 1];
|
||||||
|
/* Reconstruct the original key in this element by where its stuff is stored */
|
||||||
|
constexpr int rot = TPBX1 + (32 - TPBX1 - TPBX2 - TPBX3);
|
||||||
|
/* save rotated into the masked, bucketed loc */
|
||||||
|
buf[bucket1[num & mask1]--] = (num >> rot) | (num << (32 - rot));
|
||||||
|
//__builtin_prefetch(&buf[bucket1[num & mask1] - 2]);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
5
ypsu.cpp
5
ypsu.cpp
@ -231,8 +231,11 @@ static inline void do_thier3(uint32_t *a, int n) noexcept {
|
|||||||
/** rthier */
|
/** rthier */
|
||||||
static inline void do_rthier(uint32_t *a, int n) noexcept {
|
static inline void do_rthier(uint32_t *a, int n) noexcept {
|
||||||
assert(n * uint32_t(sizeof(a[0])) <= INT_MAX);
|
assert(n * uint32_t(sizeof(a[0])) <= INT_MAX);
|
||||||
|
if(n > 140000000) {
|
||||||
|
/* Helps a bit against our worst cases in big numbers */
|
||||||
uint32_t junk;
|
uint32_t junk;
|
||||||
randominus(a, n, junk);
|
randominus(a, n, junk);
|
||||||
|
}
|
||||||
std::vector<uint32_t> tmp(n);
|
std::vector<uint32_t> tmp(n);
|
||||||
thiersort3(a, &(tmp[0]), n);
|
thiersort3(a, &(tmp[0]), n);
|
||||||
}
|
}
|
||||||
@ -945,6 +948,7 @@ int main(int argc, char **argv) {
|
|||||||
});
|
});
|
||||||
assert(w == expected);
|
assert(w == expected);
|
||||||
|
|
||||||
|
/*
|
||||||
w = v;
|
w = v;
|
||||||
measure(inputtype, "rmagyar", [&] {
|
measure(inputtype, "rmagyar", [&] {
|
||||||
uint32_t junk;
|
uint32_t junk;
|
||||||
@ -952,6 +956,7 @@ int main(int argc, char **argv) {
|
|||||||
MagyarSort::sort<uint32_t>(&w[0], w.size());
|
MagyarSort::sort<uint32_t>(&w[0], w.size());
|
||||||
});
|
});
|
||||||
assert(w == expected);
|
assert(w == expected);
|
||||||
|
*/
|
||||||
|
|
||||||
w = v;
|
w = v;
|
||||||
measure(inputtype, "gptbuck", [&] { gpt_bucket_sort(&w[0], w.size()); });
|
measure(inputtype, "gptbuck", [&] { gpt_bucket_sort(&w[0], w.size()); });
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user