Compare commits
No commits in common. "66376651a3f27407623d9eee8514ed5a6dea5ccf" and "1d1f151c0730314ee4370eb288bf1f8c09824b02" have entirely different histories.
66376651a3
...
1d1f151c07
6
makefile
6
makefile
@ -13,15 +13,9 @@ release_debug_sym: test.cpp magyarsort.h
|
||||
release: test.cpp magyarsort.h
|
||||
g++ test.cpp -DNDEBUG -std=c++17 -O2 -o test.out
|
||||
|
||||
release_march: test.cpp magyarsort.h
|
||||
g++ test.cpp -DNDEBUG -std=c++17 -O2 -march=native -o test.out
|
||||
|
||||
release_ypsu: ypsu.cpp magyarsort.h
|
||||
g++ ypsu.cpp -DNDEBUG -std=c++17 -O2 -o ypsu.out
|
||||
|
||||
release_ypsu_march: ypsu.cpp magyarsort.h
|
||||
g++ ypsu.cpp -DNDEBUG -std=c++17 -O2 -march=native -fschedule-insns -o ypsu.out
|
||||
|
||||
release3_ypsu: ypsu.cpp magyarsort.h
|
||||
g++ ypsu.cpp -DNDEBUG -std=c++17 -O3 -o ypsu.out
|
||||
|
||||
|
||||
@ -55,9 +55,9 @@ static inline void threepass_xb(uint32_t *a, uint32_t *buf, int n) noexcept {
|
||||
}
|
||||
|
||||
/* Count prefix sums - try as much ILP as possible because bigger arrays than usual! */
|
||||
uint32_t prev1 = 0;
|
||||
uint32_t prev2 = 0;
|
||||
uint32_t prev3 = 0;
|
||||
int prev1 = -1;
|
||||
int prev2 = -1;
|
||||
int prev3 = -1;
|
||||
uint32_t common = min3u32_xb(
|
||||
(1 << TPBX1),
|
||||
(1 << TPBX2),
|
||||
@ -89,47 +89,37 @@ static inline void threepass_xb(uint32_t *a, uint32_t *buf, int n) noexcept {
|
||||
prev3 = bucket3[j];
|
||||
}
|
||||
|
||||
// Bottom digit a->buf
|
||||
// right-to-left to ensure already sorted digits order we keep for iterations
|
||||
#pragma GCC unroll 48
|
||||
for(uint32_t i = n; i > 0; --i) {
|
||||
// Prefetch caches
|
||||
//__builtin_prefetch(&a[i-8]);
|
||||
// Get num and its new offset / location
|
||||
auto num = a[i - 1];
|
||||
auto bkeyni = (num >> shr3) & mask3;
|
||||
auto offset = --bucket3[bkeyni];
|
||||
/* Move elements (the 3 pass): This utilizes frewr trick to overwrite keys (elements) by ror-ing them */
|
||||
/* This rewrites the key into its rotated state to save some pipeline stalls. (3 rotates happen and in the end its ok) */
|
||||
/* right-to-left to ensure already sorted digits order we keep for iterations */
|
||||
|
||||
// Add to the proper target location
|
||||
buf[offset] = num;
|
||||
/* Bottom digit a->buf */
|
||||
#pragma GCC unroll 80
|
||||
for(uint32_t i = n; i > 0; --i) {
|
||||
uint32_t num = a[i - 1];
|
||||
/* save rotated into the masked, bucketed loc */
|
||||
buf[bucket3[num & mask3]--] = (num >> TPBX3) | (num << (32 - TPBX3));
|
||||
//__builtin_prefetch(&buf[bucket3[num & mask3] - 2]);
|
||||
}
|
||||
// Mid digit buf->a
|
||||
// right-to-left to ensure already sorted digits order we keep for iterations
|
||||
#pragma GCC unroll 48
|
||||
for(uint32_t i = n; i > 0; --i) {
|
||||
// Prefetch caches
|
||||
//__builtin_prefetch(&buf[i-8]);
|
||||
// Get num and its new offset / location
|
||||
auto num = buf[i - 1];
|
||||
auto bkeyni = (num >> shr2) & mask2;
|
||||
auto offset = --bucket2[bkeyni];
|
||||
|
||||
// Add to the proper target location
|
||||
a[offset] = num;
|
||||
/* Mid digit buf->a */
|
||||
#pragma GCC unroll 80
|
||||
for(uint32_t i = n; i > 0; --i) {
|
||||
uint32_t num = buf[i - 1];
|
||||
/* save rotated into the masked, bucketed loc */
|
||||
a[bucket2[num & mask2]--] = (num >> TPBX2) | (num << (32 - TPBX2));
|
||||
//__builtin_prefetch(&a[bucket2[num & mask2] - 2]);
|
||||
}
|
||||
// Top digit a->buf
|
||||
// right-to-left to ensure already sorted digits order we keep for iterations
|
||||
#pragma GCC unroll 48
|
||||
for(uint32_t i = n; i > 0; --i) {
|
||||
// Prefetch caches
|
||||
// __builtin_prefetch(&a[i-16]);
|
||||
// Get num and its new offset / location
|
||||
auto num = a[i - 1];
|
||||
auto bkeyni = (num >> shr1) & mask1;
|
||||
auto offset = --bucket1[bkeyni];
|
||||
|
||||
// Add to the proper target location
|
||||
buf[offset] = num;
|
||||
/* Top digit a->buf */
|
||||
#pragma GCC unroll 80
|
||||
for(uint32_t i = n; i > 0; --i) {
|
||||
uint32_t num = a[i - 1];
|
||||
/* Reconstruct the original key in this element by where its stuff is stored */
|
||||
constexpr int rot = TPBX1 + (32 - TPBX1 - TPBX2 - TPBX3);
|
||||
/* save rotated into the masked, bucketed loc */
|
||||
buf[bucket1[num & mask1]--] = (num >> rot) | (num << (32 - rot));
|
||||
//__builtin_prefetch(&buf[bucket1[num & mask1] - 2]);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user