trying more ILP in spsort - not much success and will be reverted
This commit is contained in:
parent
f8793f71ab
commit
c7e794b7ea
@ -133,11 +133,19 @@ inline uint32_t internal_mid(uint32_t low, uint32_t high) {
|
|||||||
inline void spsort(uint32_t *t, int n, int m = 32);
|
inline void spsort(uint32_t *t, int n, int m = 32);
|
||||||
|
|
||||||
/** Helper function that puts elements higher then mid to the top of the array and lower to the bottom. Returns number of bottoms */
|
/** Helper function that puts elements higher then mid to the top of the array and lower to the bottom. Returns number of bottoms */
|
||||||
inline int internal_array_separate(uint32_t *t, int n, uint32_t mid) {
|
inline int internal_array_separate(uint32_t *t, int n, uint32_t mid, int bulk_xchg = 32) {
|
||||||
if(n > 0) {
|
if(n > 0) {
|
||||||
// Two heads that also read & write (both)
|
// Two heads that also read & write (both)
|
||||||
int left = 0;
|
int left = 0;
|
||||||
int right = n - 1;
|
int right = n - 1;
|
||||||
|
|
||||||
|
// These are needed for more ILP so that we can do the xchg operations in bulk
|
||||||
|
// and without data dependencies just do it in an unrolled loop from time to time!
|
||||||
|
std::vector<int> xchg_left(0);
|
||||||
|
std::vector<int> xchg_right(0);
|
||||||
|
xchg_left.reserve(bulk_xchg);
|
||||||
|
xchg_right.reserve(bulk_xchg);
|
||||||
|
|
||||||
while(left < right) {
|
while(left < right) {
|
||||||
// Step over already good positioned values from left
|
// Step over already good positioned values from left
|
||||||
while((left < right) && (t[left] < mid)) {
|
while((left < right) && (t[left] < mid)) {
|
||||||
@ -152,12 +160,30 @@ inline int internal_array_separate(uint32_t *t, int n, uint32_t mid) {
|
|||||||
// Extra check needed for edge-case!
|
// Extra check needed for edge-case!
|
||||||
if(left < right) {
|
if(left < right) {
|
||||||
// Both in wrong location - xchg them!
|
// Both in wrong location - xchg them!
|
||||||
auto tmp = t[right];
|
// instead of doing it right here, collect them up! (*)
|
||||||
t[right] = t[left];
|
xchg_left.push_back(left);
|
||||||
t[left] = tmp;
|
xchg_right.push_back(right);
|
||||||
++left;
|
|
||||||
--right;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// See if we can do some bulk-exchange now (*)
|
||||||
|
// This loop the compiler should more easily unroll
|
||||||
|
// and the CPU should be able to schedule ILP-wise!
|
||||||
|
if(xchg_left.size() <= bulk_xchg) {
|
||||||
|
for(int i = 0; i < xchg_left.size(); ++i) {
|
||||||
|
auto tmp = t[xchg_left[i]];
|
||||||
|
t[xchg_left[i]] = t[xchg_right[i]];
|
||||||
|
t[xchg_right[i]] = tmp;
|
||||||
|
}
|
||||||
|
xchg_left.resize(0);
|
||||||
|
xchg_right.resize(0);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Finish the remaining bulk exchanges (*)
|
||||||
|
for(int i = 0; i < xchg_left.size(); ++i) {
|
||||||
|
auto tmp = t[xchg_left[i]];
|
||||||
|
t[xchg_left[i]] = t[xchg_right[i]];
|
||||||
|
t[xchg_right[i]] = tmp;
|
||||||
}
|
}
|
||||||
|
|
||||||
// Edge-case increment if single elem happens in middle in the end
|
// Edge-case increment if single elem happens in middle in the end
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user