From dac044fe91363183291f085ce99c40cbdc40b80d Mon Sep 17 00:00:00 2001
From: Richard Thier <magosit@outlook.hu>
Date: Tue, 16 Aug 2022 19:24:35 +0200
Subject: [PATCH] Revert "trying more ILP in spsort - not much success and will
 be reverted"

This reverts commit c7e794b7ea1e10f25e64ba29f740890f3aa93b0a.
---
 space_partitioning_sort/spsort.h | 38 +++++---------------------------
 1 file changed, 6 insertions(+), 32 deletions(-)
diff --git a/space_partitioning_sort/spsort.h b/space_partitioning_sort/spsort.h
index 313bbd0..c406526 100644
--- a/space_partitioning_sort/spsort.h
+++ b/space_partitioning_sort/spsort.h
@@ -133,19 +133,11 @@ inline uint32_t internal_mid(uint32_t low, uint32_t high) {
 inline void spsort(uint32_t *t, int n, int m = 32);
 
 /** Helper function that puts elements higher then mid to the top of the array and lower to the bottom. Returns number of bottoms */
-inline int internal_array_separate(uint32_t *t, int n, uint32_t mid, int bulk_xchg = 32) {
+inline int internal_array_separate(uint32_t *t, int n, uint32_t mid) {
 	if(n > 0) {
 		// Two heads that also read & write (both)
 		int left = 0;
 		int right = n - 1;
-
-		// These are needed for more ILP so that we can do the xchg operations in bulk
-		// and without data dependencies just do it in an unrolled loop from time to time!
-		std::vector<int> xchg_left(0);
-		std::vector<int> xchg_right(0);
-		xchg_left.reserve(bulk_xchg);
-		xchg_right.reserve(bulk_xchg);
-
 		while(left < right) {
 			// Step over already good positioned values from left
 			while((left < right) && (t[left] < mid)) {
@@ -160,30 +152,12 @@ inline int internal_array_separate(uint32_t *t, int n, uint32_t mid, int bulk_xc
 			// Extra check needed for edge-case!
 			if(left < right) {
 				// Both in wrong location - xchg them!
-				// instead of doing it right here, collect them up! (*)
-				xchg_left.push_back(left);
-				xchg_right.push_back(right);
+				auto tmp = t[right];
+				t[right] = t[left];
+				t[left] = tmp;
+				++left;
+				--right;
 			}
-
-			// See if we can do some bulk-exchange now (*)
-			// This loop the compiler should more easily unroll
-			// and the CPU should be able to schedule ILP-wise!
-			if(xchg_left.size() <= bulk_xchg) {
-				for(int i = 0; i < xchg_left.size(); ++i) {
-					auto tmp = t[xchg_left[i]];
-					t[xchg_left[i]] = t[xchg_right[i]];
-					t[xchg_right[i]] = tmp;
-				}
-				xchg_left.resize(0);
-				xchg_right.resize(0);
-			}
-		}
-
-		// Finish the remaining bulk exchanges (*)
-		for(int i = 0; i < xchg_left.size(); ++i) {
-			auto tmp = t[xchg_left[i]];
-			t[xchg_left[i]] = t[xchg_right[i]];
-			t[xchg_right[i]] = tmp;
 		}
 
 		// Edge-case increment if single elem happens in middle in the end