diff --git a/data.inc b/data.inc
index 895633f..85b44e7 100644
--- a/data.inc
+++ b/data.inc
@@ -1,6 +1,7 @@
 int data[] = {
-	8, 7, 2, 1, 0, 9, 6,1
-	/*8, 7, 2, 1, 0, 9, 6,
+	8, 7, 2, 1, 0, 9, 6,1,
+	8, 7, 2, 1, 0, 9, 6,
+	8, 7, 2, 1, 0, 9, 6,
 	8, 7, 2, 1, 0, 9, 6,
 	8, 7, 2, 1, 0, 9, 6,
 	8, 7, 2, 1, 0, 9, 6,
@@ -9,5 +10,4 @@ int data[] = {
 	8, 7, 2, 1, 0, 9, 6,
 	8, 7, 2, 1, 0, 9, 6,
 	8, 7, 2, 1, 0, 9, 6,
-	8, 7, 2, 1, 0, 9, 6,*/
 };
diff --git a/schwab_sort.h b/schwab_sort.h
index f287883..f026d40 100644
--- a/schwab_sort.h
+++ b/schwab_sort.h
@@ -32,7 +32,7 @@ static inline uint32_t schwab_pick_pivot(sch_rand_state *state, uint32_t len) {
 /**
  * 4-way partitioning
  *
- * Expects: arr[plo] <= arr[pmid] <= arr[phi]
+ * Expects: arr[plo] <= kmid <= arr[phi]
  * Results: arr[low..plo - 1] <= arr[plo..pmid - 1] <= arr[pmid..phi - 1] <= arr[phi.. high]
  *
  * Also: Adding together lengths of all results arrays shrinks by 1 compared to start arr.
@@ -42,7 +42,8 @@ static inline uint32_t schwab_pick_pivot(sch_rand_state *state, uint32_t len) {
  * @param low Inclusive smallest index.
  * @param high Inclusive highest index.
  * @param plo IN-OUT: input low pivot, output index until elements <= low pivot.
- * @param pmid IN-OUT: input mid pivot, output index until elements <= mid pivot.
+ * @param kmid IN: The mid spliting value (like a pivot value, but can be imaginary nonexistent)
+ * @param pmid OUT: output index until elements <= mid pivot.
  * @param phi IN-OUT: input high pivot, output index until elements <= high pivot.
  */
 static inline void schwab_partition(
@@ -50,9 +51,14 @@ static inline void schwab_partition(
 		int low,
 		int high,
 		int *plo,
+		uint32_t kmid,
 		int *pmid,
 		int *phi) {
 
+	/* Keys only - no element copy is made here */
+	uint32_t klo = arr[*plo];
+	uint32_t khi = arr[*phi];
+
 	/* [*] Swapping arr[phi]<->arr[high] ensures stop condition later */
 	uint32_t tmphi = arr[*phi];
 	arr[*phi] = arr[high];
@@ -61,11 +67,6 @@ static inline void schwab_partition(
 	/* Aren't inclusive end indices of 4 "blocks" - b0 is smallest vals */
 	int b0 = low, b1 = low, b2 = low, b3 = low;
 
-	/* Keys only - no element copy is made here */
-	uint32_t klo = arr[*plo];
-	uint32_t kmid = arr[*pmid];
-	uint32_t khi = arr[*phi];
-
 	while(b3 < high) {
 		/* This I moved to be first for hot code path for constant / smallrange */
 		if(arr[b3] >= khi) {
@@ -106,8 +107,9 @@ static inline void schwab_partition(
 	/* This way we can return bigger index and by that */
 	/* this always removes an element per run at least */
 	tmphi = arr[b2];
-	arr[b2++] = arr[high];
+	arr[b2] = arr[high];
 	arr[high] = tmphi;
+	++b2;
 
 	/* Handle output vars as per doc comment */
 	*plo = b0;
@@ -115,7 +117,7 @@ static inline void schwab_partition(
 	*phi = b2; /* Because of: [*] */
 }
 
-/** Always at most log(n) space needing 4-way quicksort-like alg */
+/** Swabic-sort its somewhat similar to quicksort but 4-way and tricky */
 static inline void schwab_sort(
 		uint32_t *array,
 		int low,
@@ -126,70 +128,50 @@ static inline void schwab_sort(
 	while(low < high) {
 		int r0 = schwab_pick_pivot(state, (high + 1) - low) + low;
 		int r1 = schwab_pick_pivot(state, (high + 1) - low) + low;
-
-		uint32_t k0 = array[r0];
-		uint32_t k1 = array[r1];
+		uint32_t klo = array[r0];
+		uint32_t khi = array[r1];
 		int plo = r0;
 		int phi = r1;
-		if(k0 > k1) {
+		if(klo > khi) {
+			uint32_t ktmp = klo;
+			klo = khi;
+			khi = ktmp;
+
 			plo = r1;
 			phi = r0;
-			uint32_t tmp = k0;
-			k0 = k1;
-			k1 = tmp;
 		}
 
-		int r2 = schwab_pick_pivot(state, (phi + 1) - plo) + plo;
-		uint32_t k2 = array[r2];
-		int pmid = r2;
-		if(k2 < k0) {
-			int tmp = plo;
-			plo = pmid;
-			pmid = tmp;
-		} else if(k2 > k1) {
-			int tmp = phi;
-			phi = pmid;
-			pmid = tmp;
-		}
+		uint32_t kmid = klo + (khi - klo) / 2;
 
-		schwab_partition(array, low, high, &plo, &pmid, &phi);
+		int pmid;
+		schwab_partition(array, low, high, &plo, kmid, &pmid, &phi);
 
 		/* See where NOT to recurse to avoid worst case stack depth */
 		/* Rem.: These might be "not real" length but we only use them to comparisons */
 		/* REM.: The "real" lengths might be off-by-one but these are FASTER! */
 		int lolen = plo - low;
-		int lomidlen = pmid - plo;
-		int himidlen = phi - pmid;
-		int hilen = high -phi;
-		int lomax = (lolen > lomidlen) ? lolen : lomidlen;
-		int himax = (hilen > himidlen) ? hilen : himidlen;
+		int hilen = high - phi;
 
 		/* Rewrite loop for worst subtask goal and recurse others! */
 		/* Let the branch predictor try to predict input data path */
-		if(lomax < himax) {
+		/* Rem.: Best would be to check for biggest in all 4 block */
+		/*       But that would complicate codes above this point! */
+		/* Rem.: Order of operations try to be a cache-friendly as */
+		/*       possible, but had to put loops changes to the end */
+		if(lolen < hilen) {
 			schwab_sort(array, low, plo - 1, state);
 			schwab_sort(array, plo, pmid - 1, state);
-			if(hilen > himidlen) {
-				schwab_sort(array, pmid, phi - 1, state);
-				low = phi;
-				/* high = high; */
-			} else {
-				schwab_sort(array, phi, high, state);
-				low = pmid;
-				high = phi - 1;
-			}
-		} else {
 			schwab_sort(array, pmid, phi - 1, state);
+
+			low = phi;
+			/* high = high; */
+		} else {
 			schwab_sort(array, phi, high, state);
-			if(lolen < lomidlen) {
-				schwab_sort(array, low, plo - 1, state);
-				low = plo;
-				high = pmid - 1;
-			} else {
-				schwab_sort(array, plo, pmid - 1, state);
-				/* low = low; */
-				high = plo - 1;
-			}
+			schwab_sort(array, pmid, phi - 1, state);
+			schwab_sort(array, plo, pmid - 1, state);
+
+			/* low = low; */
+			high = plo - 1;
 		}
 	}
 }