From b32c7540a21f3412a427350c59abf553877d99d6 Mon Sep 17 00:00:00 2001 From: Richard Thier Date: Fri, 9 May 2025 03:29:20 +0200 Subject: [PATCH] schwab: goto-optimized - barely slower on my machine --- schwab_sort.h | 28 +++++++++++++++++++++------- 1 file changed, 21 insertions(+), 7 deletions(-) diff --git a/schwab_sort.h b/schwab_sort.h index 36bd2d6..9cd2846 100644 --- a/schwab_sort.h +++ b/schwab_sort.h @@ -158,11 +158,24 @@ static inline int schwab_partition( /* TODO: should be copy of whole element when not just uint32s! */ uint32_t curr = arr[b3]; - /* TODO: We can do "ILP-memcpy"s here: - * - * Key from b2->b3, value from b2->b3, key from b1->b2, value from b1... etc - * This is likely faster than calling a memcpy if we code this for not just uint32s! - */ + /* Half-branchless and half-goto trickery */ + int where = (curr < klo) ? 0 : + ((curr < kmid) ? 1 : 2); + int target = (curr < klo) ? b0 : + ((curr < kmid) ? b1 : b2); + + arr[b3] = arr[b2]; + if(where == 2) goto auss; + arr[b2] = arr[b1]; + if(where == 1) goto auss; + arr[b1] = arr[b0]; +auss: + ++b2; + b1 += (where < 2); + b0 += (where < 1); + arr[target] = curr; + + /* Same as this would have been: if(curr < klo) { arr[b3] = arr[b2]; arr[b2] = arr[b1]; @@ -176,12 +189,13 @@ static inline int schwab_partition( arr[b3] = arr[b2]; arr[b2] = arr[b1]; arr[b1] = curr; - ++b1; ++b2; + ++b1; } else { arr[b3] = arr[b2]; arr[b2] = curr; - ++b2; } + ++b2; + */ } /* [*] Swap the chosen pivot to begin of last block */