From 2d2cad2c5a4fbae0d2f008b4164ffb1a49ba3a88 Mon Sep 17 00:00:00 2001 From: Richard Thier Date: Thu, 11 Apr 2024 19:45:30 +0200 Subject: [PATCH] mormord sort more branchless plus extra edge-case handling for empty sized calls --- ypsu.cpp | 46 ++++++++++++++++++++++++++++------------------ 1 file changed, 28 insertions(+), 18 deletions(-) diff --git a/ypsu.cpp b/ypsu.cpp index 81d579f..0f778d7 100644 --- a/ypsu.cpp +++ b/ypsu.cpp @@ -150,6 +150,9 @@ static inline uint32_t morgrab(uint32_t elem) noexcept { } template static inline void mormord_sort_impl(uint32_t *a, int n) noexcept { + /* Edge-case */ + if(n == 0) return; + /* Preparation */ uint32_t radics[256] = {0}; uint32_t radics2[256] = {0}; @@ -184,34 +187,41 @@ static inline void mormord_sort_impl(uint32_t *a, int n) noexcept { uint32_t reali = 0; #pragma GCC unroll 16 for(int i = 0; i < 256; ++i) { + radics[i] += prev; if(radics[i] != 0) { - radics[i] += prev; real_radics[reali] = prev; real_radics[reali + 1] = radics[i]; - prev = radics[i]; reali += 2; - } else { - radics[i] += prev; - prev = radics[i]; } + prev = radics[i]; } - // Inplace swap + + // Inplace swap - own ideas + some ideas based-on "famous" ct-swap (for branchless / more ILP): + // void ct-swap(bool secret, uint64_t a[], uint64_t b[], size_t len) { + // uint64_t mask = ~((uint64_t)secret - 1); // 1->111....111; 0->000....000 + // for (size_t i = 0; i < len; i++) { + // uint64_t delta = (a[i] ^ b[i]) & mask; + // a[i] = a[i] ^ delta; // b[i], ha secret - amúgy a[i] + // b[i] = b[i] ^ delta; // fordítva + // } + // } uint32_t pivoti = 0; while(pivoti < n) { uint32_t radixval = morgrab(a[pivoti]); - uint32_t targeti = radics[radixval] - 1; - if(targeti > pivoti) { - // swap - uint32_t tmp = a[pivoti]; - a[pivoti] = a[targeti]; - a[targeti] = tmp; - // dec index - --radics[radixval]; - } else { - // progress pivot - ++pivoti; - } + uint32_t targeti = --radics[radixval]; // dec index (!) + + // Bitmask: true -> 11.....1; false -> 00.....0 + uint32_t mask = ~((targeti > pivoti) - 1); + + // Branchless swap (using bitmask) + uint32_t delta = (a[pivoti] ^ a[targeti]) & mask; + a[pivoti] = a[pivoti] ^ delta; + a[targeti] = a[targeti] ^ delta; + + // "else" branch + pivoti += !mask; + radics[radixval] += !mask; // undec index (!) } // Ends recursion