magyarsort/simd-sort/avx2-altquicksort.h

462 lines
15 KiB
C
Raw Permalink Normal View History

#pragma once
#include <x86intrin.h>
#include <cstdint>
/**
* This is an alternative approach to SIMD quicksort, implemented by D. Lemire.
* It is not meant to be fast as such, but it can serve as a useful reference.
*/
// When defined smaller reverseshufflemask is used.
//#define BYTE_PATTERN
// When defined histogram for comparison mask is collected
//#define WITH_PVBYTE_HISTOGRAM
#if WITH_RUNTIME_STATS == 0 && defined(WITH_PVBYTE_HISTOGRAM)
# undef WITH_RUNTIME_STATS
#endif
// can be replaced with VCOMPRESS on AVX-512
static
#ifdef BYTE_PATTERN
uint8_t
#else
uint32_t
#endif
reverseshufflemask[256 * 8] __attribute__((aligned(0x100))) = {
0, 1, 2, 3, 4, 5, 6, 7, /* 0*/
1, 2, 3, 4, 5, 6, 7, 0, /* 1*/
0, 2, 3, 4, 5, 6, 7, 1, /* 2*/
2, 3, 4, 5, 6, 7, 0, 1, /* 3*/
0, 1, 3, 4, 5, 6, 7, 2, /* 4*/
1, 3, 4, 5, 6, 7, 0, 2, /* 5*/
0, 3, 4, 5, 6, 7, 1, 2, /* 6*/
3, 4, 5, 6, 7, 0, 1, 2, /* 7*/
0, 1, 2, 4, 5, 6, 7, 3, /* 8*/
1, 2, 4, 5, 6, 7, 0, 3, /* 9*/
0, 2, 4, 5, 6, 7, 1, 3, /* 10*/
2, 4, 5, 6, 7, 0, 1, 3, /* 11*/
0, 1, 4, 5, 6, 7, 2, 3, /* 12*/
1, 4, 5, 6, 7, 0, 2, 3, /* 13*/
0, 4, 5, 6, 7, 1, 2, 3, /* 14*/
4, 5, 6, 7, 0, 1, 2, 3, /* 15*/
0, 1, 2, 3, 5, 6, 7, 4, /* 16*/
1, 2, 3, 5, 6, 7, 0, 4, /* 17*/
0, 2, 3, 5, 6, 7, 1, 4, /* 18*/
2, 3, 5, 6, 7, 0, 1, 4, /* 19*/
0, 1, 3, 5, 6, 7, 2, 4, /* 20*/
1, 3, 5, 6, 7, 0, 2, 4, /* 21*/
0, 3, 5, 6, 7, 1, 2, 4, /* 22*/
3, 5, 6, 7, 0, 1, 2, 4, /* 23*/
0, 1, 2, 5, 6, 7, 3, 4, /* 24*/
1, 2, 5, 6, 7, 0, 3, 4, /* 25*/
0, 2, 5, 6, 7, 1, 3, 4, /* 26*/
2, 5, 6, 7, 0, 1, 3, 4, /* 27*/
0, 1, 5, 6, 7, 2, 3, 4, /* 28*/
1, 5, 6, 7, 0, 2, 3, 4, /* 29*/
0, 5, 6, 7, 1, 2, 3, 4, /* 30*/
5, 6, 7, 0, 1, 2, 3, 4, /* 31*/
0, 1, 2, 3, 4, 6, 7, 5, /* 32*/
1, 2, 3, 4, 6, 7, 0, 5, /* 33*/
0, 2, 3, 4, 6, 7, 1, 5, /* 34*/
2, 3, 4, 6, 7, 0, 1, 5, /* 35*/
0, 1, 3, 4, 6, 7, 2, 5, /* 36*/
1, 3, 4, 6, 7, 0, 2, 5, /* 37*/
0, 3, 4, 6, 7, 1, 2, 5, /* 38*/
3, 4, 6, 7, 0, 1, 2, 5, /* 39*/
0, 1, 2, 4, 6, 7, 3, 5, /* 40*/
1, 2, 4, 6, 7, 0, 3, 5, /* 41*/
0, 2, 4, 6, 7, 1, 3, 5, /* 42*/
2, 4, 6, 7, 0, 1, 3, 5, /* 43*/
0, 1, 4, 6, 7, 2, 3, 5, /* 44*/
1, 4, 6, 7, 0, 2, 3, 5, /* 45*/
0, 4, 6, 7, 1, 2, 3, 5, /* 46*/
4, 6, 7, 0, 1, 2, 3, 5, /* 47*/
0, 1, 2, 3, 6, 7, 4, 5, /* 48*/
1, 2, 3, 6, 7, 0, 4, 5, /* 49*/
0, 2, 3, 6, 7, 1, 4, 5, /* 50*/
2, 3, 6, 7, 0, 1, 4, 5, /* 51*/
0, 1, 3, 6, 7, 2, 4, 5, /* 52*/
1, 3, 6, 7, 0, 2, 4, 5, /* 53*/
0, 3, 6, 7, 1, 2, 4, 5, /* 54*/
3, 6, 7, 0, 1, 2, 4, 5, /* 55*/
0, 1, 2, 6, 7, 3, 4, 5, /* 56*/
1, 2, 6, 7, 0, 3, 4, 5, /* 57*/
0, 2, 6, 7, 1, 3, 4, 5, /* 58*/
2, 6, 7, 0, 1, 3, 4, 5, /* 59*/
0, 1, 6, 7, 2, 3, 4, 5, /* 60*/
1, 6, 7, 0, 2, 3, 4, 5, /* 61*/
0, 6, 7, 1, 2, 3, 4, 5, /* 62*/
6, 7, 0, 1, 2, 3, 4, 5, /* 63*/
0, 1, 2, 3, 4, 5, 7, 6, /* 64*/
1, 2, 3, 4, 5, 7, 0, 6, /* 65*/
0, 2, 3, 4, 5, 7, 1, 6, /* 66*/
2, 3, 4, 5, 7, 0, 1, 6, /* 67*/
0, 1, 3, 4, 5, 7, 2, 6, /* 68*/
1, 3, 4, 5, 7, 0, 2, 6, /* 69*/
0, 3, 4, 5, 7, 1, 2, 6, /* 70*/
3, 4, 5, 7, 0, 1, 2, 6, /* 71*/
0, 1, 2, 4, 5, 7, 3, 6, /* 72*/
1, 2, 4, 5, 7, 0, 3, 6, /* 73*/
0, 2, 4, 5, 7, 1, 3, 6, /* 74*/
2, 4, 5, 7, 0, 1, 3, 6, /* 75*/
0, 1, 4, 5, 7, 2, 3, 6, /* 76*/
1, 4, 5, 7, 0, 2, 3, 6, /* 77*/
0, 4, 5, 7, 1, 2, 3, 6, /* 78*/
4, 5, 7, 0, 1, 2, 3, 6, /* 79*/
0, 1, 2, 3, 5, 7, 4, 6, /* 80*/
1, 2, 3, 5, 7, 0, 4, 6, /* 81*/
0, 2, 3, 5, 7, 1, 4, 6, /* 82*/
2, 3, 5, 7, 0, 1, 4, 6, /* 83*/
0, 1, 3, 5, 7, 2, 4, 6, /* 84*/
1, 3, 5, 7, 0, 2, 4, 6, /* 85*/
0, 3, 5, 7, 1, 2, 4, 6, /* 86*/
3, 5, 7, 0, 1, 2, 4, 6, /* 87*/
0, 1, 2, 5, 7, 3, 4, 6, /* 88*/
1, 2, 5, 7, 0, 3, 4, 6, /* 89*/
0, 2, 5, 7, 1, 3, 4, 6, /* 90*/
2, 5, 7, 0, 1, 3, 4, 6, /* 91*/
0, 1, 5, 7, 2, 3, 4, 6, /* 92*/
1, 5, 7, 0, 2, 3, 4, 6, /* 93*/
0, 5, 7, 1, 2, 3, 4, 6, /* 94*/
5, 7, 0, 1, 2, 3, 4, 6, /* 95*/
0, 1, 2, 3, 4, 7, 5, 6, /* 96*/
1, 2, 3, 4, 7, 0, 5, 6, /* 97*/
0, 2, 3, 4, 7, 1, 5, 6, /* 98*/
2, 3, 4, 7, 0, 1, 5, 6, /* 99*/
0, 1, 3, 4, 7, 2, 5, 6, /* 100*/
1, 3, 4, 7, 0, 2, 5, 6, /* 101*/
0, 3, 4, 7, 1, 2, 5, 6, /* 102*/
3, 4, 7, 0, 1, 2, 5, 6, /* 103*/
0, 1, 2, 4, 7, 3, 5, 6, /* 104*/
1, 2, 4, 7, 0, 3, 5, 6, /* 105*/
0, 2, 4, 7, 1, 3, 5, 6, /* 106*/
2, 4, 7, 0, 1, 3, 5, 6, /* 107*/
0, 1, 4, 7, 2, 3, 5, 6, /* 108*/
1, 4, 7, 0, 2, 3, 5, 6, /* 109*/
0, 4, 7, 1, 2, 3, 5, 6, /* 110*/
4, 7, 0, 1, 2, 3, 5, 6, /* 111*/
0, 1, 2, 3, 7, 4, 5, 6, /* 112*/
1, 2, 3, 7, 0, 4, 5, 6, /* 113*/
0, 2, 3, 7, 1, 4, 5, 6, /* 114*/
2, 3, 7, 0, 1, 4, 5, 6, /* 115*/
0, 1, 3, 7, 2, 4, 5, 6, /* 116*/
1, 3, 7, 0, 2, 4, 5, 6, /* 117*/
0, 3, 7, 1, 2, 4, 5, 6, /* 118*/
3, 7, 0, 1, 2, 4, 5, 6, /* 119*/
0, 1, 2, 7, 3, 4, 5, 6, /* 120*/
1, 2, 7, 0, 3, 4, 5, 6, /* 121*/
0, 2, 7, 1, 3, 4, 5, 6, /* 122*/
2, 7, 0, 1, 3, 4, 5, 6, /* 123*/
0, 1, 7, 2, 3, 4, 5, 6, /* 124*/
1, 7, 0, 2, 3, 4, 5, 6, /* 125*/
0, 7, 1, 2, 3, 4, 5, 6, /* 126*/
7, 0, 1, 2, 3, 4, 5, 6, /* 127*/
0, 1, 2, 3, 4, 5, 6, 7, /* 128*/
1, 2, 3, 4, 5, 6, 0, 7, /* 129*/
0, 2, 3, 4, 5, 6, 1, 7, /* 130*/
2, 3, 4, 5, 6, 0, 1, 7, /* 131*/
0, 1, 3, 4, 5, 6, 2, 7, /* 132*/
1, 3, 4, 5, 6, 0, 2, 7, /* 133*/
0, 3, 4, 5, 6, 1, 2, 7, /* 134*/
3, 4, 5, 6, 0, 1, 2, 7, /* 135*/
0, 1, 2, 4, 5, 6, 3, 7, /* 136*/
1, 2, 4, 5, 6, 0, 3, 7, /* 137*/
0, 2, 4, 5, 6, 1, 3, 7, /* 138*/
2, 4, 5, 6, 0, 1, 3, 7, /* 139*/
0, 1, 4, 5, 6, 2, 3, 7, /* 140*/
1, 4, 5, 6, 0, 2, 3, 7, /* 141*/
0, 4, 5, 6, 1, 2, 3, 7, /* 142*/
4, 5, 6, 0, 1, 2, 3, 7, /* 143*/
0, 1, 2, 3, 5, 6, 4, 7, /* 144*/
1, 2, 3, 5, 6, 0, 4, 7, /* 145*/
0, 2, 3, 5, 6, 1, 4, 7, /* 146*/
2, 3, 5, 6, 0, 1, 4, 7, /* 147*/
0, 1, 3, 5, 6, 2, 4, 7, /* 148*/
1, 3, 5, 6, 0, 2, 4, 7, /* 149*/
0, 3, 5, 6, 1, 2, 4, 7, /* 150*/
3, 5, 6, 0, 1, 2, 4, 7, /* 151*/
0, 1, 2, 5, 6, 3, 4, 7, /* 152*/
1, 2, 5, 6, 0, 3, 4, 7, /* 153*/
0, 2, 5, 6, 1, 3, 4, 7, /* 154*/
2, 5, 6, 0, 1, 3, 4, 7, /* 155*/
0, 1, 5, 6, 2, 3, 4, 7, /* 156*/
1, 5, 6, 0, 2, 3, 4, 7, /* 157*/
0, 5, 6, 1, 2, 3, 4, 7, /* 158*/
5, 6, 0, 1, 2, 3, 4, 7, /* 159*/
0, 1, 2, 3, 4, 6, 5, 7, /* 160*/
1, 2, 3, 4, 6, 0, 5, 7, /* 161*/
0, 2, 3, 4, 6, 1, 5, 7, /* 162*/
2, 3, 4, 6, 0, 1, 5, 7, /* 163*/
0, 1, 3, 4, 6, 2, 5, 7, /* 164*/
1, 3, 4, 6, 0, 2, 5, 7, /* 165*/
0, 3, 4, 6, 1, 2, 5, 7, /* 166*/
3, 4, 6, 0, 1, 2, 5, 7, /* 167*/
0, 1, 2, 4, 6, 3, 5, 7, /* 168*/
1, 2, 4, 6, 0, 3, 5, 7, /* 169*/
0, 2, 4, 6, 1, 3, 5, 7, /* 170*/
2, 4, 6, 0, 1, 3, 5, 7, /* 171*/
0, 1, 4, 6, 2, 3, 5, 7, /* 172*/
1, 4, 6, 0, 2, 3, 5, 7, /* 173*/
0, 4, 6, 1, 2, 3, 5, 7, /* 174*/
4, 6, 0, 1, 2, 3, 5, 7, /* 175*/
0, 1, 2, 3, 6, 4, 5, 7, /* 176*/
1, 2, 3, 6, 0, 4, 5, 7, /* 177*/
0, 2, 3, 6, 1, 4, 5, 7, /* 178*/
2, 3, 6, 0, 1, 4, 5, 7, /* 179*/
0, 1, 3, 6, 2, 4, 5, 7, /* 180*/
1, 3, 6, 0, 2, 4, 5, 7, /* 181*/
0, 3, 6, 1, 2, 4, 5, 7, /* 182*/
3, 6, 0, 1, 2, 4, 5, 7, /* 183*/
0, 1, 2, 6, 3, 4, 5, 7, /* 184*/
1, 2, 6, 0, 3, 4, 5, 7, /* 185*/
0, 2, 6, 1, 3, 4, 5, 7, /* 186*/
2, 6, 0, 1, 3, 4, 5, 7, /* 187*/
0, 1, 6, 2, 3, 4, 5, 7, /* 188*/
1, 6, 0, 2, 3, 4, 5, 7, /* 189*/
0, 6, 1, 2, 3, 4, 5, 7, /* 190*/
6, 0, 1, 2, 3, 4, 5, 7, /* 191*/
0, 1, 2, 3, 4, 5, 6, 7, /* 192*/
1, 2, 3, 4, 5, 0, 6, 7, /* 193*/
0, 2, 3, 4, 5, 1, 6, 7, /* 194*/
2, 3, 4, 5, 0, 1, 6, 7, /* 195*/
0, 1, 3, 4, 5, 2, 6, 7, /* 196*/
1, 3, 4, 5, 0, 2, 6, 7, /* 197*/
0, 3, 4, 5, 1, 2, 6, 7, /* 198*/
3, 4, 5, 0, 1, 2, 6, 7, /* 199*/
0, 1, 2, 4, 5, 3, 6, 7, /* 200*/
1, 2, 4, 5, 0, 3, 6, 7, /* 201*/
0, 2, 4, 5, 1, 3, 6, 7, /* 202*/
2, 4, 5, 0, 1, 3, 6, 7, /* 203*/
0, 1, 4, 5, 2, 3, 6, 7, /* 204*/
1, 4, 5, 0, 2, 3, 6, 7, /* 205*/
0, 4, 5, 1, 2, 3, 6, 7, /* 206*/
4, 5, 0, 1, 2, 3, 6, 7, /* 207*/
0, 1, 2, 3, 5, 4, 6, 7, /* 208*/
1, 2, 3, 5, 0, 4, 6, 7, /* 209*/
0, 2, 3, 5, 1, 4, 6, 7, /* 210*/
2, 3, 5, 0, 1, 4, 6, 7, /* 211*/
0, 1, 3, 5, 2, 4, 6, 7, /* 212*/
1, 3, 5, 0, 2, 4, 6, 7, /* 213*/
0, 3, 5, 1, 2, 4, 6, 7, /* 214*/
3, 5, 0, 1, 2, 4, 6, 7, /* 215*/
0, 1, 2, 5, 3, 4, 6, 7, /* 216*/
1, 2, 5, 0, 3, 4, 6, 7, /* 217*/
0, 2, 5, 1, 3, 4, 6, 7, /* 218*/
2, 5, 0, 1, 3, 4, 6, 7, /* 219*/
0, 1, 5, 2, 3, 4, 6, 7, /* 220*/
1, 5, 0, 2, 3, 4, 6, 7, /* 221*/
0, 5, 1, 2, 3, 4, 6, 7, /* 222*/
5, 0, 1, 2, 3, 4, 6, 7, /* 223*/
0, 1, 2, 3, 4, 5, 6, 7, /* 224*/
1, 2, 3, 4, 0, 5, 6, 7, /* 225*/
0, 2, 3, 4, 1, 5, 6, 7, /* 226*/
2, 3, 4, 0, 1, 5, 6, 7, /* 227*/
0, 1, 3, 4, 2, 5, 6, 7, /* 228*/
1, 3, 4, 0, 2, 5, 6, 7, /* 229*/
0, 3, 4, 1, 2, 5, 6, 7, /* 230*/
3, 4, 0, 1, 2, 5, 6, 7, /* 231*/
0, 1, 2, 4, 3, 5, 6, 7, /* 232*/
1, 2, 4, 0, 3, 5, 6, 7, /* 233*/
0, 2, 4, 1, 3, 5, 6, 7, /* 234*/
2, 4, 0, 1, 3, 5, 6, 7, /* 235*/
0, 1, 4, 2, 3, 5, 6, 7, /* 236*/
1, 4, 0, 2, 3, 5, 6, 7, /* 237*/
0, 4, 1, 2, 3, 5, 6, 7, /* 238*/
4, 0, 1, 2, 3, 5, 6, 7, /* 239*/
0, 1, 2, 3, 4, 5, 6, 7, /* 240*/
1, 2, 3, 0, 4, 5, 6, 7, /* 241*/
0, 2, 3, 1, 4, 5, 6, 7, /* 242*/
2, 3, 0, 1, 4, 5, 6, 7, /* 243*/
0, 1, 3, 2, 4, 5, 6, 7, /* 244*/
1, 3, 0, 2, 4, 5, 6, 7, /* 245*/
0, 3, 1, 2, 4, 5, 6, 7, /* 246*/
3, 0, 1, 2, 4, 5, 6, 7, /* 247*/
0, 1, 2, 3, 4, 5, 6, 7, /* 248*/
1, 2, 0, 3, 4, 5, 6, 7, /* 249*/
0, 2, 1, 3, 4, 5, 6, 7, /* 250*/
2, 0, 1, 3, 4, 5, 6, 7, /* 251*/
0, 1, 2, 3, 4, 5, 6, 7, /* 252*/
1, 0, 2, 3, 4, 5, 6, 7, /* 253*/
0, 1, 2, 3, 4, 5, 6, 7, /* 254*/
0, 1, 2, 3, 4, 5, 6, 7, /* 255*/
};
static FORCE_INLINE __m256i get_permutation_vector(int pvbyte) {
#ifdef BYTE_PATTERN
__m256i shufm;
asm volatile (
"vpmovzxbd (%1), %0"
: "=X" (shufm)
: "r" (reverseshufflemask + 8 * pvbyte)
);
return shufm;
#else
return _mm256_load_si256((__m256i *)(reverseshufflemask + 8 * pvbyte));
#endif
}
static uint32_t avx_pivot_on_last_value(int32_t *array, size_t length) {
/* we run through the data. Anything in [0,boundary) is smaller or equal
* than the pivot, and the value at boundary - 1 is going to be equal to the
* pivot at the end,
* anything in (boundary, i) is greater than the pivot
* stuff in [i,...) is grey
* the function returns the location of the boundary.
*/
if (length <= 1)
return 1;
{ // we exchange the last value for the middle value for a better pivot
int32_t ival = array[length / 2];
int32_t bval = array[length - 1];
array[length / 2] = bval;
array[length - 1] = ival;
}
#if WITH_RUNTIME_STATS
statistics.partition_calls += 1;
statistics.items_processed += length;
#endif
uint32_t boundary = 0;
uint32_t i = 0;
int32_t pivot = array[length - 1]; // we always pick the pivot at the end
const __m256i P = _mm256_set1_epi32(pivot);
while ( i + 8 + 1 <= length) {
__m256i allgrey = _mm256_lddqu_si256((__m256i *)(array + i));
int pvbyte = _mm256_movemask_ps((__m256)_mm256_cmpgt_epi32(allgrey, P));
#if WITH_RUNTIME_STATS && defined(WITH_PVBYTE_HISTOGRAM)
statistics.pvbyte_histogram.hit(pvbyte);
#endif
if(pvbyte == 0) { // might be frequent
i += 8; //nothing to do
boundary = i;
} else if (pvbyte == 0xFF) { // called once
boundary = i;
i += 8;
break; // exit
} else {
// hot path
switch (pvbyte) {
// for pvbyte = 0x00, 0x80, 0xc0, 0xe0, 0xf0, 0xf8, 0xfc, 0xfe, 0xff
// there is no change in order, just advance boundary
// Note: case 0x00 & 0xff are already handled
case 0x80: i += 8 - 1; break;
case 0xc0: i += 8 - 2; break;
case 0xe0: i += 8 - 3; break;
case 0xf0: i += 8 - 4; break;
case 0xf8: i += 8 - 5; break;
case 0xfc: i += 8 - 6; break;
case 0xfe: i += 8 - 7; break;
default: {
uint32_t cnt =
8 - _mm_popcnt_u32(pvbyte); // might be faster with table look-up?
__m256i blackthenwhite = _mm256_permutevar8x32_epi32(allgrey, get_permutation_vector(pvbyte));
_mm256_storeu_si256((__m256i *)(array + i), blackthenwhite);
i += cnt;
}
} // switch
boundary = i; // this doesn't need updating each and every time
}
}
for (; i + 8 + 1 <= length ;) {
__m256i allgrey =
_mm256_lddqu_si256((__m256i *)(array + i)); // this is all grey
int pvbyte = _mm256_movemask_ps((__m256)_mm256_cmpgt_epi32(allgrey, P));
if (pvbyte == 0xFF) { // called once
// nothing to do
} else {
uint32_t cnt =
8 - _mm_popcnt_u32(pvbyte); // might be faster with table look-up?
__m256i allwhite = _mm256_lddqu_si256(
(__m256i *)(array + boundary)); // this is all white
// we shuffle allgrey so that the first part is black and the second part
// is white
__m256i blackthenwhite = _mm256_permutevar8x32_epi32(allgrey, get_permutation_vector(pvbyte));
_mm256_storeu_si256((__m256i *)(array + boundary), blackthenwhite);
_mm256_storeu_si256((__m256i *)(array + i), allwhite);
boundary += cnt; // might be faster with table look-up?
}
i += 8;
}
while (i + 1 < length) {
int32_t ival = array[i];
if (ival <= pivot) {
int32_t bval = array[boundary];
array[i] = bval;
array[boundary] = ival;
boundary++;
}
i++;
}
int32_t ival = array[i];
int32_t bval = array[boundary];
array[length - 1] = bval;
array[boundary] = ival;
boundary++;
return boundary;
}
// for fallback
void scalar_partition(int32_t* array, const int32_t pivot, int& left, int& right) {
while (left <= right) {
while (array[left] < pivot) {
left += 1;
}
while (array[right] > pivot) {
right -= 1;
}
if (left <= right) {
const uint32_t t = array[left];
array[left] = array[right];
array[right] = t;
left += 1;
right -= 1;
}
}
}
//fallback
void scalar_quicksort(int32_t* array, int left, int right) {
#ifdef WITH_RUNTIME_STATS
statistics.scalar__partition_calls += 1;
statistics.scalar__items_processed += right - left + 1;
#endif
int i = left;
int j = right;
const int32_t pivot = array[(i + j)/2];
scalar_partition(array, pivot, i, j);
if (left < j) {
scalar_quicksort(array, left, j);
}
if (i < right) {
scalar_quicksort(array, i, right);
}
}
void avx2_pivotonlast_sort(int32_t *array, const uint32_t length) {
uint32_t sep = avx_pivot_on_last_value(array, length);
if(sep == length) {
// we have an ineffective pivot. Let us give up.
if(length > 1) scalar_quicksort(array,0,length - 1);
} else {
if (sep > 2) {
avx2_pivotonlast_sort(array, sep - 1);
}
if (sep + 1 < length) {
avx2_pivotonlast_sort(array + sep, length - sep);
}
}
}
void wrapped_avx2_pivotonlast_sort(uint32_t *array, int left, int right) {
avx2_pivotonlast_sort((int32_t *)array + left, right - left + 1);
}