462 lines
15 KiB
C
462 lines
15 KiB
C
|
|
#pragma once
|
||
|
|
|
||
|
|
#include <x86intrin.h>
|
||
|
|
#include <cstdint>
|
||
|
|
|
||
|
|
/**
|
||
|
|
* This is an alternative approach to SIMD quicksort, implemented by D. Lemire.
|
||
|
|
* It is not meant to be fast as such, but it can serve as a useful reference.
|
||
|
|
*/
|
||
|
|
|
||
|
|
// When defined smaller reverseshufflemask is used.
|
||
|
|
//#define BYTE_PATTERN
|
||
|
|
|
||
|
|
// When defined histogram for comparison mask is collected
|
||
|
|
//#define WITH_PVBYTE_HISTOGRAM
|
||
|
|
|
||
|
|
#if WITH_RUNTIME_STATS == 0 && defined(WITH_PVBYTE_HISTOGRAM)
|
||
|
|
# undef WITH_RUNTIME_STATS
|
||
|
|
#endif
|
||
|
|
|
||
|
|
// can be replaced with VCOMPRESS on AVX-512
|
||
|
|
static
|
||
|
|
#ifdef BYTE_PATTERN
|
||
|
|
uint8_t
|
||
|
|
#else
|
||
|
|
uint32_t
|
||
|
|
#endif
|
||
|
|
reverseshufflemask[256 * 8] __attribute__((aligned(0x100))) = {
|
||
|
|
0, 1, 2, 3, 4, 5, 6, 7, /* 0*/
|
||
|
|
1, 2, 3, 4, 5, 6, 7, 0, /* 1*/
|
||
|
|
0, 2, 3, 4, 5, 6, 7, 1, /* 2*/
|
||
|
|
2, 3, 4, 5, 6, 7, 0, 1, /* 3*/
|
||
|
|
0, 1, 3, 4, 5, 6, 7, 2, /* 4*/
|
||
|
|
1, 3, 4, 5, 6, 7, 0, 2, /* 5*/
|
||
|
|
0, 3, 4, 5, 6, 7, 1, 2, /* 6*/
|
||
|
|
3, 4, 5, 6, 7, 0, 1, 2, /* 7*/
|
||
|
|
0, 1, 2, 4, 5, 6, 7, 3, /* 8*/
|
||
|
|
1, 2, 4, 5, 6, 7, 0, 3, /* 9*/
|
||
|
|
0, 2, 4, 5, 6, 7, 1, 3, /* 10*/
|
||
|
|
2, 4, 5, 6, 7, 0, 1, 3, /* 11*/
|
||
|
|
0, 1, 4, 5, 6, 7, 2, 3, /* 12*/
|
||
|
|
1, 4, 5, 6, 7, 0, 2, 3, /* 13*/
|
||
|
|
0, 4, 5, 6, 7, 1, 2, 3, /* 14*/
|
||
|
|
4, 5, 6, 7, 0, 1, 2, 3, /* 15*/
|
||
|
|
0, 1, 2, 3, 5, 6, 7, 4, /* 16*/
|
||
|
|
1, 2, 3, 5, 6, 7, 0, 4, /* 17*/
|
||
|
|
0, 2, 3, 5, 6, 7, 1, 4, /* 18*/
|
||
|
|
2, 3, 5, 6, 7, 0, 1, 4, /* 19*/
|
||
|
|
0, 1, 3, 5, 6, 7, 2, 4, /* 20*/
|
||
|
|
1, 3, 5, 6, 7, 0, 2, 4, /* 21*/
|
||
|
|
0, 3, 5, 6, 7, 1, 2, 4, /* 22*/
|
||
|
|
3, 5, 6, 7, 0, 1, 2, 4, /* 23*/
|
||
|
|
0, 1, 2, 5, 6, 7, 3, 4, /* 24*/
|
||
|
|
1, 2, 5, 6, 7, 0, 3, 4, /* 25*/
|
||
|
|
0, 2, 5, 6, 7, 1, 3, 4, /* 26*/
|
||
|
|
2, 5, 6, 7, 0, 1, 3, 4, /* 27*/
|
||
|
|
0, 1, 5, 6, 7, 2, 3, 4, /* 28*/
|
||
|
|
1, 5, 6, 7, 0, 2, 3, 4, /* 29*/
|
||
|
|
0, 5, 6, 7, 1, 2, 3, 4, /* 30*/
|
||
|
|
5, 6, 7, 0, 1, 2, 3, 4, /* 31*/
|
||
|
|
0, 1, 2, 3, 4, 6, 7, 5, /* 32*/
|
||
|
|
1, 2, 3, 4, 6, 7, 0, 5, /* 33*/
|
||
|
|
0, 2, 3, 4, 6, 7, 1, 5, /* 34*/
|
||
|
|
2, 3, 4, 6, 7, 0, 1, 5, /* 35*/
|
||
|
|
0, 1, 3, 4, 6, 7, 2, 5, /* 36*/
|
||
|
|
1, 3, 4, 6, 7, 0, 2, 5, /* 37*/
|
||
|
|
0, 3, 4, 6, 7, 1, 2, 5, /* 38*/
|
||
|
|
3, 4, 6, 7, 0, 1, 2, 5, /* 39*/
|
||
|
|
0, 1, 2, 4, 6, 7, 3, 5, /* 40*/
|
||
|
|
1, 2, 4, 6, 7, 0, 3, 5, /* 41*/
|
||
|
|
0, 2, 4, 6, 7, 1, 3, 5, /* 42*/
|
||
|
|
2, 4, 6, 7, 0, 1, 3, 5, /* 43*/
|
||
|
|
0, 1, 4, 6, 7, 2, 3, 5, /* 44*/
|
||
|
|
1, 4, 6, 7, 0, 2, 3, 5, /* 45*/
|
||
|
|
0, 4, 6, 7, 1, 2, 3, 5, /* 46*/
|
||
|
|
4, 6, 7, 0, 1, 2, 3, 5, /* 47*/
|
||
|
|
0, 1, 2, 3, 6, 7, 4, 5, /* 48*/
|
||
|
|
1, 2, 3, 6, 7, 0, 4, 5, /* 49*/
|
||
|
|
0, 2, 3, 6, 7, 1, 4, 5, /* 50*/
|
||
|
|
2, 3, 6, 7, 0, 1, 4, 5, /* 51*/
|
||
|
|
0, 1, 3, 6, 7, 2, 4, 5, /* 52*/
|
||
|
|
1, 3, 6, 7, 0, 2, 4, 5, /* 53*/
|
||
|
|
0, 3, 6, 7, 1, 2, 4, 5, /* 54*/
|
||
|
|
3, 6, 7, 0, 1, 2, 4, 5, /* 55*/
|
||
|
|
0, 1, 2, 6, 7, 3, 4, 5, /* 56*/
|
||
|
|
1, 2, 6, 7, 0, 3, 4, 5, /* 57*/
|
||
|
|
0, 2, 6, 7, 1, 3, 4, 5, /* 58*/
|
||
|
|
2, 6, 7, 0, 1, 3, 4, 5, /* 59*/
|
||
|
|
0, 1, 6, 7, 2, 3, 4, 5, /* 60*/
|
||
|
|
1, 6, 7, 0, 2, 3, 4, 5, /* 61*/
|
||
|
|
0, 6, 7, 1, 2, 3, 4, 5, /* 62*/
|
||
|
|
6, 7, 0, 1, 2, 3, 4, 5, /* 63*/
|
||
|
|
0, 1, 2, 3, 4, 5, 7, 6, /* 64*/
|
||
|
|
1, 2, 3, 4, 5, 7, 0, 6, /* 65*/
|
||
|
|
0, 2, 3, 4, 5, 7, 1, 6, /* 66*/
|
||
|
|
2, 3, 4, 5, 7, 0, 1, 6, /* 67*/
|
||
|
|
0, 1, 3, 4, 5, 7, 2, 6, /* 68*/
|
||
|
|
1, 3, 4, 5, 7, 0, 2, 6, /* 69*/
|
||
|
|
0, 3, 4, 5, 7, 1, 2, 6, /* 70*/
|
||
|
|
3, 4, 5, 7, 0, 1, 2, 6, /* 71*/
|
||
|
|
0, 1, 2, 4, 5, 7, 3, 6, /* 72*/
|
||
|
|
1, 2, 4, 5, 7, 0, 3, 6, /* 73*/
|
||
|
|
0, 2, 4, 5, 7, 1, 3, 6, /* 74*/
|
||
|
|
2, 4, 5, 7, 0, 1, 3, 6, /* 75*/
|
||
|
|
0, 1, 4, 5, 7, 2, 3, 6, /* 76*/
|
||
|
|
1, 4, 5, 7, 0, 2, 3, 6, /* 77*/
|
||
|
|
0, 4, 5, 7, 1, 2, 3, 6, /* 78*/
|
||
|
|
4, 5, 7, 0, 1, 2, 3, 6, /* 79*/
|
||
|
|
0, 1, 2, 3, 5, 7, 4, 6, /* 80*/
|
||
|
|
1, 2, 3, 5, 7, 0, 4, 6, /* 81*/
|
||
|
|
0, 2, 3, 5, 7, 1, 4, 6, /* 82*/
|
||
|
|
2, 3, 5, 7, 0, 1, 4, 6, /* 83*/
|
||
|
|
0, 1, 3, 5, 7, 2, 4, 6, /* 84*/
|
||
|
|
1, 3, 5, 7, 0, 2, 4, 6, /* 85*/
|
||
|
|
0, 3, 5, 7, 1, 2, 4, 6, /* 86*/
|
||
|
|
3, 5, 7, 0, 1, 2, 4, 6, /* 87*/
|
||
|
|
0, 1, 2, 5, 7, 3, 4, 6, /* 88*/
|
||
|
|
1, 2, 5, 7, 0, 3, 4, 6, /* 89*/
|
||
|
|
0, 2, 5, 7, 1, 3, 4, 6, /* 90*/
|
||
|
|
2, 5, 7, 0, 1, 3, 4, 6, /* 91*/
|
||
|
|
0, 1, 5, 7, 2, 3, 4, 6, /* 92*/
|
||
|
|
1, 5, 7, 0, 2, 3, 4, 6, /* 93*/
|
||
|
|
0, 5, 7, 1, 2, 3, 4, 6, /* 94*/
|
||
|
|
5, 7, 0, 1, 2, 3, 4, 6, /* 95*/
|
||
|
|
0, 1, 2, 3, 4, 7, 5, 6, /* 96*/
|
||
|
|
1, 2, 3, 4, 7, 0, 5, 6, /* 97*/
|
||
|
|
0, 2, 3, 4, 7, 1, 5, 6, /* 98*/
|
||
|
|
2, 3, 4, 7, 0, 1, 5, 6, /* 99*/
|
||
|
|
0, 1, 3, 4, 7, 2, 5, 6, /* 100*/
|
||
|
|
1, 3, 4, 7, 0, 2, 5, 6, /* 101*/
|
||
|
|
0, 3, 4, 7, 1, 2, 5, 6, /* 102*/
|
||
|
|
3, 4, 7, 0, 1, 2, 5, 6, /* 103*/
|
||
|
|
0, 1, 2, 4, 7, 3, 5, 6, /* 104*/
|
||
|
|
1, 2, 4, 7, 0, 3, 5, 6, /* 105*/
|
||
|
|
0, 2, 4, 7, 1, 3, 5, 6, /* 106*/
|
||
|
|
2, 4, 7, 0, 1, 3, 5, 6, /* 107*/
|
||
|
|
0, 1, 4, 7, 2, 3, 5, 6, /* 108*/
|
||
|
|
1, 4, 7, 0, 2, 3, 5, 6, /* 109*/
|
||
|
|
0, 4, 7, 1, 2, 3, 5, 6, /* 110*/
|
||
|
|
4, 7, 0, 1, 2, 3, 5, 6, /* 111*/
|
||
|
|
0, 1, 2, 3, 7, 4, 5, 6, /* 112*/
|
||
|
|
1, 2, 3, 7, 0, 4, 5, 6, /* 113*/
|
||
|
|
0, 2, 3, 7, 1, 4, 5, 6, /* 114*/
|
||
|
|
2, 3, 7, 0, 1, 4, 5, 6, /* 115*/
|
||
|
|
0, 1, 3, 7, 2, 4, 5, 6, /* 116*/
|
||
|
|
1, 3, 7, 0, 2, 4, 5, 6, /* 117*/
|
||
|
|
0, 3, 7, 1, 2, 4, 5, 6, /* 118*/
|
||
|
|
3, 7, 0, 1, 2, 4, 5, 6, /* 119*/
|
||
|
|
0, 1, 2, 7, 3, 4, 5, 6, /* 120*/
|
||
|
|
1, 2, 7, 0, 3, 4, 5, 6, /* 121*/
|
||
|
|
0, 2, 7, 1, 3, 4, 5, 6, /* 122*/
|
||
|
|
2, 7, 0, 1, 3, 4, 5, 6, /* 123*/
|
||
|
|
0, 1, 7, 2, 3, 4, 5, 6, /* 124*/
|
||
|
|
1, 7, 0, 2, 3, 4, 5, 6, /* 125*/
|
||
|
|
0, 7, 1, 2, 3, 4, 5, 6, /* 126*/
|
||
|
|
7, 0, 1, 2, 3, 4, 5, 6, /* 127*/
|
||
|
|
0, 1, 2, 3, 4, 5, 6, 7, /* 128*/
|
||
|
|
1, 2, 3, 4, 5, 6, 0, 7, /* 129*/
|
||
|
|
0, 2, 3, 4, 5, 6, 1, 7, /* 130*/
|
||
|
|
2, 3, 4, 5, 6, 0, 1, 7, /* 131*/
|
||
|
|
0, 1, 3, 4, 5, 6, 2, 7, /* 132*/
|
||
|
|
1, 3, 4, 5, 6, 0, 2, 7, /* 133*/
|
||
|
|
0, 3, 4, 5, 6, 1, 2, 7, /* 134*/
|
||
|
|
3, 4, 5, 6, 0, 1, 2, 7, /* 135*/
|
||
|
|
0, 1, 2, 4, 5, 6, 3, 7, /* 136*/
|
||
|
|
1, 2, 4, 5, 6, 0, 3, 7, /* 137*/
|
||
|
|
0, 2, 4, 5, 6, 1, 3, 7, /* 138*/
|
||
|
|
2, 4, 5, 6, 0, 1, 3, 7, /* 139*/
|
||
|
|
0, 1, 4, 5, 6, 2, 3, 7, /* 140*/
|
||
|
|
1, 4, 5, 6, 0, 2, 3, 7, /* 141*/
|
||
|
|
0, 4, 5, 6, 1, 2, 3, 7, /* 142*/
|
||
|
|
4, 5, 6, 0, 1, 2, 3, 7, /* 143*/
|
||
|
|
0, 1, 2, 3, 5, 6, 4, 7, /* 144*/
|
||
|
|
1, 2, 3, 5, 6, 0, 4, 7, /* 145*/
|
||
|
|
0, 2, 3, 5, 6, 1, 4, 7, /* 146*/
|
||
|
|
2, 3, 5, 6, 0, 1, 4, 7, /* 147*/
|
||
|
|
0, 1, 3, 5, 6, 2, 4, 7, /* 148*/
|
||
|
|
1, 3, 5, 6, 0, 2, 4, 7, /* 149*/
|
||
|
|
0, 3, 5, 6, 1, 2, 4, 7, /* 150*/
|
||
|
|
3, 5, 6, 0, 1, 2, 4, 7, /* 151*/
|
||
|
|
0, 1, 2, 5, 6, 3, 4, 7, /* 152*/
|
||
|
|
1, 2, 5, 6, 0, 3, 4, 7, /* 153*/
|
||
|
|
0, 2, 5, 6, 1, 3, 4, 7, /* 154*/
|
||
|
|
2, 5, 6, 0, 1, 3, 4, 7, /* 155*/
|
||
|
|
0, 1, 5, 6, 2, 3, 4, 7, /* 156*/
|
||
|
|
1, 5, 6, 0, 2, 3, 4, 7, /* 157*/
|
||
|
|
0, 5, 6, 1, 2, 3, 4, 7, /* 158*/
|
||
|
|
5, 6, 0, 1, 2, 3, 4, 7, /* 159*/
|
||
|
|
0, 1, 2, 3, 4, 6, 5, 7, /* 160*/
|
||
|
|
1, 2, 3, 4, 6, 0, 5, 7, /* 161*/
|
||
|
|
0, 2, 3, 4, 6, 1, 5, 7, /* 162*/
|
||
|
|
2, 3, 4, 6, 0, 1, 5, 7, /* 163*/
|
||
|
|
0, 1, 3, 4, 6, 2, 5, 7, /* 164*/
|
||
|
|
1, 3, 4, 6, 0, 2, 5, 7, /* 165*/
|
||
|
|
0, 3, 4, 6, 1, 2, 5, 7, /* 166*/
|
||
|
|
3, 4, 6, 0, 1, 2, 5, 7, /* 167*/
|
||
|
|
0, 1, 2, 4, 6, 3, 5, 7, /* 168*/
|
||
|
|
1, 2, 4, 6, 0, 3, 5, 7, /* 169*/
|
||
|
|
0, 2, 4, 6, 1, 3, 5, 7, /* 170*/
|
||
|
|
2, 4, 6, 0, 1, 3, 5, 7, /* 171*/
|
||
|
|
0, 1, 4, 6, 2, 3, 5, 7, /* 172*/
|
||
|
|
1, 4, 6, 0, 2, 3, 5, 7, /* 173*/
|
||
|
|
0, 4, 6, 1, 2, 3, 5, 7, /* 174*/
|
||
|
|
4, 6, 0, 1, 2, 3, 5, 7, /* 175*/
|
||
|
|
0, 1, 2, 3, 6, 4, 5, 7, /* 176*/
|
||
|
|
1, 2, 3, 6, 0, 4, 5, 7, /* 177*/
|
||
|
|
0, 2, 3, 6, 1, 4, 5, 7, /* 178*/
|
||
|
|
2, 3, 6, 0, 1, 4, 5, 7, /* 179*/
|
||
|
|
0, 1, 3, 6, 2, 4, 5, 7, /* 180*/
|
||
|
|
1, 3, 6, 0, 2, 4, 5, 7, /* 181*/
|
||
|
|
0, 3, 6, 1, 2, 4, 5, 7, /* 182*/
|
||
|
|
3, 6, 0, 1, 2, 4, 5, 7, /* 183*/
|
||
|
|
0, 1, 2, 6, 3, 4, 5, 7, /* 184*/
|
||
|
|
1, 2, 6, 0, 3, 4, 5, 7, /* 185*/
|
||
|
|
0, 2, 6, 1, 3, 4, 5, 7, /* 186*/
|
||
|
|
2, 6, 0, 1, 3, 4, 5, 7, /* 187*/
|
||
|
|
0, 1, 6, 2, 3, 4, 5, 7, /* 188*/
|
||
|
|
1, 6, 0, 2, 3, 4, 5, 7, /* 189*/
|
||
|
|
0, 6, 1, 2, 3, 4, 5, 7, /* 190*/
|
||
|
|
6, 0, 1, 2, 3, 4, 5, 7, /* 191*/
|
||
|
|
0, 1, 2, 3, 4, 5, 6, 7, /* 192*/
|
||
|
|
1, 2, 3, 4, 5, 0, 6, 7, /* 193*/
|
||
|
|
0, 2, 3, 4, 5, 1, 6, 7, /* 194*/
|
||
|
|
2, 3, 4, 5, 0, 1, 6, 7, /* 195*/
|
||
|
|
0, 1, 3, 4, 5, 2, 6, 7, /* 196*/
|
||
|
|
1, 3, 4, 5, 0, 2, 6, 7, /* 197*/
|
||
|
|
0, 3, 4, 5, 1, 2, 6, 7, /* 198*/
|
||
|
|
3, 4, 5, 0, 1, 2, 6, 7, /* 199*/
|
||
|
|
0, 1, 2, 4, 5, 3, 6, 7, /* 200*/
|
||
|
|
1, 2, 4, 5, 0, 3, 6, 7, /* 201*/
|
||
|
|
0, 2, 4, 5, 1, 3, 6, 7, /* 202*/
|
||
|
|
2, 4, 5, 0, 1, 3, 6, 7, /* 203*/
|
||
|
|
0, 1, 4, 5, 2, 3, 6, 7, /* 204*/
|
||
|
|
1, 4, 5, 0, 2, 3, 6, 7, /* 205*/
|
||
|
|
0, 4, 5, 1, 2, 3, 6, 7, /* 206*/
|
||
|
|
4, 5, 0, 1, 2, 3, 6, 7, /* 207*/
|
||
|
|
0, 1, 2, 3, 5, 4, 6, 7, /* 208*/
|
||
|
|
1, 2, 3, 5, 0, 4, 6, 7, /* 209*/
|
||
|
|
0, 2, 3, 5, 1, 4, 6, 7, /* 210*/
|
||
|
|
2, 3, 5, 0, 1, 4, 6, 7, /* 211*/
|
||
|
|
0, 1, 3, 5, 2, 4, 6, 7, /* 212*/
|
||
|
|
1, 3, 5, 0, 2, 4, 6, 7, /* 213*/
|
||
|
|
0, 3, 5, 1, 2, 4, 6, 7, /* 214*/
|
||
|
|
3, 5, 0, 1, 2, 4, 6, 7, /* 215*/
|
||
|
|
0, 1, 2, 5, 3, 4, 6, 7, /* 216*/
|
||
|
|
1, 2, 5, 0, 3, 4, 6, 7, /* 217*/
|
||
|
|
0, 2, 5, 1, 3, 4, 6, 7, /* 218*/
|
||
|
|
2, 5, 0, 1, 3, 4, 6, 7, /* 219*/
|
||
|
|
0, 1, 5, 2, 3, 4, 6, 7, /* 220*/
|
||
|
|
1, 5, 0, 2, 3, 4, 6, 7, /* 221*/
|
||
|
|
0, 5, 1, 2, 3, 4, 6, 7, /* 222*/
|
||
|
|
5, 0, 1, 2, 3, 4, 6, 7, /* 223*/
|
||
|
|
0, 1, 2, 3, 4, 5, 6, 7, /* 224*/
|
||
|
|
1, 2, 3, 4, 0, 5, 6, 7, /* 225*/
|
||
|
|
0, 2, 3, 4, 1, 5, 6, 7, /* 226*/
|
||
|
|
2, 3, 4, 0, 1, 5, 6, 7, /* 227*/
|
||
|
|
0, 1, 3, 4, 2, 5, 6, 7, /* 228*/
|
||
|
|
1, 3, 4, 0, 2, 5, 6, 7, /* 229*/
|
||
|
|
0, 3, 4, 1, 2, 5, 6, 7, /* 230*/
|
||
|
|
3, 4, 0, 1, 2, 5, 6, 7, /* 231*/
|
||
|
|
0, 1, 2, 4, 3, 5, 6, 7, /* 232*/
|
||
|
|
1, 2, 4, 0, 3, 5, 6, 7, /* 233*/
|
||
|
|
0, 2, 4, 1, 3, 5, 6, 7, /* 234*/
|
||
|
|
2, 4, 0, 1, 3, 5, 6, 7, /* 235*/
|
||
|
|
0, 1, 4, 2, 3, 5, 6, 7, /* 236*/
|
||
|
|
1, 4, 0, 2, 3, 5, 6, 7, /* 237*/
|
||
|
|
0, 4, 1, 2, 3, 5, 6, 7, /* 238*/
|
||
|
|
4, 0, 1, 2, 3, 5, 6, 7, /* 239*/
|
||
|
|
0, 1, 2, 3, 4, 5, 6, 7, /* 240*/
|
||
|
|
1, 2, 3, 0, 4, 5, 6, 7, /* 241*/
|
||
|
|
0, 2, 3, 1, 4, 5, 6, 7, /* 242*/
|
||
|
|
2, 3, 0, 1, 4, 5, 6, 7, /* 243*/
|
||
|
|
0, 1, 3, 2, 4, 5, 6, 7, /* 244*/
|
||
|
|
1, 3, 0, 2, 4, 5, 6, 7, /* 245*/
|
||
|
|
0, 3, 1, 2, 4, 5, 6, 7, /* 246*/
|
||
|
|
3, 0, 1, 2, 4, 5, 6, 7, /* 247*/
|
||
|
|
0, 1, 2, 3, 4, 5, 6, 7, /* 248*/
|
||
|
|
1, 2, 0, 3, 4, 5, 6, 7, /* 249*/
|
||
|
|
0, 2, 1, 3, 4, 5, 6, 7, /* 250*/
|
||
|
|
2, 0, 1, 3, 4, 5, 6, 7, /* 251*/
|
||
|
|
0, 1, 2, 3, 4, 5, 6, 7, /* 252*/
|
||
|
|
1, 0, 2, 3, 4, 5, 6, 7, /* 253*/
|
||
|
|
0, 1, 2, 3, 4, 5, 6, 7, /* 254*/
|
||
|
|
0, 1, 2, 3, 4, 5, 6, 7, /* 255*/
|
||
|
|
};
|
||
|
|
|
||
|
|
|
||
|
|
static FORCE_INLINE __m256i get_permutation_vector(int pvbyte) {
|
||
|
|
#ifdef BYTE_PATTERN
|
||
|
|
__m256i shufm;
|
||
|
|
asm volatile (
|
||
|
|
"vpmovzxbd (%1), %0"
|
||
|
|
: "=X" (shufm)
|
||
|
|
: "r" (reverseshufflemask + 8 * pvbyte)
|
||
|
|
);
|
||
|
|
|
||
|
|
return shufm;
|
||
|
|
#else
|
||
|
|
return _mm256_load_si256((__m256i *)(reverseshufflemask + 8 * pvbyte));
|
||
|
|
#endif
|
||
|
|
}
|
||
|
|
|
||
|
|
|
||
|
|
static uint32_t avx_pivot_on_last_value(int32_t *array, size_t length) {
|
||
|
|
/* we run through the data. Anything in [0,boundary) is smaller or equal
|
||
|
|
* than the pivot, and the value at boundary - 1 is going to be equal to the
|
||
|
|
* pivot at the end,
|
||
|
|
* anything in (boundary, i) is greater than the pivot
|
||
|
|
* stuff in [i,...) is grey
|
||
|
|
* the function returns the location of the boundary.
|
||
|
|
*/
|
||
|
|
if (length <= 1)
|
||
|
|
return 1;
|
||
|
|
{ // we exchange the last value for the middle value for a better pivot
|
||
|
|
int32_t ival = array[length / 2];
|
||
|
|
int32_t bval = array[length - 1];
|
||
|
|
array[length / 2] = bval;
|
||
|
|
array[length - 1] = ival;
|
||
|
|
}
|
||
|
|
#if WITH_RUNTIME_STATS
|
||
|
|
statistics.partition_calls += 1;
|
||
|
|
statistics.items_processed += length;
|
||
|
|
#endif
|
||
|
|
uint32_t boundary = 0;
|
||
|
|
uint32_t i = 0;
|
||
|
|
int32_t pivot = array[length - 1]; // we always pick the pivot at the end
|
||
|
|
const __m256i P = _mm256_set1_epi32(pivot);
|
||
|
|
while ( i + 8 + 1 <= length) {
|
||
|
|
__m256i allgrey = _mm256_lddqu_si256((__m256i *)(array + i));
|
||
|
|
int pvbyte = _mm256_movemask_ps((__m256)_mm256_cmpgt_epi32(allgrey, P));
|
||
|
|
#if WITH_RUNTIME_STATS && defined(WITH_PVBYTE_HISTOGRAM)
|
||
|
|
statistics.pvbyte_histogram.hit(pvbyte);
|
||
|
|
#endif
|
||
|
|
if(pvbyte == 0) { // might be frequent
|
||
|
|
i += 8; //nothing to do
|
||
|
|
boundary = i;
|
||
|
|
} else if (pvbyte == 0xFF) { // called once
|
||
|
|
boundary = i;
|
||
|
|
i += 8;
|
||
|
|
break; // exit
|
||
|
|
} else {
|
||
|
|
|
||
|
|
// hot path
|
||
|
|
switch (pvbyte) {
|
||
|
|
// for pvbyte = 0x00, 0x80, 0xc0, 0xe0, 0xf0, 0xf8, 0xfc, 0xfe, 0xff
|
||
|
|
// there is no change in order, just advance boundary
|
||
|
|
// Note: case 0x00 & 0xff are already handled
|
||
|
|
case 0x80: i += 8 - 1; break;
|
||
|
|
case 0xc0: i += 8 - 2; break;
|
||
|
|
case 0xe0: i += 8 - 3; break;
|
||
|
|
case 0xf0: i += 8 - 4; break;
|
||
|
|
case 0xf8: i += 8 - 5; break;
|
||
|
|
case 0xfc: i += 8 - 6; break;
|
||
|
|
case 0xfe: i += 8 - 7; break;
|
||
|
|
|
||
|
|
default: {
|
||
|
|
uint32_t cnt =
|
||
|
|
8 - _mm_popcnt_u32(pvbyte); // might be faster with table look-up?
|
||
|
|
__m256i blackthenwhite = _mm256_permutevar8x32_epi32(allgrey, get_permutation_vector(pvbyte));
|
||
|
|
_mm256_storeu_si256((__m256i *)(array + i), blackthenwhite);
|
||
|
|
i += cnt;
|
||
|
|
}
|
||
|
|
} // switch
|
||
|
|
|
||
|
|
boundary = i; // this doesn't need updating each and every time
|
||
|
|
}
|
||
|
|
}
|
||
|
|
for (; i + 8 + 1 <= length ;) {
|
||
|
|
__m256i allgrey =
|
||
|
|
_mm256_lddqu_si256((__m256i *)(array + i)); // this is all grey
|
||
|
|
int pvbyte = _mm256_movemask_ps((__m256)_mm256_cmpgt_epi32(allgrey, P));
|
||
|
|
if (pvbyte == 0xFF) { // called once
|
||
|
|
// nothing to do
|
||
|
|
} else {
|
||
|
|
|
||
|
|
uint32_t cnt =
|
||
|
|
8 - _mm_popcnt_u32(pvbyte); // might be faster with table look-up?
|
||
|
|
__m256i allwhite = _mm256_lddqu_si256(
|
||
|
|
(__m256i *)(array + boundary)); // this is all white
|
||
|
|
// we shuffle allgrey so that the first part is black and the second part
|
||
|
|
// is white
|
||
|
|
__m256i blackthenwhite = _mm256_permutevar8x32_epi32(allgrey, get_permutation_vector(pvbyte));
|
||
|
|
_mm256_storeu_si256((__m256i *)(array + boundary), blackthenwhite);
|
||
|
|
_mm256_storeu_si256((__m256i *)(array + i), allwhite);
|
||
|
|
boundary += cnt; // might be faster with table look-up?
|
||
|
|
}
|
||
|
|
i += 8;
|
||
|
|
}
|
||
|
|
while (i + 1 < length) {
|
||
|
|
int32_t ival = array[i];
|
||
|
|
if (ival <= pivot) {
|
||
|
|
int32_t bval = array[boundary];
|
||
|
|
array[i] = bval;
|
||
|
|
array[boundary] = ival;
|
||
|
|
boundary++;
|
||
|
|
}
|
||
|
|
i++;
|
||
|
|
}
|
||
|
|
int32_t ival = array[i];
|
||
|
|
int32_t bval = array[boundary];
|
||
|
|
array[length - 1] = bval;
|
||
|
|
array[boundary] = ival;
|
||
|
|
boundary++;
|
||
|
|
return boundary;
|
||
|
|
}
|
||
|
|
|
||
|
|
// for fallback
|
||
|
|
void scalar_partition(int32_t* array, const int32_t pivot, int& left, int& right) {
|
||
|
|
|
||
|
|
while (left <= right) {
|
||
|
|
while (array[left] < pivot) {
|
||
|
|
left += 1;
|
||
|
|
}
|
||
|
|
while (array[right] > pivot) {
|
||
|
|
right -= 1;
|
||
|
|
}
|
||
|
|
if (left <= right) {
|
||
|
|
const uint32_t t = array[left];
|
||
|
|
array[left] = array[right];
|
||
|
|
array[right] = t;
|
||
|
|
left += 1;
|
||
|
|
right -= 1;
|
||
|
|
}
|
||
|
|
}
|
||
|
|
}
|
||
|
|
|
||
|
|
//fallback
|
||
|
|
void scalar_quicksort(int32_t* array, int left, int right) {
|
||
|
|
#ifdef WITH_RUNTIME_STATS
|
||
|
|
statistics.scalar__partition_calls += 1;
|
||
|
|
statistics.scalar__items_processed += right - left + 1;
|
||
|
|
#endif
|
||
|
|
int i = left;
|
||
|
|
int j = right;
|
||
|
|
const int32_t pivot = array[(i + j)/2];
|
||
|
|
scalar_partition(array, pivot, i, j);
|
||
|
|
if (left < j) {
|
||
|
|
scalar_quicksort(array, left, j);
|
||
|
|
}
|
||
|
|
if (i < right) {
|
||
|
|
scalar_quicksort(array, i, right);
|
||
|
|
}
|
||
|
|
}
|
||
|
|
|
||
|
|
void avx2_pivotonlast_sort(int32_t *array, const uint32_t length) {
|
||
|
|
uint32_t sep = avx_pivot_on_last_value(array, length);
|
||
|
|
if(sep == length) {
|
||
|
|
// we have an ineffective pivot. Let us give up.
|
||
|
|
if(length > 1) scalar_quicksort(array,0,length - 1);
|
||
|
|
} else {
|
||
|
|
if (sep > 2) {
|
||
|
|
avx2_pivotonlast_sort(array, sep - 1);
|
||
|
|
}
|
||
|
|
if (sep + 1 < length) {
|
||
|
|
avx2_pivotonlast_sort(array + sep, length - sep);
|
||
|
|
}
|
||
|
|
}
|
||
|
|
}
|
||
|
|
void wrapped_avx2_pivotonlast_sort(uint32_t *array, int left, int right) {
|
||
|
|
avx2_pivotonlast_sort((int32_t *)array + left, right - left + 1);
|
||
|
|
}
|