410 lines
14 KiB
C++
410 lines
14 KiB
C++
#pragma once
|
|
|
|
#include <x86intrin.h>
|
|
#include <cstdint>
|
|
#include <memory.h>
|
|
#include "avx2-altquicksort.h"
|
|
/**
|
|
* This is an alternative approach to SIMD quicksort, based on a description by
|
|
* Nathan Kurz. Daniel Lemire implemented it to avoid the Dutch national flag
|
|
* problem as an experiment. This is not meant to be fast, as implemented. It is
|
|
* an experiment.
|
|
*/
|
|
|
|
/*
|
|
I'm going to use "vector" as a unit, but it doesn't have to be the
|
|
native vector size. In particular, I think we can read multiple
|
|
vector register's worth of data with minimal added time per loop
|
|
iteration.
|
|
|
|
First, read two vectors from the head of the list, and two from the
|
|
tail of the list. Store these, either in registers or written to a
|
|
buffer. We're not going to use these until the end. We're just
|
|
getting them out of the way so that we have more than one vectors
|
|
writeable space at the head and tail.
|
|
|
|
wh--rh------------------rt--wt-
|
|
|
|
wh = write head
|
|
rh = read head
|
|
rt = read tail
|
|
wt = write tail
|
|
|
|
Then read one vector from head, and advance rh accordingly. Call it nextVector.
|
|
|
|
Loop:
|
|
|
|
Set currentVector = nextVector.
|
|
|
|
Set nextVectorPtr to either rh or rt depending on which has the least
|
|
"writable space" available. That is, if (rh - wh) < (wt - rt), read
|
|
from and advance rh, else read from and update rt. These updates
|
|
need to be branchless.
|
|
|
|
Read nextVector = *nextVectorPtr.
|
|
|
|
Partition currentVector (lookup, shuffle) and write to both wh and wt
|
|
(tail offset so end hits wt).
|
|
Advance wh and wt based on number of valid entries. Use a second byte
|
|
lookup issued as early as possible to determine this count. Our
|
|
performance is dependent on the latency of this step.
|
|
|
|
-----
|
|
*/
|
|
static uint32_t shufflemask[256 * 8] __attribute__((aligned(0x100))) = {
|
|
0, 1, 2, 3, 4, 5, 6, 7, /* 0*/
|
|
0, 1, 2, 3, 4, 5, 6, 7, /* 1*/
|
|
1, 0, 2, 3, 4, 5, 6, 7, /* 2*/
|
|
0, 1, 2, 3, 4, 5, 6, 7, /* 3*/
|
|
2, 0, 1, 3, 4, 5, 6, 7, /* 4*/
|
|
0, 2, 1, 3, 4, 5, 6, 7, /* 5*/
|
|
1, 2, 0, 3, 4, 5, 6, 7, /* 6*/
|
|
0, 1, 2, 3, 4, 5, 6, 7, /* 7*/
|
|
3, 0, 1, 2, 4, 5, 6, 7, /* 8*/
|
|
0, 3, 1, 2, 4, 5, 6, 7, /* 9*/
|
|
1, 3, 0, 2, 4, 5, 6, 7, /* 10*/
|
|
0, 1, 3, 2, 4, 5, 6, 7, /* 11*/
|
|
2, 3, 0, 1, 4, 5, 6, 7, /* 12*/
|
|
0, 2, 3, 1, 4, 5, 6, 7, /* 13*/
|
|
1, 2, 3, 0, 4, 5, 6, 7, /* 14*/
|
|
0, 1, 2, 3, 4, 5, 6, 7, /* 15*/
|
|
4, 0, 1, 2, 3, 5, 6, 7, /* 16*/
|
|
0, 4, 1, 2, 3, 5, 6, 7, /* 17*/
|
|
1, 4, 0, 2, 3, 5, 6, 7, /* 18*/
|
|
0, 1, 4, 2, 3, 5, 6, 7, /* 19*/
|
|
2, 4, 0, 1, 3, 5, 6, 7, /* 20*/
|
|
0, 2, 4, 1, 3, 5, 6, 7, /* 21*/
|
|
1, 2, 4, 0, 3, 5, 6, 7, /* 22*/
|
|
0, 1, 2, 4, 3, 5, 6, 7, /* 23*/
|
|
3, 4, 0, 1, 2, 5, 6, 7, /* 24*/
|
|
0, 3, 4, 1, 2, 5, 6, 7, /* 25*/
|
|
1, 3, 4, 0, 2, 5, 6, 7, /* 26*/
|
|
0, 1, 3, 4, 2, 5, 6, 7, /* 27*/
|
|
2, 3, 4, 0, 1, 5, 6, 7, /* 28*/
|
|
0, 2, 3, 4, 1, 5, 6, 7, /* 29*/
|
|
1, 2, 3, 4, 0, 5, 6, 7, /* 30*/
|
|
0, 1, 2, 3, 4, 5, 6, 7, /* 31*/
|
|
5, 0, 1, 2, 3, 4, 6, 7, /* 32*/
|
|
0, 5, 1, 2, 3, 4, 6, 7, /* 33*/
|
|
1, 5, 0, 2, 3, 4, 6, 7, /* 34*/
|
|
0, 1, 5, 2, 3, 4, 6, 7, /* 35*/
|
|
2, 5, 0, 1, 3, 4, 6, 7, /* 36*/
|
|
0, 2, 5, 1, 3, 4, 6, 7, /* 37*/
|
|
1, 2, 5, 0, 3, 4, 6, 7, /* 38*/
|
|
0, 1, 2, 5, 3, 4, 6, 7, /* 39*/
|
|
3, 5, 0, 1, 2, 4, 6, 7, /* 40*/
|
|
0, 3, 5, 1, 2, 4, 6, 7, /* 41*/
|
|
1, 3, 5, 0, 2, 4, 6, 7, /* 42*/
|
|
0, 1, 3, 5, 2, 4, 6, 7, /* 43*/
|
|
2, 3, 5, 0, 1, 4, 6, 7, /* 44*/
|
|
0, 2, 3, 5, 1, 4, 6, 7, /* 45*/
|
|
1, 2, 3, 5, 0, 4, 6, 7, /* 46*/
|
|
0, 1, 2, 3, 5, 4, 6, 7, /* 47*/
|
|
4, 5, 0, 1, 2, 3, 6, 7, /* 48*/
|
|
0, 4, 5, 1, 2, 3, 6, 7, /* 49*/
|
|
1, 4, 5, 0, 2, 3, 6, 7, /* 50*/
|
|
0, 1, 4, 5, 2, 3, 6, 7, /* 51*/
|
|
2, 4, 5, 0, 1, 3, 6, 7, /* 52*/
|
|
0, 2, 4, 5, 1, 3, 6, 7, /* 53*/
|
|
1, 2, 4, 5, 0, 3, 6, 7, /* 54*/
|
|
0, 1, 2, 4, 5, 3, 6, 7, /* 55*/
|
|
3, 4, 5, 0, 1, 2, 6, 7, /* 56*/
|
|
0, 3, 4, 5, 1, 2, 6, 7, /* 57*/
|
|
1, 3, 4, 5, 0, 2, 6, 7, /* 58*/
|
|
0, 1, 3, 4, 5, 2, 6, 7, /* 59*/
|
|
2, 3, 4, 5, 0, 1, 6, 7, /* 60*/
|
|
0, 2, 3, 4, 5, 1, 6, 7, /* 61*/
|
|
1, 2, 3, 4, 5, 0, 6, 7, /* 62*/
|
|
0, 1, 2, 3, 4, 5, 6, 7, /* 63*/
|
|
6, 0, 1, 2, 3, 4, 5, 7, /* 64*/
|
|
0, 6, 1, 2, 3, 4, 5, 7, /* 65*/
|
|
1, 6, 0, 2, 3, 4, 5, 7, /* 66*/
|
|
0, 1, 6, 2, 3, 4, 5, 7, /* 67*/
|
|
2, 6, 0, 1, 3, 4, 5, 7, /* 68*/
|
|
0, 2, 6, 1, 3, 4, 5, 7, /* 69*/
|
|
1, 2, 6, 0, 3, 4, 5, 7, /* 70*/
|
|
0, 1, 2, 6, 3, 4, 5, 7, /* 71*/
|
|
3, 6, 0, 1, 2, 4, 5, 7, /* 72*/
|
|
0, 3, 6, 1, 2, 4, 5, 7, /* 73*/
|
|
1, 3, 6, 0, 2, 4, 5, 7, /* 74*/
|
|
0, 1, 3, 6, 2, 4, 5, 7, /* 75*/
|
|
2, 3, 6, 0, 1, 4, 5, 7, /* 76*/
|
|
0, 2, 3, 6, 1, 4, 5, 7, /* 77*/
|
|
1, 2, 3, 6, 0, 4, 5, 7, /* 78*/
|
|
0, 1, 2, 3, 6, 4, 5, 7, /* 79*/
|
|
4, 6, 0, 1, 2, 3, 5, 7, /* 80*/
|
|
0, 4, 6, 1, 2, 3, 5, 7, /* 81*/
|
|
1, 4, 6, 0, 2, 3, 5, 7, /* 82*/
|
|
0, 1, 4, 6, 2, 3, 5, 7, /* 83*/
|
|
2, 4, 6, 0, 1, 3, 5, 7, /* 84*/
|
|
0, 2, 4, 6, 1, 3, 5, 7, /* 85*/
|
|
1, 2, 4, 6, 0, 3, 5, 7, /* 86*/
|
|
0, 1, 2, 4, 6, 3, 5, 7, /* 87*/
|
|
3, 4, 6, 0, 1, 2, 5, 7, /* 88*/
|
|
0, 3, 4, 6, 1, 2, 5, 7, /* 89*/
|
|
1, 3, 4, 6, 0, 2, 5, 7, /* 90*/
|
|
0, 1, 3, 4, 6, 2, 5, 7, /* 91*/
|
|
2, 3, 4, 6, 0, 1, 5, 7, /* 92*/
|
|
0, 2, 3, 4, 6, 1, 5, 7, /* 93*/
|
|
1, 2, 3, 4, 6, 0, 5, 7, /* 94*/
|
|
0, 1, 2, 3, 4, 6, 5, 7, /* 95*/
|
|
5, 6, 0, 1, 2, 3, 4, 7, /* 96*/
|
|
0, 5, 6, 1, 2, 3, 4, 7, /* 97*/
|
|
1, 5, 6, 0, 2, 3, 4, 7, /* 98*/
|
|
0, 1, 5, 6, 2, 3, 4, 7, /* 99*/
|
|
2, 5, 6, 0, 1, 3, 4, 7, /* 100*/
|
|
0, 2, 5, 6, 1, 3, 4, 7, /* 101*/
|
|
1, 2, 5, 6, 0, 3, 4, 7, /* 102*/
|
|
0, 1, 2, 5, 6, 3, 4, 7, /* 103*/
|
|
3, 5, 6, 0, 1, 2, 4, 7, /* 104*/
|
|
0, 3, 5, 6, 1, 2, 4, 7, /* 105*/
|
|
1, 3, 5, 6, 0, 2, 4, 7, /* 106*/
|
|
0, 1, 3, 5, 6, 2, 4, 7, /* 107*/
|
|
2, 3, 5, 6, 0, 1, 4, 7, /* 108*/
|
|
0, 2, 3, 5, 6, 1, 4, 7, /* 109*/
|
|
1, 2, 3, 5, 6, 0, 4, 7, /* 110*/
|
|
0, 1, 2, 3, 5, 6, 4, 7, /* 111*/
|
|
4, 5, 6, 0, 1, 2, 3, 7, /* 112*/
|
|
0, 4, 5, 6, 1, 2, 3, 7, /* 113*/
|
|
1, 4, 5, 6, 0, 2, 3, 7, /* 114*/
|
|
0, 1, 4, 5, 6, 2, 3, 7, /* 115*/
|
|
2, 4, 5, 6, 0, 1, 3, 7, /* 116*/
|
|
0, 2, 4, 5, 6, 1, 3, 7, /* 117*/
|
|
1, 2, 4, 5, 6, 0, 3, 7, /* 118*/
|
|
0, 1, 2, 4, 5, 6, 3, 7, /* 119*/
|
|
3, 4, 5, 6, 0, 1, 2, 7, /* 120*/
|
|
0, 3, 4, 5, 6, 1, 2, 7, /* 121*/
|
|
1, 3, 4, 5, 6, 0, 2, 7, /* 122*/
|
|
0, 1, 3, 4, 5, 6, 2, 7, /* 123*/
|
|
2, 3, 4, 5, 6, 0, 1, 7, /* 124*/
|
|
0, 2, 3, 4, 5, 6, 1, 7, /* 125*/
|
|
1, 2, 3, 4, 5, 6, 0, 7, /* 126*/
|
|
0, 1, 2, 3, 4, 5, 6, 7, /* 127*/
|
|
7, 0, 1, 2, 3, 4, 5, 6, /* 128*/
|
|
0, 7, 1, 2, 3, 4, 5, 6, /* 129*/
|
|
1, 7, 0, 2, 3, 4, 5, 6, /* 130*/
|
|
0, 1, 7, 2, 3, 4, 5, 6, /* 131*/
|
|
2, 7, 0, 1, 3, 4, 5, 6, /* 132*/
|
|
0, 2, 7, 1, 3, 4, 5, 6, /* 133*/
|
|
1, 2, 7, 0, 3, 4, 5, 6, /* 134*/
|
|
0, 1, 2, 7, 3, 4, 5, 6, /* 135*/
|
|
3, 7, 0, 1, 2, 4, 5, 6, /* 136*/
|
|
0, 3, 7, 1, 2, 4, 5, 6, /* 137*/
|
|
1, 3, 7, 0, 2, 4, 5, 6, /* 138*/
|
|
0, 1, 3, 7, 2, 4, 5, 6, /* 139*/
|
|
2, 3, 7, 0, 1, 4, 5, 6, /* 140*/
|
|
0, 2, 3, 7, 1, 4, 5, 6, /* 141*/
|
|
1, 2, 3, 7, 0, 4, 5, 6, /* 142*/
|
|
0, 1, 2, 3, 7, 4, 5, 6, /* 143*/
|
|
4, 7, 0, 1, 2, 3, 5, 6, /* 144*/
|
|
0, 4, 7, 1, 2, 3, 5, 6, /* 145*/
|
|
1, 4, 7, 0, 2, 3, 5, 6, /* 146*/
|
|
0, 1, 4, 7, 2, 3, 5, 6, /* 147*/
|
|
2, 4, 7, 0, 1, 3, 5, 6, /* 148*/
|
|
0, 2, 4, 7, 1, 3, 5, 6, /* 149*/
|
|
1, 2, 4, 7, 0, 3, 5, 6, /* 150*/
|
|
0, 1, 2, 4, 7, 3, 5, 6, /* 151*/
|
|
3, 4, 7, 0, 1, 2, 5, 6, /* 152*/
|
|
0, 3, 4, 7, 1, 2, 5, 6, /* 153*/
|
|
1, 3, 4, 7, 0, 2, 5, 6, /* 154*/
|
|
0, 1, 3, 4, 7, 2, 5, 6, /* 155*/
|
|
2, 3, 4, 7, 0, 1, 5, 6, /* 156*/
|
|
0, 2, 3, 4, 7, 1, 5, 6, /* 157*/
|
|
1, 2, 3, 4, 7, 0, 5, 6, /* 158*/
|
|
0, 1, 2, 3, 4, 7, 5, 6, /* 159*/
|
|
5, 7, 0, 1, 2, 3, 4, 6, /* 160*/
|
|
0, 5, 7, 1, 2, 3, 4, 6, /* 161*/
|
|
1, 5, 7, 0, 2, 3, 4, 6, /* 162*/
|
|
0, 1, 5, 7, 2, 3, 4, 6, /* 163*/
|
|
2, 5, 7, 0, 1, 3, 4, 6, /* 164*/
|
|
0, 2, 5, 7, 1, 3, 4, 6, /* 165*/
|
|
1, 2, 5, 7, 0, 3, 4, 6, /* 166*/
|
|
0, 1, 2, 5, 7, 3, 4, 6, /* 167*/
|
|
3, 5, 7, 0, 1, 2, 4, 6, /* 168*/
|
|
0, 3, 5, 7, 1, 2, 4, 6, /* 169*/
|
|
1, 3, 5, 7, 0, 2, 4, 6, /* 170*/
|
|
0, 1, 3, 5, 7, 2, 4, 6, /* 171*/
|
|
2, 3, 5, 7, 0, 1, 4, 6, /* 172*/
|
|
0, 2, 3, 5, 7, 1, 4, 6, /* 173*/
|
|
1, 2, 3, 5, 7, 0, 4, 6, /* 174*/
|
|
0, 1, 2, 3, 5, 7, 4, 6, /* 175*/
|
|
4, 5, 7, 0, 1, 2, 3, 6, /* 176*/
|
|
0, 4, 5, 7, 1, 2, 3, 6, /* 177*/
|
|
1, 4, 5, 7, 0, 2, 3, 6, /* 178*/
|
|
0, 1, 4, 5, 7, 2, 3, 6, /* 179*/
|
|
2, 4, 5, 7, 0, 1, 3, 6, /* 180*/
|
|
0, 2, 4, 5, 7, 1, 3, 6, /* 181*/
|
|
1, 2, 4, 5, 7, 0, 3, 6, /* 182*/
|
|
0, 1, 2, 4, 5, 7, 3, 6, /* 183*/
|
|
3, 4, 5, 7, 0, 1, 2, 6, /* 184*/
|
|
0, 3, 4, 5, 7, 1, 2, 6, /* 185*/
|
|
1, 3, 4, 5, 7, 0, 2, 6, /* 186*/
|
|
0, 1, 3, 4, 5, 7, 2, 6, /* 187*/
|
|
2, 3, 4, 5, 7, 0, 1, 6, /* 188*/
|
|
0, 2, 3, 4, 5, 7, 1, 6, /* 189*/
|
|
1, 2, 3, 4, 5, 7, 0, 6, /* 190*/
|
|
0, 1, 2, 3, 4, 5, 7, 6, /* 191*/
|
|
6, 7, 0, 1, 2, 3, 4, 5, /* 192*/
|
|
0, 6, 7, 1, 2, 3, 4, 5, /* 193*/
|
|
1, 6, 7, 0, 2, 3, 4, 5, /* 194*/
|
|
0, 1, 6, 7, 2, 3, 4, 5, /* 195*/
|
|
2, 6, 7, 0, 1, 3, 4, 5, /* 196*/
|
|
0, 2, 6, 7, 1, 3, 4, 5, /* 197*/
|
|
1, 2, 6, 7, 0, 3, 4, 5, /* 198*/
|
|
0, 1, 2, 6, 7, 3, 4, 5, /* 199*/
|
|
3, 6, 7, 0, 1, 2, 4, 5, /* 200*/
|
|
0, 3, 6, 7, 1, 2, 4, 5, /* 201*/
|
|
1, 3, 6, 7, 0, 2, 4, 5, /* 202*/
|
|
0, 1, 3, 6, 7, 2, 4, 5, /* 203*/
|
|
2, 3, 6, 7, 0, 1, 4, 5, /* 204*/
|
|
0, 2, 3, 6, 7, 1, 4, 5, /* 205*/
|
|
1, 2, 3, 6, 7, 0, 4, 5, /* 206*/
|
|
0, 1, 2, 3, 6, 7, 4, 5, /* 207*/
|
|
4, 6, 7, 0, 1, 2, 3, 5, /* 208*/
|
|
0, 4, 6, 7, 1, 2, 3, 5, /* 209*/
|
|
1, 4, 6, 7, 0, 2, 3, 5, /* 210*/
|
|
0, 1, 4, 6, 7, 2, 3, 5, /* 211*/
|
|
2, 4, 6, 7, 0, 1, 3, 5, /* 212*/
|
|
0, 2, 4, 6, 7, 1, 3, 5, /* 213*/
|
|
1, 2, 4, 6, 7, 0, 3, 5, /* 214*/
|
|
0, 1, 2, 4, 6, 7, 3, 5, /* 215*/
|
|
3, 4, 6, 7, 0, 1, 2, 5, /* 216*/
|
|
0, 3, 4, 6, 7, 1, 2, 5, /* 217*/
|
|
1, 3, 4, 6, 7, 0, 2, 5, /* 218*/
|
|
0, 1, 3, 4, 6, 7, 2, 5, /* 219*/
|
|
2, 3, 4, 6, 7, 0, 1, 5, /* 220*/
|
|
0, 2, 3, 4, 6, 7, 1, 5, /* 221*/
|
|
1, 2, 3, 4, 6, 7, 0, 5, /* 222*/
|
|
0, 1, 2, 3, 4, 6, 7, 5, /* 223*/
|
|
5, 6, 7, 0, 1, 2, 3, 4, /* 224*/
|
|
0, 5, 6, 7, 1, 2, 3, 4, /* 225*/
|
|
1, 5, 6, 7, 0, 2, 3, 4, /* 226*/
|
|
0, 1, 5, 6, 7, 2, 3, 4, /* 227*/
|
|
2, 5, 6, 7, 0, 1, 3, 4, /* 228*/
|
|
0, 2, 5, 6, 7, 1, 3, 4, /* 229*/
|
|
1, 2, 5, 6, 7, 0, 3, 4, /* 230*/
|
|
0, 1, 2, 5, 6, 7, 3, 4, /* 231*/
|
|
3, 5, 6, 7, 0, 1, 2, 4, /* 232*/
|
|
0, 3, 5, 6, 7, 1, 2, 4, /* 233*/
|
|
1, 3, 5, 6, 7, 0, 2, 4, /* 234*/
|
|
0, 1, 3, 5, 6, 7, 2, 4, /* 235*/
|
|
2, 3, 5, 6, 7, 0, 1, 4, /* 236*/
|
|
0, 2, 3, 5, 6, 7, 1, 4, /* 237*/
|
|
1, 2, 3, 5, 6, 7, 0, 4, /* 238*/
|
|
0, 1, 2, 3, 5, 6, 7, 4, /* 239*/
|
|
4, 5, 6, 7, 0, 1, 2, 3, /* 240*/
|
|
0, 4, 5, 6, 7, 1, 2, 3, /* 241*/
|
|
1, 4, 5, 6, 7, 0, 2, 3, /* 242*/
|
|
0, 1, 4, 5, 6, 7, 2, 3, /* 243*/
|
|
2, 4, 5, 6, 7, 0, 1, 3, /* 244*/
|
|
0, 2, 4, 5, 6, 7, 1, 3, /* 245*/
|
|
1, 2, 4, 5, 6, 7, 0, 3, /* 246*/
|
|
0, 1, 2, 4, 5, 6, 7, 3, /* 247*/
|
|
3, 4, 5, 6, 7, 0, 1, 2, /* 248*/
|
|
0, 3, 4, 5, 6, 7, 1, 2, /* 249*/
|
|
1, 3, 4, 5, 6, 7, 0, 2, /* 250*/
|
|
0, 1, 3, 4, 5, 6, 7, 2, /* 251*/
|
|
2, 3, 4, 5, 6, 7, 0, 1, /* 252*/
|
|
0, 2, 3, 4, 5, 6, 7, 1, /* 253*/
|
|
1, 2, 3, 4, 5, 6, 7, 0, /* 254*/
|
|
0, 1, 2, 3, 4, 5, 6, 7, /* 255*/
|
|
};
|
|
|
|
static inline __m256i _mm256_cmplt_epi32(__m256i a, __m256i b) {
|
|
return _mm256_cmpgt_epi32(b, a);
|
|
}
|
|
|
|
|
|
static void avx_natenodutch_partition_epi32(int32_t *array, const int32_t pivot,
|
|
int &left, int &right) {
|
|
const __m256i P = _mm256_set1_epi32(pivot);
|
|
const int valuespervector = sizeof(__m256i) / sizeof(int32_t);
|
|
if (right - left + 1 <
|
|
4 * valuespervector) { // not enough space for nate's algo to make sense, falling back
|
|
scalar_partition(array, pivot, left, right);
|
|
return;
|
|
}
|
|
int readleft = left + valuespervector;
|
|
int readright = right - valuespervector;
|
|
int32_t buffer[2 * valuespervector]; // tmp buffer
|
|
memcpy(buffer, array + left, valuespervector * sizeof(int32_t));
|
|
memcpy(buffer + valuespervector, array + right - valuespervector + 1,
|
|
valuespervector * sizeof(int32_t));
|
|
while (readright - readleft >= valuespervector) {
|
|
__m256i *nextVectorPtr;
|
|
if ((readleft - left) > (right - readright)) {
|
|
nextVectorPtr = (__m256i *)(array + readright - valuespervector + 1);
|
|
readright -= valuespervector;
|
|
} else {
|
|
nextVectorPtr = (__m256i *)(array + readleft);
|
|
readleft += valuespervector;
|
|
}
|
|
__m256i currentVector = _mm256_loadu_si256(nextVectorPtr);
|
|
/******
|
|
* we need two comparisons if we are to avoid the Dutch national flag
|
|
* problem,
|
|
* Reference: https://en.wikipedia.org/wiki/Dutch_national_flag_problem
|
|
*/
|
|
int greaterthanpivot =
|
|
_mm256_movemask_ps((__m256)_mm256_cmpgt_epi32(currentVector, P));
|
|
int lesserthanpivot =
|
|
_mm256_movemask_ps((__m256)_mm256_cmplt_epi32(currentVector, P));
|
|
__m256i greaterthanpivot_permvector = _mm256_load_si256(
|
|
(__m256i *)(reverseshufflemask + 8 * greaterthanpivot));
|
|
__m256i lesserthanpivot_permvector =
|
|
_mm256_load_si256((__m256i *)(shufflemask + 8 * lesserthanpivot));
|
|
|
|
int count_greaterthanpivot = _mm_popcnt_u32(greaterthanpivot);
|
|
int count_lesserthanpivot = _mm_popcnt_u32(lesserthanpivot);
|
|
__m256i greaterthanpivot_vector =
|
|
_mm256_permutevar8x32_epi32(currentVector, greaterthanpivot_permvector);
|
|
__m256i lesserthanpivot_vector =
|
|
_mm256_permutevar8x32_epi32(currentVector, lesserthanpivot_permvector);
|
|
_mm256_storeu_si256((__m256i *)(array + left), lesserthanpivot_vector);
|
|
|
|
_mm256_storeu_si256((__m256i *)(array + right - valuespervector + 1),
|
|
greaterthanpivot_vector);
|
|
|
|
left += count_lesserthanpivot;
|
|
right -= count_greaterthanpivot;
|
|
}
|
|
|
|
int howmanyleft = readleft - left;
|
|
if (howmanyleft > 2 * valuespervector) {
|
|
howmanyleft = 2 * valuespervector;
|
|
for (int k = howmanyleft; k < readleft - left; ++k)
|
|
array[left + k] = pivot;
|
|
}
|
|
memcpy(array + left, buffer, howmanyleft * sizeof(int32_t));
|
|
|
|
int howmanyright = right - readright;
|
|
if (howmanyright + howmanyleft > 2 * valuespervector) {
|
|
howmanyright = 2 * valuespervector - howmanyleft;
|
|
for (int k = howmanyright; k < right - readright; ++k)
|
|
array[readright + 1 + k] = pivot;
|
|
}
|
|
|
|
memcpy(array + readright + 1, buffer + howmanyleft,
|
|
howmanyright * sizeof(int32_t));
|
|
scalar_partition(array, pivot, left, right);
|
|
}
|
|
|
|
|
|
// it is really signed int sorting, but the API seems to expect uint32_t...
|
|
void avx_natenodutch_quicksort(uint32_t *array, int left, int right) {
|
|
|
|
int i = left;
|
|
int j = right;
|
|
const int32_t pivot = array[(i + j) / 2];
|
|
|
|
avx_natenodutch_partition_epi32((int32_t *) array, pivot, i, j);
|
|
|
|
if (left < j) {
|
|
avx_natenodutch_quicksort(array, left, j);
|
|
}
|
|
|
|
if (i < right) {
|
|
avx_natenodutch_quicksort(array, i, right);
|
|
}
|
|
}
|