added simd-sort - basically the whole repo, but I haxed-in magyarsort as measure

This commit is contained in:
Richard Thier 2021-12-14 17:30:07 +01:00
parent c4ed2994ea
commit 05235e269f
32 changed files with 4139 additions and 0 deletions

25
simd-sort/LICENSE Normal file
View File

@ -0,0 +1,25 @@
Copyright (c) 2016, Wojciech Muła, Daniel Lemire
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED
TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

57
simd-sort/Makefile Normal file
View File

@ -0,0 +1,57 @@
.SUFFIXES:
.PHONY: all clean
FLAGS=-std=c++11 -mbmi2 -Wall -pedantic -Wextra
FLAGS_AVX512=$(FLAGS) -mavx512f -DHAVE_AVX512F_INSTRUCTIONS -DHAVE_AVX2_INSTRUCTIONS
FLAGS_AVX2=$(FLAGS) -mavx2 -DHAVE_AVX2_INSTRUCTIONS
DEPS_SORT=partition.cpp \
avx2-partition.cpp \
avx2-quicksort.cpp \
avx2-altquicksort.h \
avx2-natenodutch-quicksort.h \
avx2-nate-quicksort.cpp \
avx512-swap.cpp \
avx512-partition.cpp \
avx512-auxbuffer-partition.cpp \
avx512-bmi2-partition.cpp \
avx512-popcnt-partition.cpp \
avx512-quicksort.cpp \
avx512-sort-register.cpp \
avx512-partition-register.cpp \
quicksort.cpp
SPEED_DEPS=$(DEPS_SORT) speed.cpp gettime.cpp rdtsc.cpp runtime_stats.cpp
SPEED_FLAGS=-O3 -DNDEBUG
ALL=test speed test_avx2 speed_avx2 speed_stats speed_avx2_stats
all: $(ALL)
test: test.cpp input_data.cpp $(DEPS_SORT)
$(CXX) $(FLAGS_AVX512) -fsanitize=address test.cpp -o $@
test_avx2: test.cpp input_data.cpp $(DEPS_SORT)
#$(CXX) $(FLAGS_AVX2) -fsanitize=address test.cpp -o $@
$(CXX) $(FLAGS_AVX2) test.cpp -o $@
speed: $(SPEED_DEPS)
$(CXX) $(FLAGS_AVX512) $(SPEED_FLAGS) speed.cpp -o $@
speed_avx2: $(SPEED_DEPS)
$(CXX) $(FLAGS_AVX2) $(SPEED_FLAGS) speed.cpp -o $@
speed_stats: $(SPEED_DEPS)
$(CXX) $(FLAGS_AVX512) $(SPEED_FLAGS) -DWITH_RUNTIME_STATS speed.cpp -o $@
speed_avx2_stats: $(SPEED_DEPS)
$(CXX) $(FLAGS_AVX2) $(SPEED_FLAGS) -DWITH_RUNTIME_STATS speed.cpp -o $@
run: test
sde -cnl -- ./$^
run_avx2: test_avx2
sde -cnl -- ./$^
clean:
rm -f $(ALL)

48
simd-sort/README.rst Normal file
View File

@ -0,0 +1,48 @@
================================================================================
SIMD sorting
================================================================================
Overview
--------------------------------------------------
This repository contains SIMD versions of quicksort, using **AVX512F**
and **AVX2** instruction sets. The subdirectory ``results`` contains
performance results.
The code is still unfinished, it may be subject of changes.
Although programs are written in C++, most procedures are nearly
plain C, except use of namespaces and references in place of
pointers.
Building
--------------------------------------------------
At least GCC 5.3.0 is needed. Type ``make`` to build AVX512F
and AVX2 versions of two programs:
* ``test``/``test_avx2`` --- tests if SIMD-ized sorting procedures
work correctly for various inputs.
* ``speed``/``speed_avx2`` --- compares speed of different procedures,
scalar and SIMD for various inputs.
* ``speed_stats``/``speed_avx2_stats`` --- similar to ``speed``, but
some procedures collect and print runtime statistics. Note that
this gives overhead, so for real performance tests the basic variant
of the programs should be used.
AVX512
--------------------------------------------------
`Intel Software Development Emulator`__ can be used to run AVX512 variants.
__ https://software.intel.com/en-us/articles/intel-software-development-emulator
References
--------------------------------------------------
- Fast Sorting Algorithms using AVX-512 on Intel Knights Landing https://arxiv.org/pdf/1704.08579.pdf

View File

@ -0,0 +1,461 @@
#pragma once
#include <x86intrin.h>
#include <cstdint>
/**
* This is an alternative approach to SIMD quicksort, implemented by D. Lemire.
* It is not meant to be fast as such, but it can serve as a useful reference.
*/
// When defined smaller reverseshufflemask is used.
//#define BYTE_PATTERN
// When defined histogram for comparison mask is collected
//#define WITH_PVBYTE_HISTOGRAM
#if WITH_RUNTIME_STATS == 0 && defined(WITH_PVBYTE_HISTOGRAM)
# undef WITH_RUNTIME_STATS
#endif
// can be replaced with VCOMPRESS on AVX-512
static
#ifdef BYTE_PATTERN
uint8_t
#else
uint32_t
#endif
reverseshufflemask[256 * 8] __attribute__((aligned(0x100))) = {
0, 1, 2, 3, 4, 5, 6, 7, /* 0*/
1, 2, 3, 4, 5, 6, 7, 0, /* 1*/
0, 2, 3, 4, 5, 6, 7, 1, /* 2*/
2, 3, 4, 5, 6, 7, 0, 1, /* 3*/
0, 1, 3, 4, 5, 6, 7, 2, /* 4*/
1, 3, 4, 5, 6, 7, 0, 2, /* 5*/
0, 3, 4, 5, 6, 7, 1, 2, /* 6*/
3, 4, 5, 6, 7, 0, 1, 2, /* 7*/
0, 1, 2, 4, 5, 6, 7, 3, /* 8*/
1, 2, 4, 5, 6, 7, 0, 3, /* 9*/
0, 2, 4, 5, 6, 7, 1, 3, /* 10*/
2, 4, 5, 6, 7, 0, 1, 3, /* 11*/
0, 1, 4, 5, 6, 7, 2, 3, /* 12*/
1, 4, 5, 6, 7, 0, 2, 3, /* 13*/
0, 4, 5, 6, 7, 1, 2, 3, /* 14*/
4, 5, 6, 7, 0, 1, 2, 3, /* 15*/
0, 1, 2, 3, 5, 6, 7, 4, /* 16*/
1, 2, 3, 5, 6, 7, 0, 4, /* 17*/
0, 2, 3, 5, 6, 7, 1, 4, /* 18*/
2, 3, 5, 6, 7, 0, 1, 4, /* 19*/
0, 1, 3, 5, 6, 7, 2, 4, /* 20*/
1, 3, 5, 6, 7, 0, 2, 4, /* 21*/
0, 3, 5, 6, 7, 1, 2, 4, /* 22*/
3, 5, 6, 7, 0, 1, 2, 4, /* 23*/
0, 1, 2, 5, 6, 7, 3, 4, /* 24*/
1, 2, 5, 6, 7, 0, 3, 4, /* 25*/
0, 2, 5, 6, 7, 1, 3, 4, /* 26*/
2, 5, 6, 7, 0, 1, 3, 4, /* 27*/
0, 1, 5, 6, 7, 2, 3, 4, /* 28*/
1, 5, 6, 7, 0, 2, 3, 4, /* 29*/
0, 5, 6, 7, 1, 2, 3, 4, /* 30*/
5, 6, 7, 0, 1, 2, 3, 4, /* 31*/
0, 1, 2, 3, 4, 6, 7, 5, /* 32*/
1, 2, 3, 4, 6, 7, 0, 5, /* 33*/
0, 2, 3, 4, 6, 7, 1, 5, /* 34*/
2, 3, 4, 6, 7, 0, 1, 5, /* 35*/
0, 1, 3, 4, 6, 7, 2, 5, /* 36*/
1, 3, 4, 6, 7, 0, 2, 5, /* 37*/
0, 3, 4, 6, 7, 1, 2, 5, /* 38*/
3, 4, 6, 7, 0, 1, 2, 5, /* 39*/
0, 1, 2, 4, 6, 7, 3, 5, /* 40*/
1, 2, 4, 6, 7, 0, 3, 5, /* 41*/
0, 2, 4, 6, 7, 1, 3, 5, /* 42*/
2, 4, 6, 7, 0, 1, 3, 5, /* 43*/
0, 1, 4, 6, 7, 2, 3, 5, /* 44*/
1, 4, 6, 7, 0, 2, 3, 5, /* 45*/
0, 4, 6, 7, 1, 2, 3, 5, /* 46*/
4, 6, 7, 0, 1, 2, 3, 5, /* 47*/
0, 1, 2, 3, 6, 7, 4, 5, /* 48*/
1, 2, 3, 6, 7, 0, 4, 5, /* 49*/
0, 2, 3, 6, 7, 1, 4, 5, /* 50*/
2, 3, 6, 7, 0, 1, 4, 5, /* 51*/
0, 1, 3, 6, 7, 2, 4, 5, /* 52*/
1, 3, 6, 7, 0, 2, 4, 5, /* 53*/
0, 3, 6, 7, 1, 2, 4, 5, /* 54*/
3, 6, 7, 0, 1, 2, 4, 5, /* 55*/
0, 1, 2, 6, 7, 3, 4, 5, /* 56*/
1, 2, 6, 7, 0, 3, 4, 5, /* 57*/
0, 2, 6, 7, 1, 3, 4, 5, /* 58*/
2, 6, 7, 0, 1, 3, 4, 5, /* 59*/
0, 1, 6, 7, 2, 3, 4, 5, /* 60*/
1, 6, 7, 0, 2, 3, 4, 5, /* 61*/
0, 6, 7, 1, 2, 3, 4, 5, /* 62*/
6, 7, 0, 1, 2, 3, 4, 5, /* 63*/
0, 1, 2, 3, 4, 5, 7, 6, /* 64*/
1, 2, 3, 4, 5, 7, 0, 6, /* 65*/
0, 2, 3, 4, 5, 7, 1, 6, /* 66*/
2, 3, 4, 5, 7, 0, 1, 6, /* 67*/
0, 1, 3, 4, 5, 7, 2, 6, /* 68*/
1, 3, 4, 5, 7, 0, 2, 6, /* 69*/
0, 3, 4, 5, 7, 1, 2, 6, /* 70*/
3, 4, 5, 7, 0, 1, 2, 6, /* 71*/
0, 1, 2, 4, 5, 7, 3, 6, /* 72*/
1, 2, 4, 5, 7, 0, 3, 6, /* 73*/
0, 2, 4, 5, 7, 1, 3, 6, /* 74*/
2, 4, 5, 7, 0, 1, 3, 6, /* 75*/
0, 1, 4, 5, 7, 2, 3, 6, /* 76*/
1, 4, 5, 7, 0, 2, 3, 6, /* 77*/
0, 4, 5, 7, 1, 2, 3, 6, /* 78*/
4, 5, 7, 0, 1, 2, 3, 6, /* 79*/
0, 1, 2, 3, 5, 7, 4, 6, /* 80*/
1, 2, 3, 5, 7, 0, 4, 6, /* 81*/
0, 2, 3, 5, 7, 1, 4, 6, /* 82*/
2, 3, 5, 7, 0, 1, 4, 6, /* 83*/
0, 1, 3, 5, 7, 2, 4, 6, /* 84*/
1, 3, 5, 7, 0, 2, 4, 6, /* 85*/
0, 3, 5, 7, 1, 2, 4, 6, /* 86*/
3, 5, 7, 0, 1, 2, 4, 6, /* 87*/
0, 1, 2, 5, 7, 3, 4, 6, /* 88*/
1, 2, 5, 7, 0, 3, 4, 6, /* 89*/
0, 2, 5, 7, 1, 3, 4, 6, /* 90*/
2, 5, 7, 0, 1, 3, 4, 6, /* 91*/
0, 1, 5, 7, 2, 3, 4, 6, /* 92*/
1, 5, 7, 0, 2, 3, 4, 6, /* 93*/
0, 5, 7, 1, 2, 3, 4, 6, /* 94*/
5, 7, 0, 1, 2, 3, 4, 6, /* 95*/
0, 1, 2, 3, 4, 7, 5, 6, /* 96*/
1, 2, 3, 4, 7, 0, 5, 6, /* 97*/
0, 2, 3, 4, 7, 1, 5, 6, /* 98*/
2, 3, 4, 7, 0, 1, 5, 6, /* 99*/
0, 1, 3, 4, 7, 2, 5, 6, /* 100*/
1, 3, 4, 7, 0, 2, 5, 6, /* 101*/
0, 3, 4, 7, 1, 2, 5, 6, /* 102*/
3, 4, 7, 0, 1, 2, 5, 6, /* 103*/
0, 1, 2, 4, 7, 3, 5, 6, /* 104*/
1, 2, 4, 7, 0, 3, 5, 6, /* 105*/
0, 2, 4, 7, 1, 3, 5, 6, /* 106*/
2, 4, 7, 0, 1, 3, 5, 6, /* 107*/
0, 1, 4, 7, 2, 3, 5, 6, /* 108*/
1, 4, 7, 0, 2, 3, 5, 6, /* 109*/
0, 4, 7, 1, 2, 3, 5, 6, /* 110*/
4, 7, 0, 1, 2, 3, 5, 6, /* 111*/
0, 1, 2, 3, 7, 4, 5, 6, /* 112*/
1, 2, 3, 7, 0, 4, 5, 6, /* 113*/
0, 2, 3, 7, 1, 4, 5, 6, /* 114*/
2, 3, 7, 0, 1, 4, 5, 6, /* 115*/
0, 1, 3, 7, 2, 4, 5, 6, /* 116*/
1, 3, 7, 0, 2, 4, 5, 6, /* 117*/
0, 3, 7, 1, 2, 4, 5, 6, /* 118*/
3, 7, 0, 1, 2, 4, 5, 6, /* 119*/
0, 1, 2, 7, 3, 4, 5, 6, /* 120*/
1, 2, 7, 0, 3, 4, 5, 6, /* 121*/
0, 2, 7, 1, 3, 4, 5, 6, /* 122*/
2, 7, 0, 1, 3, 4, 5, 6, /* 123*/
0, 1, 7, 2, 3, 4, 5, 6, /* 124*/
1, 7, 0, 2, 3, 4, 5, 6, /* 125*/
0, 7, 1, 2, 3, 4, 5, 6, /* 126*/
7, 0, 1, 2, 3, 4, 5, 6, /* 127*/
0, 1, 2, 3, 4, 5, 6, 7, /* 128*/
1, 2, 3, 4, 5, 6, 0, 7, /* 129*/
0, 2, 3, 4, 5, 6, 1, 7, /* 130*/
2, 3, 4, 5, 6, 0, 1, 7, /* 131*/
0, 1, 3, 4, 5, 6, 2, 7, /* 132*/
1, 3, 4, 5, 6, 0, 2, 7, /* 133*/
0, 3, 4, 5, 6, 1, 2, 7, /* 134*/
3, 4, 5, 6, 0, 1, 2, 7, /* 135*/
0, 1, 2, 4, 5, 6, 3, 7, /* 136*/
1, 2, 4, 5, 6, 0, 3, 7, /* 137*/
0, 2, 4, 5, 6, 1, 3, 7, /* 138*/
2, 4, 5, 6, 0, 1, 3, 7, /* 139*/
0, 1, 4, 5, 6, 2, 3, 7, /* 140*/
1, 4, 5, 6, 0, 2, 3, 7, /* 141*/
0, 4, 5, 6, 1, 2, 3, 7, /* 142*/
4, 5, 6, 0, 1, 2, 3, 7, /* 143*/
0, 1, 2, 3, 5, 6, 4, 7, /* 144*/
1, 2, 3, 5, 6, 0, 4, 7, /* 145*/
0, 2, 3, 5, 6, 1, 4, 7, /* 146*/
2, 3, 5, 6, 0, 1, 4, 7, /* 147*/
0, 1, 3, 5, 6, 2, 4, 7, /* 148*/
1, 3, 5, 6, 0, 2, 4, 7, /* 149*/
0, 3, 5, 6, 1, 2, 4, 7, /* 150*/
3, 5, 6, 0, 1, 2, 4, 7, /* 151*/
0, 1, 2, 5, 6, 3, 4, 7, /* 152*/
1, 2, 5, 6, 0, 3, 4, 7, /* 153*/
0, 2, 5, 6, 1, 3, 4, 7, /* 154*/
2, 5, 6, 0, 1, 3, 4, 7, /* 155*/
0, 1, 5, 6, 2, 3, 4, 7, /* 156*/
1, 5, 6, 0, 2, 3, 4, 7, /* 157*/
0, 5, 6, 1, 2, 3, 4, 7, /* 158*/
5, 6, 0, 1, 2, 3, 4, 7, /* 159*/
0, 1, 2, 3, 4, 6, 5, 7, /* 160*/
1, 2, 3, 4, 6, 0, 5, 7, /* 161*/
0, 2, 3, 4, 6, 1, 5, 7, /* 162*/
2, 3, 4, 6, 0, 1, 5, 7, /* 163*/
0, 1, 3, 4, 6, 2, 5, 7, /* 164*/
1, 3, 4, 6, 0, 2, 5, 7, /* 165*/
0, 3, 4, 6, 1, 2, 5, 7, /* 166*/
3, 4, 6, 0, 1, 2, 5, 7, /* 167*/
0, 1, 2, 4, 6, 3, 5, 7, /* 168*/
1, 2, 4, 6, 0, 3, 5, 7, /* 169*/
0, 2, 4, 6, 1, 3, 5, 7, /* 170*/
2, 4, 6, 0, 1, 3, 5, 7, /* 171*/
0, 1, 4, 6, 2, 3, 5, 7, /* 172*/
1, 4, 6, 0, 2, 3, 5, 7, /* 173*/
0, 4, 6, 1, 2, 3, 5, 7, /* 174*/
4, 6, 0, 1, 2, 3, 5, 7, /* 175*/
0, 1, 2, 3, 6, 4, 5, 7, /* 176*/
1, 2, 3, 6, 0, 4, 5, 7, /* 177*/
0, 2, 3, 6, 1, 4, 5, 7, /* 178*/
2, 3, 6, 0, 1, 4, 5, 7, /* 179*/
0, 1, 3, 6, 2, 4, 5, 7, /* 180*/
1, 3, 6, 0, 2, 4, 5, 7, /* 181*/
0, 3, 6, 1, 2, 4, 5, 7, /* 182*/
3, 6, 0, 1, 2, 4, 5, 7, /* 183*/
0, 1, 2, 6, 3, 4, 5, 7, /* 184*/
1, 2, 6, 0, 3, 4, 5, 7, /* 185*/
0, 2, 6, 1, 3, 4, 5, 7, /* 186*/
2, 6, 0, 1, 3, 4, 5, 7, /* 187*/
0, 1, 6, 2, 3, 4, 5, 7, /* 188*/
1, 6, 0, 2, 3, 4, 5, 7, /* 189*/
0, 6, 1, 2, 3, 4, 5, 7, /* 190*/
6, 0, 1, 2, 3, 4, 5, 7, /* 191*/
0, 1, 2, 3, 4, 5, 6, 7, /* 192*/
1, 2, 3, 4, 5, 0, 6, 7, /* 193*/
0, 2, 3, 4, 5, 1, 6, 7, /* 194*/
2, 3, 4, 5, 0, 1, 6, 7, /* 195*/
0, 1, 3, 4, 5, 2, 6, 7, /* 196*/
1, 3, 4, 5, 0, 2, 6, 7, /* 197*/
0, 3, 4, 5, 1, 2, 6, 7, /* 198*/
3, 4, 5, 0, 1, 2, 6, 7, /* 199*/
0, 1, 2, 4, 5, 3, 6, 7, /* 200*/
1, 2, 4, 5, 0, 3, 6, 7, /* 201*/
0, 2, 4, 5, 1, 3, 6, 7, /* 202*/
2, 4, 5, 0, 1, 3, 6, 7, /* 203*/
0, 1, 4, 5, 2, 3, 6, 7, /* 204*/
1, 4, 5, 0, 2, 3, 6, 7, /* 205*/
0, 4, 5, 1, 2, 3, 6, 7, /* 206*/
4, 5, 0, 1, 2, 3, 6, 7, /* 207*/
0, 1, 2, 3, 5, 4, 6, 7, /* 208*/
1, 2, 3, 5, 0, 4, 6, 7, /* 209*/
0, 2, 3, 5, 1, 4, 6, 7, /* 210*/
2, 3, 5, 0, 1, 4, 6, 7, /* 211*/
0, 1, 3, 5, 2, 4, 6, 7, /* 212*/
1, 3, 5, 0, 2, 4, 6, 7, /* 213*/
0, 3, 5, 1, 2, 4, 6, 7, /* 214*/
3, 5, 0, 1, 2, 4, 6, 7, /* 215*/
0, 1, 2, 5, 3, 4, 6, 7, /* 216*/
1, 2, 5, 0, 3, 4, 6, 7, /* 217*/
0, 2, 5, 1, 3, 4, 6, 7, /* 218*/
2, 5, 0, 1, 3, 4, 6, 7, /* 219*/
0, 1, 5, 2, 3, 4, 6, 7, /* 220*/
1, 5, 0, 2, 3, 4, 6, 7, /* 221*/
0, 5, 1, 2, 3, 4, 6, 7, /* 222*/
5, 0, 1, 2, 3, 4, 6, 7, /* 223*/
0, 1, 2, 3, 4, 5, 6, 7, /* 224*/
1, 2, 3, 4, 0, 5, 6, 7, /* 225*/
0, 2, 3, 4, 1, 5, 6, 7, /* 226*/
2, 3, 4, 0, 1, 5, 6, 7, /* 227*/
0, 1, 3, 4, 2, 5, 6, 7, /* 228*/
1, 3, 4, 0, 2, 5, 6, 7, /* 229*/
0, 3, 4, 1, 2, 5, 6, 7, /* 230*/
3, 4, 0, 1, 2, 5, 6, 7, /* 231*/
0, 1, 2, 4, 3, 5, 6, 7, /* 232*/
1, 2, 4, 0, 3, 5, 6, 7, /* 233*/
0, 2, 4, 1, 3, 5, 6, 7, /* 234*/
2, 4, 0, 1, 3, 5, 6, 7, /* 235*/
0, 1, 4, 2, 3, 5, 6, 7, /* 236*/
1, 4, 0, 2, 3, 5, 6, 7, /* 237*/
0, 4, 1, 2, 3, 5, 6, 7, /* 238*/
4, 0, 1, 2, 3, 5, 6, 7, /* 239*/
0, 1, 2, 3, 4, 5, 6, 7, /* 240*/
1, 2, 3, 0, 4, 5, 6, 7, /* 241*/
0, 2, 3, 1, 4, 5, 6, 7, /* 242*/
2, 3, 0, 1, 4, 5, 6, 7, /* 243*/
0, 1, 3, 2, 4, 5, 6, 7, /* 244*/
1, 3, 0, 2, 4, 5, 6, 7, /* 245*/
0, 3, 1, 2, 4, 5, 6, 7, /* 246*/
3, 0, 1, 2, 4, 5, 6, 7, /* 247*/
0, 1, 2, 3, 4, 5, 6, 7, /* 248*/
1, 2, 0, 3, 4, 5, 6, 7, /* 249*/
0, 2, 1, 3, 4, 5, 6, 7, /* 250*/
2, 0, 1, 3, 4, 5, 6, 7, /* 251*/
0, 1, 2, 3, 4, 5, 6, 7, /* 252*/
1, 0, 2, 3, 4, 5, 6, 7, /* 253*/
0, 1, 2, 3, 4, 5, 6, 7, /* 254*/
0, 1, 2, 3, 4, 5, 6, 7, /* 255*/
};
static FORCE_INLINE __m256i get_permutation_vector(int pvbyte) {
#ifdef BYTE_PATTERN
__m256i shufm;
asm volatile (
"vpmovzxbd (%1), %0"
: "=X" (shufm)
: "r" (reverseshufflemask + 8 * pvbyte)
);
return shufm;
#else
return _mm256_load_si256((__m256i *)(reverseshufflemask + 8 * pvbyte));
#endif
}
static uint32_t avx_pivot_on_last_value(int32_t *array, size_t length) {
/* we run through the data. Anything in [0,boundary) is smaller or equal
* than the pivot, and the value at boundary - 1 is going to be equal to the
* pivot at the end,
* anything in (boundary, i) is greater than the pivot
* stuff in [i,...) is grey
* the function returns the location of the boundary.
*/
if (length <= 1)
return 1;
{ // we exchange the last value for the middle value for a better pivot
int32_t ival = array[length / 2];
int32_t bval = array[length - 1];
array[length / 2] = bval;
array[length - 1] = ival;
}
#if WITH_RUNTIME_STATS
statistics.partition_calls += 1;
statistics.items_processed += length;
#endif
uint32_t boundary = 0;
uint32_t i = 0;
int32_t pivot = array[length - 1]; // we always pick the pivot at the end
const __m256i P = _mm256_set1_epi32(pivot);
while ( i + 8 + 1 <= length) {
__m256i allgrey = _mm256_lddqu_si256((__m256i *)(array + i));
int pvbyte = _mm256_movemask_ps((__m256)_mm256_cmpgt_epi32(allgrey, P));
#if WITH_RUNTIME_STATS && defined(WITH_PVBYTE_HISTOGRAM)
statistics.pvbyte_histogram.hit(pvbyte);
#endif
if(pvbyte == 0) { // might be frequent
i += 8; //nothing to do
boundary = i;
} else if (pvbyte == 0xFF) { // called once
boundary = i;
i += 8;
break; // exit
} else {
// hot path
switch (pvbyte) {
// for pvbyte = 0x00, 0x80, 0xc0, 0xe0, 0xf0, 0xf8, 0xfc, 0xfe, 0xff
// there is no change in order, just advance boundary
// Note: case 0x00 & 0xff are already handled
case 0x80: i += 8 - 1; break;
case 0xc0: i += 8 - 2; break;
case 0xe0: i += 8 - 3; break;
case 0xf0: i += 8 - 4; break;
case 0xf8: i += 8 - 5; break;
case 0xfc: i += 8 - 6; break;
case 0xfe: i += 8 - 7; break;
default: {
uint32_t cnt =
8 - _mm_popcnt_u32(pvbyte); // might be faster with table look-up?
__m256i blackthenwhite = _mm256_permutevar8x32_epi32(allgrey, get_permutation_vector(pvbyte));
_mm256_storeu_si256((__m256i *)(array + i), blackthenwhite);
i += cnt;
}
} // switch
boundary = i; // this doesn't need updating each and every time
}
}
for (; i + 8 + 1 <= length ;) {
__m256i allgrey =
_mm256_lddqu_si256((__m256i *)(array + i)); // this is all grey
int pvbyte = _mm256_movemask_ps((__m256)_mm256_cmpgt_epi32(allgrey, P));
if (pvbyte == 0xFF) { // called once
// nothing to do
} else {
uint32_t cnt =
8 - _mm_popcnt_u32(pvbyte); // might be faster with table look-up?
__m256i allwhite = _mm256_lddqu_si256(
(__m256i *)(array + boundary)); // this is all white
// we shuffle allgrey so that the first part is black and the second part
// is white
__m256i blackthenwhite = _mm256_permutevar8x32_epi32(allgrey, get_permutation_vector(pvbyte));
_mm256_storeu_si256((__m256i *)(array + boundary), blackthenwhite);
_mm256_storeu_si256((__m256i *)(array + i), allwhite);
boundary += cnt; // might be faster with table look-up?
}
i += 8;
}
while (i + 1 < length) {
int32_t ival = array[i];
if (ival <= pivot) {
int32_t bval = array[boundary];
array[i] = bval;
array[boundary] = ival;
boundary++;
}
i++;
}
int32_t ival = array[i];
int32_t bval = array[boundary];
array[length - 1] = bval;
array[boundary] = ival;
boundary++;
return boundary;
}
// for fallback
void scalar_partition(int32_t* array, const int32_t pivot, int& left, int& right) {
while (left <= right) {
while (array[left] < pivot) {
left += 1;
}
while (array[right] > pivot) {
right -= 1;
}
if (left <= right) {
const uint32_t t = array[left];
array[left] = array[right];
array[right] = t;
left += 1;
right -= 1;
}
}
}
//fallback
void scalar_quicksort(int32_t* array, int left, int right) {
#ifdef WITH_RUNTIME_STATS
statistics.scalar__partition_calls += 1;
statistics.scalar__items_processed += right - left + 1;
#endif
int i = left;
int j = right;
const int32_t pivot = array[(i + j)/2];
scalar_partition(array, pivot, i, j);
if (left < j) {
scalar_quicksort(array, left, j);
}
if (i < right) {
scalar_quicksort(array, i, right);
}
}
void avx2_pivotonlast_sort(int32_t *array, const uint32_t length) {
uint32_t sep = avx_pivot_on_last_value(array, length);
if(sep == length) {
// we have an ineffective pivot. Let us give up.
if(length > 1) scalar_quicksort(array,0,length - 1);
} else {
if (sep > 2) {
avx2_pivotonlast_sort(array, sep - 1);
}
if (sep + 1 < length) {
avx2_pivotonlast_sort(array + sep, length - sep);
}
}
}
void wrapped_avx2_pivotonlast_sort(uint32_t *array, int left, int right) {
avx2_pivotonlast_sort((int32_t *)array + left, right - left + 1);
}

View File

@ -0,0 +1,436 @@
#pragma once
#include <x86intrin.h>
#include <cstdint>
namespace nate {
const int AVX2_REGISTER_SIZE = 8; // in 32-bit words
/*
Algorithm of vectorized partition by Nathan Kurz.
Everything else was borrowed from Daniel's code (avx2-altquicksort.h)
*/
static uint32_t reverseshufflemask[256 * 8] __attribute__((aligned(0x100))) = {
0, 1, 2, 3, 4, 5, 6, 7, /* 0*/
1, 2, 3, 4, 5, 6, 7, 0, /* 1*/
0, 2, 3, 4, 5, 6, 7, 1, /* 2*/
2, 3, 4, 5, 6, 7, 0, 1, /* 3*/
0, 1, 3, 4, 5, 6, 7, 2, /* 4*/
1, 3, 4, 5, 6, 7, 0, 2, /* 5*/
0, 3, 4, 5, 6, 7, 1, 2, /* 6*/
3, 4, 5, 6, 7, 0, 1, 2, /* 7*/
0, 1, 2, 4, 5, 6, 7, 3, /* 8*/
1, 2, 4, 5, 6, 7, 0, 3, /* 9*/
0, 2, 4, 5, 6, 7, 1, 3, /* 10*/
2, 4, 5, 6, 7, 0, 1, 3, /* 11*/
0, 1, 4, 5, 6, 7, 2, 3, /* 12*/
1, 4, 5, 6, 7, 0, 2, 3, /* 13*/
0, 4, 5, 6, 7, 1, 2, 3, /* 14*/
4, 5, 6, 7, 0, 1, 2, 3, /* 15*/
0, 1, 2, 3, 5, 6, 7, 4, /* 16*/
1, 2, 3, 5, 6, 7, 0, 4, /* 17*/
0, 2, 3, 5, 6, 7, 1, 4, /* 18*/
2, 3, 5, 6, 7, 0, 1, 4, /* 19*/
0, 1, 3, 5, 6, 7, 2, 4, /* 20*/
1, 3, 5, 6, 7, 0, 2, 4, /* 21*/
0, 3, 5, 6, 7, 1, 2, 4, /* 22*/
3, 5, 6, 7, 0, 1, 2, 4, /* 23*/
0, 1, 2, 5, 6, 7, 3, 4, /* 24*/
1, 2, 5, 6, 7, 0, 3, 4, /* 25*/
0, 2, 5, 6, 7, 1, 3, 4, /* 26*/
2, 5, 6, 7, 0, 1, 3, 4, /* 27*/
0, 1, 5, 6, 7, 2, 3, 4, /* 28*/
1, 5, 6, 7, 0, 2, 3, 4, /* 29*/
0, 5, 6, 7, 1, 2, 3, 4, /* 30*/
5, 6, 7, 0, 1, 2, 3, 4, /* 31*/
0, 1, 2, 3, 4, 6, 7, 5, /* 32*/
1, 2, 3, 4, 6, 7, 0, 5, /* 33*/
0, 2, 3, 4, 6, 7, 1, 5, /* 34*/
2, 3, 4, 6, 7, 0, 1, 5, /* 35*/
0, 1, 3, 4, 6, 7, 2, 5, /* 36*/
1, 3, 4, 6, 7, 0, 2, 5, /* 37*/
0, 3, 4, 6, 7, 1, 2, 5, /* 38*/
3, 4, 6, 7, 0, 1, 2, 5, /* 39*/
0, 1, 2, 4, 6, 7, 3, 5, /* 40*/
1, 2, 4, 6, 7, 0, 3, 5, /* 41*/
0, 2, 4, 6, 7, 1, 3, 5, /* 42*/
2, 4, 6, 7, 0, 1, 3, 5, /* 43*/
0, 1, 4, 6, 7, 2, 3, 5, /* 44*/
1, 4, 6, 7, 0, 2, 3, 5, /* 45*/
0, 4, 6, 7, 1, 2, 3, 5, /* 46*/
4, 6, 7, 0, 1, 2, 3, 5, /* 47*/
0, 1, 2, 3, 6, 7, 4, 5, /* 48*/
1, 2, 3, 6, 7, 0, 4, 5, /* 49*/
0, 2, 3, 6, 7, 1, 4, 5, /* 50*/
2, 3, 6, 7, 0, 1, 4, 5, /* 51*/
0, 1, 3, 6, 7, 2, 4, 5, /* 52*/
1, 3, 6, 7, 0, 2, 4, 5, /* 53*/
0, 3, 6, 7, 1, 2, 4, 5, /* 54*/
3, 6, 7, 0, 1, 2, 4, 5, /* 55*/
0, 1, 2, 6, 7, 3, 4, 5, /* 56*/
1, 2, 6, 7, 0, 3, 4, 5, /* 57*/
0, 2, 6, 7, 1, 3, 4, 5, /* 58*/
2, 6, 7, 0, 1, 3, 4, 5, /* 59*/
0, 1, 6, 7, 2, 3, 4, 5, /* 60*/
1, 6, 7, 0, 2, 3, 4, 5, /* 61*/
0, 6, 7, 1, 2, 3, 4, 5, /* 62*/
6, 7, 0, 1, 2, 3, 4, 5, /* 63*/
0, 1, 2, 3, 4, 5, 7, 6, /* 64*/
1, 2, 3, 4, 5, 7, 0, 6, /* 65*/
0, 2, 3, 4, 5, 7, 1, 6, /* 66*/
2, 3, 4, 5, 7, 0, 1, 6, /* 67*/
0, 1, 3, 4, 5, 7, 2, 6, /* 68*/
1, 3, 4, 5, 7, 0, 2, 6, /* 69*/
0, 3, 4, 5, 7, 1, 2, 6, /* 70*/
3, 4, 5, 7, 0, 1, 2, 6, /* 71*/
0, 1, 2, 4, 5, 7, 3, 6, /* 72*/
1, 2, 4, 5, 7, 0, 3, 6, /* 73*/
0, 2, 4, 5, 7, 1, 3, 6, /* 74*/
2, 4, 5, 7, 0, 1, 3, 6, /* 75*/
0, 1, 4, 5, 7, 2, 3, 6, /* 76*/
1, 4, 5, 7, 0, 2, 3, 6, /* 77*/
0, 4, 5, 7, 1, 2, 3, 6, /* 78*/
4, 5, 7, 0, 1, 2, 3, 6, /* 79*/
0, 1, 2, 3, 5, 7, 4, 6, /* 80*/
1, 2, 3, 5, 7, 0, 4, 6, /* 81*/
0, 2, 3, 5, 7, 1, 4, 6, /* 82*/
2, 3, 5, 7, 0, 1, 4, 6, /* 83*/
0, 1, 3, 5, 7, 2, 4, 6, /* 84*/
1, 3, 5, 7, 0, 2, 4, 6, /* 85*/
0, 3, 5, 7, 1, 2, 4, 6, /* 86*/
3, 5, 7, 0, 1, 2, 4, 6, /* 87*/
0, 1, 2, 5, 7, 3, 4, 6, /* 88*/
1, 2, 5, 7, 0, 3, 4, 6, /* 89*/
0, 2, 5, 7, 1, 3, 4, 6, /* 90*/
2, 5, 7, 0, 1, 3, 4, 6, /* 91*/
0, 1, 5, 7, 2, 3, 4, 6, /* 92*/
1, 5, 7, 0, 2, 3, 4, 6, /* 93*/
0, 5, 7, 1, 2, 3, 4, 6, /* 94*/
5, 7, 0, 1, 2, 3, 4, 6, /* 95*/
0, 1, 2, 3, 4, 7, 5, 6, /* 96*/
1, 2, 3, 4, 7, 0, 5, 6, /* 97*/
0, 2, 3, 4, 7, 1, 5, 6, /* 98*/
2, 3, 4, 7, 0, 1, 5, 6, /* 99*/
0, 1, 3, 4, 7, 2, 5, 6, /* 100*/
1, 3, 4, 7, 0, 2, 5, 6, /* 101*/
0, 3, 4, 7, 1, 2, 5, 6, /* 102*/
3, 4, 7, 0, 1, 2, 5, 6, /* 103*/
0, 1, 2, 4, 7, 3, 5, 6, /* 104*/
1, 2, 4, 7, 0, 3, 5, 6, /* 105*/
0, 2, 4, 7, 1, 3, 5, 6, /* 106*/
2, 4, 7, 0, 1, 3, 5, 6, /* 107*/
0, 1, 4, 7, 2, 3, 5, 6, /* 108*/
1, 4, 7, 0, 2, 3, 5, 6, /* 109*/
0, 4, 7, 1, 2, 3, 5, 6, /* 110*/
4, 7, 0, 1, 2, 3, 5, 6, /* 111*/
0, 1, 2, 3, 7, 4, 5, 6, /* 112*/
1, 2, 3, 7, 0, 4, 5, 6, /* 113*/
0, 2, 3, 7, 1, 4, 5, 6, /* 114*/
2, 3, 7, 0, 1, 4, 5, 6, /* 115*/
0, 1, 3, 7, 2, 4, 5, 6, /* 116*/
1, 3, 7, 0, 2, 4, 5, 6, /* 117*/
0, 3, 7, 1, 2, 4, 5, 6, /* 118*/
3, 7, 0, 1, 2, 4, 5, 6, /* 119*/
0, 1, 2, 7, 3, 4, 5, 6, /* 120*/
1, 2, 7, 0, 3, 4, 5, 6, /* 121*/
0, 2, 7, 1, 3, 4, 5, 6, /* 122*/
2, 7, 0, 1, 3, 4, 5, 6, /* 123*/
0, 1, 7, 2, 3, 4, 5, 6, /* 124*/
1, 7, 0, 2, 3, 4, 5, 6, /* 125*/
0, 7, 1, 2, 3, 4, 5, 6, /* 126*/
7, 0, 1, 2, 3, 4, 5, 6, /* 127*/
0, 1, 2, 3, 4, 5, 6, 7, /* 128*/
1, 2, 3, 4, 5, 6, 0, 7, /* 129*/
0, 2, 3, 4, 5, 6, 1, 7, /* 130*/
2, 3, 4, 5, 6, 0, 1, 7, /* 131*/
0, 1, 3, 4, 5, 6, 2, 7, /* 132*/
1, 3, 4, 5, 6, 0, 2, 7, /* 133*/
0, 3, 4, 5, 6, 1, 2, 7, /* 134*/
3, 4, 5, 6, 0, 1, 2, 7, /* 135*/
0, 1, 2, 4, 5, 6, 3, 7, /* 136*/
1, 2, 4, 5, 6, 0, 3, 7, /* 137*/
0, 2, 4, 5, 6, 1, 3, 7, /* 138*/
2, 4, 5, 6, 0, 1, 3, 7, /* 139*/
0, 1, 4, 5, 6, 2, 3, 7, /* 140*/
1, 4, 5, 6, 0, 2, 3, 7, /* 141*/
0, 4, 5, 6, 1, 2, 3, 7, /* 142*/
4, 5, 6, 0, 1, 2, 3, 7, /* 143*/
0, 1, 2, 3, 5, 6, 4, 7, /* 144*/
1, 2, 3, 5, 6, 0, 4, 7, /* 145*/
0, 2, 3, 5, 6, 1, 4, 7, /* 146*/
2, 3, 5, 6, 0, 1, 4, 7, /* 147*/
0, 1, 3, 5, 6, 2, 4, 7, /* 148*/
1, 3, 5, 6, 0, 2, 4, 7, /* 149*/
0, 3, 5, 6, 1, 2, 4, 7, /* 150*/
3, 5, 6, 0, 1, 2, 4, 7, /* 151*/
0, 1, 2, 5, 6, 3, 4, 7, /* 152*/
1, 2, 5, 6, 0, 3, 4, 7, /* 153*/
0, 2, 5, 6, 1, 3, 4, 7, /* 154*/
2, 5, 6, 0, 1, 3, 4, 7, /* 155*/
0, 1, 5, 6, 2, 3, 4, 7, /* 156*/
1, 5, 6, 0, 2, 3, 4, 7, /* 157*/
0, 5, 6, 1, 2, 3, 4, 7, /* 158*/
5, 6, 0, 1, 2, 3, 4, 7, /* 159*/
0, 1, 2, 3, 4, 6, 5, 7, /* 160*/
1, 2, 3, 4, 6, 0, 5, 7, /* 161*/
0, 2, 3, 4, 6, 1, 5, 7, /* 162*/
2, 3, 4, 6, 0, 1, 5, 7, /* 163*/
0, 1, 3, 4, 6, 2, 5, 7, /* 164*/
1, 3, 4, 6, 0, 2, 5, 7, /* 165*/
0, 3, 4, 6, 1, 2, 5, 7, /* 166*/
3, 4, 6, 0, 1, 2, 5, 7, /* 167*/
0, 1, 2, 4, 6, 3, 5, 7, /* 168*/
1, 2, 4, 6, 0, 3, 5, 7, /* 169*/
0, 2, 4, 6, 1, 3, 5, 7, /* 170*/
2, 4, 6, 0, 1, 3, 5, 7, /* 171*/
0, 1, 4, 6, 2, 3, 5, 7, /* 172*/
1, 4, 6, 0, 2, 3, 5, 7, /* 173*/
0, 4, 6, 1, 2, 3, 5, 7, /* 174*/
4, 6, 0, 1, 2, 3, 5, 7, /* 175*/
0, 1, 2, 3, 6, 4, 5, 7, /* 176*/
1, 2, 3, 6, 0, 4, 5, 7, /* 177*/
0, 2, 3, 6, 1, 4, 5, 7, /* 178*/
2, 3, 6, 0, 1, 4, 5, 7, /* 179*/
0, 1, 3, 6, 2, 4, 5, 7, /* 180*/
1, 3, 6, 0, 2, 4, 5, 7, /* 181*/
0, 3, 6, 1, 2, 4, 5, 7, /* 182*/
3, 6, 0, 1, 2, 4, 5, 7, /* 183*/
0, 1, 2, 6, 3, 4, 5, 7, /* 184*/
1, 2, 6, 0, 3, 4, 5, 7, /* 185*/
0, 2, 6, 1, 3, 4, 5, 7, /* 186*/
2, 6, 0, 1, 3, 4, 5, 7, /* 187*/
0, 1, 6, 2, 3, 4, 5, 7, /* 188*/
1, 6, 0, 2, 3, 4, 5, 7, /* 189*/
0, 6, 1, 2, 3, 4, 5, 7, /* 190*/
6, 0, 1, 2, 3, 4, 5, 7, /* 191*/
0, 1, 2, 3, 4, 5, 6, 7, /* 192*/
1, 2, 3, 4, 5, 0, 6, 7, /* 193*/
0, 2, 3, 4, 5, 1, 6, 7, /* 194*/
2, 3, 4, 5, 0, 1, 6, 7, /* 195*/
0, 1, 3, 4, 5, 2, 6, 7, /* 196*/
1, 3, 4, 5, 0, 2, 6, 7, /* 197*/
0, 3, 4, 5, 1, 2, 6, 7, /* 198*/
3, 4, 5, 0, 1, 2, 6, 7, /* 199*/
0, 1, 2, 4, 5, 3, 6, 7, /* 200*/
1, 2, 4, 5, 0, 3, 6, 7, /* 201*/
0, 2, 4, 5, 1, 3, 6, 7, /* 202*/
2, 4, 5, 0, 1, 3, 6, 7, /* 203*/
0, 1, 4, 5, 2, 3, 6, 7, /* 204*/
1, 4, 5, 0, 2, 3, 6, 7, /* 205*/
0, 4, 5, 1, 2, 3, 6, 7, /* 206*/
4, 5, 0, 1, 2, 3, 6, 7, /* 207*/
0, 1, 2, 3, 5, 4, 6, 7, /* 208*/
1, 2, 3, 5, 0, 4, 6, 7, /* 209*/
0, 2, 3, 5, 1, 4, 6, 7, /* 210*/
2, 3, 5, 0, 1, 4, 6, 7, /* 211*/
0, 1, 3, 5, 2, 4, 6, 7, /* 212*/
1, 3, 5, 0, 2, 4, 6, 7, /* 213*/
0, 3, 5, 1, 2, 4, 6, 7, /* 214*/
3, 5, 0, 1, 2, 4, 6, 7, /* 215*/
0, 1, 2, 5, 3, 4, 6, 7, /* 216*/
1, 2, 5, 0, 3, 4, 6, 7, /* 217*/
0, 2, 5, 1, 3, 4, 6, 7, /* 218*/
2, 5, 0, 1, 3, 4, 6, 7, /* 219*/
0, 1, 5, 2, 3, 4, 6, 7, /* 220*/
1, 5, 0, 2, 3, 4, 6, 7, /* 221*/
0, 5, 1, 2, 3, 4, 6, 7, /* 222*/
5, 0, 1, 2, 3, 4, 6, 7, /* 223*/
0, 1, 2, 3, 4, 5, 6, 7, /* 224*/
1, 2, 3, 4, 0, 5, 6, 7, /* 225*/
0, 2, 3, 4, 1, 5, 6, 7, /* 226*/
2, 3, 4, 0, 1, 5, 6, 7, /* 227*/
0, 1, 3, 4, 2, 5, 6, 7, /* 228*/
1, 3, 4, 0, 2, 5, 6, 7, /* 229*/
0, 3, 4, 1, 2, 5, 6, 7, /* 230*/
3, 4, 0, 1, 2, 5, 6, 7, /* 231*/
0, 1, 2, 4, 3, 5, 6, 7, /* 232*/
1, 2, 4, 0, 3, 5, 6, 7, /* 233*/
0, 2, 4, 1, 3, 5, 6, 7, /* 234*/
2, 4, 0, 1, 3, 5, 6, 7, /* 235*/
0, 1, 4, 2, 3, 5, 6, 7, /* 236*/
1, 4, 0, 2, 3, 5, 6, 7, /* 237*/
0, 4, 1, 2, 3, 5, 6, 7, /* 238*/
4, 0, 1, 2, 3, 5, 6, 7, /* 239*/
0, 1, 2, 3, 4, 5, 6, 7, /* 240*/
1, 2, 3, 0, 4, 5, 6, 7, /* 241*/
0, 2, 3, 1, 4, 5, 6, 7, /* 242*/
2, 3, 0, 1, 4, 5, 6, 7, /* 243*/
0, 1, 3, 2, 4, 5, 6, 7, /* 244*/
1, 3, 0, 2, 4, 5, 6, 7, /* 245*/
0, 3, 1, 2, 4, 5, 6, 7, /* 246*/
3, 0, 1, 2, 4, 5, 6, 7, /* 247*/
0, 1, 2, 3, 4, 5, 6, 7, /* 248*/
1, 2, 0, 3, 4, 5, 6, 7, /* 249*/
0, 2, 1, 3, 4, 5, 6, 7, /* 250*/
2, 0, 1, 3, 4, 5, 6, 7, /* 251*/
0, 1, 2, 3, 4, 5, 6, 7, /* 252*/
1, 0, 2, 3, 4, 5, 6, 7, /* 253*/
0, 1, 2, 3, 4, 5, 6, 7, /* 254*/
0, 1, 2, 3, 4, 5, 6, 7, /* 255*/
};
static uint32_t avx_pivot_on_last_value(int32_t *array, size_t length) {
if (length <= 1)
return 1;
{ // we exchange the last value for the middle value for a better pivot
int32_t ival = array[length / 2];
int32_t bval = array[length - 1];
array[length / 2] = bval;
array[length - 1] = ival;
}
int wh, rh;
int wt, rt;
rh = 0;
rt = length - 1;
wh = 0;
wt = length - 1;
__m256i h0 = _mm256_loadu_si256((__m256i*)(array + rh));
__m256i h1 = _mm256_loadu_si256((__m256i*)(array + rh + AVX2_REGISTER_SIZE));
rh += 2*AVX2_REGISTER_SIZE;
__m256i t0 = _mm256_loadu_si256((__m256i*)(array + rt - AVX2_REGISTER_SIZE));
__m256i t1 = _mm256_loadu_si256((__m256i*)(array + rt - 2*AVX2_REGISTER_SIZE));
rt -= 3*AVX2_REGISTER_SIZE;
__m256i current = _mm256_loadu_si256((__m256i*)(array + rh));
__m256i next;
rh += AVX2_REGISTER_SIZE;
const int32_t pivot = array[length - 1];
const __m256i P = _mm256_set1_epi32(pivot);
// 1. the main loop
while (wh - wt < 4*AVX2_REGISTER_SIZE) {
const bool which = (rh - wh) < (wt - rt);
// I believe that a compiler will emit branchless code
__m256i* next_ptr = (__m256i*)(which ? array + rh : array + rt);
const int adv_rh = which ? AVX2_REGISTER_SIZE : 0;
const int adv_rt = which ? 0 : AVX2_REGISTER_SIZE;
// My faith ends here
next = _mm256_loadu_si256(next_ptr);
const int pvbyte = _mm256_movemask_ps((__m256)_mm256_cmpgt_epi32(current, P));
const uint32_t cnt = 8 - _mm_popcnt_u32(pvbyte);
const __m256i shuf = _mm256_load_si256((__m256i *)(reverseshufflemask + 8 * pvbyte));
const __m256i ordered = _mm256_permutevar8x32_epi32(current, shuf);
_mm256_storeu_si256((__m256i*)(array + wh), ordered);
_mm256_storeu_si256((__m256i*)(array + wt - 8), ordered);
rh += adv_rh;
rt -= adv_rt;
wh += cnt;
wt -= 8 - cnt;
current = next;
}
// 2. partition remaining part
while (wh - wt > 4*AVX2_REGISTER_SIZE) {
const int32_t v = array[rh++];
if (v < pivot) {
array[wh++] = v;
} else {
array[wt--] = v;
}
}
// 3. partition 4 registers loaded in the beginning
static uint32_t tmp[4*AVX2_REGISTER_SIZE];
_mm256_storeu_si256((__m256i*)(tmp + 0*AVX2_REGISTER_SIZE), h0);
_mm256_storeu_si256((__m256i*)(tmp + 1*AVX2_REGISTER_SIZE), h1);
_mm256_storeu_si256((__m256i*)(tmp + 2*AVX2_REGISTER_SIZE), t0);
_mm256_storeu_si256((__m256i*)(tmp + 3*AVX2_REGISTER_SIZE), t1);
for (int i=0; i < 4*AVX2_REGISTER_SIZE; i++) {
const int32_t v = array[i];
if (v < pivot) {
array[wh++] = v;
} else {
array[wt--] = v;
}
}
{
const int32_t a = array[length - 1];
const int32_t b = array[wh];
array[length - 1] = b;
array[wh] = a;
}
return wh;
}
// for fallback
void scalar_partition(int32_t* array, const int32_t pivot, int& left, int& right) {
while (left <= right) {
while (array[left] < pivot) {
left += 1;
}
while (array[right] > pivot) {
right -= 1;
}
if (left <= right) {
const uint32_t t = array[left];
array[left] = array[right];
array[right] = t;
left += 1;
right -= 1;
}
}
}
//fallback
void scalar_quicksort(int32_t* array, int left, int right) {
#ifdef WITH_RUNTIME_STATS
statistics.scalar__partition_calls += 1;
statistics.scalar__items_processed += right - left + 1;
#endif
int i = left;
int j = right;
const int32_t pivot = array[(i + j)/2];
scalar_partition(array, pivot, i, j);
if (left < j) {
scalar_quicksort(array, left, j);
}
if (i < right) {
scalar_quicksort(array, i, right);
}
}
void avx2_pivotonlast_sort(int32_t *array, const uint32_t length) {
uint32_t sep;
if (length > 8*AVX2_REGISTER_SIZE) {
sep = avx_pivot_on_last_value(array, length);
} else {
sep = lomuto_partition_epi32((uint32_t*)array, 0, length - 1);
}
if(sep == length) {
// we have an ineffective pivot. Let us give up.
if(length > 1) scalar_quicksort(array,0,length - 1);
} else {
if (sep > 2) {
avx2_pivotonlast_sort(array, sep - 1);
}
if (sep + 1 < length) {
avx2_pivotonlast_sort(array + sep, length - sep);
}
}
}
void wrapped_avx2_pivotonlast_sort(uint32_t *array, int left, int right) {
avx2_pivotonlast_sort((int32_t *)array + left, right - left + 1);
}
} // namespace nate

View File

@ -0,0 +1,409 @@
#pragma once
#include <x86intrin.h>
#include <cstdint>
#include <memory.h>
#include "avx2-altquicksort.h"
/**
* This is an alternative approach to SIMD quicksort, based on a description by
* Nathan Kurz. Daniel Lemire implemented it to avoid the Dutch national flag
* problem as an experiment. This is not meant to be fast, as implemented. It is
* an experiment.
*/
/*
I'm going to use "vector" as a unit, but it doesn't have to be the
native vector size. In particular, I think we can read multiple
vector register's worth of data with minimal added time per loop
iteration.
First, read two vectors from the head of the list, and two from the
tail of the list. Store these, either in registers or written to a
buffer. We're not going to use these until the end. We're just
getting them out of the way so that we have more than one vectors
writeable space at the head and tail.
wh--rh------------------rt--wt-
wh = write head
rh = read head
rt = read tail
wt = write tail
Then read one vector from head, and advance rh accordingly. Call it nextVector.
Loop:
Set currentVector = nextVector.
Set nextVectorPtr to either rh or rt depending on which has the least
"writable space" available. That is, if (rh - wh) < (wt - rt), read
from and advance rh, else read from and update rt. These updates
need to be branchless.
Read nextVector = *nextVectorPtr.
Partition currentVector (lookup, shuffle) and write to both wh and wt
(tail offset so end hits wt).
Advance wh and wt based on number of valid entries. Use a second byte
lookup issued as early as possible to determine this count. Our
performance is dependent on the latency of this step.
-----
*/
static uint32_t shufflemask[256 * 8] __attribute__((aligned(0x100))) = {
0, 1, 2, 3, 4, 5, 6, 7, /* 0*/
0, 1, 2, 3, 4, 5, 6, 7, /* 1*/
1, 0, 2, 3, 4, 5, 6, 7, /* 2*/
0, 1, 2, 3, 4, 5, 6, 7, /* 3*/
2, 0, 1, 3, 4, 5, 6, 7, /* 4*/
0, 2, 1, 3, 4, 5, 6, 7, /* 5*/
1, 2, 0, 3, 4, 5, 6, 7, /* 6*/
0, 1, 2, 3, 4, 5, 6, 7, /* 7*/
3, 0, 1, 2, 4, 5, 6, 7, /* 8*/
0, 3, 1, 2, 4, 5, 6, 7, /* 9*/
1, 3, 0, 2, 4, 5, 6, 7, /* 10*/
0, 1, 3, 2, 4, 5, 6, 7, /* 11*/
2, 3, 0, 1, 4, 5, 6, 7, /* 12*/
0, 2, 3, 1, 4, 5, 6, 7, /* 13*/
1, 2, 3, 0, 4, 5, 6, 7, /* 14*/
0, 1, 2, 3, 4, 5, 6, 7, /* 15*/
4, 0, 1, 2, 3, 5, 6, 7, /* 16*/
0, 4, 1, 2, 3, 5, 6, 7, /* 17*/
1, 4, 0, 2, 3, 5, 6, 7, /* 18*/
0, 1, 4, 2, 3, 5, 6, 7, /* 19*/
2, 4, 0, 1, 3, 5, 6, 7, /* 20*/
0, 2, 4, 1, 3, 5, 6, 7, /* 21*/
1, 2, 4, 0, 3, 5, 6, 7, /* 22*/
0, 1, 2, 4, 3, 5, 6, 7, /* 23*/
3, 4, 0, 1, 2, 5, 6, 7, /* 24*/
0, 3, 4, 1, 2, 5, 6, 7, /* 25*/
1, 3, 4, 0, 2, 5, 6, 7, /* 26*/
0, 1, 3, 4, 2, 5, 6, 7, /* 27*/
2, 3, 4, 0, 1, 5, 6, 7, /* 28*/
0, 2, 3, 4, 1, 5, 6, 7, /* 29*/
1, 2, 3, 4, 0, 5, 6, 7, /* 30*/
0, 1, 2, 3, 4, 5, 6, 7, /* 31*/
5, 0, 1, 2, 3, 4, 6, 7, /* 32*/
0, 5, 1, 2, 3, 4, 6, 7, /* 33*/
1, 5, 0, 2, 3, 4, 6, 7, /* 34*/
0, 1, 5, 2, 3, 4, 6, 7, /* 35*/
2, 5, 0, 1, 3, 4, 6, 7, /* 36*/
0, 2, 5, 1, 3, 4, 6, 7, /* 37*/
1, 2, 5, 0, 3, 4, 6, 7, /* 38*/
0, 1, 2, 5, 3, 4, 6, 7, /* 39*/
3, 5, 0, 1, 2, 4, 6, 7, /* 40*/
0, 3, 5, 1, 2, 4, 6, 7, /* 41*/
1, 3, 5, 0, 2, 4, 6, 7, /* 42*/
0, 1, 3, 5, 2, 4, 6, 7, /* 43*/
2, 3, 5, 0, 1, 4, 6, 7, /* 44*/
0, 2, 3, 5, 1, 4, 6, 7, /* 45*/
1, 2, 3, 5, 0, 4, 6, 7, /* 46*/
0, 1, 2, 3, 5, 4, 6, 7, /* 47*/
4, 5, 0, 1, 2, 3, 6, 7, /* 48*/
0, 4, 5, 1, 2, 3, 6, 7, /* 49*/
1, 4, 5, 0, 2, 3, 6, 7, /* 50*/
0, 1, 4, 5, 2, 3, 6, 7, /* 51*/
2, 4, 5, 0, 1, 3, 6, 7, /* 52*/
0, 2, 4, 5, 1, 3, 6, 7, /* 53*/
1, 2, 4, 5, 0, 3, 6, 7, /* 54*/
0, 1, 2, 4, 5, 3, 6, 7, /* 55*/
3, 4, 5, 0, 1, 2, 6, 7, /* 56*/
0, 3, 4, 5, 1, 2, 6, 7, /* 57*/
1, 3, 4, 5, 0, 2, 6, 7, /* 58*/
0, 1, 3, 4, 5, 2, 6, 7, /* 59*/
2, 3, 4, 5, 0, 1, 6, 7, /* 60*/
0, 2, 3, 4, 5, 1, 6, 7, /* 61*/
1, 2, 3, 4, 5, 0, 6, 7, /* 62*/
0, 1, 2, 3, 4, 5, 6, 7, /* 63*/
6, 0, 1, 2, 3, 4, 5, 7, /* 64*/
0, 6, 1, 2, 3, 4, 5, 7, /* 65*/
1, 6, 0, 2, 3, 4, 5, 7, /* 66*/
0, 1, 6, 2, 3, 4, 5, 7, /* 67*/
2, 6, 0, 1, 3, 4, 5, 7, /* 68*/
0, 2, 6, 1, 3, 4, 5, 7, /* 69*/
1, 2, 6, 0, 3, 4, 5, 7, /* 70*/
0, 1, 2, 6, 3, 4, 5, 7, /* 71*/
3, 6, 0, 1, 2, 4, 5, 7, /* 72*/
0, 3, 6, 1, 2, 4, 5, 7, /* 73*/
1, 3, 6, 0, 2, 4, 5, 7, /* 74*/
0, 1, 3, 6, 2, 4, 5, 7, /* 75*/
2, 3, 6, 0, 1, 4, 5, 7, /* 76*/
0, 2, 3, 6, 1, 4, 5, 7, /* 77*/
1, 2, 3, 6, 0, 4, 5, 7, /* 78*/
0, 1, 2, 3, 6, 4, 5, 7, /* 79*/
4, 6, 0, 1, 2, 3, 5, 7, /* 80*/
0, 4, 6, 1, 2, 3, 5, 7, /* 81*/
1, 4, 6, 0, 2, 3, 5, 7, /* 82*/
0, 1, 4, 6, 2, 3, 5, 7, /* 83*/
2, 4, 6, 0, 1, 3, 5, 7, /* 84*/
0, 2, 4, 6, 1, 3, 5, 7, /* 85*/
1, 2, 4, 6, 0, 3, 5, 7, /* 86*/
0, 1, 2, 4, 6, 3, 5, 7, /* 87*/
3, 4, 6, 0, 1, 2, 5, 7, /* 88*/
0, 3, 4, 6, 1, 2, 5, 7, /* 89*/
1, 3, 4, 6, 0, 2, 5, 7, /* 90*/
0, 1, 3, 4, 6, 2, 5, 7, /* 91*/
2, 3, 4, 6, 0, 1, 5, 7, /* 92*/
0, 2, 3, 4, 6, 1, 5, 7, /* 93*/
1, 2, 3, 4, 6, 0, 5, 7, /* 94*/
0, 1, 2, 3, 4, 6, 5, 7, /* 95*/
5, 6, 0, 1, 2, 3, 4, 7, /* 96*/
0, 5, 6, 1, 2, 3, 4, 7, /* 97*/
1, 5, 6, 0, 2, 3, 4, 7, /* 98*/
0, 1, 5, 6, 2, 3, 4, 7, /* 99*/
2, 5, 6, 0, 1, 3, 4, 7, /* 100*/
0, 2, 5, 6, 1, 3, 4, 7, /* 101*/
1, 2, 5, 6, 0, 3, 4, 7, /* 102*/
0, 1, 2, 5, 6, 3, 4, 7, /* 103*/
3, 5, 6, 0, 1, 2, 4, 7, /* 104*/
0, 3, 5, 6, 1, 2, 4, 7, /* 105*/
1, 3, 5, 6, 0, 2, 4, 7, /* 106*/
0, 1, 3, 5, 6, 2, 4, 7, /* 107*/
2, 3, 5, 6, 0, 1, 4, 7, /* 108*/
0, 2, 3, 5, 6, 1, 4, 7, /* 109*/
1, 2, 3, 5, 6, 0, 4, 7, /* 110*/
0, 1, 2, 3, 5, 6, 4, 7, /* 111*/
4, 5, 6, 0, 1, 2, 3, 7, /* 112*/
0, 4, 5, 6, 1, 2, 3, 7, /* 113*/
1, 4, 5, 6, 0, 2, 3, 7, /* 114*/
0, 1, 4, 5, 6, 2, 3, 7, /* 115*/
2, 4, 5, 6, 0, 1, 3, 7, /* 116*/
0, 2, 4, 5, 6, 1, 3, 7, /* 117*/
1, 2, 4, 5, 6, 0, 3, 7, /* 118*/
0, 1, 2, 4, 5, 6, 3, 7, /* 119*/
3, 4, 5, 6, 0, 1, 2, 7, /* 120*/
0, 3, 4, 5, 6, 1, 2, 7, /* 121*/
1, 3, 4, 5, 6, 0, 2, 7, /* 122*/
0, 1, 3, 4, 5, 6, 2, 7, /* 123*/
2, 3, 4, 5, 6, 0, 1, 7, /* 124*/
0, 2, 3, 4, 5, 6, 1, 7, /* 125*/
1, 2, 3, 4, 5, 6, 0, 7, /* 126*/
0, 1, 2, 3, 4, 5, 6, 7, /* 127*/
7, 0, 1, 2, 3, 4, 5, 6, /* 128*/
0, 7, 1, 2, 3, 4, 5, 6, /* 129*/
1, 7, 0, 2, 3, 4, 5, 6, /* 130*/
0, 1, 7, 2, 3, 4, 5, 6, /* 131*/
2, 7, 0, 1, 3, 4, 5, 6, /* 132*/
0, 2, 7, 1, 3, 4, 5, 6, /* 133*/
1, 2, 7, 0, 3, 4, 5, 6, /* 134*/
0, 1, 2, 7, 3, 4, 5, 6, /* 135*/
3, 7, 0, 1, 2, 4, 5, 6, /* 136*/
0, 3, 7, 1, 2, 4, 5, 6, /* 137*/
1, 3, 7, 0, 2, 4, 5, 6, /* 138*/
0, 1, 3, 7, 2, 4, 5, 6, /* 139*/
2, 3, 7, 0, 1, 4, 5, 6, /* 140*/
0, 2, 3, 7, 1, 4, 5, 6, /* 141*/
1, 2, 3, 7, 0, 4, 5, 6, /* 142*/
0, 1, 2, 3, 7, 4, 5, 6, /* 143*/
4, 7, 0, 1, 2, 3, 5, 6, /* 144*/
0, 4, 7, 1, 2, 3, 5, 6, /* 145*/
1, 4, 7, 0, 2, 3, 5, 6, /* 146*/
0, 1, 4, 7, 2, 3, 5, 6, /* 147*/
2, 4, 7, 0, 1, 3, 5, 6, /* 148*/
0, 2, 4, 7, 1, 3, 5, 6, /* 149*/
1, 2, 4, 7, 0, 3, 5, 6, /* 150*/
0, 1, 2, 4, 7, 3, 5, 6, /* 151*/
3, 4, 7, 0, 1, 2, 5, 6, /* 152*/
0, 3, 4, 7, 1, 2, 5, 6, /* 153*/
1, 3, 4, 7, 0, 2, 5, 6, /* 154*/
0, 1, 3, 4, 7, 2, 5, 6, /* 155*/
2, 3, 4, 7, 0, 1, 5, 6, /* 156*/
0, 2, 3, 4, 7, 1, 5, 6, /* 157*/
1, 2, 3, 4, 7, 0, 5, 6, /* 158*/
0, 1, 2, 3, 4, 7, 5, 6, /* 159*/
5, 7, 0, 1, 2, 3, 4, 6, /* 160*/
0, 5, 7, 1, 2, 3, 4, 6, /* 161*/
1, 5, 7, 0, 2, 3, 4, 6, /* 162*/
0, 1, 5, 7, 2, 3, 4, 6, /* 163*/
2, 5, 7, 0, 1, 3, 4, 6, /* 164*/
0, 2, 5, 7, 1, 3, 4, 6, /* 165*/
1, 2, 5, 7, 0, 3, 4, 6, /* 166*/
0, 1, 2, 5, 7, 3, 4, 6, /* 167*/
3, 5, 7, 0, 1, 2, 4, 6, /* 168*/
0, 3, 5, 7, 1, 2, 4, 6, /* 169*/
1, 3, 5, 7, 0, 2, 4, 6, /* 170*/
0, 1, 3, 5, 7, 2, 4, 6, /* 171*/
2, 3, 5, 7, 0, 1, 4, 6, /* 172*/
0, 2, 3, 5, 7, 1, 4, 6, /* 173*/
1, 2, 3, 5, 7, 0, 4, 6, /* 174*/
0, 1, 2, 3, 5, 7, 4, 6, /* 175*/
4, 5, 7, 0, 1, 2, 3, 6, /* 176*/
0, 4, 5, 7, 1, 2, 3, 6, /* 177*/
1, 4, 5, 7, 0, 2, 3, 6, /* 178*/
0, 1, 4, 5, 7, 2, 3, 6, /* 179*/
2, 4, 5, 7, 0, 1, 3, 6, /* 180*/
0, 2, 4, 5, 7, 1, 3, 6, /* 181*/
1, 2, 4, 5, 7, 0, 3, 6, /* 182*/
0, 1, 2, 4, 5, 7, 3, 6, /* 183*/
3, 4, 5, 7, 0, 1, 2, 6, /* 184*/
0, 3, 4, 5, 7, 1, 2, 6, /* 185*/
1, 3, 4, 5, 7, 0, 2, 6, /* 186*/
0, 1, 3, 4, 5, 7, 2, 6, /* 187*/
2, 3, 4, 5, 7, 0, 1, 6, /* 188*/
0, 2, 3, 4, 5, 7, 1, 6, /* 189*/
1, 2, 3, 4, 5, 7, 0, 6, /* 190*/
0, 1, 2, 3, 4, 5, 7, 6, /* 191*/
6, 7, 0, 1, 2, 3, 4, 5, /* 192*/
0, 6, 7, 1, 2, 3, 4, 5, /* 193*/
1, 6, 7, 0, 2, 3, 4, 5, /* 194*/
0, 1, 6, 7, 2, 3, 4, 5, /* 195*/
2, 6, 7, 0, 1, 3, 4, 5, /* 196*/
0, 2, 6, 7, 1, 3, 4, 5, /* 197*/
1, 2, 6, 7, 0, 3, 4, 5, /* 198*/
0, 1, 2, 6, 7, 3, 4, 5, /* 199*/
3, 6, 7, 0, 1, 2, 4, 5, /* 200*/
0, 3, 6, 7, 1, 2, 4, 5, /* 201*/
1, 3, 6, 7, 0, 2, 4, 5, /* 202*/
0, 1, 3, 6, 7, 2, 4, 5, /* 203*/
2, 3, 6, 7, 0, 1, 4, 5, /* 204*/
0, 2, 3, 6, 7, 1, 4, 5, /* 205*/
1, 2, 3, 6, 7, 0, 4, 5, /* 206*/
0, 1, 2, 3, 6, 7, 4, 5, /* 207*/
4, 6, 7, 0, 1, 2, 3, 5, /* 208*/
0, 4, 6, 7, 1, 2, 3, 5, /* 209*/
1, 4, 6, 7, 0, 2, 3, 5, /* 210*/
0, 1, 4, 6, 7, 2, 3, 5, /* 211*/
2, 4, 6, 7, 0, 1, 3, 5, /* 212*/
0, 2, 4, 6, 7, 1, 3, 5, /* 213*/
1, 2, 4, 6, 7, 0, 3, 5, /* 214*/
0, 1, 2, 4, 6, 7, 3, 5, /* 215*/
3, 4, 6, 7, 0, 1, 2, 5, /* 216*/
0, 3, 4, 6, 7, 1, 2, 5, /* 217*/
1, 3, 4, 6, 7, 0, 2, 5, /* 218*/
0, 1, 3, 4, 6, 7, 2, 5, /* 219*/
2, 3, 4, 6, 7, 0, 1, 5, /* 220*/
0, 2, 3, 4, 6, 7, 1, 5, /* 221*/
1, 2, 3, 4, 6, 7, 0, 5, /* 222*/
0, 1, 2, 3, 4, 6, 7, 5, /* 223*/
5, 6, 7, 0, 1, 2, 3, 4, /* 224*/
0, 5, 6, 7, 1, 2, 3, 4, /* 225*/
1, 5, 6, 7, 0, 2, 3, 4, /* 226*/
0, 1, 5, 6, 7, 2, 3, 4, /* 227*/
2, 5, 6, 7, 0, 1, 3, 4, /* 228*/
0, 2, 5, 6, 7, 1, 3, 4, /* 229*/
1, 2, 5, 6, 7, 0, 3, 4, /* 230*/
0, 1, 2, 5, 6, 7, 3, 4, /* 231*/
3, 5, 6, 7, 0, 1, 2, 4, /* 232*/
0, 3, 5, 6, 7, 1, 2, 4, /* 233*/
1, 3, 5, 6, 7, 0, 2, 4, /* 234*/
0, 1, 3, 5, 6, 7, 2, 4, /* 235*/
2, 3, 5, 6, 7, 0, 1, 4, /* 236*/
0, 2, 3, 5, 6, 7, 1, 4, /* 237*/
1, 2, 3, 5, 6, 7, 0, 4, /* 238*/
0, 1, 2, 3, 5, 6, 7, 4, /* 239*/
4, 5, 6, 7, 0, 1, 2, 3, /* 240*/
0, 4, 5, 6, 7, 1, 2, 3, /* 241*/
1, 4, 5, 6, 7, 0, 2, 3, /* 242*/
0, 1, 4, 5, 6, 7, 2, 3, /* 243*/
2, 4, 5, 6, 7, 0, 1, 3, /* 244*/
0, 2, 4, 5, 6, 7, 1, 3, /* 245*/
1, 2, 4, 5, 6, 7, 0, 3, /* 246*/
0, 1, 2, 4, 5, 6, 7, 3, /* 247*/
3, 4, 5, 6, 7, 0, 1, 2, /* 248*/
0, 3, 4, 5, 6, 7, 1, 2, /* 249*/
1, 3, 4, 5, 6, 7, 0, 2, /* 250*/
0, 1, 3, 4, 5, 6, 7, 2, /* 251*/
2, 3, 4, 5, 6, 7, 0, 1, /* 252*/
0, 2, 3, 4, 5, 6, 7, 1, /* 253*/
1, 2, 3, 4, 5, 6, 7, 0, /* 254*/
0, 1, 2, 3, 4, 5, 6, 7, /* 255*/
};
static inline __m256i _mm256_cmplt_epi32(__m256i a, __m256i b) {
return _mm256_cmpgt_epi32(b, a);
}
static void avx_natenodutch_partition_epi32(int32_t *array, const int32_t pivot,
int &left, int &right) {
const __m256i P = _mm256_set1_epi32(pivot);
const int valuespervector = sizeof(__m256i) / sizeof(int32_t);
if (right - left + 1 <
4 * valuespervector) { // not enough space for nate's algo to make sense, falling back
scalar_partition(array, pivot, left, right);
return;
}
int readleft = left + valuespervector;
int readright = right - valuespervector;
int32_t buffer[2 * valuespervector]; // tmp buffer
memcpy(buffer, array + left, valuespervector * sizeof(int32_t));
memcpy(buffer + valuespervector, array + right - valuespervector + 1,
valuespervector * sizeof(int32_t));
while (readright - readleft >= valuespervector) {
__m256i *nextVectorPtr;
if ((readleft - left) > (right - readright)) {
nextVectorPtr = (__m256i *)(array + readright - valuespervector + 1);
readright -= valuespervector;
} else {
nextVectorPtr = (__m256i *)(array + readleft);
readleft += valuespervector;
}
__m256i currentVector = _mm256_loadu_si256(nextVectorPtr);
/******
* we need two comparisons if we are to avoid the Dutch national flag
* problem,
* Reference: https://en.wikipedia.org/wiki/Dutch_national_flag_problem
*/
int greaterthanpivot =
_mm256_movemask_ps((__m256)_mm256_cmpgt_epi32(currentVector, P));
int lesserthanpivot =
_mm256_movemask_ps((__m256)_mm256_cmplt_epi32(currentVector, P));
__m256i greaterthanpivot_permvector = _mm256_load_si256(
(__m256i *)(reverseshufflemask + 8 * greaterthanpivot));
__m256i lesserthanpivot_permvector =
_mm256_load_si256((__m256i *)(shufflemask + 8 * lesserthanpivot));
int count_greaterthanpivot = _mm_popcnt_u32(greaterthanpivot);
int count_lesserthanpivot = _mm_popcnt_u32(lesserthanpivot);
__m256i greaterthanpivot_vector =
_mm256_permutevar8x32_epi32(currentVector, greaterthanpivot_permvector);
__m256i lesserthanpivot_vector =
_mm256_permutevar8x32_epi32(currentVector, lesserthanpivot_permvector);
_mm256_storeu_si256((__m256i *)(array + left), lesserthanpivot_vector);
_mm256_storeu_si256((__m256i *)(array + right - valuespervector + 1),
greaterthanpivot_vector);
left += count_lesserthanpivot;
right -= count_greaterthanpivot;
}
int howmanyleft = readleft - left;
if (howmanyleft > 2 * valuespervector) {
howmanyleft = 2 * valuespervector;
for (int k = howmanyleft; k < readleft - left; ++k)
array[left + k] = pivot;
}
memcpy(array + left, buffer, howmanyleft * sizeof(int32_t));
int howmanyright = right - readright;
if (howmanyright + howmanyleft > 2 * valuespervector) {
howmanyright = 2 * valuespervector - howmanyleft;
for (int k = howmanyright; k < right - readright; ++k)
array[readright + 1 + k] = pivot;
}
memcpy(array + readright + 1, buffer + howmanyleft,
howmanyright * sizeof(int32_t));
scalar_partition(array, pivot, left, right);
}
// it is really signed int sorting, but the API seems to expect uint32_t...
void avx_natenodutch_quicksort(uint32_t *array, int left, int right) {
int i = left;
int j = right;
const int32_t pivot = array[(i + j) / 2];
avx_natenodutch_partition_epi32((int32_t *) array, pivot, i, j);
if (left < j) {
avx_natenodutch_quicksort(array, left, j);
}
if (i < right) {
avx_natenodutch_quicksort(array, i, right);
}
}

View File

@ -0,0 +1,197 @@
#include <x86intrin.h>
#include <cstdint>
namespace qs {
namespace avx2 {
__m256i FORCE_INLINE bitmask_to_bytemask_epi32(uint8_t bm) {
const __m256i mask = _mm256_set1_epi32(bm);
const __m256i bits = _mm256_setr_epi32(0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80);
const __m256i tmp = _mm256_and_si256(mask, bits);
return _mm256_cmpeq_epi32(tmp, bits);
}
void FORCE_INLINE align_masks(uint8_t& a, uint8_t& b, uint8_t& rem_a, uint8_t& rem_b, __m256i& shuffle_a, __m256i& shuffle_b) {
assert(a != 0);
assert(b != 0);
uint8_t tmpA = a;
uint8_t tmpB = b;
uint32_t __attribute__((__aligned__(32))) tmpshufa[8];
uint32_t __attribute__((__aligned__(32))) tmpshufb[8];
while (tmpA != 0 && tmpB != 0) {
int idx_a = __builtin_ctz(tmpA);
int idx_b = __builtin_ctz(tmpB);
tmpA = tmpA & (tmpA - 1);
tmpB = tmpB & (tmpB - 1);
tmpshufb[idx_a] = idx_b;
tmpshufa[idx_b] = idx_a;
}
a = a ^ tmpA;
b = b ^ tmpB;
assert(a != 0);
assert(b != 0);
assert(_mm_popcnt_u64(a) == _mm_popcnt_u64(b));
rem_a = tmpA;
rem_b = tmpB;
shuffle_a = _mm256_load_si256((__m256i*)tmpshufa);
shuffle_b = _mm256_load_si256((__m256i*)tmpshufb);
}
__m256i FORCE_INLINE merge(const __m256i mask, const __m256i a, const __m256i b) {
return _mm256_or_si256(
_mm256_and_si256(mask, a),
_mm256_andnot_si256(mask, b)
);
}
void FORCE_INLINE swap_epi32(
__m256i& a, __m256i& b,
uint8_t mask_a, const __m256i shuffle_a,
uint8_t mask_b, const __m256i shuffle_b) {
const __m256i to_swap_b = _mm256_permutevar8x32_epi32(a, shuffle_a);
const __m256i to_swap_a = _mm256_permutevar8x32_epi32(b, shuffle_b);
const __m256i ma = bitmask_to_bytemask_epi32(mask_a);
const __m256i mb = bitmask_to_bytemask_epi32(mask_b);
a = merge(ma, to_swap_a, a);
b = merge(mb, to_swap_b, b);
}
#define _mm256_iszero(vec) (_mm256_testz_si256(vec, vec) != 0)
void FORCE_INLINE partition_epi32(uint32_t* array, uint32_t pv, int& left, int& right) {
const int N = 8; // the number of items in a register (256/32)
__m256i L;
__m256i R;
uint8_t maskL = 0;
uint8_t maskR = 0;
const __m256i pivot = _mm256_set1_epi32(pv);
int origL = left;
int origR = right;
while (true) {
if (maskL == 0) {
while (true) {
if (right - (left + N) + 1 < 2*N) {
goto end;
}
L = _mm256_loadu_si256((__m256i*)(array + left));
const __m256i bytemask = _mm256_cmpgt_epi32(pivot, L);
if (_mm256_testc_ps((__m256)bytemask, (__m256)_mm256_set1_epi32(-1))) {
left += N;
} else {
maskL = ~_mm256_movemask_ps((__m256)bytemask);
break;
}
}
}
if (maskR == 0) {
while (true) {
if ((right - N) - left + 1 < 2*N) {
goto end;
}
R = _mm256_loadu_si256((__m256i*)(array + right - N + 1));
const __m256i bytemask = _mm256_cmpgt_epi32(pivot, R);
if (_mm256_iszero(bytemask)) {
right -= N;
} else {
maskR = _mm256_movemask_ps((__m256)bytemask);
break;
}
}
}
assert(left <= right);
assert(maskL != 0);
assert(maskR != 0);
uint8_t mL;
uint8_t mR;
__m256i shuffleL;
__m256i shuffleR;
align_masks(maskL, maskR, mL, mR, shuffleL, shuffleR);
swap_epi32(L, R, maskL, shuffleL, maskR, shuffleR);
maskL = mL;
maskR = mR;
if (maskL == 0) {
_mm256_storeu_si256((__m256i*)(array + left), L);
left += N;
}
if (maskR == 0) {
_mm256_storeu_si256((__m256i*)(array + right - N + 1), R);
right -= N;
}
} // while
end:
assert(!(maskL != 0 && maskR != 0));
if (maskL != 0) {
_mm256_storeu_si256((__m256i*)(array + left), L);
} else if (maskR != 0) {
_mm256_storeu_si256((__m256i*)(array + right - N + 1), R);
}
if (left < right) {
int less = 0;
int greater = 0;
const int all = right - left + 1;
for (int i=left; i <= right; i++) {
less += int(array[i] < pv);
greater += int(array[i] > pv);
}
if (all == less) {
// all elements in range [left, right] less than pivot
scalar_partition_epi32(array, pv, origL, left);
} else if (all == greater) {
// all elements in range [left, right] greater than pivot
scalar_partition_epi32(array, pv, left, origR);
} else {
scalar_partition_epi32(array, pv, left, right);
}
}
}
} // namespace avx2
} // namespace qs

View File

@ -0,0 +1,32 @@
#include "avx2-partition.cpp"
namespace qs {
namespace avx2 {
void quicksort(uint32_t* array, int left, int right) {
int i = left;
int j = right;
const uint32_t pivot = array[(i + j)/2];
const int AVX2_REGISTER_SIZE = 8; // in 32-bit words
if (j - i >= 2 * AVX2_REGISTER_SIZE) {
qs::avx2::partition_epi32(array, pivot, i, j);
} else {
scalar_partition_epi32(array, pivot, i, j);
}
if (left < j) {
quicksort(array, left, j);
}
if (i < right) {
quicksort(array, i, right);
}
}
} // namespace avx2
} // namespace qs

View File

@ -0,0 +1,90 @@
namespace qs {
namespace avx512 {
void memset_epi32(uint32_t* array, uint32_t w, size_t n) {
const int N = 16;
const __m512i word = _mm512_set1_epi32(w);
for (size_t i=0; i < n/N; i++) {
_mm512_storeu_si512(array + i*N, word);
}
for (size_t i=n/N * N; i < n; i++) {
array[i] = w;
}
}
void memcpy_epi32(uint32_t* dst, uint32_t* src, size_t n) {
const int N = 16;
for (size_t i=0; i < n/N; i++) {
_mm512_storeu_si512(dst + i*N, _mm512_loadu_si512(src + i*N));
}
for (size_t i=n/N * N; i < n; i++) {
dst[i] = src[i];
}
}
// parition array[0..n-1]
uint32_t FORCE_INLINE partition_auxbuffer_epi32(uint32_t* array, size_t n, uint32_t pv) {
const int N = 16;
const int AUX_COUNT = 1024; // 4kB
static uint32_t gt_buf[AUX_COUNT + N];
size_t lt_count = 0;
size_t gt_count = 0;
const __m512i pivot = _mm512_set1_epi32(pv);
// 1. copy greater and less values into separate buffers
for (size_t i=0; i < n / N; i++) {
const __m512i v = _mm512_loadu_si512(array + i*N);
const __mmask16 lt = _mm512_cmplt_epi32_mask(v, pivot);
const __mmask16 gt = _mm512_cmpgt_epi32_mask(v, pivot);
const __m512i less = _mm512_maskz_compress_epi32(lt, v);
const __m512i greater = _mm512_maskz_compress_epi32(gt, v);
_mm512_storeu_si512(array + lt_count, less);
_mm512_storeu_si512(gt_buf + gt_count, greater);
lt_count += _mm_popcnt_u32(lt);
gt_count += _mm_popcnt_u32(gt);
}
for (size_t i=0; i < n % N; i++) {
const uint32_t v = array[(n/N) * N + i];
if (v < pv) {
array[lt_count++] = v;
} else if (v > pv) {
gt_buf[gt_count++] = v;
}
}
const size_t eq_count = n - (lt_count + gt_count);
// 2. replace array with partially ordered data
// 2.a. pivots
memset_epi32(array + lt_count, pv, eq_count);
// 2.b. all values greater than pivot
memcpy_epi32(array + lt_count + eq_count, gt_buf, gt_count);
// 3. index before the first pivot
return lt_count;
}
} // namespace avx512
} // namespace qa

View File

@ -0,0 +1,105 @@
namespace qs {
namespace avx512 {
void FORCE_INLINE bmi2_align_masks(__mmask16& a, __mmask16& b, __mmask16& rem_a, __mmask16& rem_b) {
uint32_t tmpA = _pext_u32(0xffffffff, a);
uint32_t tmpB = _pext_u32(0xffffffff, b);
uint32_t tmp = tmpA & tmpB;
__mmask16 new_a = _pdep_u32(tmp, a);
__mmask16 new_b = _pdep_u32(tmp, a);
rem_a = a ^ new_a;
rem_b = b ^ new_b;
a = new_a;
b = new_b;
assert(a != 0);
assert(b != 0);
assert(_mm_popcnt_u64(a) == _mm_popcnt_u64(b));
}
void FORCE_INLINE bmi2_partition_epi32(uint32_t* array, uint32_t pv, int& left, int& right) {
const int N = 16;
__m512i L;
__m512i R;
__mmask16 maskL = 0;
__mmask16 maskR = 0;
const __m512i pivot = _mm512_set1_epi32(pv);
while (right - left + 1 >= 2*N) {
while (maskL == 0) {
if (right - (left + N) + 1 < 2*N) {
goto end;
}
L = _mm512_loadu_si512(array + left);
maskL = _mm512_cmpge_epi32_mask(L, pivot);
if (maskL == 0) {
left += N;
}
}
while (maskR == 0) {
if ((right - N) - left + 1 < 2*N) {
goto end;
}
R = _mm512_loadu_si512(array + right - N + 1);
maskR = _mm512_cmple_epi32_mask(R, pivot);
if (maskR == 0) {
right -= N;
}
}
assert(left <= right);
assert(maskL != 0);
assert(maskR != 0);
__mmask16 mL;
__mmask16 mR;
bmi2_align_masks(maskL, maskR, mL, mR);
swap_epi32(L, R, maskL, maskR);
maskL = mL;
maskR = mR;
if (maskL == 0) {
_mm512_storeu_si512(array + left, L);
left += N;
}
if (maskR == 0) {
_mm512_storeu_si512(array + right - N + 1, R);
right -= N;
}
} // while
end:
if (maskL != 0) {
_mm512_storeu_si512(array + left, L);
}
if (maskR != 0) {
_mm512_storeu_si512(array + right - N + 1, R);
}
if (left < right) {
partition_epi32(array, pv, left, right);
}
}
} // namespace avx512
} // namespace qs

View File

@ -0,0 +1,58 @@
namespace qs {
namespace avx512 {
__mmask16 FORCE_INLINE get_range_mask(uint32_t position, uint32_t length) {
return (uint32_t(1) << (position + length)) - (uint32_t(1) << position);
}
/*
r stores 1 to 16 elements to partition, mask select leading element
*/
void partition_register(__m512i& r, const __m512i pivot, __mmask16 mask, int& left, int& right) {
// example:
// r = [ 1, 2, 3, 100, 200, 300, 4, 5, 50, 900, -1, -2, -3, -4, -5, -6]
// mask = 0x03ff -- first 10 elements are subject to parition
// pivot = packed_dword(50)
const __mmask16 less_mask = _mm512_mask_cmplt_epi32_mask(mask, r, pivot);
const __mmask16 equal_mask = _mm512_mask_cmpeq_epi32_mask(mask, r, pivot);
// less = [1, 2, 3, 4, 5, ... rest are **pivots**]
const __m512i less_equal = _mm512_mask_compress_epi32(pivot, less_mask, r);
// greater = [100, 200, 300, 900, ... rest are zeros]
const __m512i greater = _mm512_maskz_compress_epi32(~(less_mask | equal_mask) & mask, r);
// less_cnt = 5
// less_equal_cnt = 6
const int less_cnt = _mm_popcnt_u32(less_mask);
const int less_equal_cnt = _mm_popcnt_u32(less_mask | equal_mask);
const __mmask16 store_less_equal = get_range_mask(0, less_equal_cnt);
const __mmask16 store_greater = ~store_less_equal & mask;
// merge less or eqaul with input
//
// r = [ 1, 2, 3, 4, 5, 50, 4, 5, 50, 900, -1, -2, -3, -4, -5, -6]
// ^ ^ ^ ^ ^ ^^
// merged
r = _mm512_mask_mov_epi32(r, store_less_equal, less_equal);
// merge greater than pivot
//
// r = [ 1, 2, 3, 4, 5, 50, 100, 200, 300, 900, -1, -2, -3, -4, -5, -6]
// ^^^ ^^^ ^^^ ^^^
// merged
r = _mm512_mask_expand_epi32(r, store_greater, greater);
right = left + less_cnt;
left = left + less_equal_cnt;
}
} // namespace avx512
} // namespace qs

View File

@ -0,0 +1,129 @@
//#define USE_AVX512_PIVOT
namespace qs {
namespace avx512 {
void FORCE_INLINE align_masks(__mmask16& a, __mmask16& b, __mmask16& rem_a, __mmask16& rem_b) {
assert(a != 0);
assert(b != 0);
uint16_t tmpA = a;
uint16_t tmpB = b;
while (tmpA != 0 && tmpB != 0) {
tmpA = tmpA & (tmpA - 1);
tmpB = tmpB & (tmpB - 1);
}
a = a ^ tmpA;
b = b ^ tmpB;
assert(a != 0);
assert(b != 0);
assert(_mm_popcnt_u64(a) == _mm_popcnt_u64(b));
rem_a = tmpA;
rem_b = tmpB;
}
void FORCE_INLINE partition_epi32(uint32_t* array, uint32_t pv, int& left, int& right) {
const int N = 16;
__m512i L;
__m512i R;
__mmask16 maskL = 0;
__mmask16 maskR = 0;
const __m512i pivot = _mm512_set1_epi32(pv);
int count = (right - left + 1)/N;
while (true) {
while (maskL == 0) {
if (count <= 1) {
goto end;
}
L = _mm512_loadu_si512(array + left);
maskL = _mm512_cmpge_epi32_mask(L, pivot);
if (maskL == 0) {
left += N;
count -= 1;
}
}
while (maskR == 0) {
if (count <= 1) {
goto end;
}
R = _mm512_loadu_si512(array + right - N + 1);
maskR = _mm512_cmple_epi32_mask(R, pivot);
if (maskR == 0) {
right -= N;
count -= 1;
}
}
assert(left <= right);
assert(maskL != 0);
assert(maskR != 0);
__mmask16 mL;
__mmask16 mR;
align_masks(maskL, maskR, mL, mR);
swap_epi32(L, R, maskL, maskR);
maskL = mL;
maskR = mR;
if (maskL == 0) {
_mm512_storeu_si512(array + left, L);
left += N;
count -= 1;
}
if (maskR == 0) {
_mm512_storeu_si512(array + right - N + 1, R);
right -= N;
count -= 1;
}
} // while
end:
assert(!(maskL != 0 && maskR != 0));
if (maskL != 0) {
_mm512_storeu_si512(array + left, L);
} else if (maskR != 0) {
_mm512_storeu_si512(array + right - N + 1, R);
}
if (left < right) {
#if USE_AVX512_PIVOT
scalar_partition_epi32(array, pv, left, right);
#else
const int size = right - left + 1;
if (size > N) {
scalar_partition_epi32(array, pv, left, right);
} else {
const int k = left;
__mmask16 mask = (uint32_t(1) << size) - 1;
__m512i v = _mm512_loadu_si512(array + k);
partition_register(v, pivot, mask, left, right);
_mm512_storeu_si512(array + k, v);
}
#endif
}
}
} // namespace avx512
} // namespace qa

View File

@ -0,0 +1,119 @@
namespace qs {
namespace avx512 {
void FORCE_INLINE popcnt_partition_epi32(uint32_t* array, uint32_t pv, int& left, int& right) {
const int N = 16;
__m512i L;
__m512i R;
__mmask16 maskL = 0;
__mmask16 maskR = 0;
int popcntL;
int popcntR;
const __m512i pivot = _mm512_set1_epi32(pv);
while (true) {
while (maskL == 0) {
if (right - (left + N) + 1 < N) {
goto end;
}
L = _mm512_loadu_si512(array + left);
maskL = _mm512_cmpge_epi32_mask(L, pivot);
if (maskL == 0) {
left += N;
}
popcntL = _mm_popcnt_u32(maskL);
}
while (maskR == 0) {
if ((right - N) - left + 1 < N) {
goto end;
}
R = _mm512_loadu_si512(array + right - N + 1);
maskR = _mm512_cmple_epi32_mask(R, pivot);
if (maskR == 0) {
right -= N;
}
popcntR = _mm_popcnt_u32(maskR);
}
assert(left <= right);
assert(maskL != 0);
assert(maskR != 0);
if (popcntL == popcntR) {
// fast path
swap_epi32(L, R, maskL, maskR);
maskL = maskR = 0;
_mm512_storeu_si512(array + left, L);
left += N;
_mm512_storeu_si512(array + right - N + 1, R);
right -= N;
} else {
if (popcntL < popcntR) {
int n = popcntR - popcntL;
int k = n;
__mmask16 tmp = maskR;
while (k--) {
maskR = maskR & (maskR - 1);
}
swap_epi32(L, R, maskL, maskR);
maskR = tmp ^ maskR;
popcntR = n;
_mm512_storeu_si512(array + left, L);
left += N;
maskL = 0;
} else {
int n = popcntL - popcntR;
int k = n;
__mmask16 tmp = maskL;
while (k--) {
maskL = maskL & (maskL - 1);
}
swap_epi32(L, R, maskL, maskR);
maskL = tmp ^ maskL;
popcntL = n;
_mm512_storeu_si512(array + right - N + 1, R);
right -= N;
maskR = 0;
}
}
} // while
end:
assert(!(maskL != 0 && maskR != 0));
if (maskL != 0) {
_mm512_storeu_si512(array + left, L);
} else if (maskR != 0) {
_mm512_storeu_si512(array + right - N + 1, R);
}
if (left < right) {
scalar_partition_epi32(array, pv, left, right);
}
}
} // namespace avx512
} // namespace qs

View File

@ -0,0 +1,211 @@
#include "avx512-swap.cpp"
#include "avx512-partition-register.cpp"
#include "avx512-partition.cpp"
#include "avx512-popcnt-partition.cpp"
#include "avx512-bmi2-partition.cpp"
#include "avx512-auxbuffer-partition.cpp"
#include "avx512-sort-register.cpp"
namespace qs {
namespace avx512 {
void dump(const char* name, const __m512i v) {
uint32_t tmp[16];
printf("%-10s = [", name);
_mm512_storeu_si512((__m512i*)tmp, v);
for (int i=0; i < 16; i++) {
if (i > 0) {
putchar(' ');
}
printf("%08x", tmp[i]);
}
printf("]\n");
}
void quicksort(uint32_t* array, int left, int right) {
int i = left;
int j = right;
const int size = j - i + 1;
if (size <= 16) {
const __m512i in = _mm512_loadu_si512(array + i);
__m512i result;
switch (size) {
case 16: result = avx512_sort16_epi32(in); break;
case 15: result = avx512_sort15_epi32(in); break;
case 14: result = avx512_sort14_epi32(in); break;
case 13: result = avx512_sort13_epi32(in); break;
case 12: result = avx512_sort12_epi32(in); break;
case 11: result = avx512_sort11_epi32(in); break;
case 10: result = avx512_sort10_epi32(in); break;
case 9: result = avx512_sort9_epi32(in); break;
case 8: result = avx512_sort8_epi32(in); break;
case 7: result = avx512_sort7_epi32(in); break;
case 6: result = avx512_sort6_epi32(in); break;
case 5: result = avx512_sort5_epi32(in); break;
case 4: result = avx512_sort4_epi32(in); break;
case 3: result = avx512_sort3_epi32(in); break;
case 2: {
if (array[i] > array[j]) {
uint32_t t = array[i];
array[i] = array[j];
array[j] = t;
}
return;
}
case 1:
case 0:
return;
default:
assert(false);
break;
}
_mm512_storeu_si512(array + i, result);
return;
}
const uint32_t pivot = array[left];
const int AVX512_REGISTER_SIZE = 16; // in 32-bit words
if (size >= 2 * AVX512_REGISTER_SIZE) {
::qs::avx512::partition_epi32(array, pivot, i, j);
} else {
scalar_partition_epi32(array, pivot, i, j);
}
if (left < j) {
quicksort(array, left, j);
}
if (i < right) {
quicksort(array, i, right);
}
}
void popcnt_quicksort(uint32_t* array, int left, int right) {
int i = left;
int j = right;
const int size = j - i + 1;
if (size <= 16) {
const __m512i in = _mm512_loadu_si512(array + i);
__m512i result;
switch (size) {
case 16: result = avx512_sort16_epi32(in); break;
case 15: result = avx512_sort15_epi32(in); break;
case 14: result = avx512_sort14_epi32(in); break;
case 13: result = avx512_sort13_epi32(in); break;
case 12: result = avx512_sort12_epi32(in); break;
case 11: result = avx512_sort11_epi32(in); break;
case 10: result = avx512_sort10_epi32(in); break;
case 9: result = avx512_sort9_epi32(in); break;
case 8: result = avx512_sort8_epi32(in); break;
case 7: result = avx512_sort7_epi32(in); break;
case 6: result = avx512_sort6_epi32(in); break;
case 5: result = avx512_sort5_epi32(in); break;
case 4: result = avx512_sort4_epi32(in); break;
case 3: result = avx512_sort3_epi32(in); break;
case 2: {
if (array[i] > array[j]) {
uint32_t t = array[i];
array[i] = array[j];
array[j] = t;
}
return;
}
case 1:
case 0:
return;
default:
assert(false);
break;
}
_mm512_storeu_si512(array + i, result);
return;
}
const uint32_t pivot = array[(i + j)/2];
const int AVX512_REGISTER_SIZE = 16; // in 32-bit words
if (size >= 2 * AVX512_REGISTER_SIZE) {
::qs::avx512::popcnt_partition_epi32(array, pivot, i, j);
} else {
scalar_partition_epi32(array, pivot, i, j);
}
if (left < j) {
popcnt_quicksort(array, left, j);
}
if (i < right) {
popcnt_quicksort(array, i, right);
}
}
void bmi2_quicksort(uint32_t* array, int left, int right) {
int i = left;
int j = right;
const uint32_t pivot = array[left];
const int AVX512_REGISTER_SIZE = 16; // in 32-bit words
if (j - i >= 2 * AVX512_REGISTER_SIZE) {
::qs::avx512::bmi2_partition_epi32(array, pivot, i, j);
} else {
scalar_partition_epi32(array, pivot, i, j);
}
if (left < j) {
bmi2_quicksort(array, left, j);
}
if (i < right) {
bmi2_quicksort(array, i, right);
}
}
void auxbuffer_quicksort(uint32_t* array, int lo, int hi) {
if (lo >= hi) {
return;
}
int p;
#if 0
p = lomuto_partition_epi32(array, lo, hi);
#else
const auto n = (hi - lo + 1);
if (n > 1024) {
p = lomuto_partition_epi32(array, lo, hi);
} else if (n > 8) {
const uint32_t pivot = array[(hi + lo)/2];
p = ::qs::avx512::partition_auxbuffer_epi32(array + lo, n, pivot) + lo;
} else {
p = lomuto_partition_epi32(array, lo, hi);
}
#endif
auxbuffer_quicksort(array, lo, p-1);
auxbuffer_quicksort(array, p+1, hi);
}
} // namespace avx512
} // namespace qs

View File

@ -0,0 +1,323 @@
//#define POPCNT_LOOKUP
#ifdef POPCNT_LOOKUP
uint16_t lookup[65536];
// The lookup table replaces 2 x popcount and 2 x shift with 1 bit-or and 2 x memory fetch
void prepare_lookup() {
for (int i=0; i < 65536; i++) {
lookup[i] = 1 << _mm_popcnt_u32(i);
}
}
#define STEP(unused) { \
const __m512i b = _mm512_permutexvar_epi32(index, v); \
index = _mm512_add_epi32(index, incr); \
const uint16_t lt = _mm512_mask_cmplt_epi32_mask(sort_mask, v, b); \
const uint16_t eq = _mm512_mask_cmpeq_epi32_mask(sort_mask, v, b); \
const uint16_t mask = lookup[lt | eq] - lookup[lt]; \
result = _mm512_mask_mov_epi32(result, mask, b); \
}
#else
#define STEP(unused) { \
const __m512i b = _mm512_permutexvar_epi32(index, v); \
index = _mm512_add_epi32(index, incr); \
const uint16_t lt = _mm_popcnt_u32(_mm512_mask_cmplt_epi32_mask(sort_mask, v, b)); \
const uint16_t eq = _mm_popcnt_u32(_mm512_mask_cmpeq_epi32_mask(sort_mask, v, b)); \
const uint16_t mask = (uint32_t(1) << (lt + eq)) - (uint32_t(1) << lt); \
result = _mm512_mask_mov_epi32(result, mask, b); \
}
#endif
__m512i avx512_sort16_epi32(const __m512i v) {
__m512i result = v;
__m512i index = _mm512_setzero_si512();
__m512i incr = _mm512_set1_epi32(1);
const uint16_t sort_mask = 0xffff;
STEP(0);
STEP(1);
STEP(2);
STEP(3);
STEP(4);
STEP(5);
STEP(6);
STEP(7);
STEP(8);
STEP(9);
STEP(10);
STEP(11);
STEP(12);
STEP(13);
STEP(14);
STEP(15);
return result;
}
__m512i avx512_sort15_epi32(const __m512i v) {
__m512i result = v;
__m512i index = _mm512_setzero_si512();
__m512i incr = _mm512_set1_epi32(1);
const uint16_t sort_mask = 0x7fff;
STEP(0);
STEP(1);
STEP(2);
STEP(3);
STEP(4);
STEP(5);
STEP(6);
STEP(7);
STEP(8);
STEP(9);
STEP(10);
STEP(11);
STEP(12);
STEP(13);
STEP(14);
return result;
}
__m512i avx512_sort14_epi32(const __m512i v) {
__m512i result = v;
__m512i index = _mm512_setzero_si512();
__m512i incr = _mm512_set1_epi32(1);
const uint16_t sort_mask = 0x3fff;
STEP(0);
STEP(1);
STEP(2);
STEP(3);
STEP(4);
STEP(5);
STEP(6);
STEP(7);
STEP(8);
STEP(9);
STEP(10);
STEP(11);
STEP(12);
STEP(13);
return result;
}
__m512i avx512_sort13_epi32(const __m512i v) {
__m512i result = v;
__m512i index = _mm512_setzero_si512();
__m512i incr = _mm512_set1_epi32(1);
const uint16_t sort_mask = 0x1fff;
STEP(0);
STEP(1);
STEP(2);
STEP(3);
STEP(4);
STEP(5);
STEP(6);
STEP(7);
STEP(8);
STEP(9);
STEP(10);
STEP(11);
STEP(12);
return result;
}
__m512i avx512_sort12_epi32(const __m512i v) {
__m512i result = v;
__m512i index = _mm512_setzero_si512();
__m512i incr = _mm512_set1_epi32(1);
const uint16_t sort_mask = 0x0fff;
STEP(0);
STEP(1);
STEP(2);
STEP(3);
STEP(4);
STEP(5);
STEP(6);
STEP(7);
STEP(8);
STEP(9);
STEP(10);
STEP(11);
return result;
}
__m512i avx512_sort11_epi32(const __m512i v) {
__m512i result = v;
__m512i index = _mm512_setzero_si512();
__m512i incr = _mm512_set1_epi32(1);
const uint16_t sort_mask = 0x07ff;
STEP(0);
STEP(1);
STEP(2);
STEP(3);
STEP(4);
STEP(5);
STEP(6);
STEP(7);
STEP(8);
STEP(9);
STEP(10);
return result;
}
__m512i avx512_sort10_epi32(const __m512i v) {
__m512i result = v;
__m512i index = _mm512_setzero_si512();
__m512i incr = _mm512_set1_epi32(1);
const uint16_t sort_mask = 0x03ff;
STEP(0);
STEP(1);
STEP(2);
STEP(3);
STEP(4);
STEP(5);
STEP(6);
STEP(7);
STEP(8);
STEP(9);
return result;
}
__m512i avx512_sort9_epi32(const __m512i v) {
__m512i result = v;
__m512i index = _mm512_setzero_si512();
__m512i incr = _mm512_set1_epi32(1);
const uint16_t sort_mask = 0x01ff;
STEP(0);
STEP(1);
STEP(2);
STEP(3);
STEP(4);
STEP(5);
STEP(6);
STEP(7);
STEP(8);
return result;
}
__m512i avx512_sort8_epi32(const __m512i v) {
__m512i result = v;
__m512i index = _mm512_setzero_si512();
__m512i incr = _mm512_set1_epi32(1);
const uint16_t sort_mask = 0x00ff;
STEP(0);
STEP(1);
STEP(2);
STEP(3);
STEP(4);
STEP(5);
STEP(6);
STEP(7);
return result;
}
__m512i avx512_sort7_epi32(const __m512i v) {
__m512i result = v;
__m512i index = _mm512_setzero_si512();
__m512i incr = _mm512_set1_epi32(1);
const uint16_t sort_mask = 0x007f;
STEP(0);
STEP(1);
STEP(2);
STEP(3);
STEP(4);
STEP(5);
STEP(6);
return result;
}
__m512i avx512_sort6_epi32(const __m512i v) {
__m512i result = v;
__m512i index = _mm512_setzero_si512();
__m512i incr = _mm512_set1_epi32(1);
const uint16_t sort_mask = 0x003f;
STEP(0);
STEP(1);
STEP(2);
STEP(3);
STEP(4);
STEP(5);
return result;
}
__m512i avx512_sort5_epi32(const __m512i v) {
__m512i result = v;
__m512i index = _mm512_setzero_si512();
__m512i incr = _mm512_set1_epi32(1);
const uint16_t sort_mask = 0x001f;
STEP(0);
STEP(1);
STEP(2);
STEP(3);
STEP(4);
return result;
}
__m512i avx512_sort4_epi32(const __m512i v) {
__m512i result = v;
__m512i index = _mm512_setzero_si512();
__m512i incr = _mm512_set1_epi32(1);
const uint16_t sort_mask = 0x000f;
STEP(0);
STEP(1);
STEP(2);
STEP(3);
return result;
}
__m512i avx512_sort3_epi32(const __m512i v) {
__m512i result = v;
__m512i index = _mm512_setzero_si512();
__m512i incr = _mm512_set1_epi32(1);
const uint16_t sort_mask = 0x0007;
STEP(0);
STEP(1);
STEP(2);
return result;
}
#undef STEP

20
simd-sort/avx512-swap.cpp Normal file
View File

@ -0,0 +1,20 @@
namespace qs {
namespace avx512 {
void swap_epi32(__m512i& a, __m512i& b, __mmask16 mask_a, __mmask16 mask_b) {
assert(mask_a != 0);
assert(mask_b != 0);
assert(_mm_popcnt_u64(mask_a) == _mm_popcnt_u64(mask_b));
const __m512i to_swap_b = _mm512_mask_compress_epi32(_mm512_setzero_si512(), mask_a, a);
const __m512i to_swap_a = _mm512_mask_compress_epi32(_mm512_setzero_si512(), mask_b, b);
a = _mm512_mask_expand_epi32(a, mask_a, to_swap_a);
b = _mm512_mask_expand_epi32(b, mask_b, to_swap_b);
}
} // namespace avx512
} // namespace qa

28
simd-sort/cmdline.cpp Normal file
View File

@ -0,0 +1,28 @@
#include <cstring>
#include <string>
#include <unordered_set>
class CommandLine final {
int argc;
char** argv;
std::unordered_set<std::string> arguments;
public:
CommandLine(int argc, char* argv[])
: argc(argc)
, argv(argv) {
for (int i=1; i < argc; i++) {
arguments.insert(argv[i]);
}
}
bool has(const std::string& name) const {
return arguments.count(name);
}
bool empty() const {
return arguments.empty();
}
};

3
simd-sort/common.h Normal file
View File

@ -0,0 +1,3 @@
#pragma once
#define FORCE_INLINE inline __attribute__((always_inline))

45
simd-sort/gather_results.sh Executable file
View File

@ -0,0 +1,45 @@
#!/bin/bash
if [[ $1 == "avx2" ]]
then
SPEED=./speed_avx2
if [[ ! -x $SPEED ]]
then
echo "AVX2 test program not found. Run make speed_avx2 to build it."
exit 1
fi
else
SPEED=./speed
if [[ ! -x $SPEED ]]
then
echo "AVX512 test program not found. Run make speed to build it."
exit 1
fi
fi
$SPEED 10000 3 asc | tee -a results.txt
$SPEED 100000 3 asc | tee -a results.txt
$SPEED 1000000 3 asc | tee -a results.txt
$SPEED 2000000 3 asc | tee -a results.txt
$SPEED 5000000 3 asc | tee -a results.txt
$SPEED 10000000 3 asc | tee -a results.txt
$SPEED 20000000 3 asc | tee -a results.txt
$SPEED 10000 3 dsc | tee -a results.txt
$SPEED 100000 3 dsc | tee -a results.txt
$SPEED 1000000 3 dsc | tee -a results.txt
$SPEED 2000000 3 dsc | tee -a results.txt
$SPEED 5000000 3 dsc | tee -a results.txt
$SPEED 10000000 3 dsc | tee -a results.txt
$SPEED 20000000 3 dsc | tee -a results.txt
$SPEED 10000 3 rnd | tee -a results.txt
$SPEED 100000 3 rnd | tee -a results.txt
$SPEED 1000000 3 rnd | tee -a results.txt
$SPEED 2000000 3 rnd | tee -a results.txt
$SPEED 5000000 3 rnd | tee -a results.txt
$SPEED 10000000 3 rnd | tee -a results.txt
$SPEED 20000000 3 rnd | tee -a results.txt
echo "results.txt created"

8
simd-sort/gettime.cpp Normal file
View File

@ -0,0 +1,8 @@
#include <time.h>
#include <sys/time.h>
uint32_t get_time() {
struct timeval T;
gettimeofday(&T, NULL);
return (T.tv_sec * 1000000) + T.tv_usec;
}

106
simd-sort/input_data.cpp Normal file
View File

@ -0,0 +1,106 @@
class InputData {
protected:
uint32_t* array;
size_t n;
public:
InputData(size_t count)
: n(count) {
assert(n > 0);
array = new uint32_t[n];
}
~InputData() {
delete[] array;
}
public:
uint32_t* pointer() {
return array;
}
size_t count() const {
return n;
}
size_t size() const {
return n * sizeof(uint32_t);
}
};
class InputAscending: public InputData {
using super = InputData;
public:
InputAscending(size_t count) : super(count) {
for (size_t i=0; i < n; i++) {
array[i] = i;
}
}
};
class InputDescending: public InputData {
using super = InputData;
public:
InputDescending(size_t count) : super(count) {
for (size_t i=0; i < n; i++) {
array[i] = n - i + 1;
}
}
};
class InputRandomFew: public InputData {
using super = InputData;
public:
InputRandomFew(size_t count) : super(count) {
for (size_t i=0; i < n; i++) {
array[i] = random() % 10;
}
}
};
class InputRandom: public InputData {
using super = InputData;
public:
InputRandom(size_t count) : super(count) {
for (size_t i=0; i < n; i++) {
array[i] = random();
}
}
};
class InputRandomUnique: public InputData {
using super = InputData;
public:
InputRandomUnique(size_t count) : super(count) {
for (size_t i=0; i < n; i++) {
array[i] = i;
}
shuffle();
}
private:
void shuffle() {
for (size_t i=0; i < n; i++) {
size_t j = rand() % (n - i);
const uint32_t t = array[i];
array[i] = array[j];
array[j] = t;
}
}
};

51
simd-sort/partition.cpp Normal file
View File

@ -0,0 +1,51 @@
void scalar_partition_epi32(uint32_t* array, const uint32_t pivot, int& left, int& right) {
while (left <= right) {
while (array[left] < pivot) {
left += 1;
}
while (array[right] > pivot) {
right -= 1;
}
if (left <= right) {
const uint32_t t = array[left];
array[left] = array[right];
array[right] = t;
left += 1;
right -= 1;
}
}
}
int lomuto_partition_epi32(uint32_t* array, int lo, int hi) {
const uint32_t pivot = array[(lo + hi)/2];
const uint32_t hi_value = array[hi];
array[(lo + hi)/2] = hi_value;
array[hi] = pivot;
int i = lo;
for (int j=lo; j < hi; j++) {
if (array[j] <= pivot) {
const uint32_t t = array[i];
array[i] = array[j];
array[j] = t;
i += 1;
}
}
{
const uint32_t t = array[i];
array[i] = array[hi];
array[hi] = t;
i += 1;
}
return i;
}

View File

@ -0,0 +1,9 @@
#include "common.h"
#include "quicksort.cpp"
#ifdef HAVE_AVX2_INSTRUCTIONS
# include "avx2-quicksort.cpp"
#endif
#ifdef HAVE_AVX512F_INSTRUCTIONS
# include "avx512-quicksort.cpp"
#endif

19
simd-sort/quicksort.cpp Normal file
View File

@ -0,0 +1,19 @@
#include "partition.cpp"
void quicksort(uint32_t* array, int left, int right) {
int i = left;
int j = right;
const uint32_t pivot = array[(i + j)/2];
scalar_partition_epi32(array, pivot, i, j);
if (left < j) {
quicksort(array, left, j);
}
if (i < right) {
quicksort(array, i, right);
}
}

52
simd-sort/rdtsc.cpp Normal file
View File

@ -0,0 +1,52 @@
#define RDTSC_START(cycles) \
do { \
uint32_t cyc_high, cyc_low; \
__asm volatile("cpuid\n" \
"rdtsc\n" \
"mov %%edx, %0\n" \
"mov %%eax, %1" : \
"=r" (cyc_high), \
"=r"(cyc_low) : \
: /* no read only */ \
"%rax", "%rbx", "%rcx", "%rdx" /* clobbers */ \
); \
(cycles) = ((uint64_t)cyc_high << 32) | cyc_low; \
} while (0)
#define RDTSC_STOP(cycles) \
do { \
uint32_t cyc_high, cyc_low; \
__asm volatile("rdtscp\n" \
"mov %%edx, %0\n" \
"mov %%eax, %1\n" \
"cpuid" : \
"=r"(cyc_high), \
"=r"(cyc_low) : \
/* no read only registers */ : \
"%rax", "%rbx", "%rcx", "%rdx" /* clobbers */ \
); \
(cycles) = ((uint64_t)cyc_high << 32) | cyc_low; \
} while (0)
static __attribute__ ((noinline))
uint64_t rdtsc_overhead_func(uint64_t dummy) {
return dummy;
}
uint64_t global_rdtsc_overhead = (uint64_t) UINT64_MAX;
#define RDTSC_SET_OVERHEAD(test, repeat) \
do { \
uint64_t cycles_start, cycles_final, cycles_diff; \
uint64_t min_diff = UINT64_MAX; \
for (int i = 0; i < repeat; i++) { \
__asm volatile("" ::: /* pretend to clobber */ "memory"); \
RDTSC_START(cycles_start); \
test; \
RDTSC_STOP(cycles_final); \
cycles_diff = (cycles_final - cycles_start); \
if (cycles_diff < min_diff) min_diff = cycles_diff; \
} \
global_rdtsc_overhead = min_diff; \
printf("rdtsc_overhead set to %d\n", (int)global_rdtsc_overhead); \
} while (0)

View File

@ -0,0 +1,84 @@
items count: 10000 (40000 bytes), input ascending
std::sort ... 0.0001 s
quick sort ... 0.0001 s (1.24)
AVX2 quick sort ... 0.0001 s (1.48)
items count: 100000 (400000 bytes), input ascending
std::sort ... 0.0010 s
quick sort ... 0.0009 s (1.17)
AVX2 quick sort ... 0.0007 s (1.48)
items count: 1000000 (4000000 bytes), input ascending
std::sort ... 0.0117 s
quick sort ... 0.0092 s (1.27)
AVX2 quick sort ... 0.0066 s (1.77)
items count: 2000000 (8000000 bytes), input ascending
std::sort ... 0.0244 s
quick sort ... 0.0192 s (1.28)
AVX2 quick sort ... 0.0137 s (1.79)
items count: 5000000 (20000000 bytes), input ascending
std::sort ... 0.0700 s
quick sort ... 0.0535 s (1.31)
AVX2 quick sort ... 0.0378 s (1.85)
items count: 10000000 (40000000 bytes), input ascending
std::sort ... 0.1462 s
quick sort ... 0.1108 s (1.32)
AVX2 quick sort ... 0.0782 s (1.87)
items count: 20000000 (80000000 bytes), input ascending
std::sort ... 0.3049 s
quick sort ... 0.2301 s (1.33)
AVX2 quick sort ... 0.1614 s (1.89)
items count: 10000 (40000 bytes), input descending
std::sort ... 0.0001 s
quick sort ... 0.0001 s (0.92)
AVX2 quick sort ... 0.0001 s (1.12)
items count: 100000 (400000 bytes), input descending
std::sort ... 0.0008 s
quick sort ... 0.0009 s (0.87)
AVX2 quick sort ... 0.0007 s (1.10)
items count: 1000000 (4000000 bytes), input descending
std::sort ... 0.0087 s
quick sort ... 0.0092 s (0.95)
AVX2 quick sort ... 0.0066 s (1.32)
items count: 2000000 (8000000 bytes), input descending
std::sort ... 0.0183 s
quick sort ... 0.0192 s (0.95)
AVX2 quick sort ... 0.0138 s (1.33)
items count: 5000000 (20000000 bytes), input descending
std::sort ... 0.0514 s
quick sort ... 0.0536 s (0.96)
AVX2 quick sort ... 0.0378 s (1.36)
items count: 10000000 (40000000 bytes), input descending
std::sort ... 0.1068 s
quick sort ... 0.1111 s (0.96)
AVX2 quick sort ... 0.0783 s (1.36)
items count: 20000000 (80000000 bytes), input descending
std::sort ... 0.2221 s
quick sort ... 0.2299 s (0.97)
AVX2 quick sort ... 0.1613 s (1.38)
items count: 10000 (40000 bytes), input random
std::sort ... 0.0001 s
quick sort ... 0.0001 s (1.25)
AVX2 quick sort ... 0.0001 s (1.53)
items count: 100000 (400000 bytes), input random
std::sort ... 0.0010 s
quick sort ... 0.0009 s (1.16)
AVX2 quick sort ... 0.0007 s (1.48)
items count: 1000000 (4000000 bytes), input random
std::sort ... 0.0118 s
quick sort ... 0.0091 s (1.29)
AVX2 quick sort ... 0.0066 s (1.78)
items count: 2000000 (8000000 bytes), input random
std::sort ... 0.0246 s
quick sort ... 0.0192 s (1.28)
AVX2 quick sort ... 0.0137 s (1.79)
items count: 5000000 (20000000 bytes), input random
std::sort ... 0.0703 s
quick sort ... 0.0537 s (1.31)
AVX2 quick sort ... 0.0378 s (1.86)
items count: 10000000 (40000000 bytes), input random
std::sort ... 0.1468 s
quick sort ... 0.1106 s (1.33)
AVX2 quick sort ... 0.0786 s (1.87)
items count: 20000000 (80000000 bytes), input random
std::sort ... 0.3058 s
quick sort ... 0.2304 s (1.33)
AVX2 quick sort ... 0.1628 s (1.88)

View File

@ -0,0 +1,140 @@
items count: 100000 (400000 bytes), input ascending
std::sort ... 0.0063 s
quick sort ... 0.0067 s (0.93)
AVX2 quick sort ... 0.0071 s (0.88)
AVX512 quick sort ... 0.0055 s (1.14)
AVX512 + popcnt quick sort ... 0.0056 s (1.13)
AVX512 + BMI2 quick sort ... 0.0058 s (1.08)
items count: 1000000 (4000000 bytes), input ascending
std::sort ... 0.0728 s
quick sort ... 0.0735 s (0.99)
AVX2 quick sort ... 0.0685 s (1.06)
AVX512 quick sort ... 0.0522 s (1.40)
AVX512 + popcnt quick sort ... 0.0528 s (1.38)
AVX512 + BMI2 quick sort ... 0.0554 s (1.31)
items count: 2000000 (8000000 bytes), input ascending
std::sort ... 0.1529 s
quick sort ... 0.1523 s (1.00)
AVX2 quick sort ... 0.1402 s (1.09)
AVX512 quick sort ... 0.1057 s (1.45)
AVX512 + popcnt quick sort ... 0.1063 s (1.44)
AVX512 + BMI2 quick sort ... 0.1105 s (1.38)
items count: 5000000 (20000000 bytes), input ascending
std::sort ... 0.4433 s
quick sort ... 0.3935 s (1.13)
AVX2 quick sort ... 0.3520 s (1.26)
AVX512 quick sort ... 0.2609 s (1.70)
AVX512 + popcnt quick sort ... 0.2609 s (1.70)
AVX512 + BMI2 quick sort ... 0.2668 s (1.66)
items count: 10000000 (40000000 bytes), input ascending
std::sort ... 0.9257 s
quick sort ... 0.8108 s (1.14)
AVX2 quick sort ... 0.7222 s (1.28)
AVX512 quick sort ... 0.5265 s (1.76)
AVX512 + popcnt quick sort ... 0.5288 s (1.75)
AVX512 + BMI2 quick sort ... 0.5402 s (1.71)
items count: 20000000 (80000000 bytes), input ascending
std::sort ... 1.9285 s
quick sort ... 1.6725 s (1.15)
AVX2 quick sort ... 1.4766 s (1.31)
AVX512 quick sort ... 1.0681 s (1.81)
AVX512 + popcnt quick sort ... 1.0699 s (1.80)
AVX512 + BMI2 quick sort ... 1.0949 s (1.76)
items count: 10000 (40000 bytes), input descending
std::sort ... 0.0004 s
quick sort ... 0.0006 s (0.73)
AVX2 quick sort ... 0.0006 s (0.74)
AVX512 quick sort ... 0.0005 s (0.88)
AVX512 + popcnt quick sort ... 0.0005 s (0.89)
AVX512 + BMI2 quick sort ... 0.0005 s (0.87)
items count: 100000 (400000 bytes), input descending
std::sort ... 0.0048 s
quick sort ... 0.0067 s (0.71)
AVX2 quick sort ... 0.0070 s (0.68)
AVX512 quick sort ... 0.0055 s (0.87)
AVX512 + popcnt quick sort ... 0.0056 s (0.86)
AVX512 + BMI2 quick sort ... 0.0058 s (0.83)
items count: 1000000 (4000000 bytes), input descending
std::sort ... 0.0551 s
quick sort ... 0.0734 s (0.75)
AVX2 quick sort ... 0.0684 s (0.81)
AVX512 quick sort ... 0.0524 s (1.05)
AVX512 + popcnt quick sort ... 0.0526 s (1.05)
AVX512 + BMI2 quick sort ... 0.0547 s (1.01)
items count: 2000000 (8000000 bytes), input descending
std::sort ... 0.1154 s
quick sort ... 0.1523 s (0.76)
AVX2 quick sort ... 0.1407 s (0.82)
AVX512 quick sort ... 0.1055 s (1.09)
AVX512 + popcnt quick sort ... 0.1065 s (1.08)
AVX512 + BMI2 quick sort ... 0.1111 s (1.04)
items count: 5000000 (20000000 bytes), input descending
std::sort ... 0.3300 s
quick sort ... 0.3932 s (0.84)
AVX2 quick sort ... 0.3516 s (0.94)
AVX512 quick sort ... 0.2606 s (1.27)
AVX512 + popcnt quick sort ... 0.2602 s (1.27)
AVX512 + BMI2 quick sort ... 0.2657 s (1.24)
items count: 10000000 (40000000 bytes), input descending
std::sort ... 0.6890 s
quick sort ... 0.8134 s (0.85)
AVX2 quick sort ... 0.7216 s (0.95)
AVX512 quick sort ... 0.5273 s (1.31)
AVX512 + popcnt quick sort ... 0.5351 s (1.29)
AVX512 + BMI2 quick sort ... 0.5392 s (1.28)
items count: 20000000 (80000000 bytes), input descending
std::sort ... 1.4358 s
quick sort ... 1.6744 s (0.86)
AVX2 quick sort ... 1.4772 s (0.97)
AVX512 quick sort ... 1.0694 s (1.34)
AVX512 + popcnt quick sort ... 1.0675 s (1.34)
AVX512 + BMI2 quick sort ... 1.0929 s (1.31)
items count: 10000 (40000 bytes), input random
std::sort ... 0.0005 s
quick sort ... 0.0006 s (0.98)
AVX2 quick sort ... 0.0006 s (0.99)
AVX512 quick sort ... 0.0005 s (1.18)
AVX512 + popcnt quick sort ... 0.0005 s (1.18)
AVX512 + BMI2 quick sort ... 0.0005 s (1.16)
items count: 100000 (400000 bytes), input random
std::sort ... 0.0063 s
quick sort ... 0.0067 s (0.94)
AVX2 quick sort ... 0.0071 s (0.89)
AVX512 quick sort ... 0.0055 s (1.14)
AVX512 + popcnt quick sort ... 0.0056 s (1.13)
AVX512 + BMI2 quick sort ... 0.0058 s (1.08)
items count: 1000000 (4000000 bytes), input random
std::sort ... 0.0727 s
quick sort ... 0.0742 s (0.98)
AVX2 quick sort ... 0.0684 s (1.06)
AVX512 quick sort ... 0.0522 s (1.39)
AVX512 + popcnt quick sort ... 0.0526 s (1.38)
AVX512 + BMI2 quick sort ... 0.0544 s (1.34)
items count: 2000000 (8000000 bytes), input random
std::sort ... 0.1533 s
quick sort ... 0.1532 s (1.00)
AVX2 quick sort ... 0.1403 s (1.09)
AVX512 quick sort ... 0.1057 s (1.45)
AVX512 + popcnt quick sort ... 0.1066 s (1.44)
AVX512 + BMI2 quick sort ... 0.1101 s (1.39)
items count: 5000000 (20000000 bytes), input random
std::sort ... 0.4436 s
quick sort ... 0.3932 s (1.13)
AVX2 quick sort ... 0.3529 s (1.26)
AVX512 quick sort ... 0.2622 s (1.69)
AVX512 + popcnt quick sort ... 0.2632 s (1.69)
AVX512 + BMI2 quick sort ... 0.2657 s (1.67)
items count: 10000000 (40000000 bytes), input random
std::sort ... 0.9235 s
quick sort ... 0.8115 s (1.14)
AVX2 quick sort ... 0.7233 s (1.28)
AVX512 quick sort ... 0.5305 s (1.74)
AVX512 + popcnt quick sort ... 0.5328 s (1.73)
AVX512 + BMI2 quick sort ... 0.5379 s (1.72)
items count: 20000000 (80000000 bytes), input random
std::sort ... 1.9264 s
quick sort ... 1.6716 s (1.15)
AVX2 quick sort ... 1.4898 s (1.29)
AVX512 quick sort ... 1.0853 s (1.77)
AVX512 + popcnt quick sort ... 1.0849 s (1.78)
AVX512 + BMI2 quick sort ... 1.0984 s (1.75)

View File

@ -0,0 +1,80 @@
items count: 100000 (400000 bytes), input ascending
std::sort ... 0.0011 s
quick sort ... 0.0009 s (1.14)
AVX2 quick sort ... 0.0007 s (1.59)
items count: 1000000 (4000000 bytes), input ascending
std::sort ... 0.0122 s
quick sort ... 0.0099 s (1.23)
AVX2 quick sort ... 0.0065 s (1.89)
items count: 2000000 (8000000 bytes), input ascending
std::sort ... 0.0257 s
quick sort ... 0.0204 s (1.26)
AVX2 quick sort ... 0.0133 s (1.94)
items count: 5000000 (20000000 bytes), input ascending
std::sort ... 0.0724 s
quick sort ... 0.0576 s (1.26)
AVX2 quick sort ... 0.0376 s (1.93)
items count: 10000000 (40000000 bytes), input ascending
std::sort ... 0.1516 s
quick sort ... 0.1194 s (1.27)
AVX2 quick sort ... 0.0773 s (1.96)
items count: 20000000 (80000000 bytes), input ascending
std::sort ... 0.3167 s
quick sort ... 0.2480 s (1.28)
AVX2 quick sort ... 0.1598 s (1.98)
items count: 10000 (40000 bytes), input descending
std::sort ... 0.0001 s
quick sort ... 0.0001 s (0.91)
AVX2 quick sort ... 0.0001 s (1.15)
items count: 100000 (400000 bytes), input descending
std::sort ... 0.0009 s
quick sort ... 0.0009 s (0.91)
AVX2 quick sort ... 0.0007 s (1.26)
items count: 1000000 (4000000 bytes), input descending
std::sort ... 0.0095 s
quick sort ... 0.0099 s (0.96)
AVX2 quick sort ... 0.0065 s (1.47)
items count: 2000000 (8000000 bytes), input descending
std::sort ... 0.0196 s
quick sort ... 0.0207 s (0.95)
AVX2 quick sort ... 0.0132 s (1.48)
items count: 5000000 (20000000 bytes), input descending
std::sort ... 0.0539 s
quick sort ... 0.0576 s (0.94)
AVX2 quick sort ... 0.0377 s (1.43)
items count: 10000000 (40000000 bytes), input descending
std::sort ... 0.1130 s
quick sort ... 0.1195 s (0.95)
AVX2 quick sort ... 0.0778 s (1.45)
items count: 20000000 (80000000 bytes), input descending
std::sort ... 0.2353 s
quick sort ... 0.2479 s (0.95)
AVX2 quick sort ... 0.1604 s (1.47)
items count: 10000 (40000 bytes), input random
std::sort ... 0.0001 s
quick sort ... 0.0001 s (1.19)
AVX2 quick sort ... 0.0001 s (1.51)
items count: 100000 (400000 bytes), input random
std::sort ... 0.0011 s
quick sort ... 0.0009 s (1.13)
AVX2 quick sort ... 0.0007 s (1.59)
items count: 1000000 (4000000 bytes), input random
std::sort ... 0.0123 s
quick sort ... 0.0099 s (1.23)
AVX2 quick sort ... 0.0064 s (1.91)
items count: 2000000 (8000000 bytes), input random
std::sort ... 0.0257 s
quick sort ... 0.0207 s (1.24)
AVX2 quick sort ... 0.0133 s (1.93)
items count: 5000000 (20000000 bytes), input random
std::sort ... 0.0726 s
quick sort ... 0.0578 s (1.26)
AVX2 quick sort ... 0.0377 s (1.93)
items count: 10000000 (40000000 bytes), input random
std::sort ... 0.1518 s
quick sort ... 0.1200 s (1.26)
AVX2 quick sort ... 0.0778 s (1.95)
items count: 20000000 (80000000 bytes), input random
std::sort ... 0.3170 s
quick sort ... 0.2490 s (1.27)
AVX2 quick sort ... 0.1614 s (1.96)

View File

@ -0,0 +1,76 @@
class Histogram {
size_t hist[256];
public:
Histogram() {
reset();
}
void hit(uint8_t v) {
hist[v] += 1;
}
void reset() {
memset(hist, 0, sizeof(hist));
}
bool empty() const {
for (int i=0; i < 256; i++) {
if (hist[i] > 0) {
return false;
}
}
return true;
}
public:
void print() const {
size_t total = 0;
for (int i=0; i < 256; i++) {
total += hist[i];
}
if (total == 0) return;
for (int i=0; i < 256; i++) {
if (hist[i] == 0) continue;
printf("%02x: %5lu (%5.2f%%)\n", i, hist[i], 100.0 * hist[i]/total);
}
}
};
class Statistics {
public:
size_t partition_calls;
size_t items_processed;
size_t scalar__partition_calls;
size_t scalar__items_processed;
Histogram pvbyte_histogram;
public:
Statistics() {
reset();
}
void reset() {
partition_calls = 0;
items_processed = 0;
scalar__partition_calls = 0;
scalar__items_processed = 0;
pvbyte_histogram.reset();
}
bool anything_collected() const {
return (partition_calls > 0)
|| (items_processed > 0)
|| (scalar__partition_calls > 0)
|| (scalar__items_processed > 0)
|| (!pvbyte_histogram.empty());
}
};
Statistics statistics;

457
simd-sort/speed.cpp Normal file
View File

@ -0,0 +1,457 @@
#include <cstdlib>
#include <cstdio>
#include <cstdint>
#include <cstring>
#include <cassert>
#include <algorithm>
#include <memory>
#include "cmdline.cpp"
#ifdef WITH_RUNTIME_STATS
# include "runtime_stats.cpp" // must be included before anything else
#endif
#include "input_data.cpp"
#include "quicksort-all.cpp"
#include "avx2-altquicksort.h"
//#include "avx2-nate-quicksort.cpp"
#include "../magyarsort.h" // mine
#include "avx2-natenodutch-quicksort.h"
#define USE_RDTSC // undef to get measurments in seconds
#ifdef USE_RDTSC
# include "rdtsc.cpp"
#else
# include "gettime.cpp"
#endif
void magyarsort_it(uint32_t* array, int left, int right) {
MagyarSort::sort(array + left, right - left);
}
class PerformanceTest final {
int iterations;
InputData& input;
uint32_t* tmp;
public:
PerformanceTest(int n, InputData& input)
: iterations(n)
, input(input) {
assert(iterations > 0);
tmp = new uint32_t[input.count()];
}
~PerformanceTest() {
delete[] tmp;
}
public:
template <typename SORT_FUNCTION>
uint64_t run(SORT_FUNCTION sort) {
uint64_t time = 0;
int k = iterations;
while (k--) {
memcpy(tmp, input.pointer(), input.size());
uint64_t t1, t2;
#ifdef USE_RDTSC
RDTSC_START(t1);
#else
t1 = get_time();
#endif
sort(input.pointer(), 0, input.count() - 1);
#ifdef USE_RDTSC
RDTSC_START(t2);
#else
t2 = get_time();
#endif
const uint64_t dt = t2 - t1;
if (time == 0) {
time = dt;
} else if (dt < time) {
time = dt;
}
}
return time;
}
};
enum class InputType {
randomfew,
randomuniq,
random,
ascending,
descending,
};
const char* as_string(InputType type) {
switch (type) {
case InputType::randomfew:
return "randomfew";
case InputType::randomuniq:
return "randomuniq";
case InputType::random:
return "random";
case InputType::ascending:
return "ascending";
case InputType::descending:
return "descending";
default:
return "<unknown>";
}
}
void std_qsort_wrapper(uint32_t* array, int left, int right) {
std::qsort(array + left, right - left + 1, sizeof(uint32_t), [](const void* a, const void* b)
{
uint32_t a1 = *static_cast<const uint32_t*>(a);
uint32_t a2 = *static_cast<const uint32_t*>(b);
if(a1 < a2) return -1;
if(a1 > a2) return 1;
return 0;
});
}
void std_stable_sort_wrapper(uint32_t* array, int left, int right) {
std::stable_sort(array + left, array + right + 1);
}
void std_sort_wrapper(uint32_t* array, int left, int right) {
std::sort(array + left, array + right + 1);
}
class Flags {
public:
bool std_sort;
bool std_qsort;
bool std_stable_sort;
bool quicksort;
bool avx2;
bool magyar;
bool avx2_alt;
bool avx2_natenodutch;
bool avx512;
bool avx512_buf;
bool avx512_popcnt;
bool avx512_bmi;
public:
Flags(const CommandLine& cmd) {
enable_all(false);
bool any_set = false;
if (cmd.has("-std-sort")) {
std_sort = true;
any_set = true;
}
if (cmd.has("-std-qsort")) {
std_qsort = true;
any_set = true;
}
if (cmd.has("-std-stable-sort") || cmd.has("-std-stable")) {
std_stable_sort = true;
any_set = true;
}
if (cmd.has("-quicksort")) {
quicksort = true;
any_set = true;
}
if (cmd.has("-avx2")) {
avx2 = true;
any_set = true;
}
if (cmd.has("-avx2-alt")) {
avx2_alt = true;
any_set = true;
}
if (cmd.has("magyar")) {
magyar = true;
avx2_natenodutch = true;
any_set = true;
}
if (cmd.has("-avx512")) {
avx512 = true;
any_set = true;
}
if (cmd.has("-avx512-buf")) {
avx512_buf = true;
any_set = true;
}
if (cmd.has("-avx512-popcnt")) {
avx512_popcnt = true;
any_set = true;
}
if (cmd.has("-avx512-bmi")) {
avx512_bmi = true;
any_set = true;
}
if (!any_set) {
enable_all(true);
}
}
void enable_all(bool val) {
std_sort = val;
std_qsort = val;
std_stable_sort = val;
quicksort = val;
avx2 = val;
magyar = val;
avx2_alt = val;
avx2_natenodutch = val;
avx512 = val;
avx512_buf = val;
avx512_popcnt = val;
avx512_bmi = val;
}
};
class Test {
std::unique_ptr<InputData> data;
InputType type;
size_t count;
int iterations;
Flags flags;
uint64_t ref;
public:
Test(InputType type, size_t count, int iterations, Flags&& flags)
: type(type)
, count(count)
, iterations(iterations)
, flags(std::move(flags)) {
switch (type) {
case InputType::randomfew:
data.reset(new InputRandomFew(count));
break;
case InputType::randomuniq:
data.reset(new InputRandomUnique(count));
break;
case InputType::random:
data.reset(new InputRandom(count));
break;
case InputType::ascending:
data.reset(new InputAscending(count));
break;
case InputType::descending:
data.reset(new InputDescending(count));
break;
}
}
void run() {
printf("items count: %lu (%lu bytes), input %s\n", data->count(), data->size(), as_string(type));
ref = 0;
if (flags.std_sort) {
measure("std::sort", std_sort_wrapper);
}
if (flags.std_qsort) {
measure("std::qsort", std_qsort_wrapper);
}
if (flags.std_stable_sort) {
measure("std::stable_sort", std_stable_sort_wrapper);
}
if (flags.std_qsort) {
measure("quick sort", quicksort);
}
#ifdef HAVE_AVX2_INSTRUCTIONS
if (flags.avx2) {
measure("AVX2 quick sort", qs::avx2::quicksort);
}
if (flags.avx2_natenodutch) {
measure("AVX2 nate nodutch", avx_natenodutch_quicksort);
}
if (flags.avx2_alt) {
measure("AVX2 alt quicksort", wrapped_avx2_pivotonlast_sort);
}
if (flags.magyar) {
measure("Magyarsort variant", magyarsort_it);
}
#endif
#ifdef HAVE_AVX512F_INSTRUCTIONS
if (flags.avx512) {
measure("AVX512 quick sort", qs::avx512::quicksort);
}
if (flags.avx512_buf) {
measure("AVX512 quick sort - aux buf", qs::avx512::auxbuffer_quicksort);
}
if (flags.avx512_popcnt) {
measure("AVX512 + popcnt quick sort", qs::avx512::popcnt_quicksort);
}
if (flags.avx512_bmi) {
measure("AVX512 + BMI2 quick sort", qs::avx512::bmi2_quicksort);
}
#endif
}
private:
template <typename SORT_FUNCTION>
void measure(const char* name, SORT_FUNCTION sort) {
PerformanceTest test(iterations, *data);
printf("%30s ... ", name); fflush(stdout);
#ifdef WITH_RUNTIME_STATS
statistics.reset();
#endif
uint64_t time = test.run(sort);
#ifdef USE_RDTSC
printf("%10lu cycles", time);
if (ref > 0) {
printf(" (%0.2f)", ref/double(time));
}
# ifdef WITH_RUNTIME_STATS
if (statistics.anything_collected()) {
printf("\n");
printf("\t\tpartition calls: %lu (+%lu scalar)\n", statistics.partition_calls, statistics.scalar__partition_calls);
printf("\t\titems processed: %lu (+%lu by scalar partition)\n", statistics.items_processed, statistics.scalar__items_processed);
const size_t total_items = statistics.items_processed + statistics.scalar__items_processed;
if (total_items != 0) {
const double cpi = double(time)/total_items;
printf("\t\t : %0.4f cycles/item\n", cpi * iterations);
}
if (!statistics.pvbyte_histogram.empty()) {
puts("Histogram for pvbyte:");
statistics.pvbyte_histogram.print();
}
}
# endif // WITH_RUNTIME_STATS
#else
printf("%0.4f s", time/1000000.0);
if (ref > 0) {
printf(" (%0.2f)\n", ref/double(time));
}
#endif
putchar('\n');
if (ref == 0) {
ref = time;
}
}
};
// ------------------------------------------------------------
void usage() {
puts("usage:");
puts("speed SIZE ITERATIONS INPUT [options]");
puts("");
puts("where");
puts("* SIZE - number of 32-bit elements");
puts("* ITERATIONS - number of iterations");
puts("* INPUT - one of:");
puts(" ascending (or asc)");
puts(" descending (or dsc, desc)");
puts(" random (or rnd, rand)");
puts(" randomfew");
puts(" randomuniq");
puts("options - optional name of procedure(s) to run");
}
int main(int argc, char* argv[]) {
if (argc < 4) {
usage();
return EXIT_FAILURE;
}
int count = atoi(argv[1]);
int iterations = atoi(argv[2]);
InputType type;
#define is_keyword(key) (strcmp(argv[3], key) == 0)
if (is_keyword("descending") || is_keyword("desc") || is_keyword("dsc")) {
type = InputType::descending;
} else if (is_keyword("ascending") || is_keyword("asc")) {
type = InputType::ascending;
} else if (is_keyword("random") || is_keyword("rnd") || is_keyword("rand")) {
type = InputType::random;
} else if (is_keyword("randomfew")) {
type = InputType::randomfew;
} else if (is_keyword("randomuniq")) {
type = InputType::randomuniq;
} else {
usage();
return EXIT_FAILURE;
}
#undef is_keyword
#ifdef HAVE_AVX512F_INSTRUCTIONS
#ifdef POPCNT_LOOKUP
prepare_lookup();
#endif
#endif
#ifdef USE_RDTSC
RDTSC_SET_OVERHEAD(rdtsc_overhead_func(1), iterations);
#endif
CommandLine cmd(argc, argv);
Flags flags(cmd);
Test test(type, count, iterations, std::move(flags));
test.run();
return EXIT_SUCCESS;
}

BIN
simd-sort/speed_avx2 Executable file

Binary file not shown.

261
simd-sort/test.cpp Normal file
View File

@ -0,0 +1,261 @@
#include <cstdlib>
#include <cstdio>
#include <cstdint>
#include <cstring>
#include <cassert>
#include <immintrin.h>
#include "cmdline.cpp"
#include "input_data.cpp"
#include "quicksort-all.cpp"
#include "avx2-altquicksort.h"
#include "avx2-nate-quicksort.cpp"
bool is_sorted(uint32_t* array, size_t n) {
assert(n > 0);
for (size_t i=1; i < n; i++) {
if (array[i - 1] > array[i]) {
printf("mismatch at %lu\n", i);
return false;
}
}
return true;
}
const size_t AVX512_REGISTER_SIZE = 16;
class Test {
bool verbose;
public:
Test(bool v = true) : verbose(v) {}
template <typename SORT_FN>
bool run(SORT_FN sort) {
const size_t start = 2*AVX512_REGISTER_SIZE;
const size_t end = 256*AVX512_REGISTER_SIZE;
if (verbose) {
putchar('\n');
}
for (size_t size=start; size < end; size += 1) {
if (verbose) {
printf("%lu/%lu\r", size, end);
fflush(stdout);
}
InputAscending asc(size);
InputDescending dsc(size);
InputRandom rnd(size);
InputRandomFew rndfew(size);
if (!test(sort, asc)) {
printf("failed for size %lu, intput ascending\n", size);
return false;
}
if (!test(sort, dsc)) {
printf("failed for size %lu, intput descending\n", size);
return false;
}
if (!test(sort, rnd)) {
printf("failed for size %lu, intput random\n", size);
return false;
}
if (!test(sort, rndfew)) {
printf("failed for size %lu, intput random few\n", size);
return false;
}
} // for
if (verbose) {
putchar('\n');
}
return true;
}
private:
template <typename SORT_FN>
bool test(SORT_FN sort, InputData& data) {
sort(data.pointer(), 0, data.count() - 1);
return is_sorted(data.pointer(), data.count());
}
};
class Flags {
public:
bool avx2;
bool avx2_alt;
bool avx2_nate;
bool avx512;
bool avx512_buf;
bool avx512_popcnt;
bool avx512_bmi;
public:
Flags(const CommandLine& cmd) {
enable_all(false);
bool any_set = false;
if (cmd.has("-avx2")) {
avx2 = true;
any_set = true;
}
if (cmd.has("-avx2-alt")) {
avx2_alt = true;
any_set = true;
}
if (cmd.has("-avx2-nate")) {
avx2_nate = true;
any_set = true;
}
if (cmd.has("-avx512")) {
avx512 = true;
any_set = true;
}
if (cmd.has("-avx512-buf")) {
avx512_buf = true;
any_set = true;
}
if (cmd.has("-avx512-popcnt")) {
avx512_popcnt = true;
any_set = true;
}
if (cmd.has("-avx512-bmi")) {
avx512_bmi = true;
any_set = true;
}
if (!any_set) {
enable_all(true);
}
}
void enable_all(bool val) {
avx2 = val;
avx2_nate = val;
avx2_alt = val;
avx512 = val;
avx512_buf = val;
avx512_popcnt = val;
avx512_bmi = val;
}
};
int main(int argc, char* argv[]) {
CommandLine cmd(argc, argv);
puts("Please wait, it might take a while...");
puts("");
bool verbose = cmd.has("-v") || cmd.has("--verbose");
Flags flags(cmd);
Test test(verbose);
int ret = EXIT_SUCCESS;
#ifdef HAVE_AVX2_INSTRUCTIONS
if (flags.avx2) {
printf("AVX2 base version... "); fflush(stdout);
if (test.run(qs::avx2::quicksort)) {
puts("OK");
} else {
puts("FAILED");
ret = EXIT_FAILURE;
}
}
if (flags.avx2_alt) {
printf("AVX2 alt version... "); fflush(stdout);
if (test.run(wrapped_avx2_pivotonlast_sort)) {
puts("OK");
} else {
puts("FAILED");
ret = EXIT_FAILURE;
}
}
if (flags.avx2_nate) {
printf("AVX2 Nate's variant... "); fflush(stdout);
if (test.run(nate::wrapped_avx2_pivotonlast_sort)) {
puts("OK");
} else {
puts("FAILED");
ret = EXIT_FAILURE;
}
}
#endif
#ifdef HAVE_AVX512F_INSTRUCTIONS
#ifdef POPCNT_LOOKUP
prepare_lookup();
#endif
if (flags.avx512) {
printf("AVX512 base version... "); fflush(stdout);
if (test.run(qs::avx512::quicksort)) {
puts("OK");
} else {
puts("FAILED");
ret = EXIT_FAILURE;
}
}
if (flags.avx512_popcnt) {
printf("AVX512 + popcnt version... "); fflush(stdout);
if (test.run(qs::avx512::popcnt_quicksort)) {
puts("OK");
} else {
puts("FAILED");
ret = EXIT_FAILURE;
}
}
if (flags.avx512_buf) {
printf("AVX512 with aux buffers... "); fflush(stdout);
if (test.run(qs::avx512::auxbuffer_quicksort)) {
puts("OK");
} else {
puts("FAILED");
ret = EXIT_FAILURE;
}
}
#if 0
if (flags.avx512_bmi) {
printf("AVX512 + bmi2 version ... "); fflush(stdout);
if (test.run(qs::avx512::bmi2_quicksort)) {
puts("OK");
} else {
puts("FAILED");
ret = EXIT_FAILURE;
}
}
#endif
#endif // HAVE_AVX512F_INSTRUCTIONS
return ret;
}