added simd-sort - basically the whole repo, but I haxed-in magyarsort as measure

2021-12-14 17:30:07 +01:00 · 2021-12-14 17:30:07 +01:00 · 05235e269f
commit 05235e269f
parent c4ed2994ea
32 changed files with 4139 additions and 0 deletions
--- a/simd-sort/LICENSE
+++ b/simd-sort/LICENSE
@ -0,0 +1,25 @@
+Copyright (c) 2016, Wojciech Muła, Daniel Lemire
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+
+1. Redistributions of source code must retain the above copyright
+   notice, this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright
+   notice, this list of conditions and the following disclaimer in the
+   documentation and/or other materials provided with the distribution.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED
+TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
--- a/simd-sort/Makefile
+++ b/simd-sort/Makefile
@ -0,0 +1,57 @@
+.SUFFIXES:
+.PHONY: all clean
+
+FLAGS=-std=c++11 -mbmi2 -Wall -pedantic -Wextra
+FLAGS_AVX512=$(FLAGS) -mavx512f -DHAVE_AVX512F_INSTRUCTIONS -DHAVE_AVX2_INSTRUCTIONS
+FLAGS_AVX2=$(FLAGS) -mavx2 -DHAVE_AVX2_INSTRUCTIONS
+
+DEPS_SORT=partition.cpp \
+          avx2-partition.cpp \
+          avx2-quicksort.cpp \
+          avx2-altquicksort.h \
+          avx2-natenodutch-quicksort.h \
+          avx2-nate-quicksort.cpp \
+          avx512-swap.cpp \
+          avx512-partition.cpp \
+          avx512-auxbuffer-partition.cpp \
+          avx512-bmi2-partition.cpp \
+          avx512-popcnt-partition.cpp \
+          avx512-quicksort.cpp \
+          avx512-sort-register.cpp \
+          avx512-partition-register.cpp \
+          quicksort.cpp
+
+SPEED_DEPS=$(DEPS_SORT) speed.cpp gettime.cpp rdtsc.cpp runtime_stats.cpp
+SPEED_FLAGS=-O3 -DNDEBUG
+
+ALL=test speed test_avx2 speed_avx2 speed_stats speed_avx2_stats
+
+all: $(ALL)
+
+test: test.cpp input_data.cpp $(DEPS_SORT)
+	$(CXX) $(FLAGS_AVX512) -fsanitize=address test.cpp -o $@
+
+test_avx2: test.cpp input_data.cpp $(DEPS_SORT)
+	#$(CXX) $(FLAGS_AVX2) -fsanitize=address test.cpp -o $@
+	$(CXX) $(FLAGS_AVX2) test.cpp -o $@
+
+speed: $(SPEED_DEPS)
+	$(CXX) $(FLAGS_AVX512) $(SPEED_FLAGS) speed.cpp -o $@
+
+speed_avx2: $(SPEED_DEPS)
+	$(CXX) $(FLAGS_AVX2) $(SPEED_FLAGS) speed.cpp -o $@
+
+speed_stats: $(SPEED_DEPS)
+	$(CXX) $(FLAGS_AVX512) $(SPEED_FLAGS) -DWITH_RUNTIME_STATS speed.cpp -o $@
+
+speed_avx2_stats: $(SPEED_DEPS)
+	$(CXX) $(FLAGS_AVX2) $(SPEED_FLAGS) -DWITH_RUNTIME_STATS speed.cpp -o $@
+
+run: test
+	sde -cnl -- ./$^
+
+run_avx2: test_avx2
+	sde -cnl -- ./$^
+
+clean:
+	rm -f $(ALL)
--- a/simd-sort/README.rst
+++ b/simd-sort/README.rst
@ -0,0 +1,48 @@
+================================================================================
+                                SIMD sorting
+================================================================================
+
+Overview
+--------------------------------------------------
+
+This repository contains SIMD versions of quicksort, using **AVX512F**
+and **AVX2** instruction sets. The subdirectory ``results`` contains
+performance results.
+
+The code is still unfinished, it may be subject of changes.
+
+Although programs are written in C++, most procedures are nearly
+plain C, except use of namespaces and references in place of
+pointers.
+
+
+Building
+--------------------------------------------------
+
+At least GCC 5.3.0 is needed.  Type ``make`` to build AVX512F
+and AVX2 versions of two programs:
+
+* ``test``/``test_avx2`` --- tests if SIMD-ized sorting procedures
+  work correctly for various inputs.
+
+* ``speed``/``speed_avx2`` --- compares speed of different procedures,
+  scalar and SIMD for various inputs.
+
+* ``speed_stats``/``speed_avx2_stats`` --- similar to ``speed``, but
+  some procedures collect and print runtime statistics. Note that
+  this gives overhead, so for real performance tests the basic variant
+  of the programs should be used.
+
+
+AVX512
+--------------------------------------------------
+
+`Intel Software Development Emulator`__ can be used to run AVX512 variants.
+
+__ https://software.intel.com/en-us/articles/intel-software-development-emulator
+
+
+References
+--------------------------------------------------
+
+- Fast Sorting Algorithms using AVX-512 on Intel Knights Landing https://arxiv.org/pdf/1704.08579.pdf
--- a/simd-sort/avx2-altquicksort.h
+++ b/simd-sort/avx2-altquicksort.h
@ -0,0 +1,461 @@
+#pragma once
+
+#include <x86intrin.h>
+#include <cstdint>
+
+/**
+* This is an alternative approach to SIMD quicksort, implemented by D. Lemire.
+* It is not meant to be fast as such, but it can serve as a useful reference.
+*/
+
+// When defined smaller reverseshufflemask is used.
+//#define BYTE_PATTERN
+
+// When defined histogram for comparison mask is collected
+//#define WITH_PVBYTE_HISTOGRAM
+
+#if WITH_RUNTIME_STATS == 0 && defined(WITH_PVBYTE_HISTOGRAM)
+#   undef WITH_RUNTIME_STATS
+#endif
+
+// can be replaced with VCOMPRESS on AVX-512
+static
+#ifdef BYTE_PATTERN
+    uint8_t
+#else
+    uint32_t
+#endif
+reverseshufflemask[256 * 8] __attribute__((aligned(0x100))) = {
+    0, 1, 2, 3, 4, 5, 6, 7, /* 0*/
+    1, 2, 3, 4, 5, 6, 7, 0, /* 1*/
+    0, 2, 3, 4, 5, 6, 7, 1, /* 2*/
+    2, 3, 4, 5, 6, 7, 0, 1, /* 3*/
+    0, 1, 3, 4, 5, 6, 7, 2, /* 4*/
+    1, 3, 4, 5, 6, 7, 0, 2, /* 5*/
+    0, 3, 4, 5, 6, 7, 1, 2, /* 6*/
+    3, 4, 5, 6, 7, 0, 1, 2, /* 7*/
+    0, 1, 2, 4, 5, 6, 7, 3, /* 8*/
+    1, 2, 4, 5, 6, 7, 0, 3, /* 9*/
+    0, 2, 4, 5, 6, 7, 1, 3, /* 10*/
+    2, 4, 5, 6, 7, 0, 1, 3, /* 11*/
+    0, 1, 4, 5, 6, 7, 2, 3, /* 12*/
+    1, 4, 5, 6, 7, 0, 2, 3, /* 13*/
+    0, 4, 5, 6, 7, 1, 2, 3, /* 14*/
+    4, 5, 6, 7, 0, 1, 2, 3, /* 15*/
+    0, 1, 2, 3, 5, 6, 7, 4, /* 16*/
+    1, 2, 3, 5, 6, 7, 0, 4, /* 17*/
+    0, 2, 3, 5, 6, 7, 1, 4, /* 18*/
+    2, 3, 5, 6, 7, 0, 1, 4, /* 19*/
+    0, 1, 3, 5, 6, 7, 2, 4, /* 20*/
+    1, 3, 5, 6, 7, 0, 2, 4, /* 21*/
+    0, 3, 5, 6, 7, 1, 2, 4, /* 22*/
+    3, 5, 6, 7, 0, 1, 2, 4, /* 23*/
+    0, 1, 2, 5, 6, 7, 3, 4, /* 24*/
+    1, 2, 5, 6, 7, 0, 3, 4, /* 25*/
+    0, 2, 5, 6, 7, 1, 3, 4, /* 26*/
+    2, 5, 6, 7, 0, 1, 3, 4, /* 27*/
+    0, 1, 5, 6, 7, 2, 3, 4, /* 28*/
+    1, 5, 6, 7, 0, 2, 3, 4, /* 29*/
+    0, 5, 6, 7, 1, 2, 3, 4, /* 30*/
+    5, 6, 7, 0, 1, 2, 3, 4, /* 31*/
+    0, 1, 2, 3, 4, 6, 7, 5, /* 32*/
+    1, 2, 3, 4, 6, 7, 0, 5, /* 33*/
+    0, 2, 3, 4, 6, 7, 1, 5, /* 34*/
+    2, 3, 4, 6, 7, 0, 1, 5, /* 35*/
+    0, 1, 3, 4, 6, 7, 2, 5, /* 36*/
+    1, 3, 4, 6, 7, 0, 2, 5, /* 37*/
+    0, 3, 4, 6, 7, 1, 2, 5, /* 38*/
+    3, 4, 6, 7, 0, 1, 2, 5, /* 39*/
+    0, 1, 2, 4, 6, 7, 3, 5, /* 40*/
+    1, 2, 4, 6, 7, 0, 3, 5, /* 41*/
+    0, 2, 4, 6, 7, 1, 3, 5, /* 42*/
+    2, 4, 6, 7, 0, 1, 3, 5, /* 43*/
+    0, 1, 4, 6, 7, 2, 3, 5, /* 44*/
+    1, 4, 6, 7, 0, 2, 3, 5, /* 45*/
+    0, 4, 6, 7, 1, 2, 3, 5, /* 46*/
+    4, 6, 7, 0, 1, 2, 3, 5, /* 47*/
+    0, 1, 2, 3, 6, 7, 4, 5, /* 48*/
+    1, 2, 3, 6, 7, 0, 4, 5, /* 49*/
+    0, 2, 3, 6, 7, 1, 4, 5, /* 50*/
+    2, 3, 6, 7, 0, 1, 4, 5, /* 51*/
+    0, 1, 3, 6, 7, 2, 4, 5, /* 52*/
+    1, 3, 6, 7, 0, 2, 4, 5, /* 53*/
+    0, 3, 6, 7, 1, 2, 4, 5, /* 54*/
+    3, 6, 7, 0, 1, 2, 4, 5, /* 55*/
+    0, 1, 2, 6, 7, 3, 4, 5, /* 56*/
+    1, 2, 6, 7, 0, 3, 4, 5, /* 57*/
+    0, 2, 6, 7, 1, 3, 4, 5, /* 58*/
+    2, 6, 7, 0, 1, 3, 4, 5, /* 59*/
+    0, 1, 6, 7, 2, 3, 4, 5, /* 60*/
+    1, 6, 7, 0, 2, 3, 4, 5, /* 61*/
+    0, 6, 7, 1, 2, 3, 4, 5, /* 62*/
+    6, 7, 0, 1, 2, 3, 4, 5, /* 63*/
+    0, 1, 2, 3, 4, 5, 7, 6, /* 64*/
+    1, 2, 3, 4, 5, 7, 0, 6, /* 65*/
+    0, 2, 3, 4, 5, 7, 1, 6, /* 66*/
+    2, 3, 4, 5, 7, 0, 1, 6, /* 67*/
+    0, 1, 3, 4, 5, 7, 2, 6, /* 68*/
+    1, 3, 4, 5, 7, 0, 2, 6, /* 69*/
+    0, 3, 4, 5, 7, 1, 2, 6, /* 70*/
+    3, 4, 5, 7, 0, 1, 2, 6, /* 71*/
+    0, 1, 2, 4, 5, 7, 3, 6, /* 72*/
+    1, 2, 4, 5, 7, 0, 3, 6, /* 73*/
+    0, 2, 4, 5, 7, 1, 3, 6, /* 74*/
+    2, 4, 5, 7, 0, 1, 3, 6, /* 75*/
+    0, 1, 4, 5, 7, 2, 3, 6, /* 76*/
+    1, 4, 5, 7, 0, 2, 3, 6, /* 77*/
+    0, 4, 5, 7, 1, 2, 3, 6, /* 78*/
+    4, 5, 7, 0, 1, 2, 3, 6, /* 79*/
+    0, 1, 2, 3, 5, 7, 4, 6, /* 80*/
+    1, 2, 3, 5, 7, 0, 4, 6, /* 81*/
+    0, 2, 3, 5, 7, 1, 4, 6, /* 82*/
+    2, 3, 5, 7, 0, 1, 4, 6, /* 83*/
+    0, 1, 3, 5, 7, 2, 4, 6, /* 84*/
+    1, 3, 5, 7, 0, 2, 4, 6, /* 85*/
+    0, 3, 5, 7, 1, 2, 4, 6, /* 86*/
+    3, 5, 7, 0, 1, 2, 4, 6, /* 87*/
+    0, 1, 2, 5, 7, 3, 4, 6, /* 88*/
+    1, 2, 5, 7, 0, 3, 4, 6, /* 89*/
+    0, 2, 5, 7, 1, 3, 4, 6, /* 90*/
+    2, 5, 7, 0, 1, 3, 4, 6, /* 91*/
+    0, 1, 5, 7, 2, 3, 4, 6, /* 92*/
+    1, 5, 7, 0, 2, 3, 4, 6, /* 93*/
+    0, 5, 7, 1, 2, 3, 4, 6, /* 94*/
+    5, 7, 0, 1, 2, 3, 4, 6, /* 95*/
+    0, 1, 2, 3, 4, 7, 5, 6, /* 96*/
+    1, 2, 3, 4, 7, 0, 5, 6, /* 97*/
+    0, 2, 3, 4, 7, 1, 5, 6, /* 98*/
+    2, 3, 4, 7, 0, 1, 5, 6, /* 99*/
+    0, 1, 3, 4, 7, 2, 5, 6, /* 100*/
+    1, 3, 4, 7, 0, 2, 5, 6, /* 101*/
+    0, 3, 4, 7, 1, 2, 5, 6, /* 102*/
+    3, 4, 7, 0, 1, 2, 5, 6, /* 103*/
+    0, 1, 2, 4, 7, 3, 5, 6, /* 104*/
+    1, 2, 4, 7, 0, 3, 5, 6, /* 105*/
+    0, 2, 4, 7, 1, 3, 5, 6, /* 106*/
+    2, 4, 7, 0, 1, 3, 5, 6, /* 107*/
+    0, 1, 4, 7, 2, 3, 5, 6, /* 108*/
+    1, 4, 7, 0, 2, 3, 5, 6, /* 109*/
+    0, 4, 7, 1, 2, 3, 5, 6, /* 110*/
+    4, 7, 0, 1, 2, 3, 5, 6, /* 111*/
+    0, 1, 2, 3, 7, 4, 5, 6, /* 112*/
+    1, 2, 3, 7, 0, 4, 5, 6, /* 113*/
+    0, 2, 3, 7, 1, 4, 5, 6, /* 114*/
+    2, 3, 7, 0, 1, 4, 5, 6, /* 115*/
+    0, 1, 3, 7, 2, 4, 5, 6, /* 116*/
+    1, 3, 7, 0, 2, 4, 5, 6, /* 117*/
+    0, 3, 7, 1, 2, 4, 5, 6, /* 118*/
+    3, 7, 0, 1, 2, 4, 5, 6, /* 119*/
+    0, 1, 2, 7, 3, 4, 5, 6, /* 120*/
+    1, 2, 7, 0, 3, 4, 5, 6, /* 121*/
+    0, 2, 7, 1, 3, 4, 5, 6, /* 122*/
+    2, 7, 0, 1, 3, 4, 5, 6, /* 123*/
+    0, 1, 7, 2, 3, 4, 5, 6, /* 124*/
+    1, 7, 0, 2, 3, 4, 5, 6, /* 125*/
+    0, 7, 1, 2, 3, 4, 5, 6, /* 126*/
+    7, 0, 1, 2, 3, 4, 5, 6, /* 127*/
+    0, 1, 2, 3, 4, 5, 6, 7, /* 128*/
+    1, 2, 3, 4, 5, 6, 0, 7, /* 129*/
+    0, 2, 3, 4, 5, 6, 1, 7, /* 130*/
+    2, 3, 4, 5, 6, 0, 1, 7, /* 131*/
+    0, 1, 3, 4, 5, 6, 2, 7, /* 132*/
+    1, 3, 4, 5, 6, 0, 2, 7, /* 133*/
+    0, 3, 4, 5, 6, 1, 2, 7, /* 134*/
+    3, 4, 5, 6, 0, 1, 2, 7, /* 135*/
+    0, 1, 2, 4, 5, 6, 3, 7, /* 136*/
+    1, 2, 4, 5, 6, 0, 3, 7, /* 137*/
+    0, 2, 4, 5, 6, 1, 3, 7, /* 138*/
+    2, 4, 5, 6, 0, 1, 3, 7, /* 139*/
+    0, 1, 4, 5, 6, 2, 3, 7, /* 140*/
+    1, 4, 5, 6, 0, 2, 3, 7, /* 141*/
+    0, 4, 5, 6, 1, 2, 3, 7, /* 142*/
+    4, 5, 6, 0, 1, 2, 3, 7, /* 143*/
+    0, 1, 2, 3, 5, 6, 4, 7, /* 144*/
+    1, 2, 3, 5, 6, 0, 4, 7, /* 145*/
+    0, 2, 3, 5, 6, 1, 4, 7, /* 146*/
+    2, 3, 5, 6, 0, 1, 4, 7, /* 147*/
+    0, 1, 3, 5, 6, 2, 4, 7, /* 148*/
+    1, 3, 5, 6, 0, 2, 4, 7, /* 149*/
+    0, 3, 5, 6, 1, 2, 4, 7, /* 150*/
+    3, 5, 6, 0, 1, 2, 4, 7, /* 151*/
+    0, 1, 2, 5, 6, 3, 4, 7, /* 152*/
+    1, 2, 5, 6, 0, 3, 4, 7, /* 153*/
+    0, 2, 5, 6, 1, 3, 4, 7, /* 154*/
+    2, 5, 6, 0, 1, 3, 4, 7, /* 155*/
+    0, 1, 5, 6, 2, 3, 4, 7, /* 156*/
+    1, 5, 6, 0, 2, 3, 4, 7, /* 157*/
+    0, 5, 6, 1, 2, 3, 4, 7, /* 158*/
+    5, 6, 0, 1, 2, 3, 4, 7, /* 159*/
+    0, 1, 2, 3, 4, 6, 5, 7, /* 160*/
+    1, 2, 3, 4, 6, 0, 5, 7, /* 161*/
+    0, 2, 3, 4, 6, 1, 5, 7, /* 162*/
+    2, 3, 4, 6, 0, 1, 5, 7, /* 163*/
+    0, 1, 3, 4, 6, 2, 5, 7, /* 164*/
+    1, 3, 4, 6, 0, 2, 5, 7, /* 165*/
+    0, 3, 4, 6, 1, 2, 5, 7, /* 166*/
+    3, 4, 6, 0, 1, 2, 5, 7, /* 167*/
+    0, 1, 2, 4, 6, 3, 5, 7, /* 168*/
+    1, 2, 4, 6, 0, 3, 5, 7, /* 169*/
+    0, 2, 4, 6, 1, 3, 5, 7, /* 170*/
+    2, 4, 6, 0, 1, 3, 5, 7, /* 171*/
+    0, 1, 4, 6, 2, 3, 5, 7, /* 172*/
+    1, 4, 6, 0, 2, 3, 5, 7, /* 173*/
+    0, 4, 6, 1, 2, 3, 5, 7, /* 174*/
+    4, 6, 0, 1, 2, 3, 5, 7, /* 175*/
+    0, 1, 2, 3, 6, 4, 5, 7, /* 176*/
+    1, 2, 3, 6, 0, 4, 5, 7, /* 177*/
+    0, 2, 3, 6, 1, 4, 5, 7, /* 178*/
+    2, 3, 6, 0, 1, 4, 5, 7, /* 179*/
+    0, 1, 3, 6, 2, 4, 5, 7, /* 180*/
+    1, 3, 6, 0, 2, 4, 5, 7, /* 181*/
+    0, 3, 6, 1, 2, 4, 5, 7, /* 182*/
+    3, 6, 0, 1, 2, 4, 5, 7, /* 183*/
+    0, 1, 2, 6, 3, 4, 5, 7, /* 184*/
+    1, 2, 6, 0, 3, 4, 5, 7, /* 185*/
+    0, 2, 6, 1, 3, 4, 5, 7, /* 186*/
+    2, 6, 0, 1, 3, 4, 5, 7, /* 187*/
+    0, 1, 6, 2, 3, 4, 5, 7, /* 188*/
+    1, 6, 0, 2, 3, 4, 5, 7, /* 189*/
+    0, 6, 1, 2, 3, 4, 5, 7, /* 190*/
+    6, 0, 1, 2, 3, 4, 5, 7, /* 191*/
+    0, 1, 2, 3, 4, 5, 6, 7, /* 192*/
+    1, 2, 3, 4, 5, 0, 6, 7, /* 193*/
+    0, 2, 3, 4, 5, 1, 6, 7, /* 194*/
+    2, 3, 4, 5, 0, 1, 6, 7, /* 195*/
+    0, 1, 3, 4, 5, 2, 6, 7, /* 196*/
+    1, 3, 4, 5, 0, 2, 6, 7, /* 197*/
+    0, 3, 4, 5, 1, 2, 6, 7, /* 198*/
+    3, 4, 5, 0, 1, 2, 6, 7, /* 199*/
+    0, 1, 2, 4, 5, 3, 6, 7, /* 200*/
+    1, 2, 4, 5, 0, 3, 6, 7, /* 201*/
+    0, 2, 4, 5, 1, 3, 6, 7, /* 202*/
+    2, 4, 5, 0, 1, 3, 6, 7, /* 203*/
+    0, 1, 4, 5, 2, 3, 6, 7, /* 204*/
+    1, 4, 5, 0, 2, 3, 6, 7, /* 205*/
+    0, 4, 5, 1, 2, 3, 6, 7, /* 206*/
+    4, 5, 0, 1, 2, 3, 6, 7, /* 207*/
+    0, 1, 2, 3, 5, 4, 6, 7, /* 208*/
+    1, 2, 3, 5, 0, 4, 6, 7, /* 209*/
+    0, 2, 3, 5, 1, 4, 6, 7, /* 210*/
+    2, 3, 5, 0, 1, 4, 6, 7, /* 211*/
+    0, 1, 3, 5, 2, 4, 6, 7, /* 212*/
+    1, 3, 5, 0, 2, 4, 6, 7, /* 213*/
+    0, 3, 5, 1, 2, 4, 6, 7, /* 214*/
+    3, 5, 0, 1, 2, 4, 6, 7, /* 215*/
+    0, 1, 2, 5, 3, 4, 6, 7, /* 216*/
+    1, 2, 5, 0, 3, 4, 6, 7, /* 217*/
+    0, 2, 5, 1, 3, 4, 6, 7, /* 218*/
+    2, 5, 0, 1, 3, 4, 6, 7, /* 219*/
+    0, 1, 5, 2, 3, 4, 6, 7, /* 220*/
+    1, 5, 0, 2, 3, 4, 6, 7, /* 221*/
+    0, 5, 1, 2, 3, 4, 6, 7, /* 222*/
+    5, 0, 1, 2, 3, 4, 6, 7, /* 223*/
+    0, 1, 2, 3, 4, 5, 6, 7, /* 224*/
+    1, 2, 3, 4, 0, 5, 6, 7, /* 225*/
+    0, 2, 3, 4, 1, 5, 6, 7, /* 226*/
+    2, 3, 4, 0, 1, 5, 6, 7, /* 227*/
+    0, 1, 3, 4, 2, 5, 6, 7, /* 228*/
+    1, 3, 4, 0, 2, 5, 6, 7, /* 229*/
+    0, 3, 4, 1, 2, 5, 6, 7, /* 230*/
+    3, 4, 0, 1, 2, 5, 6, 7, /* 231*/
+    0, 1, 2, 4, 3, 5, 6, 7, /* 232*/
+    1, 2, 4, 0, 3, 5, 6, 7, /* 233*/
+    0, 2, 4, 1, 3, 5, 6, 7, /* 234*/
+    2, 4, 0, 1, 3, 5, 6, 7, /* 235*/
+    0, 1, 4, 2, 3, 5, 6, 7, /* 236*/
+    1, 4, 0, 2, 3, 5, 6, 7, /* 237*/
+    0, 4, 1, 2, 3, 5, 6, 7, /* 238*/
+    4, 0, 1, 2, 3, 5, 6, 7, /* 239*/
+    0, 1, 2, 3, 4, 5, 6, 7, /* 240*/
+    1, 2, 3, 0, 4, 5, 6, 7, /* 241*/
+    0, 2, 3, 1, 4, 5, 6, 7, /* 242*/
+    2, 3, 0, 1, 4, 5, 6, 7, /* 243*/
+    0, 1, 3, 2, 4, 5, 6, 7, /* 244*/
+    1, 3, 0, 2, 4, 5, 6, 7, /* 245*/
+    0, 3, 1, 2, 4, 5, 6, 7, /* 246*/
+    3, 0, 1, 2, 4, 5, 6, 7, /* 247*/
+    0, 1, 2, 3, 4, 5, 6, 7, /* 248*/
+    1, 2, 0, 3, 4, 5, 6, 7, /* 249*/
+    0, 2, 1, 3, 4, 5, 6, 7, /* 250*/
+    2, 0, 1, 3, 4, 5, 6, 7, /* 251*/
+    0, 1, 2, 3, 4, 5, 6, 7, /* 252*/
+    1, 0, 2, 3, 4, 5, 6, 7, /* 253*/
+    0, 1, 2, 3, 4, 5, 6, 7, /* 254*/
+    0, 1, 2, 3, 4, 5, 6, 7, /* 255*/
+};
+
+
+static FORCE_INLINE __m256i get_permutation_vector(int pvbyte) {
+#ifdef BYTE_PATTERN
+    __m256i shufm;
+    asm volatile (
+        "vpmovzxbd (%1), %0"
+        : "=X" (shufm)
+        : "r" (reverseshufflemask + 8 * pvbyte)
+    );
+
+    return shufm;
+#else
+    return _mm256_load_si256((__m256i *)(reverseshufflemask + 8 * pvbyte));
+#endif
+}
+
+
+static uint32_t avx_pivot_on_last_value(int32_t *array, size_t length) {
+  /* we run through the data. Anything in [0,boundary) is smaller or equal
+  * than the pivot, and the value at boundary - 1 is going to be equal to the
+  * pivot at the end,
+  * anything in (boundary, i) is greater than the pivot
+  * stuff in [i,...) is grey
+  * the function returns the location of the boundary.
+  */
+  if (length <= 1)
+    return 1;
+  { // we exchange the last value for the middle value for a better pivot
+    int32_t ival = array[length / 2];
+    int32_t bval = array[length - 1];
+    array[length / 2] = bval;
+    array[length - 1] = ival;
+  }
+#if WITH_RUNTIME_STATS
+  statistics.partition_calls += 1;
+  statistics.items_processed += length;
+#endif
+  uint32_t boundary = 0;
+  uint32_t i = 0;
+  int32_t pivot = array[length - 1]; // we always pick the pivot at the end
+  const __m256i P = _mm256_set1_epi32(pivot);
+  while ( i + 8 + 1 <= length) {
+      __m256i allgrey = _mm256_lddqu_si256((__m256i *)(array + i));
+      int pvbyte = _mm256_movemask_ps((__m256)_mm256_cmpgt_epi32(allgrey, P));
+#if WITH_RUNTIME_STATS && defined(WITH_PVBYTE_HISTOGRAM)
+      statistics.pvbyte_histogram.hit(pvbyte);
+#endif
+      if(pvbyte == 0) { // might be frequent
+        i += 8; //nothing to do
+        boundary = i;
+      } else if (pvbyte == 0xFF) { // called once
+        boundary = i;
+        i += 8;
+        break; // exit
+      } else {
+
+        // hot path
+        switch (pvbyte) {
+            // for pvbyte = 0x00, 0x80, 0xc0, 0xe0, 0xf0, 0xf8, 0xfc, 0xfe, 0xff
+            //              there is no change in order, just advance boundary
+            //              Note: case 0x00 & 0xff are already handled
+            case 0x80: i += 8 - 1; break;
+            case 0xc0: i += 8 - 2; break;
+            case 0xe0: i += 8 - 3; break;
+            case 0xf0: i += 8 - 4; break;
+            case 0xf8: i += 8 - 5; break;
+            case 0xfc: i += 8 - 6; break;
+            case 0xfe: i += 8 - 7; break;
+
+            default: {
+              uint32_t cnt =
+                  8 - _mm_popcnt_u32(pvbyte); // might be faster with table look-up?
+              __m256i blackthenwhite = _mm256_permutevar8x32_epi32(allgrey, get_permutation_vector(pvbyte));
+              _mm256_storeu_si256((__m256i *)(array + i), blackthenwhite);
+              i += cnt;
+            }
+        } // switch
+
+        boundary = i; // this doesn't need updating each and every time
+    }
+  }
+  for (; i + 8 + 1 <= length ;) {
+      __m256i allgrey =
+          _mm256_lddqu_si256((__m256i *)(array + i)); // this is all grey
+      int pvbyte = _mm256_movemask_ps((__m256)_mm256_cmpgt_epi32(allgrey, P));
+      if (pvbyte == 0xFF) { // called once
+        // nothing to do
+      } else {
+
+      uint32_t cnt =
+          8 - _mm_popcnt_u32(pvbyte); // might be faster with table look-up?
+      __m256i allwhite = _mm256_lddqu_si256(
+          (__m256i *)(array + boundary)); // this is all white
+      // we shuffle allgrey so that the first part is black and the second part
+      // is white
+      __m256i blackthenwhite = _mm256_permutevar8x32_epi32(allgrey, get_permutation_vector(pvbyte));
+      _mm256_storeu_si256((__m256i *)(array + boundary), blackthenwhite);
+      _mm256_storeu_si256((__m256i *)(array + i), allwhite);
+      boundary += cnt; // might be faster with table look-up?
+    }
+      i += 8;
+  }
+  while (i + 1 < length) {
+    int32_t ival = array[i];
+    if (ival <= pivot) {
+      int32_t bval = array[boundary];
+      array[i] = bval;
+      array[boundary] = ival;
+      boundary++;
+    }
+    i++;
+  }
+  int32_t ival = array[i];
+  int32_t bval = array[boundary];
+  array[length - 1] = bval;
+  array[boundary] = ival;
+  boundary++;
+  return boundary;
+}
+
+// for fallback
+void scalar_partition(int32_t* array, const int32_t pivot, int& left, int& right) {
+
+    while (left <= right) {
+        while (array[left] < pivot) {
+            left += 1;
+        }
+        while (array[right] > pivot) {
+            right -= 1;
+        }
+        if (left <= right) {
+            const uint32_t t = array[left];
+            array[left]      = array[right];
+            array[right]     = t;
+            left  += 1;
+            right -= 1;
+        }
+    }
+}
+
+//fallback
+void scalar_quicksort(int32_t* array, int left, int right) {
+#ifdef WITH_RUNTIME_STATS
+    statistics.scalar__partition_calls += 1;
+    statistics.scalar__items_processed += right - left + 1;
+#endif
+    int i = left;
+    int j = right;
+    const int32_t pivot = array[(i + j)/2];
+    scalar_partition(array, pivot, i, j);
+    if (left < j) {
+        scalar_quicksort(array, left, j);
+    }
+    if (i < right) {
+        scalar_quicksort(array, i, right);
+    }
+}
+
+void avx2_pivotonlast_sort(int32_t *array, const uint32_t length) {
+  uint32_t sep = avx_pivot_on_last_value(array, length);
+  if(sep == length) {
+    // we have an ineffective pivot. Let us give up.
+    if(length > 1) scalar_quicksort(array,0,length - 1);
+  } else {
+    if (sep > 2) {
+      avx2_pivotonlast_sort(array, sep - 1);
+    }
+    if (sep + 1 < length) {
+      avx2_pivotonlast_sort(array + sep, length - sep);
+    }
+  }
+}
+void wrapped_avx2_pivotonlast_sort(uint32_t *array, int left, int right) {
+  avx2_pivotonlast_sort((int32_t *)array + left, right - left + 1);
+}
--- a/simd-sort/avx2-nate-quicksort.cpp
+++ b/simd-sort/avx2-nate-quicksort.cpp
@ -0,0 +1,436 @@
+#pragma once
+
+#include <x86intrin.h>
+#include <cstdint>
+
+namespace nate {
+
+const int AVX2_REGISTER_SIZE = 8; // in 32-bit words
+
+/*
+    Algorithm of vectorized partition by Nathan Kurz.
+    Everything else was borrowed from Daniel's code (avx2-altquicksort.h)
+*/
+
+static uint32_t reverseshufflemask[256 * 8] __attribute__((aligned(0x100))) = {
+    0, 1, 2, 3, 4, 5, 6, 7, /* 0*/
+    1, 2, 3, 4, 5, 6, 7, 0, /* 1*/
+    0, 2, 3, 4, 5, 6, 7, 1, /* 2*/
+    2, 3, 4, 5, 6, 7, 0, 1, /* 3*/
+    0, 1, 3, 4, 5, 6, 7, 2, /* 4*/
+    1, 3, 4, 5, 6, 7, 0, 2, /* 5*/
+    0, 3, 4, 5, 6, 7, 1, 2, /* 6*/
+    3, 4, 5, 6, 7, 0, 1, 2, /* 7*/
+    0, 1, 2, 4, 5, 6, 7, 3, /* 8*/
+    1, 2, 4, 5, 6, 7, 0, 3, /* 9*/
+    0, 2, 4, 5, 6, 7, 1, 3, /* 10*/
+    2, 4, 5, 6, 7, 0, 1, 3, /* 11*/
+    0, 1, 4, 5, 6, 7, 2, 3, /* 12*/
+    1, 4, 5, 6, 7, 0, 2, 3, /* 13*/
+    0, 4, 5, 6, 7, 1, 2, 3, /* 14*/
+    4, 5, 6, 7, 0, 1, 2, 3, /* 15*/
+    0, 1, 2, 3, 5, 6, 7, 4, /* 16*/
+    1, 2, 3, 5, 6, 7, 0, 4, /* 17*/
+    0, 2, 3, 5, 6, 7, 1, 4, /* 18*/
+    2, 3, 5, 6, 7, 0, 1, 4, /* 19*/
+    0, 1, 3, 5, 6, 7, 2, 4, /* 20*/
+    1, 3, 5, 6, 7, 0, 2, 4, /* 21*/
+    0, 3, 5, 6, 7, 1, 2, 4, /* 22*/
+    3, 5, 6, 7, 0, 1, 2, 4, /* 23*/
+    0, 1, 2, 5, 6, 7, 3, 4, /* 24*/
+    1, 2, 5, 6, 7, 0, 3, 4, /* 25*/
+    0, 2, 5, 6, 7, 1, 3, 4, /* 26*/
+    2, 5, 6, 7, 0, 1, 3, 4, /* 27*/
+    0, 1, 5, 6, 7, 2, 3, 4, /* 28*/
+    1, 5, 6, 7, 0, 2, 3, 4, /* 29*/
+    0, 5, 6, 7, 1, 2, 3, 4, /* 30*/
+    5, 6, 7, 0, 1, 2, 3, 4, /* 31*/
+    0, 1, 2, 3, 4, 6, 7, 5, /* 32*/
+    1, 2, 3, 4, 6, 7, 0, 5, /* 33*/
+    0, 2, 3, 4, 6, 7, 1, 5, /* 34*/
+    2, 3, 4, 6, 7, 0, 1, 5, /* 35*/
+    0, 1, 3, 4, 6, 7, 2, 5, /* 36*/
+    1, 3, 4, 6, 7, 0, 2, 5, /* 37*/
+    0, 3, 4, 6, 7, 1, 2, 5, /* 38*/
+    3, 4, 6, 7, 0, 1, 2, 5, /* 39*/
+    0, 1, 2, 4, 6, 7, 3, 5, /* 40*/
+    1, 2, 4, 6, 7, 0, 3, 5, /* 41*/
+    0, 2, 4, 6, 7, 1, 3, 5, /* 42*/
+    2, 4, 6, 7, 0, 1, 3, 5, /* 43*/
+    0, 1, 4, 6, 7, 2, 3, 5, /* 44*/
+    1, 4, 6, 7, 0, 2, 3, 5, /* 45*/
+    0, 4, 6, 7, 1, 2, 3, 5, /* 46*/
+    4, 6, 7, 0, 1, 2, 3, 5, /* 47*/
+    0, 1, 2, 3, 6, 7, 4, 5, /* 48*/
+    1, 2, 3, 6, 7, 0, 4, 5, /* 49*/
+    0, 2, 3, 6, 7, 1, 4, 5, /* 50*/
+    2, 3, 6, 7, 0, 1, 4, 5, /* 51*/
+    0, 1, 3, 6, 7, 2, 4, 5, /* 52*/
+    1, 3, 6, 7, 0, 2, 4, 5, /* 53*/
+    0, 3, 6, 7, 1, 2, 4, 5, /* 54*/
+    3, 6, 7, 0, 1, 2, 4, 5, /* 55*/
+    0, 1, 2, 6, 7, 3, 4, 5, /* 56*/
+    1, 2, 6, 7, 0, 3, 4, 5, /* 57*/
+    0, 2, 6, 7, 1, 3, 4, 5, /* 58*/
+    2, 6, 7, 0, 1, 3, 4, 5, /* 59*/
+    0, 1, 6, 7, 2, 3, 4, 5, /* 60*/
+    1, 6, 7, 0, 2, 3, 4, 5, /* 61*/
+    0, 6, 7, 1, 2, 3, 4, 5, /* 62*/
+    6, 7, 0, 1, 2, 3, 4, 5, /* 63*/
+    0, 1, 2, 3, 4, 5, 7, 6, /* 64*/
+    1, 2, 3, 4, 5, 7, 0, 6, /* 65*/
+    0, 2, 3, 4, 5, 7, 1, 6, /* 66*/
+    2, 3, 4, 5, 7, 0, 1, 6, /* 67*/
+    0, 1, 3, 4, 5, 7, 2, 6, /* 68*/
+    1, 3, 4, 5, 7, 0, 2, 6, /* 69*/
+    0, 3, 4, 5, 7, 1, 2, 6, /* 70*/
+    3, 4, 5, 7, 0, 1, 2, 6, /* 71*/
+    0, 1, 2, 4, 5, 7, 3, 6, /* 72*/
+    1, 2, 4, 5, 7, 0, 3, 6, /* 73*/
+    0, 2, 4, 5, 7, 1, 3, 6, /* 74*/
+    2, 4, 5, 7, 0, 1, 3, 6, /* 75*/
+    0, 1, 4, 5, 7, 2, 3, 6, /* 76*/
+    1, 4, 5, 7, 0, 2, 3, 6, /* 77*/
+    0, 4, 5, 7, 1, 2, 3, 6, /* 78*/
+    4, 5, 7, 0, 1, 2, 3, 6, /* 79*/
+    0, 1, 2, 3, 5, 7, 4, 6, /* 80*/
+    1, 2, 3, 5, 7, 0, 4, 6, /* 81*/
+    0, 2, 3, 5, 7, 1, 4, 6, /* 82*/
+    2, 3, 5, 7, 0, 1, 4, 6, /* 83*/
+    0, 1, 3, 5, 7, 2, 4, 6, /* 84*/
+    1, 3, 5, 7, 0, 2, 4, 6, /* 85*/
+    0, 3, 5, 7, 1, 2, 4, 6, /* 86*/
+    3, 5, 7, 0, 1, 2, 4, 6, /* 87*/
+    0, 1, 2, 5, 7, 3, 4, 6, /* 88*/
+    1, 2, 5, 7, 0, 3, 4, 6, /* 89*/
+    0, 2, 5, 7, 1, 3, 4, 6, /* 90*/
+    2, 5, 7, 0, 1, 3, 4, 6, /* 91*/
+    0, 1, 5, 7, 2, 3, 4, 6, /* 92*/
+    1, 5, 7, 0, 2, 3, 4, 6, /* 93*/
+    0, 5, 7, 1, 2, 3, 4, 6, /* 94*/
+    5, 7, 0, 1, 2, 3, 4, 6, /* 95*/
+    0, 1, 2, 3, 4, 7, 5, 6, /* 96*/
+    1, 2, 3, 4, 7, 0, 5, 6, /* 97*/
+    0, 2, 3, 4, 7, 1, 5, 6, /* 98*/
+    2, 3, 4, 7, 0, 1, 5, 6, /* 99*/
+    0, 1, 3, 4, 7, 2, 5, 6, /* 100*/
+    1, 3, 4, 7, 0, 2, 5, 6, /* 101*/
+    0, 3, 4, 7, 1, 2, 5, 6, /* 102*/
+    3, 4, 7, 0, 1, 2, 5, 6, /* 103*/
+    0, 1, 2, 4, 7, 3, 5, 6, /* 104*/
+    1, 2, 4, 7, 0, 3, 5, 6, /* 105*/
+    0, 2, 4, 7, 1, 3, 5, 6, /* 106*/
+    2, 4, 7, 0, 1, 3, 5, 6, /* 107*/
+    0, 1, 4, 7, 2, 3, 5, 6, /* 108*/
+    1, 4, 7, 0, 2, 3, 5, 6, /* 109*/
+    0, 4, 7, 1, 2, 3, 5, 6, /* 110*/
+    4, 7, 0, 1, 2, 3, 5, 6, /* 111*/
+    0, 1, 2, 3, 7, 4, 5, 6, /* 112*/
+    1, 2, 3, 7, 0, 4, 5, 6, /* 113*/
+    0, 2, 3, 7, 1, 4, 5, 6, /* 114*/
+    2, 3, 7, 0, 1, 4, 5, 6, /* 115*/
+    0, 1, 3, 7, 2, 4, 5, 6, /* 116*/
+    1, 3, 7, 0, 2, 4, 5, 6, /* 117*/
+    0, 3, 7, 1, 2, 4, 5, 6, /* 118*/
+    3, 7, 0, 1, 2, 4, 5, 6, /* 119*/
+    0, 1, 2, 7, 3, 4, 5, 6, /* 120*/
+    1, 2, 7, 0, 3, 4, 5, 6, /* 121*/
+    0, 2, 7, 1, 3, 4, 5, 6, /* 122*/
+    2, 7, 0, 1, 3, 4, 5, 6, /* 123*/
+    0, 1, 7, 2, 3, 4, 5, 6, /* 124*/
+    1, 7, 0, 2, 3, 4, 5, 6, /* 125*/
+    0, 7, 1, 2, 3, 4, 5, 6, /* 126*/
+    7, 0, 1, 2, 3, 4, 5, 6, /* 127*/
+    0, 1, 2, 3, 4, 5, 6, 7, /* 128*/
+    1, 2, 3, 4, 5, 6, 0, 7, /* 129*/
+    0, 2, 3, 4, 5, 6, 1, 7, /* 130*/
+    2, 3, 4, 5, 6, 0, 1, 7, /* 131*/
+    0, 1, 3, 4, 5, 6, 2, 7, /* 132*/
+    1, 3, 4, 5, 6, 0, 2, 7, /* 133*/
+    0, 3, 4, 5, 6, 1, 2, 7, /* 134*/
+    3, 4, 5, 6, 0, 1, 2, 7, /* 135*/
+    0, 1, 2, 4, 5, 6, 3, 7, /* 136*/
+    1, 2, 4, 5, 6, 0, 3, 7, /* 137*/
+    0, 2, 4, 5, 6, 1, 3, 7, /* 138*/
+    2, 4, 5, 6, 0, 1, 3, 7, /* 139*/
+    0, 1, 4, 5, 6, 2, 3, 7, /* 140*/
+    1, 4, 5, 6, 0, 2, 3, 7, /* 141*/
+    0, 4, 5, 6, 1, 2, 3, 7, /* 142*/
+    4, 5, 6, 0, 1, 2, 3, 7, /* 143*/
+    0, 1, 2, 3, 5, 6, 4, 7, /* 144*/
+    1, 2, 3, 5, 6, 0, 4, 7, /* 145*/
+    0, 2, 3, 5, 6, 1, 4, 7, /* 146*/
+    2, 3, 5, 6, 0, 1, 4, 7, /* 147*/
+    0, 1, 3, 5, 6, 2, 4, 7, /* 148*/
+    1, 3, 5, 6, 0, 2, 4, 7, /* 149*/
+    0, 3, 5, 6, 1, 2, 4, 7, /* 150*/
+    3, 5, 6, 0, 1, 2, 4, 7, /* 151*/
+    0, 1, 2, 5, 6, 3, 4, 7, /* 152*/
+    1, 2, 5, 6, 0, 3, 4, 7, /* 153*/
+    0, 2, 5, 6, 1, 3, 4, 7, /* 154*/
+    2, 5, 6, 0, 1, 3, 4, 7, /* 155*/
+    0, 1, 5, 6, 2, 3, 4, 7, /* 156*/
+    1, 5, 6, 0, 2, 3, 4, 7, /* 157*/
+    0, 5, 6, 1, 2, 3, 4, 7, /* 158*/
+    5, 6, 0, 1, 2, 3, 4, 7, /* 159*/
+    0, 1, 2, 3, 4, 6, 5, 7, /* 160*/
+    1, 2, 3, 4, 6, 0, 5, 7, /* 161*/
+    0, 2, 3, 4, 6, 1, 5, 7, /* 162*/
+    2, 3, 4, 6, 0, 1, 5, 7, /* 163*/
+    0, 1, 3, 4, 6, 2, 5, 7, /* 164*/
+    1, 3, 4, 6, 0, 2, 5, 7, /* 165*/
+    0, 3, 4, 6, 1, 2, 5, 7, /* 166*/
+    3, 4, 6, 0, 1, 2, 5, 7, /* 167*/
+    0, 1, 2, 4, 6, 3, 5, 7, /* 168*/
+    1, 2, 4, 6, 0, 3, 5, 7, /* 169*/
+    0, 2, 4, 6, 1, 3, 5, 7, /* 170*/
+    2, 4, 6, 0, 1, 3, 5, 7, /* 171*/
+    0, 1, 4, 6, 2, 3, 5, 7, /* 172*/
+    1, 4, 6, 0, 2, 3, 5, 7, /* 173*/
+    0, 4, 6, 1, 2, 3, 5, 7, /* 174*/
+    4, 6, 0, 1, 2, 3, 5, 7, /* 175*/
+    0, 1, 2, 3, 6, 4, 5, 7, /* 176*/
+    1, 2, 3, 6, 0, 4, 5, 7, /* 177*/
+    0, 2, 3, 6, 1, 4, 5, 7, /* 178*/
+    2, 3, 6, 0, 1, 4, 5, 7, /* 179*/
+    0, 1, 3, 6, 2, 4, 5, 7, /* 180*/
+    1, 3, 6, 0, 2, 4, 5, 7, /* 181*/
+    0, 3, 6, 1, 2, 4, 5, 7, /* 182*/
+    3, 6, 0, 1, 2, 4, 5, 7, /* 183*/
+    0, 1, 2, 6, 3, 4, 5, 7, /* 184*/
+    1, 2, 6, 0, 3, 4, 5, 7, /* 185*/
+    0, 2, 6, 1, 3, 4, 5, 7, /* 186*/
+    2, 6, 0, 1, 3, 4, 5, 7, /* 187*/
+    0, 1, 6, 2, 3, 4, 5, 7, /* 188*/
+    1, 6, 0, 2, 3, 4, 5, 7, /* 189*/
+    0, 6, 1, 2, 3, 4, 5, 7, /* 190*/
+    6, 0, 1, 2, 3, 4, 5, 7, /* 191*/
+    0, 1, 2, 3, 4, 5, 6, 7, /* 192*/
+    1, 2, 3, 4, 5, 0, 6, 7, /* 193*/
+    0, 2, 3, 4, 5, 1, 6, 7, /* 194*/
+    2, 3, 4, 5, 0, 1, 6, 7, /* 195*/
+    0, 1, 3, 4, 5, 2, 6, 7, /* 196*/
+    1, 3, 4, 5, 0, 2, 6, 7, /* 197*/
+    0, 3, 4, 5, 1, 2, 6, 7, /* 198*/
+    3, 4, 5, 0, 1, 2, 6, 7, /* 199*/
+    0, 1, 2, 4, 5, 3, 6, 7, /* 200*/
+    1, 2, 4, 5, 0, 3, 6, 7, /* 201*/
+    0, 2, 4, 5, 1, 3, 6, 7, /* 202*/
+    2, 4, 5, 0, 1, 3, 6, 7, /* 203*/
+    0, 1, 4, 5, 2, 3, 6, 7, /* 204*/
+    1, 4, 5, 0, 2, 3, 6, 7, /* 205*/
+    0, 4, 5, 1, 2, 3, 6, 7, /* 206*/
+    4, 5, 0, 1, 2, 3, 6, 7, /* 207*/
+    0, 1, 2, 3, 5, 4, 6, 7, /* 208*/
+    1, 2, 3, 5, 0, 4, 6, 7, /* 209*/
+    0, 2, 3, 5, 1, 4, 6, 7, /* 210*/
+    2, 3, 5, 0, 1, 4, 6, 7, /* 211*/
+    0, 1, 3, 5, 2, 4, 6, 7, /* 212*/
+    1, 3, 5, 0, 2, 4, 6, 7, /* 213*/
+    0, 3, 5, 1, 2, 4, 6, 7, /* 214*/
+    3, 5, 0, 1, 2, 4, 6, 7, /* 215*/
+    0, 1, 2, 5, 3, 4, 6, 7, /* 216*/
+    1, 2, 5, 0, 3, 4, 6, 7, /* 217*/
+    0, 2, 5, 1, 3, 4, 6, 7, /* 218*/
+    2, 5, 0, 1, 3, 4, 6, 7, /* 219*/
+    0, 1, 5, 2, 3, 4, 6, 7, /* 220*/
+    1, 5, 0, 2, 3, 4, 6, 7, /* 221*/
+    0, 5, 1, 2, 3, 4, 6, 7, /* 222*/
+    5, 0, 1, 2, 3, 4, 6, 7, /* 223*/
+    0, 1, 2, 3, 4, 5, 6, 7, /* 224*/
+    1, 2, 3, 4, 0, 5, 6, 7, /* 225*/
+    0, 2, 3, 4, 1, 5, 6, 7, /* 226*/
+    2, 3, 4, 0, 1, 5, 6, 7, /* 227*/
+    0, 1, 3, 4, 2, 5, 6, 7, /* 228*/
+    1, 3, 4, 0, 2, 5, 6, 7, /* 229*/
+    0, 3, 4, 1, 2, 5, 6, 7, /* 230*/
+    3, 4, 0, 1, 2, 5, 6, 7, /* 231*/
+    0, 1, 2, 4, 3, 5, 6, 7, /* 232*/
+    1, 2, 4, 0, 3, 5, 6, 7, /* 233*/
+    0, 2, 4, 1, 3, 5, 6, 7, /* 234*/
+    2, 4, 0, 1, 3, 5, 6, 7, /* 235*/
+    0, 1, 4, 2, 3, 5, 6, 7, /* 236*/
+    1, 4, 0, 2, 3, 5, 6, 7, /* 237*/
+    0, 4, 1, 2, 3, 5, 6, 7, /* 238*/
+    4, 0, 1, 2, 3, 5, 6, 7, /* 239*/
+    0, 1, 2, 3, 4, 5, 6, 7, /* 240*/
+    1, 2, 3, 0, 4, 5, 6, 7, /* 241*/
+    0, 2, 3, 1, 4, 5, 6, 7, /* 242*/
+    2, 3, 0, 1, 4, 5, 6, 7, /* 243*/
+    0, 1, 3, 2, 4, 5, 6, 7, /* 244*/
+    1, 3, 0, 2, 4, 5, 6, 7, /* 245*/
+    0, 3, 1, 2, 4, 5, 6, 7, /* 246*/
+    3, 0, 1, 2, 4, 5, 6, 7, /* 247*/
+    0, 1, 2, 3, 4, 5, 6, 7, /* 248*/
+    1, 2, 0, 3, 4, 5, 6, 7, /* 249*/
+    0, 2, 1, 3, 4, 5, 6, 7, /* 250*/
+    2, 0, 1, 3, 4, 5, 6, 7, /* 251*/
+    0, 1, 2, 3, 4, 5, 6, 7, /* 252*/
+    1, 0, 2, 3, 4, 5, 6, 7, /* 253*/
+    0, 1, 2, 3, 4, 5, 6, 7, /* 254*/
+    0, 1, 2, 3, 4, 5, 6, 7, /* 255*/
+};
+
+
+static uint32_t avx_pivot_on_last_value(int32_t *array, size_t length) {
+    if (length <= 1)
+        return 1;
+
+
+    { // we exchange the last value for the middle value for a better pivot
+        int32_t ival = array[length / 2];
+        int32_t bval = array[length - 1];
+        array[length / 2] = bval;
+        array[length - 1] = ival;
+    }
+
+    int wh, rh;
+    int wt, rt;
+
+    rh = 0;
+    rt = length - 1;
+    wh = 0;
+    wt = length - 1;
+
+    __m256i h0 = _mm256_loadu_si256((__m256i*)(array + rh));
+    __m256i h1 = _mm256_loadu_si256((__m256i*)(array + rh + AVX2_REGISTER_SIZE));
+    rh += 2*AVX2_REGISTER_SIZE;
+
+    __m256i t0 = _mm256_loadu_si256((__m256i*)(array + rt -   AVX2_REGISTER_SIZE));
+    __m256i t1 = _mm256_loadu_si256((__m256i*)(array + rt - 2*AVX2_REGISTER_SIZE));
+    rt -= 3*AVX2_REGISTER_SIZE;
+
+    __m256i current = _mm256_loadu_si256((__m256i*)(array + rh));
+    __m256i next;
+    rh += AVX2_REGISTER_SIZE;
+
+    const int32_t pivot = array[length - 1];
+    const __m256i P = _mm256_set1_epi32(pivot);
+
+    // 1. the main loop
+    while (wh - wt < 4*AVX2_REGISTER_SIZE) {
+        const bool which = (rh - wh) < (wt - rt);
+
+        // I believe that a compiler will emit branchless code
+        __m256i* next_ptr = (__m256i*)(which ? array + rh : array + rt);
+        const int adv_rh = which ? AVX2_REGISTER_SIZE : 0;
+        const int adv_rt = which ? 0 : AVX2_REGISTER_SIZE;
+        // My faith ends here
+
+        next = _mm256_loadu_si256(next_ptr);
+
+        const int pvbyte = _mm256_movemask_ps((__m256)_mm256_cmpgt_epi32(current, P));
+        const uint32_t cnt = 8 - _mm_popcnt_u32(pvbyte);
+        const __m256i shuf =  _mm256_load_si256((__m256i *)(reverseshufflemask + 8 * pvbyte));
+
+        const __m256i ordered = _mm256_permutevar8x32_epi32(current, shuf);
+        _mm256_storeu_si256((__m256i*)(array + wh),     ordered);
+        _mm256_storeu_si256((__m256i*)(array + wt - 8), ordered);
+
+        rh += adv_rh;
+        rt -= adv_rt;
+        wh += cnt;
+        wt -= 8 - cnt;
+
+        current = next;
+    }
+
+    // 2. partition remaining part
+    while (wh - wt > 4*AVX2_REGISTER_SIZE) {
+        const int32_t v = array[rh++];
+        if (v < pivot) {
+            array[wh++] = v;
+        } else {
+            array[wt--] = v;
+        }
+    }
+
+    // 3. partition 4 registers loaded in the beginning
+    static uint32_t tmp[4*AVX2_REGISTER_SIZE];
+    _mm256_storeu_si256((__m256i*)(tmp + 0*AVX2_REGISTER_SIZE), h0);
+    _mm256_storeu_si256((__m256i*)(tmp + 1*AVX2_REGISTER_SIZE), h1);
+    _mm256_storeu_si256((__m256i*)(tmp + 2*AVX2_REGISTER_SIZE), t0);
+    _mm256_storeu_si256((__m256i*)(tmp + 3*AVX2_REGISTER_SIZE), t1);
+    for (int i=0; i < 4*AVX2_REGISTER_SIZE; i++) {
+        const int32_t v = array[i];
+        if (v < pivot) {
+            array[wh++] = v;
+        } else {
+            array[wt--] = v;
+        }
+    }
+
+    {
+        const int32_t a = array[length - 1];
+        const int32_t b = array[wh];
+
+        array[length - 1] = b;
+        array[wh] = a;
+    }
+
+    return wh;
+}
+
+// for fallback
+void scalar_partition(int32_t* array, const int32_t pivot, int& left, int& right) {
+
+    while (left <= right) {
+        while (array[left] < pivot) {
+            left += 1;
+        }
+        while (array[right] > pivot) {
+            right -= 1;
+        }
+        if (left <= right) {
+            const uint32_t t = array[left];
+            array[left]      = array[right];
+            array[right]     = t;
+            left  += 1;
+            right -= 1;
+        }
+    }
+}
+
+//fallback
+void scalar_quicksort(int32_t* array, int left, int right) {
+#ifdef WITH_RUNTIME_STATS
+    statistics.scalar__partition_calls += 1;
+    statistics.scalar__items_processed += right - left + 1;
+#endif
+    int i = left;
+    int j = right;
+    const int32_t pivot = array[(i + j)/2];
+    scalar_partition(array, pivot, i, j);
+    if (left < j) {
+        scalar_quicksort(array, left, j);
+    }
+    if (i < right) {
+        scalar_quicksort(array, i, right);
+    }
+}
+
+void avx2_pivotonlast_sort(int32_t *array, const uint32_t length) {
+  uint32_t sep;
+
+  if (length > 8*AVX2_REGISTER_SIZE) {
+    sep = avx_pivot_on_last_value(array, length);
+  } else {
+    sep = lomuto_partition_epi32((uint32_t*)array, 0, length - 1);
+  }
+  if(sep == length) {
+    // we have an ineffective pivot. Let us give up.
+    if(length > 1) scalar_quicksort(array,0,length - 1);
+  } else {
+    if (sep > 2) {
+      avx2_pivotonlast_sort(array, sep - 1);
+    }
+    if (sep + 1 < length) {
+      avx2_pivotonlast_sort(array + sep, length - sep);
+    }
+  }
+}
+void wrapped_avx2_pivotonlast_sort(uint32_t *array, int left, int right) {
+  avx2_pivotonlast_sort((int32_t *)array + left, right - left + 1);
+}
+
+} // namespace nate
--- a/simd-sort/avx2-natenodutch-quicksort.h
+++ b/simd-sort/avx2-natenodutch-quicksort.h
@ -0,0 +1,409 @@
+#pragma once
+
+#include <x86intrin.h>
+#include <cstdint>
+#include <memory.h>
+#include "avx2-altquicksort.h"
+/**
+* This is an alternative approach to SIMD quicksort, based on a description by
+* Nathan Kurz. Daniel Lemire implemented it to avoid the  Dutch national flag
+* problem as an experiment. This is not meant to be fast, as implemented. It is
+* an experiment.
+*/
+
+/*
+I'm going to use "vector" as a unit, but it doesn't have to be the
+native vector size.  In particular, I think we can read multiple
+vector register's worth of data with minimal added time per loop
+iteration.
+
+First, read two vectors from the head of the list, and two from the
+tail of the list.   Store these, either in registers or written to a
+buffer.  We're not going to use these until the end.  We're just
+getting them out of the way so that we have more than one vectors
+writeable space at the head and tail.
+
+wh--rh------------------rt--wt-
+
+wh = write head
+rh = read head
+rt = read tail
+wt = write tail
+
+Then read one vector from head, and advance rh accordingly.  Call it nextVector.
+
+Loop:
+
+Set currentVector = nextVector.
+
+Set nextVectorPtr to either rh or rt depending on which has the least
+"writable space" available.  That is, if (rh - wh) < (wt - rt), read
+from and advance rh, else read from and update rt.   These updates
+need to be branchless.
+
+Read nextVector = *nextVectorPtr.
+
+Partition currentVector (lookup, shuffle) and write to both wh and wt
+(tail offset so end hits wt).
+Advance wh and wt based on number of valid entries.  Use a second byte
+lookup issued as early as possible to determine this count.  Our
+performance is dependent on  the latency of this step.
+
+-----
+*/
+static uint32_t shufflemask[256 * 8] __attribute__((aligned(0x100))) = {
+    0, 1, 2, 3, 4, 5, 6, 7, /* 0*/
+    0, 1, 2, 3, 4, 5, 6, 7, /* 1*/
+    1, 0, 2, 3, 4, 5, 6, 7, /* 2*/
+    0, 1, 2, 3, 4, 5, 6, 7, /* 3*/
+    2, 0, 1, 3, 4, 5, 6, 7, /* 4*/
+    0, 2, 1, 3, 4, 5, 6, 7, /* 5*/
+    1, 2, 0, 3, 4, 5, 6, 7, /* 6*/
+    0, 1, 2, 3, 4, 5, 6, 7, /* 7*/
+    3, 0, 1, 2, 4, 5, 6, 7, /* 8*/
+    0, 3, 1, 2, 4, 5, 6, 7, /* 9*/
+    1, 3, 0, 2, 4, 5, 6, 7, /* 10*/
+    0, 1, 3, 2, 4, 5, 6, 7, /* 11*/
+    2, 3, 0, 1, 4, 5, 6, 7, /* 12*/
+    0, 2, 3, 1, 4, 5, 6, 7, /* 13*/
+    1, 2, 3, 0, 4, 5, 6, 7, /* 14*/
+    0, 1, 2, 3, 4, 5, 6, 7, /* 15*/
+    4, 0, 1, 2, 3, 5, 6, 7, /* 16*/
+    0, 4, 1, 2, 3, 5, 6, 7, /* 17*/
+    1, 4, 0, 2, 3, 5, 6, 7, /* 18*/
+    0, 1, 4, 2, 3, 5, 6, 7, /* 19*/
+    2, 4, 0, 1, 3, 5, 6, 7, /* 20*/
+    0, 2, 4, 1, 3, 5, 6, 7, /* 21*/
+    1, 2, 4, 0, 3, 5, 6, 7, /* 22*/
+    0, 1, 2, 4, 3, 5, 6, 7, /* 23*/
+    3, 4, 0, 1, 2, 5, 6, 7, /* 24*/
+    0, 3, 4, 1, 2, 5, 6, 7, /* 25*/
+    1, 3, 4, 0, 2, 5, 6, 7, /* 26*/
+    0, 1, 3, 4, 2, 5, 6, 7, /* 27*/
+    2, 3, 4, 0, 1, 5, 6, 7, /* 28*/
+    0, 2, 3, 4, 1, 5, 6, 7, /* 29*/
+    1, 2, 3, 4, 0, 5, 6, 7, /* 30*/
+    0, 1, 2, 3, 4, 5, 6, 7, /* 31*/
+    5, 0, 1, 2, 3, 4, 6, 7, /* 32*/
+    0, 5, 1, 2, 3, 4, 6, 7, /* 33*/
+    1, 5, 0, 2, 3, 4, 6, 7, /* 34*/
+    0, 1, 5, 2, 3, 4, 6, 7, /* 35*/
+    2, 5, 0, 1, 3, 4, 6, 7, /* 36*/
+    0, 2, 5, 1, 3, 4, 6, 7, /* 37*/
+    1, 2, 5, 0, 3, 4, 6, 7, /* 38*/
+    0, 1, 2, 5, 3, 4, 6, 7, /* 39*/
+    3, 5, 0, 1, 2, 4, 6, 7, /* 40*/
+    0, 3, 5, 1, 2, 4, 6, 7, /* 41*/
+    1, 3, 5, 0, 2, 4, 6, 7, /* 42*/
+    0, 1, 3, 5, 2, 4, 6, 7, /* 43*/
+    2, 3, 5, 0, 1, 4, 6, 7, /* 44*/
+    0, 2, 3, 5, 1, 4, 6, 7, /* 45*/
+    1, 2, 3, 5, 0, 4, 6, 7, /* 46*/
+    0, 1, 2, 3, 5, 4, 6, 7, /* 47*/
+    4, 5, 0, 1, 2, 3, 6, 7, /* 48*/
+    0, 4, 5, 1, 2, 3, 6, 7, /* 49*/
+    1, 4, 5, 0, 2, 3, 6, 7, /* 50*/
+    0, 1, 4, 5, 2, 3, 6, 7, /* 51*/
+    2, 4, 5, 0, 1, 3, 6, 7, /* 52*/
+    0, 2, 4, 5, 1, 3, 6, 7, /* 53*/
+    1, 2, 4, 5, 0, 3, 6, 7, /* 54*/
+    0, 1, 2, 4, 5, 3, 6, 7, /* 55*/
+    3, 4, 5, 0, 1, 2, 6, 7, /* 56*/
+    0, 3, 4, 5, 1, 2, 6, 7, /* 57*/
+    1, 3, 4, 5, 0, 2, 6, 7, /* 58*/
+    0, 1, 3, 4, 5, 2, 6, 7, /* 59*/
+    2, 3, 4, 5, 0, 1, 6, 7, /* 60*/
+    0, 2, 3, 4, 5, 1, 6, 7, /* 61*/
+    1, 2, 3, 4, 5, 0, 6, 7, /* 62*/
+    0, 1, 2, 3, 4, 5, 6, 7, /* 63*/
+    6, 0, 1, 2, 3, 4, 5, 7, /* 64*/
+    0, 6, 1, 2, 3, 4, 5, 7, /* 65*/
+    1, 6, 0, 2, 3, 4, 5, 7, /* 66*/
+    0, 1, 6, 2, 3, 4, 5, 7, /* 67*/
+    2, 6, 0, 1, 3, 4, 5, 7, /* 68*/
+    0, 2, 6, 1, 3, 4, 5, 7, /* 69*/
+    1, 2, 6, 0, 3, 4, 5, 7, /* 70*/
+    0, 1, 2, 6, 3, 4, 5, 7, /* 71*/
+    3, 6, 0, 1, 2, 4, 5, 7, /* 72*/
+    0, 3, 6, 1, 2, 4, 5, 7, /* 73*/
+    1, 3, 6, 0, 2, 4, 5, 7, /* 74*/
+    0, 1, 3, 6, 2, 4, 5, 7, /* 75*/
+    2, 3, 6, 0, 1, 4, 5, 7, /* 76*/
+    0, 2, 3, 6, 1, 4, 5, 7, /* 77*/
+    1, 2, 3, 6, 0, 4, 5, 7, /* 78*/
+    0, 1, 2, 3, 6, 4, 5, 7, /* 79*/
+    4, 6, 0, 1, 2, 3, 5, 7, /* 80*/
+    0, 4, 6, 1, 2, 3, 5, 7, /* 81*/
+    1, 4, 6, 0, 2, 3, 5, 7, /* 82*/
+    0, 1, 4, 6, 2, 3, 5, 7, /* 83*/
+    2, 4, 6, 0, 1, 3, 5, 7, /* 84*/
+    0, 2, 4, 6, 1, 3, 5, 7, /* 85*/
+    1, 2, 4, 6, 0, 3, 5, 7, /* 86*/
+    0, 1, 2, 4, 6, 3, 5, 7, /* 87*/
+    3, 4, 6, 0, 1, 2, 5, 7, /* 88*/
+    0, 3, 4, 6, 1, 2, 5, 7, /* 89*/
+    1, 3, 4, 6, 0, 2, 5, 7, /* 90*/
+    0, 1, 3, 4, 6, 2, 5, 7, /* 91*/
+    2, 3, 4, 6, 0, 1, 5, 7, /* 92*/
+    0, 2, 3, 4, 6, 1, 5, 7, /* 93*/
+    1, 2, 3, 4, 6, 0, 5, 7, /* 94*/
+    0, 1, 2, 3, 4, 6, 5, 7, /* 95*/
+    5, 6, 0, 1, 2, 3, 4, 7, /* 96*/
+    0, 5, 6, 1, 2, 3, 4, 7, /* 97*/
+    1, 5, 6, 0, 2, 3, 4, 7, /* 98*/
+    0, 1, 5, 6, 2, 3, 4, 7, /* 99*/
+    2, 5, 6, 0, 1, 3, 4, 7, /* 100*/
+    0, 2, 5, 6, 1, 3, 4, 7, /* 101*/
+    1, 2, 5, 6, 0, 3, 4, 7, /* 102*/
+    0, 1, 2, 5, 6, 3, 4, 7, /* 103*/
+    3, 5, 6, 0, 1, 2, 4, 7, /* 104*/
+    0, 3, 5, 6, 1, 2, 4, 7, /* 105*/
+    1, 3, 5, 6, 0, 2, 4, 7, /* 106*/
+    0, 1, 3, 5, 6, 2, 4, 7, /* 107*/
+    2, 3, 5, 6, 0, 1, 4, 7, /* 108*/
+    0, 2, 3, 5, 6, 1, 4, 7, /* 109*/
+    1, 2, 3, 5, 6, 0, 4, 7, /* 110*/
+    0, 1, 2, 3, 5, 6, 4, 7, /* 111*/
+    4, 5, 6, 0, 1, 2, 3, 7, /* 112*/
+    0, 4, 5, 6, 1, 2, 3, 7, /* 113*/
+    1, 4, 5, 6, 0, 2, 3, 7, /* 114*/
+    0, 1, 4, 5, 6, 2, 3, 7, /* 115*/
+    2, 4, 5, 6, 0, 1, 3, 7, /* 116*/
+    0, 2, 4, 5, 6, 1, 3, 7, /* 117*/
+    1, 2, 4, 5, 6, 0, 3, 7, /* 118*/
+    0, 1, 2, 4, 5, 6, 3, 7, /* 119*/
+    3, 4, 5, 6, 0, 1, 2, 7, /* 120*/
+    0, 3, 4, 5, 6, 1, 2, 7, /* 121*/
+    1, 3, 4, 5, 6, 0, 2, 7, /* 122*/
+    0, 1, 3, 4, 5, 6, 2, 7, /* 123*/
+    2, 3, 4, 5, 6, 0, 1, 7, /* 124*/
+    0, 2, 3, 4, 5, 6, 1, 7, /* 125*/
+    1, 2, 3, 4, 5, 6, 0, 7, /* 126*/
+    0, 1, 2, 3, 4, 5, 6, 7, /* 127*/
+    7, 0, 1, 2, 3, 4, 5, 6, /* 128*/
+    0, 7, 1, 2, 3, 4, 5, 6, /* 129*/
+    1, 7, 0, 2, 3, 4, 5, 6, /* 130*/
+    0, 1, 7, 2, 3, 4, 5, 6, /* 131*/
+    2, 7, 0, 1, 3, 4, 5, 6, /* 132*/
+    0, 2, 7, 1, 3, 4, 5, 6, /* 133*/
+    1, 2, 7, 0, 3, 4, 5, 6, /* 134*/
+    0, 1, 2, 7, 3, 4, 5, 6, /* 135*/
+    3, 7, 0, 1, 2, 4, 5, 6, /* 136*/
+    0, 3, 7, 1, 2, 4, 5, 6, /* 137*/
+    1, 3, 7, 0, 2, 4, 5, 6, /* 138*/
+    0, 1, 3, 7, 2, 4, 5, 6, /* 139*/
+    2, 3, 7, 0, 1, 4, 5, 6, /* 140*/
+    0, 2, 3, 7, 1, 4, 5, 6, /* 141*/
+    1, 2, 3, 7, 0, 4, 5, 6, /* 142*/
+    0, 1, 2, 3, 7, 4, 5, 6, /* 143*/
+    4, 7, 0, 1, 2, 3, 5, 6, /* 144*/
+    0, 4, 7, 1, 2, 3, 5, 6, /* 145*/
+    1, 4, 7, 0, 2, 3, 5, 6, /* 146*/
+    0, 1, 4, 7, 2, 3, 5, 6, /* 147*/
+    2, 4, 7, 0, 1, 3, 5, 6, /* 148*/
+    0, 2, 4, 7, 1, 3, 5, 6, /* 149*/
+    1, 2, 4, 7, 0, 3, 5, 6, /* 150*/
+    0, 1, 2, 4, 7, 3, 5, 6, /* 151*/
+    3, 4, 7, 0, 1, 2, 5, 6, /* 152*/
+    0, 3, 4, 7, 1, 2, 5, 6, /* 153*/
+    1, 3, 4, 7, 0, 2, 5, 6, /* 154*/
+    0, 1, 3, 4, 7, 2, 5, 6, /* 155*/
+    2, 3, 4, 7, 0, 1, 5, 6, /* 156*/
+    0, 2, 3, 4, 7, 1, 5, 6, /* 157*/
+    1, 2, 3, 4, 7, 0, 5, 6, /* 158*/
+    0, 1, 2, 3, 4, 7, 5, 6, /* 159*/
+    5, 7, 0, 1, 2, 3, 4, 6, /* 160*/
+    0, 5, 7, 1, 2, 3, 4, 6, /* 161*/
+    1, 5, 7, 0, 2, 3, 4, 6, /* 162*/
+    0, 1, 5, 7, 2, 3, 4, 6, /* 163*/
+    2, 5, 7, 0, 1, 3, 4, 6, /* 164*/
+    0, 2, 5, 7, 1, 3, 4, 6, /* 165*/
+    1, 2, 5, 7, 0, 3, 4, 6, /* 166*/
+    0, 1, 2, 5, 7, 3, 4, 6, /* 167*/
+    3, 5, 7, 0, 1, 2, 4, 6, /* 168*/
+    0, 3, 5, 7, 1, 2, 4, 6, /* 169*/
+    1, 3, 5, 7, 0, 2, 4, 6, /* 170*/
+    0, 1, 3, 5, 7, 2, 4, 6, /* 171*/
+    2, 3, 5, 7, 0, 1, 4, 6, /* 172*/
+    0, 2, 3, 5, 7, 1, 4, 6, /* 173*/
+    1, 2, 3, 5, 7, 0, 4, 6, /* 174*/
+    0, 1, 2, 3, 5, 7, 4, 6, /* 175*/
+    4, 5, 7, 0, 1, 2, 3, 6, /* 176*/
+    0, 4, 5, 7, 1, 2, 3, 6, /* 177*/
+    1, 4, 5, 7, 0, 2, 3, 6, /* 178*/
+    0, 1, 4, 5, 7, 2, 3, 6, /* 179*/
+    2, 4, 5, 7, 0, 1, 3, 6, /* 180*/
+    0, 2, 4, 5, 7, 1, 3, 6, /* 181*/
+    1, 2, 4, 5, 7, 0, 3, 6, /* 182*/
+    0, 1, 2, 4, 5, 7, 3, 6, /* 183*/
+    3, 4, 5, 7, 0, 1, 2, 6, /* 184*/
+    0, 3, 4, 5, 7, 1, 2, 6, /* 185*/
+    1, 3, 4, 5, 7, 0, 2, 6, /* 186*/
+    0, 1, 3, 4, 5, 7, 2, 6, /* 187*/
+    2, 3, 4, 5, 7, 0, 1, 6, /* 188*/
+    0, 2, 3, 4, 5, 7, 1, 6, /* 189*/
+    1, 2, 3, 4, 5, 7, 0, 6, /* 190*/
+    0, 1, 2, 3, 4, 5, 7, 6, /* 191*/
+    6, 7, 0, 1, 2, 3, 4, 5, /* 192*/
+    0, 6, 7, 1, 2, 3, 4, 5, /* 193*/
+    1, 6, 7, 0, 2, 3, 4, 5, /* 194*/
+    0, 1, 6, 7, 2, 3, 4, 5, /* 195*/
+    2, 6, 7, 0, 1, 3, 4, 5, /* 196*/
+    0, 2, 6, 7, 1, 3, 4, 5, /* 197*/
+    1, 2, 6, 7, 0, 3, 4, 5, /* 198*/
+    0, 1, 2, 6, 7, 3, 4, 5, /* 199*/
+    3, 6, 7, 0, 1, 2, 4, 5, /* 200*/
+    0, 3, 6, 7, 1, 2, 4, 5, /* 201*/
+    1, 3, 6, 7, 0, 2, 4, 5, /* 202*/
+    0, 1, 3, 6, 7, 2, 4, 5, /* 203*/
+    2, 3, 6, 7, 0, 1, 4, 5, /* 204*/
+    0, 2, 3, 6, 7, 1, 4, 5, /* 205*/
+    1, 2, 3, 6, 7, 0, 4, 5, /* 206*/
+    0, 1, 2, 3, 6, 7, 4, 5, /* 207*/
+    4, 6, 7, 0, 1, 2, 3, 5, /* 208*/
+    0, 4, 6, 7, 1, 2, 3, 5, /* 209*/
+    1, 4, 6, 7, 0, 2, 3, 5, /* 210*/
+    0, 1, 4, 6, 7, 2, 3, 5, /* 211*/
+    2, 4, 6, 7, 0, 1, 3, 5, /* 212*/
+    0, 2, 4, 6, 7, 1, 3, 5, /* 213*/
+    1, 2, 4, 6, 7, 0, 3, 5, /* 214*/
+    0, 1, 2, 4, 6, 7, 3, 5, /* 215*/
+    3, 4, 6, 7, 0, 1, 2, 5, /* 216*/
+    0, 3, 4, 6, 7, 1, 2, 5, /* 217*/
+    1, 3, 4, 6, 7, 0, 2, 5, /* 218*/
+    0, 1, 3, 4, 6, 7, 2, 5, /* 219*/
+    2, 3, 4, 6, 7, 0, 1, 5, /* 220*/
+    0, 2, 3, 4, 6, 7, 1, 5, /* 221*/
+    1, 2, 3, 4, 6, 7, 0, 5, /* 222*/
+    0, 1, 2, 3, 4, 6, 7, 5, /* 223*/
+    5, 6, 7, 0, 1, 2, 3, 4, /* 224*/
+    0, 5, 6, 7, 1, 2, 3, 4, /* 225*/
+    1, 5, 6, 7, 0, 2, 3, 4, /* 226*/
+    0, 1, 5, 6, 7, 2, 3, 4, /* 227*/
+    2, 5, 6, 7, 0, 1, 3, 4, /* 228*/
+    0, 2, 5, 6, 7, 1, 3, 4, /* 229*/
+    1, 2, 5, 6, 7, 0, 3, 4, /* 230*/
+    0, 1, 2, 5, 6, 7, 3, 4, /* 231*/
+    3, 5, 6, 7, 0, 1, 2, 4, /* 232*/
+    0, 3, 5, 6, 7, 1, 2, 4, /* 233*/
+    1, 3, 5, 6, 7, 0, 2, 4, /* 234*/
+    0, 1, 3, 5, 6, 7, 2, 4, /* 235*/
+    2, 3, 5, 6, 7, 0, 1, 4, /* 236*/
+    0, 2, 3, 5, 6, 7, 1, 4, /* 237*/
+    1, 2, 3, 5, 6, 7, 0, 4, /* 238*/
+    0, 1, 2, 3, 5, 6, 7, 4, /* 239*/
+    4, 5, 6, 7, 0, 1, 2, 3, /* 240*/
+    0, 4, 5, 6, 7, 1, 2, 3, /* 241*/
+    1, 4, 5, 6, 7, 0, 2, 3, /* 242*/
+    0, 1, 4, 5, 6, 7, 2, 3, /* 243*/
+    2, 4, 5, 6, 7, 0, 1, 3, /* 244*/
+    0, 2, 4, 5, 6, 7, 1, 3, /* 245*/
+    1, 2, 4, 5, 6, 7, 0, 3, /* 246*/
+    0, 1, 2, 4, 5, 6, 7, 3, /* 247*/
+    3, 4, 5, 6, 7, 0, 1, 2, /* 248*/
+    0, 3, 4, 5, 6, 7, 1, 2, /* 249*/
+    1, 3, 4, 5, 6, 7, 0, 2, /* 250*/
+    0, 1, 3, 4, 5, 6, 7, 2, /* 251*/
+    2, 3, 4, 5, 6, 7, 0, 1, /* 252*/
+    0, 2, 3, 4, 5, 6, 7, 1, /* 253*/
+    1, 2, 3, 4, 5, 6, 7, 0, /* 254*/
+    0, 1, 2, 3, 4, 5, 6, 7, /* 255*/
+};
+
+static inline __m256i _mm256_cmplt_epi32(__m256i a, __m256i b) {
+  return _mm256_cmpgt_epi32(b, a);
+}
+
+
+static void avx_natenodutch_partition_epi32(int32_t *array, const int32_t pivot,
+                                            int &left, int &right) {
+  const __m256i P = _mm256_set1_epi32(pivot);
+  const int valuespervector = sizeof(__m256i) / sizeof(int32_t);
+  if (right - left + 1 <
+      4 * valuespervector) { // not enough space for nate's algo to make sense, falling back
+    scalar_partition(array, pivot, left, right);
+    return;
+  }
+  int readleft = left + valuespervector;
+  int readright = right - valuespervector;
+  int32_t buffer[2 * valuespervector]; // tmp buffer
+  memcpy(buffer, array + left, valuespervector * sizeof(int32_t));
+  memcpy(buffer + valuespervector, array + right - valuespervector + 1,
+         valuespervector * sizeof(int32_t));
+  while (readright - readleft >= valuespervector) {
+    __m256i *nextVectorPtr;
+    if ((readleft - left) > (right - readright)) {
+      nextVectorPtr = (__m256i *)(array + readright - valuespervector + 1);
+      readright -= valuespervector;
+    } else {
+      nextVectorPtr = (__m256i *)(array + readleft);
+      readleft += valuespervector;
+    }
+    __m256i currentVector = _mm256_loadu_si256(nextVectorPtr);
+    /******
+    * we need two comparisons if we are to avoid the Dutch national flag
+    * problem,
+    * Reference: https://en.wikipedia.org/wiki/Dutch_national_flag_problem
+    */
+    int greaterthanpivot =
+        _mm256_movemask_ps((__m256)_mm256_cmpgt_epi32(currentVector, P));
+    int lesserthanpivot =
+        _mm256_movemask_ps((__m256)_mm256_cmplt_epi32(currentVector, P));
+    __m256i greaterthanpivot_permvector = _mm256_load_si256(
+        (__m256i *)(reverseshufflemask + 8 * greaterthanpivot));
+    __m256i lesserthanpivot_permvector =
+        _mm256_load_si256((__m256i *)(shufflemask + 8 * lesserthanpivot));
+
+    int count_greaterthanpivot = _mm_popcnt_u32(greaterthanpivot);
+    int count_lesserthanpivot = _mm_popcnt_u32(lesserthanpivot);
+    __m256i greaterthanpivot_vector =
+        _mm256_permutevar8x32_epi32(currentVector, greaterthanpivot_permvector);
+    __m256i lesserthanpivot_vector =
+        _mm256_permutevar8x32_epi32(currentVector, lesserthanpivot_permvector);
+    _mm256_storeu_si256((__m256i *)(array + left), lesserthanpivot_vector);
+
+    _mm256_storeu_si256((__m256i *)(array + right - valuespervector + 1),
+                        greaterthanpivot_vector);
+
+    left += count_lesserthanpivot;
+    right -= count_greaterthanpivot;
+  }
+
+  int howmanyleft = readleft - left;
+  if (howmanyleft > 2 * valuespervector) {
+    howmanyleft = 2 * valuespervector;
+    for (int k = howmanyleft; k < readleft - left; ++k)
+      array[left + k] = pivot;
+  }
+  memcpy(array + left, buffer, howmanyleft * sizeof(int32_t));
+
+  int howmanyright = right - readright;
+  if (howmanyright + howmanyleft > 2 * valuespervector) {
+    howmanyright = 2 * valuespervector - howmanyleft;
+    for (int k = howmanyright; k < right - readright; ++k)
+      array[readright + 1 + k] = pivot;
+  }
+
+  memcpy(array + readright + 1, buffer + howmanyleft,
+         howmanyright * sizeof(int32_t));
+  scalar_partition(array, pivot, left, right);
+}
+
+
+// it is really signed int sorting, but the API seems to expect uint32_t...
+void avx_natenodutch_quicksort(uint32_t *array, int left, int right) {
+
+  int i = left;
+  int j = right;
+  const int32_t pivot = array[(i + j) / 2];
+
+  avx_natenodutch_partition_epi32((int32_t *) array, pivot, i, j);
+
+  if (left < j) {
+    avx_natenodutch_quicksort(array, left, j);
+  }
+
+  if (i < right) {
+    avx_natenodutch_quicksort(array, i, right);
+  }
+}
--- a/simd-sort/avx2-partition.cpp
+++ b/simd-sort/avx2-partition.cpp
@ -0,0 +1,197 @@
+#include <x86intrin.h>
+#include <cstdint>
+
+namespace qs {
+
+    namespace avx2 {
+
+        __m256i FORCE_INLINE bitmask_to_bytemask_epi32(uint8_t bm) {
+
+            const __m256i mask = _mm256_set1_epi32(bm);
+            const __m256i bits = _mm256_setr_epi32(0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80);
+            const __m256i tmp  = _mm256_and_si256(mask, bits);
+
+            return _mm256_cmpeq_epi32(tmp, bits);
+        }
+
+
+        void FORCE_INLINE align_masks(uint8_t& a, uint8_t& b, uint8_t& rem_a, uint8_t& rem_b, __m256i& shuffle_a, __m256i& shuffle_b) {
+
+            assert(a != 0);
+            assert(b != 0);
+
+            uint8_t tmpA = a;
+            uint8_t tmpB = b;
+
+            uint32_t __attribute__((__aligned__(32))) tmpshufa[8];
+            uint32_t __attribute__((__aligned__(32))) tmpshufb[8];
+
+            while (tmpA != 0 && tmpB != 0) {
+                int idx_a = __builtin_ctz(tmpA);
+                int idx_b = __builtin_ctz(tmpB);
+
+                tmpA = tmpA & (tmpA - 1);
+                tmpB = tmpB & (tmpB - 1);
+
+                tmpshufb[idx_a] = idx_b;
+                tmpshufa[idx_b] = idx_a;
+            }
+
+            a = a ^ tmpA;
+            b = b ^ tmpB;
+
+            assert(a != 0);
+            assert(b != 0);
+            assert(_mm_popcnt_u64(a) == _mm_popcnt_u64(b));
+
+            rem_a = tmpA;
+            rem_b = tmpB;
+
+            shuffle_a = _mm256_load_si256((__m256i*)tmpshufa);
+            shuffle_b = _mm256_load_si256((__m256i*)tmpshufb);
+        }
+
+
+        __m256i FORCE_INLINE merge(const __m256i mask, const __m256i a, const __m256i b) {
+
+            return _mm256_or_si256(
+                    _mm256_and_si256(mask, a),
+                    _mm256_andnot_si256(mask, b)
+            );
+        }
+
+
+       void FORCE_INLINE swap_epi32(
+            __m256i& a, __m256i& b,
+            uint8_t mask_a, const __m256i shuffle_a,
+            uint8_t mask_b, const __m256i shuffle_b) {
+
+            const __m256i to_swap_b = _mm256_permutevar8x32_epi32(a, shuffle_a);
+            const __m256i to_swap_a = _mm256_permutevar8x32_epi32(b, shuffle_b);
+            const __m256i ma    = bitmask_to_bytemask_epi32(mask_a);
+            const __m256i mb    = bitmask_to_bytemask_epi32(mask_b);
+
+            a = merge(ma, to_swap_a, a);
+            b = merge(mb, to_swap_b, b);
+        }
+
+
+
+#define _mm256_iszero(vec) (_mm256_testz_si256(vec, vec) != 0)
+
+        void FORCE_INLINE partition_epi32(uint32_t* array, uint32_t pv, int& left, int& right) {
+
+            const int N = 8; // the number of items in a register (256/32)
+
+            __m256i L;
+            __m256i R;
+            uint8_t maskL = 0;
+            uint8_t maskR = 0;
+
+            const __m256i pivot = _mm256_set1_epi32(pv);
+
+            int origL = left;
+            int origR = right;
+
+            while (true) {
+
+                if (maskL == 0) {
+                    while (true) {
+                        if (right - (left + N) + 1 < 2*N) {
+                            goto end;
+                        }
+
+                        L = _mm256_loadu_si256((__m256i*)(array + left));
+                        const __m256i bytemask = _mm256_cmpgt_epi32(pivot, L);
+
+                        if (_mm256_testc_ps((__m256)bytemask, (__m256)_mm256_set1_epi32(-1))) {
+                            left += N;
+                        } else {
+                            maskL = ~_mm256_movemask_ps((__m256)bytemask);
+                            break;
+                        }
+                    }
+
+                }
+
+                if (maskR == 0) {
+                    while (true) {
+                        if ((right - N) - left + 1 < 2*N) {
+                            goto end;
+                        }
+
+                        R = _mm256_loadu_si256((__m256i*)(array + right - N + 1));
+                        const __m256i bytemask = _mm256_cmpgt_epi32(pivot, R);
+                        if (_mm256_iszero(bytemask)) {
+                            right -= N;
+                        } else {
+                            maskR = _mm256_movemask_ps((__m256)bytemask);
+                            break;
+                        }
+                    }
+
+                }
+
+                assert(left <= right);
+                assert(maskL != 0);
+                assert(maskR != 0);
+
+                uint8_t mL;
+                uint8_t mR;
+                __m256i shuffleL;
+                __m256i shuffleR;
+
+                align_masks(maskL, maskR, mL, mR, shuffleL, shuffleR);
+                swap_epi32(L, R, maskL, shuffleL, maskR, shuffleR);
+
+                maskL = mL;
+                maskR = mR;
+
+                if (maskL == 0) {
+                    _mm256_storeu_si256((__m256i*)(array + left), L);
+                    left += N;
+                }
+
+                if (maskR == 0) {
+                    _mm256_storeu_si256((__m256i*)(array + right - N + 1), R);
+                    right -= N;
+                }
+
+            } // while
+
+        end:
+
+            assert(!(maskL != 0 && maskR != 0));
+
+            if (maskL != 0) {
+                _mm256_storeu_si256((__m256i*)(array + left), L);
+            } else if (maskR != 0) {
+                _mm256_storeu_si256((__m256i*)(array + right - N + 1), R);
+            }
+
+            if (left < right) {
+                int less    = 0;
+                int greater = 0;
+                const int all = right - left + 1;
+
+                for (int i=left; i <= right; i++) {
+                    less    += int(array[i] < pv);
+                    greater += int(array[i] > pv);
+                }
+
+                if (all == less) {
+                    // all elements in range [left, right] less than pivot
+                    scalar_partition_epi32(array, pv, origL, left);
+                } else if (all == greater) {
+                    // all elements in range [left, right] greater than pivot
+                    scalar_partition_epi32(array, pv, left, origR);
+                } else {
+                    scalar_partition_epi32(array, pv, left, right);
+                }
+            }
+        }
+
+    } // namespace avx2
+
+} // namespace qs
+
--- a/simd-sort/avx2-quicksort.cpp
+++ b/simd-sort/avx2-quicksort.cpp
@ -0,0 +1,32 @@
+#include "avx2-partition.cpp"
+
+namespace qs {
+
+    namespace avx2 {
+
+        void quicksort(uint32_t* array, int left, int right) {
+
+            int i = left;
+            int j = right;
+
+            const uint32_t pivot = array[(i + j)/2];
+            const int AVX2_REGISTER_SIZE = 8; // in 32-bit words
+
+            if (j - i >= 2 * AVX2_REGISTER_SIZE) {
+                qs::avx2::partition_epi32(array, pivot, i, j);
+            } else {
+                scalar_partition_epi32(array, pivot, i, j);
+            }
+
+            if (left < j) {
+                quicksort(array, left, j);
+            }
+
+            if (i < right) {
+                quicksort(array, i, right);
+            }
+        }
+
+    } // namespace avx2
+
+} // namespace qs
--- a/simd-sort/avx512-auxbuffer-partition.cpp
+++ b/simd-sort/avx512-auxbuffer-partition.cpp
@ -0,0 +1,90 @@
+namespace qs {
+
+    namespace avx512 {
+
+        void memset_epi32(uint32_t* array, uint32_t w, size_t n) {
+            
+            const int N = 16;
+            const __m512i word = _mm512_set1_epi32(w);
+
+            for (size_t i=0; i < n/N; i++) {
+                _mm512_storeu_si512(array + i*N, word);
+            }
+
+            for (size_t i=n/N * N; i < n; i++) {
+                array[i] = w;
+            }
+        }
+
+        void memcpy_epi32(uint32_t* dst, uint32_t* src, size_t n) {
+            
+            const int N = 16;
+
+            for (size_t i=0; i < n/N; i++) {
+                _mm512_storeu_si512(dst + i*N, _mm512_loadu_si512(src + i*N));
+            }
+
+            for (size_t i=n/N * N; i < n; i++) {
+                dst[i] = src[i];
+            }
+        }
+
+        // parition array[0..n-1]
+        uint32_t FORCE_INLINE partition_auxbuffer_epi32(uint32_t* array, size_t n, uint32_t pv) {
+
+            const int N = 16;
+            const int AUX_COUNT = 1024; // 4kB
+
+            static uint32_t gt_buf[AUX_COUNT + N];
+
+            size_t lt_count = 0;
+            size_t gt_count = 0;
+
+            const __m512i pivot = _mm512_set1_epi32(pv);
+
+            // 1. copy greater and less values into separate buffers
+            for (size_t i=0; i < n / N; i++) {
+                
+                const __m512i v = _mm512_loadu_si512(array + i*N);
+
+                const __mmask16 lt = _mm512_cmplt_epi32_mask(v, pivot);
+                const __mmask16 gt = _mm512_cmpgt_epi32_mask(v, pivot);
+
+                const __m512i less    = _mm512_maskz_compress_epi32(lt, v);
+                const __m512i greater = _mm512_maskz_compress_epi32(gt, v);
+
+                _mm512_storeu_si512(array  + lt_count, less);
+                _mm512_storeu_si512(gt_buf + gt_count, greater);
+
+                lt_count += _mm_popcnt_u32(lt);
+                gt_count += _mm_popcnt_u32(gt);
+            }
+
+            for (size_t i=0; i < n % N; i++) {
+                
+                const uint32_t v = array[(n/N) * N + i];
+
+                if (v < pv) {
+                    array[lt_count++] = v;
+                } else if (v > pv) {
+                    gt_buf[gt_count++] = v;
+                }
+            }
+
+            const size_t eq_count = n - (lt_count + gt_count);
+
+            // 2. replace array with partially ordered data
+
+            // 2.a. pivots
+            memset_epi32(array + lt_count, pv, eq_count);
+
+            // 2.b. all values greater than pivot
+            memcpy_epi32(array + lt_count + eq_count, gt_buf, gt_count);
+
+            // 3. index before the first pivot
+            return lt_count;
+        }
+
+    } // namespace avx512
+
+} // namespace qa
--- a/simd-sort/avx512-bmi2-partition.cpp
+++ b/simd-sort/avx512-bmi2-partition.cpp
@ -0,0 +1,105 @@
+namespace qs {
+
+    namespace avx512 {
+
+        void FORCE_INLINE bmi2_align_masks(__mmask16& a, __mmask16& b, __mmask16& rem_a, __mmask16& rem_b) {
+
+            uint32_t tmpA = _pext_u32(0xffffffff, a);
+            uint32_t tmpB = _pext_u32(0xffffffff, b);
+
+            uint32_t tmp  = tmpA & tmpB;
+
+            __mmask16 new_a = _pdep_u32(tmp, a);
+            __mmask16 new_b = _pdep_u32(tmp, a);
+
+            rem_a = a ^ new_a;
+            rem_b = b ^ new_b;
+
+            a = new_a;
+            b = new_b;
+
+            assert(a != 0);
+            assert(b != 0);
+            assert(_mm_popcnt_u64(a) == _mm_popcnt_u64(b));
+        }
+
+
+        void FORCE_INLINE bmi2_partition_epi32(uint32_t* array, uint32_t pv, int& left, int& right) {
+
+            const int N = 16;
+
+            __m512i L;
+            __m512i R;
+            __mmask16 maskL = 0;
+            __mmask16 maskR = 0;
+            const __m512i pivot = _mm512_set1_epi32(pv);
+
+            while (right - left + 1 >= 2*N) {
+
+                while (maskL == 0) {
+                    if (right - (left + N) + 1 < 2*N) {
+                        goto end;
+                    }
+
+                    L = _mm512_loadu_si512(array + left);
+                    maskL = _mm512_cmpge_epi32_mask(L, pivot);
+                    if (maskL == 0) {
+                        left += N;
+                    }
+                }
+
+                while (maskR == 0) {
+                    if ((right - N) - left + 1 < 2*N) {
+                        goto end;
+                    }
+
+                    R = _mm512_loadu_si512(array + right - N + 1);
+                    maskR = _mm512_cmple_epi32_mask(R, pivot);
+                    if (maskR == 0) {
+                        right -= N;
+                    }
+                }
+
+                assert(left <= right);
+                assert(maskL != 0);
+                assert(maskR != 0);
+                
+                __mmask16 mL;
+                __mmask16 mR;
+
+                bmi2_align_masks(maskL, maskR, mL, mR);
+                swap_epi32(L, R, maskL, maskR);
+
+                maskL = mL;
+                maskR = mR;
+
+                if (maskL == 0) {
+                    _mm512_storeu_si512(array + left, L);
+                    left += N;
+                }
+
+                if (maskR == 0) {
+                    _mm512_storeu_si512(array + right - N + 1, R);
+                    right -= N;
+                }
+
+            } // while
+
+        end:
+
+            if (maskL != 0) {
+                _mm512_storeu_si512(array + left, L);
+            }
+
+            if (maskR != 0) {
+                _mm512_storeu_si512(array + right - N + 1, R); 
+            }
+
+            if (left < right) {
+                partition_epi32(array, pv, left, right);
+            }
+        }
+
+    } // namespace avx512
+
+} // namespace qs
--- a/simd-sort/avx512-partition-register.cpp
+++ b/simd-sort/avx512-partition-register.cpp
@ -0,0 +1,58 @@
+namespace qs {
+
+    namespace avx512 {
+
+        __mmask16 FORCE_INLINE get_range_mask(uint32_t position, uint32_t length) {
+            
+            return (uint32_t(1) << (position + length)) - (uint32_t(1) << position);
+        }
+
+        /*
+            r stores 1 to 16 elements to partition, mask select leading element
+        */
+        void partition_register(__m512i& r, const __m512i pivot, __mmask16 mask, int& left, int& right) {
+
+            // example:
+
+            // r     = [  1,   2,   3, 100, 200, 300,   4,   5,   50,  900,  -1,  -2,  -3,  -4,  -5,  -6] 
+            // mask  = 0x03ff  -- first 10 elements are subject to parition
+            // pivot = packed_dword(50) 
+            
+            const __mmask16 less_mask       = _mm512_mask_cmplt_epi32_mask(mask, r, pivot);
+            const __mmask16 equal_mask      = _mm512_mask_cmpeq_epi32_mask(mask, r, pivot);
+
+            // less    = [1, 2, 3, 4, 5, ... rest are **pivots**]
+            const __m512i less_equal        = _mm512_mask_compress_epi32(pivot, less_mask, r);
+
+            // greater = [100, 200, 300, 900, ... rest are zeros]
+            const __m512i greater           = _mm512_maskz_compress_epi32(~(less_mask | equal_mask) & mask, r);
+
+            // less_cnt = 5
+            // less_equal_cnt = 6
+            const int less_cnt               = _mm_popcnt_u32(less_mask);
+            const int less_equal_cnt         = _mm_popcnt_u32(less_mask | equal_mask);
+
+            const __mmask16 store_less_equal = get_range_mask(0, less_equal_cnt);
+            const __mmask16 store_greater    = ~store_less_equal & mask;
+
+            // merge less or eqaul with input
+            //
+            // r = [  1,   2,   3,   4,   5,  50,   4,   5,  50, 900,  -1,  -2,  -3,  -4,  -5,  -6] 
+            //        ^    ^    ^    ^    ^   ^^
+            //        merged
+            r = _mm512_mask_mov_epi32(r, store_less_equal, less_equal);
+
+            // merge greater than pivot
+            //
+            // r = [  1,   2,   3,   4,   5,  50, 100, 200, 300, 900,  -1,  -2,  -3,  -4,  -5,  -6] 
+            //                                    ^^^  ^^^  ^^^  ^^^
+            //                                    merged
+            r = _mm512_mask_expand_epi32(r, store_greater, greater);
+
+            right = left + less_cnt;
+            left  = left + less_equal_cnt;
+        }
+
+    } // namespace avx512
+
+} // namespace qs
--- a/simd-sort/avx512-partition.cpp
+++ b/simd-sort/avx512-partition.cpp
@ -0,0 +1,129 @@
+//#define USE_AVX512_PIVOT
+
+namespace qs {
+
+    namespace avx512 {
+
+        void FORCE_INLINE align_masks(__mmask16& a, __mmask16& b, __mmask16& rem_a, __mmask16& rem_b) {
+
+            assert(a != 0);
+            assert(b != 0);
+
+            uint16_t tmpA = a;
+            uint16_t tmpB = b;
+                
+            while (tmpA != 0 && tmpB != 0) {
+                tmpA = tmpA & (tmpA - 1);
+                tmpB = tmpB & (tmpB - 1);
+            }
+
+            a = a ^ tmpA;
+            b = b ^ tmpB;
+
+            assert(a != 0);
+            assert(b != 0);
+            assert(_mm_popcnt_u64(a) == _mm_popcnt_u64(b));
+
+            rem_a = tmpA;
+            rem_b = tmpB;
+        }
+
+
+        void FORCE_INLINE partition_epi32(uint32_t* array, uint32_t pv, int& left, int& right) {
+
+            const int N = 16;
+
+            __m512i L;
+            __m512i R;
+            __mmask16 maskL = 0;
+            __mmask16 maskR = 0;
+            const __m512i pivot = _mm512_set1_epi32(pv);
+
+            int count = (right - left + 1)/N;
+
+            while (true) {
+
+                while (maskL == 0) {
+                    if (count <= 1) {
+                        goto end;
+                    }
+
+                    L = _mm512_loadu_si512(array + left);
+                    maskL = _mm512_cmpge_epi32_mask(L, pivot);
+                    if (maskL == 0) {
+                        left  += N;
+                        count -= 1;
+                    }
+                }
+
+                while (maskR == 0) {
+                    if (count <= 1) {
+                        goto end;
+                    }
+
+                    R = _mm512_loadu_si512(array + right - N + 1);
+                    maskR = _mm512_cmple_epi32_mask(R, pivot);
+                    if (maskR == 0) {
+                        right -= N;
+                        count -= 1;
+                    }
+                }
+
+                assert(left <= right);
+                assert(maskL != 0);
+                assert(maskR != 0);
+                
+                __mmask16 mL;
+                __mmask16 mR;
+
+                align_masks(maskL, maskR, mL, mR);
+                swap_epi32(L, R, maskL, maskR);
+
+                maskL = mL;
+                maskR = mR;
+
+                if (maskL == 0) {
+                    _mm512_storeu_si512(array + left, L);
+                    left  += N;
+                    count -= 1;
+                }
+
+                if (maskR == 0) {
+                    _mm512_storeu_si512(array + right - N + 1, R);
+                    right -= N;
+                    count -= 1;
+                }
+
+            } // while
+
+        end:
+
+            assert(!(maskL != 0 && maskR != 0));
+
+            if (maskL != 0) {
+                _mm512_storeu_si512(array + left, L);
+            } else if (maskR != 0) {
+                _mm512_storeu_si512(array + right - N + 1, R); 
+            }
+
+            if (left < right) {
+#if USE_AVX512_PIVOT
+                scalar_partition_epi32(array, pv, left, right);
+#else
+                const int size = right - left + 1;
+                if (size > N) {
+                    scalar_partition_epi32(array, pv, left, right);
+                } else {
+                    const int k    = left;
+                    __mmask16 mask = (uint32_t(1) << size) - 1;
+                    __m512i   v    = _mm512_loadu_si512(array + k);
+                    partition_register(v, pivot, mask, left, right);
+                    _mm512_storeu_si512(array + k, v);
+                }
+#endif
+            }
+        }
+
+    } // namespace avx512
+
+} // namespace qa
--- a/simd-sort/avx512-popcnt-partition.cpp
+++ b/simd-sort/avx512-popcnt-partition.cpp
@ -0,0 +1,119 @@
+namespace qs {
+
+    namespace avx512 {
+
+        void FORCE_INLINE popcnt_partition_epi32(uint32_t* array, uint32_t pv, int& left, int& right) {
+
+            const int N = 16;
+
+            __m512i L;
+            __m512i R;
+            __mmask16 maskL = 0;
+            __mmask16 maskR = 0;
+            int popcntL;
+            int popcntR;
+
+            const __m512i pivot = _mm512_set1_epi32(pv);
+
+            while (true) {
+
+                while (maskL == 0) {
+                    if (right - (left + N) + 1 < N) {
+                        goto end;
+                    }
+
+                    L = _mm512_loadu_si512(array + left);
+                    maskL = _mm512_cmpge_epi32_mask(L, pivot);
+                    if (maskL == 0) {
+                        left += N;
+                    }
+
+                    popcntL = _mm_popcnt_u32(maskL);
+                }
+
+                while (maskR == 0) {
+                    if ((right - N) - left + 1 < N) {
+                        goto end;
+                    }
+
+                    R = _mm512_loadu_si512(array + right - N + 1);
+                    maskR = _mm512_cmple_epi32_mask(R, pivot);
+                    if (maskR == 0) {
+                        right -= N;
+                    }
+
+                    popcntR = _mm_popcnt_u32(maskR);
+                }
+
+                assert(left <= right);
+                assert(maskL != 0);
+                assert(maskR != 0);
+
+                if (popcntL == popcntR) {
+                    // fast path
+                    swap_epi32(L, R, maskL, maskR);
+                    maskL = maskR = 0;
+
+                    _mm512_storeu_si512(array + left, L);
+                    left += N;
+
+                    _mm512_storeu_si512(array + right - N + 1, R);
+                    right -= N;
+                } else {
+                    if (popcntL < popcntR) {
+                        int n = popcntR - popcntL;
+                        int k = n;
+
+                        __mmask16 tmp = maskR;
+                        while (k--) {
+                            maskR = maskR & (maskR - 1);
+                        }
+
+                        swap_epi32(L, R, maskL, maskR);
+                        maskR = tmp ^ maskR;
+
+                        popcntR = n;
+
+                        _mm512_storeu_si512(array + left, L);
+                        left += N;
+                        maskL = 0;
+                    } else {
+                        int n = popcntL - popcntR;
+                        int k = n;
+
+                        __mmask16 tmp = maskL;
+                        while (k--) {
+                            maskL = maskL & (maskL - 1);
+                        }
+
+                        swap_epi32(L, R, maskL, maskR);
+                        maskL = tmp ^ maskL;
+
+                        popcntL = n;
+
+                        _mm512_storeu_si512(array + right - N + 1, R);
+                        right -= N;
+                        maskR  = 0;
+                    }
+                }
+
+            } // while
+
+        end:
+
+            assert(!(maskL != 0 && maskR != 0));
+
+            if (maskL != 0) {
+                _mm512_storeu_si512(array + left, L);
+            } else if (maskR != 0) {
+                _mm512_storeu_si512(array + right - N + 1, R); 
+            }
+
+            if (left < right) {
+                scalar_partition_epi32(array, pv, left, right);
+            }
+        }
+
+    } // namespace avx512
+
+} // namespace qs
--- a/simd-sort/avx512-quicksort.cpp
+++ b/simd-sort/avx512-quicksort.cpp
@ -0,0 +1,211 @@
+#include "avx512-swap.cpp"
+#include "avx512-partition-register.cpp"
+#include "avx512-partition.cpp"
+#include "avx512-popcnt-partition.cpp"
+#include "avx512-bmi2-partition.cpp"
+#include "avx512-auxbuffer-partition.cpp"
+#include "avx512-sort-register.cpp"
+
+
+namespace qs {
+
+    namespace avx512 {
+
+
+        void dump(const char* name, const __m512i v) {
+            uint32_t tmp[16];
+
+            printf("%-10s = [", name);
+            _mm512_storeu_si512((__m512i*)tmp, v);
+            for (int i=0; i < 16; i++) {
+                if (i > 0) {
+                    putchar(' ');
+                }
+
+                printf("%08x", tmp[i]);
+            }
+
+            printf("]\n");
+        }
+
+        void quicksort(uint32_t* array, int left, int right) {
+
+            int i = left;
+            int j = right;
+
+            const int size = j - i + 1;
+            if (size <= 16) {
+                const __m512i in = _mm512_loadu_si512(array + i);
+                __m512i result;
+
+                switch (size) {
+                    case 16: result = avx512_sort16_epi32(in); break;
+                    case 15: result = avx512_sort15_epi32(in); break;
+                    case 14: result = avx512_sort14_epi32(in); break;
+                    case 13: result = avx512_sort13_epi32(in); break;
+                    case 12: result = avx512_sort12_epi32(in); break;
+                    case 11: result = avx512_sort11_epi32(in); break;
+                    case 10: result = avx512_sort10_epi32(in); break;
+                    case 9: result = avx512_sort9_epi32(in); break;
+                    case 8: result = avx512_sort8_epi32(in); break;
+                    case 7: result = avx512_sort7_epi32(in); break;
+                    case 6: result = avx512_sort6_epi32(in); break;
+                    case 5: result = avx512_sort5_epi32(in); break;
+                    case 4: result = avx512_sort4_epi32(in); break;
+                    case 3: result = avx512_sort3_epi32(in); break;
+                    case 2: {
+                        if (array[i] > array[j]) {
+                            uint32_t t = array[i];
+                            array[i]   = array[j];
+                            array[j]   = t;
+                        }
+                        return;
+                     }
+
+                     case 1:
+                     case 0:
+                        return;
+
+                     default:
+                        assert(false);
+                        break;
+                }
+
+                _mm512_storeu_si512(array + i, result);
+                return;
+            }
+
+            const uint32_t pivot = array[left];
+            const int AVX512_REGISTER_SIZE = 16; // in 32-bit words
+            if (size >= 2 * AVX512_REGISTER_SIZE) {
+                ::qs::avx512::partition_epi32(array, pivot, i, j);
+            } else {
+                scalar_partition_epi32(array, pivot, i, j);
+            }
+
+            if (left < j) {
+                quicksort(array, left, j);
+            }
+
+            if (i < right) {
+                quicksort(array, i, right);
+            }
+        }
+
+
+        void popcnt_quicksort(uint32_t* array, int left, int right) {
+
+            int i = left;
+            int j = right;
+            const int size = j - i + 1;
+            if (size <= 16) {
+                const __m512i in = _mm512_loadu_si512(array + i);
+                __m512i result;
+
+                switch (size) {
+                    case 16: result = avx512_sort16_epi32(in); break;
+                    case 15: result = avx512_sort15_epi32(in); break;
+                    case 14: result = avx512_sort14_epi32(in); break;
+                    case 13: result = avx512_sort13_epi32(in); break;
+                    case 12: result = avx512_sort12_epi32(in); break;
+                    case 11: result = avx512_sort11_epi32(in); break;
+                    case 10: result = avx512_sort10_epi32(in); break;
+                    case 9: result = avx512_sort9_epi32(in); break;
+                    case 8: result = avx512_sort8_epi32(in); break;
+                    case 7: result = avx512_sort7_epi32(in); break;
+                    case 6: result = avx512_sort6_epi32(in); break;
+                    case 5: result = avx512_sort5_epi32(in); break;
+                    case 4: result = avx512_sort4_epi32(in); break;
+                    case 3: result = avx512_sort3_epi32(in); break;
+                    case 2: {
+                        if (array[i] > array[j]) {
+                            uint32_t t = array[i];
+                            array[i]   = array[j];
+                            array[j]   = t;
+                        }
+                        return;
+                     }
+
+                     case 1:
+                     case 0:
+                        return;
+
+                     default:
+                        assert(false);
+                        break;
+                }
+
+                _mm512_storeu_si512(array + i, result);
+                return;
+            }
+
+            const uint32_t pivot = array[(i + j)/2];
+            const int AVX512_REGISTER_SIZE = 16; // in 32-bit words
+            if (size >= 2 * AVX512_REGISTER_SIZE) {
+                ::qs::avx512::popcnt_partition_epi32(array, pivot, i, j);
+            } else {
+                scalar_partition_epi32(array, pivot, i, j);
+            }
+
+            if (left < j) {
+                popcnt_quicksort(array, left, j);
+            }
+
+            if (i < right) {
+                popcnt_quicksort(array, i, right);
+            }
+        }
+
+
+        void bmi2_quicksort(uint32_t* array, int left, int right) {
+
+            int i = left;
+            int j = right;
+
+            const uint32_t pivot = array[left];
+            const int AVX512_REGISTER_SIZE = 16; // in 32-bit words
+
+            if (j - i >= 2 * AVX512_REGISTER_SIZE) {
+                ::qs::avx512::bmi2_partition_epi32(array, pivot, i, j);
+            } else {
+                scalar_partition_epi32(array, pivot, i, j);
+            }
+
+            if (left < j) {
+                bmi2_quicksort(array, left, j);
+            }
+
+            if (i < right) {
+                bmi2_quicksort(array, i, right);
+            }
+        }
+
+
+        void auxbuffer_quicksort(uint32_t* array, int lo, int hi) {
+
+            if (lo >= hi) {
+                return;
+            }
+
+            int p;
+#if 0
+            p = lomuto_partition_epi32(array, lo, hi);
+#else
+            const auto n = (hi - lo + 1);
+            if (n > 1024) {
+                p = lomuto_partition_epi32(array, lo, hi);
+            } else if (n > 8) {
+                const uint32_t pivot = array[(hi + lo)/2];
+                p = ::qs::avx512::partition_auxbuffer_epi32(array + lo, n, pivot) + lo;
+            } else {
+                p = lomuto_partition_epi32(array, lo, hi);
+            }
+#endif
+            auxbuffer_quicksort(array, lo, p-1);
+            auxbuffer_quicksort(array, p+1, hi);
+        }
+
+    } // namespace avx512
+
+} // namespace qs
+
--- a/simd-sort/avx512-sort-register.cpp
+++ b/simd-sort/avx512-sort-register.cpp
@ -0,0 +1,323 @@
+
+//#define POPCNT_LOOKUP
+
+#ifdef POPCNT_LOOKUP
+
+    uint16_t lookup[65536];
+
+    // The lookup table replaces 2 x popcount and 2 x shift with 1 bit-or and 2 x memory fetch
+    void prepare_lookup() {
+        for (int i=0; i < 65536; i++) {
+            lookup[i] = 1 << _mm_popcnt_u32(i);
+        }
+    }
+    #define STEP(unused) { \
+        const __m512i  b    = _mm512_permutexvar_epi32(index, v); \
+        index  = _mm512_add_epi32(index, incr); \
+        const uint16_t lt   = _mm512_mask_cmplt_epi32_mask(sort_mask, v, b); \
+        const uint16_t eq   = _mm512_mask_cmpeq_epi32_mask(sort_mask, v, b); \
+        const uint16_t mask = lookup[lt | eq] - lookup[lt]; \
+        result = _mm512_mask_mov_epi32(result, mask, b); \
+    }
+
+#else
+
+    #define STEP(unused) { \
+        const __m512i  b    = _mm512_permutexvar_epi32(index, v); \
+        index  = _mm512_add_epi32(index, incr); \
+        const uint16_t lt   = _mm_popcnt_u32(_mm512_mask_cmplt_epi32_mask(sort_mask, v, b)); \
+        const uint16_t eq   = _mm_popcnt_u32(_mm512_mask_cmpeq_epi32_mask(sort_mask, v, b)); \
+        const uint16_t mask = (uint32_t(1) << (lt + eq)) - (uint32_t(1) << lt); \
+        result = _mm512_mask_mov_epi32(result, mask, b); \
+    }
+
+#endif
+
+__m512i avx512_sort16_epi32(const __m512i v) {
+
+    __m512i result = v;
+    __m512i index = _mm512_setzero_si512();
+    __m512i incr  = _mm512_set1_epi32(1);
+
+    const uint16_t sort_mask = 0xffff;
+    STEP(0);
+    STEP(1);
+    STEP(2);
+    STEP(3);
+    STEP(4);
+    STEP(5);
+    STEP(6);
+    STEP(7);
+    STEP(8);
+    STEP(9);
+    STEP(10);
+    STEP(11);
+    STEP(12);
+    STEP(13);
+    STEP(14);
+    STEP(15);
+
+    return result;
+}
+
+__m512i avx512_sort15_epi32(const __m512i v) {
+
+    __m512i result = v;
+    __m512i index = _mm512_setzero_si512();
+    __m512i incr  = _mm512_set1_epi32(1);
+
+    const uint16_t sort_mask = 0x7fff;
+    STEP(0);
+    STEP(1);
+    STEP(2);
+    STEP(3);
+    STEP(4);
+    STEP(5);
+    STEP(6);
+    STEP(7);
+    STEP(8);
+    STEP(9);
+    STEP(10);
+    STEP(11);
+    STEP(12);
+    STEP(13);
+    STEP(14);
+
+    return result;
+}
+
+__m512i avx512_sort14_epi32(const __m512i v) {
+
+    __m512i result = v;
+    __m512i index = _mm512_setzero_si512();
+    __m512i incr  = _mm512_set1_epi32(1);
+
+    const uint16_t sort_mask = 0x3fff;
+    STEP(0);
+    STEP(1);
+    STEP(2);
+    STEP(3);
+    STEP(4);
+    STEP(5);
+    STEP(6);
+    STEP(7);
+    STEP(8);
+    STEP(9);
+    STEP(10);
+    STEP(11);
+    STEP(12);
+    STEP(13);
+
+    return result;
+}
+
+__m512i avx512_sort13_epi32(const __m512i v) {
+
+    __m512i result = v;
+    __m512i index = _mm512_setzero_si512();
+    __m512i incr  = _mm512_set1_epi32(1);
+
+    const uint16_t sort_mask = 0x1fff;
+    STEP(0);
+    STEP(1);
+    STEP(2);
+    STEP(3);
+    STEP(4);
+    STEP(5);
+    STEP(6);
+    STEP(7);
+    STEP(8);
+    STEP(9);
+    STEP(10);
+    STEP(11);
+    STEP(12);
+
+    return result;
+}
+
+__m512i avx512_sort12_epi32(const __m512i v) {
+
+    __m512i result = v;
+    __m512i index = _mm512_setzero_si512();
+    __m512i incr  = _mm512_set1_epi32(1);
+
+    const uint16_t sort_mask = 0x0fff;
+    STEP(0);
+    STEP(1);
+    STEP(2);
+    STEP(3);
+    STEP(4);
+    STEP(5);
+    STEP(6);
+    STEP(7);
+    STEP(8);
+    STEP(9);
+    STEP(10);
+    STEP(11);
+
+    return result;
+}
+
+__m512i avx512_sort11_epi32(const __m512i v) {
+
+    __m512i result = v;
+    __m512i index = _mm512_setzero_si512();
+    __m512i incr  = _mm512_set1_epi32(1);
+
+    const uint16_t sort_mask = 0x07ff;
+    STEP(0);
+    STEP(1);
+    STEP(2);
+    STEP(3);
+    STEP(4);
+    STEP(5);
+    STEP(6);
+    STEP(7);
+    STEP(8);
+    STEP(9);
+    STEP(10);
+
+    return result;
+}
+
+__m512i avx512_sort10_epi32(const __m512i v) {
+
+    __m512i result = v;
+    __m512i index = _mm512_setzero_si512();
+    __m512i incr  = _mm512_set1_epi32(1);
+
+    const uint16_t sort_mask = 0x03ff;
+    STEP(0);
+    STEP(1);
+    STEP(2);
+    STEP(3);
+    STEP(4);
+    STEP(5);
+    STEP(6);
+    STEP(7);
+    STEP(8);
+    STEP(9);
+
+    return result;
+}
+
+__m512i avx512_sort9_epi32(const __m512i v) {
+
+    __m512i result = v;
+    __m512i index = _mm512_setzero_si512();
+    __m512i incr  = _mm512_set1_epi32(1);
+
+    const uint16_t sort_mask = 0x01ff;
+    STEP(0);
+    STEP(1);
+    STEP(2);
+    STEP(3);
+    STEP(4);
+    STEP(5);
+    STEP(6);
+    STEP(7);
+    STEP(8);
+
+    return result;
+}
+
+__m512i avx512_sort8_epi32(const __m512i v) {
+
+    __m512i result = v;
+    __m512i index = _mm512_setzero_si512();
+    __m512i incr  = _mm512_set1_epi32(1);
+
+    const uint16_t sort_mask = 0x00ff;
+    STEP(0);
+    STEP(1);
+    STEP(2);
+    STEP(3);
+    STEP(4);
+    STEP(5);
+    STEP(6);
+    STEP(7);
+
+    return result;
+}
+
+__m512i avx512_sort7_epi32(const __m512i v) {
+
+    __m512i result = v;
+    __m512i index = _mm512_setzero_si512();
+    __m512i incr  = _mm512_set1_epi32(1);
+
+    const uint16_t sort_mask = 0x007f;
+    STEP(0);
+    STEP(1);
+    STEP(2);
+    STEP(3);
+    STEP(4);
+    STEP(5);
+    STEP(6);
+
+    return result;
+}
+
+__m512i avx512_sort6_epi32(const __m512i v) {
+
+    __m512i result = v;
+    __m512i index = _mm512_setzero_si512();
+    __m512i incr  = _mm512_set1_epi32(1);
+
+    const uint16_t sort_mask = 0x003f;
+    STEP(0);
+    STEP(1);
+    STEP(2);
+    STEP(3);
+    STEP(4);
+    STEP(5);
+
+    return result;
+}
+
+__m512i avx512_sort5_epi32(const __m512i v) {
+
+    __m512i result = v;
+    __m512i index = _mm512_setzero_si512();
+    __m512i incr  = _mm512_set1_epi32(1);
+
+    const uint16_t sort_mask = 0x001f;
+    STEP(0);
+    STEP(1);
+    STEP(2);
+    STEP(3);
+    STEP(4);
+
+    return result;
+}
+
+__m512i avx512_sort4_epi32(const __m512i v) {
+
+    __m512i result = v;
+    __m512i index = _mm512_setzero_si512();
+    __m512i incr  = _mm512_set1_epi32(1);
+
+    const uint16_t sort_mask = 0x000f;
+    STEP(0);
+    STEP(1);
+    STEP(2);
+    STEP(3);
+
+    return result;
+}
+
+__m512i avx512_sort3_epi32(const __m512i v) {
+
+    __m512i result = v;
+    __m512i index = _mm512_setzero_si512();
+    __m512i incr  = _mm512_set1_epi32(1);
+
+    const uint16_t sort_mask = 0x0007;
+    STEP(0);
+    STEP(1);
+    STEP(2);
+
+    return result;
+}
+
+#undef STEP
--- a/simd-sort/avx512-swap.cpp
+++ b/simd-sort/avx512-swap.cpp
@ -0,0 +1,20 @@
+namespace qs {
+    namespace avx512 {
+
+        void swap_epi32(__m512i& a, __m512i& b, __mmask16 mask_a, __mmask16 mask_b) {
+            
+            assert(mask_a != 0);
+            assert(mask_b != 0);
+            assert(_mm_popcnt_u64(mask_a) == _mm_popcnt_u64(mask_b));
+
+            const __m512i to_swap_b = _mm512_mask_compress_epi32(_mm512_setzero_si512(), mask_a, a);
+            const __m512i to_swap_a = _mm512_mask_compress_epi32(_mm512_setzero_si512(), mask_b, b);
+
+            a = _mm512_mask_expand_epi32(a, mask_a, to_swap_a);
+            b = _mm512_mask_expand_epi32(b, mask_b, to_swap_b);
+        }
+
+
+    } // namespace avx512
+
+} // namespace qa
--- a/simd-sort/cmdline.cpp
+++ b/simd-sort/cmdline.cpp
@ -0,0 +1,28 @@
+#include <cstring>
+#include <string>
+#include <unordered_set>
+
+class CommandLine final {
+    int argc;
+    char** argv;
+
+    std::unordered_set<std::string> arguments;
+
+public:
+    CommandLine(int argc, char* argv[])
+        : argc(argc)
+        , argv(argv) {
+
+        for (int i=1; i < argc; i++) {
+            arguments.insert(argv[i]);
+        }
+    }
+
+    bool has(const std::string& name) const {
+        return arguments.count(name);
+    }
+
+    bool empty() const {
+        return arguments.empty();
+    }
+};
--- a/simd-sort/common.h
+++ b/simd-sort/common.h
@ -0,0 +1,3 @@
+#pragma once
+
+#define FORCE_INLINE inline __attribute__((always_inline))
--- a/simd-sort/gather_results.sh
+++ b/simd-sort/gather_results.sh
@ -0,0 +1,45 @@
+#!/bin/bash
+
+if [[ $1 == "avx2" ]]
+then
+    SPEED=./speed_avx2
+    if [[ ! -x $SPEED ]]
+    then
+        echo "AVX2 test program not found. Run make speed_avx2 to build it."
+        exit 1
+    fi
+else
+    SPEED=./speed
+    if [[ ! -x $SPEED ]]
+    then
+        echo "AVX512 test program not found. Run make speed to build it."
+        exit 1
+    fi
+fi
+
+
+$SPEED    10000 3 asc | tee -a results.txt
+$SPEED   100000 3 asc | tee -a results.txt
+$SPEED  1000000 3 asc | tee -a results.txt
+$SPEED  2000000 3 asc | tee -a results.txt
+$SPEED  5000000 3 asc | tee -a results.txt
+$SPEED 10000000 3 asc | tee -a results.txt
+$SPEED 20000000 3 asc | tee -a results.txt
+
+$SPEED    10000 3 dsc | tee -a results.txt
+$SPEED   100000 3 dsc | tee -a results.txt
+$SPEED  1000000 3 dsc | tee -a results.txt
+$SPEED  2000000 3 dsc | tee -a results.txt
+$SPEED  5000000 3 dsc | tee -a results.txt
+$SPEED 10000000 3 dsc | tee -a results.txt
+$SPEED 20000000 3 dsc | tee -a results.txt
+
+$SPEED    10000 3 rnd | tee -a results.txt
+$SPEED   100000 3 rnd | tee -a results.txt
+$SPEED  1000000 3 rnd | tee -a results.txt
+$SPEED  2000000 3 rnd | tee -a results.txt
+$SPEED  5000000 3 rnd | tee -a results.txt
+$SPEED 10000000 3 rnd | tee -a results.txt
+$SPEED 20000000 3 rnd | tee -a results.txt
+
+echo "results.txt created"
--- a/simd-sort/gettime.cpp
+++ b/simd-sort/gettime.cpp
@ -0,0 +1,8 @@
+#include <time.h>
+#include <sys/time.h>
+
+uint32_t get_time() {
+	struct timeval T;
+	gettimeofday(&T, NULL);
+	return (T.tv_sec * 1000000) + T.tv_usec;
+}
--- a/simd-sort/input_data.cpp
+++ b/simd-sort/input_data.cpp
@ -0,0 +1,106 @@
+class InputData {
+    
+protected:
+    uint32_t*   array;
+    size_t      n;
+
+public:
+    InputData(size_t count)
+        : n(count) {
+
+        assert(n > 0);
+        array = new uint32_t[n];
+    }
+
+    ~InputData() {
+        delete[] array;
+    }
+
+public:
+    uint32_t* pointer() {
+        return array;
+    }
+
+    size_t count() const {
+        return n;
+    }
+
+    size_t size() const {
+        return n * sizeof(uint32_t);
+    }
+};
+
+
+class InputAscending: public InputData {
+
+    using super = InputData;
+
+public:
+    InputAscending(size_t count) : super(count) {
+        for (size_t i=0; i < n; i++) {
+            array[i] = i;
+        }
+    }
+};
+
+
+class InputDescending: public InputData {
+
+    using super = InputData;
+
+public:
+    InputDescending(size_t count) : super(count) {
+        for (size_t i=0; i < n; i++) {
+            array[i] = n - i + 1;
+        }
+    }
+};
+
+class InputRandomFew: public InputData {
+
+    using super = InputData;
+
+public:
+    InputRandomFew(size_t count) : super(count) {
+        for (size_t i=0; i < n; i++) {
+            array[i] = random() % 10;
+        }
+    }
+};
+
+class InputRandom: public InputData {
+
+    using super = InputData;
+
+public:
+    InputRandom(size_t count) : super(count) {
+        for (size_t i=0; i < n; i++) {
+            array[i] = random();
+        }
+    }
+};
+
+class InputRandomUnique: public InputData {
+
+    using super = InputData;
+
+public:
+    InputRandomUnique(size_t count) : super(count) {
+        for (size_t i=0; i < n; i++) {
+            array[i] = i;
+        }
+
+        shuffle();
+    }
+
+private:
+    void shuffle() {
+        for (size_t i=0; i < n; i++) {
+            size_t j = rand() % (n - i);
+
+            const uint32_t t = array[i];
+            array[i] = array[j];
+            array[j] = t;
+        }
+    }
+};
--- a/simd-sort/partition.cpp
+++ b/simd-sort/partition.cpp
@ -0,0 +1,51 @@
+void scalar_partition_epi32(uint32_t* array, const uint32_t pivot, int& left, int& right) {
+
+    while (left <= right) {
+
+        while (array[left] < pivot) {
+            left += 1;
+        }
+
+        while (array[right] > pivot) {
+            right -= 1;
+        }
+
+        if (left <= right) {
+            const uint32_t t = array[left];
+            array[left]      = array[right];
+            array[right]     = t;
+
+            left  += 1;
+            right -= 1;
+        }
+    }
+}
+
+
+int lomuto_partition_epi32(uint32_t* array, int lo, int hi) {
+
+    const uint32_t pivot = array[(lo + hi)/2];
+    const uint32_t hi_value = array[hi];
+
+    array[(lo + hi)/2] = hi_value;
+    array[hi] = pivot;
+
+    int i = lo;
+    for (int j=lo; j < hi; j++) {
+        if (array[j] <= pivot) {
+            const uint32_t t = array[i];
+            array[i] = array[j];
+            array[j] = t;
+            i += 1;
+        }
+    }
+
+    {
+        const uint32_t t = array[i];
+        array[i]  = array[hi];
+        array[hi] = t;
+        i += 1;
+    }
+
+    return i;
+}
--- a/simd-sort/quicksort-all.cpp
+++ b/simd-sort/quicksort-all.cpp
@ -0,0 +1,9 @@
+#include "common.h"
+#include "quicksort.cpp"
+#ifdef HAVE_AVX2_INSTRUCTIONS
+#   include "avx2-quicksort.cpp"
+#endif
+#ifdef HAVE_AVX512F_INSTRUCTIONS
+#   include "avx512-quicksort.cpp"
+#endif
+
--- a/simd-sort/quicksort.cpp
+++ b/simd-sort/quicksort.cpp
@ -0,0 +1,19 @@
+#include "partition.cpp"
+
+void quicksort(uint32_t* array, int left, int right) {
+
+    int i = left;
+    int j = right;
+
+    const uint32_t pivot = array[(i + j)/2];
+
+    scalar_partition_epi32(array, pivot, i, j);
+
+    if (left < j) {
+        quicksort(array, left, j);
+    }
+
+    if (i < right) {
+        quicksort(array, i, right);
+    }
+}
--- a/simd-sort/rdtsc.cpp
+++ b/simd-sort/rdtsc.cpp
@ -0,0 +1,52 @@
+#define RDTSC_START(cycles)                                             \
+    do {                                                                \
+        uint32_t cyc_high, cyc_low;                                     \
+        __asm volatile("cpuid\n"                                        \
+                       "rdtsc\n"                                        \
+                       "mov %%edx, %0\n"                                \
+                       "mov %%eax, %1" :                                \
+                       "=r" (cyc_high),                                 \
+                       "=r"(cyc_low) :                                  \
+                       : /* no read only */                             \
+                       "%rax", "%rbx", "%rcx", "%rdx" /* clobbers */    \
+                       );                                               \
+        (cycles) = ((uint64_t)cyc_high << 32) | cyc_low;                \
+    } while (0)
+
+#define RDTSC_STOP(cycles)                                              \
+    do {                                                                \
+        uint32_t cyc_high, cyc_low;                                     \
+        __asm volatile("rdtscp\n"                                       \
+                       "mov %%edx, %0\n"                                \
+                       "mov %%eax, %1\n"                                \
+                       "cpuid" :                                        \
+                       "=r"(cyc_high),                                  \
+                       "=r"(cyc_low) :                                  \
+                       /* no read only registers */ :                   \
+                       "%rax", "%rbx", "%rcx", "%rdx" /* clobbers */    \
+                       );                                               \
+        (cycles) = ((uint64_t)cyc_high << 32) | cyc_low;                \
+    } while (0)
+
+static __attribute__ ((noinline))
+uint64_t rdtsc_overhead_func(uint64_t dummy) {
+    return dummy;
+}
+
+uint64_t global_rdtsc_overhead = (uint64_t) UINT64_MAX;
+
+#define RDTSC_SET_OVERHEAD(test, repeat)			                    \
+  do {								                                    \
+    uint64_t cycles_start, cycles_final, cycles_diff;		            \
+    uint64_t min_diff = UINT64_MAX;				                        \
+    for (int i = 0; i < repeat; i++) {			                        \
+      __asm volatile("" ::: /* pretend to clobber */ "memory");	        \
+      RDTSC_START(cycles_start);				                        \
+      test;							                                    \
+      RDTSC_STOP(cycles_final);                                         \
+      cycles_diff = (cycles_final - cycles_start);		                \
+      if (cycles_diff < min_diff) min_diff = cycles_diff;	            \
+    }								                                    \
+    global_rdtsc_overhead = min_diff;				                    \
+    printf("rdtsc_overhead set to %d\n", (int)global_rdtsc_overhead);   \
+  } while (0)
--- a/simd-sort/results/haswell-i7-4770-gcc5.3.0-avx2.txt
+++ b/simd-sort/results/haswell-i7-4770-gcc5.3.0-avx2.txt
@ -0,0 +1,84 @@
+items count: 10000 (40000 bytes), input ascending
+                     std::sort ... 0.0001 s
+                    quick sort ... 0.0001 s (1.24)
+               AVX2 quick sort ... 0.0001 s (1.48)
+items count: 100000 (400000 bytes), input ascending
+                     std::sort ... 0.0010 s
+                    quick sort ... 0.0009 s (1.17)
+               AVX2 quick sort ... 0.0007 s (1.48)
+items count: 1000000 (4000000 bytes), input ascending
+                     std::sort ... 0.0117 s
+                    quick sort ... 0.0092 s (1.27)
+               AVX2 quick sort ... 0.0066 s (1.77)
+items count: 2000000 (8000000 bytes), input ascending
+                     std::sort ... 0.0244 s
+                    quick sort ... 0.0192 s (1.28)
+               AVX2 quick sort ... 0.0137 s (1.79)
+items count: 5000000 (20000000 bytes), input ascending
+                     std::sort ... 0.0700 s
+                    quick sort ... 0.0535 s (1.31)
+               AVX2 quick sort ... 0.0378 s (1.85)
+items count: 10000000 (40000000 bytes), input ascending
+                     std::sort ... 0.1462 s
+                    quick sort ... 0.1108 s (1.32)
+               AVX2 quick sort ... 0.0782 s (1.87)
+items count: 20000000 (80000000 bytes), input ascending
+                     std::sort ... 0.3049 s
+                    quick sort ... 0.2301 s (1.33)
+               AVX2 quick sort ... 0.1614 s (1.89)
+items count: 10000 (40000 bytes), input descending
+                     std::sort ... 0.0001 s
+                    quick sort ... 0.0001 s (0.92)
+               AVX2 quick sort ... 0.0001 s (1.12)
+items count: 100000 (400000 bytes), input descending
+                     std::sort ... 0.0008 s
+                    quick sort ... 0.0009 s (0.87)
+               AVX2 quick sort ... 0.0007 s (1.10)
+items count: 1000000 (4000000 bytes), input descending
+                     std::sort ... 0.0087 s
+                    quick sort ... 0.0092 s (0.95)
+               AVX2 quick sort ... 0.0066 s (1.32)
+items count: 2000000 (8000000 bytes), input descending
+                     std::sort ... 0.0183 s
+                    quick sort ... 0.0192 s (0.95)
+               AVX2 quick sort ... 0.0138 s (1.33)
+items count: 5000000 (20000000 bytes), input descending
+                     std::sort ... 0.0514 s
+                    quick sort ... 0.0536 s (0.96)
+               AVX2 quick sort ... 0.0378 s (1.36)
+items count: 10000000 (40000000 bytes), input descending
+                     std::sort ... 0.1068 s
+                    quick sort ... 0.1111 s (0.96)
+               AVX2 quick sort ... 0.0783 s (1.36)
+items count: 20000000 (80000000 bytes), input descending
+                     std::sort ... 0.2221 s
+                    quick sort ... 0.2299 s (0.97)
+               AVX2 quick sort ... 0.1613 s (1.38)
+items count: 10000 (40000 bytes), input random
+                     std::sort ... 0.0001 s
+                    quick sort ... 0.0001 s (1.25)
+               AVX2 quick sort ... 0.0001 s (1.53)
+items count: 100000 (400000 bytes), input random
+                     std::sort ... 0.0010 s
+                    quick sort ... 0.0009 s (1.16)
+               AVX2 quick sort ... 0.0007 s (1.48)
+items count: 1000000 (4000000 bytes), input random
+                     std::sort ... 0.0118 s
+                    quick sort ... 0.0091 s (1.29)
+               AVX2 quick sort ... 0.0066 s (1.78)
+items count: 2000000 (8000000 bytes), input random
+                     std::sort ... 0.0246 s
+                    quick sort ... 0.0192 s (1.28)
+               AVX2 quick sort ... 0.0137 s (1.79)
+items count: 5000000 (20000000 bytes), input random
+                     std::sort ... 0.0703 s
+                    quick sort ... 0.0537 s (1.31)
+               AVX2 quick sort ... 0.0378 s (1.86)
+items count: 10000000 (40000000 bytes), input random
+                     std::sort ... 0.1468 s
+                    quick sort ... 0.1106 s (1.33)
+               AVX2 quick sort ... 0.0786 s (1.87)
+items count: 20000000 (80000000 bytes), input random
+                     std::sort ... 0.3058 s
+                    quick sort ... 0.2304 s (1.33)
+               AVX2 quick sort ... 0.1628 s (1.88)
--- a/simd-sort/results/knights-landing-7210-gcc5.3.0-avx512f.txt
+++ b/simd-sort/results/knights-landing-7210-gcc5.3.0-avx512f.txt
@ -0,0 +1,140 @@
+items count: 100000 (400000 bytes), input ascending
+                     std::sort ... 0.0063 s
+                    quick sort ... 0.0067 s (0.93)
+               AVX2 quick sort ... 0.0071 s (0.88)
+             AVX512 quick sort ... 0.0055 s (1.14)
+    AVX512 + popcnt quick sort ... 0.0056 s (1.13)
+      AVX512 + BMI2 quick sort ... 0.0058 s (1.08)
+items count: 1000000 (4000000 bytes), input ascending
+                     std::sort ... 0.0728 s
+                    quick sort ... 0.0735 s (0.99)
+               AVX2 quick sort ... 0.0685 s (1.06)
+             AVX512 quick sort ... 0.0522 s (1.40)
+    AVX512 + popcnt quick sort ... 0.0528 s (1.38)
+      AVX512 + BMI2 quick sort ... 0.0554 s (1.31)
+items count: 2000000 (8000000 bytes), input ascending
+                     std::sort ... 0.1529 s
+                    quick sort ... 0.1523 s (1.00)
+               AVX2 quick sort ... 0.1402 s (1.09)
+             AVX512 quick sort ... 0.1057 s (1.45)
+    AVX512 + popcnt quick sort ... 0.1063 s (1.44)
+      AVX512 + BMI2 quick sort ... 0.1105 s (1.38)
+items count: 5000000 (20000000 bytes), input ascending
+                     std::sort ... 0.4433 s
+                    quick sort ... 0.3935 s (1.13)
+               AVX2 quick sort ... 0.3520 s (1.26)
+             AVX512 quick sort ... 0.2609 s (1.70)
+    AVX512 + popcnt quick sort ... 0.2609 s (1.70)
+      AVX512 + BMI2 quick sort ... 0.2668 s (1.66)
+items count: 10000000 (40000000 bytes), input ascending
+                     std::sort ... 0.9257 s
+                    quick sort ... 0.8108 s (1.14)
+               AVX2 quick sort ... 0.7222 s (1.28)
+             AVX512 quick sort ... 0.5265 s (1.76)
+    AVX512 + popcnt quick sort ... 0.5288 s (1.75)
+      AVX512 + BMI2 quick sort ... 0.5402 s (1.71)
+items count: 20000000 (80000000 bytes), input ascending
+                     std::sort ... 1.9285 s
+                    quick sort ... 1.6725 s (1.15)
+               AVX2 quick sort ... 1.4766 s (1.31)
+             AVX512 quick sort ... 1.0681 s (1.81)
+    AVX512 + popcnt quick sort ... 1.0699 s (1.80)
+      AVX512 + BMI2 quick sort ... 1.0949 s (1.76)
+items count: 10000 (40000 bytes), input descending
+                     std::sort ... 0.0004 s
+                    quick sort ... 0.0006 s (0.73)
+               AVX2 quick sort ... 0.0006 s (0.74)
+             AVX512 quick sort ... 0.0005 s (0.88)
+    AVX512 + popcnt quick sort ... 0.0005 s (0.89)
+      AVX512 + BMI2 quick sort ... 0.0005 s (0.87)
+items count: 100000 (400000 bytes), input descending
+                     std::sort ... 0.0048 s
+                    quick sort ... 0.0067 s (0.71)
+               AVX2 quick sort ... 0.0070 s (0.68)
+             AVX512 quick sort ... 0.0055 s (0.87)
+    AVX512 + popcnt quick sort ... 0.0056 s (0.86)
+      AVX512 + BMI2 quick sort ... 0.0058 s (0.83)
+items count: 1000000 (4000000 bytes), input descending
+                     std::sort ... 0.0551 s
+                    quick sort ... 0.0734 s (0.75)
+               AVX2 quick sort ... 0.0684 s (0.81)
+             AVX512 quick sort ... 0.0524 s (1.05)
+    AVX512 + popcnt quick sort ... 0.0526 s (1.05)
+      AVX512 + BMI2 quick sort ... 0.0547 s (1.01)
+items count: 2000000 (8000000 bytes), input descending
+                     std::sort ... 0.1154 s
+                    quick sort ... 0.1523 s (0.76)
+               AVX2 quick sort ... 0.1407 s (0.82)
+             AVX512 quick sort ... 0.1055 s (1.09)
+    AVX512 + popcnt quick sort ... 0.1065 s (1.08)
+      AVX512 + BMI2 quick sort ... 0.1111 s (1.04)
+items count: 5000000 (20000000 bytes), input descending
+                     std::sort ... 0.3300 s
+                    quick sort ... 0.3932 s (0.84)
+               AVX2 quick sort ... 0.3516 s (0.94)
+             AVX512 quick sort ... 0.2606 s (1.27)
+    AVX512 + popcnt quick sort ... 0.2602 s (1.27)
+      AVX512 + BMI2 quick sort ... 0.2657 s (1.24)
+items count: 10000000 (40000000 bytes), input descending
+                     std::sort ... 0.6890 s
+                    quick sort ... 0.8134 s (0.85)
+               AVX2 quick sort ... 0.7216 s (0.95)
+             AVX512 quick sort ... 0.5273 s (1.31)
+    AVX512 + popcnt quick sort ... 0.5351 s (1.29)
+      AVX512 + BMI2 quick sort ... 0.5392 s (1.28)
+items count: 20000000 (80000000 bytes), input descending
+                     std::sort ... 1.4358 s
+                    quick sort ... 1.6744 s (0.86)
+               AVX2 quick sort ... 1.4772 s (0.97)
+             AVX512 quick sort ... 1.0694 s (1.34)
+    AVX512 + popcnt quick sort ... 1.0675 s (1.34)
+      AVX512 + BMI2 quick sort ... 1.0929 s (1.31)
+items count: 10000 (40000 bytes), input random
+                     std::sort ... 0.0005 s
+                    quick sort ... 0.0006 s (0.98)
+               AVX2 quick sort ... 0.0006 s (0.99)
+             AVX512 quick sort ... 0.0005 s (1.18)
+    AVX512 + popcnt quick sort ... 0.0005 s (1.18)
+      AVX512 + BMI2 quick sort ... 0.0005 s (1.16)
+items count: 100000 (400000 bytes), input random
+                     std::sort ... 0.0063 s
+                    quick sort ... 0.0067 s (0.94)
+               AVX2 quick sort ... 0.0071 s (0.89)
+             AVX512 quick sort ... 0.0055 s (1.14)
+    AVX512 + popcnt quick sort ... 0.0056 s (1.13)
+      AVX512 + BMI2 quick sort ... 0.0058 s (1.08)
+items count: 1000000 (4000000 bytes), input random
+                     std::sort ... 0.0727 s
+                    quick sort ... 0.0742 s (0.98)
+               AVX2 quick sort ... 0.0684 s (1.06)
+             AVX512 quick sort ... 0.0522 s (1.39)
+    AVX512 + popcnt quick sort ... 0.0526 s (1.38)
+      AVX512 + BMI2 quick sort ... 0.0544 s (1.34)
+items count: 2000000 (8000000 bytes), input random
+                     std::sort ... 0.1533 s
+                    quick sort ... 0.1532 s (1.00)
+               AVX2 quick sort ... 0.1403 s (1.09)
+             AVX512 quick sort ... 0.1057 s (1.45)
+    AVX512 + popcnt quick sort ... 0.1066 s (1.44)
+      AVX512 + BMI2 quick sort ... 0.1101 s (1.39)
+items count: 5000000 (20000000 bytes), input random
+                     std::sort ... 0.4436 s
+                    quick sort ... 0.3932 s (1.13)
+               AVX2 quick sort ... 0.3529 s (1.26)
+             AVX512 quick sort ... 0.2622 s (1.69)
+    AVX512 + popcnt quick sort ... 0.2632 s (1.69)
+      AVX512 + BMI2 quick sort ... 0.2657 s (1.67)
+items count: 10000000 (40000000 bytes), input random
+                     std::sort ... 0.9235 s
+                    quick sort ... 0.8115 s (1.14)
+               AVX2 quick sort ... 0.7233 s (1.28)
+             AVX512 quick sort ... 0.5305 s (1.74)
+    AVX512 + popcnt quick sort ... 0.5328 s (1.73)
+      AVX512 + BMI2 quick sort ... 0.5379 s (1.72)
+items count: 20000000 (80000000 bytes), input random
+                     std::sort ... 1.9264 s
+                    quick sort ... 1.6716 s (1.15)
+               AVX2 quick sort ... 1.4898 s (1.29)
+             AVX512 quick sort ... 1.0853 s (1.77)
+    AVX512 + popcnt quick sort ... 1.0849 s (1.78)
+      AVX512 + BMI2 quick sort ... 1.0984 s (1.75)
--- a/simd-sort/results/skylake-i7-6700-gcc5.3.0-avx2.txt
+++ b/simd-sort/results/skylake-i7-6700-gcc5.3.0-avx2.txt
@ -0,0 +1,80 @@
+items count: 100000 (400000 bytes), input ascending
+                     std::sort ... 0.0011 s
+                    quick sort ... 0.0009 s (1.14)
+               AVX2 quick sort ... 0.0007 s (1.59)
+items count: 1000000 (4000000 bytes), input ascending
+                     std::sort ... 0.0122 s
+                    quick sort ... 0.0099 s (1.23)
+               AVX2 quick sort ... 0.0065 s (1.89)
+items count: 2000000 (8000000 bytes), input ascending
+                     std::sort ... 0.0257 s
+                    quick sort ... 0.0204 s (1.26)
+               AVX2 quick sort ... 0.0133 s (1.94)
+items count: 5000000 (20000000 bytes), input ascending
+                     std::sort ... 0.0724 s
+                    quick sort ... 0.0576 s (1.26)
+               AVX2 quick sort ... 0.0376 s (1.93)
+items count: 10000000 (40000000 bytes), input ascending
+                     std::sort ... 0.1516 s
+                    quick sort ... 0.1194 s (1.27)
+               AVX2 quick sort ... 0.0773 s (1.96)
+items count: 20000000 (80000000 bytes), input ascending
+                     std::sort ... 0.3167 s
+                    quick sort ... 0.2480 s (1.28)
+               AVX2 quick sort ... 0.1598 s (1.98)
+items count: 10000 (40000 bytes), input descending
+                     std::sort ... 0.0001 s
+                    quick sort ... 0.0001 s (0.91)
+               AVX2 quick sort ... 0.0001 s (1.15)
+items count: 100000 (400000 bytes), input descending
+                     std::sort ... 0.0009 s
+                    quick sort ... 0.0009 s (0.91)
+               AVX2 quick sort ... 0.0007 s (1.26)
+items count: 1000000 (4000000 bytes), input descending
+                     std::sort ... 0.0095 s
+                    quick sort ... 0.0099 s (0.96)
+               AVX2 quick sort ... 0.0065 s (1.47)
+items count: 2000000 (8000000 bytes), input descending
+                     std::sort ... 0.0196 s
+                    quick sort ... 0.0207 s (0.95)
+               AVX2 quick sort ... 0.0132 s (1.48)
+items count: 5000000 (20000000 bytes), input descending
+                     std::sort ... 0.0539 s
+                    quick sort ... 0.0576 s (0.94)
+               AVX2 quick sort ... 0.0377 s (1.43)
+items count: 10000000 (40000000 bytes), input descending
+                     std::sort ... 0.1130 s
+                    quick sort ... 0.1195 s (0.95)
+               AVX2 quick sort ... 0.0778 s (1.45)
+items count: 20000000 (80000000 bytes), input descending
+                     std::sort ... 0.2353 s
+                    quick sort ... 0.2479 s (0.95)
+               AVX2 quick sort ... 0.1604 s (1.47)
+items count: 10000 (40000 bytes), input random
+                     std::sort ... 0.0001 s
+                    quick sort ... 0.0001 s (1.19)
+               AVX2 quick sort ... 0.0001 s (1.51)
+items count: 100000 (400000 bytes), input random
+                     std::sort ... 0.0011 s
+                    quick sort ... 0.0009 s (1.13)
+               AVX2 quick sort ... 0.0007 s (1.59)
+items count: 1000000 (4000000 bytes), input random
+                     std::sort ... 0.0123 s
+                    quick sort ... 0.0099 s (1.23)
+               AVX2 quick sort ... 0.0064 s (1.91)
+items count: 2000000 (8000000 bytes), input random
+                     std::sort ... 0.0257 s
+                    quick sort ... 0.0207 s (1.24)
+               AVX2 quick sort ... 0.0133 s (1.93)
+items count: 5000000 (20000000 bytes), input random
+                     std::sort ... 0.0726 s
+                    quick sort ... 0.0578 s (1.26)
+               AVX2 quick sort ... 0.0377 s (1.93)
+items count: 10000000 (40000000 bytes), input random
+                     std::sort ... 0.1518 s
+                    quick sort ... 0.1200 s (1.26)
+               AVX2 quick sort ... 0.0778 s (1.95)
+items count: 20000000 (80000000 bytes), input random
+                     std::sort ... 0.3170 s
+                    quick sort ... 0.2490 s (1.27)
+               AVX2 quick sort ... 0.1614 s (1.96)
--- a/simd-sort/runtime_stats.cpp
+++ b/simd-sort/runtime_stats.cpp
@ -0,0 +1,76 @@
+class Histogram {
+    size_t hist[256];
+
+public:
+    Histogram() {
+        reset();
+    }
+
+    void hit(uint8_t v) {
+        hist[v] += 1;
+    }
+
+    void reset() {
+        memset(hist, 0, sizeof(hist));
+    }
+
+    bool empty() const {
+        for (int i=0; i < 256; i++) {
+            if (hist[i] > 0) {
+                return false;
+            }
+        }
+        
+        return true;
+    }
+
+public:
+    void print() const {
+        size_t total = 0;
+        for (int i=0; i < 256; i++) {
+            total += hist[i];
+        }
+
+        if (total == 0) return;
+
+        for (int i=0; i < 256; i++) {
+            if (hist[i] == 0) continue;
+
+            printf("%02x: %5lu (%5.2f%%)\n", i, hist[i], 100.0 * hist[i]/total);
+        }
+    }
+};
+
+class Statistics {
+public:
+    size_t partition_calls;
+    size_t items_processed;
+    size_t scalar__partition_calls;
+    size_t scalar__items_processed;
+    Histogram pvbyte_histogram;
+
+public:
+    Statistics() {
+        reset();
+    }
+
+    void reset() {
+        partition_calls = 0;
+        items_processed = 0;
+        scalar__partition_calls = 0;
+        scalar__items_processed = 0;
+
+        pvbyte_histogram.reset();
+    }
+
+    bool anything_collected() const {
+        return (partition_calls > 0)
+            || (items_processed > 0)
+            || (scalar__partition_calls > 0)
+            || (scalar__items_processed > 0)
+            || (!pvbyte_histogram.empty());
+    }
+};
+
+
+Statistics statistics;
--- a/simd-sort/speed.cpp
+++ b/simd-sort/speed.cpp
@ -0,0 +1,457 @@
+#include <cstdlib>
+#include <cstdio>
+#include <cstdint>
+#include <cstring>
+#include <cassert>
+#include <algorithm>
+#include <memory>
+
+#include "cmdline.cpp"
+#ifdef WITH_RUNTIME_STATS
+#   include "runtime_stats.cpp" // must be included before anything else
+#endif
+#include "input_data.cpp"
+#include "quicksort-all.cpp"
+#include "avx2-altquicksort.h"
+//#include "avx2-nate-quicksort.cpp"
+#include "../magyarsort.h" // mine
+#include "avx2-natenodutch-quicksort.h"
+#define USE_RDTSC // undef to get measurments in seconds
+#ifdef USE_RDTSC
+#   include "rdtsc.cpp"
+#else
+#   include "gettime.cpp"
+#endif
+
+void magyarsort_it(uint32_t* array, int left, int right) {
+	MagyarSort::sort(array + left, right - left);
+}
+
+class PerformanceTest final {
+
+    int         iterations;
+    InputData&  input;
+    uint32_t*   tmp;
+
+public:
+    PerformanceTest(int n, InputData& input)
+        : iterations(n)
+        , input(input) {
+
+        assert(iterations > 0);
+        tmp = new uint32_t[input.count()];
+    }
+
+    ~PerformanceTest() {
+        delete[] tmp;
+    }
+
+public:
+    template <typename SORT_FUNCTION>
+    uint64_t run(SORT_FUNCTION sort) {
+
+        uint64_t time = 0;
+
+        int k = iterations;
+        while (k--) {
+            memcpy(tmp, input.pointer(), input.size());
+
+            uint64_t t1, t2;
+
+#ifdef USE_RDTSC
+            RDTSC_START(t1);
+#else
+            t1 = get_time();
+#endif
+            sort(input.pointer(), 0, input.count() - 1);
+
+#ifdef USE_RDTSC
+            RDTSC_START(t2);
+#else
+            t2 = get_time();
+#endif
+
+            const uint64_t dt = t2 - t1;
+
+            if (time == 0) {
+                time = dt;
+            } else if (dt < time) {
+                time = dt;
+            }
+        }
+
+        return time;
+    }
+};
+
+
+enum class InputType {
+    randomfew,
+    randomuniq,
+    random,
+    ascending,
+    descending,
+};
+
+
+const char* as_string(InputType type) {
+    switch (type) {
+        case InputType::randomfew:
+            return "randomfew";
+
+        case InputType::randomuniq:
+            return "randomuniq";
+
+        case InputType::random:
+            return "random";
+
+        case InputType::ascending:
+            return "ascending";
+
+        case InputType::descending:
+            return "descending";
+
+        default:
+            return "<unknown>";
+    }
+}
+
+void std_qsort_wrapper(uint32_t* array, int left, int right) {
+
+    std::qsort(array + left, right - left + 1, sizeof(uint32_t),  [](const void* a, const void* b)
+    {
+        uint32_t a1 = *static_cast<const uint32_t*>(a);
+        uint32_t a2 = *static_cast<const uint32_t*>(b);
+
+        if(a1 < a2) return -1;
+        if(a1 > a2) return 1;
+        return 0;
+    });
+}
+
+
+void std_stable_sort_wrapper(uint32_t* array, int left, int right) {
+
+    std::stable_sort(array + left, array + right + 1);
+}
+
+
+void std_sort_wrapper(uint32_t* array, int left, int right) {
+
+    std::sort(array + left, array + right + 1);
+}
+
+
+class Flags {
+    public:
+        bool std_sort;
+        bool std_qsort;
+        bool std_stable_sort;
+        bool quicksort;
+        bool avx2;
+        bool magyar;
+        bool avx2_alt;
+        bool avx2_natenodutch;
+        bool avx512;
+        bool avx512_buf;
+        bool avx512_popcnt;
+        bool avx512_bmi;
+
+    public:
+        Flags(const CommandLine& cmd) {
+
+            enable_all(false);
+
+            bool any_set = false;
+            if (cmd.has("-std-sort")) {
+                std_sort = true;
+                any_set = true;
+            }
+
+            if (cmd.has("-std-qsort")) {
+                std_qsort = true;
+                any_set = true;
+            }
+
+            if (cmd.has("-std-stable-sort") || cmd.has("-std-stable")) {
+                std_stable_sort = true;
+                any_set = true;
+            }
+
+            if (cmd.has("-quicksort")) {
+                quicksort = true;
+                any_set = true;
+            }
+
+            if (cmd.has("-avx2")) {
+                avx2 = true;
+                any_set = true;
+            }
+
+            if (cmd.has("-avx2-alt")) {
+                avx2_alt = true;
+                any_set = true;
+            }
+
+            if (cmd.has("magyar")) {
+                magyar = true;
+                avx2_natenodutch = true;
+                any_set = true;
+            }
+
+            if (cmd.has("-avx512")) {
+                avx512 = true;
+                any_set = true;
+            }
+
+            if (cmd.has("-avx512-buf")) {
+                avx512_buf = true;
+                any_set = true;
+            }
+
+            if (cmd.has("-avx512-popcnt")) {
+                avx512_popcnt = true;
+                any_set = true;
+            }
+
+            if (cmd.has("-avx512-bmi")) {
+                avx512_bmi = true;
+                any_set = true;
+            }
+
+            if (!any_set) {
+                enable_all(true);
+            }
+        }
+
+        void enable_all(bool val) {
+            std_sort      = val;
+            std_qsort     = val;
+            std_stable_sort = val;
+            quicksort     = val;
+            avx2          = val;
+            magyar     = val;
+            avx2_alt      = val;
+            avx2_natenodutch = val;
+            avx512        = val;
+            avx512_buf    = val;
+            avx512_popcnt = val;
+            avx512_bmi    = val;
+        }
+};
+
+
+class Test {
+
+    std::unique_ptr<InputData> data;
+    InputType   type;
+    size_t      count;
+    int         iterations;
+    Flags       flags;
+    uint64_t    ref;
+
+public:
+    Test(InputType type, size_t count, int iterations, Flags&& flags)
+        : type(type)
+        , count(count)
+        , iterations(iterations)
+        , flags(std::move(flags)) {
+
+        switch (type) {
+            case InputType::randomfew:
+                data.reset(new InputRandomFew(count));
+                break;
+
+            case InputType::randomuniq:
+                data.reset(new InputRandomUnique(count));
+                break;
+
+            case InputType::random:
+                data.reset(new InputRandom(count));
+                break;
+
+            case InputType::ascending:
+                data.reset(new InputAscending(count));
+                break;
+
+            case InputType::descending:
+                data.reset(new InputDescending(count));
+                break;
+        }
+    }
+
+    void run() {
+
+        printf("items count: %lu (%lu bytes), input %s\n", data->count(), data->size(), as_string(type));
+
+        ref = 0;
+
+        if (flags.std_sort) {
+            measure("std::sort", std_sort_wrapper);
+        }
+
+        if (flags.std_qsort) {
+            measure("std::qsort", std_qsort_wrapper);
+        }
+
+        if (flags.std_stable_sort) {
+            measure("std::stable_sort", std_stable_sort_wrapper);
+        }
+
+        if (flags.std_qsort) {
+            measure("quick sort", quicksort);
+        }
+#ifdef HAVE_AVX2_INSTRUCTIONS
+        if (flags.avx2) {
+            measure("AVX2 quick sort", qs::avx2::quicksort);
+        }
+
+        if (flags.avx2_natenodutch) {
+            measure("AVX2 nate nodutch", avx_natenodutch_quicksort);
+        }
+
+        if (flags.avx2_alt) {
+            measure("AVX2 alt quicksort", wrapped_avx2_pivotonlast_sort);
+        }
+
+        if (flags.magyar) {
+            measure("Magyarsort variant", magyarsort_it);
+        }
+
+#endif
+#ifdef HAVE_AVX512F_INSTRUCTIONS
+        if (flags.avx512) {
+            measure("AVX512 quick sort", qs::avx512::quicksort);
+        }
+
+        if (flags.avx512_buf) {
+            measure("AVX512 quick sort - aux buf", qs::avx512::auxbuffer_quicksort);
+        }
+
+        if (flags.avx512_popcnt) {
+            measure("AVX512 + popcnt quick sort", qs::avx512::popcnt_quicksort);
+        }
+
+        if (flags.avx512_bmi) {
+            measure("AVX512 + BMI2 quick sort", qs::avx512::bmi2_quicksort);
+        }
+#endif
+    }
+
+private:
+    template <typename SORT_FUNCTION>
+    void measure(const char* name, SORT_FUNCTION sort) {
+
+        PerformanceTest test(iterations, *data);
+
+        printf("%30s ... ", name); fflush(stdout);
+#ifdef WITH_RUNTIME_STATS
+        statistics.reset();
+#endif
+        uint64_t time = test.run(sort);
+
+#ifdef USE_RDTSC
+        printf("%10lu cycles", time);
+        if (ref > 0) {
+            printf(" (%0.2f)", ref/double(time));
+        }
+
+#   ifdef WITH_RUNTIME_STATS
+        if (statistics.anything_collected()) {
+            printf("\n");
+            printf("\t\tpartition calls: %lu (+%lu scalar)\n", statistics.partition_calls, statistics.scalar__partition_calls);
+            printf("\t\titems processed: %lu (+%lu by scalar partition)\n", statistics.items_processed, statistics.scalar__items_processed);
+
+            const size_t total_items = statistics.items_processed + statistics.scalar__items_processed;
+
+            if (total_items != 0) {
+                const double cpi = double(time)/total_items;
+                printf("\t\t               : %0.4f cycles/item\n", cpi * iterations);
+            }
+
+            if (!statistics.pvbyte_histogram.empty()) {
+                puts("Histogram for pvbyte:");
+                statistics.pvbyte_histogram.print();
+            }
+        }
+#   endif // WITH_RUNTIME_STATS
+
+#else
+        printf("%0.4f s", time/1000000.0);
+        if (ref > 0) {
+            printf(" (%0.2f)\n", ref/double(time));
+        }
+#endif
+        putchar('\n');
+
+        if (ref == 0) {
+            ref = time;
+        }
+    }
+};
+
+
+// ------------------------------------------------------------
+
+
+void usage() {
+    puts("usage:");
+    puts("speed SIZE ITERATIONS INPUT [options]");
+    puts("");
+    puts("where");
+    puts("* SIZE       - number of 32-bit elements");
+    puts("* ITERATIONS - number of iterations");
+    puts("* INPUT      - one of:");
+    puts("                 ascending (or asc)");
+    puts("                 descending (or dsc, desc)");
+    puts("                 random (or rnd, rand)");
+    puts("                 randomfew");
+    puts("                 randomuniq");
+    puts("options      - optional name of procedure(s) to run");
+}
+
+
+int main(int argc, char* argv[]) {
+
+    if (argc < 4) {
+        usage();
+        return EXIT_FAILURE;
+    }
+
+    int count      = atoi(argv[1]);
+    int iterations = atoi(argv[2]);
+    InputType type;
+
+#define is_keyword(key) (strcmp(argv[3], key) == 0)
+    if (is_keyword("descending") || is_keyword("desc") || is_keyword("dsc")) {
+        type = InputType::descending;
+    } else if (is_keyword("ascending") || is_keyword("asc")) {
+        type = InputType::ascending;
+    } else if (is_keyword("random") || is_keyword("rnd") || is_keyword("rand")) {
+        type = InputType::random;
+    } else if (is_keyword("randomfew")) {
+        type = InputType::randomfew;
+    } else if (is_keyword("randomuniq")) {
+        type = InputType::randomuniq;
+    } else {
+        usage();
+        return EXIT_FAILURE;
+    }
+#undef is_keyword
+
+#ifdef HAVE_AVX512F_INSTRUCTIONS
+    #ifdef POPCNT_LOOKUP
+        prepare_lookup();
+    #endif
+#endif
+
+#ifdef USE_RDTSC
+    RDTSC_SET_OVERHEAD(rdtsc_overhead_func(1), iterations);
+#endif
+    CommandLine cmd(argc, argv);
+    Flags flags(cmd);
+    Test test(type, count, iterations, std::move(flags));
+    test.run();
+
+    return EXIT_SUCCESS;
+}
--- a/simd-sort/speed_avx2
+++ b/simd-sort/speed_avx2
--- a/simd-sort/test.cpp
+++ b/simd-sort/test.cpp
@ -0,0 +1,261 @@
+#include <cstdlib>
+#include <cstdio>
+#include <cstdint>
+#include <cstring>
+#include <cassert>
+#include <immintrin.h>
+
+#include "cmdline.cpp"
+#include "input_data.cpp"
+#include "quicksort-all.cpp"
+#include "avx2-altquicksort.h"
+#include "avx2-nate-quicksort.cpp"
+
+
+bool is_sorted(uint32_t* array, size_t n) {
+    assert(n > 0);
+    for (size_t i=1; i < n; i++) {
+        if (array[i - 1] > array[i]) {
+            printf("mismatch at %lu\n", i);
+            return false;
+        }
+    }
+
+    return true;
+}
+
+
+const size_t AVX512_REGISTER_SIZE = 16;
+
+
+class Test {
+
+    bool verbose;
+
+public:
+    Test(bool v = true) : verbose(v) {}
+
+    template <typename SORT_FN>
+    bool run(SORT_FN sort) {
+        const size_t start = 2*AVX512_REGISTER_SIZE;
+        const size_t end   = 256*AVX512_REGISTER_SIZE;
+
+        if (verbose) {
+            putchar('\n');
+        }
+
+        for (size_t size=start; size < end; size += 1) {
+
+            if (verbose) {
+                printf("%lu/%lu\r", size, end);
+                fflush(stdout);
+            }
+
+            InputAscending  asc(size);
+            InputDescending dsc(size);
+            InputRandom     rnd(size);
+            InputRandomFew  rndfew(size);
+
+            if (!test(sort, asc)) {
+                printf("failed for size %lu, intput ascending\n", size);
+                return false;
+            }
+
+            if (!test(sort, dsc)) {
+                printf("failed for size %lu, intput descending\n", size);
+                return false;
+            }
+
+            if (!test(sort, rnd)) {
+                printf("failed for size %lu, intput random\n", size);
+                return false;
+            }
+
+            if (!test(sort, rndfew)) {
+                printf("failed for size %lu, intput random few\n", size);
+                return false;
+            }
+        } // for
+
+        if (verbose) {
+            putchar('\n');
+        }
+
+        return true;
+    }
+
+
+private:
+    template <typename SORT_FN>
+    bool test(SORT_FN sort, InputData& data) {
+        sort(data.pointer(), 0, data.count() - 1);
+
+        return is_sorted(data.pointer(), data.count());
+    }
+};
+
+
+class Flags {
+    public:
+        bool avx2;
+        bool avx2_alt;
+        bool avx2_nate;
+        bool avx512;
+        bool avx512_buf;
+        bool avx512_popcnt;
+        bool avx512_bmi;
+
+    public:
+        Flags(const CommandLine& cmd) {
+
+            enable_all(false);
+
+            bool any_set = false;
+            if (cmd.has("-avx2")) {
+                avx2 = true;
+                any_set = true;
+            }
+
+            if (cmd.has("-avx2-alt")) {
+                avx2_alt = true;
+                any_set = true;
+            }
+
+            if (cmd.has("-avx2-nate")) {
+                avx2_nate = true;
+                any_set = true;
+            }
+
+            if (cmd.has("-avx512")) {
+                avx512 = true;
+                any_set = true;
+            }
+
+            if (cmd.has("-avx512-buf")) {
+                avx512_buf = true;
+                any_set = true;
+            }
+
+            if (cmd.has("-avx512-popcnt")) {
+                avx512_popcnt = true;
+                any_set = true;
+            }
+
+            if (cmd.has("-avx512-bmi")) {
+                avx512_bmi = true;
+                any_set = true;
+            }
+
+            if (!any_set) {
+                enable_all(true);
+            }
+        }
+
+        void enable_all(bool val) {
+            avx2          = val;
+            avx2_nate     = val;
+            avx2_alt      = val;
+            avx512        = val;
+            avx512_buf    = val;
+            avx512_popcnt = val;
+            avx512_bmi    = val;
+        }
+};
+
+
+int main(int argc, char* argv[]) {
+
+    CommandLine cmd(argc, argv);
+
+    puts("Please wait, it might take a while...");
+    puts("");
+
+    bool verbose = cmd.has("-v") || cmd.has("--verbose");
+    Flags flags(cmd);
+
+    Test test(verbose);
+    int ret = EXIT_SUCCESS;
+
+#ifdef HAVE_AVX2_INSTRUCTIONS
+    if (flags.avx2) {
+        printf("AVX2 base version... "); fflush(stdout);
+        if (test.run(qs::avx2::quicksort)) {
+            puts("OK");
+        } else {
+            puts("FAILED");
+            ret = EXIT_FAILURE;
+        }
+    }
+
+    if (flags.avx2_alt) {
+        printf("AVX2 alt version... "); fflush(stdout);
+        if (test.run(wrapped_avx2_pivotonlast_sort)) {
+            puts("OK");
+        } else {
+            puts("FAILED");
+            ret = EXIT_FAILURE;
+        }
+    }
+
+    if (flags.avx2_nate) {
+        printf("AVX2 Nate's variant... "); fflush(stdout);
+        if (test.run(nate::wrapped_avx2_pivotonlast_sort)) {
+            puts("OK");
+        } else {
+            puts("FAILED");
+            ret = EXIT_FAILURE;
+        }
+    }
+#endif
+
+#ifdef HAVE_AVX512F_INSTRUCTIONS
+
+#ifdef POPCNT_LOOKUP
+    prepare_lookup();
+#endif
+
+    if (flags.avx512) {
+        printf("AVX512 base version... "); fflush(stdout);
+        if (test.run(qs::avx512::quicksort)) {
+            puts("OK");
+        } else {
+            puts("FAILED");
+            ret = EXIT_FAILURE;
+        }
+    }
+
+    if (flags.avx512_popcnt) {
+        printf("AVX512 + popcnt version... "); fflush(stdout);
+        if (test.run(qs::avx512::popcnt_quicksort)) {
+            puts("OK");
+        } else {
+            puts("FAILED");
+            ret = EXIT_FAILURE;
+        }
+    }
+
+    if (flags.avx512_buf) {
+        printf("AVX512 with aux buffers... "); fflush(stdout);
+        if (test.run(qs::avx512::auxbuffer_quicksort)) {
+            puts("OK");
+        } else {
+            puts("FAILED");
+            ret = EXIT_FAILURE;
+        }
+    }
+
+#if 0
+    if (flags.avx512_bmi) {
+        printf("AVX512 + bmi2 version ... "); fflush(stdout);
+        if (test.run(qs::avx512::bmi2_quicksort)) {
+            puts("OK");
+        } else {
+            puts("FAILED");
+            ret = EXIT_FAILURE;
+        }
+    }
+#endif
+#endif // HAVE_AVX512F_INSTRUCTIONS
+
+    return ret;
+}