diff --git a/simd-sort/LICENSE b/simd-sort/LICENSE new file mode 100644 index 0000000..a571ebb --- /dev/null +++ b/simd-sort/LICENSE @@ -0,0 +1,25 @@ +Copyright (c) 2016, Wojciech Muła, Daniel Lemire +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: + +1. Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS +IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED +TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A +PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED +TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. diff --git a/simd-sort/Makefile b/simd-sort/Makefile new file mode 100644 index 0000000..7675676 --- /dev/null +++ b/simd-sort/Makefile @@ -0,0 +1,57 @@ +.SUFFIXES: +.PHONY: all clean + +FLAGS=-std=c++11 -mbmi2 -Wall -pedantic -Wextra +FLAGS_AVX512=$(FLAGS) -mavx512f -DHAVE_AVX512F_INSTRUCTIONS -DHAVE_AVX2_INSTRUCTIONS +FLAGS_AVX2=$(FLAGS) -mavx2 -DHAVE_AVX2_INSTRUCTIONS + +DEPS_SORT=partition.cpp \ + avx2-partition.cpp \ + avx2-quicksort.cpp \ + avx2-altquicksort.h \ + avx2-natenodutch-quicksort.h \ + avx2-nate-quicksort.cpp \ + avx512-swap.cpp \ + avx512-partition.cpp \ + avx512-auxbuffer-partition.cpp \ + avx512-bmi2-partition.cpp \ + avx512-popcnt-partition.cpp \ + avx512-quicksort.cpp \ + avx512-sort-register.cpp \ + avx512-partition-register.cpp \ + quicksort.cpp + +SPEED_DEPS=$(DEPS_SORT) speed.cpp gettime.cpp rdtsc.cpp runtime_stats.cpp +SPEED_FLAGS=-O3 -DNDEBUG + +ALL=test speed test_avx2 speed_avx2 speed_stats speed_avx2_stats + +all: $(ALL) + +test: test.cpp input_data.cpp $(DEPS_SORT) + $(CXX) $(FLAGS_AVX512) -fsanitize=address test.cpp -o $@ + +test_avx2: test.cpp input_data.cpp $(DEPS_SORT) + #$(CXX) $(FLAGS_AVX2) -fsanitize=address test.cpp -o $@ + $(CXX) $(FLAGS_AVX2) test.cpp -o $@ + +speed: $(SPEED_DEPS) + $(CXX) $(FLAGS_AVX512) $(SPEED_FLAGS) speed.cpp -o $@ + +speed_avx2: $(SPEED_DEPS) + $(CXX) $(FLAGS_AVX2) $(SPEED_FLAGS) speed.cpp -o $@ + +speed_stats: $(SPEED_DEPS) + $(CXX) $(FLAGS_AVX512) $(SPEED_FLAGS) -DWITH_RUNTIME_STATS speed.cpp -o $@ + +speed_avx2_stats: $(SPEED_DEPS) + $(CXX) $(FLAGS_AVX2) $(SPEED_FLAGS) -DWITH_RUNTIME_STATS speed.cpp -o $@ + +run: test + sde -cnl -- ./$^ + +run_avx2: test_avx2 + sde -cnl -- ./$^ + +clean: + rm -f $(ALL) diff --git a/simd-sort/README.rst b/simd-sort/README.rst new file mode 100644 index 0000000..9f9c385 --- /dev/null +++ b/simd-sort/README.rst @@ -0,0 +1,48 @@ +================================================================================ + SIMD sorting +================================================================================ + +Overview +-------------------------------------------------- + +This repository contains SIMD versions of quicksort, using **AVX512F** +and **AVX2** instruction sets. The subdirectory ``results`` contains +performance results. + +The code is still unfinished, it may be subject of changes. + +Although programs are written in C++, most procedures are nearly +plain C, except use of namespaces and references in place of +pointers. + + +Building +-------------------------------------------------- + +At least GCC 5.3.0 is needed. Type ``make`` to build AVX512F +and AVX2 versions of two programs: + +* ``test``/``test_avx2`` --- tests if SIMD-ized sorting procedures + work correctly for various inputs. + +* ``speed``/``speed_avx2`` --- compares speed of different procedures, + scalar and SIMD for various inputs. + +* ``speed_stats``/``speed_avx2_stats`` --- similar to ``speed``, but + some procedures collect and print runtime statistics. Note that + this gives overhead, so for real performance tests the basic variant + of the programs should be used. + + +AVX512 +-------------------------------------------------- + +`Intel Software Development Emulator`__ can be used to run AVX512 variants. + +__ https://software.intel.com/en-us/articles/intel-software-development-emulator + + +References +-------------------------------------------------- + +- Fast Sorting Algorithms using AVX-512 on Intel Knights Landing https://arxiv.org/pdf/1704.08579.pdf diff --git a/simd-sort/avx2-altquicksort.h b/simd-sort/avx2-altquicksort.h new file mode 100644 index 0000000..aa23ef0 --- /dev/null +++ b/simd-sort/avx2-altquicksort.h @@ -0,0 +1,461 @@ +#pragma once + +#include +#include + +/** +* This is an alternative approach to SIMD quicksort, implemented by D. Lemire. +* It is not meant to be fast as such, but it can serve as a useful reference. +*/ + +// When defined smaller reverseshufflemask is used. +//#define BYTE_PATTERN + +// When defined histogram for comparison mask is collected +//#define WITH_PVBYTE_HISTOGRAM + +#if WITH_RUNTIME_STATS == 0 && defined(WITH_PVBYTE_HISTOGRAM) +# undef WITH_RUNTIME_STATS +#endif + +// can be replaced with VCOMPRESS on AVX-512 +static +#ifdef BYTE_PATTERN + uint8_t +#else + uint32_t +#endif +reverseshufflemask[256 * 8] __attribute__((aligned(0x100))) = { + 0, 1, 2, 3, 4, 5, 6, 7, /* 0*/ + 1, 2, 3, 4, 5, 6, 7, 0, /* 1*/ + 0, 2, 3, 4, 5, 6, 7, 1, /* 2*/ + 2, 3, 4, 5, 6, 7, 0, 1, /* 3*/ + 0, 1, 3, 4, 5, 6, 7, 2, /* 4*/ + 1, 3, 4, 5, 6, 7, 0, 2, /* 5*/ + 0, 3, 4, 5, 6, 7, 1, 2, /* 6*/ + 3, 4, 5, 6, 7, 0, 1, 2, /* 7*/ + 0, 1, 2, 4, 5, 6, 7, 3, /* 8*/ + 1, 2, 4, 5, 6, 7, 0, 3, /* 9*/ + 0, 2, 4, 5, 6, 7, 1, 3, /* 10*/ + 2, 4, 5, 6, 7, 0, 1, 3, /* 11*/ + 0, 1, 4, 5, 6, 7, 2, 3, /* 12*/ + 1, 4, 5, 6, 7, 0, 2, 3, /* 13*/ + 0, 4, 5, 6, 7, 1, 2, 3, /* 14*/ + 4, 5, 6, 7, 0, 1, 2, 3, /* 15*/ + 0, 1, 2, 3, 5, 6, 7, 4, /* 16*/ + 1, 2, 3, 5, 6, 7, 0, 4, /* 17*/ + 0, 2, 3, 5, 6, 7, 1, 4, /* 18*/ + 2, 3, 5, 6, 7, 0, 1, 4, /* 19*/ + 0, 1, 3, 5, 6, 7, 2, 4, /* 20*/ + 1, 3, 5, 6, 7, 0, 2, 4, /* 21*/ + 0, 3, 5, 6, 7, 1, 2, 4, /* 22*/ + 3, 5, 6, 7, 0, 1, 2, 4, /* 23*/ + 0, 1, 2, 5, 6, 7, 3, 4, /* 24*/ + 1, 2, 5, 6, 7, 0, 3, 4, /* 25*/ + 0, 2, 5, 6, 7, 1, 3, 4, /* 26*/ + 2, 5, 6, 7, 0, 1, 3, 4, /* 27*/ + 0, 1, 5, 6, 7, 2, 3, 4, /* 28*/ + 1, 5, 6, 7, 0, 2, 3, 4, /* 29*/ + 0, 5, 6, 7, 1, 2, 3, 4, /* 30*/ + 5, 6, 7, 0, 1, 2, 3, 4, /* 31*/ + 0, 1, 2, 3, 4, 6, 7, 5, /* 32*/ + 1, 2, 3, 4, 6, 7, 0, 5, /* 33*/ + 0, 2, 3, 4, 6, 7, 1, 5, /* 34*/ + 2, 3, 4, 6, 7, 0, 1, 5, /* 35*/ + 0, 1, 3, 4, 6, 7, 2, 5, /* 36*/ + 1, 3, 4, 6, 7, 0, 2, 5, /* 37*/ + 0, 3, 4, 6, 7, 1, 2, 5, /* 38*/ + 3, 4, 6, 7, 0, 1, 2, 5, /* 39*/ + 0, 1, 2, 4, 6, 7, 3, 5, /* 40*/ + 1, 2, 4, 6, 7, 0, 3, 5, /* 41*/ + 0, 2, 4, 6, 7, 1, 3, 5, /* 42*/ + 2, 4, 6, 7, 0, 1, 3, 5, /* 43*/ + 0, 1, 4, 6, 7, 2, 3, 5, /* 44*/ + 1, 4, 6, 7, 0, 2, 3, 5, /* 45*/ + 0, 4, 6, 7, 1, 2, 3, 5, /* 46*/ + 4, 6, 7, 0, 1, 2, 3, 5, /* 47*/ + 0, 1, 2, 3, 6, 7, 4, 5, /* 48*/ + 1, 2, 3, 6, 7, 0, 4, 5, /* 49*/ + 0, 2, 3, 6, 7, 1, 4, 5, /* 50*/ + 2, 3, 6, 7, 0, 1, 4, 5, /* 51*/ + 0, 1, 3, 6, 7, 2, 4, 5, /* 52*/ + 1, 3, 6, 7, 0, 2, 4, 5, /* 53*/ + 0, 3, 6, 7, 1, 2, 4, 5, /* 54*/ + 3, 6, 7, 0, 1, 2, 4, 5, /* 55*/ + 0, 1, 2, 6, 7, 3, 4, 5, /* 56*/ + 1, 2, 6, 7, 0, 3, 4, 5, /* 57*/ + 0, 2, 6, 7, 1, 3, 4, 5, /* 58*/ + 2, 6, 7, 0, 1, 3, 4, 5, /* 59*/ + 0, 1, 6, 7, 2, 3, 4, 5, /* 60*/ + 1, 6, 7, 0, 2, 3, 4, 5, /* 61*/ + 0, 6, 7, 1, 2, 3, 4, 5, /* 62*/ + 6, 7, 0, 1, 2, 3, 4, 5, /* 63*/ + 0, 1, 2, 3, 4, 5, 7, 6, /* 64*/ + 1, 2, 3, 4, 5, 7, 0, 6, /* 65*/ + 0, 2, 3, 4, 5, 7, 1, 6, /* 66*/ + 2, 3, 4, 5, 7, 0, 1, 6, /* 67*/ + 0, 1, 3, 4, 5, 7, 2, 6, /* 68*/ + 1, 3, 4, 5, 7, 0, 2, 6, /* 69*/ + 0, 3, 4, 5, 7, 1, 2, 6, /* 70*/ + 3, 4, 5, 7, 0, 1, 2, 6, /* 71*/ + 0, 1, 2, 4, 5, 7, 3, 6, /* 72*/ + 1, 2, 4, 5, 7, 0, 3, 6, /* 73*/ + 0, 2, 4, 5, 7, 1, 3, 6, /* 74*/ + 2, 4, 5, 7, 0, 1, 3, 6, /* 75*/ + 0, 1, 4, 5, 7, 2, 3, 6, /* 76*/ + 1, 4, 5, 7, 0, 2, 3, 6, /* 77*/ + 0, 4, 5, 7, 1, 2, 3, 6, /* 78*/ + 4, 5, 7, 0, 1, 2, 3, 6, /* 79*/ + 0, 1, 2, 3, 5, 7, 4, 6, /* 80*/ + 1, 2, 3, 5, 7, 0, 4, 6, /* 81*/ + 0, 2, 3, 5, 7, 1, 4, 6, /* 82*/ + 2, 3, 5, 7, 0, 1, 4, 6, /* 83*/ + 0, 1, 3, 5, 7, 2, 4, 6, /* 84*/ + 1, 3, 5, 7, 0, 2, 4, 6, /* 85*/ + 0, 3, 5, 7, 1, 2, 4, 6, /* 86*/ + 3, 5, 7, 0, 1, 2, 4, 6, /* 87*/ + 0, 1, 2, 5, 7, 3, 4, 6, /* 88*/ + 1, 2, 5, 7, 0, 3, 4, 6, /* 89*/ + 0, 2, 5, 7, 1, 3, 4, 6, /* 90*/ + 2, 5, 7, 0, 1, 3, 4, 6, /* 91*/ + 0, 1, 5, 7, 2, 3, 4, 6, /* 92*/ + 1, 5, 7, 0, 2, 3, 4, 6, /* 93*/ + 0, 5, 7, 1, 2, 3, 4, 6, /* 94*/ + 5, 7, 0, 1, 2, 3, 4, 6, /* 95*/ + 0, 1, 2, 3, 4, 7, 5, 6, /* 96*/ + 1, 2, 3, 4, 7, 0, 5, 6, /* 97*/ + 0, 2, 3, 4, 7, 1, 5, 6, /* 98*/ + 2, 3, 4, 7, 0, 1, 5, 6, /* 99*/ + 0, 1, 3, 4, 7, 2, 5, 6, /* 100*/ + 1, 3, 4, 7, 0, 2, 5, 6, /* 101*/ + 0, 3, 4, 7, 1, 2, 5, 6, /* 102*/ + 3, 4, 7, 0, 1, 2, 5, 6, /* 103*/ + 0, 1, 2, 4, 7, 3, 5, 6, /* 104*/ + 1, 2, 4, 7, 0, 3, 5, 6, /* 105*/ + 0, 2, 4, 7, 1, 3, 5, 6, /* 106*/ + 2, 4, 7, 0, 1, 3, 5, 6, /* 107*/ + 0, 1, 4, 7, 2, 3, 5, 6, /* 108*/ + 1, 4, 7, 0, 2, 3, 5, 6, /* 109*/ + 0, 4, 7, 1, 2, 3, 5, 6, /* 110*/ + 4, 7, 0, 1, 2, 3, 5, 6, /* 111*/ + 0, 1, 2, 3, 7, 4, 5, 6, /* 112*/ + 1, 2, 3, 7, 0, 4, 5, 6, /* 113*/ + 0, 2, 3, 7, 1, 4, 5, 6, /* 114*/ + 2, 3, 7, 0, 1, 4, 5, 6, /* 115*/ + 0, 1, 3, 7, 2, 4, 5, 6, /* 116*/ + 1, 3, 7, 0, 2, 4, 5, 6, /* 117*/ + 0, 3, 7, 1, 2, 4, 5, 6, /* 118*/ + 3, 7, 0, 1, 2, 4, 5, 6, /* 119*/ + 0, 1, 2, 7, 3, 4, 5, 6, /* 120*/ + 1, 2, 7, 0, 3, 4, 5, 6, /* 121*/ + 0, 2, 7, 1, 3, 4, 5, 6, /* 122*/ + 2, 7, 0, 1, 3, 4, 5, 6, /* 123*/ + 0, 1, 7, 2, 3, 4, 5, 6, /* 124*/ + 1, 7, 0, 2, 3, 4, 5, 6, /* 125*/ + 0, 7, 1, 2, 3, 4, 5, 6, /* 126*/ + 7, 0, 1, 2, 3, 4, 5, 6, /* 127*/ + 0, 1, 2, 3, 4, 5, 6, 7, /* 128*/ + 1, 2, 3, 4, 5, 6, 0, 7, /* 129*/ + 0, 2, 3, 4, 5, 6, 1, 7, /* 130*/ + 2, 3, 4, 5, 6, 0, 1, 7, /* 131*/ + 0, 1, 3, 4, 5, 6, 2, 7, /* 132*/ + 1, 3, 4, 5, 6, 0, 2, 7, /* 133*/ + 0, 3, 4, 5, 6, 1, 2, 7, /* 134*/ + 3, 4, 5, 6, 0, 1, 2, 7, /* 135*/ + 0, 1, 2, 4, 5, 6, 3, 7, /* 136*/ + 1, 2, 4, 5, 6, 0, 3, 7, /* 137*/ + 0, 2, 4, 5, 6, 1, 3, 7, /* 138*/ + 2, 4, 5, 6, 0, 1, 3, 7, /* 139*/ + 0, 1, 4, 5, 6, 2, 3, 7, /* 140*/ + 1, 4, 5, 6, 0, 2, 3, 7, /* 141*/ + 0, 4, 5, 6, 1, 2, 3, 7, /* 142*/ + 4, 5, 6, 0, 1, 2, 3, 7, /* 143*/ + 0, 1, 2, 3, 5, 6, 4, 7, /* 144*/ + 1, 2, 3, 5, 6, 0, 4, 7, /* 145*/ + 0, 2, 3, 5, 6, 1, 4, 7, /* 146*/ + 2, 3, 5, 6, 0, 1, 4, 7, /* 147*/ + 0, 1, 3, 5, 6, 2, 4, 7, /* 148*/ + 1, 3, 5, 6, 0, 2, 4, 7, /* 149*/ + 0, 3, 5, 6, 1, 2, 4, 7, /* 150*/ + 3, 5, 6, 0, 1, 2, 4, 7, /* 151*/ + 0, 1, 2, 5, 6, 3, 4, 7, /* 152*/ + 1, 2, 5, 6, 0, 3, 4, 7, /* 153*/ + 0, 2, 5, 6, 1, 3, 4, 7, /* 154*/ + 2, 5, 6, 0, 1, 3, 4, 7, /* 155*/ + 0, 1, 5, 6, 2, 3, 4, 7, /* 156*/ + 1, 5, 6, 0, 2, 3, 4, 7, /* 157*/ + 0, 5, 6, 1, 2, 3, 4, 7, /* 158*/ + 5, 6, 0, 1, 2, 3, 4, 7, /* 159*/ + 0, 1, 2, 3, 4, 6, 5, 7, /* 160*/ + 1, 2, 3, 4, 6, 0, 5, 7, /* 161*/ + 0, 2, 3, 4, 6, 1, 5, 7, /* 162*/ + 2, 3, 4, 6, 0, 1, 5, 7, /* 163*/ + 0, 1, 3, 4, 6, 2, 5, 7, /* 164*/ + 1, 3, 4, 6, 0, 2, 5, 7, /* 165*/ + 0, 3, 4, 6, 1, 2, 5, 7, /* 166*/ + 3, 4, 6, 0, 1, 2, 5, 7, /* 167*/ + 0, 1, 2, 4, 6, 3, 5, 7, /* 168*/ + 1, 2, 4, 6, 0, 3, 5, 7, /* 169*/ + 0, 2, 4, 6, 1, 3, 5, 7, /* 170*/ + 2, 4, 6, 0, 1, 3, 5, 7, /* 171*/ + 0, 1, 4, 6, 2, 3, 5, 7, /* 172*/ + 1, 4, 6, 0, 2, 3, 5, 7, /* 173*/ + 0, 4, 6, 1, 2, 3, 5, 7, /* 174*/ + 4, 6, 0, 1, 2, 3, 5, 7, /* 175*/ + 0, 1, 2, 3, 6, 4, 5, 7, /* 176*/ + 1, 2, 3, 6, 0, 4, 5, 7, /* 177*/ + 0, 2, 3, 6, 1, 4, 5, 7, /* 178*/ + 2, 3, 6, 0, 1, 4, 5, 7, /* 179*/ + 0, 1, 3, 6, 2, 4, 5, 7, /* 180*/ + 1, 3, 6, 0, 2, 4, 5, 7, /* 181*/ + 0, 3, 6, 1, 2, 4, 5, 7, /* 182*/ + 3, 6, 0, 1, 2, 4, 5, 7, /* 183*/ + 0, 1, 2, 6, 3, 4, 5, 7, /* 184*/ + 1, 2, 6, 0, 3, 4, 5, 7, /* 185*/ + 0, 2, 6, 1, 3, 4, 5, 7, /* 186*/ + 2, 6, 0, 1, 3, 4, 5, 7, /* 187*/ + 0, 1, 6, 2, 3, 4, 5, 7, /* 188*/ + 1, 6, 0, 2, 3, 4, 5, 7, /* 189*/ + 0, 6, 1, 2, 3, 4, 5, 7, /* 190*/ + 6, 0, 1, 2, 3, 4, 5, 7, /* 191*/ + 0, 1, 2, 3, 4, 5, 6, 7, /* 192*/ + 1, 2, 3, 4, 5, 0, 6, 7, /* 193*/ + 0, 2, 3, 4, 5, 1, 6, 7, /* 194*/ + 2, 3, 4, 5, 0, 1, 6, 7, /* 195*/ + 0, 1, 3, 4, 5, 2, 6, 7, /* 196*/ + 1, 3, 4, 5, 0, 2, 6, 7, /* 197*/ + 0, 3, 4, 5, 1, 2, 6, 7, /* 198*/ + 3, 4, 5, 0, 1, 2, 6, 7, /* 199*/ + 0, 1, 2, 4, 5, 3, 6, 7, /* 200*/ + 1, 2, 4, 5, 0, 3, 6, 7, /* 201*/ + 0, 2, 4, 5, 1, 3, 6, 7, /* 202*/ + 2, 4, 5, 0, 1, 3, 6, 7, /* 203*/ + 0, 1, 4, 5, 2, 3, 6, 7, /* 204*/ + 1, 4, 5, 0, 2, 3, 6, 7, /* 205*/ + 0, 4, 5, 1, 2, 3, 6, 7, /* 206*/ + 4, 5, 0, 1, 2, 3, 6, 7, /* 207*/ + 0, 1, 2, 3, 5, 4, 6, 7, /* 208*/ + 1, 2, 3, 5, 0, 4, 6, 7, /* 209*/ + 0, 2, 3, 5, 1, 4, 6, 7, /* 210*/ + 2, 3, 5, 0, 1, 4, 6, 7, /* 211*/ + 0, 1, 3, 5, 2, 4, 6, 7, /* 212*/ + 1, 3, 5, 0, 2, 4, 6, 7, /* 213*/ + 0, 3, 5, 1, 2, 4, 6, 7, /* 214*/ + 3, 5, 0, 1, 2, 4, 6, 7, /* 215*/ + 0, 1, 2, 5, 3, 4, 6, 7, /* 216*/ + 1, 2, 5, 0, 3, 4, 6, 7, /* 217*/ + 0, 2, 5, 1, 3, 4, 6, 7, /* 218*/ + 2, 5, 0, 1, 3, 4, 6, 7, /* 219*/ + 0, 1, 5, 2, 3, 4, 6, 7, /* 220*/ + 1, 5, 0, 2, 3, 4, 6, 7, /* 221*/ + 0, 5, 1, 2, 3, 4, 6, 7, /* 222*/ + 5, 0, 1, 2, 3, 4, 6, 7, /* 223*/ + 0, 1, 2, 3, 4, 5, 6, 7, /* 224*/ + 1, 2, 3, 4, 0, 5, 6, 7, /* 225*/ + 0, 2, 3, 4, 1, 5, 6, 7, /* 226*/ + 2, 3, 4, 0, 1, 5, 6, 7, /* 227*/ + 0, 1, 3, 4, 2, 5, 6, 7, /* 228*/ + 1, 3, 4, 0, 2, 5, 6, 7, /* 229*/ + 0, 3, 4, 1, 2, 5, 6, 7, /* 230*/ + 3, 4, 0, 1, 2, 5, 6, 7, /* 231*/ + 0, 1, 2, 4, 3, 5, 6, 7, /* 232*/ + 1, 2, 4, 0, 3, 5, 6, 7, /* 233*/ + 0, 2, 4, 1, 3, 5, 6, 7, /* 234*/ + 2, 4, 0, 1, 3, 5, 6, 7, /* 235*/ + 0, 1, 4, 2, 3, 5, 6, 7, /* 236*/ + 1, 4, 0, 2, 3, 5, 6, 7, /* 237*/ + 0, 4, 1, 2, 3, 5, 6, 7, /* 238*/ + 4, 0, 1, 2, 3, 5, 6, 7, /* 239*/ + 0, 1, 2, 3, 4, 5, 6, 7, /* 240*/ + 1, 2, 3, 0, 4, 5, 6, 7, /* 241*/ + 0, 2, 3, 1, 4, 5, 6, 7, /* 242*/ + 2, 3, 0, 1, 4, 5, 6, 7, /* 243*/ + 0, 1, 3, 2, 4, 5, 6, 7, /* 244*/ + 1, 3, 0, 2, 4, 5, 6, 7, /* 245*/ + 0, 3, 1, 2, 4, 5, 6, 7, /* 246*/ + 3, 0, 1, 2, 4, 5, 6, 7, /* 247*/ + 0, 1, 2, 3, 4, 5, 6, 7, /* 248*/ + 1, 2, 0, 3, 4, 5, 6, 7, /* 249*/ + 0, 2, 1, 3, 4, 5, 6, 7, /* 250*/ + 2, 0, 1, 3, 4, 5, 6, 7, /* 251*/ + 0, 1, 2, 3, 4, 5, 6, 7, /* 252*/ + 1, 0, 2, 3, 4, 5, 6, 7, /* 253*/ + 0, 1, 2, 3, 4, 5, 6, 7, /* 254*/ + 0, 1, 2, 3, 4, 5, 6, 7, /* 255*/ +}; + + +static FORCE_INLINE __m256i get_permutation_vector(int pvbyte) { +#ifdef BYTE_PATTERN + __m256i shufm; + asm volatile ( + "vpmovzxbd (%1), %0" + : "=X" (shufm) + : "r" (reverseshufflemask + 8 * pvbyte) + ); + + return shufm; +#else + return _mm256_load_si256((__m256i *)(reverseshufflemask + 8 * pvbyte)); +#endif +} + + +static uint32_t avx_pivot_on_last_value(int32_t *array, size_t length) { + /* we run through the data. Anything in [0,boundary) is smaller or equal + * than the pivot, and the value at boundary - 1 is going to be equal to the + * pivot at the end, + * anything in (boundary, i) is greater than the pivot + * stuff in [i,...) is grey + * the function returns the location of the boundary. + */ + if (length <= 1) + return 1; + { // we exchange the last value for the middle value for a better pivot + int32_t ival = array[length / 2]; + int32_t bval = array[length - 1]; + array[length / 2] = bval; + array[length - 1] = ival; + } +#if WITH_RUNTIME_STATS + statistics.partition_calls += 1; + statistics.items_processed += length; +#endif + uint32_t boundary = 0; + uint32_t i = 0; + int32_t pivot = array[length - 1]; // we always pick the pivot at the end + const __m256i P = _mm256_set1_epi32(pivot); + while ( i + 8 + 1 <= length) { + __m256i allgrey = _mm256_lddqu_si256((__m256i *)(array + i)); + int pvbyte = _mm256_movemask_ps((__m256)_mm256_cmpgt_epi32(allgrey, P)); +#if WITH_RUNTIME_STATS && defined(WITH_PVBYTE_HISTOGRAM) + statistics.pvbyte_histogram.hit(pvbyte); +#endif + if(pvbyte == 0) { // might be frequent + i += 8; //nothing to do + boundary = i; + } else if (pvbyte == 0xFF) { // called once + boundary = i; + i += 8; + break; // exit + } else { + + // hot path + switch (pvbyte) { + // for pvbyte = 0x00, 0x80, 0xc0, 0xe0, 0xf0, 0xf8, 0xfc, 0xfe, 0xff + // there is no change in order, just advance boundary + // Note: case 0x00 & 0xff are already handled + case 0x80: i += 8 - 1; break; + case 0xc0: i += 8 - 2; break; + case 0xe0: i += 8 - 3; break; + case 0xf0: i += 8 - 4; break; + case 0xf8: i += 8 - 5; break; + case 0xfc: i += 8 - 6; break; + case 0xfe: i += 8 - 7; break; + + default: { + uint32_t cnt = + 8 - _mm_popcnt_u32(pvbyte); // might be faster with table look-up? + __m256i blackthenwhite = _mm256_permutevar8x32_epi32(allgrey, get_permutation_vector(pvbyte)); + _mm256_storeu_si256((__m256i *)(array + i), blackthenwhite); + i += cnt; + } + } // switch + + boundary = i; // this doesn't need updating each and every time + } + } + for (; i + 8 + 1 <= length ;) { + __m256i allgrey = + _mm256_lddqu_si256((__m256i *)(array + i)); // this is all grey + int pvbyte = _mm256_movemask_ps((__m256)_mm256_cmpgt_epi32(allgrey, P)); + if (pvbyte == 0xFF) { // called once + // nothing to do + } else { + + uint32_t cnt = + 8 - _mm_popcnt_u32(pvbyte); // might be faster with table look-up? + __m256i allwhite = _mm256_lddqu_si256( + (__m256i *)(array + boundary)); // this is all white + // we shuffle allgrey so that the first part is black and the second part + // is white + __m256i blackthenwhite = _mm256_permutevar8x32_epi32(allgrey, get_permutation_vector(pvbyte)); + _mm256_storeu_si256((__m256i *)(array + boundary), blackthenwhite); + _mm256_storeu_si256((__m256i *)(array + i), allwhite); + boundary += cnt; // might be faster with table look-up? + } + i += 8; + } + while (i + 1 < length) { + int32_t ival = array[i]; + if (ival <= pivot) { + int32_t bval = array[boundary]; + array[i] = bval; + array[boundary] = ival; + boundary++; + } + i++; + } + int32_t ival = array[i]; + int32_t bval = array[boundary]; + array[length - 1] = bval; + array[boundary] = ival; + boundary++; + return boundary; +} + +// for fallback +void scalar_partition(int32_t* array, const int32_t pivot, int& left, int& right) { + + while (left <= right) { + while (array[left] < pivot) { + left += 1; + } + while (array[right] > pivot) { + right -= 1; + } + if (left <= right) { + const uint32_t t = array[left]; + array[left] = array[right]; + array[right] = t; + left += 1; + right -= 1; + } + } +} + +//fallback +void scalar_quicksort(int32_t* array, int left, int right) { +#ifdef WITH_RUNTIME_STATS + statistics.scalar__partition_calls += 1; + statistics.scalar__items_processed += right - left + 1; +#endif + int i = left; + int j = right; + const int32_t pivot = array[(i + j)/2]; + scalar_partition(array, pivot, i, j); + if (left < j) { + scalar_quicksort(array, left, j); + } + if (i < right) { + scalar_quicksort(array, i, right); + } +} + +void avx2_pivotonlast_sort(int32_t *array, const uint32_t length) { + uint32_t sep = avx_pivot_on_last_value(array, length); + if(sep == length) { + // we have an ineffective pivot. Let us give up. + if(length > 1) scalar_quicksort(array,0,length - 1); + } else { + if (sep > 2) { + avx2_pivotonlast_sort(array, sep - 1); + } + if (sep + 1 < length) { + avx2_pivotonlast_sort(array + sep, length - sep); + } + } +} +void wrapped_avx2_pivotonlast_sort(uint32_t *array, int left, int right) { + avx2_pivotonlast_sort((int32_t *)array + left, right - left + 1); +} diff --git a/simd-sort/avx2-nate-quicksort.cpp b/simd-sort/avx2-nate-quicksort.cpp new file mode 100644 index 0000000..7d91931 --- /dev/null +++ b/simd-sort/avx2-nate-quicksort.cpp @@ -0,0 +1,436 @@ +#pragma once + +#include +#include + +namespace nate { + +const int AVX2_REGISTER_SIZE = 8; // in 32-bit words + +/* + Algorithm of vectorized partition by Nathan Kurz. + Everything else was borrowed from Daniel's code (avx2-altquicksort.h) +*/ + +static uint32_t reverseshufflemask[256 * 8] __attribute__((aligned(0x100))) = { + 0, 1, 2, 3, 4, 5, 6, 7, /* 0*/ + 1, 2, 3, 4, 5, 6, 7, 0, /* 1*/ + 0, 2, 3, 4, 5, 6, 7, 1, /* 2*/ + 2, 3, 4, 5, 6, 7, 0, 1, /* 3*/ + 0, 1, 3, 4, 5, 6, 7, 2, /* 4*/ + 1, 3, 4, 5, 6, 7, 0, 2, /* 5*/ + 0, 3, 4, 5, 6, 7, 1, 2, /* 6*/ + 3, 4, 5, 6, 7, 0, 1, 2, /* 7*/ + 0, 1, 2, 4, 5, 6, 7, 3, /* 8*/ + 1, 2, 4, 5, 6, 7, 0, 3, /* 9*/ + 0, 2, 4, 5, 6, 7, 1, 3, /* 10*/ + 2, 4, 5, 6, 7, 0, 1, 3, /* 11*/ + 0, 1, 4, 5, 6, 7, 2, 3, /* 12*/ + 1, 4, 5, 6, 7, 0, 2, 3, /* 13*/ + 0, 4, 5, 6, 7, 1, 2, 3, /* 14*/ + 4, 5, 6, 7, 0, 1, 2, 3, /* 15*/ + 0, 1, 2, 3, 5, 6, 7, 4, /* 16*/ + 1, 2, 3, 5, 6, 7, 0, 4, /* 17*/ + 0, 2, 3, 5, 6, 7, 1, 4, /* 18*/ + 2, 3, 5, 6, 7, 0, 1, 4, /* 19*/ + 0, 1, 3, 5, 6, 7, 2, 4, /* 20*/ + 1, 3, 5, 6, 7, 0, 2, 4, /* 21*/ + 0, 3, 5, 6, 7, 1, 2, 4, /* 22*/ + 3, 5, 6, 7, 0, 1, 2, 4, /* 23*/ + 0, 1, 2, 5, 6, 7, 3, 4, /* 24*/ + 1, 2, 5, 6, 7, 0, 3, 4, /* 25*/ + 0, 2, 5, 6, 7, 1, 3, 4, /* 26*/ + 2, 5, 6, 7, 0, 1, 3, 4, /* 27*/ + 0, 1, 5, 6, 7, 2, 3, 4, /* 28*/ + 1, 5, 6, 7, 0, 2, 3, 4, /* 29*/ + 0, 5, 6, 7, 1, 2, 3, 4, /* 30*/ + 5, 6, 7, 0, 1, 2, 3, 4, /* 31*/ + 0, 1, 2, 3, 4, 6, 7, 5, /* 32*/ + 1, 2, 3, 4, 6, 7, 0, 5, /* 33*/ + 0, 2, 3, 4, 6, 7, 1, 5, /* 34*/ + 2, 3, 4, 6, 7, 0, 1, 5, /* 35*/ + 0, 1, 3, 4, 6, 7, 2, 5, /* 36*/ + 1, 3, 4, 6, 7, 0, 2, 5, /* 37*/ + 0, 3, 4, 6, 7, 1, 2, 5, /* 38*/ + 3, 4, 6, 7, 0, 1, 2, 5, /* 39*/ + 0, 1, 2, 4, 6, 7, 3, 5, /* 40*/ + 1, 2, 4, 6, 7, 0, 3, 5, /* 41*/ + 0, 2, 4, 6, 7, 1, 3, 5, /* 42*/ + 2, 4, 6, 7, 0, 1, 3, 5, /* 43*/ + 0, 1, 4, 6, 7, 2, 3, 5, /* 44*/ + 1, 4, 6, 7, 0, 2, 3, 5, /* 45*/ + 0, 4, 6, 7, 1, 2, 3, 5, /* 46*/ + 4, 6, 7, 0, 1, 2, 3, 5, /* 47*/ + 0, 1, 2, 3, 6, 7, 4, 5, /* 48*/ + 1, 2, 3, 6, 7, 0, 4, 5, /* 49*/ + 0, 2, 3, 6, 7, 1, 4, 5, /* 50*/ + 2, 3, 6, 7, 0, 1, 4, 5, /* 51*/ + 0, 1, 3, 6, 7, 2, 4, 5, /* 52*/ + 1, 3, 6, 7, 0, 2, 4, 5, /* 53*/ + 0, 3, 6, 7, 1, 2, 4, 5, /* 54*/ + 3, 6, 7, 0, 1, 2, 4, 5, /* 55*/ + 0, 1, 2, 6, 7, 3, 4, 5, /* 56*/ + 1, 2, 6, 7, 0, 3, 4, 5, /* 57*/ + 0, 2, 6, 7, 1, 3, 4, 5, /* 58*/ + 2, 6, 7, 0, 1, 3, 4, 5, /* 59*/ + 0, 1, 6, 7, 2, 3, 4, 5, /* 60*/ + 1, 6, 7, 0, 2, 3, 4, 5, /* 61*/ + 0, 6, 7, 1, 2, 3, 4, 5, /* 62*/ + 6, 7, 0, 1, 2, 3, 4, 5, /* 63*/ + 0, 1, 2, 3, 4, 5, 7, 6, /* 64*/ + 1, 2, 3, 4, 5, 7, 0, 6, /* 65*/ + 0, 2, 3, 4, 5, 7, 1, 6, /* 66*/ + 2, 3, 4, 5, 7, 0, 1, 6, /* 67*/ + 0, 1, 3, 4, 5, 7, 2, 6, /* 68*/ + 1, 3, 4, 5, 7, 0, 2, 6, /* 69*/ + 0, 3, 4, 5, 7, 1, 2, 6, /* 70*/ + 3, 4, 5, 7, 0, 1, 2, 6, /* 71*/ + 0, 1, 2, 4, 5, 7, 3, 6, /* 72*/ + 1, 2, 4, 5, 7, 0, 3, 6, /* 73*/ + 0, 2, 4, 5, 7, 1, 3, 6, /* 74*/ + 2, 4, 5, 7, 0, 1, 3, 6, /* 75*/ + 0, 1, 4, 5, 7, 2, 3, 6, /* 76*/ + 1, 4, 5, 7, 0, 2, 3, 6, /* 77*/ + 0, 4, 5, 7, 1, 2, 3, 6, /* 78*/ + 4, 5, 7, 0, 1, 2, 3, 6, /* 79*/ + 0, 1, 2, 3, 5, 7, 4, 6, /* 80*/ + 1, 2, 3, 5, 7, 0, 4, 6, /* 81*/ + 0, 2, 3, 5, 7, 1, 4, 6, /* 82*/ + 2, 3, 5, 7, 0, 1, 4, 6, /* 83*/ + 0, 1, 3, 5, 7, 2, 4, 6, /* 84*/ + 1, 3, 5, 7, 0, 2, 4, 6, /* 85*/ + 0, 3, 5, 7, 1, 2, 4, 6, /* 86*/ + 3, 5, 7, 0, 1, 2, 4, 6, /* 87*/ + 0, 1, 2, 5, 7, 3, 4, 6, /* 88*/ + 1, 2, 5, 7, 0, 3, 4, 6, /* 89*/ + 0, 2, 5, 7, 1, 3, 4, 6, /* 90*/ + 2, 5, 7, 0, 1, 3, 4, 6, /* 91*/ + 0, 1, 5, 7, 2, 3, 4, 6, /* 92*/ + 1, 5, 7, 0, 2, 3, 4, 6, /* 93*/ + 0, 5, 7, 1, 2, 3, 4, 6, /* 94*/ + 5, 7, 0, 1, 2, 3, 4, 6, /* 95*/ + 0, 1, 2, 3, 4, 7, 5, 6, /* 96*/ + 1, 2, 3, 4, 7, 0, 5, 6, /* 97*/ + 0, 2, 3, 4, 7, 1, 5, 6, /* 98*/ + 2, 3, 4, 7, 0, 1, 5, 6, /* 99*/ + 0, 1, 3, 4, 7, 2, 5, 6, /* 100*/ + 1, 3, 4, 7, 0, 2, 5, 6, /* 101*/ + 0, 3, 4, 7, 1, 2, 5, 6, /* 102*/ + 3, 4, 7, 0, 1, 2, 5, 6, /* 103*/ + 0, 1, 2, 4, 7, 3, 5, 6, /* 104*/ + 1, 2, 4, 7, 0, 3, 5, 6, /* 105*/ + 0, 2, 4, 7, 1, 3, 5, 6, /* 106*/ + 2, 4, 7, 0, 1, 3, 5, 6, /* 107*/ + 0, 1, 4, 7, 2, 3, 5, 6, /* 108*/ + 1, 4, 7, 0, 2, 3, 5, 6, /* 109*/ + 0, 4, 7, 1, 2, 3, 5, 6, /* 110*/ + 4, 7, 0, 1, 2, 3, 5, 6, /* 111*/ + 0, 1, 2, 3, 7, 4, 5, 6, /* 112*/ + 1, 2, 3, 7, 0, 4, 5, 6, /* 113*/ + 0, 2, 3, 7, 1, 4, 5, 6, /* 114*/ + 2, 3, 7, 0, 1, 4, 5, 6, /* 115*/ + 0, 1, 3, 7, 2, 4, 5, 6, /* 116*/ + 1, 3, 7, 0, 2, 4, 5, 6, /* 117*/ + 0, 3, 7, 1, 2, 4, 5, 6, /* 118*/ + 3, 7, 0, 1, 2, 4, 5, 6, /* 119*/ + 0, 1, 2, 7, 3, 4, 5, 6, /* 120*/ + 1, 2, 7, 0, 3, 4, 5, 6, /* 121*/ + 0, 2, 7, 1, 3, 4, 5, 6, /* 122*/ + 2, 7, 0, 1, 3, 4, 5, 6, /* 123*/ + 0, 1, 7, 2, 3, 4, 5, 6, /* 124*/ + 1, 7, 0, 2, 3, 4, 5, 6, /* 125*/ + 0, 7, 1, 2, 3, 4, 5, 6, /* 126*/ + 7, 0, 1, 2, 3, 4, 5, 6, /* 127*/ + 0, 1, 2, 3, 4, 5, 6, 7, /* 128*/ + 1, 2, 3, 4, 5, 6, 0, 7, /* 129*/ + 0, 2, 3, 4, 5, 6, 1, 7, /* 130*/ + 2, 3, 4, 5, 6, 0, 1, 7, /* 131*/ + 0, 1, 3, 4, 5, 6, 2, 7, /* 132*/ + 1, 3, 4, 5, 6, 0, 2, 7, /* 133*/ + 0, 3, 4, 5, 6, 1, 2, 7, /* 134*/ + 3, 4, 5, 6, 0, 1, 2, 7, /* 135*/ + 0, 1, 2, 4, 5, 6, 3, 7, /* 136*/ + 1, 2, 4, 5, 6, 0, 3, 7, /* 137*/ + 0, 2, 4, 5, 6, 1, 3, 7, /* 138*/ + 2, 4, 5, 6, 0, 1, 3, 7, /* 139*/ + 0, 1, 4, 5, 6, 2, 3, 7, /* 140*/ + 1, 4, 5, 6, 0, 2, 3, 7, /* 141*/ + 0, 4, 5, 6, 1, 2, 3, 7, /* 142*/ + 4, 5, 6, 0, 1, 2, 3, 7, /* 143*/ + 0, 1, 2, 3, 5, 6, 4, 7, /* 144*/ + 1, 2, 3, 5, 6, 0, 4, 7, /* 145*/ + 0, 2, 3, 5, 6, 1, 4, 7, /* 146*/ + 2, 3, 5, 6, 0, 1, 4, 7, /* 147*/ + 0, 1, 3, 5, 6, 2, 4, 7, /* 148*/ + 1, 3, 5, 6, 0, 2, 4, 7, /* 149*/ + 0, 3, 5, 6, 1, 2, 4, 7, /* 150*/ + 3, 5, 6, 0, 1, 2, 4, 7, /* 151*/ + 0, 1, 2, 5, 6, 3, 4, 7, /* 152*/ + 1, 2, 5, 6, 0, 3, 4, 7, /* 153*/ + 0, 2, 5, 6, 1, 3, 4, 7, /* 154*/ + 2, 5, 6, 0, 1, 3, 4, 7, /* 155*/ + 0, 1, 5, 6, 2, 3, 4, 7, /* 156*/ + 1, 5, 6, 0, 2, 3, 4, 7, /* 157*/ + 0, 5, 6, 1, 2, 3, 4, 7, /* 158*/ + 5, 6, 0, 1, 2, 3, 4, 7, /* 159*/ + 0, 1, 2, 3, 4, 6, 5, 7, /* 160*/ + 1, 2, 3, 4, 6, 0, 5, 7, /* 161*/ + 0, 2, 3, 4, 6, 1, 5, 7, /* 162*/ + 2, 3, 4, 6, 0, 1, 5, 7, /* 163*/ + 0, 1, 3, 4, 6, 2, 5, 7, /* 164*/ + 1, 3, 4, 6, 0, 2, 5, 7, /* 165*/ + 0, 3, 4, 6, 1, 2, 5, 7, /* 166*/ + 3, 4, 6, 0, 1, 2, 5, 7, /* 167*/ + 0, 1, 2, 4, 6, 3, 5, 7, /* 168*/ + 1, 2, 4, 6, 0, 3, 5, 7, /* 169*/ + 0, 2, 4, 6, 1, 3, 5, 7, /* 170*/ + 2, 4, 6, 0, 1, 3, 5, 7, /* 171*/ + 0, 1, 4, 6, 2, 3, 5, 7, /* 172*/ + 1, 4, 6, 0, 2, 3, 5, 7, /* 173*/ + 0, 4, 6, 1, 2, 3, 5, 7, /* 174*/ + 4, 6, 0, 1, 2, 3, 5, 7, /* 175*/ + 0, 1, 2, 3, 6, 4, 5, 7, /* 176*/ + 1, 2, 3, 6, 0, 4, 5, 7, /* 177*/ + 0, 2, 3, 6, 1, 4, 5, 7, /* 178*/ + 2, 3, 6, 0, 1, 4, 5, 7, /* 179*/ + 0, 1, 3, 6, 2, 4, 5, 7, /* 180*/ + 1, 3, 6, 0, 2, 4, 5, 7, /* 181*/ + 0, 3, 6, 1, 2, 4, 5, 7, /* 182*/ + 3, 6, 0, 1, 2, 4, 5, 7, /* 183*/ + 0, 1, 2, 6, 3, 4, 5, 7, /* 184*/ + 1, 2, 6, 0, 3, 4, 5, 7, /* 185*/ + 0, 2, 6, 1, 3, 4, 5, 7, /* 186*/ + 2, 6, 0, 1, 3, 4, 5, 7, /* 187*/ + 0, 1, 6, 2, 3, 4, 5, 7, /* 188*/ + 1, 6, 0, 2, 3, 4, 5, 7, /* 189*/ + 0, 6, 1, 2, 3, 4, 5, 7, /* 190*/ + 6, 0, 1, 2, 3, 4, 5, 7, /* 191*/ + 0, 1, 2, 3, 4, 5, 6, 7, /* 192*/ + 1, 2, 3, 4, 5, 0, 6, 7, /* 193*/ + 0, 2, 3, 4, 5, 1, 6, 7, /* 194*/ + 2, 3, 4, 5, 0, 1, 6, 7, /* 195*/ + 0, 1, 3, 4, 5, 2, 6, 7, /* 196*/ + 1, 3, 4, 5, 0, 2, 6, 7, /* 197*/ + 0, 3, 4, 5, 1, 2, 6, 7, /* 198*/ + 3, 4, 5, 0, 1, 2, 6, 7, /* 199*/ + 0, 1, 2, 4, 5, 3, 6, 7, /* 200*/ + 1, 2, 4, 5, 0, 3, 6, 7, /* 201*/ + 0, 2, 4, 5, 1, 3, 6, 7, /* 202*/ + 2, 4, 5, 0, 1, 3, 6, 7, /* 203*/ + 0, 1, 4, 5, 2, 3, 6, 7, /* 204*/ + 1, 4, 5, 0, 2, 3, 6, 7, /* 205*/ + 0, 4, 5, 1, 2, 3, 6, 7, /* 206*/ + 4, 5, 0, 1, 2, 3, 6, 7, /* 207*/ + 0, 1, 2, 3, 5, 4, 6, 7, /* 208*/ + 1, 2, 3, 5, 0, 4, 6, 7, /* 209*/ + 0, 2, 3, 5, 1, 4, 6, 7, /* 210*/ + 2, 3, 5, 0, 1, 4, 6, 7, /* 211*/ + 0, 1, 3, 5, 2, 4, 6, 7, /* 212*/ + 1, 3, 5, 0, 2, 4, 6, 7, /* 213*/ + 0, 3, 5, 1, 2, 4, 6, 7, /* 214*/ + 3, 5, 0, 1, 2, 4, 6, 7, /* 215*/ + 0, 1, 2, 5, 3, 4, 6, 7, /* 216*/ + 1, 2, 5, 0, 3, 4, 6, 7, /* 217*/ + 0, 2, 5, 1, 3, 4, 6, 7, /* 218*/ + 2, 5, 0, 1, 3, 4, 6, 7, /* 219*/ + 0, 1, 5, 2, 3, 4, 6, 7, /* 220*/ + 1, 5, 0, 2, 3, 4, 6, 7, /* 221*/ + 0, 5, 1, 2, 3, 4, 6, 7, /* 222*/ + 5, 0, 1, 2, 3, 4, 6, 7, /* 223*/ + 0, 1, 2, 3, 4, 5, 6, 7, /* 224*/ + 1, 2, 3, 4, 0, 5, 6, 7, /* 225*/ + 0, 2, 3, 4, 1, 5, 6, 7, /* 226*/ + 2, 3, 4, 0, 1, 5, 6, 7, /* 227*/ + 0, 1, 3, 4, 2, 5, 6, 7, /* 228*/ + 1, 3, 4, 0, 2, 5, 6, 7, /* 229*/ + 0, 3, 4, 1, 2, 5, 6, 7, /* 230*/ + 3, 4, 0, 1, 2, 5, 6, 7, /* 231*/ + 0, 1, 2, 4, 3, 5, 6, 7, /* 232*/ + 1, 2, 4, 0, 3, 5, 6, 7, /* 233*/ + 0, 2, 4, 1, 3, 5, 6, 7, /* 234*/ + 2, 4, 0, 1, 3, 5, 6, 7, /* 235*/ + 0, 1, 4, 2, 3, 5, 6, 7, /* 236*/ + 1, 4, 0, 2, 3, 5, 6, 7, /* 237*/ + 0, 4, 1, 2, 3, 5, 6, 7, /* 238*/ + 4, 0, 1, 2, 3, 5, 6, 7, /* 239*/ + 0, 1, 2, 3, 4, 5, 6, 7, /* 240*/ + 1, 2, 3, 0, 4, 5, 6, 7, /* 241*/ + 0, 2, 3, 1, 4, 5, 6, 7, /* 242*/ + 2, 3, 0, 1, 4, 5, 6, 7, /* 243*/ + 0, 1, 3, 2, 4, 5, 6, 7, /* 244*/ + 1, 3, 0, 2, 4, 5, 6, 7, /* 245*/ + 0, 3, 1, 2, 4, 5, 6, 7, /* 246*/ + 3, 0, 1, 2, 4, 5, 6, 7, /* 247*/ + 0, 1, 2, 3, 4, 5, 6, 7, /* 248*/ + 1, 2, 0, 3, 4, 5, 6, 7, /* 249*/ + 0, 2, 1, 3, 4, 5, 6, 7, /* 250*/ + 2, 0, 1, 3, 4, 5, 6, 7, /* 251*/ + 0, 1, 2, 3, 4, 5, 6, 7, /* 252*/ + 1, 0, 2, 3, 4, 5, 6, 7, /* 253*/ + 0, 1, 2, 3, 4, 5, 6, 7, /* 254*/ + 0, 1, 2, 3, 4, 5, 6, 7, /* 255*/ +}; + + +static uint32_t avx_pivot_on_last_value(int32_t *array, size_t length) { + if (length <= 1) + return 1; + + + { // we exchange the last value for the middle value for a better pivot + int32_t ival = array[length / 2]; + int32_t bval = array[length - 1]; + array[length / 2] = bval; + array[length - 1] = ival; + } + + int wh, rh; + int wt, rt; + + rh = 0; + rt = length - 1; + wh = 0; + wt = length - 1; + + __m256i h0 = _mm256_loadu_si256((__m256i*)(array + rh)); + __m256i h1 = _mm256_loadu_si256((__m256i*)(array + rh + AVX2_REGISTER_SIZE)); + rh += 2*AVX2_REGISTER_SIZE; + + __m256i t0 = _mm256_loadu_si256((__m256i*)(array + rt - AVX2_REGISTER_SIZE)); + __m256i t1 = _mm256_loadu_si256((__m256i*)(array + rt - 2*AVX2_REGISTER_SIZE)); + rt -= 3*AVX2_REGISTER_SIZE; + + __m256i current = _mm256_loadu_si256((__m256i*)(array + rh)); + __m256i next; + rh += AVX2_REGISTER_SIZE; + + const int32_t pivot = array[length - 1]; + const __m256i P = _mm256_set1_epi32(pivot); + + // 1. the main loop + while (wh - wt < 4*AVX2_REGISTER_SIZE) { + const bool which = (rh - wh) < (wt - rt); + + // I believe that a compiler will emit branchless code + __m256i* next_ptr = (__m256i*)(which ? array + rh : array + rt); + const int adv_rh = which ? AVX2_REGISTER_SIZE : 0; + const int adv_rt = which ? 0 : AVX2_REGISTER_SIZE; + // My faith ends here + + next = _mm256_loadu_si256(next_ptr); + + const int pvbyte = _mm256_movemask_ps((__m256)_mm256_cmpgt_epi32(current, P)); + const uint32_t cnt = 8 - _mm_popcnt_u32(pvbyte); + const __m256i shuf = _mm256_load_si256((__m256i *)(reverseshufflemask + 8 * pvbyte)); + + const __m256i ordered = _mm256_permutevar8x32_epi32(current, shuf); + _mm256_storeu_si256((__m256i*)(array + wh), ordered); + _mm256_storeu_si256((__m256i*)(array + wt - 8), ordered); + + rh += adv_rh; + rt -= adv_rt; + wh += cnt; + wt -= 8 - cnt; + + current = next; + } + + // 2. partition remaining part + while (wh - wt > 4*AVX2_REGISTER_SIZE) { + const int32_t v = array[rh++]; + if (v < pivot) { + array[wh++] = v; + } else { + array[wt--] = v; + } + } + + // 3. partition 4 registers loaded in the beginning + static uint32_t tmp[4*AVX2_REGISTER_SIZE]; + _mm256_storeu_si256((__m256i*)(tmp + 0*AVX2_REGISTER_SIZE), h0); + _mm256_storeu_si256((__m256i*)(tmp + 1*AVX2_REGISTER_SIZE), h1); + _mm256_storeu_si256((__m256i*)(tmp + 2*AVX2_REGISTER_SIZE), t0); + _mm256_storeu_si256((__m256i*)(tmp + 3*AVX2_REGISTER_SIZE), t1); + for (int i=0; i < 4*AVX2_REGISTER_SIZE; i++) { + const int32_t v = array[i]; + if (v < pivot) { + array[wh++] = v; + } else { + array[wt--] = v; + } + } + + { + const int32_t a = array[length - 1]; + const int32_t b = array[wh]; + + array[length - 1] = b; + array[wh] = a; + } + + return wh; +} + +// for fallback +void scalar_partition(int32_t* array, const int32_t pivot, int& left, int& right) { + + while (left <= right) { + while (array[left] < pivot) { + left += 1; + } + while (array[right] > pivot) { + right -= 1; + } + if (left <= right) { + const uint32_t t = array[left]; + array[left] = array[right]; + array[right] = t; + left += 1; + right -= 1; + } + } +} + +//fallback +void scalar_quicksort(int32_t* array, int left, int right) { +#ifdef WITH_RUNTIME_STATS + statistics.scalar__partition_calls += 1; + statistics.scalar__items_processed += right - left + 1; +#endif + int i = left; + int j = right; + const int32_t pivot = array[(i + j)/2]; + scalar_partition(array, pivot, i, j); + if (left < j) { + scalar_quicksort(array, left, j); + } + if (i < right) { + scalar_quicksort(array, i, right); + } +} + +void avx2_pivotonlast_sort(int32_t *array, const uint32_t length) { + uint32_t sep; + + if (length > 8*AVX2_REGISTER_SIZE) { + sep = avx_pivot_on_last_value(array, length); + } else { + sep = lomuto_partition_epi32((uint32_t*)array, 0, length - 1); + } + if(sep == length) { + // we have an ineffective pivot. Let us give up. + if(length > 1) scalar_quicksort(array,0,length - 1); + } else { + if (sep > 2) { + avx2_pivotonlast_sort(array, sep - 1); + } + if (sep + 1 < length) { + avx2_pivotonlast_sort(array + sep, length - sep); + } + } +} +void wrapped_avx2_pivotonlast_sort(uint32_t *array, int left, int right) { + avx2_pivotonlast_sort((int32_t *)array + left, right - left + 1); +} + +} // namespace nate diff --git a/simd-sort/avx2-natenodutch-quicksort.h b/simd-sort/avx2-natenodutch-quicksort.h new file mode 100644 index 0000000..4a816b6 --- /dev/null +++ b/simd-sort/avx2-natenodutch-quicksort.h @@ -0,0 +1,409 @@ +#pragma once + +#include +#include +#include +#include "avx2-altquicksort.h" +/** +* This is an alternative approach to SIMD quicksort, based on a description by +* Nathan Kurz. Daniel Lemire implemented it to avoid the Dutch national flag +* problem as an experiment. This is not meant to be fast, as implemented. It is +* an experiment. +*/ + +/* +I'm going to use "vector" as a unit, but it doesn't have to be the +native vector size. In particular, I think we can read multiple +vector register's worth of data with minimal added time per loop +iteration. + +First, read two vectors from the head of the list, and two from the +tail of the list. Store these, either in registers or written to a +buffer. We're not going to use these until the end. We're just +getting them out of the way so that we have more than one vectors +writeable space at the head and tail. + +wh--rh------------------rt--wt- + +wh = write head +rh = read head +rt = read tail +wt = write tail + +Then read one vector from head, and advance rh accordingly. Call it nextVector. + +Loop: + +Set currentVector = nextVector. + +Set nextVectorPtr to either rh or rt depending on which has the least +"writable space" available. That is, if (rh - wh) < (wt - rt), read +from and advance rh, else read from and update rt. These updates +need to be branchless. + +Read nextVector = *nextVectorPtr. + +Partition currentVector (lookup, shuffle) and write to both wh and wt +(tail offset so end hits wt). +Advance wh and wt based on number of valid entries. Use a second byte +lookup issued as early as possible to determine this count. Our +performance is dependent on the latency of this step. + +----- +*/ +static uint32_t shufflemask[256 * 8] __attribute__((aligned(0x100))) = { + 0, 1, 2, 3, 4, 5, 6, 7, /* 0*/ + 0, 1, 2, 3, 4, 5, 6, 7, /* 1*/ + 1, 0, 2, 3, 4, 5, 6, 7, /* 2*/ + 0, 1, 2, 3, 4, 5, 6, 7, /* 3*/ + 2, 0, 1, 3, 4, 5, 6, 7, /* 4*/ + 0, 2, 1, 3, 4, 5, 6, 7, /* 5*/ + 1, 2, 0, 3, 4, 5, 6, 7, /* 6*/ + 0, 1, 2, 3, 4, 5, 6, 7, /* 7*/ + 3, 0, 1, 2, 4, 5, 6, 7, /* 8*/ + 0, 3, 1, 2, 4, 5, 6, 7, /* 9*/ + 1, 3, 0, 2, 4, 5, 6, 7, /* 10*/ + 0, 1, 3, 2, 4, 5, 6, 7, /* 11*/ + 2, 3, 0, 1, 4, 5, 6, 7, /* 12*/ + 0, 2, 3, 1, 4, 5, 6, 7, /* 13*/ + 1, 2, 3, 0, 4, 5, 6, 7, /* 14*/ + 0, 1, 2, 3, 4, 5, 6, 7, /* 15*/ + 4, 0, 1, 2, 3, 5, 6, 7, /* 16*/ + 0, 4, 1, 2, 3, 5, 6, 7, /* 17*/ + 1, 4, 0, 2, 3, 5, 6, 7, /* 18*/ + 0, 1, 4, 2, 3, 5, 6, 7, /* 19*/ + 2, 4, 0, 1, 3, 5, 6, 7, /* 20*/ + 0, 2, 4, 1, 3, 5, 6, 7, /* 21*/ + 1, 2, 4, 0, 3, 5, 6, 7, /* 22*/ + 0, 1, 2, 4, 3, 5, 6, 7, /* 23*/ + 3, 4, 0, 1, 2, 5, 6, 7, /* 24*/ + 0, 3, 4, 1, 2, 5, 6, 7, /* 25*/ + 1, 3, 4, 0, 2, 5, 6, 7, /* 26*/ + 0, 1, 3, 4, 2, 5, 6, 7, /* 27*/ + 2, 3, 4, 0, 1, 5, 6, 7, /* 28*/ + 0, 2, 3, 4, 1, 5, 6, 7, /* 29*/ + 1, 2, 3, 4, 0, 5, 6, 7, /* 30*/ + 0, 1, 2, 3, 4, 5, 6, 7, /* 31*/ + 5, 0, 1, 2, 3, 4, 6, 7, /* 32*/ + 0, 5, 1, 2, 3, 4, 6, 7, /* 33*/ + 1, 5, 0, 2, 3, 4, 6, 7, /* 34*/ + 0, 1, 5, 2, 3, 4, 6, 7, /* 35*/ + 2, 5, 0, 1, 3, 4, 6, 7, /* 36*/ + 0, 2, 5, 1, 3, 4, 6, 7, /* 37*/ + 1, 2, 5, 0, 3, 4, 6, 7, /* 38*/ + 0, 1, 2, 5, 3, 4, 6, 7, /* 39*/ + 3, 5, 0, 1, 2, 4, 6, 7, /* 40*/ + 0, 3, 5, 1, 2, 4, 6, 7, /* 41*/ + 1, 3, 5, 0, 2, 4, 6, 7, /* 42*/ + 0, 1, 3, 5, 2, 4, 6, 7, /* 43*/ + 2, 3, 5, 0, 1, 4, 6, 7, /* 44*/ + 0, 2, 3, 5, 1, 4, 6, 7, /* 45*/ + 1, 2, 3, 5, 0, 4, 6, 7, /* 46*/ + 0, 1, 2, 3, 5, 4, 6, 7, /* 47*/ + 4, 5, 0, 1, 2, 3, 6, 7, /* 48*/ + 0, 4, 5, 1, 2, 3, 6, 7, /* 49*/ + 1, 4, 5, 0, 2, 3, 6, 7, /* 50*/ + 0, 1, 4, 5, 2, 3, 6, 7, /* 51*/ + 2, 4, 5, 0, 1, 3, 6, 7, /* 52*/ + 0, 2, 4, 5, 1, 3, 6, 7, /* 53*/ + 1, 2, 4, 5, 0, 3, 6, 7, /* 54*/ + 0, 1, 2, 4, 5, 3, 6, 7, /* 55*/ + 3, 4, 5, 0, 1, 2, 6, 7, /* 56*/ + 0, 3, 4, 5, 1, 2, 6, 7, /* 57*/ + 1, 3, 4, 5, 0, 2, 6, 7, /* 58*/ + 0, 1, 3, 4, 5, 2, 6, 7, /* 59*/ + 2, 3, 4, 5, 0, 1, 6, 7, /* 60*/ + 0, 2, 3, 4, 5, 1, 6, 7, /* 61*/ + 1, 2, 3, 4, 5, 0, 6, 7, /* 62*/ + 0, 1, 2, 3, 4, 5, 6, 7, /* 63*/ + 6, 0, 1, 2, 3, 4, 5, 7, /* 64*/ + 0, 6, 1, 2, 3, 4, 5, 7, /* 65*/ + 1, 6, 0, 2, 3, 4, 5, 7, /* 66*/ + 0, 1, 6, 2, 3, 4, 5, 7, /* 67*/ + 2, 6, 0, 1, 3, 4, 5, 7, /* 68*/ + 0, 2, 6, 1, 3, 4, 5, 7, /* 69*/ + 1, 2, 6, 0, 3, 4, 5, 7, /* 70*/ + 0, 1, 2, 6, 3, 4, 5, 7, /* 71*/ + 3, 6, 0, 1, 2, 4, 5, 7, /* 72*/ + 0, 3, 6, 1, 2, 4, 5, 7, /* 73*/ + 1, 3, 6, 0, 2, 4, 5, 7, /* 74*/ + 0, 1, 3, 6, 2, 4, 5, 7, /* 75*/ + 2, 3, 6, 0, 1, 4, 5, 7, /* 76*/ + 0, 2, 3, 6, 1, 4, 5, 7, /* 77*/ + 1, 2, 3, 6, 0, 4, 5, 7, /* 78*/ + 0, 1, 2, 3, 6, 4, 5, 7, /* 79*/ + 4, 6, 0, 1, 2, 3, 5, 7, /* 80*/ + 0, 4, 6, 1, 2, 3, 5, 7, /* 81*/ + 1, 4, 6, 0, 2, 3, 5, 7, /* 82*/ + 0, 1, 4, 6, 2, 3, 5, 7, /* 83*/ + 2, 4, 6, 0, 1, 3, 5, 7, /* 84*/ + 0, 2, 4, 6, 1, 3, 5, 7, /* 85*/ + 1, 2, 4, 6, 0, 3, 5, 7, /* 86*/ + 0, 1, 2, 4, 6, 3, 5, 7, /* 87*/ + 3, 4, 6, 0, 1, 2, 5, 7, /* 88*/ + 0, 3, 4, 6, 1, 2, 5, 7, /* 89*/ + 1, 3, 4, 6, 0, 2, 5, 7, /* 90*/ + 0, 1, 3, 4, 6, 2, 5, 7, /* 91*/ + 2, 3, 4, 6, 0, 1, 5, 7, /* 92*/ + 0, 2, 3, 4, 6, 1, 5, 7, /* 93*/ + 1, 2, 3, 4, 6, 0, 5, 7, /* 94*/ + 0, 1, 2, 3, 4, 6, 5, 7, /* 95*/ + 5, 6, 0, 1, 2, 3, 4, 7, /* 96*/ + 0, 5, 6, 1, 2, 3, 4, 7, /* 97*/ + 1, 5, 6, 0, 2, 3, 4, 7, /* 98*/ + 0, 1, 5, 6, 2, 3, 4, 7, /* 99*/ + 2, 5, 6, 0, 1, 3, 4, 7, /* 100*/ + 0, 2, 5, 6, 1, 3, 4, 7, /* 101*/ + 1, 2, 5, 6, 0, 3, 4, 7, /* 102*/ + 0, 1, 2, 5, 6, 3, 4, 7, /* 103*/ + 3, 5, 6, 0, 1, 2, 4, 7, /* 104*/ + 0, 3, 5, 6, 1, 2, 4, 7, /* 105*/ + 1, 3, 5, 6, 0, 2, 4, 7, /* 106*/ + 0, 1, 3, 5, 6, 2, 4, 7, /* 107*/ + 2, 3, 5, 6, 0, 1, 4, 7, /* 108*/ + 0, 2, 3, 5, 6, 1, 4, 7, /* 109*/ + 1, 2, 3, 5, 6, 0, 4, 7, /* 110*/ + 0, 1, 2, 3, 5, 6, 4, 7, /* 111*/ + 4, 5, 6, 0, 1, 2, 3, 7, /* 112*/ + 0, 4, 5, 6, 1, 2, 3, 7, /* 113*/ + 1, 4, 5, 6, 0, 2, 3, 7, /* 114*/ + 0, 1, 4, 5, 6, 2, 3, 7, /* 115*/ + 2, 4, 5, 6, 0, 1, 3, 7, /* 116*/ + 0, 2, 4, 5, 6, 1, 3, 7, /* 117*/ + 1, 2, 4, 5, 6, 0, 3, 7, /* 118*/ + 0, 1, 2, 4, 5, 6, 3, 7, /* 119*/ + 3, 4, 5, 6, 0, 1, 2, 7, /* 120*/ + 0, 3, 4, 5, 6, 1, 2, 7, /* 121*/ + 1, 3, 4, 5, 6, 0, 2, 7, /* 122*/ + 0, 1, 3, 4, 5, 6, 2, 7, /* 123*/ + 2, 3, 4, 5, 6, 0, 1, 7, /* 124*/ + 0, 2, 3, 4, 5, 6, 1, 7, /* 125*/ + 1, 2, 3, 4, 5, 6, 0, 7, /* 126*/ + 0, 1, 2, 3, 4, 5, 6, 7, /* 127*/ + 7, 0, 1, 2, 3, 4, 5, 6, /* 128*/ + 0, 7, 1, 2, 3, 4, 5, 6, /* 129*/ + 1, 7, 0, 2, 3, 4, 5, 6, /* 130*/ + 0, 1, 7, 2, 3, 4, 5, 6, /* 131*/ + 2, 7, 0, 1, 3, 4, 5, 6, /* 132*/ + 0, 2, 7, 1, 3, 4, 5, 6, /* 133*/ + 1, 2, 7, 0, 3, 4, 5, 6, /* 134*/ + 0, 1, 2, 7, 3, 4, 5, 6, /* 135*/ + 3, 7, 0, 1, 2, 4, 5, 6, /* 136*/ + 0, 3, 7, 1, 2, 4, 5, 6, /* 137*/ + 1, 3, 7, 0, 2, 4, 5, 6, /* 138*/ + 0, 1, 3, 7, 2, 4, 5, 6, /* 139*/ + 2, 3, 7, 0, 1, 4, 5, 6, /* 140*/ + 0, 2, 3, 7, 1, 4, 5, 6, /* 141*/ + 1, 2, 3, 7, 0, 4, 5, 6, /* 142*/ + 0, 1, 2, 3, 7, 4, 5, 6, /* 143*/ + 4, 7, 0, 1, 2, 3, 5, 6, /* 144*/ + 0, 4, 7, 1, 2, 3, 5, 6, /* 145*/ + 1, 4, 7, 0, 2, 3, 5, 6, /* 146*/ + 0, 1, 4, 7, 2, 3, 5, 6, /* 147*/ + 2, 4, 7, 0, 1, 3, 5, 6, /* 148*/ + 0, 2, 4, 7, 1, 3, 5, 6, /* 149*/ + 1, 2, 4, 7, 0, 3, 5, 6, /* 150*/ + 0, 1, 2, 4, 7, 3, 5, 6, /* 151*/ + 3, 4, 7, 0, 1, 2, 5, 6, /* 152*/ + 0, 3, 4, 7, 1, 2, 5, 6, /* 153*/ + 1, 3, 4, 7, 0, 2, 5, 6, /* 154*/ + 0, 1, 3, 4, 7, 2, 5, 6, /* 155*/ + 2, 3, 4, 7, 0, 1, 5, 6, /* 156*/ + 0, 2, 3, 4, 7, 1, 5, 6, /* 157*/ + 1, 2, 3, 4, 7, 0, 5, 6, /* 158*/ + 0, 1, 2, 3, 4, 7, 5, 6, /* 159*/ + 5, 7, 0, 1, 2, 3, 4, 6, /* 160*/ + 0, 5, 7, 1, 2, 3, 4, 6, /* 161*/ + 1, 5, 7, 0, 2, 3, 4, 6, /* 162*/ + 0, 1, 5, 7, 2, 3, 4, 6, /* 163*/ + 2, 5, 7, 0, 1, 3, 4, 6, /* 164*/ + 0, 2, 5, 7, 1, 3, 4, 6, /* 165*/ + 1, 2, 5, 7, 0, 3, 4, 6, /* 166*/ + 0, 1, 2, 5, 7, 3, 4, 6, /* 167*/ + 3, 5, 7, 0, 1, 2, 4, 6, /* 168*/ + 0, 3, 5, 7, 1, 2, 4, 6, /* 169*/ + 1, 3, 5, 7, 0, 2, 4, 6, /* 170*/ + 0, 1, 3, 5, 7, 2, 4, 6, /* 171*/ + 2, 3, 5, 7, 0, 1, 4, 6, /* 172*/ + 0, 2, 3, 5, 7, 1, 4, 6, /* 173*/ + 1, 2, 3, 5, 7, 0, 4, 6, /* 174*/ + 0, 1, 2, 3, 5, 7, 4, 6, /* 175*/ + 4, 5, 7, 0, 1, 2, 3, 6, /* 176*/ + 0, 4, 5, 7, 1, 2, 3, 6, /* 177*/ + 1, 4, 5, 7, 0, 2, 3, 6, /* 178*/ + 0, 1, 4, 5, 7, 2, 3, 6, /* 179*/ + 2, 4, 5, 7, 0, 1, 3, 6, /* 180*/ + 0, 2, 4, 5, 7, 1, 3, 6, /* 181*/ + 1, 2, 4, 5, 7, 0, 3, 6, /* 182*/ + 0, 1, 2, 4, 5, 7, 3, 6, /* 183*/ + 3, 4, 5, 7, 0, 1, 2, 6, /* 184*/ + 0, 3, 4, 5, 7, 1, 2, 6, /* 185*/ + 1, 3, 4, 5, 7, 0, 2, 6, /* 186*/ + 0, 1, 3, 4, 5, 7, 2, 6, /* 187*/ + 2, 3, 4, 5, 7, 0, 1, 6, /* 188*/ + 0, 2, 3, 4, 5, 7, 1, 6, /* 189*/ + 1, 2, 3, 4, 5, 7, 0, 6, /* 190*/ + 0, 1, 2, 3, 4, 5, 7, 6, /* 191*/ + 6, 7, 0, 1, 2, 3, 4, 5, /* 192*/ + 0, 6, 7, 1, 2, 3, 4, 5, /* 193*/ + 1, 6, 7, 0, 2, 3, 4, 5, /* 194*/ + 0, 1, 6, 7, 2, 3, 4, 5, /* 195*/ + 2, 6, 7, 0, 1, 3, 4, 5, /* 196*/ + 0, 2, 6, 7, 1, 3, 4, 5, /* 197*/ + 1, 2, 6, 7, 0, 3, 4, 5, /* 198*/ + 0, 1, 2, 6, 7, 3, 4, 5, /* 199*/ + 3, 6, 7, 0, 1, 2, 4, 5, /* 200*/ + 0, 3, 6, 7, 1, 2, 4, 5, /* 201*/ + 1, 3, 6, 7, 0, 2, 4, 5, /* 202*/ + 0, 1, 3, 6, 7, 2, 4, 5, /* 203*/ + 2, 3, 6, 7, 0, 1, 4, 5, /* 204*/ + 0, 2, 3, 6, 7, 1, 4, 5, /* 205*/ + 1, 2, 3, 6, 7, 0, 4, 5, /* 206*/ + 0, 1, 2, 3, 6, 7, 4, 5, /* 207*/ + 4, 6, 7, 0, 1, 2, 3, 5, /* 208*/ + 0, 4, 6, 7, 1, 2, 3, 5, /* 209*/ + 1, 4, 6, 7, 0, 2, 3, 5, /* 210*/ + 0, 1, 4, 6, 7, 2, 3, 5, /* 211*/ + 2, 4, 6, 7, 0, 1, 3, 5, /* 212*/ + 0, 2, 4, 6, 7, 1, 3, 5, /* 213*/ + 1, 2, 4, 6, 7, 0, 3, 5, /* 214*/ + 0, 1, 2, 4, 6, 7, 3, 5, /* 215*/ + 3, 4, 6, 7, 0, 1, 2, 5, /* 216*/ + 0, 3, 4, 6, 7, 1, 2, 5, /* 217*/ + 1, 3, 4, 6, 7, 0, 2, 5, /* 218*/ + 0, 1, 3, 4, 6, 7, 2, 5, /* 219*/ + 2, 3, 4, 6, 7, 0, 1, 5, /* 220*/ + 0, 2, 3, 4, 6, 7, 1, 5, /* 221*/ + 1, 2, 3, 4, 6, 7, 0, 5, /* 222*/ + 0, 1, 2, 3, 4, 6, 7, 5, /* 223*/ + 5, 6, 7, 0, 1, 2, 3, 4, /* 224*/ + 0, 5, 6, 7, 1, 2, 3, 4, /* 225*/ + 1, 5, 6, 7, 0, 2, 3, 4, /* 226*/ + 0, 1, 5, 6, 7, 2, 3, 4, /* 227*/ + 2, 5, 6, 7, 0, 1, 3, 4, /* 228*/ + 0, 2, 5, 6, 7, 1, 3, 4, /* 229*/ + 1, 2, 5, 6, 7, 0, 3, 4, /* 230*/ + 0, 1, 2, 5, 6, 7, 3, 4, /* 231*/ + 3, 5, 6, 7, 0, 1, 2, 4, /* 232*/ + 0, 3, 5, 6, 7, 1, 2, 4, /* 233*/ + 1, 3, 5, 6, 7, 0, 2, 4, /* 234*/ + 0, 1, 3, 5, 6, 7, 2, 4, /* 235*/ + 2, 3, 5, 6, 7, 0, 1, 4, /* 236*/ + 0, 2, 3, 5, 6, 7, 1, 4, /* 237*/ + 1, 2, 3, 5, 6, 7, 0, 4, /* 238*/ + 0, 1, 2, 3, 5, 6, 7, 4, /* 239*/ + 4, 5, 6, 7, 0, 1, 2, 3, /* 240*/ + 0, 4, 5, 6, 7, 1, 2, 3, /* 241*/ + 1, 4, 5, 6, 7, 0, 2, 3, /* 242*/ + 0, 1, 4, 5, 6, 7, 2, 3, /* 243*/ + 2, 4, 5, 6, 7, 0, 1, 3, /* 244*/ + 0, 2, 4, 5, 6, 7, 1, 3, /* 245*/ + 1, 2, 4, 5, 6, 7, 0, 3, /* 246*/ + 0, 1, 2, 4, 5, 6, 7, 3, /* 247*/ + 3, 4, 5, 6, 7, 0, 1, 2, /* 248*/ + 0, 3, 4, 5, 6, 7, 1, 2, /* 249*/ + 1, 3, 4, 5, 6, 7, 0, 2, /* 250*/ + 0, 1, 3, 4, 5, 6, 7, 2, /* 251*/ + 2, 3, 4, 5, 6, 7, 0, 1, /* 252*/ + 0, 2, 3, 4, 5, 6, 7, 1, /* 253*/ + 1, 2, 3, 4, 5, 6, 7, 0, /* 254*/ + 0, 1, 2, 3, 4, 5, 6, 7, /* 255*/ +}; + +static inline __m256i _mm256_cmplt_epi32(__m256i a, __m256i b) { + return _mm256_cmpgt_epi32(b, a); +} + + +static void avx_natenodutch_partition_epi32(int32_t *array, const int32_t pivot, + int &left, int &right) { + const __m256i P = _mm256_set1_epi32(pivot); + const int valuespervector = sizeof(__m256i) / sizeof(int32_t); + if (right - left + 1 < + 4 * valuespervector) { // not enough space for nate's algo to make sense, falling back + scalar_partition(array, pivot, left, right); + return; + } + int readleft = left + valuespervector; + int readright = right - valuespervector; + int32_t buffer[2 * valuespervector]; // tmp buffer + memcpy(buffer, array + left, valuespervector * sizeof(int32_t)); + memcpy(buffer + valuespervector, array + right - valuespervector + 1, + valuespervector * sizeof(int32_t)); + while (readright - readleft >= valuespervector) { + __m256i *nextVectorPtr; + if ((readleft - left) > (right - readright)) { + nextVectorPtr = (__m256i *)(array + readright - valuespervector + 1); + readright -= valuespervector; + } else { + nextVectorPtr = (__m256i *)(array + readleft); + readleft += valuespervector; + } + __m256i currentVector = _mm256_loadu_si256(nextVectorPtr); + /****** + * we need two comparisons if we are to avoid the Dutch national flag + * problem, + * Reference: https://en.wikipedia.org/wiki/Dutch_national_flag_problem + */ + int greaterthanpivot = + _mm256_movemask_ps((__m256)_mm256_cmpgt_epi32(currentVector, P)); + int lesserthanpivot = + _mm256_movemask_ps((__m256)_mm256_cmplt_epi32(currentVector, P)); + __m256i greaterthanpivot_permvector = _mm256_load_si256( + (__m256i *)(reverseshufflemask + 8 * greaterthanpivot)); + __m256i lesserthanpivot_permvector = + _mm256_load_si256((__m256i *)(shufflemask + 8 * lesserthanpivot)); + + int count_greaterthanpivot = _mm_popcnt_u32(greaterthanpivot); + int count_lesserthanpivot = _mm_popcnt_u32(lesserthanpivot); + __m256i greaterthanpivot_vector = + _mm256_permutevar8x32_epi32(currentVector, greaterthanpivot_permvector); + __m256i lesserthanpivot_vector = + _mm256_permutevar8x32_epi32(currentVector, lesserthanpivot_permvector); + _mm256_storeu_si256((__m256i *)(array + left), lesserthanpivot_vector); + + _mm256_storeu_si256((__m256i *)(array + right - valuespervector + 1), + greaterthanpivot_vector); + + left += count_lesserthanpivot; + right -= count_greaterthanpivot; + } + + int howmanyleft = readleft - left; + if (howmanyleft > 2 * valuespervector) { + howmanyleft = 2 * valuespervector; + for (int k = howmanyleft; k < readleft - left; ++k) + array[left + k] = pivot; + } + memcpy(array + left, buffer, howmanyleft * sizeof(int32_t)); + + int howmanyright = right - readright; + if (howmanyright + howmanyleft > 2 * valuespervector) { + howmanyright = 2 * valuespervector - howmanyleft; + for (int k = howmanyright; k < right - readright; ++k) + array[readright + 1 + k] = pivot; + } + + memcpy(array + readright + 1, buffer + howmanyleft, + howmanyright * sizeof(int32_t)); + scalar_partition(array, pivot, left, right); +} + + +// it is really signed int sorting, but the API seems to expect uint32_t... +void avx_natenodutch_quicksort(uint32_t *array, int left, int right) { + + int i = left; + int j = right; + const int32_t pivot = array[(i + j) / 2]; + + avx_natenodutch_partition_epi32((int32_t *) array, pivot, i, j); + + if (left < j) { + avx_natenodutch_quicksort(array, left, j); + } + + if (i < right) { + avx_natenodutch_quicksort(array, i, right); + } +} diff --git a/simd-sort/avx2-partition.cpp b/simd-sort/avx2-partition.cpp new file mode 100644 index 0000000..08f5cd8 --- /dev/null +++ b/simd-sort/avx2-partition.cpp @@ -0,0 +1,197 @@ +#include +#include + +namespace qs { + + namespace avx2 { + + __m256i FORCE_INLINE bitmask_to_bytemask_epi32(uint8_t bm) { + + const __m256i mask = _mm256_set1_epi32(bm); + const __m256i bits = _mm256_setr_epi32(0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80); + const __m256i tmp = _mm256_and_si256(mask, bits); + + return _mm256_cmpeq_epi32(tmp, bits); + } + + + void FORCE_INLINE align_masks(uint8_t& a, uint8_t& b, uint8_t& rem_a, uint8_t& rem_b, __m256i& shuffle_a, __m256i& shuffle_b) { + + assert(a != 0); + assert(b != 0); + + uint8_t tmpA = a; + uint8_t tmpB = b; + + uint32_t __attribute__((__aligned__(32))) tmpshufa[8]; + uint32_t __attribute__((__aligned__(32))) tmpshufb[8]; + + while (tmpA != 0 && tmpB != 0) { + int idx_a = __builtin_ctz(tmpA); + int idx_b = __builtin_ctz(tmpB); + + tmpA = tmpA & (tmpA - 1); + tmpB = tmpB & (tmpB - 1); + + tmpshufb[idx_a] = idx_b; + tmpshufa[idx_b] = idx_a; + } + + a = a ^ tmpA; + b = b ^ tmpB; + + assert(a != 0); + assert(b != 0); + assert(_mm_popcnt_u64(a) == _mm_popcnt_u64(b)); + + rem_a = tmpA; + rem_b = tmpB; + + shuffle_a = _mm256_load_si256((__m256i*)tmpshufa); + shuffle_b = _mm256_load_si256((__m256i*)tmpshufb); + } + + + __m256i FORCE_INLINE merge(const __m256i mask, const __m256i a, const __m256i b) { + + return _mm256_or_si256( + _mm256_and_si256(mask, a), + _mm256_andnot_si256(mask, b) + ); + } + + + void FORCE_INLINE swap_epi32( + __m256i& a, __m256i& b, + uint8_t mask_a, const __m256i shuffle_a, + uint8_t mask_b, const __m256i shuffle_b) { + + const __m256i to_swap_b = _mm256_permutevar8x32_epi32(a, shuffle_a); + const __m256i to_swap_a = _mm256_permutevar8x32_epi32(b, shuffle_b); + const __m256i ma = bitmask_to_bytemask_epi32(mask_a); + const __m256i mb = bitmask_to_bytemask_epi32(mask_b); + + a = merge(ma, to_swap_a, a); + b = merge(mb, to_swap_b, b); + } + + + +#define _mm256_iszero(vec) (_mm256_testz_si256(vec, vec) != 0) + + void FORCE_INLINE partition_epi32(uint32_t* array, uint32_t pv, int& left, int& right) { + + const int N = 8; // the number of items in a register (256/32) + + __m256i L; + __m256i R; + uint8_t maskL = 0; + uint8_t maskR = 0; + + const __m256i pivot = _mm256_set1_epi32(pv); + + int origL = left; + int origR = right; + + while (true) { + + if (maskL == 0) { + while (true) { + if (right - (left + N) + 1 < 2*N) { + goto end; + } + + L = _mm256_loadu_si256((__m256i*)(array + left)); + const __m256i bytemask = _mm256_cmpgt_epi32(pivot, L); + + if (_mm256_testc_ps((__m256)bytemask, (__m256)_mm256_set1_epi32(-1))) { + left += N; + } else { + maskL = ~_mm256_movemask_ps((__m256)bytemask); + break; + } + } + + } + + if (maskR == 0) { + while (true) { + if ((right - N) - left + 1 < 2*N) { + goto end; + } + + R = _mm256_loadu_si256((__m256i*)(array + right - N + 1)); + const __m256i bytemask = _mm256_cmpgt_epi32(pivot, R); + if (_mm256_iszero(bytemask)) { + right -= N; + } else { + maskR = _mm256_movemask_ps((__m256)bytemask); + break; + } + } + + } + + assert(left <= right); + assert(maskL != 0); + assert(maskR != 0); + + uint8_t mL; + uint8_t mR; + __m256i shuffleL; + __m256i shuffleR; + + align_masks(maskL, maskR, mL, mR, shuffleL, shuffleR); + swap_epi32(L, R, maskL, shuffleL, maskR, shuffleR); + + maskL = mL; + maskR = mR; + + if (maskL == 0) { + _mm256_storeu_si256((__m256i*)(array + left), L); + left += N; + } + + if (maskR == 0) { + _mm256_storeu_si256((__m256i*)(array + right - N + 1), R); + right -= N; + } + + } // while + + end: + + assert(!(maskL != 0 && maskR != 0)); + + if (maskL != 0) { + _mm256_storeu_si256((__m256i*)(array + left), L); + } else if (maskR != 0) { + _mm256_storeu_si256((__m256i*)(array + right - N + 1), R); + } + + if (left < right) { + int less = 0; + int greater = 0; + const int all = right - left + 1; + + for (int i=left; i <= right; i++) { + less += int(array[i] < pv); + greater += int(array[i] > pv); + } + + if (all == less) { + // all elements in range [left, right] less than pivot + scalar_partition_epi32(array, pv, origL, left); + } else if (all == greater) { + // all elements in range [left, right] greater than pivot + scalar_partition_epi32(array, pv, left, origR); + } else { + scalar_partition_epi32(array, pv, left, right); + } + } + } + + } // namespace avx2 + +} // namespace qs + diff --git a/simd-sort/avx2-quicksort.cpp b/simd-sort/avx2-quicksort.cpp new file mode 100644 index 0000000..3fc3859 --- /dev/null +++ b/simd-sort/avx2-quicksort.cpp @@ -0,0 +1,32 @@ +#include "avx2-partition.cpp" + +namespace qs { + + namespace avx2 { + + void quicksort(uint32_t* array, int left, int right) { + + int i = left; + int j = right; + + const uint32_t pivot = array[(i + j)/2]; + const int AVX2_REGISTER_SIZE = 8; // in 32-bit words + + if (j - i >= 2 * AVX2_REGISTER_SIZE) { + qs::avx2::partition_epi32(array, pivot, i, j); + } else { + scalar_partition_epi32(array, pivot, i, j); + } + + if (left < j) { + quicksort(array, left, j); + } + + if (i < right) { + quicksort(array, i, right); + } + } + + } // namespace avx2 + +} // namespace qs diff --git a/simd-sort/avx512-auxbuffer-partition.cpp b/simd-sort/avx512-auxbuffer-partition.cpp new file mode 100644 index 0000000..29a1391 --- /dev/null +++ b/simd-sort/avx512-auxbuffer-partition.cpp @@ -0,0 +1,90 @@ +namespace qs { + + namespace avx512 { + + void memset_epi32(uint32_t* array, uint32_t w, size_t n) { + + const int N = 16; + const __m512i word = _mm512_set1_epi32(w); + + for (size_t i=0; i < n/N; i++) { + _mm512_storeu_si512(array + i*N, word); + } + + for (size_t i=n/N * N; i < n; i++) { + array[i] = w; + } + } + + void memcpy_epi32(uint32_t* dst, uint32_t* src, size_t n) { + + const int N = 16; + + for (size_t i=0; i < n/N; i++) { + _mm512_storeu_si512(dst + i*N, _mm512_loadu_si512(src + i*N)); + } + + for (size_t i=n/N * N; i < n; i++) { + dst[i] = src[i]; + } + } + + // parition array[0..n-1] + uint32_t FORCE_INLINE partition_auxbuffer_epi32(uint32_t* array, size_t n, uint32_t pv) { + + const int N = 16; + const int AUX_COUNT = 1024; // 4kB + + static uint32_t gt_buf[AUX_COUNT + N]; + + size_t lt_count = 0; + size_t gt_count = 0; + + const __m512i pivot = _mm512_set1_epi32(pv); + + // 1. copy greater and less values into separate buffers + for (size_t i=0; i < n / N; i++) { + + const __m512i v = _mm512_loadu_si512(array + i*N); + + const __mmask16 lt = _mm512_cmplt_epi32_mask(v, pivot); + const __mmask16 gt = _mm512_cmpgt_epi32_mask(v, pivot); + + const __m512i less = _mm512_maskz_compress_epi32(lt, v); + const __m512i greater = _mm512_maskz_compress_epi32(gt, v); + + _mm512_storeu_si512(array + lt_count, less); + _mm512_storeu_si512(gt_buf + gt_count, greater); + + lt_count += _mm_popcnt_u32(lt); + gt_count += _mm_popcnt_u32(gt); + } + + for (size_t i=0; i < n % N; i++) { + + const uint32_t v = array[(n/N) * N + i]; + + if (v < pv) { + array[lt_count++] = v; + } else if (v > pv) { + gt_buf[gt_count++] = v; + } + } + + const size_t eq_count = n - (lt_count + gt_count); + + // 2. replace array with partially ordered data + + // 2.a. pivots + memset_epi32(array + lt_count, pv, eq_count); + + // 2.b. all values greater than pivot + memcpy_epi32(array + lt_count + eq_count, gt_buf, gt_count); + + // 3. index before the first pivot + return lt_count; + } + + } // namespace avx512 + +} // namespace qa diff --git a/simd-sort/avx512-bmi2-partition.cpp b/simd-sort/avx512-bmi2-partition.cpp new file mode 100644 index 0000000..c2dac5d --- /dev/null +++ b/simd-sort/avx512-bmi2-partition.cpp @@ -0,0 +1,105 @@ +namespace qs { + + namespace avx512 { + + void FORCE_INLINE bmi2_align_masks(__mmask16& a, __mmask16& b, __mmask16& rem_a, __mmask16& rem_b) { + + uint32_t tmpA = _pext_u32(0xffffffff, a); + uint32_t tmpB = _pext_u32(0xffffffff, b); + + uint32_t tmp = tmpA & tmpB; + + __mmask16 new_a = _pdep_u32(tmp, a); + __mmask16 new_b = _pdep_u32(tmp, a); + + rem_a = a ^ new_a; + rem_b = b ^ new_b; + + a = new_a; + b = new_b; + + assert(a != 0); + assert(b != 0); + assert(_mm_popcnt_u64(a) == _mm_popcnt_u64(b)); + } + + + void FORCE_INLINE bmi2_partition_epi32(uint32_t* array, uint32_t pv, int& left, int& right) { + + const int N = 16; + + __m512i L; + __m512i R; + __mmask16 maskL = 0; + __mmask16 maskR = 0; + const __m512i pivot = _mm512_set1_epi32(pv); + + while (right - left + 1 >= 2*N) { + + while (maskL == 0) { + if (right - (left + N) + 1 < 2*N) { + goto end; + } + + L = _mm512_loadu_si512(array + left); + maskL = _mm512_cmpge_epi32_mask(L, pivot); + if (maskL == 0) { + left += N; + } + } + + while (maskR == 0) { + if ((right - N) - left + 1 < 2*N) { + goto end; + } + + R = _mm512_loadu_si512(array + right - N + 1); + maskR = _mm512_cmple_epi32_mask(R, pivot); + if (maskR == 0) { + right -= N; + } + } + + assert(left <= right); + assert(maskL != 0); + assert(maskR != 0); + + __mmask16 mL; + __mmask16 mR; + + bmi2_align_masks(maskL, maskR, mL, mR); + swap_epi32(L, R, maskL, maskR); + + maskL = mL; + maskR = mR; + + if (maskL == 0) { + _mm512_storeu_si512(array + left, L); + left += N; + } + + if (maskR == 0) { + _mm512_storeu_si512(array + right - N + 1, R); + right -= N; + } + + } // while + + end: + + if (maskL != 0) { + _mm512_storeu_si512(array + left, L); + } + + if (maskR != 0) { + _mm512_storeu_si512(array + right - N + 1, R); + } + + if (left < right) { + partition_epi32(array, pv, left, right); + } + } + + } // namespace avx512 + +} // namespace qs diff --git a/simd-sort/avx512-partition-register.cpp b/simd-sort/avx512-partition-register.cpp new file mode 100644 index 0000000..a52100d --- /dev/null +++ b/simd-sort/avx512-partition-register.cpp @@ -0,0 +1,58 @@ +namespace qs { + + namespace avx512 { + + __mmask16 FORCE_INLINE get_range_mask(uint32_t position, uint32_t length) { + + return (uint32_t(1) << (position + length)) - (uint32_t(1) << position); + } + + /* + r stores 1 to 16 elements to partition, mask select leading element + */ + void partition_register(__m512i& r, const __m512i pivot, __mmask16 mask, int& left, int& right) { + + // example: + + // r = [ 1, 2, 3, 100, 200, 300, 4, 5, 50, 900, -1, -2, -3, -4, -5, -6] + // mask = 0x03ff -- first 10 elements are subject to parition + // pivot = packed_dword(50) + + const __mmask16 less_mask = _mm512_mask_cmplt_epi32_mask(mask, r, pivot); + const __mmask16 equal_mask = _mm512_mask_cmpeq_epi32_mask(mask, r, pivot); + + // less = [1, 2, 3, 4, 5, ... rest are **pivots**] + const __m512i less_equal = _mm512_mask_compress_epi32(pivot, less_mask, r); + + // greater = [100, 200, 300, 900, ... rest are zeros] + const __m512i greater = _mm512_maskz_compress_epi32(~(less_mask | equal_mask) & mask, r); + + // less_cnt = 5 + // less_equal_cnt = 6 + const int less_cnt = _mm_popcnt_u32(less_mask); + const int less_equal_cnt = _mm_popcnt_u32(less_mask | equal_mask); + + const __mmask16 store_less_equal = get_range_mask(0, less_equal_cnt); + const __mmask16 store_greater = ~store_less_equal & mask; + + // merge less or eqaul with input + // + // r = [ 1, 2, 3, 4, 5, 50, 4, 5, 50, 900, -1, -2, -3, -4, -5, -6] + // ^ ^ ^ ^ ^ ^^ + // merged + r = _mm512_mask_mov_epi32(r, store_less_equal, less_equal); + + // merge greater than pivot + // + // r = [ 1, 2, 3, 4, 5, 50, 100, 200, 300, 900, -1, -2, -3, -4, -5, -6] + // ^^^ ^^^ ^^^ ^^^ + // merged + r = _mm512_mask_expand_epi32(r, store_greater, greater); + + right = left + less_cnt; + left = left + less_equal_cnt; + } + + } // namespace avx512 + +} // namespace qs diff --git a/simd-sort/avx512-partition.cpp b/simd-sort/avx512-partition.cpp new file mode 100644 index 0000000..31625ff --- /dev/null +++ b/simd-sort/avx512-partition.cpp @@ -0,0 +1,129 @@ +//#define USE_AVX512_PIVOT + +namespace qs { + + namespace avx512 { + + void FORCE_INLINE align_masks(__mmask16& a, __mmask16& b, __mmask16& rem_a, __mmask16& rem_b) { + + assert(a != 0); + assert(b != 0); + + uint16_t tmpA = a; + uint16_t tmpB = b; + + while (tmpA != 0 && tmpB != 0) { + tmpA = tmpA & (tmpA - 1); + tmpB = tmpB & (tmpB - 1); + } + + a = a ^ tmpA; + b = b ^ tmpB; + + assert(a != 0); + assert(b != 0); + assert(_mm_popcnt_u64(a) == _mm_popcnt_u64(b)); + + rem_a = tmpA; + rem_b = tmpB; + } + + + void FORCE_INLINE partition_epi32(uint32_t* array, uint32_t pv, int& left, int& right) { + + const int N = 16; + + __m512i L; + __m512i R; + __mmask16 maskL = 0; + __mmask16 maskR = 0; + const __m512i pivot = _mm512_set1_epi32(pv); + + int count = (right - left + 1)/N; + + while (true) { + + while (maskL == 0) { + if (count <= 1) { + goto end; + } + + L = _mm512_loadu_si512(array + left); + maskL = _mm512_cmpge_epi32_mask(L, pivot); + if (maskL == 0) { + left += N; + count -= 1; + } + } + + while (maskR == 0) { + if (count <= 1) { + goto end; + } + + R = _mm512_loadu_si512(array + right - N + 1); + maskR = _mm512_cmple_epi32_mask(R, pivot); + if (maskR == 0) { + right -= N; + count -= 1; + } + } + + assert(left <= right); + assert(maskL != 0); + assert(maskR != 0); + + __mmask16 mL; + __mmask16 mR; + + align_masks(maskL, maskR, mL, mR); + swap_epi32(L, R, maskL, maskR); + + maskL = mL; + maskR = mR; + + if (maskL == 0) { + _mm512_storeu_si512(array + left, L); + left += N; + count -= 1; + } + + if (maskR == 0) { + _mm512_storeu_si512(array + right - N + 1, R); + right -= N; + count -= 1; + } + + } // while + + end: + + assert(!(maskL != 0 && maskR != 0)); + + if (maskL != 0) { + _mm512_storeu_si512(array + left, L); + } else if (maskR != 0) { + _mm512_storeu_si512(array + right - N + 1, R); + } + + if (left < right) { +#if USE_AVX512_PIVOT + scalar_partition_epi32(array, pv, left, right); +#else + const int size = right - left + 1; + if (size > N) { + scalar_partition_epi32(array, pv, left, right); + } else { + const int k = left; + __mmask16 mask = (uint32_t(1) << size) - 1; + __m512i v = _mm512_loadu_si512(array + k); + partition_register(v, pivot, mask, left, right); + _mm512_storeu_si512(array + k, v); + } +#endif + } + } + + } // namespace avx512 + +} // namespace qa diff --git a/simd-sort/avx512-popcnt-partition.cpp b/simd-sort/avx512-popcnt-partition.cpp new file mode 100644 index 0000000..4319f0c --- /dev/null +++ b/simd-sort/avx512-popcnt-partition.cpp @@ -0,0 +1,119 @@ +namespace qs { + + namespace avx512 { + + void FORCE_INLINE popcnt_partition_epi32(uint32_t* array, uint32_t pv, int& left, int& right) { + + const int N = 16; + + __m512i L; + __m512i R; + __mmask16 maskL = 0; + __mmask16 maskR = 0; + int popcntL; + int popcntR; + + const __m512i pivot = _mm512_set1_epi32(pv); + + while (true) { + + while (maskL == 0) { + if (right - (left + N) + 1 < N) { + goto end; + } + + L = _mm512_loadu_si512(array + left); + maskL = _mm512_cmpge_epi32_mask(L, pivot); + if (maskL == 0) { + left += N; + } + + popcntL = _mm_popcnt_u32(maskL); + } + + while (maskR == 0) { + if ((right - N) - left + 1 < N) { + goto end; + } + + R = _mm512_loadu_si512(array + right - N + 1); + maskR = _mm512_cmple_epi32_mask(R, pivot); + if (maskR == 0) { + right -= N; + } + + popcntR = _mm_popcnt_u32(maskR); + } + + assert(left <= right); + assert(maskL != 0); + assert(maskR != 0); + + if (popcntL == popcntR) { + // fast path + swap_epi32(L, R, maskL, maskR); + maskL = maskR = 0; + + _mm512_storeu_si512(array + left, L); + left += N; + + _mm512_storeu_si512(array + right - N + 1, R); + right -= N; + } else { + if (popcntL < popcntR) { + int n = popcntR - popcntL; + int k = n; + + __mmask16 tmp = maskR; + while (k--) { + maskR = maskR & (maskR - 1); + } + + swap_epi32(L, R, maskL, maskR); + maskR = tmp ^ maskR; + + popcntR = n; + + _mm512_storeu_si512(array + left, L); + left += N; + maskL = 0; + } else { + int n = popcntL - popcntR; + int k = n; + + __mmask16 tmp = maskL; + while (k--) { + maskL = maskL & (maskL - 1); + } + + swap_epi32(L, R, maskL, maskR); + maskL = tmp ^ maskL; + + popcntL = n; + + _mm512_storeu_si512(array + right - N + 1, R); + right -= N; + maskR = 0; + } + } + + } // while + + end: + + assert(!(maskL != 0 && maskR != 0)); + + if (maskL != 0) { + _mm512_storeu_si512(array + left, L); + } else if (maskR != 0) { + _mm512_storeu_si512(array + right - N + 1, R); + } + + if (left < right) { + scalar_partition_epi32(array, pv, left, right); + } + } + + } // namespace avx512 + +} // namespace qs diff --git a/simd-sort/avx512-quicksort.cpp b/simd-sort/avx512-quicksort.cpp new file mode 100644 index 0000000..55a0e80 --- /dev/null +++ b/simd-sort/avx512-quicksort.cpp @@ -0,0 +1,211 @@ +#include "avx512-swap.cpp" +#include "avx512-partition-register.cpp" +#include "avx512-partition.cpp" +#include "avx512-popcnt-partition.cpp" +#include "avx512-bmi2-partition.cpp" +#include "avx512-auxbuffer-partition.cpp" +#include "avx512-sort-register.cpp" + + +namespace qs { + + namespace avx512 { + + + void dump(const char* name, const __m512i v) { + uint32_t tmp[16]; + + printf("%-10s = [", name); + _mm512_storeu_si512((__m512i*)tmp, v); + for (int i=0; i < 16; i++) { + if (i > 0) { + putchar(' '); + } + + printf("%08x", tmp[i]); + } + + printf("]\n"); + } + + void quicksort(uint32_t* array, int left, int right) { + + int i = left; + int j = right; + + const int size = j - i + 1; + if (size <= 16) { + const __m512i in = _mm512_loadu_si512(array + i); + __m512i result; + + switch (size) { + case 16: result = avx512_sort16_epi32(in); break; + case 15: result = avx512_sort15_epi32(in); break; + case 14: result = avx512_sort14_epi32(in); break; + case 13: result = avx512_sort13_epi32(in); break; + case 12: result = avx512_sort12_epi32(in); break; + case 11: result = avx512_sort11_epi32(in); break; + case 10: result = avx512_sort10_epi32(in); break; + case 9: result = avx512_sort9_epi32(in); break; + case 8: result = avx512_sort8_epi32(in); break; + case 7: result = avx512_sort7_epi32(in); break; + case 6: result = avx512_sort6_epi32(in); break; + case 5: result = avx512_sort5_epi32(in); break; + case 4: result = avx512_sort4_epi32(in); break; + case 3: result = avx512_sort3_epi32(in); break; + case 2: { + if (array[i] > array[j]) { + uint32_t t = array[i]; + array[i] = array[j]; + array[j] = t; + } + return; + } + + case 1: + case 0: + return; + + default: + assert(false); + break; + } + + _mm512_storeu_si512(array + i, result); + return; + } + + const uint32_t pivot = array[left]; + const int AVX512_REGISTER_SIZE = 16; // in 32-bit words + if (size >= 2 * AVX512_REGISTER_SIZE) { + ::qs::avx512::partition_epi32(array, pivot, i, j); + } else { + scalar_partition_epi32(array, pivot, i, j); + } + + if (left < j) { + quicksort(array, left, j); + } + + if (i < right) { + quicksort(array, i, right); + } + } + + + void popcnt_quicksort(uint32_t* array, int left, int right) { + + int i = left; + int j = right; + const int size = j - i + 1; + if (size <= 16) { + const __m512i in = _mm512_loadu_si512(array + i); + __m512i result; + + switch (size) { + case 16: result = avx512_sort16_epi32(in); break; + case 15: result = avx512_sort15_epi32(in); break; + case 14: result = avx512_sort14_epi32(in); break; + case 13: result = avx512_sort13_epi32(in); break; + case 12: result = avx512_sort12_epi32(in); break; + case 11: result = avx512_sort11_epi32(in); break; + case 10: result = avx512_sort10_epi32(in); break; + case 9: result = avx512_sort9_epi32(in); break; + case 8: result = avx512_sort8_epi32(in); break; + case 7: result = avx512_sort7_epi32(in); break; + case 6: result = avx512_sort6_epi32(in); break; + case 5: result = avx512_sort5_epi32(in); break; + case 4: result = avx512_sort4_epi32(in); break; + case 3: result = avx512_sort3_epi32(in); break; + case 2: { + if (array[i] > array[j]) { + uint32_t t = array[i]; + array[i] = array[j]; + array[j] = t; + } + return; + } + + case 1: + case 0: + return; + + default: + assert(false); + break; + } + + _mm512_storeu_si512(array + i, result); + return; + } + + const uint32_t pivot = array[(i + j)/2]; + const int AVX512_REGISTER_SIZE = 16; // in 32-bit words + if (size >= 2 * AVX512_REGISTER_SIZE) { + ::qs::avx512::popcnt_partition_epi32(array, pivot, i, j); + } else { + scalar_partition_epi32(array, pivot, i, j); + } + + if (left < j) { + popcnt_quicksort(array, left, j); + } + + if (i < right) { + popcnt_quicksort(array, i, right); + } + } + + + void bmi2_quicksort(uint32_t* array, int left, int right) { + + int i = left; + int j = right; + + const uint32_t pivot = array[left]; + const int AVX512_REGISTER_SIZE = 16; // in 32-bit words + + if (j - i >= 2 * AVX512_REGISTER_SIZE) { + ::qs::avx512::bmi2_partition_epi32(array, pivot, i, j); + } else { + scalar_partition_epi32(array, pivot, i, j); + } + + if (left < j) { + bmi2_quicksort(array, left, j); + } + + if (i < right) { + bmi2_quicksort(array, i, right); + } + } + + + void auxbuffer_quicksort(uint32_t* array, int lo, int hi) { + + if (lo >= hi) { + return; + } + + int p; +#if 0 + p = lomuto_partition_epi32(array, lo, hi); +#else + const auto n = (hi - lo + 1); + if (n > 1024) { + p = lomuto_partition_epi32(array, lo, hi); + } else if (n > 8) { + const uint32_t pivot = array[(hi + lo)/2]; + p = ::qs::avx512::partition_auxbuffer_epi32(array + lo, n, pivot) + lo; + } else { + p = lomuto_partition_epi32(array, lo, hi); + } +#endif + auxbuffer_quicksort(array, lo, p-1); + auxbuffer_quicksort(array, p+1, hi); + } + + } // namespace avx512 + +} // namespace qs + diff --git a/simd-sort/avx512-sort-register.cpp b/simd-sort/avx512-sort-register.cpp new file mode 100644 index 0000000..b711954 --- /dev/null +++ b/simd-sort/avx512-sort-register.cpp @@ -0,0 +1,323 @@ + +//#define POPCNT_LOOKUP + +#ifdef POPCNT_LOOKUP + + uint16_t lookup[65536]; + + // The lookup table replaces 2 x popcount and 2 x shift with 1 bit-or and 2 x memory fetch + void prepare_lookup() { + for (int i=0; i < 65536; i++) { + lookup[i] = 1 << _mm_popcnt_u32(i); + } + } + #define STEP(unused) { \ + const __m512i b = _mm512_permutexvar_epi32(index, v); \ + index = _mm512_add_epi32(index, incr); \ + const uint16_t lt = _mm512_mask_cmplt_epi32_mask(sort_mask, v, b); \ + const uint16_t eq = _mm512_mask_cmpeq_epi32_mask(sort_mask, v, b); \ + const uint16_t mask = lookup[lt | eq] - lookup[lt]; \ + result = _mm512_mask_mov_epi32(result, mask, b); \ + } + +#else + + #define STEP(unused) { \ + const __m512i b = _mm512_permutexvar_epi32(index, v); \ + index = _mm512_add_epi32(index, incr); \ + const uint16_t lt = _mm_popcnt_u32(_mm512_mask_cmplt_epi32_mask(sort_mask, v, b)); \ + const uint16_t eq = _mm_popcnt_u32(_mm512_mask_cmpeq_epi32_mask(sort_mask, v, b)); \ + const uint16_t mask = (uint32_t(1) << (lt + eq)) - (uint32_t(1) << lt); \ + result = _mm512_mask_mov_epi32(result, mask, b); \ + } + +#endif + +__m512i avx512_sort16_epi32(const __m512i v) { + + __m512i result = v; + __m512i index = _mm512_setzero_si512(); + __m512i incr = _mm512_set1_epi32(1); + + const uint16_t sort_mask = 0xffff; + STEP(0); + STEP(1); + STEP(2); + STEP(3); + STEP(4); + STEP(5); + STEP(6); + STEP(7); + STEP(8); + STEP(9); + STEP(10); + STEP(11); + STEP(12); + STEP(13); + STEP(14); + STEP(15); + + return result; +} + +__m512i avx512_sort15_epi32(const __m512i v) { + + __m512i result = v; + __m512i index = _mm512_setzero_si512(); + __m512i incr = _mm512_set1_epi32(1); + + const uint16_t sort_mask = 0x7fff; + STEP(0); + STEP(1); + STEP(2); + STEP(3); + STEP(4); + STEP(5); + STEP(6); + STEP(7); + STEP(8); + STEP(9); + STEP(10); + STEP(11); + STEP(12); + STEP(13); + STEP(14); + + return result; +} + +__m512i avx512_sort14_epi32(const __m512i v) { + + __m512i result = v; + __m512i index = _mm512_setzero_si512(); + __m512i incr = _mm512_set1_epi32(1); + + const uint16_t sort_mask = 0x3fff; + STEP(0); + STEP(1); + STEP(2); + STEP(3); + STEP(4); + STEP(5); + STEP(6); + STEP(7); + STEP(8); + STEP(9); + STEP(10); + STEP(11); + STEP(12); + STEP(13); + + return result; +} + +__m512i avx512_sort13_epi32(const __m512i v) { + + __m512i result = v; + __m512i index = _mm512_setzero_si512(); + __m512i incr = _mm512_set1_epi32(1); + + const uint16_t sort_mask = 0x1fff; + STEP(0); + STEP(1); + STEP(2); + STEP(3); + STEP(4); + STEP(5); + STEP(6); + STEP(7); + STEP(8); + STEP(9); + STEP(10); + STEP(11); + STEP(12); + + return result; +} + +__m512i avx512_sort12_epi32(const __m512i v) { + + __m512i result = v; + __m512i index = _mm512_setzero_si512(); + __m512i incr = _mm512_set1_epi32(1); + + const uint16_t sort_mask = 0x0fff; + STEP(0); + STEP(1); + STEP(2); + STEP(3); + STEP(4); + STEP(5); + STEP(6); + STEP(7); + STEP(8); + STEP(9); + STEP(10); + STEP(11); + + return result; +} + +__m512i avx512_sort11_epi32(const __m512i v) { + + __m512i result = v; + __m512i index = _mm512_setzero_si512(); + __m512i incr = _mm512_set1_epi32(1); + + const uint16_t sort_mask = 0x07ff; + STEP(0); + STEP(1); + STEP(2); + STEP(3); + STEP(4); + STEP(5); + STEP(6); + STEP(7); + STEP(8); + STEP(9); + STEP(10); + + return result; +} + +__m512i avx512_sort10_epi32(const __m512i v) { + + __m512i result = v; + __m512i index = _mm512_setzero_si512(); + __m512i incr = _mm512_set1_epi32(1); + + const uint16_t sort_mask = 0x03ff; + STEP(0); + STEP(1); + STEP(2); + STEP(3); + STEP(4); + STEP(5); + STEP(6); + STEP(7); + STEP(8); + STEP(9); + + return result; +} + +__m512i avx512_sort9_epi32(const __m512i v) { + + __m512i result = v; + __m512i index = _mm512_setzero_si512(); + __m512i incr = _mm512_set1_epi32(1); + + const uint16_t sort_mask = 0x01ff; + STEP(0); + STEP(1); + STEP(2); + STEP(3); + STEP(4); + STEP(5); + STEP(6); + STEP(7); + STEP(8); + + return result; +} + +__m512i avx512_sort8_epi32(const __m512i v) { + + __m512i result = v; + __m512i index = _mm512_setzero_si512(); + __m512i incr = _mm512_set1_epi32(1); + + const uint16_t sort_mask = 0x00ff; + STEP(0); + STEP(1); + STEP(2); + STEP(3); + STEP(4); + STEP(5); + STEP(6); + STEP(7); + + return result; +} + +__m512i avx512_sort7_epi32(const __m512i v) { + + __m512i result = v; + __m512i index = _mm512_setzero_si512(); + __m512i incr = _mm512_set1_epi32(1); + + const uint16_t sort_mask = 0x007f; + STEP(0); + STEP(1); + STEP(2); + STEP(3); + STEP(4); + STEP(5); + STEP(6); + + return result; +} + +__m512i avx512_sort6_epi32(const __m512i v) { + + __m512i result = v; + __m512i index = _mm512_setzero_si512(); + __m512i incr = _mm512_set1_epi32(1); + + const uint16_t sort_mask = 0x003f; + STEP(0); + STEP(1); + STEP(2); + STEP(3); + STEP(4); + STEP(5); + + return result; +} + +__m512i avx512_sort5_epi32(const __m512i v) { + + __m512i result = v; + __m512i index = _mm512_setzero_si512(); + __m512i incr = _mm512_set1_epi32(1); + + const uint16_t sort_mask = 0x001f; + STEP(0); + STEP(1); + STEP(2); + STEP(3); + STEP(4); + + return result; +} + +__m512i avx512_sort4_epi32(const __m512i v) { + + __m512i result = v; + __m512i index = _mm512_setzero_si512(); + __m512i incr = _mm512_set1_epi32(1); + + const uint16_t sort_mask = 0x000f; + STEP(0); + STEP(1); + STEP(2); + STEP(3); + + return result; +} + +__m512i avx512_sort3_epi32(const __m512i v) { + + __m512i result = v; + __m512i index = _mm512_setzero_si512(); + __m512i incr = _mm512_set1_epi32(1); + + const uint16_t sort_mask = 0x0007; + STEP(0); + STEP(1); + STEP(2); + + return result; +} + +#undef STEP diff --git a/simd-sort/avx512-swap.cpp b/simd-sort/avx512-swap.cpp new file mode 100644 index 0000000..c9d5e80 --- /dev/null +++ b/simd-sort/avx512-swap.cpp @@ -0,0 +1,20 @@ +namespace qs { + namespace avx512 { + + void swap_epi32(__m512i& a, __m512i& b, __mmask16 mask_a, __mmask16 mask_b) { + + assert(mask_a != 0); + assert(mask_b != 0); + assert(_mm_popcnt_u64(mask_a) == _mm_popcnt_u64(mask_b)); + + const __m512i to_swap_b = _mm512_mask_compress_epi32(_mm512_setzero_si512(), mask_a, a); + const __m512i to_swap_a = _mm512_mask_compress_epi32(_mm512_setzero_si512(), mask_b, b); + + a = _mm512_mask_expand_epi32(a, mask_a, to_swap_a); + b = _mm512_mask_expand_epi32(b, mask_b, to_swap_b); + } + + + } // namespace avx512 + +} // namespace qa diff --git a/simd-sort/cmdline.cpp b/simd-sort/cmdline.cpp new file mode 100644 index 0000000..59c6282 --- /dev/null +++ b/simd-sort/cmdline.cpp @@ -0,0 +1,28 @@ +#include +#include +#include + +class CommandLine final { + int argc; + char** argv; + + std::unordered_set arguments; + +public: + CommandLine(int argc, char* argv[]) + : argc(argc) + , argv(argv) { + + for (int i=1; i < argc; i++) { + arguments.insert(argv[i]); + } + } + + bool has(const std::string& name) const { + return arguments.count(name); + } + + bool empty() const { + return arguments.empty(); + } +}; diff --git a/simd-sort/common.h b/simd-sort/common.h new file mode 100644 index 0000000..f2fc23b --- /dev/null +++ b/simd-sort/common.h @@ -0,0 +1,3 @@ +#pragma once + +#define FORCE_INLINE inline __attribute__((always_inline)) diff --git a/simd-sort/gather_results.sh b/simd-sort/gather_results.sh new file mode 100755 index 0000000..b5086d7 --- /dev/null +++ b/simd-sort/gather_results.sh @@ -0,0 +1,45 @@ +#!/bin/bash + +if [[ $1 == "avx2" ]] +then + SPEED=./speed_avx2 + if [[ ! -x $SPEED ]] + then + echo "AVX2 test program not found. Run make speed_avx2 to build it." + exit 1 + fi +else + SPEED=./speed + if [[ ! -x $SPEED ]] + then + echo "AVX512 test program not found. Run make speed to build it." + exit 1 + fi +fi + + +$SPEED 10000 3 asc | tee -a results.txt +$SPEED 100000 3 asc | tee -a results.txt +$SPEED 1000000 3 asc | tee -a results.txt +$SPEED 2000000 3 asc | tee -a results.txt +$SPEED 5000000 3 asc | tee -a results.txt +$SPEED 10000000 3 asc | tee -a results.txt +$SPEED 20000000 3 asc | tee -a results.txt + +$SPEED 10000 3 dsc | tee -a results.txt +$SPEED 100000 3 dsc | tee -a results.txt +$SPEED 1000000 3 dsc | tee -a results.txt +$SPEED 2000000 3 dsc | tee -a results.txt +$SPEED 5000000 3 dsc | tee -a results.txt +$SPEED 10000000 3 dsc | tee -a results.txt +$SPEED 20000000 3 dsc | tee -a results.txt + +$SPEED 10000 3 rnd | tee -a results.txt +$SPEED 100000 3 rnd | tee -a results.txt +$SPEED 1000000 3 rnd | tee -a results.txt +$SPEED 2000000 3 rnd | tee -a results.txt +$SPEED 5000000 3 rnd | tee -a results.txt +$SPEED 10000000 3 rnd | tee -a results.txt +$SPEED 20000000 3 rnd | tee -a results.txt + +echo "results.txt created" diff --git a/simd-sort/gettime.cpp b/simd-sort/gettime.cpp new file mode 100644 index 0000000..5f95269 --- /dev/null +++ b/simd-sort/gettime.cpp @@ -0,0 +1,8 @@ +#include +#include + +uint32_t get_time() { + struct timeval T; + gettimeofday(&T, NULL); + return (T.tv_sec * 1000000) + T.tv_usec; +} diff --git a/simd-sort/input_data.cpp b/simd-sort/input_data.cpp new file mode 100644 index 0000000..7804aaa --- /dev/null +++ b/simd-sort/input_data.cpp @@ -0,0 +1,106 @@ +class InputData { + +protected: + uint32_t* array; + size_t n; + +public: + InputData(size_t count) + : n(count) { + + assert(n > 0); + array = new uint32_t[n]; + } + + ~InputData() { + delete[] array; + } + +public: + uint32_t* pointer() { + return array; + } + + size_t count() const { + return n; + } + + size_t size() const { + return n * sizeof(uint32_t); + } +}; + + +class InputAscending: public InputData { + + using super = InputData; + +public: + InputAscending(size_t count) : super(count) { + for (size_t i=0; i < n; i++) { + array[i] = i; + } + } +}; + + +class InputDescending: public InputData { + + using super = InputData; + +public: + InputDescending(size_t count) : super(count) { + for (size_t i=0; i < n; i++) { + array[i] = n - i + 1; + } + } +}; + +class InputRandomFew: public InputData { + + using super = InputData; + +public: + InputRandomFew(size_t count) : super(count) { + for (size_t i=0; i < n; i++) { + array[i] = random() % 10; + } + } +}; + +class InputRandom: public InputData { + + using super = InputData; + +public: + InputRandom(size_t count) : super(count) { + for (size_t i=0; i < n; i++) { + array[i] = random(); + } + } +}; + +class InputRandomUnique: public InputData { + + using super = InputData; + +public: + InputRandomUnique(size_t count) : super(count) { + for (size_t i=0; i < n; i++) { + array[i] = i; + } + + shuffle(); + } + +private: + void shuffle() { + for (size_t i=0; i < n; i++) { + size_t j = rand() % (n - i); + + const uint32_t t = array[i]; + array[i] = array[j]; + array[j] = t; + } + } +}; diff --git a/simd-sort/partition.cpp b/simd-sort/partition.cpp new file mode 100644 index 0000000..762cc4b --- /dev/null +++ b/simd-sort/partition.cpp @@ -0,0 +1,51 @@ +void scalar_partition_epi32(uint32_t* array, const uint32_t pivot, int& left, int& right) { + + while (left <= right) { + + while (array[left] < pivot) { + left += 1; + } + + while (array[right] > pivot) { + right -= 1; + } + + if (left <= right) { + const uint32_t t = array[left]; + array[left] = array[right]; + array[right] = t; + + left += 1; + right -= 1; + } + } +} + + +int lomuto_partition_epi32(uint32_t* array, int lo, int hi) { + + const uint32_t pivot = array[(lo + hi)/2]; + const uint32_t hi_value = array[hi]; + + array[(lo + hi)/2] = hi_value; + array[hi] = pivot; + + int i = lo; + for (int j=lo; j < hi; j++) { + if (array[j] <= pivot) { + const uint32_t t = array[i]; + array[i] = array[j]; + array[j] = t; + i += 1; + } + } + + { + const uint32_t t = array[i]; + array[i] = array[hi]; + array[hi] = t; + i += 1; + } + + return i; +} diff --git a/simd-sort/quicksort-all.cpp b/simd-sort/quicksort-all.cpp new file mode 100644 index 0000000..4037646 --- /dev/null +++ b/simd-sort/quicksort-all.cpp @@ -0,0 +1,9 @@ +#include "common.h" +#include "quicksort.cpp" +#ifdef HAVE_AVX2_INSTRUCTIONS +# include "avx2-quicksort.cpp" +#endif +#ifdef HAVE_AVX512F_INSTRUCTIONS +# include "avx512-quicksort.cpp" +#endif + diff --git a/simd-sort/quicksort.cpp b/simd-sort/quicksort.cpp new file mode 100644 index 0000000..82aa462 --- /dev/null +++ b/simd-sort/quicksort.cpp @@ -0,0 +1,19 @@ +#include "partition.cpp" + +void quicksort(uint32_t* array, int left, int right) { + + int i = left; + int j = right; + + const uint32_t pivot = array[(i + j)/2]; + + scalar_partition_epi32(array, pivot, i, j); + + if (left < j) { + quicksort(array, left, j); + } + + if (i < right) { + quicksort(array, i, right); + } +} diff --git a/simd-sort/rdtsc.cpp b/simd-sort/rdtsc.cpp new file mode 100644 index 0000000..4228252 --- /dev/null +++ b/simd-sort/rdtsc.cpp @@ -0,0 +1,52 @@ +#define RDTSC_START(cycles) \ + do { \ + uint32_t cyc_high, cyc_low; \ + __asm volatile("cpuid\n" \ + "rdtsc\n" \ + "mov %%edx, %0\n" \ + "mov %%eax, %1" : \ + "=r" (cyc_high), \ + "=r"(cyc_low) : \ + : /* no read only */ \ + "%rax", "%rbx", "%rcx", "%rdx" /* clobbers */ \ + ); \ + (cycles) = ((uint64_t)cyc_high << 32) | cyc_low; \ + } while (0) + +#define RDTSC_STOP(cycles) \ + do { \ + uint32_t cyc_high, cyc_low; \ + __asm volatile("rdtscp\n" \ + "mov %%edx, %0\n" \ + "mov %%eax, %1\n" \ + "cpuid" : \ + "=r"(cyc_high), \ + "=r"(cyc_low) : \ + /* no read only registers */ : \ + "%rax", "%rbx", "%rcx", "%rdx" /* clobbers */ \ + ); \ + (cycles) = ((uint64_t)cyc_high << 32) | cyc_low; \ + } while (0) + +static __attribute__ ((noinline)) +uint64_t rdtsc_overhead_func(uint64_t dummy) { + return dummy; +} + +uint64_t global_rdtsc_overhead = (uint64_t) UINT64_MAX; + +#define RDTSC_SET_OVERHEAD(test, repeat) \ + do { \ + uint64_t cycles_start, cycles_final, cycles_diff; \ + uint64_t min_diff = UINT64_MAX; \ + for (int i = 0; i < repeat; i++) { \ + __asm volatile("" ::: /* pretend to clobber */ "memory"); \ + RDTSC_START(cycles_start); \ + test; \ + RDTSC_STOP(cycles_final); \ + cycles_diff = (cycles_final - cycles_start); \ + if (cycles_diff < min_diff) min_diff = cycles_diff; \ + } \ + global_rdtsc_overhead = min_diff; \ + printf("rdtsc_overhead set to %d\n", (int)global_rdtsc_overhead); \ + } while (0) diff --git a/simd-sort/results/haswell-i7-4770-gcc5.3.0-avx2.txt b/simd-sort/results/haswell-i7-4770-gcc5.3.0-avx2.txt new file mode 100644 index 0000000..4b2af5c --- /dev/null +++ b/simd-sort/results/haswell-i7-4770-gcc5.3.0-avx2.txt @@ -0,0 +1,84 @@ +items count: 10000 (40000 bytes), input ascending + std::sort ... 0.0001 s + quick sort ... 0.0001 s (1.24) + AVX2 quick sort ... 0.0001 s (1.48) +items count: 100000 (400000 bytes), input ascending + std::sort ... 0.0010 s + quick sort ... 0.0009 s (1.17) + AVX2 quick sort ... 0.0007 s (1.48) +items count: 1000000 (4000000 bytes), input ascending + std::sort ... 0.0117 s + quick sort ... 0.0092 s (1.27) + AVX2 quick sort ... 0.0066 s (1.77) +items count: 2000000 (8000000 bytes), input ascending + std::sort ... 0.0244 s + quick sort ... 0.0192 s (1.28) + AVX2 quick sort ... 0.0137 s (1.79) +items count: 5000000 (20000000 bytes), input ascending + std::sort ... 0.0700 s + quick sort ... 0.0535 s (1.31) + AVX2 quick sort ... 0.0378 s (1.85) +items count: 10000000 (40000000 bytes), input ascending + std::sort ... 0.1462 s + quick sort ... 0.1108 s (1.32) + AVX2 quick sort ... 0.0782 s (1.87) +items count: 20000000 (80000000 bytes), input ascending + std::sort ... 0.3049 s + quick sort ... 0.2301 s (1.33) + AVX2 quick sort ... 0.1614 s (1.89) +items count: 10000 (40000 bytes), input descending + std::sort ... 0.0001 s + quick sort ... 0.0001 s (0.92) + AVX2 quick sort ... 0.0001 s (1.12) +items count: 100000 (400000 bytes), input descending + std::sort ... 0.0008 s + quick sort ... 0.0009 s (0.87) + AVX2 quick sort ... 0.0007 s (1.10) +items count: 1000000 (4000000 bytes), input descending + std::sort ... 0.0087 s + quick sort ... 0.0092 s (0.95) + AVX2 quick sort ... 0.0066 s (1.32) +items count: 2000000 (8000000 bytes), input descending + std::sort ... 0.0183 s + quick sort ... 0.0192 s (0.95) + AVX2 quick sort ... 0.0138 s (1.33) +items count: 5000000 (20000000 bytes), input descending + std::sort ... 0.0514 s + quick sort ... 0.0536 s (0.96) + AVX2 quick sort ... 0.0378 s (1.36) +items count: 10000000 (40000000 bytes), input descending + std::sort ... 0.1068 s + quick sort ... 0.1111 s (0.96) + AVX2 quick sort ... 0.0783 s (1.36) +items count: 20000000 (80000000 bytes), input descending + std::sort ... 0.2221 s + quick sort ... 0.2299 s (0.97) + AVX2 quick sort ... 0.1613 s (1.38) +items count: 10000 (40000 bytes), input random + std::sort ... 0.0001 s + quick sort ... 0.0001 s (1.25) + AVX2 quick sort ... 0.0001 s (1.53) +items count: 100000 (400000 bytes), input random + std::sort ... 0.0010 s + quick sort ... 0.0009 s (1.16) + AVX2 quick sort ... 0.0007 s (1.48) +items count: 1000000 (4000000 bytes), input random + std::sort ... 0.0118 s + quick sort ... 0.0091 s (1.29) + AVX2 quick sort ... 0.0066 s (1.78) +items count: 2000000 (8000000 bytes), input random + std::sort ... 0.0246 s + quick sort ... 0.0192 s (1.28) + AVX2 quick sort ... 0.0137 s (1.79) +items count: 5000000 (20000000 bytes), input random + std::sort ... 0.0703 s + quick sort ... 0.0537 s (1.31) + AVX2 quick sort ... 0.0378 s (1.86) +items count: 10000000 (40000000 bytes), input random + std::sort ... 0.1468 s + quick sort ... 0.1106 s (1.33) + AVX2 quick sort ... 0.0786 s (1.87) +items count: 20000000 (80000000 bytes), input random + std::sort ... 0.3058 s + quick sort ... 0.2304 s (1.33) + AVX2 quick sort ... 0.1628 s (1.88) diff --git a/simd-sort/results/knights-landing-7210-gcc5.3.0-avx512f.txt b/simd-sort/results/knights-landing-7210-gcc5.3.0-avx512f.txt new file mode 100644 index 0000000..b50f9c0 --- /dev/null +++ b/simd-sort/results/knights-landing-7210-gcc5.3.0-avx512f.txt @@ -0,0 +1,140 @@ +items count: 100000 (400000 bytes), input ascending + std::sort ... 0.0063 s + quick sort ... 0.0067 s (0.93) + AVX2 quick sort ... 0.0071 s (0.88) + AVX512 quick sort ... 0.0055 s (1.14) + AVX512 + popcnt quick sort ... 0.0056 s (1.13) + AVX512 + BMI2 quick sort ... 0.0058 s (1.08) +items count: 1000000 (4000000 bytes), input ascending + std::sort ... 0.0728 s + quick sort ... 0.0735 s (0.99) + AVX2 quick sort ... 0.0685 s (1.06) + AVX512 quick sort ... 0.0522 s (1.40) + AVX512 + popcnt quick sort ... 0.0528 s (1.38) + AVX512 + BMI2 quick sort ... 0.0554 s (1.31) +items count: 2000000 (8000000 bytes), input ascending + std::sort ... 0.1529 s + quick sort ... 0.1523 s (1.00) + AVX2 quick sort ... 0.1402 s (1.09) + AVX512 quick sort ... 0.1057 s (1.45) + AVX512 + popcnt quick sort ... 0.1063 s (1.44) + AVX512 + BMI2 quick sort ... 0.1105 s (1.38) +items count: 5000000 (20000000 bytes), input ascending + std::sort ... 0.4433 s + quick sort ... 0.3935 s (1.13) + AVX2 quick sort ... 0.3520 s (1.26) + AVX512 quick sort ... 0.2609 s (1.70) + AVX512 + popcnt quick sort ... 0.2609 s (1.70) + AVX512 + BMI2 quick sort ... 0.2668 s (1.66) +items count: 10000000 (40000000 bytes), input ascending + std::sort ... 0.9257 s + quick sort ... 0.8108 s (1.14) + AVX2 quick sort ... 0.7222 s (1.28) + AVX512 quick sort ... 0.5265 s (1.76) + AVX512 + popcnt quick sort ... 0.5288 s (1.75) + AVX512 + BMI2 quick sort ... 0.5402 s (1.71) +items count: 20000000 (80000000 bytes), input ascending + std::sort ... 1.9285 s + quick sort ... 1.6725 s (1.15) + AVX2 quick sort ... 1.4766 s (1.31) + AVX512 quick sort ... 1.0681 s (1.81) + AVX512 + popcnt quick sort ... 1.0699 s (1.80) + AVX512 + BMI2 quick sort ... 1.0949 s (1.76) +items count: 10000 (40000 bytes), input descending + std::sort ... 0.0004 s + quick sort ... 0.0006 s (0.73) + AVX2 quick sort ... 0.0006 s (0.74) + AVX512 quick sort ... 0.0005 s (0.88) + AVX512 + popcnt quick sort ... 0.0005 s (0.89) + AVX512 + BMI2 quick sort ... 0.0005 s (0.87) +items count: 100000 (400000 bytes), input descending + std::sort ... 0.0048 s + quick sort ... 0.0067 s (0.71) + AVX2 quick sort ... 0.0070 s (0.68) + AVX512 quick sort ... 0.0055 s (0.87) + AVX512 + popcnt quick sort ... 0.0056 s (0.86) + AVX512 + BMI2 quick sort ... 0.0058 s (0.83) +items count: 1000000 (4000000 bytes), input descending + std::sort ... 0.0551 s + quick sort ... 0.0734 s (0.75) + AVX2 quick sort ... 0.0684 s (0.81) + AVX512 quick sort ... 0.0524 s (1.05) + AVX512 + popcnt quick sort ... 0.0526 s (1.05) + AVX512 + BMI2 quick sort ... 0.0547 s (1.01) +items count: 2000000 (8000000 bytes), input descending + std::sort ... 0.1154 s + quick sort ... 0.1523 s (0.76) + AVX2 quick sort ... 0.1407 s (0.82) + AVX512 quick sort ... 0.1055 s (1.09) + AVX512 + popcnt quick sort ... 0.1065 s (1.08) + AVX512 + BMI2 quick sort ... 0.1111 s (1.04) +items count: 5000000 (20000000 bytes), input descending + std::sort ... 0.3300 s + quick sort ... 0.3932 s (0.84) + AVX2 quick sort ... 0.3516 s (0.94) + AVX512 quick sort ... 0.2606 s (1.27) + AVX512 + popcnt quick sort ... 0.2602 s (1.27) + AVX512 + BMI2 quick sort ... 0.2657 s (1.24) +items count: 10000000 (40000000 bytes), input descending + std::sort ... 0.6890 s + quick sort ... 0.8134 s (0.85) + AVX2 quick sort ... 0.7216 s (0.95) + AVX512 quick sort ... 0.5273 s (1.31) + AVX512 + popcnt quick sort ... 0.5351 s (1.29) + AVX512 + BMI2 quick sort ... 0.5392 s (1.28) +items count: 20000000 (80000000 bytes), input descending + std::sort ... 1.4358 s + quick sort ... 1.6744 s (0.86) + AVX2 quick sort ... 1.4772 s (0.97) + AVX512 quick sort ... 1.0694 s (1.34) + AVX512 + popcnt quick sort ... 1.0675 s (1.34) + AVX512 + BMI2 quick sort ... 1.0929 s (1.31) +items count: 10000 (40000 bytes), input random + std::sort ... 0.0005 s + quick sort ... 0.0006 s (0.98) + AVX2 quick sort ... 0.0006 s (0.99) + AVX512 quick sort ... 0.0005 s (1.18) + AVX512 + popcnt quick sort ... 0.0005 s (1.18) + AVX512 + BMI2 quick sort ... 0.0005 s (1.16) +items count: 100000 (400000 bytes), input random + std::sort ... 0.0063 s + quick sort ... 0.0067 s (0.94) + AVX2 quick sort ... 0.0071 s (0.89) + AVX512 quick sort ... 0.0055 s (1.14) + AVX512 + popcnt quick sort ... 0.0056 s (1.13) + AVX512 + BMI2 quick sort ... 0.0058 s (1.08) +items count: 1000000 (4000000 bytes), input random + std::sort ... 0.0727 s + quick sort ... 0.0742 s (0.98) + AVX2 quick sort ... 0.0684 s (1.06) + AVX512 quick sort ... 0.0522 s (1.39) + AVX512 + popcnt quick sort ... 0.0526 s (1.38) + AVX512 + BMI2 quick sort ... 0.0544 s (1.34) +items count: 2000000 (8000000 bytes), input random + std::sort ... 0.1533 s + quick sort ... 0.1532 s (1.00) + AVX2 quick sort ... 0.1403 s (1.09) + AVX512 quick sort ... 0.1057 s (1.45) + AVX512 + popcnt quick sort ... 0.1066 s (1.44) + AVX512 + BMI2 quick sort ... 0.1101 s (1.39) +items count: 5000000 (20000000 bytes), input random + std::sort ... 0.4436 s + quick sort ... 0.3932 s (1.13) + AVX2 quick sort ... 0.3529 s (1.26) + AVX512 quick sort ... 0.2622 s (1.69) + AVX512 + popcnt quick sort ... 0.2632 s (1.69) + AVX512 + BMI2 quick sort ... 0.2657 s (1.67) +items count: 10000000 (40000000 bytes), input random + std::sort ... 0.9235 s + quick sort ... 0.8115 s (1.14) + AVX2 quick sort ... 0.7233 s (1.28) + AVX512 quick sort ... 0.5305 s (1.74) + AVX512 + popcnt quick sort ... 0.5328 s (1.73) + AVX512 + BMI2 quick sort ... 0.5379 s (1.72) +items count: 20000000 (80000000 bytes), input random + std::sort ... 1.9264 s + quick sort ... 1.6716 s (1.15) + AVX2 quick sort ... 1.4898 s (1.29) + AVX512 quick sort ... 1.0853 s (1.77) + AVX512 + popcnt quick sort ... 1.0849 s (1.78) + AVX512 + BMI2 quick sort ... 1.0984 s (1.75) diff --git a/simd-sort/results/skylake-i7-6700-gcc5.3.0-avx2.txt b/simd-sort/results/skylake-i7-6700-gcc5.3.0-avx2.txt new file mode 100644 index 0000000..ee7c7a2 --- /dev/null +++ b/simd-sort/results/skylake-i7-6700-gcc5.3.0-avx2.txt @@ -0,0 +1,80 @@ +items count: 100000 (400000 bytes), input ascending + std::sort ... 0.0011 s + quick sort ... 0.0009 s (1.14) + AVX2 quick sort ... 0.0007 s (1.59) +items count: 1000000 (4000000 bytes), input ascending + std::sort ... 0.0122 s + quick sort ... 0.0099 s (1.23) + AVX2 quick sort ... 0.0065 s (1.89) +items count: 2000000 (8000000 bytes), input ascending + std::sort ... 0.0257 s + quick sort ... 0.0204 s (1.26) + AVX2 quick sort ... 0.0133 s (1.94) +items count: 5000000 (20000000 bytes), input ascending + std::sort ... 0.0724 s + quick sort ... 0.0576 s (1.26) + AVX2 quick sort ... 0.0376 s (1.93) +items count: 10000000 (40000000 bytes), input ascending + std::sort ... 0.1516 s + quick sort ... 0.1194 s (1.27) + AVX2 quick sort ... 0.0773 s (1.96) +items count: 20000000 (80000000 bytes), input ascending + std::sort ... 0.3167 s + quick sort ... 0.2480 s (1.28) + AVX2 quick sort ... 0.1598 s (1.98) +items count: 10000 (40000 bytes), input descending + std::sort ... 0.0001 s + quick sort ... 0.0001 s (0.91) + AVX2 quick sort ... 0.0001 s (1.15) +items count: 100000 (400000 bytes), input descending + std::sort ... 0.0009 s + quick sort ... 0.0009 s (0.91) + AVX2 quick sort ... 0.0007 s (1.26) +items count: 1000000 (4000000 bytes), input descending + std::sort ... 0.0095 s + quick sort ... 0.0099 s (0.96) + AVX2 quick sort ... 0.0065 s (1.47) +items count: 2000000 (8000000 bytes), input descending + std::sort ... 0.0196 s + quick sort ... 0.0207 s (0.95) + AVX2 quick sort ... 0.0132 s (1.48) +items count: 5000000 (20000000 bytes), input descending + std::sort ... 0.0539 s + quick sort ... 0.0576 s (0.94) + AVX2 quick sort ... 0.0377 s (1.43) +items count: 10000000 (40000000 bytes), input descending + std::sort ... 0.1130 s + quick sort ... 0.1195 s (0.95) + AVX2 quick sort ... 0.0778 s (1.45) +items count: 20000000 (80000000 bytes), input descending + std::sort ... 0.2353 s + quick sort ... 0.2479 s (0.95) + AVX2 quick sort ... 0.1604 s (1.47) +items count: 10000 (40000 bytes), input random + std::sort ... 0.0001 s + quick sort ... 0.0001 s (1.19) + AVX2 quick sort ... 0.0001 s (1.51) +items count: 100000 (400000 bytes), input random + std::sort ... 0.0011 s + quick sort ... 0.0009 s (1.13) + AVX2 quick sort ... 0.0007 s (1.59) +items count: 1000000 (4000000 bytes), input random + std::sort ... 0.0123 s + quick sort ... 0.0099 s (1.23) + AVX2 quick sort ... 0.0064 s (1.91) +items count: 2000000 (8000000 bytes), input random + std::sort ... 0.0257 s + quick sort ... 0.0207 s (1.24) + AVX2 quick sort ... 0.0133 s (1.93) +items count: 5000000 (20000000 bytes), input random + std::sort ... 0.0726 s + quick sort ... 0.0578 s (1.26) + AVX2 quick sort ... 0.0377 s (1.93) +items count: 10000000 (40000000 bytes), input random + std::sort ... 0.1518 s + quick sort ... 0.1200 s (1.26) + AVX2 quick sort ... 0.0778 s (1.95) +items count: 20000000 (80000000 bytes), input random + std::sort ... 0.3170 s + quick sort ... 0.2490 s (1.27) + AVX2 quick sort ... 0.1614 s (1.96) diff --git a/simd-sort/runtime_stats.cpp b/simd-sort/runtime_stats.cpp new file mode 100644 index 0000000..2fa5e7b --- /dev/null +++ b/simd-sort/runtime_stats.cpp @@ -0,0 +1,76 @@ +class Histogram { + size_t hist[256]; + +public: + Histogram() { + reset(); + } + + void hit(uint8_t v) { + hist[v] += 1; + } + + void reset() { + memset(hist, 0, sizeof(hist)); + } + + bool empty() const { + for (int i=0; i < 256; i++) { + if (hist[i] > 0) { + return false; + } + } + + return true; + } + +public: + void print() const { + size_t total = 0; + for (int i=0; i < 256; i++) { + total += hist[i]; + } + + if (total == 0) return; + + for (int i=0; i < 256; i++) { + if (hist[i] == 0) continue; + + printf("%02x: %5lu (%5.2f%%)\n", i, hist[i], 100.0 * hist[i]/total); + } + } +}; + +class Statistics { +public: + size_t partition_calls; + size_t items_processed; + size_t scalar__partition_calls; + size_t scalar__items_processed; + Histogram pvbyte_histogram; + +public: + Statistics() { + reset(); + } + + void reset() { + partition_calls = 0; + items_processed = 0; + scalar__partition_calls = 0; + scalar__items_processed = 0; + + pvbyte_histogram.reset(); + } + + bool anything_collected() const { + return (partition_calls > 0) + || (items_processed > 0) + || (scalar__partition_calls > 0) + || (scalar__items_processed > 0) + || (!pvbyte_histogram.empty()); + } +}; + + +Statistics statistics; diff --git a/simd-sort/speed.cpp b/simd-sort/speed.cpp new file mode 100644 index 0000000..aa58089 --- /dev/null +++ b/simd-sort/speed.cpp @@ -0,0 +1,457 @@ +#include +#include +#include +#include +#include +#include +#include + +#include "cmdline.cpp" +#ifdef WITH_RUNTIME_STATS +# include "runtime_stats.cpp" // must be included before anything else +#endif +#include "input_data.cpp" +#include "quicksort-all.cpp" +#include "avx2-altquicksort.h" +//#include "avx2-nate-quicksort.cpp" +#include "../magyarsort.h" // mine +#include "avx2-natenodutch-quicksort.h" +#define USE_RDTSC // undef to get measurments in seconds +#ifdef USE_RDTSC +# include "rdtsc.cpp" +#else +# include "gettime.cpp" +#endif + +void magyarsort_it(uint32_t* array, int left, int right) { + MagyarSort::sort(array + left, right - left); +} + +class PerformanceTest final { + + int iterations; + InputData& input; + uint32_t* tmp; + +public: + PerformanceTest(int n, InputData& input) + : iterations(n) + , input(input) { + + assert(iterations > 0); + tmp = new uint32_t[input.count()]; + } + + ~PerformanceTest() { + delete[] tmp; + } + +public: + template + uint64_t run(SORT_FUNCTION sort) { + + uint64_t time = 0; + + int k = iterations; + while (k--) { + memcpy(tmp, input.pointer(), input.size()); + + uint64_t t1, t2; + +#ifdef USE_RDTSC + RDTSC_START(t1); +#else + t1 = get_time(); +#endif + sort(input.pointer(), 0, input.count() - 1); + +#ifdef USE_RDTSC + RDTSC_START(t2); +#else + t2 = get_time(); +#endif + + const uint64_t dt = t2 - t1; + + if (time == 0) { + time = dt; + } else if (dt < time) { + time = dt; + } + } + + return time; + } +}; + + +enum class InputType { + randomfew, + randomuniq, + random, + ascending, + descending, +}; + + +const char* as_string(InputType type) { + switch (type) { + case InputType::randomfew: + return "randomfew"; + + case InputType::randomuniq: + return "randomuniq"; + + case InputType::random: + return "random"; + + case InputType::ascending: + return "ascending"; + + case InputType::descending: + return "descending"; + + default: + return ""; + } +} + +void std_qsort_wrapper(uint32_t* array, int left, int right) { + + std::qsort(array + left, right - left + 1, sizeof(uint32_t), [](const void* a, const void* b) + { + uint32_t a1 = *static_cast(a); + uint32_t a2 = *static_cast(b); + + if(a1 < a2) return -1; + if(a1 > a2) return 1; + return 0; + }); +} + + +void std_stable_sort_wrapper(uint32_t* array, int left, int right) { + + std::stable_sort(array + left, array + right + 1); +} + + +void std_sort_wrapper(uint32_t* array, int left, int right) { + + std::sort(array + left, array + right + 1); +} + + +class Flags { + public: + bool std_sort; + bool std_qsort; + bool std_stable_sort; + bool quicksort; + bool avx2; + bool magyar; + bool avx2_alt; + bool avx2_natenodutch; + bool avx512; + bool avx512_buf; + bool avx512_popcnt; + bool avx512_bmi; + + public: + Flags(const CommandLine& cmd) { + + enable_all(false); + + bool any_set = false; + if (cmd.has("-std-sort")) { + std_sort = true; + any_set = true; + } + + if (cmd.has("-std-qsort")) { + std_qsort = true; + any_set = true; + } + + if (cmd.has("-std-stable-sort") || cmd.has("-std-stable")) { + std_stable_sort = true; + any_set = true; + } + + if (cmd.has("-quicksort")) { + quicksort = true; + any_set = true; + } + + if (cmd.has("-avx2")) { + avx2 = true; + any_set = true; + } + + if (cmd.has("-avx2-alt")) { + avx2_alt = true; + any_set = true; + } + + if (cmd.has("magyar")) { + magyar = true; + avx2_natenodutch = true; + any_set = true; + } + + if (cmd.has("-avx512")) { + avx512 = true; + any_set = true; + } + + if (cmd.has("-avx512-buf")) { + avx512_buf = true; + any_set = true; + } + + if (cmd.has("-avx512-popcnt")) { + avx512_popcnt = true; + any_set = true; + } + + if (cmd.has("-avx512-bmi")) { + avx512_bmi = true; + any_set = true; + } + + if (!any_set) { + enable_all(true); + } + } + + void enable_all(bool val) { + std_sort = val; + std_qsort = val; + std_stable_sort = val; + quicksort = val; + avx2 = val; + magyar = val; + avx2_alt = val; + avx2_natenodutch = val; + avx512 = val; + avx512_buf = val; + avx512_popcnt = val; + avx512_bmi = val; + } +}; + + +class Test { + + std::unique_ptr data; + InputType type; + size_t count; + int iterations; + Flags flags; + uint64_t ref; + +public: + Test(InputType type, size_t count, int iterations, Flags&& flags) + : type(type) + , count(count) + , iterations(iterations) + , flags(std::move(flags)) { + + switch (type) { + case InputType::randomfew: + data.reset(new InputRandomFew(count)); + break; + + case InputType::randomuniq: + data.reset(new InputRandomUnique(count)); + break; + + case InputType::random: + data.reset(new InputRandom(count)); + break; + + case InputType::ascending: + data.reset(new InputAscending(count)); + break; + + case InputType::descending: + data.reset(new InputDescending(count)); + break; + } + } + + void run() { + + printf("items count: %lu (%lu bytes), input %s\n", data->count(), data->size(), as_string(type)); + + ref = 0; + + if (flags.std_sort) { + measure("std::sort", std_sort_wrapper); + } + + if (flags.std_qsort) { + measure("std::qsort", std_qsort_wrapper); + } + + if (flags.std_stable_sort) { + measure("std::stable_sort", std_stable_sort_wrapper); + } + + if (flags.std_qsort) { + measure("quick sort", quicksort); + } +#ifdef HAVE_AVX2_INSTRUCTIONS + if (flags.avx2) { + measure("AVX2 quick sort", qs::avx2::quicksort); + } + + if (flags.avx2_natenodutch) { + measure("AVX2 nate nodutch", avx_natenodutch_quicksort); + } + + if (flags.avx2_alt) { + measure("AVX2 alt quicksort", wrapped_avx2_pivotonlast_sort); + } + + if (flags.magyar) { + measure("Magyarsort variant", magyarsort_it); + } + +#endif +#ifdef HAVE_AVX512F_INSTRUCTIONS + if (flags.avx512) { + measure("AVX512 quick sort", qs::avx512::quicksort); + } + + if (flags.avx512_buf) { + measure("AVX512 quick sort - aux buf", qs::avx512::auxbuffer_quicksort); + } + + if (flags.avx512_popcnt) { + measure("AVX512 + popcnt quick sort", qs::avx512::popcnt_quicksort); + } + + if (flags.avx512_bmi) { + measure("AVX512 + BMI2 quick sort", qs::avx512::bmi2_quicksort); + } +#endif + } + +private: + template + void measure(const char* name, SORT_FUNCTION sort) { + + PerformanceTest test(iterations, *data); + + printf("%30s ... ", name); fflush(stdout); +#ifdef WITH_RUNTIME_STATS + statistics.reset(); +#endif + uint64_t time = test.run(sort); + +#ifdef USE_RDTSC + printf("%10lu cycles", time); + if (ref > 0) { + printf(" (%0.2f)", ref/double(time)); + } + +# ifdef WITH_RUNTIME_STATS + if (statistics.anything_collected()) { + printf("\n"); + printf("\t\tpartition calls: %lu (+%lu scalar)\n", statistics.partition_calls, statistics.scalar__partition_calls); + printf("\t\titems processed: %lu (+%lu by scalar partition)\n", statistics.items_processed, statistics.scalar__items_processed); + + const size_t total_items = statistics.items_processed + statistics.scalar__items_processed; + + if (total_items != 0) { + const double cpi = double(time)/total_items; + printf("\t\t : %0.4f cycles/item\n", cpi * iterations); + } + + if (!statistics.pvbyte_histogram.empty()) { + puts("Histogram for pvbyte:"); + statistics.pvbyte_histogram.print(); + } + } +# endif // WITH_RUNTIME_STATS + +#else + printf("%0.4f s", time/1000000.0); + if (ref > 0) { + printf(" (%0.2f)\n", ref/double(time)); + } +#endif + putchar('\n'); + + if (ref == 0) { + ref = time; + } + } +}; + + +// ------------------------------------------------------------ + + +void usage() { + puts("usage:"); + puts("speed SIZE ITERATIONS INPUT [options]"); + puts(""); + puts("where"); + puts("* SIZE - number of 32-bit elements"); + puts("* ITERATIONS - number of iterations"); + puts("* INPUT - one of:"); + puts(" ascending (or asc)"); + puts(" descending (or dsc, desc)"); + puts(" random (or rnd, rand)"); + puts(" randomfew"); + puts(" randomuniq"); + puts("options - optional name of procedure(s) to run"); +} + + +int main(int argc, char* argv[]) { + + if (argc < 4) { + usage(); + return EXIT_FAILURE; + } + + int count = atoi(argv[1]); + int iterations = atoi(argv[2]); + InputType type; + +#define is_keyword(key) (strcmp(argv[3], key) == 0) + if (is_keyword("descending") || is_keyword("desc") || is_keyword("dsc")) { + type = InputType::descending; + } else if (is_keyword("ascending") || is_keyword("asc")) { + type = InputType::ascending; + } else if (is_keyword("random") || is_keyword("rnd") || is_keyword("rand")) { + type = InputType::random; + } else if (is_keyword("randomfew")) { + type = InputType::randomfew; + } else if (is_keyword("randomuniq")) { + type = InputType::randomuniq; + } else { + usage(); + return EXIT_FAILURE; + } +#undef is_keyword + +#ifdef HAVE_AVX512F_INSTRUCTIONS + #ifdef POPCNT_LOOKUP + prepare_lookup(); + #endif +#endif + +#ifdef USE_RDTSC + RDTSC_SET_OVERHEAD(rdtsc_overhead_func(1), iterations); +#endif + CommandLine cmd(argc, argv); + Flags flags(cmd); + Test test(type, count, iterations, std::move(flags)); + test.run(); + + return EXIT_SUCCESS; +} diff --git a/simd-sort/speed_avx2 b/simd-sort/speed_avx2 new file mode 100755 index 0000000..6bbb97a Binary files /dev/null and b/simd-sort/speed_avx2 differ diff --git a/simd-sort/test.cpp b/simd-sort/test.cpp new file mode 100644 index 0000000..bd68322 --- /dev/null +++ b/simd-sort/test.cpp @@ -0,0 +1,261 @@ +#include +#include +#include +#include +#include +#include + +#include "cmdline.cpp" +#include "input_data.cpp" +#include "quicksort-all.cpp" +#include "avx2-altquicksort.h" +#include "avx2-nate-quicksort.cpp" + + +bool is_sorted(uint32_t* array, size_t n) { + assert(n > 0); + for (size_t i=1; i < n; i++) { + if (array[i - 1] > array[i]) { + printf("mismatch at %lu\n", i); + return false; + } + } + + return true; +} + + +const size_t AVX512_REGISTER_SIZE = 16; + + +class Test { + + bool verbose; + +public: + Test(bool v = true) : verbose(v) {} + + template + bool run(SORT_FN sort) { + const size_t start = 2*AVX512_REGISTER_SIZE; + const size_t end = 256*AVX512_REGISTER_SIZE; + + if (verbose) { + putchar('\n'); + } + + for (size_t size=start; size < end; size += 1) { + + if (verbose) { + printf("%lu/%lu\r", size, end); + fflush(stdout); + } + + InputAscending asc(size); + InputDescending dsc(size); + InputRandom rnd(size); + InputRandomFew rndfew(size); + + if (!test(sort, asc)) { + printf("failed for size %lu, intput ascending\n", size); + return false; + } + + if (!test(sort, dsc)) { + printf("failed for size %lu, intput descending\n", size); + return false; + } + + if (!test(sort, rnd)) { + printf("failed for size %lu, intput random\n", size); + return false; + } + + if (!test(sort, rndfew)) { + printf("failed for size %lu, intput random few\n", size); + return false; + } + } // for + + if (verbose) { + putchar('\n'); + } + + return true; + } + + +private: + template + bool test(SORT_FN sort, InputData& data) { + sort(data.pointer(), 0, data.count() - 1); + + return is_sorted(data.pointer(), data.count()); + } +}; + + +class Flags { + public: + bool avx2; + bool avx2_alt; + bool avx2_nate; + bool avx512; + bool avx512_buf; + bool avx512_popcnt; + bool avx512_bmi; + + public: + Flags(const CommandLine& cmd) { + + enable_all(false); + + bool any_set = false; + if (cmd.has("-avx2")) { + avx2 = true; + any_set = true; + } + + if (cmd.has("-avx2-alt")) { + avx2_alt = true; + any_set = true; + } + + if (cmd.has("-avx2-nate")) { + avx2_nate = true; + any_set = true; + } + + if (cmd.has("-avx512")) { + avx512 = true; + any_set = true; + } + + if (cmd.has("-avx512-buf")) { + avx512_buf = true; + any_set = true; + } + + if (cmd.has("-avx512-popcnt")) { + avx512_popcnt = true; + any_set = true; + } + + if (cmd.has("-avx512-bmi")) { + avx512_bmi = true; + any_set = true; + } + + if (!any_set) { + enable_all(true); + } + } + + void enable_all(bool val) { + avx2 = val; + avx2_nate = val; + avx2_alt = val; + avx512 = val; + avx512_buf = val; + avx512_popcnt = val; + avx512_bmi = val; + } +}; + + +int main(int argc, char* argv[]) { + + CommandLine cmd(argc, argv); + + puts("Please wait, it might take a while..."); + puts(""); + + bool verbose = cmd.has("-v") || cmd.has("--verbose"); + Flags flags(cmd); + + Test test(verbose); + int ret = EXIT_SUCCESS; + +#ifdef HAVE_AVX2_INSTRUCTIONS + if (flags.avx2) { + printf("AVX2 base version... "); fflush(stdout); + if (test.run(qs::avx2::quicksort)) { + puts("OK"); + } else { + puts("FAILED"); + ret = EXIT_FAILURE; + } + } + + if (flags.avx2_alt) { + printf("AVX2 alt version... "); fflush(stdout); + if (test.run(wrapped_avx2_pivotonlast_sort)) { + puts("OK"); + } else { + puts("FAILED"); + ret = EXIT_FAILURE; + } + } + + if (flags.avx2_nate) { + printf("AVX2 Nate's variant... "); fflush(stdout); + if (test.run(nate::wrapped_avx2_pivotonlast_sort)) { + puts("OK"); + } else { + puts("FAILED"); + ret = EXIT_FAILURE; + } + } +#endif + +#ifdef HAVE_AVX512F_INSTRUCTIONS + +#ifdef POPCNT_LOOKUP + prepare_lookup(); +#endif + + if (flags.avx512) { + printf("AVX512 base version... "); fflush(stdout); + if (test.run(qs::avx512::quicksort)) { + puts("OK"); + } else { + puts("FAILED"); + ret = EXIT_FAILURE; + } + } + + if (flags.avx512_popcnt) { + printf("AVX512 + popcnt version... "); fflush(stdout); + if (test.run(qs::avx512::popcnt_quicksort)) { + puts("OK"); + } else { + puts("FAILED"); + ret = EXIT_FAILURE; + } + } + + if (flags.avx512_buf) { + printf("AVX512 with aux buffers... "); fflush(stdout); + if (test.run(qs::avx512::auxbuffer_quicksort)) { + puts("OK"); + } else { + puts("FAILED"); + ret = EXIT_FAILURE; + } + } + +#if 0 + if (flags.avx512_bmi) { + printf("AVX512 + bmi2 version ... "); fflush(stdout); + if (test.run(qs::avx512::bmi2_quicksort)) { + puts("OK"); + } else { + puts("FAILED"); + ret = EXIT_FAILURE; + } + } +#endif +#endif // HAVE_AVX512F_INSTRUCTIONS + + return ret; +}