From 05235e269ff962366639b56a6102ce35adaa33cf Mon Sep 17 00:00:00 2001 From: Richard Thier Date: Tue, 14 Dec 2021 17:30:07 +0100 Subject: [PATCH] added simd-sort - basically the whole repo, but I haxed-in magyarsort as measure --- simd-sort/LICENSE | 25 + simd-sort/Makefile | 57 +++ simd-sort/README.rst | 48 ++ simd-sort/avx2-altquicksort.h | 461 ++++++++++++++++++ simd-sort/avx2-nate-quicksort.cpp | 436 +++++++++++++++++ simd-sort/avx2-natenodutch-quicksort.h | 409 ++++++++++++++++ simd-sort/avx2-partition.cpp | 197 ++++++++ simd-sort/avx2-quicksort.cpp | 32 ++ simd-sort/avx512-auxbuffer-partition.cpp | 90 ++++ simd-sort/avx512-bmi2-partition.cpp | 105 ++++ simd-sort/avx512-partition-register.cpp | 58 +++ simd-sort/avx512-partition.cpp | 129 +++++ simd-sort/avx512-popcnt-partition.cpp | 119 +++++ simd-sort/avx512-quicksort.cpp | 211 ++++++++ simd-sort/avx512-sort-register.cpp | 323 ++++++++++++ simd-sort/avx512-swap.cpp | 20 + simd-sort/cmdline.cpp | 28 ++ simd-sort/common.h | 3 + simd-sort/gather_results.sh | 45 ++ simd-sort/gettime.cpp | 8 + simd-sort/input_data.cpp | 106 ++++ simd-sort/partition.cpp | 51 ++ simd-sort/quicksort-all.cpp | 9 + simd-sort/quicksort.cpp | 19 + simd-sort/rdtsc.cpp | 52 ++ .../results/haswell-i7-4770-gcc5.3.0-avx2.txt | 84 ++++ .../knights-landing-7210-gcc5.3.0-avx512f.txt | 140 ++++++ .../results/skylake-i7-6700-gcc5.3.0-avx2.txt | 80 +++ simd-sort/runtime_stats.cpp | 76 +++ simd-sort/speed.cpp | 457 +++++++++++++++++ simd-sort/speed_avx2 | Bin 0 -> 92216 bytes simd-sort/test.cpp | 261 ++++++++++ 32 files changed, 4139 insertions(+) create mode 100644 simd-sort/LICENSE create mode 100644 simd-sort/Makefile create mode 100644 simd-sort/README.rst create mode 100644 simd-sort/avx2-altquicksort.h create mode 100644 simd-sort/avx2-nate-quicksort.cpp create mode 100644 simd-sort/avx2-natenodutch-quicksort.h create mode 100644 simd-sort/avx2-partition.cpp create mode 100644 simd-sort/avx2-quicksort.cpp create mode 100644 simd-sort/avx512-auxbuffer-partition.cpp create mode 100644 simd-sort/avx512-bmi2-partition.cpp create mode 100644 simd-sort/avx512-partition-register.cpp create mode 100644 simd-sort/avx512-partition.cpp create mode 100644 simd-sort/avx512-popcnt-partition.cpp create mode 100644 simd-sort/avx512-quicksort.cpp create mode 100644 simd-sort/avx512-sort-register.cpp create mode 100644 simd-sort/avx512-swap.cpp create mode 100644 simd-sort/cmdline.cpp create mode 100644 simd-sort/common.h create mode 100755 simd-sort/gather_results.sh create mode 100644 simd-sort/gettime.cpp create mode 100644 simd-sort/input_data.cpp create mode 100644 simd-sort/partition.cpp create mode 100644 simd-sort/quicksort-all.cpp create mode 100644 simd-sort/quicksort.cpp create mode 100644 simd-sort/rdtsc.cpp create mode 100644 simd-sort/results/haswell-i7-4770-gcc5.3.0-avx2.txt create mode 100644 simd-sort/results/knights-landing-7210-gcc5.3.0-avx512f.txt create mode 100644 simd-sort/results/skylake-i7-6700-gcc5.3.0-avx2.txt create mode 100644 simd-sort/runtime_stats.cpp create mode 100644 simd-sort/speed.cpp create mode 100755 simd-sort/speed_avx2 create mode 100644 simd-sort/test.cpp diff --git a/simd-sort/LICENSE b/simd-sort/LICENSE new file mode 100644 index 0000000..a571ebb --- /dev/null +++ b/simd-sort/LICENSE @@ -0,0 +1,25 @@ +Copyright (c) 2016, Wojciech Muła, Daniel Lemire +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: + +1. Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS +IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED +TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A +PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED +TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. diff --git a/simd-sort/Makefile b/simd-sort/Makefile new file mode 100644 index 0000000..7675676 --- /dev/null +++ b/simd-sort/Makefile @@ -0,0 +1,57 @@ +.SUFFIXES: +.PHONY: all clean + +FLAGS=-std=c++11 -mbmi2 -Wall -pedantic -Wextra +FLAGS_AVX512=$(FLAGS) -mavx512f -DHAVE_AVX512F_INSTRUCTIONS -DHAVE_AVX2_INSTRUCTIONS +FLAGS_AVX2=$(FLAGS) -mavx2 -DHAVE_AVX2_INSTRUCTIONS + +DEPS_SORT=partition.cpp \ + avx2-partition.cpp \ + avx2-quicksort.cpp \ + avx2-altquicksort.h \ + avx2-natenodutch-quicksort.h \ + avx2-nate-quicksort.cpp \ + avx512-swap.cpp \ + avx512-partition.cpp \ + avx512-auxbuffer-partition.cpp \ + avx512-bmi2-partition.cpp \ + avx512-popcnt-partition.cpp \ + avx512-quicksort.cpp \ + avx512-sort-register.cpp \ + avx512-partition-register.cpp \ + quicksort.cpp + +SPEED_DEPS=$(DEPS_SORT) speed.cpp gettime.cpp rdtsc.cpp runtime_stats.cpp +SPEED_FLAGS=-O3 -DNDEBUG + +ALL=test speed test_avx2 speed_avx2 speed_stats speed_avx2_stats + +all: $(ALL) + +test: test.cpp input_data.cpp $(DEPS_SORT) + $(CXX) $(FLAGS_AVX512) -fsanitize=address test.cpp -o $@ + +test_avx2: test.cpp input_data.cpp $(DEPS_SORT) + #$(CXX) $(FLAGS_AVX2) -fsanitize=address test.cpp -o $@ + $(CXX) $(FLAGS_AVX2) test.cpp -o $@ + +speed: $(SPEED_DEPS) + $(CXX) $(FLAGS_AVX512) $(SPEED_FLAGS) speed.cpp -o $@ + +speed_avx2: $(SPEED_DEPS) + $(CXX) $(FLAGS_AVX2) $(SPEED_FLAGS) speed.cpp -o $@ + +speed_stats: $(SPEED_DEPS) + $(CXX) $(FLAGS_AVX512) $(SPEED_FLAGS) -DWITH_RUNTIME_STATS speed.cpp -o $@ + +speed_avx2_stats: $(SPEED_DEPS) + $(CXX) $(FLAGS_AVX2) $(SPEED_FLAGS) -DWITH_RUNTIME_STATS speed.cpp -o $@ + +run: test + sde -cnl -- ./$^ + +run_avx2: test_avx2 + sde -cnl -- ./$^ + +clean: + rm -f $(ALL) diff --git a/simd-sort/README.rst b/simd-sort/README.rst new file mode 100644 index 0000000..9f9c385 --- /dev/null +++ b/simd-sort/README.rst @@ -0,0 +1,48 @@ +================================================================================ + SIMD sorting +================================================================================ + +Overview +-------------------------------------------------- + +This repository contains SIMD versions of quicksort, using **AVX512F** +and **AVX2** instruction sets. The subdirectory ``results`` contains +performance results. + +The code is still unfinished, it may be subject of changes. + +Although programs are written in C++, most procedures are nearly +plain C, except use of namespaces and references in place of +pointers. + + +Building +-------------------------------------------------- + +At least GCC 5.3.0 is needed. Type ``make`` to build AVX512F +and AVX2 versions of two programs: + +* ``test``/``test_avx2`` --- tests if SIMD-ized sorting procedures + work correctly for various inputs. + +* ``speed``/``speed_avx2`` --- compares speed of different procedures, + scalar and SIMD for various inputs. + +* ``speed_stats``/``speed_avx2_stats`` --- similar to ``speed``, but + some procedures collect and print runtime statistics. Note that + this gives overhead, so for real performance tests the basic variant + of the programs should be used. + + +AVX512 +-------------------------------------------------- + +`Intel Software Development Emulator`__ can be used to run AVX512 variants. + +__ https://software.intel.com/en-us/articles/intel-software-development-emulator + + +References +-------------------------------------------------- + +- Fast Sorting Algorithms using AVX-512 on Intel Knights Landing https://arxiv.org/pdf/1704.08579.pdf diff --git a/simd-sort/avx2-altquicksort.h b/simd-sort/avx2-altquicksort.h new file mode 100644 index 0000000..aa23ef0 --- /dev/null +++ b/simd-sort/avx2-altquicksort.h @@ -0,0 +1,461 @@ +#pragma once + +#include +#include + +/** +* This is an alternative approach to SIMD quicksort, implemented by D. Lemire. +* It is not meant to be fast as such, but it can serve as a useful reference. +*/ + +// When defined smaller reverseshufflemask is used. +//#define BYTE_PATTERN + +// When defined histogram for comparison mask is collected +//#define WITH_PVBYTE_HISTOGRAM + +#if WITH_RUNTIME_STATS == 0 && defined(WITH_PVBYTE_HISTOGRAM) +# undef WITH_RUNTIME_STATS +#endif + +// can be replaced with VCOMPRESS on AVX-512 +static +#ifdef BYTE_PATTERN + uint8_t +#else + uint32_t +#endif +reverseshufflemask[256 * 8] __attribute__((aligned(0x100))) = { + 0, 1, 2, 3, 4, 5, 6, 7, /* 0*/ + 1, 2, 3, 4, 5, 6, 7, 0, /* 1*/ + 0, 2, 3, 4, 5, 6, 7, 1, /* 2*/ + 2, 3, 4, 5, 6, 7, 0, 1, /* 3*/ + 0, 1, 3, 4, 5, 6, 7, 2, /* 4*/ + 1, 3, 4, 5, 6, 7, 0, 2, /* 5*/ + 0, 3, 4, 5, 6, 7, 1, 2, /* 6*/ + 3, 4, 5, 6, 7, 0, 1, 2, /* 7*/ + 0, 1, 2, 4, 5, 6, 7, 3, /* 8*/ + 1, 2, 4, 5, 6, 7, 0, 3, /* 9*/ + 0, 2, 4, 5, 6, 7, 1, 3, /* 10*/ + 2, 4, 5, 6, 7, 0, 1, 3, /* 11*/ + 0, 1, 4, 5, 6, 7, 2, 3, /* 12*/ + 1, 4, 5, 6, 7, 0, 2, 3, /* 13*/ + 0, 4, 5, 6, 7, 1, 2, 3, /* 14*/ + 4, 5, 6, 7, 0, 1, 2, 3, /* 15*/ + 0, 1, 2, 3, 5, 6, 7, 4, /* 16*/ + 1, 2, 3, 5, 6, 7, 0, 4, /* 17*/ + 0, 2, 3, 5, 6, 7, 1, 4, /* 18*/ + 2, 3, 5, 6, 7, 0, 1, 4, /* 19*/ + 0, 1, 3, 5, 6, 7, 2, 4, /* 20*/ + 1, 3, 5, 6, 7, 0, 2, 4, /* 21*/ + 0, 3, 5, 6, 7, 1, 2, 4, /* 22*/ + 3, 5, 6, 7, 0, 1, 2, 4, /* 23*/ + 0, 1, 2, 5, 6, 7, 3, 4, /* 24*/ + 1, 2, 5, 6, 7, 0, 3, 4, /* 25*/ + 0, 2, 5, 6, 7, 1, 3, 4, /* 26*/ + 2, 5, 6, 7, 0, 1, 3, 4, /* 27*/ + 0, 1, 5, 6, 7, 2, 3, 4, /* 28*/ + 1, 5, 6, 7, 0, 2, 3, 4, /* 29*/ + 0, 5, 6, 7, 1, 2, 3, 4, /* 30*/ + 5, 6, 7, 0, 1, 2, 3, 4, /* 31*/ + 0, 1, 2, 3, 4, 6, 7, 5, /* 32*/ + 1, 2, 3, 4, 6, 7, 0, 5, /* 33*/ + 0, 2, 3, 4, 6, 7, 1, 5, /* 34*/ + 2, 3, 4, 6, 7, 0, 1, 5, /* 35*/ + 0, 1, 3, 4, 6, 7, 2, 5, /* 36*/ + 1, 3, 4, 6, 7, 0, 2, 5, /* 37*/ + 0, 3, 4, 6, 7, 1, 2, 5, /* 38*/ + 3, 4, 6, 7, 0, 1, 2, 5, /* 39*/ + 0, 1, 2, 4, 6, 7, 3, 5, /* 40*/ + 1, 2, 4, 6, 7, 0, 3, 5, /* 41*/ + 0, 2, 4, 6, 7, 1, 3, 5, /* 42*/ + 2, 4, 6, 7, 0, 1, 3, 5, /* 43*/ + 0, 1, 4, 6, 7, 2, 3, 5, /* 44*/ + 1, 4, 6, 7, 0, 2, 3, 5, /* 45*/ + 0, 4, 6, 7, 1, 2, 3, 5, /* 46*/ + 4, 6, 7, 0, 1, 2, 3, 5, /* 47*/ + 0, 1, 2, 3, 6, 7, 4, 5, /* 48*/ + 1, 2, 3, 6, 7, 0, 4, 5, /* 49*/ + 0, 2, 3, 6, 7, 1, 4, 5, /* 50*/ + 2, 3, 6, 7, 0, 1, 4, 5, /* 51*/ + 0, 1, 3, 6, 7, 2, 4, 5, /* 52*/ + 1, 3, 6, 7, 0, 2, 4, 5, /* 53*/ + 0, 3, 6, 7, 1, 2, 4, 5, /* 54*/ + 3, 6, 7, 0, 1, 2, 4, 5, /* 55*/ + 0, 1, 2, 6, 7, 3, 4, 5, /* 56*/ + 1, 2, 6, 7, 0, 3, 4, 5, /* 57*/ + 0, 2, 6, 7, 1, 3, 4, 5, /* 58*/ + 2, 6, 7, 0, 1, 3, 4, 5, /* 59*/ + 0, 1, 6, 7, 2, 3, 4, 5, /* 60*/ + 1, 6, 7, 0, 2, 3, 4, 5, /* 61*/ + 0, 6, 7, 1, 2, 3, 4, 5, /* 62*/ + 6, 7, 0, 1, 2, 3, 4, 5, /* 63*/ + 0, 1, 2, 3, 4, 5, 7, 6, /* 64*/ + 1, 2, 3, 4, 5, 7, 0, 6, /* 65*/ + 0, 2, 3, 4, 5, 7, 1, 6, /* 66*/ + 2, 3, 4, 5, 7, 0, 1, 6, /* 67*/ + 0, 1, 3, 4, 5, 7, 2, 6, /* 68*/ + 1, 3, 4, 5, 7, 0, 2, 6, /* 69*/ + 0, 3, 4, 5, 7, 1, 2, 6, /* 70*/ + 3, 4, 5, 7, 0, 1, 2, 6, /* 71*/ + 0, 1, 2, 4, 5, 7, 3, 6, /* 72*/ + 1, 2, 4, 5, 7, 0, 3, 6, /* 73*/ + 0, 2, 4, 5, 7, 1, 3, 6, /* 74*/ + 2, 4, 5, 7, 0, 1, 3, 6, /* 75*/ + 0, 1, 4, 5, 7, 2, 3, 6, /* 76*/ + 1, 4, 5, 7, 0, 2, 3, 6, /* 77*/ + 0, 4, 5, 7, 1, 2, 3, 6, /* 78*/ + 4, 5, 7, 0, 1, 2, 3, 6, /* 79*/ + 0, 1, 2, 3, 5, 7, 4, 6, /* 80*/ + 1, 2, 3, 5, 7, 0, 4, 6, /* 81*/ + 0, 2, 3, 5, 7, 1, 4, 6, /* 82*/ + 2, 3, 5, 7, 0, 1, 4, 6, /* 83*/ + 0, 1, 3, 5, 7, 2, 4, 6, /* 84*/ + 1, 3, 5, 7, 0, 2, 4, 6, /* 85*/ + 0, 3, 5, 7, 1, 2, 4, 6, /* 86*/ + 3, 5, 7, 0, 1, 2, 4, 6, /* 87*/ + 0, 1, 2, 5, 7, 3, 4, 6, /* 88*/ + 1, 2, 5, 7, 0, 3, 4, 6, /* 89*/ + 0, 2, 5, 7, 1, 3, 4, 6, /* 90*/ + 2, 5, 7, 0, 1, 3, 4, 6, /* 91*/ + 0, 1, 5, 7, 2, 3, 4, 6, /* 92*/ + 1, 5, 7, 0, 2, 3, 4, 6, /* 93*/ + 0, 5, 7, 1, 2, 3, 4, 6, /* 94*/ + 5, 7, 0, 1, 2, 3, 4, 6, /* 95*/ + 0, 1, 2, 3, 4, 7, 5, 6, /* 96*/ + 1, 2, 3, 4, 7, 0, 5, 6, /* 97*/ + 0, 2, 3, 4, 7, 1, 5, 6, /* 98*/ + 2, 3, 4, 7, 0, 1, 5, 6, /* 99*/ + 0, 1, 3, 4, 7, 2, 5, 6, /* 100*/ + 1, 3, 4, 7, 0, 2, 5, 6, /* 101*/ + 0, 3, 4, 7, 1, 2, 5, 6, /* 102*/ + 3, 4, 7, 0, 1, 2, 5, 6, /* 103*/ + 0, 1, 2, 4, 7, 3, 5, 6, /* 104*/ + 1, 2, 4, 7, 0, 3, 5, 6, /* 105*/ + 0, 2, 4, 7, 1, 3, 5, 6, /* 106*/ + 2, 4, 7, 0, 1, 3, 5, 6, /* 107*/ + 0, 1, 4, 7, 2, 3, 5, 6, /* 108*/ + 1, 4, 7, 0, 2, 3, 5, 6, /* 109*/ + 0, 4, 7, 1, 2, 3, 5, 6, /* 110*/ + 4, 7, 0, 1, 2, 3, 5, 6, /* 111*/ + 0, 1, 2, 3, 7, 4, 5, 6, /* 112*/ + 1, 2, 3, 7, 0, 4, 5, 6, /* 113*/ + 0, 2, 3, 7, 1, 4, 5, 6, /* 114*/ + 2, 3, 7, 0, 1, 4, 5, 6, /* 115*/ + 0, 1, 3, 7, 2, 4, 5, 6, /* 116*/ + 1, 3, 7, 0, 2, 4, 5, 6, /* 117*/ + 0, 3, 7, 1, 2, 4, 5, 6, /* 118*/ + 3, 7, 0, 1, 2, 4, 5, 6, /* 119*/ + 0, 1, 2, 7, 3, 4, 5, 6, /* 120*/ + 1, 2, 7, 0, 3, 4, 5, 6, /* 121*/ + 0, 2, 7, 1, 3, 4, 5, 6, /* 122*/ + 2, 7, 0, 1, 3, 4, 5, 6, /* 123*/ + 0, 1, 7, 2, 3, 4, 5, 6, /* 124*/ + 1, 7, 0, 2, 3, 4, 5, 6, /* 125*/ + 0, 7, 1, 2, 3, 4, 5, 6, /* 126*/ + 7, 0, 1, 2, 3, 4, 5, 6, /* 127*/ + 0, 1, 2, 3, 4, 5, 6, 7, /* 128*/ + 1, 2, 3, 4, 5, 6, 0, 7, /* 129*/ + 0, 2, 3, 4, 5, 6, 1, 7, /* 130*/ + 2, 3, 4, 5, 6, 0, 1, 7, /* 131*/ + 0, 1, 3, 4, 5, 6, 2, 7, /* 132*/ + 1, 3, 4, 5, 6, 0, 2, 7, /* 133*/ + 0, 3, 4, 5, 6, 1, 2, 7, /* 134*/ + 3, 4, 5, 6, 0, 1, 2, 7, /* 135*/ + 0, 1, 2, 4, 5, 6, 3, 7, /* 136*/ + 1, 2, 4, 5, 6, 0, 3, 7, /* 137*/ + 0, 2, 4, 5, 6, 1, 3, 7, /* 138*/ + 2, 4, 5, 6, 0, 1, 3, 7, /* 139*/ + 0, 1, 4, 5, 6, 2, 3, 7, /* 140*/ + 1, 4, 5, 6, 0, 2, 3, 7, /* 141*/ + 0, 4, 5, 6, 1, 2, 3, 7, /* 142*/ + 4, 5, 6, 0, 1, 2, 3, 7, /* 143*/ + 0, 1, 2, 3, 5, 6, 4, 7, /* 144*/ + 1, 2, 3, 5, 6, 0, 4, 7, /* 145*/ + 0, 2, 3, 5, 6, 1, 4, 7, /* 146*/ + 2, 3, 5, 6, 0, 1, 4, 7, /* 147*/ + 0, 1, 3, 5, 6, 2, 4, 7, /* 148*/ + 1, 3, 5, 6, 0, 2, 4, 7, /* 149*/ + 0, 3, 5, 6, 1, 2, 4, 7, /* 150*/ + 3, 5, 6, 0, 1, 2, 4, 7, /* 151*/ + 0, 1, 2, 5, 6, 3, 4, 7, /* 152*/ + 1, 2, 5, 6, 0, 3, 4, 7, /* 153*/ + 0, 2, 5, 6, 1, 3, 4, 7, /* 154*/ + 2, 5, 6, 0, 1, 3, 4, 7, /* 155*/ + 0, 1, 5, 6, 2, 3, 4, 7, /* 156*/ + 1, 5, 6, 0, 2, 3, 4, 7, /* 157*/ + 0, 5, 6, 1, 2, 3, 4, 7, /* 158*/ + 5, 6, 0, 1, 2, 3, 4, 7, /* 159*/ + 0, 1, 2, 3, 4, 6, 5, 7, /* 160*/ + 1, 2, 3, 4, 6, 0, 5, 7, /* 161*/ + 0, 2, 3, 4, 6, 1, 5, 7, /* 162*/ + 2, 3, 4, 6, 0, 1, 5, 7, /* 163*/ + 0, 1, 3, 4, 6, 2, 5, 7, /* 164*/ + 1, 3, 4, 6, 0, 2, 5, 7, /* 165*/ + 0, 3, 4, 6, 1, 2, 5, 7, /* 166*/ + 3, 4, 6, 0, 1, 2, 5, 7, /* 167*/ + 0, 1, 2, 4, 6, 3, 5, 7, /* 168*/ + 1, 2, 4, 6, 0, 3, 5, 7, /* 169*/ + 0, 2, 4, 6, 1, 3, 5, 7, /* 170*/ + 2, 4, 6, 0, 1, 3, 5, 7, /* 171*/ + 0, 1, 4, 6, 2, 3, 5, 7, /* 172*/ + 1, 4, 6, 0, 2, 3, 5, 7, /* 173*/ + 0, 4, 6, 1, 2, 3, 5, 7, /* 174*/ + 4, 6, 0, 1, 2, 3, 5, 7, /* 175*/ + 0, 1, 2, 3, 6, 4, 5, 7, /* 176*/ + 1, 2, 3, 6, 0, 4, 5, 7, /* 177*/ + 0, 2, 3, 6, 1, 4, 5, 7, /* 178*/ + 2, 3, 6, 0, 1, 4, 5, 7, /* 179*/ + 0, 1, 3, 6, 2, 4, 5, 7, /* 180*/ + 1, 3, 6, 0, 2, 4, 5, 7, /* 181*/ + 0, 3, 6, 1, 2, 4, 5, 7, /* 182*/ + 3, 6, 0, 1, 2, 4, 5, 7, /* 183*/ + 0, 1, 2, 6, 3, 4, 5, 7, /* 184*/ + 1, 2, 6, 0, 3, 4, 5, 7, /* 185*/ + 0, 2, 6, 1, 3, 4, 5, 7, /* 186*/ + 2, 6, 0, 1, 3, 4, 5, 7, /* 187*/ + 0, 1, 6, 2, 3, 4, 5, 7, /* 188*/ + 1, 6, 0, 2, 3, 4, 5, 7, /* 189*/ + 0, 6, 1, 2, 3, 4, 5, 7, /* 190*/ + 6, 0, 1, 2, 3, 4, 5, 7, /* 191*/ + 0, 1, 2, 3, 4, 5, 6, 7, /* 192*/ + 1, 2, 3, 4, 5, 0, 6, 7, /* 193*/ + 0, 2, 3, 4, 5, 1, 6, 7, /* 194*/ + 2, 3, 4, 5, 0, 1, 6, 7, /* 195*/ + 0, 1, 3, 4, 5, 2, 6, 7, /* 196*/ + 1, 3, 4, 5, 0, 2, 6, 7, /* 197*/ + 0, 3, 4, 5, 1, 2, 6, 7, /* 198*/ + 3, 4, 5, 0, 1, 2, 6, 7, /* 199*/ + 0, 1, 2, 4, 5, 3, 6, 7, /* 200*/ + 1, 2, 4, 5, 0, 3, 6, 7, /* 201*/ + 0, 2, 4, 5, 1, 3, 6, 7, /* 202*/ + 2, 4, 5, 0, 1, 3, 6, 7, /* 203*/ + 0, 1, 4, 5, 2, 3, 6, 7, /* 204*/ + 1, 4, 5, 0, 2, 3, 6, 7, /* 205*/ + 0, 4, 5, 1, 2, 3, 6, 7, /* 206*/ + 4, 5, 0, 1, 2, 3, 6, 7, /* 207*/ + 0, 1, 2, 3, 5, 4, 6, 7, /* 208*/ + 1, 2, 3, 5, 0, 4, 6, 7, /* 209*/ + 0, 2, 3, 5, 1, 4, 6, 7, /* 210*/ + 2, 3, 5, 0, 1, 4, 6, 7, /* 211*/ + 0, 1, 3, 5, 2, 4, 6, 7, /* 212*/ + 1, 3, 5, 0, 2, 4, 6, 7, /* 213*/ + 0, 3, 5, 1, 2, 4, 6, 7, /* 214*/ + 3, 5, 0, 1, 2, 4, 6, 7, /* 215*/ + 0, 1, 2, 5, 3, 4, 6, 7, /* 216*/ + 1, 2, 5, 0, 3, 4, 6, 7, /* 217*/ + 0, 2, 5, 1, 3, 4, 6, 7, /* 218*/ + 2, 5, 0, 1, 3, 4, 6, 7, /* 219*/ + 0, 1, 5, 2, 3, 4, 6, 7, /* 220*/ + 1, 5, 0, 2, 3, 4, 6, 7, /* 221*/ + 0, 5, 1, 2, 3, 4, 6, 7, /* 222*/ + 5, 0, 1, 2, 3, 4, 6, 7, /* 223*/ + 0, 1, 2, 3, 4, 5, 6, 7, /* 224*/ + 1, 2, 3, 4, 0, 5, 6, 7, /* 225*/ + 0, 2, 3, 4, 1, 5, 6, 7, /* 226*/ + 2, 3, 4, 0, 1, 5, 6, 7, /* 227*/ + 0, 1, 3, 4, 2, 5, 6, 7, /* 228*/ + 1, 3, 4, 0, 2, 5, 6, 7, /* 229*/ + 0, 3, 4, 1, 2, 5, 6, 7, /* 230*/ + 3, 4, 0, 1, 2, 5, 6, 7, /* 231*/ + 0, 1, 2, 4, 3, 5, 6, 7, /* 232*/ + 1, 2, 4, 0, 3, 5, 6, 7, /* 233*/ + 0, 2, 4, 1, 3, 5, 6, 7, /* 234*/ + 2, 4, 0, 1, 3, 5, 6, 7, /* 235*/ + 0, 1, 4, 2, 3, 5, 6, 7, /* 236*/ + 1, 4, 0, 2, 3, 5, 6, 7, /* 237*/ + 0, 4, 1, 2, 3, 5, 6, 7, /* 238*/ + 4, 0, 1, 2, 3, 5, 6, 7, /* 239*/ + 0, 1, 2, 3, 4, 5, 6, 7, /* 240*/ + 1, 2, 3, 0, 4, 5, 6, 7, /* 241*/ + 0, 2, 3, 1, 4, 5, 6, 7, /* 242*/ + 2, 3, 0, 1, 4, 5, 6, 7, /* 243*/ + 0, 1, 3, 2, 4, 5, 6, 7, /* 244*/ + 1, 3, 0, 2, 4, 5, 6, 7, /* 245*/ + 0, 3, 1, 2, 4, 5, 6, 7, /* 246*/ + 3, 0, 1, 2, 4, 5, 6, 7, /* 247*/ + 0, 1, 2, 3, 4, 5, 6, 7, /* 248*/ + 1, 2, 0, 3, 4, 5, 6, 7, /* 249*/ + 0, 2, 1, 3, 4, 5, 6, 7, /* 250*/ + 2, 0, 1, 3, 4, 5, 6, 7, /* 251*/ + 0, 1, 2, 3, 4, 5, 6, 7, /* 252*/ + 1, 0, 2, 3, 4, 5, 6, 7, /* 253*/ + 0, 1, 2, 3, 4, 5, 6, 7, /* 254*/ + 0, 1, 2, 3, 4, 5, 6, 7, /* 255*/ +}; + + +static FORCE_INLINE __m256i get_permutation_vector(int pvbyte) { +#ifdef BYTE_PATTERN + __m256i shufm; + asm volatile ( + "vpmovzxbd (%1), %0" + : "=X" (shufm) + : "r" (reverseshufflemask + 8 * pvbyte) + ); + + return shufm; +#else + return _mm256_load_si256((__m256i *)(reverseshufflemask + 8 * pvbyte)); +#endif +} + + +static uint32_t avx_pivot_on_last_value(int32_t *array, size_t length) { + /* we run through the data. Anything in [0,boundary) is smaller or equal + * than the pivot, and the value at boundary - 1 is going to be equal to the + * pivot at the end, + * anything in (boundary, i) is greater than the pivot + * stuff in [i,...) is grey + * the function returns the location of the boundary. + */ + if (length <= 1) + return 1; + { // we exchange the last value for the middle value for a better pivot + int32_t ival = array[length / 2]; + int32_t bval = array[length - 1]; + array[length / 2] = bval; + array[length - 1] = ival; + } +#if WITH_RUNTIME_STATS + statistics.partition_calls += 1; + statistics.items_processed += length; +#endif + uint32_t boundary = 0; + uint32_t i = 0; + int32_t pivot = array[length - 1]; // we always pick the pivot at the end + const __m256i P = _mm256_set1_epi32(pivot); + while ( i + 8 + 1 <= length) { + __m256i allgrey = _mm256_lddqu_si256((__m256i *)(array + i)); + int pvbyte = _mm256_movemask_ps((__m256)_mm256_cmpgt_epi32(allgrey, P)); +#if WITH_RUNTIME_STATS && defined(WITH_PVBYTE_HISTOGRAM) + statistics.pvbyte_histogram.hit(pvbyte); +#endif + if(pvbyte == 0) { // might be frequent + i += 8; //nothing to do + boundary = i; + } else if (pvbyte == 0xFF) { // called once + boundary = i; + i += 8; + break; // exit + } else { + + // hot path + switch (pvbyte) { + // for pvbyte = 0x00, 0x80, 0xc0, 0xe0, 0xf0, 0xf8, 0xfc, 0xfe, 0xff + // there is no change in order, just advance boundary + // Note: case 0x00 & 0xff are already handled + case 0x80: i += 8 - 1; break; + case 0xc0: i += 8 - 2; break; + case 0xe0: i += 8 - 3; break; + case 0xf0: i += 8 - 4; break; + case 0xf8: i += 8 - 5; break; + case 0xfc: i += 8 - 6; break; + case 0xfe: i += 8 - 7; break; + + default: { + uint32_t cnt = + 8 - _mm_popcnt_u32(pvbyte); // might be faster with table look-up? + __m256i blackthenwhite = _mm256_permutevar8x32_epi32(allgrey, get_permutation_vector(pvbyte)); + _mm256_storeu_si256((__m256i *)(array + i), blackthenwhite); + i += cnt; + } + } // switch + + boundary = i; // this doesn't need updating each and every time + } + } + for (; i + 8 + 1 <= length ;) { + __m256i allgrey = + _mm256_lddqu_si256((__m256i *)(array + i)); // this is all grey + int pvbyte = _mm256_movemask_ps((__m256)_mm256_cmpgt_epi32(allgrey, P)); + if (pvbyte == 0xFF) { // called once + // nothing to do + } else { + + uint32_t cnt = + 8 - _mm_popcnt_u32(pvbyte); // might be faster with table look-up? + __m256i allwhite = _mm256_lddqu_si256( + (__m256i *)(array + boundary)); // this is all white + // we shuffle allgrey so that the first part is black and the second part + // is white + __m256i blackthenwhite = _mm256_permutevar8x32_epi32(allgrey, get_permutation_vector(pvbyte)); + _mm256_storeu_si256((__m256i *)(array + boundary), blackthenwhite); + _mm256_storeu_si256((__m256i *)(array + i), allwhite); + boundary += cnt; // might be faster with table look-up? + } + i += 8; + } + while (i + 1 < length) { + int32_t ival = array[i]; + if (ival <= pivot) { + int32_t bval = array[boundary]; + array[i] = bval; + array[boundary] = ival; + boundary++; + } + i++; + } + int32_t ival = array[i]; + int32_t bval = array[boundary]; + array[length - 1] = bval; + array[boundary] = ival; + boundary++; + return boundary; +} + +// for fallback +void scalar_partition(int32_t* array, const int32_t pivot, int& left, int& right) { + + while (left <= right) { + while (array[left] < pivot) { + left += 1; + } + while (array[right] > pivot) { + right -= 1; + } + if (left <= right) { + const uint32_t t = array[left]; + array[left] = array[right]; + array[right] = t; + left += 1; + right -= 1; + } + } +} + +//fallback +void scalar_quicksort(int32_t* array, int left, int right) { +#ifdef WITH_RUNTIME_STATS + statistics.scalar__partition_calls += 1; + statistics.scalar__items_processed += right - left + 1; +#endif + int i = left; + int j = right; + const int32_t pivot = array[(i + j)/2]; + scalar_partition(array, pivot, i, j); + if (left < j) { + scalar_quicksort(array, left, j); + } + if (i < right) { + scalar_quicksort(array, i, right); + } +} + +void avx2_pivotonlast_sort(int32_t *array, const uint32_t length) { + uint32_t sep = avx_pivot_on_last_value(array, length); + if(sep == length) { + // we have an ineffective pivot. Let us give up. + if(length > 1) scalar_quicksort(array,0,length - 1); + } else { + if (sep > 2) { + avx2_pivotonlast_sort(array, sep - 1); + } + if (sep + 1 < length) { + avx2_pivotonlast_sort(array + sep, length - sep); + } + } +} +void wrapped_avx2_pivotonlast_sort(uint32_t *array, int left, int right) { + avx2_pivotonlast_sort((int32_t *)array + left, right - left + 1); +} diff --git a/simd-sort/avx2-nate-quicksort.cpp b/simd-sort/avx2-nate-quicksort.cpp new file mode 100644 index 0000000..7d91931 --- /dev/null +++ b/simd-sort/avx2-nate-quicksort.cpp @@ -0,0 +1,436 @@ +#pragma once + +#include +#include + +namespace nate { + +const int AVX2_REGISTER_SIZE = 8; // in 32-bit words + +/* + Algorithm of vectorized partition by Nathan Kurz. + Everything else was borrowed from Daniel's code (avx2-altquicksort.h) +*/ + +static uint32_t reverseshufflemask[256 * 8] __attribute__((aligned(0x100))) = { + 0, 1, 2, 3, 4, 5, 6, 7, /* 0*/ + 1, 2, 3, 4, 5, 6, 7, 0, /* 1*/ + 0, 2, 3, 4, 5, 6, 7, 1, /* 2*/ + 2, 3, 4, 5, 6, 7, 0, 1, /* 3*/ + 0, 1, 3, 4, 5, 6, 7, 2, /* 4*/ + 1, 3, 4, 5, 6, 7, 0, 2, /* 5*/ + 0, 3, 4, 5, 6, 7, 1, 2, /* 6*/ + 3, 4, 5, 6, 7, 0, 1, 2, /* 7*/ + 0, 1, 2, 4, 5, 6, 7, 3, /* 8*/ + 1, 2, 4, 5, 6, 7, 0, 3, /* 9*/ + 0, 2, 4, 5, 6, 7, 1, 3, /* 10*/ + 2, 4, 5, 6, 7, 0, 1, 3, /* 11*/ + 0, 1, 4, 5, 6, 7, 2, 3, /* 12*/ + 1, 4, 5, 6, 7, 0, 2, 3, /* 13*/ + 0, 4, 5, 6, 7, 1, 2, 3, /* 14*/ + 4, 5, 6, 7, 0, 1, 2, 3, /* 15*/ + 0, 1, 2, 3, 5, 6, 7, 4, /* 16*/ + 1, 2, 3, 5, 6, 7, 0, 4, /* 17*/ + 0, 2, 3, 5, 6, 7, 1, 4, /* 18*/ + 2, 3, 5, 6, 7, 0, 1, 4, /* 19*/ + 0, 1, 3, 5, 6, 7, 2, 4, /* 20*/ + 1, 3, 5, 6, 7, 0, 2, 4, /* 21*/ + 0, 3, 5, 6, 7, 1, 2, 4, /* 22*/ + 3, 5, 6, 7, 0, 1, 2, 4, /* 23*/ + 0, 1, 2, 5, 6, 7, 3, 4, /* 24*/ + 1, 2, 5, 6, 7, 0, 3, 4, /* 25*/ + 0, 2, 5, 6, 7, 1, 3, 4, /* 26*/ + 2, 5, 6, 7, 0, 1, 3, 4, /* 27*/ + 0, 1, 5, 6, 7, 2, 3, 4, /* 28*/ + 1, 5, 6, 7, 0, 2, 3, 4, /* 29*/ + 0, 5, 6, 7, 1, 2, 3, 4, /* 30*/ + 5, 6, 7, 0, 1, 2, 3, 4, /* 31*/ + 0, 1, 2, 3, 4, 6, 7, 5, /* 32*/ + 1, 2, 3, 4, 6, 7, 0, 5, /* 33*/ + 0, 2, 3, 4, 6, 7, 1, 5, /* 34*/ + 2, 3, 4, 6, 7, 0, 1, 5, /* 35*/ + 0, 1, 3, 4, 6, 7, 2, 5, /* 36*/ + 1, 3, 4, 6, 7, 0, 2, 5, /* 37*/ + 0, 3, 4, 6, 7, 1, 2, 5, /* 38*/ + 3, 4, 6, 7, 0, 1, 2, 5, /* 39*/ + 0, 1, 2, 4, 6, 7, 3, 5, /* 40*/ + 1, 2, 4, 6, 7, 0, 3, 5, /* 41*/ + 0, 2, 4, 6, 7, 1, 3, 5, /* 42*/ + 2, 4, 6, 7, 0, 1, 3, 5, /* 43*/ + 0, 1, 4, 6, 7, 2, 3, 5, /* 44*/ + 1, 4, 6, 7, 0, 2, 3, 5, /* 45*/ + 0, 4, 6, 7, 1, 2, 3, 5, /* 46*/ + 4, 6, 7, 0, 1, 2, 3, 5, /* 47*/ + 0, 1, 2, 3, 6, 7, 4, 5, /* 48*/ + 1, 2, 3, 6, 7, 0, 4, 5, /* 49*/ + 0, 2, 3, 6, 7, 1, 4, 5, /* 50*/ + 2, 3, 6, 7, 0, 1, 4, 5, /* 51*/ + 0, 1, 3, 6, 7, 2, 4, 5, /* 52*/ + 1, 3, 6, 7, 0, 2, 4, 5, /* 53*/ + 0, 3, 6, 7, 1, 2, 4, 5, /* 54*/ + 3, 6, 7, 0, 1, 2, 4, 5, /* 55*/ + 0, 1, 2, 6, 7, 3, 4, 5, /* 56*/ + 1, 2, 6, 7, 0, 3, 4, 5, /* 57*/ + 0, 2, 6, 7, 1, 3, 4, 5, /* 58*/ + 2, 6, 7, 0, 1, 3, 4, 5, /* 59*/ + 0, 1, 6, 7, 2, 3, 4, 5, /* 60*/ + 1, 6, 7, 0, 2, 3, 4, 5, /* 61*/ + 0, 6, 7, 1, 2, 3, 4, 5, /* 62*/ + 6, 7, 0, 1, 2, 3, 4, 5, /* 63*/ + 0, 1, 2, 3, 4, 5, 7, 6, /* 64*/ + 1, 2, 3, 4, 5, 7, 0, 6, /* 65*/ + 0, 2, 3, 4, 5, 7, 1, 6, /* 66*/ + 2, 3, 4, 5, 7, 0, 1, 6, /* 67*/ + 0, 1, 3, 4, 5, 7, 2, 6, /* 68*/ + 1, 3, 4, 5, 7, 0, 2, 6, /* 69*/ + 0, 3, 4, 5, 7, 1, 2, 6, /* 70*/ + 3, 4, 5, 7, 0, 1, 2, 6, /* 71*/ + 0, 1, 2, 4, 5, 7, 3, 6, /* 72*/ + 1, 2, 4, 5, 7, 0, 3, 6, /* 73*/ + 0, 2, 4, 5, 7, 1, 3, 6, /* 74*/ + 2, 4, 5, 7, 0, 1, 3, 6, /* 75*/ + 0, 1, 4, 5, 7, 2, 3, 6, /* 76*/ + 1, 4, 5, 7, 0, 2, 3, 6, /* 77*/ + 0, 4, 5, 7, 1, 2, 3, 6, /* 78*/ + 4, 5, 7, 0, 1, 2, 3, 6, /* 79*/ + 0, 1, 2, 3, 5, 7, 4, 6, /* 80*/ + 1, 2, 3, 5, 7, 0, 4, 6, /* 81*/ + 0, 2, 3, 5, 7, 1, 4, 6, /* 82*/ + 2, 3, 5, 7, 0, 1, 4, 6, /* 83*/ + 0, 1, 3, 5, 7, 2, 4, 6, /* 84*/ + 1, 3, 5, 7, 0, 2, 4, 6, /* 85*/ + 0, 3, 5, 7, 1, 2, 4, 6, /* 86*/ + 3, 5, 7, 0, 1, 2, 4, 6, /* 87*/ + 0, 1, 2, 5, 7, 3, 4, 6, /* 88*/ + 1, 2, 5, 7, 0, 3, 4, 6, /* 89*/ + 0, 2, 5, 7, 1, 3, 4, 6, /* 90*/ + 2, 5, 7, 0, 1, 3, 4, 6, /* 91*/ + 0, 1, 5, 7, 2, 3, 4, 6, /* 92*/ + 1, 5, 7, 0, 2, 3, 4, 6, /* 93*/ + 0, 5, 7, 1, 2, 3, 4, 6, /* 94*/ + 5, 7, 0, 1, 2, 3, 4, 6, /* 95*/ + 0, 1, 2, 3, 4, 7, 5, 6, /* 96*/ + 1, 2, 3, 4, 7, 0, 5, 6, /* 97*/ + 0, 2, 3, 4, 7, 1, 5, 6, /* 98*/ + 2, 3, 4, 7, 0, 1, 5, 6, /* 99*/ + 0, 1, 3, 4, 7, 2, 5, 6, /* 100*/ + 1, 3, 4, 7, 0, 2, 5, 6, /* 101*/ + 0, 3, 4, 7, 1, 2, 5, 6, /* 102*/ + 3, 4, 7, 0, 1, 2, 5, 6, /* 103*/ + 0, 1, 2, 4, 7, 3, 5, 6, /* 104*/ + 1, 2, 4, 7, 0, 3, 5, 6, /* 105*/ + 0, 2, 4, 7, 1, 3, 5, 6, /* 106*/ + 2, 4, 7, 0, 1, 3, 5, 6, /* 107*/ + 0, 1, 4, 7, 2, 3, 5, 6, /* 108*/ + 1, 4, 7, 0, 2, 3, 5, 6, /* 109*/ + 0, 4, 7, 1, 2, 3, 5, 6, /* 110*/ + 4, 7, 0, 1, 2, 3, 5, 6, /* 111*/ + 0, 1, 2, 3, 7, 4, 5, 6, /* 112*/ + 1, 2, 3, 7, 0, 4, 5, 6, /* 113*/ + 0, 2, 3, 7, 1, 4, 5, 6, /* 114*/ + 2, 3, 7, 0, 1, 4, 5, 6, /* 115*/ + 0, 1, 3, 7, 2, 4, 5, 6, /* 116*/ + 1, 3, 7, 0, 2, 4, 5, 6, /* 117*/ + 0, 3, 7, 1, 2, 4, 5, 6, /* 118*/ + 3, 7, 0, 1, 2, 4, 5, 6, /* 119*/ + 0, 1, 2, 7, 3, 4, 5, 6, /* 120*/ + 1, 2, 7, 0, 3, 4, 5, 6, /* 121*/ + 0, 2, 7, 1, 3, 4, 5, 6, /* 122*/ + 2, 7, 0, 1, 3, 4, 5, 6, /* 123*/ + 0, 1, 7, 2, 3, 4, 5, 6, /* 124*/ + 1, 7, 0, 2, 3, 4, 5, 6, /* 125*/ + 0, 7, 1, 2, 3, 4, 5, 6, /* 126*/ + 7, 0, 1, 2, 3, 4, 5, 6, /* 127*/ + 0, 1, 2, 3, 4, 5, 6, 7, /* 128*/ + 1, 2, 3, 4, 5, 6, 0, 7, /* 129*/ + 0, 2, 3, 4, 5, 6, 1, 7, /* 130*/ + 2, 3, 4, 5, 6, 0, 1, 7, /* 131*/ + 0, 1, 3, 4, 5, 6, 2, 7, /* 132*/ + 1, 3, 4, 5, 6, 0, 2, 7, /* 133*/ + 0, 3, 4, 5, 6, 1, 2, 7, /* 134*/ + 3, 4, 5, 6, 0, 1, 2, 7, /* 135*/ + 0, 1, 2, 4, 5, 6, 3, 7, /* 136*/ + 1, 2, 4, 5, 6, 0, 3, 7, /* 137*/ + 0, 2, 4, 5, 6, 1, 3, 7, /* 138*/ + 2, 4, 5, 6, 0, 1, 3, 7, /* 139*/ + 0, 1, 4, 5, 6, 2, 3, 7, /* 140*/ + 1, 4, 5, 6, 0, 2, 3, 7, /* 141*/ + 0, 4, 5, 6, 1, 2, 3, 7, /* 142*/ + 4, 5, 6, 0, 1, 2, 3, 7, /* 143*/ + 0, 1, 2, 3, 5, 6, 4, 7, /* 144*/ + 1, 2, 3, 5, 6, 0, 4, 7, /* 145*/ + 0, 2, 3, 5, 6, 1, 4, 7, /* 146*/ + 2, 3, 5, 6, 0, 1, 4, 7, /* 147*/ + 0, 1, 3, 5, 6, 2, 4, 7, /* 148*/ + 1, 3, 5, 6, 0, 2, 4, 7, /* 149*/ + 0, 3, 5, 6, 1, 2, 4, 7, /* 150*/ + 3, 5, 6, 0, 1, 2, 4, 7, /* 151*/ + 0, 1, 2, 5, 6, 3, 4, 7, /* 152*/ + 1, 2, 5, 6, 0, 3, 4, 7, /* 153*/ + 0, 2, 5, 6, 1, 3, 4, 7, /* 154*/ + 2, 5, 6, 0, 1, 3, 4, 7, /* 155*/ + 0, 1, 5, 6, 2, 3, 4, 7, /* 156*/ + 1, 5, 6, 0, 2, 3, 4, 7, /* 157*/ + 0, 5, 6, 1, 2, 3, 4, 7, /* 158*/ + 5, 6, 0, 1, 2, 3, 4, 7, /* 159*/ + 0, 1, 2, 3, 4, 6, 5, 7, /* 160*/ + 1, 2, 3, 4, 6, 0, 5, 7, /* 161*/ + 0, 2, 3, 4, 6, 1, 5, 7, /* 162*/ + 2, 3, 4, 6, 0, 1, 5, 7, /* 163*/ + 0, 1, 3, 4, 6, 2, 5, 7, /* 164*/ + 1, 3, 4, 6, 0, 2, 5, 7, /* 165*/ + 0, 3, 4, 6, 1, 2, 5, 7, /* 166*/ + 3, 4, 6, 0, 1, 2, 5, 7, /* 167*/ + 0, 1, 2, 4, 6, 3, 5, 7, /* 168*/ + 1, 2, 4, 6, 0, 3, 5, 7, /* 169*/ + 0, 2, 4, 6, 1, 3, 5, 7, /* 170*/ + 2, 4, 6, 0, 1, 3, 5, 7, /* 171*/ + 0, 1, 4, 6, 2, 3, 5, 7, /* 172*/ + 1, 4, 6, 0, 2, 3, 5, 7, /* 173*/ + 0, 4, 6, 1, 2, 3, 5, 7, /* 174*/ + 4, 6, 0, 1, 2, 3, 5, 7, /* 175*/ + 0, 1, 2, 3, 6, 4, 5, 7, /* 176*/ + 1, 2, 3, 6, 0, 4, 5, 7, /* 177*/ + 0, 2, 3, 6, 1, 4, 5, 7, /* 178*/ + 2, 3, 6, 0, 1, 4, 5, 7, /* 179*/ + 0, 1, 3, 6, 2, 4, 5, 7, /* 180*/ + 1, 3, 6, 0, 2, 4, 5, 7, /* 181*/ + 0, 3, 6, 1, 2, 4, 5, 7, /* 182*/ + 3, 6, 0, 1, 2, 4, 5, 7, /* 183*/ + 0, 1, 2, 6, 3, 4, 5, 7, /* 184*/ + 1, 2, 6, 0, 3, 4, 5, 7, /* 185*/ + 0, 2, 6, 1, 3, 4, 5, 7, /* 186*/ + 2, 6, 0, 1, 3, 4, 5, 7, /* 187*/ + 0, 1, 6, 2, 3, 4, 5, 7, /* 188*/ + 1, 6, 0, 2, 3, 4, 5, 7, /* 189*/ + 0, 6, 1, 2, 3, 4, 5, 7, /* 190*/ + 6, 0, 1, 2, 3, 4, 5, 7, /* 191*/ + 0, 1, 2, 3, 4, 5, 6, 7, /* 192*/ + 1, 2, 3, 4, 5, 0, 6, 7, /* 193*/ + 0, 2, 3, 4, 5, 1, 6, 7, /* 194*/ + 2, 3, 4, 5, 0, 1, 6, 7, /* 195*/ + 0, 1, 3, 4, 5, 2, 6, 7, /* 196*/ + 1, 3, 4, 5, 0, 2, 6, 7, /* 197*/ + 0, 3, 4, 5, 1, 2, 6, 7, /* 198*/ + 3, 4, 5, 0, 1, 2, 6, 7, /* 199*/ + 0, 1, 2, 4, 5, 3, 6, 7, /* 200*/ + 1, 2, 4, 5, 0, 3, 6, 7, /* 201*/ + 0, 2, 4, 5, 1, 3, 6, 7, /* 202*/ + 2, 4, 5, 0, 1, 3, 6, 7, /* 203*/ + 0, 1, 4, 5, 2, 3, 6, 7, /* 204*/ + 1, 4, 5, 0, 2, 3, 6, 7, /* 205*/ + 0, 4, 5, 1, 2, 3, 6, 7, /* 206*/ + 4, 5, 0, 1, 2, 3, 6, 7, /* 207*/ + 0, 1, 2, 3, 5, 4, 6, 7, /* 208*/ + 1, 2, 3, 5, 0, 4, 6, 7, /* 209*/ + 0, 2, 3, 5, 1, 4, 6, 7, /* 210*/ + 2, 3, 5, 0, 1, 4, 6, 7, /* 211*/ + 0, 1, 3, 5, 2, 4, 6, 7, /* 212*/ + 1, 3, 5, 0, 2, 4, 6, 7, /* 213*/ + 0, 3, 5, 1, 2, 4, 6, 7, /* 214*/ + 3, 5, 0, 1, 2, 4, 6, 7, /* 215*/ + 0, 1, 2, 5, 3, 4, 6, 7, /* 216*/ + 1, 2, 5, 0, 3, 4, 6, 7, /* 217*/ + 0, 2, 5, 1, 3, 4, 6, 7, /* 218*/ + 2, 5, 0, 1, 3, 4, 6, 7, /* 219*/ + 0, 1, 5, 2, 3, 4, 6, 7, /* 220*/ + 1, 5, 0, 2, 3, 4, 6, 7, /* 221*/ + 0, 5, 1, 2, 3, 4, 6, 7, /* 222*/ + 5, 0, 1, 2, 3, 4, 6, 7, /* 223*/ + 0, 1, 2, 3, 4, 5, 6, 7, /* 224*/ + 1, 2, 3, 4, 0, 5, 6, 7, /* 225*/ + 0, 2, 3, 4, 1, 5, 6, 7, /* 226*/ + 2, 3, 4, 0, 1, 5, 6, 7, /* 227*/ + 0, 1, 3, 4, 2, 5, 6, 7, /* 228*/ + 1, 3, 4, 0, 2, 5, 6, 7, /* 229*/ + 0, 3, 4, 1, 2, 5, 6, 7, /* 230*/ + 3, 4, 0, 1, 2, 5, 6, 7, /* 231*/ + 0, 1, 2, 4, 3, 5, 6, 7, /* 232*/ + 1, 2, 4, 0, 3, 5, 6, 7, /* 233*/ + 0, 2, 4, 1, 3, 5, 6, 7, /* 234*/ + 2, 4, 0, 1, 3, 5, 6, 7, /* 235*/ + 0, 1, 4, 2, 3, 5, 6, 7, /* 236*/ + 1, 4, 0, 2, 3, 5, 6, 7, /* 237*/ + 0, 4, 1, 2, 3, 5, 6, 7, /* 238*/ + 4, 0, 1, 2, 3, 5, 6, 7, /* 239*/ + 0, 1, 2, 3, 4, 5, 6, 7, /* 240*/ + 1, 2, 3, 0, 4, 5, 6, 7, /* 241*/ + 0, 2, 3, 1, 4, 5, 6, 7, /* 242*/ + 2, 3, 0, 1, 4, 5, 6, 7, /* 243*/ + 0, 1, 3, 2, 4, 5, 6, 7, /* 244*/ + 1, 3, 0, 2, 4, 5, 6, 7, /* 245*/ + 0, 3, 1, 2, 4, 5, 6, 7, /* 246*/ + 3, 0, 1, 2, 4, 5, 6, 7, /* 247*/ + 0, 1, 2, 3, 4, 5, 6, 7, /* 248*/ + 1, 2, 0, 3, 4, 5, 6, 7, /* 249*/ + 0, 2, 1, 3, 4, 5, 6, 7, /* 250*/ + 2, 0, 1, 3, 4, 5, 6, 7, /* 251*/ + 0, 1, 2, 3, 4, 5, 6, 7, /* 252*/ + 1, 0, 2, 3, 4, 5, 6, 7, /* 253*/ + 0, 1, 2, 3, 4, 5, 6, 7, /* 254*/ + 0, 1, 2, 3, 4, 5, 6, 7, /* 255*/ +}; + + +static uint32_t avx_pivot_on_last_value(int32_t *array, size_t length) { + if (length <= 1) + return 1; + + + { // we exchange the last value for the middle value for a better pivot + int32_t ival = array[length / 2]; + int32_t bval = array[length - 1]; + array[length / 2] = bval; + array[length - 1] = ival; + } + + int wh, rh; + int wt, rt; + + rh = 0; + rt = length - 1; + wh = 0; + wt = length - 1; + + __m256i h0 = _mm256_loadu_si256((__m256i*)(array + rh)); + __m256i h1 = _mm256_loadu_si256((__m256i*)(array + rh + AVX2_REGISTER_SIZE)); + rh += 2*AVX2_REGISTER_SIZE; + + __m256i t0 = _mm256_loadu_si256((__m256i*)(array + rt - AVX2_REGISTER_SIZE)); + __m256i t1 = _mm256_loadu_si256((__m256i*)(array + rt - 2*AVX2_REGISTER_SIZE)); + rt -= 3*AVX2_REGISTER_SIZE; + + __m256i current = _mm256_loadu_si256((__m256i*)(array + rh)); + __m256i next; + rh += AVX2_REGISTER_SIZE; + + const int32_t pivot = array[length - 1]; + const __m256i P = _mm256_set1_epi32(pivot); + + // 1. the main loop + while (wh - wt < 4*AVX2_REGISTER_SIZE) { + const bool which = (rh - wh) < (wt - rt); + + // I believe that a compiler will emit branchless code + __m256i* next_ptr = (__m256i*)(which ? array + rh : array + rt); + const int adv_rh = which ? AVX2_REGISTER_SIZE : 0; + const int adv_rt = which ? 0 : AVX2_REGISTER_SIZE; + // My faith ends here + + next = _mm256_loadu_si256(next_ptr); + + const int pvbyte = _mm256_movemask_ps((__m256)_mm256_cmpgt_epi32(current, P)); + const uint32_t cnt = 8 - _mm_popcnt_u32(pvbyte); + const __m256i shuf = _mm256_load_si256((__m256i *)(reverseshufflemask + 8 * pvbyte)); + + const __m256i ordered = _mm256_permutevar8x32_epi32(current, shuf); + _mm256_storeu_si256((__m256i*)(array + wh), ordered); + _mm256_storeu_si256((__m256i*)(array + wt - 8), ordered); + + rh += adv_rh; + rt -= adv_rt; + wh += cnt; + wt -= 8 - cnt; + + current = next; + } + + // 2. partition remaining part + while (wh - wt > 4*AVX2_REGISTER_SIZE) { + const int32_t v = array[rh++]; + if (v < pivot) { + array[wh++] = v; + } else { + array[wt--] = v; + } + } + + // 3. partition 4 registers loaded in the beginning + static uint32_t tmp[4*AVX2_REGISTER_SIZE]; + _mm256_storeu_si256((__m256i*)(tmp + 0*AVX2_REGISTER_SIZE), h0); + _mm256_storeu_si256((__m256i*)(tmp + 1*AVX2_REGISTER_SIZE), h1); + _mm256_storeu_si256((__m256i*)(tmp + 2*AVX2_REGISTER_SIZE), t0); + _mm256_storeu_si256((__m256i*)(tmp + 3*AVX2_REGISTER_SIZE), t1); + for (int i=0; i < 4*AVX2_REGISTER_SIZE; i++) { + const int32_t v = array[i]; + if (v < pivot) { + array[wh++] = v; + } else { + array[wt--] = v; + } + } + + { + const int32_t a = array[length - 1]; + const int32_t b = array[wh]; + + array[length - 1] = b; + array[wh] = a; + } + + return wh; +} + +// for fallback +void scalar_partition(int32_t* array, const int32_t pivot, int& left, int& right) { + + while (left <= right) { + while (array[left] < pivot) { + left += 1; + } + while (array[right] > pivot) { + right -= 1; + } + if (left <= right) { + const uint32_t t = array[left]; + array[left] = array[right]; + array[right] = t; + left += 1; + right -= 1; + } + } +} + +//fallback +void scalar_quicksort(int32_t* array, int left, int right) { +#ifdef WITH_RUNTIME_STATS + statistics.scalar__partition_calls += 1; + statistics.scalar__items_processed += right - left + 1; +#endif + int i = left; + int j = right; + const int32_t pivot = array[(i + j)/2]; + scalar_partition(array, pivot, i, j); + if (left < j) { + scalar_quicksort(array, left, j); + } + if (i < right) { + scalar_quicksort(array, i, right); + } +} + +void avx2_pivotonlast_sort(int32_t *array, const uint32_t length) { + uint32_t sep; + + if (length > 8*AVX2_REGISTER_SIZE) { + sep = avx_pivot_on_last_value(array, length); + } else { + sep = lomuto_partition_epi32((uint32_t*)array, 0, length - 1); + } + if(sep == length) { + // we have an ineffective pivot. Let us give up. + if(length > 1) scalar_quicksort(array,0,length - 1); + } else { + if (sep > 2) { + avx2_pivotonlast_sort(array, sep - 1); + } + if (sep + 1 < length) { + avx2_pivotonlast_sort(array + sep, length - sep); + } + } +} +void wrapped_avx2_pivotonlast_sort(uint32_t *array, int left, int right) { + avx2_pivotonlast_sort((int32_t *)array + left, right - left + 1); +} + +} // namespace nate diff --git a/simd-sort/avx2-natenodutch-quicksort.h b/simd-sort/avx2-natenodutch-quicksort.h new file mode 100644 index 0000000..4a816b6 --- /dev/null +++ b/simd-sort/avx2-natenodutch-quicksort.h @@ -0,0 +1,409 @@ +#pragma once + +#include +#include +#include +#include "avx2-altquicksort.h" +/** +* This is an alternative approach to SIMD quicksort, based on a description by +* Nathan Kurz. Daniel Lemire implemented it to avoid the Dutch national flag +* problem as an experiment. This is not meant to be fast, as implemented. It is +* an experiment. +*/ + +/* +I'm going to use "vector" as a unit, but it doesn't have to be the +native vector size. In particular, I think we can read multiple +vector register's worth of data with minimal added time per loop +iteration. + +First, read two vectors from the head of the list, and two from the +tail of the list. Store these, either in registers or written to a +buffer. We're not going to use these until the end. We're just +getting them out of the way so that we have more than one vectors +writeable space at the head and tail. + +wh--rh------------------rt--wt- + +wh = write head +rh = read head +rt = read tail +wt = write tail + +Then read one vector from head, and advance rh accordingly. Call it nextVector. + +Loop: + +Set currentVector = nextVector. + +Set nextVectorPtr to either rh or rt depending on which has the least +"writable space" available. That is, if (rh - wh) < (wt - rt), read +from and advance rh, else read from and update rt. These updates +need to be branchless. + +Read nextVector = *nextVectorPtr. + +Partition currentVector (lookup, shuffle) and write to both wh and wt +(tail offset so end hits wt). +Advance wh and wt based on number of valid entries. Use a second byte +lookup issued as early as possible to determine this count. Our +performance is dependent on the latency of this step. + +----- +*/ +static uint32_t shufflemask[256 * 8] __attribute__((aligned(0x100))) = { + 0, 1, 2, 3, 4, 5, 6, 7, /* 0*/ + 0, 1, 2, 3, 4, 5, 6, 7, /* 1*/ + 1, 0, 2, 3, 4, 5, 6, 7, /* 2*/ + 0, 1, 2, 3, 4, 5, 6, 7, /* 3*/ + 2, 0, 1, 3, 4, 5, 6, 7, /* 4*/ + 0, 2, 1, 3, 4, 5, 6, 7, /* 5*/ + 1, 2, 0, 3, 4, 5, 6, 7, /* 6*/ + 0, 1, 2, 3, 4, 5, 6, 7, /* 7*/ + 3, 0, 1, 2, 4, 5, 6, 7, /* 8*/ + 0, 3, 1, 2, 4, 5, 6, 7, /* 9*/ + 1, 3, 0, 2, 4, 5, 6, 7, /* 10*/ + 0, 1, 3, 2, 4, 5, 6, 7, /* 11*/ + 2, 3, 0, 1, 4, 5, 6, 7, /* 12*/ + 0, 2, 3, 1, 4, 5, 6, 7, /* 13*/ + 1, 2, 3, 0, 4, 5, 6, 7, /* 14*/ + 0, 1, 2, 3, 4, 5, 6, 7, /* 15*/ + 4, 0, 1, 2, 3, 5, 6, 7, /* 16*/ + 0, 4, 1, 2, 3, 5, 6, 7, /* 17*/ + 1, 4, 0, 2, 3, 5, 6, 7, /* 18*/ + 0, 1, 4, 2, 3, 5, 6, 7, /* 19*/ + 2, 4, 0, 1, 3, 5, 6, 7, /* 20*/ + 0, 2, 4, 1, 3, 5, 6, 7, /* 21*/ + 1, 2, 4, 0, 3, 5, 6, 7, /* 22*/ + 0, 1, 2, 4, 3, 5, 6, 7, /* 23*/ + 3, 4, 0, 1, 2, 5, 6, 7, /* 24*/ + 0, 3, 4, 1, 2, 5, 6, 7, /* 25*/ + 1, 3, 4, 0, 2, 5, 6, 7, /* 26*/ + 0, 1, 3, 4, 2, 5, 6, 7, /* 27*/ + 2, 3, 4, 0, 1, 5, 6, 7, /* 28*/ + 0, 2, 3, 4, 1, 5, 6, 7, /* 29*/ + 1, 2, 3, 4, 0, 5, 6, 7, /* 30*/ + 0, 1, 2, 3, 4, 5, 6, 7, /* 31*/ + 5, 0, 1, 2, 3, 4, 6, 7, /* 32*/ + 0, 5, 1, 2, 3, 4, 6, 7, /* 33*/ + 1, 5, 0, 2, 3, 4, 6, 7, /* 34*/ + 0, 1, 5, 2, 3, 4, 6, 7, /* 35*/ + 2, 5, 0, 1, 3, 4, 6, 7, /* 36*/ + 0, 2, 5, 1, 3, 4, 6, 7, /* 37*/ + 1, 2, 5, 0, 3, 4, 6, 7, /* 38*/ + 0, 1, 2, 5, 3, 4, 6, 7, /* 39*/ + 3, 5, 0, 1, 2, 4, 6, 7, /* 40*/ + 0, 3, 5, 1, 2, 4, 6, 7, /* 41*/ + 1, 3, 5, 0, 2, 4, 6, 7, /* 42*/ + 0, 1, 3, 5, 2, 4, 6, 7, /* 43*/ + 2, 3, 5, 0, 1, 4, 6, 7, /* 44*/ + 0, 2, 3, 5, 1, 4, 6, 7, /* 45*/ + 1, 2, 3, 5, 0, 4, 6, 7, /* 46*/ + 0, 1, 2, 3, 5, 4, 6, 7, /* 47*/ + 4, 5, 0, 1, 2, 3, 6, 7, /* 48*/ + 0, 4, 5, 1, 2, 3, 6, 7, /* 49*/ + 1, 4, 5, 0, 2, 3, 6, 7, /* 50*/ + 0, 1, 4, 5, 2, 3, 6, 7, /* 51*/ + 2, 4, 5, 0, 1, 3, 6, 7, /* 52*/ + 0, 2, 4, 5, 1, 3, 6, 7, /* 53*/ + 1, 2, 4, 5, 0, 3, 6, 7, /* 54*/ + 0, 1, 2, 4, 5, 3, 6, 7, /* 55*/ + 3, 4, 5, 0, 1, 2, 6, 7, /* 56*/ + 0, 3, 4, 5, 1, 2, 6, 7, /* 57*/ + 1, 3, 4, 5, 0, 2, 6, 7, /* 58*/ + 0, 1, 3, 4, 5, 2, 6, 7, /* 59*/ + 2, 3, 4, 5, 0, 1, 6, 7, /* 60*/ + 0, 2, 3, 4, 5, 1, 6, 7, /* 61*/ + 1, 2, 3, 4, 5, 0, 6, 7, /* 62*/ + 0, 1, 2, 3, 4, 5, 6, 7, /* 63*/ + 6, 0, 1, 2, 3, 4, 5, 7, /* 64*/ + 0, 6, 1, 2, 3, 4, 5, 7, /* 65*/ + 1, 6, 0, 2, 3, 4, 5, 7, /* 66*/ + 0, 1, 6, 2, 3, 4, 5, 7, /* 67*/ + 2, 6, 0, 1, 3, 4, 5, 7, /* 68*/ + 0, 2, 6, 1, 3, 4, 5, 7, /* 69*/ + 1, 2, 6, 0, 3, 4, 5, 7, /* 70*/ + 0, 1, 2, 6, 3, 4, 5, 7, /* 71*/ + 3, 6, 0, 1, 2, 4, 5, 7, /* 72*/ + 0, 3, 6, 1, 2, 4, 5, 7, /* 73*/ + 1, 3, 6, 0, 2, 4, 5, 7, /* 74*/ + 0, 1, 3, 6, 2, 4, 5, 7, /* 75*/ + 2, 3, 6, 0, 1, 4, 5, 7, /* 76*/ + 0, 2, 3, 6, 1, 4, 5, 7, /* 77*/ + 1, 2, 3, 6, 0, 4, 5, 7, /* 78*/ + 0, 1, 2, 3, 6, 4, 5, 7, /* 79*/ + 4, 6, 0, 1, 2, 3, 5, 7, /* 80*/ + 0, 4, 6, 1, 2, 3, 5, 7, /* 81*/ + 1, 4, 6, 0, 2, 3, 5, 7, /* 82*/ + 0, 1, 4, 6, 2, 3, 5, 7, /* 83*/ + 2, 4, 6, 0, 1, 3, 5, 7, /* 84*/ + 0, 2, 4, 6, 1, 3, 5, 7, /* 85*/ + 1, 2, 4, 6, 0, 3, 5, 7, /* 86*/ + 0, 1, 2, 4, 6, 3, 5, 7, /* 87*/ + 3, 4, 6, 0, 1, 2, 5, 7, /* 88*/ + 0, 3, 4, 6, 1, 2, 5, 7, /* 89*/ + 1, 3, 4, 6, 0, 2, 5, 7, /* 90*/ + 0, 1, 3, 4, 6, 2, 5, 7, /* 91*/ + 2, 3, 4, 6, 0, 1, 5, 7, /* 92*/ + 0, 2, 3, 4, 6, 1, 5, 7, /* 93*/ + 1, 2, 3, 4, 6, 0, 5, 7, /* 94*/ + 0, 1, 2, 3, 4, 6, 5, 7, /* 95*/ + 5, 6, 0, 1, 2, 3, 4, 7, /* 96*/ + 0, 5, 6, 1, 2, 3, 4, 7, /* 97*/ + 1, 5, 6, 0, 2, 3, 4, 7, /* 98*/ + 0, 1, 5, 6, 2, 3, 4, 7, /* 99*/ + 2, 5, 6, 0, 1, 3, 4, 7, /* 100*/ + 0, 2, 5, 6, 1, 3, 4, 7, /* 101*/ + 1, 2, 5, 6, 0, 3, 4, 7, /* 102*/ + 0, 1, 2, 5, 6, 3, 4, 7, /* 103*/ + 3, 5, 6, 0, 1, 2, 4, 7, /* 104*/ + 0, 3, 5, 6, 1, 2, 4, 7, /* 105*/ + 1, 3, 5, 6, 0, 2, 4, 7, /* 106*/ + 0, 1, 3, 5, 6, 2, 4, 7, /* 107*/ + 2, 3, 5, 6, 0, 1, 4, 7, /* 108*/ + 0, 2, 3, 5, 6, 1, 4, 7, /* 109*/ + 1, 2, 3, 5, 6, 0, 4, 7, /* 110*/ + 0, 1, 2, 3, 5, 6, 4, 7, /* 111*/ + 4, 5, 6, 0, 1, 2, 3, 7, /* 112*/ + 0, 4, 5, 6, 1, 2, 3, 7, /* 113*/ + 1, 4, 5, 6, 0, 2, 3, 7, /* 114*/ + 0, 1, 4, 5, 6, 2, 3, 7, /* 115*/ + 2, 4, 5, 6, 0, 1, 3, 7, /* 116*/ + 0, 2, 4, 5, 6, 1, 3, 7, /* 117*/ + 1, 2, 4, 5, 6, 0, 3, 7, /* 118*/ + 0, 1, 2, 4, 5, 6, 3, 7, /* 119*/ + 3, 4, 5, 6, 0, 1, 2, 7, /* 120*/ + 0, 3, 4, 5, 6, 1, 2, 7, /* 121*/ + 1, 3, 4, 5, 6, 0, 2, 7, /* 122*/ + 0, 1, 3, 4, 5, 6, 2, 7, /* 123*/ + 2, 3, 4, 5, 6, 0, 1, 7, /* 124*/ + 0, 2, 3, 4, 5, 6, 1, 7, /* 125*/ + 1, 2, 3, 4, 5, 6, 0, 7, /* 126*/ + 0, 1, 2, 3, 4, 5, 6, 7, /* 127*/ + 7, 0, 1, 2, 3, 4, 5, 6, /* 128*/ + 0, 7, 1, 2, 3, 4, 5, 6, /* 129*/ + 1, 7, 0, 2, 3, 4, 5, 6, /* 130*/ + 0, 1, 7, 2, 3, 4, 5, 6, /* 131*/ + 2, 7, 0, 1, 3, 4, 5, 6, /* 132*/ + 0, 2, 7, 1, 3, 4, 5, 6, /* 133*/ + 1, 2, 7, 0, 3, 4, 5, 6, /* 134*/ + 0, 1, 2, 7, 3, 4, 5, 6, /* 135*/ + 3, 7, 0, 1, 2, 4, 5, 6, /* 136*/ + 0, 3, 7, 1, 2, 4, 5, 6, /* 137*/ + 1, 3, 7, 0, 2, 4, 5, 6, /* 138*/ + 0, 1, 3, 7, 2, 4, 5, 6, /* 139*/ + 2, 3, 7, 0, 1, 4, 5, 6, /* 140*/ + 0, 2, 3, 7, 1, 4, 5, 6, /* 141*/ + 1, 2, 3, 7, 0, 4, 5, 6, /* 142*/ + 0, 1, 2, 3, 7, 4, 5, 6, /* 143*/ + 4, 7, 0, 1, 2, 3, 5, 6, /* 144*/ + 0, 4, 7, 1, 2, 3, 5, 6, /* 145*/ + 1, 4, 7, 0, 2, 3, 5, 6, /* 146*/ + 0, 1, 4, 7, 2, 3, 5, 6, /* 147*/ + 2, 4, 7, 0, 1, 3, 5, 6, /* 148*/ + 0, 2, 4, 7, 1, 3, 5, 6, /* 149*/ + 1, 2, 4, 7, 0, 3, 5, 6, /* 150*/ + 0, 1, 2, 4, 7, 3, 5, 6, /* 151*/ + 3, 4, 7, 0, 1, 2, 5, 6, /* 152*/ + 0, 3, 4, 7, 1, 2, 5, 6, /* 153*/ + 1, 3, 4, 7, 0, 2, 5, 6, /* 154*/ + 0, 1, 3, 4, 7, 2, 5, 6, /* 155*/ + 2, 3, 4, 7, 0, 1, 5, 6, /* 156*/ + 0, 2, 3, 4, 7, 1, 5, 6, /* 157*/ + 1, 2, 3, 4, 7, 0, 5, 6, /* 158*/ + 0, 1, 2, 3, 4, 7, 5, 6, /* 159*/ + 5, 7, 0, 1, 2, 3, 4, 6, /* 160*/ + 0, 5, 7, 1, 2, 3, 4, 6, /* 161*/ + 1, 5, 7, 0, 2, 3, 4, 6, /* 162*/ + 0, 1, 5, 7, 2, 3, 4, 6, /* 163*/ + 2, 5, 7, 0, 1, 3, 4, 6, /* 164*/ + 0, 2, 5, 7, 1, 3, 4, 6, /* 165*/ + 1, 2, 5, 7, 0, 3, 4, 6, /* 166*/ + 0, 1, 2, 5, 7, 3, 4, 6, /* 167*/ + 3, 5, 7, 0, 1, 2, 4, 6, /* 168*/ + 0, 3, 5, 7, 1, 2, 4, 6, /* 169*/ + 1, 3, 5, 7, 0, 2, 4, 6, /* 170*/ + 0, 1, 3, 5, 7, 2, 4, 6, /* 171*/ + 2, 3, 5, 7, 0, 1, 4, 6, /* 172*/ + 0, 2, 3, 5, 7, 1, 4, 6, /* 173*/ + 1, 2, 3, 5, 7, 0, 4, 6, /* 174*/ + 0, 1, 2, 3, 5, 7, 4, 6, /* 175*/ + 4, 5, 7, 0, 1, 2, 3, 6, /* 176*/ + 0, 4, 5, 7, 1, 2, 3, 6, /* 177*/ + 1, 4, 5, 7, 0, 2, 3, 6, /* 178*/ + 0, 1, 4, 5, 7, 2, 3, 6, /* 179*/ + 2, 4, 5, 7, 0, 1, 3, 6, /* 180*/ + 0, 2, 4, 5, 7, 1, 3, 6, /* 181*/ + 1, 2, 4, 5, 7, 0, 3, 6, /* 182*/ + 0, 1, 2, 4, 5, 7, 3, 6, /* 183*/ + 3, 4, 5, 7, 0, 1, 2, 6, /* 184*/ + 0, 3, 4, 5, 7, 1, 2, 6, /* 185*/ + 1, 3, 4, 5, 7, 0, 2, 6, /* 186*/ + 0, 1, 3, 4, 5, 7, 2, 6, /* 187*/ + 2, 3, 4, 5, 7, 0, 1, 6, /* 188*/ + 0, 2, 3, 4, 5, 7, 1, 6, /* 189*/ + 1, 2, 3, 4, 5, 7, 0, 6, /* 190*/ + 0, 1, 2, 3, 4, 5, 7, 6, /* 191*/ + 6, 7, 0, 1, 2, 3, 4, 5, /* 192*/ + 0, 6, 7, 1, 2, 3, 4, 5, /* 193*/ + 1, 6, 7, 0, 2, 3, 4, 5, /* 194*/ + 0, 1, 6, 7, 2, 3, 4, 5, /* 195*/ + 2, 6, 7, 0, 1, 3, 4, 5, /* 196*/ + 0, 2, 6, 7, 1, 3, 4, 5, /* 197*/ + 1, 2, 6, 7, 0, 3, 4, 5, /* 198*/ + 0, 1, 2, 6, 7, 3, 4, 5, /* 199*/ + 3, 6, 7, 0, 1, 2, 4, 5, /* 200*/ + 0, 3, 6, 7, 1, 2, 4, 5, /* 201*/ + 1, 3, 6, 7, 0, 2, 4, 5, /* 202*/ + 0, 1, 3, 6, 7, 2, 4, 5, /* 203*/ + 2, 3, 6, 7, 0, 1, 4, 5, /* 204*/ + 0, 2, 3, 6, 7, 1, 4, 5, /* 205*/ + 1, 2, 3, 6, 7, 0, 4, 5, /* 206*/ + 0, 1, 2, 3, 6, 7, 4, 5, /* 207*/ + 4, 6, 7, 0, 1, 2, 3, 5, /* 208*/ + 0, 4, 6, 7, 1, 2, 3, 5, /* 209*/ + 1, 4, 6, 7, 0, 2, 3, 5, /* 210*/ + 0, 1, 4, 6, 7, 2, 3, 5, /* 211*/ + 2, 4, 6, 7, 0, 1, 3, 5, /* 212*/ + 0, 2, 4, 6, 7, 1, 3, 5, /* 213*/ + 1, 2, 4, 6, 7, 0, 3, 5, /* 214*/ + 0, 1, 2, 4, 6, 7, 3, 5, /* 215*/ + 3, 4, 6, 7, 0, 1, 2, 5, /* 216*/ + 0, 3, 4, 6, 7, 1, 2, 5, /* 217*/ + 1, 3, 4, 6, 7, 0, 2, 5, /* 218*/ + 0, 1, 3, 4, 6, 7, 2, 5, /* 219*/ + 2, 3, 4, 6, 7, 0, 1, 5, /* 220*/ + 0, 2, 3, 4, 6, 7, 1, 5, /* 221*/ + 1, 2, 3, 4, 6, 7, 0, 5, /* 222*/ + 0, 1, 2, 3, 4, 6, 7, 5, /* 223*/ + 5, 6, 7, 0, 1, 2, 3, 4, /* 224*/ + 0, 5, 6, 7, 1, 2, 3, 4, /* 225*/ + 1, 5, 6, 7, 0, 2, 3, 4, /* 226*/ + 0, 1, 5, 6, 7, 2, 3, 4, /* 227*/ + 2, 5, 6, 7, 0, 1, 3, 4, /* 228*/ + 0, 2, 5, 6, 7, 1, 3, 4, /* 229*/ + 1, 2, 5, 6, 7, 0, 3, 4, /* 230*/ + 0, 1, 2, 5, 6, 7, 3, 4, /* 231*/ + 3, 5, 6, 7, 0, 1, 2, 4, /* 232*/ + 0, 3, 5, 6, 7, 1, 2, 4, /* 233*/ + 1, 3, 5, 6, 7, 0, 2, 4, /* 234*/ + 0, 1, 3, 5, 6, 7, 2, 4, /* 235*/ + 2, 3, 5, 6, 7, 0, 1, 4, /* 236*/ + 0, 2, 3, 5, 6, 7, 1, 4, /* 237*/ + 1, 2, 3, 5, 6, 7, 0, 4, /* 238*/ + 0, 1, 2, 3, 5, 6, 7, 4, /* 239*/ + 4, 5, 6, 7, 0, 1, 2, 3, /* 240*/ + 0, 4, 5, 6, 7, 1, 2, 3, /* 241*/ + 1, 4, 5, 6, 7, 0, 2, 3, /* 242*/ + 0, 1, 4, 5, 6, 7, 2, 3, /* 243*/ + 2, 4, 5, 6, 7, 0, 1, 3, /* 244*/ + 0, 2, 4, 5, 6, 7, 1, 3, /* 245*/ + 1, 2, 4, 5, 6, 7, 0, 3, /* 246*/ + 0, 1, 2, 4, 5, 6, 7, 3, /* 247*/ + 3, 4, 5, 6, 7, 0, 1, 2, /* 248*/ + 0, 3, 4, 5, 6, 7, 1, 2, /* 249*/ + 1, 3, 4, 5, 6, 7, 0, 2, /* 250*/ + 0, 1, 3, 4, 5, 6, 7, 2, /* 251*/ + 2, 3, 4, 5, 6, 7, 0, 1, /* 252*/ + 0, 2, 3, 4, 5, 6, 7, 1, /* 253*/ + 1, 2, 3, 4, 5, 6, 7, 0, /* 254*/ + 0, 1, 2, 3, 4, 5, 6, 7, /* 255*/ +}; + +static inline __m256i _mm256_cmplt_epi32(__m256i a, __m256i b) { + return _mm256_cmpgt_epi32(b, a); +} + + +static void avx_natenodutch_partition_epi32(int32_t *array, const int32_t pivot, + int &left, int &right) { + const __m256i P = _mm256_set1_epi32(pivot); + const int valuespervector = sizeof(__m256i) / sizeof(int32_t); + if (right - left + 1 < + 4 * valuespervector) { // not enough space for nate's algo to make sense, falling back + scalar_partition(array, pivot, left, right); + return; + } + int readleft = left + valuespervector; + int readright = right - valuespervector; + int32_t buffer[2 * valuespervector]; // tmp buffer + memcpy(buffer, array + left, valuespervector * sizeof(int32_t)); + memcpy(buffer + valuespervector, array + right - valuespervector + 1, + valuespervector * sizeof(int32_t)); + while (readright - readleft >= valuespervector) { + __m256i *nextVectorPtr; + if ((readleft - left) > (right - readright)) { + nextVectorPtr = (__m256i *)(array + readright - valuespervector + 1); + readright -= valuespervector; + } else { + nextVectorPtr = (__m256i *)(array + readleft); + readleft += valuespervector; + } + __m256i currentVector = _mm256_loadu_si256(nextVectorPtr); + /****** + * we need two comparisons if we are to avoid the Dutch national flag + * problem, + * Reference: https://en.wikipedia.org/wiki/Dutch_national_flag_problem + */ + int greaterthanpivot = + _mm256_movemask_ps((__m256)_mm256_cmpgt_epi32(currentVector, P)); + int lesserthanpivot = + _mm256_movemask_ps((__m256)_mm256_cmplt_epi32(currentVector, P)); + __m256i greaterthanpivot_permvector = _mm256_load_si256( + (__m256i *)(reverseshufflemask + 8 * greaterthanpivot)); + __m256i lesserthanpivot_permvector = + _mm256_load_si256((__m256i *)(shufflemask + 8 * lesserthanpivot)); + + int count_greaterthanpivot = _mm_popcnt_u32(greaterthanpivot); + int count_lesserthanpivot = _mm_popcnt_u32(lesserthanpivot); + __m256i greaterthanpivot_vector = + _mm256_permutevar8x32_epi32(currentVector, greaterthanpivot_permvector); + __m256i lesserthanpivot_vector = + _mm256_permutevar8x32_epi32(currentVector, lesserthanpivot_permvector); + _mm256_storeu_si256((__m256i *)(array + left), lesserthanpivot_vector); + + _mm256_storeu_si256((__m256i *)(array + right - valuespervector + 1), + greaterthanpivot_vector); + + left += count_lesserthanpivot; + right -= count_greaterthanpivot; + } + + int howmanyleft = readleft - left; + if (howmanyleft > 2 * valuespervector) { + howmanyleft = 2 * valuespervector; + for (int k = howmanyleft; k < readleft - left; ++k) + array[left + k] = pivot; + } + memcpy(array + left, buffer, howmanyleft * sizeof(int32_t)); + + int howmanyright = right - readright; + if (howmanyright + howmanyleft > 2 * valuespervector) { + howmanyright = 2 * valuespervector - howmanyleft; + for (int k = howmanyright; k < right - readright; ++k) + array[readright + 1 + k] = pivot; + } + + memcpy(array + readright + 1, buffer + howmanyleft, + howmanyright * sizeof(int32_t)); + scalar_partition(array, pivot, left, right); +} + + +// it is really signed int sorting, but the API seems to expect uint32_t... +void avx_natenodutch_quicksort(uint32_t *array, int left, int right) { + + int i = left; + int j = right; + const int32_t pivot = array[(i + j) / 2]; + + avx_natenodutch_partition_epi32((int32_t *) array, pivot, i, j); + + if (left < j) { + avx_natenodutch_quicksort(array, left, j); + } + + if (i < right) { + avx_natenodutch_quicksort(array, i, right); + } +} diff --git a/simd-sort/avx2-partition.cpp b/simd-sort/avx2-partition.cpp new file mode 100644 index 0000000..08f5cd8 --- /dev/null +++ b/simd-sort/avx2-partition.cpp @@ -0,0 +1,197 @@ +#include +#include + +namespace qs { + + namespace avx2 { + + __m256i FORCE_INLINE bitmask_to_bytemask_epi32(uint8_t bm) { + + const __m256i mask = _mm256_set1_epi32(bm); + const __m256i bits = _mm256_setr_epi32(0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80); + const __m256i tmp = _mm256_and_si256(mask, bits); + + return _mm256_cmpeq_epi32(tmp, bits); + } + + + void FORCE_INLINE align_masks(uint8_t& a, uint8_t& b, uint8_t& rem_a, uint8_t& rem_b, __m256i& shuffle_a, __m256i& shuffle_b) { + + assert(a != 0); + assert(b != 0); + + uint8_t tmpA = a; + uint8_t tmpB = b; + + uint32_t __attribute__((__aligned__(32))) tmpshufa[8]; + uint32_t __attribute__((__aligned__(32))) tmpshufb[8]; + + while (tmpA != 0 && tmpB != 0) { + int idx_a = __builtin_ctz(tmpA); + int idx_b = __builtin_ctz(tmpB); + + tmpA = tmpA & (tmpA - 1); + tmpB = tmpB & (tmpB - 1); + + tmpshufb[idx_a] = idx_b; + tmpshufa[idx_b] = idx_a; + } + + a = a ^ tmpA; + b = b ^ tmpB; + + assert(a != 0); + assert(b != 0); + assert(_mm_popcnt_u64(a) == _mm_popcnt_u64(b)); + + rem_a = tmpA; + rem_b = tmpB; + + shuffle_a = _mm256_load_si256((__m256i*)tmpshufa); + shuffle_b = _mm256_load_si256((__m256i*)tmpshufb); + } + + + __m256i FORCE_INLINE merge(const __m256i mask, const __m256i a, const __m256i b) { + + return _mm256_or_si256( + _mm256_and_si256(mask, a), + _mm256_andnot_si256(mask, b) + ); + } + + + void FORCE_INLINE swap_epi32( + __m256i& a, __m256i& b, + uint8_t mask_a, const __m256i shuffle_a, + uint8_t mask_b, const __m256i shuffle_b) { + + const __m256i to_swap_b = _mm256_permutevar8x32_epi32(a, shuffle_a); + const __m256i to_swap_a = _mm256_permutevar8x32_epi32(b, shuffle_b); + const __m256i ma = bitmask_to_bytemask_epi32(mask_a); + const __m256i mb = bitmask_to_bytemask_epi32(mask_b); + + a = merge(ma, to_swap_a, a); + b = merge(mb, to_swap_b, b); + } + + + +#define _mm256_iszero(vec) (_mm256_testz_si256(vec, vec) != 0) + + void FORCE_INLINE partition_epi32(uint32_t* array, uint32_t pv, int& left, int& right) { + + const int N = 8; // the number of items in a register (256/32) + + __m256i L; + __m256i R; + uint8_t maskL = 0; + uint8_t maskR = 0; + + const __m256i pivot = _mm256_set1_epi32(pv); + + int origL = left; + int origR = right; + + while (true) { + + if (maskL == 0) { + while (true) { + if (right - (left + N) + 1 < 2*N) { + goto end; + } + + L = _mm256_loadu_si256((__m256i*)(array + left)); + const __m256i bytemask = _mm256_cmpgt_epi32(pivot, L); + + if (_mm256_testc_ps((__m256)bytemask, (__m256)_mm256_set1_epi32(-1))) { + left += N; + } else { + maskL = ~_mm256_movemask_ps((__m256)bytemask); + break; + } + } + + } + + if (maskR == 0) { + while (true) { + if ((right - N) - left + 1 < 2*N) { + goto end; + } + + R = _mm256_loadu_si256((__m256i*)(array + right - N + 1)); + const __m256i bytemask = _mm256_cmpgt_epi32(pivot, R); + if (_mm256_iszero(bytemask)) { + right -= N; + } else { + maskR = _mm256_movemask_ps((__m256)bytemask); + break; + } + } + + } + + assert(left <= right); + assert(maskL != 0); + assert(maskR != 0); + + uint8_t mL; + uint8_t mR; + __m256i shuffleL; + __m256i shuffleR; + + align_masks(maskL, maskR, mL, mR, shuffleL, shuffleR); + swap_epi32(L, R, maskL, shuffleL, maskR, shuffleR); + + maskL = mL; + maskR = mR; + + if (maskL == 0) { + _mm256_storeu_si256((__m256i*)(array + left), L); + left += N; + } + + if (maskR == 0) { + _mm256_storeu_si256((__m256i*)(array + right - N + 1), R); + right -= N; + } + + } // while + + end: + + assert(!(maskL != 0 && maskR != 0)); + + if (maskL != 0) { + _mm256_storeu_si256((__m256i*)(array + left), L); + } else if (maskR != 0) { + _mm256_storeu_si256((__m256i*)(array + right - N + 1), R); + } + + if (left < right) { + int less = 0; + int greater = 0; + const int all = right - left + 1; + + for (int i=left; i <= right; i++) { + less += int(array[i] < pv); + greater += int(array[i] > pv); + } + + if (all == less) { + // all elements in range [left, right] less than pivot + scalar_partition_epi32(array, pv, origL, left); + } else if (all == greater) { + // all elements in range [left, right] greater than pivot + scalar_partition_epi32(array, pv, left, origR); + } else { + scalar_partition_epi32(array, pv, left, right); + } + } + } + + } // namespace avx2 + +} // namespace qs + diff --git a/simd-sort/avx2-quicksort.cpp b/simd-sort/avx2-quicksort.cpp new file mode 100644 index 0000000..3fc3859 --- /dev/null +++ b/simd-sort/avx2-quicksort.cpp @@ -0,0 +1,32 @@ +#include "avx2-partition.cpp" + +namespace qs { + + namespace avx2 { + + void quicksort(uint32_t* array, int left, int right) { + + int i = left; + int j = right; + + const uint32_t pivot = array[(i + j)/2]; + const int AVX2_REGISTER_SIZE = 8; // in 32-bit words + + if (j - i >= 2 * AVX2_REGISTER_SIZE) { + qs::avx2::partition_epi32(array, pivot, i, j); + } else { + scalar_partition_epi32(array, pivot, i, j); + } + + if (left < j) { + quicksort(array, left, j); + } + + if (i < right) { + quicksort(array, i, right); + } + } + + } // namespace avx2 + +} // namespace qs diff --git a/simd-sort/avx512-auxbuffer-partition.cpp b/simd-sort/avx512-auxbuffer-partition.cpp new file mode 100644 index 0000000..29a1391 --- /dev/null +++ b/simd-sort/avx512-auxbuffer-partition.cpp @@ -0,0 +1,90 @@ +namespace qs { + + namespace avx512 { + + void memset_epi32(uint32_t* array, uint32_t w, size_t n) { + + const int N = 16; + const __m512i word = _mm512_set1_epi32(w); + + for (size_t i=0; i < n/N; i++) { + _mm512_storeu_si512(array + i*N, word); + } + + for (size_t i=n/N * N; i < n; i++) { + array[i] = w; + } + } + + void memcpy_epi32(uint32_t* dst, uint32_t* src, size_t n) { + + const int N = 16; + + for (size_t i=0; i < n/N; i++) { + _mm512_storeu_si512(dst + i*N, _mm512_loadu_si512(src + i*N)); + } + + for (size_t i=n/N * N; i < n; i++) { + dst[i] = src[i]; + } + } + + // parition array[0..n-1] + uint32_t FORCE_INLINE partition_auxbuffer_epi32(uint32_t* array, size_t n, uint32_t pv) { + + const int N = 16; + const int AUX_COUNT = 1024; // 4kB + + static uint32_t gt_buf[AUX_COUNT + N]; + + size_t lt_count = 0; + size_t gt_count = 0; + + const __m512i pivot = _mm512_set1_epi32(pv); + + // 1. copy greater and less values into separate buffers + for (size_t i=0; i < n / N; i++) { + + const __m512i v = _mm512_loadu_si512(array + i*N); + + const __mmask16 lt = _mm512_cmplt_epi32_mask(v, pivot); + const __mmask16 gt = _mm512_cmpgt_epi32_mask(v, pivot); + + const __m512i less = _mm512_maskz_compress_epi32(lt, v); + const __m512i greater = _mm512_maskz_compress_epi32(gt, v); + + _mm512_storeu_si512(array + lt_count, less); + _mm512_storeu_si512(gt_buf + gt_count, greater); + + lt_count += _mm_popcnt_u32(lt); + gt_count += _mm_popcnt_u32(gt); + } + + for (size_t i=0; i < n % N; i++) { + + const uint32_t v = array[(n/N) * N + i]; + + if (v < pv) { + array[lt_count++] = v; + } else if (v > pv) { + gt_buf[gt_count++] = v; + } + } + + const size_t eq_count = n - (lt_count + gt_count); + + // 2. replace array with partially ordered data + + // 2.a. pivots + memset_epi32(array + lt_count, pv, eq_count); + + // 2.b. all values greater than pivot + memcpy_epi32(array + lt_count + eq_count, gt_buf, gt_count); + + // 3. index before the first pivot + return lt_count; + } + + } // namespace avx512 + +} // namespace qa diff --git a/simd-sort/avx512-bmi2-partition.cpp b/simd-sort/avx512-bmi2-partition.cpp new file mode 100644 index 0000000..c2dac5d --- /dev/null +++ b/simd-sort/avx512-bmi2-partition.cpp @@ -0,0 +1,105 @@ +namespace qs { + + namespace avx512 { + + void FORCE_INLINE bmi2_align_masks(__mmask16& a, __mmask16& b, __mmask16& rem_a, __mmask16& rem_b) { + + uint32_t tmpA = _pext_u32(0xffffffff, a); + uint32_t tmpB = _pext_u32(0xffffffff, b); + + uint32_t tmp = tmpA & tmpB; + + __mmask16 new_a = _pdep_u32(tmp, a); + __mmask16 new_b = _pdep_u32(tmp, a); + + rem_a = a ^ new_a; + rem_b = b ^ new_b; + + a = new_a; + b = new_b; + + assert(a != 0); + assert(b != 0); + assert(_mm_popcnt_u64(a) == _mm_popcnt_u64(b)); + } + + + void FORCE_INLINE bmi2_partition_epi32(uint32_t* array, uint32_t pv, int& left, int& right) { + + const int N = 16; + + __m512i L; + __m512i R; + __mmask16 maskL = 0; + __mmask16 maskR = 0; + const __m512i pivot = _mm512_set1_epi32(pv); + + while (right - left + 1 >= 2*N) { + + while (maskL == 0) { + if (right - (left + N) + 1 < 2*N) { + goto end; + } + + L = _mm512_loadu_si512(array + left); + maskL = _mm512_cmpge_epi32_mask(L, pivot); + if (maskL == 0) { + left += N; + } + } + + while (maskR == 0) { + if ((right - N) - left + 1 < 2*N) { + goto end; + } + + R = _mm512_loadu_si512(array + right - N + 1); + maskR = _mm512_cmple_epi32_mask(R, pivot); + if (maskR == 0) { + right -= N; + } + } + + assert(left <= right); + assert(maskL != 0); + assert(maskR != 0); + + __mmask16 mL; + __mmask16 mR; + + bmi2_align_masks(maskL, maskR, mL, mR); + swap_epi32(L, R, maskL, maskR); + + maskL = mL; + maskR = mR; + + if (maskL == 0) { + _mm512_storeu_si512(array + left, L); + left += N; + } + + if (maskR == 0) { + _mm512_storeu_si512(array + right - N + 1, R); + right -= N; + } + + } // while + + end: + + if (maskL != 0) { + _mm512_storeu_si512(array + left, L); + } + + if (maskR != 0) { + _mm512_storeu_si512(array + right - N + 1, R); + } + + if (left < right) { + partition_epi32(array, pv, left, right); + } + } + + } // namespace avx512 + +} // namespace qs diff --git a/simd-sort/avx512-partition-register.cpp b/simd-sort/avx512-partition-register.cpp new file mode 100644 index 0000000..a52100d --- /dev/null +++ b/simd-sort/avx512-partition-register.cpp @@ -0,0 +1,58 @@ +namespace qs { + + namespace avx512 { + + __mmask16 FORCE_INLINE get_range_mask(uint32_t position, uint32_t length) { + + return (uint32_t(1) << (position + length)) - (uint32_t(1) << position); + } + + /* + r stores 1 to 16 elements to partition, mask select leading element + */ + void partition_register(__m512i& r, const __m512i pivot, __mmask16 mask, int& left, int& right) { + + // example: + + // r = [ 1, 2, 3, 100, 200, 300, 4, 5, 50, 900, -1, -2, -3, -4, -5, -6] + // mask = 0x03ff -- first 10 elements are subject to parition + // pivot = packed_dword(50) + + const __mmask16 less_mask = _mm512_mask_cmplt_epi32_mask(mask, r, pivot); + const __mmask16 equal_mask = _mm512_mask_cmpeq_epi32_mask(mask, r, pivot); + + // less = [1, 2, 3, 4, 5, ... rest are **pivots**] + const __m512i less_equal = _mm512_mask_compress_epi32(pivot, less_mask, r); + + // greater = [100, 200, 300, 900, ... rest are zeros] + const __m512i greater = _mm512_maskz_compress_epi32(~(less_mask | equal_mask) & mask, r); + + // less_cnt = 5 + // less_equal_cnt = 6 + const int less_cnt = _mm_popcnt_u32(less_mask); + const int less_equal_cnt = _mm_popcnt_u32(less_mask | equal_mask); + + const __mmask16 store_less_equal = get_range_mask(0, less_equal_cnt); + const __mmask16 store_greater = ~store_less_equal & mask; + + // merge less or eqaul with input + // + // r = [ 1, 2, 3, 4, 5, 50, 4, 5, 50, 900, -1, -2, -3, -4, -5, -6] + // ^ ^ ^ ^ ^ ^^ + // merged + r = _mm512_mask_mov_epi32(r, store_less_equal, less_equal); + + // merge greater than pivot + // + // r = [ 1, 2, 3, 4, 5, 50, 100, 200, 300, 900, -1, -2, -3, -4, -5, -6] + // ^^^ ^^^ ^^^ ^^^ + // merged + r = _mm512_mask_expand_epi32(r, store_greater, greater); + + right = left + less_cnt; + left = left + less_equal_cnt; + } + + } // namespace avx512 + +} // namespace qs diff --git a/simd-sort/avx512-partition.cpp b/simd-sort/avx512-partition.cpp new file mode 100644 index 0000000..31625ff --- /dev/null +++ b/simd-sort/avx512-partition.cpp @@ -0,0 +1,129 @@ +//#define USE_AVX512_PIVOT + +namespace qs { + + namespace avx512 { + + void FORCE_INLINE align_masks(__mmask16& a, __mmask16& b, __mmask16& rem_a, __mmask16& rem_b) { + + assert(a != 0); + assert(b != 0); + + uint16_t tmpA = a; + uint16_t tmpB = b; + + while (tmpA != 0 && tmpB != 0) { + tmpA = tmpA & (tmpA - 1); + tmpB = tmpB & (tmpB - 1); + } + + a = a ^ tmpA; + b = b ^ tmpB; + + assert(a != 0); + assert(b != 0); + assert(_mm_popcnt_u64(a) == _mm_popcnt_u64(b)); + + rem_a = tmpA; + rem_b = tmpB; + } + + + void FORCE_INLINE partition_epi32(uint32_t* array, uint32_t pv, int& left, int& right) { + + const int N = 16; + + __m512i L; + __m512i R; + __mmask16 maskL = 0; + __mmask16 maskR = 0; + const __m512i pivot = _mm512_set1_epi32(pv); + + int count = (right - left + 1)/N; + + while (true) { + + while (maskL == 0) { + if (count <= 1) { + goto end; + } + + L = _mm512_loadu_si512(array + left); + maskL = _mm512_cmpge_epi32_mask(L, pivot); + if (maskL == 0) { + left += N; + count -= 1; + } + } + + while (maskR == 0) { + if (count <= 1) { + goto end; + } + + R = _mm512_loadu_si512(array + right - N + 1); + maskR = _mm512_cmple_epi32_mask(R, pivot); + if (maskR == 0) { + right -= N; + count -= 1; + } + } + + assert(left <= right); + assert(maskL != 0); + assert(maskR != 0); + + __mmask16 mL; + __mmask16 mR; + + align_masks(maskL, maskR, mL, mR); + swap_epi32(L, R, maskL, maskR); + + maskL = mL; + maskR = mR; + + if (maskL == 0) { + _mm512_storeu_si512(array + left, L); + left += N; + count -= 1; + } + + if (maskR == 0) { + _mm512_storeu_si512(array + right - N + 1, R); + right -= N; + count -= 1; + } + + } // while + + end: + + assert(!(maskL != 0 && maskR != 0)); + + if (maskL != 0) { + _mm512_storeu_si512(array + left, L); + } else if (maskR != 0) { + _mm512_storeu_si512(array + right - N + 1, R); + } + + if (left < right) { +#if USE_AVX512_PIVOT + scalar_partition_epi32(array, pv, left, right); +#else + const int size = right - left + 1; + if (size > N) { + scalar_partition_epi32(array, pv, left, right); + } else { + const int k = left; + __mmask16 mask = (uint32_t(1) << size) - 1; + __m512i v = _mm512_loadu_si512(array + k); + partition_register(v, pivot, mask, left, right); + _mm512_storeu_si512(array + k, v); + } +#endif + } + } + + } // namespace avx512 + +} // namespace qa diff --git a/simd-sort/avx512-popcnt-partition.cpp b/simd-sort/avx512-popcnt-partition.cpp new file mode 100644 index 0000000..4319f0c --- /dev/null +++ b/simd-sort/avx512-popcnt-partition.cpp @@ -0,0 +1,119 @@ +namespace qs { + + namespace avx512 { + + void FORCE_INLINE popcnt_partition_epi32(uint32_t* array, uint32_t pv, int& left, int& right) { + + const int N = 16; + + __m512i L; + __m512i R; + __mmask16 maskL = 0; + __mmask16 maskR = 0; + int popcntL; + int popcntR; + + const __m512i pivot = _mm512_set1_epi32(pv); + + while (true) { + + while (maskL == 0) { + if (right - (left + N) + 1 < N) { + goto end; + } + + L = _mm512_loadu_si512(array + left); + maskL = _mm512_cmpge_epi32_mask(L, pivot); + if (maskL == 0) { + left += N; + } + + popcntL = _mm_popcnt_u32(maskL); + } + + while (maskR == 0) { + if ((right - N) - left + 1 < N) { + goto end; + } + + R = _mm512_loadu_si512(array + right - N + 1); + maskR = _mm512_cmple_epi32_mask(R, pivot); + if (maskR == 0) { + right -= N; + } + + popcntR = _mm_popcnt_u32(maskR); + } + + assert(left <= right); + assert(maskL != 0); + assert(maskR != 0); + + if (popcntL == popcntR) { + // fast path + swap_epi32(L, R, maskL, maskR); + maskL = maskR = 0; + + _mm512_storeu_si512(array + left, L); + left += N; + + _mm512_storeu_si512(array + right - N + 1, R); + right -= N; + } else { + if (popcntL < popcntR) { + int n = popcntR - popcntL; + int k = n; + + __mmask16 tmp = maskR; + while (k--) { + maskR = maskR & (maskR - 1); + } + + swap_epi32(L, R, maskL, maskR); + maskR = tmp ^ maskR; + + popcntR = n; + + _mm512_storeu_si512(array + left, L); + left += N; + maskL = 0; + } else { + int n = popcntL - popcntR; + int k = n; + + __mmask16 tmp = maskL; + while (k--) { + maskL = maskL & (maskL - 1); + } + + swap_epi32(L, R, maskL, maskR); + maskL = tmp ^ maskL; + + popcntL = n; + + _mm512_storeu_si512(array + right - N + 1, R); + right -= N; + maskR = 0; + } + } + + } // while + + end: + + assert(!(maskL != 0 && maskR != 0)); + + if (maskL != 0) { + _mm512_storeu_si512(array + left, L); + } else if (maskR != 0) { + _mm512_storeu_si512(array + right - N + 1, R); + } + + if (left < right) { + scalar_partition_epi32(array, pv, left, right); + } + } + + } // namespace avx512 + +} // namespace qs diff --git a/simd-sort/avx512-quicksort.cpp b/simd-sort/avx512-quicksort.cpp new file mode 100644 index 0000000..55a0e80 --- /dev/null +++ b/simd-sort/avx512-quicksort.cpp @@ -0,0 +1,211 @@ +#include "avx512-swap.cpp" +#include "avx512-partition-register.cpp" +#include "avx512-partition.cpp" +#include "avx512-popcnt-partition.cpp" +#include "avx512-bmi2-partition.cpp" +#include "avx512-auxbuffer-partition.cpp" +#include "avx512-sort-register.cpp" + + +namespace qs { + + namespace avx512 { + + + void dump(const char* name, const __m512i v) { + uint32_t tmp[16]; + + printf("%-10s = [", name); + _mm512_storeu_si512((__m512i*)tmp, v); + for (int i=0; i < 16; i++) { + if (i > 0) { + putchar(' '); + } + + printf("%08x", tmp[i]); + } + + printf("]\n"); + } + + void quicksort(uint32_t* array, int left, int right) { + + int i = left; + int j = right; + + const int size = j - i + 1; + if (size <= 16) { + const __m512i in = _mm512_loadu_si512(array + i); + __m512i result; + + switch (size) { + case 16: result = avx512_sort16_epi32(in); break; + case 15: result = avx512_sort15_epi32(in); break; + case 14: result = avx512_sort14_epi32(in); break; + case 13: result = avx512_sort13_epi32(in); break; + case 12: result = avx512_sort12_epi32(in); break; + case 11: result = avx512_sort11_epi32(in); break; + case 10: result = avx512_sort10_epi32(in); break; + case 9: result = avx512_sort9_epi32(in); break; + case 8: result = avx512_sort8_epi32(in); break; + case 7: result = avx512_sort7_epi32(in); break; + case 6: result = avx512_sort6_epi32(in); break; + case 5: result = avx512_sort5_epi32(in); break; + case 4: result = avx512_sort4_epi32(in); break; + case 3: result = avx512_sort3_epi32(in); break; + case 2: { + if (array[i] > array[j]) { + uint32_t t = array[i]; + array[i] = array[j]; + array[j] = t; + } + return; + } + + case 1: + case 0: + return; + + default: + assert(false); + break; + } + + _mm512_storeu_si512(array + i, result); + return; + } + + const uint32_t pivot = array[left]; + const int AVX512_REGISTER_SIZE = 16; // in 32-bit words + if (size >= 2 * AVX512_REGISTER_SIZE) { + ::qs::avx512::partition_epi32(array, pivot, i, j); + } else { + scalar_partition_epi32(array, pivot, i, j); + } + + if (left < j) { + quicksort(array, left, j); + } + + if (i < right) { + quicksort(array, i, right); + } + } + + + void popcnt_quicksort(uint32_t* array, int left, int right) { + + int i = left; + int j = right; + const int size = j - i + 1; + if (size <= 16) { + const __m512i in = _mm512_loadu_si512(array + i); + __m512i result; + + switch (size) { + case 16: result = avx512_sort16_epi32(in); break; + case 15: result = avx512_sort15_epi32(in); break; + case 14: result = avx512_sort14_epi32(in); break; + case 13: result = avx512_sort13_epi32(in); break; + case 12: result = avx512_sort12_epi32(in); break; + case 11: result = avx512_sort11_epi32(in); break; + case 10: result = avx512_sort10_epi32(in); break; + case 9: result = avx512_sort9_epi32(in); break; + case 8: result = avx512_sort8_epi32(in); break; + case 7: result = avx512_sort7_epi32(in); break; + case 6: result = avx512_sort6_epi32(in); break; + case 5: result = avx512_sort5_epi32(in); break; + case 4: result = avx512_sort4_epi32(in); break; + case 3: result = avx512_sort3_epi32(in); break; + case 2: { + if (array[i] > array[j]) { + uint32_t t = array[i]; + array[i] = array[j]; + array[j] = t; + } + return; + } + + case 1: + case 0: + return; + + default: + assert(false); + break; + } + + _mm512_storeu_si512(array + i, result); + return; + } + + const uint32_t pivot = array[(i + j)/2]; + const int AVX512_REGISTER_SIZE = 16; // in 32-bit words + if (size >= 2 * AVX512_REGISTER_SIZE) { + ::qs::avx512::popcnt_partition_epi32(array, pivot, i, j); + } else { + scalar_partition_epi32(array, pivot, i, j); + } + + if (left < j) { + popcnt_quicksort(array, left, j); + } + + if (i < right) { + popcnt_quicksort(array, i, right); + } + } + + + void bmi2_quicksort(uint32_t* array, int left, int right) { + + int i = left; + int j = right; + + const uint32_t pivot = array[left]; + const int AVX512_REGISTER_SIZE = 16; // in 32-bit words + + if (j - i >= 2 * AVX512_REGISTER_SIZE) { + ::qs::avx512::bmi2_partition_epi32(array, pivot, i, j); + } else { + scalar_partition_epi32(array, pivot, i, j); + } + + if (left < j) { + bmi2_quicksort(array, left, j); + } + + if (i < right) { + bmi2_quicksort(array, i, right); + } + } + + + void auxbuffer_quicksort(uint32_t* array, int lo, int hi) { + + if (lo >= hi) { + return; + } + + int p; +#if 0 + p = lomuto_partition_epi32(array, lo, hi); +#else + const auto n = (hi - lo + 1); + if (n > 1024) { + p = lomuto_partition_epi32(array, lo, hi); + } else if (n > 8) { + const uint32_t pivot = array[(hi + lo)/2]; + p = ::qs::avx512::partition_auxbuffer_epi32(array + lo, n, pivot) + lo; + } else { + p = lomuto_partition_epi32(array, lo, hi); + } +#endif + auxbuffer_quicksort(array, lo, p-1); + auxbuffer_quicksort(array, p+1, hi); + } + + } // namespace avx512 + +} // namespace qs + diff --git a/simd-sort/avx512-sort-register.cpp b/simd-sort/avx512-sort-register.cpp new file mode 100644 index 0000000..b711954 --- /dev/null +++ b/simd-sort/avx512-sort-register.cpp @@ -0,0 +1,323 @@ + +//#define POPCNT_LOOKUP + +#ifdef POPCNT_LOOKUP + + uint16_t lookup[65536]; + + // The lookup table replaces 2 x popcount and 2 x shift with 1 bit-or and 2 x memory fetch + void prepare_lookup() { + for (int i=0; i < 65536; i++) { + lookup[i] = 1 << _mm_popcnt_u32(i); + } + } + #define STEP(unused) { \ + const __m512i b = _mm512_permutexvar_epi32(index, v); \ + index = _mm512_add_epi32(index, incr); \ + const uint16_t lt = _mm512_mask_cmplt_epi32_mask(sort_mask, v, b); \ + const uint16_t eq = _mm512_mask_cmpeq_epi32_mask(sort_mask, v, b); \ + const uint16_t mask = lookup[lt | eq] - lookup[lt]; \ + result = _mm512_mask_mov_epi32(result, mask, b); \ + } + +#else + + #define STEP(unused) { \ + const __m512i b = _mm512_permutexvar_epi32(index, v); \ + index = _mm512_add_epi32(index, incr); \ + const uint16_t lt = _mm_popcnt_u32(_mm512_mask_cmplt_epi32_mask(sort_mask, v, b)); \ + const uint16_t eq = _mm_popcnt_u32(_mm512_mask_cmpeq_epi32_mask(sort_mask, v, b)); \ + const uint16_t mask = (uint32_t(1) << (lt + eq)) - (uint32_t(1) << lt); \ + result = _mm512_mask_mov_epi32(result, mask, b); \ + } + +#endif + +__m512i avx512_sort16_epi32(const __m512i v) { + + __m512i result = v; + __m512i index = _mm512_setzero_si512(); + __m512i incr = _mm512_set1_epi32(1); + + const uint16_t sort_mask = 0xffff; + STEP(0); + STEP(1); + STEP(2); + STEP(3); + STEP(4); + STEP(5); + STEP(6); + STEP(7); + STEP(8); + STEP(9); + STEP(10); + STEP(11); + STEP(12); + STEP(13); + STEP(14); + STEP(15); + + return result; +} + +__m512i avx512_sort15_epi32(const __m512i v) { + + __m512i result = v; + __m512i index = _mm512_setzero_si512(); + __m512i incr = _mm512_set1_epi32(1); + + const uint16_t sort_mask = 0x7fff; + STEP(0); + STEP(1); + STEP(2); + STEP(3); + STEP(4); + STEP(5); + STEP(6); + STEP(7); + STEP(8); + STEP(9); + STEP(10); + STEP(11); + STEP(12); + STEP(13); + STEP(14); + + return result; +} + +__m512i avx512_sort14_epi32(const __m512i v) { + + __m512i result = v; + __m512i index = _mm512_setzero_si512(); + __m512i incr = _mm512_set1_epi32(1); + + const uint16_t sort_mask = 0x3fff; + STEP(0); + STEP(1); + STEP(2); + STEP(3); + STEP(4); + STEP(5); + STEP(6); + STEP(7); + STEP(8); + STEP(9); + STEP(10); + STEP(11); + STEP(12); + STEP(13); + + return result; +} + +__m512i avx512_sort13_epi32(const __m512i v) { + + __m512i result = v; + __m512i index = _mm512_setzero_si512(); + __m512i incr = _mm512_set1_epi32(1); + + const uint16_t sort_mask = 0x1fff; + STEP(0); + STEP(1); + STEP(2); + STEP(3); + STEP(4); + STEP(5); + STEP(6); + STEP(7); + STEP(8); + STEP(9); + STEP(10); + STEP(11); + STEP(12); + + return result; +} + +__m512i avx512_sort12_epi32(const __m512i v) { + + __m512i result = v; + __m512i index = _mm512_setzero_si512(); + __m512i incr = _mm512_set1_epi32(1); + + const uint16_t sort_mask = 0x0fff; + STEP(0); + STEP(1); + STEP(2); + STEP(3); + STEP(4); + STEP(5); + STEP(6); + STEP(7); + STEP(8); + STEP(9); + STEP(10); + STEP(11); + + return result; +} + +__m512i avx512_sort11_epi32(const __m512i v) { + + __m512i result = v; + __m512i index = _mm512_setzero_si512(); + __m512i incr = _mm512_set1_epi32(1); + + const uint16_t sort_mask = 0x07ff; + STEP(0); + STEP(1); + STEP(2); + STEP(3); + STEP(4); + STEP(5); + STEP(6); + STEP(7); + STEP(8); + STEP(9); + STEP(10); + + return result; +} + +__m512i avx512_sort10_epi32(const __m512i v) { + + __m512i result = v; + __m512i index = _mm512_setzero_si512(); + __m512i incr = _mm512_set1_epi32(1); + + const uint16_t sort_mask = 0x03ff; + STEP(0); + STEP(1); + STEP(2); + STEP(3); + STEP(4); + STEP(5); + STEP(6); + STEP(7); + STEP(8); + STEP(9); + + return result; +} + +__m512i avx512_sort9_epi32(const __m512i v) { + + __m512i result = v; + __m512i index = _mm512_setzero_si512(); + __m512i incr = _mm512_set1_epi32(1); + + const uint16_t sort_mask = 0x01ff; + STEP(0); + STEP(1); + STEP(2); + STEP(3); + STEP(4); + STEP(5); + STEP(6); + STEP(7); + STEP(8); + + return result; +} + +__m512i avx512_sort8_epi32(const __m512i v) { + + __m512i result = v; + __m512i index = _mm512_setzero_si512(); + __m512i incr = _mm512_set1_epi32(1); + + const uint16_t sort_mask = 0x00ff; + STEP(0); + STEP(1); + STEP(2); + STEP(3); + STEP(4); + STEP(5); + STEP(6); + STEP(7); + + return result; +} + +__m512i avx512_sort7_epi32(const __m512i v) { + + __m512i result = v; + __m512i index = _mm512_setzero_si512(); + __m512i incr = _mm512_set1_epi32(1); + + const uint16_t sort_mask = 0x007f; + STEP(0); + STEP(1); + STEP(2); + STEP(3); + STEP(4); + STEP(5); + STEP(6); + + return result; +} + +__m512i avx512_sort6_epi32(const __m512i v) { + + __m512i result = v; + __m512i index = _mm512_setzero_si512(); + __m512i incr = _mm512_set1_epi32(1); + + const uint16_t sort_mask = 0x003f; + STEP(0); + STEP(1); + STEP(2); + STEP(3); + STEP(4); + STEP(5); + + return result; +} + +__m512i avx512_sort5_epi32(const __m512i v) { + + __m512i result = v; + __m512i index = _mm512_setzero_si512(); + __m512i incr = _mm512_set1_epi32(1); + + const uint16_t sort_mask = 0x001f; + STEP(0); + STEP(1); + STEP(2); + STEP(3); + STEP(4); + + return result; +} + +__m512i avx512_sort4_epi32(const __m512i v) { + + __m512i result = v; + __m512i index = _mm512_setzero_si512(); + __m512i incr = _mm512_set1_epi32(1); + + const uint16_t sort_mask = 0x000f; + STEP(0); + STEP(1); + STEP(2); + STEP(3); + + return result; +} + +__m512i avx512_sort3_epi32(const __m512i v) { + + __m512i result = v; + __m512i index = _mm512_setzero_si512(); + __m512i incr = _mm512_set1_epi32(1); + + const uint16_t sort_mask = 0x0007; + STEP(0); + STEP(1); + STEP(2); + + return result; +} + +#undef STEP diff --git a/simd-sort/avx512-swap.cpp b/simd-sort/avx512-swap.cpp new file mode 100644 index 0000000..c9d5e80 --- /dev/null +++ b/simd-sort/avx512-swap.cpp @@ -0,0 +1,20 @@ +namespace qs { + namespace avx512 { + + void swap_epi32(__m512i& a, __m512i& b, __mmask16 mask_a, __mmask16 mask_b) { + + assert(mask_a != 0); + assert(mask_b != 0); + assert(_mm_popcnt_u64(mask_a) == _mm_popcnt_u64(mask_b)); + + const __m512i to_swap_b = _mm512_mask_compress_epi32(_mm512_setzero_si512(), mask_a, a); + const __m512i to_swap_a = _mm512_mask_compress_epi32(_mm512_setzero_si512(), mask_b, b); + + a = _mm512_mask_expand_epi32(a, mask_a, to_swap_a); + b = _mm512_mask_expand_epi32(b, mask_b, to_swap_b); + } + + + } // namespace avx512 + +} // namespace qa diff --git a/simd-sort/cmdline.cpp b/simd-sort/cmdline.cpp new file mode 100644 index 0000000..59c6282 --- /dev/null +++ b/simd-sort/cmdline.cpp @@ -0,0 +1,28 @@ +#include +#include +#include + +class CommandLine final { + int argc; + char** argv; + + std::unordered_set arguments; + +public: + CommandLine(int argc, char* argv[]) + : argc(argc) + , argv(argv) { + + for (int i=1; i < argc; i++) { + arguments.insert(argv[i]); + } + } + + bool has(const std::string& name) const { + return arguments.count(name); + } + + bool empty() const { + return arguments.empty(); + } +}; diff --git a/simd-sort/common.h b/simd-sort/common.h new file mode 100644 index 0000000..f2fc23b --- /dev/null +++ b/simd-sort/common.h @@ -0,0 +1,3 @@ +#pragma once + +#define FORCE_INLINE inline __attribute__((always_inline)) diff --git a/simd-sort/gather_results.sh b/simd-sort/gather_results.sh new file mode 100755 index 0000000..b5086d7 --- /dev/null +++ b/simd-sort/gather_results.sh @@ -0,0 +1,45 @@ +#!/bin/bash + +if [[ $1 == "avx2" ]] +then + SPEED=./speed_avx2 + if [[ ! -x $SPEED ]] + then + echo "AVX2 test program not found. Run make speed_avx2 to build it." + exit 1 + fi +else + SPEED=./speed + if [[ ! -x $SPEED ]] + then + echo "AVX512 test program not found. Run make speed to build it." + exit 1 + fi +fi + + +$SPEED 10000 3 asc | tee -a results.txt +$SPEED 100000 3 asc | tee -a results.txt +$SPEED 1000000 3 asc | tee -a results.txt +$SPEED 2000000 3 asc | tee -a results.txt +$SPEED 5000000 3 asc | tee -a results.txt +$SPEED 10000000 3 asc | tee -a results.txt +$SPEED 20000000 3 asc | tee -a results.txt + +$SPEED 10000 3 dsc | tee -a results.txt +$SPEED 100000 3 dsc | tee -a results.txt +$SPEED 1000000 3 dsc | tee -a results.txt +$SPEED 2000000 3 dsc | tee -a results.txt +$SPEED 5000000 3 dsc | tee -a results.txt +$SPEED 10000000 3 dsc | tee -a results.txt +$SPEED 20000000 3 dsc | tee -a results.txt + +$SPEED 10000 3 rnd | tee -a results.txt +$SPEED 100000 3 rnd | tee -a results.txt +$SPEED 1000000 3 rnd | tee -a results.txt +$SPEED 2000000 3 rnd | tee -a results.txt +$SPEED 5000000 3 rnd | tee -a results.txt +$SPEED 10000000 3 rnd | tee -a results.txt +$SPEED 20000000 3 rnd | tee -a results.txt + +echo "results.txt created" diff --git a/simd-sort/gettime.cpp b/simd-sort/gettime.cpp new file mode 100644 index 0000000..5f95269 --- /dev/null +++ b/simd-sort/gettime.cpp @@ -0,0 +1,8 @@ +#include +#include + +uint32_t get_time() { + struct timeval T; + gettimeofday(&T, NULL); + return (T.tv_sec * 1000000) + T.tv_usec; +} diff --git a/simd-sort/input_data.cpp b/simd-sort/input_data.cpp new file mode 100644 index 0000000..7804aaa --- /dev/null +++ b/simd-sort/input_data.cpp @@ -0,0 +1,106 @@ +class InputData { + +protected: + uint32_t* array; + size_t n; + +public: + InputData(size_t count) + : n(count) { + + assert(n > 0); + array = new uint32_t[n]; + } + + ~InputData() { + delete[] array; + } + +public: + uint32_t* pointer() { + return array; + } + + size_t count() const { + return n; + } + + size_t size() const { + return n * sizeof(uint32_t); + } +}; + + +class InputAscending: public InputData { + + using super = InputData; + +public: + InputAscending(size_t count) : super(count) { + for (size_t i=0; i < n; i++) { + array[i] = i; + } + } +}; + + +class InputDescending: public InputData { + + using super = InputData; + +public: + InputDescending(size_t count) : super(count) { + for (size_t i=0; i < n; i++) { + array[i] = n - i + 1; + } + } +}; + +class InputRandomFew: public InputData { + + using super = InputData; + +public: + InputRandomFew(size_t count) : super(count) { + for (size_t i=0; i < n; i++) { + array[i] = random() % 10; + } + } +}; + +class InputRandom: public InputData { + + using super = InputData; + +public: + InputRandom(size_t count) : super(count) { + for (size_t i=0; i < n; i++) { + array[i] = random(); + } + } +}; + +class InputRandomUnique: public InputData { + + using super = InputData; + +public: + InputRandomUnique(size_t count) : super(count) { + for (size_t i=0; i < n; i++) { + array[i] = i; + } + + shuffle(); + } + +private: + void shuffle() { + for (size_t i=0; i < n; i++) { + size_t j = rand() % (n - i); + + const uint32_t t = array[i]; + array[i] = array[j]; + array[j] = t; + } + } +}; diff --git a/simd-sort/partition.cpp b/simd-sort/partition.cpp new file mode 100644 index 0000000..762cc4b --- /dev/null +++ b/simd-sort/partition.cpp @@ -0,0 +1,51 @@ +void scalar_partition_epi32(uint32_t* array, const uint32_t pivot, int& left, int& right) { + + while (left <= right) { + + while (array[left] < pivot) { + left += 1; + } + + while (array[right] > pivot) { + right -= 1; + } + + if (left <= right) { + const uint32_t t = array[left]; + array[left] = array[right]; + array[right] = t; + + left += 1; + right -= 1; + } + } +} + + +int lomuto_partition_epi32(uint32_t* array, int lo, int hi) { + + const uint32_t pivot = array[(lo + hi)/2]; + const uint32_t hi_value = array[hi]; + + array[(lo + hi)/2] = hi_value; + array[hi] = pivot; + + int i = lo; + for (int j=lo; j < hi; j++) { + if (array[j] <= pivot) { + const uint32_t t = array[i]; + array[i] = array[j]; + array[j] = t; + i += 1; + } + } + + { + const uint32_t t = array[i]; + array[i] = array[hi]; + array[hi] = t; + i += 1; + } + + return i; +} diff --git a/simd-sort/quicksort-all.cpp b/simd-sort/quicksort-all.cpp new file mode 100644 index 0000000..4037646 --- /dev/null +++ b/simd-sort/quicksort-all.cpp @@ -0,0 +1,9 @@ +#include "common.h" +#include "quicksort.cpp" +#ifdef HAVE_AVX2_INSTRUCTIONS +# include "avx2-quicksort.cpp" +#endif +#ifdef HAVE_AVX512F_INSTRUCTIONS +# include "avx512-quicksort.cpp" +#endif + diff --git a/simd-sort/quicksort.cpp b/simd-sort/quicksort.cpp new file mode 100644 index 0000000..82aa462 --- /dev/null +++ b/simd-sort/quicksort.cpp @@ -0,0 +1,19 @@ +#include "partition.cpp" + +void quicksort(uint32_t* array, int left, int right) { + + int i = left; + int j = right; + + const uint32_t pivot = array[(i + j)/2]; + + scalar_partition_epi32(array, pivot, i, j); + + if (left < j) { + quicksort(array, left, j); + } + + if (i < right) { + quicksort(array, i, right); + } +} diff --git a/simd-sort/rdtsc.cpp b/simd-sort/rdtsc.cpp new file mode 100644 index 0000000..4228252 --- /dev/null +++ b/simd-sort/rdtsc.cpp @@ -0,0 +1,52 @@ +#define RDTSC_START(cycles) \ + do { \ + uint32_t cyc_high, cyc_low; \ + __asm volatile("cpuid\n" \ + "rdtsc\n" \ + "mov %%edx, %0\n" \ + "mov %%eax, %1" : \ + "=r" (cyc_high), \ + "=r"(cyc_low) : \ + : /* no read only */ \ + "%rax", "%rbx", "%rcx", "%rdx" /* clobbers */ \ + ); \ + (cycles) = ((uint64_t)cyc_high << 32) | cyc_low; \ + } while (0) + +#define RDTSC_STOP(cycles) \ + do { \ + uint32_t cyc_high, cyc_low; \ + __asm volatile("rdtscp\n" \ + "mov %%edx, %0\n" \ + "mov %%eax, %1\n" \ + "cpuid" : \ + "=r"(cyc_high), \ + "=r"(cyc_low) : \ + /* no read only registers */ : \ + "%rax", "%rbx", "%rcx", "%rdx" /* clobbers */ \ + ); \ + (cycles) = ((uint64_t)cyc_high << 32) | cyc_low; \ + } while (0) + +static __attribute__ ((noinline)) +uint64_t rdtsc_overhead_func(uint64_t dummy) { + return dummy; +} + +uint64_t global_rdtsc_overhead = (uint64_t) UINT64_MAX; + +#define RDTSC_SET_OVERHEAD(test, repeat) \ + do { \ + uint64_t cycles_start, cycles_final, cycles_diff; \ + uint64_t min_diff = UINT64_MAX; \ + for (int i = 0; i < repeat; i++) { \ + __asm volatile("" ::: /* pretend to clobber */ "memory"); \ + RDTSC_START(cycles_start); \ + test; \ + RDTSC_STOP(cycles_final); \ + cycles_diff = (cycles_final - cycles_start); \ + if (cycles_diff < min_diff) min_diff = cycles_diff; \ + } \ + global_rdtsc_overhead = min_diff; \ + printf("rdtsc_overhead set to %d\n", (int)global_rdtsc_overhead); \ + } while (0) diff --git a/simd-sort/results/haswell-i7-4770-gcc5.3.0-avx2.txt b/simd-sort/results/haswell-i7-4770-gcc5.3.0-avx2.txt new file mode 100644 index 0000000..4b2af5c --- /dev/null +++ b/simd-sort/results/haswell-i7-4770-gcc5.3.0-avx2.txt @@ -0,0 +1,84 @@ +items count: 10000 (40000 bytes), input ascending + std::sort ... 0.0001 s + quick sort ... 0.0001 s (1.24) + AVX2 quick sort ... 0.0001 s (1.48) +items count: 100000 (400000 bytes), input ascending + std::sort ... 0.0010 s + quick sort ... 0.0009 s (1.17) + AVX2 quick sort ... 0.0007 s (1.48) +items count: 1000000 (4000000 bytes), input ascending + std::sort ... 0.0117 s + quick sort ... 0.0092 s (1.27) + AVX2 quick sort ... 0.0066 s (1.77) +items count: 2000000 (8000000 bytes), input ascending + std::sort ... 0.0244 s + quick sort ... 0.0192 s (1.28) + AVX2 quick sort ... 0.0137 s (1.79) +items count: 5000000 (20000000 bytes), input ascending + std::sort ... 0.0700 s + quick sort ... 0.0535 s (1.31) + AVX2 quick sort ... 0.0378 s (1.85) +items count: 10000000 (40000000 bytes), input ascending + std::sort ... 0.1462 s + quick sort ... 0.1108 s (1.32) + AVX2 quick sort ... 0.0782 s (1.87) +items count: 20000000 (80000000 bytes), input ascending + std::sort ... 0.3049 s + quick sort ... 0.2301 s (1.33) + AVX2 quick sort ... 0.1614 s (1.89) +items count: 10000 (40000 bytes), input descending + std::sort ... 0.0001 s + quick sort ... 0.0001 s (0.92) + AVX2 quick sort ... 0.0001 s (1.12) +items count: 100000 (400000 bytes), input descending + std::sort ... 0.0008 s + quick sort ... 0.0009 s (0.87) + AVX2 quick sort ... 0.0007 s (1.10) +items count: 1000000 (4000000 bytes), input descending + std::sort ... 0.0087 s + quick sort ... 0.0092 s (0.95) + AVX2 quick sort ... 0.0066 s (1.32) +items count: 2000000 (8000000 bytes), input descending + std::sort ... 0.0183 s + quick sort ... 0.0192 s (0.95) + AVX2 quick sort ... 0.0138 s (1.33) +items count: 5000000 (20000000 bytes), input descending + std::sort ... 0.0514 s + quick sort ... 0.0536 s (0.96) + AVX2 quick sort ... 0.0378 s (1.36) +items count: 10000000 (40000000 bytes), input descending + std::sort ... 0.1068 s + quick sort ... 0.1111 s (0.96) + AVX2 quick sort ... 0.0783 s (1.36) +items count: 20000000 (80000000 bytes), input descending + std::sort ... 0.2221 s + quick sort ... 0.2299 s (0.97) + AVX2 quick sort ... 0.1613 s (1.38) +items count: 10000 (40000 bytes), input random + std::sort ... 0.0001 s + quick sort ... 0.0001 s (1.25) + AVX2 quick sort ... 0.0001 s (1.53) +items count: 100000 (400000 bytes), input random + std::sort ... 0.0010 s + quick sort ... 0.0009 s (1.16) + AVX2 quick sort ... 0.0007 s (1.48) +items count: 1000000 (4000000 bytes), input random + std::sort ... 0.0118 s + quick sort ... 0.0091 s (1.29) + AVX2 quick sort ... 0.0066 s (1.78) +items count: 2000000 (8000000 bytes), input random + std::sort ... 0.0246 s + quick sort ... 0.0192 s (1.28) + AVX2 quick sort ... 0.0137 s (1.79) +items count: 5000000 (20000000 bytes), input random + std::sort ... 0.0703 s + quick sort ... 0.0537 s (1.31) + AVX2 quick sort ... 0.0378 s (1.86) +items count: 10000000 (40000000 bytes), input random + std::sort ... 0.1468 s + quick sort ... 0.1106 s (1.33) + AVX2 quick sort ... 0.0786 s (1.87) +items count: 20000000 (80000000 bytes), input random + std::sort ... 0.3058 s + quick sort ... 0.2304 s (1.33) + AVX2 quick sort ... 0.1628 s (1.88) diff --git a/simd-sort/results/knights-landing-7210-gcc5.3.0-avx512f.txt b/simd-sort/results/knights-landing-7210-gcc5.3.0-avx512f.txt new file mode 100644 index 0000000..b50f9c0 --- /dev/null +++ b/simd-sort/results/knights-landing-7210-gcc5.3.0-avx512f.txt @@ -0,0 +1,140 @@ +items count: 100000 (400000 bytes), input ascending + std::sort ... 0.0063 s + quick sort ... 0.0067 s (0.93) + AVX2 quick sort ... 0.0071 s (0.88) + AVX512 quick sort ... 0.0055 s (1.14) + AVX512 + popcnt quick sort ... 0.0056 s (1.13) + AVX512 + BMI2 quick sort ... 0.0058 s (1.08) +items count: 1000000 (4000000 bytes), input ascending + std::sort ... 0.0728 s + quick sort ... 0.0735 s (0.99) + AVX2 quick sort ... 0.0685 s (1.06) + AVX512 quick sort ... 0.0522 s (1.40) + AVX512 + popcnt quick sort ... 0.0528 s (1.38) + AVX512 + BMI2 quick sort ... 0.0554 s (1.31) +items count: 2000000 (8000000 bytes), input ascending + std::sort ... 0.1529 s + quick sort ... 0.1523 s (1.00) + AVX2 quick sort ... 0.1402 s (1.09) + AVX512 quick sort ... 0.1057 s (1.45) + AVX512 + popcnt quick sort ... 0.1063 s (1.44) + AVX512 + BMI2 quick sort ... 0.1105 s (1.38) +items count: 5000000 (20000000 bytes), input ascending + std::sort ... 0.4433 s + quick sort ... 0.3935 s (1.13) + AVX2 quick sort ... 0.3520 s (1.26) + AVX512 quick sort ... 0.2609 s (1.70) + AVX512 + popcnt quick sort ... 0.2609 s (1.70) + AVX512 + BMI2 quick sort ... 0.2668 s (1.66) +items count: 10000000 (40000000 bytes), input ascending + std::sort ... 0.9257 s + quick sort ... 0.8108 s (1.14) + AVX2 quick sort ... 0.7222 s (1.28) + AVX512 quick sort ... 0.5265 s (1.76) + AVX512 + popcnt quick sort ... 0.5288 s (1.75) + AVX512 + BMI2 quick sort ... 0.5402 s (1.71) +items count: 20000000 (80000000 bytes), input ascending + std::sort ... 1.9285 s + quick sort ... 1.6725 s (1.15) + AVX2 quick sort ... 1.4766 s (1.31) + AVX512 quick sort ... 1.0681 s (1.81) + AVX512 + popcnt quick sort ... 1.0699 s (1.80) + AVX512 + BMI2 quick sort ... 1.0949 s (1.76) +items count: 10000 (40000 bytes), input descending + std::sort ... 0.0004 s + quick sort ... 0.0006 s (0.73) + AVX2 quick sort ... 0.0006 s (0.74) + AVX512 quick sort ... 0.0005 s (0.88) + AVX512 + popcnt quick sort ... 0.0005 s (0.89) + AVX512 + BMI2 quick sort ... 0.0005 s (0.87) +items count: 100000 (400000 bytes), input descending + std::sort ... 0.0048 s + quick sort ... 0.0067 s (0.71) + AVX2 quick sort ... 0.0070 s (0.68) + AVX512 quick sort ... 0.0055 s (0.87) + AVX512 + popcnt quick sort ... 0.0056 s (0.86) + AVX512 + BMI2 quick sort ... 0.0058 s (0.83) +items count: 1000000 (4000000 bytes), input descending + std::sort ... 0.0551 s + quick sort ... 0.0734 s (0.75) + AVX2 quick sort ... 0.0684 s (0.81) + AVX512 quick sort ... 0.0524 s (1.05) + AVX512 + popcnt quick sort ... 0.0526 s (1.05) + AVX512 + BMI2 quick sort ... 0.0547 s (1.01) +items count: 2000000 (8000000 bytes), input descending + std::sort ... 0.1154 s + quick sort ... 0.1523 s (0.76) + AVX2 quick sort ... 0.1407 s (0.82) + AVX512 quick sort ... 0.1055 s (1.09) + AVX512 + popcnt quick sort ... 0.1065 s (1.08) + AVX512 + BMI2 quick sort ... 0.1111 s (1.04) +items count: 5000000 (20000000 bytes), input descending + std::sort ... 0.3300 s + quick sort ... 0.3932 s (0.84) + AVX2 quick sort ... 0.3516 s (0.94) + AVX512 quick sort ... 0.2606 s (1.27) + AVX512 + popcnt quick sort ... 0.2602 s (1.27) + AVX512 + BMI2 quick sort ... 0.2657 s (1.24) +items count: 10000000 (40000000 bytes), input descending + std::sort ... 0.6890 s + quick sort ... 0.8134 s (0.85) + AVX2 quick sort ... 0.7216 s (0.95) + AVX512 quick sort ... 0.5273 s (1.31) + AVX512 + popcnt quick sort ... 0.5351 s (1.29) + AVX512 + BMI2 quick sort ... 0.5392 s (1.28) +items count: 20000000 (80000000 bytes), input descending + std::sort ... 1.4358 s + quick sort ... 1.6744 s (0.86) + AVX2 quick sort ... 1.4772 s (0.97) + AVX512 quick sort ... 1.0694 s (1.34) + AVX512 + popcnt quick sort ... 1.0675 s (1.34) + AVX512 + BMI2 quick sort ... 1.0929 s (1.31) +items count: 10000 (40000 bytes), input random + std::sort ... 0.0005 s + quick sort ... 0.0006 s (0.98) + AVX2 quick sort ... 0.0006 s (0.99) + AVX512 quick sort ... 0.0005 s (1.18) + AVX512 + popcnt quick sort ... 0.0005 s (1.18) + AVX512 + BMI2 quick sort ... 0.0005 s (1.16) +items count: 100000 (400000 bytes), input random + std::sort ... 0.0063 s + quick sort ... 0.0067 s (0.94) + AVX2 quick sort ... 0.0071 s (0.89) + AVX512 quick sort ... 0.0055 s (1.14) + AVX512 + popcnt quick sort ... 0.0056 s (1.13) + AVX512 + BMI2 quick sort ... 0.0058 s (1.08) +items count: 1000000 (4000000 bytes), input random + std::sort ... 0.0727 s + quick sort ... 0.0742 s (0.98) + AVX2 quick sort ... 0.0684 s (1.06) + AVX512 quick sort ... 0.0522 s (1.39) + AVX512 + popcnt quick sort ... 0.0526 s (1.38) + AVX512 + BMI2 quick sort ... 0.0544 s (1.34) +items count: 2000000 (8000000 bytes), input random + std::sort ... 0.1533 s + quick sort ... 0.1532 s (1.00) + AVX2 quick sort ... 0.1403 s (1.09) + AVX512 quick sort ... 0.1057 s (1.45) + AVX512 + popcnt quick sort ... 0.1066 s (1.44) + AVX512 + BMI2 quick sort ... 0.1101 s (1.39) +items count: 5000000 (20000000 bytes), input random + std::sort ... 0.4436 s + quick sort ... 0.3932 s (1.13) + AVX2 quick sort ... 0.3529 s (1.26) + AVX512 quick sort ... 0.2622 s (1.69) + AVX512 + popcnt quick sort ... 0.2632 s (1.69) + AVX512 + BMI2 quick sort ... 0.2657 s (1.67) +items count: 10000000 (40000000 bytes), input random + std::sort ... 0.9235 s + quick sort ... 0.8115 s (1.14) + AVX2 quick sort ... 0.7233 s (1.28) + AVX512 quick sort ... 0.5305 s (1.74) + AVX512 + popcnt quick sort ... 0.5328 s (1.73) + AVX512 + BMI2 quick sort ... 0.5379 s (1.72) +items count: 20000000 (80000000 bytes), input random + std::sort ... 1.9264 s + quick sort ... 1.6716 s (1.15) + AVX2 quick sort ... 1.4898 s (1.29) + AVX512 quick sort ... 1.0853 s (1.77) + AVX512 + popcnt quick sort ... 1.0849 s (1.78) + AVX512 + BMI2 quick sort ... 1.0984 s (1.75) diff --git a/simd-sort/results/skylake-i7-6700-gcc5.3.0-avx2.txt b/simd-sort/results/skylake-i7-6700-gcc5.3.0-avx2.txt new file mode 100644 index 0000000..ee7c7a2 --- /dev/null +++ b/simd-sort/results/skylake-i7-6700-gcc5.3.0-avx2.txt @@ -0,0 +1,80 @@ +items count: 100000 (400000 bytes), input ascending + std::sort ... 0.0011 s + quick sort ... 0.0009 s (1.14) + AVX2 quick sort ... 0.0007 s (1.59) +items count: 1000000 (4000000 bytes), input ascending + std::sort ... 0.0122 s + quick sort ... 0.0099 s (1.23) + AVX2 quick sort ... 0.0065 s (1.89) +items count: 2000000 (8000000 bytes), input ascending + std::sort ... 0.0257 s + quick sort ... 0.0204 s (1.26) + AVX2 quick sort ... 0.0133 s (1.94) +items count: 5000000 (20000000 bytes), input ascending + std::sort ... 0.0724 s + quick sort ... 0.0576 s (1.26) + AVX2 quick sort ... 0.0376 s (1.93) +items count: 10000000 (40000000 bytes), input ascending + std::sort ... 0.1516 s + quick sort ... 0.1194 s (1.27) + AVX2 quick sort ... 0.0773 s (1.96) +items count: 20000000 (80000000 bytes), input ascending + std::sort ... 0.3167 s + quick sort ... 0.2480 s (1.28) + AVX2 quick sort ... 0.1598 s (1.98) +items count: 10000 (40000 bytes), input descending + std::sort ... 0.0001 s + quick sort ... 0.0001 s (0.91) + AVX2 quick sort ... 0.0001 s (1.15) +items count: 100000 (400000 bytes), input descending + std::sort ... 0.0009 s + quick sort ... 0.0009 s (0.91) + AVX2 quick sort ... 0.0007 s (1.26) +items count: 1000000 (4000000 bytes), input descending + std::sort ... 0.0095 s + quick sort ... 0.0099 s (0.96) + AVX2 quick sort ... 0.0065 s (1.47) +items count: 2000000 (8000000 bytes), input descending + std::sort ... 0.0196 s + quick sort ... 0.0207 s (0.95) + AVX2 quick sort ... 0.0132 s (1.48) +items count: 5000000 (20000000 bytes), input descending + std::sort ... 0.0539 s + quick sort ... 0.0576 s (0.94) + AVX2 quick sort ... 0.0377 s (1.43) +items count: 10000000 (40000000 bytes), input descending + std::sort ... 0.1130 s + quick sort ... 0.1195 s (0.95) + AVX2 quick sort ... 0.0778 s (1.45) +items count: 20000000 (80000000 bytes), input descending + std::sort ... 0.2353 s + quick sort ... 0.2479 s (0.95) + AVX2 quick sort ... 0.1604 s (1.47) +items count: 10000 (40000 bytes), input random + std::sort ... 0.0001 s + quick sort ... 0.0001 s (1.19) + AVX2 quick sort ... 0.0001 s (1.51) +items count: 100000 (400000 bytes), input random + std::sort ... 0.0011 s + quick sort ... 0.0009 s (1.13) + AVX2 quick sort ... 0.0007 s (1.59) +items count: 1000000 (4000000 bytes), input random + std::sort ... 0.0123 s + quick sort ... 0.0099 s (1.23) + AVX2 quick sort ... 0.0064 s (1.91) +items count: 2000000 (8000000 bytes), input random + std::sort ... 0.0257 s + quick sort ... 0.0207 s (1.24) + AVX2 quick sort ... 0.0133 s (1.93) +items count: 5000000 (20000000 bytes), input random + std::sort ... 0.0726 s + quick sort ... 0.0578 s (1.26) + AVX2 quick sort ... 0.0377 s (1.93) +items count: 10000000 (40000000 bytes), input random + std::sort ... 0.1518 s + quick sort ... 0.1200 s (1.26) + AVX2 quick sort ... 0.0778 s (1.95) +items count: 20000000 (80000000 bytes), input random + std::sort ... 0.3170 s + quick sort ... 0.2490 s (1.27) + AVX2 quick sort ... 0.1614 s (1.96) diff --git a/simd-sort/runtime_stats.cpp b/simd-sort/runtime_stats.cpp new file mode 100644 index 0000000..2fa5e7b --- /dev/null +++ b/simd-sort/runtime_stats.cpp @@ -0,0 +1,76 @@ +class Histogram { + size_t hist[256]; + +public: + Histogram() { + reset(); + } + + void hit(uint8_t v) { + hist[v] += 1; + } + + void reset() { + memset(hist, 0, sizeof(hist)); + } + + bool empty() const { + for (int i=0; i < 256; i++) { + if (hist[i] > 0) { + return false; + } + } + + return true; + } + +public: + void print() const { + size_t total = 0; + for (int i=0; i < 256; i++) { + total += hist[i]; + } + + if (total == 0) return; + + for (int i=0; i < 256; i++) { + if (hist[i] == 0) continue; + + printf("%02x: %5lu (%5.2f%%)\n", i, hist[i], 100.0 * hist[i]/total); + } + } +}; + +class Statistics { +public: + size_t partition_calls; + size_t items_processed; + size_t scalar__partition_calls; + size_t scalar__items_processed; + Histogram pvbyte_histogram; + +public: + Statistics() { + reset(); + } + + void reset() { + partition_calls = 0; + items_processed = 0; + scalar__partition_calls = 0; + scalar__items_processed = 0; + + pvbyte_histogram.reset(); + } + + bool anything_collected() const { + return (partition_calls > 0) + || (items_processed > 0) + || (scalar__partition_calls > 0) + || (scalar__items_processed > 0) + || (!pvbyte_histogram.empty()); + } +}; + + +Statistics statistics; diff --git a/simd-sort/speed.cpp b/simd-sort/speed.cpp new file mode 100644 index 0000000..aa58089 --- /dev/null +++ b/simd-sort/speed.cpp @@ -0,0 +1,457 @@ +#include +#include +#include +#include +#include +#include +#include + +#include "cmdline.cpp" +#ifdef WITH_RUNTIME_STATS +# include "runtime_stats.cpp" // must be included before anything else +#endif +#include "input_data.cpp" +#include "quicksort-all.cpp" +#include "avx2-altquicksort.h" +//#include "avx2-nate-quicksort.cpp" +#include "../magyarsort.h" // mine +#include "avx2-natenodutch-quicksort.h" +#define USE_RDTSC // undef to get measurments in seconds +#ifdef USE_RDTSC +# include "rdtsc.cpp" +#else +# include "gettime.cpp" +#endif + +void magyarsort_it(uint32_t* array, int left, int right) { + MagyarSort::sort(array + left, right - left); +} + +class PerformanceTest final { + + int iterations; + InputData& input; + uint32_t* tmp; + +public: + PerformanceTest(int n, InputData& input) + : iterations(n) + , input(input) { + + assert(iterations > 0); + tmp = new uint32_t[input.count()]; + } + + ~PerformanceTest() { + delete[] tmp; + } + +public: + template + uint64_t run(SORT_FUNCTION sort) { + + uint64_t time = 0; + + int k = iterations; + while (k--) { + memcpy(tmp, input.pointer(), input.size()); + + uint64_t t1, t2; + +#ifdef USE_RDTSC + RDTSC_START(t1); +#else + t1 = get_time(); +#endif + sort(input.pointer(), 0, input.count() - 1); + +#ifdef USE_RDTSC + RDTSC_START(t2); +#else + t2 = get_time(); +#endif + + const uint64_t dt = t2 - t1; + + if (time == 0) { + time = dt; + } else if (dt < time) { + time = dt; + } + } + + return time; + } +}; + + +enum class InputType { + randomfew, + randomuniq, + random, + ascending, + descending, +}; + + +const char* as_string(InputType type) { + switch (type) { + case InputType::randomfew: + return "randomfew"; + + case InputType::randomuniq: + return "randomuniq"; + + case InputType::random: + return "random"; + + case InputType::ascending: + return "ascending"; + + case InputType::descending: + return "descending"; + + default: + return ""; + } +} + +void std_qsort_wrapper(uint32_t* array, int left, int right) { + + std::qsort(array + left, right - left + 1, sizeof(uint32_t), [](const void* a, const void* b) + { + uint32_t a1 = *static_cast(a); + uint32_t a2 = *static_cast(b); + + if(a1 < a2) return -1; + if(a1 > a2) return 1; + return 0; + }); +} + + +void std_stable_sort_wrapper(uint32_t* array, int left, int right) { + + std::stable_sort(array + left, array + right + 1); +} + + +void std_sort_wrapper(uint32_t* array, int left, int right) { + + std::sort(array + left, array + right + 1); +} + + +class Flags { + public: + bool std_sort; + bool std_qsort; + bool std_stable_sort; + bool quicksort; + bool avx2; + bool magyar; + bool avx2_alt; + bool avx2_natenodutch; + bool avx512; + bool avx512_buf; + bool avx512_popcnt; + bool avx512_bmi; + + public: + Flags(const CommandLine& cmd) { + + enable_all(false); + + bool any_set = false; + if (cmd.has("-std-sort")) { + std_sort = true; + any_set = true; + } + + if (cmd.has("-std-qsort")) { + std_qsort = true; + any_set = true; + } + + if (cmd.has("-std-stable-sort") || cmd.has("-std-stable")) { + std_stable_sort = true; + any_set = true; + } + + if (cmd.has("-quicksort")) { + quicksort = true; + any_set = true; + } + + if (cmd.has("-avx2")) { + avx2 = true; + any_set = true; + } + + if (cmd.has("-avx2-alt")) { + avx2_alt = true; + any_set = true; + } + + if (cmd.has("magyar")) { + magyar = true; + avx2_natenodutch = true; + any_set = true; + } + + if (cmd.has("-avx512")) { + avx512 = true; + any_set = true; + } + + if (cmd.has("-avx512-buf")) { + avx512_buf = true; + any_set = true; + } + + if (cmd.has("-avx512-popcnt")) { + avx512_popcnt = true; + any_set = true; + } + + if (cmd.has("-avx512-bmi")) { + avx512_bmi = true; + any_set = true; + } + + if (!any_set) { + enable_all(true); + } + } + + void enable_all(bool val) { + std_sort = val; + std_qsort = val; + std_stable_sort = val; + quicksort = val; + avx2 = val; + magyar = val; + avx2_alt = val; + avx2_natenodutch = val; + avx512 = val; + avx512_buf = val; + avx512_popcnt = val; + avx512_bmi = val; + } +}; + + +class Test { + + std::unique_ptr data; + InputType type; + size_t count; + int iterations; + Flags flags; + uint64_t ref; + +public: + Test(InputType type, size_t count, int iterations, Flags&& flags) + : type(type) + , count(count) + , iterations(iterations) + , flags(std::move(flags)) { + + switch (type) { + case InputType::randomfew: + data.reset(new InputRandomFew(count)); + break; + + case InputType::randomuniq: + data.reset(new InputRandomUnique(count)); + break; + + case InputType::random: + data.reset(new InputRandom(count)); + break; + + case InputType::ascending: + data.reset(new InputAscending(count)); + break; + + case InputType::descending: + data.reset(new InputDescending(count)); + break; + } + } + + void run() { + + printf("items count: %lu (%lu bytes), input %s\n", data->count(), data->size(), as_string(type)); + + ref = 0; + + if (flags.std_sort) { + measure("std::sort", std_sort_wrapper); + } + + if (flags.std_qsort) { + measure("std::qsort", std_qsort_wrapper); + } + + if (flags.std_stable_sort) { + measure("std::stable_sort", std_stable_sort_wrapper); + } + + if (flags.std_qsort) { + measure("quick sort", quicksort); + } +#ifdef HAVE_AVX2_INSTRUCTIONS + if (flags.avx2) { + measure("AVX2 quick sort", qs::avx2::quicksort); + } + + if (flags.avx2_natenodutch) { + measure("AVX2 nate nodutch", avx_natenodutch_quicksort); + } + + if (flags.avx2_alt) { + measure("AVX2 alt quicksort", wrapped_avx2_pivotonlast_sort); + } + + if (flags.magyar) { + measure("Magyarsort variant", magyarsort_it); + } + +#endif +#ifdef HAVE_AVX512F_INSTRUCTIONS + if (flags.avx512) { + measure("AVX512 quick sort", qs::avx512::quicksort); + } + + if (flags.avx512_buf) { + measure("AVX512 quick sort - aux buf", qs::avx512::auxbuffer_quicksort); + } + + if (flags.avx512_popcnt) { + measure("AVX512 + popcnt quick sort", qs::avx512::popcnt_quicksort); + } + + if (flags.avx512_bmi) { + measure("AVX512 + BMI2 quick sort", qs::avx512::bmi2_quicksort); + } +#endif + } + +private: + template + void measure(const char* name, SORT_FUNCTION sort) { + + PerformanceTest test(iterations, *data); + + printf("%30s ... ", name); fflush(stdout); +#ifdef WITH_RUNTIME_STATS + statistics.reset(); +#endif + uint64_t time = test.run(sort); + +#ifdef USE_RDTSC + printf("%10lu cycles", time); + if (ref > 0) { + printf(" (%0.2f)", ref/double(time)); + } + +# ifdef WITH_RUNTIME_STATS + if (statistics.anything_collected()) { + printf("\n"); + printf("\t\tpartition calls: %lu (+%lu scalar)\n", statistics.partition_calls, statistics.scalar__partition_calls); + printf("\t\titems processed: %lu (+%lu by scalar partition)\n", statistics.items_processed, statistics.scalar__items_processed); + + const size_t total_items = statistics.items_processed + statistics.scalar__items_processed; + + if (total_items != 0) { + const double cpi = double(time)/total_items; + printf("\t\t : %0.4f cycles/item\n", cpi * iterations); + } + + if (!statistics.pvbyte_histogram.empty()) { + puts("Histogram for pvbyte:"); + statistics.pvbyte_histogram.print(); + } + } +# endif // WITH_RUNTIME_STATS + +#else + printf("%0.4f s", time/1000000.0); + if (ref > 0) { + printf(" (%0.2f)\n", ref/double(time)); + } +#endif + putchar('\n'); + + if (ref == 0) { + ref = time; + } + } +}; + + +// ------------------------------------------------------------ + + +void usage() { + puts("usage:"); + puts("speed SIZE ITERATIONS INPUT [options]"); + puts(""); + puts("where"); + puts("* SIZE - number of 32-bit elements"); + puts("* ITERATIONS - number of iterations"); + puts("* INPUT - one of:"); + puts(" ascending (or asc)"); + puts(" descending (or dsc, desc)"); + puts(" random (or rnd, rand)"); + puts(" randomfew"); + puts(" randomuniq"); + puts("options - optional name of procedure(s) to run"); +} + + +int main(int argc, char* argv[]) { + + if (argc < 4) { + usage(); + return EXIT_FAILURE; + } + + int count = atoi(argv[1]); + int iterations = atoi(argv[2]); + InputType type; + +#define is_keyword(key) (strcmp(argv[3], key) == 0) + if (is_keyword("descending") || is_keyword("desc") || is_keyword("dsc")) { + type = InputType::descending; + } else if (is_keyword("ascending") || is_keyword("asc")) { + type = InputType::ascending; + } else if (is_keyword("random") || is_keyword("rnd") || is_keyword("rand")) { + type = InputType::random; + } else if (is_keyword("randomfew")) { + type = InputType::randomfew; + } else if (is_keyword("randomuniq")) { + type = InputType::randomuniq; + } else { + usage(); + return EXIT_FAILURE; + } +#undef is_keyword + +#ifdef HAVE_AVX512F_INSTRUCTIONS + #ifdef POPCNT_LOOKUP + prepare_lookup(); + #endif +#endif + +#ifdef USE_RDTSC + RDTSC_SET_OVERHEAD(rdtsc_overhead_func(1), iterations); +#endif + CommandLine cmd(argc, argv); + Flags flags(cmd); + Test test(type, count, iterations, std::move(flags)); + test.run(); + + return EXIT_SUCCESS; +} diff --git a/simd-sort/speed_avx2 b/simd-sort/speed_avx2 new file mode 100755 index 0000000000000000000000000000000000000000..6bbb97af765ae7642175b092dcfdf2809fbc5db4 GIT binary patch literal 92216 zcmeFadwf$>);E6Arcj{tRIFG>gs4GV1*8Z{xJZL3oJfKlxro<-r3hMDY!fcZO_PFW z!a+vS@fn>N$MKoh8J!u%85Qp>Z2=LnAR>ZFLDYnDDT+`kkoUXxK07@PY18TN_j&$! zpPqiwz4l(0z4qGob#}6s<+{ctm`sV1{v=8_NhZJzR-TZfr#I$PDt~gM;ZmxUBV8_8 zq%KG@nGwlocugJ^K`!T2RT7>=;G#;L@4z9CHTmm?S1s>ly1}WYpr~FdOe6x z&tAH&XwH2%485*s=D?yk#omPj7Y@H+;0;3um6Z${!c-u=2IE6*as0$7Qo?eng3<)P zNiwncB}k(uL64KY=^@&5e5ksJ|3KM6CD0{p)cCM=dfcNgOq@_M@D-w=a*#}-p@-7w zBOX%!YJ7->?8C}^c^XU0L_F)VRec5?&%YO7%;$cj_KJa#Avjk0lgKDmdQL0!b6b(W zqLuUskaI;0eJ*Q7{7KhjFNwUzX@S}E5%t>|fMMb5LW&|lJu{FAMuf7?p>zgv;N zua)$!t>}4QE9t#kNq?vneF|DhKi*3Eovq0KODpt$Xr)}#&Mo*vKf79y)325EIj!ge z@|f~-TcID&3jN_$@|^_wSp8@SI9IKUT37fTiTJi@U^%+J4nZb@-| znP*0+CqG}xcTRTa&nzr0ynjxar?7Og`{tsO;=;)@?kg(fxY(FGCwXoxF7eDREm3Kq`DzYm!f=NHWI z6wC(c%%VI(+`n*P{=CA{vXbH%MRPoh^5+jG+zi4&+K}Nbq?MM=Sd?E}xFEl%u=sw@ zY@(S6ML;yO&@*FB(U8IUd8Kpa7Uq{0&Yn>=JAYnD(VT)s*+cW);3}M{C+E(cOS+;I z#E+7ZdJD6&vxnR_qijwAY*jj^_ObEs{!S6wj0%Dk~{P9kY_nnnM-! za3PV;Ety|PLWnItA2TA^j@sGW8FPvmYsp**$q-p^?X@&H-ayg05$OH`a8N9p(hGR{ z_?vIezkbkQX}rrh_U8N{gN8<+>}GJtpkX|D+B6Uk9W*3cx_R0(`&egw_Mqzr4R3+V zZW$e7Gmsj8?{Hu{pnol!A0Ve-jmb z=)t@%JqJ*XPkC1@>LE!zq2f{p$$An?%mir=6&8I~_9xoGx3Xh^@@kltWOzrv8-Xh^pj(j|RdrQ9zwq>FKt)^oIO5|0>%DV^3? z;>k$k`jhS);uwn|9VU-FnTB-Tlngi6kZxSp4L77y-r~tIq?7H$qZraf9S|Hl(LSQnCIwq@y86o`r_=)JUo%EjOgghV;h`>1++FkhK4p zBq`+;o`i6%;;T-sV#cV*X@1Zz32ST+hzJucA!U|Qe z_!f$j>ngO4#Wzr#TvnkMSp02@ldCGUhQ(i_IJu}o%US%d6eo9CXda8dNO5vig{HIk zZz)bLs!$$_KS^i;ti< zxsXC@SbPY@$#oQ3&f?cloLoksc`SYf#mQ9^n$F_Cpg6gRLU}CSi{j)O3gxhPcZ!os zC^VSG6DUruppb>dn~ow*F3nIHi~m4za-oJK7C%IBauJ0Z&W6KEV6oMrl!raPsA*gf zd>?jGw8hrx!Robu)MLR_FcC2IA-Ugj@JROC?NjYj?31TVQXW10G7@Gg0dxOM0E+6j z(p%5$3ST%;UQGuY)U!%pX|hxO>20dN?32N-p@B27BrW?SOHovZH97bp65RoBhN2BS z@gB*kva8iD^;GZ$Aj&J`@)KaTe?U#Fe6PHEJKAP$_J@jZTUhQV$uG*;cD2^7Zjeo< z)z9r}mHeWjHaaWoEV=65Ty=*6$la~h)or&ct3t7DROImuc3-6_*S6KWLD5EceP5DX zn#Y=#U7@JM-h(lNjb~sN?QUzjdK7B7)I-6ku!c)Lsi^yc5B`=+R0N|HOtzwIyB+F& zhkA}|HCW>|d-;zDg@sQA?g8^C*F}%uq9xYtxzK)kZjkJm+Hx6S0umkE~ma6Xuxcv{CC0H(ac+r$0=3MYl^ zzy|L{9_v~36r%PI*f4T6@u45uzD}@&V2oB=@5t&;TaU zc$a;|scyelQ4c7-gAI9;U0Tw~e^9}DE@Q!H1x(?Kwj7eA6Khc8`v;kTOxi<%tD!M_bJUT|HY)n*48G`8uA3$I zb4ZflC;gU>_JUXPUOz?gjUEC@wDE5}J(a-dsfRHZUt~d2umC!4OmczHbWc@t5vyi3 z!!%x#ywLri)%6Q>v2}eL(XjlR3i*u+dF%!w$1Ie0YPfu-LV&DQDuZJb zQ?0VSQBgO*m`+>6k~_iV($e3gjO~G54%?Q}PaJBuyY8ui{wZ%lwzt8hR#5pBUwwk2 z?olcaLCPLwdy`_T^{!Mjhm;VQyz&gGPg$j`43)qrv(~s&=tz=ud%+gpiZRTo?wo`s zQ>WNY$^NU+78Uhp#kNcKQ@bQp)#J{r6G~R2l6Bf?I)f@tP%2L&`wdFg7R9z*_WvG) zyuKB6lalp`VmmEs(@@un@3cwwKZ-F+(b6$2P&o8LjI;9U-VD9L;?gGIGd`@8?~s4L z!DZWKm!GP1ByW-Z>(K?AzQzQ3#osy9MP8ADoSf?E>>at;70${~qUZuO=Ss{M;$KQ{ zr?0|Pb8cA*V$00E72gJn+qT)ePf@WRIMC?QQl2I?CTiv$F4H!27{&Iv>@P=>Q*Cl) z?Qm;%hMm5j6XuU_X?>njOk1G%Q`MCZmn7CSmHYz0vP)~4mL<3_qWTV;0Sy!9G;MdA zcDqbpJ8e5WBkh4SN1%7Enx^)4YQHoghox8B19@ggV5-^at4zpM^UUg0v&;6$QW@#V zj=;p^TvbU{rzR^}pLdl1Qcj&FbNGKlHc~e$S=*cKy6R;bC$w}@8TJjszPG_BF3o%) zY>OdIlHBShXVxjD{6l&5PDpT4tGcbur8&dStdnlrF1L1XU5@F?a4mrtUQv48He@j>dK`h=IiNy`VbV!1ZY`0#h00f zD88exdC0q0QMW0+?;4%z56)A2oRtR>ZbheYn`)7u*pAEo=b@`p{m7a1qg#6rRzI6C z{}$yG`ptI06 zif|$W+mD{H_P{UC*aP#<*wu&d$vflJZbMaUUz!C(BM^;F--ZMr8-Z+e0g(p82_Q}& z5udyh%Be4$TAwGVKG0>iqe?uhuUs`5 z7W3{8&0#}c4kp8&Lo)$Wqg8iPcla^-O3o=oIbteu_5k zjFR;Ob!1=DP4bHIs9wd_m@KatO%Qr+<*@{GMP%HVA+NX%i7s^mChE#ViLh-FjXTuO z67g?97j&-yfR>rND832{4L{#tY*Ku6jgiKwrCg|(esJ0_DV{}DlgY9+@eINV^M_(d zlAxHX6jKALbud2YV#}__2icOKGm18fw{p{I#nc2jo*@BKV1#N@6BNyE#wx~hdBAQC zJZx4MnbomoB*IJ^Jt+Zua^T@)by2c97E6;8q1#{ymo|V4)5_2hufIhzUrV$jo!XQ$ zSkOSZ;L4xE;c=?x+B2cc!QcoKU5I)(WcKt?v`5T}`ZduMfX1O-8w&F_Q2z-M>yIkx zDcC7%Um!g(kkUm>NmSFxEKsr0TOV443IJK?A*u*8RrQPPiqP>E^)JR^tS5bkGZl5R zHN#DdiTSJxZl%8Le*iGlHaXREZnXxLlpy;ZNORl1m96_p+AINwCvk}MlP* zs-?;Pmk^tvP417T*MtJ7h&nN~s$WwvR@uPpR)d%j-RdbP=15mogWD9O5;(Uv-lFwc zq0ckW-$FCc_9my6@|%FoHnOw;s=KsB5!LHl>S9)UF5_A;}YtX_z{Vy0mcD%L@2ZM_ful$Jl#j~a92)vHhk_9<*-@tJ)xdfXI6 zD=||=CbLoeN3tr`Nf@t|nyoY>IekY=PHmJw(6#!TMZGteYEIAFyuar3(n?$H{Moh! z`7v4n+kPmWSzWnkR|;dHE}_;(^tWx9FH?UEOh9p~%(m^NKcU9Vwq5hn*!YZ0cznT6 zimj?_uzEd_AyrKd^soogt@ZCg{tLE-vK+fQ)T(Z?s|rTTbmZl-n(Kd!)xL{rWnZut zYqZ!k)bY|{?M}7nxe^T{)E`K@YPPDF$H2!0%sUgBh;)})Mb+j~zjtY~sfA$e>fChsvWr8P z!EB|n-I4Eb2JbR1v~LquiYZpbRw+O67FDD+;3ou6XYCNef^#BYVOkvt>a+$t& zt7qI*N8Qn8wl2HQjrnK+tgYvZ)sI>2%5w{PegiRb z`8=TOS7A{=?H1L9F|yLBe(aKO4NlNf{-FT0o%W1SY#(|1)L*anFJFzWwPLIH^d*8z z0#hV=psTHR!2z|a6Xl^?fb|}5s$1PzXNZQYd@eXzR~fwhP%{}yRy}lAR1B1Rnmbjf zH<1C|oFvw#SaaH^Iw}$H+pdbVpzk=Bk^|*ljz@Zs7LA z3Qtih*+d>RBfaz}=e!(@-c6#sRgwM-UF*Lx>~S0c4D&mj>WB6~chb{A?T@;itKbVP zx0Hg)u_K56Vg5ofe%aLowGuOq$>v#kru4Y8z!XSd=~OFhyUH%NV>93^z_!7GaY{AW zX>tvkuuU-Aepql68Q5)`7yllDF}eI9kgTSGTg483 z0r+36{}6R1sSVoD-sn>5@ybv?({ZKXhwZ1`TAy`F0ZMQ^HX=_1Y-+Yr&|p_b+G>}K z@*Tnen6fTw0~QEw&9%m?x8J}8w$Dp@IklndP!=qr)!yK7U@POR#CrKd?>>6~Gj=** z1KrgTcC`m`r*ciUW0i@aSN&{~T}!{lp?<2?RtEc-YSly7Vm)gQr1;5>wojMMRtlW_3J2Sc&%X=hf$0K+6)nf?t1{V+FKDoxF6%Qvb3A`W^fOfM za2jDz5Zmd}2HT#+FWJ@MphkjS&9;5I^jW)Y$KuOvUo9Iz)lF3mJwdzPNLRnhfz>(M z>`0i`_SMpzc6CpPU_)CVy(9ZI^(6X(%I6vM2<+M(J^TXNMa`Maft&u0@NCKGeF$Ut zqktM#O(cs-dfp9}K+Tepq@9T0tfddqjpZO~Ij}~A!OVVNlPs_P9lAu#>Eb$|9O{qR z;pziRrX9>and3hx`;Q{y9Di8$e}x?g#P5d4FYGF&gdZI$mPubn+P6b~W1~a6J?zjX zhI8aMKKU_Mez)RB`P~m4mw)aqs2!`i)=o!Taj1?}IS$Jx7f48_H2;85yodP zGMH@5slIRS4W9zM+d3VyWAW#t&yMP`g*WV_=-!$mdsz~I`MYYq-Mg>m+xcJD9NG6_ zb>-Z>J8F*9e^~SFzKZI~!o7QIzOAo7F}x{C!AHKTWVU%(Ma3!ahYC;%w)hUQMAxc= zRN8Vj6Bc~r(8e~_5765I?SRayvQO50HLI2k0Buq~%aNblhz6Tn4inMV)uC=wKg!zX zJD0lb6DEHE%w^jnubv8|UEN{N+LNn|HQ9Yl*mMnry`fOe{$nW6Jxz7!3#%MzB`mua z`(14jmhuxVO6B)Sis_U36w&{YodwJC-k8Kt(5Bj&duE}qvvyY7`#0^W`L=0~V*A9q zr}~>Ewml}27CQH-Z+~OW{<1yE4lU&pic`c)Ky|?q+g_|mmVRDcIfJJsw;%v&7Y#^$ z6IMRC>ZkUsk8-t1(EsPe`Ngh+z0_P>rZ2Fn`BI|YTKcN$12aY*s0q#AS+jrsm(`VX z_HBpgeP7gkf&3wQCjpfAPKe%K6Dr%AT(ke&4$~(QohaS3&7p4h?LT8Tedr*sgAc9G zU9dejd+#m!u6}$VeXYY*YnOjhC>Psr-{%r~;<9PEfypUm5JwXvfWU#SdLt1e zM-vdYuQCz7n8ldUT`4KpY1uX-hf0TOn{6`_-XQyB)TrIo2&GV^7(X!1EJiK53gCF% zPvgv5toKrKF{*2>6m#H4Eb>+igY_cyr+yO1Rx~FYZyFXfIBXSVmrc-IYtQg;12X}p z1jQGk0SCy^qZo01%ZD3>+LezutDDDXJN!4R7SqDi_Lcm^W&lcn`=#M-HS6xy1JD)w z657WaHnBB}^Y++JeM%j02JgZ&zOHZNeZro#yFPQ)8fy3G7ujJn{O#21(eN+ckt46& z1Ray%-|^o~nQNnx)h)g-Yxn33$=Ec(D!iwKn&8*e2aH^Q*wZc7zgI4={+aeyG zhfx`!F68|z*g`zWFjRvm$}c~*53Bdk&$Lft1C#Aj`H8MfMWfL%xTn|y+05wnz;IjR z{Hem8n*%ms??#O84K%*P+>!D9L(~U1x2*x=JH(UY#jXw(W4x_C&}?teR2XrgBIx2! zX*p7Jc3FxQ{~T5eZ6JqW<$+Qs-T4<7A0zP&3NbQY!^6>6P>8|233~vf6Ke@rsUBAk zXBvDRx`kvc&TFC(2V-sMJ(m7O_J`Tw?31Af$VBQG|uMWmw-VX=8n}_&-?REbh?yM*YFpWRh3x$B4w2 zIxcPW@RJxprn6iU2|Bw!Eq(8Tna?4x*r38Ce?%+Of_d` zZK*kvSBY04yGDnBl>Gq=XQR&nTxK1;k5X3OffR@KIK6GP`%d>=kaBB5%4)aixIMgo z)*8$bKRV>~n=9)Qm1nCfmz1QeR2v*wo9(9R>I2>zDp=m|p1~PdJKvA?sD1|pw29ab zZ-$ey^n%dK5Z~N?)Xka_c4{6?3Z&XTS$xs0q^jLlZOX+v2(`Ce?MCeLzK@OvQD$WIVjcsN($d6$)E7{*+_1l%zuXFFCsG~OMWuq$T8u}9rbeL2fw6{N&eX@CS4GqQET$TMHnuoeA)U&y~SaKJf z#L5!ymqxvb^hC%JI+vU9a$XnIZF)PfGD32?YE=DO()UR;d;G59WFQfh#hYc z`{yAUF0Fsim+P_RIGhak4IF`r_B#oSg6xXmCtpYEQ3+g+DUYd<9DEb$@bY;8c7ry3cS9S&+`xbpDz-SAeT{wP)xX8E%~?>X-P)L|EkZ}Z zOM+1=$Wdv5JNjyTO44dhFSFn?JB^Z)ku*IG{|2j@9a)V?#~X9}pJ^q!nbzsP#;)>e z8xrsE7iHXPEIO+8L?JR57aW3|5dM|u4Z^lq_W z2aP^#*mz3OF1l{JgcqLc8~I>@;bSu<{yvXkA8YzK(o)SUoCVePtP^aZwBjJs585of z0cYcNKh^tJc}G_D#@6J!8ipBSaNa(XQyEOu=Aj9U#a84yY(+NKcVq2}-UXn$^dfhkZKD@k z_dR&)rj1Rr+dlQ&ivt2#yBylR4Zfon_|7HF$Gey<~utM1c2XhXM1Ru~j785rkS03}W5?TMyRAv~UkQOw*pcHZU-mmRa)K3>uXrgYqTUr8qaiQRV?N`Jig zC?=i7zgAN*e6b`ot=@2;fRx~Ske(;|{pGbb6Ej|jdORvwh?O#00cDHrFvT4-47aBH4f1*OS+!;aU;#buR zbQRSbLo?8M^zqa7vApUsM2#f~y^jIju6_~nuuToR;Wj$NVnM9_SEy}g!RI(mz+ND; zS0^DCCJvO?zMf_>`v+`3+mpRxJv)~50cHd+g=`S2*3t{UZ6FU0*#jNc*r&1?I;kyVkmW;qZ^jD|lP{3XT)V{%-x~wGmz|eC^cM zGOUudo9u-12?^v+^wo`#{r`dj@c!df^DGYG#myBQN|gOy%M5Fl{lDcLL5t=__AjitK;5C*$ZT`)}Y-s_g%PjuH79 zWw=&2gm+30aOeWre-(#%$^HX48wa`zaXNuR>9YS$4qYVsdvmC_?B9Xu5p);J{?#1H zko~uC=n~oAg+srP{TsS5x=UsMQV#Wz{UbPZne0D_x`LFxc-zIHezL!qLl)V84Tmn5 z{omolALy=-{V#IpO4)xehpe*y7aZy@`*-1_An2}={ZDe}YS}-LLz%L_JBPAl|HdRn zca7|Sj6>JT{+l>7K=wDGH9*Qh+5ZNI2Fd_3E&9dy~U|IZv6BKz;- z(DkyvFNcQ8{yi9_KsQYGKf|FLaQJ{jH_HB04h@(6Td>9iolW+y;Lr%!KbAuyWk0Q= zfgL6L-{#P0&%=n{(-5(oShcxYzgQ&oqeTqOz0dnJ7mVBr@V|KmyV?%zVaF$Tj4h>OY-t^1OYaz4M#tFt zbd1f?F*a+**fKlDHlSl{gFD7Hv}0_;JH|GuV{AAZ)&b|wct^+Bl#a2vI>wgQF}BGa zW1H47w!1sVHoaqPGdspMyJKuc9b=olo(?9p-$g!<;X7nDf;RbFS+!=f6A5xxT}k_$qh@ z>$9T6oYfuX+}vT#tsUm9?J(!g4s-7AFz1&Y=B(>5=fMth20P4oxWk+c9p*gIVa~=5 za?*aWycS=;a%z6|3OsuUPST!qQ|x;jaKecxNl6^8B(9^=OL!@stRz0(NUyaM*EHf6 zOcZq?zDl;PkzU@a9HBAWOJ!drT6D4mAG+xk<%L2rk{n<#>F6!;?OS-hB^4-!RNXvTMV1}nBr zo*qtZ>KXjh$iLpg87%dXc=s>Pn+IG^osr+D#5cw_;hqh=@W;u)E_T&bj`AHcdF$9$ zwHgE$I|NY8Uxg#DCpm;e0JAuRLjWTKfD?YRa2Z}MD$W2>*@i>pWR?jI)`NPw1t28wt5 z)Id#Yy04R>(;l7eqq8@3jt-q;Vdvb?Iafxl3|d_`utkOcC{}!I1>w@hgbQ*+2!54lW2*$jexglmLz~#cZDNCMVh^^7t!oqeWt-UDZDMz}iLGrDyR}X1<~Fg_ zZDK3h#J<}mc72=Jf47NU*CzJWHnA_aiG8U}>P5j;znQa|@2FKXV9ULXvgUCJj;t{pf+H)HLvUn$#obqMWW5@3WIe(mII?cx z5FA;Ta0rg9V8oI2X2g-TBI3xJ!XY@atQ>+P>&J*AYeU45_3MZuYkI_ybsdM`$T}Bs zWPKcQWIfLzII1?2#&079D*ZjH+NORk@bH#1V@(c(t;x^H{!@j=MWrO-$Wc) z^y_xy$imN@a%i+?GIM5K9hm$Ieiq7$FKbl>)9|CTineMKok?!?XW3PJOFJ<6Uu+rQ zNk+d-)oGB<7IM_o*%vzdLg#4NIWLG8Kb`YJ=gNURc#k-VPdn>omYbinhFM`>gemBeQi; zx}3GXh3z9Fr)E2(k!!OZWaQv%2N}6J+d)Ro&eq9*i4>N+p7HEPj?dPmMMO<&9~n78 z+aZlyq3s|ehiE&<$Sv9qGIEZ#P6kY*u;e9;XE$<`wk|CqYIys|$Z6URY2-R>2N^j~ z+d)Qd)OL`OGqrUxU?PPjuWCHIkz=)WX%SHw?IR;6YdfTotF;|ut@we%bPYS8&;HU0PyJ4!f~`%~e8P2~_guyx#H3J>gS9D)bd&mnkVr*a4$ z*h~(=1KY$srtrXS&pL-4@zv!C>PLi+JScwk@RUQ~Es=W_@iScOCI!0Nx|0}t$B?mLAC_I(b) z1G|Pp@W9^3Asi(f${~1QyKo2|*xlTV3J>gGIRp>vVh+IrJCQ>;N@(E_Jg`4-FDg8+ z)f|Ec_8AW0C?VZZi1Okn;Rp`lC}9eR;DOZ-6~Y6%j(b*dl<+YQ;V9u`4&f-_RUE=m z!ba{zg$H&Ehj5fo_q*aKVKGmG2R4U8I7--yL-4>JnJg|>*2p-rwIRp>v z01n|Oq5j)J@W9fEag+l`318Gbu%3m8kM>N&|C>BkyypKK{HjOTDma6E2OK{eR^9A; z<%bD7X=qgIv`=T-5Wbeqe$hE5bdCx1D|4M=Lg$zuzI5@|8gAsOL6ebzt~3kpO3@;Y#jXBIQTVj@T=nB zACH6g$H6oIVXNg|`#QM~+Z3_(b#fiXAtAS691?OF#vvhhVH^^26~-YUH(?wS;U8?n zF617JBZXXpaY)E57>9&hf^kU59T&$PE~WMEL*OunW2W;z%LaUmOy0`^6z4 zmtPzba`(j{Ay;1<5_0p!Arbz)Hta&~y*N_HwHJqk+3TN zBKW^Y#s8b)knvYEc?HeqnxxQn1lJ6}J~Z zP093I^K{|HfQ#*uaVxLZ2e)b-PIlrNCVxGlvo_!w3`Kj;j7w4#6?a+xgkO4gp> zX^h*7uLf5ZjC9%#dqz3}OC-n0k)=BWxUg8u#ch+H%Rc&!EZH;azRAz&zDZ)Wfz?}s zt93m#_PhJI2a!BuR$O%9u%5v!YCiIw&053wH|qCQ;{KUqPG7ZIO$DQOYt$tj{P+89 zM?Bx#-(`ikioPG;tpAm|{x<0EpnGR)%uZ0hv5GUu6cFY&dRIlLJ6mOm~J)5#LdRe%EM!vro*_X!nqx{I5gr)x&eQp zdc!@+SK+b_I~o_e2s-5{n@#rRl8crug1#eioAMqf?g+@jCGh8P-Rj*|3%dv=>s#Ej zYQGINz%GYA=FU3hf;tu#?vls|{-zHaI~p6J`$6ZzwGtT$$}$)kvO8qbtwfzk)D{Id zb7TZh;Bo_EL0Z$aJJ^*gZd(&^rlIsZ6x$(r)jG6Tr@B`O+)d@)il0Dmsb4v932)X> z7aAJwp#6bK|33HzZr+}tr4M!5c6ugJ6P+m9TfmL)ftjtg&GY+H1GTGe2{+Wxb+(>s zT($$=eW<13Vu-WnS5f1$tGIR^M8JFM=w6$*3EJh#!Yw~^l~PNVkPVsRatJ@PrP^we zS6z+TilI2ndXzFI#c@`hZUa4w1=$46JQU@@n+IaERl{{pE#7woccn_uU+=7nuik9i zQr1W9!HAzjW`VSTn_S=R}&wK6bD1m(I#F38CG%n}XUG?%948 zAj0~0D6ZvUG}wt(1XR&k2Warj`W*IScV#r^=fvgF>er!cCdDDj4<0a3{brr2-_o-G z&~?G=q9BaKbjf;F(-gWQ8hz+Tg+_a|7FSw~L93mHnMP4PxCR;b0kM{+Uo-F22Hd2m zi_u>y!wCxN1aJmX4pz|U2}nclTZ()>!L7}#rcMQYs1v$poD#mqCBwl|R!N7G-MEaM zD)Us>gVv00Xra58GwDj2j9||*RFl=S4K0lIRJV%TaE4+^@mO)YY-5JJ`tPVu7os^X zb*5Eut0(wAtwE=DxAkECb^5rx&vP;Q&UzY|UATWR>jzwE)Ft@pe42qa znRC@5>p{2ejAvAC;JVzAh?H)%2X4o7Fdh@iXR_}b^nBD))Oh?EjmKFe0KDF6yw2nI zfoI51POB$|JR|IZe)f^~NTnYOwQ%*KZyM7I?VyETwoglssh5yelohUAMcwsz2k+mcr6XN| zA}bp>Gg$uYE@)9_xLyYnLdvC)fkLVLCLtI?s>}BC;+<3zgPrOTQ4^WGCfM!YFdvhO zJFrBZaJyru_0{xoX>)^$cF)23!Qwte+#`+pIHcFdcWCmc4|A}Om+FINXzxKX*yvm~ zJ9-aNxu>8yXnx-%`yA-|G3IwZKk~uoh_}jxyUeqKOiflFP@dI?!i~TxrO1jGrH{il zG;SvI`uQ0bZ?F=nMwhh1Ju8*#D8G9qL@v)6cB^I7ZGj)RO~a^9nLj zL|?I-(W6BNe>UbJ2ot57u1hbHe+qD<(q@rhr7D8h4c;Zq+SWtwvU<6_Hg)WN*9l8eN z3qbXf&>IXo7n*|RjP|du2T#&^(5$ZqzvJt{u2>J^7Wfet8T-{yGC){exw(j!q(s@|wX64gdy$dV6|AvvI;38bbh2&n8TwdYC7mEEKAsuri zpB|~C{eNcT1bWV?=Ak0g4}DH=Ul>G;x2H$u8<$pSeuu^YTESgiKi!2D;Dt0uX^Ss| z7VJLrXCVgd4|7^Iy7jl7k-34PR>w%MRJy%+iRiHXFn@@?K5V|vTwkBnMDokUy`z(u z>HJhUDn(XZSgz~ub4--c43oPbc>N%P&w*h1~+hc?QideEf0#^c6h*RAi{2kV! zY)udjz6e@3z7!Tn!Q$hPJF5{gz(AM72j_zuVaf`qqC0RS{HEe~H}hO*h>+11ROs z3VoN0n#}k<_CQXux*(Zde5u}^rrw&Nj<@>GndOx$*^*+HdJ-*Z`e5Aahi{MIt}gRy zd^rp~_`Z@8-S1Z@!-~bxs>y%mW1nz@#lKIt2H(g(CCHXic zo2!k(y8yZcZUo3NaMI0W4Y-D`|7%0oJ++r@L8w$7N<`X4UjwG=&`#2Ambe8?F>TS? zE3nv4;EuSjc79Xy)6PTH-+0H>Sm^)9*)?b9RqV#U5Am<4vgYHa3S60%gOygTce6`v zP<(Zb_LCcN@Z$+IY}|x}=}pPPwr(n>(x!y@H(_mIs@{LPxPf%T)PAb^n-8zpfQzhm z_FO{B`My6>)3~wnaa^YMiBxkO;+vM4A5~0^itU`|CI=Qb+R$>cLe>T~{ZW$UwC$ST z9~Zwt;{@Eywm2JWmyPo;1Sv=+XD1uSmn)+#3*+;2&&WKZ z-rMma3ic*bkI$y&K$rhrb~k~#wdXOISvk_bKvNyD^I5X4h|3O-^!ICNrty>pH zl~-(~bXXNuB%8`B-a&%u$W9Ku24L2j-~<>Xw%x9PW)ZtN;83LYip!#%@7+I(Kdj|G zKysSf|8qo$@eK{zdFlvjmyKg+fryPfhWIBHQ-zXMIaW=19}|_yJrc7}=@wVuT9U|QjJip#?wI1(VE$jU%%KiL$|H0G$ z{5qe1fuX+Vo2jmG!DwWBdgTGKFNXH)9nQcgtMaT<-F%BS^dIi5pd*lqm5#}IssR=C zn?Rb1t80^8w$Oqtp}SBkY{KLb*sr@FL_Pc#7JUT3w%nZEQUl5Cs1~@CU(XDOex;H3k0+{t7{J`{y^tPErCTX}I^#ds)3rAD=#*gN_K% zA5mDo*K@g|ex(GK;CJkEu~)wbj-5^3Jw)(VPT<|=#PT?qERlT@hC<6ouc1LR=mdci ztG8O`CWkdhNd(jp;Ie)1=}8Ni&`c1~d`FS0X7$$CZ>N!a3wc18yY11#v{XIsnpbUn zj(TJDH^;B2z|iBs8gkS43^Z5?p#=ejnG|jskGXL97(|Eb$vG5az#pINJ7)4P+0@LX3*gS$V37XDUBHDNhrC>>)gT=Gi>JD5@@jTWndSMi&CCH`N zoKvd8evuK!30K}hJ`}n%vHV6rcKMBT>*JFlGLViz$YGtf(VPHr9_zHXyUvmEqi&0> zdBLaeM@0wbW@PW61?8!q(BMBK#hog=Trj&5$i;o8=2|QmJri633+*d$b0t`*?_|2m zD`bB*ibfkWwFZ}LDz=Yg{{p1A16C^(_Vi+eB!4sHV=VT33Ca&(E-Cmq<*v=kP&QGX z$h4Yz6m5s{l#ymDeu@zVadJIi9D>kBmql;4JqMamiP^qfwD9<8RP)x4Z$eh}En181x-6Rt#L z<2`I@FqK@slZ4tn^6qwG^O7ccYbk#(?(b$9W70szUtmb6FBkhiY_XBHlu09>gNnLWyNELeM^#J8zN(QHE15Y787alzystCGDV7@o!3 z9vUv%5!xV`cRE(Lv`jw0@}NX`9`ZtuvH=SX6W6W#2a(|Qv=qc6E4b=*c(TOEmm zq~zfHOvYpk&REfH!d4}CH%sxN0lKV%b8!Llrd$iGis|vBH@*IsYm~snih2PmO;MBS z?&>kPm>iZ2+@J*Pnbhz!^DZSY9K)Lz`|TyLhfPu478j^}Hc9rcf~*N@%0?&LHV|0} z8aQ7TTZd!vwNT#b?(7{db#4Y^xm2{9h|18HZRWV0YEGA+&(Wp1aJz(VHP|l~nH5@C zn?h$wRRTFw)u+4}Zq*9a*cH;I(38wuIcR&eA$p^Qa^_GuX^%NP)I`-RmK|UxQN6aEVt6=REn8qJWr&k{rf_ zK243wlT4_ANOG!QqK~CfbWjtgJc(qAuSUM4GQ3r}!-FpnKl1t`%`5mWR8{>GAgX_< zdAOzfV%exe{^p<9+(2s^yyghZPY$L*nvcpEO^e<;%O0sh#Ze{jvR?2sH_B&xmOeRaD;Gz4% z;qnUaKaC8{8ryH@Q-f1IgUxqw8g0JOvjgV-G~QDyolT99?U-Czku^ur?zZM_8lTK6 z3;SIx*47U!^UNGrR#K{9e90l6Jd9J*JTyEhI2kS>c83()PgO<3&a)WWsh{O2wtcAK zzd%9zMzhsqXYadoaSxT1*xxSo&?LYziHM%%fNOgilZ_MWB>PS&ST1DuX(`*yWIL=T zUAPOAj|JvO2ghiPiluFx3$A_m`v%hhf*tT;t0kDkDjrRnCK3!T$I#xBUg*y)f=)Hr zIv8^vO2I~Ic;czlnKbhRY30I-c?s6jR7{J@MoZfbGpLvjYeI85AM(yAco5}$tCe!P z&MW8dgm81qau({s6Gb_jjfPU@4V*E5A+nx4gqaN+O)}dQDL7Gxds^33uae%nICHbE zkT7K6jH2k;fSmIxx)CcdW`TY!rB>kV09l}mF4ew%Mrn)sB~|CnxPK9<7|JqTaa^Bt zohk)sUB{&Ulg(O@=JS9qHAzSXc~{d1!?1N?%A!fmLY+=VB^pdIm79XvHRx}bf;S8C zhgyp7p^Hz}#lL@_ms-Iw-qzQXWrt-C88{1W%V{h>-umyH#ni{QFi8GgQt%t{I8p0= zpr!m*^0`sE>GIzyj%xFUEL_&RS?@wQrp+svSHQBseN2?ay;5+w$YOlUEEGM9?s^t) z-#52~+0o-7rJ$~*ci=ncL=}PV*JedHg)9vnICw8FLJuq*FkTq$`u>h4Jzw?07K?)A z;KuC82Mx~**izIJ%X$5MMR;)v9j2jFcNRwJ@H|Fno;7UwH2NzaLkgjSp|4FvbUCH~ zOn6VDUPFHTYo6b!k<0$$C{1u62t(fk68j5LEc}QybPx6fo?LMEGZIIOVjsT;IR8nx zybIGR`fi zlVBn^nSa+l)va!$`R_BnH)8Kw@rIw;Mvlu?xwOb>t5|lw6E9Y=J{YH{pDI~D!l&Ws z!CrNK61)SF=*4_GThpO&kaHg|vw}H5&{isy5uT(eeISzDT6!NeGK}1{G1{4}a@k#O z^^j8jp$BhT@gntml8sk_nxt5AH-N*xXsrGVFQ5rQ%&0 zI8FxN0m7+nsMqxN#4^?yo3js{zz9m~OfkR04|$g6*5STo$#5^8!@v+2kNwd_Bm1IJ z6__96e#JF7O!4na(f*t3Cl|}c(f3?NO>bT@=3*_KdlX;06K#A+A8gHX+15-h&%k`R ze92{!T)quM{PHDz0ogFZEni{*bSGwvsmA(#x!6m6J|bk;`i#LH^x*lF1n z!E<*|iy!@ghOTu0RJr5;2;mBP2VTPUf!kt@J1{QU9k_|QMcOS|dO3|1%o&aD6)XXZ z{Gyy~S9hu#X=$IV;=TDni%ShVyPa@g`P_2>r%-p%61xE>U@Ge@O1DP5Rdd>E;Qx2Z z<35AmNs)6m_=>E3@J6A=8=&z=8$Aq)LdQFqj=7l_={z^ck5{3lYC^Z~sNP?HO$B+Z z9#<)4|LuFLE5EtsrE#gF}u?gHNRFryT_l(7%{5)17QXnhjgL)P#u$&|citgACw z$$~$DiW%SU!x+(qQS0b8)ySy-n#PQJ3YCHUmLb4<5vcnLr3XeAA-~`gkjhLtA6&wu zyq$8pqS=R=t-6gCJZW@DAUT*0))uD4j*-^Tur6%e?L_uTyOtLY9$DG}`}(+jKl=r< zZ`19haKygd z1MDq~+@k&JCcblF2Tc4pH*x7D%*4OJ`yYde8?Xki`SA9W)%!ovO-$ED)KuOcuCA<# znD}P!auZKu?87cWDR~pVnA(`Z#MC*1Lr|E|>tbGP!>*UXKFqEScn1i(hBKI5-=#pe z>vx0+jQ&3=?79WHGrK+tC52rh_1EV3a4T5j+P6FWcSQ6b4H3G1dqHRPxf_|fktQr* zjBVM3H)7cu-GslvjV9c_jgzUg!`_0sgxmK`GI+FocOa9{geEB459^LL?E4<}V$8m4 zNUk=_$F5FcKDVUXb=ZRKF%)0QavnVfIS229d}e3*xkF*+w%hM9u*b45JMM=;0+ZKP z^hO$v3mSskk->jzXu13mOf)!L_;PQ3iOP;yk?tGt8r0CJf5ddw(&%qdO4PVf3^Cm3 zKE^)Gu~Ha)ELCE((eFVC*~ICeiP5|bqqA=#)3|&o$<>Bkf{f@zwU%`7BHpfbllNfx zj^4TgCO-z4nfw{F4`K55jn5viwlIC{@%eXk|GCY*K4?80TC-(%#NM~v*23Nol66}Q zBH5G@HHe%`7xuo8u@Bpc8Jsur9b^cyx9mTK1qd2R@EB;=g2jKwDVhZfv9y@mVrhXX zh^;NYz=DRg@mGIIT47TSo4B$Rj(iuPMh)`Y9Dw--&A`FCmE*82M5-;fcpDJ|V% zx8ILH?SN_Z^@X<=8=QZ8YYXH4={)1kiDBGv;N|1QU5tHLAD=MpFQ{#`H13TkOz6oI zSXR(_{SPwM$CD9b?MUmnl;V!&xR>HX^*Aj7L(*9?bkBzfaVbNfI|Q-+pzfq>5Bd(6 zly0@v`>U?mT=V0!4@1|nS(eo!max7$6Lmyo`yr*<2Ab11)cicHBGibfV%8d64u(O4 zoKJN*uOVNtALVke@Dg(N>T(_dQL}%rO}(x-9W5`0UiFPJz}&em}R+4?|+;y)#rS^lm&9KfJe@c}d>)S{KnB3(f`_V;UvBrzcH85?dGj zDEDo`8El?|eEf&}o5A3v^nb(*m6q=(Iqm z1v)M8|Dpvn%Vzh z?+A;8{{y7b8O1Y8=1Mcl3JQy7&MCfMnpqeDM|q1MEG}74e3RrYn{j{P2x-CW!qP(N z8jEvc-jvA}_6)R?6c<`bW{r?|-0+`^o>jP@HI%n_&O^YPuaY}!kLyy z&O380&dIsA;g#yG6DR4q-c>TsGpD4u>>kiv!)dst11-hgx%U;8q6n7jhYY-Lj>l41 zR5-V=*i%L_=|Op$#Tn`6cnV8rFtLozP;?GPjB3d&DYYP$6`QWvjDrUanU%#U zaf%6~oXIoezM{f>mSCScZHR@P{;@R2i)VNWq1Q~Wr(ia=vY?nzXRGfD}B z=B0CH&_NMt`hjq`c0ayYb0i#o0iWgg=tYVdgyTH)4io+{$&3%)3(&PXk_Dd}e3s*b zXL+0t(a=|G*+a7Efwz!7B$J-jLdut()l{94^J@5oW+`xE0}Syg55ValBSbLqQu6 z=1_Q7IJ_2N296lKjBp{sg9wxF35N&3nLQkb+2|KagYTox5l;I6c!UcPHX_8=Ys1U& zW}y*bCJa2hCLI1Z!pA=hhf`3##*gs5G~lOUta+B`5Y{4`y&HPt9ahE{&>P|Gec^C5 z!i9L5K)?37`&c-k`bG`Knq_XyQH^Rrqgu`$^vxn+z z9zJW4)(v?`xv6O@5^hP&@Fgbvy?bh!JvDi3kA(3I7y$J1_$)+Su7FJ>JJH_Uvy1XX z;%c*&w5p5Z>zZg!><)b=&4>JUw%JIAW;}kuH38L-vGuo|=JrgptRi${77}&^zD^ycz8@QpXOe<6JE%cU6~L zQOiy<)v~<@<+%>@Gw}vW=s|f_BTbszzC80ldII$8K!2L}Tzb12`$VF1wb`j9ja$_P zd9E_?mX+AoR5f<%*pJ6<8~cfUBh(m|(k0E;bw$FeE?UxR^AqU)%oYlOyD=W%^&5MJ z|F zysHzP)RxiyRwXIamKENXa}p-TY6@hVI>^}xIfEj7Xk5=O%)WWliBQr-x^;Pfr|e7* z=egKu(Dk`5=19_|VoV_0QUCup#*_1 zE#bM;WEvA)NMDb!`DsdLZQns{AH4}}e@Ue6J9}Wj;N1ZI1*8j69?MqDMc_Bz1zlKu zqu<0H6UL`jm=fQ^p&wnFA8tUvz@*gS^ITupsZOU$KuQmCFV`1!gdOp2-| zM~{M*^+xhCVW;OI&yA4l7d5tGJXz5tm$%1tro>;GW9p}ne$(+;Nb)|%d={xoCJ(yl z?VHKN2uEXePQnaGL&|wWb;8Wl;kTTZ4*8Gc^Cj}wjP^1VeSqtn8?hezNrK+JS}Dj?TW5e^s8SjO57n@ghlgkCRs3HQgeqpOD_ zrtTQ~XG3fT>5RI%3-3A2r;sztC#wG0+^W~tyoBdkk(c;7wcGP+aw4@0>I*a$R9}yu z!AE(_C)&!3&Ew-!>r9Dxu_tmn$)qum9>bWHR2Ui4z>TrI%c>;JygKm-G9l~V2-BcL z7DDr{$kOQM_zb`rYTCEqF!9yRdfgVefOlZJ*59m zA%YZQdpCs=s#1t=OA66#Pa#G;;pc{#uJI=(9qcBZNKC5jF1_0&>9g+A@4EgiA=J&Z zs~bgLC!rs8PukNxvh*jt=u>#dA7BeD;k4;5eWlH2(AAk&B)rmB`WNZ+ao01ZfA*CE z#Q)#j{Rz+ZmFh_=t>>>3{?S*elL0TsRPkb8>62c7zwPBu_+4M=m5TsZUF1vnTVLs= zzBfY2u&MVqCLDKKo?vM-OMgm8YA{RhQEtGP`yuRnIxWy?fldo_TAPg_N?p4CFA>Q?13wrJ@`@;d)DjtK6+f>e>xW~p`=Fg!Qc`5w?^{!;vshuJr?pY z;L%@8NF#K5+QZXBK6QG8f0f6Y-$zMlTtB^75d`h$>Cv;3u(4#%JiVYIeYS20p?9)w zNNm;FBl5=)SXLe+GSYK;{Bd2;upZadm+-2E(GSt_r=#$?`2X=>vtD1(8Q);xDdY5T zsvgeL!})r+LJyzO!$0fc8+y1=4|nO|cY1hI54&70v8T5lUZsaO=;1g$oT`Vj^l-i& zuF%70^zhGm_=X;C)WcnR_?;e})Wa@U==tm6ReH!%|M&b)?q&MlQ&>Yu(NEc9Ncvv~ zQq9wNNn;fazce<}c-$2qdfLK^R?|^_;-iP;w3a{7w1yYDk&M>(1x+kGrO_j3VwIn0 zTEkPikU@IJ;ukcr@RTO_i7pnupoxX2G=T>glix@aEuZ4_7-^#GpJ-acQ@W5L^oZsc zG_mlMCisbtY!S^bXky_hP2d5>x(U6HRM)N*6MWdK&o!O)Na634Wpz zLMNjPBcAx9X^eP+h3-Z=WBCP5EIg$NJmn>HHS!y2qVW_r@)IqU#mH}@iN;ghh)481 z{%9Jbe1fC%Cwa#FjWj}*zzcnhGNSpTX^eP+jq(v?^(Ev}Jer2&3B1rJT86+Ae=Hh- zr;wglG-CcIn%3}?E@VXODfk6VEIg$Nexfs$L+}flSa?bkcz`kajWp5nDQ?W4WQn>I z`3o8$$A}j?2^q2YjWp4CiVOJwnVpUDDa}YjvW$44lTn5dPyEp|Mm)hncOxB*38MUh zCKjI31Rh{SS0lfXMwCV1g*}WqP+0_C$P{^yEF)g%B=AIU#8W()#)v1_$SY`#`5S4X z@f0`WDL*Qckw2Qoh!^&V*1^ahO+)eoUg#68gTRwMv1kOILZj`Cd5QWLG|_ShFLV(y zV(}YkqVW_L@&Ph=qAVhRBMr$i;)PB|8AiO2X_R5a6Kv!a@~A#U{zjT;ywJxegYq)+ zN7ESb1RLce%4}fdkES7c0x$H5mLc#$UbGB>r_iXoQJyHjktP~1bTP^hc?te#8Y7-y zqkN$+l}GSL(~vxY7y3lY5O^UkT86+=7|knWiu|K#jCi3-v<%8m@W-MNcnYKC3!Mdj zEEr_iXoQJ%=(NE3}0x)^1MyaazVjS)|< zQNGZZ$|LxrX-J;H3w@$x2)vLNEkocbjOG5P6gTn{t+6b`Z={LFQ{0Fr z`e^=W8Y7gQLZ4_I1fKMXMI-PO zM)L}Kk$*Ie5l?V5p7JD}1%E6Wfu}GQKk*3uSTq7pVJtk7BmAw=kbL??+ukTo$PqNr zvVpg+4|+@d!M@v1kOI!f0MWPwj%9Xc{A);AlMY zP`LzuEE<6qwuq&J;EzQ^G6^qqiKT$4{IO^Rp2F62 zpfb@Di-u$pUg#1_hTw0FhVb->#VdG#j!7f%6t*UV^rojZ8p6}3HGblwr!^YF)2B5& z@zN8IrY(G1I4-^Q{NwQwo*oKg*?`K`8jkn~Pfsk~_T(-21#NUYAi1sKM7|V~oM>L5 zcPt$72)ux!`H4^DAB`g(fhRZ`FLaJ&7os=fqvOVYOmdC1MjY`Go*to#5l?)CkA)*1 zffq38YUC$Ak$*Ifcm$r{XuQyw_@i+|Z^Rqp#{7v-9GG1fJk%ywI8Wqj5xU#2e#O-y(nF6ZunKMjY`7Ji*a;k}vSlIHEV=McgQ# zcmzMu8gWE##8cde7iAOtMjX*bDIOpDb zdcM9>9qhcOVdT}}^X&XifZ95ZzAqroBaMl;;;zG} z5w3F^GxgM|{uHA|IL}jDF;kynm zYccXI&TEUSZPHte+!Rl3T(fLHb+-NVCk&@koac&1xMHS0#mKj~uC0G+Z2cA^-{QQs zxY{SZ#mG%@Yg7ExNH4xH@-5D5#v`13i(8D`6qikU@~xhHVdSPbZNe3+)e9r94zIIw z{HaUFpM64%hj5- z$hSDJ8IN$aMS6>oo8q!bPrmg}zA$oAoHpT#)#`-)c1W7N=e@Y3wa- zF>=)&`In6`YwWF_d|~9$pT%htu2?NDjJ!JB+F3vORxgd(p5oLqCbd1qg^{PWSpTxC z?Wvv`)vdlv^{TrLmsaga{j2_Y+Ha?m?>epGuESKP^yH>-QKxpL81=$=p5ls``V=GI z;`ZA1D|M=$byw*IM+e{wBGzQuWMakWi)i;VidpSXwHBwI*OsF`Q~eer-{QQsxZ0=qEJkjMTbtsiMtbpuk#BKcGalh; zi}V&FH^pU>o_y<{d|~9KIBmietJMo5uMV%X^S(=+`rp<`jE8WZr?_Iax=epM2||d|~9K zIBmk!Hmes#ULBs=+5glieyh>=S)6*sL|pMoZ!vOHTsFq6vA6!o7e+4qS)4ZEiq+!6 z$g9Jxo%NG%_0p*ADNa3OQrlBp7JU>!&|q+Jq}ss~1LI9d7NcpM0yAMr}`V>KU`zp5nsDQ(LTm+12(`PmStUU#5E1 zU587ncBKAQztvKs@k@2&Tb$Qct9Yfi7`Z7fo8lth`X^r)xhYPYaK&o%!pN(`t)2Z! zzST=Zj7N&o#+ZmJR;w3AUL7tw_2j1hsZrf^IQ6Q#4wqK#Nc~f9G4gHv79%&sX;Zz5 z)#`h9ule6vp{NRapO`<)G|D)ay-jWpGPo!Et1v_%po;Y5bWB)7I z&!C_0kNvFo6uml!k5}hrkD#Bw`lV2Xe{VNVAHpX6F4~#o0Ed51KgzSPpT@kEe-Gjh z-*tWlZ%w)v-xs9uz2^OhbzFWJcGi3M&x4KcH0SX?-^ExL$@^w*MgHl?|74(N6&@tc z@NLO&1OuAmk6(ekuHXLH+p!*7fyaG;$KOZ$Xfa$XSHZu}@$_4#*zUP|95caz9`dH+D=Cgq}3Ptt@bNv6=9Q#*bKXVG> zoAm!{u>UCRSu(G_Imgf91L%iS4^H0?7Q@Njg9*v_8*#q97=Z>}pjFpzo8JCFws0vB zKN|UyIs9LXd~~)yUWokPf}QoI^XKDukyu^5pYe9g&kqI%J_iI&zZC7KqvJ;O{}0jr zW}H7-FOOh8a~-|;{y_Zc$p5!F_5M%v(}?R%^Ye%CUNGNpUk(oZ`|xqf7ovOrSA(7T z7I<{MJ)GOWzYMz{-#CuKJ~oH{>FEDn?0=sh0-}3!_^-^dzZd<$eIs4h2k)f)z1hP_ zJdgQ)Q*_7W2j|!~=GZ%P?7!{p*xxP&-#+&SPG6eC|6AA@XS(mb68+qbe)QhC|DNOL z`*0uOczxp4P;VKE>^;%`)umwYc}{S8&f6o&6svRW-8uGOg`M-~?%m*o`7C&R7WRfe zoBi{bbNqa5j{PyNclgg<3x48uo-M<~dKdmC*qIxE$CI8P&h6hF*w1jD{6M&a=>GMK zo)159-TB-e5PlN&hIJ6nUx2-?y?-73Tzn}IeLe)7{c)P5e|_1x-Gf8d zxpU{7bHmXjKj&7F1kNFw*~Bp&T)Ta1%WW;V&2w%mcUvvj>`lf69{JtP_04M0{e0_! zbK_x!{AKXiZg08Gmj0_+kB5|buIO~+u&9P(w(Ho>X4PsrHz>!urMum$_J)U*+dkac zDaVZQErz(c!Iz-t4tGjrKpN_r{CV#TrdLA&#Qqp29VxH^FkOL0iVte$?ryf7|?w=8#wA09i*5$ z&NPSKds~e>ZsA&fVO%0r;iR1GVIYt`u$VlD5ns#mS;Nisd?BQ!+eF{46**MYllq!dsj%eRRTGkJ1h?XQZ8uVH%nCW42GL`A)k! zPWD03J3zPhyIE&kO)B&WR%92sBX@ffE|+THI(_7goM7E{hiW?w#o?W- zGaQUKO}ZShI)>Z9GIzyzTpT&1Y8)SBJ7Z)ybltC z9M-zSCv(?+G8&(}awBtBKD2)6+Ug3h8F6>kfWAY*Lifbl^~;ym-1VzhH?f?zE?r)0 za|c1z#fwj@tzKSv_F1>sT*^jBuUPE_lWR3w&2!9kh{Zc&@;guNo-8JN&SzA_{xld| zgp*5`S6$v*gbZ8BxvTwRce2uI-*_s|SB8TDGC;2N4oYD8;}guPV(fg*M6REWUayzA zvWvy$C)RoFnS0~m?V0en^CXe1&$wR6+r5nq?6lm-@&zBFUp%Y0#(Kq`nCE%RCX-z4 zfSGsp5KJ{LdevmL)7~ubr`^VE#2Nm!p~jZjEvur}Uv{fq3}>%8YQtJSe~4L84LxnM zTG~UWM9F!VUvSrkU5Bu-4H7}}O87eOS_|$<+-p48^{AkYaSzjETvFji!+x)G={Uu>JBF*<^yjOXuR)Fw_S8y(eDpCw`RK5-E6hu zn2a!EddwK=%298zwXwf{qldKyKdn}OI5@0^?gWK{hu$sRHkQwU>lDa&X(mJju^^D5 z^B2mQF+>*6et9rs!rgFY6I$X7X>aTg+PL|2d!0!(CB*4os*c<(-b2yb zxCMva?hkj7wzM3NhvSW>I-!!d)(|RhI^TtpbP4mIi{Zh_^4-P94Kq3YhIvrl#ska2 zZnZZvPn)e}jN_rw%&~?U~Fp#<_lPyW={OL;t|SKw8gF zmgwPlE+GnNvAYkDRy&dN_JaK1bvKuj`|wQgH`kZ8%1MQ%gX4XF(7PV)6u4b(UBkUy z4}O?1eC*p|cIV7JeQ+C_mAg?+4hINmWd+Yk@q^OaeV$lm*6MMIsn))cOompD9-ej& zpBvddKIlbcz~<5oupa@(Tis0O#D~+_gye96=PZ1tr6PlM{E0&vwW7yeFL|GYB@gexp?0_zDv$N;kCjOUe=r(4G<7Y ztO#H3=?u@X<#?1e{l{*66mIf!RXMJXjA{F@*YBQ1y70ilgX>v-FmIC1=dP^TJvu-Y zp{&LM;pZnj7|mQbVB@l1P!P{X{VHqnsXS{|BibAf{o`%3+>6itdtGRC1*aW6 zD%|OmqY59UIs2LwR$cHEJ{$y3I;S^Q4aYvrV$kcr%McafPt!*cYV>aKfrTmKYrgFG z4ovt4C*wCb@oVdB1Hbd|P^tV6!5X+aXZd6Jhp(2W`cc%Yya>!5`kqL($8loyccWfq z{5n>2{Ps0_tR{N>Ua7J%OFyB12Kav>hlAR$ycQ}Sjr#aC+%!MB+!i>&SEI|NF49 zU*$h*;DSfP|K&H7Ge2snR5BLSkA1eO{S5~jzYgNAX#VCk{bh3Uf z(JN1jN_>qwbENVAY1FI#vo|6=mM@ z{sw-}v4_S_wpVas^`H1gKve$Z|5N|Jf=-|Bh=J+%+PnW7{4^S&)ZeuK{{sNNI@UCs z{$4G88n3QZ{ui|P%6O{R@8d7W`o#mqAG%ig4z!O=jN8QYciDSUpETcdqbpzVXK2oS z>Gk{eSE63?H*PD_^`m}xcT)fB4~80FkN-!=qqZ`fD{htVOZ3OTiw8dshhpkQI+Y(t z_4+;YR1n^;jX@p~e`j$`UuwU8_qqQ{AgY~`t6p8VKqGTOhFD6U{&o<&1`Vma*4l|u zUo>L<7kP8Q@gw;6e*9DY+W)vb6gy6*$49P*nx5t4GKcDy=?qTl^e=xnFn=}?B$wI$ E0q(1;EC2ui literal 0 HcmV?d00001 diff --git a/simd-sort/test.cpp b/simd-sort/test.cpp new file mode 100644 index 0000000..bd68322 --- /dev/null +++ b/simd-sort/test.cpp @@ -0,0 +1,261 @@ +#include +#include +#include +#include +#include +#include + +#include "cmdline.cpp" +#include "input_data.cpp" +#include "quicksort-all.cpp" +#include "avx2-altquicksort.h" +#include "avx2-nate-quicksort.cpp" + + +bool is_sorted(uint32_t* array, size_t n) { + assert(n > 0); + for (size_t i=1; i < n; i++) { + if (array[i - 1] > array[i]) { + printf("mismatch at %lu\n", i); + return false; + } + } + + return true; +} + + +const size_t AVX512_REGISTER_SIZE = 16; + + +class Test { + + bool verbose; + +public: + Test(bool v = true) : verbose(v) {} + + template + bool run(SORT_FN sort) { + const size_t start = 2*AVX512_REGISTER_SIZE; + const size_t end = 256*AVX512_REGISTER_SIZE; + + if (verbose) { + putchar('\n'); + } + + for (size_t size=start; size < end; size += 1) { + + if (verbose) { + printf("%lu/%lu\r", size, end); + fflush(stdout); + } + + InputAscending asc(size); + InputDescending dsc(size); + InputRandom rnd(size); + InputRandomFew rndfew(size); + + if (!test(sort, asc)) { + printf("failed for size %lu, intput ascending\n", size); + return false; + } + + if (!test(sort, dsc)) { + printf("failed for size %lu, intput descending\n", size); + return false; + } + + if (!test(sort, rnd)) { + printf("failed for size %lu, intput random\n", size); + return false; + } + + if (!test(sort, rndfew)) { + printf("failed for size %lu, intput random few\n", size); + return false; + } + } // for + + if (verbose) { + putchar('\n'); + } + + return true; + } + + +private: + template + bool test(SORT_FN sort, InputData& data) { + sort(data.pointer(), 0, data.count() - 1); + + return is_sorted(data.pointer(), data.count()); + } +}; + + +class Flags { + public: + bool avx2; + bool avx2_alt; + bool avx2_nate; + bool avx512; + bool avx512_buf; + bool avx512_popcnt; + bool avx512_bmi; + + public: + Flags(const CommandLine& cmd) { + + enable_all(false); + + bool any_set = false; + if (cmd.has("-avx2")) { + avx2 = true; + any_set = true; + } + + if (cmd.has("-avx2-alt")) { + avx2_alt = true; + any_set = true; + } + + if (cmd.has("-avx2-nate")) { + avx2_nate = true; + any_set = true; + } + + if (cmd.has("-avx512")) { + avx512 = true; + any_set = true; + } + + if (cmd.has("-avx512-buf")) { + avx512_buf = true; + any_set = true; + } + + if (cmd.has("-avx512-popcnt")) { + avx512_popcnt = true; + any_set = true; + } + + if (cmd.has("-avx512-bmi")) { + avx512_bmi = true; + any_set = true; + } + + if (!any_set) { + enable_all(true); + } + } + + void enable_all(bool val) { + avx2 = val; + avx2_nate = val; + avx2_alt = val; + avx512 = val; + avx512_buf = val; + avx512_popcnt = val; + avx512_bmi = val; + } +}; + + +int main(int argc, char* argv[]) { + + CommandLine cmd(argc, argv); + + puts("Please wait, it might take a while..."); + puts(""); + + bool verbose = cmd.has("-v") || cmd.has("--verbose"); + Flags flags(cmd); + + Test test(verbose); + int ret = EXIT_SUCCESS; + +#ifdef HAVE_AVX2_INSTRUCTIONS + if (flags.avx2) { + printf("AVX2 base version... "); fflush(stdout); + if (test.run(qs::avx2::quicksort)) { + puts("OK"); + } else { + puts("FAILED"); + ret = EXIT_FAILURE; + } + } + + if (flags.avx2_alt) { + printf("AVX2 alt version... "); fflush(stdout); + if (test.run(wrapped_avx2_pivotonlast_sort)) { + puts("OK"); + } else { + puts("FAILED"); + ret = EXIT_FAILURE; + } + } + + if (flags.avx2_nate) { + printf("AVX2 Nate's variant... "); fflush(stdout); + if (test.run(nate::wrapped_avx2_pivotonlast_sort)) { + puts("OK"); + } else { + puts("FAILED"); + ret = EXIT_FAILURE; + } + } +#endif + +#ifdef HAVE_AVX512F_INSTRUCTIONS + +#ifdef POPCNT_LOOKUP + prepare_lookup(); +#endif + + if (flags.avx512) { + printf("AVX512 base version... "); fflush(stdout); + if (test.run(qs::avx512::quicksort)) { + puts("OK"); + } else { + puts("FAILED"); + ret = EXIT_FAILURE; + } + } + + if (flags.avx512_popcnt) { + printf("AVX512 + popcnt version... "); fflush(stdout); + if (test.run(qs::avx512::popcnt_quicksort)) { + puts("OK"); + } else { + puts("FAILED"); + ret = EXIT_FAILURE; + } + } + + if (flags.avx512_buf) { + printf("AVX512 with aux buffers... "); fflush(stdout); + if (test.run(qs::avx512::auxbuffer_quicksort)) { + puts("OK"); + } else { + puts("FAILED"); + ret = EXIT_FAILURE; + } + } + +#if 0 + if (flags.avx512_bmi) { + printf("AVX512 + bmi2 version ... "); fflush(stdout); + if (test.run(qs::avx512::bmi2_quicksort)) { + puts("OK"); + } else { + puts("FAILED"); + ret = EXIT_FAILURE; + } + } +#endif +#endif // HAVE_AVX512F_INSTRUCTIONS + + return ret; +}