added simd-sort - basically the whole repo, but I haxed-in magyarsort as measure
This commit is contained in:
parent
c4ed2994ea
commit
05235e269f
25
simd-sort/LICENSE
Normal file
25
simd-sort/LICENSE
Normal file
@ -0,0 +1,25 @@
|
||||
Copyright (c) 2016, Wojciech Muła, Daniel Lemire
|
||||
All rights reserved.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
|
||||
IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
|
||||
TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
|
||||
PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED
|
||||
TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
|
||||
PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
|
||||
LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
|
||||
NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
57
simd-sort/Makefile
Normal file
57
simd-sort/Makefile
Normal file
@ -0,0 +1,57 @@
|
||||
.SUFFIXES:
|
||||
.PHONY: all clean
|
||||
|
||||
FLAGS=-std=c++11 -mbmi2 -Wall -pedantic -Wextra
|
||||
FLAGS_AVX512=$(FLAGS) -mavx512f -DHAVE_AVX512F_INSTRUCTIONS -DHAVE_AVX2_INSTRUCTIONS
|
||||
FLAGS_AVX2=$(FLAGS) -mavx2 -DHAVE_AVX2_INSTRUCTIONS
|
||||
|
||||
DEPS_SORT=partition.cpp \
|
||||
avx2-partition.cpp \
|
||||
avx2-quicksort.cpp \
|
||||
avx2-altquicksort.h \
|
||||
avx2-natenodutch-quicksort.h \
|
||||
avx2-nate-quicksort.cpp \
|
||||
avx512-swap.cpp \
|
||||
avx512-partition.cpp \
|
||||
avx512-auxbuffer-partition.cpp \
|
||||
avx512-bmi2-partition.cpp \
|
||||
avx512-popcnt-partition.cpp \
|
||||
avx512-quicksort.cpp \
|
||||
avx512-sort-register.cpp \
|
||||
avx512-partition-register.cpp \
|
||||
quicksort.cpp
|
||||
|
||||
SPEED_DEPS=$(DEPS_SORT) speed.cpp gettime.cpp rdtsc.cpp runtime_stats.cpp
|
||||
SPEED_FLAGS=-O3 -DNDEBUG
|
||||
|
||||
ALL=test speed test_avx2 speed_avx2 speed_stats speed_avx2_stats
|
||||
|
||||
all: $(ALL)
|
||||
|
||||
test: test.cpp input_data.cpp $(DEPS_SORT)
|
||||
$(CXX) $(FLAGS_AVX512) -fsanitize=address test.cpp -o $@
|
||||
|
||||
test_avx2: test.cpp input_data.cpp $(DEPS_SORT)
|
||||
#$(CXX) $(FLAGS_AVX2) -fsanitize=address test.cpp -o $@
|
||||
$(CXX) $(FLAGS_AVX2) test.cpp -o $@
|
||||
|
||||
speed: $(SPEED_DEPS)
|
||||
$(CXX) $(FLAGS_AVX512) $(SPEED_FLAGS) speed.cpp -o $@
|
||||
|
||||
speed_avx2: $(SPEED_DEPS)
|
||||
$(CXX) $(FLAGS_AVX2) $(SPEED_FLAGS) speed.cpp -o $@
|
||||
|
||||
speed_stats: $(SPEED_DEPS)
|
||||
$(CXX) $(FLAGS_AVX512) $(SPEED_FLAGS) -DWITH_RUNTIME_STATS speed.cpp -o $@
|
||||
|
||||
speed_avx2_stats: $(SPEED_DEPS)
|
||||
$(CXX) $(FLAGS_AVX2) $(SPEED_FLAGS) -DWITH_RUNTIME_STATS speed.cpp -o $@
|
||||
|
||||
run: test
|
||||
sde -cnl -- ./$^
|
||||
|
||||
run_avx2: test_avx2
|
||||
sde -cnl -- ./$^
|
||||
|
||||
clean:
|
||||
rm -f $(ALL)
|
||||
48
simd-sort/README.rst
Normal file
48
simd-sort/README.rst
Normal file
@ -0,0 +1,48 @@
|
||||
================================================================================
|
||||
SIMD sorting
|
||||
================================================================================
|
||||
|
||||
Overview
|
||||
--------------------------------------------------
|
||||
|
||||
This repository contains SIMD versions of quicksort, using **AVX512F**
|
||||
and **AVX2** instruction sets. The subdirectory ``results`` contains
|
||||
performance results.
|
||||
|
||||
The code is still unfinished, it may be subject of changes.
|
||||
|
||||
Although programs are written in C++, most procedures are nearly
|
||||
plain C, except use of namespaces and references in place of
|
||||
pointers.
|
||||
|
||||
|
||||
Building
|
||||
--------------------------------------------------
|
||||
|
||||
At least GCC 5.3.0 is needed. Type ``make`` to build AVX512F
|
||||
and AVX2 versions of two programs:
|
||||
|
||||
* ``test``/``test_avx2`` --- tests if SIMD-ized sorting procedures
|
||||
work correctly for various inputs.
|
||||
|
||||
* ``speed``/``speed_avx2`` --- compares speed of different procedures,
|
||||
scalar and SIMD for various inputs.
|
||||
|
||||
* ``speed_stats``/``speed_avx2_stats`` --- similar to ``speed``, but
|
||||
some procedures collect and print runtime statistics. Note that
|
||||
this gives overhead, so for real performance tests the basic variant
|
||||
of the programs should be used.
|
||||
|
||||
|
||||
AVX512
|
||||
--------------------------------------------------
|
||||
|
||||
`Intel Software Development Emulator`__ can be used to run AVX512 variants.
|
||||
|
||||
__ https://software.intel.com/en-us/articles/intel-software-development-emulator
|
||||
|
||||
|
||||
References
|
||||
--------------------------------------------------
|
||||
|
||||
- Fast Sorting Algorithms using AVX-512 on Intel Knights Landing https://arxiv.org/pdf/1704.08579.pdf
|
||||
461
simd-sort/avx2-altquicksort.h
Normal file
461
simd-sort/avx2-altquicksort.h
Normal file
@ -0,0 +1,461 @@
|
||||
#pragma once
|
||||
|
||||
#include <x86intrin.h>
|
||||
#include <cstdint>
|
||||
|
||||
/**
|
||||
* This is an alternative approach to SIMD quicksort, implemented by D. Lemire.
|
||||
* It is not meant to be fast as such, but it can serve as a useful reference.
|
||||
*/
|
||||
|
||||
// When defined smaller reverseshufflemask is used.
|
||||
//#define BYTE_PATTERN
|
||||
|
||||
// When defined histogram for comparison mask is collected
|
||||
//#define WITH_PVBYTE_HISTOGRAM
|
||||
|
||||
#if WITH_RUNTIME_STATS == 0 && defined(WITH_PVBYTE_HISTOGRAM)
|
||||
# undef WITH_RUNTIME_STATS
|
||||
#endif
|
||||
|
||||
// can be replaced with VCOMPRESS on AVX-512
|
||||
static
|
||||
#ifdef BYTE_PATTERN
|
||||
uint8_t
|
||||
#else
|
||||
uint32_t
|
||||
#endif
|
||||
reverseshufflemask[256 * 8] __attribute__((aligned(0x100))) = {
|
||||
0, 1, 2, 3, 4, 5, 6, 7, /* 0*/
|
||||
1, 2, 3, 4, 5, 6, 7, 0, /* 1*/
|
||||
0, 2, 3, 4, 5, 6, 7, 1, /* 2*/
|
||||
2, 3, 4, 5, 6, 7, 0, 1, /* 3*/
|
||||
0, 1, 3, 4, 5, 6, 7, 2, /* 4*/
|
||||
1, 3, 4, 5, 6, 7, 0, 2, /* 5*/
|
||||
0, 3, 4, 5, 6, 7, 1, 2, /* 6*/
|
||||
3, 4, 5, 6, 7, 0, 1, 2, /* 7*/
|
||||
0, 1, 2, 4, 5, 6, 7, 3, /* 8*/
|
||||
1, 2, 4, 5, 6, 7, 0, 3, /* 9*/
|
||||
0, 2, 4, 5, 6, 7, 1, 3, /* 10*/
|
||||
2, 4, 5, 6, 7, 0, 1, 3, /* 11*/
|
||||
0, 1, 4, 5, 6, 7, 2, 3, /* 12*/
|
||||
1, 4, 5, 6, 7, 0, 2, 3, /* 13*/
|
||||
0, 4, 5, 6, 7, 1, 2, 3, /* 14*/
|
||||
4, 5, 6, 7, 0, 1, 2, 3, /* 15*/
|
||||
0, 1, 2, 3, 5, 6, 7, 4, /* 16*/
|
||||
1, 2, 3, 5, 6, 7, 0, 4, /* 17*/
|
||||
0, 2, 3, 5, 6, 7, 1, 4, /* 18*/
|
||||
2, 3, 5, 6, 7, 0, 1, 4, /* 19*/
|
||||
0, 1, 3, 5, 6, 7, 2, 4, /* 20*/
|
||||
1, 3, 5, 6, 7, 0, 2, 4, /* 21*/
|
||||
0, 3, 5, 6, 7, 1, 2, 4, /* 22*/
|
||||
3, 5, 6, 7, 0, 1, 2, 4, /* 23*/
|
||||
0, 1, 2, 5, 6, 7, 3, 4, /* 24*/
|
||||
1, 2, 5, 6, 7, 0, 3, 4, /* 25*/
|
||||
0, 2, 5, 6, 7, 1, 3, 4, /* 26*/
|
||||
2, 5, 6, 7, 0, 1, 3, 4, /* 27*/
|
||||
0, 1, 5, 6, 7, 2, 3, 4, /* 28*/
|
||||
1, 5, 6, 7, 0, 2, 3, 4, /* 29*/
|
||||
0, 5, 6, 7, 1, 2, 3, 4, /* 30*/
|
||||
5, 6, 7, 0, 1, 2, 3, 4, /* 31*/
|
||||
0, 1, 2, 3, 4, 6, 7, 5, /* 32*/
|
||||
1, 2, 3, 4, 6, 7, 0, 5, /* 33*/
|
||||
0, 2, 3, 4, 6, 7, 1, 5, /* 34*/
|
||||
2, 3, 4, 6, 7, 0, 1, 5, /* 35*/
|
||||
0, 1, 3, 4, 6, 7, 2, 5, /* 36*/
|
||||
1, 3, 4, 6, 7, 0, 2, 5, /* 37*/
|
||||
0, 3, 4, 6, 7, 1, 2, 5, /* 38*/
|
||||
3, 4, 6, 7, 0, 1, 2, 5, /* 39*/
|
||||
0, 1, 2, 4, 6, 7, 3, 5, /* 40*/
|
||||
1, 2, 4, 6, 7, 0, 3, 5, /* 41*/
|
||||
0, 2, 4, 6, 7, 1, 3, 5, /* 42*/
|
||||
2, 4, 6, 7, 0, 1, 3, 5, /* 43*/
|
||||
0, 1, 4, 6, 7, 2, 3, 5, /* 44*/
|
||||
1, 4, 6, 7, 0, 2, 3, 5, /* 45*/
|
||||
0, 4, 6, 7, 1, 2, 3, 5, /* 46*/
|
||||
4, 6, 7, 0, 1, 2, 3, 5, /* 47*/
|
||||
0, 1, 2, 3, 6, 7, 4, 5, /* 48*/
|
||||
1, 2, 3, 6, 7, 0, 4, 5, /* 49*/
|
||||
0, 2, 3, 6, 7, 1, 4, 5, /* 50*/
|
||||
2, 3, 6, 7, 0, 1, 4, 5, /* 51*/
|
||||
0, 1, 3, 6, 7, 2, 4, 5, /* 52*/
|
||||
1, 3, 6, 7, 0, 2, 4, 5, /* 53*/
|
||||
0, 3, 6, 7, 1, 2, 4, 5, /* 54*/
|
||||
3, 6, 7, 0, 1, 2, 4, 5, /* 55*/
|
||||
0, 1, 2, 6, 7, 3, 4, 5, /* 56*/
|
||||
1, 2, 6, 7, 0, 3, 4, 5, /* 57*/
|
||||
0, 2, 6, 7, 1, 3, 4, 5, /* 58*/
|
||||
2, 6, 7, 0, 1, 3, 4, 5, /* 59*/
|
||||
0, 1, 6, 7, 2, 3, 4, 5, /* 60*/
|
||||
1, 6, 7, 0, 2, 3, 4, 5, /* 61*/
|
||||
0, 6, 7, 1, 2, 3, 4, 5, /* 62*/
|
||||
6, 7, 0, 1, 2, 3, 4, 5, /* 63*/
|
||||
0, 1, 2, 3, 4, 5, 7, 6, /* 64*/
|
||||
1, 2, 3, 4, 5, 7, 0, 6, /* 65*/
|
||||
0, 2, 3, 4, 5, 7, 1, 6, /* 66*/
|
||||
2, 3, 4, 5, 7, 0, 1, 6, /* 67*/
|
||||
0, 1, 3, 4, 5, 7, 2, 6, /* 68*/
|
||||
1, 3, 4, 5, 7, 0, 2, 6, /* 69*/
|
||||
0, 3, 4, 5, 7, 1, 2, 6, /* 70*/
|
||||
3, 4, 5, 7, 0, 1, 2, 6, /* 71*/
|
||||
0, 1, 2, 4, 5, 7, 3, 6, /* 72*/
|
||||
1, 2, 4, 5, 7, 0, 3, 6, /* 73*/
|
||||
0, 2, 4, 5, 7, 1, 3, 6, /* 74*/
|
||||
2, 4, 5, 7, 0, 1, 3, 6, /* 75*/
|
||||
0, 1, 4, 5, 7, 2, 3, 6, /* 76*/
|
||||
1, 4, 5, 7, 0, 2, 3, 6, /* 77*/
|
||||
0, 4, 5, 7, 1, 2, 3, 6, /* 78*/
|
||||
4, 5, 7, 0, 1, 2, 3, 6, /* 79*/
|
||||
0, 1, 2, 3, 5, 7, 4, 6, /* 80*/
|
||||
1, 2, 3, 5, 7, 0, 4, 6, /* 81*/
|
||||
0, 2, 3, 5, 7, 1, 4, 6, /* 82*/
|
||||
2, 3, 5, 7, 0, 1, 4, 6, /* 83*/
|
||||
0, 1, 3, 5, 7, 2, 4, 6, /* 84*/
|
||||
1, 3, 5, 7, 0, 2, 4, 6, /* 85*/
|
||||
0, 3, 5, 7, 1, 2, 4, 6, /* 86*/
|
||||
3, 5, 7, 0, 1, 2, 4, 6, /* 87*/
|
||||
0, 1, 2, 5, 7, 3, 4, 6, /* 88*/
|
||||
1, 2, 5, 7, 0, 3, 4, 6, /* 89*/
|
||||
0, 2, 5, 7, 1, 3, 4, 6, /* 90*/
|
||||
2, 5, 7, 0, 1, 3, 4, 6, /* 91*/
|
||||
0, 1, 5, 7, 2, 3, 4, 6, /* 92*/
|
||||
1, 5, 7, 0, 2, 3, 4, 6, /* 93*/
|
||||
0, 5, 7, 1, 2, 3, 4, 6, /* 94*/
|
||||
5, 7, 0, 1, 2, 3, 4, 6, /* 95*/
|
||||
0, 1, 2, 3, 4, 7, 5, 6, /* 96*/
|
||||
1, 2, 3, 4, 7, 0, 5, 6, /* 97*/
|
||||
0, 2, 3, 4, 7, 1, 5, 6, /* 98*/
|
||||
2, 3, 4, 7, 0, 1, 5, 6, /* 99*/
|
||||
0, 1, 3, 4, 7, 2, 5, 6, /* 100*/
|
||||
1, 3, 4, 7, 0, 2, 5, 6, /* 101*/
|
||||
0, 3, 4, 7, 1, 2, 5, 6, /* 102*/
|
||||
3, 4, 7, 0, 1, 2, 5, 6, /* 103*/
|
||||
0, 1, 2, 4, 7, 3, 5, 6, /* 104*/
|
||||
1, 2, 4, 7, 0, 3, 5, 6, /* 105*/
|
||||
0, 2, 4, 7, 1, 3, 5, 6, /* 106*/
|
||||
2, 4, 7, 0, 1, 3, 5, 6, /* 107*/
|
||||
0, 1, 4, 7, 2, 3, 5, 6, /* 108*/
|
||||
1, 4, 7, 0, 2, 3, 5, 6, /* 109*/
|
||||
0, 4, 7, 1, 2, 3, 5, 6, /* 110*/
|
||||
4, 7, 0, 1, 2, 3, 5, 6, /* 111*/
|
||||
0, 1, 2, 3, 7, 4, 5, 6, /* 112*/
|
||||
1, 2, 3, 7, 0, 4, 5, 6, /* 113*/
|
||||
0, 2, 3, 7, 1, 4, 5, 6, /* 114*/
|
||||
2, 3, 7, 0, 1, 4, 5, 6, /* 115*/
|
||||
0, 1, 3, 7, 2, 4, 5, 6, /* 116*/
|
||||
1, 3, 7, 0, 2, 4, 5, 6, /* 117*/
|
||||
0, 3, 7, 1, 2, 4, 5, 6, /* 118*/
|
||||
3, 7, 0, 1, 2, 4, 5, 6, /* 119*/
|
||||
0, 1, 2, 7, 3, 4, 5, 6, /* 120*/
|
||||
1, 2, 7, 0, 3, 4, 5, 6, /* 121*/
|
||||
0, 2, 7, 1, 3, 4, 5, 6, /* 122*/
|
||||
2, 7, 0, 1, 3, 4, 5, 6, /* 123*/
|
||||
0, 1, 7, 2, 3, 4, 5, 6, /* 124*/
|
||||
1, 7, 0, 2, 3, 4, 5, 6, /* 125*/
|
||||
0, 7, 1, 2, 3, 4, 5, 6, /* 126*/
|
||||
7, 0, 1, 2, 3, 4, 5, 6, /* 127*/
|
||||
0, 1, 2, 3, 4, 5, 6, 7, /* 128*/
|
||||
1, 2, 3, 4, 5, 6, 0, 7, /* 129*/
|
||||
0, 2, 3, 4, 5, 6, 1, 7, /* 130*/
|
||||
2, 3, 4, 5, 6, 0, 1, 7, /* 131*/
|
||||
0, 1, 3, 4, 5, 6, 2, 7, /* 132*/
|
||||
1, 3, 4, 5, 6, 0, 2, 7, /* 133*/
|
||||
0, 3, 4, 5, 6, 1, 2, 7, /* 134*/
|
||||
3, 4, 5, 6, 0, 1, 2, 7, /* 135*/
|
||||
0, 1, 2, 4, 5, 6, 3, 7, /* 136*/
|
||||
1, 2, 4, 5, 6, 0, 3, 7, /* 137*/
|
||||
0, 2, 4, 5, 6, 1, 3, 7, /* 138*/
|
||||
2, 4, 5, 6, 0, 1, 3, 7, /* 139*/
|
||||
0, 1, 4, 5, 6, 2, 3, 7, /* 140*/
|
||||
1, 4, 5, 6, 0, 2, 3, 7, /* 141*/
|
||||
0, 4, 5, 6, 1, 2, 3, 7, /* 142*/
|
||||
4, 5, 6, 0, 1, 2, 3, 7, /* 143*/
|
||||
0, 1, 2, 3, 5, 6, 4, 7, /* 144*/
|
||||
1, 2, 3, 5, 6, 0, 4, 7, /* 145*/
|
||||
0, 2, 3, 5, 6, 1, 4, 7, /* 146*/
|
||||
2, 3, 5, 6, 0, 1, 4, 7, /* 147*/
|
||||
0, 1, 3, 5, 6, 2, 4, 7, /* 148*/
|
||||
1, 3, 5, 6, 0, 2, 4, 7, /* 149*/
|
||||
0, 3, 5, 6, 1, 2, 4, 7, /* 150*/
|
||||
3, 5, 6, 0, 1, 2, 4, 7, /* 151*/
|
||||
0, 1, 2, 5, 6, 3, 4, 7, /* 152*/
|
||||
1, 2, 5, 6, 0, 3, 4, 7, /* 153*/
|
||||
0, 2, 5, 6, 1, 3, 4, 7, /* 154*/
|
||||
2, 5, 6, 0, 1, 3, 4, 7, /* 155*/
|
||||
0, 1, 5, 6, 2, 3, 4, 7, /* 156*/
|
||||
1, 5, 6, 0, 2, 3, 4, 7, /* 157*/
|
||||
0, 5, 6, 1, 2, 3, 4, 7, /* 158*/
|
||||
5, 6, 0, 1, 2, 3, 4, 7, /* 159*/
|
||||
0, 1, 2, 3, 4, 6, 5, 7, /* 160*/
|
||||
1, 2, 3, 4, 6, 0, 5, 7, /* 161*/
|
||||
0, 2, 3, 4, 6, 1, 5, 7, /* 162*/
|
||||
2, 3, 4, 6, 0, 1, 5, 7, /* 163*/
|
||||
0, 1, 3, 4, 6, 2, 5, 7, /* 164*/
|
||||
1, 3, 4, 6, 0, 2, 5, 7, /* 165*/
|
||||
0, 3, 4, 6, 1, 2, 5, 7, /* 166*/
|
||||
3, 4, 6, 0, 1, 2, 5, 7, /* 167*/
|
||||
0, 1, 2, 4, 6, 3, 5, 7, /* 168*/
|
||||
1, 2, 4, 6, 0, 3, 5, 7, /* 169*/
|
||||
0, 2, 4, 6, 1, 3, 5, 7, /* 170*/
|
||||
2, 4, 6, 0, 1, 3, 5, 7, /* 171*/
|
||||
0, 1, 4, 6, 2, 3, 5, 7, /* 172*/
|
||||
1, 4, 6, 0, 2, 3, 5, 7, /* 173*/
|
||||
0, 4, 6, 1, 2, 3, 5, 7, /* 174*/
|
||||
4, 6, 0, 1, 2, 3, 5, 7, /* 175*/
|
||||
0, 1, 2, 3, 6, 4, 5, 7, /* 176*/
|
||||
1, 2, 3, 6, 0, 4, 5, 7, /* 177*/
|
||||
0, 2, 3, 6, 1, 4, 5, 7, /* 178*/
|
||||
2, 3, 6, 0, 1, 4, 5, 7, /* 179*/
|
||||
0, 1, 3, 6, 2, 4, 5, 7, /* 180*/
|
||||
1, 3, 6, 0, 2, 4, 5, 7, /* 181*/
|
||||
0, 3, 6, 1, 2, 4, 5, 7, /* 182*/
|
||||
3, 6, 0, 1, 2, 4, 5, 7, /* 183*/
|
||||
0, 1, 2, 6, 3, 4, 5, 7, /* 184*/
|
||||
1, 2, 6, 0, 3, 4, 5, 7, /* 185*/
|
||||
0, 2, 6, 1, 3, 4, 5, 7, /* 186*/
|
||||
2, 6, 0, 1, 3, 4, 5, 7, /* 187*/
|
||||
0, 1, 6, 2, 3, 4, 5, 7, /* 188*/
|
||||
1, 6, 0, 2, 3, 4, 5, 7, /* 189*/
|
||||
0, 6, 1, 2, 3, 4, 5, 7, /* 190*/
|
||||
6, 0, 1, 2, 3, 4, 5, 7, /* 191*/
|
||||
0, 1, 2, 3, 4, 5, 6, 7, /* 192*/
|
||||
1, 2, 3, 4, 5, 0, 6, 7, /* 193*/
|
||||
0, 2, 3, 4, 5, 1, 6, 7, /* 194*/
|
||||
2, 3, 4, 5, 0, 1, 6, 7, /* 195*/
|
||||
0, 1, 3, 4, 5, 2, 6, 7, /* 196*/
|
||||
1, 3, 4, 5, 0, 2, 6, 7, /* 197*/
|
||||
0, 3, 4, 5, 1, 2, 6, 7, /* 198*/
|
||||
3, 4, 5, 0, 1, 2, 6, 7, /* 199*/
|
||||
0, 1, 2, 4, 5, 3, 6, 7, /* 200*/
|
||||
1, 2, 4, 5, 0, 3, 6, 7, /* 201*/
|
||||
0, 2, 4, 5, 1, 3, 6, 7, /* 202*/
|
||||
2, 4, 5, 0, 1, 3, 6, 7, /* 203*/
|
||||
0, 1, 4, 5, 2, 3, 6, 7, /* 204*/
|
||||
1, 4, 5, 0, 2, 3, 6, 7, /* 205*/
|
||||
0, 4, 5, 1, 2, 3, 6, 7, /* 206*/
|
||||
4, 5, 0, 1, 2, 3, 6, 7, /* 207*/
|
||||
0, 1, 2, 3, 5, 4, 6, 7, /* 208*/
|
||||
1, 2, 3, 5, 0, 4, 6, 7, /* 209*/
|
||||
0, 2, 3, 5, 1, 4, 6, 7, /* 210*/
|
||||
2, 3, 5, 0, 1, 4, 6, 7, /* 211*/
|
||||
0, 1, 3, 5, 2, 4, 6, 7, /* 212*/
|
||||
1, 3, 5, 0, 2, 4, 6, 7, /* 213*/
|
||||
0, 3, 5, 1, 2, 4, 6, 7, /* 214*/
|
||||
3, 5, 0, 1, 2, 4, 6, 7, /* 215*/
|
||||
0, 1, 2, 5, 3, 4, 6, 7, /* 216*/
|
||||
1, 2, 5, 0, 3, 4, 6, 7, /* 217*/
|
||||
0, 2, 5, 1, 3, 4, 6, 7, /* 218*/
|
||||
2, 5, 0, 1, 3, 4, 6, 7, /* 219*/
|
||||
0, 1, 5, 2, 3, 4, 6, 7, /* 220*/
|
||||
1, 5, 0, 2, 3, 4, 6, 7, /* 221*/
|
||||
0, 5, 1, 2, 3, 4, 6, 7, /* 222*/
|
||||
5, 0, 1, 2, 3, 4, 6, 7, /* 223*/
|
||||
0, 1, 2, 3, 4, 5, 6, 7, /* 224*/
|
||||
1, 2, 3, 4, 0, 5, 6, 7, /* 225*/
|
||||
0, 2, 3, 4, 1, 5, 6, 7, /* 226*/
|
||||
2, 3, 4, 0, 1, 5, 6, 7, /* 227*/
|
||||
0, 1, 3, 4, 2, 5, 6, 7, /* 228*/
|
||||
1, 3, 4, 0, 2, 5, 6, 7, /* 229*/
|
||||
0, 3, 4, 1, 2, 5, 6, 7, /* 230*/
|
||||
3, 4, 0, 1, 2, 5, 6, 7, /* 231*/
|
||||
0, 1, 2, 4, 3, 5, 6, 7, /* 232*/
|
||||
1, 2, 4, 0, 3, 5, 6, 7, /* 233*/
|
||||
0, 2, 4, 1, 3, 5, 6, 7, /* 234*/
|
||||
2, 4, 0, 1, 3, 5, 6, 7, /* 235*/
|
||||
0, 1, 4, 2, 3, 5, 6, 7, /* 236*/
|
||||
1, 4, 0, 2, 3, 5, 6, 7, /* 237*/
|
||||
0, 4, 1, 2, 3, 5, 6, 7, /* 238*/
|
||||
4, 0, 1, 2, 3, 5, 6, 7, /* 239*/
|
||||
0, 1, 2, 3, 4, 5, 6, 7, /* 240*/
|
||||
1, 2, 3, 0, 4, 5, 6, 7, /* 241*/
|
||||
0, 2, 3, 1, 4, 5, 6, 7, /* 242*/
|
||||
2, 3, 0, 1, 4, 5, 6, 7, /* 243*/
|
||||
0, 1, 3, 2, 4, 5, 6, 7, /* 244*/
|
||||
1, 3, 0, 2, 4, 5, 6, 7, /* 245*/
|
||||
0, 3, 1, 2, 4, 5, 6, 7, /* 246*/
|
||||
3, 0, 1, 2, 4, 5, 6, 7, /* 247*/
|
||||
0, 1, 2, 3, 4, 5, 6, 7, /* 248*/
|
||||
1, 2, 0, 3, 4, 5, 6, 7, /* 249*/
|
||||
0, 2, 1, 3, 4, 5, 6, 7, /* 250*/
|
||||
2, 0, 1, 3, 4, 5, 6, 7, /* 251*/
|
||||
0, 1, 2, 3, 4, 5, 6, 7, /* 252*/
|
||||
1, 0, 2, 3, 4, 5, 6, 7, /* 253*/
|
||||
0, 1, 2, 3, 4, 5, 6, 7, /* 254*/
|
||||
0, 1, 2, 3, 4, 5, 6, 7, /* 255*/
|
||||
};
|
||||
|
||||
|
||||
static FORCE_INLINE __m256i get_permutation_vector(int pvbyte) {
|
||||
#ifdef BYTE_PATTERN
|
||||
__m256i shufm;
|
||||
asm volatile (
|
||||
"vpmovzxbd (%1), %0"
|
||||
: "=X" (shufm)
|
||||
: "r" (reverseshufflemask + 8 * pvbyte)
|
||||
);
|
||||
|
||||
return shufm;
|
||||
#else
|
||||
return _mm256_load_si256((__m256i *)(reverseshufflemask + 8 * pvbyte));
|
||||
#endif
|
||||
}
|
||||
|
||||
|
||||
static uint32_t avx_pivot_on_last_value(int32_t *array, size_t length) {
|
||||
/* we run through the data. Anything in [0,boundary) is smaller or equal
|
||||
* than the pivot, and the value at boundary - 1 is going to be equal to the
|
||||
* pivot at the end,
|
||||
* anything in (boundary, i) is greater than the pivot
|
||||
* stuff in [i,...) is grey
|
||||
* the function returns the location of the boundary.
|
||||
*/
|
||||
if (length <= 1)
|
||||
return 1;
|
||||
{ // we exchange the last value for the middle value for a better pivot
|
||||
int32_t ival = array[length / 2];
|
||||
int32_t bval = array[length - 1];
|
||||
array[length / 2] = bval;
|
||||
array[length - 1] = ival;
|
||||
}
|
||||
#if WITH_RUNTIME_STATS
|
||||
statistics.partition_calls += 1;
|
||||
statistics.items_processed += length;
|
||||
#endif
|
||||
uint32_t boundary = 0;
|
||||
uint32_t i = 0;
|
||||
int32_t pivot = array[length - 1]; // we always pick the pivot at the end
|
||||
const __m256i P = _mm256_set1_epi32(pivot);
|
||||
while ( i + 8 + 1 <= length) {
|
||||
__m256i allgrey = _mm256_lddqu_si256((__m256i *)(array + i));
|
||||
int pvbyte = _mm256_movemask_ps((__m256)_mm256_cmpgt_epi32(allgrey, P));
|
||||
#if WITH_RUNTIME_STATS && defined(WITH_PVBYTE_HISTOGRAM)
|
||||
statistics.pvbyte_histogram.hit(pvbyte);
|
||||
#endif
|
||||
if(pvbyte == 0) { // might be frequent
|
||||
i += 8; //nothing to do
|
||||
boundary = i;
|
||||
} else if (pvbyte == 0xFF) { // called once
|
||||
boundary = i;
|
||||
i += 8;
|
||||
break; // exit
|
||||
} else {
|
||||
|
||||
// hot path
|
||||
switch (pvbyte) {
|
||||
// for pvbyte = 0x00, 0x80, 0xc0, 0xe0, 0xf0, 0xf8, 0xfc, 0xfe, 0xff
|
||||
// there is no change in order, just advance boundary
|
||||
// Note: case 0x00 & 0xff are already handled
|
||||
case 0x80: i += 8 - 1; break;
|
||||
case 0xc0: i += 8 - 2; break;
|
||||
case 0xe0: i += 8 - 3; break;
|
||||
case 0xf0: i += 8 - 4; break;
|
||||
case 0xf8: i += 8 - 5; break;
|
||||
case 0xfc: i += 8 - 6; break;
|
||||
case 0xfe: i += 8 - 7; break;
|
||||
|
||||
default: {
|
||||
uint32_t cnt =
|
||||
8 - _mm_popcnt_u32(pvbyte); // might be faster with table look-up?
|
||||
__m256i blackthenwhite = _mm256_permutevar8x32_epi32(allgrey, get_permutation_vector(pvbyte));
|
||||
_mm256_storeu_si256((__m256i *)(array + i), blackthenwhite);
|
||||
i += cnt;
|
||||
}
|
||||
} // switch
|
||||
|
||||
boundary = i; // this doesn't need updating each and every time
|
||||
}
|
||||
}
|
||||
for (; i + 8 + 1 <= length ;) {
|
||||
__m256i allgrey =
|
||||
_mm256_lddqu_si256((__m256i *)(array + i)); // this is all grey
|
||||
int pvbyte = _mm256_movemask_ps((__m256)_mm256_cmpgt_epi32(allgrey, P));
|
||||
if (pvbyte == 0xFF) { // called once
|
||||
// nothing to do
|
||||
} else {
|
||||
|
||||
uint32_t cnt =
|
||||
8 - _mm_popcnt_u32(pvbyte); // might be faster with table look-up?
|
||||
__m256i allwhite = _mm256_lddqu_si256(
|
||||
(__m256i *)(array + boundary)); // this is all white
|
||||
// we shuffle allgrey so that the first part is black and the second part
|
||||
// is white
|
||||
__m256i blackthenwhite = _mm256_permutevar8x32_epi32(allgrey, get_permutation_vector(pvbyte));
|
||||
_mm256_storeu_si256((__m256i *)(array + boundary), blackthenwhite);
|
||||
_mm256_storeu_si256((__m256i *)(array + i), allwhite);
|
||||
boundary += cnt; // might be faster with table look-up?
|
||||
}
|
||||
i += 8;
|
||||
}
|
||||
while (i + 1 < length) {
|
||||
int32_t ival = array[i];
|
||||
if (ival <= pivot) {
|
||||
int32_t bval = array[boundary];
|
||||
array[i] = bval;
|
||||
array[boundary] = ival;
|
||||
boundary++;
|
||||
}
|
||||
i++;
|
||||
}
|
||||
int32_t ival = array[i];
|
||||
int32_t bval = array[boundary];
|
||||
array[length - 1] = bval;
|
||||
array[boundary] = ival;
|
||||
boundary++;
|
||||
return boundary;
|
||||
}
|
||||
|
||||
// for fallback
|
||||
void scalar_partition(int32_t* array, const int32_t pivot, int& left, int& right) {
|
||||
|
||||
while (left <= right) {
|
||||
while (array[left] < pivot) {
|
||||
left += 1;
|
||||
}
|
||||
while (array[right] > pivot) {
|
||||
right -= 1;
|
||||
}
|
||||
if (left <= right) {
|
||||
const uint32_t t = array[left];
|
||||
array[left] = array[right];
|
||||
array[right] = t;
|
||||
left += 1;
|
||||
right -= 1;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
//fallback
|
||||
void scalar_quicksort(int32_t* array, int left, int right) {
|
||||
#ifdef WITH_RUNTIME_STATS
|
||||
statistics.scalar__partition_calls += 1;
|
||||
statistics.scalar__items_processed += right - left + 1;
|
||||
#endif
|
||||
int i = left;
|
||||
int j = right;
|
||||
const int32_t pivot = array[(i + j)/2];
|
||||
scalar_partition(array, pivot, i, j);
|
||||
if (left < j) {
|
||||
scalar_quicksort(array, left, j);
|
||||
}
|
||||
if (i < right) {
|
||||
scalar_quicksort(array, i, right);
|
||||
}
|
||||
}
|
||||
|
||||
void avx2_pivotonlast_sort(int32_t *array, const uint32_t length) {
|
||||
uint32_t sep = avx_pivot_on_last_value(array, length);
|
||||
if(sep == length) {
|
||||
// we have an ineffective pivot. Let us give up.
|
||||
if(length > 1) scalar_quicksort(array,0,length - 1);
|
||||
} else {
|
||||
if (sep > 2) {
|
||||
avx2_pivotonlast_sort(array, sep - 1);
|
||||
}
|
||||
if (sep + 1 < length) {
|
||||
avx2_pivotonlast_sort(array + sep, length - sep);
|
||||
}
|
||||
}
|
||||
}
|
||||
void wrapped_avx2_pivotonlast_sort(uint32_t *array, int left, int right) {
|
||||
avx2_pivotonlast_sort((int32_t *)array + left, right - left + 1);
|
||||
}
|
||||
436
simd-sort/avx2-nate-quicksort.cpp
Normal file
436
simd-sort/avx2-nate-quicksort.cpp
Normal file
@ -0,0 +1,436 @@
|
||||
#pragma once
|
||||
|
||||
#include <x86intrin.h>
|
||||
#include <cstdint>
|
||||
|
||||
namespace nate {
|
||||
|
||||
const int AVX2_REGISTER_SIZE = 8; // in 32-bit words
|
||||
|
||||
/*
|
||||
Algorithm of vectorized partition by Nathan Kurz.
|
||||
Everything else was borrowed from Daniel's code (avx2-altquicksort.h)
|
||||
*/
|
||||
|
||||
static uint32_t reverseshufflemask[256 * 8] __attribute__((aligned(0x100))) = {
|
||||
0, 1, 2, 3, 4, 5, 6, 7, /* 0*/
|
||||
1, 2, 3, 4, 5, 6, 7, 0, /* 1*/
|
||||
0, 2, 3, 4, 5, 6, 7, 1, /* 2*/
|
||||
2, 3, 4, 5, 6, 7, 0, 1, /* 3*/
|
||||
0, 1, 3, 4, 5, 6, 7, 2, /* 4*/
|
||||
1, 3, 4, 5, 6, 7, 0, 2, /* 5*/
|
||||
0, 3, 4, 5, 6, 7, 1, 2, /* 6*/
|
||||
3, 4, 5, 6, 7, 0, 1, 2, /* 7*/
|
||||
0, 1, 2, 4, 5, 6, 7, 3, /* 8*/
|
||||
1, 2, 4, 5, 6, 7, 0, 3, /* 9*/
|
||||
0, 2, 4, 5, 6, 7, 1, 3, /* 10*/
|
||||
2, 4, 5, 6, 7, 0, 1, 3, /* 11*/
|
||||
0, 1, 4, 5, 6, 7, 2, 3, /* 12*/
|
||||
1, 4, 5, 6, 7, 0, 2, 3, /* 13*/
|
||||
0, 4, 5, 6, 7, 1, 2, 3, /* 14*/
|
||||
4, 5, 6, 7, 0, 1, 2, 3, /* 15*/
|
||||
0, 1, 2, 3, 5, 6, 7, 4, /* 16*/
|
||||
1, 2, 3, 5, 6, 7, 0, 4, /* 17*/
|
||||
0, 2, 3, 5, 6, 7, 1, 4, /* 18*/
|
||||
2, 3, 5, 6, 7, 0, 1, 4, /* 19*/
|
||||
0, 1, 3, 5, 6, 7, 2, 4, /* 20*/
|
||||
1, 3, 5, 6, 7, 0, 2, 4, /* 21*/
|
||||
0, 3, 5, 6, 7, 1, 2, 4, /* 22*/
|
||||
3, 5, 6, 7, 0, 1, 2, 4, /* 23*/
|
||||
0, 1, 2, 5, 6, 7, 3, 4, /* 24*/
|
||||
1, 2, 5, 6, 7, 0, 3, 4, /* 25*/
|
||||
0, 2, 5, 6, 7, 1, 3, 4, /* 26*/
|
||||
2, 5, 6, 7, 0, 1, 3, 4, /* 27*/
|
||||
0, 1, 5, 6, 7, 2, 3, 4, /* 28*/
|
||||
1, 5, 6, 7, 0, 2, 3, 4, /* 29*/
|
||||
0, 5, 6, 7, 1, 2, 3, 4, /* 30*/
|
||||
5, 6, 7, 0, 1, 2, 3, 4, /* 31*/
|
||||
0, 1, 2, 3, 4, 6, 7, 5, /* 32*/
|
||||
1, 2, 3, 4, 6, 7, 0, 5, /* 33*/
|
||||
0, 2, 3, 4, 6, 7, 1, 5, /* 34*/
|
||||
2, 3, 4, 6, 7, 0, 1, 5, /* 35*/
|
||||
0, 1, 3, 4, 6, 7, 2, 5, /* 36*/
|
||||
1, 3, 4, 6, 7, 0, 2, 5, /* 37*/
|
||||
0, 3, 4, 6, 7, 1, 2, 5, /* 38*/
|
||||
3, 4, 6, 7, 0, 1, 2, 5, /* 39*/
|
||||
0, 1, 2, 4, 6, 7, 3, 5, /* 40*/
|
||||
1, 2, 4, 6, 7, 0, 3, 5, /* 41*/
|
||||
0, 2, 4, 6, 7, 1, 3, 5, /* 42*/
|
||||
2, 4, 6, 7, 0, 1, 3, 5, /* 43*/
|
||||
0, 1, 4, 6, 7, 2, 3, 5, /* 44*/
|
||||
1, 4, 6, 7, 0, 2, 3, 5, /* 45*/
|
||||
0, 4, 6, 7, 1, 2, 3, 5, /* 46*/
|
||||
4, 6, 7, 0, 1, 2, 3, 5, /* 47*/
|
||||
0, 1, 2, 3, 6, 7, 4, 5, /* 48*/
|
||||
1, 2, 3, 6, 7, 0, 4, 5, /* 49*/
|
||||
0, 2, 3, 6, 7, 1, 4, 5, /* 50*/
|
||||
2, 3, 6, 7, 0, 1, 4, 5, /* 51*/
|
||||
0, 1, 3, 6, 7, 2, 4, 5, /* 52*/
|
||||
1, 3, 6, 7, 0, 2, 4, 5, /* 53*/
|
||||
0, 3, 6, 7, 1, 2, 4, 5, /* 54*/
|
||||
3, 6, 7, 0, 1, 2, 4, 5, /* 55*/
|
||||
0, 1, 2, 6, 7, 3, 4, 5, /* 56*/
|
||||
1, 2, 6, 7, 0, 3, 4, 5, /* 57*/
|
||||
0, 2, 6, 7, 1, 3, 4, 5, /* 58*/
|
||||
2, 6, 7, 0, 1, 3, 4, 5, /* 59*/
|
||||
0, 1, 6, 7, 2, 3, 4, 5, /* 60*/
|
||||
1, 6, 7, 0, 2, 3, 4, 5, /* 61*/
|
||||
0, 6, 7, 1, 2, 3, 4, 5, /* 62*/
|
||||
6, 7, 0, 1, 2, 3, 4, 5, /* 63*/
|
||||
0, 1, 2, 3, 4, 5, 7, 6, /* 64*/
|
||||
1, 2, 3, 4, 5, 7, 0, 6, /* 65*/
|
||||
0, 2, 3, 4, 5, 7, 1, 6, /* 66*/
|
||||
2, 3, 4, 5, 7, 0, 1, 6, /* 67*/
|
||||
0, 1, 3, 4, 5, 7, 2, 6, /* 68*/
|
||||
1, 3, 4, 5, 7, 0, 2, 6, /* 69*/
|
||||
0, 3, 4, 5, 7, 1, 2, 6, /* 70*/
|
||||
3, 4, 5, 7, 0, 1, 2, 6, /* 71*/
|
||||
0, 1, 2, 4, 5, 7, 3, 6, /* 72*/
|
||||
1, 2, 4, 5, 7, 0, 3, 6, /* 73*/
|
||||
0, 2, 4, 5, 7, 1, 3, 6, /* 74*/
|
||||
2, 4, 5, 7, 0, 1, 3, 6, /* 75*/
|
||||
0, 1, 4, 5, 7, 2, 3, 6, /* 76*/
|
||||
1, 4, 5, 7, 0, 2, 3, 6, /* 77*/
|
||||
0, 4, 5, 7, 1, 2, 3, 6, /* 78*/
|
||||
4, 5, 7, 0, 1, 2, 3, 6, /* 79*/
|
||||
0, 1, 2, 3, 5, 7, 4, 6, /* 80*/
|
||||
1, 2, 3, 5, 7, 0, 4, 6, /* 81*/
|
||||
0, 2, 3, 5, 7, 1, 4, 6, /* 82*/
|
||||
2, 3, 5, 7, 0, 1, 4, 6, /* 83*/
|
||||
0, 1, 3, 5, 7, 2, 4, 6, /* 84*/
|
||||
1, 3, 5, 7, 0, 2, 4, 6, /* 85*/
|
||||
0, 3, 5, 7, 1, 2, 4, 6, /* 86*/
|
||||
3, 5, 7, 0, 1, 2, 4, 6, /* 87*/
|
||||
0, 1, 2, 5, 7, 3, 4, 6, /* 88*/
|
||||
1, 2, 5, 7, 0, 3, 4, 6, /* 89*/
|
||||
0, 2, 5, 7, 1, 3, 4, 6, /* 90*/
|
||||
2, 5, 7, 0, 1, 3, 4, 6, /* 91*/
|
||||
0, 1, 5, 7, 2, 3, 4, 6, /* 92*/
|
||||
1, 5, 7, 0, 2, 3, 4, 6, /* 93*/
|
||||
0, 5, 7, 1, 2, 3, 4, 6, /* 94*/
|
||||
5, 7, 0, 1, 2, 3, 4, 6, /* 95*/
|
||||
0, 1, 2, 3, 4, 7, 5, 6, /* 96*/
|
||||
1, 2, 3, 4, 7, 0, 5, 6, /* 97*/
|
||||
0, 2, 3, 4, 7, 1, 5, 6, /* 98*/
|
||||
2, 3, 4, 7, 0, 1, 5, 6, /* 99*/
|
||||
0, 1, 3, 4, 7, 2, 5, 6, /* 100*/
|
||||
1, 3, 4, 7, 0, 2, 5, 6, /* 101*/
|
||||
0, 3, 4, 7, 1, 2, 5, 6, /* 102*/
|
||||
3, 4, 7, 0, 1, 2, 5, 6, /* 103*/
|
||||
0, 1, 2, 4, 7, 3, 5, 6, /* 104*/
|
||||
1, 2, 4, 7, 0, 3, 5, 6, /* 105*/
|
||||
0, 2, 4, 7, 1, 3, 5, 6, /* 106*/
|
||||
2, 4, 7, 0, 1, 3, 5, 6, /* 107*/
|
||||
0, 1, 4, 7, 2, 3, 5, 6, /* 108*/
|
||||
1, 4, 7, 0, 2, 3, 5, 6, /* 109*/
|
||||
0, 4, 7, 1, 2, 3, 5, 6, /* 110*/
|
||||
4, 7, 0, 1, 2, 3, 5, 6, /* 111*/
|
||||
0, 1, 2, 3, 7, 4, 5, 6, /* 112*/
|
||||
1, 2, 3, 7, 0, 4, 5, 6, /* 113*/
|
||||
0, 2, 3, 7, 1, 4, 5, 6, /* 114*/
|
||||
2, 3, 7, 0, 1, 4, 5, 6, /* 115*/
|
||||
0, 1, 3, 7, 2, 4, 5, 6, /* 116*/
|
||||
1, 3, 7, 0, 2, 4, 5, 6, /* 117*/
|
||||
0, 3, 7, 1, 2, 4, 5, 6, /* 118*/
|
||||
3, 7, 0, 1, 2, 4, 5, 6, /* 119*/
|
||||
0, 1, 2, 7, 3, 4, 5, 6, /* 120*/
|
||||
1, 2, 7, 0, 3, 4, 5, 6, /* 121*/
|
||||
0, 2, 7, 1, 3, 4, 5, 6, /* 122*/
|
||||
2, 7, 0, 1, 3, 4, 5, 6, /* 123*/
|
||||
0, 1, 7, 2, 3, 4, 5, 6, /* 124*/
|
||||
1, 7, 0, 2, 3, 4, 5, 6, /* 125*/
|
||||
0, 7, 1, 2, 3, 4, 5, 6, /* 126*/
|
||||
7, 0, 1, 2, 3, 4, 5, 6, /* 127*/
|
||||
0, 1, 2, 3, 4, 5, 6, 7, /* 128*/
|
||||
1, 2, 3, 4, 5, 6, 0, 7, /* 129*/
|
||||
0, 2, 3, 4, 5, 6, 1, 7, /* 130*/
|
||||
2, 3, 4, 5, 6, 0, 1, 7, /* 131*/
|
||||
0, 1, 3, 4, 5, 6, 2, 7, /* 132*/
|
||||
1, 3, 4, 5, 6, 0, 2, 7, /* 133*/
|
||||
0, 3, 4, 5, 6, 1, 2, 7, /* 134*/
|
||||
3, 4, 5, 6, 0, 1, 2, 7, /* 135*/
|
||||
0, 1, 2, 4, 5, 6, 3, 7, /* 136*/
|
||||
1, 2, 4, 5, 6, 0, 3, 7, /* 137*/
|
||||
0, 2, 4, 5, 6, 1, 3, 7, /* 138*/
|
||||
2, 4, 5, 6, 0, 1, 3, 7, /* 139*/
|
||||
0, 1, 4, 5, 6, 2, 3, 7, /* 140*/
|
||||
1, 4, 5, 6, 0, 2, 3, 7, /* 141*/
|
||||
0, 4, 5, 6, 1, 2, 3, 7, /* 142*/
|
||||
4, 5, 6, 0, 1, 2, 3, 7, /* 143*/
|
||||
0, 1, 2, 3, 5, 6, 4, 7, /* 144*/
|
||||
1, 2, 3, 5, 6, 0, 4, 7, /* 145*/
|
||||
0, 2, 3, 5, 6, 1, 4, 7, /* 146*/
|
||||
2, 3, 5, 6, 0, 1, 4, 7, /* 147*/
|
||||
0, 1, 3, 5, 6, 2, 4, 7, /* 148*/
|
||||
1, 3, 5, 6, 0, 2, 4, 7, /* 149*/
|
||||
0, 3, 5, 6, 1, 2, 4, 7, /* 150*/
|
||||
3, 5, 6, 0, 1, 2, 4, 7, /* 151*/
|
||||
0, 1, 2, 5, 6, 3, 4, 7, /* 152*/
|
||||
1, 2, 5, 6, 0, 3, 4, 7, /* 153*/
|
||||
0, 2, 5, 6, 1, 3, 4, 7, /* 154*/
|
||||
2, 5, 6, 0, 1, 3, 4, 7, /* 155*/
|
||||
0, 1, 5, 6, 2, 3, 4, 7, /* 156*/
|
||||
1, 5, 6, 0, 2, 3, 4, 7, /* 157*/
|
||||
0, 5, 6, 1, 2, 3, 4, 7, /* 158*/
|
||||
5, 6, 0, 1, 2, 3, 4, 7, /* 159*/
|
||||
0, 1, 2, 3, 4, 6, 5, 7, /* 160*/
|
||||
1, 2, 3, 4, 6, 0, 5, 7, /* 161*/
|
||||
0, 2, 3, 4, 6, 1, 5, 7, /* 162*/
|
||||
2, 3, 4, 6, 0, 1, 5, 7, /* 163*/
|
||||
0, 1, 3, 4, 6, 2, 5, 7, /* 164*/
|
||||
1, 3, 4, 6, 0, 2, 5, 7, /* 165*/
|
||||
0, 3, 4, 6, 1, 2, 5, 7, /* 166*/
|
||||
3, 4, 6, 0, 1, 2, 5, 7, /* 167*/
|
||||
0, 1, 2, 4, 6, 3, 5, 7, /* 168*/
|
||||
1, 2, 4, 6, 0, 3, 5, 7, /* 169*/
|
||||
0, 2, 4, 6, 1, 3, 5, 7, /* 170*/
|
||||
2, 4, 6, 0, 1, 3, 5, 7, /* 171*/
|
||||
0, 1, 4, 6, 2, 3, 5, 7, /* 172*/
|
||||
1, 4, 6, 0, 2, 3, 5, 7, /* 173*/
|
||||
0, 4, 6, 1, 2, 3, 5, 7, /* 174*/
|
||||
4, 6, 0, 1, 2, 3, 5, 7, /* 175*/
|
||||
0, 1, 2, 3, 6, 4, 5, 7, /* 176*/
|
||||
1, 2, 3, 6, 0, 4, 5, 7, /* 177*/
|
||||
0, 2, 3, 6, 1, 4, 5, 7, /* 178*/
|
||||
2, 3, 6, 0, 1, 4, 5, 7, /* 179*/
|
||||
0, 1, 3, 6, 2, 4, 5, 7, /* 180*/
|
||||
1, 3, 6, 0, 2, 4, 5, 7, /* 181*/
|
||||
0, 3, 6, 1, 2, 4, 5, 7, /* 182*/
|
||||
3, 6, 0, 1, 2, 4, 5, 7, /* 183*/
|
||||
0, 1, 2, 6, 3, 4, 5, 7, /* 184*/
|
||||
1, 2, 6, 0, 3, 4, 5, 7, /* 185*/
|
||||
0, 2, 6, 1, 3, 4, 5, 7, /* 186*/
|
||||
2, 6, 0, 1, 3, 4, 5, 7, /* 187*/
|
||||
0, 1, 6, 2, 3, 4, 5, 7, /* 188*/
|
||||
1, 6, 0, 2, 3, 4, 5, 7, /* 189*/
|
||||
0, 6, 1, 2, 3, 4, 5, 7, /* 190*/
|
||||
6, 0, 1, 2, 3, 4, 5, 7, /* 191*/
|
||||
0, 1, 2, 3, 4, 5, 6, 7, /* 192*/
|
||||
1, 2, 3, 4, 5, 0, 6, 7, /* 193*/
|
||||
0, 2, 3, 4, 5, 1, 6, 7, /* 194*/
|
||||
2, 3, 4, 5, 0, 1, 6, 7, /* 195*/
|
||||
0, 1, 3, 4, 5, 2, 6, 7, /* 196*/
|
||||
1, 3, 4, 5, 0, 2, 6, 7, /* 197*/
|
||||
0, 3, 4, 5, 1, 2, 6, 7, /* 198*/
|
||||
3, 4, 5, 0, 1, 2, 6, 7, /* 199*/
|
||||
0, 1, 2, 4, 5, 3, 6, 7, /* 200*/
|
||||
1, 2, 4, 5, 0, 3, 6, 7, /* 201*/
|
||||
0, 2, 4, 5, 1, 3, 6, 7, /* 202*/
|
||||
2, 4, 5, 0, 1, 3, 6, 7, /* 203*/
|
||||
0, 1, 4, 5, 2, 3, 6, 7, /* 204*/
|
||||
1, 4, 5, 0, 2, 3, 6, 7, /* 205*/
|
||||
0, 4, 5, 1, 2, 3, 6, 7, /* 206*/
|
||||
4, 5, 0, 1, 2, 3, 6, 7, /* 207*/
|
||||
0, 1, 2, 3, 5, 4, 6, 7, /* 208*/
|
||||
1, 2, 3, 5, 0, 4, 6, 7, /* 209*/
|
||||
0, 2, 3, 5, 1, 4, 6, 7, /* 210*/
|
||||
2, 3, 5, 0, 1, 4, 6, 7, /* 211*/
|
||||
0, 1, 3, 5, 2, 4, 6, 7, /* 212*/
|
||||
1, 3, 5, 0, 2, 4, 6, 7, /* 213*/
|
||||
0, 3, 5, 1, 2, 4, 6, 7, /* 214*/
|
||||
3, 5, 0, 1, 2, 4, 6, 7, /* 215*/
|
||||
0, 1, 2, 5, 3, 4, 6, 7, /* 216*/
|
||||
1, 2, 5, 0, 3, 4, 6, 7, /* 217*/
|
||||
0, 2, 5, 1, 3, 4, 6, 7, /* 218*/
|
||||
2, 5, 0, 1, 3, 4, 6, 7, /* 219*/
|
||||
0, 1, 5, 2, 3, 4, 6, 7, /* 220*/
|
||||
1, 5, 0, 2, 3, 4, 6, 7, /* 221*/
|
||||
0, 5, 1, 2, 3, 4, 6, 7, /* 222*/
|
||||
5, 0, 1, 2, 3, 4, 6, 7, /* 223*/
|
||||
0, 1, 2, 3, 4, 5, 6, 7, /* 224*/
|
||||
1, 2, 3, 4, 0, 5, 6, 7, /* 225*/
|
||||
0, 2, 3, 4, 1, 5, 6, 7, /* 226*/
|
||||
2, 3, 4, 0, 1, 5, 6, 7, /* 227*/
|
||||
0, 1, 3, 4, 2, 5, 6, 7, /* 228*/
|
||||
1, 3, 4, 0, 2, 5, 6, 7, /* 229*/
|
||||
0, 3, 4, 1, 2, 5, 6, 7, /* 230*/
|
||||
3, 4, 0, 1, 2, 5, 6, 7, /* 231*/
|
||||
0, 1, 2, 4, 3, 5, 6, 7, /* 232*/
|
||||
1, 2, 4, 0, 3, 5, 6, 7, /* 233*/
|
||||
0, 2, 4, 1, 3, 5, 6, 7, /* 234*/
|
||||
2, 4, 0, 1, 3, 5, 6, 7, /* 235*/
|
||||
0, 1, 4, 2, 3, 5, 6, 7, /* 236*/
|
||||
1, 4, 0, 2, 3, 5, 6, 7, /* 237*/
|
||||
0, 4, 1, 2, 3, 5, 6, 7, /* 238*/
|
||||
4, 0, 1, 2, 3, 5, 6, 7, /* 239*/
|
||||
0, 1, 2, 3, 4, 5, 6, 7, /* 240*/
|
||||
1, 2, 3, 0, 4, 5, 6, 7, /* 241*/
|
||||
0, 2, 3, 1, 4, 5, 6, 7, /* 242*/
|
||||
2, 3, 0, 1, 4, 5, 6, 7, /* 243*/
|
||||
0, 1, 3, 2, 4, 5, 6, 7, /* 244*/
|
||||
1, 3, 0, 2, 4, 5, 6, 7, /* 245*/
|
||||
0, 3, 1, 2, 4, 5, 6, 7, /* 246*/
|
||||
3, 0, 1, 2, 4, 5, 6, 7, /* 247*/
|
||||
0, 1, 2, 3, 4, 5, 6, 7, /* 248*/
|
||||
1, 2, 0, 3, 4, 5, 6, 7, /* 249*/
|
||||
0, 2, 1, 3, 4, 5, 6, 7, /* 250*/
|
||||
2, 0, 1, 3, 4, 5, 6, 7, /* 251*/
|
||||
0, 1, 2, 3, 4, 5, 6, 7, /* 252*/
|
||||
1, 0, 2, 3, 4, 5, 6, 7, /* 253*/
|
||||
0, 1, 2, 3, 4, 5, 6, 7, /* 254*/
|
||||
0, 1, 2, 3, 4, 5, 6, 7, /* 255*/
|
||||
};
|
||||
|
||||
|
||||
static uint32_t avx_pivot_on_last_value(int32_t *array, size_t length) {
|
||||
if (length <= 1)
|
||||
return 1;
|
||||
|
||||
|
||||
{ // we exchange the last value for the middle value for a better pivot
|
||||
int32_t ival = array[length / 2];
|
||||
int32_t bval = array[length - 1];
|
||||
array[length / 2] = bval;
|
||||
array[length - 1] = ival;
|
||||
}
|
||||
|
||||
int wh, rh;
|
||||
int wt, rt;
|
||||
|
||||
rh = 0;
|
||||
rt = length - 1;
|
||||
wh = 0;
|
||||
wt = length - 1;
|
||||
|
||||
__m256i h0 = _mm256_loadu_si256((__m256i*)(array + rh));
|
||||
__m256i h1 = _mm256_loadu_si256((__m256i*)(array + rh + AVX2_REGISTER_SIZE));
|
||||
rh += 2*AVX2_REGISTER_SIZE;
|
||||
|
||||
__m256i t0 = _mm256_loadu_si256((__m256i*)(array + rt - AVX2_REGISTER_SIZE));
|
||||
__m256i t1 = _mm256_loadu_si256((__m256i*)(array + rt - 2*AVX2_REGISTER_SIZE));
|
||||
rt -= 3*AVX2_REGISTER_SIZE;
|
||||
|
||||
__m256i current = _mm256_loadu_si256((__m256i*)(array + rh));
|
||||
__m256i next;
|
||||
rh += AVX2_REGISTER_SIZE;
|
||||
|
||||
const int32_t pivot = array[length - 1];
|
||||
const __m256i P = _mm256_set1_epi32(pivot);
|
||||
|
||||
// 1. the main loop
|
||||
while (wh - wt < 4*AVX2_REGISTER_SIZE) {
|
||||
const bool which = (rh - wh) < (wt - rt);
|
||||
|
||||
// I believe that a compiler will emit branchless code
|
||||
__m256i* next_ptr = (__m256i*)(which ? array + rh : array + rt);
|
||||
const int adv_rh = which ? AVX2_REGISTER_SIZE : 0;
|
||||
const int adv_rt = which ? 0 : AVX2_REGISTER_SIZE;
|
||||
// My faith ends here
|
||||
|
||||
next = _mm256_loadu_si256(next_ptr);
|
||||
|
||||
const int pvbyte = _mm256_movemask_ps((__m256)_mm256_cmpgt_epi32(current, P));
|
||||
const uint32_t cnt = 8 - _mm_popcnt_u32(pvbyte);
|
||||
const __m256i shuf = _mm256_load_si256((__m256i *)(reverseshufflemask + 8 * pvbyte));
|
||||
|
||||
const __m256i ordered = _mm256_permutevar8x32_epi32(current, shuf);
|
||||
_mm256_storeu_si256((__m256i*)(array + wh), ordered);
|
||||
_mm256_storeu_si256((__m256i*)(array + wt - 8), ordered);
|
||||
|
||||
rh += adv_rh;
|
||||
rt -= adv_rt;
|
||||
wh += cnt;
|
||||
wt -= 8 - cnt;
|
||||
|
||||
current = next;
|
||||
}
|
||||
|
||||
// 2. partition remaining part
|
||||
while (wh - wt > 4*AVX2_REGISTER_SIZE) {
|
||||
const int32_t v = array[rh++];
|
||||
if (v < pivot) {
|
||||
array[wh++] = v;
|
||||
} else {
|
||||
array[wt--] = v;
|
||||
}
|
||||
}
|
||||
|
||||
// 3. partition 4 registers loaded in the beginning
|
||||
static uint32_t tmp[4*AVX2_REGISTER_SIZE];
|
||||
_mm256_storeu_si256((__m256i*)(tmp + 0*AVX2_REGISTER_SIZE), h0);
|
||||
_mm256_storeu_si256((__m256i*)(tmp + 1*AVX2_REGISTER_SIZE), h1);
|
||||
_mm256_storeu_si256((__m256i*)(tmp + 2*AVX2_REGISTER_SIZE), t0);
|
||||
_mm256_storeu_si256((__m256i*)(tmp + 3*AVX2_REGISTER_SIZE), t1);
|
||||
for (int i=0; i < 4*AVX2_REGISTER_SIZE; i++) {
|
||||
const int32_t v = array[i];
|
||||
if (v < pivot) {
|
||||
array[wh++] = v;
|
||||
} else {
|
||||
array[wt--] = v;
|
||||
}
|
||||
}
|
||||
|
||||
{
|
||||
const int32_t a = array[length - 1];
|
||||
const int32_t b = array[wh];
|
||||
|
||||
array[length - 1] = b;
|
||||
array[wh] = a;
|
||||
}
|
||||
|
||||
return wh;
|
||||
}
|
||||
|
||||
// for fallback
|
||||
void scalar_partition(int32_t* array, const int32_t pivot, int& left, int& right) {
|
||||
|
||||
while (left <= right) {
|
||||
while (array[left] < pivot) {
|
||||
left += 1;
|
||||
}
|
||||
while (array[right] > pivot) {
|
||||
right -= 1;
|
||||
}
|
||||
if (left <= right) {
|
||||
const uint32_t t = array[left];
|
||||
array[left] = array[right];
|
||||
array[right] = t;
|
||||
left += 1;
|
||||
right -= 1;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
//fallback
|
||||
void scalar_quicksort(int32_t* array, int left, int right) {
|
||||
#ifdef WITH_RUNTIME_STATS
|
||||
statistics.scalar__partition_calls += 1;
|
||||
statistics.scalar__items_processed += right - left + 1;
|
||||
#endif
|
||||
int i = left;
|
||||
int j = right;
|
||||
const int32_t pivot = array[(i + j)/2];
|
||||
scalar_partition(array, pivot, i, j);
|
||||
if (left < j) {
|
||||
scalar_quicksort(array, left, j);
|
||||
}
|
||||
if (i < right) {
|
||||
scalar_quicksort(array, i, right);
|
||||
}
|
||||
}
|
||||
|
||||
void avx2_pivotonlast_sort(int32_t *array, const uint32_t length) {
|
||||
uint32_t sep;
|
||||
|
||||
if (length > 8*AVX2_REGISTER_SIZE) {
|
||||
sep = avx_pivot_on_last_value(array, length);
|
||||
} else {
|
||||
sep = lomuto_partition_epi32((uint32_t*)array, 0, length - 1);
|
||||
}
|
||||
if(sep == length) {
|
||||
// we have an ineffective pivot. Let us give up.
|
||||
if(length > 1) scalar_quicksort(array,0,length - 1);
|
||||
} else {
|
||||
if (sep > 2) {
|
||||
avx2_pivotonlast_sort(array, sep - 1);
|
||||
}
|
||||
if (sep + 1 < length) {
|
||||
avx2_pivotonlast_sort(array + sep, length - sep);
|
||||
}
|
||||
}
|
||||
}
|
||||
void wrapped_avx2_pivotonlast_sort(uint32_t *array, int left, int right) {
|
||||
avx2_pivotonlast_sort((int32_t *)array + left, right - left + 1);
|
||||
}
|
||||
|
||||
} // namespace nate
|
||||
409
simd-sort/avx2-natenodutch-quicksort.h
Normal file
409
simd-sort/avx2-natenodutch-quicksort.h
Normal file
@ -0,0 +1,409 @@
|
||||
#pragma once
|
||||
|
||||
#include <x86intrin.h>
|
||||
#include <cstdint>
|
||||
#include <memory.h>
|
||||
#include "avx2-altquicksort.h"
|
||||
/**
|
||||
* This is an alternative approach to SIMD quicksort, based on a description by
|
||||
* Nathan Kurz. Daniel Lemire implemented it to avoid the Dutch national flag
|
||||
* problem as an experiment. This is not meant to be fast, as implemented. It is
|
||||
* an experiment.
|
||||
*/
|
||||
|
||||
/*
|
||||
I'm going to use "vector" as a unit, but it doesn't have to be the
|
||||
native vector size. In particular, I think we can read multiple
|
||||
vector register's worth of data with minimal added time per loop
|
||||
iteration.
|
||||
|
||||
First, read two vectors from the head of the list, and two from the
|
||||
tail of the list. Store these, either in registers or written to a
|
||||
buffer. We're not going to use these until the end. We're just
|
||||
getting them out of the way so that we have more than one vectors
|
||||
writeable space at the head and tail.
|
||||
|
||||
wh--rh------------------rt--wt-
|
||||
|
||||
wh = write head
|
||||
rh = read head
|
||||
rt = read tail
|
||||
wt = write tail
|
||||
|
||||
Then read one vector from head, and advance rh accordingly. Call it nextVector.
|
||||
|
||||
Loop:
|
||||
|
||||
Set currentVector = nextVector.
|
||||
|
||||
Set nextVectorPtr to either rh or rt depending on which has the least
|
||||
"writable space" available. That is, if (rh - wh) < (wt - rt), read
|
||||
from and advance rh, else read from and update rt. These updates
|
||||
need to be branchless.
|
||||
|
||||
Read nextVector = *nextVectorPtr.
|
||||
|
||||
Partition currentVector (lookup, shuffle) and write to both wh and wt
|
||||
(tail offset so end hits wt).
|
||||
Advance wh and wt based on number of valid entries. Use a second byte
|
||||
lookup issued as early as possible to determine this count. Our
|
||||
performance is dependent on the latency of this step.
|
||||
|
||||
-----
|
||||
*/
|
||||
static uint32_t shufflemask[256 * 8] __attribute__((aligned(0x100))) = {
|
||||
0, 1, 2, 3, 4, 5, 6, 7, /* 0*/
|
||||
0, 1, 2, 3, 4, 5, 6, 7, /* 1*/
|
||||
1, 0, 2, 3, 4, 5, 6, 7, /* 2*/
|
||||
0, 1, 2, 3, 4, 5, 6, 7, /* 3*/
|
||||
2, 0, 1, 3, 4, 5, 6, 7, /* 4*/
|
||||
0, 2, 1, 3, 4, 5, 6, 7, /* 5*/
|
||||
1, 2, 0, 3, 4, 5, 6, 7, /* 6*/
|
||||
0, 1, 2, 3, 4, 5, 6, 7, /* 7*/
|
||||
3, 0, 1, 2, 4, 5, 6, 7, /* 8*/
|
||||
0, 3, 1, 2, 4, 5, 6, 7, /* 9*/
|
||||
1, 3, 0, 2, 4, 5, 6, 7, /* 10*/
|
||||
0, 1, 3, 2, 4, 5, 6, 7, /* 11*/
|
||||
2, 3, 0, 1, 4, 5, 6, 7, /* 12*/
|
||||
0, 2, 3, 1, 4, 5, 6, 7, /* 13*/
|
||||
1, 2, 3, 0, 4, 5, 6, 7, /* 14*/
|
||||
0, 1, 2, 3, 4, 5, 6, 7, /* 15*/
|
||||
4, 0, 1, 2, 3, 5, 6, 7, /* 16*/
|
||||
0, 4, 1, 2, 3, 5, 6, 7, /* 17*/
|
||||
1, 4, 0, 2, 3, 5, 6, 7, /* 18*/
|
||||
0, 1, 4, 2, 3, 5, 6, 7, /* 19*/
|
||||
2, 4, 0, 1, 3, 5, 6, 7, /* 20*/
|
||||
0, 2, 4, 1, 3, 5, 6, 7, /* 21*/
|
||||
1, 2, 4, 0, 3, 5, 6, 7, /* 22*/
|
||||
0, 1, 2, 4, 3, 5, 6, 7, /* 23*/
|
||||
3, 4, 0, 1, 2, 5, 6, 7, /* 24*/
|
||||
0, 3, 4, 1, 2, 5, 6, 7, /* 25*/
|
||||
1, 3, 4, 0, 2, 5, 6, 7, /* 26*/
|
||||
0, 1, 3, 4, 2, 5, 6, 7, /* 27*/
|
||||
2, 3, 4, 0, 1, 5, 6, 7, /* 28*/
|
||||
0, 2, 3, 4, 1, 5, 6, 7, /* 29*/
|
||||
1, 2, 3, 4, 0, 5, 6, 7, /* 30*/
|
||||
0, 1, 2, 3, 4, 5, 6, 7, /* 31*/
|
||||
5, 0, 1, 2, 3, 4, 6, 7, /* 32*/
|
||||
0, 5, 1, 2, 3, 4, 6, 7, /* 33*/
|
||||
1, 5, 0, 2, 3, 4, 6, 7, /* 34*/
|
||||
0, 1, 5, 2, 3, 4, 6, 7, /* 35*/
|
||||
2, 5, 0, 1, 3, 4, 6, 7, /* 36*/
|
||||
0, 2, 5, 1, 3, 4, 6, 7, /* 37*/
|
||||
1, 2, 5, 0, 3, 4, 6, 7, /* 38*/
|
||||
0, 1, 2, 5, 3, 4, 6, 7, /* 39*/
|
||||
3, 5, 0, 1, 2, 4, 6, 7, /* 40*/
|
||||
0, 3, 5, 1, 2, 4, 6, 7, /* 41*/
|
||||
1, 3, 5, 0, 2, 4, 6, 7, /* 42*/
|
||||
0, 1, 3, 5, 2, 4, 6, 7, /* 43*/
|
||||
2, 3, 5, 0, 1, 4, 6, 7, /* 44*/
|
||||
0, 2, 3, 5, 1, 4, 6, 7, /* 45*/
|
||||
1, 2, 3, 5, 0, 4, 6, 7, /* 46*/
|
||||
0, 1, 2, 3, 5, 4, 6, 7, /* 47*/
|
||||
4, 5, 0, 1, 2, 3, 6, 7, /* 48*/
|
||||
0, 4, 5, 1, 2, 3, 6, 7, /* 49*/
|
||||
1, 4, 5, 0, 2, 3, 6, 7, /* 50*/
|
||||
0, 1, 4, 5, 2, 3, 6, 7, /* 51*/
|
||||
2, 4, 5, 0, 1, 3, 6, 7, /* 52*/
|
||||
0, 2, 4, 5, 1, 3, 6, 7, /* 53*/
|
||||
1, 2, 4, 5, 0, 3, 6, 7, /* 54*/
|
||||
0, 1, 2, 4, 5, 3, 6, 7, /* 55*/
|
||||
3, 4, 5, 0, 1, 2, 6, 7, /* 56*/
|
||||
0, 3, 4, 5, 1, 2, 6, 7, /* 57*/
|
||||
1, 3, 4, 5, 0, 2, 6, 7, /* 58*/
|
||||
0, 1, 3, 4, 5, 2, 6, 7, /* 59*/
|
||||
2, 3, 4, 5, 0, 1, 6, 7, /* 60*/
|
||||
0, 2, 3, 4, 5, 1, 6, 7, /* 61*/
|
||||
1, 2, 3, 4, 5, 0, 6, 7, /* 62*/
|
||||
0, 1, 2, 3, 4, 5, 6, 7, /* 63*/
|
||||
6, 0, 1, 2, 3, 4, 5, 7, /* 64*/
|
||||
0, 6, 1, 2, 3, 4, 5, 7, /* 65*/
|
||||
1, 6, 0, 2, 3, 4, 5, 7, /* 66*/
|
||||
0, 1, 6, 2, 3, 4, 5, 7, /* 67*/
|
||||
2, 6, 0, 1, 3, 4, 5, 7, /* 68*/
|
||||
0, 2, 6, 1, 3, 4, 5, 7, /* 69*/
|
||||
1, 2, 6, 0, 3, 4, 5, 7, /* 70*/
|
||||
0, 1, 2, 6, 3, 4, 5, 7, /* 71*/
|
||||
3, 6, 0, 1, 2, 4, 5, 7, /* 72*/
|
||||
0, 3, 6, 1, 2, 4, 5, 7, /* 73*/
|
||||
1, 3, 6, 0, 2, 4, 5, 7, /* 74*/
|
||||
0, 1, 3, 6, 2, 4, 5, 7, /* 75*/
|
||||
2, 3, 6, 0, 1, 4, 5, 7, /* 76*/
|
||||
0, 2, 3, 6, 1, 4, 5, 7, /* 77*/
|
||||
1, 2, 3, 6, 0, 4, 5, 7, /* 78*/
|
||||
0, 1, 2, 3, 6, 4, 5, 7, /* 79*/
|
||||
4, 6, 0, 1, 2, 3, 5, 7, /* 80*/
|
||||
0, 4, 6, 1, 2, 3, 5, 7, /* 81*/
|
||||
1, 4, 6, 0, 2, 3, 5, 7, /* 82*/
|
||||
0, 1, 4, 6, 2, 3, 5, 7, /* 83*/
|
||||
2, 4, 6, 0, 1, 3, 5, 7, /* 84*/
|
||||
0, 2, 4, 6, 1, 3, 5, 7, /* 85*/
|
||||
1, 2, 4, 6, 0, 3, 5, 7, /* 86*/
|
||||
0, 1, 2, 4, 6, 3, 5, 7, /* 87*/
|
||||
3, 4, 6, 0, 1, 2, 5, 7, /* 88*/
|
||||
0, 3, 4, 6, 1, 2, 5, 7, /* 89*/
|
||||
1, 3, 4, 6, 0, 2, 5, 7, /* 90*/
|
||||
0, 1, 3, 4, 6, 2, 5, 7, /* 91*/
|
||||
2, 3, 4, 6, 0, 1, 5, 7, /* 92*/
|
||||
0, 2, 3, 4, 6, 1, 5, 7, /* 93*/
|
||||
1, 2, 3, 4, 6, 0, 5, 7, /* 94*/
|
||||
0, 1, 2, 3, 4, 6, 5, 7, /* 95*/
|
||||
5, 6, 0, 1, 2, 3, 4, 7, /* 96*/
|
||||
0, 5, 6, 1, 2, 3, 4, 7, /* 97*/
|
||||
1, 5, 6, 0, 2, 3, 4, 7, /* 98*/
|
||||
0, 1, 5, 6, 2, 3, 4, 7, /* 99*/
|
||||
2, 5, 6, 0, 1, 3, 4, 7, /* 100*/
|
||||
0, 2, 5, 6, 1, 3, 4, 7, /* 101*/
|
||||
1, 2, 5, 6, 0, 3, 4, 7, /* 102*/
|
||||
0, 1, 2, 5, 6, 3, 4, 7, /* 103*/
|
||||
3, 5, 6, 0, 1, 2, 4, 7, /* 104*/
|
||||
0, 3, 5, 6, 1, 2, 4, 7, /* 105*/
|
||||
1, 3, 5, 6, 0, 2, 4, 7, /* 106*/
|
||||
0, 1, 3, 5, 6, 2, 4, 7, /* 107*/
|
||||
2, 3, 5, 6, 0, 1, 4, 7, /* 108*/
|
||||
0, 2, 3, 5, 6, 1, 4, 7, /* 109*/
|
||||
1, 2, 3, 5, 6, 0, 4, 7, /* 110*/
|
||||
0, 1, 2, 3, 5, 6, 4, 7, /* 111*/
|
||||
4, 5, 6, 0, 1, 2, 3, 7, /* 112*/
|
||||
0, 4, 5, 6, 1, 2, 3, 7, /* 113*/
|
||||
1, 4, 5, 6, 0, 2, 3, 7, /* 114*/
|
||||
0, 1, 4, 5, 6, 2, 3, 7, /* 115*/
|
||||
2, 4, 5, 6, 0, 1, 3, 7, /* 116*/
|
||||
0, 2, 4, 5, 6, 1, 3, 7, /* 117*/
|
||||
1, 2, 4, 5, 6, 0, 3, 7, /* 118*/
|
||||
0, 1, 2, 4, 5, 6, 3, 7, /* 119*/
|
||||
3, 4, 5, 6, 0, 1, 2, 7, /* 120*/
|
||||
0, 3, 4, 5, 6, 1, 2, 7, /* 121*/
|
||||
1, 3, 4, 5, 6, 0, 2, 7, /* 122*/
|
||||
0, 1, 3, 4, 5, 6, 2, 7, /* 123*/
|
||||
2, 3, 4, 5, 6, 0, 1, 7, /* 124*/
|
||||
0, 2, 3, 4, 5, 6, 1, 7, /* 125*/
|
||||
1, 2, 3, 4, 5, 6, 0, 7, /* 126*/
|
||||
0, 1, 2, 3, 4, 5, 6, 7, /* 127*/
|
||||
7, 0, 1, 2, 3, 4, 5, 6, /* 128*/
|
||||
0, 7, 1, 2, 3, 4, 5, 6, /* 129*/
|
||||
1, 7, 0, 2, 3, 4, 5, 6, /* 130*/
|
||||
0, 1, 7, 2, 3, 4, 5, 6, /* 131*/
|
||||
2, 7, 0, 1, 3, 4, 5, 6, /* 132*/
|
||||
0, 2, 7, 1, 3, 4, 5, 6, /* 133*/
|
||||
1, 2, 7, 0, 3, 4, 5, 6, /* 134*/
|
||||
0, 1, 2, 7, 3, 4, 5, 6, /* 135*/
|
||||
3, 7, 0, 1, 2, 4, 5, 6, /* 136*/
|
||||
0, 3, 7, 1, 2, 4, 5, 6, /* 137*/
|
||||
1, 3, 7, 0, 2, 4, 5, 6, /* 138*/
|
||||
0, 1, 3, 7, 2, 4, 5, 6, /* 139*/
|
||||
2, 3, 7, 0, 1, 4, 5, 6, /* 140*/
|
||||
0, 2, 3, 7, 1, 4, 5, 6, /* 141*/
|
||||
1, 2, 3, 7, 0, 4, 5, 6, /* 142*/
|
||||
0, 1, 2, 3, 7, 4, 5, 6, /* 143*/
|
||||
4, 7, 0, 1, 2, 3, 5, 6, /* 144*/
|
||||
0, 4, 7, 1, 2, 3, 5, 6, /* 145*/
|
||||
1, 4, 7, 0, 2, 3, 5, 6, /* 146*/
|
||||
0, 1, 4, 7, 2, 3, 5, 6, /* 147*/
|
||||
2, 4, 7, 0, 1, 3, 5, 6, /* 148*/
|
||||
0, 2, 4, 7, 1, 3, 5, 6, /* 149*/
|
||||
1, 2, 4, 7, 0, 3, 5, 6, /* 150*/
|
||||
0, 1, 2, 4, 7, 3, 5, 6, /* 151*/
|
||||
3, 4, 7, 0, 1, 2, 5, 6, /* 152*/
|
||||
0, 3, 4, 7, 1, 2, 5, 6, /* 153*/
|
||||
1, 3, 4, 7, 0, 2, 5, 6, /* 154*/
|
||||
0, 1, 3, 4, 7, 2, 5, 6, /* 155*/
|
||||
2, 3, 4, 7, 0, 1, 5, 6, /* 156*/
|
||||
0, 2, 3, 4, 7, 1, 5, 6, /* 157*/
|
||||
1, 2, 3, 4, 7, 0, 5, 6, /* 158*/
|
||||
0, 1, 2, 3, 4, 7, 5, 6, /* 159*/
|
||||
5, 7, 0, 1, 2, 3, 4, 6, /* 160*/
|
||||
0, 5, 7, 1, 2, 3, 4, 6, /* 161*/
|
||||
1, 5, 7, 0, 2, 3, 4, 6, /* 162*/
|
||||
0, 1, 5, 7, 2, 3, 4, 6, /* 163*/
|
||||
2, 5, 7, 0, 1, 3, 4, 6, /* 164*/
|
||||
0, 2, 5, 7, 1, 3, 4, 6, /* 165*/
|
||||
1, 2, 5, 7, 0, 3, 4, 6, /* 166*/
|
||||
0, 1, 2, 5, 7, 3, 4, 6, /* 167*/
|
||||
3, 5, 7, 0, 1, 2, 4, 6, /* 168*/
|
||||
0, 3, 5, 7, 1, 2, 4, 6, /* 169*/
|
||||
1, 3, 5, 7, 0, 2, 4, 6, /* 170*/
|
||||
0, 1, 3, 5, 7, 2, 4, 6, /* 171*/
|
||||
2, 3, 5, 7, 0, 1, 4, 6, /* 172*/
|
||||
0, 2, 3, 5, 7, 1, 4, 6, /* 173*/
|
||||
1, 2, 3, 5, 7, 0, 4, 6, /* 174*/
|
||||
0, 1, 2, 3, 5, 7, 4, 6, /* 175*/
|
||||
4, 5, 7, 0, 1, 2, 3, 6, /* 176*/
|
||||
0, 4, 5, 7, 1, 2, 3, 6, /* 177*/
|
||||
1, 4, 5, 7, 0, 2, 3, 6, /* 178*/
|
||||
0, 1, 4, 5, 7, 2, 3, 6, /* 179*/
|
||||
2, 4, 5, 7, 0, 1, 3, 6, /* 180*/
|
||||
0, 2, 4, 5, 7, 1, 3, 6, /* 181*/
|
||||
1, 2, 4, 5, 7, 0, 3, 6, /* 182*/
|
||||
0, 1, 2, 4, 5, 7, 3, 6, /* 183*/
|
||||
3, 4, 5, 7, 0, 1, 2, 6, /* 184*/
|
||||
0, 3, 4, 5, 7, 1, 2, 6, /* 185*/
|
||||
1, 3, 4, 5, 7, 0, 2, 6, /* 186*/
|
||||
0, 1, 3, 4, 5, 7, 2, 6, /* 187*/
|
||||
2, 3, 4, 5, 7, 0, 1, 6, /* 188*/
|
||||
0, 2, 3, 4, 5, 7, 1, 6, /* 189*/
|
||||
1, 2, 3, 4, 5, 7, 0, 6, /* 190*/
|
||||
0, 1, 2, 3, 4, 5, 7, 6, /* 191*/
|
||||
6, 7, 0, 1, 2, 3, 4, 5, /* 192*/
|
||||
0, 6, 7, 1, 2, 3, 4, 5, /* 193*/
|
||||
1, 6, 7, 0, 2, 3, 4, 5, /* 194*/
|
||||
0, 1, 6, 7, 2, 3, 4, 5, /* 195*/
|
||||
2, 6, 7, 0, 1, 3, 4, 5, /* 196*/
|
||||
0, 2, 6, 7, 1, 3, 4, 5, /* 197*/
|
||||
1, 2, 6, 7, 0, 3, 4, 5, /* 198*/
|
||||
0, 1, 2, 6, 7, 3, 4, 5, /* 199*/
|
||||
3, 6, 7, 0, 1, 2, 4, 5, /* 200*/
|
||||
0, 3, 6, 7, 1, 2, 4, 5, /* 201*/
|
||||
1, 3, 6, 7, 0, 2, 4, 5, /* 202*/
|
||||
0, 1, 3, 6, 7, 2, 4, 5, /* 203*/
|
||||
2, 3, 6, 7, 0, 1, 4, 5, /* 204*/
|
||||
0, 2, 3, 6, 7, 1, 4, 5, /* 205*/
|
||||
1, 2, 3, 6, 7, 0, 4, 5, /* 206*/
|
||||
0, 1, 2, 3, 6, 7, 4, 5, /* 207*/
|
||||
4, 6, 7, 0, 1, 2, 3, 5, /* 208*/
|
||||
0, 4, 6, 7, 1, 2, 3, 5, /* 209*/
|
||||
1, 4, 6, 7, 0, 2, 3, 5, /* 210*/
|
||||
0, 1, 4, 6, 7, 2, 3, 5, /* 211*/
|
||||
2, 4, 6, 7, 0, 1, 3, 5, /* 212*/
|
||||
0, 2, 4, 6, 7, 1, 3, 5, /* 213*/
|
||||
1, 2, 4, 6, 7, 0, 3, 5, /* 214*/
|
||||
0, 1, 2, 4, 6, 7, 3, 5, /* 215*/
|
||||
3, 4, 6, 7, 0, 1, 2, 5, /* 216*/
|
||||
0, 3, 4, 6, 7, 1, 2, 5, /* 217*/
|
||||
1, 3, 4, 6, 7, 0, 2, 5, /* 218*/
|
||||
0, 1, 3, 4, 6, 7, 2, 5, /* 219*/
|
||||
2, 3, 4, 6, 7, 0, 1, 5, /* 220*/
|
||||
0, 2, 3, 4, 6, 7, 1, 5, /* 221*/
|
||||
1, 2, 3, 4, 6, 7, 0, 5, /* 222*/
|
||||
0, 1, 2, 3, 4, 6, 7, 5, /* 223*/
|
||||
5, 6, 7, 0, 1, 2, 3, 4, /* 224*/
|
||||
0, 5, 6, 7, 1, 2, 3, 4, /* 225*/
|
||||
1, 5, 6, 7, 0, 2, 3, 4, /* 226*/
|
||||
0, 1, 5, 6, 7, 2, 3, 4, /* 227*/
|
||||
2, 5, 6, 7, 0, 1, 3, 4, /* 228*/
|
||||
0, 2, 5, 6, 7, 1, 3, 4, /* 229*/
|
||||
1, 2, 5, 6, 7, 0, 3, 4, /* 230*/
|
||||
0, 1, 2, 5, 6, 7, 3, 4, /* 231*/
|
||||
3, 5, 6, 7, 0, 1, 2, 4, /* 232*/
|
||||
0, 3, 5, 6, 7, 1, 2, 4, /* 233*/
|
||||
1, 3, 5, 6, 7, 0, 2, 4, /* 234*/
|
||||
0, 1, 3, 5, 6, 7, 2, 4, /* 235*/
|
||||
2, 3, 5, 6, 7, 0, 1, 4, /* 236*/
|
||||
0, 2, 3, 5, 6, 7, 1, 4, /* 237*/
|
||||
1, 2, 3, 5, 6, 7, 0, 4, /* 238*/
|
||||
0, 1, 2, 3, 5, 6, 7, 4, /* 239*/
|
||||
4, 5, 6, 7, 0, 1, 2, 3, /* 240*/
|
||||
0, 4, 5, 6, 7, 1, 2, 3, /* 241*/
|
||||
1, 4, 5, 6, 7, 0, 2, 3, /* 242*/
|
||||
0, 1, 4, 5, 6, 7, 2, 3, /* 243*/
|
||||
2, 4, 5, 6, 7, 0, 1, 3, /* 244*/
|
||||
0, 2, 4, 5, 6, 7, 1, 3, /* 245*/
|
||||
1, 2, 4, 5, 6, 7, 0, 3, /* 246*/
|
||||
0, 1, 2, 4, 5, 6, 7, 3, /* 247*/
|
||||
3, 4, 5, 6, 7, 0, 1, 2, /* 248*/
|
||||
0, 3, 4, 5, 6, 7, 1, 2, /* 249*/
|
||||
1, 3, 4, 5, 6, 7, 0, 2, /* 250*/
|
||||
0, 1, 3, 4, 5, 6, 7, 2, /* 251*/
|
||||
2, 3, 4, 5, 6, 7, 0, 1, /* 252*/
|
||||
0, 2, 3, 4, 5, 6, 7, 1, /* 253*/
|
||||
1, 2, 3, 4, 5, 6, 7, 0, /* 254*/
|
||||
0, 1, 2, 3, 4, 5, 6, 7, /* 255*/
|
||||
};
|
||||
|
||||
static inline __m256i _mm256_cmplt_epi32(__m256i a, __m256i b) {
|
||||
return _mm256_cmpgt_epi32(b, a);
|
||||
}
|
||||
|
||||
|
||||
static void avx_natenodutch_partition_epi32(int32_t *array, const int32_t pivot,
|
||||
int &left, int &right) {
|
||||
const __m256i P = _mm256_set1_epi32(pivot);
|
||||
const int valuespervector = sizeof(__m256i) / sizeof(int32_t);
|
||||
if (right - left + 1 <
|
||||
4 * valuespervector) { // not enough space for nate's algo to make sense, falling back
|
||||
scalar_partition(array, pivot, left, right);
|
||||
return;
|
||||
}
|
||||
int readleft = left + valuespervector;
|
||||
int readright = right - valuespervector;
|
||||
int32_t buffer[2 * valuespervector]; // tmp buffer
|
||||
memcpy(buffer, array + left, valuespervector * sizeof(int32_t));
|
||||
memcpy(buffer + valuespervector, array + right - valuespervector + 1,
|
||||
valuespervector * sizeof(int32_t));
|
||||
while (readright - readleft >= valuespervector) {
|
||||
__m256i *nextVectorPtr;
|
||||
if ((readleft - left) > (right - readright)) {
|
||||
nextVectorPtr = (__m256i *)(array + readright - valuespervector + 1);
|
||||
readright -= valuespervector;
|
||||
} else {
|
||||
nextVectorPtr = (__m256i *)(array + readleft);
|
||||
readleft += valuespervector;
|
||||
}
|
||||
__m256i currentVector = _mm256_loadu_si256(nextVectorPtr);
|
||||
/******
|
||||
* we need two comparisons if we are to avoid the Dutch national flag
|
||||
* problem,
|
||||
* Reference: https://en.wikipedia.org/wiki/Dutch_national_flag_problem
|
||||
*/
|
||||
int greaterthanpivot =
|
||||
_mm256_movemask_ps((__m256)_mm256_cmpgt_epi32(currentVector, P));
|
||||
int lesserthanpivot =
|
||||
_mm256_movemask_ps((__m256)_mm256_cmplt_epi32(currentVector, P));
|
||||
__m256i greaterthanpivot_permvector = _mm256_load_si256(
|
||||
(__m256i *)(reverseshufflemask + 8 * greaterthanpivot));
|
||||
__m256i lesserthanpivot_permvector =
|
||||
_mm256_load_si256((__m256i *)(shufflemask + 8 * lesserthanpivot));
|
||||
|
||||
int count_greaterthanpivot = _mm_popcnt_u32(greaterthanpivot);
|
||||
int count_lesserthanpivot = _mm_popcnt_u32(lesserthanpivot);
|
||||
__m256i greaterthanpivot_vector =
|
||||
_mm256_permutevar8x32_epi32(currentVector, greaterthanpivot_permvector);
|
||||
__m256i lesserthanpivot_vector =
|
||||
_mm256_permutevar8x32_epi32(currentVector, lesserthanpivot_permvector);
|
||||
_mm256_storeu_si256((__m256i *)(array + left), lesserthanpivot_vector);
|
||||
|
||||
_mm256_storeu_si256((__m256i *)(array + right - valuespervector + 1),
|
||||
greaterthanpivot_vector);
|
||||
|
||||
left += count_lesserthanpivot;
|
||||
right -= count_greaterthanpivot;
|
||||
}
|
||||
|
||||
int howmanyleft = readleft - left;
|
||||
if (howmanyleft > 2 * valuespervector) {
|
||||
howmanyleft = 2 * valuespervector;
|
||||
for (int k = howmanyleft; k < readleft - left; ++k)
|
||||
array[left + k] = pivot;
|
||||
}
|
||||
memcpy(array + left, buffer, howmanyleft * sizeof(int32_t));
|
||||
|
||||
int howmanyright = right - readright;
|
||||
if (howmanyright + howmanyleft > 2 * valuespervector) {
|
||||
howmanyright = 2 * valuespervector - howmanyleft;
|
||||
for (int k = howmanyright; k < right - readright; ++k)
|
||||
array[readright + 1 + k] = pivot;
|
||||
}
|
||||
|
||||
memcpy(array + readright + 1, buffer + howmanyleft,
|
||||
howmanyright * sizeof(int32_t));
|
||||
scalar_partition(array, pivot, left, right);
|
||||
}
|
||||
|
||||
|
||||
// it is really signed int sorting, but the API seems to expect uint32_t...
|
||||
void avx_natenodutch_quicksort(uint32_t *array, int left, int right) {
|
||||
|
||||
int i = left;
|
||||
int j = right;
|
||||
const int32_t pivot = array[(i + j) / 2];
|
||||
|
||||
avx_natenodutch_partition_epi32((int32_t *) array, pivot, i, j);
|
||||
|
||||
if (left < j) {
|
||||
avx_natenodutch_quicksort(array, left, j);
|
||||
}
|
||||
|
||||
if (i < right) {
|
||||
avx_natenodutch_quicksort(array, i, right);
|
||||
}
|
||||
}
|
||||
197
simd-sort/avx2-partition.cpp
Normal file
197
simd-sort/avx2-partition.cpp
Normal file
@ -0,0 +1,197 @@
|
||||
#include <x86intrin.h>
|
||||
#include <cstdint>
|
||||
|
||||
namespace qs {
|
||||
|
||||
namespace avx2 {
|
||||
|
||||
__m256i FORCE_INLINE bitmask_to_bytemask_epi32(uint8_t bm) {
|
||||
|
||||
const __m256i mask = _mm256_set1_epi32(bm);
|
||||
const __m256i bits = _mm256_setr_epi32(0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80);
|
||||
const __m256i tmp = _mm256_and_si256(mask, bits);
|
||||
|
||||
return _mm256_cmpeq_epi32(tmp, bits);
|
||||
}
|
||||
|
||||
|
||||
void FORCE_INLINE align_masks(uint8_t& a, uint8_t& b, uint8_t& rem_a, uint8_t& rem_b, __m256i& shuffle_a, __m256i& shuffle_b) {
|
||||
|
||||
assert(a != 0);
|
||||
assert(b != 0);
|
||||
|
||||
uint8_t tmpA = a;
|
||||
uint8_t tmpB = b;
|
||||
|
||||
uint32_t __attribute__((__aligned__(32))) tmpshufa[8];
|
||||
uint32_t __attribute__((__aligned__(32))) tmpshufb[8];
|
||||
|
||||
while (tmpA != 0 && tmpB != 0) {
|
||||
int idx_a = __builtin_ctz(tmpA);
|
||||
int idx_b = __builtin_ctz(tmpB);
|
||||
|
||||
tmpA = tmpA & (tmpA - 1);
|
||||
tmpB = tmpB & (tmpB - 1);
|
||||
|
||||
tmpshufb[idx_a] = idx_b;
|
||||
tmpshufa[idx_b] = idx_a;
|
||||
}
|
||||
|
||||
a = a ^ tmpA;
|
||||
b = b ^ tmpB;
|
||||
|
||||
assert(a != 0);
|
||||
assert(b != 0);
|
||||
assert(_mm_popcnt_u64(a) == _mm_popcnt_u64(b));
|
||||
|
||||
rem_a = tmpA;
|
||||
rem_b = tmpB;
|
||||
|
||||
shuffle_a = _mm256_load_si256((__m256i*)tmpshufa);
|
||||
shuffle_b = _mm256_load_si256((__m256i*)tmpshufb);
|
||||
}
|
||||
|
||||
|
||||
__m256i FORCE_INLINE merge(const __m256i mask, const __m256i a, const __m256i b) {
|
||||
|
||||
return _mm256_or_si256(
|
||||
_mm256_and_si256(mask, a),
|
||||
_mm256_andnot_si256(mask, b)
|
||||
);
|
||||
}
|
||||
|
||||
|
||||
void FORCE_INLINE swap_epi32(
|
||||
__m256i& a, __m256i& b,
|
||||
uint8_t mask_a, const __m256i shuffle_a,
|
||||
uint8_t mask_b, const __m256i shuffle_b) {
|
||||
|
||||
const __m256i to_swap_b = _mm256_permutevar8x32_epi32(a, shuffle_a);
|
||||
const __m256i to_swap_a = _mm256_permutevar8x32_epi32(b, shuffle_b);
|
||||
const __m256i ma = bitmask_to_bytemask_epi32(mask_a);
|
||||
const __m256i mb = bitmask_to_bytemask_epi32(mask_b);
|
||||
|
||||
a = merge(ma, to_swap_a, a);
|
||||
b = merge(mb, to_swap_b, b);
|
||||
}
|
||||
|
||||
|
||||
|
||||
#define _mm256_iszero(vec) (_mm256_testz_si256(vec, vec) != 0)
|
||||
|
||||
void FORCE_INLINE partition_epi32(uint32_t* array, uint32_t pv, int& left, int& right) {
|
||||
|
||||
const int N = 8; // the number of items in a register (256/32)
|
||||
|
||||
__m256i L;
|
||||
__m256i R;
|
||||
uint8_t maskL = 0;
|
||||
uint8_t maskR = 0;
|
||||
|
||||
const __m256i pivot = _mm256_set1_epi32(pv);
|
||||
|
||||
int origL = left;
|
||||
int origR = right;
|
||||
|
||||
while (true) {
|
||||
|
||||
if (maskL == 0) {
|
||||
while (true) {
|
||||
if (right - (left + N) + 1 < 2*N) {
|
||||
goto end;
|
||||
}
|
||||
|
||||
L = _mm256_loadu_si256((__m256i*)(array + left));
|
||||
const __m256i bytemask = _mm256_cmpgt_epi32(pivot, L);
|
||||
|
||||
if (_mm256_testc_ps((__m256)bytemask, (__m256)_mm256_set1_epi32(-1))) {
|
||||
left += N;
|
||||
} else {
|
||||
maskL = ~_mm256_movemask_ps((__m256)bytemask);
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
if (maskR == 0) {
|
||||
while (true) {
|
||||
if ((right - N) - left + 1 < 2*N) {
|
||||
goto end;
|
||||
}
|
||||
|
||||
R = _mm256_loadu_si256((__m256i*)(array + right - N + 1));
|
||||
const __m256i bytemask = _mm256_cmpgt_epi32(pivot, R);
|
||||
if (_mm256_iszero(bytemask)) {
|
||||
right -= N;
|
||||
} else {
|
||||
maskR = _mm256_movemask_ps((__m256)bytemask);
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
assert(left <= right);
|
||||
assert(maskL != 0);
|
||||
assert(maskR != 0);
|
||||
|
||||
uint8_t mL;
|
||||
uint8_t mR;
|
||||
__m256i shuffleL;
|
||||
__m256i shuffleR;
|
||||
|
||||
align_masks(maskL, maskR, mL, mR, shuffleL, shuffleR);
|
||||
swap_epi32(L, R, maskL, shuffleL, maskR, shuffleR);
|
||||
|
||||
maskL = mL;
|
||||
maskR = mR;
|
||||
|
||||
if (maskL == 0) {
|
||||
_mm256_storeu_si256((__m256i*)(array + left), L);
|
||||
left += N;
|
||||
}
|
||||
|
||||
if (maskR == 0) {
|
||||
_mm256_storeu_si256((__m256i*)(array + right - N + 1), R);
|
||||
right -= N;
|
||||
}
|
||||
|
||||
} // while
|
||||
|
||||
end:
|
||||
|
||||
assert(!(maskL != 0 && maskR != 0));
|
||||
|
||||
if (maskL != 0) {
|
||||
_mm256_storeu_si256((__m256i*)(array + left), L);
|
||||
} else if (maskR != 0) {
|
||||
_mm256_storeu_si256((__m256i*)(array + right - N + 1), R);
|
||||
}
|
||||
|
||||
if (left < right) {
|
||||
int less = 0;
|
||||
int greater = 0;
|
||||
const int all = right - left + 1;
|
||||
|
||||
for (int i=left; i <= right; i++) {
|
||||
less += int(array[i] < pv);
|
||||
greater += int(array[i] > pv);
|
||||
}
|
||||
|
||||
if (all == less) {
|
||||
// all elements in range [left, right] less than pivot
|
||||
scalar_partition_epi32(array, pv, origL, left);
|
||||
} else if (all == greater) {
|
||||
// all elements in range [left, right] greater than pivot
|
||||
scalar_partition_epi32(array, pv, left, origR);
|
||||
} else {
|
||||
scalar_partition_epi32(array, pv, left, right);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
} // namespace avx2
|
||||
|
||||
} // namespace qs
|
||||
|
||||
32
simd-sort/avx2-quicksort.cpp
Normal file
32
simd-sort/avx2-quicksort.cpp
Normal file
@ -0,0 +1,32 @@
|
||||
#include "avx2-partition.cpp"
|
||||
|
||||
namespace qs {
|
||||
|
||||
namespace avx2 {
|
||||
|
||||
void quicksort(uint32_t* array, int left, int right) {
|
||||
|
||||
int i = left;
|
||||
int j = right;
|
||||
|
||||
const uint32_t pivot = array[(i + j)/2];
|
||||
const int AVX2_REGISTER_SIZE = 8; // in 32-bit words
|
||||
|
||||
if (j - i >= 2 * AVX2_REGISTER_SIZE) {
|
||||
qs::avx2::partition_epi32(array, pivot, i, j);
|
||||
} else {
|
||||
scalar_partition_epi32(array, pivot, i, j);
|
||||
}
|
||||
|
||||
if (left < j) {
|
||||
quicksort(array, left, j);
|
||||
}
|
||||
|
||||
if (i < right) {
|
||||
quicksort(array, i, right);
|
||||
}
|
||||
}
|
||||
|
||||
} // namespace avx2
|
||||
|
||||
} // namespace qs
|
||||
90
simd-sort/avx512-auxbuffer-partition.cpp
Normal file
90
simd-sort/avx512-auxbuffer-partition.cpp
Normal file
@ -0,0 +1,90 @@
|
||||
namespace qs {
|
||||
|
||||
namespace avx512 {
|
||||
|
||||
void memset_epi32(uint32_t* array, uint32_t w, size_t n) {
|
||||
|
||||
const int N = 16;
|
||||
const __m512i word = _mm512_set1_epi32(w);
|
||||
|
||||
for (size_t i=0; i < n/N; i++) {
|
||||
_mm512_storeu_si512(array + i*N, word);
|
||||
}
|
||||
|
||||
for (size_t i=n/N * N; i < n; i++) {
|
||||
array[i] = w;
|
||||
}
|
||||
}
|
||||
|
||||
void memcpy_epi32(uint32_t* dst, uint32_t* src, size_t n) {
|
||||
|
||||
const int N = 16;
|
||||
|
||||
for (size_t i=0; i < n/N; i++) {
|
||||
_mm512_storeu_si512(dst + i*N, _mm512_loadu_si512(src + i*N));
|
||||
}
|
||||
|
||||
for (size_t i=n/N * N; i < n; i++) {
|
||||
dst[i] = src[i];
|
||||
}
|
||||
}
|
||||
|
||||
// parition array[0..n-1]
|
||||
uint32_t FORCE_INLINE partition_auxbuffer_epi32(uint32_t* array, size_t n, uint32_t pv) {
|
||||
|
||||
const int N = 16;
|
||||
const int AUX_COUNT = 1024; // 4kB
|
||||
|
||||
static uint32_t gt_buf[AUX_COUNT + N];
|
||||
|
||||
size_t lt_count = 0;
|
||||
size_t gt_count = 0;
|
||||
|
||||
const __m512i pivot = _mm512_set1_epi32(pv);
|
||||
|
||||
// 1. copy greater and less values into separate buffers
|
||||
for (size_t i=0; i < n / N; i++) {
|
||||
|
||||
const __m512i v = _mm512_loadu_si512(array + i*N);
|
||||
|
||||
const __mmask16 lt = _mm512_cmplt_epi32_mask(v, pivot);
|
||||
const __mmask16 gt = _mm512_cmpgt_epi32_mask(v, pivot);
|
||||
|
||||
const __m512i less = _mm512_maskz_compress_epi32(lt, v);
|
||||
const __m512i greater = _mm512_maskz_compress_epi32(gt, v);
|
||||
|
||||
_mm512_storeu_si512(array + lt_count, less);
|
||||
_mm512_storeu_si512(gt_buf + gt_count, greater);
|
||||
|
||||
lt_count += _mm_popcnt_u32(lt);
|
||||
gt_count += _mm_popcnt_u32(gt);
|
||||
}
|
||||
|
||||
for (size_t i=0; i < n % N; i++) {
|
||||
|
||||
const uint32_t v = array[(n/N) * N + i];
|
||||
|
||||
if (v < pv) {
|
||||
array[lt_count++] = v;
|
||||
} else if (v > pv) {
|
||||
gt_buf[gt_count++] = v;
|
||||
}
|
||||
}
|
||||
|
||||
const size_t eq_count = n - (lt_count + gt_count);
|
||||
|
||||
// 2. replace array with partially ordered data
|
||||
|
||||
// 2.a. pivots
|
||||
memset_epi32(array + lt_count, pv, eq_count);
|
||||
|
||||
// 2.b. all values greater than pivot
|
||||
memcpy_epi32(array + lt_count + eq_count, gt_buf, gt_count);
|
||||
|
||||
// 3. index before the first pivot
|
||||
return lt_count;
|
||||
}
|
||||
|
||||
} // namespace avx512
|
||||
|
||||
} // namespace qa
|
||||
105
simd-sort/avx512-bmi2-partition.cpp
Normal file
105
simd-sort/avx512-bmi2-partition.cpp
Normal file
@ -0,0 +1,105 @@
|
||||
namespace qs {
|
||||
|
||||
namespace avx512 {
|
||||
|
||||
void FORCE_INLINE bmi2_align_masks(__mmask16& a, __mmask16& b, __mmask16& rem_a, __mmask16& rem_b) {
|
||||
|
||||
uint32_t tmpA = _pext_u32(0xffffffff, a);
|
||||
uint32_t tmpB = _pext_u32(0xffffffff, b);
|
||||
|
||||
uint32_t tmp = tmpA & tmpB;
|
||||
|
||||
__mmask16 new_a = _pdep_u32(tmp, a);
|
||||
__mmask16 new_b = _pdep_u32(tmp, a);
|
||||
|
||||
rem_a = a ^ new_a;
|
||||
rem_b = b ^ new_b;
|
||||
|
||||
a = new_a;
|
||||
b = new_b;
|
||||
|
||||
assert(a != 0);
|
||||
assert(b != 0);
|
||||
assert(_mm_popcnt_u64(a) == _mm_popcnt_u64(b));
|
||||
}
|
||||
|
||||
|
||||
void FORCE_INLINE bmi2_partition_epi32(uint32_t* array, uint32_t pv, int& left, int& right) {
|
||||
|
||||
const int N = 16;
|
||||
|
||||
__m512i L;
|
||||
__m512i R;
|
||||
__mmask16 maskL = 0;
|
||||
__mmask16 maskR = 0;
|
||||
const __m512i pivot = _mm512_set1_epi32(pv);
|
||||
|
||||
while (right - left + 1 >= 2*N) {
|
||||
|
||||
while (maskL == 0) {
|
||||
if (right - (left + N) + 1 < 2*N) {
|
||||
goto end;
|
||||
}
|
||||
|
||||
L = _mm512_loadu_si512(array + left);
|
||||
maskL = _mm512_cmpge_epi32_mask(L, pivot);
|
||||
if (maskL == 0) {
|
||||
left += N;
|
||||
}
|
||||
}
|
||||
|
||||
while (maskR == 0) {
|
||||
if ((right - N) - left + 1 < 2*N) {
|
||||
goto end;
|
||||
}
|
||||
|
||||
R = _mm512_loadu_si512(array + right - N + 1);
|
||||
maskR = _mm512_cmple_epi32_mask(R, pivot);
|
||||
if (maskR == 0) {
|
||||
right -= N;
|
||||
}
|
||||
}
|
||||
|
||||
assert(left <= right);
|
||||
assert(maskL != 0);
|
||||
assert(maskR != 0);
|
||||
|
||||
__mmask16 mL;
|
||||
__mmask16 mR;
|
||||
|
||||
bmi2_align_masks(maskL, maskR, mL, mR);
|
||||
swap_epi32(L, R, maskL, maskR);
|
||||
|
||||
maskL = mL;
|
||||
maskR = mR;
|
||||
|
||||
if (maskL == 0) {
|
||||
_mm512_storeu_si512(array + left, L);
|
||||
left += N;
|
||||
}
|
||||
|
||||
if (maskR == 0) {
|
||||
_mm512_storeu_si512(array + right - N + 1, R);
|
||||
right -= N;
|
||||
}
|
||||
|
||||
} // while
|
||||
|
||||
end:
|
||||
|
||||
if (maskL != 0) {
|
||||
_mm512_storeu_si512(array + left, L);
|
||||
}
|
||||
|
||||
if (maskR != 0) {
|
||||
_mm512_storeu_si512(array + right - N + 1, R);
|
||||
}
|
||||
|
||||
if (left < right) {
|
||||
partition_epi32(array, pv, left, right);
|
||||
}
|
||||
}
|
||||
|
||||
} // namespace avx512
|
||||
|
||||
} // namespace qs
|
||||
58
simd-sort/avx512-partition-register.cpp
Normal file
58
simd-sort/avx512-partition-register.cpp
Normal file
@ -0,0 +1,58 @@
|
||||
namespace qs {
|
||||
|
||||
namespace avx512 {
|
||||
|
||||
__mmask16 FORCE_INLINE get_range_mask(uint32_t position, uint32_t length) {
|
||||
|
||||
return (uint32_t(1) << (position + length)) - (uint32_t(1) << position);
|
||||
}
|
||||
|
||||
/*
|
||||
r stores 1 to 16 elements to partition, mask select leading element
|
||||
*/
|
||||
void partition_register(__m512i& r, const __m512i pivot, __mmask16 mask, int& left, int& right) {
|
||||
|
||||
// example:
|
||||
|
||||
// r = [ 1, 2, 3, 100, 200, 300, 4, 5, 50, 900, -1, -2, -3, -4, -5, -6]
|
||||
// mask = 0x03ff -- first 10 elements are subject to parition
|
||||
// pivot = packed_dword(50)
|
||||
|
||||
const __mmask16 less_mask = _mm512_mask_cmplt_epi32_mask(mask, r, pivot);
|
||||
const __mmask16 equal_mask = _mm512_mask_cmpeq_epi32_mask(mask, r, pivot);
|
||||
|
||||
// less = [1, 2, 3, 4, 5, ... rest are **pivots**]
|
||||
const __m512i less_equal = _mm512_mask_compress_epi32(pivot, less_mask, r);
|
||||
|
||||
// greater = [100, 200, 300, 900, ... rest are zeros]
|
||||
const __m512i greater = _mm512_maskz_compress_epi32(~(less_mask | equal_mask) & mask, r);
|
||||
|
||||
// less_cnt = 5
|
||||
// less_equal_cnt = 6
|
||||
const int less_cnt = _mm_popcnt_u32(less_mask);
|
||||
const int less_equal_cnt = _mm_popcnt_u32(less_mask | equal_mask);
|
||||
|
||||
const __mmask16 store_less_equal = get_range_mask(0, less_equal_cnt);
|
||||
const __mmask16 store_greater = ~store_less_equal & mask;
|
||||
|
||||
// merge less or eqaul with input
|
||||
//
|
||||
// r = [ 1, 2, 3, 4, 5, 50, 4, 5, 50, 900, -1, -2, -3, -4, -5, -6]
|
||||
// ^ ^ ^ ^ ^ ^^
|
||||
// merged
|
||||
r = _mm512_mask_mov_epi32(r, store_less_equal, less_equal);
|
||||
|
||||
// merge greater than pivot
|
||||
//
|
||||
// r = [ 1, 2, 3, 4, 5, 50, 100, 200, 300, 900, -1, -2, -3, -4, -5, -6]
|
||||
// ^^^ ^^^ ^^^ ^^^
|
||||
// merged
|
||||
r = _mm512_mask_expand_epi32(r, store_greater, greater);
|
||||
|
||||
right = left + less_cnt;
|
||||
left = left + less_equal_cnt;
|
||||
}
|
||||
|
||||
} // namespace avx512
|
||||
|
||||
} // namespace qs
|
||||
129
simd-sort/avx512-partition.cpp
Normal file
129
simd-sort/avx512-partition.cpp
Normal file
@ -0,0 +1,129 @@
|
||||
//#define USE_AVX512_PIVOT
|
||||
|
||||
namespace qs {
|
||||
|
||||
namespace avx512 {
|
||||
|
||||
void FORCE_INLINE align_masks(__mmask16& a, __mmask16& b, __mmask16& rem_a, __mmask16& rem_b) {
|
||||
|
||||
assert(a != 0);
|
||||
assert(b != 0);
|
||||
|
||||
uint16_t tmpA = a;
|
||||
uint16_t tmpB = b;
|
||||
|
||||
while (tmpA != 0 && tmpB != 0) {
|
||||
tmpA = tmpA & (tmpA - 1);
|
||||
tmpB = tmpB & (tmpB - 1);
|
||||
}
|
||||
|
||||
a = a ^ tmpA;
|
||||
b = b ^ tmpB;
|
||||
|
||||
assert(a != 0);
|
||||
assert(b != 0);
|
||||
assert(_mm_popcnt_u64(a) == _mm_popcnt_u64(b));
|
||||
|
||||
rem_a = tmpA;
|
||||
rem_b = tmpB;
|
||||
}
|
||||
|
||||
|
||||
void FORCE_INLINE partition_epi32(uint32_t* array, uint32_t pv, int& left, int& right) {
|
||||
|
||||
const int N = 16;
|
||||
|
||||
__m512i L;
|
||||
__m512i R;
|
||||
__mmask16 maskL = 0;
|
||||
__mmask16 maskR = 0;
|
||||
const __m512i pivot = _mm512_set1_epi32(pv);
|
||||
|
||||
int count = (right - left + 1)/N;
|
||||
|
||||
while (true) {
|
||||
|
||||
while (maskL == 0) {
|
||||
if (count <= 1) {
|
||||
goto end;
|
||||
}
|
||||
|
||||
L = _mm512_loadu_si512(array + left);
|
||||
maskL = _mm512_cmpge_epi32_mask(L, pivot);
|
||||
if (maskL == 0) {
|
||||
left += N;
|
||||
count -= 1;
|
||||
}
|
||||
}
|
||||
|
||||
while (maskR == 0) {
|
||||
if (count <= 1) {
|
||||
goto end;
|
||||
}
|
||||
|
||||
R = _mm512_loadu_si512(array + right - N + 1);
|
||||
maskR = _mm512_cmple_epi32_mask(R, pivot);
|
||||
if (maskR == 0) {
|
||||
right -= N;
|
||||
count -= 1;
|
||||
}
|
||||
}
|
||||
|
||||
assert(left <= right);
|
||||
assert(maskL != 0);
|
||||
assert(maskR != 0);
|
||||
|
||||
__mmask16 mL;
|
||||
__mmask16 mR;
|
||||
|
||||
align_masks(maskL, maskR, mL, mR);
|
||||
swap_epi32(L, R, maskL, maskR);
|
||||
|
||||
maskL = mL;
|
||||
maskR = mR;
|
||||
|
||||
if (maskL == 0) {
|
||||
_mm512_storeu_si512(array + left, L);
|
||||
left += N;
|
||||
count -= 1;
|
||||
}
|
||||
|
||||
if (maskR == 0) {
|
||||
_mm512_storeu_si512(array + right - N + 1, R);
|
||||
right -= N;
|
||||
count -= 1;
|
||||
}
|
||||
|
||||
} // while
|
||||
|
||||
end:
|
||||
|
||||
assert(!(maskL != 0 && maskR != 0));
|
||||
|
||||
if (maskL != 0) {
|
||||
_mm512_storeu_si512(array + left, L);
|
||||
} else if (maskR != 0) {
|
||||
_mm512_storeu_si512(array + right - N + 1, R);
|
||||
}
|
||||
|
||||
if (left < right) {
|
||||
#if USE_AVX512_PIVOT
|
||||
scalar_partition_epi32(array, pv, left, right);
|
||||
#else
|
||||
const int size = right - left + 1;
|
||||
if (size > N) {
|
||||
scalar_partition_epi32(array, pv, left, right);
|
||||
} else {
|
||||
const int k = left;
|
||||
__mmask16 mask = (uint32_t(1) << size) - 1;
|
||||
__m512i v = _mm512_loadu_si512(array + k);
|
||||
partition_register(v, pivot, mask, left, right);
|
||||
_mm512_storeu_si512(array + k, v);
|
||||
}
|
||||
#endif
|
||||
}
|
||||
}
|
||||
|
||||
} // namespace avx512
|
||||
|
||||
} // namespace qa
|
||||
119
simd-sort/avx512-popcnt-partition.cpp
Normal file
119
simd-sort/avx512-popcnt-partition.cpp
Normal file
@ -0,0 +1,119 @@
|
||||
namespace qs {
|
||||
|
||||
namespace avx512 {
|
||||
|
||||
void FORCE_INLINE popcnt_partition_epi32(uint32_t* array, uint32_t pv, int& left, int& right) {
|
||||
|
||||
const int N = 16;
|
||||
|
||||
__m512i L;
|
||||
__m512i R;
|
||||
__mmask16 maskL = 0;
|
||||
__mmask16 maskR = 0;
|
||||
int popcntL;
|
||||
int popcntR;
|
||||
|
||||
const __m512i pivot = _mm512_set1_epi32(pv);
|
||||
|
||||
while (true) {
|
||||
|
||||
while (maskL == 0) {
|
||||
if (right - (left + N) + 1 < N) {
|
||||
goto end;
|
||||
}
|
||||
|
||||
L = _mm512_loadu_si512(array + left);
|
||||
maskL = _mm512_cmpge_epi32_mask(L, pivot);
|
||||
if (maskL == 0) {
|
||||
left += N;
|
||||
}
|
||||
|
||||
popcntL = _mm_popcnt_u32(maskL);
|
||||
}
|
||||
|
||||
while (maskR == 0) {
|
||||
if ((right - N) - left + 1 < N) {
|
||||
goto end;
|
||||
}
|
||||
|
||||
R = _mm512_loadu_si512(array + right - N + 1);
|
||||
maskR = _mm512_cmple_epi32_mask(R, pivot);
|
||||
if (maskR == 0) {
|
||||
right -= N;
|
||||
}
|
||||
|
||||
popcntR = _mm_popcnt_u32(maskR);
|
||||
}
|
||||
|
||||
assert(left <= right);
|
||||
assert(maskL != 0);
|
||||
assert(maskR != 0);
|
||||
|
||||
if (popcntL == popcntR) {
|
||||
// fast path
|
||||
swap_epi32(L, R, maskL, maskR);
|
||||
maskL = maskR = 0;
|
||||
|
||||
_mm512_storeu_si512(array + left, L);
|
||||
left += N;
|
||||
|
||||
_mm512_storeu_si512(array + right - N + 1, R);
|
||||
right -= N;
|
||||
} else {
|
||||
if (popcntL < popcntR) {
|
||||
int n = popcntR - popcntL;
|
||||
int k = n;
|
||||
|
||||
__mmask16 tmp = maskR;
|
||||
while (k--) {
|
||||
maskR = maskR & (maskR - 1);
|
||||
}
|
||||
|
||||
swap_epi32(L, R, maskL, maskR);
|
||||
maskR = tmp ^ maskR;
|
||||
|
||||
popcntR = n;
|
||||
|
||||
_mm512_storeu_si512(array + left, L);
|
||||
left += N;
|
||||
maskL = 0;
|
||||
} else {
|
||||
int n = popcntL - popcntR;
|
||||
int k = n;
|
||||
|
||||
__mmask16 tmp = maskL;
|
||||
while (k--) {
|
||||
maskL = maskL & (maskL - 1);
|
||||
}
|
||||
|
||||
swap_epi32(L, R, maskL, maskR);
|
||||
maskL = tmp ^ maskL;
|
||||
|
||||
popcntL = n;
|
||||
|
||||
_mm512_storeu_si512(array + right - N + 1, R);
|
||||
right -= N;
|
||||
maskR = 0;
|
||||
}
|
||||
}
|
||||
|
||||
} // while
|
||||
|
||||
end:
|
||||
|
||||
assert(!(maskL != 0 && maskR != 0));
|
||||
|
||||
if (maskL != 0) {
|
||||
_mm512_storeu_si512(array + left, L);
|
||||
} else if (maskR != 0) {
|
||||
_mm512_storeu_si512(array + right - N + 1, R);
|
||||
}
|
||||
|
||||
if (left < right) {
|
||||
scalar_partition_epi32(array, pv, left, right);
|
||||
}
|
||||
}
|
||||
|
||||
} // namespace avx512
|
||||
|
||||
} // namespace qs
|
||||
211
simd-sort/avx512-quicksort.cpp
Normal file
211
simd-sort/avx512-quicksort.cpp
Normal file
@ -0,0 +1,211 @@
|
||||
#include "avx512-swap.cpp"
|
||||
#include "avx512-partition-register.cpp"
|
||||
#include "avx512-partition.cpp"
|
||||
#include "avx512-popcnt-partition.cpp"
|
||||
#include "avx512-bmi2-partition.cpp"
|
||||
#include "avx512-auxbuffer-partition.cpp"
|
||||
#include "avx512-sort-register.cpp"
|
||||
|
||||
|
||||
namespace qs {
|
||||
|
||||
namespace avx512 {
|
||||
|
||||
|
||||
void dump(const char* name, const __m512i v) {
|
||||
uint32_t tmp[16];
|
||||
|
||||
printf("%-10s = [", name);
|
||||
_mm512_storeu_si512((__m512i*)tmp, v);
|
||||
for (int i=0; i < 16; i++) {
|
||||
if (i > 0) {
|
||||
putchar(' ');
|
||||
}
|
||||
|
||||
printf("%08x", tmp[i]);
|
||||
}
|
||||
|
||||
printf("]\n");
|
||||
}
|
||||
|
||||
void quicksort(uint32_t* array, int left, int right) {
|
||||
|
||||
int i = left;
|
||||
int j = right;
|
||||
|
||||
const int size = j - i + 1;
|
||||
if (size <= 16) {
|
||||
const __m512i in = _mm512_loadu_si512(array + i);
|
||||
__m512i result;
|
||||
|
||||
switch (size) {
|
||||
case 16: result = avx512_sort16_epi32(in); break;
|
||||
case 15: result = avx512_sort15_epi32(in); break;
|
||||
case 14: result = avx512_sort14_epi32(in); break;
|
||||
case 13: result = avx512_sort13_epi32(in); break;
|
||||
case 12: result = avx512_sort12_epi32(in); break;
|
||||
case 11: result = avx512_sort11_epi32(in); break;
|
||||
case 10: result = avx512_sort10_epi32(in); break;
|
||||
case 9: result = avx512_sort9_epi32(in); break;
|
||||
case 8: result = avx512_sort8_epi32(in); break;
|
||||
case 7: result = avx512_sort7_epi32(in); break;
|
||||
case 6: result = avx512_sort6_epi32(in); break;
|
||||
case 5: result = avx512_sort5_epi32(in); break;
|
||||
case 4: result = avx512_sort4_epi32(in); break;
|
||||
case 3: result = avx512_sort3_epi32(in); break;
|
||||
case 2: {
|
||||
if (array[i] > array[j]) {
|
||||
uint32_t t = array[i];
|
||||
array[i] = array[j];
|
||||
array[j] = t;
|
||||
}
|
||||
return;
|
||||
}
|
||||
|
||||
case 1:
|
||||
case 0:
|
||||
return;
|
||||
|
||||
default:
|
||||
assert(false);
|
||||
break;
|
||||
}
|
||||
|
||||
_mm512_storeu_si512(array + i, result);
|
||||
return;
|
||||
}
|
||||
|
||||
const uint32_t pivot = array[left];
|
||||
const int AVX512_REGISTER_SIZE = 16; // in 32-bit words
|
||||
if (size >= 2 * AVX512_REGISTER_SIZE) {
|
||||
::qs::avx512::partition_epi32(array, pivot, i, j);
|
||||
} else {
|
||||
scalar_partition_epi32(array, pivot, i, j);
|
||||
}
|
||||
|
||||
if (left < j) {
|
||||
quicksort(array, left, j);
|
||||
}
|
||||
|
||||
if (i < right) {
|
||||
quicksort(array, i, right);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
void popcnt_quicksort(uint32_t* array, int left, int right) {
|
||||
|
||||
int i = left;
|
||||
int j = right;
|
||||
const int size = j - i + 1;
|
||||
if (size <= 16) {
|
||||
const __m512i in = _mm512_loadu_si512(array + i);
|
||||
__m512i result;
|
||||
|
||||
switch (size) {
|
||||
case 16: result = avx512_sort16_epi32(in); break;
|
||||
case 15: result = avx512_sort15_epi32(in); break;
|
||||
case 14: result = avx512_sort14_epi32(in); break;
|
||||
case 13: result = avx512_sort13_epi32(in); break;
|
||||
case 12: result = avx512_sort12_epi32(in); break;
|
||||
case 11: result = avx512_sort11_epi32(in); break;
|
||||
case 10: result = avx512_sort10_epi32(in); break;
|
||||
case 9: result = avx512_sort9_epi32(in); break;
|
||||
case 8: result = avx512_sort8_epi32(in); break;
|
||||
case 7: result = avx512_sort7_epi32(in); break;
|
||||
case 6: result = avx512_sort6_epi32(in); break;
|
||||
case 5: result = avx512_sort5_epi32(in); break;
|
||||
case 4: result = avx512_sort4_epi32(in); break;
|
||||
case 3: result = avx512_sort3_epi32(in); break;
|
||||
case 2: {
|
||||
if (array[i] > array[j]) {
|
||||
uint32_t t = array[i];
|
||||
array[i] = array[j];
|
||||
array[j] = t;
|
||||
}
|
||||
return;
|
||||
}
|
||||
|
||||
case 1:
|
||||
case 0:
|
||||
return;
|
||||
|
||||
default:
|
||||
assert(false);
|
||||
break;
|
||||
}
|
||||
|
||||
_mm512_storeu_si512(array + i, result);
|
||||
return;
|
||||
}
|
||||
|
||||
const uint32_t pivot = array[(i + j)/2];
|
||||
const int AVX512_REGISTER_SIZE = 16; // in 32-bit words
|
||||
if (size >= 2 * AVX512_REGISTER_SIZE) {
|
||||
::qs::avx512::popcnt_partition_epi32(array, pivot, i, j);
|
||||
} else {
|
||||
scalar_partition_epi32(array, pivot, i, j);
|
||||
}
|
||||
|
||||
if (left < j) {
|
||||
popcnt_quicksort(array, left, j);
|
||||
}
|
||||
|
||||
if (i < right) {
|
||||
popcnt_quicksort(array, i, right);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
void bmi2_quicksort(uint32_t* array, int left, int right) {
|
||||
|
||||
int i = left;
|
||||
int j = right;
|
||||
|
||||
const uint32_t pivot = array[left];
|
||||
const int AVX512_REGISTER_SIZE = 16; // in 32-bit words
|
||||
|
||||
if (j - i >= 2 * AVX512_REGISTER_SIZE) {
|
||||
::qs::avx512::bmi2_partition_epi32(array, pivot, i, j);
|
||||
} else {
|
||||
scalar_partition_epi32(array, pivot, i, j);
|
||||
}
|
||||
|
||||
if (left < j) {
|
||||
bmi2_quicksort(array, left, j);
|
||||
}
|
||||
|
||||
if (i < right) {
|
||||
bmi2_quicksort(array, i, right);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
void auxbuffer_quicksort(uint32_t* array, int lo, int hi) {
|
||||
|
||||
if (lo >= hi) {
|
||||
return;
|
||||
}
|
||||
|
||||
int p;
|
||||
#if 0
|
||||
p = lomuto_partition_epi32(array, lo, hi);
|
||||
#else
|
||||
const auto n = (hi - lo + 1);
|
||||
if (n > 1024) {
|
||||
p = lomuto_partition_epi32(array, lo, hi);
|
||||
} else if (n > 8) {
|
||||
const uint32_t pivot = array[(hi + lo)/2];
|
||||
p = ::qs::avx512::partition_auxbuffer_epi32(array + lo, n, pivot) + lo;
|
||||
} else {
|
||||
p = lomuto_partition_epi32(array, lo, hi);
|
||||
}
|
||||
#endif
|
||||
auxbuffer_quicksort(array, lo, p-1);
|
||||
auxbuffer_quicksort(array, p+1, hi);
|
||||
}
|
||||
|
||||
} // namespace avx512
|
||||
|
||||
} // namespace qs
|
||||
|
||||
323
simd-sort/avx512-sort-register.cpp
Normal file
323
simd-sort/avx512-sort-register.cpp
Normal file
@ -0,0 +1,323 @@
|
||||
|
||||
//#define POPCNT_LOOKUP
|
||||
|
||||
#ifdef POPCNT_LOOKUP
|
||||
|
||||
uint16_t lookup[65536];
|
||||
|
||||
// The lookup table replaces 2 x popcount and 2 x shift with 1 bit-or and 2 x memory fetch
|
||||
void prepare_lookup() {
|
||||
for (int i=0; i < 65536; i++) {
|
||||
lookup[i] = 1 << _mm_popcnt_u32(i);
|
||||
}
|
||||
}
|
||||
#define STEP(unused) { \
|
||||
const __m512i b = _mm512_permutexvar_epi32(index, v); \
|
||||
index = _mm512_add_epi32(index, incr); \
|
||||
const uint16_t lt = _mm512_mask_cmplt_epi32_mask(sort_mask, v, b); \
|
||||
const uint16_t eq = _mm512_mask_cmpeq_epi32_mask(sort_mask, v, b); \
|
||||
const uint16_t mask = lookup[lt | eq] - lookup[lt]; \
|
||||
result = _mm512_mask_mov_epi32(result, mask, b); \
|
||||
}
|
||||
|
||||
#else
|
||||
|
||||
#define STEP(unused) { \
|
||||
const __m512i b = _mm512_permutexvar_epi32(index, v); \
|
||||
index = _mm512_add_epi32(index, incr); \
|
||||
const uint16_t lt = _mm_popcnt_u32(_mm512_mask_cmplt_epi32_mask(sort_mask, v, b)); \
|
||||
const uint16_t eq = _mm_popcnt_u32(_mm512_mask_cmpeq_epi32_mask(sort_mask, v, b)); \
|
||||
const uint16_t mask = (uint32_t(1) << (lt + eq)) - (uint32_t(1) << lt); \
|
||||
result = _mm512_mask_mov_epi32(result, mask, b); \
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
__m512i avx512_sort16_epi32(const __m512i v) {
|
||||
|
||||
__m512i result = v;
|
||||
__m512i index = _mm512_setzero_si512();
|
||||
__m512i incr = _mm512_set1_epi32(1);
|
||||
|
||||
const uint16_t sort_mask = 0xffff;
|
||||
STEP(0);
|
||||
STEP(1);
|
||||
STEP(2);
|
||||
STEP(3);
|
||||
STEP(4);
|
||||
STEP(5);
|
||||
STEP(6);
|
||||
STEP(7);
|
||||
STEP(8);
|
||||
STEP(9);
|
||||
STEP(10);
|
||||
STEP(11);
|
||||
STEP(12);
|
||||
STEP(13);
|
||||
STEP(14);
|
||||
STEP(15);
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
__m512i avx512_sort15_epi32(const __m512i v) {
|
||||
|
||||
__m512i result = v;
|
||||
__m512i index = _mm512_setzero_si512();
|
||||
__m512i incr = _mm512_set1_epi32(1);
|
||||
|
||||
const uint16_t sort_mask = 0x7fff;
|
||||
STEP(0);
|
||||
STEP(1);
|
||||
STEP(2);
|
||||
STEP(3);
|
||||
STEP(4);
|
||||
STEP(5);
|
||||
STEP(6);
|
||||
STEP(7);
|
||||
STEP(8);
|
||||
STEP(9);
|
||||
STEP(10);
|
||||
STEP(11);
|
||||
STEP(12);
|
||||
STEP(13);
|
||||
STEP(14);
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
__m512i avx512_sort14_epi32(const __m512i v) {
|
||||
|
||||
__m512i result = v;
|
||||
__m512i index = _mm512_setzero_si512();
|
||||
__m512i incr = _mm512_set1_epi32(1);
|
||||
|
||||
const uint16_t sort_mask = 0x3fff;
|
||||
STEP(0);
|
||||
STEP(1);
|
||||
STEP(2);
|
||||
STEP(3);
|
||||
STEP(4);
|
||||
STEP(5);
|
||||
STEP(6);
|
||||
STEP(7);
|
||||
STEP(8);
|
||||
STEP(9);
|
||||
STEP(10);
|
||||
STEP(11);
|
||||
STEP(12);
|
||||
STEP(13);
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
__m512i avx512_sort13_epi32(const __m512i v) {
|
||||
|
||||
__m512i result = v;
|
||||
__m512i index = _mm512_setzero_si512();
|
||||
__m512i incr = _mm512_set1_epi32(1);
|
||||
|
||||
const uint16_t sort_mask = 0x1fff;
|
||||
STEP(0);
|
||||
STEP(1);
|
||||
STEP(2);
|
||||
STEP(3);
|
||||
STEP(4);
|
||||
STEP(5);
|
||||
STEP(6);
|
||||
STEP(7);
|
||||
STEP(8);
|
||||
STEP(9);
|
||||
STEP(10);
|
||||
STEP(11);
|
||||
STEP(12);
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
__m512i avx512_sort12_epi32(const __m512i v) {
|
||||
|
||||
__m512i result = v;
|
||||
__m512i index = _mm512_setzero_si512();
|
||||
__m512i incr = _mm512_set1_epi32(1);
|
||||
|
||||
const uint16_t sort_mask = 0x0fff;
|
||||
STEP(0);
|
||||
STEP(1);
|
||||
STEP(2);
|
||||
STEP(3);
|
||||
STEP(4);
|
||||
STEP(5);
|
||||
STEP(6);
|
||||
STEP(7);
|
||||
STEP(8);
|
||||
STEP(9);
|
||||
STEP(10);
|
||||
STEP(11);
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
__m512i avx512_sort11_epi32(const __m512i v) {
|
||||
|
||||
__m512i result = v;
|
||||
__m512i index = _mm512_setzero_si512();
|
||||
__m512i incr = _mm512_set1_epi32(1);
|
||||
|
||||
const uint16_t sort_mask = 0x07ff;
|
||||
STEP(0);
|
||||
STEP(1);
|
||||
STEP(2);
|
||||
STEP(3);
|
||||
STEP(4);
|
||||
STEP(5);
|
||||
STEP(6);
|
||||
STEP(7);
|
||||
STEP(8);
|
||||
STEP(9);
|
||||
STEP(10);
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
__m512i avx512_sort10_epi32(const __m512i v) {
|
||||
|
||||
__m512i result = v;
|
||||
__m512i index = _mm512_setzero_si512();
|
||||
__m512i incr = _mm512_set1_epi32(1);
|
||||
|
||||
const uint16_t sort_mask = 0x03ff;
|
||||
STEP(0);
|
||||
STEP(1);
|
||||
STEP(2);
|
||||
STEP(3);
|
||||
STEP(4);
|
||||
STEP(5);
|
||||
STEP(6);
|
||||
STEP(7);
|
||||
STEP(8);
|
||||
STEP(9);
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
__m512i avx512_sort9_epi32(const __m512i v) {
|
||||
|
||||
__m512i result = v;
|
||||
__m512i index = _mm512_setzero_si512();
|
||||
__m512i incr = _mm512_set1_epi32(1);
|
||||
|
||||
const uint16_t sort_mask = 0x01ff;
|
||||
STEP(0);
|
||||
STEP(1);
|
||||
STEP(2);
|
||||
STEP(3);
|
||||
STEP(4);
|
||||
STEP(5);
|
||||
STEP(6);
|
||||
STEP(7);
|
||||
STEP(8);
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
__m512i avx512_sort8_epi32(const __m512i v) {
|
||||
|
||||
__m512i result = v;
|
||||
__m512i index = _mm512_setzero_si512();
|
||||
__m512i incr = _mm512_set1_epi32(1);
|
||||
|
||||
const uint16_t sort_mask = 0x00ff;
|
||||
STEP(0);
|
||||
STEP(1);
|
||||
STEP(2);
|
||||
STEP(3);
|
||||
STEP(4);
|
||||
STEP(5);
|
||||
STEP(6);
|
||||
STEP(7);
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
__m512i avx512_sort7_epi32(const __m512i v) {
|
||||
|
||||
__m512i result = v;
|
||||
__m512i index = _mm512_setzero_si512();
|
||||
__m512i incr = _mm512_set1_epi32(1);
|
||||
|
||||
const uint16_t sort_mask = 0x007f;
|
||||
STEP(0);
|
||||
STEP(1);
|
||||
STEP(2);
|
||||
STEP(3);
|
||||
STEP(4);
|
||||
STEP(5);
|
||||
STEP(6);
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
__m512i avx512_sort6_epi32(const __m512i v) {
|
||||
|
||||
__m512i result = v;
|
||||
__m512i index = _mm512_setzero_si512();
|
||||
__m512i incr = _mm512_set1_epi32(1);
|
||||
|
||||
const uint16_t sort_mask = 0x003f;
|
||||
STEP(0);
|
||||
STEP(1);
|
||||
STEP(2);
|
||||
STEP(3);
|
||||
STEP(4);
|
||||
STEP(5);
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
__m512i avx512_sort5_epi32(const __m512i v) {
|
||||
|
||||
__m512i result = v;
|
||||
__m512i index = _mm512_setzero_si512();
|
||||
__m512i incr = _mm512_set1_epi32(1);
|
||||
|
||||
const uint16_t sort_mask = 0x001f;
|
||||
STEP(0);
|
||||
STEP(1);
|
||||
STEP(2);
|
||||
STEP(3);
|
||||
STEP(4);
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
__m512i avx512_sort4_epi32(const __m512i v) {
|
||||
|
||||
__m512i result = v;
|
||||
__m512i index = _mm512_setzero_si512();
|
||||
__m512i incr = _mm512_set1_epi32(1);
|
||||
|
||||
const uint16_t sort_mask = 0x000f;
|
||||
STEP(0);
|
||||
STEP(1);
|
||||
STEP(2);
|
||||
STEP(3);
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
__m512i avx512_sort3_epi32(const __m512i v) {
|
||||
|
||||
__m512i result = v;
|
||||
__m512i index = _mm512_setzero_si512();
|
||||
__m512i incr = _mm512_set1_epi32(1);
|
||||
|
||||
const uint16_t sort_mask = 0x0007;
|
||||
STEP(0);
|
||||
STEP(1);
|
||||
STEP(2);
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
#undef STEP
|
||||
20
simd-sort/avx512-swap.cpp
Normal file
20
simd-sort/avx512-swap.cpp
Normal file
@ -0,0 +1,20 @@
|
||||
namespace qs {
|
||||
namespace avx512 {
|
||||
|
||||
void swap_epi32(__m512i& a, __m512i& b, __mmask16 mask_a, __mmask16 mask_b) {
|
||||
|
||||
assert(mask_a != 0);
|
||||
assert(mask_b != 0);
|
||||
assert(_mm_popcnt_u64(mask_a) == _mm_popcnt_u64(mask_b));
|
||||
|
||||
const __m512i to_swap_b = _mm512_mask_compress_epi32(_mm512_setzero_si512(), mask_a, a);
|
||||
const __m512i to_swap_a = _mm512_mask_compress_epi32(_mm512_setzero_si512(), mask_b, b);
|
||||
|
||||
a = _mm512_mask_expand_epi32(a, mask_a, to_swap_a);
|
||||
b = _mm512_mask_expand_epi32(b, mask_b, to_swap_b);
|
||||
}
|
||||
|
||||
|
||||
} // namespace avx512
|
||||
|
||||
} // namespace qa
|
||||
28
simd-sort/cmdline.cpp
Normal file
28
simd-sort/cmdline.cpp
Normal file
@ -0,0 +1,28 @@
|
||||
#include <cstring>
|
||||
#include <string>
|
||||
#include <unordered_set>
|
||||
|
||||
class CommandLine final {
|
||||
int argc;
|
||||
char** argv;
|
||||
|
||||
std::unordered_set<std::string> arguments;
|
||||
|
||||
public:
|
||||
CommandLine(int argc, char* argv[])
|
||||
: argc(argc)
|
||||
, argv(argv) {
|
||||
|
||||
for (int i=1; i < argc; i++) {
|
||||
arguments.insert(argv[i]);
|
||||
}
|
||||
}
|
||||
|
||||
bool has(const std::string& name) const {
|
||||
return arguments.count(name);
|
||||
}
|
||||
|
||||
bool empty() const {
|
||||
return arguments.empty();
|
||||
}
|
||||
};
|
||||
3
simd-sort/common.h
Normal file
3
simd-sort/common.h
Normal file
@ -0,0 +1,3 @@
|
||||
#pragma once
|
||||
|
||||
#define FORCE_INLINE inline __attribute__((always_inline))
|
||||
45
simd-sort/gather_results.sh
Executable file
45
simd-sort/gather_results.sh
Executable file
@ -0,0 +1,45 @@
|
||||
#!/bin/bash
|
||||
|
||||
if [[ $1 == "avx2" ]]
|
||||
then
|
||||
SPEED=./speed_avx2
|
||||
if [[ ! -x $SPEED ]]
|
||||
then
|
||||
echo "AVX2 test program not found. Run make speed_avx2 to build it."
|
||||
exit 1
|
||||
fi
|
||||
else
|
||||
SPEED=./speed
|
||||
if [[ ! -x $SPEED ]]
|
||||
then
|
||||
echo "AVX512 test program not found. Run make speed to build it."
|
||||
exit 1
|
||||
fi
|
||||
fi
|
||||
|
||||
|
||||
$SPEED 10000 3 asc | tee -a results.txt
|
||||
$SPEED 100000 3 asc | tee -a results.txt
|
||||
$SPEED 1000000 3 asc | tee -a results.txt
|
||||
$SPEED 2000000 3 asc | tee -a results.txt
|
||||
$SPEED 5000000 3 asc | tee -a results.txt
|
||||
$SPEED 10000000 3 asc | tee -a results.txt
|
||||
$SPEED 20000000 3 asc | tee -a results.txt
|
||||
|
||||
$SPEED 10000 3 dsc | tee -a results.txt
|
||||
$SPEED 100000 3 dsc | tee -a results.txt
|
||||
$SPEED 1000000 3 dsc | tee -a results.txt
|
||||
$SPEED 2000000 3 dsc | tee -a results.txt
|
||||
$SPEED 5000000 3 dsc | tee -a results.txt
|
||||
$SPEED 10000000 3 dsc | tee -a results.txt
|
||||
$SPEED 20000000 3 dsc | tee -a results.txt
|
||||
|
||||
$SPEED 10000 3 rnd | tee -a results.txt
|
||||
$SPEED 100000 3 rnd | tee -a results.txt
|
||||
$SPEED 1000000 3 rnd | tee -a results.txt
|
||||
$SPEED 2000000 3 rnd | tee -a results.txt
|
||||
$SPEED 5000000 3 rnd | tee -a results.txt
|
||||
$SPEED 10000000 3 rnd | tee -a results.txt
|
||||
$SPEED 20000000 3 rnd | tee -a results.txt
|
||||
|
||||
echo "results.txt created"
|
||||
8
simd-sort/gettime.cpp
Normal file
8
simd-sort/gettime.cpp
Normal file
@ -0,0 +1,8 @@
|
||||
#include <time.h>
|
||||
#include <sys/time.h>
|
||||
|
||||
uint32_t get_time() {
|
||||
struct timeval T;
|
||||
gettimeofday(&T, NULL);
|
||||
return (T.tv_sec * 1000000) + T.tv_usec;
|
||||
}
|
||||
106
simd-sort/input_data.cpp
Normal file
106
simd-sort/input_data.cpp
Normal file
@ -0,0 +1,106 @@
|
||||
class InputData {
|
||||
|
||||
protected:
|
||||
uint32_t* array;
|
||||
size_t n;
|
||||
|
||||
public:
|
||||
InputData(size_t count)
|
||||
: n(count) {
|
||||
|
||||
assert(n > 0);
|
||||
array = new uint32_t[n];
|
||||
}
|
||||
|
||||
~InputData() {
|
||||
delete[] array;
|
||||
}
|
||||
|
||||
public:
|
||||
uint32_t* pointer() {
|
||||
return array;
|
||||
}
|
||||
|
||||
size_t count() const {
|
||||
return n;
|
||||
}
|
||||
|
||||
size_t size() const {
|
||||
return n * sizeof(uint32_t);
|
||||
}
|
||||
};
|
||||
|
||||
|
||||
class InputAscending: public InputData {
|
||||
|
||||
using super = InputData;
|
||||
|
||||
public:
|
||||
InputAscending(size_t count) : super(count) {
|
||||
for (size_t i=0; i < n; i++) {
|
||||
array[i] = i;
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
|
||||
class InputDescending: public InputData {
|
||||
|
||||
using super = InputData;
|
||||
|
||||
public:
|
||||
InputDescending(size_t count) : super(count) {
|
||||
for (size_t i=0; i < n; i++) {
|
||||
array[i] = n - i + 1;
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
class InputRandomFew: public InputData {
|
||||
|
||||
using super = InputData;
|
||||
|
||||
public:
|
||||
InputRandomFew(size_t count) : super(count) {
|
||||
for (size_t i=0; i < n; i++) {
|
||||
array[i] = random() % 10;
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
class InputRandom: public InputData {
|
||||
|
||||
using super = InputData;
|
||||
|
||||
public:
|
||||
InputRandom(size_t count) : super(count) {
|
||||
for (size_t i=0; i < n; i++) {
|
||||
array[i] = random();
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
class InputRandomUnique: public InputData {
|
||||
|
||||
using super = InputData;
|
||||
|
||||
public:
|
||||
InputRandomUnique(size_t count) : super(count) {
|
||||
for (size_t i=0; i < n; i++) {
|
||||
array[i] = i;
|
||||
}
|
||||
|
||||
shuffle();
|
||||
}
|
||||
|
||||
private:
|
||||
void shuffle() {
|
||||
for (size_t i=0; i < n; i++) {
|
||||
size_t j = rand() % (n - i);
|
||||
|
||||
const uint32_t t = array[i];
|
||||
array[i] = array[j];
|
||||
array[j] = t;
|
||||
}
|
||||
}
|
||||
};
|
||||
51
simd-sort/partition.cpp
Normal file
51
simd-sort/partition.cpp
Normal file
@ -0,0 +1,51 @@
|
||||
void scalar_partition_epi32(uint32_t* array, const uint32_t pivot, int& left, int& right) {
|
||||
|
||||
while (left <= right) {
|
||||
|
||||
while (array[left] < pivot) {
|
||||
left += 1;
|
||||
}
|
||||
|
||||
while (array[right] > pivot) {
|
||||
right -= 1;
|
||||
}
|
||||
|
||||
if (left <= right) {
|
||||
const uint32_t t = array[left];
|
||||
array[left] = array[right];
|
||||
array[right] = t;
|
||||
|
||||
left += 1;
|
||||
right -= 1;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
int lomuto_partition_epi32(uint32_t* array, int lo, int hi) {
|
||||
|
||||
const uint32_t pivot = array[(lo + hi)/2];
|
||||
const uint32_t hi_value = array[hi];
|
||||
|
||||
array[(lo + hi)/2] = hi_value;
|
||||
array[hi] = pivot;
|
||||
|
||||
int i = lo;
|
||||
for (int j=lo; j < hi; j++) {
|
||||
if (array[j] <= pivot) {
|
||||
const uint32_t t = array[i];
|
||||
array[i] = array[j];
|
||||
array[j] = t;
|
||||
i += 1;
|
||||
}
|
||||
}
|
||||
|
||||
{
|
||||
const uint32_t t = array[i];
|
||||
array[i] = array[hi];
|
||||
array[hi] = t;
|
||||
i += 1;
|
||||
}
|
||||
|
||||
return i;
|
||||
}
|
||||
9
simd-sort/quicksort-all.cpp
Normal file
9
simd-sort/quicksort-all.cpp
Normal file
@ -0,0 +1,9 @@
|
||||
#include "common.h"
|
||||
#include "quicksort.cpp"
|
||||
#ifdef HAVE_AVX2_INSTRUCTIONS
|
||||
# include "avx2-quicksort.cpp"
|
||||
#endif
|
||||
#ifdef HAVE_AVX512F_INSTRUCTIONS
|
||||
# include "avx512-quicksort.cpp"
|
||||
#endif
|
||||
|
||||
19
simd-sort/quicksort.cpp
Normal file
19
simd-sort/quicksort.cpp
Normal file
@ -0,0 +1,19 @@
|
||||
#include "partition.cpp"
|
||||
|
||||
void quicksort(uint32_t* array, int left, int right) {
|
||||
|
||||
int i = left;
|
||||
int j = right;
|
||||
|
||||
const uint32_t pivot = array[(i + j)/2];
|
||||
|
||||
scalar_partition_epi32(array, pivot, i, j);
|
||||
|
||||
if (left < j) {
|
||||
quicksort(array, left, j);
|
||||
}
|
||||
|
||||
if (i < right) {
|
||||
quicksort(array, i, right);
|
||||
}
|
||||
}
|
||||
52
simd-sort/rdtsc.cpp
Normal file
52
simd-sort/rdtsc.cpp
Normal file
@ -0,0 +1,52 @@
|
||||
#define RDTSC_START(cycles) \
|
||||
do { \
|
||||
uint32_t cyc_high, cyc_low; \
|
||||
__asm volatile("cpuid\n" \
|
||||
"rdtsc\n" \
|
||||
"mov %%edx, %0\n" \
|
||||
"mov %%eax, %1" : \
|
||||
"=r" (cyc_high), \
|
||||
"=r"(cyc_low) : \
|
||||
: /* no read only */ \
|
||||
"%rax", "%rbx", "%rcx", "%rdx" /* clobbers */ \
|
||||
); \
|
||||
(cycles) = ((uint64_t)cyc_high << 32) | cyc_low; \
|
||||
} while (0)
|
||||
|
||||
#define RDTSC_STOP(cycles) \
|
||||
do { \
|
||||
uint32_t cyc_high, cyc_low; \
|
||||
__asm volatile("rdtscp\n" \
|
||||
"mov %%edx, %0\n" \
|
||||
"mov %%eax, %1\n" \
|
||||
"cpuid" : \
|
||||
"=r"(cyc_high), \
|
||||
"=r"(cyc_low) : \
|
||||
/* no read only registers */ : \
|
||||
"%rax", "%rbx", "%rcx", "%rdx" /* clobbers */ \
|
||||
); \
|
||||
(cycles) = ((uint64_t)cyc_high << 32) | cyc_low; \
|
||||
} while (0)
|
||||
|
||||
static __attribute__ ((noinline))
|
||||
uint64_t rdtsc_overhead_func(uint64_t dummy) {
|
||||
return dummy;
|
||||
}
|
||||
|
||||
uint64_t global_rdtsc_overhead = (uint64_t) UINT64_MAX;
|
||||
|
||||
#define RDTSC_SET_OVERHEAD(test, repeat) \
|
||||
do { \
|
||||
uint64_t cycles_start, cycles_final, cycles_diff; \
|
||||
uint64_t min_diff = UINT64_MAX; \
|
||||
for (int i = 0; i < repeat; i++) { \
|
||||
__asm volatile("" ::: /* pretend to clobber */ "memory"); \
|
||||
RDTSC_START(cycles_start); \
|
||||
test; \
|
||||
RDTSC_STOP(cycles_final); \
|
||||
cycles_diff = (cycles_final - cycles_start); \
|
||||
if (cycles_diff < min_diff) min_diff = cycles_diff; \
|
||||
} \
|
||||
global_rdtsc_overhead = min_diff; \
|
||||
printf("rdtsc_overhead set to %d\n", (int)global_rdtsc_overhead); \
|
||||
} while (0)
|
||||
84
simd-sort/results/haswell-i7-4770-gcc5.3.0-avx2.txt
Normal file
84
simd-sort/results/haswell-i7-4770-gcc5.3.0-avx2.txt
Normal file
@ -0,0 +1,84 @@
|
||||
items count: 10000 (40000 bytes), input ascending
|
||||
std::sort ... 0.0001 s
|
||||
quick sort ... 0.0001 s (1.24)
|
||||
AVX2 quick sort ... 0.0001 s (1.48)
|
||||
items count: 100000 (400000 bytes), input ascending
|
||||
std::sort ... 0.0010 s
|
||||
quick sort ... 0.0009 s (1.17)
|
||||
AVX2 quick sort ... 0.0007 s (1.48)
|
||||
items count: 1000000 (4000000 bytes), input ascending
|
||||
std::sort ... 0.0117 s
|
||||
quick sort ... 0.0092 s (1.27)
|
||||
AVX2 quick sort ... 0.0066 s (1.77)
|
||||
items count: 2000000 (8000000 bytes), input ascending
|
||||
std::sort ... 0.0244 s
|
||||
quick sort ... 0.0192 s (1.28)
|
||||
AVX2 quick sort ... 0.0137 s (1.79)
|
||||
items count: 5000000 (20000000 bytes), input ascending
|
||||
std::sort ... 0.0700 s
|
||||
quick sort ... 0.0535 s (1.31)
|
||||
AVX2 quick sort ... 0.0378 s (1.85)
|
||||
items count: 10000000 (40000000 bytes), input ascending
|
||||
std::sort ... 0.1462 s
|
||||
quick sort ... 0.1108 s (1.32)
|
||||
AVX2 quick sort ... 0.0782 s (1.87)
|
||||
items count: 20000000 (80000000 bytes), input ascending
|
||||
std::sort ... 0.3049 s
|
||||
quick sort ... 0.2301 s (1.33)
|
||||
AVX2 quick sort ... 0.1614 s (1.89)
|
||||
items count: 10000 (40000 bytes), input descending
|
||||
std::sort ... 0.0001 s
|
||||
quick sort ... 0.0001 s (0.92)
|
||||
AVX2 quick sort ... 0.0001 s (1.12)
|
||||
items count: 100000 (400000 bytes), input descending
|
||||
std::sort ... 0.0008 s
|
||||
quick sort ... 0.0009 s (0.87)
|
||||
AVX2 quick sort ... 0.0007 s (1.10)
|
||||
items count: 1000000 (4000000 bytes), input descending
|
||||
std::sort ... 0.0087 s
|
||||
quick sort ... 0.0092 s (0.95)
|
||||
AVX2 quick sort ... 0.0066 s (1.32)
|
||||
items count: 2000000 (8000000 bytes), input descending
|
||||
std::sort ... 0.0183 s
|
||||
quick sort ... 0.0192 s (0.95)
|
||||
AVX2 quick sort ... 0.0138 s (1.33)
|
||||
items count: 5000000 (20000000 bytes), input descending
|
||||
std::sort ... 0.0514 s
|
||||
quick sort ... 0.0536 s (0.96)
|
||||
AVX2 quick sort ... 0.0378 s (1.36)
|
||||
items count: 10000000 (40000000 bytes), input descending
|
||||
std::sort ... 0.1068 s
|
||||
quick sort ... 0.1111 s (0.96)
|
||||
AVX2 quick sort ... 0.0783 s (1.36)
|
||||
items count: 20000000 (80000000 bytes), input descending
|
||||
std::sort ... 0.2221 s
|
||||
quick sort ... 0.2299 s (0.97)
|
||||
AVX2 quick sort ... 0.1613 s (1.38)
|
||||
items count: 10000 (40000 bytes), input random
|
||||
std::sort ... 0.0001 s
|
||||
quick sort ... 0.0001 s (1.25)
|
||||
AVX2 quick sort ... 0.0001 s (1.53)
|
||||
items count: 100000 (400000 bytes), input random
|
||||
std::sort ... 0.0010 s
|
||||
quick sort ... 0.0009 s (1.16)
|
||||
AVX2 quick sort ... 0.0007 s (1.48)
|
||||
items count: 1000000 (4000000 bytes), input random
|
||||
std::sort ... 0.0118 s
|
||||
quick sort ... 0.0091 s (1.29)
|
||||
AVX2 quick sort ... 0.0066 s (1.78)
|
||||
items count: 2000000 (8000000 bytes), input random
|
||||
std::sort ... 0.0246 s
|
||||
quick sort ... 0.0192 s (1.28)
|
||||
AVX2 quick sort ... 0.0137 s (1.79)
|
||||
items count: 5000000 (20000000 bytes), input random
|
||||
std::sort ... 0.0703 s
|
||||
quick sort ... 0.0537 s (1.31)
|
||||
AVX2 quick sort ... 0.0378 s (1.86)
|
||||
items count: 10000000 (40000000 bytes), input random
|
||||
std::sort ... 0.1468 s
|
||||
quick sort ... 0.1106 s (1.33)
|
||||
AVX2 quick sort ... 0.0786 s (1.87)
|
||||
items count: 20000000 (80000000 bytes), input random
|
||||
std::sort ... 0.3058 s
|
||||
quick sort ... 0.2304 s (1.33)
|
||||
AVX2 quick sort ... 0.1628 s (1.88)
|
||||
140
simd-sort/results/knights-landing-7210-gcc5.3.0-avx512f.txt
Normal file
140
simd-sort/results/knights-landing-7210-gcc5.3.0-avx512f.txt
Normal file
@ -0,0 +1,140 @@
|
||||
items count: 100000 (400000 bytes), input ascending
|
||||
std::sort ... 0.0063 s
|
||||
quick sort ... 0.0067 s (0.93)
|
||||
AVX2 quick sort ... 0.0071 s (0.88)
|
||||
AVX512 quick sort ... 0.0055 s (1.14)
|
||||
AVX512 + popcnt quick sort ... 0.0056 s (1.13)
|
||||
AVX512 + BMI2 quick sort ... 0.0058 s (1.08)
|
||||
items count: 1000000 (4000000 bytes), input ascending
|
||||
std::sort ... 0.0728 s
|
||||
quick sort ... 0.0735 s (0.99)
|
||||
AVX2 quick sort ... 0.0685 s (1.06)
|
||||
AVX512 quick sort ... 0.0522 s (1.40)
|
||||
AVX512 + popcnt quick sort ... 0.0528 s (1.38)
|
||||
AVX512 + BMI2 quick sort ... 0.0554 s (1.31)
|
||||
items count: 2000000 (8000000 bytes), input ascending
|
||||
std::sort ... 0.1529 s
|
||||
quick sort ... 0.1523 s (1.00)
|
||||
AVX2 quick sort ... 0.1402 s (1.09)
|
||||
AVX512 quick sort ... 0.1057 s (1.45)
|
||||
AVX512 + popcnt quick sort ... 0.1063 s (1.44)
|
||||
AVX512 + BMI2 quick sort ... 0.1105 s (1.38)
|
||||
items count: 5000000 (20000000 bytes), input ascending
|
||||
std::sort ... 0.4433 s
|
||||
quick sort ... 0.3935 s (1.13)
|
||||
AVX2 quick sort ... 0.3520 s (1.26)
|
||||
AVX512 quick sort ... 0.2609 s (1.70)
|
||||
AVX512 + popcnt quick sort ... 0.2609 s (1.70)
|
||||
AVX512 + BMI2 quick sort ... 0.2668 s (1.66)
|
||||
items count: 10000000 (40000000 bytes), input ascending
|
||||
std::sort ... 0.9257 s
|
||||
quick sort ... 0.8108 s (1.14)
|
||||
AVX2 quick sort ... 0.7222 s (1.28)
|
||||
AVX512 quick sort ... 0.5265 s (1.76)
|
||||
AVX512 + popcnt quick sort ... 0.5288 s (1.75)
|
||||
AVX512 + BMI2 quick sort ... 0.5402 s (1.71)
|
||||
items count: 20000000 (80000000 bytes), input ascending
|
||||
std::sort ... 1.9285 s
|
||||
quick sort ... 1.6725 s (1.15)
|
||||
AVX2 quick sort ... 1.4766 s (1.31)
|
||||
AVX512 quick sort ... 1.0681 s (1.81)
|
||||
AVX512 + popcnt quick sort ... 1.0699 s (1.80)
|
||||
AVX512 + BMI2 quick sort ... 1.0949 s (1.76)
|
||||
items count: 10000 (40000 bytes), input descending
|
||||
std::sort ... 0.0004 s
|
||||
quick sort ... 0.0006 s (0.73)
|
||||
AVX2 quick sort ... 0.0006 s (0.74)
|
||||
AVX512 quick sort ... 0.0005 s (0.88)
|
||||
AVX512 + popcnt quick sort ... 0.0005 s (0.89)
|
||||
AVX512 + BMI2 quick sort ... 0.0005 s (0.87)
|
||||
items count: 100000 (400000 bytes), input descending
|
||||
std::sort ... 0.0048 s
|
||||
quick sort ... 0.0067 s (0.71)
|
||||
AVX2 quick sort ... 0.0070 s (0.68)
|
||||
AVX512 quick sort ... 0.0055 s (0.87)
|
||||
AVX512 + popcnt quick sort ... 0.0056 s (0.86)
|
||||
AVX512 + BMI2 quick sort ... 0.0058 s (0.83)
|
||||
items count: 1000000 (4000000 bytes), input descending
|
||||
std::sort ... 0.0551 s
|
||||
quick sort ... 0.0734 s (0.75)
|
||||
AVX2 quick sort ... 0.0684 s (0.81)
|
||||
AVX512 quick sort ... 0.0524 s (1.05)
|
||||
AVX512 + popcnt quick sort ... 0.0526 s (1.05)
|
||||
AVX512 + BMI2 quick sort ... 0.0547 s (1.01)
|
||||
items count: 2000000 (8000000 bytes), input descending
|
||||
std::sort ... 0.1154 s
|
||||
quick sort ... 0.1523 s (0.76)
|
||||
AVX2 quick sort ... 0.1407 s (0.82)
|
||||
AVX512 quick sort ... 0.1055 s (1.09)
|
||||
AVX512 + popcnt quick sort ... 0.1065 s (1.08)
|
||||
AVX512 + BMI2 quick sort ... 0.1111 s (1.04)
|
||||
items count: 5000000 (20000000 bytes), input descending
|
||||
std::sort ... 0.3300 s
|
||||
quick sort ... 0.3932 s (0.84)
|
||||
AVX2 quick sort ... 0.3516 s (0.94)
|
||||
AVX512 quick sort ... 0.2606 s (1.27)
|
||||
AVX512 + popcnt quick sort ... 0.2602 s (1.27)
|
||||
AVX512 + BMI2 quick sort ... 0.2657 s (1.24)
|
||||
items count: 10000000 (40000000 bytes), input descending
|
||||
std::sort ... 0.6890 s
|
||||
quick sort ... 0.8134 s (0.85)
|
||||
AVX2 quick sort ... 0.7216 s (0.95)
|
||||
AVX512 quick sort ... 0.5273 s (1.31)
|
||||
AVX512 + popcnt quick sort ... 0.5351 s (1.29)
|
||||
AVX512 + BMI2 quick sort ... 0.5392 s (1.28)
|
||||
items count: 20000000 (80000000 bytes), input descending
|
||||
std::sort ... 1.4358 s
|
||||
quick sort ... 1.6744 s (0.86)
|
||||
AVX2 quick sort ... 1.4772 s (0.97)
|
||||
AVX512 quick sort ... 1.0694 s (1.34)
|
||||
AVX512 + popcnt quick sort ... 1.0675 s (1.34)
|
||||
AVX512 + BMI2 quick sort ... 1.0929 s (1.31)
|
||||
items count: 10000 (40000 bytes), input random
|
||||
std::sort ... 0.0005 s
|
||||
quick sort ... 0.0006 s (0.98)
|
||||
AVX2 quick sort ... 0.0006 s (0.99)
|
||||
AVX512 quick sort ... 0.0005 s (1.18)
|
||||
AVX512 + popcnt quick sort ... 0.0005 s (1.18)
|
||||
AVX512 + BMI2 quick sort ... 0.0005 s (1.16)
|
||||
items count: 100000 (400000 bytes), input random
|
||||
std::sort ... 0.0063 s
|
||||
quick sort ... 0.0067 s (0.94)
|
||||
AVX2 quick sort ... 0.0071 s (0.89)
|
||||
AVX512 quick sort ... 0.0055 s (1.14)
|
||||
AVX512 + popcnt quick sort ... 0.0056 s (1.13)
|
||||
AVX512 + BMI2 quick sort ... 0.0058 s (1.08)
|
||||
items count: 1000000 (4000000 bytes), input random
|
||||
std::sort ... 0.0727 s
|
||||
quick sort ... 0.0742 s (0.98)
|
||||
AVX2 quick sort ... 0.0684 s (1.06)
|
||||
AVX512 quick sort ... 0.0522 s (1.39)
|
||||
AVX512 + popcnt quick sort ... 0.0526 s (1.38)
|
||||
AVX512 + BMI2 quick sort ... 0.0544 s (1.34)
|
||||
items count: 2000000 (8000000 bytes), input random
|
||||
std::sort ... 0.1533 s
|
||||
quick sort ... 0.1532 s (1.00)
|
||||
AVX2 quick sort ... 0.1403 s (1.09)
|
||||
AVX512 quick sort ... 0.1057 s (1.45)
|
||||
AVX512 + popcnt quick sort ... 0.1066 s (1.44)
|
||||
AVX512 + BMI2 quick sort ... 0.1101 s (1.39)
|
||||
items count: 5000000 (20000000 bytes), input random
|
||||
std::sort ... 0.4436 s
|
||||
quick sort ... 0.3932 s (1.13)
|
||||
AVX2 quick sort ... 0.3529 s (1.26)
|
||||
AVX512 quick sort ... 0.2622 s (1.69)
|
||||
AVX512 + popcnt quick sort ... 0.2632 s (1.69)
|
||||
AVX512 + BMI2 quick sort ... 0.2657 s (1.67)
|
||||
items count: 10000000 (40000000 bytes), input random
|
||||
std::sort ... 0.9235 s
|
||||
quick sort ... 0.8115 s (1.14)
|
||||
AVX2 quick sort ... 0.7233 s (1.28)
|
||||
AVX512 quick sort ... 0.5305 s (1.74)
|
||||
AVX512 + popcnt quick sort ... 0.5328 s (1.73)
|
||||
AVX512 + BMI2 quick sort ... 0.5379 s (1.72)
|
||||
items count: 20000000 (80000000 bytes), input random
|
||||
std::sort ... 1.9264 s
|
||||
quick sort ... 1.6716 s (1.15)
|
||||
AVX2 quick sort ... 1.4898 s (1.29)
|
||||
AVX512 quick sort ... 1.0853 s (1.77)
|
||||
AVX512 + popcnt quick sort ... 1.0849 s (1.78)
|
||||
AVX512 + BMI2 quick sort ... 1.0984 s (1.75)
|
||||
80
simd-sort/results/skylake-i7-6700-gcc5.3.0-avx2.txt
Normal file
80
simd-sort/results/skylake-i7-6700-gcc5.3.0-avx2.txt
Normal file
@ -0,0 +1,80 @@
|
||||
items count: 100000 (400000 bytes), input ascending
|
||||
std::sort ... 0.0011 s
|
||||
quick sort ... 0.0009 s (1.14)
|
||||
AVX2 quick sort ... 0.0007 s (1.59)
|
||||
items count: 1000000 (4000000 bytes), input ascending
|
||||
std::sort ... 0.0122 s
|
||||
quick sort ... 0.0099 s (1.23)
|
||||
AVX2 quick sort ... 0.0065 s (1.89)
|
||||
items count: 2000000 (8000000 bytes), input ascending
|
||||
std::sort ... 0.0257 s
|
||||
quick sort ... 0.0204 s (1.26)
|
||||
AVX2 quick sort ... 0.0133 s (1.94)
|
||||
items count: 5000000 (20000000 bytes), input ascending
|
||||
std::sort ... 0.0724 s
|
||||
quick sort ... 0.0576 s (1.26)
|
||||
AVX2 quick sort ... 0.0376 s (1.93)
|
||||
items count: 10000000 (40000000 bytes), input ascending
|
||||
std::sort ... 0.1516 s
|
||||
quick sort ... 0.1194 s (1.27)
|
||||
AVX2 quick sort ... 0.0773 s (1.96)
|
||||
items count: 20000000 (80000000 bytes), input ascending
|
||||
std::sort ... 0.3167 s
|
||||
quick sort ... 0.2480 s (1.28)
|
||||
AVX2 quick sort ... 0.1598 s (1.98)
|
||||
items count: 10000 (40000 bytes), input descending
|
||||
std::sort ... 0.0001 s
|
||||
quick sort ... 0.0001 s (0.91)
|
||||
AVX2 quick sort ... 0.0001 s (1.15)
|
||||
items count: 100000 (400000 bytes), input descending
|
||||
std::sort ... 0.0009 s
|
||||
quick sort ... 0.0009 s (0.91)
|
||||
AVX2 quick sort ... 0.0007 s (1.26)
|
||||
items count: 1000000 (4000000 bytes), input descending
|
||||
std::sort ... 0.0095 s
|
||||
quick sort ... 0.0099 s (0.96)
|
||||
AVX2 quick sort ... 0.0065 s (1.47)
|
||||
items count: 2000000 (8000000 bytes), input descending
|
||||
std::sort ... 0.0196 s
|
||||
quick sort ... 0.0207 s (0.95)
|
||||
AVX2 quick sort ... 0.0132 s (1.48)
|
||||
items count: 5000000 (20000000 bytes), input descending
|
||||
std::sort ... 0.0539 s
|
||||
quick sort ... 0.0576 s (0.94)
|
||||
AVX2 quick sort ... 0.0377 s (1.43)
|
||||
items count: 10000000 (40000000 bytes), input descending
|
||||
std::sort ... 0.1130 s
|
||||
quick sort ... 0.1195 s (0.95)
|
||||
AVX2 quick sort ... 0.0778 s (1.45)
|
||||
items count: 20000000 (80000000 bytes), input descending
|
||||
std::sort ... 0.2353 s
|
||||
quick sort ... 0.2479 s (0.95)
|
||||
AVX2 quick sort ... 0.1604 s (1.47)
|
||||
items count: 10000 (40000 bytes), input random
|
||||
std::sort ... 0.0001 s
|
||||
quick sort ... 0.0001 s (1.19)
|
||||
AVX2 quick sort ... 0.0001 s (1.51)
|
||||
items count: 100000 (400000 bytes), input random
|
||||
std::sort ... 0.0011 s
|
||||
quick sort ... 0.0009 s (1.13)
|
||||
AVX2 quick sort ... 0.0007 s (1.59)
|
||||
items count: 1000000 (4000000 bytes), input random
|
||||
std::sort ... 0.0123 s
|
||||
quick sort ... 0.0099 s (1.23)
|
||||
AVX2 quick sort ... 0.0064 s (1.91)
|
||||
items count: 2000000 (8000000 bytes), input random
|
||||
std::sort ... 0.0257 s
|
||||
quick sort ... 0.0207 s (1.24)
|
||||
AVX2 quick sort ... 0.0133 s (1.93)
|
||||
items count: 5000000 (20000000 bytes), input random
|
||||
std::sort ... 0.0726 s
|
||||
quick sort ... 0.0578 s (1.26)
|
||||
AVX2 quick sort ... 0.0377 s (1.93)
|
||||
items count: 10000000 (40000000 bytes), input random
|
||||
std::sort ... 0.1518 s
|
||||
quick sort ... 0.1200 s (1.26)
|
||||
AVX2 quick sort ... 0.0778 s (1.95)
|
||||
items count: 20000000 (80000000 bytes), input random
|
||||
std::sort ... 0.3170 s
|
||||
quick sort ... 0.2490 s (1.27)
|
||||
AVX2 quick sort ... 0.1614 s (1.96)
|
||||
76
simd-sort/runtime_stats.cpp
Normal file
76
simd-sort/runtime_stats.cpp
Normal file
@ -0,0 +1,76 @@
|
||||
class Histogram {
|
||||
size_t hist[256];
|
||||
|
||||
public:
|
||||
Histogram() {
|
||||
reset();
|
||||
}
|
||||
|
||||
void hit(uint8_t v) {
|
||||
hist[v] += 1;
|
||||
}
|
||||
|
||||
void reset() {
|
||||
memset(hist, 0, sizeof(hist));
|
||||
}
|
||||
|
||||
bool empty() const {
|
||||
for (int i=0; i < 256; i++) {
|
||||
if (hist[i] > 0) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
public:
|
||||
void print() const {
|
||||
size_t total = 0;
|
||||
for (int i=0; i < 256; i++) {
|
||||
total += hist[i];
|
||||
}
|
||||
|
||||
if (total == 0) return;
|
||||
|
||||
for (int i=0; i < 256; i++) {
|
||||
if (hist[i] == 0) continue;
|
||||
|
||||
printf("%02x: %5lu (%5.2f%%)\n", i, hist[i], 100.0 * hist[i]/total);
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
class Statistics {
|
||||
public:
|
||||
size_t partition_calls;
|
||||
size_t items_processed;
|
||||
size_t scalar__partition_calls;
|
||||
size_t scalar__items_processed;
|
||||
Histogram pvbyte_histogram;
|
||||
|
||||
public:
|
||||
Statistics() {
|
||||
reset();
|
||||
}
|
||||
|
||||
void reset() {
|
||||
partition_calls = 0;
|
||||
items_processed = 0;
|
||||
scalar__partition_calls = 0;
|
||||
scalar__items_processed = 0;
|
||||
|
||||
pvbyte_histogram.reset();
|
||||
}
|
||||
|
||||
bool anything_collected() const {
|
||||
return (partition_calls > 0)
|
||||
|| (items_processed > 0)
|
||||
|| (scalar__partition_calls > 0)
|
||||
|| (scalar__items_processed > 0)
|
||||
|| (!pvbyte_histogram.empty());
|
||||
}
|
||||
};
|
||||
|
||||
|
||||
Statistics statistics;
|
||||
457
simd-sort/speed.cpp
Normal file
457
simd-sort/speed.cpp
Normal file
@ -0,0 +1,457 @@
|
||||
#include <cstdlib>
|
||||
#include <cstdio>
|
||||
#include <cstdint>
|
||||
#include <cstring>
|
||||
#include <cassert>
|
||||
#include <algorithm>
|
||||
#include <memory>
|
||||
|
||||
#include "cmdline.cpp"
|
||||
#ifdef WITH_RUNTIME_STATS
|
||||
# include "runtime_stats.cpp" // must be included before anything else
|
||||
#endif
|
||||
#include "input_data.cpp"
|
||||
#include "quicksort-all.cpp"
|
||||
#include "avx2-altquicksort.h"
|
||||
//#include "avx2-nate-quicksort.cpp"
|
||||
#include "../magyarsort.h" // mine
|
||||
#include "avx2-natenodutch-quicksort.h"
|
||||
#define USE_RDTSC // undef to get measurments in seconds
|
||||
#ifdef USE_RDTSC
|
||||
# include "rdtsc.cpp"
|
||||
#else
|
||||
# include "gettime.cpp"
|
||||
#endif
|
||||
|
||||
void magyarsort_it(uint32_t* array, int left, int right) {
|
||||
MagyarSort::sort(array + left, right - left);
|
||||
}
|
||||
|
||||
class PerformanceTest final {
|
||||
|
||||
int iterations;
|
||||
InputData& input;
|
||||
uint32_t* tmp;
|
||||
|
||||
public:
|
||||
PerformanceTest(int n, InputData& input)
|
||||
: iterations(n)
|
||||
, input(input) {
|
||||
|
||||
assert(iterations > 0);
|
||||
tmp = new uint32_t[input.count()];
|
||||
}
|
||||
|
||||
~PerformanceTest() {
|
||||
delete[] tmp;
|
||||
}
|
||||
|
||||
public:
|
||||
template <typename SORT_FUNCTION>
|
||||
uint64_t run(SORT_FUNCTION sort) {
|
||||
|
||||
uint64_t time = 0;
|
||||
|
||||
int k = iterations;
|
||||
while (k--) {
|
||||
memcpy(tmp, input.pointer(), input.size());
|
||||
|
||||
uint64_t t1, t2;
|
||||
|
||||
#ifdef USE_RDTSC
|
||||
RDTSC_START(t1);
|
||||
#else
|
||||
t1 = get_time();
|
||||
#endif
|
||||
sort(input.pointer(), 0, input.count() - 1);
|
||||
|
||||
#ifdef USE_RDTSC
|
||||
RDTSC_START(t2);
|
||||
#else
|
||||
t2 = get_time();
|
||||
#endif
|
||||
|
||||
const uint64_t dt = t2 - t1;
|
||||
|
||||
if (time == 0) {
|
||||
time = dt;
|
||||
} else if (dt < time) {
|
||||
time = dt;
|
||||
}
|
||||
}
|
||||
|
||||
return time;
|
||||
}
|
||||
};
|
||||
|
||||
|
||||
enum class InputType {
|
||||
randomfew,
|
||||
randomuniq,
|
||||
random,
|
||||
ascending,
|
||||
descending,
|
||||
};
|
||||
|
||||
|
||||
const char* as_string(InputType type) {
|
||||
switch (type) {
|
||||
case InputType::randomfew:
|
||||
return "randomfew";
|
||||
|
||||
case InputType::randomuniq:
|
||||
return "randomuniq";
|
||||
|
||||
case InputType::random:
|
||||
return "random";
|
||||
|
||||
case InputType::ascending:
|
||||
return "ascending";
|
||||
|
||||
case InputType::descending:
|
||||
return "descending";
|
||||
|
||||
default:
|
||||
return "<unknown>";
|
||||
}
|
||||
}
|
||||
|
||||
void std_qsort_wrapper(uint32_t* array, int left, int right) {
|
||||
|
||||
std::qsort(array + left, right - left + 1, sizeof(uint32_t), [](const void* a, const void* b)
|
||||
{
|
||||
uint32_t a1 = *static_cast<const uint32_t*>(a);
|
||||
uint32_t a2 = *static_cast<const uint32_t*>(b);
|
||||
|
||||
if(a1 < a2) return -1;
|
||||
if(a1 > a2) return 1;
|
||||
return 0;
|
||||
});
|
||||
}
|
||||
|
||||
|
||||
void std_stable_sort_wrapper(uint32_t* array, int left, int right) {
|
||||
|
||||
std::stable_sort(array + left, array + right + 1);
|
||||
}
|
||||
|
||||
|
||||
void std_sort_wrapper(uint32_t* array, int left, int right) {
|
||||
|
||||
std::sort(array + left, array + right + 1);
|
||||
}
|
||||
|
||||
|
||||
class Flags {
|
||||
public:
|
||||
bool std_sort;
|
||||
bool std_qsort;
|
||||
bool std_stable_sort;
|
||||
bool quicksort;
|
||||
bool avx2;
|
||||
bool magyar;
|
||||
bool avx2_alt;
|
||||
bool avx2_natenodutch;
|
||||
bool avx512;
|
||||
bool avx512_buf;
|
||||
bool avx512_popcnt;
|
||||
bool avx512_bmi;
|
||||
|
||||
public:
|
||||
Flags(const CommandLine& cmd) {
|
||||
|
||||
enable_all(false);
|
||||
|
||||
bool any_set = false;
|
||||
if (cmd.has("-std-sort")) {
|
||||
std_sort = true;
|
||||
any_set = true;
|
||||
}
|
||||
|
||||
if (cmd.has("-std-qsort")) {
|
||||
std_qsort = true;
|
||||
any_set = true;
|
||||
}
|
||||
|
||||
if (cmd.has("-std-stable-sort") || cmd.has("-std-stable")) {
|
||||
std_stable_sort = true;
|
||||
any_set = true;
|
||||
}
|
||||
|
||||
if (cmd.has("-quicksort")) {
|
||||
quicksort = true;
|
||||
any_set = true;
|
||||
}
|
||||
|
||||
if (cmd.has("-avx2")) {
|
||||
avx2 = true;
|
||||
any_set = true;
|
||||
}
|
||||
|
||||
if (cmd.has("-avx2-alt")) {
|
||||
avx2_alt = true;
|
||||
any_set = true;
|
||||
}
|
||||
|
||||
if (cmd.has("magyar")) {
|
||||
magyar = true;
|
||||
avx2_natenodutch = true;
|
||||
any_set = true;
|
||||
}
|
||||
|
||||
if (cmd.has("-avx512")) {
|
||||
avx512 = true;
|
||||
any_set = true;
|
||||
}
|
||||
|
||||
if (cmd.has("-avx512-buf")) {
|
||||
avx512_buf = true;
|
||||
any_set = true;
|
||||
}
|
||||
|
||||
if (cmd.has("-avx512-popcnt")) {
|
||||
avx512_popcnt = true;
|
||||
any_set = true;
|
||||
}
|
||||
|
||||
if (cmd.has("-avx512-bmi")) {
|
||||
avx512_bmi = true;
|
||||
any_set = true;
|
||||
}
|
||||
|
||||
if (!any_set) {
|
||||
enable_all(true);
|
||||
}
|
||||
}
|
||||
|
||||
void enable_all(bool val) {
|
||||
std_sort = val;
|
||||
std_qsort = val;
|
||||
std_stable_sort = val;
|
||||
quicksort = val;
|
||||
avx2 = val;
|
||||
magyar = val;
|
||||
avx2_alt = val;
|
||||
avx2_natenodutch = val;
|
||||
avx512 = val;
|
||||
avx512_buf = val;
|
||||
avx512_popcnt = val;
|
||||
avx512_bmi = val;
|
||||
}
|
||||
};
|
||||
|
||||
|
||||
class Test {
|
||||
|
||||
std::unique_ptr<InputData> data;
|
||||
InputType type;
|
||||
size_t count;
|
||||
int iterations;
|
||||
Flags flags;
|
||||
uint64_t ref;
|
||||
|
||||
public:
|
||||
Test(InputType type, size_t count, int iterations, Flags&& flags)
|
||||
: type(type)
|
||||
, count(count)
|
||||
, iterations(iterations)
|
||||
, flags(std::move(flags)) {
|
||||
|
||||
switch (type) {
|
||||
case InputType::randomfew:
|
||||
data.reset(new InputRandomFew(count));
|
||||
break;
|
||||
|
||||
case InputType::randomuniq:
|
||||
data.reset(new InputRandomUnique(count));
|
||||
break;
|
||||
|
||||
case InputType::random:
|
||||
data.reset(new InputRandom(count));
|
||||
break;
|
||||
|
||||
case InputType::ascending:
|
||||
data.reset(new InputAscending(count));
|
||||
break;
|
||||
|
||||
case InputType::descending:
|
||||
data.reset(new InputDescending(count));
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
void run() {
|
||||
|
||||
printf("items count: %lu (%lu bytes), input %s\n", data->count(), data->size(), as_string(type));
|
||||
|
||||
ref = 0;
|
||||
|
||||
if (flags.std_sort) {
|
||||
measure("std::sort", std_sort_wrapper);
|
||||
}
|
||||
|
||||
if (flags.std_qsort) {
|
||||
measure("std::qsort", std_qsort_wrapper);
|
||||
}
|
||||
|
||||
if (flags.std_stable_sort) {
|
||||
measure("std::stable_sort", std_stable_sort_wrapper);
|
||||
}
|
||||
|
||||
if (flags.std_qsort) {
|
||||
measure("quick sort", quicksort);
|
||||
}
|
||||
#ifdef HAVE_AVX2_INSTRUCTIONS
|
||||
if (flags.avx2) {
|
||||
measure("AVX2 quick sort", qs::avx2::quicksort);
|
||||
}
|
||||
|
||||
if (flags.avx2_natenodutch) {
|
||||
measure("AVX2 nate nodutch", avx_natenodutch_quicksort);
|
||||
}
|
||||
|
||||
if (flags.avx2_alt) {
|
||||
measure("AVX2 alt quicksort", wrapped_avx2_pivotonlast_sort);
|
||||
}
|
||||
|
||||
if (flags.magyar) {
|
||||
measure("Magyarsort variant", magyarsort_it);
|
||||
}
|
||||
|
||||
#endif
|
||||
#ifdef HAVE_AVX512F_INSTRUCTIONS
|
||||
if (flags.avx512) {
|
||||
measure("AVX512 quick sort", qs::avx512::quicksort);
|
||||
}
|
||||
|
||||
if (flags.avx512_buf) {
|
||||
measure("AVX512 quick sort - aux buf", qs::avx512::auxbuffer_quicksort);
|
||||
}
|
||||
|
||||
if (flags.avx512_popcnt) {
|
||||
measure("AVX512 + popcnt quick sort", qs::avx512::popcnt_quicksort);
|
||||
}
|
||||
|
||||
if (flags.avx512_bmi) {
|
||||
measure("AVX512 + BMI2 quick sort", qs::avx512::bmi2_quicksort);
|
||||
}
|
||||
#endif
|
||||
}
|
||||
|
||||
private:
|
||||
template <typename SORT_FUNCTION>
|
||||
void measure(const char* name, SORT_FUNCTION sort) {
|
||||
|
||||
PerformanceTest test(iterations, *data);
|
||||
|
||||
printf("%30s ... ", name); fflush(stdout);
|
||||
#ifdef WITH_RUNTIME_STATS
|
||||
statistics.reset();
|
||||
#endif
|
||||
uint64_t time = test.run(sort);
|
||||
|
||||
#ifdef USE_RDTSC
|
||||
printf("%10lu cycles", time);
|
||||
if (ref > 0) {
|
||||
printf(" (%0.2f)", ref/double(time));
|
||||
}
|
||||
|
||||
# ifdef WITH_RUNTIME_STATS
|
||||
if (statistics.anything_collected()) {
|
||||
printf("\n");
|
||||
printf("\t\tpartition calls: %lu (+%lu scalar)\n", statistics.partition_calls, statistics.scalar__partition_calls);
|
||||
printf("\t\titems processed: %lu (+%lu by scalar partition)\n", statistics.items_processed, statistics.scalar__items_processed);
|
||||
|
||||
const size_t total_items = statistics.items_processed + statistics.scalar__items_processed;
|
||||
|
||||
if (total_items != 0) {
|
||||
const double cpi = double(time)/total_items;
|
||||
printf("\t\t : %0.4f cycles/item\n", cpi * iterations);
|
||||
}
|
||||
|
||||
if (!statistics.pvbyte_histogram.empty()) {
|
||||
puts("Histogram for pvbyte:");
|
||||
statistics.pvbyte_histogram.print();
|
||||
}
|
||||
}
|
||||
# endif // WITH_RUNTIME_STATS
|
||||
|
||||
#else
|
||||
printf("%0.4f s", time/1000000.0);
|
||||
if (ref > 0) {
|
||||
printf(" (%0.2f)\n", ref/double(time));
|
||||
}
|
||||
#endif
|
||||
putchar('\n');
|
||||
|
||||
if (ref == 0) {
|
||||
ref = time;
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
|
||||
// ------------------------------------------------------------
|
||||
|
||||
|
||||
void usage() {
|
||||
puts("usage:");
|
||||
puts("speed SIZE ITERATIONS INPUT [options]");
|
||||
puts("");
|
||||
puts("where");
|
||||
puts("* SIZE - number of 32-bit elements");
|
||||
puts("* ITERATIONS - number of iterations");
|
||||
puts("* INPUT - one of:");
|
||||
puts(" ascending (or asc)");
|
||||
puts(" descending (or dsc, desc)");
|
||||
puts(" random (or rnd, rand)");
|
||||
puts(" randomfew");
|
||||
puts(" randomuniq");
|
||||
puts("options - optional name of procedure(s) to run");
|
||||
}
|
||||
|
||||
|
||||
int main(int argc, char* argv[]) {
|
||||
|
||||
if (argc < 4) {
|
||||
usage();
|
||||
return EXIT_FAILURE;
|
||||
}
|
||||
|
||||
int count = atoi(argv[1]);
|
||||
int iterations = atoi(argv[2]);
|
||||
InputType type;
|
||||
|
||||
#define is_keyword(key) (strcmp(argv[3], key) == 0)
|
||||
if (is_keyword("descending") || is_keyword("desc") || is_keyword("dsc")) {
|
||||
type = InputType::descending;
|
||||
} else if (is_keyword("ascending") || is_keyword("asc")) {
|
||||
type = InputType::ascending;
|
||||
} else if (is_keyword("random") || is_keyword("rnd") || is_keyword("rand")) {
|
||||
type = InputType::random;
|
||||
} else if (is_keyword("randomfew")) {
|
||||
type = InputType::randomfew;
|
||||
} else if (is_keyword("randomuniq")) {
|
||||
type = InputType::randomuniq;
|
||||
} else {
|
||||
usage();
|
||||
return EXIT_FAILURE;
|
||||
}
|
||||
#undef is_keyword
|
||||
|
||||
#ifdef HAVE_AVX512F_INSTRUCTIONS
|
||||
#ifdef POPCNT_LOOKUP
|
||||
prepare_lookup();
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#ifdef USE_RDTSC
|
||||
RDTSC_SET_OVERHEAD(rdtsc_overhead_func(1), iterations);
|
||||
#endif
|
||||
CommandLine cmd(argc, argv);
|
||||
Flags flags(cmd);
|
||||
Test test(type, count, iterations, std::move(flags));
|
||||
test.run();
|
||||
|
||||
return EXIT_SUCCESS;
|
||||
}
|
||||
BIN
simd-sort/speed_avx2
Executable file
BIN
simd-sort/speed_avx2
Executable file
Binary file not shown.
261
simd-sort/test.cpp
Normal file
261
simd-sort/test.cpp
Normal file
@ -0,0 +1,261 @@
|
||||
#include <cstdlib>
|
||||
#include <cstdio>
|
||||
#include <cstdint>
|
||||
#include <cstring>
|
||||
#include <cassert>
|
||||
#include <immintrin.h>
|
||||
|
||||
#include "cmdline.cpp"
|
||||
#include "input_data.cpp"
|
||||
#include "quicksort-all.cpp"
|
||||
#include "avx2-altquicksort.h"
|
||||
#include "avx2-nate-quicksort.cpp"
|
||||
|
||||
|
||||
bool is_sorted(uint32_t* array, size_t n) {
|
||||
assert(n > 0);
|
||||
for (size_t i=1; i < n; i++) {
|
||||
if (array[i - 1] > array[i]) {
|
||||
printf("mismatch at %lu\n", i);
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
|
||||
const size_t AVX512_REGISTER_SIZE = 16;
|
||||
|
||||
|
||||
class Test {
|
||||
|
||||
bool verbose;
|
||||
|
||||
public:
|
||||
Test(bool v = true) : verbose(v) {}
|
||||
|
||||
template <typename SORT_FN>
|
||||
bool run(SORT_FN sort) {
|
||||
const size_t start = 2*AVX512_REGISTER_SIZE;
|
||||
const size_t end = 256*AVX512_REGISTER_SIZE;
|
||||
|
||||
if (verbose) {
|
||||
putchar('\n');
|
||||
}
|
||||
|
||||
for (size_t size=start; size < end; size += 1) {
|
||||
|
||||
if (verbose) {
|
||||
printf("%lu/%lu\r", size, end);
|
||||
fflush(stdout);
|
||||
}
|
||||
|
||||
InputAscending asc(size);
|
||||
InputDescending dsc(size);
|
||||
InputRandom rnd(size);
|
||||
InputRandomFew rndfew(size);
|
||||
|
||||
if (!test(sort, asc)) {
|
||||
printf("failed for size %lu, intput ascending\n", size);
|
||||
return false;
|
||||
}
|
||||
|
||||
if (!test(sort, dsc)) {
|
||||
printf("failed for size %lu, intput descending\n", size);
|
||||
return false;
|
||||
}
|
||||
|
||||
if (!test(sort, rnd)) {
|
||||
printf("failed for size %lu, intput random\n", size);
|
||||
return false;
|
||||
}
|
||||
|
||||
if (!test(sort, rndfew)) {
|
||||
printf("failed for size %lu, intput random few\n", size);
|
||||
return false;
|
||||
}
|
||||
} // for
|
||||
|
||||
if (verbose) {
|
||||
putchar('\n');
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
|
||||
private:
|
||||
template <typename SORT_FN>
|
||||
bool test(SORT_FN sort, InputData& data) {
|
||||
sort(data.pointer(), 0, data.count() - 1);
|
||||
|
||||
return is_sorted(data.pointer(), data.count());
|
||||
}
|
||||
};
|
||||
|
||||
|
||||
class Flags {
|
||||
public:
|
||||
bool avx2;
|
||||
bool avx2_alt;
|
||||
bool avx2_nate;
|
||||
bool avx512;
|
||||
bool avx512_buf;
|
||||
bool avx512_popcnt;
|
||||
bool avx512_bmi;
|
||||
|
||||
public:
|
||||
Flags(const CommandLine& cmd) {
|
||||
|
||||
enable_all(false);
|
||||
|
||||
bool any_set = false;
|
||||
if (cmd.has("-avx2")) {
|
||||
avx2 = true;
|
||||
any_set = true;
|
||||
}
|
||||
|
||||
if (cmd.has("-avx2-alt")) {
|
||||
avx2_alt = true;
|
||||
any_set = true;
|
||||
}
|
||||
|
||||
if (cmd.has("-avx2-nate")) {
|
||||
avx2_nate = true;
|
||||
any_set = true;
|
||||
}
|
||||
|
||||
if (cmd.has("-avx512")) {
|
||||
avx512 = true;
|
||||
any_set = true;
|
||||
}
|
||||
|
||||
if (cmd.has("-avx512-buf")) {
|
||||
avx512_buf = true;
|
||||
any_set = true;
|
||||
}
|
||||
|
||||
if (cmd.has("-avx512-popcnt")) {
|
||||
avx512_popcnt = true;
|
||||
any_set = true;
|
||||
}
|
||||
|
||||
if (cmd.has("-avx512-bmi")) {
|
||||
avx512_bmi = true;
|
||||
any_set = true;
|
||||
}
|
||||
|
||||
if (!any_set) {
|
||||
enable_all(true);
|
||||
}
|
||||
}
|
||||
|
||||
void enable_all(bool val) {
|
||||
avx2 = val;
|
||||
avx2_nate = val;
|
||||
avx2_alt = val;
|
||||
avx512 = val;
|
||||
avx512_buf = val;
|
||||
avx512_popcnt = val;
|
||||
avx512_bmi = val;
|
||||
}
|
||||
};
|
||||
|
||||
|
||||
int main(int argc, char* argv[]) {
|
||||
|
||||
CommandLine cmd(argc, argv);
|
||||
|
||||
puts("Please wait, it might take a while...");
|
||||
puts("");
|
||||
|
||||
bool verbose = cmd.has("-v") || cmd.has("--verbose");
|
||||
Flags flags(cmd);
|
||||
|
||||
Test test(verbose);
|
||||
int ret = EXIT_SUCCESS;
|
||||
|
||||
#ifdef HAVE_AVX2_INSTRUCTIONS
|
||||
if (flags.avx2) {
|
||||
printf("AVX2 base version... "); fflush(stdout);
|
||||
if (test.run(qs::avx2::quicksort)) {
|
||||
puts("OK");
|
||||
} else {
|
||||
puts("FAILED");
|
||||
ret = EXIT_FAILURE;
|
||||
}
|
||||
}
|
||||
|
||||
if (flags.avx2_alt) {
|
||||
printf("AVX2 alt version... "); fflush(stdout);
|
||||
if (test.run(wrapped_avx2_pivotonlast_sort)) {
|
||||
puts("OK");
|
||||
} else {
|
||||
puts("FAILED");
|
||||
ret = EXIT_FAILURE;
|
||||
}
|
||||
}
|
||||
|
||||
if (flags.avx2_nate) {
|
||||
printf("AVX2 Nate's variant... "); fflush(stdout);
|
||||
if (test.run(nate::wrapped_avx2_pivotonlast_sort)) {
|
||||
puts("OK");
|
||||
} else {
|
||||
puts("FAILED");
|
||||
ret = EXIT_FAILURE;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
#ifdef HAVE_AVX512F_INSTRUCTIONS
|
||||
|
||||
#ifdef POPCNT_LOOKUP
|
||||
prepare_lookup();
|
||||
#endif
|
||||
|
||||
if (flags.avx512) {
|
||||
printf("AVX512 base version... "); fflush(stdout);
|
||||
if (test.run(qs::avx512::quicksort)) {
|
||||
puts("OK");
|
||||
} else {
|
||||
puts("FAILED");
|
||||
ret = EXIT_FAILURE;
|
||||
}
|
||||
}
|
||||
|
||||
if (flags.avx512_popcnt) {
|
||||
printf("AVX512 + popcnt version... "); fflush(stdout);
|
||||
if (test.run(qs::avx512::popcnt_quicksort)) {
|
||||
puts("OK");
|
||||
} else {
|
||||
puts("FAILED");
|
||||
ret = EXIT_FAILURE;
|
||||
}
|
||||
}
|
||||
|
||||
if (flags.avx512_buf) {
|
||||
printf("AVX512 with aux buffers... "); fflush(stdout);
|
||||
if (test.run(qs::avx512::auxbuffer_quicksort)) {
|
||||
puts("OK");
|
||||
} else {
|
||||
puts("FAILED");
|
||||
ret = EXIT_FAILURE;
|
||||
}
|
||||
}
|
||||
|
||||
#if 0
|
||||
if (flags.avx512_bmi) {
|
||||
printf("AVX512 + bmi2 version ... "); fflush(stdout);
|
||||
if (test.run(qs::avx512::bmi2_quicksort)) {
|
||||
puts("OK");
|
||||
} else {
|
||||
puts("FAILED");
|
||||
ret = EXIT_FAILURE;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
#endif // HAVE_AVX512F_INSTRUCTIONS
|
||||
|
||||
return ret;
|
||||
}
|
||||
Loading…
x
Reference in New Issue
Block a user