212 lines
6.8 KiB
C++
212 lines
6.8 KiB
C++
#include "avx512-swap.cpp"
|
|
#include "avx512-partition-register.cpp"
|
|
#include "avx512-partition.cpp"
|
|
#include "avx512-popcnt-partition.cpp"
|
|
#include "avx512-bmi2-partition.cpp"
|
|
#include "avx512-auxbuffer-partition.cpp"
|
|
#include "avx512-sort-register.cpp"
|
|
|
|
|
|
namespace qs {
|
|
|
|
namespace avx512 {
|
|
|
|
|
|
void dump(const char* name, const __m512i v) {
|
|
uint32_t tmp[16];
|
|
|
|
printf("%-10s = [", name);
|
|
_mm512_storeu_si512((__m512i*)tmp, v);
|
|
for (int i=0; i < 16; i++) {
|
|
if (i > 0) {
|
|
putchar(' ');
|
|
}
|
|
|
|
printf("%08x", tmp[i]);
|
|
}
|
|
|
|
printf("]\n");
|
|
}
|
|
|
|
void quicksort(uint32_t* array, int left, int right) {
|
|
|
|
int i = left;
|
|
int j = right;
|
|
|
|
const int size = j - i + 1;
|
|
if (size <= 16) {
|
|
const __m512i in = _mm512_loadu_si512(array + i);
|
|
__m512i result;
|
|
|
|
switch (size) {
|
|
case 16: result = avx512_sort16_epi32(in); break;
|
|
case 15: result = avx512_sort15_epi32(in); break;
|
|
case 14: result = avx512_sort14_epi32(in); break;
|
|
case 13: result = avx512_sort13_epi32(in); break;
|
|
case 12: result = avx512_sort12_epi32(in); break;
|
|
case 11: result = avx512_sort11_epi32(in); break;
|
|
case 10: result = avx512_sort10_epi32(in); break;
|
|
case 9: result = avx512_sort9_epi32(in); break;
|
|
case 8: result = avx512_sort8_epi32(in); break;
|
|
case 7: result = avx512_sort7_epi32(in); break;
|
|
case 6: result = avx512_sort6_epi32(in); break;
|
|
case 5: result = avx512_sort5_epi32(in); break;
|
|
case 4: result = avx512_sort4_epi32(in); break;
|
|
case 3: result = avx512_sort3_epi32(in); break;
|
|
case 2: {
|
|
if (array[i] > array[j]) {
|
|
uint32_t t = array[i];
|
|
array[i] = array[j];
|
|
array[j] = t;
|
|
}
|
|
return;
|
|
}
|
|
|
|
case 1:
|
|
case 0:
|
|
return;
|
|
|
|
default:
|
|
assert(false);
|
|
break;
|
|
}
|
|
|
|
_mm512_storeu_si512(array + i, result);
|
|
return;
|
|
}
|
|
|
|
const uint32_t pivot = array[left];
|
|
const int AVX512_REGISTER_SIZE = 16; // in 32-bit words
|
|
if (size >= 2 * AVX512_REGISTER_SIZE) {
|
|
::qs::avx512::partition_epi32(array, pivot, i, j);
|
|
} else {
|
|
scalar_partition_epi32(array, pivot, i, j);
|
|
}
|
|
|
|
if (left < j) {
|
|
quicksort(array, left, j);
|
|
}
|
|
|
|
if (i < right) {
|
|
quicksort(array, i, right);
|
|
}
|
|
}
|
|
|
|
|
|
void popcnt_quicksort(uint32_t* array, int left, int right) {
|
|
|
|
int i = left;
|
|
int j = right;
|
|
const int size = j - i + 1;
|
|
if (size <= 16) {
|
|
const __m512i in = _mm512_loadu_si512(array + i);
|
|
__m512i result;
|
|
|
|
switch (size) {
|
|
case 16: result = avx512_sort16_epi32(in); break;
|
|
case 15: result = avx512_sort15_epi32(in); break;
|
|
case 14: result = avx512_sort14_epi32(in); break;
|
|
case 13: result = avx512_sort13_epi32(in); break;
|
|
case 12: result = avx512_sort12_epi32(in); break;
|
|
case 11: result = avx512_sort11_epi32(in); break;
|
|
case 10: result = avx512_sort10_epi32(in); break;
|
|
case 9: result = avx512_sort9_epi32(in); break;
|
|
case 8: result = avx512_sort8_epi32(in); break;
|
|
case 7: result = avx512_sort7_epi32(in); break;
|
|
case 6: result = avx512_sort6_epi32(in); break;
|
|
case 5: result = avx512_sort5_epi32(in); break;
|
|
case 4: result = avx512_sort4_epi32(in); break;
|
|
case 3: result = avx512_sort3_epi32(in); break;
|
|
case 2: {
|
|
if (array[i] > array[j]) {
|
|
uint32_t t = array[i];
|
|
array[i] = array[j];
|
|
array[j] = t;
|
|
}
|
|
return;
|
|
}
|
|
|
|
case 1:
|
|
case 0:
|
|
return;
|
|
|
|
default:
|
|
assert(false);
|
|
break;
|
|
}
|
|
|
|
_mm512_storeu_si512(array + i, result);
|
|
return;
|
|
}
|
|
|
|
const uint32_t pivot = array[(i + j)/2];
|
|
const int AVX512_REGISTER_SIZE = 16; // in 32-bit words
|
|
if (size >= 2 * AVX512_REGISTER_SIZE) {
|
|
::qs::avx512::popcnt_partition_epi32(array, pivot, i, j);
|
|
} else {
|
|
scalar_partition_epi32(array, pivot, i, j);
|
|
}
|
|
|
|
if (left < j) {
|
|
popcnt_quicksort(array, left, j);
|
|
}
|
|
|
|
if (i < right) {
|
|
popcnt_quicksort(array, i, right);
|
|
}
|
|
}
|
|
|
|
|
|
void bmi2_quicksort(uint32_t* array, int left, int right) {
|
|
|
|
int i = left;
|
|
int j = right;
|
|
|
|
const uint32_t pivot = array[left];
|
|
const int AVX512_REGISTER_SIZE = 16; // in 32-bit words
|
|
|
|
if (j - i >= 2 * AVX512_REGISTER_SIZE) {
|
|
::qs::avx512::bmi2_partition_epi32(array, pivot, i, j);
|
|
} else {
|
|
scalar_partition_epi32(array, pivot, i, j);
|
|
}
|
|
|
|
if (left < j) {
|
|
bmi2_quicksort(array, left, j);
|
|
}
|
|
|
|
if (i < right) {
|
|
bmi2_quicksort(array, i, right);
|
|
}
|
|
}
|
|
|
|
|
|
void auxbuffer_quicksort(uint32_t* array, int lo, int hi) {
|
|
|
|
if (lo >= hi) {
|
|
return;
|
|
}
|
|
|
|
int p;
|
|
#if 0
|
|
p = lomuto_partition_epi32(array, lo, hi);
|
|
#else
|
|
const auto n = (hi - lo + 1);
|
|
if (n > 1024) {
|
|
p = lomuto_partition_epi32(array, lo, hi);
|
|
} else if (n > 8) {
|
|
const uint32_t pivot = array[(hi + lo)/2];
|
|
p = ::qs::avx512::partition_auxbuffer_epi32(array + lo, n, pivot) + lo;
|
|
} else {
|
|
p = lomuto_partition_epi32(array, lo, hi);
|
|
}
|
|
#endif
|
|
auxbuffer_quicksort(array, lo, p-1);
|
|
auxbuffer_quicksort(array, p+1, hi);
|
|
}
|
|
|
|
} // namespace avx512
|
|
|
|
} // namespace qs
|
|
|