#include "avx512-swap.cpp" #include "avx512-partition-register.cpp" #include "avx512-partition.cpp" #include "avx512-popcnt-partition.cpp" #include "avx512-bmi2-partition.cpp" #include "avx512-auxbuffer-partition.cpp" #include "avx512-sort-register.cpp" namespace qs { namespace avx512 { void dump(const char* name, const __m512i v) { uint32_t tmp[16]; printf("%-10s = [", name); _mm512_storeu_si512((__m512i*)tmp, v); for (int i=0; i < 16; i++) { if (i > 0) { putchar(' '); } printf("%08x", tmp[i]); } printf("]\n"); } void quicksort(uint32_t* array, int left, int right) { int i = left; int j = right; const int size = j - i + 1; if (size <= 16) { const __m512i in = _mm512_loadu_si512(array + i); __m512i result; switch (size) { case 16: result = avx512_sort16_epi32(in); break; case 15: result = avx512_sort15_epi32(in); break; case 14: result = avx512_sort14_epi32(in); break; case 13: result = avx512_sort13_epi32(in); break; case 12: result = avx512_sort12_epi32(in); break; case 11: result = avx512_sort11_epi32(in); break; case 10: result = avx512_sort10_epi32(in); break; case 9: result = avx512_sort9_epi32(in); break; case 8: result = avx512_sort8_epi32(in); break; case 7: result = avx512_sort7_epi32(in); break; case 6: result = avx512_sort6_epi32(in); break; case 5: result = avx512_sort5_epi32(in); break; case 4: result = avx512_sort4_epi32(in); break; case 3: result = avx512_sort3_epi32(in); break; case 2: { if (array[i] > array[j]) { uint32_t t = array[i]; array[i] = array[j]; array[j] = t; } return; } case 1: case 0: return; default: assert(false); break; } _mm512_storeu_si512(array + i, result); return; } const uint32_t pivot = array[left]; const int AVX512_REGISTER_SIZE = 16; // in 32-bit words if (size >= 2 * AVX512_REGISTER_SIZE) { ::qs::avx512::partition_epi32(array, pivot, i, j); } else { scalar_partition_epi32(array, pivot, i, j); } if (left < j) { quicksort(array, left, j); } if (i < right) { quicksort(array, i, right); } } void popcnt_quicksort(uint32_t* array, int left, int right) { int i = left; int j = right; const int size = j - i + 1; if (size <= 16) { const __m512i in = _mm512_loadu_si512(array + i); __m512i result; switch (size) { case 16: result = avx512_sort16_epi32(in); break; case 15: result = avx512_sort15_epi32(in); break; case 14: result = avx512_sort14_epi32(in); break; case 13: result = avx512_sort13_epi32(in); break; case 12: result = avx512_sort12_epi32(in); break; case 11: result = avx512_sort11_epi32(in); break; case 10: result = avx512_sort10_epi32(in); break; case 9: result = avx512_sort9_epi32(in); break; case 8: result = avx512_sort8_epi32(in); break; case 7: result = avx512_sort7_epi32(in); break; case 6: result = avx512_sort6_epi32(in); break; case 5: result = avx512_sort5_epi32(in); break; case 4: result = avx512_sort4_epi32(in); break; case 3: result = avx512_sort3_epi32(in); break; case 2: { if (array[i] > array[j]) { uint32_t t = array[i]; array[i] = array[j]; array[j] = t; } return; } case 1: case 0: return; default: assert(false); break; } _mm512_storeu_si512(array + i, result); return; } const uint32_t pivot = array[(i + j)/2]; const int AVX512_REGISTER_SIZE = 16; // in 32-bit words if (size >= 2 * AVX512_REGISTER_SIZE) { ::qs::avx512::popcnt_partition_epi32(array, pivot, i, j); } else { scalar_partition_epi32(array, pivot, i, j); } if (left < j) { popcnt_quicksort(array, left, j); } if (i < right) { popcnt_quicksort(array, i, right); } } void bmi2_quicksort(uint32_t* array, int left, int right) { int i = left; int j = right; const uint32_t pivot = array[left]; const int AVX512_REGISTER_SIZE = 16; // in 32-bit words if (j - i >= 2 * AVX512_REGISTER_SIZE) { ::qs::avx512::bmi2_partition_epi32(array, pivot, i, j); } else { scalar_partition_epi32(array, pivot, i, j); } if (left < j) { bmi2_quicksort(array, left, j); } if (i < right) { bmi2_quicksort(array, i, right); } } void auxbuffer_quicksort(uint32_t* array, int lo, int hi) { if (lo >= hi) { return; } int p; #if 0 p = lomuto_partition_epi32(array, lo, hi); #else const auto n = (hi - lo + 1); if (n > 1024) { p = lomuto_partition_epi32(array, lo, hi); } else if (n > 8) { const uint32_t pivot = array[(hi + lo)/2]; p = ::qs::avx512::partition_auxbuffer_epi32(array + lo, n, pivot) + lo; } else { p = lomuto_partition_epi32(array, lo, hi); } #endif auxbuffer_quicksort(array, lo, p-1); auxbuffer_quicksort(array, p+1, hi); } } // namespace avx512 } // namespace qs