91 lines
2.6 KiB
C++
91 lines
2.6 KiB
C++
|
|
namespace qs {
|
||
|
|
|
||
|
|
namespace avx512 {
|
||
|
|
|
||
|
|
void memset_epi32(uint32_t* array, uint32_t w, size_t n) {
|
||
|
|
|
||
|
|
const int N = 16;
|
||
|
|
const __m512i word = _mm512_set1_epi32(w);
|
||
|
|
|
||
|
|
for (size_t i=0; i < n/N; i++) {
|
||
|
|
_mm512_storeu_si512(array + i*N, word);
|
||
|
|
}
|
||
|
|
|
||
|
|
for (size_t i=n/N * N; i < n; i++) {
|
||
|
|
array[i] = w;
|
||
|
|
}
|
||
|
|
}
|
||
|
|
|
||
|
|
void memcpy_epi32(uint32_t* dst, uint32_t* src, size_t n) {
|
||
|
|
|
||
|
|
const int N = 16;
|
||
|
|
|
||
|
|
for (size_t i=0; i < n/N; i++) {
|
||
|
|
_mm512_storeu_si512(dst + i*N, _mm512_loadu_si512(src + i*N));
|
||
|
|
}
|
||
|
|
|
||
|
|
for (size_t i=n/N * N; i < n; i++) {
|
||
|
|
dst[i] = src[i];
|
||
|
|
}
|
||
|
|
}
|
||
|
|
|
||
|
|
// parition array[0..n-1]
|
||
|
|
uint32_t FORCE_INLINE partition_auxbuffer_epi32(uint32_t* array, size_t n, uint32_t pv) {
|
||
|
|
|
||
|
|
const int N = 16;
|
||
|
|
const int AUX_COUNT = 1024; // 4kB
|
||
|
|
|
||
|
|
static uint32_t gt_buf[AUX_COUNT + N];
|
||
|
|
|
||
|
|
size_t lt_count = 0;
|
||
|
|
size_t gt_count = 0;
|
||
|
|
|
||
|
|
const __m512i pivot = _mm512_set1_epi32(pv);
|
||
|
|
|
||
|
|
// 1. copy greater and less values into separate buffers
|
||
|
|
for (size_t i=0; i < n / N; i++) {
|
||
|
|
|
||
|
|
const __m512i v = _mm512_loadu_si512(array + i*N);
|
||
|
|
|
||
|
|
const __mmask16 lt = _mm512_cmplt_epi32_mask(v, pivot);
|
||
|
|
const __mmask16 gt = _mm512_cmpgt_epi32_mask(v, pivot);
|
||
|
|
|
||
|
|
const __m512i less = _mm512_maskz_compress_epi32(lt, v);
|
||
|
|
const __m512i greater = _mm512_maskz_compress_epi32(gt, v);
|
||
|
|
|
||
|
|
_mm512_storeu_si512(array + lt_count, less);
|
||
|
|
_mm512_storeu_si512(gt_buf + gt_count, greater);
|
||
|
|
|
||
|
|
lt_count += _mm_popcnt_u32(lt);
|
||
|
|
gt_count += _mm_popcnt_u32(gt);
|
||
|
|
}
|
||
|
|
|
||
|
|
for (size_t i=0; i < n % N; i++) {
|
||
|
|
|
||
|
|
const uint32_t v = array[(n/N) * N + i];
|
||
|
|
|
||
|
|
if (v < pv) {
|
||
|
|
array[lt_count++] = v;
|
||
|
|
} else if (v > pv) {
|
||
|
|
gt_buf[gt_count++] = v;
|
||
|
|
}
|
||
|
|
}
|
||
|
|
|
||
|
|
const size_t eq_count = n - (lt_count + gt_count);
|
||
|
|
|
||
|
|
// 2. replace array with partially ordered data
|
||
|
|
|
||
|
|
// 2.a. pivots
|
||
|
|
memset_epi32(array + lt_count, pv, eq_count);
|
||
|
|
|
||
|
|
// 2.b. all values greater than pivot
|
||
|
|
memcpy_epi32(array + lt_count + eq_count, gt_buf, gt_count);
|
||
|
|
|
||
|
|
// 3. index before the first pivot
|
||
|
|
return lt_count;
|
||
|
|
}
|
||
|
|
|
||
|
|
} // namespace avx512
|
||
|
|
|
||
|
|
} // namespace qa
|