namespace qs {

    namespace avx512 {

        void memset_epi32(uint32_t* array, uint32_t w, size_t n) {
            
            const int N = 16;
            const __m512i word = _mm512_set1_epi32(w);

            for (size_t i=0; i < n/N; i++) {
                _mm512_storeu_si512(array + i*N, word);
            }

            for (size_t i=n/N * N; i < n; i++) {
                array[i] = w;
            }
        }

        void memcpy_epi32(uint32_t* dst, uint32_t* src, size_t n) {
            
            const int N = 16;

            for (size_t i=0; i < n/N; i++) {
                _mm512_storeu_si512(dst + i*N, _mm512_loadu_si512(src + i*N));
            }

            for (size_t i=n/N * N; i < n; i++) {
                dst[i] = src[i];
            }
        }

        // parition array[0..n-1]
        uint32_t FORCE_INLINE partition_auxbuffer_epi32(uint32_t* array, size_t n, uint32_t pv) {

            const int N = 16;
            const int AUX_COUNT = 1024; // 4kB

            static uint32_t gt_buf[AUX_COUNT + N];

            size_t lt_count = 0;
            size_t gt_count = 0;

            const __m512i pivot = _mm512_set1_epi32(pv);

            // 1. copy greater and less values into separate buffers
            for (size_t i=0; i < n / N; i++) {
                
                const __m512i v = _mm512_loadu_si512(array + i*N);

                const __mmask16 lt = _mm512_cmplt_epi32_mask(v, pivot);
                const __mmask16 gt = _mm512_cmpgt_epi32_mask(v, pivot);

                const __m512i less    = _mm512_maskz_compress_epi32(lt, v);
                const __m512i greater = _mm512_maskz_compress_epi32(gt, v);

                _mm512_storeu_si512(array  + lt_count, less);
                _mm512_storeu_si512(gt_buf + gt_count, greater);

                lt_count += _mm_popcnt_u32(lt);
                gt_count += _mm_popcnt_u32(gt);
            }

            for (size_t i=0; i < n % N; i++) {
                
                const uint32_t v = array[(n/N) * N + i];

                if (v < pv) {
                    array[lt_count++] = v;
                } else if (v > pv) {
                    gt_buf[gt_count++] = v;
                }
            }

            const size_t eq_count = n - (lt_count + gt_count);

            // 2. replace array with partially ordered data

            // 2.a. pivots
            memset_epi32(array + lt_count, pv, eq_count);

            // 2.b. all values greater than pivot
            memcpy_epi32(array + lt_count + eq_count, gt_buf, gt_count);

            // 3. index before the first pivot
            return lt_count;
        }

    } // namespace avx512

} // namespace qa