Merge branch 'tmp' into ilp-radix-1

2021-12-19 22:53:09 +01:00 · 2021-12-19 22:53:09 +01:00 · d858f39708
commit d858f39708
parent efa2c7bc26 c77e592a84
2 changed files with 76 additions and 15 deletions
--- a/magyarsort.h
+++ b/magyarsort.h
@ -23,6 +23,10 @@
 #include <vector>
 #include <algorithm> // std::swap
 #ifndef NO_MLOCK
 #include <sys/mman.h> // mlock & munlock
 #endif // !NO_MLOCK
 namespace MagyarSort {
 	/* CONFIG */
@ -111,7 +115,7 @@ namespace MagyarSort {
 		//#pragma GCC unroll 4
 		for(; i < size - 64; i += 64) {
 			// Prefetch for read level-1 cache
-			//__builtin_prefetch(&arr[i + (1 * 16)], 0/*r*/, 2/*L2 or L3 cache likely*/);
+			//__builtin_prefetch(&arr[i + (1 * 16)], 0, 2); // r, L2 or L3 cache
 			__builtin_prefetch(&arr[i + (1 * 16)]);
 			// Creates no object, struct is empty
 			OccurenceMagic<DIGITS - 1, COUNTER_TYP>(arr, i, radicsOut);
@ -290,16 +294,16 @@ namespace MagyarSort {
 	 *
 	 * Rem.: If you use the VectorGiverWithReuse please remind yourself to Gc() it time-to-time!
 	 *
-	 * Beware: GC needs to happen on all threads that use us!
+	 * Beware: GC needs to happen on all threads that use us if you want to GC!
 	 *
 	 * @param arr The array to sort. Result will be in the same array - as sorted.
 	 * @param size The lenght of the array - should fit in the COUNTER_TYP.
-	 * @param COUNTER_TYP OPTIONAL: When set this type will be the counter type.
+	 * @param COUNTER_TYP OPTIONAL: When set this type will be the counter type. For most cases uint32_t is enough.
 	 * @param REUSE OPTIONAL: When true, we reuse the array instead of always gettin' and releasin' from da heap.
 	 * @param GC OPTIONAL: When true, we garbage collect memory from previous sorts if REUSE is true.
 	 * @param GC_WITHOUT_SORT OPTIONAL: When true, we "just GC" but do not sort in case of GC is true.
 	 */
-	template<typename COUNTER_TYP = size_t, bool REUSE = false, bool GC = false, bool GC_WITHOUT_SORT = false>
+	template<typename COUNTER_TYP = uint32_t, bool REUSE = false, bool GC = false, bool GC_WITHOUT_SORT = false>
 	inline void __attribute__((always_inline)) sort_impl(uint32_t arr[], COUNTER_TYP size) noexcept {
 		// Most funny optimization is this multiply here :-)
 		//
@ -308,6 +312,10 @@ namespace MagyarSort {
 		// optimize the first call for sort when we REUSE the array so size is fine!
 		static thread_local std::vector<uint32_t> arc(size * REUSE);
 #ifndef NO_MLOCK
    		mlock(arr, size * sizeof(uint32_t));
 #endif // !NO_MLOCK
 		// "Garbage-collection"
 		if(GC) {
 			arc = std::vector<uint32_t>();
@ -321,6 +329,9 @@ namespace MagyarSort {
 		// Holds "digit" occurences, prefix sums, whatevers
 		// First "DIGIT_RANGE" elem is for MSB "DIGITS", last is for LSB
 		static thread_local COUNTER_TYP radics[DIGITS * DIGIT_RANGE];
 #ifndef NO_MLOCK
    		mlock(radics, (DIGITS * DIGIT_RANGE) * sizeof(COUNTER_TYP));
 #endif // !NO_MLOCK
 		// Write prefetchin'
 		//__builtin_prefetch(&radicsOut[..], 1);
 		PrefetchMagic<DIGITS * DIGIT_RANGE, (64/sizeof(COUNTER_TYP)), COUNTER_TYP, 1/*w*/> pm(radics);
@ -346,9 +357,6 @@ namespace MagyarSort {
 		// Regular radix sort needs a copy, see: https://www.youtube.com/watch?v=ujb2CIWE8zY
 		// But instead of the below, we do a trickery...
 		//
 		//std::vector<uint32_t> arc(size);
 		//auto arc = VectorGiver::Give(size); // "auto" is needed for this to perform well with some givers!
 		//
 		// Rem.: The branch is optimized out in compile time!
 		if(REUSE) {
 			arc.resize(size);
@ -357,6 +365,9 @@ namespace MagyarSort {
 			// We must regain memory of previous!
 			arc = std::move(std::vector<uint32_t>(size));
 		}
 #ifndef NO_MLOCK
    		mlock(&arc[0], size * sizeof(uint32_t));
 #endif // !NO_MLOCK
 		uint32_t *from = arr;
 		uint32_t *to = &arc[0];
@ -371,6 +382,11 @@ namespace MagyarSort {
 		if(swapped) { // <- in reality this is what we want because of last swap happened anyways!
 			memcpy(arr, to, size);
 		}
 #ifndef NO_MLOCK
    		munlock(radics, (DIGITS * DIGIT_RANGE) * sizeof(COUNTER_TYP));
 		munlock(&arc[0], size * sizeof(uint32_t));
    		munlock(arr, size * sizeof(uint32_t));
 #endif // !NO_MLOCK
 	}
 	/**
@ -425,10 +441,10 @@ namespace MagyarSort {
 	 *
 	 * @param arr The array to sort. Result will be in the same array - as sorted.
 	 * @param size The lenght of the array.
-	 * @param COUNTER_TYP OPTIONAL: It is best kepts as size_t, but for smaller arrays, you can use uint32_t.
+	 * @param COUNTER_TYP: Should be size_t for HUGE arrays, but regular arrays, you can use uint32_t. Should be auto found-out
 	 */
-	template<typename COUNTER_TYP = size_t>
+	template<typename COUNTER_TYP>
-	inline void __attribute__((always_inline)) sort_no_reuse(uint32_t arr[], size_t size) noexcept {
+	inline void __attribute__((always_inline)) sort_no_reuse(uint32_t arr[], COUNTER_TYP size) noexcept {
 		// We use the heap once per every call...
 		// This is safer and we do not need garbage collecting
 		MagyarSort::sort_impl<COUNTER_TYP>(arr, size);
@ -446,10 +462,10 @@ namespace MagyarSort {
 	 *
 	 * @param arr The array to sort. Result will be in the same array - as sorted.
 	 * @param size The lenght of the array.
-	 * @param COUNTER_TYP OPTIONAL: It is best kepts as size_t, but for smaller arrays, you can use uint32_t.
+	 * @param COUNTER_TYP: Should be size_t for HUGE arrays, but regular arrays, you can use uint32_t. Should be auto found-out
 	 */
-	template<typename COUNTER_TYP = size_t>
+	template<typename COUNTER_TYP>
-	inline void sort(uint32_t arr[], size_t size) noexcept {
+	inline void sort(uint32_t arr[], COUNTER_TYP size) noexcept {
 #ifdef MAGYAR_SORT_DEFAULT_REUSE
 		MagyarSort::sort_reuse<COUNTER_TYP>(arr, size);
 #else
--- a/ypsu.cpp
+++ b/ypsu.cpp
@ -11,6 +11,7 @@
  #include <string>
  #include <vector>
  #include <numeric>
  #include <sys/mman.h> // mlock & munlock
  #include "ska_sort.hpp"
@ -260,6 +261,47 @@
    free(buf);
  }
  // frewr - four rewrites.
  void frewr(uint32_t *arr, int n) {
    uint32_t *tmpbuf = (uint32_t *)malloc(n * 4);
    mlock(tmpbuf, n * 4);
    int btoffsets[4][256] = {};
    #pragma GCC unroll 64
    for (int i = n - 1; i >= 0; i--) {
      uint32_t a = arr[i];
      btoffsets[3][a & 0xff]++;
      btoffsets[2][a >> 8 & 0xff]++;
      btoffsets[1][a >> 16 & 0xff]++;
      btoffsets[0][a >> 24 & 0xff]++;
    }
    int btend[4] = {n - 1, n - 1, n - 1, n - 1};
    #pragma GCC unroll 16
    for (int i = 255; i >= 0; i--) {
    #pragma GCC unroll 4
      for (int pass = 3; pass >= 0; pass--) {
        int nbtend = btend[pass] - btoffsets[pass][i];
        btoffsets[pass][i] = btend[pass];
        btend[pass] = nbtend;
      }
    }
    uint32_t *src = arr, *dst = tmpbuf;
    #pragma GCC unroll 4
    for (int pass = 3; pass >= 0; pass--) {
      int *off = btoffsets[pass];
      #pragma GCC unroll 64
      for (int i = n - 1; i >= 0; i--) {
        uint32_t v = src[i];
        dst[off[v & 0xff]--] = v >> 8 | v << 24;
        __builtin_prefetch(&dst[off[v & 0xff] - 2]);
      }
      uint32_t *tmp = src;
      src = dst;
      dst = tmp;
    }
    munlock(tmpbuf, n * 4);
    free(tmpbuf);
  }
  void vsort(uint32_t *a, int n) {
    thread_local std::vector<uint32_t> bts[256];
    #pragma GCC unroll 4
@ -349,8 +391,8 @@
  }
  int main(void) {
-    //int n = 100000000;
+    int n = 100000000;
-    int n = 40000000;
+    //int n = 10000000;
    for (auto inputtype : inputtypes) {
      printf("%10s", inputtype.c_str());
      fflush(stdout);
@ -390,6 +432,9 @@
      w = v;
      measure(inputtype, "4rot", [&] { fourrots(&w[0], w.size()); });
      assert(w == expected);
      w = v;
      measure(inputtype, "frewr", [&] { frewr(&w[0], w.size()); });
      assert(w == expected);
      /*
      w = v;
      measure(inputtype, "vsort", [&] { vsort(&w[0], w.size()); });