From 22d6631e246d9de41f6be881b60fbe58abee978f Mon Sep 17 00:00:00 2001 From: Richard Thier Date: Sun, 9 Apr 2023 17:20:58 +0200 Subject: [PATCH] added thiersort idea, measure magyar_bucket 1&2 --- gptsort.h | 73 ++++++++++++++-- thiersort.h | 236 ++++++++++++++++++++++++++++++++++++++++++++++++++++ ypsu.cpp | 8 +- 3 files changed, 307 insertions(+), 10 deletions(-) create mode 100644 thiersort.h diff --git a/gptsort.h b/gptsort.h index d969f58..c70bae1 100644 --- a/gptsort.h +++ b/gptsort.h @@ -1,6 +1,7 @@ #include #include #include +#include "magyarsort.h" // ChatGPT and me did this space partitioning bucket sort void gpt_bucket_sort(uint32_t* array, int n) { @@ -39,13 +40,14 @@ void gpt_bucket_sort(uint32_t* array, int n) { } // Further optimizations (no chatGPT) -void my_bucket_sort(uint32_t* array, int n) { +void magyar_bucket_sort(uint32_t* array, int n) { // Calculate the number of buckets to use int num_buckets = std::sqrt(n); // Create a vector of buckets std::vector> buckets(num_buckets); + // O(n) // Calculate the range of values that each bucket can hold auto mm = std::minmax_element(array, array + n); uint32_t min_value = *mm.first; @@ -54,23 +56,80 @@ void my_bucket_sort(uint32_t* array, int n) { uint32_t bucket_size = range / num_buckets + 1; // Distribute the elements of the array into the buckets - for (int i = 0; i < n; i++) { + for (int i = 0; i < n; ++i) { // Calculate the bucket index for this element // using the range of values and the bucket size as the divisor int bucket_index = (array[i] - min_value) / bucket_size; buckets[bucket_index].push_back(array[i]); } + // sqrt(n) * (sqrt(n)*log(sqrt(n))) = n*log(sqrt(n)) for std::sort and linear for magyarsort but less mem use! // Sort the elements in each bucket using std::sort - for (int i = 0; i < num_buckets; i++) { - std::sort(buckets[i].begin(), buckets[i].end()); + for (int i = 0; i < num_buckets; ++i) { + if(buckets[i].size() >= 96) { // what to choose here is pretty random + MagyarSort::sort(&(buckets[i][0]), buckets[i].size()); + } else { + std::sort(buckets[i].begin(), buckets[i].end()); + } } // Concatenate the buckets to get the sorted array int k = 0; - for (int i = 0; i < num_buckets; i++) { - for (int j = 0; j < buckets[i].size(); j++) { - array[k++] = buckets[i][j]; + for (int i = 0; i < num_buckets; ++i) { + for (int j = 0; j < buckets[i].size(); ++j) { + array[++k] = buckets[i][j]; } } } + +/** Simplify magyarbucket */ +void magyar_bucket_sort2(uint32_t* array, int n) { + // ensure bucket size as POT + int bucketSize = 65536; + + // O(n) + // Calculate the range of values that each bucket can hold + auto mm = std::minmax_element(array, array + n); + uint32_t min = *mm.first; + uint32_t max = *mm.second; + uint32_t range = max - min + 1; + + // Calculate number of buckets from size + // bucketSize = (range / numBuckets) + 1 + // so: + // bucketSize + 1 = range / numBuckets + // numBuckets * (bucketSize + 1) = range + // so: + // numBuckets = range / (bucketSize + 1) + uint32_t numBuckets = range / bucketSize + 1; + + // Create a vector of buckets + std::vector> buckets(numBuckets); + + + // Distribute the elements of the array into the buckets + for (int i = 0; i < n; ++i) { + // Calculate the bucket index for this element + // using the range of values and the bucket size as the divisor + int bucket_index = (array[i] - min) / bucketSize; // bitshift likely + buckets[bucket_index].push_back(array[i]); + } + + // sqrt(n) * (sqrt(n)*log(sqrt(n))) = n*log(sqrt(n)) for std::sort and linear for magyarsort but less mem use! + // Sort the elements in each bucket using std::sort + for (int i = 0; i < numBuckets; ++i) { + if(buckets[i].size() >= 96) { // what to choose here is pretty random + MagyarSort::sort(&(buckets[i][0]), buckets[i].size()); + } else { + std::sort(buckets[i].begin(), buckets[i].end()); + } + } + + // Concatenate the buckets to get the sorted array + int k = 0; + for (int i = 0; i < numBuckets; ++i) { + for (int j = 0; j < buckets[i].size(); ++j) { + array[++k] = buckets[i][j]; + } + } +} diff --git a/thiersort.h b/thiersort.h new file mode 100644 index 0000000..59c2f02 --- /dev/null +++ b/thiersort.h @@ -0,0 +1,236 @@ +#ifndef THIER_SORT_H +#define THIER_SORT_H +/* + * This sort alg. is a two step bucket sort algorithm. + * - Step 1 creates top-down pre-sorted buckets with a float radix (allocates) + * - Step 2 uses M quicksort algorithms with source-to-destination backwrite. + * + * What is "float radix" sort? + * - We convert each integer key to a 32 bit float value + * - We take that and create 8 bit ((*): or 16 bit) approxmator + * - This is what we radix sort on as if it would be 8 bit char (**) + * - In both cases, those are just right-shifted floats (LSB bits lost) + * + * For calculating occurence counts, we need a pass on the whole data set. + * Then we need an other pass - into a new array - to do radix step, or + * if we do 16 bit float variant we need two steps (new + old array). + * + * For random data, likely the best is to do less passes, so we do the + * 8 bit version as first implementation, this gives more data in the + * buckets for the "Step 2" quicksort part - which is not std::sort or + * something, but we write a variant that puts back data in original + * array so no back copy will be necessary. For the (*) variant, with + * 16 bit floats, we do not need back-copy and can do quicksort in place! + * + * To avoid costly float conversions, we can do SSE2 here (or neon)! + * + * These are useful for us: + * + * __m128i _mm_load_si128 (__m128i const* mem_addr) + * #include + * Instruction: movdqa xmm, m128 [Latency: 6, Throughput: 0.33 - 0.5] + * CPUID Flags: SSE2 + * Description + * Load 128-bits of integer data from memory into dst. mem_addr must be + * aligned on a 16-byte boundary or a general-protection exception may be + * generated. + * + * __m128i _mm_add_epi8 (__m128i a, __m128i b) + * #include + * Instruction: paddb xmm, xmm [Latency: 1, Throughput: 0.33] + * CPUID Flags: SSE2 + * Description + * Add packed 8-bit integers in a and b, and store the results in dst. + * + * __m128i _mm_and_si128 (__m128i a, __m128i b) + * #include + * Instruction: pand xmm, xmm [Latency: 1, Throughput: 0.33] + * CPUID Flags: SSE2 + * Description + * Compute the bitwise AND of 128 bits (representing integer data) + * in a and b, and store the result in dst. + * + * __m128i _mm_set_epi8 ( + * char e15, char e14, char e13, char e12, + * char e11, char e10, char e9, char e8, + * char e7, char e6, char e5, char e4, + * char e3, char e2, char e1, char e0) + * #include + * Instruction: Sequence [XXX: slow - best outside loop] + * CPUID Flags: SSE2 + * Description + * Set packed 8-bit integers in dst with the supplied values. + * + * __m128 _mm_cvtepi32_ps (__m128i a) + * #include + * Instruction: cvtdq2ps xmm, xmm [Latency: 4, Throughput: 0.5] + * CPUID Flags: SSE2 + * Description + * Convert packed signed 32-bit integers in a to packed single-precision + * (32-bit) floating-point elements, and store the results in dst. + * + * __m128i _mm_castps_si128 (__m128 a) + * #include + * CPUID Flags: SSE2 + * Description + * Cast vector of type __m128 to type __m128i. This intrinsic is only + * used for compilation and does not generate any instructions, thus it + * has zero latency. + * + * void _mm_store_si128 (__m128i* mem_addr, __m128i a) + * #include + * Instruction: movdqa m128, xmm [Latency: 1 - 5, Throughput: 0.5 - 1] + * CPUID Flags: SSE2 + * Description + * Store 128-bits of integer data from a into memory. mem_addr must be + * aligned on a 16-byte boundary or a general-protection exception may + * be generated. + * + * void _mm_maskmoveu_si128 (__m128i a, __m128i mask, char* mem_addr) + * #include + * Instruction: maskmovdqu xmm, xmm [Latency: 6, Throughput: 1 - XXX: NT!] + * CPUID Flags: SSE2 + * Description + * Conditionally store 8-bit integer elements from a into memory using + * mask (elements are not stored when the highest bit is not set in the + * corresponding element) and a non-temporal memory hint. mem_addr does + * not need to be aligned on any particular boundary. + * + * int _mm_extract_epi16 (__m128i a, int imm8) + * #include + * Instruction: pextrw r32, xmm, imm8 [Latency: 4, Throughput: 1] + * CPUID Flags: SSE2 + * Description + * Extract a 16-bit integer from a, selected with imm8, + * and store the result in the lower element of dst. + * Operation + * dst[15:0] := (a[127:0] >> (imm8[2:0] * 16))[15:0] + * dst[31:16] := 0 + * + * void _mm_store_si128 (__m128i* mem_addr, __m128i a) + * #include + * Instruction: movdqa m128, xmm [Latency: 1 - 5, Throughput: 0.5 - 1] + * CPUID Flags: SSE2 + * Description + * Store 128-bits of integer data from a into memory. mem_addr must be + * aligned on a 16-byte boundary or a general-protection exception + * may be generated. + * + * __m128i _mm_srli_epi32 (__m128i a, int imm8) + * #include + * Instruction: psrld xmm, imm8 [Latency: 1, Throughput: 0.5] + * CPUID Flags: SSE2 + * Description + * Shift packed 32-bit integers in a right by imm8 while shifting in + * zeros, and store the results in dst. + * + * See also: + * https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html + * https://www.laruence.com/x86/MASKMOVDQU.html + * https://blogs.fau.de/hager/archives/2103 + * + * For ARM / Neon: + * https://github.com/DLTcollab/sse2neon + * + * Data layout: + * + * [int32_t key][int32_t i][int32_t key][int32_t i]... + * + * That is, we extract keys and "pointer" like offsets into an + * array of the "real" elements. So you sort "short pointers". + * + * We might create a version that just sorts bare integers, + * but that is only good to show off speed compared to algs. + * + * Main alg: + * - We first process data to have the 16 byte alignment (maybe 64 byte align?) + * - We process the input in 64 byte / loop with 3x SSE 128 bit read + 1xnormal + * - We process remaining part of the array + * + * Middle part should be skipped if there is less than 64 bytes this way... + * + * (pre-loop) + * pat1 = set(1,1,1,1,0,0,0,0,1,1,1,1...) + * pat2 = set(0,0,0,0,1,1,1,1,0,0,0,0...) + * + * (loop) + * a = LOAD4int[+0] + * b = LOAD4int[+4] + * c = LOAD4int[+8] + * <+d some int code> + * + * pata1 = a & pat1; + * pata2 = a & pat2; + * patb1 = b & pat1; + * patb2 = b & pat2; + * patc1 = c & pat1; + * patc2 = c & pat2; + * <+d some int code> + * + * rsa = pata1 >> 24; + * rsb = patb1 >> 24; + * rsc = patc1 >> 24; + * <+d some int code> + * + * resa = rsa + pata2; + * resb = rsb + patb2; + * resc = rsc + patc2; + * <+d some int code> + * + * store(resa, resb, resc) -> tmp + * + * + * + * The tmp should be automatically 16 bit aligned being on-stack allocated! + * + * The integer code that process tmp, depends on what phase we are in, in + * the phase where we count occurences, we do not need the pat[x]2 parts! + * However the reorganize phase passes need it to do the copy around! + * + * This is how we do 64 byte / iteration and should be pipelined well! + * + * However the "integer" code to process tmp is the bottleneck, because + * that part has to process 16 counts when occurence counting likely + * with a 4-wide pipeline in (at least) 4 steps and more in the other + * phase where we are not counting but moving elements where they belong. + * XXX: Now that I think more... pata2 likes are not needed as only make + * issues? I mean we could (should) copy key/index directly from source! + * + * In the 8 bit float version, I guess we only have bits from exponent.. + * In the 16 bit version, with an extra pass, we have few bits from mantissa.. + * + * I still think, in the random case it worths the less passes by 8 bit float! + * + * String sorting: + * - First create the [key,index] 64 bit data for strings to sort. + * - Key should be first 4 character (extra zeroes if there is no enough char) + * - Do the sorting, with comparator using both the first 4 char AND the strcmp + * + * ^^I actually think this gives a pretty fast string comparison because it do + * give a lot of higher integer values even for short strings (8 bit works)! + * + * XXX: We can do approcimately 2x (1,5x?) the speed for integer-only sort! + * + * Unsigned/signed: + * - Custom trickery is needed when processing tmp (both occurence and move) + * - It should assign different bucket based on topmost bit for sign modes! + * - Modes: msb_early, msb_late + * - These two handles: integer, unsigned int, float32 + * + * XXX: So basically a good sorting algo, for float keyes sorting! + * + * Number of passes (8 bit): 3.5 + * - 1x Occurence counting pass + * - 1x 8 bit float radix pass + * - 1.5x quicksort pass [more elements] + * + * Number of passes (16 bit): 4.2 + * - 1x Occurence counting pass + * - 2x 8 bit float radix pass + * - 1.2x quicksort pass [less elements] + * + * Rem.: Basically these are regular, pipelined radix passes, with only + * the float conversions AND shifts vectorized with SSE somewhat... + */ + +#endif /* THIER_SORT_H */ diff --git a/ypsu.cpp b/ypsu.cpp index 31e13d5..d070adf 100644 --- a/ypsu.cpp +++ b/ypsu.cpp @@ -415,8 +415,8 @@ void measure_single(int n) { } int main(void) { - //int n = 100000000; - int n = 10000000; + int n = 100000000; + //int n = 10000000; //int n = 100; // Uncomment this for profiling and alg! @@ -467,7 +467,9 @@ int main(void) { w = v;*/ measure(inputtype, "gptbuck", [&] { gpt_bucket_sort(&w[0], w.size()); }); assert(w == expected); - measure(inputtype, "mybuck", [&] { my_bucket_sort(&w[0], w.size()); }); + measure(inputtype, "magbuck", [&] { magyar_bucket_sort(&w[0], w.size()); }); + assert(w == expected); + measure(inputtype, "magbuck2", [&] { magyar_bucket_sort2(&w[0], w.size()); }); assert(w == expected); /* w = v;