added thiersort idea, measure magyar_bucket 1&2
This commit is contained in:
parent
50b1997d5c
commit
22d6631e24
73
gptsort.h
73
gptsort.h
@ -1,6 +1,7 @@
|
|||||||
#include <cmath>
|
#include <cmath>
|
||||||
#include <vector>
|
#include <vector>
|
||||||
#include <algorithm>
|
#include <algorithm>
|
||||||
|
#include "magyarsort.h"
|
||||||
|
|
||||||
// ChatGPT and me did this space partitioning bucket sort
|
// ChatGPT and me did this space partitioning bucket sort
|
||||||
void gpt_bucket_sort(uint32_t* array, int n) {
|
void gpt_bucket_sort(uint32_t* array, int n) {
|
||||||
@ -39,13 +40,14 @@ void gpt_bucket_sort(uint32_t* array, int n) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
// Further optimizations (no chatGPT)
|
// Further optimizations (no chatGPT)
|
||||||
void my_bucket_sort(uint32_t* array, int n) {
|
void magyar_bucket_sort(uint32_t* array, int n) {
|
||||||
// Calculate the number of buckets to use
|
// Calculate the number of buckets to use
|
||||||
int num_buckets = std::sqrt(n);
|
int num_buckets = std::sqrt(n);
|
||||||
|
|
||||||
// Create a vector of buckets
|
// Create a vector of buckets
|
||||||
std::vector<std::vector<uint32_t>> buckets(num_buckets);
|
std::vector<std::vector<uint32_t>> buckets(num_buckets);
|
||||||
|
|
||||||
|
// O(n)
|
||||||
// Calculate the range of values that each bucket can hold
|
// Calculate the range of values that each bucket can hold
|
||||||
auto mm = std::minmax_element(array, array + n);
|
auto mm = std::minmax_element(array, array + n);
|
||||||
uint32_t min_value = *mm.first;
|
uint32_t min_value = *mm.first;
|
||||||
@ -54,23 +56,80 @@ void my_bucket_sort(uint32_t* array, int n) {
|
|||||||
uint32_t bucket_size = range / num_buckets + 1;
|
uint32_t bucket_size = range / num_buckets + 1;
|
||||||
|
|
||||||
// Distribute the elements of the array into the buckets
|
// Distribute the elements of the array into the buckets
|
||||||
for (int i = 0; i < n; i++) {
|
for (int i = 0; i < n; ++i) {
|
||||||
// Calculate the bucket index for this element
|
// Calculate the bucket index for this element
|
||||||
// using the range of values and the bucket size as the divisor
|
// using the range of values and the bucket size as the divisor
|
||||||
int bucket_index = (array[i] - min_value) / bucket_size;
|
int bucket_index = (array[i] - min_value) / bucket_size;
|
||||||
buckets[bucket_index].push_back(array[i]);
|
buckets[bucket_index].push_back(array[i]);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// sqrt(n) * (sqrt(n)*log(sqrt(n))) = n*log(sqrt(n)) for std::sort and linear for magyarsort but less mem use!
|
||||||
// Sort the elements in each bucket using std::sort
|
// Sort the elements in each bucket using std::sort
|
||||||
for (int i = 0; i < num_buckets; i++) {
|
for (int i = 0; i < num_buckets; ++i) {
|
||||||
std::sort(buckets[i].begin(), buckets[i].end());
|
if(buckets[i].size() >= 96) { // what to choose here is pretty random
|
||||||
|
MagyarSort::sort<uint32_t>(&(buckets[i][0]), buckets[i].size());
|
||||||
|
} else {
|
||||||
|
std::sort(buckets[i].begin(), buckets[i].end());
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// Concatenate the buckets to get the sorted array
|
// Concatenate the buckets to get the sorted array
|
||||||
int k = 0;
|
int k = 0;
|
||||||
for (int i = 0; i < num_buckets; i++) {
|
for (int i = 0; i < num_buckets; ++i) {
|
||||||
for (int j = 0; j < buckets[i].size(); j++) {
|
for (int j = 0; j < buckets[i].size(); ++j) {
|
||||||
array[k++] = buckets[i][j];
|
array[++k] = buckets[i][j];
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/** Simplify magyarbucket */
|
||||||
|
void magyar_bucket_sort2(uint32_t* array, int n) {
|
||||||
|
// ensure bucket size as POT
|
||||||
|
int bucketSize = 65536;
|
||||||
|
|
||||||
|
// O(n)
|
||||||
|
// Calculate the range of values that each bucket can hold
|
||||||
|
auto mm = std::minmax_element(array, array + n);
|
||||||
|
uint32_t min = *mm.first;
|
||||||
|
uint32_t max = *mm.second;
|
||||||
|
uint32_t range = max - min + 1;
|
||||||
|
|
||||||
|
// Calculate number of buckets from size
|
||||||
|
// bucketSize = (range / numBuckets) + 1
|
||||||
|
// so:
|
||||||
|
// bucketSize + 1 = range / numBuckets
|
||||||
|
// numBuckets * (bucketSize + 1) = range
|
||||||
|
// so:
|
||||||
|
// numBuckets = range / (bucketSize + 1)
|
||||||
|
uint32_t numBuckets = range / bucketSize + 1;
|
||||||
|
|
||||||
|
// Create a vector of buckets
|
||||||
|
std::vector<std::vector<uint32_t>> buckets(numBuckets);
|
||||||
|
|
||||||
|
|
||||||
|
// Distribute the elements of the array into the buckets
|
||||||
|
for (int i = 0; i < n; ++i) {
|
||||||
|
// Calculate the bucket index for this element
|
||||||
|
// using the range of values and the bucket size as the divisor
|
||||||
|
int bucket_index = (array[i] - min) / bucketSize; // bitshift likely
|
||||||
|
buckets[bucket_index].push_back(array[i]);
|
||||||
|
}
|
||||||
|
|
||||||
|
// sqrt(n) * (sqrt(n)*log(sqrt(n))) = n*log(sqrt(n)) for std::sort and linear for magyarsort but less mem use!
|
||||||
|
// Sort the elements in each bucket using std::sort
|
||||||
|
for (int i = 0; i < numBuckets; ++i) {
|
||||||
|
if(buckets[i].size() >= 96) { // what to choose here is pretty random
|
||||||
|
MagyarSort::sort<uint32_t>(&(buckets[i][0]), buckets[i].size());
|
||||||
|
} else {
|
||||||
|
std::sort(buckets[i].begin(), buckets[i].end());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Concatenate the buckets to get the sorted array
|
||||||
|
int k = 0;
|
||||||
|
for (int i = 0; i < numBuckets; ++i) {
|
||||||
|
for (int j = 0; j < buckets[i].size(); ++j) {
|
||||||
|
array[++k] = buckets[i][j];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|||||||
236
thiersort.h
Normal file
236
thiersort.h
Normal file
@ -0,0 +1,236 @@
|
|||||||
|
#ifndef THIER_SORT_H
|
||||||
|
#define THIER_SORT_H
|
||||||
|
/*
|
||||||
|
* This sort alg. is a two step bucket sort algorithm.
|
||||||
|
* - Step 1 creates top-down pre-sorted buckets with a float radix (allocates)
|
||||||
|
* - Step 2 uses M quicksort algorithms with source-to-destination backwrite.
|
||||||
|
*
|
||||||
|
* What is "float radix" sort?
|
||||||
|
* - We convert each integer key to a 32 bit float value
|
||||||
|
* - We take that and create 8 bit ((*): or 16 bit) approxmator
|
||||||
|
* - This is what we radix sort on as if it would be 8 bit char (**)
|
||||||
|
* - In both cases, those are just right-shifted floats (LSB bits lost)
|
||||||
|
*
|
||||||
|
* For calculating occurence counts, we need a pass on the whole data set.
|
||||||
|
* Then we need an other pass - into a new array - to do radix step, or
|
||||||
|
* if we do 16 bit float variant we need two steps (new + old array).
|
||||||
|
*
|
||||||
|
* For random data, likely the best is to do less passes, so we do the
|
||||||
|
* 8 bit version as first implementation, this gives more data in the
|
||||||
|
* buckets for the "Step 2" quicksort part - which is not std::sort or
|
||||||
|
* something, but we write a variant that puts back data in original
|
||||||
|
* array so no back copy will be necessary. For the (*) variant, with
|
||||||
|
* 16 bit floats, we do not need back-copy and can do quicksort in place!
|
||||||
|
*
|
||||||
|
* To avoid costly float conversions, we can do SSE2 here (or neon)!
|
||||||
|
*
|
||||||
|
* These are useful for us:
|
||||||
|
*
|
||||||
|
* __m128i _mm_load_si128 (__m128i const* mem_addr)
|
||||||
|
* #include <emmintrin.h>
|
||||||
|
* Instruction: movdqa xmm, m128 [Latency: 6, Throughput: 0.33 - 0.5]
|
||||||
|
* CPUID Flags: SSE2
|
||||||
|
* Description
|
||||||
|
* Load 128-bits of integer data from memory into dst. mem_addr must be
|
||||||
|
* aligned on a 16-byte boundary or a general-protection exception may be
|
||||||
|
* generated.
|
||||||
|
*
|
||||||
|
* __m128i _mm_add_epi8 (__m128i a, __m128i b)
|
||||||
|
* #include <emmintrin.h>
|
||||||
|
* Instruction: paddb xmm, xmm [Latency: 1, Throughput: 0.33]
|
||||||
|
* CPUID Flags: SSE2
|
||||||
|
* Description
|
||||||
|
* Add packed 8-bit integers in a and b, and store the results in dst.
|
||||||
|
*
|
||||||
|
* __m128i _mm_and_si128 (__m128i a, __m128i b)
|
||||||
|
* #include <emmintrin.h>
|
||||||
|
* Instruction: pand xmm, xmm [Latency: 1, Throughput: 0.33]
|
||||||
|
* CPUID Flags: SSE2
|
||||||
|
* Description
|
||||||
|
* Compute the bitwise AND of 128 bits (representing integer data)
|
||||||
|
* in a and b, and store the result in dst.
|
||||||
|
*
|
||||||
|
* __m128i _mm_set_epi8 (
|
||||||
|
* char e15, char e14, char e13, char e12,
|
||||||
|
* char e11, char e10, char e9, char e8,
|
||||||
|
* char e7, char e6, char e5, char e4,
|
||||||
|
* char e3, char e2, char e1, char e0)
|
||||||
|
* #include <emmintrin.h>
|
||||||
|
* Instruction: Sequence [XXX: slow - best outside loop]
|
||||||
|
* CPUID Flags: SSE2
|
||||||
|
* Description
|
||||||
|
* Set packed 8-bit integers in dst with the supplied values.
|
||||||
|
*
|
||||||
|
* __m128 _mm_cvtepi32_ps (__m128i a)
|
||||||
|
* #include <emmintrin.h>
|
||||||
|
* Instruction: cvtdq2ps xmm, xmm [Latency: 4, Throughput: 0.5]
|
||||||
|
* CPUID Flags: SSE2
|
||||||
|
* Description
|
||||||
|
* Convert packed signed 32-bit integers in a to packed single-precision
|
||||||
|
* (32-bit) floating-point elements, and store the results in dst.
|
||||||
|
*
|
||||||
|
* __m128i _mm_castps_si128 (__m128 a)
|
||||||
|
* #include <emmintrin.h>
|
||||||
|
* CPUID Flags: SSE2
|
||||||
|
* Description
|
||||||
|
* Cast vector of type __m128 to type __m128i. This intrinsic is only
|
||||||
|
* used for compilation and does not generate any instructions, thus it
|
||||||
|
* has zero latency.
|
||||||
|
*
|
||||||
|
* void _mm_store_si128 (__m128i* mem_addr, __m128i a)
|
||||||
|
* #include <emmintrin.h>
|
||||||
|
* Instruction: movdqa m128, xmm [Latency: 1 - 5, Throughput: 0.5 - 1]
|
||||||
|
* CPUID Flags: SSE2
|
||||||
|
* Description
|
||||||
|
* Store 128-bits of integer data from a into memory. mem_addr must be
|
||||||
|
* aligned on a 16-byte boundary or a general-protection exception may
|
||||||
|
* be generated.
|
||||||
|
*
|
||||||
|
* void _mm_maskmoveu_si128 (__m128i a, __m128i mask, char* mem_addr)
|
||||||
|
* #include <emmintrin.h>
|
||||||
|
* Instruction: maskmovdqu xmm, xmm [Latency: 6, Throughput: 1 - XXX: NT!]
|
||||||
|
* CPUID Flags: SSE2
|
||||||
|
* Description
|
||||||
|
* Conditionally store 8-bit integer elements from a into memory using
|
||||||
|
* mask (elements are not stored when the highest bit is not set in the
|
||||||
|
* corresponding element) and a non-temporal memory hint. mem_addr does
|
||||||
|
* not need to be aligned on any particular boundary.
|
||||||
|
*
|
||||||
|
* int _mm_extract_epi16 (__m128i a, int imm8)
|
||||||
|
* #include <emmintrin.h>
|
||||||
|
* Instruction: pextrw r32, xmm, imm8 [Latency: 4, Throughput: 1]
|
||||||
|
* CPUID Flags: SSE2
|
||||||
|
* Description
|
||||||
|
* Extract a 16-bit integer from a, selected with imm8,
|
||||||
|
* and store the result in the lower element of dst.
|
||||||
|
* Operation
|
||||||
|
* dst[15:0] := (a[127:0] >> (imm8[2:0] * 16))[15:0]
|
||||||
|
* dst[31:16] := 0
|
||||||
|
*
|
||||||
|
* void _mm_store_si128 (__m128i* mem_addr, __m128i a)
|
||||||
|
* #include <emmintrin.h>
|
||||||
|
* Instruction: movdqa m128, xmm [Latency: 1 - 5, Throughput: 0.5 - 1]
|
||||||
|
* CPUID Flags: SSE2
|
||||||
|
* Description
|
||||||
|
* Store 128-bits of integer data from a into memory. mem_addr must be
|
||||||
|
* aligned on a 16-byte boundary or a general-protection exception
|
||||||
|
* may be generated.
|
||||||
|
*
|
||||||
|
* __m128i _mm_srli_epi32 (__m128i a, int imm8)
|
||||||
|
* #include <emmintrin.h>
|
||||||
|
* Instruction: psrld xmm, imm8 [Latency: 1, Throughput: 0.5]
|
||||||
|
* CPUID Flags: SSE2
|
||||||
|
* Description
|
||||||
|
* Shift packed 32-bit integers in a right by imm8 while shifting in
|
||||||
|
* zeros, and store the results in dst.
|
||||||
|
*
|
||||||
|
* See also:
|
||||||
|
* https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html
|
||||||
|
* https://www.laruence.com/x86/MASKMOVDQU.html
|
||||||
|
* https://blogs.fau.de/hager/archives/2103
|
||||||
|
*
|
||||||
|
* For ARM / Neon:
|
||||||
|
* https://github.com/DLTcollab/sse2neon
|
||||||
|
*
|
||||||
|
* Data layout:
|
||||||
|
*
|
||||||
|
* [int32_t key][int32_t i][int32_t key][int32_t i]...
|
||||||
|
*
|
||||||
|
* That is, we extract keys and "pointer" like offsets into an
|
||||||
|
* array of the "real" elements. So you sort "short pointers".
|
||||||
|
*
|
||||||
|
* We might create a version that just sorts bare integers,
|
||||||
|
* but that is only good to show off speed compared to algs.
|
||||||
|
*
|
||||||
|
* Main alg:
|
||||||
|
* - We first process data to have the 16 byte alignment (maybe 64 byte align?)
|
||||||
|
* - We process the input in 64 byte / loop with 3x SSE 128 bit read + 1xnormal
|
||||||
|
* - We process remaining part of the array
|
||||||
|
*
|
||||||
|
* Middle part should be skipped if there is less than 64 bytes this way...
|
||||||
|
*
|
||||||
|
* (pre-loop)
|
||||||
|
* pat1 = set(1,1,1,1,0,0,0,0,1,1,1,1...)
|
||||||
|
* pat2 = set(0,0,0,0,1,1,1,1,0,0,0,0...)
|
||||||
|
*
|
||||||
|
* (loop)
|
||||||
|
* a = LOAD4int[+0]
|
||||||
|
* b = LOAD4int[+4]
|
||||||
|
* c = LOAD4int[+8]
|
||||||
|
* <+d some int code>
|
||||||
|
*
|
||||||
|
* pata1 = a & pat1;
|
||||||
|
* pata2 = a & pat2;
|
||||||
|
* patb1 = b & pat1;
|
||||||
|
* patb2 = b & pat2;
|
||||||
|
* patc1 = c & pat1;
|
||||||
|
* patc2 = c & pat2;
|
||||||
|
* <+d some int code>
|
||||||
|
*
|
||||||
|
* rsa = pata1 >> 24;
|
||||||
|
* rsb = patb1 >> 24;
|
||||||
|
* rsc = patc1 >> 24;
|
||||||
|
* <+d some int code>
|
||||||
|
*
|
||||||
|
* resa = rsa + pata2;
|
||||||
|
* resb = rsb + patb2;
|
||||||
|
* resc = rsc + patc2;
|
||||||
|
* <+d some int code>
|
||||||
|
*
|
||||||
|
* store(resa, resb, resc) -> tmp
|
||||||
|
*
|
||||||
|
* <integer code to process(tmp)>
|
||||||
|
*
|
||||||
|
* The tmp should be automatically 16 bit aligned being on-stack allocated!
|
||||||
|
*
|
||||||
|
* The integer code that process tmp, depends on what phase we are in, in
|
||||||
|
* the phase where we count occurences, we do not need the pat[x]2 parts!
|
||||||
|
* However the reorganize phase passes need it to do the copy around!
|
||||||
|
*
|
||||||
|
* This is how we do 64 byte / iteration and should be pipelined well!
|
||||||
|
*
|
||||||
|
* However the "integer" code to process tmp is the bottleneck, because
|
||||||
|
* that part has to process 16 counts when occurence counting likely
|
||||||
|
* with a 4-wide pipeline in (at least) 4 steps and more in the other
|
||||||
|
* phase where we are not counting but moving elements where they belong.
|
||||||
|
* XXX: Now that I think more... pata2 likes are not needed as only make
|
||||||
|
* issues? I mean we could (should) copy key/index directly from source!
|
||||||
|
*
|
||||||
|
* In the 8 bit float version, I guess we only have bits from exponent..
|
||||||
|
* In the 16 bit version, with an extra pass, we have few bits from mantissa..
|
||||||
|
*
|
||||||
|
* I still think, in the random case it worths the less passes by 8 bit float!
|
||||||
|
*
|
||||||
|
* String sorting:
|
||||||
|
* - First create the [key,index] 64 bit data for strings to sort.
|
||||||
|
* - Key should be first 4 character (extra zeroes if there is no enough char)
|
||||||
|
* - Do the sorting, with comparator using both the first 4 char AND the strcmp
|
||||||
|
*
|
||||||
|
* ^^I actually think this gives a pretty fast string comparison because it do
|
||||||
|
* give a lot of higher integer values even for short strings (8 bit works)!
|
||||||
|
*
|
||||||
|
* XXX: We can do approcimately 2x (1,5x?) the speed for integer-only sort!
|
||||||
|
*
|
||||||
|
* Unsigned/signed:
|
||||||
|
* - Custom trickery is needed when processing tmp (both occurence and move)
|
||||||
|
* - It should assign different bucket based on topmost bit for sign modes!
|
||||||
|
* - Modes: msb_early, msb_late
|
||||||
|
* - These two handles: integer, unsigned int, float32
|
||||||
|
*
|
||||||
|
* XXX: So basically a good sorting algo, for float keyes sorting!
|
||||||
|
*
|
||||||
|
* Number of passes (8 bit): 3.5
|
||||||
|
* - 1x Occurence counting pass
|
||||||
|
* - 1x 8 bit float radix pass
|
||||||
|
* - 1.5x quicksort pass [more elements]
|
||||||
|
*
|
||||||
|
* Number of passes (16 bit): 4.2
|
||||||
|
* - 1x Occurence counting pass
|
||||||
|
* - 2x 8 bit float radix pass
|
||||||
|
* - 1.2x quicksort pass [less elements]
|
||||||
|
*
|
||||||
|
* Rem.: Basically these are regular, pipelined radix passes, with only
|
||||||
|
* the float conversions AND shifts vectorized with SSE somewhat...
|
||||||
|
*/
|
||||||
|
|
||||||
|
#endif /* THIER_SORT_H */
|
||||||
8
ypsu.cpp
8
ypsu.cpp
@ -415,8 +415,8 @@ void measure_single(int n) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
int main(void) {
|
int main(void) {
|
||||||
//int n = 100000000;
|
int n = 100000000;
|
||||||
int n = 10000000;
|
//int n = 10000000;
|
||||||
//int n = 100;
|
//int n = 100;
|
||||||
|
|
||||||
// Uncomment this for profiling and alg!
|
// Uncomment this for profiling and alg!
|
||||||
@ -467,7 +467,9 @@ int main(void) {
|
|||||||
w = v;*/
|
w = v;*/
|
||||||
measure(inputtype, "gptbuck", [&] { gpt_bucket_sort(&w[0], w.size()); });
|
measure(inputtype, "gptbuck", [&] { gpt_bucket_sort(&w[0], w.size()); });
|
||||||
assert(w == expected);
|
assert(w == expected);
|
||||||
measure(inputtype, "mybuck", [&] { my_bucket_sort(&w[0], w.size()); });
|
measure(inputtype, "magbuck", [&] { magyar_bucket_sort(&w[0], w.size()); });
|
||||||
|
assert(w == expected);
|
||||||
|
measure(inputtype, "magbuck2", [&] { magyar_bucket_sort2(&w[0], w.size()); });
|
||||||
assert(w == expected);
|
assert(w == expected);
|
||||||
/*
|
/*
|
||||||
w = v;
|
w = v;
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user