Implemented ILP and cache optimized simple radix variant - surprisingly good already!
This commit is contained in:
parent
4199393153
commit
68684f7fb0
106
magyarsort.h
106
magyarsort.h
@ -13,7 +13,12 @@
|
|||||||
#include <cstdint>
|
#include <cstdint>
|
||||||
#include <cstring> // memset
|
#include <cstring> // memset
|
||||||
|
|
||||||
|
// TODO: Only for the regular radix I guess
|
||||||
|
#include <vector>
|
||||||
|
#include <algorithm> // std::swap
|
||||||
|
|
||||||
namespace MagyarSort {
|
namespace MagyarSort {
|
||||||
|
/* CONFIG */
|
||||||
|
|
||||||
// Only change these if you know what you are doing
|
// Only change these if you know what you are doing
|
||||||
// I use these because I want to see if nibbles are
|
// I use these because I want to see if nibbles are
|
||||||
@ -23,9 +28,38 @@ namespace MagyarSort {
|
|||||||
// - DIGIT_RANGE and BITS_PER_DIGIT should correspond
|
// - DIGIT_RANGE and BITS_PER_DIGIT should correspond
|
||||||
// - DIGITS should also correspond with the uint32_t
|
// - DIGITS should also correspond with the uint32_t
|
||||||
// - and DIGIT_RANGE should be 2^n value (16 or 256)
|
// - and DIGIT_RANGE should be 2^n value (16 or 256)
|
||||||
|
#ifdef MAGYAR_SORT_NIBBLE
|
||||||
|
// Per-nibble digits sorting
|
||||||
static constexpr int DIGITS = 8; // "helyiérték"
|
static constexpr int DIGITS = 8; // "helyiérték"
|
||||||
static constexpr int BITS_PER_DIGIT = 4; // "bit / helyiérték"
|
static constexpr int BITS_PER_DIGIT = 4; // "bit / helyiérték"
|
||||||
static constexpr int DIGIT_RANGE = 16; // "helyiérték állapottér"
|
static constexpr int DIGIT_RANGE = 16; // "helyiérték állapottér"
|
||||||
|
#else
|
||||||
|
// Per-byte digits sorting
|
||||||
|
static constexpr int DIGITS = 4; // "helyiérték"
|
||||||
|
static constexpr int BITS_PER_DIGIT = 8; // "bit / helyiérték"
|
||||||
|
static constexpr int DIGIT_RANGE = 256; // "helyiérték állapottér"
|
||||||
|
#endif
|
||||||
|
|
||||||
|
/* DEBUG */
|
||||||
|
|
||||||
|
void debugArr(uint32_t *arr, size_t size) {
|
||||||
|
for(int i = 0; i < size; ++i) {
|
||||||
|
printf("%x, ", arr[i]);
|
||||||
|
}
|
||||||
|
printf("\n");
|
||||||
|
}
|
||||||
|
|
||||||
|
void debugRadics(size_t *radics) {
|
||||||
|
for(size_t j = 0; j < DIGITS; ++j) {
|
||||||
|
printf("d%d: ", j);
|
||||||
|
for(size_t i = 0; i < DIGIT_RANGE; ++i) {
|
||||||
|
printf("%d,", radics[i + DIGIT_RANGE*j]);
|
||||||
|
}
|
||||||
|
printf("\n\n");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/* HELPERS */
|
||||||
|
|
||||||
template<int DIGIT_CHOICE>
|
template<int DIGIT_CHOICE>
|
||||||
static inline uint32_t getDigit(uint32_t num) noexcept {
|
static inline uint32_t getDigit(uint32_t num) noexcept {
|
||||||
@ -73,6 +107,13 @@ namespace MagyarSort {
|
|||||||
inline PrefixMagic(size_t *radics, size_t *prev, int i) noexcept {}
|
inline PrefixMagic(size_t *radics, size_t *prev, int i) noexcept {}
|
||||||
};
|
};
|
||||||
|
|
||||||
|
/** Gets REFERENCE to the given digit from the radix-array that has more than one digits */
|
||||||
|
template<int DIGIT>
|
||||||
|
static inline size_t &rGet(size_t *radics, size_t i) noexcept {
|
||||||
|
static constexpr int DSTART = (DIGIT * DIGIT_RANGE);
|
||||||
|
return radics[DSTART + i];
|
||||||
|
}
|
||||||
|
|
||||||
static inline void calcPrefixSums(size_t *radics) noexcept {
|
static inline void calcPrefixSums(size_t *radics) noexcept {
|
||||||
static thread_local size_t prev[DIGITS];
|
static thread_local size_t prev[DIGITS];
|
||||||
memset(prev, 0, sizeof(prev));
|
memset(prev, 0, sizeof(prev));
|
||||||
@ -83,15 +124,40 @@ namespace MagyarSort {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
void debugIt(size_t *radics) {
|
/** Recursive Functor: no class should be generated I think (compiler should be smart) */
|
||||||
for(size_t j = 0; j < DIGITS; ++j) {
|
template<int DIGIT>
|
||||||
printf("d%d: ", j);
|
struct RadixMagic : public RadixMagic<DIGIT - 1> {
|
||||||
for(size_t i = 0; i < DIGIT_RANGE; ++i) {
|
inline RadixMagic(size_t *radics, uint32_t *&from, uint32_t *&to, size_t size) noexcept // BEWARE: "*&" needed to swap pointers..
|
||||||
printf("%d,", radics[i + DIGIT_RANGE*j]);
|
: RadixMagic<DIGIT - 1>(radics, from, to, size) {
|
||||||
}
|
// DEBUG
|
||||||
printf("\n\n");
|
//printf("%d before: ", DIGIT);
|
||||||
|
//debugArr(from, size);
|
||||||
|
|
||||||
|
for(size_t i = size; i > 0; --i) { // right-to-left to ensure already sorted digits order we keep for iterations
|
||||||
|
// Get num and its new offset / location
|
||||||
|
auto num = from[i - 1];
|
||||||
|
auto digVal = getDigit<DIGIT>(num);
|
||||||
|
auto offset = (--rGet<DIGIT>(radics, digVal));
|
||||||
|
|
||||||
|
// Add to the proper target location
|
||||||
|
to[offset] = num;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// DEBUG
|
||||||
|
//printf("%d after: ", DIGIT);
|
||||||
|
//debugArr(to, size);
|
||||||
|
|
||||||
|
// Only swaps pointers :-)
|
||||||
|
std::swap(from, to);
|
||||||
}
|
}
|
||||||
|
};
|
||||||
|
/** Ends template recursion */
|
||||||
|
template<>
|
||||||
|
struct RadixMagic<-1> {
|
||||||
|
inline RadixMagic(size_t *radics, uint32_t *&from, uint32_t *&to, size_t size) noexcept { }
|
||||||
|
};
|
||||||
|
|
||||||
|
/* SORT */
|
||||||
|
|
||||||
/** Sort the given array (in-place sorting) with the given size */
|
/** Sort the given array (in-place sorting) with the given size */
|
||||||
inline void sort(uint32_t arr[], size_t size) noexcept {
|
inline void sort(uint32_t arr[], size_t size) noexcept {
|
||||||
@ -103,12 +169,34 @@ namespace MagyarSort {
|
|||||||
// Calculate occurences of digits
|
// Calculate occurences of digits
|
||||||
countOccurences(arr, size, radics);
|
countOccurences(arr, size, radics);
|
||||||
|
|
||||||
debugIt(radics);
|
//debugRadics(radics);
|
||||||
|
|
||||||
// Calculate prefix sums
|
// Calculate prefix sums
|
||||||
calcPrefixSums(radics);
|
calcPrefixSums(radics);
|
||||||
|
|
||||||
debugIt(radics);
|
//debugRadics(radics);
|
||||||
|
|
||||||
|
/* Regular (old) radix sort with small twist */
|
||||||
|
|
||||||
|
// Regular radix sort - I just changed occurence couting and prefix summing to have more ILP
|
||||||
|
// But because my approach does not use that, I want to keep this version in a branch for a
|
||||||
|
// regular radix sort using better ILP just to see how it is doing if I wrote those "Magic"
|
||||||
|
// above already anyways...
|
||||||
|
|
||||||
|
// Regular radix sort needs a copy, see: https://www.youtube.com/watch?v=ujb2CIWE8zY
|
||||||
|
std::vector<uint32_t> arc(size);
|
||||||
|
|
||||||
|
uint32_t *from = arr;
|
||||||
|
uint32_t *to = &arc[0];
|
||||||
|
|
||||||
|
RadixMagic<DIGITS - 1>(radics, from, to, size);
|
||||||
|
|
||||||
|
// With an other API we could spare this copy if we can delete original arr and return ptr or something...
|
||||||
|
// I am fine with this... this is not my main idea anyways, just little ILP tweak to regular radix sort
|
||||||
|
//if(to != arr) { // <- logically, but bad they are already swapped here!!! BEWARE
|
||||||
|
if(from != arr) { // <- in reality this is what we want because of last swap happened anyways!
|
||||||
|
memcpy(arr, from, size);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
|
|||||||
2
makefile
2
makefile
@ -2,7 +2,7 @@ debug: test.cpp magyarsort.h
|
|||||||
g++ test.cpp -g -std=c++14 -o test.out
|
g++ test.cpp -g -std=c++14 -o test.out
|
||||||
|
|
||||||
release: test.cpp magyarsort.h
|
release: test.cpp magyarsort.h
|
||||||
g++ test.cpp -o -std=c++14 -O2 test.out
|
g++ test.cpp -std=c++14 -O2 -o test.out
|
||||||
|
|
||||||
clean: test.out
|
clean: test.out
|
||||||
rm test.out
|
rm test.out
|
||||||
|
|||||||
119
test.cpp
119
test.cpp
@ -1,15 +1,128 @@
|
|||||||
/* LICENCE: CC3 - look it up, you need to mention me but that is all */
|
/* LICENCE: CC3 - look it up, you need to mention me but that is all */
|
||||||
|
|
||||||
|
/* CONFIG */
|
||||||
|
|
||||||
|
// Uncomment next line to follow Creel: https://www.youtube.com/watch?v=ujb2CIWE8zY
|
||||||
|
// #define CREEL // Overwrites TEST_LEN to 16 and sets MAGYAR_SORT_NIBBLE!
|
||||||
|
|
||||||
|
// Number of input elements to generate - unused when CREEL is defined!
|
||||||
|
#define SORT_WIDTH 40000
|
||||||
|
// Uncomment this to use nibbles as digits and not bytes - CREEL defines this anyways
|
||||||
|
//#define MAGYAR_SORT_NIBBLE
|
||||||
|
|
||||||
|
// Uncomment if you want to see output before / after sorts (debugging for example)
|
||||||
|
//#define PRINT_OUTPUT
|
||||||
|
|
||||||
|
/* Includes */
|
||||||
|
|
||||||
|
#include <cstring>
|
||||||
#include <cstdint>
|
#include <cstdint>
|
||||||
#include <cstdio>
|
#include <cstdio>
|
||||||
|
#include <cstdlib> // std::rand | rand
|
||||||
|
#include <vector>
|
||||||
|
#include <chrono>
|
||||||
|
#include <algorithm> // std::sort
|
||||||
#include "magyarsort.h"
|
#include "magyarsort.h"
|
||||||
|
|
||||||
|
/* Input generation and prerequisites */
|
||||||
|
|
||||||
|
#ifdef CREEL
|
||||||
|
#define MAGYAR_SORT_NIBBLE
|
||||||
|
#define PRINT_OUTPUT
|
||||||
|
static inline std::vector<uint32_t> GenerateInput() {
|
||||||
|
static constexpr uint32_t CreelHex[16] = {
|
||||||
|
// Homage to https://www.youtube.com/watch?v=ujb2CIWE8zY haha
|
||||||
|
// When doing nibbles these are visible all throughout all the
|
||||||
|
// steps and these will be easily readable in debugger in hex!
|
||||||
|
0x277,
|
||||||
|
0x806,
|
||||||
|
0x681,
|
||||||
|
0x462,
|
||||||
|
0x787,
|
||||||
|
0x163,
|
||||||
|
0x284,
|
||||||
|
0x166,
|
||||||
|
0x905,
|
||||||
|
0x518,
|
||||||
|
0x263,
|
||||||
|
0x395,
|
||||||
|
0x988,
|
||||||
|
0x307,
|
||||||
|
0x779,
|
||||||
|
0x721
|
||||||
|
};
|
||||||
|
|
||||||
|
std::vector<uint32_t> ret;
|
||||||
|
ret.resize(16);
|
||||||
|
|
||||||
|
memcpy(&ret[0], CreelHex, sizeof(CreelHex));
|
||||||
|
|
||||||
|
return ret;
|
||||||
|
}
|
||||||
|
#else
|
||||||
|
// Randomized values, no overrides
|
||||||
|
static inline std::vector<uint32_t> GenerateInput() {
|
||||||
|
std::vector<uint32_t> ret;
|
||||||
|
ret.resize(SORT_WIDTH);
|
||||||
|
|
||||||
|
for(size_t ek = 0; ek < SORT_WIDTH; ++ek) {
|
||||||
|
ret[ek] = (uint32_t)std::rand();
|
||||||
|
}
|
||||||
|
|
||||||
|
return ret;
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
|
/* Test entry point */
|
||||||
|
|
||||||
int main() {
|
int main() {
|
||||||
uint32_t smallArr[16] = { 0xFF, 0xFFFFFFFF, 0xAA000000, 10, 20, 200, 1234513, 1, 0, 65535, 1024*1024, 1026*16, 7, 8, 1, 0};
|
/* Input */
|
||||||
|
std::vector<uint32_t> in1 = GenerateInput();;
|
||||||
|
std::vector<uint32_t> in2 = in1; // copy
|
||||||
|
|
||||||
MagyarSort::sort(smallArr, 16);
|
uint32_t *arr1 = &(in1[0]);
|
||||||
|
|
||||||
// TODO: check, etc.
|
#ifdef PRINT_OUTPUT
|
||||||
|
printf("Inp: ");
|
||||||
|
MagyarSort::debugArr(arr1, in1.size());
|
||||||
|
#endif // PRINT_OUTPUT
|
||||||
|
|
||||||
|
/* Our sort */
|
||||||
|
auto ourBegin = std::chrono::high_resolution_clock::now();
|
||||||
|
MagyarSort::sort(arr1, in1.size());
|
||||||
|
auto ourEnd = std::chrono::high_resolution_clock::now();
|
||||||
|
|
||||||
|
#ifdef PRINT_OUTPUT
|
||||||
|
printf("Our: ");
|
||||||
|
MagyarSort::debugArr(arr1, in1.size());
|
||||||
|
#endif // PRINT_OUTPUT
|
||||||
|
|
||||||
|
/* std::sort */
|
||||||
|
auto stdBegin = std::chrono::high_resolution_clock::now();
|
||||||
|
std::sort(std::begin(in2), std::end(in2));
|
||||||
|
auto stdEnd = std::chrono::high_resolution_clock::now();
|
||||||
|
|
||||||
|
#ifdef PRINT_OUTPUT
|
||||||
|
printf("std: ");
|
||||||
|
MagyarSort::debugArr(&in2[0], in2.size());
|
||||||
|
#endif // PRINT_OUTPUT
|
||||||
|
|
||||||
|
/* Check against std - the real test */
|
||||||
|
|
||||||
|
bool good = true;
|
||||||
|
for(size_t i = 0; good && (i < in1.size()); ++i) {
|
||||||
|
good &= (in1[i] == in2[i]);
|
||||||
|
}
|
||||||
|
|
||||||
|
printf("Results:\n\n");
|
||||||
|
printf("- Sorted %zu elements", in1.size());
|
||||||
|
if(good) printf("- Same result as std::sort!\n");
|
||||||
|
else printf("- Differs from std::sort! Error!\n");
|
||||||
|
printf("\n");
|
||||||
|
auto stdElapsed = std::chrono::duration_cast<std::chrono::nanoseconds>(stdEnd - stdBegin);
|
||||||
|
auto ourElapsed = std::chrono::duration_cast<std::chrono::nanoseconds>(ourEnd - ourBegin);
|
||||||
|
printf("Time (std sort): %.3f ms.\n", stdElapsed.count() * 1e-6);
|
||||||
|
printf("Time (our sort): %.3f ms.\n", ourElapsed.count() * 1e-6);
|
||||||
|
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user