Implemented ILP and cache optimized simple radix variant - surprisingly good already!

This commit is contained in:
Richard Thier 2021-03-13 15:51:24 +01:00
parent 4199393153
commit 68684f7fb0
3 changed files with 214 additions and 13 deletions

View File

@ -13,7 +13,12 @@
#include <cstdint>
#include <cstring> // memset
// TODO: Only for the regular radix I guess
#include <vector>
#include <algorithm> // std::swap
namespace MagyarSort {
/* CONFIG */
// Only change these if you know what you are doing
// I use these because I want to see if nibbles are
@ -23,9 +28,38 @@ namespace MagyarSort {
// - DIGIT_RANGE and BITS_PER_DIGIT should correspond
// - DIGITS should also correspond with the uint32_t
// - and DIGIT_RANGE should be 2^n value (16 or 256)
#ifdef MAGYAR_SORT_NIBBLE
// Per-nibble digits sorting
static constexpr int DIGITS = 8; // "helyiérték"
static constexpr int BITS_PER_DIGIT = 4; // "bit / helyiérték"
static constexpr int DIGIT_RANGE = 16; // "helyiérték állapottér"
#else
// Per-byte digits sorting
static constexpr int DIGITS = 4; // "helyiérték"
static constexpr int BITS_PER_DIGIT = 8; // "bit / helyiérték"
static constexpr int DIGIT_RANGE = 256; // "helyiérték állapottér"
#endif
/* DEBUG */
void debugArr(uint32_t *arr, size_t size) {
for(int i = 0; i < size; ++i) {
printf("%x, ", arr[i]);
}
printf("\n");
}
void debugRadics(size_t *radics) {
for(size_t j = 0; j < DIGITS; ++j) {
printf("d%d: ", j);
for(size_t i = 0; i < DIGIT_RANGE; ++i) {
printf("%d,", radics[i + DIGIT_RANGE*j]);
}
printf("\n\n");
}
}
/* HELPERS */
template<int DIGIT_CHOICE>
static inline uint32_t getDigit(uint32_t num) noexcept {
@ -73,6 +107,13 @@ namespace MagyarSort {
inline PrefixMagic(size_t *radics, size_t *prev, int i) noexcept {}
};
/** Gets REFERENCE to the given digit from the radix-array that has more than one digits */
template<int DIGIT>
static inline size_t &rGet(size_t *radics, size_t i) noexcept {
static constexpr int DSTART = (DIGIT * DIGIT_RANGE);
return radics[DSTART + i];
}
static inline void calcPrefixSums(size_t *radics) noexcept {
static thread_local size_t prev[DIGITS];
memset(prev, 0, sizeof(prev));
@ -83,15 +124,40 @@ namespace MagyarSort {
}
}
void debugIt(size_t *radics) {
for(size_t j = 0; j < DIGITS; ++j) {
printf("d%d: ", j);
for(size_t i = 0; i < DIGIT_RANGE; ++i) {
printf("%d,", radics[i + DIGIT_RANGE*j]);
/** Recursive Functor: no class should be generated I think (compiler should be smart) */
template<int DIGIT>
struct RadixMagic : public RadixMagic<DIGIT - 1> {
inline RadixMagic(size_t *radics, uint32_t *&from, uint32_t *&to, size_t size) noexcept // BEWARE: "*&" needed to swap pointers..
: RadixMagic<DIGIT - 1>(radics, from, to, size) {
// DEBUG
//printf("%d before: ", DIGIT);
//debugArr(from, size);
for(size_t i = size; i > 0; --i) { // right-to-left to ensure already sorted digits order we keep for iterations
// Get num and its new offset / location
auto num = from[i - 1];
auto digVal = getDigit<DIGIT>(num);
auto offset = (--rGet<DIGIT>(radics, digVal));
// Add to the proper target location
to[offset] = num;
}
printf("\n\n");
// DEBUG
//printf("%d after: ", DIGIT);
//debugArr(to, size);
// Only swaps pointers :-)
std::swap(from, to);
}
}
};
/** Ends template recursion */
template<>
struct RadixMagic<-1> {
inline RadixMagic(size_t *radics, uint32_t *&from, uint32_t *&to, size_t size) noexcept { }
};
/* SORT */
/** Sort the given array (in-place sorting) with the given size */
inline void sort(uint32_t arr[], size_t size) noexcept {
@ -103,12 +169,34 @@ namespace MagyarSort {
// Calculate occurences of digits
countOccurences(arr, size, radics);
debugIt(radics);
//debugRadics(radics);
// Calculate prefix sums
calcPrefixSums(radics);
debugIt(radics);
//debugRadics(radics);
/* Regular (old) radix sort with small twist */
// Regular radix sort - I just changed occurence couting and prefix summing to have more ILP
// But because my approach does not use that, I want to keep this version in a branch for a
// regular radix sort using better ILP just to see how it is doing if I wrote those "Magic"
// above already anyways...
// Regular radix sort needs a copy, see: https://www.youtube.com/watch?v=ujb2CIWE8zY
std::vector<uint32_t> arc(size);
uint32_t *from = arr;
uint32_t *to = &arc[0];
RadixMagic<DIGITS - 1>(radics, from, to, size);
// With an other API we could spare this copy if we can delete original arr and return ptr or something...
// I am fine with this... this is not my main idea anyways, just little ILP tweak to regular radix sort
//if(to != arr) { // <- logically, but bad they are already swapped here!!! BEWARE
if(from != arr) { // <- in reality this is what we want because of last swap happened anyways!
memcpy(arr, from, size);
}
}
};

View File

@ -2,7 +2,7 @@ debug: test.cpp magyarsort.h
g++ test.cpp -g -std=c++14 -o test.out
release: test.cpp magyarsort.h
g++ test.cpp -o -std=c++14 -O2 test.out
g++ test.cpp -std=c++14 -O2 -o test.out
clean: test.out
rm test.out

119
test.cpp
View File

@ -1,15 +1,128 @@
/* LICENCE: CC3 - look it up, you need to mention me but that is all */
/* CONFIG */
// Uncomment next line to follow Creel: https://www.youtube.com/watch?v=ujb2CIWE8zY
// #define CREEL // Overwrites TEST_LEN to 16 and sets MAGYAR_SORT_NIBBLE!
// Number of input elements to generate - unused when CREEL is defined!
#define SORT_WIDTH 40000
// Uncomment this to use nibbles as digits and not bytes - CREEL defines this anyways
//#define MAGYAR_SORT_NIBBLE
// Uncomment if you want to see output before / after sorts (debugging for example)
//#define PRINT_OUTPUT
/* Includes */
#include <cstring>
#include <cstdint>
#include <cstdio>
#include <cstdlib> // std::rand | rand
#include <vector>
#include <chrono>
#include <algorithm> // std::sort
#include "magyarsort.h"
/* Input generation and prerequisites */
#ifdef CREEL
#define MAGYAR_SORT_NIBBLE
#define PRINT_OUTPUT
static inline std::vector<uint32_t> GenerateInput() {
static constexpr uint32_t CreelHex[16] = {
// Homage to https://www.youtube.com/watch?v=ujb2CIWE8zY haha
// When doing nibbles these are visible all throughout all the
// steps and these will be easily readable in debugger in hex!
0x277,
0x806,
0x681,
0x462,
0x787,
0x163,
0x284,
0x166,
0x905,
0x518,
0x263,
0x395,
0x988,
0x307,
0x779,
0x721
};
std::vector<uint32_t> ret;
ret.resize(16);
memcpy(&ret[0], CreelHex, sizeof(CreelHex));
return ret;
}
#else
// Randomized values, no overrides
static inline std::vector<uint32_t> GenerateInput() {
std::vector<uint32_t> ret;
ret.resize(SORT_WIDTH);
for(size_t ek = 0; ek < SORT_WIDTH; ++ek) {
ret[ek] = (uint32_t)std::rand();
}
return ret;
}
#endif
/* Test entry point */
int main() {
uint32_t smallArr[16] = { 0xFF, 0xFFFFFFFF, 0xAA000000, 10, 20, 200, 1234513, 1, 0, 65535, 1024*1024, 1026*16, 7, 8, 1, 0};
/* Input */
std::vector<uint32_t> in1 = GenerateInput();;
std::vector<uint32_t> in2 = in1; // copy
MagyarSort::sort(smallArr, 16);
uint32_t *arr1 = &(in1[0]);
// TODO: check, etc.
#ifdef PRINT_OUTPUT
printf("Inp: ");
MagyarSort::debugArr(arr1, in1.size());
#endif // PRINT_OUTPUT
/* Our sort */
auto ourBegin = std::chrono::high_resolution_clock::now();
MagyarSort::sort(arr1, in1.size());
auto ourEnd = std::chrono::high_resolution_clock::now();
#ifdef PRINT_OUTPUT
printf("Our: ");
MagyarSort::debugArr(arr1, in1.size());
#endif // PRINT_OUTPUT
/* std::sort */
auto stdBegin = std::chrono::high_resolution_clock::now();
std::sort(std::begin(in2), std::end(in2));
auto stdEnd = std::chrono::high_resolution_clock::now();
#ifdef PRINT_OUTPUT
printf("std: ");
MagyarSort::debugArr(&in2[0], in2.size());
#endif // PRINT_OUTPUT
/* Check against std - the real test */
bool good = true;
for(size_t i = 0; good && (i < in1.size()); ++i) {
good &= (in1[i] == in2[i]);
}
printf("Results:\n\n");
printf("- Sorted %zu elements", in1.size());
if(good) printf("- Same result as std::sort!\n");
else printf("- Differs from std::sort! Error!\n");
printf("\n");
auto stdElapsed = std::chrono::duration_cast<std::chrono::nanoseconds>(stdEnd - stdBegin);
auto ourElapsed = std::chrono::duration_cast<std::chrono::nanoseconds>(ourEnd - ourBegin);
printf("Time (std sort): %.3f ms.\n", stdElapsed.count() * 1e-6);
printf("Time (our sort): %.3f ms.\n", ourElapsed.count() * 1e-6);
return 0;
}