Implemented ILP and cache optimized simple radix variant - surprisingly good already!
This commit is contained in:
parent
4199393153
commit
68684f7fb0
106
magyarsort.h
106
magyarsort.h
@ -13,7 +13,12 @@
|
||||
#include <cstdint>
|
||||
#include <cstring> // memset
|
||||
|
||||
// TODO: Only for the regular radix I guess
|
||||
#include <vector>
|
||||
#include <algorithm> // std::swap
|
||||
|
||||
namespace MagyarSort {
|
||||
/* CONFIG */
|
||||
|
||||
// Only change these if you know what you are doing
|
||||
// I use these because I want to see if nibbles are
|
||||
@ -23,9 +28,38 @@ namespace MagyarSort {
|
||||
// - DIGIT_RANGE and BITS_PER_DIGIT should correspond
|
||||
// - DIGITS should also correspond with the uint32_t
|
||||
// - and DIGIT_RANGE should be 2^n value (16 or 256)
|
||||
#ifdef MAGYAR_SORT_NIBBLE
|
||||
// Per-nibble digits sorting
|
||||
static constexpr int DIGITS = 8; // "helyiérték"
|
||||
static constexpr int BITS_PER_DIGIT = 4; // "bit / helyiérték"
|
||||
static constexpr int DIGIT_RANGE = 16; // "helyiérték állapottér"
|
||||
#else
|
||||
// Per-byte digits sorting
|
||||
static constexpr int DIGITS = 4; // "helyiérték"
|
||||
static constexpr int BITS_PER_DIGIT = 8; // "bit / helyiérték"
|
||||
static constexpr int DIGIT_RANGE = 256; // "helyiérték állapottér"
|
||||
#endif
|
||||
|
||||
/* DEBUG */
|
||||
|
||||
void debugArr(uint32_t *arr, size_t size) {
|
||||
for(int i = 0; i < size; ++i) {
|
||||
printf("%x, ", arr[i]);
|
||||
}
|
||||
printf("\n");
|
||||
}
|
||||
|
||||
void debugRadics(size_t *radics) {
|
||||
for(size_t j = 0; j < DIGITS; ++j) {
|
||||
printf("d%d: ", j);
|
||||
for(size_t i = 0; i < DIGIT_RANGE; ++i) {
|
||||
printf("%d,", radics[i + DIGIT_RANGE*j]);
|
||||
}
|
||||
printf("\n\n");
|
||||
}
|
||||
}
|
||||
|
||||
/* HELPERS */
|
||||
|
||||
template<int DIGIT_CHOICE>
|
||||
static inline uint32_t getDigit(uint32_t num) noexcept {
|
||||
@ -73,6 +107,13 @@ namespace MagyarSort {
|
||||
inline PrefixMagic(size_t *radics, size_t *prev, int i) noexcept {}
|
||||
};
|
||||
|
||||
/** Gets REFERENCE to the given digit from the radix-array that has more than one digits */
|
||||
template<int DIGIT>
|
||||
static inline size_t &rGet(size_t *radics, size_t i) noexcept {
|
||||
static constexpr int DSTART = (DIGIT * DIGIT_RANGE);
|
||||
return radics[DSTART + i];
|
||||
}
|
||||
|
||||
static inline void calcPrefixSums(size_t *radics) noexcept {
|
||||
static thread_local size_t prev[DIGITS];
|
||||
memset(prev, 0, sizeof(prev));
|
||||
@ -83,15 +124,40 @@ namespace MagyarSort {
|
||||
}
|
||||
}
|
||||
|
||||
void debugIt(size_t *radics) {
|
||||
for(size_t j = 0; j < DIGITS; ++j) {
|
||||
printf("d%d: ", j);
|
||||
for(size_t i = 0; i < DIGIT_RANGE; ++i) {
|
||||
printf("%d,", radics[i + DIGIT_RANGE*j]);
|
||||
/** Recursive Functor: no class should be generated I think (compiler should be smart) */
|
||||
template<int DIGIT>
|
||||
struct RadixMagic : public RadixMagic<DIGIT - 1> {
|
||||
inline RadixMagic(size_t *radics, uint32_t *&from, uint32_t *&to, size_t size) noexcept // BEWARE: "*&" needed to swap pointers..
|
||||
: RadixMagic<DIGIT - 1>(radics, from, to, size) {
|
||||
// DEBUG
|
||||
//printf("%d before: ", DIGIT);
|
||||
//debugArr(from, size);
|
||||
|
||||
for(size_t i = size; i > 0; --i) { // right-to-left to ensure already sorted digits order we keep for iterations
|
||||
// Get num and its new offset / location
|
||||
auto num = from[i - 1];
|
||||
auto digVal = getDigit<DIGIT>(num);
|
||||
auto offset = (--rGet<DIGIT>(radics, digVal));
|
||||
|
||||
// Add to the proper target location
|
||||
to[offset] = num;
|
||||
}
|
||||
printf("\n\n");
|
||||
|
||||
// DEBUG
|
||||
//printf("%d after: ", DIGIT);
|
||||
//debugArr(to, size);
|
||||
|
||||
// Only swaps pointers :-)
|
||||
std::swap(from, to);
|
||||
}
|
||||
}
|
||||
};
|
||||
/** Ends template recursion */
|
||||
template<>
|
||||
struct RadixMagic<-1> {
|
||||
inline RadixMagic(size_t *radics, uint32_t *&from, uint32_t *&to, size_t size) noexcept { }
|
||||
};
|
||||
|
||||
/* SORT */
|
||||
|
||||
/** Sort the given array (in-place sorting) with the given size */
|
||||
inline void sort(uint32_t arr[], size_t size) noexcept {
|
||||
@ -103,12 +169,34 @@ namespace MagyarSort {
|
||||
// Calculate occurences of digits
|
||||
countOccurences(arr, size, radics);
|
||||
|
||||
debugIt(radics);
|
||||
//debugRadics(radics);
|
||||
|
||||
// Calculate prefix sums
|
||||
calcPrefixSums(radics);
|
||||
|
||||
debugIt(radics);
|
||||
//debugRadics(radics);
|
||||
|
||||
/* Regular (old) radix sort with small twist */
|
||||
|
||||
// Regular radix sort - I just changed occurence couting and prefix summing to have more ILP
|
||||
// But because my approach does not use that, I want to keep this version in a branch for a
|
||||
// regular radix sort using better ILP just to see how it is doing if I wrote those "Magic"
|
||||
// above already anyways...
|
||||
|
||||
// Regular radix sort needs a copy, see: https://www.youtube.com/watch?v=ujb2CIWE8zY
|
||||
std::vector<uint32_t> arc(size);
|
||||
|
||||
uint32_t *from = arr;
|
||||
uint32_t *to = &arc[0];
|
||||
|
||||
RadixMagic<DIGITS - 1>(radics, from, to, size);
|
||||
|
||||
// With an other API we could spare this copy if we can delete original arr and return ptr or something...
|
||||
// I am fine with this... this is not my main idea anyways, just little ILP tweak to regular radix sort
|
||||
//if(to != arr) { // <- logically, but bad they are already swapped here!!! BEWARE
|
||||
if(from != arr) { // <- in reality this is what we want because of last swap happened anyways!
|
||||
memcpy(arr, from, size);
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
|
||||
2
makefile
2
makefile
@ -2,7 +2,7 @@ debug: test.cpp magyarsort.h
|
||||
g++ test.cpp -g -std=c++14 -o test.out
|
||||
|
||||
release: test.cpp magyarsort.h
|
||||
g++ test.cpp -o -std=c++14 -O2 test.out
|
||||
g++ test.cpp -std=c++14 -O2 -o test.out
|
||||
|
||||
clean: test.out
|
||||
rm test.out
|
||||
|
||||
119
test.cpp
119
test.cpp
@ -1,15 +1,128 @@
|
||||
/* LICENCE: CC3 - look it up, you need to mention me but that is all */
|
||||
|
||||
/* CONFIG */
|
||||
|
||||
// Uncomment next line to follow Creel: https://www.youtube.com/watch?v=ujb2CIWE8zY
|
||||
// #define CREEL // Overwrites TEST_LEN to 16 and sets MAGYAR_SORT_NIBBLE!
|
||||
|
||||
// Number of input elements to generate - unused when CREEL is defined!
|
||||
#define SORT_WIDTH 40000
|
||||
// Uncomment this to use nibbles as digits and not bytes - CREEL defines this anyways
|
||||
//#define MAGYAR_SORT_NIBBLE
|
||||
|
||||
// Uncomment if you want to see output before / after sorts (debugging for example)
|
||||
//#define PRINT_OUTPUT
|
||||
|
||||
/* Includes */
|
||||
|
||||
#include <cstring>
|
||||
#include <cstdint>
|
||||
#include <cstdio>
|
||||
#include <cstdlib> // std::rand | rand
|
||||
#include <vector>
|
||||
#include <chrono>
|
||||
#include <algorithm> // std::sort
|
||||
#include "magyarsort.h"
|
||||
|
||||
/* Input generation and prerequisites */
|
||||
|
||||
#ifdef CREEL
|
||||
#define MAGYAR_SORT_NIBBLE
|
||||
#define PRINT_OUTPUT
|
||||
static inline std::vector<uint32_t> GenerateInput() {
|
||||
static constexpr uint32_t CreelHex[16] = {
|
||||
// Homage to https://www.youtube.com/watch?v=ujb2CIWE8zY haha
|
||||
// When doing nibbles these are visible all throughout all the
|
||||
// steps and these will be easily readable in debugger in hex!
|
||||
0x277,
|
||||
0x806,
|
||||
0x681,
|
||||
0x462,
|
||||
0x787,
|
||||
0x163,
|
||||
0x284,
|
||||
0x166,
|
||||
0x905,
|
||||
0x518,
|
||||
0x263,
|
||||
0x395,
|
||||
0x988,
|
||||
0x307,
|
||||
0x779,
|
||||
0x721
|
||||
};
|
||||
|
||||
std::vector<uint32_t> ret;
|
||||
ret.resize(16);
|
||||
|
||||
memcpy(&ret[0], CreelHex, sizeof(CreelHex));
|
||||
|
||||
return ret;
|
||||
}
|
||||
#else
|
||||
// Randomized values, no overrides
|
||||
static inline std::vector<uint32_t> GenerateInput() {
|
||||
std::vector<uint32_t> ret;
|
||||
ret.resize(SORT_WIDTH);
|
||||
|
||||
for(size_t ek = 0; ek < SORT_WIDTH; ++ek) {
|
||||
ret[ek] = (uint32_t)std::rand();
|
||||
}
|
||||
|
||||
return ret;
|
||||
}
|
||||
#endif
|
||||
|
||||
/* Test entry point */
|
||||
|
||||
int main() {
|
||||
uint32_t smallArr[16] = { 0xFF, 0xFFFFFFFF, 0xAA000000, 10, 20, 200, 1234513, 1, 0, 65535, 1024*1024, 1026*16, 7, 8, 1, 0};
|
||||
/* Input */
|
||||
std::vector<uint32_t> in1 = GenerateInput();;
|
||||
std::vector<uint32_t> in2 = in1; // copy
|
||||
|
||||
MagyarSort::sort(smallArr, 16);
|
||||
uint32_t *arr1 = &(in1[0]);
|
||||
|
||||
// TODO: check, etc.
|
||||
#ifdef PRINT_OUTPUT
|
||||
printf("Inp: ");
|
||||
MagyarSort::debugArr(arr1, in1.size());
|
||||
#endif // PRINT_OUTPUT
|
||||
|
||||
/* Our sort */
|
||||
auto ourBegin = std::chrono::high_resolution_clock::now();
|
||||
MagyarSort::sort(arr1, in1.size());
|
||||
auto ourEnd = std::chrono::high_resolution_clock::now();
|
||||
|
||||
#ifdef PRINT_OUTPUT
|
||||
printf("Our: ");
|
||||
MagyarSort::debugArr(arr1, in1.size());
|
||||
#endif // PRINT_OUTPUT
|
||||
|
||||
/* std::sort */
|
||||
auto stdBegin = std::chrono::high_resolution_clock::now();
|
||||
std::sort(std::begin(in2), std::end(in2));
|
||||
auto stdEnd = std::chrono::high_resolution_clock::now();
|
||||
|
||||
#ifdef PRINT_OUTPUT
|
||||
printf("std: ");
|
||||
MagyarSort::debugArr(&in2[0], in2.size());
|
||||
#endif // PRINT_OUTPUT
|
||||
|
||||
/* Check against std - the real test */
|
||||
|
||||
bool good = true;
|
||||
for(size_t i = 0; good && (i < in1.size()); ++i) {
|
||||
good &= (in1[i] == in2[i]);
|
||||
}
|
||||
|
||||
printf("Results:\n\n");
|
||||
printf("- Sorted %zu elements", in1.size());
|
||||
if(good) printf("- Same result as std::sort!\n");
|
||||
else printf("- Differs from std::sort! Error!\n");
|
||||
printf("\n");
|
||||
auto stdElapsed = std::chrono::duration_cast<std::chrono::nanoseconds>(stdEnd - stdBegin);
|
||||
auto ourElapsed = std::chrono::duration_cast<std::chrono::nanoseconds>(ourEnd - ourBegin);
|
||||
printf("Time (std sort): %.3f ms.\n", stdElapsed.count() * 1e-6);
|
||||
printf("Time (our sort): %.3f ms.\n", ourElapsed.count() * 1e-6);
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user