#include // std::pair #include #include #include #include #include #include #include #include #include #include #include #include #include #include // mlock & munlock #include "ska_sort.hpp" #include "gptsort.h" #include "thiersort.h" #include "thiersort2.h" #include "qsort/qsort.h" #include "qsort/zssort.h" #include "qsort/schwab_sort.h" #include "qsort/chatgpt_qs.h" #include "threepass.h" #include "thiersort3.h" // #define MAGYAR_SORT_DEFAULT_REUSE #include "magyarsort.h" #include "space_partitioning_sort/spsort.h" std::map results; std::map worst; void measure(const std::string &inputtype, const std::string &name, std::function f) { auto begin = std::chrono::high_resolution_clock::now(); f(); auto dur = std::chrono::high_resolution_clock::now() - begin; double seconds = dur / std::chrono::milliseconds(1) / 1000.0; results[name] = seconds; worst[name] = std::max(worst[name], seconds); } std::vector inputtypes = { "constant", "asc", "desc", "ascasc", "ascdesc", "descasc", "descdesc", "smallrange", "rand", }; std::vector geninput(const std::string &type, int n) { std::vector v(n); if (type == "constant") { int c = rand(); for (int i = 0; i < n; i++) { v[i] = c; } } else if (type == "asc") { for (int i = 0; i < n; i++) { v[i] = i; } } else if (type == "desc") { for (int i = 0; i < n; i++) { v[i] = n - i; } } else if (type == "ascasc") { for (int i = 0; i < n / 2; i++) { v[i] = i; v[i + n / 2] = i; } } else if (type == "ascdesc") { for (int i = 0; i < n / 2; i++) { v[i] = i; v[i + n / 2] = n - i; } } else if (type == "descasc") { for (int i = 0; i < n / 2; i++) { v[i] = n - i; v[i + n / 2] = i; } } else if (type == "descdesc") { for (int i = 0; i < n / 2; i++) { v[i] = n - i; v[i + n / 2] = n - i; } } else if (type == "smallrange") { int c = rand() / 2; for (int i = 0; i < n; i++) { v[i] = c + rand() % 100; } } else if (type == "rand") { for (int i = 0; i < n; i++) { v[i] = rand(); } } return v; } void twopass(uint32_t *a, int n) { assert(n * int64_t(sizeof(a[0])) <= INT_MAX); // alloc helper buffers. int sz = n * sizeof(a[0]); std::vector bucketdata(1 << 16); uint32_t *buf = (uint32_t *)malloc(sz); assert(buf != NULL); // pass 1: sort by lower 16 bits. for (int i = 0; i < n; i++) bucketdata[a[i] & 0xffff]++; int offset = 0; for (int i = 0; i < 1 << 16; i++) { int d = bucketdata[i]; bucketdata[i] = offset; offset += d; } for (int i = 0; i < n; i++) buf[bucketdata[a[i] & 0xffff]++] = a[i]; // pass 2: sort by upper 16 bits. memset(&bucketdata[0], 0, bucketdata.size() * sizeof(bucketdata[0])); for (int i = 0; i < n; i++) bucketdata[buf[i] >> 16]++; offset = 0; for (int i = 0; i < 1 << 16; i++) { int d = bucketdata[i]; bucketdata[i] = offset; offset += d; } for (int i = 0; i < n; i++) a[bucketdata[buf[i] >> 16]++] = buf[i]; free(buf); } /** "Standardly" written inplace recursive quicksort */ static inline void do_qsort(uint32_t *a, int n) noexcept { assert(n * uint32_t(sizeof(a[0])) <= INT_MAX); quicksort(a, 0, n - 1); } static inline void do_qsr3(uint32_t *a, int n) noexcept { assert(n * uint32_t(sizeof(a[0])) <= INT_MAX); rpivotstate state; quicksort_rand3(a, 0, n - 1, &state); } /** Quicksort with fast random pivoting */ static inline void do_qsr(uint32_t *a, int n) noexcept { assert(n * uint32_t(sizeof(a[0])) <= INT_MAX); rpivotstate state; quicksort_rand(a, 0, n - 1, &state); } /** Zsolti's quicksort version with at most O(log(n)) memuse because loop instead half of the recursions */ static inline void do_zsssort(uint32_t *a, int n) noexcept { assert(n * uint32_t(sizeof(a[0])) <= INT_MAX); zssort(a, 0, n - 1); } /** Fastrandomized zss */ static inline void do_zsr(uint32_t *a, int n) noexcept { assert(n * uint32_t(sizeof(a[0])) <= INT_MAX); rpivotstate state; zssort_rand(a, 0, n - 1, &state); } /** Fastrandomized zss3 */ static inline void do_zsr3(uint32_t *a, int n) noexcept { assert(n * uint32_t(sizeof(a[0])) <= INT_MAX); rpivotstate state; zssort_rand3(a, 0, n - 1, &state); } /** Fastrandomized zss3 single-pass threewayed */ static inline void do_zsr3_sp(uint32_t *a, int n) noexcept { assert(n * uint32_t(sizeof(a[0])) <= INT_MAX); rpivotstate state; zssort_rand3_sp(a, 0, n - 1, &state); } /** Fastrandomized zss3 single-pass threewayed */ static inline void do_zsr3_sp2(uint32_t *a, int n) noexcept { assert(n * uint32_t(sizeof(a[0])) <= INT_MAX); rpivotstate state; zssort_rand3_sp2(a, 0, n - 1, &state); } /** Fastrandomized zss with const input check */ static inline void do_zsrc(uint32_t *a, int n) noexcept { assert(n * uint32_t(sizeof(a[0])) <= INT_MAX); rpivotstate state; zsrc(a, 0, n - 1, &state); } /** meanqs */ static inline void do_meanqs(uint32_t *a, int n) noexcept { assert(n * uint32_t(sizeof(a[0])) <= INT_MAX); rpivotstate state; meanqs(a, 0, n - 1, &state); } /** neoqs */ static inline void do_neoqs(uint32_t *a, int n) noexcept { assert(n * uint32_t(sizeof(a[0])) <= INT_MAX); rpivotstate state; neoqs(a, 0, n - 1, &state); } /** schwab */ static inline void do_schwab(uint32_t *a, int n) noexcept { assert(n * uint32_t(sizeof(a[0])) <= INT_MAX); uint32_t junk; sch_rand_state state = schwab_rand_state(junk); schwab_sort(a, 0, n - 1, &state); } /** thier2 */ static inline void do_thier2(uint32_t *a, int n) noexcept { assert(n * uint32_t(sizeof(a[0])) <= INT_MAX); uint32_t junk; sch_rand_state state = schwab_rand_state(junk); std::vector tmp(n); thiersort2(a, &(tmp[0]), n, &state); } /** thier3 */ static inline void do_thier3(uint32_t *a, int n) noexcept { assert(n * uint32_t(sizeof(a[0])) <= INT_MAX); std::vector tmp(n); thiersort3(a, &(tmp[0]), n); } /** 3+1 pass bottom-up radix */ static inline void do_threepass(uint32_t *a, int n) noexcept { threepass(a, n); } // mormord — Today at 2:27 AM // 1 2 2 2 3 // // 0 1 2 3 4 // |1|2 2 3 2 // 1|2|2 3 2 // 1|3|2 2 2 // 1|2|2 2 3 // 1|2|2 2 3 // 1 2|2|2 3 // ^ // Pivot // // állítás: pivottól balra helyükön vannak az elemek rendezettségük szerint // // Kezdés Indexek = Prefix összeg - 1 (utolsó helyek az elemeknek) // // Ha pivot új helyének meghatározot index > pivot_index (|.| helye) // swap // --index // különben // ++pivot_index template static inline bool morbittop(uint32_t elem) noexcept { return (elem >> (8 * j)) & 0x80; // Only top bit } template static inline uint32_t morgrab(uint32_t elem) noexcept { return (elem >> (8 * j)) & 0x7f; // Only 7-bit! } /** * Count occurences AND divides array into two partitions by its topmost bit. * - Similar to quicksort partitioning, but changed accordingly. * - MSB 0 bit values will come first partition. * * @param a The array to partition and occurence count. * @param n The length of the array. * @param radics1 A 128-sized array for occurence counting the bottom partition. * @param radics2 A 128-sized array for occurence counting the top partition. * @param DIGIT The digit in question (for a morgrab(..) call) * @returns The partition bounds are: [0..first) and [second..n) with logical means to mark empty partitions. */ template static inline std::pair oc_bit_partition( uint32_t *a, uint32_t n, uint32_t *radics1, uint32_t *radics2) noexcept { // See Hoare's OG quicksort why int64_t i = 0; int64_t j = n - 1; while(true) { // Move past well-placed ones // And occurence count them // Rem.: In quicksort usually a do-while loop while ((i < j) && !morbittop(a[i])) { ++radics1[morgrab(a[i])]; ++i; } while ((i < j) && morbittop(a[j])) { ++radics2[morgrab(a[j])]; --j; } // If the indices crossed, return // Rem.: Not >= to ensure occ. counts! See also: (*) if(i > j) return std::make_pair(i, j + 1); // Check for swap if(i < j) { // Swap // No need occurence count here as above loops will handle! uint32_t tmp = a[i]; a[i] = a[j]; a[j] = tmp; } else { // i == j case: count occurence properly for the one. if(!morbittop(a[j])) { ++radics1[morgrab(a[i])]; ++i; } else { ++radics2[morgrab(a[j])]; --j; } } } } template static inline void mormord_sort_impl(uint32_t *a, int n) noexcept { /* Preparation */ uint32_t radics1[128] = {0}; uint32_t radics2[128] = {0}; /* [from, to) index: only where prefix sums change - usually nonfull */ uint32_t real_radics1[128 * 2] = {0}; uint32_t real_radics2[128 * 2] = {0}; // Count occurences and partition by topmost bit std::pair boundz = oc_bit_partition(a, n, radics1, radics2); /* Prefix sum + real radics calc O(256) */ /* Radics: */ /* fr: {10, 20, 10, 0, 5, 15,...} */ /* to: {10, 30, 40, 40, 45, 60,..} */ /* Real radics: */ /* to: {[0, 10], [10, 30], [30, 40], [40, 45], [45, 60]} */ /* 0. 1. 2. 4. 5. */ /* (because radix value 3 is not found in input) */ uint32_t prev1 = 0; uint32_t reali1 = 0; uint32_t prev2 = 0; uint32_t reali2 = 0; #pragma GCC unroll 16 for(int i = 0; i < 128; ++i) { // Hopefully we get more ILP out of this // Also I tried branchless before adding // ILP here and it slowed things, so first // let us try it with branch prediction! if(radics1[i] != 0) { radics1[i] += prev1; real_radics1[reali1] = prev1; real_radics1[reali1 + 1] = radics1[i]; prev1 = radics1[i]; reali1 += 2; } else { radics1[i] += prev1; prev1 = radics1[i]; } if(radics2[i] != 0) { radics2[i] += prev2; real_radics2[reali2] = prev2; real_radics2[reali2 + 1] = radics2[i]; prev2 = radics2[i]; reali2 += 2; } else { radics2[i] += prev2; prev2 = radics2[i]; } } // Inplace swap, with added ILP / branchless opt. // Without it its data dependent like crazy... uint32_t pivoti1 = 0; uint32_t pivoti2 = boundz.second; while((pivoti1 < boundz.first) && (pivoti2 < n)) { /* Pivot 1 */ uint32_t radixval1 = morgrab(a[pivoti1]); uint32_t targeti1 = --radics1[radixval1]; // dec index (!) // Bitmask: true -> 11.....1; false -> 00.....0 uint32_t mask1 = ~((targeti1 > pivoti1) - 1); // Branchless swap (using bitmask) uint32_t delta1 = (a[pivoti1] ^ a[targeti1]) & mask1; a[pivoti1] = a[pivoti1] ^ delta1; a[targeti1] = a[targeti1] ^ delta1; // "else" branch pivoti1 += !mask1; radics1[radixval1] += !mask1; // undec index (!) /* Pivot 2 */ uint32_t radixval2 = morgrab(a[pivoti2]); uint32_t targeti2 = boundz.second + (--radics2[radixval2]); // dec index (!) // Bitmask: true -> 11.....1; false -> 00.....0 uint32_t mask2 = ~((targeti2 > pivoti2) - 1); // Branchless swap (using bitmask) uint32_t delta2 = (a[pivoti2] ^ a[targeti2]) & mask2; a[pivoti2] = a[pivoti2] ^ delta2; a[targeti2] = a[targeti2] ^ delta2; // "else" branch pivoti2 += !mask2; radics2[radixval2] += !mask2; // undec index (!) } // Finish pivot1 if there are still elements.. while(pivoti1 < boundz.first) { /* Pivot 1+ */ uint32_t radixval1 = morgrab(a[pivoti1]); uint32_t targeti1 = --radics1[radixval1]; // dec index (!) // Bitmask: true -> 11.....1; false -> 00.....0 uint32_t mask1 = ~((targeti1 > pivoti1) - 1); // Branchless swap (using bitmask) uint32_t delta1 = (a[pivoti1] ^ a[targeti1]) & mask1; a[pivoti1] = a[pivoti1] ^ delta1; a[targeti1] = a[targeti1] ^ delta1; // "else" branch pivoti1 += !mask1; radics1[radixval1] += !mask1; // undec index (!) } // Finish pivot2 if there are still elements.. while(pivoti2 < n) { /* Pivot 2+ */ uint32_t radixval2 = morgrab(a[pivoti2]); uint32_t targeti2 = boundz.second + (--radics2[radixval2]); // dec index (!) // Bitmask: true -> 11.....1; false -> 00.....0 uint32_t mask2 = ~((targeti2 > pivoti2) - 1); // Branchless swap (using bitmask) uint32_t delta2 = (a[pivoti2] ^ a[targeti2]) & mask2; a[pivoti2] = a[pivoti2] ^ delta2; a[targeti2] = a[targeti2] ^ delta2; // "else" branch pivoti2 += !mask2; radics2[radixval2] += !mask2; // undec index (!) } // Possible recursions if constexpr (j != 0) { /* Partition 1 recursions */ for(int i = 0; i < reali1; i += 2) { /* inclusive */ uint32_t from = real_radics1[i]; /* non-inclusive */ uint32_t to = real_radics1[i + 1]; mormord_sort_impl(&a[from], (to - (from))); } /* Partition 2 recursions */ for(int i = 0; i < reali2; i += 2) { /* inclusive */ uint32_t from = real_radics2[i]; /* non-inclusive */ uint32_t to = real_radics2[i + 1]; mormord_sort_impl(&a[from], (to - (from))); } } } static inline void mormord_sort(uint32_t *a, int n) noexcept { assert(n * uint32_t(sizeof(a[0])) <= INT_MAX); mormord_sort_impl<3>(a, n); } void fourpass(uint32_t *a, int n) { assert(n * int64_t(sizeof(a[0])) <= INT_MAX); // alloc helper buffers. int sz = n * sizeof(a[0]); std::vector bucketdata(1 << 8); uint32_t *buf = (uint32_t *)malloc(sz); assert(buf != NULL); uint32_t *src = a, *dst = buf; uintptr_t swapmask = (uintptr_t)a ^ (uintptr_t)buf; for (int shift = 0; shift < 32; shift += 8) { memset(&bucketdata[0], 0, bucketdata.size() * sizeof(bucketdata[0])); for (int i = 0; i < n; i++) bucketdata[src[i] >> shift & 0xff]++; int offset = 0; for (int i = 0; i < 1 << 8; i++) { int d = bucketdata[i]; bucketdata[i] = offset; offset += d; } for (int i = 0; i < n; i++) { dst[bucketdata[src[i] >> shift & 0xff]++] = src[i]; } src = (uint32_t *)((uintptr_t)src ^ swapmask); dst = (uint32_t *)((uintptr_t)dst ^ swapmask); } free(buf); } /** Only werks für das fourpassu! */ void my_memset(int *v) { memset(v, 0, (1 << 8) * sizeof(int)); } // hand-unrolled fourpass. void fourpassu(uint32_t *a, int n) { assert(n * int64_t(sizeof(a[0])) <= INT_MAX); // alloc helper buffers. int sz = n * sizeof(a[0]); static thread_local int bucketdata[1 << 8]; my_memset(bucketdata); uint32_t *buf = (uint32_t *)malloc(sz); assert(buf != NULL); // pass 1: sort by lower 8 bits. #pragma GCC unroll 32 for (int i = 0; i < n; i++) bucketdata[a[i] & 0xff]++; int offset = 0; #pragma GCC unroll 8 for (int i = 0; i < 1 << 8; i++) { int d = bucketdata[i]; bucketdata[i] = offset; offset += d; } for (int i = 0; i < n; i++) buf[bucketdata[a[i] & 0xff]++] = a[i]; // pass 2: sort by 2nd 8 bits. my_memset(bucketdata); #pragma GCC unroll 32 for (int i = 0; i < n; i++) bucketdata[buf[i] >> 8 & 0xff]++; offset = 0; #pragma GCC unroll 8 for (int i = 0; i < 1 << 8; i++) { int d = bucketdata[i]; bucketdata[i] = offset; offset += d; } #pragma GCC unroll 64 for (int i = 0; i < n; i++) a[bucketdata[buf[i] >> 8 & 0xff]++] = buf[i]; // pass 3: sort by 3rd 8 bits. my_memset(bucketdata); #pragma GCC unroll 32 for (int i = 0; i < n; i++) bucketdata[a[i] >> 16 & 0xff]++; offset = 0; #pragma GCC unroll 8 for (int i = 0; i < 1 << 8; i++) { int d = bucketdata[i]; bucketdata[i] = offset; offset += d; } #pragma GCC unroll 64 for (int i = 0; i < n; i++) buf[bucketdata[a[i] >> 16 & 0xff]++] = a[i]; // pass 4: sort by 4th 8 bits. my_memset(bucketdata); #pragma GCC unroll 32 for (int i = 0; i < n; i++) bucketdata[buf[i] >> 24 & 0xff]++; offset = 0; #pragma GCC unroll 8 for (int i = 0; i < 1 << 8; i++) { int d = bucketdata[i]; bucketdata[i] = offset; offset += d; } #pragma GCC unroll 32 for (int i = 0; i < n; i++) a[bucketdata[buf[i] >> 24 & 0xff]++] = buf[i]; free(buf); } static inline uint32_t byterotate(uint32_t x) { return (x >> 8) | (x << 24); } void fourrots(uint32_t *arr, int n) { assert(n * int64_t(sizeof(arr[0])) <= INT_MAX); assert(n % 4 == 0); // alloc helper buffers. int sz = n * sizeof(arr[0]); std::vector bucketdata(1 << 8); int *btd = &bucketdata[0]; uint32_t *buf = (uint32_t *)malloc(sz); assert(buf != NULL); uint32_t *src = arr, *dst = buf; uintptr_t swapmask = (uintptr_t)arr ^ (uintptr_t)buf; uint32_t a, b, c, d; uint32_t abt, bbt, cbt, dbt; for (int shift = 0; shift < 32; shift += 8) { memset(btd, 0, bucketdata.size() * sizeof(bucketdata[0])); for (int i = 0; i < n; i += 4) { a = src[i]; b = src[i + 1]; c = src[i + 2]; d = src[i + 3]; abt = a & 0xff; bbt = b & 0xff; cbt = c & 0xff; dbt = d & 0xff; btd[abt]++; btd[bbt]++; btd[cbt]++; btd[dbt]++; } int offset = 0; for (int i = 0; i < 1 << 8; i++) { int d = bucketdata[i]; bucketdata[i] = offset; offset += d; } for (int i = 0; i < n; i += 4) { a = src[i]; b = src[i + 1]; c = src[i + 2]; d = src[i + 3]; abt = a & 0xff; bbt = b & 0xff; cbt = c & 0xff; dbt = d & 0xff; dst[btd[abt]++] = byterotate(a); dst[btd[bbt]++] = byterotate(b); dst[btd[cbt]++] = byterotate(c); dst[btd[dbt]++] = byterotate(d); } src = (uint32_t *)((uintptr_t)src ^ swapmask); dst = (uint32_t *)((uintptr_t)dst ^ swapmask); } free(buf); } // frewr - four rewrites. void frewr(uint32_t *arr, int n) { uint32_t *tmpbuf = (uint32_t *)malloc(n * 4); mlock(tmpbuf, n * 4); int btoffsets[4][256] = {}; #pragma GCC unroll 64 for (int i = n - 1; i >= 0; i--) { uint32_t a = arr[i]; btoffsets[3][a & 0xff]++; btoffsets[2][a >> 8 & 0xff]++; btoffsets[1][a >> 16 & 0xff]++; btoffsets[0][a >> 24 & 0xff]++; } int btend[4] = {n - 1, n - 1, n - 1, n - 1}; #pragma GCC unroll 16 for (int i = 255; i >= 0; i--) { #pragma GCC unroll 4 for (int pass = 3; pass >= 0; pass--) { int nbtend = btend[pass] - btoffsets[pass][i]; btoffsets[pass][i] = btend[pass]; btend[pass] = nbtend; } } uint32_t *src = arr, *dst = tmpbuf; #pragma GCC unroll 4 for (int pass = 3; pass >= 0; pass--) { int *off = btoffsets[pass]; #pragma GCC unroll 64 for (int i = n - 1; i >= 0; i--) { uint32_t v = src[i]; dst[off[v & 0xff]--] = v >> 8 | v << 24; __builtin_prefetch(&dst[off[v & 0xff] - 2]); } uint32_t *tmp = src; src = dst; dst = tmp; } munlock(tmpbuf, n * 4); free(tmpbuf); } void vsort(uint32_t *a, int n) { thread_local std::vector bts[256]; #pragma GCC unroll 4 for (int shift = 0; shift < 32; shift += 8) { #pragma GCC unroll 64 for (int i = 0; i < n; i++) bts[a[i] >> shift & 0xff].push_back(a[i]); #pragma GCC unroll 64 for (int bt = 0, k = 0; bt < 256; bt++) { memcpy(a + k, &bts[bt][0], bts[bt].size() * sizeof(a[0])); k += bts[bt].size(); bts[bt].clear(); } } } void pagedsort(uint32_t *a, int n) { enum { pagesize = 1024 }; int pagecount = (n + pagesize - 1) / pagesize + 512; uint32_t *pd = (uint32_t *)malloc(pagecount * pagesize * sizeof(a[0])); std::vector freelist(pagecount); std::vector next(pagecount); std::iota(std::begin(freelist), std::end(freelist), 0); struct bucket { int len; int headpage, lastpage; }; bucket bts[512]; // initial scatter. for (int bt = 0; bt < 256; bt++) { int p = freelist.back(); freelist.pop_back(); bts[bt] = {0, p, p}; } for (int i = 0; i < n; i++) { bucket *bt = &bts[a[i] & 0xff]; pd[bt->lastpage * pagesize + bt->len++ % pagesize] = a[i]; if (bt->len % pagesize == 0) { int p = freelist.back(); freelist.pop_back(); next[bt->lastpage] = p; bt->lastpage = p; } } // intermediate level scatters. int ibase = 0, obase = 256; for (int shift = 8; shift < 32; shift += 8) { for (int bt = 0; bt < 256; bt++) { int p = freelist.back(); freelist.pop_back(); bts[obase + bt] = {0, p, p}; } for (int ibti = 0; ibti < 256; ibti++) { struct bucket *ibt = &bts[ibase + ibti]; int page = ibt->headpage; for (int i = 0; i < ibt->len; i++) { uint32_t v = pd[page * pagesize + i % pagesize]; struct bucket *obt = &bts[obase + (v >> shift & 0xff)]; pd[obt->lastpage * pagesize + obt->len++ % pagesize] = v; if (obt->len % pagesize == 0) { int p = freelist.back(); freelist.pop_back(); next[obt->lastpage] = p; obt->lastpage = p; } if (i % pagesize == pagesize - 1) { freelist.push_back(page); page = next[page]; } } freelist.push_back(ibt->lastpage); } ibase = 256 - ibase; obase = 256 - obase; } // the final gather. int k = 0; for (int ibti = 0; ibti < 256; ibti++) { struct bucket *ibt = &bts[ibase + ibti]; int page = ibt->headpage; for (int i = 0; i < ibt->len; i++) { a[k++] = pd[page * pagesize + i % pagesize]; if (i % pagesize == pagesize - 1) { page = next[page]; } } } free(pd); } // I measured this being faster than std::sort which is giga-lol wtf... void thier_quicksort(uint32_t *arr, int n) { // Prepare: O(n) tselem *tarr = thiersort_prepare_array( arr, // union tskey (askey)(void *elem), [] (void *elem) { tskey k; k.u = *((uint32_t *)elem); return k; }, 4, // elemsize, n, // length, malloc); /* for(uint32_t i = 0; i < n; ++i) { printf("In: %d @%d\n", tarr[i].key.u, tarr[i].i); } */ // Quicksort by me ts_quicksort_inplace( tarr, 0, // from n, // to ts_lt_uint, nullptr); /* for(uint32_t i = 0; i < n; ++i) { printf("Out: %d @%d\n", tarr[i].key.u, tarr[i].i); } */ // Apply: O(n) uint32_t tmp[1]; // needed for elem swaps thiersort_apply( tarr, arr, n, 4, // elemsize tmp, free); } void thiersort_uintkey8(uint32_t *arr, int n) { // Prepare: O(n) tselem *tarr = thiersort_prepare_array( arr, // union tskey (askey)(void *elem), [] (void *elem) { tskey k; k.u = *((uint32_t *)elem); return k; }, 4, // elemsize, n, // length, malloc); /* for(uint32_t i = 0; i < n; ++i) { printf("In: %d @%d\n", tarr[i].key.u, tarr[i].i); } */ // Sort: O(n*loglogn on amortized on random input): thiersort8_uintkey( tarr, n, malloc, free); /* for(uint32_t i = 0; i < n; ++i) { printf("Out: %d @%d\n", tarr[i].key.u, tarr[i].i); } */ // Apply: O(n) uint32_t tmp[1]; // needed for elem swaps thiersort_apply( tarr, arr, n, 4, // elemsize tmp, free); } // to measure / profile a single variant void measure_single(int n) { for (auto inputtype : inputtypes) { printf("%10s", inputtype.c_str()); fflush(stdout); std::vector v(n); v = geninput(inputtype, n); //measure(inputtype, "sp", [&] { spsort(&v[0], v.size()); }); //measure(inputtype, "magyar", [&] { MagyarSort::sort(&v[0], v.size()); }); //measure(inputtype, "thier2", [&] { do_thier2(&v[0], v.size()); }); measure(inputtype, "threep", [&] { do_threepass(&v[0], v.size()); }); for (auto r : results) printf("%9.3fs", r.second); puts(""); } puts(""); printf("%10s", "worst"); for (auto w : worst) printf("%9.3fs", w.second); puts(""); printf("%10s", ""); for (auto w : worst) printf("%10s", w.first.c_str()); puts(""); } int main(int argc, char **argv) { //int n = 100000000; //int n = 10000000; int n = 5000000; //int n = 1000000; //int n = 100000; //int n = 20001; //int n = 20000; //int n = 1000; //int n = 200; //int n = 170; //int n = 100; //int n = 180; //int n = 20; if(argc > 1) { const char* arg = argv[1]; n = atoi(arg); } printf("Sorting %d elements:\n\n", n); // Uncomment this for profiling and alg! //measure_single(n); //return 0; for (auto inputtype : inputtypes) { printf("%10s", inputtype.c_str()); // fflush(stdout); std::vector v(n), w(n), expected(n); v = geninput(inputtype, n); measure(inputtype, "copy", [&] { w = v; }); w = v; measure(inputtype, "std", [&] { std::sort(std::begin(w), std::end(w)); }); expected = w; /* w = v; measure(inputtype, "ska", [&] { ska_sort(std::begin(w), std::end(w)); }); */ w = v; measure(inputtype, "ska_copy", [&] { std::vector buf(w.size()); if (ska_sort_copy(std::begin(w), std::end(w), std::begin(buf))) { w.swap(buf); } }); w = v; measure(inputtype, "magyar", [&] { MagyarSort::sort(&w[0], w.size()); }); assert(w == expected); w = v; measure(inputtype, "gptbuck", [&] { gpt_bucket_sort(&w[0], w.size()); }); assert(w == expected); /* w = v; measure(inputtype, "mormord", [&] { mormord_sort(&w[0], w.size()); }); assert(w == expected); w = v; measure(inputtype, "2pass", [&] { twopass(&w[0], w.size()); }); assert(w == expected); w = v; measure(inputtype, "4pass", [&] { fourpass(&w[0], w.size()); }); assert(w == expected); w = v; measure(inputtype, "psort", [&] { pagedsort(&w[0], w.size()); }); assert(w == expected); w = v; measure(inputtype, "4pasu", [&] { fourpassu(&w[0], w.size()); }); assert(w == expected); w = v; measure(inputtype, "4rot", [&] { fourrots(&w[0], w.size()); }); assert(w == expected); w = v; measure(inputtype, "sp", [&] { spsort(&w[0], w.size()); }); assert(w == expected); w = v; measure(inputtype, "gpt_qsort", [&] { gpt_quicksort(w); }); assert(w == expected); w = v; measure(inputtype, "qsort", [&] { do_qsort(&w[0], w.size()); }); assert(w == expected); w = v; measure(inputtype, "zssort", [&] { do_zsssort(&w[0], w.size()); }); assert(w == expected); w = v; measure(inputtype, "qsr", [&] { do_qsr(&w[0], w.size()); }); assert(w == expected); w = v; measure(inputtype, "zsr", [&] { do_zsr(&w[0], w.size()); }); assert(w == expected); w = v; measure(inputtype, "qsr3", [&] { do_qsr3(&w[0], w.size()); }); assert(w == expected); */ /* w = v; measure(inputtype, "zsrc", [&] { do_zsrc(&w[0], w.size()); }); assert(w == expected); */ /* w = v; measure(inputtype, "meanqs", [&] { do_meanqs(&w[0], w.size()); }); assert(w == expected); w = v; measure(inputtype, "zsr3", [&] { do_zsr3(&w[0], w.size()); }); assert(w == expected); w = v; measure(inputtype, "zsr3_sp", [&] { do_zsr3_sp(&w[0], w.size()); }); assert(w == expected); w = v; measure(inputtype, "zsr3_sp2", [&] { do_zsr3_sp2(&w[0], w.size()); }); assert(w == expected); */ /* // TODO: This is buggy! See valgrind! w = v; measure(inputtype, "neoqs", [&] { do_neoqs(&w[0], w.size()); }); assert(w == expected); */ w = v; measure(inputtype, "schwab", [&] { do_schwab(&w[0], w.size()); }); assert(w == expected); w = v; measure(inputtype, "thier2", [&] { do_thier2(&w[0], w.size()); }); assert(w == expected); w = v; measure(inputtype, "thier3", [&] { do_thier3(&w[0], w.size()); }); assert(w == expected); w = v; measure(inputtype, "threep", [&] { do_threepass(&w[0], w.size()); }); assert(w == expected); /* w = v; measure(inputtype, "magbuck", [&] { magyar_bucket_sort(&w[0], w.size()); }); assert(w == expected); w = v; measure(inputtype, "magbuck2", [&] { magyar_bucket_sort2(&w[0], w.size()); }); assert(w == expected); w = v; w = {10, 20, 20}; measure(inputtype, "qsmine", [&] { thier_quicksort(&w[0], w.size()); }); if(w != expected) { for(uint32_t i = 0; i < n; ++i) { // assert(w[i] == expected[i]); if(w[i] != expected[i]) { fprintf(stderr, "Difference at %d: %d != %d\n", i, w[i], expected[i]); } } } assert(w == expected); w = v; measure(inputtype, "thier", [&] { thiersort_uintkey8(&w[0], w.size()); }); if(w != expected) { for(uint32_t i = 0; i < n; ++i) { // assert(w[i] == expected[i]); if(w[i] != expected[i]) { fprintf(stderr, "Difference at %d: %d != %d\n", i, w[i], expected[i]); } } } assert(w == expected); */ /* w = v; measure(inputtype, "frewr", [&] { frewr(&w[0], w.size()); }); assert(w == expected); w = v; measure(inputtype, "vsort", [&] { vsort(&w[0], w.size()); }); assert(w == expected); */ for (auto r : results) printf("%9.3fs", r.second); puts(""); } puts(""); printf("%10s", "worst"); for (auto w : worst) printf("%9.3fs", w.second); puts(""); printf("%10s", ""); for (auto w : worst) printf("%10s", w.first.c_str()); puts(""); return 0; }