diff --git a/ypsu.cpp b/ypsu.cpp index 76c81e8..030df79 100644 --- a/ypsu.cpp +++ b/ypsu.cpp @@ -134,17 +134,34 @@ } free(buf); } + + /** Regular memset will not do with thread_local I think... */ + void my_memset(std::vector &v) { + size_t n = v.size(); + v.clear(); + v.resize(n); + std::fill(v.begin(), v.end(), 0); + } + // hand-unrolled fourpass. void fourpassu(uint32_t *a, int n) { assert(n * int64_t(sizeof(a[0])) <= INT_MAX); // alloc helper buffers. int sz = n * sizeof(a[0]); - std::vector bucketdata(1 << 8); + + //std::vector bucketdata(1 << 8); + // Below code crashes :-( + static std::vector bucketdata(1 << 8); + bucketdata.resize(1 << 8); // JHapak's "pattern" + my_memset(bucketdata); + uint32_t *buf = (uint32_t *)malloc(sz); assert(buf != NULL); // pass 1: sort by lower 8 bits. + #pragma GCC unroll 32 for (int i = 0; i < n; i++) bucketdata[a[i] & 0xff]++; int offset = 0; + #pragma GCC unroll 8 for (int i = 0; i < 1 << 8; i++) { int d = bucketdata[i]; bucketdata[i] = offset; @@ -152,41 +169,54 @@ } for (int i = 0; i < n; i++) buf[bucketdata[a[i] & 0xff]++] = a[i]; // pass 2: sort by 2nd 8 bits. - memset(&bucketdata[0], 0, bucketdata.size() * sizeof(bucketdata[0])); + my_memset(bucketdata); + #pragma GCC unroll 32 for (int i = 0; i < n; i++) bucketdata[buf[i] >> 8 & 0xff]++; offset = 0; + #pragma GCC unroll 8 for (int i = 0; i < 1 << 8; i++) { int d = bucketdata[i]; bucketdata[i] = offset; offset += d; } + #pragma GCC unroll 64 for (int i = 0; i < n; i++) a[bucketdata[buf[i] >> 8 & 0xff]++] = buf[i]; // pass 3: sort by 3rd 8 bits. - memset(&bucketdata[0], 0, bucketdata.size() * sizeof(bucketdata[0])); + my_memset(bucketdata); + #pragma GCC unroll 32 for (int i = 0; i < n; i++) bucketdata[a[i] >> 16 & 0xff]++; offset = 0; + #pragma GCC unroll 8 for (int i = 0; i < 1 << 8; i++) { int d = bucketdata[i]; bucketdata[i] = offset; offset += d; } + #pragma GCC unroll 64 for (int i = 0; i < n; i++) buf[bucketdata[a[i] >> 16 & 0xff]++] = a[i]; // pass 4: sort by 4th 8 bits. - memset(&bucketdata[0], 0, bucketdata.size() * sizeof(bucketdata[0])); + my_memset(bucketdata); + #pragma GCC unroll 32 for (int i = 0; i < n; i++) bucketdata[buf[i] >> 24 & 0xff]++; offset = 0; + #pragma GCC unroll 8 for (int i = 0; i < 1 << 8; i++) { int d = bucketdata[i]; bucketdata[i] = offset; offset += d; } + #pragma GCC unroll 32 for (int i = 0; i < n; i++) a[bucketdata[buf[i] >> 24 & 0xff]++] = buf[i]; free(buf); } + void vsort(uint32_t *a, int n) { thread_local std::vector bts[256]; + #pragma GCC unroll 4 for (int shift = 0; shift < 32; shift += 8) { + #pragma GCC unroll 64 for (int i = 0; i < n; i++) bts[a[i] >> shift & 0xff].push_back(a[i]); + #pragma GCC unroll 64 for (int bt = 0, k = 0; bt < 256; bt++) { memcpy(a + k, &bts[bt][0], bts[bt].size() * sizeof(a[0])); k += bts[bt].size();