diff --git a/randominus.h b/randominus.h index 67709ae..1708045 100644 --- a/randominus.h +++ b/randominus.h @@ -3,6 +3,7 @@ /* To randomize an array - hopefully as fast as possible */ #include "fastrand/fastrand.h" +#define RD_REL_WIDTH 16 /** swap */ static void inline rd_swap(uint32_t *a, uint32_t *b) { @@ -24,23 +25,24 @@ static inline void randominus(uint32_t *a, int n, uint32_t seed) { } /** Go over the array and randomly swap stuff - hand unrolled with ILP random get! */ - for(int i = 0; i < (n - 8); i += 8) { - uint32_t to0 = fastmodlike(lcg_ilp(&rsi, A), n); - uint32_t to1 = fastmodlike(lcg_ilp(&rsi, B), n); - uint32_t to2 = fastmodlike(lcg_ilp(&rsi, C), n); - uint32_t to3 = fastmodlike(lcg_ilp(&rsi, D), n); - uint32_t to4 = fastmodlike(lcg_ilp(&rsi, E), n); - uint32_t to5 = fastmodlike(lcg_ilp(&rsi, F), n); - uint32_t to6 = fastmodlike(lcg_ilp(&rsi, G), n); - uint32_t to7 = fastmodlike(lcg_ilp(&rsi, H), n); - rd_swap(&a[i], &a[to0]); - rd_swap(&a[i + 1], &a[to1]); - rd_swap(&a[i + 2], &a[to2]); - rd_swap(&a[i + 3], &a[to3]); - rd_swap(&a[i + 4], &a[to4]); - rd_swap(&a[i + 5], &a[to5]); - rd_swap(&a[i + 6], &a[to6]); - rd_swap(&a[i + 7], &a[to7]); + #pragma GCC unroll 16 + for(int i = (RD_REL_WIDTH/2 + 1); i < (n - 8 - (RD_REL_WIDTH/2 + 1)); i += 8) { + uint32_t to = fastmodlike(lcg_ilp(&rsi, A), n); + uint32_t relto1 = fastmodlike(lcg_ilp(&rsi, B), RD_REL_WIDTH) - (RD_REL_WIDTH / 2); + uint32_t relto2 = fastmodlike(lcg_ilp(&rsi, C), RD_REL_WIDTH) - (RD_REL_WIDTH / 2); + uint32_t relto3 = fastmodlike(lcg_ilp(&rsi, D), RD_REL_WIDTH) - (RD_REL_WIDTH / 2); + uint32_t relto4 = fastmodlike(lcg_ilp(&rsi, E), RD_REL_WIDTH) - (RD_REL_WIDTH / 2); + uint32_t relto5 = fastmodlike(lcg_ilp(&rsi, F), RD_REL_WIDTH) - (RD_REL_WIDTH / 2); + uint32_t relto6 = fastmodlike(lcg_ilp(&rsi, G), RD_REL_WIDTH) - (RD_REL_WIDTH / 2); + uint32_t relto7 = fastmodlike(lcg_ilp(&rsi, H), RD_REL_WIDTH) - (RD_REL_WIDTH / 2); + rd_swap(&a[i], &a[to]); + rd_swap(&a[i + 1], &a[i + relto1]); + rd_swap(&a[i + 2], &a[i + relto2]); + rd_swap(&a[i + 3], &a[i + relto3]); + rd_swap(&a[i + 4], &a[i + relto4]); + rd_swap(&a[i + 5], &a[i + relto5]); + rd_swap(&a[i + 6], &a[i + relto6]); + rd_swap(&a[i + 7], &a[i + relto7]); } }