From d823a77bf64a725e6db169fb952d28997ec3263e Mon Sep 17 00:00:00 2001 From: Richard Thier Date: Tue, 1 Apr 2025 22:31:53 +0200 Subject: [PATCH] added ILP optimized version --- fastrand.h | 87 ++++++++++++++++++++++++++++++++++++++++++++++++++++++ perf.cpp | 14 ++++++++- 2 files changed, 100 insertions(+), 1 deletion(-) diff --git a/fastrand.h b/fastrand.h index 9d9e9df..ef99d67 100644 --- a/fastrand.h +++ b/fastrand.h @@ -4,6 +4,8 @@ #define FAST_RAND_H #include +#include + #ifndef NO_CSTDLIB #include #endif /* NO_CSTDLIB */ @@ -22,16 +24,64 @@ /* Currently a single integer is enough */ typedef uint32_t rand_state; +/* Currently a single integer is enough */ +struct rand_ilp_state { + uint32_t a; + uint32_t b; + uint32_t c; + uint32_t d; + uint32_t e; + uint32_t f; + uint32_t g; + uint32_t h; +}; +typedef struct rand_ilp_state rand_ilp_state; + /** Creates a random number generator state with given seed */ static inline rand_state init_rand_with(uint32_t seed) { return seed; } +static inline rand_ilp_state init_rand_ilp_with( + uint32_t seed1, + uint32_t seed2, + uint32_t seed3, + uint32_t seed4, + uint32_t seed5, + uint32_t seed6, + uint32_t seed7, + uint32_t seed8) { + rand_ilp_state ret; + ret.a = seed1; + ret.b = seed2; + ret.c = seed3; + ret.d = seed4; + ret.e = seed1; + ret.f = seed2; + ret.g = seed3; + ret.h = seed4; + return ret; +} + #ifndef NO_CSTDLIB /** Creates a random number generator state with arc4random() which does not need seeding as it uses system etropy */ static inline rand_state init_rand() { return arc4random(); } + +/** Creates a random number generator state with arc4random() which does not need seeding as it uses system etropy */ +static inline rand_ilp_state init_rand_ilp() { + rand_ilp_state ret; + ret.a = arc4random(); + ret.b = arc4random(); + ret.c = arc4random(); + ret.d = arc4random(); + ret.e = arc4random(); + ret.f = arc4random(); + ret.g = arc4random(); + ret.h = arc4random(); + return ret; +} #endif /* NO_CSTDLIB */ // 32-bit LCG @@ -40,6 +90,43 @@ static inline uint32_t lcg(rand_state *state) { return *state; } +#define RAND_ILP_MAX 7 +enum RAND_ILP { + A = 0, B = 1, C = 2, D = 3, + E = 4, F = 5, G = 6, H = RAND_ILP_MAX +}; +typedef enum RAND_ILP RAND_ILP; + +// 32-bit LCG with more states - might be faster when called from a loop, see perf.cpp +static inline uint32_t lcg_ilp(rand_ilp_state *state, RAND_ILP which) { + if(which == A) { + state->a = state->a * 1664525u + 1013904223u; + return state->a; + } else if(which == B) { + state->b = state->b * 1664525u + 1013904223u; + return state->b; + } else if(which == C) { + state->c = state->c * 1664525u + 1013904223u; + return state->c; + } else if(which == D) { + state->d = state->d * 1664525u + 1013904223u; + return state->d; + } else if(which == E) { + state->e = state->a * 1664525u + 1013904223u; + return state->e; + } else if(which == F) { + state->f = state->f * 1664525u + 1013904223u; + return state->f; + } else if(which == G) { + state->g = state->g * 1664525u + 1013904223u; + return state->g; + } else if(which == H) { + state->h = state->h * 1664525u + 1013904223u; + return state->h; + } + assert(0); +} + /** Pick a "reasonably random" number in [0, until-1] without modulus */ static inline uint32_t rand_until(rand_state *restrict state, uint32_t until) { uint32_t rand = lcg(state); diff --git a/perf.cpp b/perf.cpp index bd23af0..9d43242 100644 --- a/perf.cpp +++ b/perf.cpp @@ -9,7 +9,7 @@ // #define M 10000000 // M >= N #define M 19999999 // M >= N #define FROM 100 -#define TO 576 +#define TO 576 // [FROM, TO) uint32_t res[M] = { 0 }; @@ -19,6 +19,7 @@ int main() { // Init srand((unsigned int)time(NULL)); rand_state rs = init_rand(); + rand_ilp_state rs_ilp = init_rand_ilp(); printf("Full range generation perf - %d number of cases:\n", N); @@ -45,15 +46,26 @@ int main() { auto t3 = std::chrono::high_resolution_clock::now(); + // lcg4 + #pragma GCC unroll 4 + for (int i = 0; i < N; ++i) { + // res[i] += lcg_ilp(&rs_ilp, (RAND_ILP)(i % (RAND_ILP_MAX + 1))); + res[i] += lcg_ilp(&rs_ilp, (RAND_ILP)(i % 4)); + } + + auto t31 = std::chrono::high_resolution_clock::now(); + // results 1 auto arc4_elapsed = std::chrono::duration_cast(t1 - t0); auto rand_elapsed = std::chrono::duration_cast(t2 - t1); auto lcg_elapsed = std::chrono::duration_cast(t3 - t2); + auto lcg4_elapsed = std::chrono::duration_cast(t31 - t3); printf("Time (arc4): %.3f ms.\n", arc4_elapsed.count() * 1e-6); printf("Time (rand): %.3f ms.\n", rand_elapsed.count() * 1e-6); printf("Time (lcg): %.3f ms.\n", lcg_elapsed.count() * 1e-6); + printf("Time (lcg4): %.3f ms.\n", lcg4_elapsed.count() * 1e-6); printf("Modulo VS nomod perf for rand_between (both LCG) - %d number of cases:\n", M);