added ILP optimized version

This commit is contained in:
Richard Thier 2025-04-01 22:31:53 +02:00
parent 5fe3db5428
commit d823a77bf6
2 changed files with 100 additions and 1 deletions

View File

@ -4,6 +4,8 @@
#define FAST_RAND_H #define FAST_RAND_H
#include <stdint.h> #include <stdint.h>
#include <assert.h>
#ifndef NO_CSTDLIB #ifndef NO_CSTDLIB
#include <stdlib.h> #include <stdlib.h>
#endif /* NO_CSTDLIB */ #endif /* NO_CSTDLIB */
@ -22,16 +24,64 @@
/* Currently a single integer is enough */ /* Currently a single integer is enough */
typedef uint32_t rand_state; typedef uint32_t rand_state;
/* Currently a single integer is enough */
struct rand_ilp_state {
uint32_t a;
uint32_t b;
uint32_t c;
uint32_t d;
uint32_t e;
uint32_t f;
uint32_t g;
uint32_t h;
};
typedef struct rand_ilp_state rand_ilp_state;
/** Creates a random number generator state with given seed */ /** Creates a random number generator state with given seed */
static inline rand_state init_rand_with(uint32_t seed) { static inline rand_state init_rand_with(uint32_t seed) {
return seed; return seed;
} }
static inline rand_ilp_state init_rand_ilp_with(
uint32_t seed1,
uint32_t seed2,
uint32_t seed3,
uint32_t seed4,
uint32_t seed5,
uint32_t seed6,
uint32_t seed7,
uint32_t seed8) {
rand_ilp_state ret;
ret.a = seed1;
ret.b = seed2;
ret.c = seed3;
ret.d = seed4;
ret.e = seed1;
ret.f = seed2;
ret.g = seed3;
ret.h = seed4;
return ret;
}
#ifndef NO_CSTDLIB #ifndef NO_CSTDLIB
/** Creates a random number generator state with arc4random() which does not need seeding as it uses system etropy */ /** Creates a random number generator state with arc4random() which does not need seeding as it uses system etropy */
static inline rand_state init_rand() { static inline rand_state init_rand() {
return arc4random(); return arc4random();
} }
/** Creates a random number generator state with arc4random() which does not need seeding as it uses system etropy */
static inline rand_ilp_state init_rand_ilp() {
rand_ilp_state ret;
ret.a = arc4random();
ret.b = arc4random();
ret.c = arc4random();
ret.d = arc4random();
ret.e = arc4random();
ret.f = arc4random();
ret.g = arc4random();
ret.h = arc4random();
return ret;
}
#endif /* NO_CSTDLIB */ #endif /* NO_CSTDLIB */
// 32-bit LCG // 32-bit LCG
@ -40,6 +90,43 @@ static inline uint32_t lcg(rand_state *state) {
return *state; return *state;
} }
#define RAND_ILP_MAX 7
enum RAND_ILP {
A = 0, B = 1, C = 2, D = 3,
E = 4, F = 5, G = 6, H = RAND_ILP_MAX
};
typedef enum RAND_ILP RAND_ILP;
// 32-bit LCG with more states - might be faster when called from a loop, see perf.cpp
static inline uint32_t lcg_ilp(rand_ilp_state *state, RAND_ILP which) {
if(which == A) {
state->a = state->a * 1664525u + 1013904223u;
return state->a;
} else if(which == B) {
state->b = state->b * 1664525u + 1013904223u;
return state->b;
} else if(which == C) {
state->c = state->c * 1664525u + 1013904223u;
return state->c;
} else if(which == D) {
state->d = state->d * 1664525u + 1013904223u;
return state->d;
} else if(which == E) {
state->e = state->a * 1664525u + 1013904223u;
return state->e;
} else if(which == F) {
state->f = state->f * 1664525u + 1013904223u;
return state->f;
} else if(which == G) {
state->g = state->g * 1664525u + 1013904223u;
return state->g;
} else if(which == H) {
state->h = state->h * 1664525u + 1013904223u;
return state->h;
}
assert(0);
}
/** Pick a "reasonably random" number in [0, until-1] without modulus */ /** Pick a "reasonably random" number in [0, until-1] without modulus */
static inline uint32_t rand_until(rand_state *restrict state, uint32_t until) { static inline uint32_t rand_until(rand_state *restrict state, uint32_t until) {
uint32_t rand = lcg(state); uint32_t rand = lcg(state);

View File

@ -9,7 +9,7 @@
// #define M 10000000 // M >= N // #define M 10000000 // M >= N
#define M 19999999 // M >= N #define M 19999999 // M >= N
#define FROM 100 #define FROM 100
#define TO 576 #define TO 576 // [FROM, TO)
uint32_t res[M] = { 0 }; uint32_t res[M] = { 0 };
@ -19,6 +19,7 @@ int main() {
// Init // Init
srand((unsigned int)time(NULL)); srand((unsigned int)time(NULL));
rand_state rs = init_rand(); rand_state rs = init_rand();
rand_ilp_state rs_ilp = init_rand_ilp();
printf("Full range generation perf - %d number of cases:\n", N); printf("Full range generation perf - %d number of cases:\n", N);
@ -45,15 +46,26 @@ int main() {
auto t3 = std::chrono::high_resolution_clock::now(); auto t3 = std::chrono::high_resolution_clock::now();
// lcg4
#pragma GCC unroll 4
for (int i = 0; i < N; ++i) {
// res[i] += lcg_ilp(&rs_ilp, (RAND_ILP)(i % (RAND_ILP_MAX + 1)));
res[i] += lcg_ilp(&rs_ilp, (RAND_ILP)(i % 4));
}
auto t31 = std::chrono::high_resolution_clock::now();
// results 1 // results 1
auto arc4_elapsed = std::chrono::duration_cast<std::chrono::nanoseconds>(t1 - t0); auto arc4_elapsed = std::chrono::duration_cast<std::chrono::nanoseconds>(t1 - t0);
auto rand_elapsed = std::chrono::duration_cast<std::chrono::nanoseconds>(t2 - t1); auto rand_elapsed = std::chrono::duration_cast<std::chrono::nanoseconds>(t2 - t1);
auto lcg_elapsed = std::chrono::duration_cast<std::chrono::nanoseconds>(t3 - t2); auto lcg_elapsed = std::chrono::duration_cast<std::chrono::nanoseconds>(t3 - t2);
auto lcg4_elapsed = std::chrono::duration_cast<std::chrono::nanoseconds>(t31 - t3);
printf("Time (arc4): %.3f ms.\n", arc4_elapsed.count() * 1e-6); printf("Time (arc4): %.3f ms.\n", arc4_elapsed.count() * 1e-6);
printf("Time (rand): %.3f ms.\n", rand_elapsed.count() * 1e-6); printf("Time (rand): %.3f ms.\n", rand_elapsed.count() * 1e-6);
printf("Time (lcg): %.3f ms.\n", lcg_elapsed.count() * 1e-6); printf("Time (lcg): %.3f ms.\n", lcg_elapsed.count() * 1e-6);
printf("Time (lcg4): %.3f ms.\n", lcg4_elapsed.count() * 1e-6);
printf("Modulo VS nomod perf for rand_between (both LCG) - %d number of cases:\n", M); printf("Modulo VS nomod perf for rand_between (both LCG) - %d number of cases:\n", M);