Compare commits
10 Commits
563ec5eedd
...
42943f4678
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
42943f4678 | ||
|
|
a6dd3f93cd | ||
|
|
40da271e34 | ||
|
|
1a3e87e076 | ||
|
|
d823a77bf6 | ||
|
|
5fe3db5428 | ||
|
|
b1d62443f6 | ||
|
|
ef915efdc1 | ||
|
|
0a5204c1cc | ||
|
|
bd6c1e2b18 |
117
fastrand.h
117
fastrand.h
@ -4,38 +4,149 @@
|
|||||||
#define FAST_RAND_H
|
#define FAST_RAND_H
|
||||||
|
|
||||||
#include <stdint.h>
|
#include <stdint.h>
|
||||||
|
#include <assert.h>
|
||||||
|
|
||||||
#ifndef NO_CSTDLIB
|
#ifndef NO_CSTDLIB
|
||||||
#include <stdlib.h>
|
#include <stdlib.h>
|
||||||
#endif /* NO_CSTDLIB */
|
#endif /* NO_CSTDLIB */
|
||||||
|
|
||||||
|
#ifdef __cplusplus
|
||||||
|
// C++-specific logic
|
||||||
|
#if defined(__GNUC__) || defined(__clang__)
|
||||||
|
#define restrict __restrict__ // GCC/Clang
|
||||||
|
#elif defined(_MSC_VER)
|
||||||
|
#define restrict __restrict // MSVC
|
||||||
|
#else
|
||||||
|
#error "Compiler not supported for 'restrict' keyword in C++"
|
||||||
|
#endif
|
||||||
|
#endif
|
||||||
|
|
||||||
/* Currently a single integer is enough */
|
/* Currently a single integer is enough */
|
||||||
typedef uint32_t rand_state;
|
typedef uint32_t rand_state;
|
||||||
|
|
||||||
|
/* Currently a single integer is enough */
|
||||||
|
struct rand_ilp_state {
|
||||||
|
uint32_t a;
|
||||||
|
uint32_t b;
|
||||||
|
uint32_t c;
|
||||||
|
uint32_t d;
|
||||||
|
uint32_t e;
|
||||||
|
uint32_t f;
|
||||||
|
uint32_t g;
|
||||||
|
uint32_t h;
|
||||||
|
};
|
||||||
|
typedef struct rand_ilp_state rand_ilp_state;
|
||||||
|
|
||||||
/** Creates a random number generator state with given seed */
|
/** Creates a random number generator state with given seed */
|
||||||
static inline rand_state init_rand_with(uint32_t seed) {
|
static inline rand_state init_rand_with(uint32_t seed) {
|
||||||
return seed;
|
return seed;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static inline rand_ilp_state init_rand_ilp_with(
|
||||||
|
uint32_t seed1,
|
||||||
|
uint32_t seed2,
|
||||||
|
uint32_t seed3,
|
||||||
|
uint32_t seed4,
|
||||||
|
uint32_t seed5,
|
||||||
|
uint32_t seed6,
|
||||||
|
uint32_t seed7,
|
||||||
|
uint32_t seed8) {
|
||||||
|
rand_ilp_state ret;
|
||||||
|
ret.a = seed1;
|
||||||
|
ret.b = seed2;
|
||||||
|
ret.c = seed3;
|
||||||
|
ret.d = seed4;
|
||||||
|
ret.e = seed5;
|
||||||
|
ret.f = seed6;
|
||||||
|
ret.g = seed7;
|
||||||
|
ret.h = seed8;
|
||||||
|
return ret;
|
||||||
|
}
|
||||||
|
|
||||||
#ifndef NO_CSTDLIB
|
#ifndef NO_CSTDLIB
|
||||||
/** Creates a random number generator state with arc4random() which does not need seeding as it uses system etropy */
|
/** Creates a random number generator state with arc4random() which does not need seeding as it uses system etropy */
|
||||||
static inline rand_state init_rand() {
|
static inline rand_state init_rand() {
|
||||||
return arc4random();
|
return arc4random();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/** Creates a random number generator state with arc4random() which does not need seeding as it uses system etropy */
|
||||||
|
static inline rand_ilp_state init_rand_ilp() {
|
||||||
|
rand_ilp_state ret;
|
||||||
|
ret.a = arc4random();
|
||||||
|
ret.b = arc4random();
|
||||||
|
ret.c = arc4random();
|
||||||
|
ret.d = arc4random();
|
||||||
|
ret.e = arc4random();
|
||||||
|
ret.f = arc4random();
|
||||||
|
ret.g = arc4random();
|
||||||
|
ret.h = arc4random();
|
||||||
|
return ret;
|
||||||
|
}
|
||||||
#endif /* NO_CSTDLIB */
|
#endif /* NO_CSTDLIB */
|
||||||
|
|
||||||
// 32-bit LCG
|
// 32-bit LCG
|
||||||
static inline uint32_t lcg(uint32_t *state) {
|
static inline uint32_t lcg(rand_state *state) {
|
||||||
*state = *state * 1664525u + 1013904223u;
|
*state = *state * 1664525u + 1013904223u;
|
||||||
return *state;
|
return *state;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#define RAND_ILP_MAX 7
|
||||||
|
enum RAND_ILP {
|
||||||
|
A = 0, B = 1, C = 2, D = 3,
|
||||||
|
E = 4, F = 5, G = 6, H = RAND_ILP_MAX
|
||||||
|
};
|
||||||
|
typedef enum RAND_ILP RAND_ILP;
|
||||||
|
|
||||||
|
// 32-bit LCG with more states - might be faster when called from a loop, see perf.cpp
|
||||||
|
static inline uint32_t lcg_ilp(rand_ilp_state *state, RAND_ILP which) {
|
||||||
|
if(which == A) {
|
||||||
|
state->a = state->a * 1664525u + 1013904223u;
|
||||||
|
return state->a;
|
||||||
|
} else if(which == B) {
|
||||||
|
state->b = state->b * 1664525u + 1013904223u;
|
||||||
|
return state->b;
|
||||||
|
} else if(which == C) {
|
||||||
|
state->c = state->c * 1664525u + 1013904223u;
|
||||||
|
return state->c;
|
||||||
|
} else if(which == D) {
|
||||||
|
state->d = state->d * 1664525u + 1013904223u;
|
||||||
|
return state->d;
|
||||||
|
} else if(which == E) {
|
||||||
|
state->e = state->e * 1664525u + 1013904223u;
|
||||||
|
return state->e;
|
||||||
|
} else if(which == F) {
|
||||||
|
state->f = state->f * 1664525u + 1013904223u;
|
||||||
|
return state->f;
|
||||||
|
} else if(which == G) {
|
||||||
|
state->g = state->g * 1664525u + 1013904223u;
|
||||||
|
return state->g;
|
||||||
|
} else if(which == H) {
|
||||||
|
state->h = state->h * 1664525u + 1013904223u;
|
||||||
|
return state->h;
|
||||||
|
}
|
||||||
|
assert(0);
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
/** Slower for me than lcg_ilp because that gets optimized out in unrolled loop! */
|
||||||
|
static inline uint32_t lcg_ilp2(rand_ilp_state *state, RAND_ILP which)
|
||||||
|
{
|
||||||
|
uint32_t *s = &(state->a) + which;
|
||||||
|
*s = *s * 1664525u + 1013904223u;
|
||||||
|
return *s;
|
||||||
|
}
|
||||||
|
|
||||||
/** Pick a "reasonably random" number in [0, until-1] without modulus */
|
/** Pick a "reasonably random" number in [0, until-1] without modulus */
|
||||||
static inline uint32_t rand_until(uint32_t *state, uint32_t until) {
|
static inline uint32_t rand_until(rand_state *restrict state, uint32_t until) {
|
||||||
uint32_t rand = lcg(state);
|
uint32_t rand = lcg(state);
|
||||||
// Multiply by "until", take the upper 32 bits of the 64-bit result
|
// Multiply by "until", take the upper 32 bits of the 64-bit result
|
||||||
return (uint32_t)(((uint64_t)rand * until) >> 32);
|
return (uint32_t)(((uint64_t)rand * until) >> 32);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static inline uint32_t fastmodlike(uint32_t num, uint32_t m) {
|
||||||
|
return (uint32_t)(((uint64_t) num * m) >> 32);
|
||||||
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Pick a "reasonably random" number in [from, to) without modulus.
|
* Pick a "reasonably random" number in [from, to) without modulus.
|
||||||
*
|
*
|
||||||
@ -44,7 +155,7 @@ static inline uint32_t rand_until(uint32_t *state, uint32_t until) {
|
|||||||
* @param to The biggest possible value + 1
|
* @param to The biggest possible value + 1
|
||||||
* @returns A value in [from, to) interval
|
* @returns A value in [from, to) interval
|
||||||
*/
|
*/
|
||||||
static inline uint32_t rand_between(uint32_t *state, uint32_t from, uint32_t to) {
|
static inline uint32_t rand_between(rand_state *restrict state, uint32_t from, uint32_t to) {
|
||||||
return from + rand_until(state, to - from);
|
return from + rand_until(state, to - from);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
4
makefile
4
makefile
@ -2,3 +2,7 @@ debug:
|
|||||||
gcc main.c -g -o main
|
gcc main.c -g -o main
|
||||||
release:
|
release:
|
||||||
gcc main.c -O2 -o main
|
gcc main.c -O2 -o main
|
||||||
|
perf:
|
||||||
|
g++ perf.cpp -O2 -o perftest; ./perftest
|
||||||
|
perf-debug:
|
||||||
|
g++ perf.cpp -g -o perftest; gdb ./perftest
|
||||||
|
|||||||
155
perf.cpp
Normal file
155
perf.cpp
Normal file
@ -0,0 +1,155 @@
|
|||||||
|
#include <cstdio>
|
||||||
|
#include <cstdlib>
|
||||||
|
#include <chrono>
|
||||||
|
#include <cassert>
|
||||||
|
#include <random>
|
||||||
|
#include "fastrand.h"
|
||||||
|
|
||||||
|
#define N 10000000
|
||||||
|
// #define N 19999999
|
||||||
|
// #define M 10000000 // M >= N
|
||||||
|
#define M 19999999 // M >= N
|
||||||
|
/*
|
||||||
|
#define FROM 100
|
||||||
|
#define TO 576 // [FROM, TO)
|
||||||
|
*/
|
||||||
|
|
||||||
|
uint32_t res[M] = { 0 };
|
||||||
|
|
||||||
|
int main() {
|
||||||
|
assert(M >= N); // M >= N
|
||||||
|
|
||||||
|
// Init
|
||||||
|
srand((unsigned int)time(NULL));
|
||||||
|
rand_state rs = init_rand();
|
||||||
|
rand_ilp_state rs_ilp = init_rand_ilp();
|
||||||
|
// C++ engines
|
||||||
|
std::linear_congruential_engine<uint32_t, 1664525u, 1013904223u, 0> lce;
|
||||||
|
std::mt19937 mte;
|
||||||
|
std::minstd_rand lce_def;
|
||||||
|
|
||||||
|
// Generate FROM,TO as random, because otherwise compiler optimizes out IDIV of the '%' operator!
|
||||||
|
uint32_t FROM = (uint32_t) rand();
|
||||||
|
uint32_t TO = (uint32_t) rand();
|
||||||
|
|
||||||
|
printf("Full range generation perf - %d number of cases:\n", N);
|
||||||
|
|
||||||
|
auto t0 = std::chrono::high_resolution_clock::now();
|
||||||
|
|
||||||
|
// arc4
|
||||||
|
for (int i = 0; i < N; ++i) {
|
||||||
|
res[i] += arc4random();
|
||||||
|
}
|
||||||
|
|
||||||
|
auto t1 = std::chrono::high_resolution_clock::now();
|
||||||
|
|
||||||
|
// rand
|
||||||
|
for (int i = 0; i < N; ++i) {
|
||||||
|
res[i] += rand();
|
||||||
|
}
|
||||||
|
|
||||||
|
auto t2 = std::chrono::high_resolution_clock::now();
|
||||||
|
|
||||||
|
// C++ LCG
|
||||||
|
for (int i = 0; i < N; ++i) {
|
||||||
|
res[i] += lce_def();
|
||||||
|
}
|
||||||
|
|
||||||
|
auto t21 = std::chrono::high_resolution_clock::now();
|
||||||
|
|
||||||
|
// C++ LCG - my parameters
|
||||||
|
for (int i = 0; i < N; ++i) {
|
||||||
|
res[i] += lce();
|
||||||
|
}
|
||||||
|
|
||||||
|
auto t211 = std::chrono::high_resolution_clock::now();
|
||||||
|
|
||||||
|
// C++ MT
|
||||||
|
for (int i = 0; i < N; ++i) {
|
||||||
|
res[i] += mte();
|
||||||
|
}
|
||||||
|
|
||||||
|
auto t22 = std::chrono::high_resolution_clock::now();
|
||||||
|
|
||||||
|
// lcg
|
||||||
|
for (int i = 0; i < N; ++i) {
|
||||||
|
res[i] += lcg(&rs);
|
||||||
|
}
|
||||||
|
|
||||||
|
auto t3 = std::chrono::high_resolution_clock::now();
|
||||||
|
|
||||||
|
// lcg4
|
||||||
|
#pragma GCC unroll 4
|
||||||
|
for (int i = 0; i < N; ++i) {
|
||||||
|
// res[i] += lcg_ilp(&rs_ilp, (RAND_ILP)(i % (RAND_ILP_MAX + 1)));
|
||||||
|
res[i] += lcg_ilp(&rs_ilp, (RAND_ILP)(i % 4));
|
||||||
|
}
|
||||||
|
|
||||||
|
auto t31 = std::chrono::high_resolution_clock::now();
|
||||||
|
|
||||||
|
// results 1
|
||||||
|
|
||||||
|
auto arc4_elapsed = std::chrono::duration_cast<std::chrono::nanoseconds>(t1 - t0);
|
||||||
|
auto rand_elapsed = std::chrono::duration_cast<std::chrono::nanoseconds>(t2 - t1);
|
||||||
|
auto lce_def_elapsed = std::chrono::duration_cast<std::chrono::nanoseconds>(t21 - t2);
|
||||||
|
auto lce_elapsed = std::chrono::duration_cast<std::chrono::nanoseconds>(t211 - t21);
|
||||||
|
auto mt_elapsed = std::chrono::duration_cast<std::chrono::nanoseconds>(t22 - t21);
|
||||||
|
auto lcg_elapsed = std::chrono::duration_cast<std::chrono::nanoseconds>(t3 - t22);
|
||||||
|
auto lcg4_elapsed = std::chrono::duration_cast<std::chrono::nanoseconds>(t31 - t3);
|
||||||
|
|
||||||
|
printf("Time (arc4): %.3f ms.\n", arc4_elapsed.count() * 1e-6);
|
||||||
|
printf("Time (rand): %.3f ms.\n", rand_elapsed.count() * 1e-6);
|
||||||
|
printf("Time (C++ lcg): %.3f ms.\n", lce_def_elapsed.count() * 1e-6);
|
||||||
|
printf("Time (C++ lcg my parameters): %.3f ms.\n", lce_elapsed.count() * 1e-6);
|
||||||
|
printf("Time (C++ mersenne twister 32bit): %.3f ms.\n", mt_elapsed.count() * 1e-6);
|
||||||
|
printf("Time (lcg): %.3f ms.\n", lcg_elapsed.count() * 1e-6);
|
||||||
|
printf("Time (lcg4): %.3f ms.\n", lcg4_elapsed.count() * 1e-6);
|
||||||
|
|
||||||
|
printf("Modulo VS nomod perf for rand_between (both LCG) - %d number of cases:\n", M);
|
||||||
|
|
||||||
|
|
||||||
|
auto t4 = std::chrono::high_resolution_clock::now();
|
||||||
|
|
||||||
|
// rand + modulo
|
||||||
|
for (int i = 0; i < M; ++i) {
|
||||||
|
res[i] += FROM + (rand() % (TO - FROM));
|
||||||
|
}
|
||||||
|
|
||||||
|
auto t5 = std::chrono::high_resolution_clock::now();
|
||||||
|
|
||||||
|
// lcg + modulo
|
||||||
|
for (int i = 0; i < M; ++i) {
|
||||||
|
res[i] += FROM + (lcg(&rs) % (TO - FROM));
|
||||||
|
}
|
||||||
|
|
||||||
|
auto t6 = std::chrono::high_resolution_clock::now();
|
||||||
|
|
||||||
|
// rand_between (also LCG, but no modulus)
|
||||||
|
for (int i = 0; i < M; ++i) {
|
||||||
|
res[i] += rand_between(&rs, FROM, TO);
|
||||||
|
}
|
||||||
|
|
||||||
|
auto t7 = std::chrono::high_resolution_clock::now();
|
||||||
|
|
||||||
|
|
||||||
|
// results 2
|
||||||
|
|
||||||
|
auto randmod_elapsed = std::chrono::duration_cast<std::chrono::nanoseconds>(t5 - t4);
|
||||||
|
auto mod_elapsed = std::chrono::duration_cast<std::chrono::nanoseconds>(t6 - t5);
|
||||||
|
auto between_elapsed = std::chrono::duration_cast<std::chrono::nanoseconds>(t7 - t6);
|
||||||
|
|
||||||
|
uint32_t choice = rand_between(&rs, FROM, TO);
|
||||||
|
printf("rand + modulo [%u, %u): %.3f ms.\n", FROM, TO, randmod_elapsed.count() * 1e-6);
|
||||||
|
printf("lcg + modulo [%u, %u): %.3f ms.\n", FROM, TO, mod_elapsed.count() * 1e-6);
|
||||||
|
printf("rand_between [%u, %u): %.3f ms.\n", FROM, TO, between_elapsed.count() * 1e-6);
|
||||||
|
|
||||||
|
// checksum - avoids optimizing out above loops
|
||||||
|
|
||||||
|
uint32_t sum = 0;
|
||||||
|
for(int i = 0; i < M; ++i) {
|
||||||
|
sum += res[i];
|
||||||
|
}
|
||||||
|
printf("Checksum: 0x%x\n", sum);
|
||||||
|
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
Loading…
x
Reference in New Issue
Block a user