much better perf tests because no summing now that serialized opcodes

This commit is contained in:
Richard Thier 2025-04-01 20:25:53 +02:00
parent b1d62443f6
commit 5fe3db5428
2 changed files with 38 additions and 18 deletions

View File

@ -4,3 +4,5 @@ release:
gcc main.c -O2 -o main gcc main.c -O2 -o main
perf: perf:
g++ perf.cpp -O2 -o perftest; ./perftest g++ perf.cpp -O2 -o perftest; ./perftest
perf-debug:
g++ perf.cpp -g -o perftest; gdb ./perftest

View File

@ -5,82 +5,100 @@
#include "fastrand.h" #include "fastrand.h"
#define N 10000000 #define N 10000000
#define M 99999999 // M > N // #define N 19999999
// #define M 10000000 // M >= N
#define M 19999999 // M >= N
#define FROM 100 #define FROM 100
#define TO 576 #define TO 576
uint32_t res[M] = { 0 };
int main() { int main() {
assert(M > N); // M > N assert(M >= N); // M >= N
// Init // Init
srand((unsigned int)time(NULL)); srand((unsigned int)time(NULL));
rand_state rs = init_rand(); rand_state rs = init_rand();
uint32_t sum = 0; // to avoid compiler optimizing out stuff
printf("Full range generation perf - %d number of cases:\n", N); printf("Full range generation perf - %d number of cases:\n", N);
auto t0 = std::chrono::high_resolution_clock::now(); auto t0 = std::chrono::high_resolution_clock::now();
// rand // arc4
for (int i = 0; i < N; ++i) { for (int i = 0; i < N; ++i) {
sum += rand(); res[i] += arc4random();
} }
auto t1 = std::chrono::high_resolution_clock::now(); auto t1 = std::chrono::high_resolution_clock::now();
// arc4 // rand
for (int i = 0; i < N; ++i) { for (int i = 0; i < N; ++i) {
sum += arc4random(); res[i] += rand();
} }
auto t2 = std::chrono::high_resolution_clock::now(); auto t2 = std::chrono::high_resolution_clock::now();
// lcg // lcg
for (int i = 0; i < N; ++i) { for (int i = 0; i < N; ++i) {
sum += lcg(&rs); res[i] += lcg(&rs);
} }
auto t3 = std::chrono::high_resolution_clock::now(); auto t3 = std::chrono::high_resolution_clock::now();
// results 1 // results 1
auto rand_elapsed = std::chrono::duration_cast<std::chrono::nanoseconds>(t1 - t0); auto arc4_elapsed = std::chrono::duration_cast<std::chrono::nanoseconds>(t1 - t0);
auto arc4_elapsed = std::chrono::duration_cast<std::chrono::nanoseconds>(t2 - t1); auto rand_elapsed = std::chrono::duration_cast<std::chrono::nanoseconds>(t2 - t1);
auto lcg_elapsed = std::chrono::duration_cast<std::chrono::nanoseconds>(t3 - t2); auto lcg_elapsed = std::chrono::duration_cast<std::chrono::nanoseconds>(t3 - t2);
printf("Time (rand): %.3f ms.\n", rand_elapsed.count() * 1e-6);
printf("Time (arc4): %.3f ms.\n", arc4_elapsed.count() * 1e-6); printf("Time (arc4): %.3f ms.\n", arc4_elapsed.count() * 1e-6);
printf("Time (rand): %.3f ms.\n", rand_elapsed.count() * 1e-6);
printf("Time (lcg): %.3f ms.\n", lcg_elapsed.count() * 1e-6); printf("Time (lcg): %.3f ms.\n", lcg_elapsed.count() * 1e-6);
printf("Modulo VS nomod perf for rand_between (both LCG) - %d number of cases:\n", M); printf("Modulo VS nomod perf for rand_between (both LCG) - %d number of cases:\n", M);
auto t4 = std::chrono::high_resolution_clock::now(); auto t4 = std::chrono::high_resolution_clock::now();
// lcg + modulo // rand + modulo
for (int i = 0; i < M; ++i) { for (int i = 0; i < M; ++i) {
sum += FROM + (lcg(&rs) % (TO - FROM)); res[i] += FROM + (rand() % (TO - FROM));
} }
auto t5 = std::chrono::high_resolution_clock::now(); auto t5 = std::chrono::high_resolution_clock::now();
// rand_between (also LCG, but no modulus) // lcg + modulo
for (int i = 0; i < M; ++i) { for (int i = 0; i < M; ++i) {
sum += rand_between(&rs, FROM, TO); res[i] += FROM + (lcg(&rs) % (TO - FROM));
} }
auto t6 = std::chrono::high_resolution_clock::now(); auto t6 = std::chrono::high_resolution_clock::now();
// rand_between (also LCG, but no modulus)
for (int i = 0; i < M; ++i) {
res[i] += rand_between(&rs, FROM, TO);
}
auto t7 = std::chrono::high_resolution_clock::now();
// results 2 // results 2
auto mod_elapsed = std::chrono::duration_cast<std::chrono::nanoseconds>(t5 - t4); auto randmod_elapsed = std::chrono::duration_cast<std::chrono::nanoseconds>(t5 - t4);
auto between_elapsed = std::chrono::duration_cast<std::chrono::nanoseconds>(t6 - t5); auto mod_elapsed = std::chrono::duration_cast<std::chrono::nanoseconds>(t6 - t5);
auto between_elapsed = std::chrono::duration_cast<std::chrono::nanoseconds>(t7 - t6);
uint32_t choice = rand_between(&rs, FROM, TO); uint32_t choice = rand_between(&rs, FROM, TO);
printf("rand + modulo [%u, %u): %.3f ms.\n", FROM, TO, randmod_elapsed.count() * 1e-6);
printf("lcg + modulo [%u, %u): %.3f ms.\n", FROM, TO, mod_elapsed.count() * 1e-6); printf("lcg + modulo [%u, %u): %.3f ms.\n", FROM, TO, mod_elapsed.count() * 1e-6);
printf("rand_between [%u, %u): %.3f ms.\n", FROM, TO, between_elapsed.count() * 1e-6); printf("rand_between [%u, %u): %.3f ms.\n", FROM, TO, between_elapsed.count() * 1e-6);
// checksum - avoid optimizing out loops // checksum - avoids optimizing out above loops
uint32_t sum = 0;
for(int i = 0; i < M; ++i) {
sum += res[i];
}
printf("Checksum: 0x%x\n", sum); printf("Checksum: 0x%x\n", sum);
return 0; return 0;