Compare commits
No commits in common. "master" and "atszanfixes" have entirely different histories.
master
...
atszanfixe
2
arena.h
2
arena.h
@ -1 +1 @@
|
||||
Subproject commit dfff5028f3f7baee4e764744a656baf553bc4b70
|
||||
Subproject commit 3037bf6bec96b0ebc231510d308da1daece276fd
|
317
main.cpp
317
main.cpp
@ -1,333 +1,22 @@
|
||||
#include <cstdio>
|
||||
#include <cassert>
|
||||
#include <vector>
|
||||
#include <string>
|
||||
#include <chrono>
|
||||
#include "amap.h"
|
||||
#include "simap.h"
|
||||
#include "mapmap.hpp"
|
||||
#include "unomap.hpp"
|
||||
#include "simd_map.h"
|
||||
#include "vmap.h"
|
||||
|
||||
/**
|
||||
* Creates keys or returns the ith key. Used for performance tests.
|
||||
*
|
||||
* @param i When "create" is false, we return the ith key (does not check OOB)
|
||||
* @param create When true, we initialize the keystore with keys generated from 0..i indices.
|
||||
* @returns The ith key when create==false, otherwise undefined.
|
||||
*/
|
||||
inline const char *keystore(int i, bool create = false) noexcept {
|
||||
static thread_local std::vector<std::string> keys;
|
||||
|
||||
if(!create) {
|
||||
return keys[i].c_str();
|
||||
} else {
|
||||
keys.resize(0);
|
||||
keys.reserve(0);
|
||||
std::string key = "k";
|
||||
for(int j = 0; j < i; ++j) {
|
||||
keys.push_back(key + std::to_string(j));
|
||||
}
|
||||
return NULL;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Creates keys or returns the ith integer key. Used for performance tests.
|
||||
*
|
||||
* Rem.: Generated keys are like this: i, i-1, i-2, ... 1
|
||||
*
|
||||
* @param i When "create" is false, we return the ith key (does not check OOB)
|
||||
* @param create When true, we initialize the keystore with keys generated from 0..i indices.
|
||||
* @returns The ith key when create==false, otherwise undefined.
|
||||
*/
|
||||
inline int *int_keystore(int i, bool create = false) noexcept {
|
||||
static thread_local std::vector<int> keys;
|
||||
|
||||
if(!create) {
|
||||
return &(keys[i]);
|
||||
} else {
|
||||
keys.resize(0);
|
||||
keys.reserve(0);
|
||||
for(int j = 0; j < i; ++j) {
|
||||
keys.push_back(i - j);
|
||||
}
|
||||
return NULL;
|
||||
}
|
||||
}
|
||||
inline const char *datastore_int(int i, bool create = false) noexcept {
|
||||
static thread_local std::vector<std::string> keys;
|
||||
|
||||
if(!create) {
|
||||
return keys[i].c_str();
|
||||
} else {
|
||||
keys.resize(0);
|
||||
keys.reserve(0);
|
||||
std::string key = "k";
|
||||
for(int j = 0; j < i; ++j) {
|
||||
keys.push_back(key + std::to_string(j));
|
||||
}
|
||||
return NULL;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Creates datas or returns the ith data. Used for performance tests.
|
||||
*
|
||||
* @param i When "create" is false, we return the ith data (does not check OOB)
|
||||
* @param create When true, we initialize the datastore with datas generated from 0..i indices.
|
||||
* @returns The ith data when create==false, otherwise undefined.
|
||||
*/
|
||||
inline int *datastore(int i, bool create = false) noexcept {
|
||||
static thread_local std::vector<int> keys;
|
||||
|
||||
if(!create) {
|
||||
return &(keys[i]);
|
||||
} else {
|
||||
keys.resize(0);
|
||||
keys.reserve(0);
|
||||
for(int j = 0; j < i; ++j) {
|
||||
keys.push_back(j);
|
||||
}
|
||||
return NULL;
|
||||
}
|
||||
}
|
||||
|
||||
void test_perf(amap mapdo, void *map, int max_key, const char *what) {
|
||||
auto begin = std::chrono::high_resolution_clock::now();
|
||||
for(int i = 0; i < max_key; ++i) {
|
||||
const char *key = keystore(i);
|
||||
int *data = datastore(i);
|
||||
mapdo(map, AMAP_SET, key, data);
|
||||
}
|
||||
auto end = std::chrono::high_resolution_clock::now();
|
||||
auto elapsed = std::chrono::duration_cast<std::chrono::nanoseconds>(end - begin);
|
||||
|
||||
printf("Insertion time for %d elements (%s): %.3f ms.\n", max_key, what, elapsed.count() * 1e-6);
|
||||
}
|
||||
|
||||
void test_basics(amap mapdo, void *map) {
|
||||
/* Most basics */
|
||||
assert(NULL == mapdo(map, AMAP_GET, "asdf", NULL));
|
||||
int i = 42;
|
||||
int *iptr;
|
||||
const char *chptr;
|
||||
assert(NULL != mapdo(map, AMAP_SET, "meaning", &i));
|
||||
assert(NULL != (iptr = (int *)mapdo(map, AMAP_GET, "meaning", NULL)));
|
||||
assert(*iptr == 42);
|
||||
assert(iptr == &i);
|
||||
|
||||
/* Delete / tombstone */
|
||||
assert(NULL != mapdo(map, AMAP_SET, "meaning", NULL));
|
||||
assert(NULL == (int *)mapdo(map, AMAP_GET, "meaning", NULL));
|
||||
|
||||
/* Check re-adding */
|
||||
assert(NULL != mapdo(map, AMAP_SET, "meaning", &i));
|
||||
assert(NULL != (iptr = (int *)mapdo(map, AMAP_GET, "meaning", NULL)));
|
||||
assert(*iptr == 42);
|
||||
assert(iptr == &i);
|
||||
|
||||
/* Test Erase */
|
||||
assert(NULL != mapdo(map, AMAP_ERASE, NULL, NULL));
|
||||
|
||||
/* Check re-adding 3 new things */
|
||||
assert(NULL != mapdo(map, AMAP_SET, "meaningless1", &i));
|
||||
assert(NULL != mapdo(map, AMAP_SET, "meaning2", &i));
|
||||
const char *helloworld = "Hello world!";
|
||||
assert(NULL != mapdo(map, AMAP_SET, "hello", (char *)helloworld)); /* ugly cast... */
|
||||
|
||||
assert(NULL != (chptr = (const char *)mapdo(map, AMAP_GET, "hello", NULL)));
|
||||
assert(strlen(chptr) == strlen(helloworld));
|
||||
assert(strcmp(chptr, helloworld) == 0);
|
||||
assert(NULL != (iptr = (int *)mapdo(map, AMAP_GET, "meaning2", NULL)));
|
||||
assert(*iptr == 42);
|
||||
assert(iptr == &i);
|
||||
assert(NULL != (iptr = (int *)mapdo(map, AMAP_GET, "meaningless1", NULL)));
|
||||
assert(*iptr == 42);
|
||||
assert(iptr == &i);
|
||||
|
||||
/* Check the case where we have same 8-long prefix for multiple and they should be different */
|
||||
int long_1 = 1;
|
||||
int long_2 = 2;
|
||||
int long_3 = 3;
|
||||
|
||||
assert(NULL != mapdo(map, AMAP_SET, "very_long_test_key_1", &long_1));
|
||||
assert(NULL != mapdo(map, AMAP_SET, "very_long_test_key_2", &long_2));
|
||||
assert(NULL != mapdo(map, AMAP_SET, "very_long_test_key_3", &long_3));
|
||||
|
||||
assert(NULL != (iptr = (int *)mapdo(map, AMAP_GET, "very_long_test_key_1", NULL)));
|
||||
assert(*iptr == 1);
|
||||
assert(NULL != (iptr = (int *)mapdo(map, AMAP_GET, "very_long_test_key_2", NULL)));
|
||||
assert(*iptr == 2);
|
||||
assert(NULL != (iptr = (int *)mapdo(map, AMAP_GET, "very_long_test_key_3", NULL)));
|
||||
assert(*iptr == 3);
|
||||
}
|
||||
|
||||
void test_stringmaps(int perf_test_i) {
|
||||
/* Basic tests */
|
||||
simap_instance si = simap_create();
|
||||
test_basics(simap, &si);
|
||||
|
||||
mapmap_instance mi = mapmap_create();
|
||||
test_basics(mapmap, &mi);
|
||||
|
||||
unomap_instance umi = unomap_create();
|
||||
test_basics(unomap, &umi);
|
||||
|
||||
/* Performance tests */
|
||||
int i = perf_test_i;
|
||||
test_perf(mapmap, &mi, i, "std::map");
|
||||
test_perf(simap, &si, i, "simap");
|
||||
test_perf(unomap, &umi, i, "std::unordered_map");
|
||||
}
|
||||
|
||||
void test_simd_map_basics() {
|
||||
/* Empty free tests */
|
||||
simd_map smap = simd_map_create();
|
||||
simd_map_free(&smap);
|
||||
|
||||
/* Re-create */
|
||||
smap = simd_map_create();
|
||||
|
||||
/* Empty search */
|
||||
assert(simd_map_find(&smap, 42) == NULL);
|
||||
assert(simd_map_size(&smap) == 0);
|
||||
|
||||
/* Insertions */
|
||||
assert(simd_map_set(&smap, 40, 0) != 0);
|
||||
assert(simd_map_set(&smap, 41, 1) != 0);
|
||||
assert(simd_map_set(&smap, 42, 2) != 0);
|
||||
assert(simd_map_size(&smap) == 3);
|
||||
|
||||
/* Searches */
|
||||
assert(*simd_map_find(&smap, 40) == 0);
|
||||
assert(*simd_map_find(&smap, 41) == 1);
|
||||
assert(*simd_map_find(&smap, 42) == 2);
|
||||
|
||||
/* Test erase */
|
||||
simd_map_erase(&smap);
|
||||
assert(simd_map_find(&smap, 42) == NULL);
|
||||
assert(simd_map_set(&smap, 42, 2) != 0);
|
||||
assert(*simd_map_find(&smap, 42) == 2);
|
||||
|
||||
/* Test a bit more */
|
||||
int cnt = 100;
|
||||
for(int i = 0; i < cnt; ++i) {
|
||||
assert(simd_map_set(&smap, i, (cnt - i)) != 0);
|
||||
}
|
||||
assert(simd_map_size(&smap) == 100); /* 42->2 should get overwritten */
|
||||
for(int i = 0; i < cnt; ++i) {
|
||||
assert(*simd_map_find(&smap, i) == (uint32_t)(cnt - i));
|
||||
}
|
||||
|
||||
/* Test removal */
|
||||
assert(simd_map_remove(&smap, 41) != 0);
|
||||
assert(simd_map_remove(&smap, 43) != 0);
|
||||
assert(simd_map_find(&smap, 41) == NULL);
|
||||
assert(simd_map_find(&smap, 43) == NULL);
|
||||
assert(simd_map_find(&smap, 42) != NULL);
|
||||
assert(simd_map_find(&smap, 99) != NULL);
|
||||
assert(simd_map_find(&smap, 98) != NULL);
|
||||
assert(simd_map_size(&smap) == 98);
|
||||
|
||||
/* Test multimap operations */
|
||||
assert(simd_map_multi_set(&smap, 42, 42) != 0);
|
||||
assert(simd_map_multi_set(&smap, 42, 43) != 0);
|
||||
assert(simd_map_find(&smap, 42) != NULL);
|
||||
assert(*simd_map_find(&smap, 42) != 42);
|
||||
simd_map_find_res res1 = simd_map_find_all(&smap, 42, simd_map_find_all_begin());
|
||||
assert(res1.value_location != NULL);
|
||||
assert(*(res1.value_location) == (uint32_t)(cnt - 42));
|
||||
simd_map_find_res res2 = simd_map_find_all(&smap, 42, res1);
|
||||
assert(*(res2.value_location) == 42);
|
||||
simd_map_find_res res3 = simd_map_find_all(&smap, 42, res2);
|
||||
assert(res3.value_location != NULL);
|
||||
assert(*(res3.value_location) == 43);
|
||||
|
||||
/* Test filled-free */
|
||||
simd_map_free(&smap);
|
||||
}
|
||||
|
||||
void test_simd_map_perf(int max_key) {
|
||||
#ifdef __AVX2__
|
||||
puts("...Perf testing simd_map with AVX2...");
|
||||
#elif __SSE2__
|
||||
puts("...Perf testing simd_map with SSE2...");
|
||||
#endif
|
||||
|
||||
// XXX: This way we would measure the wrong thing:
|
||||
// simd_map smap = simd_map_create();
|
||||
// Why? To measure the right thing, not the first allocation!
|
||||
simd_map smap = simd_map_create_and_reserve();
|
||||
auto begin = std::chrono::high_resolution_clock::now();
|
||||
for(int i = 0; i < max_key; ++i) {
|
||||
int *key = int_keystore(i);
|
||||
int *data = datastore(i);
|
||||
simd_map_set(&smap, *key, *data);
|
||||
}
|
||||
auto end = std::chrono::high_resolution_clock::now();
|
||||
auto elapsed = std::chrono::duration_cast<std::chrono::nanoseconds>(end - begin);
|
||||
|
||||
printf("Insertion time for %d elements (simd_map): %.3f ms.\n", max_key, elapsed.count() * 1e-6);
|
||||
simd_map_free(&smap);
|
||||
}
|
||||
|
||||
void test_int_unordered_map(int max_key) {
|
||||
std::unordered_map<int, int> smap;
|
||||
auto begin = std::chrono::high_resolution_clock::now();
|
||||
for(int i = 0; i < max_key; ++i) {
|
||||
int *key = int_keystore(i);
|
||||
int *data = datastore(i);
|
||||
smap[*key] = *data;
|
||||
}
|
||||
auto end = std::chrono::high_resolution_clock::now();
|
||||
auto elapsed = std::chrono::duration_cast<std::chrono::nanoseconds>(end - begin);
|
||||
|
||||
printf("Insertion time for %d elements (std::unordered_map<int,int>): %.3f ms.\n", max_key, elapsed.count() * 1e-6);
|
||||
}
|
||||
|
||||
void test_int_std_map(int max_key) {
|
||||
std::map<int, int> smap;
|
||||
auto begin = std::chrono::high_resolution_clock::now();
|
||||
for(int i = 0; i < max_key; ++i) {
|
||||
int *key = int_keystore(i);
|
||||
int *data = datastore(i);
|
||||
smap[*key] = *data;
|
||||
}
|
||||
auto end = std::chrono::high_resolution_clock::now();
|
||||
auto elapsed = std::chrono::duration_cast<std::chrono::nanoseconds>(end - begin);
|
||||
|
||||
printf("Insertion time for %d elements (std::map<int,int>): %.3f ms.\n", max_key, elapsed.count() * 1e-6);
|
||||
}
|
||||
|
||||
void test_intmaps(int perf_test_i) {
|
||||
/* Basic tests */
|
||||
// test_simd_map_basics();
|
||||
test_int_std_map(perf_test_i);
|
||||
test_int_unordered_map(perf_test_i);
|
||||
test_simd_map_perf(perf_test_i);
|
||||
}
|
||||
|
||||
int main() {
|
||||
int perf_test_i = 1000;
|
||||
|
||||
/* Prepare data stores */
|
||||
keystore(perf_test_i, true);
|
||||
int_keystore(perf_test_i, true);
|
||||
datastore(perf_test_i, true);
|
||||
|
||||
/* Tests */
|
||||
puts("");
|
||||
puts("Integer maps...");
|
||||
puts("");
|
||||
test_intmaps(perf_test_i);
|
||||
puts("");
|
||||
puts("String maps...");
|
||||
puts("");
|
||||
test_stringmaps(perf_test_i);
|
||||
puts("");
|
||||
puts("...done!");
|
||||
/* test simap */
|
||||
simap_instance si = simap_create();
|
||||
test_basics(simap, &si);
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
8
makefile
8
makefile
@ -2,11 +2,3 @@ debug:
|
||||
g++ main.cpp -g -Wall -o main
|
||||
release:
|
||||
g++ main.cpp -O2 -Wall -o main
|
||||
debug-avx2:
|
||||
g++ main.cpp -g -mavx2 -Wall -o main
|
||||
release-avx2:
|
||||
g++ main.cpp -mavx2 -O2 -Wall -o main
|
||||
release-avx2-debug:
|
||||
g++ main.cpp -g -mavx2 -O2 -Wall -o main
|
||||
release-avx2-asm:
|
||||
g++ main.cpp -S -fopt-info-vec-missed -masm=intel -mavx2 -O2 -Wall -o main
|
||||
|
39
mapmap.hpp
39
mapmap.hpp
@ -1,39 +0,0 @@
|
||||
#ifndef MAPMAP_HPP
|
||||
#define MAPMAP_HPP
|
||||
|
||||
#include <map>
|
||||
#include <cassert>
|
||||
#include <memory>
|
||||
|
||||
struct mapmap_instance {
|
||||
std::map<const char *, void *> m;
|
||||
};
|
||||
|
||||
static inline mapmap_instance mapmap_create() {
|
||||
mapmap_instance ret;
|
||||
return ret;
|
||||
}
|
||||
|
||||
static inline void* mapmap(void *amap_instance, AMAP_OP op, const char *key, void *ptr) {
|
||||
mapmap_instance *map = (mapmap_instance *) amap_instance;
|
||||
if(op == AMAP_GET) {
|
||||
try {
|
||||
return map->m[key];
|
||||
} catch(...) {
|
||||
return ptr;
|
||||
}
|
||||
} else if(op == AMAP_SET) {
|
||||
try {
|
||||
map->m[key] = ptr;
|
||||
return map; // non-null
|
||||
} catch(...) {
|
||||
return NULL;
|
||||
}
|
||||
} else { // if(op == AMAP_ERASE) {
|
||||
assert(op == AMAP_ERASE);
|
||||
map->m = std::move(std::map<const char *, void *>());
|
||||
return (void *)((uint8_t)(NULL) - 1L);
|
||||
}
|
||||
}
|
||||
|
||||
#endif // MAPMAP_HPP
|
277
simap.h
277
simap.h
@ -1,73 +1,35 @@
|
||||
#ifndef SIMAP_H
|
||||
#define SIMAP_H
|
||||
#include <stddef.h> /* NULL */
|
||||
#include <stdint.h> /* uint8_t, uint32_t, ... */
|
||||
#include <string.h> /* strcmp, strncpy etc. */
|
||||
#include <assert.h> /* assert */
|
||||
#include <stddef.h>
|
||||
#include <stdint.h>
|
||||
#include <string.h>
|
||||
#include <assert.h>
|
||||
#include "amap.h"
|
||||
#include "arena.h/arena.h"
|
||||
|
||||
/* Possible (non-AVX, but alike) optimization, but means there can be lookup / insert errors (very rarely)
|
||||
#define SIMAP_RAW
|
||||
*/
|
||||
|
||||
/* Perf trickery */
|
||||
|
||||
/* XXX: Enabling AVX also enables rare errors for speed gain! See above. */
|
||||
#ifdef __AVX2__
|
||||
#define SIMAP_RAW
|
||||
#include <immintrin.h>
|
||||
#endif
|
||||
|
||||
/* I have no idea what MSVC has instead... */
|
||||
#ifdef _MSC_VER
|
||||
#define SM_THREAD_LOCAL __declspec(thread)
|
||||
#define SM_PREFETCH(x)
|
||||
#define SM_LIKELY(x)
|
||||
#define SM_UNLIKELY(x)
|
||||
#define SM_NOINLINE __declspec(noinline)
|
||||
#define SM_ALWAYS_INLINE __forceinline
|
||||
#else
|
||||
#define SM_THREAD_LOCAL __thread
|
||||
#define SM_PREFETCH(x) __builtin_prefetch(x)
|
||||
#define SM_LIKELY(x) __builtin_expect((x),1)
|
||||
#define SM_UNLIKELY(x) __builtin_expect((x),0)
|
||||
#define SM_NOINLINE __attribute__ ((noinline))
|
||||
#define SM_ALWAYS_INLINE __attribute__ ((always_inline))
|
||||
#endif
|
||||
typedef uint64_t auint64 __attribute__ ((__aligned__(8)));
|
||||
|
||||
/** The first 8 characters are stored as uint64_t for fast checks */
|
||||
union simap_c64 {
|
||||
char str8[8];
|
||||
auint64 u64;
|
||||
};
|
||||
typedef union simap_char64 simap_char64;
|
||||
|
||||
/** This is to ensure 8byte storage of pointers (with possible padding) */
|
||||
union simap_ptr64 {
|
||||
void *ptr;
|
||||
auint64 u64;
|
||||
uint64_t u64;
|
||||
};
|
||||
typedef union simap_ptr64 simap_ptr64;
|
||||
|
||||
struct elem_nonkey_prefix {
|
||||
/** The value (ptr) */
|
||||
simap_ptr64 value;
|
||||
|
||||
/** Previous element index from base (full offset) */
|
||||
uint32_t previndex;
|
||||
/** Next element index from base (full offset) */
|
||||
uint32_t nextindex;
|
||||
};
|
||||
typedef struct elem_nonkey_prefix elem_nonkey_prefix;
|
||||
|
||||
/**
|
||||
* The per-element storage layout
|
||||
* A "peasantly" map data structure backed by arena.h - basically a toy data structure...
|
||||
*
|
||||
* This is very simple, no trees, no hashes, just (hopefully) autovectorized linear lookup.
|
||||
* Inserting NULLs to keys happens through tombstoning unless erase happens and we never
|
||||
* shrink memory so please do not add a lot of things then remove a lot of things.
|
||||
*
|
||||
* We also only do heuristics against data being the key so its not "sure" and can fail...
|
||||
*
|
||||
* XXX: So beware that this can FAIL just "statistically most often works"!
|
||||
*
|
||||
* The memory layout after at *base is as follows:
|
||||
*
|
||||
* 8 byte:
|
||||
* - void* value;
|
||||
* - [?] optional padding (only for non-64 bit pointer machines)
|
||||
* - ? padding (only for non-64 bit pointer machines)
|
||||
*
|
||||
* 8 byte:
|
||||
* - uint32_t previndex;
|
||||
@ -77,35 +39,20 @@ typedef struct elem_nonkey_prefix elem_nonkey_prefix;
|
||||
* - char name[]; // inline stored
|
||||
* - padding (divisible by 8)
|
||||
*
|
||||
* ELEMENTS added to it...
|
||||
*
|
||||
* Because of it a lookup is basically via strstr-like with 8byte steps!
|
||||
* with few character names zero-padded in the search term parameter
|
||||
* and if you want check extra validity by jumping back&forth in it.
|
||||
*/
|
||||
struct elem_prefix {
|
||||
/** Value and meta-data - divisible by 8bytes */
|
||||
elem_nonkey_prefix nonkey_prefix;
|
||||
|
||||
/** The prefix of the key - divisible by 8bytes padded string after this (inlined) */
|
||||
simap_c64 key_prefix;
|
||||
};
|
||||
typedef struct elem_prefix elem_prefix;
|
||||
|
||||
/**
|
||||
* A "peasantly" map data structure backed by arena.h - basically a toy data structure...
|
||||
*
|
||||
* This is very simple, no trees, no hashes, just (hopefully) autovectorized linear lookup.
|
||||
* Inserting NULLs to keys happens through tombstoning unless erase happens and we never
|
||||
* shrink memory so please do not add a lot of things then remove a lot of things.
|
||||
*
|
||||
* In AVX2 mode, we do heuristics against data being the key so its not "sure" and can fail...
|
||||
* XXX: So beware that this CAN FAIL for AVX2 build flags just "statistically most often works"!
|
||||
*/
|
||||
struct simap_instance {
|
||||
arena a;
|
||||
uint32_t end;
|
||||
uint32_t prev_usage_end; /* previous usage_end or -1 if no previous exists... in bytes!!! */
|
||||
uint32_t usage_end; /* in bytes!!! */
|
||||
elem_prefix *base;
|
||||
uint32_t next_previndex; /* in bytes!!! */
|
||||
/** see doc comment for layout and why uint64_t* is the type */
|
||||
uint64_t *base;
|
||||
};
|
||||
typedef struct simap_instance simap_instance;
|
||||
|
||||
@ -115,101 +62,17 @@ static inline simap_instance simap_create() {
|
||||
ret.end = 0;
|
||||
ret.prev_usage_end = (uint32_t) -1;
|
||||
ret.usage_end = 0;
|
||||
ret.base = (elem_prefix*)(((auint64*) aralloc(&(ret.a), sizeof(auint64), sizeof(auint64), 1)) /* addr divisible by 8 */
|
||||
+ 1); /* First really addressible thing */
|
||||
ret.next_previndex = 0;
|
||||
ret.base = ((uint64_t*) aralloc(&(ret.a), sizeof(uint64_t), sizeof(uint64_t), 1)) /* addr divisible by 8 */
|
||||
+ 1; /* First really addressible thing */
|
||||
return ret;
|
||||
}
|
||||
|
||||
static inline void* simap(void *amap_instance, AMAP_OP op, const char *key, void *ptr);
|
||||
|
||||
/** Gets padding bytes for a size to be padded to divisible alignment */
|
||||
static inline unsigned int get_size_padding(unsigned int size, unsigned int alignment) {
|
||||
/* Would ensure returned value divisible by alignment */
|
||||
/* return (size + alignment - 1) / alignment * alignment; */
|
||||
/* Basically same as: */
|
||||
/* return (alignment - (size % alignment)) % alignment; */
|
||||
|
||||
/* Substracting size leads to padding */
|
||||
return ((size + alignment - 1) / alignment) * alignment - size;
|
||||
}
|
||||
|
||||
/** Gets padded address - or same address if divisible by alignment */
|
||||
static inline void *get_padded(void *ptr, int alignment) {
|
||||
/* return (alignment - (ptr % alignment)) % alignment; */
|
||||
return (void*)((ptrdiff_t)((uint8_t *)ptr + alignment - 1) / alignment * alignment);
|
||||
}
|
||||
|
||||
static inline SM_ALWAYS_INLINE auint64 *make_tipp(auint64 *base, auint64 *tip, auint64 prefix, auint64 *end) {
|
||||
#ifdef __AVX2__
|
||||
/* See:
|
||||
*
|
||||
* https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#ig_expand=861,4605&avxnewtechs=AVX
|
||||
* https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#ig_expand=861,4605&avxnewtechs=AVX&text=movemask
|
||||
* https://chryswoods.com/vector_c++/part2.html
|
||||
* https://blog.triplez.cn/posts/avx-avx2-learning-notes/
|
||||
* https://github.com/Triple-Z/AVX-AVX2-Example-Code
|
||||
* https://en.algorithmica.org/hpc/simd/masking/
|
||||
* https://stackoverflow.com/questions/31089502/aligned-and-unaligned-memory-access-with-avx-avx2-intrinsics
|
||||
*/
|
||||
|
||||
/* Step over previous tipp and search until the AVX2 alignment needs */
|
||||
/* AVXs 256 bit = 32 byte so multiple of 32 needed here */
|
||||
/* TODO: Probably I can change "32" here into something bigger to non-avx small arrays! */
|
||||
auint64 *neotip = (auint64 *) get_padded(++tip, 32);
|
||||
while((tip < neotip) && (*tip != prefix)) ++tip;
|
||||
if(tip < neotip) return tip;
|
||||
|
||||
/* Prepare an AVX 256bit search register: 4 uint64_t */
|
||||
__m256i sreg = _mm256_set1_epi64x(prefix);
|
||||
|
||||
while(tip < end) {
|
||||
/* This needs 32 byte alignment here that we do above */
|
||||
/* The tipp register: 4 uint64_t */
|
||||
__m256i treg = _mm256_load_si256((__m256i *) tip);
|
||||
/* Check equality and return proper tip address for first found */
|
||||
__m256i m = _mm256_cmpeq_epi64(sreg, treg); /* Needs AVX2 */
|
||||
uint32_t mask = (uint32_t) _mm256_movemask_pd((__m256d) m);
|
||||
|
||||
/* Try next tip, processes 256 bits per loop */
|
||||
tip += 4; /* 4x64 bit */
|
||||
|
||||
/* One of the links used __builtin_ctz(mask), but I
|
||||
* think it was bad implementation and only finds the
|
||||
* last search result!
|
||||
*
|
||||
* __builtin_clz returns leading zeroes of the mask
|
||||
* and the mask has 4 bits at most and each show if
|
||||
* 1..4 places of AVX reg compared properly to the
|
||||
* given prefix value (4x 64 bit comparizons happen).
|
||||
* Subtracting from 31 we subtract either 28,29,30,31
|
||||
* and thus resulting in 3, 2, 1, 0 (right offsets).
|
||||
*
|
||||
* If the mask got all zero, there is nothing found,
|
||||
* otherwise its the tipp + offset we calculated. */
|
||||
if(SM_UNLIKELY(mask != 0)) {
|
||||
int offset = (31 - __builtin_clz(mask));
|
||||
/* -4 because this is the unlikely scenario and we already incremented! */
|
||||
return tip - 4 + offset;
|
||||
}
|
||||
}
|
||||
/* Not found case */
|
||||
return tip;
|
||||
#endif
|
||||
#ifdef SIMAP_RAW
|
||||
#pragma GCC unroll 16
|
||||
while((++tip < end) && (*tip != prefix));
|
||||
return tip;
|
||||
#endif
|
||||
/* XXX: This only works because of (***) because reading -1 tips makes tip >= end for sure here and back */
|
||||
elem_nonkey_prefix *pre = (elem_nonkey_prefix *)((uint8_t *)tip - sizeof(elem_nonkey_prefix));
|
||||
tip = (auint64 *) ((uint8_t *)base + pre->nextindex + sizeof(elem_nonkey_prefix));
|
||||
#pragma GCC unroll 16
|
||||
while((tip < end) && (*tip != prefix)) {
|
||||
pre = (elem_nonkey_prefix *)((uint8_t *)tip - sizeof(elem_nonkey_prefix));
|
||||
tip = (auint64 *) ((uint8_t *)base + pre->nextindex + sizeof(elem_nonkey_prefix));
|
||||
}
|
||||
return tip;
|
||||
}
|
||||
union simap_c64 {
|
||||
char str8[8];
|
||||
uint64_t u64;
|
||||
};
|
||||
typedef union simap_char64 simap_char64;
|
||||
|
||||
static inline simap_ptr64 *simap_search_internal(simap_instance *map, const char *key) {
|
||||
/* Construct prefix (fast-key) */
|
||||
@ -218,36 +81,28 @@ static inline simap_ptr64 *simap_search_internal(simap_instance *map, const char
|
||||
|
||||
simap_c64 prefix {0};
|
||||
size_t prefixlen = is_smallkey ? keylen : 8;
|
||||
/* Ignore warning because we know what we are doing here... */
|
||||
#pragma GCC diagnostic push
|
||||
#pragma GCC diagnostic ignored "-Wstringop-truncation"
|
||||
strncpy(prefix.str8, key, prefixlen);
|
||||
#pragma GCC diagnostic pop
|
||||
|
||||
/* Construct keyremains - might point to the \0 terminator only if smallkey or 8 bytes exactly */
|
||||
const char *keyremains = key + prefixlen;
|
||||
|
||||
/* TODO: Maybe I should create separate function for fast-lookup returning "next" pointer from a pointer to autovectorize? */
|
||||
/* Lookup prefix (fast-key) - hopefully this gets vectorized (should be)!!! */
|
||||
auint64 *base = (auint64 *) (map->base);
|
||||
auint64 *end = (auint64 *)((uint8_t *)base + (map->usage_end));
|
||||
auint64 *tipp = make_tipp(base, base, prefix.u64, end);
|
||||
while(tipp < end) { /* XXX: (***) */
|
||||
|
||||
/* Need detailed lookup, because found the prefix */
|
||||
assert((*tipp == prefix.u64));
|
||||
|
||||
uint64_t *base = map->base;
|
||||
uint64_t *tipp = map->base;
|
||||
for(uint32_t i = 0; i < map->usage_end / 8; ++i, ++tipp) {
|
||||
/* Fast lookup */
|
||||
if(*tipp == prefix.u64) {
|
||||
/* First check the remains of the string (only if needed) */
|
||||
if(!is_smallkey) {
|
||||
char *tippremains = (char *)((uint8_t *)tipp + sizeof(uint64_t));
|
||||
if(strcmp(keyremains, tippremains) != 0) {
|
||||
tipp = make_tipp(base, tipp, prefix.u64, end);
|
||||
continue;
|
||||
}
|
||||
}
|
||||
|
||||
simap_ptr64 *ptr = (simap_ptr64 *)((uint8_t *) (tipp - 2));
|
||||
|
||||
#ifdef SIMAP_RAW
|
||||
/* Check back & forth (jump validation) */
|
||||
uint32_t previ = *((uint32_t *)(tipp - 1));
|
||||
if(previ == (uint32_t) -1) {
|
||||
@ -258,30 +113,37 @@ static inline simap_ptr64 *simap_search_internal(simap_instance *map, const char
|
||||
+ sizeof(simap_ptr64)
|
||||
+ sizeof(uint32_t));
|
||||
|
||||
auint64 *retipp = (auint64 *)(((uint8_t *)base + prevnexi)
|
||||
uint64_t *retipp = (uint64_t *)(((uint8_t *)base + prevnexi)
|
||||
+ sizeof(simap_ptr64) + sizeof(uint32_t) +
|
||||
+ sizeof(uint32_t));
|
||||
|
||||
if(retipp != tipp) {
|
||||
tipp = make_tipp(base, tipp, prefix.u64, end);
|
||||
continue;
|
||||
}
|
||||
#endif /* SIMAP_RAW */
|
||||
|
||||
/* Can have the (statistically checked) pointer */
|
||||
return ptr;
|
||||
}
|
||||
}
|
||||
|
||||
/* Haven't found anything */
|
||||
return NULL;
|
||||
}
|
||||
|
||||
/** Gets padding bytes for a size to be padded to divisible alignment */
|
||||
static inline unsigned int get_size_padding(unsigned int size, unsigned int alignment) {
|
||||
/* Would ensure returned value divisible by alignment */
|
||||
/* return (size + alignment - 1) / alignment * alignment; */
|
||||
/* same: return (alignment - (size % alignment)) % alignment; */
|
||||
|
||||
/* Substracting size leads to padding */
|
||||
return ((size + alignment - 1) / alignment) * alignment - size;
|
||||
}
|
||||
|
||||
/** Returns the size of the storage needed for the given key */
|
||||
static inline uint32_t simap_elem_storage_size(const char *key) {
|
||||
uint32_t keysize = strlen(key);
|
||||
uint32_t padding = get_size_padding(keysize, 8);
|
||||
/* XXX: The exactly 8byte keys need a zero terminator too (would be overridden without this) */
|
||||
padding += (keysize == 8) ? 8 : 0;
|
||||
|
||||
return keysize +
|
||||
sizeof(simap_ptr64) +
|
||||
@ -290,11 +152,11 @@ static inline uint32_t simap_elem_storage_size(const char *key) {
|
||||
padding;
|
||||
}
|
||||
|
||||
/** Force-add the (key,value) to the end of the map. Use this if you prefill your map one-by-one and need speed */
|
||||
static inline void *simap_force_add(simap_instance *map, const char *key, void *ptr) {
|
||||
/** Force-add the (key,value) to the end of the map */
|
||||
static inline void *simap_force_add_internal(simap_instance *map, const char *key, void *ptr) {
|
||||
uint32_t storage_needed = simap_elem_storage_size(key);
|
||||
assert((storage_needed % 8) == 0);
|
||||
if(SM_UNLIKELY(map->end - map->usage_end < storage_needed)) {
|
||||
if(map->end - map->usage_end < storage_needed) {
|
||||
/* Need storage */
|
||||
aralloc(&(map->a),
|
||||
sizeof(uint8_t)/*esize*/,
|
||||
@ -310,39 +172,36 @@ static inline void *simap_force_add(simap_instance *map, const char *key, void *
|
||||
/* Create first 8 char encoding (this ensures endianness and all such stuff) */
|
||||
simap_c64 first8 {0};
|
||||
uint32_t keylen = strlen(key);
|
||||
/* Ignore warning because we know what we are doing here... */
|
||||
#pragma GCC diagnostic push
|
||||
#pragma GCC diagnostic ignored "-Wstringop-truncation"
|
||||
strncpy(first8.str8, key, (keylen < 8) ? keylen : 8);
|
||||
#pragma GCC diagnostic pop
|
||||
|
||||
uint32_t usi = map->usage_end;
|
||||
uint32_t previ = map->prev_usage_end;
|
||||
|
||||
/* 8byte: Save data ptr */
|
||||
/* Save data ptr */
|
||||
simap_ptr64 *data = (simap_ptr64 *)((uint8_t *)(map->base) + usi);
|
||||
data->ptr = ptr;
|
||||
|
||||
/* 8byte: Save link to previous and next */
|
||||
uint32_t *usprev = (uint32_t *)((uint8_t *)(map->base) + usi +
|
||||
sizeof(simap_ptr64));
|
||||
/* Save link to previous */
|
||||
uint32_t *usprev = (uint32_t *)((uint8_t *)(map->base) +
|
||||
sizeof(simap_ptr64) +
|
||||
sizeof(uint32_t));
|
||||
*usprev = previ;
|
||||
*(usprev + 1) = (uint32_t) -1; /* XXX: (***): ensures the "not < end" here! */
|
||||
/* and nex */
|
||||
*(usprev + 1) = (uint32_t) -1;
|
||||
|
||||
/* 8byte: First 8 char */
|
||||
/* First 8 bytes */
|
||||
simap_c64 *start_str = (simap_c64 *)(usprev + 2);
|
||||
*start_str = first8;
|
||||
|
||||
/* Remaining bytes */
|
||||
if(keylen >= 8) {
|
||||
/* Remainin bytes */
|
||||
if(keylen > 8) {
|
||||
/* uint32_t key_remains = keylen - 8; */
|
||||
char *rem_str = (char *)(start_str + 1);
|
||||
strcpy(rem_str, key + 8);
|
||||
}
|
||||
/* XXX: The "padding" gets automagically added by the movement of the arena here(by junk bytes)! */
|
||||
|
||||
/* Update previous with linkage */
|
||||
if(SM_LIKELY(previ != (uint32_t)-1)) {
|
||||
if(previ != (uint32_t)-1) {
|
||||
uint32_t *prevnex = (uint32_t *)((uint8_t *)(map->base) + previ +
|
||||
sizeof(simap_ptr64) +
|
||||
sizeof(uint32_t));
|
||||
@ -376,10 +235,9 @@ static inline void *simap_force_add(simap_instance *map, const char *key, void *
|
||||
static inline void* simap(void *amap_instance, AMAP_OP op, const char *key, void *ptr) {
|
||||
simap_instance *map = (simap_instance *) amap_instance;
|
||||
|
||||
if((op == AMAP_ERASE)) {
|
||||
map->prev_usage_end = (uint32_t) -1;
|
||||
if(op == AMAP_ERASE) {
|
||||
map->usage_end = 0;
|
||||
return (void *)((uint8_t)(NULL) - 1L);
|
||||
return NULL;
|
||||
}
|
||||
|
||||
/* Search for the key - also needed for SET in order to "re-set" */
|
||||
@ -390,13 +248,12 @@ static inline void* simap(void *amap_instance, AMAP_OP op, const char *key, void
|
||||
} else {
|
||||
assert(op == AMAP_SET);
|
||||
|
||||
if((!found)) {
|
||||
/* Add as new */
|
||||
return simap_force_add(map, key, ptr);
|
||||
} else {
|
||||
if(found) {
|
||||
/* Just overwrite */
|
||||
found->ptr = ptr;
|
||||
return (void *) found;
|
||||
} else {
|
||||
return simap_force_add_internal(map, key, ptr);
|
||||
}
|
||||
}
|
||||
|
||||
|
268
simd_map.h
268
simd_map.h
@ -1,268 +0,0 @@
|
||||
#ifndef SIMD_MAP
|
||||
#define SIMD_MAP
|
||||
|
||||
#include <stddef.h> /* NULL */
|
||||
#include <stdint.h> /* uint32_t, ... */
|
||||
#include <assert.h>
|
||||
#include "arena.h/arena.h"
|
||||
#include "simd_map_lane.h"
|
||||
|
||||
/* I have no idea what MSVC has instead... */
|
||||
#ifdef _MSC_VER
|
||||
#define SM_THREAD_LOCAL __declspec(thread)
|
||||
#define SM_PREFETCH(x)
|
||||
#define SM_LIKELY(x)
|
||||
#define SM_UNLIKELY(x)
|
||||
#define SM_NOINLINE __declspec(noinline)
|
||||
#define SM_ALWAYS_INLINE __forceinline
|
||||
#else
|
||||
#define SM_THREAD_LOCAL __thread
|
||||
#define SM_PREFETCH(x) __builtin_prefetch(x)
|
||||
#define SM_LIKELY(x) __builtin_expect((x),1)
|
||||
#define SM_UNLIKELY(x) __builtin_expect((x),0)
|
||||
#define SM_NOINLINE __attribute__ ((noinline))
|
||||
#define SM_ALWAYS_INLINE __attribute__ ((always_inline))
|
||||
#endif
|
||||
|
||||
struct simd_map {
|
||||
arena a;
|
||||
simd_map_lane *lanes;
|
||||
uint32_t end; /* in lanes */
|
||||
uint32_t usage_end; /* in lanes!!! */
|
||||
int lane_modulo; /* [0..SML_LANE_SPAN) */
|
||||
};
|
||||
typedef struct simd_map simd_map;
|
||||
|
||||
/** Create a simd map instance */
|
||||
static inline SM_ALWAYS_INLINE simd_map simd_map_create() {
|
||||
simd_map ret;
|
||||
ret.a = newarena((ptrdiff_t)1 << 33);
|
||||
ret.end = 0;
|
||||
ret.usage_end = 0;
|
||||
ret.lanes = (simd_map_lane*)(aralloc(&(ret.a), sizeof(simd_map_lane), sizeof(simd_map_lane), 1)) /* aligned! */
|
||||
+ 1; /* First really addressible thing */
|
||||
ret.lane_modulo = 0;
|
||||
return ret;
|
||||
}
|
||||
|
||||
/** Creates a simd map instance and pre-reserve space for a few elements */
|
||||
static inline SM_ALWAYS_INLINE simd_map simd_map_create_and_reserve();
|
||||
|
||||
/** Free all resources held by the map. Returns 0 on errors. */
|
||||
static inline SM_ALWAYS_INLINE char simd_map_free(simd_map *map) {
|
||||
return freearena(&(map->a));
|
||||
}
|
||||
|
||||
/** The result of the find_all(..) operation */
|
||||
struct simd_map_find_res {
|
||||
/** The found location - or NULL when the key was not found */
|
||||
uint32_t *value_location;
|
||||
/** Meta-data for continuation of the search */
|
||||
uint32_t lane_next;
|
||||
/** Meta-data for continuation of the search */
|
||||
int lane_next_begin;
|
||||
};
|
||||
typedef struct simd_map_find_res simd_map_find_res;
|
||||
|
||||
/** Create the value for starting a find_all call */
|
||||
static inline simd_map_find_res simd_map_find_all_begin() {
|
||||
simd_map_find_res ret;
|
||||
ret.value_location = NULL;
|
||||
ret.lane_next = 0;
|
||||
ret.lane_next_begin = 0;
|
||||
return ret;
|
||||
}
|
||||
|
||||
/**
|
||||
* Useful for multimap-like operations to find multiple mappings for the same key.
|
||||
*
|
||||
* @param map The map
|
||||
* @param key The key to search
|
||||
* @param prev The previous result - or simd_map_find_all_begin() to find the first / start lookup!
|
||||
* @returns The found pointer / location (if any) and if location was non-NULL meta-data so we can search further same-keys!
|
||||
*/
|
||||
static inline SM_ALWAYS_INLINE simd_map_find_res simd_map_find_all(simd_map *map, uint32_t key, simd_map_find_res prev) {
|
||||
simd_map_find_res ret;
|
||||
|
||||
/* Process most lanes */
|
||||
/* Do not process last element (-1) because of last incomplete lane */
|
||||
if(map->usage_end > 0) for(uint32_t i = prev.lane_next; i < map->usage_end - 1; ++i) {
|
||||
uint32_t *found = simd_map_lane_find(
|
||||
&(map->lanes[i]),
|
||||
key,
|
||||
0,
|
||||
prev.lane_next_begin,
|
||||
&(ret.lane_next_begin)); /* XXX: Fills part of retval! */
|
||||
|
||||
/* Needed so only the currently found are ignored in find_all(..) */
|
||||
prev.lane_next_begin = 0;
|
||||
|
||||
if(found) {
|
||||
ret.value_location = found;
|
||||
ret.lane_next = i + (ret.lane_next_begin == 0);
|
||||
return ret;
|
||||
}
|
||||
}
|
||||
|
||||
/* Process last lane - with a modulo lane */
|
||||
if((map->usage_end > 0) && (prev.lane_next < map->usage_end)) {
|
||||
uint32_t *found = simd_map_lane_find(
|
||||
&(map->lanes[map->usage_end - 1]),
|
||||
key,
|
||||
map->lane_modulo,
|
||||
prev.lane_next_begin,
|
||||
&(ret.lane_next_begin)); /* XXX: Fills part of retval! */
|
||||
|
||||
/* Needed so only the currently found are ignored in find_all(..) */
|
||||
prev.lane_next_begin = 0;
|
||||
|
||||
if(found) {
|
||||
ret.value_location = found;
|
||||
ret.lane_next = (map->usage_end - 1) + (ret.lane_next_begin == 0);
|
||||
return ret;
|
||||
}
|
||||
}
|
||||
|
||||
/* Not found */
|
||||
ret = simd_map_find_all_begin();
|
||||
return ret;
|
||||
}
|
||||
|
||||
/** Returns if this key is stored in the map or not - returns NULL if does not exists. */
|
||||
static inline uint32_t *simd_map_find(simd_map *map, uint32_t key) {
|
||||
simd_map_find_res begin = simd_map_find_all_begin();
|
||||
simd_map_find_res fires = simd_map_find_all(map, key, begin);
|
||||
return fires.value_location;
|
||||
}
|
||||
|
||||
/**
|
||||
* Insert without checking that the value have been already added or not.
|
||||
*
|
||||
* Useful for multimap operation or if you know the key have never been before added (faster)!
|
||||
*
|
||||
* @param map The map
|
||||
* @param key The key to insert
|
||||
* @param value The value for this key to insert
|
||||
* @returns 0 on errors, otherwise 1.
|
||||
*/
|
||||
static inline SM_ALWAYS_INLINE char simd_map_multi_set(simd_map *map, uint32_t key, uint32_t value) {
|
||||
/* Handle storage growth needs. */
|
||||
uint32_t storage_needed = (map->lane_modulo == 0) ? 1 : 0;
|
||||
if(SM_UNLIKELY(map->end - map->usage_end < storage_needed)) {
|
||||
void *allret = aralloc(&(map->a),
|
||||
sizeof(simd_map_lane)/* esize */,
|
||||
1 /* align - should be sizeof(simd_map_lane) but should be aligned here as-is already! */,
|
||||
storage_needed); /* ecount */
|
||||
|
||||
/** Return early with error but no state changes if we cannot allocate! */
|
||||
if(SM_UNLIKELY(!allret)) {
|
||||
return 0;
|
||||
}
|
||||
|
||||
/* Administer end offset */
|
||||
map->end += storage_needed;
|
||||
}
|
||||
|
||||
/* Administer usage end offset, separate from end because erase / shrink ops possible */
|
||||
map->usage_end += storage_needed;
|
||||
|
||||
/* Always force-insert into the last lane at lane_modulo location */
|
||||
simd_map_lane *lane = &(map->lanes[map->usage_end - 1]);
|
||||
lane->keys[map->lane_modulo] = key;
|
||||
lane->values[map->lane_modulo] = value;
|
||||
|
||||
/* Update lane modulo */
|
||||
map->lane_modulo = (map->lane_modulo + 1) % SML_LANE_SPAN;
|
||||
|
||||
return 1;
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns 0 on errors, otherwise 1 when added as new, 2 when already found got overwritten
|
||||
*/
|
||||
static inline char simd_map_set(simd_map *map, uint32_t key, uint32_t value) {
|
||||
uint32_t *found = simd_map_find(map, key);
|
||||
if(!found) {
|
||||
return simd_map_multi_set(map, key, value);
|
||||
} else {
|
||||
/* Overwrite already existing mapping */
|
||||
*found = value;
|
||||
return 2;
|
||||
}
|
||||
}
|
||||
|
||||
/** Empties the map - this does not free resources, just makes it reusable! */
|
||||
static inline SM_ALWAYS_INLINE void simd_map_erase(simd_map *map) {
|
||||
map->usage_end = 0;
|
||||
map->lane_modulo = 0;
|
||||
}
|
||||
|
||||
/** Returns count of elements in the given simd_map */
|
||||
static inline SM_ALWAYS_INLINE size_t simd_map_size(simd_map *map) {
|
||||
return (map->usage_end > 0) ?
|
||||
(((size_t)(map->usage_end) - 1) * 8 + map->lane_modulo) :
|
||||
0;
|
||||
}
|
||||
|
||||
/** Returns TRUE when map is empty and false otherwise - faster than simd_map_size(..) */
|
||||
static inline SM_ALWAYS_INLINE char simd_map_is_empty(simd_map *map) {
|
||||
return (map->usage_end == 0);
|
||||
}
|
||||
|
||||
/** Returns the lastly inserted value's location in the map - you must ensure the map has elements! */
|
||||
static inline SM_ALWAYS_INLINE uint32_t *simd_map_last_location(simd_map *map) {
|
||||
return (map->lane_modulo > 0) ?
|
||||
&(map->lanes[map->usage_end - 1].values[map->lane_modulo - 1]) :
|
||||
&(map->lanes[map->usage_end - 1].values[SML_LANE_SPAN - 1]);
|
||||
}
|
||||
|
||||
/**
|
||||
* Removes the found location from the map.
|
||||
*
|
||||
* Most users should prefer simd_map_remove instead. This is an unchecked operation!
|
||||
*
|
||||
* This must be called right after a find(..) or find_all(..) operation,
|
||||
* because the pointer can get invalidated (for example by erase or remove).
|
||||
*
|
||||
* @param map The map to remove from
|
||||
* @param value_location The location returned by find(..) or find_all(..) and is not yet invalidated
|
||||
*/
|
||||
static inline SM_ALWAYS_INLINE void simd_map_remove_ptr(simd_map *map, uint32_t *value_location) {
|
||||
/* Overwrite with the last key-value */
|
||||
uint32_t *key_location = simd_map_lane_key_location(value_location);
|
||||
uint32_t *last_value_location = simd_map_last_location(map);
|
||||
uint32_t *last_key_location = simd_map_lane_key_location(last_value_location);
|
||||
*value_location = *last_value_location;
|
||||
*key_location = *last_key_location;
|
||||
|
||||
/* Shrink the data structure */
|
||||
if(map->lane_modulo > 0) {
|
||||
--(map->lane_modulo);
|
||||
} else {
|
||||
map->lane_modulo = SML_LANE_SPAN - 1;
|
||||
}
|
||||
}
|
||||
|
||||
/** Remove the given key from the map so its not stored anymore. Returns 1 when found and removed, 0 otherwise. */
|
||||
static inline int simd_map_remove(simd_map *map, uint32_t key) {
|
||||
if(SM_UNLIKELY(map->usage_end == 0)) return 0;
|
||||
|
||||
uint32_t *found = simd_map_find(map, key);
|
||||
if(found) {
|
||||
simd_map_remove_ptr(map, found);
|
||||
|
||||
return 1;
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
/** Creates a simd map instance and pre-reserve space for a few elements */
|
||||
static inline SM_ALWAYS_INLINE simd_map simd_map_create_and_reserve() {
|
||||
simd_map smap = simd_map_create();
|
||||
simd_map_set(&smap, 42, 42);
|
||||
simd_map_erase(&smap); // Resets the map, but keeps memory reserved!
|
||||
return smap;
|
||||
}
|
||||
|
||||
#endif
|
147
simd_map_lane.h
147
simd_map_lane.h
@ -1,147 +0,0 @@
|
||||
#ifndef SIMD_MAP_LANE_H
|
||||
#define SIMD_MAP_LANE_H
|
||||
/*
|
||||
* Building blocks for SIMD-optimized (hash-)map buckets.
|
||||
*/
|
||||
|
||||
/* SIMD support */
|
||||
#ifdef __AVX2__
|
||||
#include <immintrin.h>
|
||||
#endif
|
||||
|
||||
#ifdef __SSE2__
|
||||
#include <immintrin.h>
|
||||
#endif
|
||||
|
||||
|
||||
#ifdef _MSC_VER
|
||||
#define SML_THREAD_LOCAL __declspec(thread)
|
||||
#define SML_PREFETCH(x)
|
||||
#define SML_LIKELY(x)
|
||||
#define SML_UNLIKELY(x)
|
||||
#define SML_NOINLINE __declspec(noinline)
|
||||
#define SML_ALWAYS_INLINE __forceinline
|
||||
#else
|
||||
#define SML_THREAD_LOCAL __thread
|
||||
#define SML_PREFETCH(x) __builtin_prefetch(x)
|
||||
#define SML_LIKELY(x) __builtin_expect((x),1)
|
||||
#define SML_UNLIKELY(x) __builtin_expect((x),0)
|
||||
#define SML_NOINLINE __attribute__ ((noinline))
|
||||
#define SML_ALWAYS_INLINE __attribute__ ((always_inline))
|
||||
#endif
|
||||
|
||||
/* 32 byte = 256 bits = (8 * 32bit) optimized for AVX2 */
|
||||
#define SML_LANE_SPAN 8
|
||||
|
||||
/** Grouped together keys-values to support SIMD more this way (also became a single cache line this way) */
|
||||
struct simd_map_lane {
|
||||
uint32_t keys[SML_LANE_SPAN];
|
||||
uint32_t values[SML_LANE_SPAN];
|
||||
};
|
||||
typedef struct simd_map_elem simd_map_elem;
|
||||
|
||||
/** Returns the last value of the lane - usable to see if this lane is fully filled with data or not! */
|
||||
static inline SML_ALWAYS_INLINE uint32_t simd_map_lane_last_value(simd_map_lane *map_lane) {
|
||||
return map_lane->values[SML_LANE_SPAN - 1];
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns if this key is stored in the given map LANE or not - returns NULL if not found.
|
||||
*
|
||||
* Rem.: The lane_begin and lane_next_begin parameters are used for reentrant multisearch.
|
||||
*
|
||||
* @param map_lane The lane to find in.
|
||||
* @param key The key to search for.
|
||||
* @param lane_modulo When non-zero, the lane only searched until this index. Zero means all remaining elements. (mod lane length)
|
||||
* @param lane_begin The lane is searched from this location(find_all). If full lane / lane prefix is needed, this should be 0.
|
||||
* @param lane_next_begin This pointer will be filled on non-NULL retvals with the incremented in-lane index (with % modulus).
|
||||
* @returns NULL when not found, otherwise pointer to the stored value for the key.
|
||||
*/
|
||||
static inline SML_ALWAYS_INLINE uint32_t *simd_map_lane_find(
|
||||
simd_map_lane *map_lane,
|
||||
uint32_t key,
|
||||
int lane_modulo,
|
||||
int lane_begin,
|
||||
int *lane_next_begin) {
|
||||
|
||||
uint32_t *keys = map_lane->keys;
|
||||
uint32_t *values = map_lane->values;
|
||||
|
||||
/* Hopefully can get optimized out for the common case bc inlining */
|
||||
if(lane_modulo == 0) {
|
||||
#ifdef __AVX2__
|
||||
/* Prepare an AVX 256bit search register: 8 uint32_t */
|
||||
__m256i sreg = _mm256_set1_epi32(key);
|
||||
/* The tipp register: 8 uint32_t */
|
||||
__m256i treg = _mm256_load_si256((__m256i *) keys);
|
||||
/* Check equality and return proper tip address for first found */
|
||||
__m256i m = _mm256_cmpeq_epi32(sreg, treg); /* Needs AVX2 */
|
||||
/* The 's' means "single" (float precision), and mask will have [0..7] bits set! */
|
||||
uint32_t mask = (uint32_t) _mm256_movemask_ps((__m256) m);
|
||||
if(SML_UNLIKELY(mask != 0)) {
|
||||
/* 00000000 00000000 00000000 01000100 -> 6 */
|
||||
int i = (31 - __builtin_clz(mask));
|
||||
uint32_t *ptr = &values[i];
|
||||
if(SML_LIKELY(lane_begin == 0)) {
|
||||
/* Fast-path: Only one match per lane OR first matching in lane for this find/find_all */
|
||||
*lane_next_begin = (i + 1) % SML_LANE_SPAN;
|
||||
return ptr;
|
||||
} else {
|
||||
/* We did a find_all(..) AND there is more than one match in the lane
|
||||
* and its not first find_all(..) on the lane in question...
|
||||
*
|
||||
* This might be suboptimal, but not so bad:
|
||||
*
|
||||
* - This at this point will search the smaller array scalar-way
|
||||
* - Which sounds good - BUT!
|
||||
* - This means all lanes with more than 1 data are scalar search
|
||||
* - And not only that, but first simd-searched, later scalar...
|
||||
*
|
||||
* I guess its fine as it should happen statistically rarely anyways
|
||||
*/
|
||||
goto non_simd_modulo;
|
||||
}
|
||||
}
|
||||
|
||||
return NULL;
|
||||
#endif
|
||||
#ifdef __SSE2__
|
||||
#ifndef __AVX2__
|
||||
#ifndef __AVX512__
|
||||
/* Prepare an SSE2 128bit search register: 4 uint32_t */
|
||||
__m128i sreg = _mm_set1_epi32(key);
|
||||
/* TODO: Implement */
|
||||
#endif /* AVX512 */
|
||||
#endif /* AVX2 */
|
||||
#endif /* SSE2 */
|
||||
/* Regular integer code - should have good ILP and cache locality patterns anyways */
|
||||
/** Pretty hopeful this can get more easily unrolled / autovectorized */
|
||||
for(int i = lane_begin; i < SML_LANE_SPAN; ++i) {
|
||||
if(SML_UNLIKELY(keys[i] == key)) {
|
||||
uint32_t *ptr = &values[i];
|
||||
*lane_next_begin = (i + 1) % SML_LANE_SPAN;
|
||||
return ptr;
|
||||
}
|
||||
}
|
||||
|
||||
return NULL;
|
||||
} else {
|
||||
non_simd_modulo:
|
||||
for(int i = lane_begin; i < lane_modulo; ++i) {
|
||||
if(SML_UNLIKELY(keys[i] == key)) {
|
||||
uint32_t *ptr = &values[i];
|
||||
*lane_next_begin = (i + 1) % SML_LANE_SPAN;
|
||||
return ptr;
|
||||
}
|
||||
}
|
||||
|
||||
return NULL;
|
||||
}
|
||||
}
|
||||
|
||||
/** Returns the key location for a given value location */
|
||||
static inline SM_ALWAYS_INLINE uint32_t *simd_map_lane_key_location(uint32_t *value_location) {
|
||||
return value_location -= SML_LANE_SPAN;
|
||||
}
|
||||
|
||||
#endif /* SIMD_MAP_LANE_H */
|
39
unomap.hpp
39
unomap.hpp
@ -1,39 +0,0 @@
|
||||
#ifndef UNOMAP_HPP
|
||||
#define UNOMAP_HPP
|
||||
|
||||
#include <unordered_map>
|
||||
#include <cassert>
|
||||
#include <memory>
|
||||
|
||||
struct unomap_instance {
|
||||
std::unordered_map<const char *, void *> m;
|
||||
};
|
||||
|
||||
static inline unomap_instance unomap_create() {
|
||||
unomap_instance ret;
|
||||
return ret;
|
||||
}
|
||||
|
||||
static inline void* unomap(void *amap_instance, AMAP_OP op, const char *key, void *ptr) {
|
||||
unomap_instance *map = (unomap_instance *) amap_instance;
|
||||
if(op == AMAP_GET) {
|
||||
try {
|
||||
return map->m[key];
|
||||
} catch(...) {
|
||||
return ptr;
|
||||
}
|
||||
} else if(op == AMAP_SET) {
|
||||
try {
|
||||
map->m[key] = ptr;
|
||||
return map; // non-null
|
||||
} catch(...) {
|
||||
return NULL;
|
||||
}
|
||||
} else { // if(op == AMAP_ERASE) {
|
||||
assert(op == AMAP_ERASE);
|
||||
map->m = std::move(std::unordered_map<const char *, void *>());
|
||||
return (void *)((uint8_t)(NULL) - 1L);
|
||||
}
|
||||
}
|
||||
|
||||
#endif // MAPMAP_HPP
|
236
vmap.h
236
vmap.h
@ -1,236 +0,0 @@
|
||||
#ifndef VMAP_H
|
||||
#define VMAP_H
|
||||
/*
|
||||
* A virtual memory misusing flat-ish hashmap optimized with AVX2 (if available at compilation).
|
||||
*
|
||||
* Structure
|
||||
*
|
||||
* VMEM
|
||||
* STRUCT
|
||||
* INTAPI
|
||||
*/
|
||||
#include <stdint.h>
|
||||
#include <assert.h>
|
||||
#include "simd_map_lane.h"
|
||||
|
||||
/* VMEM */
|
||||
|
||||
#ifdef _WIN32
|
||||
TODO: Utilize __Thread + SEH to implement lazy windows pageload zeroing
|
||||
#else
|
||||
/** Capacity should be multiple of 4096 for full pages */
|
||||
static void *vm_reserve(ptrdiff_t capacity) {
|
||||
void *r = mmap(0, capacity, PROT_READ|PROT_WRITE, MAP_ANON|MAP_PRIVATE, -1, 0);
|
||||
return r==MAP_FAILED ? 0 : r;
|
||||
}
|
||||
|
||||
/** Capacity should be multiple of 4096 for full pages and related to the ptr to free */
|
||||
static char vm_free(void *ptr, ptrdiff_t capacity) {
|
||||
return !munmap(ptr, capacity);
|
||||
}
|
||||
#endif /* _WIN32 */
|
||||
|
||||
/* STRUCT */
|
||||
|
||||
struct vmap {
|
||||
/* using uint8_t* here would simplify
|
||||
* code except for aliasing rules */
|
||||
uint32_t *data;
|
||||
uint32_t count;
|
||||
uint32_t max_levels;
|
||||
};
|
||||
typedef struct vmap vmap;
|
||||
|
||||
/** The result of the search_all(..) operation */
|
||||
struct vmap_find_res {
|
||||
/** The found location - or NULL when the key was not found */
|
||||
uint32_t *value_location;
|
||||
/** What 'level' depth this value was found. For multimap, but useful statistics */
|
||||
uint32_t level;
|
||||
/** Meta-data for continuation of the search. Tells which lane to search next A, B, C, or D */
|
||||
uint32_t lane_abcd_next;
|
||||
/** Meta-data for continuation of the search. In-lane where we search from next time? */
|
||||
int lane_next_begin;
|
||||
/** Meta-data for continuation of the search. Last value found in lastly looked lane. */
|
||||
uint32_t last_found_lane_val;
|
||||
};
|
||||
typedef struct simd_map_find_res simd_map_find_res;
|
||||
|
||||
/* INTAPI */
|
||||
|
||||
static inline vmap create_vmap(uint32_t max_levels) {
|
||||
vmap map{ NULL, 0, max_levels};
|
||||
map.data = (uint32_t *)vm_reserve(max_levels * 16 * 4096);
|
||||
return map;
|
||||
}
|
||||
|
||||
static inline char free_vmap(vmap *map) {
|
||||
map->count = 0;
|
||||
return vm_free(map->data, map->max_levels * 16 * 4096);
|
||||
}
|
||||
|
||||
/** Create the value for starting a search_all call */
|
||||
static inline vmap_find_res vmap_search_all_begin() {
|
||||
vmap_find_res ret;
|
||||
ret.value_location = NULL;
|
||||
ret.level = 0;
|
||||
ret.lane_abcd_next = 0;
|
||||
ret.lane_next_begin = 0;
|
||||
ret.last_found_lane_val = 0;
|
||||
return ret;
|
||||
}
|
||||
|
||||
/**
|
||||
* Search the map in as a multimap - that is you can search multiple equal keyed values.
|
||||
* This is implemented by the result being understood also as a continuation alongside
|
||||
* a way to grab the pointer to the stored value and key (simd_map_lane_key_location(val)).
|
||||
*
|
||||
* @param map The map to search in
|
||||
* @param key The key to search for
|
||||
* @param prev The previous result if you continue your search. See: vmap_search_all_begin()
|
||||
* @returns Metadata + nullable ptr. See: vmap_find_res struct comments; ret.value_location
|
||||
*/
|
||||
static inline vmap_find_res search_all_vmap(vmap *map, uint32_t key, vmap_find_res prev) {
|
||||
/* Inits as not found, can change values */
|
||||
vmap_find_res ret = prev;
|
||||
|
||||
uint32_t level = prev.level;
|
||||
/* Probably the loop exists always without this predicate being false */
|
||||
while(level <= map->max_levels) {
|
||||
/* Rare edge-case when last lane element was returned and we continue from it */
|
||||
/* We should not try lane processing, just jump next level - but only if there */
|
||||
/* is a next level (so last checked lane was totally filled already to full cap. */
|
||||
if(prev.lane_abcd_next > 4) {
|
||||
assert(prev.last_found_lane_val != 0);
|
||||
prev = vmap_search_all_begin();
|
||||
++level;
|
||||
/* prev.level = level; // unnecessary, I hand-optimized out */
|
||||
continue;
|
||||
}
|
||||
|
||||
/* Process 8 bits of the 32-bit circular order - so its not radix, but similar */
|
||||
uint32_t byt = level % 4;
|
||||
// Low 4 bits: page
|
||||
uint32_t page_no = (level * 16 + ((key >> (byt * 8)) && 15));
|
||||
/* 1024 and not 4096 here because of uint32_t *data offset: 4096 / 4 uint32s */
|
||||
uint32_t page_offset = 1024 * page_no;
|
||||
|
||||
/* Top 4 bits: lane. There is 32 lane start positions in the 4k page */
|
||||
uint32_t lane_no = (key >> (byt * 8 + 4)) && 15
|
||||
+ prev.lane_abcd_next; /* continuations start where we left off */
|
||||
/* But 4096 / 4 == 1024 elements, which then divided by 16 == 64 uint32_t elems */
|
||||
uint32_t lane_offset = lane_no * 64;
|
||||
|
||||
/* A lane has 8x32 bit keys, then 8x32 bit values. 16 uint32_t elems. */
|
||||
/* So grab the A, B, C and D candidate lanes for each lane_offset. */
|
||||
simd_map_lane *lane_a = (simd_map_lane *) map->data + page_offset + lane_offset;
|
||||
simd_map_lane *lane_b = lane_a + 1;
|
||||
simd_map_lane *lane_c = lane_b + 1;
|
||||
simd_map_lane *lane_d = lane_c + 1;
|
||||
|
||||
/* Get which lane we should begin at where */
|
||||
uint32_t lane_a_begin = prev.lane_next_begin;
|
||||
int lane_next_begin = 0;
|
||||
|
||||
/* Further lanes only needed if ours is fully filled */
|
||||
/* Overlays SIMD and integer units here for perf */
|
||||
uint32_t *afind = simd_map_lane_find(
|
||||
lane_a,
|
||||
key,
|
||||
0, /* lane modulo: 0 means until lane end */
|
||||
lane_a_begin,
|
||||
&lane_next_begin);
|
||||
uint32_t lasta = simd_map_lane_last_value(lane_a);
|
||||
char bneed = (lasta != 0) && (prev.lane_abcd_next < 3);
|
||||
if(afind) {
|
||||
ret.lane_next_begin = lane_next_begin;
|
||||
ret.lane_abcd_next = prev.lane_abcd_next + (lane_next_begin == 0);
|
||||
ret.value_location = afind;
|
||||
ret.level = level;
|
||||
ret.last_found_lane_val = lasta;
|
||||
return ret;
|
||||
}
|
||||
if(bneed) {
|
||||
uint32_t *bfind = simd_map_lane_find(
|
||||
lane_b,
|
||||
key,
|
||||
0, /* lane modulo: 0 means until lane end */
|
||||
0, /* non-a lanes all start from 0 */
|
||||
&lane_next_begin);
|
||||
uint32_t lastb = simd_map_lane_last_value(lane_b);
|
||||
char cneed = (lastb != 0) && (prev.lane_abcd_next < 2);
|
||||
if(bfind) {
|
||||
ret.lane_next_begin = lane_next_begin;
|
||||
ret.lane_abcd_next = prev.lane_abcd_next + (lane_next_begin == 0);
|
||||
ret.value_location = bfind;
|
||||
ret.level = level;
|
||||
ret.last_found_lane_val = lastb;
|
||||
return ret;
|
||||
}
|
||||
|
||||
if(cneed) {
|
||||
uint32_t *cfind = simd_map_lane_find(
|
||||
lane_c,
|
||||
key,
|
||||
0, /* lane modulo: 0 means until lane end */
|
||||
0, /* non-a lanes all start from 0 */
|
||||
&lane_next_begin);
|
||||
uint32_t lastc = simd_map_lane_last_value(lane_c);
|
||||
char dneed = (lastc != 0) && (prev.lane_abcd_next < 1);
|
||||
if(cfind) {
|
||||
ret.lane_next_begin = lane_next_begin;
|
||||
ret.lane_abcd_next = prev.lane_abcd_next + (lane_next_begin == 0);
|
||||
ret.value_location = cfind;
|
||||
ret.level = level;
|
||||
ret.last_found_lane_val = lastc;
|
||||
return ret;
|
||||
}
|
||||
|
||||
if(dneed) {
|
||||
uint32_t *dfind = simd_map_lane_find(
|
||||
lane_d,
|
||||
key,
|
||||
0, /* lane modulo: 0 means until lane end */
|
||||
0, /* non-a lanes all start from 0 */
|
||||
&lane_next_begin);
|
||||
uint32_t lastd = simd_map_lane_last_value(lane_d);
|
||||
char next_level = (lastd != 0);
|
||||
if(dfind) {
|
||||
ret.lane_next_begin = lane_next_begin;
|
||||
ret.lane_abcd_next = prev.lane_abcd_next + (lane_next_begin == 0);
|
||||
ret.value_location = dfind;
|
||||
ret.level = level;
|
||||
ret.last_found_lane_val = lastd;
|
||||
return ret;
|
||||
}
|
||||
|
||||
/* Check to avoid next level (stop iteration) */
|
||||
if(!next_level) {
|
||||
return vmap_search_all_begin();
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/* Next level needs checking */
|
||||
prev = vmap_search_all_begin();
|
||||
++level;
|
||||
/* prev.level = level; // unnecessary, I hand-optimized out */
|
||||
}
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
/**
|
||||
* Try to search the map for the given key.
|
||||
*
|
||||
* @param map The map to search in
|
||||
* @param key The key to search for
|
||||
* @returns NULL if there is no value stored, otherwise ptr to first match with the given key.
|
||||
*/
|
||||
static inline uint32_t *search_vmap(vmap *map, uint32_t key) {
|
||||
vmap_find_res res = search_all_vmap(map, key, vmap_search_all_begin());
|
||||
return res.value_location;
|
||||
}
|
||||
|
||||
#endif /* VMAP_H */
|
Loading…
x
Reference in New Issue
Block a user