Compare commits

..

35 Commits

Author SHA1 Message Date
Richard Thier
219405069c better this way with assert 2025-01-27 04:41:26 +01:00
Richard Thier
544c920304 fixed a near-sure vmap bug 2025-01-27 04:26:03 +01:00
Richard Thier
c698fc55c3 vmap find implementation - untested 2025-01-27 04:21:18 +01:00
Richard Thier
673555fdfc vmap.h work-in-progress idea 2025-01-27 03:13:07 +01:00
Richard Thier
182fb69e18 refactor: pulled out simd_map_lane.h because its useful on its own 2025-01-27 03:12:22 +01:00
Richard Thier
c023882866 removed TODO 2024-10-23 21:37:53 +02:00
Richard Thier
e82468acf8 added simd_map_create_and_reserve 2024-10-23 21:30:30 +02:00
Richard Thier
968f95734d simd-map multimap search fix in the AVX2 code 2024-10-23 18:06:01 +02:00
Richard Thier
41988a0dee simd_map findall fix + buggy avx2 implementation 2024-10-23 14:20:56 +02:00
Richard Thier
ac62753820 multimap ops tests and some cleanup - seems workin" 2024-10-23 00:45:33 +02:00
Richard Thier
d219203939 simd multimap operations - first version, seem compatible with non-multi for now 2024-10-23 00:27:46 +02:00
Richard Thier
cd30b70457 simd_map remove 2024-10-22 18:52:12 +02:00
Richard Thier
ab3e80f020 simd_map_size 2024-10-22 17:39:23 +02:00
Richard Thier
c5e5993001 added simd_map first version with basic tests; no remove and no SIMD yet, but scalar 2024-10-22 15:22:22 +02:00
Richard Thier
feb0ea59e6 updated arena.h dependency to have freearena 2024-10-22 15:21:15 +02:00
Richard Thier
80bbeec568 include docs 2024-10-22 15:20:48 +02:00
Richard Thier
64a7d871c2 AVX2 implementation seems to work and is (as expected) faster than regular 2024-10-21 17:31:58 +02:00
Richard Thier
4e4c266632 RAW (can-fail) flags for optimization and non-failing implementation added 2024-10-21 14:21:34 +02:00
Richard Thier
418c8d289c restructured API for faster lookup without AVX and added a micro-optimization too 2024-10-21 13:49:16 +02:00
Richard Thier
a26b411fd4 fixed not needing double loops but fixing bug of not first finding results 2024-10-21 13:27:04 +02:00
Richard Thier
cdd9c77892 fixed missing outer loop for not finding element at first tipp + started refactor 2024-10-21 12:53:15 +02:00
Richard Thier
4ade45a655 added unordered map with api and benchmarks 2024-10-11 03:33:29 +02:00
Richard Thier
312c6f14ca pragma unroll 4 2024-10-11 02:52:07 +02:00
Richard Thier
22ed78cd0e not autoveced for some reason... 2024-10-11 02:39:48 +02:00
Richard Thier
1c41a4e106 tried auto-vectorization and simpler codes but does not happen as it says: "missed: not vectorized: no vectype for stmt, scalar_type: auint64" 2024-10-11 02:13:51 +02:00
Richard Thier
6c1adb1655 perf tests and smaller perf tunes + some experiments 2024-10-11 00:54:13 +02:00
Richard Thier
c1b4b9e97b perf test architecture 2024-10-10 22:37:21 +02:00
Richard Thier
70f9b24669 added mapmap.hpp so maybe future benchmarks can be done 2024-10-10 17:16:41 +02:00
Richard Thier
c1f0e5f1a9 reorder operations and better docs for use case simplification 2024-10-10 17:16:19 +02:00
Richard Thier
047babeb1b testing the tombstone-ing 2024-10-10 16:38:15 +02:00
Richard Thier
21351fd2b4 fixed edge-case when padding was not there and zero-terminator got overridden 2024-10-10 16:32:07 +02:00
Richard Thier
14052a8421 more meaningful tests for finding the errors: 8-length special case bug found 2024-10-10 16:06:08 +02:00
Richard Thier
79aa314352 commit retval 2024-10-10 00:04:22 +02:00
Richard Thier
5ad409f23d more test cases 2024-10-09 18:52:01 +02:00
Richard Thier
0725e0fd1c minor annoying bug fixes 2024-10-09 16:11:32 +02:00
9 changed files with 1282 additions and 91 deletions

@ -1 +1 @@
Subproject commit 3037bf6bec96b0ebc231510d308da1daece276fd Subproject commit dfff5028f3f7baee4e764744a656baf553bc4b70

315
main.cpp
View File

@ -1,22 +1,333 @@
#include <cstdio> #include <cstdio>
#include <cassert> #include <cassert>
#include <vector>
#include <string>
#include <chrono>
#include "amap.h" #include "amap.h"
#include "simap.h" #include "simap.h"
#include "mapmap.hpp"
#include "unomap.hpp"
#include "simd_map.h"
#include "vmap.h"
/**
* Creates keys or returns the ith key. Used for performance tests.
*
* @param i When "create" is false, we return the ith key (does not check OOB)
* @param create When true, we initialize the keystore with keys generated from 0..i indices.
* @returns The ith key when create==false, otherwise undefined.
*/
inline const char *keystore(int i, bool create = false) noexcept {
static thread_local std::vector<std::string> keys;
if(!create) {
return keys[i].c_str();
} else {
keys.resize(0);
keys.reserve(0);
std::string key = "k";
for(int j = 0; j < i; ++j) {
keys.push_back(key + std::to_string(j));
}
return NULL;
}
}
/**
* Creates keys or returns the ith integer key. Used for performance tests.
*
* Rem.: Generated keys are like this: i, i-1, i-2, ... 1
*
* @param i When "create" is false, we return the ith key (does not check OOB)
* @param create When true, we initialize the keystore with keys generated from 0..i indices.
* @returns The ith key when create==false, otherwise undefined.
*/
inline int *int_keystore(int i, bool create = false) noexcept {
static thread_local std::vector<int> keys;
if(!create) {
return &(keys[i]);
} else {
keys.resize(0);
keys.reserve(0);
for(int j = 0; j < i; ++j) {
keys.push_back(i - j);
}
return NULL;
}
}
inline const char *datastore_int(int i, bool create = false) noexcept {
static thread_local std::vector<std::string> keys;
if(!create) {
return keys[i].c_str();
} else {
keys.resize(0);
keys.reserve(0);
std::string key = "k";
for(int j = 0; j < i; ++j) {
keys.push_back(key + std::to_string(j));
}
return NULL;
}
}
/**
* Creates datas or returns the ith data. Used for performance tests.
*
* @param i When "create" is false, we return the ith data (does not check OOB)
* @param create When true, we initialize the datastore with datas generated from 0..i indices.
* @returns The ith data when create==false, otherwise undefined.
*/
inline int *datastore(int i, bool create = false) noexcept {
static thread_local std::vector<int> keys;
if(!create) {
return &(keys[i]);
} else {
keys.resize(0);
keys.reserve(0);
for(int j = 0; j < i; ++j) {
keys.push_back(j);
}
return NULL;
}
}
void test_perf(amap mapdo, void *map, int max_key, const char *what) {
auto begin = std::chrono::high_resolution_clock::now();
for(int i = 0; i < max_key; ++i) {
const char *key = keystore(i);
int *data = datastore(i);
mapdo(map, AMAP_SET, key, data);
}
auto end = std::chrono::high_resolution_clock::now();
auto elapsed = std::chrono::duration_cast<std::chrono::nanoseconds>(end - begin);
printf("Insertion time for %d elements (%s): %.3f ms.\n", max_key, what, elapsed.count() * 1e-6);
}
void test_basics(amap mapdo, void *map) { void test_basics(amap mapdo, void *map) {
/* Most basics */
assert(NULL == mapdo(map, AMAP_GET, "asdf", NULL)); assert(NULL == mapdo(map, AMAP_GET, "asdf", NULL));
int i = 42; int i = 42;
int *iptr; int *iptr;
const char *chptr;
assert(NULL != mapdo(map, AMAP_SET, "meaning", &i)); assert(NULL != mapdo(map, AMAP_SET, "meaning", &i));
assert(NULL != (iptr = (int *)mapdo(map, AMAP_GET, "meaning", NULL))); assert(NULL != (iptr = (int *)mapdo(map, AMAP_GET, "meaning", NULL)));
assert(*iptr == 42); assert(*iptr == 42);
assert(iptr == &i); assert(iptr == &i);
/* Delete / tombstone */
assert(NULL != mapdo(map, AMAP_SET, "meaning", NULL));
assert(NULL == (int *)mapdo(map, AMAP_GET, "meaning", NULL));
/* Check re-adding */
assert(NULL != mapdo(map, AMAP_SET, "meaning", &i));
assert(NULL != (iptr = (int *)mapdo(map, AMAP_GET, "meaning", NULL)));
assert(*iptr == 42);
assert(iptr == &i);
/* Test Erase */
assert(NULL != mapdo(map, AMAP_ERASE, NULL, NULL));
/* Check re-adding 3 new things */
assert(NULL != mapdo(map, AMAP_SET, "meaningless1", &i));
assert(NULL != mapdo(map, AMAP_SET, "meaning2", &i));
const char *helloworld = "Hello world!";
assert(NULL != mapdo(map, AMAP_SET, "hello", (char *)helloworld)); /* ugly cast... */
assert(NULL != (chptr = (const char *)mapdo(map, AMAP_GET, "hello", NULL)));
assert(strlen(chptr) == strlen(helloworld));
assert(strcmp(chptr, helloworld) == 0);
assert(NULL != (iptr = (int *)mapdo(map, AMAP_GET, "meaning2", NULL)));
assert(*iptr == 42);
assert(iptr == &i);
assert(NULL != (iptr = (int *)mapdo(map, AMAP_GET, "meaningless1", NULL)));
assert(*iptr == 42);
assert(iptr == &i);
/* Check the case where we have same 8-long prefix for multiple and they should be different */
int long_1 = 1;
int long_2 = 2;
int long_3 = 3;
assert(NULL != mapdo(map, AMAP_SET, "very_long_test_key_1", &long_1));
assert(NULL != mapdo(map, AMAP_SET, "very_long_test_key_2", &long_2));
assert(NULL != mapdo(map, AMAP_SET, "very_long_test_key_3", &long_3));
assert(NULL != (iptr = (int *)mapdo(map, AMAP_GET, "very_long_test_key_1", NULL)));
assert(*iptr == 1);
assert(NULL != (iptr = (int *)mapdo(map, AMAP_GET, "very_long_test_key_2", NULL)));
assert(*iptr == 2);
assert(NULL != (iptr = (int *)mapdo(map, AMAP_GET, "very_long_test_key_3", NULL)));
assert(*iptr == 3);
} }
int main() { void test_stringmaps(int perf_test_i) {
/* test simap */ /* Basic tests */
simap_instance si = simap_create(); simap_instance si = simap_create();
test_basics(simap, &si); test_basics(simap, &si);
mapmap_instance mi = mapmap_create();
test_basics(mapmap, &mi);
unomap_instance umi = unomap_create();
test_basics(unomap, &umi);
/* Performance tests */
int i = perf_test_i;
test_perf(mapmap, &mi, i, "std::map");
test_perf(simap, &si, i, "simap");
test_perf(unomap, &umi, i, "std::unordered_map");
}
void test_simd_map_basics() {
/* Empty free tests */
simd_map smap = simd_map_create();
simd_map_free(&smap);
/* Re-create */
smap = simd_map_create();
/* Empty search */
assert(simd_map_find(&smap, 42) == NULL);
assert(simd_map_size(&smap) == 0);
/* Insertions */
assert(simd_map_set(&smap, 40, 0) != 0);
assert(simd_map_set(&smap, 41, 1) != 0);
assert(simd_map_set(&smap, 42, 2) != 0);
assert(simd_map_size(&smap) == 3);
/* Searches */
assert(*simd_map_find(&smap, 40) == 0);
assert(*simd_map_find(&smap, 41) == 1);
assert(*simd_map_find(&smap, 42) == 2);
/* Test erase */
simd_map_erase(&smap);
assert(simd_map_find(&smap, 42) == NULL);
assert(simd_map_set(&smap, 42, 2) != 0);
assert(*simd_map_find(&smap, 42) == 2);
/* Test a bit more */
int cnt = 100;
for(int i = 0; i < cnt; ++i) {
assert(simd_map_set(&smap, i, (cnt - i)) != 0);
}
assert(simd_map_size(&smap) == 100); /* 42->2 should get overwritten */
for(int i = 0; i < cnt; ++i) {
assert(*simd_map_find(&smap, i) == (uint32_t)(cnt - i));
}
/* Test removal */
assert(simd_map_remove(&smap, 41) != 0);
assert(simd_map_remove(&smap, 43) != 0);
assert(simd_map_find(&smap, 41) == NULL);
assert(simd_map_find(&smap, 43) == NULL);
assert(simd_map_find(&smap, 42) != NULL);
assert(simd_map_find(&smap, 99) != NULL);
assert(simd_map_find(&smap, 98) != NULL);
assert(simd_map_size(&smap) == 98);
/* Test multimap operations */
assert(simd_map_multi_set(&smap, 42, 42) != 0);
assert(simd_map_multi_set(&smap, 42, 43) != 0);
assert(simd_map_find(&smap, 42) != NULL);
assert(*simd_map_find(&smap, 42) != 42);
simd_map_find_res res1 = simd_map_find_all(&smap, 42, simd_map_find_all_begin());
assert(res1.value_location != NULL);
assert(*(res1.value_location) == (uint32_t)(cnt - 42));
simd_map_find_res res2 = simd_map_find_all(&smap, 42, res1);
assert(*(res2.value_location) == 42);
simd_map_find_res res3 = simd_map_find_all(&smap, 42, res2);
assert(res3.value_location != NULL);
assert(*(res3.value_location) == 43);
/* Test filled-free */
simd_map_free(&smap);
}
void test_simd_map_perf(int max_key) {
#ifdef __AVX2__
puts("...Perf testing simd_map with AVX2...");
#elif __SSE2__
puts("...Perf testing simd_map with SSE2...");
#endif
// XXX: This way we would measure the wrong thing:
// simd_map smap = simd_map_create();
// Why? To measure the right thing, not the first allocation!
simd_map smap = simd_map_create_and_reserve();
auto begin = std::chrono::high_resolution_clock::now();
for(int i = 0; i < max_key; ++i) {
int *key = int_keystore(i);
int *data = datastore(i);
simd_map_set(&smap, *key, *data);
}
auto end = std::chrono::high_resolution_clock::now();
auto elapsed = std::chrono::duration_cast<std::chrono::nanoseconds>(end - begin);
printf("Insertion time for %d elements (simd_map): %.3f ms.\n", max_key, elapsed.count() * 1e-6);
simd_map_free(&smap);
}
void test_int_unordered_map(int max_key) {
std::unordered_map<int, int> smap;
auto begin = std::chrono::high_resolution_clock::now();
for(int i = 0; i < max_key; ++i) {
int *key = int_keystore(i);
int *data = datastore(i);
smap[*key] = *data;
}
auto end = std::chrono::high_resolution_clock::now();
auto elapsed = std::chrono::duration_cast<std::chrono::nanoseconds>(end - begin);
printf("Insertion time for %d elements (std::unordered_map<int,int>): %.3f ms.\n", max_key, elapsed.count() * 1e-6);
}
void test_int_std_map(int max_key) {
std::map<int, int> smap;
auto begin = std::chrono::high_resolution_clock::now();
for(int i = 0; i < max_key; ++i) {
int *key = int_keystore(i);
int *data = datastore(i);
smap[*key] = *data;
}
auto end = std::chrono::high_resolution_clock::now();
auto elapsed = std::chrono::duration_cast<std::chrono::nanoseconds>(end - begin);
printf("Insertion time for %d elements (std::map<int,int>): %.3f ms.\n", max_key, elapsed.count() * 1e-6);
}
void test_intmaps(int perf_test_i) {
/* Basic tests */
// test_simd_map_basics();
test_int_std_map(perf_test_i);
test_int_unordered_map(perf_test_i);
test_simd_map_perf(perf_test_i);
}
int main() {
int perf_test_i = 1000;
/* Prepare data stores */
keystore(perf_test_i, true);
int_keystore(perf_test_i, true);
datastore(perf_test_i, true);
/* Tests */
puts("");
puts("Integer maps...");
puts("");
test_intmaps(perf_test_i);
puts("");
puts("String maps...");
puts("");
test_stringmaps(perf_test_i);
puts("");
puts("...done!");
return 0; return 0;
} }

View File

@ -2,3 +2,11 @@ debug:
g++ main.cpp -g -Wall -o main g++ main.cpp -g -Wall -o main
release: release:
g++ main.cpp -O2 -Wall -o main g++ main.cpp -O2 -Wall -o main
debug-avx2:
g++ main.cpp -g -mavx2 -Wall -o main
release-avx2:
g++ main.cpp -mavx2 -O2 -Wall -o main
release-avx2-debug:
g++ main.cpp -g -mavx2 -O2 -Wall -o main
release-avx2-asm:
g++ main.cpp -S -fopt-info-vec-missed -masm=intel -mavx2 -O2 -Wall -o main

39
mapmap.hpp Normal file
View File

@ -0,0 +1,39 @@
#ifndef MAPMAP_HPP
#define MAPMAP_HPP
#include <map>
#include <cassert>
#include <memory>
struct mapmap_instance {
std::map<const char *, void *> m;
};
static inline mapmap_instance mapmap_create() {
mapmap_instance ret;
return ret;
}
static inline void* mapmap(void *amap_instance, AMAP_OP op, const char *key, void *ptr) {
mapmap_instance *map = (mapmap_instance *) amap_instance;
if(op == AMAP_GET) {
try {
return map->m[key];
} catch(...) {
return ptr;
}
} else if(op == AMAP_SET) {
try {
map->m[key] = ptr;
return map; // non-null
} catch(...) {
return NULL;
}
} else { // if(op == AMAP_ERASE) {
assert(op == AMAP_ERASE);
map->m = std::move(std::map<const char *, void *>());
return (void *)((uint8_t)(NULL) - 1L);
}
}
#endif // MAPMAP_HPP

277
simap.h
View File

@ -1,35 +1,73 @@
#ifndef SIMAP_H #ifndef SIMAP_H
#define SIMAP_H #define SIMAP_H
#include <stddef.h> #include <stddef.h> /* NULL */
#include <stdint.h> #include <stdint.h> /* uint8_t, uint32_t, ... */
#include <string.h> #include <string.h> /* strcmp, strncpy etc. */
#include <assert.h> #include <assert.h> /* assert */
#include "amap.h" #include "amap.h"
#include "arena.h/arena.h" #include "arena.h/arena.h"
/* Possible (non-AVX, but alike) optimization, but means there can be lookup / insert errors (very rarely)
#define SIMAP_RAW
*/
/* Perf trickery */
/* XXX: Enabling AVX also enables rare errors for speed gain! See above. */
#ifdef __AVX2__
#define SIMAP_RAW
#include <immintrin.h>
#endif
/* I have no idea what MSVC has instead... */
#ifdef _MSC_VER
#define SM_THREAD_LOCAL __declspec(thread)
#define SM_PREFETCH(x)
#define SM_LIKELY(x)
#define SM_UNLIKELY(x)
#define SM_NOINLINE __declspec(noinline)
#define SM_ALWAYS_INLINE __forceinline
#else
#define SM_THREAD_LOCAL __thread
#define SM_PREFETCH(x) __builtin_prefetch(x)
#define SM_LIKELY(x) __builtin_expect((x),1)
#define SM_UNLIKELY(x) __builtin_expect((x),0)
#define SM_NOINLINE __attribute__ ((noinline))
#define SM_ALWAYS_INLINE __attribute__ ((always_inline))
#endif
typedef uint64_t auint64 __attribute__ ((__aligned__(8)));
/** The first 8 characters are stored as uint64_t for fast checks */
union simap_c64 {
char str8[8];
auint64 u64;
};
typedef union simap_char64 simap_char64;
/** This is to ensure 8byte storage of pointers (with possible padding) */ /** This is to ensure 8byte storage of pointers (with possible padding) */
union simap_ptr64 { union simap_ptr64 {
void *ptr; void *ptr;
uint64_t u64; auint64 u64;
}; };
typedef union simap_ptr64 simap_ptr64; typedef union simap_ptr64 simap_ptr64;
struct elem_nonkey_prefix {
/** The value (ptr) */
simap_ptr64 value;
/** Previous element index from base (full offset) */
uint32_t previndex;
/** Next element index from base (full offset) */
uint32_t nextindex;
};
typedef struct elem_nonkey_prefix elem_nonkey_prefix;
/** /**
* A "peasantly" map data structure backed by arena.h - basically a toy data structure... * The per-element storage layout
*
* This is very simple, no trees, no hashes, just (hopefully) autovectorized linear lookup.
* Inserting NULLs to keys happens through tombstoning unless erase happens and we never
* shrink memory so please do not add a lot of things then remove a lot of things.
*
* We also only do heuristics against data being the key so its not "sure" and can fail...
*
* XXX: So beware that this can FAIL just "statistically most often works"!
*
* The memory layout after at *base is as follows:
* *
* 8 byte: * 8 byte:
* - void* value; * - void* value;
* - ? padding (only for non-64 bit pointer machines) * - [?] optional padding (only for non-64 bit pointer machines)
* *
* 8 byte: * 8 byte:
* - uint32_t previndex; * - uint32_t previndex;
@ -39,20 +77,35 @@ typedef union simap_ptr64 simap_ptr64;
* - char name[]; // inline stored * - char name[]; // inline stored
* - padding (divisible by 8) * - padding (divisible by 8)
* *
* ELEMENTS added to it...
*
* Because of it a lookup is basically via strstr-like with 8byte steps! * Because of it a lookup is basically via strstr-like with 8byte steps!
* with few character names zero-padded in the search term parameter * with few character names zero-padded in the search term parameter
* and if you want check extra validity by jumping back&forth in it. * and if you want check extra validity by jumping back&forth in it.
*/ */
struct elem_prefix {
/** Value and meta-data - divisible by 8bytes */
elem_nonkey_prefix nonkey_prefix;
/** The prefix of the key - divisible by 8bytes padded string after this (inlined) */
simap_c64 key_prefix;
};
typedef struct elem_prefix elem_prefix;
/**
* A "peasantly" map data structure backed by arena.h - basically a toy data structure...
*
* This is very simple, no trees, no hashes, just (hopefully) autovectorized linear lookup.
* Inserting NULLs to keys happens through tombstoning unless erase happens and we never
* shrink memory so please do not add a lot of things then remove a lot of things.
*
* In AVX2 mode, we do heuristics against data being the key so its not "sure" and can fail...
* XXX: So beware that this CAN FAIL for AVX2 build flags just "statistically most often works"!
*/
struct simap_instance { struct simap_instance {
arena a; arena a;
uint32_t end; uint32_t end;
uint32_t prev_usage_end; /* previous usage_end or -1 if no previous exists... in bytes!!! */ uint32_t prev_usage_end; /* previous usage_end or -1 if no previous exists... in bytes!!! */
uint32_t usage_end; /* in bytes!!! */ uint32_t usage_end; /* in bytes!!! */
uint32_t next_previndex; /* in bytes!!! */ elem_prefix *base;
/** see doc comment for layout and why uint64_t* is the type */
uint64_t *base;
}; };
typedef struct simap_instance simap_instance; typedef struct simap_instance simap_instance;
@ -62,17 +115,101 @@ static inline simap_instance simap_create() {
ret.end = 0; ret.end = 0;
ret.prev_usage_end = (uint32_t) -1; ret.prev_usage_end = (uint32_t) -1;
ret.usage_end = 0; ret.usage_end = 0;
ret.next_previndex = 0; ret.base = (elem_prefix*)(((auint64*) aralloc(&(ret.a), sizeof(auint64), sizeof(auint64), 1)) /* addr divisible by 8 */
ret.base = ((uint64_t*) aralloc(&(ret.a), sizeof(uint64_t), sizeof(uint64_t), 1)) /* addr divisible by 8 */ + 1); /* First really addressible thing */
+ 1; /* First really addressible thing */
return ret; return ret;
} }
union simap_c64 { static inline void* simap(void *amap_instance, AMAP_OP op, const char *key, void *ptr);
char str8[8];
uint64_t u64; /** Gets padding bytes for a size to be padded to divisible alignment */
}; static inline unsigned int get_size_padding(unsigned int size, unsigned int alignment) {
typedef union simap_char64 simap_char64; /* Would ensure returned value divisible by alignment */
/* return (size + alignment - 1) / alignment * alignment; */
/* Basically same as: */
/* return (alignment - (size % alignment)) % alignment; */
/* Substracting size leads to padding */
return ((size + alignment - 1) / alignment) * alignment - size;
}
/** Gets padded address - or same address if divisible by alignment */
static inline void *get_padded(void *ptr, int alignment) {
/* return (alignment - (ptr % alignment)) % alignment; */
return (void*)((ptrdiff_t)((uint8_t *)ptr + alignment - 1) / alignment * alignment);
}
static inline SM_ALWAYS_INLINE auint64 *make_tipp(auint64 *base, auint64 *tip, auint64 prefix, auint64 *end) {
#ifdef __AVX2__
/* See:
*
* https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#ig_expand=861,4605&avxnewtechs=AVX
* https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#ig_expand=861,4605&avxnewtechs=AVX&text=movemask
* https://chryswoods.com/vector_c++/part2.html
* https://blog.triplez.cn/posts/avx-avx2-learning-notes/
* https://github.com/Triple-Z/AVX-AVX2-Example-Code
* https://en.algorithmica.org/hpc/simd/masking/
* https://stackoverflow.com/questions/31089502/aligned-and-unaligned-memory-access-with-avx-avx2-intrinsics
*/
/* Step over previous tipp and search until the AVX2 alignment needs */
/* AVXs 256 bit = 32 byte so multiple of 32 needed here */
/* TODO: Probably I can change "32" here into something bigger to non-avx small arrays! */
auint64 *neotip = (auint64 *) get_padded(++tip, 32);
while((tip < neotip) && (*tip != prefix)) ++tip;
if(tip < neotip) return tip;
/* Prepare an AVX 256bit search register: 4 uint64_t */
__m256i sreg = _mm256_set1_epi64x(prefix);
while(tip < end) {
/* This needs 32 byte alignment here that we do above */
/* The tipp register: 4 uint64_t */
__m256i treg = _mm256_load_si256((__m256i *) tip);
/* Check equality and return proper tip address for first found */
__m256i m = _mm256_cmpeq_epi64(sreg, treg); /* Needs AVX2 */
uint32_t mask = (uint32_t) _mm256_movemask_pd((__m256d) m);
/* Try next tip, processes 256 bits per loop */
tip += 4; /* 4x64 bit */
/* One of the links used __builtin_ctz(mask), but I
* think it was bad implementation and only finds the
* last search result!
*
* __builtin_clz returns leading zeroes of the mask
* and the mask has 4 bits at most and each show if
* 1..4 places of AVX reg compared properly to the
* given prefix value (4x 64 bit comparizons happen).
* Subtracting from 31 we subtract either 28,29,30,31
* and thus resulting in 3, 2, 1, 0 (right offsets).
*
* If the mask got all zero, there is nothing found,
* otherwise its the tipp + offset we calculated. */
if(SM_UNLIKELY(mask != 0)) {
int offset = (31 - __builtin_clz(mask));
/* -4 because this is the unlikely scenario and we already incremented! */
return tip - 4 + offset;
}
}
/* Not found case */
return tip;
#endif
#ifdef SIMAP_RAW
#pragma GCC unroll 16
while((++tip < end) && (*tip != prefix));
return tip;
#endif
/* XXX: This only works because of (***) because reading -1 tips makes tip >= end for sure here and back */
elem_nonkey_prefix *pre = (elem_nonkey_prefix *)((uint8_t *)tip - sizeof(elem_nonkey_prefix));
tip = (auint64 *) ((uint8_t *)base + pre->nextindex + sizeof(elem_nonkey_prefix));
#pragma GCC unroll 16
while((tip < end) && (*tip != prefix)) {
pre = (elem_nonkey_prefix *)((uint8_t *)tip - sizeof(elem_nonkey_prefix));
tip = (auint64 *) ((uint8_t *)base + pre->nextindex + sizeof(elem_nonkey_prefix));
}
return tip;
}
static inline simap_ptr64 *simap_search_internal(simap_instance *map, const char *key) { static inline simap_ptr64 *simap_search_internal(simap_instance *map, const char *key) {
/* Construct prefix (fast-key) */ /* Construct prefix (fast-key) */
@ -81,28 +218,36 @@ static inline simap_ptr64 *simap_search_internal(simap_instance *map, const char
simap_c64 prefix {0}; simap_c64 prefix {0};
size_t prefixlen = is_smallkey ? keylen : 8; size_t prefixlen = is_smallkey ? keylen : 8;
/* Ignore warning because we know what we are doing here... */
#pragma GCC diagnostic push
#pragma GCC diagnostic ignored "-Wstringop-truncation"
strncpy(prefix.str8, key, prefixlen); strncpy(prefix.str8, key, prefixlen);
#pragma GCC diagnostic pop
/* Construct keyremains - might point to the \0 terminator only if smallkey or 8 bytes exactly */ /* Construct keyremains - might point to the \0 terminator only if smallkey or 8 bytes exactly */
const char *keyremains = key + prefixlen; const char *keyremains = key + prefixlen;
/* TODO: Maybe I should create separate function for fast-lookup returning "next" pointer from a pointer to autovectorize? */
/* Lookup prefix (fast-key) - hopefully this gets vectorized (should be)!!! */ /* Lookup prefix (fast-key) - hopefully this gets vectorized (should be)!!! */
uint64_t *base = map->base; auint64 *base = (auint64 *) (map->base);
uint64_t *tipp = map->base; auint64 *end = (auint64 *)((uint8_t *)base + (map->usage_end));
for(uint32_t i = 0; i < map->usage_end / 8; ++i, ++tipp) { auint64 *tipp = make_tipp(base, base, prefix.u64, end);
/* Fast lookup */ while(tipp < end) { /* XXX: (***) */
if(*tipp == prefix.u64) {
/* Need detailed lookup, because found the prefix */
assert((*tipp == prefix.u64));
/* First check the remains of the string (only if needed) */ /* First check the remains of the string (only if needed) */
if(!is_smallkey) { if(!is_smallkey) {
char *tippremains = (char *)((uint8_t *)tipp + sizeof(uint64_t)); char *tippremains = (char *)((uint8_t *)tipp + sizeof(uint64_t));
if(strcmp(keyremains, tippremains) != 0) { if(strcmp(keyremains, tippremains) != 0) {
tipp = make_tipp(base, tipp, prefix.u64, end);
continue; continue;
} }
} }
simap_ptr64 *ptr = (simap_ptr64 *)((uint8_t *) (tipp - 2)); simap_ptr64 *ptr = (simap_ptr64 *)((uint8_t *) (tipp - 2));
#ifdef SIMAP_RAW
/* Check back & forth (jump validation) */ /* Check back & forth (jump validation) */
uint32_t previ = *((uint32_t *)(tipp - 1)); uint32_t previ = *((uint32_t *)(tipp - 1));
if(previ == (uint32_t) -1) { if(previ == (uint32_t) -1) {
@ -113,37 +258,30 @@ static inline simap_ptr64 *simap_search_internal(simap_instance *map, const char
+ sizeof(simap_ptr64) + sizeof(simap_ptr64)
+ sizeof(uint32_t)); + sizeof(uint32_t));
uint64_t *retipp = (uint64_t *)(((uint8_t *)base + prevnexi) auint64 *retipp = (auint64 *)(((uint8_t *)base + prevnexi)
+ sizeof(simap_ptr64) + sizeof(uint32_t) + + sizeof(simap_ptr64) + sizeof(uint32_t) +
+ sizeof(uint32_t)); + sizeof(uint32_t));
if(retipp != tipp) { if(retipp != tipp) {
tipp = make_tipp(base, tipp, prefix.u64, end);
continue; continue;
} }
#endif /* SIMAP_RAW */
/* Can have the (statistically checked) pointer */ /* Can have the (statistically checked) pointer */
return ptr; return ptr;
} }
}
/* Haven't found anything */ /* Haven't found anything */
return NULL; return NULL;
} }
/** Gets padding bytes for a size to be padded to divisible alignment */
static inline unsigned int get_size_padding(unsigned int size, unsigned int alignment) {
/* Would ensure returned value divisible by alignment */
/* return (size + alignment - 1) / alignment * alignment; */
/* same: return (alignment - (size % alignment)) % alignment; */
/* Substracting size leads to padding */
return ((size + alignment - 1) / alignment) * alignment - size;
}
/** Returns the size of the storage needed for the given key */ /** Returns the size of the storage needed for the given key */
static inline uint32_t simap_elem_storage_size(const char *key) { static inline uint32_t simap_elem_storage_size(const char *key) {
uint32_t keysize = strlen(key); uint32_t keysize = strlen(key);
uint32_t padding = get_size_padding(keysize, 8); uint32_t padding = get_size_padding(keysize, 8);
/* XXX: The exactly 8byte keys need a zero terminator too (would be overridden without this) */
padding += (keysize == 8) ? 8 : 0;
return keysize + return keysize +
sizeof(simap_ptr64) + sizeof(simap_ptr64) +
@ -152,11 +290,11 @@ static inline uint32_t simap_elem_storage_size(const char *key) {
padding; padding;
} }
/** Force-add the (key,value) to the end of the map */ /** Force-add the (key,value) to the end of the map. Use this if you prefill your map one-by-one and need speed */
static inline void *simap_force_add_internal(simap_instance *map, const char *key, void *ptr) { static inline void *simap_force_add(simap_instance *map, const char *key, void *ptr) {
uint32_t storage_needed = simap_elem_storage_size(key); uint32_t storage_needed = simap_elem_storage_size(key);
assert((storage_needed % 8) == 0); assert((storage_needed % 8) == 0);
if(map->end - map->usage_end < storage_needed) { if(SM_UNLIKELY(map->end - map->usage_end < storage_needed)) {
/* Need storage */ /* Need storage */
aralloc(&(map->a), aralloc(&(map->a),
sizeof(uint8_t)/*esize*/, sizeof(uint8_t)/*esize*/,
@ -172,36 +310,39 @@ static inline void *simap_force_add_internal(simap_instance *map, const char *ke
/* Create first 8 char encoding (this ensures endianness and all such stuff) */ /* Create first 8 char encoding (this ensures endianness and all such stuff) */
simap_c64 first8 {0}; simap_c64 first8 {0};
uint32_t keylen = strlen(key); uint32_t keylen = strlen(key);
/* Ignore warning because we know what we are doing here... */
#pragma GCC diagnostic push
#pragma GCC diagnostic ignored "-Wstringop-truncation"
strncpy(first8.str8, key, (keylen < 8) ? keylen : 8); strncpy(first8.str8, key, (keylen < 8) ? keylen : 8);
#pragma GCC diagnostic pop
uint32_t usi = map->usage_end; uint32_t usi = map->usage_end;
uint32_t previ = map->prev_usage_end; uint32_t previ = map->prev_usage_end;
/* Save data ptr */ /* 8byte: Save data ptr */
simap_ptr64 *data = (simap_ptr64 *)((uint8_t *)(map->base) + usi); simap_ptr64 *data = (simap_ptr64 *)((uint8_t *)(map->base) + usi);
data->ptr = ptr; data->ptr = ptr;
/* Save link to previous */ /* 8byte: Save link to previous and next */
uint32_t *usprev = (uint32_t *)((uint8_t *)(map->base) + uint32_t *usprev = (uint32_t *)((uint8_t *)(map->base) + usi +
sizeof(simap_ptr64) + sizeof(simap_ptr64));
sizeof(uint32_t));
*usprev = previ; *usprev = previ;
/* and nex */ *(usprev + 1) = (uint32_t) -1; /* XXX: (***): ensures the "not < end" here! */
*(usprev + 1) = (uint32_t) -1;
/* First 8 bytes */ /* 8byte: First 8 char */
simap_c64 *start_str = (simap_c64 *)(usprev + 2); simap_c64 *start_str = (simap_c64 *)(usprev + 2);
*start_str = first8; *start_str = first8;
/* Remainin bytes */ /* Remaining bytes */
if(keylen > 8) { if(keylen >= 8) {
/* uint32_t key_remains = keylen - 8; */ /* uint32_t key_remains = keylen - 8; */
char *rem_str = (char *)(start_str + 1); char *rem_str = (char *)(start_str + 1);
strcpy(rem_str, key + 8); strcpy(rem_str, key + 8);
} }
/* XXX: The "padding" gets automagically added by the movement of the arena here(by junk bytes)! */
/* Update previous with linkage */ /* Update previous with linkage */
if(previ != (uint32_t)-1) { if(SM_LIKELY(previ != (uint32_t)-1)) {
uint32_t *prevnex = (uint32_t *)((uint8_t *)(map->base) + previ + uint32_t *prevnex = (uint32_t *)((uint8_t *)(map->base) + previ +
sizeof(simap_ptr64) + sizeof(simap_ptr64) +
sizeof(uint32_t)); sizeof(uint32_t));
@ -235,9 +376,10 @@ static inline void *simap_force_add_internal(simap_instance *map, const char *ke
static inline void* simap(void *amap_instance, AMAP_OP op, const char *key, void *ptr) { static inline void* simap(void *amap_instance, AMAP_OP op, const char *key, void *ptr) {
simap_instance *map = (simap_instance *) amap_instance; simap_instance *map = (simap_instance *) amap_instance;
if(op == AMAP_ERASE) { if((op == AMAP_ERASE)) {
map->prev_usage_end = (uint32_t) -1;
map->usage_end = 0; map->usage_end = 0;
return NULL; return (void *)((uint8_t)(NULL) - 1L);
} }
/* Search for the key - also needed for SET in order to "re-set" */ /* Search for the key - also needed for SET in order to "re-set" */
@ -248,12 +390,13 @@ static inline void* simap(void *amap_instance, AMAP_OP op, const char *key, void
} else { } else {
assert(op == AMAP_SET); assert(op == AMAP_SET);
if(found) { if((!found)) {
/* Add as new */
return simap_force_add(map, key, ptr);
} else {
/* Just overwrite */ /* Just overwrite */
found->ptr = ptr; found->ptr = ptr;
return (void *) found; return (void *) found;
} else {
return simap_force_add_internal(map, key, ptr);
} }
} }

268
simd_map.h Normal file
View File

@ -0,0 +1,268 @@
#ifndef SIMD_MAP
#define SIMD_MAP
#include <stddef.h> /* NULL */
#include <stdint.h> /* uint32_t, ... */
#include <assert.h>
#include "arena.h/arena.h"
#include "simd_map_lane.h"
/* I have no idea what MSVC has instead... */
#ifdef _MSC_VER
#define SM_THREAD_LOCAL __declspec(thread)
#define SM_PREFETCH(x)
#define SM_LIKELY(x)
#define SM_UNLIKELY(x)
#define SM_NOINLINE __declspec(noinline)
#define SM_ALWAYS_INLINE __forceinline
#else
#define SM_THREAD_LOCAL __thread
#define SM_PREFETCH(x) __builtin_prefetch(x)
#define SM_LIKELY(x) __builtin_expect((x),1)
#define SM_UNLIKELY(x) __builtin_expect((x),0)
#define SM_NOINLINE __attribute__ ((noinline))
#define SM_ALWAYS_INLINE __attribute__ ((always_inline))
#endif
struct simd_map {
arena a;
simd_map_lane *lanes;
uint32_t end; /* in lanes */
uint32_t usage_end; /* in lanes!!! */
int lane_modulo; /* [0..SML_LANE_SPAN) */
};
typedef struct simd_map simd_map;
/** Create a simd map instance */
static inline SM_ALWAYS_INLINE simd_map simd_map_create() {
simd_map ret;
ret.a = newarena((ptrdiff_t)1 << 33);
ret.end = 0;
ret.usage_end = 0;
ret.lanes = (simd_map_lane*)(aralloc(&(ret.a), sizeof(simd_map_lane), sizeof(simd_map_lane), 1)) /* aligned! */
+ 1; /* First really addressible thing */
ret.lane_modulo = 0;
return ret;
}
/** Creates a simd map instance and pre-reserve space for a few elements */
static inline SM_ALWAYS_INLINE simd_map simd_map_create_and_reserve();
/** Free all resources held by the map. Returns 0 on errors. */
static inline SM_ALWAYS_INLINE char simd_map_free(simd_map *map) {
return freearena(&(map->a));
}
/** The result of the find_all(..) operation */
struct simd_map_find_res {
/** The found location - or NULL when the key was not found */
uint32_t *value_location;
/** Meta-data for continuation of the search */
uint32_t lane_next;
/** Meta-data for continuation of the search */
int lane_next_begin;
};
typedef struct simd_map_find_res simd_map_find_res;
/** Create the value for starting a find_all call */
static inline simd_map_find_res simd_map_find_all_begin() {
simd_map_find_res ret;
ret.value_location = NULL;
ret.lane_next = 0;
ret.lane_next_begin = 0;
return ret;
}
/**
* Useful for multimap-like operations to find multiple mappings for the same key.
*
* @param map The map
* @param key The key to search
* @param prev The previous result - or simd_map_find_all_begin() to find the first / start lookup!
* @returns The found pointer / location (if any) and if location was non-NULL meta-data so we can search further same-keys!
*/
static inline SM_ALWAYS_INLINE simd_map_find_res simd_map_find_all(simd_map *map, uint32_t key, simd_map_find_res prev) {
simd_map_find_res ret;
/* Process most lanes */
/* Do not process last element (-1) because of last incomplete lane */
if(map->usage_end > 0) for(uint32_t i = prev.lane_next; i < map->usage_end - 1; ++i) {
uint32_t *found = simd_map_lane_find(
&(map->lanes[i]),
key,
0,
prev.lane_next_begin,
&(ret.lane_next_begin)); /* XXX: Fills part of retval! */
/* Needed so only the currently found are ignored in find_all(..) */
prev.lane_next_begin = 0;
if(found) {
ret.value_location = found;
ret.lane_next = i + (ret.lane_next_begin == 0);
return ret;
}
}
/* Process last lane - with a modulo lane */
if((map->usage_end > 0) && (prev.lane_next < map->usage_end)) {
uint32_t *found = simd_map_lane_find(
&(map->lanes[map->usage_end - 1]),
key,
map->lane_modulo,
prev.lane_next_begin,
&(ret.lane_next_begin)); /* XXX: Fills part of retval! */
/* Needed so only the currently found are ignored in find_all(..) */
prev.lane_next_begin = 0;
if(found) {
ret.value_location = found;
ret.lane_next = (map->usage_end - 1) + (ret.lane_next_begin == 0);
return ret;
}
}
/* Not found */
ret = simd_map_find_all_begin();
return ret;
}
/** Returns if this key is stored in the map or not - returns NULL if does not exists. */
static inline uint32_t *simd_map_find(simd_map *map, uint32_t key) {
simd_map_find_res begin = simd_map_find_all_begin();
simd_map_find_res fires = simd_map_find_all(map, key, begin);
return fires.value_location;
}
/**
* Insert without checking that the value have been already added or not.
*
* Useful for multimap operation or if you know the key have never been before added (faster)!
*
* @param map The map
* @param key The key to insert
* @param value The value for this key to insert
* @returns 0 on errors, otherwise 1.
*/
static inline SM_ALWAYS_INLINE char simd_map_multi_set(simd_map *map, uint32_t key, uint32_t value) {
/* Handle storage growth needs. */
uint32_t storage_needed = (map->lane_modulo == 0) ? 1 : 0;
if(SM_UNLIKELY(map->end - map->usage_end < storage_needed)) {
void *allret = aralloc(&(map->a),
sizeof(simd_map_lane)/* esize */,
1 /* align - should be sizeof(simd_map_lane) but should be aligned here as-is already! */,
storage_needed); /* ecount */
/** Return early with error but no state changes if we cannot allocate! */
if(SM_UNLIKELY(!allret)) {
return 0;
}
/* Administer end offset */
map->end += storage_needed;
}
/* Administer usage end offset, separate from end because erase / shrink ops possible */
map->usage_end += storage_needed;
/* Always force-insert into the last lane at lane_modulo location */
simd_map_lane *lane = &(map->lanes[map->usage_end - 1]);
lane->keys[map->lane_modulo] = key;
lane->values[map->lane_modulo] = value;
/* Update lane modulo */
map->lane_modulo = (map->lane_modulo + 1) % SML_LANE_SPAN;
return 1;
}
/**
* Returns 0 on errors, otherwise 1 when added as new, 2 when already found got overwritten
*/
static inline char simd_map_set(simd_map *map, uint32_t key, uint32_t value) {
uint32_t *found = simd_map_find(map, key);
if(!found) {
return simd_map_multi_set(map, key, value);
} else {
/* Overwrite already existing mapping */
*found = value;
return 2;
}
}
/** Empties the map - this does not free resources, just makes it reusable! */
static inline SM_ALWAYS_INLINE void simd_map_erase(simd_map *map) {
map->usage_end = 0;
map->lane_modulo = 0;
}
/** Returns count of elements in the given simd_map */
static inline SM_ALWAYS_INLINE size_t simd_map_size(simd_map *map) {
return (map->usage_end > 0) ?
(((size_t)(map->usage_end) - 1) * 8 + map->lane_modulo) :
0;
}
/** Returns TRUE when map is empty and false otherwise - faster than simd_map_size(..) */
static inline SM_ALWAYS_INLINE char simd_map_is_empty(simd_map *map) {
return (map->usage_end == 0);
}
/** Returns the lastly inserted value's location in the map - you must ensure the map has elements! */
static inline SM_ALWAYS_INLINE uint32_t *simd_map_last_location(simd_map *map) {
return (map->lane_modulo > 0) ?
&(map->lanes[map->usage_end - 1].values[map->lane_modulo - 1]) :
&(map->lanes[map->usage_end - 1].values[SML_LANE_SPAN - 1]);
}
/**
* Removes the found location from the map.
*
* Most users should prefer simd_map_remove instead. This is an unchecked operation!
*
* This must be called right after a find(..) or find_all(..) operation,
* because the pointer can get invalidated (for example by erase or remove).
*
* @param map The map to remove from
* @param value_location The location returned by find(..) or find_all(..) and is not yet invalidated
*/
static inline SM_ALWAYS_INLINE void simd_map_remove_ptr(simd_map *map, uint32_t *value_location) {
/* Overwrite with the last key-value */
uint32_t *key_location = simd_map_lane_key_location(value_location);
uint32_t *last_value_location = simd_map_last_location(map);
uint32_t *last_key_location = simd_map_lane_key_location(last_value_location);
*value_location = *last_value_location;
*key_location = *last_key_location;
/* Shrink the data structure */
if(map->lane_modulo > 0) {
--(map->lane_modulo);
} else {
map->lane_modulo = SML_LANE_SPAN - 1;
}
}
/** Remove the given key from the map so its not stored anymore. Returns 1 when found and removed, 0 otherwise. */
static inline int simd_map_remove(simd_map *map, uint32_t key) {
if(SM_UNLIKELY(map->usage_end == 0)) return 0;
uint32_t *found = simd_map_find(map, key);
if(found) {
simd_map_remove_ptr(map, found);
return 1;
}
return 0;
}
/** Creates a simd map instance and pre-reserve space for a few elements */
static inline SM_ALWAYS_INLINE simd_map simd_map_create_and_reserve() {
simd_map smap = simd_map_create();
simd_map_set(&smap, 42, 42);
simd_map_erase(&smap); // Resets the map, but keeps memory reserved!
return smap;
}
#endif

147
simd_map_lane.h Normal file
View File

@ -0,0 +1,147 @@
#ifndef SIMD_MAP_LANE_H
#define SIMD_MAP_LANE_H
/*
* Building blocks for SIMD-optimized (hash-)map buckets.
*/
/* SIMD support */
#ifdef __AVX2__
#include <immintrin.h>
#endif
#ifdef __SSE2__
#include <immintrin.h>
#endif
#ifdef _MSC_VER
#define SML_THREAD_LOCAL __declspec(thread)
#define SML_PREFETCH(x)
#define SML_LIKELY(x)
#define SML_UNLIKELY(x)
#define SML_NOINLINE __declspec(noinline)
#define SML_ALWAYS_INLINE __forceinline
#else
#define SML_THREAD_LOCAL __thread
#define SML_PREFETCH(x) __builtin_prefetch(x)
#define SML_LIKELY(x) __builtin_expect((x),1)
#define SML_UNLIKELY(x) __builtin_expect((x),0)
#define SML_NOINLINE __attribute__ ((noinline))
#define SML_ALWAYS_INLINE __attribute__ ((always_inline))
#endif
/* 32 byte = 256 bits = (8 * 32bit) optimized for AVX2 */
#define SML_LANE_SPAN 8
/** Grouped together keys-values to support SIMD more this way (also became a single cache line this way) */
struct simd_map_lane {
uint32_t keys[SML_LANE_SPAN];
uint32_t values[SML_LANE_SPAN];
};
typedef struct simd_map_elem simd_map_elem;
/** Returns the last value of the lane - usable to see if this lane is fully filled with data or not! */
static inline SML_ALWAYS_INLINE uint32_t simd_map_lane_last_value(simd_map_lane *map_lane) {
return map_lane->values[SML_LANE_SPAN - 1];
}
/**
* Returns if this key is stored in the given map LANE or not - returns NULL if not found.
*
* Rem.: The lane_begin and lane_next_begin parameters are used for reentrant multisearch.
*
* @param map_lane The lane to find in.
* @param key The key to search for.
* @param lane_modulo When non-zero, the lane only searched until this index. Zero means all remaining elements. (mod lane length)
* @param lane_begin The lane is searched from this location(find_all). If full lane / lane prefix is needed, this should be 0.
* @param lane_next_begin This pointer will be filled on non-NULL retvals with the incremented in-lane index (with % modulus).
* @returns NULL when not found, otherwise pointer to the stored value for the key.
*/
static inline SML_ALWAYS_INLINE uint32_t *simd_map_lane_find(
simd_map_lane *map_lane,
uint32_t key,
int lane_modulo,
int lane_begin,
int *lane_next_begin) {
uint32_t *keys = map_lane->keys;
uint32_t *values = map_lane->values;
/* Hopefully can get optimized out for the common case bc inlining */
if(lane_modulo == 0) {
#ifdef __AVX2__
/* Prepare an AVX 256bit search register: 8 uint32_t */
__m256i sreg = _mm256_set1_epi32(key);
/* The tipp register: 8 uint32_t */
__m256i treg = _mm256_load_si256((__m256i *) keys);
/* Check equality and return proper tip address for first found */
__m256i m = _mm256_cmpeq_epi32(sreg, treg); /* Needs AVX2 */
/* The 's' means "single" (float precision), and mask will have [0..7] bits set! */
uint32_t mask = (uint32_t) _mm256_movemask_ps((__m256) m);
if(SML_UNLIKELY(mask != 0)) {
/* 00000000 00000000 00000000 01000100 -> 6 */
int i = (31 - __builtin_clz(mask));
uint32_t *ptr = &values[i];
if(SML_LIKELY(lane_begin == 0)) {
/* Fast-path: Only one match per lane OR first matching in lane for this find/find_all */
*lane_next_begin = (i + 1) % SML_LANE_SPAN;
return ptr;
} else {
/* We did a find_all(..) AND there is more than one match in the lane
* and its not first find_all(..) on the lane in question...
*
* This might be suboptimal, but not so bad:
*
* - This at this point will search the smaller array scalar-way
* - Which sounds good - BUT!
* - This means all lanes with more than 1 data are scalar search
* - And not only that, but first simd-searched, later scalar...
*
* I guess its fine as it should happen statistically rarely anyways
*/
goto non_simd_modulo;
}
}
return NULL;
#endif
#ifdef __SSE2__
#ifndef __AVX2__
#ifndef __AVX512__
/* Prepare an SSE2 128bit search register: 4 uint32_t */
__m128i sreg = _mm_set1_epi32(key);
/* TODO: Implement */
#endif /* AVX512 */
#endif /* AVX2 */
#endif /* SSE2 */
/* Regular integer code - should have good ILP and cache locality patterns anyways */
/** Pretty hopeful this can get more easily unrolled / autovectorized */
for(int i = lane_begin; i < SML_LANE_SPAN; ++i) {
if(SML_UNLIKELY(keys[i] == key)) {
uint32_t *ptr = &values[i];
*lane_next_begin = (i + 1) % SML_LANE_SPAN;
return ptr;
}
}
return NULL;
} else {
non_simd_modulo:
for(int i = lane_begin; i < lane_modulo; ++i) {
if(SML_UNLIKELY(keys[i] == key)) {
uint32_t *ptr = &values[i];
*lane_next_begin = (i + 1) % SML_LANE_SPAN;
return ptr;
}
}
return NULL;
}
}
/** Returns the key location for a given value location */
static inline SM_ALWAYS_INLINE uint32_t *simd_map_lane_key_location(uint32_t *value_location) {
return value_location -= SML_LANE_SPAN;
}
#endif /* SIMD_MAP_LANE_H */

39
unomap.hpp Normal file
View File

@ -0,0 +1,39 @@
#ifndef UNOMAP_HPP
#define UNOMAP_HPP
#include <unordered_map>
#include <cassert>
#include <memory>
struct unomap_instance {
std::unordered_map<const char *, void *> m;
};
static inline unomap_instance unomap_create() {
unomap_instance ret;
return ret;
}
static inline void* unomap(void *amap_instance, AMAP_OP op, const char *key, void *ptr) {
unomap_instance *map = (unomap_instance *) amap_instance;
if(op == AMAP_GET) {
try {
return map->m[key];
} catch(...) {
return ptr;
}
} else if(op == AMAP_SET) {
try {
map->m[key] = ptr;
return map; // non-null
} catch(...) {
return NULL;
}
} else { // if(op == AMAP_ERASE) {
assert(op == AMAP_ERASE);
map->m = std::move(std::unordered_map<const char *, void *>());
return (void *)((uint8_t)(NULL) - 1L);
}
}
#endif // MAPMAP_HPP

236
vmap.h Normal file
View File

@ -0,0 +1,236 @@
#ifndef VMAP_H
#define VMAP_H
/*
* A virtual memory misusing flat-ish hashmap optimized with AVX2 (if available at compilation).
*
* Structure
*
* VMEM
* STRUCT
* INTAPI
*/
#include <stdint.h>
#include <assert.h>
#include "simd_map_lane.h"
/* VMEM */
#ifdef _WIN32
TODO: Utilize __Thread + SEH to implement lazy windows pageload zeroing
#else
/** Capacity should be multiple of 4096 for full pages */
static void *vm_reserve(ptrdiff_t capacity) {
void *r = mmap(0, capacity, PROT_READ|PROT_WRITE, MAP_ANON|MAP_PRIVATE, -1, 0);
return r==MAP_FAILED ? 0 : r;
}
/** Capacity should be multiple of 4096 for full pages and related to the ptr to free */
static char vm_free(void *ptr, ptrdiff_t capacity) {
return !munmap(ptr, capacity);
}
#endif /* _WIN32 */
/* STRUCT */
struct vmap {
/* using uint8_t* here would simplify
* code except for aliasing rules */
uint32_t *data;
uint32_t count;
uint32_t max_levels;
};
typedef struct vmap vmap;
/** The result of the search_all(..) operation */
struct vmap_find_res {
/** The found location - or NULL when the key was not found */
uint32_t *value_location;
/** What 'level' depth this value was found. For multimap, but useful statistics */
uint32_t level;
/** Meta-data for continuation of the search. Tells which lane to search next A, B, C, or D */
uint32_t lane_abcd_next;
/** Meta-data for continuation of the search. In-lane where we search from next time? */
int lane_next_begin;
/** Meta-data for continuation of the search. Last value found in lastly looked lane. */
uint32_t last_found_lane_val;
};
typedef struct simd_map_find_res simd_map_find_res;
/* INTAPI */
static inline vmap create_vmap(uint32_t max_levels) {
vmap map{ NULL, 0, max_levels};
map.data = (uint32_t *)vm_reserve(max_levels * 16 * 4096);
return map;
}
static inline char free_vmap(vmap *map) {
map->count = 0;
return vm_free(map->data, map->max_levels * 16 * 4096);
}
/** Create the value for starting a search_all call */
static inline vmap_find_res vmap_search_all_begin() {
vmap_find_res ret;
ret.value_location = NULL;
ret.level = 0;
ret.lane_abcd_next = 0;
ret.lane_next_begin = 0;
ret.last_found_lane_val = 0;
return ret;
}
/**
* Search the map in as a multimap - that is you can search multiple equal keyed values.
* This is implemented by the result being understood also as a continuation alongside
* a way to grab the pointer to the stored value and key (simd_map_lane_key_location(val)).
*
* @param map The map to search in
* @param key The key to search for
* @param prev The previous result if you continue your search. See: vmap_search_all_begin()
* @returns Metadata + nullable ptr. See: vmap_find_res struct comments; ret.value_location
*/
static inline vmap_find_res search_all_vmap(vmap *map, uint32_t key, vmap_find_res prev) {
/* Inits as not found, can change values */
vmap_find_res ret = prev;
uint32_t level = prev.level;
/* Probably the loop exists always without this predicate being false */
while(level <= map->max_levels) {
/* Rare edge-case when last lane element was returned and we continue from it */
/* We should not try lane processing, just jump next level - but only if there */
/* is a next level (so last checked lane was totally filled already to full cap. */
if(prev.lane_abcd_next > 4) {
assert(prev.last_found_lane_val != 0);
prev = vmap_search_all_begin();
++level;
/* prev.level = level; // unnecessary, I hand-optimized out */
continue;
}
/* Process 8 bits of the 32-bit circular order - so its not radix, but similar */
uint32_t byt = level % 4;
// Low 4 bits: page
uint32_t page_no = (level * 16 + ((key >> (byt * 8)) && 15));
/* 1024 and not 4096 here because of uint32_t *data offset: 4096 / 4 uint32s */
uint32_t page_offset = 1024 * page_no;
/* Top 4 bits: lane. There is 32 lane start positions in the 4k page */
uint32_t lane_no = (key >> (byt * 8 + 4)) && 15
+ prev.lane_abcd_next; /* continuations start where we left off */
/* But 4096 / 4 == 1024 elements, which then divided by 16 == 64 uint32_t elems */
uint32_t lane_offset = lane_no * 64;
/* A lane has 8x32 bit keys, then 8x32 bit values. 16 uint32_t elems. */
/* So grab the A, B, C and D candidate lanes for each lane_offset. */
simd_map_lane *lane_a = (simd_map_lane *) map->data + page_offset + lane_offset;
simd_map_lane *lane_b = lane_a + 1;
simd_map_lane *lane_c = lane_b + 1;
simd_map_lane *lane_d = lane_c + 1;
/* Get which lane we should begin at where */
uint32_t lane_a_begin = prev.lane_next_begin;
int lane_next_begin = 0;
/* Further lanes only needed if ours is fully filled */
/* Overlays SIMD and integer units here for perf */
uint32_t *afind = simd_map_lane_find(
lane_a,
key,
0, /* lane modulo: 0 means until lane end */
lane_a_begin,
&lane_next_begin);
uint32_t lasta = simd_map_lane_last_value(lane_a);
char bneed = (lasta != 0) && (prev.lane_abcd_next < 3);
if(afind) {
ret.lane_next_begin = lane_next_begin;
ret.lane_abcd_next = prev.lane_abcd_next + (lane_next_begin == 0);
ret.value_location = afind;
ret.level = level;
ret.last_found_lane_val = lasta;
return ret;
}
if(bneed) {
uint32_t *bfind = simd_map_lane_find(
lane_b,
key,
0, /* lane modulo: 0 means until lane end */
0, /* non-a lanes all start from 0 */
&lane_next_begin);
uint32_t lastb = simd_map_lane_last_value(lane_b);
char cneed = (lastb != 0) && (prev.lane_abcd_next < 2);
if(bfind) {
ret.lane_next_begin = lane_next_begin;
ret.lane_abcd_next = prev.lane_abcd_next + (lane_next_begin == 0);
ret.value_location = bfind;
ret.level = level;
ret.last_found_lane_val = lastb;
return ret;
}
if(cneed) {
uint32_t *cfind = simd_map_lane_find(
lane_c,
key,
0, /* lane modulo: 0 means until lane end */
0, /* non-a lanes all start from 0 */
&lane_next_begin);
uint32_t lastc = simd_map_lane_last_value(lane_c);
char dneed = (lastc != 0) && (prev.lane_abcd_next < 1);
if(cfind) {
ret.lane_next_begin = lane_next_begin;
ret.lane_abcd_next = prev.lane_abcd_next + (lane_next_begin == 0);
ret.value_location = cfind;
ret.level = level;
ret.last_found_lane_val = lastc;
return ret;
}
if(dneed) {
uint32_t *dfind = simd_map_lane_find(
lane_d,
key,
0, /* lane modulo: 0 means until lane end */
0, /* non-a lanes all start from 0 */
&lane_next_begin);
uint32_t lastd = simd_map_lane_last_value(lane_d);
char next_level = (lastd != 0);
if(dfind) {
ret.lane_next_begin = lane_next_begin;
ret.lane_abcd_next = prev.lane_abcd_next + (lane_next_begin == 0);
ret.value_location = dfind;
ret.level = level;
ret.last_found_lane_val = lastd;
return ret;
}
/* Check to avoid next level (stop iteration) */
if(!next_level) {
return vmap_search_all_begin();
}
}
}
}
/* Next level needs checking */
prev = vmap_search_all_begin();
++level;
/* prev.level = level; // unnecessary, I hand-optimized out */
}
return ret;
}
/**
* Try to search the map for the given key.
*
* @param map The map to search in
* @param key The key to search for
* @returns NULL if there is no value stored, otherwise ptr to first match with the given key.
*/
static inline uint32_t *search_vmap(vmap *map, uint32_t key) {
vmap_find_res res = search_all_vmap(map, key, vmap_search_all_begin());
return res.value_location;
}
#endif /* VMAP_H */