simap/simap.h

408 lines
14 KiB
C
Raw Normal View History

2024-09-29 20:52:18 +02:00
#ifndef SIMAP_H
#define SIMAP_H
2024-10-22 15:20:48 +02:00
#include <stddef.h> /* NULL */
#include <stdint.h> /* uint8_t, uint32_t, ... */
#include <string.h> /* strcmp, strncpy etc. */
#include <assert.h> /* assert */
2024-09-29 20:52:18 +02:00
#include "amap.h"
#include "arena.h/arena.h"
/* Possible (non-AVX, but alike) optimization, but means there can be lookup / insert errors (very rarely)
#define SIMAP_RAW
*/
/* Perf trickery */
/* XXX: Enabling AVX also enables rare errors for speed gain! See above. */
#ifdef __AVX2__
#define SIMAP_RAW
#include <immintrin.h>
#endif
/* I have no idea what MSVC has instead... */
#ifdef _MSC_VER
#define SM_THREAD_LOCAL __declspec(thread)
#define SM_PREFETCH(x)
#define SM_LIKELY(x)
#define SM_UNLIKELY(x)
#define SM_NOINLINE __declspec(noinline)
#define SM_ALWAYS_INLINE __forceinline
#else
#define SM_THREAD_LOCAL __thread
#define SM_PREFETCH(x) __builtin_prefetch(x)
#define SM_LIKELY(x) __builtin_expect((x),1)
#define SM_UNLIKELY(x) __builtin_expect((x),0)
#define SM_NOINLINE __attribute__ ((noinline))
#define SM_ALWAYS_INLINE __attribute__ ((always_inline))
#endif
typedef uint64_t auint64 __attribute__ ((__aligned__(8)));
/** The first 8 characters are stored as uint64_t for fast checks */
union simap_c64 {
char str8[8];
auint64 u64;
};
typedef union simap_char64 simap_char64;
/** This is to ensure 8byte storage of pointers (with possible padding) */
union simap_ptr64 {
void *ptr;
auint64 u64;
};
typedef union simap_ptr64 simap_ptr64;
struct elem_nonkey_prefix {
/** The value (ptr) */
simap_ptr64 value;
/** Previous element index from base (full offset) */
uint32_t previndex;
/** Next element index from base (full offset) */
uint32_t nextindex;
};
typedef struct elem_nonkey_prefix elem_nonkey_prefix;
2024-09-29 20:52:18 +02:00
/**
* The per-element storage layout
2024-09-29 20:52:18 +02:00
*
* 8 byte:
* - void* value;
2024-10-09 16:11:32 +02:00
* - [?] optional padding (only for non-64 bit pointer machines)
2024-09-29 20:52:18 +02:00
*
* 8 byte:
* - uint32_t previndex;
* - uint32_t nextindex;
*
* K x 8 byte:
* - char name[]; // inline stored
* - padding (divisible by 8)
*
* Because of it a lookup is basically via strstr-like with 8byte steps!
* with few character names zero-padded in the search term parameter
* and if you want check extra validity by jumping back&forth in it.
*/
struct elem_prefix {
/** Value and meta-data - divisible by 8bytes */
elem_nonkey_prefix nonkey_prefix;
/** The prefix of the key - divisible by 8bytes padded string after this (inlined) */
simap_c64 key_prefix;
};
typedef struct elem_prefix elem_prefix;
/**
* A "peasantly" map data structure backed by arena.h - basically a toy data structure...
*
* This is very simple, no trees, no hashes, just (hopefully) autovectorized linear lookup.
* Inserting NULLs to keys happens through tombstoning unless erase happens and we never
* shrink memory so please do not add a lot of things then remove a lot of things.
*
* In AVX2 mode, we do heuristics against data being the key so its not "sure" and can fail...
* XXX: So beware that this CAN FAIL for AVX2 build flags just "statistically most often works"!
*/
2024-09-29 20:52:18 +02:00
struct simap_instance {
arena a;
uint32_t end;
uint32_t prev_usage_end; /* previous usage_end or -1 if no previous exists... in bytes!!! */
2024-09-29 20:52:18 +02:00
uint32_t usage_end; /* in bytes!!! */
elem_prefix *base;
2024-09-29 20:52:18 +02:00
};
typedef struct simap_instance simap_instance;
static inline simap_instance simap_create() {
simap_instance ret;
ret.a = newarena((ptrdiff_t)1 << 33);
ret.end = 0;
ret.prev_usage_end = (uint32_t) -1;
2024-09-29 20:52:18 +02:00
ret.usage_end = 0;
ret.base = (elem_prefix*)(((auint64*) aralloc(&(ret.a), sizeof(auint64), sizeof(auint64), 1)) /* addr divisible by 8 */
+ 1); /* First really addressible thing */
2024-09-29 20:52:18 +02:00
return ret;
}
static inline void* simap(void *amap_instance, AMAP_OP op, const char *key, void *ptr);
/** Gets padding bytes for a size to be padded to divisible alignment */
static inline unsigned int get_size_padding(unsigned int size, unsigned int alignment) {
/* Would ensure returned value divisible by alignment */
/* return (size + alignment - 1) / alignment * alignment; */
/* Basically same as: */
/* return (alignment - (size % alignment)) % alignment; */
/* Substracting size leads to padding */
return ((size + alignment - 1) / alignment) * alignment - size;
}
/** Gets padded address - or same address if divisible by alignment */
static inline void *get_padded(void *ptr, int alignment) {
/* return (alignment - (ptr % alignment)) % alignment; */
return (void*)((ptrdiff_t)((uint8_t *)ptr + alignment - 1) / alignment * alignment);
}
static inline SM_ALWAYS_INLINE auint64 *make_tipp(auint64 *base, auint64 *tip, auint64 prefix, auint64 *end) {
#ifdef __AVX2__
/* See:
*
* https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#ig_expand=861,4605&avxnewtechs=AVX
* https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#ig_expand=861,4605&avxnewtechs=AVX&text=movemask
* https://chryswoods.com/vector_c++/part2.html
* https://blog.triplez.cn/posts/avx-avx2-learning-notes/
* https://github.com/Triple-Z/AVX-AVX2-Example-Code
* https://en.algorithmica.org/hpc/simd/masking/
* https://stackoverflow.com/questions/31089502/aligned-and-unaligned-memory-access-with-avx-avx2-intrinsics
*/
/* Step over previous tipp and search until the AVX2 alignment needs */
/* AVXs 256 bit = 32 byte so multiple of 32 needed here */
/* TODO: Probably I can change "32" here into something bigger to non-avx small arrays! */
auint64 *neotip = (auint64 *) get_padded(++tip, 32);
while((tip < neotip) && (*tip != prefix)) ++tip;
if(tip < neotip) return tip;
/* Prepare an AVX 256bit search register: 4 uint64_t */
__m256i sreg = _mm256_set1_epi64x(prefix);
while(tip < end) {
/* This needs 32 byte alignment here that we do above */
/* The tipp register: 4 uint64_t */
__m256i treg = _mm256_load_si256((__m256i *) tip);
/* Check equality and return proper tip address for first found */
__m256i m = _mm256_cmpeq_epi64(sreg, treg); /* Needs AVX2 */
uint32_t mask = (uint32_t) _mm256_movemask_pd((__m256d) m);
/* Try next tip, processes 256 bits per loop */
tip += 4; /* 4x64 bit */
/* One of the links used __builtin_ctz(mask), but I
* think it was bad implementation and only finds the
* last search result!
*
* __builtin_clz returns leading zeroes of the mask
* and the mask has 4 bits at most and each show if
* 1..4 places of AVX reg compared properly to the
* given prefix value (4x 64 bit comparizons happen).
* Subtracting from 31 we subtract either 28,29,30,31
* and thus resulting in 3, 2, 1, 0 (right offsets).
*
* If the mask got all zero, there is nothing found,
* otherwise its the tipp + offset we calculated. */
if(SM_UNLIKELY(mask != 0)) {
int offset = (31 - __builtin_clz(mask));
/* -4 because this is the unlikely scenario and we already incremented! */
return tip - 4 + offset;
}
}
/* Not found case */
return tip;
#endif
#ifdef SIMAP_RAW
#pragma GCC unroll 16
while((++tip < end) && (*tip != prefix));
return tip;
#endif
/* XXX: This only works because of (***) because reading -1 tips makes tip >= end for sure here and back */
elem_nonkey_prefix *pre = (elem_nonkey_prefix *)((uint8_t *)tip - sizeof(elem_nonkey_prefix));
tip = (auint64 *) ((uint8_t *)base + pre->nextindex + sizeof(elem_nonkey_prefix));
#pragma GCC unroll 16
while((tip < end) && (*tip != prefix)) {
pre = (elem_nonkey_prefix *)((uint8_t *)tip - sizeof(elem_nonkey_prefix));
tip = (auint64 *) ((uint8_t *)base + pre->nextindex + sizeof(elem_nonkey_prefix));
}
return tip;
}
2024-09-29 20:52:18 +02:00
static inline simap_ptr64 *simap_search_internal(simap_instance *map, const char *key) {
/* Construct prefix (fast-key) */
size_t keylen = strlen(key);
char is_smallkey = (keylen < 8);
simap_c64 prefix {0};
size_t prefixlen = is_smallkey ? keylen : 8;
/* Ignore warning because we know what we are doing here... */
#pragma GCC diagnostic push
#pragma GCC diagnostic ignored "-Wstringop-truncation"
2024-09-29 20:52:18 +02:00
strncpy(prefix.str8, key, prefixlen);
#pragma GCC diagnostic pop
2024-09-29 20:52:18 +02:00
/* Construct keyremains - might point to the \0 terminator only if smallkey or 8 bytes exactly */
const char *keyremains = key + prefixlen;
/* Lookup prefix (fast-key) - hopefully this gets vectorized (should be)!!! */
auint64 *base = (auint64 *) (map->base);
auint64 *end = (auint64 *)((uint8_t *)base + (map->usage_end));
auint64 *tipp = make_tipp(base, base, prefix.u64, end);
while(tipp < end) { /* XXX: (***) */
/* Need detailed lookup, because found the prefix */
assert((*tipp == prefix.u64));
2024-09-29 20:52:18 +02:00
/* First check the remains of the string (only if needed) */
if(!is_smallkey) {
char *tippremains = (char *)((uint8_t *)tipp + sizeof(uint64_t));
if(strcmp(keyremains, tippremains) != 0) {
tipp = make_tipp(base, tipp, prefix.u64, end);
continue;
}
}
simap_ptr64 *ptr = (simap_ptr64 *)((uint8_t *) (tipp - 2));
#ifdef SIMAP_RAW
/* Check back & forth (jump validation) */
uint32_t previ = *((uint32_t *)(tipp - 1));
if(previ == (uint32_t) -1) {
/* Expect it be good if it was first insert ever? Statistically rare to be not like it */
return ptr;
}
uint32_t prevnexi = *(uint32_t *)(((uint8_t *)base) + previ
+ sizeof(simap_ptr64)
+ sizeof(uint32_t));
auint64 *retipp = (auint64 *)(((uint8_t *)base + prevnexi)
+ sizeof(simap_ptr64) + sizeof(uint32_t) +
+ sizeof(uint32_t));
if(retipp != tipp) {
tipp = make_tipp(base, tipp, prefix.u64, end);
continue;
}
#endif /* SIMAP_RAW */
/* Can have the (statistically checked) pointer */
return ptr;
2024-09-29 20:52:18 +02:00
}
/* Haven't found anything */
return NULL;
}
/** Returns the size of the storage needed for the given key */
static inline uint32_t simap_elem_storage_size(const char *key) {
uint32_t keysize = strlen(key);
uint32_t padding = get_size_padding(keysize, 8);
/* XXX: The exactly 8byte keys need a zero terminator too (would be overridden without this) */
padding += (keysize == 8) ? 8 : 0;
2024-09-29 20:52:18 +02:00
return keysize +
sizeof(simap_ptr64) +
sizeof(uint32_t) +
sizeof(uint32_t) +
padding;
}
/** Force-add the (key,value) to the end of the map. Use this if you prefill your map one-by-one and need speed */
static inline void *simap_force_add(simap_instance *map, const char *key, void *ptr) {
2024-09-29 20:52:18 +02:00
uint32_t storage_needed = simap_elem_storage_size(key);
2024-10-07 18:13:18 +02:00
assert((storage_needed % 8) == 0);
if(SM_UNLIKELY(map->end - map->usage_end < storage_needed)) {
2024-09-29 20:52:18 +02:00
/* Need storage */
2024-09-30 12:08:31 +02:00
aralloc(&(map->a),
sizeof(uint8_t)/*esize*/,
1 /*align - should be 8 but should be aligned here as-is! */,
storage_needed);
/* Administer end offset */
map->end += storage_needed;
2024-09-29 20:52:18 +02:00
}
/* Already have the storage */
/* Create first 8 char encoding (this ensures endianness and all such stuff) */
simap_c64 first8 {0};
uint32_t keylen = strlen(key);
/* Ignore warning because we know what we are doing here... */
#pragma GCC diagnostic push
#pragma GCC diagnostic ignored "-Wstringop-truncation"
strncpy(first8.str8, key, (keylen < 8) ? keylen : 8);
#pragma GCC diagnostic pop
uint32_t usi = map->usage_end;
uint32_t previ = map->prev_usage_end;
/* 8byte: Save data ptr */
2024-10-07 10:11:41 +02:00
simap_ptr64 *data = (simap_ptr64 *)((uint8_t *)(map->base) + usi);
data->ptr = ptr;
/* 8byte: Save link to previous and next */
2024-10-09 16:11:32 +02:00
uint32_t *usprev = (uint32_t *)((uint8_t *)(map->base) + usi +
sizeof(simap_ptr64));
*usprev = previ;
*(usprev + 1) = (uint32_t) -1; /* XXX: (***): ensures the "not < end" here! */
/* 8byte: First 8 char */
2024-10-07 10:11:41 +02:00
simap_c64 *start_str = (simap_c64 *)(usprev + 2);
*start_str = first8;
2024-10-09 16:11:32 +02:00
/* Remaining bytes */
if(keylen >= 8) {
2024-10-07 10:11:41 +02:00
/* uint32_t key_remains = keylen - 8; */
char *rem_str = (char *)(start_str + 1);
strcpy(rem_str, key + 8);
}
/* XXX: The "padding" gets automagically added by the movement of the arena here(by junk bytes)! */
/* Update previous with linkage */
if(SM_LIKELY(previ != (uint32_t)-1)) {
2024-10-07 10:11:41 +02:00
uint32_t *prevnex = (uint32_t *)((uint8_t *)(map->base) + previ +
sizeof(simap_ptr64) +
2024-10-07 10:11:41 +02:00
sizeof(uint32_t));
*prevnex = usi;
}
/* Update prev usage end */
map->prev_usage_end = usi;
/* Administer usage_end offset */
map->usage_end += storage_needed;
2024-10-07 12:46:15 +02:00
return data;
2024-09-29 20:52:18 +02:00
}
/**
* A simple map data structure that fulfills amap.h
*
* Operations:
*
* AMAP_SET Saves a mapping from key->ptr in map. ptr==NULL "tombstones" the mapping to return NULL.
* AMAP_GET Gets the symbol at key (the ptr parameter is unused). Returns "ptr" if there is no data for the key.
2024-09-29 20:52:18 +02:00
* AMAP_ERASE Erases the symbol table so it becomes empty again. Can never fail, returns NULL.
*
* @param amap_instance The instance we operate upon.
* @param op Defines which operation the caller wants.
* @param key The key (both for SET and GET). This pointer can get easily invalidated so you might need a copy or you do Trie, etc.
* @param ptr When adding a ptr (data) to the map / table, the key will point to this ptr and the "nt found" ptr to return in get...
2024-09-29 20:52:18 +02:00
* @returns The ptr / data stored for the key, or NULL on tombstone or when not stored yet or op is SET and there was an error.
*/
static inline void* simap(void *amap_instance, AMAP_OP op, const char *key, void *ptr) {
simap_instance *map = (simap_instance *) amap_instance;
if((op == AMAP_ERASE)) {
2024-10-10 00:04:22 +02:00
map->prev_usage_end = (uint32_t) -1;
2024-09-29 20:52:18 +02:00
map->usage_end = 0;
2024-10-10 00:04:22 +02:00
return (void *)((uint8_t)(NULL) - 1L);
2024-09-29 20:52:18 +02:00
}
/* Search for the key - also needed for SET in order to "re-set" */
simap_ptr64 *found = simap_search_internal(map, key);
if(op == AMAP_GET) {
return found ? found->ptr : ptr;
2024-09-29 20:52:18 +02:00
} else {
assert(op == AMAP_SET);
if((!found)) {
/* Add as new */
return simap_force_add(map, key, ptr);
} else {
2024-09-29 20:52:18 +02:00
/* Just overwrite */
found->ptr = ptr;
return (void *) found;
}
}
2024-09-30 12:08:31 +02:00
assert(false); /* should be unreachable */
return ptr;
2024-09-29 20:52:18 +02:00
}
#endif /* SIMAP_H */