#ifndef SIMAP_H #define SIMAP_H #include /* NULL */ #include /* uint8_t, uint32_t, ... */ #include /* strcmp, strncpy etc. */ #include /* assert */ #include "amap.h" #include "arena.h/arena.h" /* Possible (non-AVX, but alike) optimization, but means there can be lookup / insert errors (very rarely) #define SIMAP_RAW */ /* Perf trickery */ /* XXX: Enabling AVX also enables rare errors for speed gain! See above. */ #ifdef __AVX2__ #define SIMAP_RAW #include #endif /* I have no idea what MSVC has instead... */ #ifdef _MSC_VER #define SM_THREAD_LOCAL __declspec(thread) #define SM_PREFETCH(x) #define SM_LIKELY(x) #define SM_UNLIKELY(x) #define SM_NOINLINE __declspec(noinline) #define SM_ALWAYS_INLINE __forceinline #else #define SM_THREAD_LOCAL __thread #define SM_PREFETCH(x) __builtin_prefetch(x) #define SM_LIKELY(x) __builtin_expect((x),1) #define SM_UNLIKELY(x) __builtin_expect((x),0) #define SM_NOINLINE __attribute__ ((noinline)) #define SM_ALWAYS_INLINE __attribute__ ((always_inline)) #endif typedef uint64_t auint64 __attribute__ ((__aligned__(8))); /** The first 8 characters are stored as uint64_t for fast checks */ union simap_c64 { char str8[8]; auint64 u64; }; typedef union simap_char64 simap_char64; /** This is to ensure 8byte storage of pointers (with possible padding) */ union simap_ptr64 { void *ptr; auint64 u64; }; typedef union simap_ptr64 simap_ptr64; struct elem_nonkey_prefix { /** The value (ptr) */ simap_ptr64 value; /** Previous element index from base (full offset) */ uint32_t previndex; /** Next element index from base (full offset) */ uint32_t nextindex; }; typedef struct elem_nonkey_prefix elem_nonkey_prefix; /** * The per-element storage layout * * 8 byte: * - void* value; * - [?] optional padding (only for non-64 bit pointer machines) * * 8 byte: * - uint32_t previndex; * - uint32_t nextindex; * * K x 8 byte: * - char name[]; // inline stored * - padding (divisible by 8) * * Because of it a lookup is basically via strstr-like with 8byte steps! * with few character names zero-padded in the search term parameter * and if you want check extra validity by jumping back&forth in it. */ struct elem_prefix { /** Value and meta-data - divisible by 8bytes */ elem_nonkey_prefix nonkey_prefix; /** The prefix of the key - divisible by 8bytes padded string after this (inlined) */ simap_c64 key_prefix; }; typedef struct elem_prefix elem_prefix; /** * A "peasantly" map data structure backed by arena.h - basically a toy data structure... * * This is very simple, no trees, no hashes, just (hopefully) autovectorized linear lookup. * Inserting NULLs to keys happens through tombstoning unless erase happens and we never * shrink memory so please do not add a lot of things then remove a lot of things. * * In AVX2 mode, we do heuristics against data being the key so its not "sure" and can fail... * XXX: So beware that this CAN FAIL for AVX2 build flags just "statistically most often works"! */ struct simap_instance { arena a; uint32_t end; uint32_t prev_usage_end; /* previous usage_end or -1 if no previous exists... in bytes!!! */ uint32_t usage_end; /* in bytes!!! */ elem_prefix *base; }; typedef struct simap_instance simap_instance; static inline simap_instance simap_create() { simap_instance ret; ret.a = newarena((ptrdiff_t)1 << 33); ret.end = 0; ret.prev_usage_end = (uint32_t) -1; ret.usage_end = 0; ret.base = (elem_prefix*)(((auint64*) aralloc(&(ret.a), sizeof(auint64), sizeof(auint64), 1)) /* addr divisible by 8 */ + 1); /* First really addressible thing */ return ret; } static inline void* simap(void *amap_instance, AMAP_OP op, const char *key, void *ptr); /** Gets padding bytes for a size to be padded to divisible alignment */ static inline unsigned int get_size_padding(unsigned int size, unsigned int alignment) { /* Would ensure returned value divisible by alignment */ /* return (size + alignment - 1) / alignment * alignment; */ /* Basically same as: */ /* return (alignment - (size % alignment)) % alignment; */ /* Substracting size leads to padding */ return ((size + alignment - 1) / alignment) * alignment - size; } /** Gets padded address - or same address if divisible by alignment */ static inline void *get_padded(void *ptr, int alignment) { /* return (alignment - (ptr % alignment)) % alignment; */ return (void*)((ptrdiff_t)((uint8_t *)ptr + alignment - 1) / alignment * alignment); } static inline SM_ALWAYS_INLINE auint64 *make_tipp(auint64 *base, auint64 *tip, auint64 prefix, auint64 *end) { #ifdef __AVX2__ /* See: * * https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#ig_expand=861,4605&avxnewtechs=AVX * https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#ig_expand=861,4605&avxnewtechs=AVX&text=movemask * https://chryswoods.com/vector_c++/part2.html * https://blog.triplez.cn/posts/avx-avx2-learning-notes/ * https://github.com/Triple-Z/AVX-AVX2-Example-Code * https://en.algorithmica.org/hpc/simd/masking/ * https://stackoverflow.com/questions/31089502/aligned-and-unaligned-memory-access-with-avx-avx2-intrinsics */ /* Step over previous tipp and search until the AVX2 alignment needs */ /* AVXs 256 bit = 32 byte so multiple of 32 needed here */ /* TODO: Probably I can change "32" here into something bigger to non-avx small arrays! */ auint64 *neotip = (auint64 *) get_padded(++tip, 32); while((tip < neotip) && (*tip != prefix)) ++tip; if(tip < neotip) return tip; /* Prepare an AVX 256bit search register: 4 uint64_t */ __m256i sreg = _mm256_set1_epi64x(prefix); while(tip < end) { /* This needs 32 byte alignment here that we do above */ /* The tipp register: 4 uint64_t */ __m256i treg = _mm256_load_si256((__m256i *) tip); /* Check equality and return proper tip address for first found */ __m256i m = _mm256_cmpeq_epi64(sreg, treg); /* Needs AVX2 */ uint32_t mask = (uint32_t) _mm256_movemask_pd((__m256d) m); /* Try next tip, processes 256 bits per loop */ tip += 4; /* 4x64 bit */ /* One of the links used __builtin_ctz(mask), but I * think it was bad implementation and only finds the * last search result! * * __builtin_clz returns leading zeroes of the mask * and the mask has 4 bits at most and each show if * 1..4 places of AVX reg compared properly to the * given prefix value (4x 64 bit comparizons happen). * Subtracting from 31 we subtract either 28,29,30,31 * and thus resulting in 3, 2, 1, 0 (right offsets). * * If the mask got all zero, there is nothing found, * otherwise its the tipp + offset we calculated. */ if(SM_UNLIKELY(mask != 0)) { int offset = (31 - __builtin_clz(mask)); /* -4 because this is the unlikely scenario and we already incremented! */ return tip - 4 + offset; } } /* Not found case */ return tip; #endif #ifdef SIMAP_RAW #pragma GCC unroll 16 while((++tip < end) && (*tip != prefix)); return tip; #endif /* XXX: This only works because of (***) because reading -1 tips makes tip >= end for sure here and back */ elem_nonkey_prefix *pre = (elem_nonkey_prefix *)((uint8_t *)tip - sizeof(elem_nonkey_prefix)); tip = (auint64 *) ((uint8_t *)base + pre->nextindex + sizeof(elem_nonkey_prefix)); #pragma GCC unroll 16 while((tip < end) && (*tip != prefix)) { pre = (elem_nonkey_prefix *)((uint8_t *)tip - sizeof(elem_nonkey_prefix)); tip = (auint64 *) ((uint8_t *)base + pre->nextindex + sizeof(elem_nonkey_prefix)); } return tip; } static inline simap_ptr64 *simap_search_internal(simap_instance *map, const char *key) { /* Construct prefix (fast-key) */ size_t keylen = strlen(key); char is_smallkey = (keylen < 8); simap_c64 prefix {0}; size_t prefixlen = is_smallkey ? keylen : 8; /* Ignore warning because we know what we are doing here... */ #pragma GCC diagnostic push #pragma GCC diagnostic ignored "-Wstringop-truncation" strncpy(prefix.str8, key, prefixlen); #pragma GCC diagnostic pop /* Construct keyremains - might point to the \0 terminator only if smallkey or 8 bytes exactly */ const char *keyremains = key + prefixlen; /* Lookup prefix (fast-key) - hopefully this gets vectorized (should be)!!! */ auint64 *base = (auint64 *) (map->base); auint64 *end = (auint64 *)((uint8_t *)base + (map->usage_end)); auint64 *tipp = make_tipp(base, base, prefix.u64, end); while(tipp < end) { /* XXX: (***) */ /* Need detailed lookup, because found the prefix */ assert((*tipp == prefix.u64)); /* First check the remains of the string (only if needed) */ if(!is_smallkey) { char *tippremains = (char *)((uint8_t *)tipp + sizeof(uint64_t)); if(strcmp(keyremains, tippremains) != 0) { tipp = make_tipp(base, tipp, prefix.u64, end); continue; } } simap_ptr64 *ptr = (simap_ptr64 *)((uint8_t *) (tipp - 2)); #ifdef SIMAP_RAW /* Check back & forth (jump validation) */ uint32_t previ = *((uint32_t *)(tipp - 1)); if(previ == (uint32_t) -1) { /* Expect it be good if it was first insert ever? Statistically rare to be not like it */ return ptr; } uint32_t prevnexi = *(uint32_t *)(((uint8_t *)base) + previ + sizeof(simap_ptr64) + sizeof(uint32_t)); auint64 *retipp = (auint64 *)(((uint8_t *)base + prevnexi) + sizeof(simap_ptr64) + sizeof(uint32_t) + + sizeof(uint32_t)); if(retipp != tipp) { tipp = make_tipp(base, tipp, prefix.u64, end); continue; } #endif /* SIMAP_RAW */ /* Can have the (statistically checked) pointer */ return ptr; } /* Haven't found anything */ return NULL; } /** Returns the size of the storage needed for the given key */ static inline uint32_t simap_elem_storage_size(const char *key) { uint32_t keysize = strlen(key); uint32_t padding = get_size_padding(keysize, 8); /* XXX: The exactly 8byte keys need a zero terminator too (would be overridden without this) */ padding += (keysize == 8) ? 8 : 0; return keysize + sizeof(simap_ptr64) + sizeof(uint32_t) + sizeof(uint32_t) + padding; } /** Force-add the (key,value) to the end of the map. Use this if you prefill your map one-by-one and need speed */ static inline void *simap_force_add(simap_instance *map, const char *key, void *ptr) { uint32_t storage_needed = simap_elem_storage_size(key); assert((storage_needed % 8) == 0); if(SM_UNLIKELY(map->end - map->usage_end < storage_needed)) { /* Need storage */ aralloc(&(map->a), sizeof(uint8_t)/*esize*/, 1 /*align - should be 8 but should be aligned here as-is! */, storage_needed); /* Administer end offset */ map->end += storage_needed; } /* Already have the storage */ /* Create first 8 char encoding (this ensures endianness and all such stuff) */ simap_c64 first8 {0}; uint32_t keylen = strlen(key); /* Ignore warning because we know what we are doing here... */ #pragma GCC diagnostic push #pragma GCC diagnostic ignored "-Wstringop-truncation" strncpy(first8.str8, key, (keylen < 8) ? keylen : 8); #pragma GCC diagnostic pop uint32_t usi = map->usage_end; uint32_t previ = map->prev_usage_end; /* 8byte: Save data ptr */ simap_ptr64 *data = (simap_ptr64 *)((uint8_t *)(map->base) + usi); data->ptr = ptr; /* 8byte: Save link to previous and next */ uint32_t *usprev = (uint32_t *)((uint8_t *)(map->base) + usi + sizeof(simap_ptr64)); *usprev = previ; *(usprev + 1) = (uint32_t) -1; /* XXX: (***): ensures the "not < end" here! */ /* 8byte: First 8 char */ simap_c64 *start_str = (simap_c64 *)(usprev + 2); *start_str = first8; /* Remaining bytes */ if(keylen >= 8) { /* uint32_t key_remains = keylen - 8; */ char *rem_str = (char *)(start_str + 1); strcpy(rem_str, key + 8); } /* XXX: The "padding" gets automagically added by the movement of the arena here(by junk bytes)! */ /* Update previous with linkage */ if(SM_LIKELY(previ != (uint32_t)-1)) { uint32_t *prevnex = (uint32_t *)((uint8_t *)(map->base) + previ + sizeof(simap_ptr64) + sizeof(uint32_t)); *prevnex = usi; } /* Update prev usage end */ map->prev_usage_end = usi; /* Administer usage_end offset */ map->usage_end += storage_needed; return data; } /** * A simple map data structure that fulfills amap.h * * Operations: * * AMAP_SET Saves a mapping from key->ptr in map. ptr==NULL "tombstones" the mapping to return NULL. * AMAP_GET Gets the symbol at key (the ptr parameter is unused). Returns "ptr" if there is no data for the key. * AMAP_ERASE Erases the symbol table so it becomes empty again. Can never fail, returns NULL. * * @param amap_instance The instance we operate upon. * @param op Defines which operation the caller wants. * @param key The key (both for SET and GET). This pointer can get easily invalidated so you might need a copy or you do Trie, etc. * @param ptr When adding a ptr (data) to the map / table, the key will point to this ptr and the "nt found" ptr to return in get... * @returns The ptr / data stored for the key, or NULL on tombstone or when not stored yet or op is SET and there was an error. */ static inline void* simap(void *amap_instance, AMAP_OP op, const char *key, void *ptr) { simap_instance *map = (simap_instance *) amap_instance; if((op == AMAP_ERASE)) { map->prev_usage_end = (uint32_t) -1; map->usage_end = 0; return (void *)((uint8_t)(NULL) - 1L); } /* Search for the key - also needed for SET in order to "re-set" */ simap_ptr64 *found = simap_search_internal(map, key); if(op == AMAP_GET) { return found ? found->ptr : ptr; } else { assert(op == AMAP_SET); if((!found)) { /* Add as new */ return simap_force_add(map, key, ptr); } else { /* Just overwrite */ found->ptr = ptr; return (void *) found; } } assert(false); /* should be unreachable */ return ptr; } #endif /* SIMAP_H */