Compare commits

...

2 Commits

Author SHA1 Message Date
Richard Thier
673555fdfc vmap.h work-in-progress idea 2025-01-27 03:13:07 +01:00
Richard Thier
182fb69e18 refactor: pulled out simd_map_lane.h because its useful on its own 2025-01-27 03:12:22 +01:00
4 changed files with 318 additions and 125 deletions

View File

@ -8,6 +8,7 @@
#include "mapmap.hpp" #include "mapmap.hpp"
#include "unomap.hpp" #include "unomap.hpp"
#include "simd_map.h" #include "simd_map.h"
#include "vmap.h"
/** /**
* Creates keys or returns the ith key. Used for performance tests. * Creates keys or returns the ith key. Used for performance tests.
@ -309,7 +310,7 @@ void test_intmaps(int perf_test_i) {
} }
int main() { int main() {
int perf_test_i = 100; int perf_test_i = 1000;
/* Prepare data stores */ /* Prepare data stores */
keystore(perf_test_i, true); keystore(perf_test_i, true);

View File

@ -5,15 +5,7 @@
#include <stdint.h> /* uint32_t, ... */ #include <stdint.h> /* uint32_t, ... */
#include <assert.h> #include <assert.h>
#include "arena.h/arena.h" #include "arena.h/arena.h"
#include "simd_map_lane.h"
/* SIMD support */
#ifdef __AVX2__
#include <immintrin.h>
#endif
#ifdef __SSE2__
#include <immintrin.h>
#endif
/* I have no idea what MSVC has instead... */ /* I have no idea what MSVC has instead... */
#ifdef _MSC_VER #ifdef _MSC_VER
@ -32,22 +24,12 @@
#define SM_ALWAYS_INLINE __attribute__ ((always_inline)) #define SM_ALWAYS_INLINE __attribute__ ((always_inline))
#endif #endif
/* 32 byte = 256 bits = (8 * 32bit) optimized for AVX2 */
#define SM_LANE_SPAN 8
/** Grouped together keys-values to support SIMD more this way (also became a single cache line this way) */
struct simd_map_lane {
uint32_t keys[SM_LANE_SPAN];
uint32_t values[SM_LANE_SPAN];
};
typedef struct simd_map_elem simd_map_elem;
struct simd_map { struct simd_map {
arena a; arena a;
simd_map_lane *lanes; simd_map_lane *lanes;
uint32_t end; /* in lanes */ uint32_t end; /* in lanes */
uint32_t usage_end; /* in lanes!!! */ uint32_t usage_end; /* in lanes!!! */
int lane_modulo; /* [0..SM_LANE_SPAN) */ int lane_modulo; /* [0..SML_LANE_SPAN) */
}; };
typedef struct simd_map simd_map; typedef struct simd_map simd_map;
@ -71,100 +53,6 @@ static inline SM_ALWAYS_INLINE char simd_map_free(simd_map *map) {
return freearena(&(map->a)); return freearena(&(map->a));
} }
/**
* Returns if this key is stored in the given map LANE or not - returns NULL if not found.
*
* Rem.: The lane_begin and lane_next_begin parameters are used for reentrant multisearch.
*
* @param map_lane The lane to find in.
* @param key The key to search for.
* @param lane_modulo When non-zero, the lane only searched until this index. Zero means all remaining elements. (mod lane length)
* @param lane_begin The lane is searched from this location(find_all). If full lane / lane prefix is needed, this should be 0.
* @param lane_next_begin This pointer will be filled on non-NULL retvals with the incremented in-lane index (with % modulus).
* @returns NULL when not found, otherwise pointer to the stored value for the key.
*/
static inline SM_ALWAYS_INLINE uint32_t *simd_map_lane_find(
simd_map_lane *map_lane,
uint32_t key,
int lane_modulo,
int lane_begin,
int *lane_next_begin) {
uint32_t *keys = map_lane->keys;
uint32_t *values = map_lane->values;
/* Hopefully can get optimized out for the common case bc inlining */
if(lane_modulo == 0) {
#ifdef __AVX2__
/* Prepare an AVX 256bit search register: 8 uint32_t */
__m256i sreg = _mm256_set1_epi32(key);
/* The tipp register: 8 uint32_t */
__m256i treg = _mm256_load_si256((__m256i *) keys);
/* Check equality and return proper tip address for first found */
__m256i m = _mm256_cmpeq_epi32(sreg, treg); /* Needs AVX2 */
/* The 's' means "single" (float precision), and mask will have [0..7] bits set! */
uint32_t mask = (uint32_t) _mm256_movemask_ps((__m256) m);
if(SM_UNLIKELY(mask != 0)) {
/* 00000000 00000000 00000000 01000100 -> 6 */
int i = (31 - __builtin_clz(mask));
uint32_t *ptr = &values[i];
if(SM_LIKELY(lane_begin == 0)) {
/* Fast-path: Only one match per lane OR first matching in lane for this find/find_all */
*lane_next_begin = (i + 1) % SM_LANE_SPAN;
return ptr;
} else {
/* We did a find_all(..) AND there is more than one match in the lane
* and its not first find_all(..) on the lane in question...
*
* This might be suboptimal, but not so bad:
*
* - This at this point will search the smaller array scalar-way
* - Which sounds good - BUT!
* - This means all lanes with more than 1 data are scalar search
* - And not only that, but first simd-searched, later scalar...
*
* I guess its fine as it should happen statistically rarely anyways
*/
goto non_simd_modulo;
}
}
return NULL;
#endif
#ifdef __SSE2__
#ifndef __AVX2__
#ifndef __AVX512__
/* Prepare an SSE2 128bit search register: 4 uint32_t */
__m128i sreg = _mm_set1_epi32(key);
/* TODO: Implement */
#endif /* AVX512 */
#endif /* AVX2 */
#endif /* SSE2 */
/* Regular integer code - should have good ILP and cache locality patterns anyways */
/** Pretty hopeful this can get more easily unrolled / autovectorized */
for(int i = lane_begin; i < SM_LANE_SPAN; ++i) {
if(SM_UNLIKELY(keys[i] == key)) {
uint32_t *ptr = &values[i];
*lane_next_begin = (i + 1) % SM_LANE_SPAN;
return ptr;
}
}
return NULL;
} else {
non_simd_modulo:
for(int i = lane_begin; i < lane_modulo; ++i) {
if(SM_UNLIKELY(keys[i] == key)) {
uint32_t *ptr = &values[i];
*lane_next_begin = (i + 1) % SM_LANE_SPAN;
return ptr;
}
}
return NULL;
}
}
/** The result of the find_all(..) operation */ /** The result of the find_all(..) operation */
struct simd_map_find_res { struct simd_map_find_res {
/** The found location - or NULL when the key was not found */ /** The found location - or NULL when the key was not found */
@ -284,7 +172,7 @@ static inline SM_ALWAYS_INLINE char simd_map_multi_set(simd_map *map, uint32_t k
lane->values[map->lane_modulo] = value; lane->values[map->lane_modulo] = value;
/* Update lane modulo */ /* Update lane modulo */
map->lane_modulo = (map->lane_modulo + 1) % SM_LANE_SPAN; map->lane_modulo = (map->lane_modulo + 1) % SML_LANE_SPAN;
return 1; return 1;
} }
@ -321,16 +209,11 @@ static inline SM_ALWAYS_INLINE char simd_map_is_empty(simd_map *map) {
return (map->usage_end == 0); return (map->usage_end == 0);
} }
/** Returns the key location for a given value location */
static inline SM_ALWAYS_INLINE uint32_t *simd_map_key_location(uint32_t *value_location) {
return value_location -= SM_LANE_SPAN;
}
/** Returns the lastly inserted value's location in the map - you must ensure the map has elements! */ /** Returns the lastly inserted value's location in the map - you must ensure the map has elements! */
static inline SM_ALWAYS_INLINE uint32_t *simd_map_last_location(simd_map *map) { static inline SM_ALWAYS_INLINE uint32_t *simd_map_last_location(simd_map *map) {
return (map->lane_modulo > 0) ? return (map->lane_modulo > 0) ?
&(map->lanes[map->usage_end - 1].values[map->lane_modulo - 1]) : &(map->lanes[map->usage_end - 1].values[map->lane_modulo - 1]) :
&(map->lanes[map->usage_end - 1].values[SM_LANE_SPAN - 1]); &(map->lanes[map->usage_end - 1].values[SML_LANE_SPAN - 1]);
} }
/** /**
@ -346,9 +229,9 @@ static inline SM_ALWAYS_INLINE uint32_t *simd_map_last_location(simd_map *map) {
*/ */
static inline SM_ALWAYS_INLINE void simd_map_remove_ptr(simd_map *map, uint32_t *value_location) { static inline SM_ALWAYS_INLINE void simd_map_remove_ptr(simd_map *map, uint32_t *value_location) {
/* Overwrite with the last key-value */ /* Overwrite with the last key-value */
uint32_t *key_location = simd_map_key_location(value_location); uint32_t *key_location = simd_map_lane_key_location(value_location);
uint32_t *last_value_location = simd_map_last_location(map); uint32_t *last_value_location = simd_map_last_location(map);
uint32_t *last_key_location = simd_map_key_location(last_value_location); uint32_t *last_key_location = simd_map_lane_key_location(last_value_location);
*value_location = *last_value_location; *value_location = *last_value_location;
*key_location = *last_key_location; *key_location = *last_key_location;
@ -356,7 +239,7 @@ static inline SM_ALWAYS_INLINE void simd_map_remove_ptr(simd_map *map, uint32_t
if(map->lane_modulo > 0) { if(map->lane_modulo > 0) {
--(map->lane_modulo); --(map->lane_modulo);
} else { } else {
map->lane_modulo = SM_LANE_SPAN - 1; map->lane_modulo = SML_LANE_SPAN - 1;
} }
} }

147
simd_map_lane.h Normal file
View File

@ -0,0 +1,147 @@
#ifndef SIMD_MAP_LANE_H
#define SIMD_MAP_LANE_H
/*
* Building blocks for SIMD-optimized (hash-)map buckets.
*/
/* SIMD support */
#ifdef __AVX2__
#include <immintrin.h>
#endif
#ifdef __SSE2__
#include <immintrin.h>
#endif
#ifdef _MSC_VER
#define SML_THREAD_LOCAL __declspec(thread)
#define SML_PREFETCH(x)
#define SML_LIKELY(x)
#define SML_UNLIKELY(x)
#define SML_NOINLINE __declspec(noinline)
#define SML_ALWAYS_INLINE __forceinline
#else
#define SML_THREAD_LOCAL __thread
#define SML_PREFETCH(x) __builtin_prefetch(x)
#define SML_LIKELY(x) __builtin_expect((x),1)
#define SML_UNLIKELY(x) __builtin_expect((x),0)
#define SML_NOINLINE __attribute__ ((noinline))
#define SML_ALWAYS_INLINE __attribute__ ((always_inline))
#endif
/* 32 byte = 256 bits = (8 * 32bit) optimized for AVX2 */
#define SML_LANE_SPAN 8
/** Grouped together keys-values to support SIMD more this way (also became a single cache line this way) */
struct simd_map_lane {
uint32_t keys[SML_LANE_SPAN];
uint32_t values[SML_LANE_SPAN];
};
typedef struct simd_map_elem simd_map_elem;
/** Returns the last value of the lane - usable to see if this lane is fully filled with data or not! */
static inline SML_ALWAYS_INLINE uint32_t simd_map_lane_last_value(simd_map_lane *map_lane) {
return map_lane->values[SML_LANE_SPAN - 1];
}
/**
* Returns if this key is stored in the given map LANE or not - returns NULL if not found.
*
* Rem.: The lane_begin and lane_next_begin parameters are used for reentrant multisearch.
*
* @param map_lane The lane to find in.
* @param key The key to search for.
* @param lane_modulo When non-zero, the lane only searched until this index. Zero means all remaining elements. (mod lane length)
* @param lane_begin The lane is searched from this location(find_all). If full lane / lane prefix is needed, this should be 0.
* @param lane_next_begin This pointer will be filled on non-NULL retvals with the incremented in-lane index (with % modulus).
* @returns NULL when not found, otherwise pointer to the stored value for the key.
*/
static inline SML_ALWAYS_INLINE uint32_t *simd_map_lane_find(
simd_map_lane *map_lane,
uint32_t key,
int lane_modulo,
int lane_begin,
int *lane_next_begin) {
uint32_t *keys = map_lane->keys;
uint32_t *values = map_lane->values;
/* Hopefully can get optimized out for the common case bc inlining */
if(lane_modulo == 0) {
#ifdef __AVX2__
/* Prepare an AVX 256bit search register: 8 uint32_t */
__m256i sreg = _mm256_set1_epi32(key);
/* The tipp register: 8 uint32_t */
__m256i treg = _mm256_load_si256((__m256i *) keys);
/* Check equality and return proper tip address for first found */
__m256i m = _mm256_cmpeq_epi32(sreg, treg); /* Needs AVX2 */
/* The 's' means "single" (float precision), and mask will have [0..7] bits set! */
uint32_t mask = (uint32_t) _mm256_movemask_ps((__m256) m);
if(SML_UNLIKELY(mask != 0)) {
/* 00000000 00000000 00000000 01000100 -> 6 */
int i = (31 - __builtin_clz(mask));
uint32_t *ptr = &values[i];
if(SML_LIKELY(lane_begin == 0)) {
/* Fast-path: Only one match per lane OR first matching in lane for this find/find_all */
*lane_next_begin = (i + 1) % SML_LANE_SPAN;
return ptr;
} else {
/* We did a find_all(..) AND there is more than one match in the lane
* and its not first find_all(..) on the lane in question...
*
* This might be suboptimal, but not so bad:
*
* - This at this point will search the smaller array scalar-way
* - Which sounds good - BUT!
* - This means all lanes with more than 1 data are scalar search
* - And not only that, but first simd-searched, later scalar...
*
* I guess its fine as it should happen statistically rarely anyways
*/
goto non_simd_modulo;
}
}
return NULL;
#endif
#ifdef __SSE2__
#ifndef __AVX2__
#ifndef __AVX512__
/* Prepare an SSE2 128bit search register: 4 uint32_t */
__m128i sreg = _mm_set1_epi32(key);
/* TODO: Implement */
#endif /* AVX512 */
#endif /* AVX2 */
#endif /* SSE2 */
/* Regular integer code - should have good ILP and cache locality patterns anyways */
/** Pretty hopeful this can get more easily unrolled / autovectorized */
for(int i = lane_begin; i < SML_LANE_SPAN; ++i) {
if(SML_UNLIKELY(keys[i] == key)) {
uint32_t *ptr = &values[i];
*lane_next_begin = (i + 1) % SML_LANE_SPAN;
return ptr;
}
}
return NULL;
} else {
non_simd_modulo:
for(int i = lane_begin; i < lane_modulo; ++i) {
if(SML_UNLIKELY(keys[i] == key)) {
uint32_t *ptr = &values[i];
*lane_next_begin = (i + 1) % SML_LANE_SPAN;
return ptr;
}
}
return NULL;
}
}
/** Returns the key location for a given value location */
static inline SM_ALWAYS_INLINE uint32_t *simd_map_lane_key_location(uint32_t *value_location) {
return value_location -= SML_LANE_SPAN;
}
#endif /* SIMD_MAP_LANE_H */

162
vmap.h Normal file
View File

@ -0,0 +1,162 @@
#ifndef VMAP_H
#define VMAP_H
/*
* A virtual memory misusing flat-ish hashmap optimized with AVX2.
*
* Structure
*
* VMEM
* STRUCT
* PRIVATE
* UINTAPI
*/
#include <stdint.h>
#include "simd_map_lane.h"
/* VMEM */
#ifdef _WIN32
TODO: Utilize __Thread + SEH to implement lazy windows pageload zeroing
#else
/** Capacity should be multiple of 4096 for full pages */
static void *vm_reserve(ptrdiff_t capacity) {
void *r = mmap(0, capacity, PROT_READ|PROT_WRITE, MAP_ANON|MAP_PRIVATE, -1, 0);
return r==MAP_FAILED ? 0 : r;
}
/** Capacity should be multiple of 4096 for full pages and related to the ptr to free */
static char vm_free(void *ptr, ptrdiff_t capacity) {
return !munmap(ptr, capacity);
}
#endif /* _WIN32 */
/* STRUCT */
struct vmap {
/* using uint8_t* here would simplify
* code except for aliasing rules */
uint32_t *data;
uint32_t count;
uint32_t max_levels;
};
typedef struct vmap vmap;
/** The result of the search_all(..) operation */
struct vmap_find_res {
/** The found location - or NULL when the key was not found */
uint32_t *value_location;
/** What 'level' depth this value was found. For multimap, but useful statistics */
uint32_t level;
/** Meta-data for continuation of the search. Tells which lane to search next A, B, C, or D */
uint32_t lane_abcd_next;
/** Meta-data for continuation of the search. In-lane where we search from next time? */
int lane_next_begin;
};
typedef struct simd_map_find_res simd_map_find_res;
/* PRIVATE */
/* UINTAPI */
static inline vmap create_vmap(uint32_t max_levels) {
vmap map{ NULL, 0, max_levels};
map.data = (uint32_t *)vm_reserve(max_levels * 16 * 4096);
return map;
}
static inline char free_vmap(vmap *map) {
map->count = 0;
return vm_free(map->data, map->max_levels * 16 * 4096);
}
/** Create the value for starting a search_all call */
static inline vmap_find_res vmap_search_all_begin() {
vmap_find_res ret;
ret.value_location = NULL;
ret.level = 0;
ret.lane_abcd_next = 0;
ret.lane_next_begin = 0;
return ret;
}
/**
* Search the map in as a multimap - that is you can search multiple equal keyed values.
* This is implemented by the result being understood also as a continuation alongside
* a way to grab the pointer to the stored value and key (simd_map_lane_key_location(val)).
*
* @param map The map to search in
* @param key The key to search for
* @param prev The previous result if you continue your search. See: vmap_search_all_begin()
* @returns Metadata + nullable ptr. See: vmap_find_res struct comments; ret.value_location
*/
static inline vmap_find_res search_all_vmap(vmap *map, uint32_t key, vmap_find_res prev) {
/* Inits as not found, can change values */
vmap_find_res ret = prev;
uint32_t level = prev.level;
/* Probably the loop exists always without this predicate being false */
while(level <= map->max_levels) {
/* Process 8 bits of the 32-bit circular order - so its not radix, but similar */
uint32_t byt = level % 4;
// Low 4 bits: page
uint32_t page_no = (level * 16 + ((key >> (byt * 8)) && 15));
/* 1024 and not 4096 here because of uint32_t *data offset: 4096 / 4 uint32s */
uint32_t page_offset = 1024 * page_no;
/* Top 4 bits: lane. There is 32 lane start positions in the 4k page */
uint32_t lane_no = (key >> (byt * 8 + 4)) && 15;
/* But 4096 / 4 == 1024 elements, which then divided by 16 == 64 uint32_t elems */
uint32_t lane_offset = lane_no * 64;
// FIXME: Rerhink what is needed for continuations!
// I think we should store A, B, C and D lane retvals plus where we are
// or maybe just the "where we are" and figure out with logic here,
// but maybe I need to just save flags (4x1 bytes) for "does lane-ABCD search needed?" as that is faster to simd branch pred?
/* A lane has 8x32 bit keys, then 8x32 bit values. 16 uint32_t elems. */
/* So grab the A, B, C and D candidate lanes for each lane_offset. */
simd_map_lane *lane_a = (simd_map_lane *) map->data + page_offset + lane_offset;
simd_map_lane *lane_b = lane_a + 1;
simd_map_lane *lane_c = lane_b + 1;
simd_map_lane *lane_d = lane_c + 1;
/* Further lanes only needed if ours is fully filled */
/* Overlay simd and integer units here for perf */
uint32_t *afind = simd_map_lane_find(
lane_a,
key,
0, /* lane modulo: 0 means until lane end */
0, /* FIXME - from continuation! */
NULL); /* FIXME - we should fill a *lane_next_begin ptr here */
uint32_t bneed = simd_map_lane_last_value(lane_a);
if(afind) {
ret.value_location = afind;
ret.level = level;
}
/* TODO: Implement B, C and D */
uint32_t cneed = simd_map_lane_last_value(lane_b);
uint32_t dneed = simd_map_lane_last_value(lane_c);
/* Check if we need to jump to the next level and do */
uint32_t more = simd_map_lane_last_value(lane_c);
if(!more) return ret;
++level;
}
return ret;
}
/**
* Try to search the map for the given key.
*
* @param map The map to search in
* @param key The key to search for
* @returns NULL if there is no value stored, otherwise ptr to first match with the given key.
*/
static inline uint32_t *search_vmap(vmap *map, uint32_t key) {
vmap_find_res res = search_all_vmap(map, key, vmap_search_all_begin());
return res.value_location;
}
#endif /* VMAP_H */