diff --git a/simd_map.h b/simd_map.h index 58ac58b..0b89100 100644 --- a/simd_map.h +++ b/simd_map.h @@ -5,15 +5,7 @@ #include /* uint32_t, ... */ #include #include "arena.h/arena.h" - -/* SIMD support */ -#ifdef __AVX2__ -#include -#endif - -#ifdef __SSE2__ -#include -#endif +#include "simd_map_lane.h" /* I have no idea what MSVC has instead... */ #ifdef _MSC_VER @@ -32,22 +24,12 @@ #define SM_ALWAYS_INLINE __attribute__ ((always_inline)) #endif -/* 32 byte = 256 bits = (8 * 32bit) optimized for AVX2 */ -#define SM_LANE_SPAN 8 - -/** Grouped together keys-values to support SIMD more this way (also became a single cache line this way) */ -struct simd_map_lane { - uint32_t keys[SM_LANE_SPAN]; - uint32_t values[SM_LANE_SPAN]; -}; -typedef struct simd_map_elem simd_map_elem; - struct simd_map { arena a; simd_map_lane *lanes; uint32_t end; /* in lanes */ uint32_t usage_end; /* in lanes!!! */ - int lane_modulo; /* [0..SM_LANE_SPAN) */ + int lane_modulo; /* [0..SML_LANE_SPAN) */ }; typedef struct simd_map simd_map; @@ -71,100 +53,6 @@ static inline SM_ALWAYS_INLINE char simd_map_free(simd_map *map) { return freearena(&(map->a)); } -/** - * Returns if this key is stored in the given map LANE or not - returns NULL if not found. - * - * Rem.: The lane_begin and lane_next_begin parameters are used for reentrant multisearch. - * - * @param map_lane The lane to find in. - * @param key The key to search for. - * @param lane_modulo When non-zero, the lane only searched until this index. Zero means all remaining elements. (mod lane length) - * @param lane_begin The lane is searched from this location(find_all). If full lane / lane prefix is needed, this should be 0. - * @param lane_next_begin This pointer will be filled on non-NULL retvals with the incremented in-lane index (with % modulus). - * @returns NULL when not found, otherwise pointer to the stored value for the key. - */ -static inline SM_ALWAYS_INLINE uint32_t *simd_map_lane_find( - simd_map_lane *map_lane, - uint32_t key, - int lane_modulo, - int lane_begin, - int *lane_next_begin) { - - uint32_t *keys = map_lane->keys; - uint32_t *values = map_lane->values; - - /* Hopefully can get optimized out for the common case bc inlining */ - if(lane_modulo == 0) { -#ifdef __AVX2__ - /* Prepare an AVX 256bit search register: 8 uint32_t */ - __m256i sreg = _mm256_set1_epi32(key); - /* The tipp register: 8 uint32_t */ - __m256i treg = _mm256_load_si256((__m256i *) keys); - /* Check equality and return proper tip address for first found */ - __m256i m = _mm256_cmpeq_epi32(sreg, treg); /* Needs AVX2 */ - /* The 's' means "single" (float precision), and mask will have [0..7] bits set! */ - uint32_t mask = (uint32_t) _mm256_movemask_ps((__m256) m); - if(SM_UNLIKELY(mask != 0)) { - /* 00000000 00000000 00000000 01000100 -> 6 */ - int i = (31 - __builtin_clz(mask)); - uint32_t *ptr = &values[i]; - if(SM_LIKELY(lane_begin == 0)) { - /* Fast-path: Only one match per lane OR first matching in lane for this find/find_all */ - *lane_next_begin = (i + 1) % SM_LANE_SPAN; - return ptr; - } else { - /* We did a find_all(..) AND there is more than one match in the lane - * and its not first find_all(..) on the lane in question... - * - * This might be suboptimal, but not so bad: - * - * - This at this point will search the smaller array scalar-way - * - Which sounds good - BUT! - * - This means all lanes with more than 1 data are scalar search - * - And not only that, but first simd-searched, later scalar... - * - * I guess its fine as it should happen statistically rarely anyways - */ - goto non_simd_modulo; - } - } - - return NULL; -#endif -#ifdef __SSE2__ -#ifndef __AVX2__ -#ifndef __AVX512__ - /* Prepare an SSE2 128bit search register: 4 uint32_t */ - __m128i sreg = _mm_set1_epi32(key); - /* TODO: Implement */ -#endif /* AVX512 */ -#endif /* AVX2 */ -#endif /* SSE2 */ - /* Regular integer code - should have good ILP and cache locality patterns anyways */ - /** Pretty hopeful this can get more easily unrolled / autovectorized */ - for(int i = lane_begin; i < SM_LANE_SPAN; ++i) { - if(SM_UNLIKELY(keys[i] == key)) { - uint32_t *ptr = &values[i]; - *lane_next_begin = (i + 1) % SM_LANE_SPAN; - return ptr; - } - } - - return NULL; - } else { -non_simd_modulo: - for(int i = lane_begin; i < lane_modulo; ++i) { - if(SM_UNLIKELY(keys[i] == key)) { - uint32_t *ptr = &values[i]; - *lane_next_begin = (i + 1) % SM_LANE_SPAN; - return ptr; - } - } - - return NULL; - } -} - /** The result of the find_all(..) operation */ struct simd_map_find_res { /** The found location - or NULL when the key was not found */ @@ -284,7 +172,7 @@ static inline SM_ALWAYS_INLINE char simd_map_multi_set(simd_map *map, uint32_t k lane->values[map->lane_modulo] = value; /* Update lane modulo */ - map->lane_modulo = (map->lane_modulo + 1) % SM_LANE_SPAN; + map->lane_modulo = (map->lane_modulo + 1) % SML_LANE_SPAN; return 1; } @@ -321,16 +209,11 @@ static inline SM_ALWAYS_INLINE char simd_map_is_empty(simd_map *map) { return (map->usage_end == 0); } -/** Returns the key location for a given value location */ -static inline SM_ALWAYS_INLINE uint32_t *simd_map_key_location(uint32_t *value_location) { - return value_location -= SM_LANE_SPAN; -} - /** Returns the lastly inserted value's location in the map - you must ensure the map has elements! */ static inline SM_ALWAYS_INLINE uint32_t *simd_map_last_location(simd_map *map) { return (map->lane_modulo > 0) ? &(map->lanes[map->usage_end - 1].values[map->lane_modulo - 1]) : - &(map->lanes[map->usage_end - 1].values[SM_LANE_SPAN - 1]); + &(map->lanes[map->usage_end - 1].values[SML_LANE_SPAN - 1]); } /** @@ -346,9 +229,9 @@ static inline SM_ALWAYS_INLINE uint32_t *simd_map_last_location(simd_map *map) { */ static inline SM_ALWAYS_INLINE void simd_map_remove_ptr(simd_map *map, uint32_t *value_location) { /* Overwrite with the last key-value */ - uint32_t *key_location = simd_map_key_location(value_location); + uint32_t *key_location = simd_map_lane_key_location(value_location); uint32_t *last_value_location = simd_map_last_location(map); - uint32_t *last_key_location = simd_map_key_location(last_value_location); + uint32_t *last_key_location = simd_map_lane_key_location(last_value_location); *value_location = *last_value_location; *key_location = *last_key_location; @@ -356,7 +239,7 @@ static inline SM_ALWAYS_INLINE void simd_map_remove_ptr(simd_map *map, uint32_t if(map->lane_modulo > 0) { --(map->lane_modulo); } else { - map->lane_modulo = SM_LANE_SPAN - 1; + map->lane_modulo = SML_LANE_SPAN - 1; } } diff --git a/simd_map_lane.h b/simd_map_lane.h new file mode 100644 index 0000000..3dcdb01 --- /dev/null +++ b/simd_map_lane.h @@ -0,0 +1,147 @@ +#ifndef SIMD_MAP_LANE_H +#define SIMD_MAP_LANE_H +/* + * Building blocks for SIMD-optimized (hash-)map buckets. + */ + +/* SIMD support */ +#ifdef __AVX2__ +#include +#endif + +#ifdef __SSE2__ +#include +#endif + + +#ifdef _MSC_VER +#define SML_THREAD_LOCAL __declspec(thread) +#define SML_PREFETCH(x) +#define SML_LIKELY(x) +#define SML_UNLIKELY(x) +#define SML_NOINLINE __declspec(noinline) +#define SML_ALWAYS_INLINE __forceinline +#else +#define SML_THREAD_LOCAL __thread +#define SML_PREFETCH(x) __builtin_prefetch(x) +#define SML_LIKELY(x) __builtin_expect((x),1) +#define SML_UNLIKELY(x) __builtin_expect((x),0) +#define SML_NOINLINE __attribute__ ((noinline)) +#define SML_ALWAYS_INLINE __attribute__ ((always_inline)) +#endif + +/* 32 byte = 256 bits = (8 * 32bit) optimized for AVX2 */ +#define SML_LANE_SPAN 8 + +/** Grouped together keys-values to support SIMD more this way (also became a single cache line this way) */ +struct simd_map_lane { + uint32_t keys[SML_LANE_SPAN]; + uint32_t values[SML_LANE_SPAN]; +}; +typedef struct simd_map_elem simd_map_elem; + +/** Returns the last value of the lane - usable to see if this lane is fully filled with data or not! */ +static inline SML_ALWAYS_INLINE uint32_t simd_map_lane_last_value(simd_map_lane *map_lane) { + return map_lane->values[SML_LANE_SPAN - 1]; +} + +/** + * Returns if this key is stored in the given map LANE or not - returns NULL if not found. + * + * Rem.: The lane_begin and lane_next_begin parameters are used for reentrant multisearch. + * + * @param map_lane The lane to find in. + * @param key The key to search for. + * @param lane_modulo When non-zero, the lane only searched until this index. Zero means all remaining elements. (mod lane length) + * @param lane_begin The lane is searched from this location(find_all). If full lane / lane prefix is needed, this should be 0. + * @param lane_next_begin This pointer will be filled on non-NULL retvals with the incremented in-lane index (with % modulus). + * @returns NULL when not found, otherwise pointer to the stored value for the key. + */ +static inline SML_ALWAYS_INLINE uint32_t *simd_map_lane_find( + simd_map_lane *map_lane, + uint32_t key, + int lane_modulo, + int lane_begin, + int *lane_next_begin) { + + uint32_t *keys = map_lane->keys; + uint32_t *values = map_lane->values; + + /* Hopefully can get optimized out for the common case bc inlining */ + if(lane_modulo == 0) { +#ifdef __AVX2__ + /* Prepare an AVX 256bit search register: 8 uint32_t */ + __m256i sreg = _mm256_set1_epi32(key); + /* The tipp register: 8 uint32_t */ + __m256i treg = _mm256_load_si256((__m256i *) keys); + /* Check equality and return proper tip address for first found */ + __m256i m = _mm256_cmpeq_epi32(sreg, treg); /* Needs AVX2 */ + /* The 's' means "single" (float precision), and mask will have [0..7] bits set! */ + uint32_t mask = (uint32_t) _mm256_movemask_ps((__m256) m); + if(SML_UNLIKELY(mask != 0)) { + /* 00000000 00000000 00000000 01000100 -> 6 */ + int i = (31 - __builtin_clz(mask)); + uint32_t *ptr = &values[i]; + if(SML_LIKELY(lane_begin == 0)) { + /* Fast-path: Only one match per lane OR first matching in lane for this find/find_all */ + *lane_next_begin = (i + 1) % SML_LANE_SPAN; + return ptr; + } else { + /* We did a find_all(..) AND there is more than one match in the lane + * and its not first find_all(..) on the lane in question... + * + * This might be suboptimal, but not so bad: + * + * - This at this point will search the smaller array scalar-way + * - Which sounds good - BUT! + * - This means all lanes with more than 1 data are scalar search + * - And not only that, but first simd-searched, later scalar... + * + * I guess its fine as it should happen statistically rarely anyways + */ + goto non_simd_modulo; + } + } + + return NULL; +#endif +#ifdef __SSE2__ +#ifndef __AVX2__ +#ifndef __AVX512__ + /* Prepare an SSE2 128bit search register: 4 uint32_t */ + __m128i sreg = _mm_set1_epi32(key); + /* TODO: Implement */ +#endif /* AVX512 */ +#endif /* AVX2 */ +#endif /* SSE2 */ + /* Regular integer code - should have good ILP and cache locality patterns anyways */ + /** Pretty hopeful this can get more easily unrolled / autovectorized */ + for(int i = lane_begin; i < SML_LANE_SPAN; ++i) { + if(SML_UNLIKELY(keys[i] == key)) { + uint32_t *ptr = &values[i]; + *lane_next_begin = (i + 1) % SML_LANE_SPAN; + return ptr; + } + } + + return NULL; + } else { +non_simd_modulo: + for(int i = lane_begin; i < lane_modulo; ++i) { + if(SML_UNLIKELY(keys[i] == key)) { + uint32_t *ptr = &values[i]; + *lane_next_begin = (i + 1) % SML_LANE_SPAN; + return ptr; + } + } + + return NULL; + } +} + +/** Returns the key location for a given value location */ +static inline SM_ALWAYS_INLINE uint32_t *simd_map_lane_key_location(uint32_t *value_location) { + return value_location -= SML_LANE_SPAN; +} + +#endif /* SIMD_MAP_LANE_H */