simap/simd_map_lane.h

148 lines
4.8 KiB
C

#ifndef SIMD_MAP_LANE_H
#define SIMD_MAP_LANE_H
/*
* Building blocks for SIMD-optimized (hash-)map buckets.
*/
/* SIMD support */
#ifdef __AVX2__
#include <immintrin.h>
#endif
#ifdef __SSE2__
#include <immintrin.h>
#endif
#ifdef _MSC_VER
#define SML_THREAD_LOCAL __declspec(thread)
#define SML_PREFETCH(x)
#define SML_LIKELY(x)
#define SML_UNLIKELY(x)
#define SML_NOINLINE __declspec(noinline)
#define SML_ALWAYS_INLINE __forceinline
#else
#define SML_THREAD_LOCAL __thread
#define SML_PREFETCH(x) __builtin_prefetch(x)
#define SML_LIKELY(x) __builtin_expect((x),1)
#define SML_UNLIKELY(x) __builtin_expect((x),0)
#define SML_NOINLINE __attribute__ ((noinline))
#define SML_ALWAYS_INLINE __attribute__ ((always_inline))
#endif
/* 32 byte = 256 bits = (8 * 32bit) optimized for AVX2 */
#define SML_LANE_SPAN 8
/** Grouped together keys-values to support SIMD more this way (also became a single cache line this way) */
struct simd_map_lane {
uint32_t keys[SML_LANE_SPAN];
uint32_t values[SML_LANE_SPAN];
};
typedef struct simd_map_elem simd_map_elem;
/** Returns the last value of the lane - usable to see if this lane is fully filled with data or not! */
static inline SML_ALWAYS_INLINE uint32_t simd_map_lane_last_value(simd_map_lane *map_lane) {
return map_lane->values[SML_LANE_SPAN - 1];
}
/**
* Returns if this key is stored in the given map LANE or not - returns NULL if not found.
*
* Rem.: The lane_begin and lane_next_begin parameters are used for reentrant multisearch.
*
* @param map_lane The lane to find in.
* @param key The key to search for.
* @param lane_modulo When non-zero, the lane only searched until this index. Zero means all remaining elements. (mod lane length)
* @param lane_begin The lane is searched from this location(find_all). If full lane / lane prefix is needed, this should be 0.
* @param lane_next_begin This pointer will be filled on non-NULL retvals with the incremented in-lane index (with % modulus).
* @returns NULL when not found, otherwise pointer to the stored value for the key.
*/
static inline SML_ALWAYS_INLINE uint32_t *simd_map_lane_find(
simd_map_lane *map_lane,
uint32_t key,
int lane_modulo,
int lane_begin,
int *lane_next_begin) {
uint32_t *keys = map_lane->keys;
uint32_t *values = map_lane->values;
/* Hopefully can get optimized out for the common case bc inlining */
if(lane_modulo == 0) {
#ifdef __AVX2__
/* Prepare an AVX 256bit search register: 8 uint32_t */
__m256i sreg = _mm256_set1_epi32(key);
/* The tipp register: 8 uint32_t */
__m256i treg = _mm256_load_si256((__m256i *) keys);
/* Check equality and return proper tip address for first found */
__m256i m = _mm256_cmpeq_epi32(sreg, treg); /* Needs AVX2 */
/* The 's' means "single" (float precision), and mask will have [0..7] bits set! */
uint32_t mask = (uint32_t) _mm256_movemask_ps((__m256) m);
if(SML_UNLIKELY(mask != 0)) {
/* 00000000 00000000 00000000 01000100 -> 6 */
int i = (31 - __builtin_clz(mask));
uint32_t *ptr = &values[i];
if(SML_LIKELY(lane_begin == 0)) {
/* Fast-path: Only one match per lane OR first matching in lane for this find/find_all */
*lane_next_begin = (i + 1) % SML_LANE_SPAN;
return ptr;
} else {
/* We did a find_all(..) AND there is more than one match in the lane
* and its not first find_all(..) on the lane in question...
*
* This might be suboptimal, but not so bad:
*
* - This at this point will search the smaller array scalar-way
* - Which sounds good - BUT!
* - This means all lanes with more than 1 data are scalar search
* - And not only that, but first simd-searched, later scalar...
*
* I guess its fine as it should happen statistically rarely anyways
*/
goto non_simd_modulo;
}
}
return NULL;
#endif
#ifdef __SSE2__
#ifndef __AVX2__
#ifndef __AVX512__
/* Prepare an SSE2 128bit search register: 4 uint32_t */
__m128i sreg = _mm_set1_epi32(key);
/* TODO: Implement */
#endif /* AVX512 */
#endif /* AVX2 */
#endif /* SSE2 */
/* Regular integer code - should have good ILP and cache locality patterns anyways */
/** Pretty hopeful this can get more easily unrolled / autovectorized */
for(int i = lane_begin; i < SML_LANE_SPAN; ++i) {
if(SML_UNLIKELY(keys[i] == key)) {
uint32_t *ptr = &values[i];
*lane_next_begin = (i + 1) % SML_LANE_SPAN;
return ptr;
}
}
return NULL;
} else {
non_simd_modulo:
for(int i = lane_begin; i < lane_modulo; ++i) {
if(SML_UNLIKELY(keys[i] == key)) {
uint32_t *ptr = &values[i];
*lane_next_begin = (i + 1) % SML_LANE_SPAN;
return ptr;
}
}
return NULL;
}
}
/** Returns the key location for a given value location */
static inline SM_ALWAYS_INLINE uint32_t *simd_map_lane_key_location(uint32_t *value_location) {
return value_location -= SML_LANE_SPAN;
}
#endif /* SIMD_MAP_LANE_H */