#ifndef SIMD_MAP_LANE_H #define SIMD_MAP_LANE_H /* * Building blocks for SIMD-optimized (hash-)map buckets. */ /* SIMD support */ #ifdef __AVX2__ #include #endif #ifdef __SSE2__ #include #endif #ifdef _MSC_VER #define SML_THREAD_LOCAL __declspec(thread) #define SML_PREFETCH(x) #define SML_LIKELY(x) #define SML_UNLIKELY(x) #define SML_NOINLINE __declspec(noinline) #define SML_ALWAYS_INLINE __forceinline #else #define SML_THREAD_LOCAL __thread #define SML_PREFETCH(x) __builtin_prefetch(x) #define SML_LIKELY(x) __builtin_expect((x),1) #define SML_UNLIKELY(x) __builtin_expect((x),0) #define SML_NOINLINE __attribute__ ((noinline)) #define SML_ALWAYS_INLINE __attribute__ ((always_inline)) #endif /* 32 byte = 256 bits = (8 * 32bit) optimized for AVX2 */ #define SML_LANE_SPAN 8 /** Grouped together keys-values to support SIMD more this way (also became a single cache line this way) */ struct simd_map_lane { uint32_t keys[SML_LANE_SPAN]; uint32_t values[SML_LANE_SPAN]; }; typedef struct simd_map_elem simd_map_elem; /** Returns the last value of the lane - usable to see if this lane is fully filled with data or not! */ static inline SML_ALWAYS_INLINE uint32_t simd_map_lane_last_value(simd_map_lane *map_lane) { return map_lane->values[SML_LANE_SPAN - 1]; } /** * Returns if this key is stored in the given map LANE or not - returns NULL if not found. * * Rem.: The lane_begin and lane_next_begin parameters are used for reentrant multisearch. * * @param map_lane The lane to find in. * @param key The key to search for. * @param lane_modulo When non-zero, the lane only searched until this index. Zero means all remaining elements. (mod lane length) * @param lane_begin The lane is searched from this location(find_all). If full lane / lane prefix is needed, this should be 0. * @param lane_next_begin This pointer will be filled on non-NULL retvals with the incremented in-lane index (with % modulus). * @returns NULL when not found, otherwise pointer to the stored value for the key. */ static inline SML_ALWAYS_INLINE uint32_t *simd_map_lane_find( simd_map_lane *map_lane, uint32_t key, int lane_modulo, int lane_begin, int *lane_next_begin) { uint32_t *keys = map_lane->keys; uint32_t *values = map_lane->values; /* Hopefully can get optimized out for the common case bc inlining */ if(lane_modulo == 0) { #ifdef __AVX2__ /* Prepare an AVX 256bit search register: 8 uint32_t */ __m256i sreg = _mm256_set1_epi32(key); /* The tipp register: 8 uint32_t */ __m256i treg = _mm256_load_si256((__m256i *) keys); /* Check equality and return proper tip address for first found */ __m256i m = _mm256_cmpeq_epi32(sreg, treg); /* Needs AVX2 */ /* The 's' means "single" (float precision), and mask will have [0..7] bits set! */ uint32_t mask = (uint32_t) _mm256_movemask_ps((__m256) m); if(SML_UNLIKELY(mask != 0)) { /* 00000000 00000000 00000000 01000100 -> 6 */ int i = (31 - __builtin_clz(mask)); uint32_t *ptr = &values[i]; if(SML_LIKELY(lane_begin == 0)) { /* Fast-path: Only one match per lane OR first matching in lane for this find/find_all */ *lane_next_begin = (i + 1) % SML_LANE_SPAN; return ptr; } else { /* We did a find_all(..) AND there is more than one match in the lane * and its not first find_all(..) on the lane in question... * * This might be suboptimal, but not so bad: * * - This at this point will search the smaller array scalar-way * - Which sounds good - BUT! * - This means all lanes with more than 1 data are scalar search * - And not only that, but first simd-searched, later scalar... * * I guess its fine as it should happen statistically rarely anyways */ goto non_simd_modulo; } } return NULL; #endif #ifdef __SSE2__ #ifndef __AVX2__ #ifndef __AVX512__ /* Prepare an SSE2 128bit search register: 4 uint32_t */ __m128i sreg = _mm_set1_epi32(key); /* TODO: Implement */ #endif /* AVX512 */ #endif /* AVX2 */ #endif /* SSE2 */ /* Regular integer code - should have good ILP and cache locality patterns anyways */ /** Pretty hopeful this can get more easily unrolled / autovectorized */ for(int i = lane_begin; i < SML_LANE_SPAN; ++i) { if(SML_UNLIKELY(keys[i] == key)) { uint32_t *ptr = &values[i]; *lane_next_begin = (i + 1) % SML_LANE_SPAN; return ptr; } } return NULL; } else { non_simd_modulo: for(int i = lane_begin; i < lane_modulo; ++i) { if(SML_UNLIKELY(keys[i] == key)) { uint32_t *ptr = &values[i]; *lane_next_begin = (i + 1) % SML_LANE_SPAN; return ptr; } } return NULL; } } /** Returns the key location for a given value location */ static inline SM_ALWAYS_INLINE uint32_t *simd_map_lane_key_location(uint32_t *value_location) { return value_location -= SML_LANE_SPAN; } #endif /* SIMD_MAP_LANE_H */