From 41988a0deefa3c9f0af210bd0bd937421a2cba5a Mon Sep 17 00:00:00 2001 From: Richard Thier Date: Wed, 23 Oct 2024 14:20:56 +0200 Subject: [PATCH] simd_map findall fix + buggy avx2 implementation --- main.cpp | 6 ++++-- simd_map.h | 61 +++++++++++++++++++++++++++++++++++++++++++++--------- 2 files changed, 55 insertions(+), 12 deletions(-) diff --git a/main.cpp b/main.cpp index 43d2d3a..b4ad4de 100644 --- a/main.cpp +++ b/main.cpp @@ -198,15 +198,17 @@ void test_simd_map_basics() { /* Test multimap operations */ assert(simd_map_multi_set(&smap, 42, 42) != 0); + assert(simd_map_multi_set(&smap, 42, 43) != 0); assert(simd_map_find(&smap, 42) != NULL); assert(*simd_map_find(&smap, 42) != 42); simd_map_find_res res1 = simd_map_find_all(&smap, 42, simd_map_find_all_begin()); assert(res1.value_location != NULL); - assert(*(res1.value_location) != 42); + assert(*(res1.value_location) == (uint32_t)(cnt - 42)); simd_map_find_res res2 = simd_map_find_all(&smap, 42, res1); assert(*(res2.value_location) == 42); simd_map_find_res res3 = simd_map_find_all(&smap, 42, res2); - assert(res3.value_location == NULL); + assert(res3.value_location != NULL); + assert(*(res3.value_location) == 43); /* Test filled-free */ simd_map_free(&smap); diff --git a/simd_map.h b/simd_map.h index acb8ab1..a63c2af 100644 --- a/simd_map.h +++ b/simd_map.h @@ -57,8 +57,8 @@ static inline SM_ALWAYS_INLINE simd_map simd_map_create() { ret.a = newarena((ptrdiff_t)1 << 33); ret.end = 0; ret.usage_end = 0; - ret.lanes = (simd_map_lane*)(((auint64*) aralloc(&(ret.a), sizeof(uint64_t), sizeof(simd_map_lane), 1)) /* aligned! */ - + 1); /* First really addressible thing */ + ret.lanes = (simd_map_lane*)(aralloc(&(ret.a), sizeof(simd_map_lane), sizeof(simd_map_lane), 1)) /* aligned! */ + + 1; /* First really addressible thing */ ret.lane_modulo = 0; return ret; } @@ -90,15 +90,56 @@ static inline SM_ALWAYS_INLINE uint32_t *simd_map_lane_find( uint32_t *keys = map_lane->keys; uint32_t *values = map_lane->values; - if(SM_UNLIKELY(lane_modulo)) goto non_simd_modulo; + /* Hopefully can get optimized out for the common case bc inlining */ + if(lane_modulo == 0) { #ifdef __AVX2__ - /* TODO */ + /* TODO */ + /* Prepare an AVX 256bit search register: 8 uint32_t */ + __m256i sreg = _mm256_set1_epi32(key); + /* The tipp register: 8 uint32_t */ + __m256i treg = _mm256_load_si256((__m256i *) keys); + /* Check equality and return proper tip address for first found */ + __m256i m = _mm256_cmpeq_epi32(sreg, treg); /* Needs AVX2 */ + /* The 's' means "single" (float precision), and mask will have [0..7] bits set! */ + uint32_t mask = (uint32_t) _mm256_movemask_ps((__m256) m); + if(SM_UNLIKELY(mask != 0)) { + int ipc = __builtin_popcount(mask); + /* 00000000 00000000 00000000 01000100 -> 6 */ + int i = (31 - __builtin_clz(mask)); + uint32_t *ptr = &values[i]; + if(SM_LIKELY(ipc == 1) || i >= lane_begin) { + /* Only one match in the lane OR first matching in find/find_all */ + *lane_next_begin = (i + 1) % SM_LANE_SPAN; + return ptr; + } else { + /* We did a find_all(..) AND there is more than one match in the lane + * + * This might be suboptimal, but not so bad: + * + * - This at this point will search the smaller array scalar-way + * - Which sounds good - BUT! + * - This means all lanes with more than 1 data are scalar search + * - And not only that, but first simd-searched, later scalar... + * + * I guess its fine as it should happen statistically rarely anyways + */ + /* TODO: Can this be solved more optimal by specialized function? */ + goto non_simd_modulo; + } + } + + return NULL; #endif #ifdef __SSE2__ - /* TODO */ -#endif - /* Regular integer code - should have good ILP and cache locality patterns anyways */ - if(lane_modulo == 0) { +#ifndef __AVX2__ +#ifndef __AVX512__ + /* Prepare an SSE2 128bit search register: 4 uint32_t */ + __m128i sreg = _mm_set1_epi32(key); + /* TODO: Implement */ +#endif /* AVX512 */ +#endif /* AVX2 */ +#endif /* SSE2 */ + /* Regular integer code - should have good ILP and cache locality patterns anyways */ /** Pretty hopeful this can get more easily unrolled / autovectorized */ for(int i = lane_begin; i < SM_LANE_SPAN; ++i) { if(SM_UNLIKELY(keys[i] == key)) { @@ -169,7 +210,7 @@ static inline SM_ALWAYS_INLINE simd_map_find_res simd_map_find_all(simd_map *map if(found) { ret.value_location = found; - ret.lane_next = i + (ret.lane_next_begin != 0); + ret.lane_next = i + (ret.lane_next_begin == 0); return ret; } } @@ -188,7 +229,7 @@ static inline SM_ALWAYS_INLINE simd_map_find_res simd_map_find_all(simd_map *map if(found) { ret.value_location = found; - ret.lane_next = (map->usage_end - 1) + (ret.lane_next_begin != 0); + ret.lane_next = (map->usage_end - 1) + (ret.lane_next_begin == 0); return ret; } }