simd_map findall fix + buggy avx2 implementation

This commit is contained in:
Richard Thier 2024-10-23 14:20:56 +02:00
parent ac62753820
commit 41988a0dee
2 changed files with 55 additions and 12 deletions

View File

@ -198,15 +198,17 @@ void test_simd_map_basics() {
/* Test multimap operations */
assert(simd_map_multi_set(&smap, 42, 42) != 0);
assert(simd_map_multi_set(&smap, 42, 43) != 0);
assert(simd_map_find(&smap, 42) != NULL);
assert(*simd_map_find(&smap, 42) != 42);
simd_map_find_res res1 = simd_map_find_all(&smap, 42, simd_map_find_all_begin());
assert(res1.value_location != NULL);
assert(*(res1.value_location) != 42);
assert(*(res1.value_location) == (uint32_t)(cnt - 42));
simd_map_find_res res2 = simd_map_find_all(&smap, 42, res1);
assert(*(res2.value_location) == 42);
simd_map_find_res res3 = simd_map_find_all(&smap, 42, res2);
assert(res3.value_location == NULL);
assert(res3.value_location != NULL);
assert(*(res3.value_location) == 43);
/* Test filled-free */
simd_map_free(&smap);

View File

@ -57,8 +57,8 @@ static inline SM_ALWAYS_INLINE simd_map simd_map_create() {
ret.a = newarena((ptrdiff_t)1 << 33);
ret.end = 0;
ret.usage_end = 0;
ret.lanes = (simd_map_lane*)(((auint64*) aralloc(&(ret.a), sizeof(uint64_t), sizeof(simd_map_lane), 1)) /* aligned! */
+ 1); /* First really addressible thing */
ret.lanes = (simd_map_lane*)(aralloc(&(ret.a), sizeof(simd_map_lane), sizeof(simd_map_lane), 1)) /* aligned! */
+ 1; /* First really addressible thing */
ret.lane_modulo = 0;
return ret;
}
@ -90,15 +90,56 @@ static inline SM_ALWAYS_INLINE uint32_t *simd_map_lane_find(
uint32_t *keys = map_lane->keys;
uint32_t *values = map_lane->values;
if(SM_UNLIKELY(lane_modulo)) goto non_simd_modulo;
/* Hopefully can get optimized out for the common case bc inlining */
if(lane_modulo == 0) {
#ifdef __AVX2__
/* TODO */
/* TODO */
/* Prepare an AVX 256bit search register: 8 uint32_t */
__m256i sreg = _mm256_set1_epi32(key);
/* The tipp register: 8 uint32_t */
__m256i treg = _mm256_load_si256((__m256i *) keys);
/* Check equality and return proper tip address for first found */
__m256i m = _mm256_cmpeq_epi32(sreg, treg); /* Needs AVX2 */
/* The 's' means "single" (float precision), and mask will have [0..7] bits set! */
uint32_t mask = (uint32_t) _mm256_movemask_ps((__m256) m);
if(SM_UNLIKELY(mask != 0)) {
int ipc = __builtin_popcount(mask);
/* 00000000 00000000 00000000 01000100 -> 6 */
int i = (31 - __builtin_clz(mask));
uint32_t *ptr = &values[i];
if(SM_LIKELY(ipc == 1) || i >= lane_begin) {
/* Only one match in the lane OR first matching in find/find_all */
*lane_next_begin = (i + 1) % SM_LANE_SPAN;
return ptr;
} else {
/* We did a find_all(..) AND there is more than one match in the lane
*
* This might be suboptimal, but not so bad:
*
* - This at this point will search the smaller array scalar-way
* - Which sounds good - BUT!
* - This means all lanes with more than 1 data are scalar search
* - And not only that, but first simd-searched, later scalar...
*
* I guess its fine as it should happen statistically rarely anyways
*/
/* TODO: Can this be solved more optimal by specialized function? */
goto non_simd_modulo;
}
}
return NULL;
#endif
#ifdef __SSE2__
/* TODO */
#endif
/* Regular integer code - should have good ILP and cache locality patterns anyways */
if(lane_modulo == 0) {
#ifndef __AVX2__
#ifndef __AVX512__
/* Prepare an SSE2 128bit search register: 4 uint32_t */
__m128i sreg = _mm_set1_epi32(key);
/* TODO: Implement */
#endif /* AVX512 */
#endif /* AVX2 */
#endif /* SSE2 */
/* Regular integer code - should have good ILP and cache locality patterns anyways */
/** Pretty hopeful this can get more easily unrolled / autovectorized */
for(int i = lane_begin; i < SM_LANE_SPAN; ++i) {
if(SM_UNLIKELY(keys[i] == key)) {
@ -169,7 +210,7 @@ static inline SM_ALWAYS_INLINE simd_map_find_res simd_map_find_all(simd_map *map
if(found) {
ret.value_location = found;
ret.lane_next = i + (ret.lane_next_begin != 0);
ret.lane_next = i + (ret.lane_next_begin == 0);
return ret;
}
}
@ -188,7 +229,7 @@ static inline SM_ALWAYS_INLINE simd_map_find_res simd_map_find_all(simd_map *map
if(found) {
ret.value_location = found;
ret.lane_next = (map->usage_end - 1) + (ret.lane_next_begin != 0);
ret.lane_next = (map->usage_end - 1) + (ret.lane_next_begin == 0);
return ret;
}
}