simd_map findall fix + buggy avx2 implementation
This commit is contained in:
parent
ac62753820
commit
41988a0dee
6
main.cpp
6
main.cpp
@ -198,15 +198,17 @@ void test_simd_map_basics() {
|
||||
|
||||
/* Test multimap operations */
|
||||
assert(simd_map_multi_set(&smap, 42, 42) != 0);
|
||||
assert(simd_map_multi_set(&smap, 42, 43) != 0);
|
||||
assert(simd_map_find(&smap, 42) != NULL);
|
||||
assert(*simd_map_find(&smap, 42) != 42);
|
||||
simd_map_find_res res1 = simd_map_find_all(&smap, 42, simd_map_find_all_begin());
|
||||
assert(res1.value_location != NULL);
|
||||
assert(*(res1.value_location) != 42);
|
||||
assert(*(res1.value_location) == (uint32_t)(cnt - 42));
|
||||
simd_map_find_res res2 = simd_map_find_all(&smap, 42, res1);
|
||||
assert(*(res2.value_location) == 42);
|
||||
simd_map_find_res res3 = simd_map_find_all(&smap, 42, res2);
|
||||
assert(res3.value_location == NULL);
|
||||
assert(res3.value_location != NULL);
|
||||
assert(*(res3.value_location) == 43);
|
||||
|
||||
/* Test filled-free */
|
||||
simd_map_free(&smap);
|
||||
|
61
simd_map.h
61
simd_map.h
@ -57,8 +57,8 @@ static inline SM_ALWAYS_INLINE simd_map simd_map_create() {
|
||||
ret.a = newarena((ptrdiff_t)1 << 33);
|
||||
ret.end = 0;
|
||||
ret.usage_end = 0;
|
||||
ret.lanes = (simd_map_lane*)(((auint64*) aralloc(&(ret.a), sizeof(uint64_t), sizeof(simd_map_lane), 1)) /* aligned! */
|
||||
+ 1); /* First really addressible thing */
|
||||
ret.lanes = (simd_map_lane*)(aralloc(&(ret.a), sizeof(simd_map_lane), sizeof(simd_map_lane), 1)) /* aligned! */
|
||||
+ 1; /* First really addressible thing */
|
||||
ret.lane_modulo = 0;
|
||||
return ret;
|
||||
}
|
||||
@ -90,15 +90,56 @@ static inline SM_ALWAYS_INLINE uint32_t *simd_map_lane_find(
|
||||
uint32_t *keys = map_lane->keys;
|
||||
uint32_t *values = map_lane->values;
|
||||
|
||||
if(SM_UNLIKELY(lane_modulo)) goto non_simd_modulo;
|
||||
/* Hopefully can get optimized out for the common case bc inlining */
|
||||
if(lane_modulo == 0) {
|
||||
#ifdef __AVX2__
|
||||
/* TODO */
|
||||
/* TODO */
|
||||
/* Prepare an AVX 256bit search register: 8 uint32_t */
|
||||
__m256i sreg = _mm256_set1_epi32(key);
|
||||
/* The tipp register: 8 uint32_t */
|
||||
__m256i treg = _mm256_load_si256((__m256i *) keys);
|
||||
/* Check equality and return proper tip address for first found */
|
||||
__m256i m = _mm256_cmpeq_epi32(sreg, treg); /* Needs AVX2 */
|
||||
/* The 's' means "single" (float precision), and mask will have [0..7] bits set! */
|
||||
uint32_t mask = (uint32_t) _mm256_movemask_ps((__m256) m);
|
||||
if(SM_UNLIKELY(mask != 0)) {
|
||||
int ipc = __builtin_popcount(mask);
|
||||
/* 00000000 00000000 00000000 01000100 -> 6 */
|
||||
int i = (31 - __builtin_clz(mask));
|
||||
uint32_t *ptr = &values[i];
|
||||
if(SM_LIKELY(ipc == 1) || i >= lane_begin) {
|
||||
/* Only one match in the lane OR first matching in find/find_all */
|
||||
*lane_next_begin = (i + 1) % SM_LANE_SPAN;
|
||||
return ptr;
|
||||
} else {
|
||||
/* We did a find_all(..) AND there is more than one match in the lane
|
||||
*
|
||||
* This might be suboptimal, but not so bad:
|
||||
*
|
||||
* - This at this point will search the smaller array scalar-way
|
||||
* - Which sounds good - BUT!
|
||||
* - This means all lanes with more than 1 data are scalar search
|
||||
* - And not only that, but first simd-searched, later scalar...
|
||||
*
|
||||
* I guess its fine as it should happen statistically rarely anyways
|
||||
*/
|
||||
/* TODO: Can this be solved more optimal by specialized function? */
|
||||
goto non_simd_modulo;
|
||||
}
|
||||
}
|
||||
|
||||
return NULL;
|
||||
#endif
|
||||
#ifdef __SSE2__
|
||||
/* TODO */
|
||||
#endif
|
||||
/* Regular integer code - should have good ILP and cache locality patterns anyways */
|
||||
if(lane_modulo == 0) {
|
||||
#ifndef __AVX2__
|
||||
#ifndef __AVX512__
|
||||
/* Prepare an SSE2 128bit search register: 4 uint32_t */
|
||||
__m128i sreg = _mm_set1_epi32(key);
|
||||
/* TODO: Implement */
|
||||
#endif /* AVX512 */
|
||||
#endif /* AVX2 */
|
||||
#endif /* SSE2 */
|
||||
/* Regular integer code - should have good ILP and cache locality patterns anyways */
|
||||
/** Pretty hopeful this can get more easily unrolled / autovectorized */
|
||||
for(int i = lane_begin; i < SM_LANE_SPAN; ++i) {
|
||||
if(SM_UNLIKELY(keys[i] == key)) {
|
||||
@ -169,7 +210,7 @@ static inline SM_ALWAYS_INLINE simd_map_find_res simd_map_find_all(simd_map *map
|
||||
|
||||
if(found) {
|
||||
ret.value_location = found;
|
||||
ret.lane_next = i + (ret.lane_next_begin != 0);
|
||||
ret.lane_next = i + (ret.lane_next_begin == 0);
|
||||
return ret;
|
||||
}
|
||||
}
|
||||
@ -188,7 +229,7 @@ static inline SM_ALWAYS_INLINE simd_map_find_res simd_map_find_all(simd_map *map
|
||||
|
||||
if(found) {
|
||||
ret.value_location = found;
|
||||
ret.lane_next = (map->usage_end - 1) + (ret.lane_next_begin != 0);
|
||||
ret.lane_next = (map->usage_end - 1) + (ret.lane_next_begin == 0);
|
||||
return ret;
|
||||
}
|
||||
}
|
||||
|
Loading…
x
Reference in New Issue
Block a user