refactor: pulled out simd_map_lane.h because its useful on its own
This commit is contained in:
parent
c023882866
commit
182fb69e18
131
simd_map.h
131
simd_map.h
@ -5,15 +5,7 @@
|
|||||||
#include <stdint.h> /* uint32_t, ... */
|
#include <stdint.h> /* uint32_t, ... */
|
||||||
#include <assert.h>
|
#include <assert.h>
|
||||||
#include "arena.h/arena.h"
|
#include "arena.h/arena.h"
|
||||||
|
#include "simd_map_lane.h"
|
||||||
/* SIMD support */
|
|
||||||
#ifdef __AVX2__
|
|
||||||
#include <immintrin.h>
|
|
||||||
#endif
|
|
||||||
|
|
||||||
#ifdef __SSE2__
|
|
||||||
#include <immintrin.h>
|
|
||||||
#endif
|
|
||||||
|
|
||||||
/* I have no idea what MSVC has instead... */
|
/* I have no idea what MSVC has instead... */
|
||||||
#ifdef _MSC_VER
|
#ifdef _MSC_VER
|
||||||
@ -32,22 +24,12 @@
|
|||||||
#define SM_ALWAYS_INLINE __attribute__ ((always_inline))
|
#define SM_ALWAYS_INLINE __attribute__ ((always_inline))
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
/* 32 byte = 256 bits = (8 * 32bit) optimized for AVX2 */
|
|
||||||
#define SM_LANE_SPAN 8
|
|
||||||
|
|
||||||
/** Grouped together keys-values to support SIMD more this way (also became a single cache line this way) */
|
|
||||||
struct simd_map_lane {
|
|
||||||
uint32_t keys[SM_LANE_SPAN];
|
|
||||||
uint32_t values[SM_LANE_SPAN];
|
|
||||||
};
|
|
||||||
typedef struct simd_map_elem simd_map_elem;
|
|
||||||
|
|
||||||
struct simd_map {
|
struct simd_map {
|
||||||
arena a;
|
arena a;
|
||||||
simd_map_lane *lanes;
|
simd_map_lane *lanes;
|
||||||
uint32_t end; /* in lanes */
|
uint32_t end; /* in lanes */
|
||||||
uint32_t usage_end; /* in lanes!!! */
|
uint32_t usage_end; /* in lanes!!! */
|
||||||
int lane_modulo; /* [0..SM_LANE_SPAN) */
|
int lane_modulo; /* [0..SML_LANE_SPAN) */
|
||||||
};
|
};
|
||||||
typedef struct simd_map simd_map;
|
typedef struct simd_map simd_map;
|
||||||
|
|
||||||
@ -71,100 +53,6 @@ static inline SM_ALWAYS_INLINE char simd_map_free(simd_map *map) {
|
|||||||
return freearena(&(map->a));
|
return freearena(&(map->a));
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
|
||||||
* Returns if this key is stored in the given map LANE or not - returns NULL if not found.
|
|
||||||
*
|
|
||||||
* Rem.: The lane_begin and lane_next_begin parameters are used for reentrant multisearch.
|
|
||||||
*
|
|
||||||
* @param map_lane The lane to find in.
|
|
||||||
* @param key The key to search for.
|
|
||||||
* @param lane_modulo When non-zero, the lane only searched until this index. Zero means all remaining elements. (mod lane length)
|
|
||||||
* @param lane_begin The lane is searched from this location(find_all). If full lane / lane prefix is needed, this should be 0.
|
|
||||||
* @param lane_next_begin This pointer will be filled on non-NULL retvals with the incremented in-lane index (with % modulus).
|
|
||||||
* @returns NULL when not found, otherwise pointer to the stored value for the key.
|
|
||||||
*/
|
|
||||||
static inline SM_ALWAYS_INLINE uint32_t *simd_map_lane_find(
|
|
||||||
simd_map_lane *map_lane,
|
|
||||||
uint32_t key,
|
|
||||||
int lane_modulo,
|
|
||||||
int lane_begin,
|
|
||||||
int *lane_next_begin) {
|
|
||||||
|
|
||||||
uint32_t *keys = map_lane->keys;
|
|
||||||
uint32_t *values = map_lane->values;
|
|
||||||
|
|
||||||
/* Hopefully can get optimized out for the common case bc inlining */
|
|
||||||
if(lane_modulo == 0) {
|
|
||||||
#ifdef __AVX2__
|
|
||||||
/* Prepare an AVX 256bit search register: 8 uint32_t */
|
|
||||||
__m256i sreg = _mm256_set1_epi32(key);
|
|
||||||
/* The tipp register: 8 uint32_t */
|
|
||||||
__m256i treg = _mm256_load_si256((__m256i *) keys);
|
|
||||||
/* Check equality and return proper tip address for first found */
|
|
||||||
__m256i m = _mm256_cmpeq_epi32(sreg, treg); /* Needs AVX2 */
|
|
||||||
/* The 's' means "single" (float precision), and mask will have [0..7] bits set! */
|
|
||||||
uint32_t mask = (uint32_t) _mm256_movemask_ps((__m256) m);
|
|
||||||
if(SM_UNLIKELY(mask != 0)) {
|
|
||||||
/* 00000000 00000000 00000000 01000100 -> 6 */
|
|
||||||
int i = (31 - __builtin_clz(mask));
|
|
||||||
uint32_t *ptr = &values[i];
|
|
||||||
if(SM_LIKELY(lane_begin == 0)) {
|
|
||||||
/* Fast-path: Only one match per lane OR first matching in lane for this find/find_all */
|
|
||||||
*lane_next_begin = (i + 1) % SM_LANE_SPAN;
|
|
||||||
return ptr;
|
|
||||||
} else {
|
|
||||||
/* We did a find_all(..) AND there is more than one match in the lane
|
|
||||||
* and its not first find_all(..) on the lane in question...
|
|
||||||
*
|
|
||||||
* This might be suboptimal, but not so bad:
|
|
||||||
*
|
|
||||||
* - This at this point will search the smaller array scalar-way
|
|
||||||
* - Which sounds good - BUT!
|
|
||||||
* - This means all lanes with more than 1 data are scalar search
|
|
||||||
* - And not only that, but first simd-searched, later scalar...
|
|
||||||
*
|
|
||||||
* I guess its fine as it should happen statistically rarely anyways
|
|
||||||
*/
|
|
||||||
goto non_simd_modulo;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
return NULL;
|
|
||||||
#endif
|
|
||||||
#ifdef __SSE2__
|
|
||||||
#ifndef __AVX2__
|
|
||||||
#ifndef __AVX512__
|
|
||||||
/* Prepare an SSE2 128bit search register: 4 uint32_t */
|
|
||||||
__m128i sreg = _mm_set1_epi32(key);
|
|
||||||
/* TODO: Implement */
|
|
||||||
#endif /* AVX512 */
|
|
||||||
#endif /* AVX2 */
|
|
||||||
#endif /* SSE2 */
|
|
||||||
/* Regular integer code - should have good ILP and cache locality patterns anyways */
|
|
||||||
/** Pretty hopeful this can get more easily unrolled / autovectorized */
|
|
||||||
for(int i = lane_begin; i < SM_LANE_SPAN; ++i) {
|
|
||||||
if(SM_UNLIKELY(keys[i] == key)) {
|
|
||||||
uint32_t *ptr = &values[i];
|
|
||||||
*lane_next_begin = (i + 1) % SM_LANE_SPAN;
|
|
||||||
return ptr;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
return NULL;
|
|
||||||
} else {
|
|
||||||
non_simd_modulo:
|
|
||||||
for(int i = lane_begin; i < lane_modulo; ++i) {
|
|
||||||
if(SM_UNLIKELY(keys[i] == key)) {
|
|
||||||
uint32_t *ptr = &values[i];
|
|
||||||
*lane_next_begin = (i + 1) % SM_LANE_SPAN;
|
|
||||||
return ptr;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
return NULL;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
/** The result of the find_all(..) operation */
|
/** The result of the find_all(..) operation */
|
||||||
struct simd_map_find_res {
|
struct simd_map_find_res {
|
||||||
/** The found location - or NULL when the key was not found */
|
/** The found location - or NULL when the key was not found */
|
||||||
@ -284,7 +172,7 @@ static inline SM_ALWAYS_INLINE char simd_map_multi_set(simd_map *map, uint32_t k
|
|||||||
lane->values[map->lane_modulo] = value;
|
lane->values[map->lane_modulo] = value;
|
||||||
|
|
||||||
/* Update lane modulo */
|
/* Update lane modulo */
|
||||||
map->lane_modulo = (map->lane_modulo + 1) % SM_LANE_SPAN;
|
map->lane_modulo = (map->lane_modulo + 1) % SML_LANE_SPAN;
|
||||||
|
|
||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
@ -321,16 +209,11 @@ static inline SM_ALWAYS_INLINE char simd_map_is_empty(simd_map *map) {
|
|||||||
return (map->usage_end == 0);
|
return (map->usage_end == 0);
|
||||||
}
|
}
|
||||||
|
|
||||||
/** Returns the key location for a given value location */
|
|
||||||
static inline SM_ALWAYS_INLINE uint32_t *simd_map_key_location(uint32_t *value_location) {
|
|
||||||
return value_location -= SM_LANE_SPAN;
|
|
||||||
}
|
|
||||||
|
|
||||||
/** Returns the lastly inserted value's location in the map - you must ensure the map has elements! */
|
/** Returns the lastly inserted value's location in the map - you must ensure the map has elements! */
|
||||||
static inline SM_ALWAYS_INLINE uint32_t *simd_map_last_location(simd_map *map) {
|
static inline SM_ALWAYS_INLINE uint32_t *simd_map_last_location(simd_map *map) {
|
||||||
return (map->lane_modulo > 0) ?
|
return (map->lane_modulo > 0) ?
|
||||||
&(map->lanes[map->usage_end - 1].values[map->lane_modulo - 1]) :
|
&(map->lanes[map->usage_end - 1].values[map->lane_modulo - 1]) :
|
||||||
&(map->lanes[map->usage_end - 1].values[SM_LANE_SPAN - 1]);
|
&(map->lanes[map->usage_end - 1].values[SML_LANE_SPAN - 1]);
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
@ -346,9 +229,9 @@ static inline SM_ALWAYS_INLINE uint32_t *simd_map_last_location(simd_map *map) {
|
|||||||
*/
|
*/
|
||||||
static inline SM_ALWAYS_INLINE void simd_map_remove_ptr(simd_map *map, uint32_t *value_location) {
|
static inline SM_ALWAYS_INLINE void simd_map_remove_ptr(simd_map *map, uint32_t *value_location) {
|
||||||
/* Overwrite with the last key-value */
|
/* Overwrite with the last key-value */
|
||||||
uint32_t *key_location = simd_map_key_location(value_location);
|
uint32_t *key_location = simd_map_lane_key_location(value_location);
|
||||||
uint32_t *last_value_location = simd_map_last_location(map);
|
uint32_t *last_value_location = simd_map_last_location(map);
|
||||||
uint32_t *last_key_location = simd_map_key_location(last_value_location);
|
uint32_t *last_key_location = simd_map_lane_key_location(last_value_location);
|
||||||
*value_location = *last_value_location;
|
*value_location = *last_value_location;
|
||||||
*key_location = *last_key_location;
|
*key_location = *last_key_location;
|
||||||
|
|
||||||
@ -356,7 +239,7 @@ static inline SM_ALWAYS_INLINE void simd_map_remove_ptr(simd_map *map, uint32_t
|
|||||||
if(map->lane_modulo > 0) {
|
if(map->lane_modulo > 0) {
|
||||||
--(map->lane_modulo);
|
--(map->lane_modulo);
|
||||||
} else {
|
} else {
|
||||||
map->lane_modulo = SM_LANE_SPAN - 1;
|
map->lane_modulo = SML_LANE_SPAN - 1;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
147
simd_map_lane.h
Normal file
147
simd_map_lane.h
Normal file
@ -0,0 +1,147 @@
|
|||||||
|
#ifndef SIMD_MAP_LANE_H
|
||||||
|
#define SIMD_MAP_LANE_H
|
||||||
|
/*
|
||||||
|
* Building blocks for SIMD-optimized (hash-)map buckets.
|
||||||
|
*/
|
||||||
|
|
||||||
|
/* SIMD support */
|
||||||
|
#ifdef __AVX2__
|
||||||
|
#include <immintrin.h>
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#ifdef __SSE2__
|
||||||
|
#include <immintrin.h>
|
||||||
|
#endif
|
||||||
|
|
||||||
|
|
||||||
|
#ifdef _MSC_VER
|
||||||
|
#define SML_THREAD_LOCAL __declspec(thread)
|
||||||
|
#define SML_PREFETCH(x)
|
||||||
|
#define SML_LIKELY(x)
|
||||||
|
#define SML_UNLIKELY(x)
|
||||||
|
#define SML_NOINLINE __declspec(noinline)
|
||||||
|
#define SML_ALWAYS_INLINE __forceinline
|
||||||
|
#else
|
||||||
|
#define SML_THREAD_LOCAL __thread
|
||||||
|
#define SML_PREFETCH(x) __builtin_prefetch(x)
|
||||||
|
#define SML_LIKELY(x) __builtin_expect((x),1)
|
||||||
|
#define SML_UNLIKELY(x) __builtin_expect((x),0)
|
||||||
|
#define SML_NOINLINE __attribute__ ((noinline))
|
||||||
|
#define SML_ALWAYS_INLINE __attribute__ ((always_inline))
|
||||||
|
#endif
|
||||||
|
|
||||||
|
/* 32 byte = 256 bits = (8 * 32bit) optimized for AVX2 */
|
||||||
|
#define SML_LANE_SPAN 8
|
||||||
|
|
||||||
|
/** Grouped together keys-values to support SIMD more this way (also became a single cache line this way) */
|
||||||
|
struct simd_map_lane {
|
||||||
|
uint32_t keys[SML_LANE_SPAN];
|
||||||
|
uint32_t values[SML_LANE_SPAN];
|
||||||
|
};
|
||||||
|
typedef struct simd_map_elem simd_map_elem;
|
||||||
|
|
||||||
|
/** Returns the last value of the lane - usable to see if this lane is fully filled with data or not! */
|
||||||
|
static inline SML_ALWAYS_INLINE uint32_t simd_map_lane_last_value(simd_map_lane *map_lane) {
|
||||||
|
return map_lane->values[SML_LANE_SPAN - 1];
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Returns if this key is stored in the given map LANE or not - returns NULL if not found.
|
||||||
|
*
|
||||||
|
* Rem.: The lane_begin and lane_next_begin parameters are used for reentrant multisearch.
|
||||||
|
*
|
||||||
|
* @param map_lane The lane to find in.
|
||||||
|
* @param key The key to search for.
|
||||||
|
* @param lane_modulo When non-zero, the lane only searched until this index. Zero means all remaining elements. (mod lane length)
|
||||||
|
* @param lane_begin The lane is searched from this location(find_all). If full lane / lane prefix is needed, this should be 0.
|
||||||
|
* @param lane_next_begin This pointer will be filled on non-NULL retvals with the incremented in-lane index (with % modulus).
|
||||||
|
* @returns NULL when not found, otherwise pointer to the stored value for the key.
|
||||||
|
*/
|
||||||
|
static inline SML_ALWAYS_INLINE uint32_t *simd_map_lane_find(
|
||||||
|
simd_map_lane *map_lane,
|
||||||
|
uint32_t key,
|
||||||
|
int lane_modulo,
|
||||||
|
int lane_begin,
|
||||||
|
int *lane_next_begin) {
|
||||||
|
|
||||||
|
uint32_t *keys = map_lane->keys;
|
||||||
|
uint32_t *values = map_lane->values;
|
||||||
|
|
||||||
|
/* Hopefully can get optimized out for the common case bc inlining */
|
||||||
|
if(lane_modulo == 0) {
|
||||||
|
#ifdef __AVX2__
|
||||||
|
/* Prepare an AVX 256bit search register: 8 uint32_t */
|
||||||
|
__m256i sreg = _mm256_set1_epi32(key);
|
||||||
|
/* The tipp register: 8 uint32_t */
|
||||||
|
__m256i treg = _mm256_load_si256((__m256i *) keys);
|
||||||
|
/* Check equality and return proper tip address for first found */
|
||||||
|
__m256i m = _mm256_cmpeq_epi32(sreg, treg); /* Needs AVX2 */
|
||||||
|
/* The 's' means "single" (float precision), and mask will have [0..7] bits set! */
|
||||||
|
uint32_t mask = (uint32_t) _mm256_movemask_ps((__m256) m);
|
||||||
|
if(SML_UNLIKELY(mask != 0)) {
|
||||||
|
/* 00000000 00000000 00000000 01000100 -> 6 */
|
||||||
|
int i = (31 - __builtin_clz(mask));
|
||||||
|
uint32_t *ptr = &values[i];
|
||||||
|
if(SML_LIKELY(lane_begin == 0)) {
|
||||||
|
/* Fast-path: Only one match per lane OR first matching in lane for this find/find_all */
|
||||||
|
*lane_next_begin = (i + 1) % SML_LANE_SPAN;
|
||||||
|
return ptr;
|
||||||
|
} else {
|
||||||
|
/* We did a find_all(..) AND there is more than one match in the lane
|
||||||
|
* and its not first find_all(..) on the lane in question...
|
||||||
|
*
|
||||||
|
* This might be suboptimal, but not so bad:
|
||||||
|
*
|
||||||
|
* - This at this point will search the smaller array scalar-way
|
||||||
|
* - Which sounds good - BUT!
|
||||||
|
* - This means all lanes with more than 1 data are scalar search
|
||||||
|
* - And not only that, but first simd-searched, later scalar...
|
||||||
|
*
|
||||||
|
* I guess its fine as it should happen statistically rarely anyways
|
||||||
|
*/
|
||||||
|
goto non_simd_modulo;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return NULL;
|
||||||
|
#endif
|
||||||
|
#ifdef __SSE2__
|
||||||
|
#ifndef __AVX2__
|
||||||
|
#ifndef __AVX512__
|
||||||
|
/* Prepare an SSE2 128bit search register: 4 uint32_t */
|
||||||
|
__m128i sreg = _mm_set1_epi32(key);
|
||||||
|
/* TODO: Implement */
|
||||||
|
#endif /* AVX512 */
|
||||||
|
#endif /* AVX2 */
|
||||||
|
#endif /* SSE2 */
|
||||||
|
/* Regular integer code - should have good ILP and cache locality patterns anyways */
|
||||||
|
/** Pretty hopeful this can get more easily unrolled / autovectorized */
|
||||||
|
for(int i = lane_begin; i < SML_LANE_SPAN; ++i) {
|
||||||
|
if(SML_UNLIKELY(keys[i] == key)) {
|
||||||
|
uint32_t *ptr = &values[i];
|
||||||
|
*lane_next_begin = (i + 1) % SML_LANE_SPAN;
|
||||||
|
return ptr;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return NULL;
|
||||||
|
} else {
|
||||||
|
non_simd_modulo:
|
||||||
|
for(int i = lane_begin; i < lane_modulo; ++i) {
|
||||||
|
if(SML_UNLIKELY(keys[i] == key)) {
|
||||||
|
uint32_t *ptr = &values[i];
|
||||||
|
*lane_next_begin = (i + 1) % SML_LANE_SPAN;
|
||||||
|
return ptr;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return NULL;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/** Returns the key location for a given value location */
|
||||||
|
static inline SM_ALWAYS_INLINE uint32_t *simd_map_lane_key_location(uint32_t *value_location) {
|
||||||
|
return value_location -= SML_LANE_SPAN;
|
||||||
|
}
|
||||||
|
|
||||||
|
#endif /* SIMD_MAP_LANE_H */
|
Loading…
x
Reference in New Issue
Block a user