AVX2 implementation seems to work and is (as expected) faster than regular
This commit is contained in:
parent
4e4c266632
commit
64a7d871c2
2
main.cpp
2
main.cpp
@ -135,7 +135,7 @@ int main() {
|
|||||||
test_basics(unomap, &umi);
|
test_basics(unomap, &umi);
|
||||||
|
|
||||||
/* Performance tests */
|
/* Performance tests */
|
||||||
int i = 10000;
|
int i = 100;
|
||||||
keystore(i, true);
|
keystore(i, true);
|
||||||
datastore(i, true);
|
datastore(i, true);
|
||||||
|
|
||||||
|
4
makefile
4
makefile
@ -2,7 +2,9 @@ debug:
|
|||||||
g++ main.cpp -g -Wall -o main
|
g++ main.cpp -g -Wall -o main
|
||||||
release:
|
release:
|
||||||
g++ main.cpp -O2 -Wall -o main
|
g++ main.cpp -O2 -Wall -o main
|
||||||
|
debug-avx2:
|
||||||
|
g++ main.cpp -g -mavx2 -Wall -o main
|
||||||
release-avx2:
|
release-avx2:
|
||||||
g++ main.cpp -fopt-info-vec-missed -mavx2 -O3 -Wall -o main
|
g++ main.cpp -mavx2 -O3 -Wall -o main
|
||||||
release-avx2-asm:
|
release-avx2-asm:
|
||||||
g++ main.cpp -S -fopt-info-vec-missed -masm=intel -mavx2 -O3 -Wall -o main
|
g++ main.cpp -S -fopt-info-vec-missed -masm=intel -mavx2 -O3 -Wall -o main
|
||||||
|
102
simap.h
102
simap.h
@ -7,16 +7,16 @@
|
|||||||
#include "amap.h"
|
#include "amap.h"
|
||||||
#include "arena.h/arena.h"
|
#include "arena.h/arena.h"
|
||||||
|
|
||||||
/* Possible optimizations, but they mean there can be lookup / insert errors (very rarely)
|
/* Possible (non-AVX, but alike) optimization, but means there can be lookup / insert errors (very rarely)
|
||||||
*/
|
|
||||||
#define SIMAP_AVX2_RAW
|
|
||||||
#define SIMAP_RAW
|
#define SIMAP_RAW
|
||||||
|
*/
|
||||||
|
|
||||||
/* Perf trickery */
|
/* Perf trickery */
|
||||||
|
|
||||||
/* This unifies the ifdefs but separates code paths when needed */
|
/* XXX: Enabling AVX also enables rare errors for speed gain! See above. */
|
||||||
#ifdef SIMAP_AVX2_RAW
|
#ifdef __AVX2__
|
||||||
#define SIMAP_RAW
|
#define SIMAP_RAW
|
||||||
|
#include <immintrin.h>
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
/* I have no idea what MSVC has instead... */
|
/* I have no idea what MSVC has instead... */
|
||||||
@ -122,10 +122,78 @@ static inline simap_instance simap_create() {
|
|||||||
|
|
||||||
static inline void* simap(void *amap_instance, AMAP_OP op, const char *key, void *ptr);
|
static inline void* simap(void *amap_instance, AMAP_OP op, const char *key, void *ptr);
|
||||||
|
|
||||||
// TODO: We can possibly hand-optimise this with intrinsics maybe - but I hope autovectorization (does not seem to happen???)
|
/** Gets padding bytes for a size to be padded to divisible alignment */
|
||||||
|
static inline unsigned int get_size_padding(unsigned int size, unsigned int alignment) {
|
||||||
|
/* Would ensure returned value divisible by alignment */
|
||||||
|
/* return (size + alignment - 1) / alignment * alignment; */
|
||||||
|
/* Basically same as: */
|
||||||
|
/* return (alignment - (size % alignment)) % alignment; */
|
||||||
|
|
||||||
|
/* Substracting size leads to padding */
|
||||||
|
return ((size + alignment - 1) / alignment) * alignment - size;
|
||||||
|
}
|
||||||
|
|
||||||
|
/** Gets padded address - or same address if divisible by alignment */
|
||||||
|
static inline void *get_padded(void *ptr, int alignment) {
|
||||||
|
// return (alignment - (ptr % alignment)) % alignment;
|
||||||
|
return (void*)((ptrdiff_t)((uint8_t *)ptr + alignment - 1) / alignment * alignment);
|
||||||
|
}
|
||||||
|
|
||||||
static inline SM_ALWAYS_INLINE auint64 *make_tipp(auint64 *base, auint64 *tip, auint64 prefix, auint64 *end) {
|
static inline SM_ALWAYS_INLINE auint64 *make_tipp(auint64 *base, auint64 *tip, auint64 prefix, auint64 *end) {
|
||||||
#ifdef SIMAP_AVX2_RAW
|
#ifdef __AVX2__
|
||||||
/* TODO: Implement */
|
/* See:
|
||||||
|
*
|
||||||
|
* https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#ig_expand=861,4605&avxnewtechs=AVX
|
||||||
|
* https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#ig_expand=861,4605&avxnewtechs=AVX&text=movemask
|
||||||
|
* https://chryswoods.com/vector_c++/part2.html
|
||||||
|
* https://blog.triplez.cn/posts/avx-avx2-learning-notes/
|
||||||
|
* https://github.com/Triple-Z/AVX-AVX2-Example-Code
|
||||||
|
* https://en.algorithmica.org/hpc/simd/masking/
|
||||||
|
* https://stackoverflow.com/questions/31089502/aligned-and-unaligned-memory-access-with-avx-avx2-intrinsics
|
||||||
|
*/
|
||||||
|
|
||||||
|
/* Step over previous tipp and search until the AVX2 alignment needs */
|
||||||
|
/* AVXs 256 bit = 32 byte so multiple of 32 needed here */
|
||||||
|
/* TODO: Probably I can change "32" here into something bigger to non-avx small arrays! */
|
||||||
|
auint64 *neotip = (auint64 *) get_padded(++tip, 32);
|
||||||
|
while((tip < neotip) && (*tip != prefix)) ++tip;
|
||||||
|
if(tip < neotip) return tip;
|
||||||
|
|
||||||
|
/* Prepare an AVX 256bit search register: 4 uint64_t */
|
||||||
|
__m256i sreg = _mm256_set1_epi64x(prefix);
|
||||||
|
|
||||||
|
while(tip < end) {
|
||||||
|
/* This needs 32 byte alignment here that we do above */
|
||||||
|
/* The tipp register: 4 uint64_t */
|
||||||
|
__m256i treg = _mm256_load_si256((__m256i *) tip);
|
||||||
|
/* Check equality and return proper tip address for first found */
|
||||||
|
__m256i m = _mm256_cmpeq_epi64(sreg, treg); /* Needs AVX2 */
|
||||||
|
uint32_t mask = (uint32_t) _mm256_movemask_pd((__m256d) m);
|
||||||
|
|
||||||
|
/* Try next tip, processes 256 bits per loop */
|
||||||
|
tip += 4; /* 4x64 bit */
|
||||||
|
|
||||||
|
/* One of the links used __builtin_ctz(mask), but I
|
||||||
|
* think it was bad implementation and only finds the
|
||||||
|
* last search result!
|
||||||
|
*
|
||||||
|
* __builtin_clz returns leading zeroes of the mask
|
||||||
|
* and the mask has 4 bits at most and each show if
|
||||||
|
* 1..4 places of AVX reg compared properly to the
|
||||||
|
* given prefix value (4x 64 bit comparizons happen).
|
||||||
|
* Subtracting from 31 we subtract either 28,29,30,31
|
||||||
|
* and thus resulting in 3, 2, 1, 0 (right offsets).
|
||||||
|
*
|
||||||
|
* If the mask got all zero, there is nothing found,
|
||||||
|
* otherwise its the tipp + offset we calculated. */
|
||||||
|
if(SM_UNLIKELY(mask != 0)) {
|
||||||
|
int offset = (31 - __builtin_clz(mask));
|
||||||
|
/* -4 because this is the unlikely scenario and we already incremented! */
|
||||||
|
return tip - 4 + offset;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
/* Not found case */
|
||||||
|
return tip;
|
||||||
#endif
|
#endif
|
||||||
#ifdef SIMAP_RAW
|
#ifdef SIMAP_RAW
|
||||||
#pragma GCC unroll 16
|
#pragma GCC unroll 16
|
||||||
@ -150,7 +218,11 @@ static inline simap_ptr64 *simap_search_internal(simap_instance *map, const char
|
|||||||
|
|
||||||
simap_c64 prefix {0};
|
simap_c64 prefix {0};
|
||||||
size_t prefixlen = is_smallkey ? keylen : 8;
|
size_t prefixlen = is_smallkey ? keylen : 8;
|
||||||
|
/* Ignore warning because we know what we are doing here... */
|
||||||
|
#pragma GCC diagnostic push
|
||||||
|
#pragma GCC diagnostic ignored "-Wstringop-truncation"
|
||||||
strncpy(prefix.str8, key, prefixlen);
|
strncpy(prefix.str8, key, prefixlen);
|
||||||
|
#pragma GCC diagnostic pop
|
||||||
|
|
||||||
/* Construct keyremains - might point to the \0 terminator only if smallkey or 8 bytes exactly */
|
/* Construct keyremains - might point to the \0 terminator only if smallkey or 8 bytes exactly */
|
||||||
const char *keyremains = key + prefixlen;
|
const char *keyremains = key + prefixlen;
|
||||||
@ -204,16 +276,6 @@ static inline simap_ptr64 *simap_search_internal(simap_instance *map, const char
|
|||||||
return NULL;
|
return NULL;
|
||||||
}
|
}
|
||||||
|
|
||||||
/** Gets padding bytes for a size to be padded to divisible alignment */
|
|
||||||
static inline unsigned int get_size_padding(unsigned int size, unsigned int alignment) {
|
|
||||||
/* Would ensure returned value divisible by alignment */
|
|
||||||
/* return (size + alignment - 1) / alignment * alignment; */
|
|
||||||
/* same: return (alignment - (size % alignment)) % alignment; */
|
|
||||||
|
|
||||||
/* Substracting size leads to padding */
|
|
||||||
return ((size + alignment - 1) / alignment) * alignment - size;
|
|
||||||
}
|
|
||||||
|
|
||||||
/** Returns the size of the storage needed for the given key */
|
/** Returns the size of the storage needed for the given key */
|
||||||
static inline uint32_t simap_elem_storage_size(const char *key) {
|
static inline uint32_t simap_elem_storage_size(const char *key) {
|
||||||
uint32_t keysize = strlen(key);
|
uint32_t keysize = strlen(key);
|
||||||
@ -248,7 +310,11 @@ static inline void *simap_force_add(simap_instance *map, const char *key, void *
|
|||||||
/* Create first 8 char encoding (this ensures endianness and all such stuff) */
|
/* Create first 8 char encoding (this ensures endianness and all such stuff) */
|
||||||
simap_c64 first8 {0};
|
simap_c64 first8 {0};
|
||||||
uint32_t keylen = strlen(key);
|
uint32_t keylen = strlen(key);
|
||||||
|
/* Ignore warning because we know what we are doing here... */
|
||||||
|
#pragma GCC diagnostic push
|
||||||
|
#pragma GCC diagnostic ignored "-Wstringop-truncation"
|
||||||
strncpy(first8.str8, key, (keylen < 8) ? keylen : 8);
|
strncpy(first8.str8, key, (keylen < 8) ? keylen : 8);
|
||||||
|
#pragma GCC diagnostic pop
|
||||||
|
|
||||||
uint32_t usi = map->usage_end;
|
uint32_t usi = map->usage_end;
|
||||||
uint32_t previ = map->prev_usage_end;
|
uint32_t previ = map->prev_usage_end;
|
||||||
|
Loading…
x
Reference in New Issue
Block a user