From 64a7d871c2f8dab1012994f289145fb49bc18928 Mon Sep 17 00:00:00 2001 From: Richard Thier Date: Mon, 21 Oct 2024 17:31:58 +0200 Subject: [PATCH] AVX2 implementation seems to work and is (as expected) faster than regular --- main.cpp | 2 +- makefile | 4 ++- simap.h | 102 +++++++++++++++++++++++++++++++++++++++++++++---------- 3 files changed, 88 insertions(+), 20 deletions(-) diff --git a/main.cpp b/main.cpp index 8cce812..ae68fe1 100644 --- a/main.cpp +++ b/main.cpp @@ -135,7 +135,7 @@ int main() { test_basics(unomap, &umi); /* Performance tests */ - int i = 10000; + int i = 100; keystore(i, true); datastore(i, true); diff --git a/makefile b/makefile index 7c6691d..a60cf41 100644 --- a/makefile +++ b/makefile @@ -2,7 +2,9 @@ debug: g++ main.cpp -g -Wall -o main release: g++ main.cpp -O2 -Wall -o main +debug-avx2: + g++ main.cpp -g -mavx2 -Wall -o main release-avx2: - g++ main.cpp -fopt-info-vec-missed -mavx2 -O3 -Wall -o main + g++ main.cpp -mavx2 -O3 -Wall -o main release-avx2-asm: g++ main.cpp -S -fopt-info-vec-missed -masm=intel -mavx2 -O3 -Wall -o main diff --git a/simap.h b/simap.h index 6d82168..25ddc53 100644 --- a/simap.h +++ b/simap.h @@ -7,16 +7,16 @@ #include "amap.h" #include "arena.h/arena.h" -/* Possible optimizations, but they mean there can be lookup / insert errors (very rarely) -*/ -#define SIMAP_AVX2_RAW +/* Possible (non-AVX, but alike) optimization, but means there can be lookup / insert errors (very rarely) #define SIMAP_RAW +*/ /* Perf trickery */ -/* This unifies the ifdefs but separates code paths when needed */ -#ifdef SIMAP_AVX2_RAW +/* XXX: Enabling AVX also enables rare errors for speed gain! See above. */ +#ifdef __AVX2__ #define SIMAP_RAW +#include #endif /* I have no idea what MSVC has instead... */ @@ -122,10 +122,78 @@ static inline simap_instance simap_create() { static inline void* simap(void *amap_instance, AMAP_OP op, const char *key, void *ptr); -// TODO: We can possibly hand-optimise this with intrinsics maybe - but I hope autovectorization (does not seem to happen???) +/** Gets padding bytes for a size to be padded to divisible alignment */ +static inline unsigned int get_size_padding(unsigned int size, unsigned int alignment) { + /* Would ensure returned value divisible by alignment */ + /* return (size + alignment - 1) / alignment * alignment; */ + /* Basically same as: */ + /* return (alignment - (size % alignment)) % alignment; */ + + /* Substracting size leads to padding */ + return ((size + alignment - 1) / alignment) * alignment - size; +} + +/** Gets padded address - or same address if divisible by alignment */ +static inline void *get_padded(void *ptr, int alignment) { + // return (alignment - (ptr % alignment)) % alignment; + return (void*)((ptrdiff_t)((uint8_t *)ptr + alignment - 1) / alignment * alignment); +} + static inline SM_ALWAYS_INLINE auint64 *make_tipp(auint64 *base, auint64 *tip, auint64 prefix, auint64 *end) { -#ifdef SIMAP_AVX2_RAW - /* TODO: Implement */ +#ifdef __AVX2__ + /* See: + * + * https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#ig_expand=861,4605&avxnewtechs=AVX + * https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#ig_expand=861,4605&avxnewtechs=AVX&text=movemask + * https://chryswoods.com/vector_c++/part2.html + * https://blog.triplez.cn/posts/avx-avx2-learning-notes/ + * https://github.com/Triple-Z/AVX-AVX2-Example-Code + * https://en.algorithmica.org/hpc/simd/masking/ + * https://stackoverflow.com/questions/31089502/aligned-and-unaligned-memory-access-with-avx-avx2-intrinsics + */ + + /* Step over previous tipp and search until the AVX2 alignment needs */ + /* AVXs 256 bit = 32 byte so multiple of 32 needed here */ + /* TODO: Probably I can change "32" here into something bigger to non-avx small arrays! */ + auint64 *neotip = (auint64 *) get_padded(++tip, 32); + while((tip < neotip) && (*tip != prefix)) ++tip; + if(tip < neotip) return tip; + + /* Prepare an AVX 256bit search register: 4 uint64_t */ + __m256i sreg = _mm256_set1_epi64x(prefix); + + while(tip < end) { + /* This needs 32 byte alignment here that we do above */ + /* The tipp register: 4 uint64_t */ + __m256i treg = _mm256_load_si256((__m256i *) tip); + /* Check equality and return proper tip address for first found */ + __m256i m = _mm256_cmpeq_epi64(sreg, treg); /* Needs AVX2 */ + uint32_t mask = (uint32_t) _mm256_movemask_pd((__m256d) m); + + /* Try next tip, processes 256 bits per loop */ + tip += 4; /* 4x64 bit */ + + /* One of the links used __builtin_ctz(mask), but I + * think it was bad implementation and only finds the + * last search result! + * + * __builtin_clz returns leading zeroes of the mask + * and the mask has 4 bits at most and each show if + * 1..4 places of AVX reg compared properly to the + * given prefix value (4x 64 bit comparizons happen). + * Subtracting from 31 we subtract either 28,29,30,31 + * and thus resulting in 3, 2, 1, 0 (right offsets). + * + * If the mask got all zero, there is nothing found, + * otherwise its the tipp + offset we calculated. */ + if(SM_UNLIKELY(mask != 0)) { + int offset = (31 - __builtin_clz(mask)); + /* -4 because this is the unlikely scenario and we already incremented! */ + return tip - 4 + offset; + } + } + /* Not found case */ + return tip; #endif #ifdef SIMAP_RAW #pragma GCC unroll 16 @@ -150,7 +218,11 @@ static inline simap_ptr64 *simap_search_internal(simap_instance *map, const char simap_c64 prefix {0}; size_t prefixlen = is_smallkey ? keylen : 8; + /* Ignore warning because we know what we are doing here... */ +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wstringop-truncation" strncpy(prefix.str8, key, prefixlen); +#pragma GCC diagnostic pop /* Construct keyremains - might point to the \0 terminator only if smallkey or 8 bytes exactly */ const char *keyremains = key + prefixlen; @@ -204,16 +276,6 @@ static inline simap_ptr64 *simap_search_internal(simap_instance *map, const char return NULL; } -/** Gets padding bytes for a size to be padded to divisible alignment */ -static inline unsigned int get_size_padding(unsigned int size, unsigned int alignment) { - /* Would ensure returned value divisible by alignment */ - /* return (size + alignment - 1) / alignment * alignment; */ - /* same: return (alignment - (size % alignment)) % alignment; */ - - /* Substracting size leads to padding */ - return ((size + alignment - 1) / alignment) * alignment - size; -} - /** Returns the size of the storage needed for the given key */ static inline uint32_t simap_elem_storage_size(const char *key) { uint32_t keysize = strlen(key); @@ -248,7 +310,11 @@ static inline void *simap_force_add(simap_instance *map, const char *key, void * /* Create first 8 char encoding (this ensures endianness and all such stuff) */ simap_c64 first8 {0}; uint32_t keylen = strlen(key); + /* Ignore warning because we know what we are doing here... */ +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wstringop-truncation" strncpy(first8.str8, key, (keylen < 8) ? keylen : 8); +#pragma GCC diagnostic pop uint32_t usi = map->usage_end; uint32_t previ = map->prev_usage_end;