AVX2 implementation seems to work and is (as expected) faster than regular

This commit is contained in:
Richard Thier 2024-10-21 17:31:58 +02:00
parent 4e4c266632
commit 64a7d871c2
3 changed files with 88 additions and 20 deletions

View File

@ -135,7 +135,7 @@ int main() {
test_basics(unomap, &umi);
/* Performance tests */
int i = 10000;
int i = 100;
keystore(i, true);
datastore(i, true);

View File

@ -2,7 +2,9 @@ debug:
g++ main.cpp -g -Wall -o main
release:
g++ main.cpp -O2 -Wall -o main
debug-avx2:
g++ main.cpp -g -mavx2 -Wall -o main
release-avx2:
g++ main.cpp -fopt-info-vec-missed -mavx2 -O3 -Wall -o main
g++ main.cpp -mavx2 -O3 -Wall -o main
release-avx2-asm:
g++ main.cpp -S -fopt-info-vec-missed -masm=intel -mavx2 -O3 -Wall -o main

102
simap.h
View File

@ -7,16 +7,16 @@
#include "amap.h"
#include "arena.h/arena.h"
/* Possible optimizations, but they mean there can be lookup / insert errors (very rarely)
*/
#define SIMAP_AVX2_RAW
/* Possible (non-AVX, but alike) optimization, but means there can be lookup / insert errors (very rarely)
#define SIMAP_RAW
*/
/* Perf trickery */
/* This unifies the ifdefs but separates code paths when needed */
#ifdef SIMAP_AVX2_RAW
/* XXX: Enabling AVX also enables rare errors for speed gain! See above. */
#ifdef __AVX2__
#define SIMAP_RAW
#include <immintrin.h>
#endif
/* I have no idea what MSVC has instead... */
@ -122,10 +122,78 @@ static inline simap_instance simap_create() {
static inline void* simap(void *amap_instance, AMAP_OP op, const char *key, void *ptr);
// TODO: We can possibly hand-optimise this with intrinsics maybe - but I hope autovectorization (does not seem to happen???)
/** Gets padding bytes for a size to be padded to divisible alignment */
static inline unsigned int get_size_padding(unsigned int size, unsigned int alignment) {
/* Would ensure returned value divisible by alignment */
/* return (size + alignment - 1) / alignment * alignment; */
/* Basically same as: */
/* return (alignment - (size % alignment)) % alignment; */
/* Substracting size leads to padding */
return ((size + alignment - 1) / alignment) * alignment - size;
}
/** Gets padded address - or same address if divisible by alignment */
static inline void *get_padded(void *ptr, int alignment) {
// return (alignment - (ptr % alignment)) % alignment;
return (void*)((ptrdiff_t)((uint8_t *)ptr + alignment - 1) / alignment * alignment);
}
static inline SM_ALWAYS_INLINE auint64 *make_tipp(auint64 *base, auint64 *tip, auint64 prefix, auint64 *end) {
#ifdef SIMAP_AVX2_RAW
/* TODO: Implement */
#ifdef __AVX2__
/* See:
*
* https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#ig_expand=861,4605&avxnewtechs=AVX
* https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#ig_expand=861,4605&avxnewtechs=AVX&text=movemask
* https://chryswoods.com/vector_c++/part2.html
* https://blog.triplez.cn/posts/avx-avx2-learning-notes/
* https://github.com/Triple-Z/AVX-AVX2-Example-Code
* https://en.algorithmica.org/hpc/simd/masking/
* https://stackoverflow.com/questions/31089502/aligned-and-unaligned-memory-access-with-avx-avx2-intrinsics
*/
/* Step over previous tipp and search until the AVX2 alignment needs */
/* AVXs 256 bit = 32 byte so multiple of 32 needed here */
/* TODO: Probably I can change "32" here into something bigger to non-avx small arrays! */
auint64 *neotip = (auint64 *) get_padded(++tip, 32);
while((tip < neotip) && (*tip != prefix)) ++tip;
if(tip < neotip) return tip;
/* Prepare an AVX 256bit search register: 4 uint64_t */
__m256i sreg = _mm256_set1_epi64x(prefix);
while(tip < end) {
/* This needs 32 byte alignment here that we do above */
/* The tipp register: 4 uint64_t */
__m256i treg = _mm256_load_si256((__m256i *) tip);
/* Check equality and return proper tip address for first found */
__m256i m = _mm256_cmpeq_epi64(sreg, treg); /* Needs AVX2 */
uint32_t mask = (uint32_t) _mm256_movemask_pd((__m256d) m);
/* Try next tip, processes 256 bits per loop */
tip += 4; /* 4x64 bit */
/* One of the links used __builtin_ctz(mask), but I
* think it was bad implementation and only finds the
* last search result!
*
* __builtin_clz returns leading zeroes of the mask
* and the mask has 4 bits at most and each show if
* 1..4 places of AVX reg compared properly to the
* given prefix value (4x 64 bit comparizons happen).
* Subtracting from 31 we subtract either 28,29,30,31
* and thus resulting in 3, 2, 1, 0 (right offsets).
*
* If the mask got all zero, there is nothing found,
* otherwise its the tipp + offset we calculated. */
if(SM_UNLIKELY(mask != 0)) {
int offset = (31 - __builtin_clz(mask));
/* -4 because this is the unlikely scenario and we already incremented! */
return tip - 4 + offset;
}
}
/* Not found case */
return tip;
#endif
#ifdef SIMAP_RAW
#pragma GCC unroll 16
@ -150,7 +218,11 @@ static inline simap_ptr64 *simap_search_internal(simap_instance *map, const char
simap_c64 prefix {0};
size_t prefixlen = is_smallkey ? keylen : 8;
/* Ignore warning because we know what we are doing here... */
#pragma GCC diagnostic push
#pragma GCC diagnostic ignored "-Wstringop-truncation"
strncpy(prefix.str8, key, prefixlen);
#pragma GCC diagnostic pop
/* Construct keyremains - might point to the \0 terminator only if smallkey or 8 bytes exactly */
const char *keyremains = key + prefixlen;
@ -204,16 +276,6 @@ static inline simap_ptr64 *simap_search_internal(simap_instance *map, const char
return NULL;
}
/** Gets padding bytes for a size to be padded to divisible alignment */
static inline unsigned int get_size_padding(unsigned int size, unsigned int alignment) {
/* Would ensure returned value divisible by alignment */
/* return (size + alignment - 1) / alignment * alignment; */
/* same: return (alignment - (size % alignment)) % alignment; */
/* Substracting size leads to padding */
return ((size + alignment - 1) / alignment) * alignment - size;
}
/** Returns the size of the storage needed for the given key */
static inline uint32_t simap_elem_storage_size(const char *key) {
uint32_t keysize = strlen(key);
@ -248,7 +310,11 @@ static inline void *simap_force_add(simap_instance *map, const char *key, void *
/* Create first 8 char encoding (this ensures endianness and all such stuff) */
simap_c64 first8 {0};
uint32_t keylen = strlen(key);
/* Ignore warning because we know what we are doing here... */
#pragma GCC diagnostic push
#pragma GCC diagnostic ignored "-Wstringop-truncation"
strncpy(first8.str8, key, (keylen < 8) ? keylen : 8);
#pragma GCC diagnostic pop
uint32_t usi = map->usage_end;
uint32_t previ = map->prev_usage_end;