AVX2 implementation seems to work and is (as expected) faster than regular

2024-10-21 17:31:58 +02:00 · 2024-10-21 17:31:58 +02:00 · 64a7d871c2
commit 64a7d871c2
parent 4e4c266632
3 changed files with 88 additions and 20 deletions
--- a/main.cpp
+++ b/main.cpp
@ -135,7 +135,7 @@ int main() {
 	test_basics(unomap, &umi);
 	/* Performance tests */
-	int i = 10000;
+	int i = 100;
 	keystore(i, true);
 	datastore(i, true);
--- a/4
+++ b/4
@ -2,7 +2,9 @@ debug:
 	g++ main.cpp -g -Wall -o main
 release:
 	g++ main.cpp -O2 -Wall -o main
 debug-avx2:
 	g++ main.cpp -g -mavx2 -Wall -o main
 release-avx2:
-	g++ main.cpp -fopt-info-vec-missed -mavx2 -O3 -Wall -o main
+	g++ main.cpp -mavx2 -O3 -Wall -o main
 release-avx2-asm:
 	g++ main.cpp -S -fopt-info-vec-missed -masm=intel -mavx2 -O3 -Wall -o main
--- a/simap.h
+++ b/simap.h
@ -7,16 +7,16 @@
 #include "amap.h"
 #include "arena.h/arena.h"
-/* Possible optimizations, but they mean there can be lookup / insert errors (very rarely)
+/* Possible (non-AVX, but alike) optimization, but means there can be lookup / insert errors (very rarely)
 */
 #define SIMAP_AVX2_RAW
 #define SIMAP_RAW
 */
 /* Perf trickery */
-/* This unifies the ifdefs but separates code paths when needed */
+/* XXX: Enabling AVX also enables rare errors for speed gain! See above. */
-#ifdef SIMAP_AVX2_RAW
+#ifdef __AVX2__
 #define SIMAP_RAW
 #include <immintrin.h>
 #endif
 /* I have no idea what MSVC has instead... */
@ -122,10 +122,78 @@ static inline simap_instance simap_create() {
 static inline void* simap(void *amap_instance, AMAP_OP op, const char *key, void *ptr);
-// TODO: We can possibly hand-optimise this with intrinsics maybe - but I hope autovectorization (does not seem to happen???)
+/** Gets padding bytes for a size to be padded to divisible alignment */
 static inline unsigned int get_size_padding(unsigned int size, unsigned int alignment) {
 	/* Would ensure returned value divisible by alignment */
 	/* return (size + alignment - 1) / alignment * alignment; */
 	/* Basically same as: */
 	/* return (alignment - (size % alignment)) % alignment; */
 	/* Substracting size leads to padding */
 	return ((size + alignment - 1) / alignment) * alignment - size;
 }
 /** Gets padded address - or same address if divisible by alignment */
 static inline void *get_padded(void *ptr, int alignment) {
 	// return (alignment - (ptr % alignment)) % alignment;
 	return (void*)((ptrdiff_t)((uint8_t *)ptr + alignment - 1) / alignment * alignment);
 }
 static inline SM_ALWAYS_INLINE auint64 *make_tipp(auint64 *base, auint64 *tip, auint64 prefix, auint64 *end) {
-#ifdef SIMAP_AVX2_RAW
+#ifdef __AVX2__
-	/* TODO: Implement */
+	/* See:
 	 *
 	 * https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#ig_expand=861,4605&avxnewtechs=AVX
 	 * https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#ig_expand=861,4605&avxnewtechs=AVX&text=movemask
 	 * https://chryswoods.com/vector_c++/part2.html
 	 * https://blog.triplez.cn/posts/avx-avx2-learning-notes/
 	 * https://github.com/Triple-Z/AVX-AVX2-Example-Code
 	 * https://en.algorithmica.org/hpc/simd/masking/
 	 * https://stackoverflow.com/questions/31089502/aligned-and-unaligned-memory-access-with-avx-avx2-intrinsics
 	 */
 	/* Step over previous tipp and search until the AVX2 alignment needs */
 	/* AVXs 256 bit = 32 byte so multiple of 32 needed here */
 	/* TODO: Probably I can change "32" here into something bigger to non-avx small arrays! */
 	auint64 *neotip = (auint64 *) get_padded(++tip, 32);
 	while((tip < neotip) && (*tip != prefix)) ++tip;
 	if(tip < neotip) return tip;
 	/* Prepare an AVX 256bit search register: 4 uint64_t */
 	__m256i sreg = _mm256_set1_epi64x(prefix);
 	while(tip < end) {
 		/* This needs 32 byte alignment here that we do above */
 		/* The tipp register: 4 uint64_t */
 		__m256i treg = _mm256_load_si256((__m256i *) tip);
 		/* Check equality and return proper tip address for first found */
 		__m256i m = _mm256_cmpeq_epi64(sreg, treg); /* Needs AVX2 */
 		uint32_t mask = (uint32_t) _mm256_movemask_pd((__m256d) m);
 		/* Try next tip, processes 256 bits per loop */
 		tip += 4; /* 4x64 bit */
 		/* One of the links used __builtin_ctz(mask), but I
 		 * think it was bad implementation and only finds the
 		 * last search result!
 		 *
 		 * __builtin_clz returns leading zeroes of the mask
 		 * and the mask has 4 bits at most and each show if
 		 * 1..4 places of AVX reg compared properly to the
 		 * given prefix value (4x 64 bit comparizons happen).
 		 * Subtracting from 31 we subtract either 28,29,30,31
 		 * and thus resulting in 3, 2, 1, 0 (right offsets).
 		 *
 		 * If the mask got all zero, there is nothing found,
 		 * otherwise its the tipp + offset we calculated. */
 		if(SM_UNLIKELY(mask != 0)) {
 			int offset = (31 - __builtin_clz(mask));
 			/* -4 because this is the unlikely scenario and we already incremented! */
 			return tip - 4 + offset;
 		}
 	}
 	/* Not found case */
 	return tip;
 #endif
 #ifdef SIMAP_RAW
 	#pragma GCC unroll 16
@ -150,7 +218,11 @@ static inline simap_ptr64 *simap_search_internal(simap_instance *map, const char
 	simap_c64 prefix {0};
 	size_t prefixlen =  is_smallkey ? keylen : 8;
 	/* Ignore warning because we know what we are doing here... */
 #pragma GCC diagnostic push
 #pragma GCC diagnostic ignored "-Wstringop-truncation"
 	strncpy(prefix.str8, key, prefixlen);
 #pragma GCC diagnostic pop
 	/* Construct keyremains - might point to the \0 terminator only if smallkey or 8 bytes exactly */
 	const char *keyremains = key + prefixlen;
@ -204,16 +276,6 @@ static inline simap_ptr64 *simap_search_internal(simap_instance *map, const char
 	return NULL;
 }
 /** Gets padding bytes for a size to be padded to divisible alignment */
 static inline unsigned int get_size_padding(unsigned int size, unsigned int alignment) {
 	/* Would ensure returned value divisible by alignment */
 	/* return (size + alignment - 1) / alignment * alignment; */
 	/* same: return (alignment - (size % alignment)) % alignment; */
 	/* Substracting size leads to padding */
 	return ((size + alignment - 1) / alignment) * alignment - size;
 }
 /** Returns the size of the storage needed for the given key */
 static inline uint32_t simap_elem_storage_size(const char *key) {
 	uint32_t keysize = strlen(key);
@ -248,7 +310,11 @@ static inline void *simap_force_add(simap_instance *map, const char *key, void *
 	/* Create first 8 char encoding (this ensures endianness and all such stuff) */
 	simap_c64 first8 {0};
 	uint32_t keylen = strlen(key);
 	/* Ignore warning because we know what we are doing here... */
 #pragma GCC diagnostic push
 #pragma GCC diagnostic ignored "-Wstringop-truncation"
 	strncpy(first8.str8, key, (keylen < 8) ? keylen : 8);
 #pragma GCC diagnostic pop
 	uint32_t usi = map->usage_end;
 	uint32_t previ = map->prev_usage_end;