From 64a7d871c2f8dab1012994f289145fb49bc18928 Mon Sep 17 00:00:00 2001
From: Richard Thier <magosit@outlook.hu>
Date: Mon, 21 Oct 2024 17:31:58 +0200
Subject: [PATCH] AVX2 implementation seems to work and is (as expected) faster
 than regular

---
 main.cpp |   2 +-
 makefile |   4 ++-
 simap.h  | 102 +++++++++++++++++++++++++++++++++++++++++++++----------
 3 files changed, 88 insertions(+), 20 deletions(-)

diff --git a/main.cpp b/main.cpp
index 8cce812..ae68fe1 100644
--- a/main.cpp
+++ b/main.cpp
@@ -135,7 +135,7 @@ int main() {
 	test_basics(unomap, &umi);
 
 	/* Performance tests */
-	int i = 10000;
+	int i = 100;
 	keystore(i, true);
 	datastore(i, true);
 
diff --git a/makefile b/makefile
index 7c6691d..a60cf41 100644
--- a/makefile
+++ b/makefile
@@ -2,7 +2,9 @@ debug:
 	g++ main.cpp -g -Wall -o main
 release:
 	g++ main.cpp -O2 -Wall -o main
+debug-avx2:
+	g++ main.cpp -g -mavx2 -Wall -o main
 release-avx2:
-	g++ main.cpp -fopt-info-vec-missed -mavx2 -O3 -Wall -o main
+	g++ main.cpp -mavx2 -O3 -Wall -o main
 release-avx2-asm:
 	g++ main.cpp -S -fopt-info-vec-missed -masm=intel -mavx2 -O3 -Wall -o main
diff --git a/simap.h b/simap.h
index 6d82168..25ddc53 100644
--- a/simap.h
+++ b/simap.h
@@ -7,16 +7,16 @@
 #include "amap.h"
 #include "arena.h/arena.h"
 
-/* Possible optimizations, but they mean there can be lookup / insert errors (very rarely)
-*/
-#define SIMAP_AVX2_RAW
+/* Possible (non-AVX, but alike) optimization, but means there can be lookup / insert errors (very rarely)
 #define SIMAP_RAW
+*/
 
 /* Perf trickery */
 
-/* This unifies the ifdefs but separates code paths when needed */
-#ifdef SIMAP_AVX2_RAW
+/* XXX: Enabling AVX also enables rare errors for speed gain! See above. */
+#ifdef __AVX2__
 #define SIMAP_RAW
+#include <immintrin.h>
 #endif
 
 /* I have no idea what MSVC has instead... */
@@ -122,10 +122,78 @@ static inline simap_instance simap_create() {
 
 static inline void* simap(void *amap_instance, AMAP_OP op, const char *key, void *ptr);
 
-// TODO: We can possibly hand-optimise this with intrinsics maybe - but I hope autovectorization (does not seem to happen???)
+/** Gets padding bytes for a size to be padded to divisible alignment */
+static inline unsigned int get_size_padding(unsigned int size, unsigned int alignment) {
+	/* Would ensure returned value divisible by alignment */
+	/* return (size + alignment - 1) / alignment * alignment; */
+	/* Basically same as: */
+	/* return (alignment - (size % alignment)) % alignment; */
+
+	/* Substracting size leads to padding */
+	return ((size + alignment - 1) / alignment) * alignment - size;
+}
+
+/** Gets padded address - or same address if divisible by alignment */
+static inline void *get_padded(void *ptr, int alignment) {
+	// return (alignment - (ptr % alignment)) % alignment;
+	return (void*)((ptrdiff_t)((uint8_t *)ptr + alignment - 1) / alignment * alignment);
+}
+
 static inline SM_ALWAYS_INLINE auint64 *make_tipp(auint64 *base, auint64 *tip, auint64 prefix, auint64 *end) {
-#ifdef SIMAP_AVX2_RAW
-	/* TODO: Implement */
+#ifdef __AVX2__
+	/* See:
+	 *
+	 * https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#ig_expand=861,4605&avxnewtechs=AVX
+	 * https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#ig_expand=861,4605&avxnewtechs=AVX&text=movemask
+	 * https://chryswoods.com/vector_c++/part2.html
+	 * https://blog.triplez.cn/posts/avx-avx2-learning-notes/
+	 * https://github.com/Triple-Z/AVX-AVX2-Example-Code
+	 * https://en.algorithmica.org/hpc/simd/masking/
+	 * https://stackoverflow.com/questions/31089502/aligned-and-unaligned-memory-access-with-avx-avx2-intrinsics
+	 */
+
+	/* Step over previous tipp and search until the AVX2 alignment needs */
+	/* AVXs 256 bit = 32 byte so multiple of 32 needed here */
+	/* TODO: Probably I can change "32" here into something bigger to non-avx small arrays! */
+	auint64 *neotip = (auint64 *) get_padded(++tip, 32);
+	while((tip < neotip) && (*tip != prefix)) ++tip;
+	if(tip < neotip) return tip;
+
+	/* Prepare an AVX 256bit search register: 4 uint64_t */
+	__m256i sreg = _mm256_set1_epi64x(prefix);
+
+	while(tip < end) {
+		/* This needs 32 byte alignment here that we do above */
+		/* The tipp register: 4 uint64_t */
+		__m256i treg = _mm256_load_si256((__m256i *) tip);
+		/* Check equality and return proper tip address for first found */
+		__m256i m = _mm256_cmpeq_epi64(sreg, treg); /* Needs AVX2 */
+		uint32_t mask = (uint32_t) _mm256_movemask_pd((__m256d) m);
+
+		/* Try next tip, processes 256 bits per loop */
+		tip += 4; /* 4x64 bit */
+
+		/* One of the links used __builtin_ctz(mask), but I
+		 * think it was bad implementation and only finds the
+		 * last search result!
+		 *
+		 * __builtin_clz returns leading zeroes of the mask
+		 * and the mask has 4 bits at most and each show if
+		 * 1..4 places of AVX reg compared properly to the
+		 * given prefix value (4x 64 bit comparizons happen).
+		 * Subtracting from 31 we subtract either 28,29,30,31
+		 * and thus resulting in 3, 2, 1, 0 (right offsets).
+		 *
+		 * If the mask got all zero, there is nothing found,
+		 * otherwise its the tipp + offset we calculated. */
+		if(SM_UNLIKELY(mask != 0)) {
+			int offset = (31 - __builtin_clz(mask));
+			/* -4 because this is the unlikely scenario and we already incremented! */
+			return tip - 4 + offset;
+		}
+	}
+	/* Not found case */
+	return tip;
 #endif
 #ifdef SIMAP_RAW
 	#pragma GCC unroll 16
@@ -150,7 +218,11 @@ static inline simap_ptr64 *simap_search_internal(simap_instance *map, const char
 
 	simap_c64 prefix {0};
 	size_t prefixlen =  is_smallkey ? keylen : 8;
+	/* Ignore warning because we know what we are doing here... */
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wstringop-truncation"
 	strncpy(prefix.str8, key, prefixlen);
+#pragma GCC diagnostic pop
 
 	/* Construct keyremains - might point to the \0 terminator only if smallkey or 8 bytes exactly */
 	const char *keyremains = key + prefixlen;
@@ -204,16 +276,6 @@ static inline simap_ptr64 *simap_search_internal(simap_instance *map, const char
 	return NULL;
 }
 
-/** Gets padding bytes for a size to be padded to divisible alignment */
-static inline unsigned int get_size_padding(unsigned int size, unsigned int alignment) {
-	/* Would ensure returned value divisible by alignment */
-	/* return (size + alignment - 1) / alignment * alignment; */
-	/* same: return (alignment - (size % alignment)) % alignment; */
-
-	/* Substracting size leads to padding */
-	return ((size + alignment - 1) / alignment) * alignment - size;
-}
-
 /** Returns the size of the storage needed for the given key */
 static inline uint32_t simap_elem_storage_size(const char *key) {
 	uint32_t keysize = strlen(key);
@@ -248,7 +310,11 @@ static inline void *simap_force_add(simap_instance *map, const char *key, void *
 	/* Create first 8 char encoding (this ensures endianness and all such stuff) */
 	simap_c64 first8 {0};
 	uint32_t keylen = strlen(key);
+	/* Ignore warning because we know what we are doing here... */
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wstringop-truncation"
 	strncpy(first8.str8, key, (keylen < 8) ? keylen : 8);
+#pragma GCC diagnostic pop
 
 	uint32_t usi = map->usage_end;
 	uint32_t previ = map->prev_usage_end;