AVX2 implementation seems to work and is (as expected) faster than regular
This commit is contained in:
parent
4e4c266632
commit
64a7d871c2
2
main.cpp
2
main.cpp
@ -135,7 +135,7 @@ int main() {
|
||||
test_basics(unomap, &umi);
|
||||
|
||||
/* Performance tests */
|
||||
int i = 10000;
|
||||
int i = 100;
|
||||
keystore(i, true);
|
||||
datastore(i, true);
|
||||
|
||||
|
4
makefile
4
makefile
@ -2,7 +2,9 @@ debug:
|
||||
g++ main.cpp -g -Wall -o main
|
||||
release:
|
||||
g++ main.cpp -O2 -Wall -o main
|
||||
debug-avx2:
|
||||
g++ main.cpp -g -mavx2 -Wall -o main
|
||||
release-avx2:
|
||||
g++ main.cpp -fopt-info-vec-missed -mavx2 -O3 -Wall -o main
|
||||
g++ main.cpp -mavx2 -O3 -Wall -o main
|
||||
release-avx2-asm:
|
||||
g++ main.cpp -S -fopt-info-vec-missed -masm=intel -mavx2 -O3 -Wall -o main
|
||||
|
102
simap.h
102
simap.h
@ -7,16 +7,16 @@
|
||||
#include "amap.h"
|
||||
#include "arena.h/arena.h"
|
||||
|
||||
/* Possible optimizations, but they mean there can be lookup / insert errors (very rarely)
|
||||
*/
|
||||
#define SIMAP_AVX2_RAW
|
||||
/* Possible (non-AVX, but alike) optimization, but means there can be lookup / insert errors (very rarely)
|
||||
#define SIMAP_RAW
|
||||
*/
|
||||
|
||||
/* Perf trickery */
|
||||
|
||||
/* This unifies the ifdefs but separates code paths when needed */
|
||||
#ifdef SIMAP_AVX2_RAW
|
||||
/* XXX: Enabling AVX also enables rare errors for speed gain! See above. */
|
||||
#ifdef __AVX2__
|
||||
#define SIMAP_RAW
|
||||
#include <immintrin.h>
|
||||
#endif
|
||||
|
||||
/* I have no idea what MSVC has instead... */
|
||||
@ -122,10 +122,78 @@ static inline simap_instance simap_create() {
|
||||
|
||||
static inline void* simap(void *amap_instance, AMAP_OP op, const char *key, void *ptr);
|
||||
|
||||
// TODO: We can possibly hand-optimise this with intrinsics maybe - but I hope autovectorization (does not seem to happen???)
|
||||
/** Gets padding bytes for a size to be padded to divisible alignment */
|
||||
static inline unsigned int get_size_padding(unsigned int size, unsigned int alignment) {
|
||||
/* Would ensure returned value divisible by alignment */
|
||||
/* return (size + alignment - 1) / alignment * alignment; */
|
||||
/* Basically same as: */
|
||||
/* return (alignment - (size % alignment)) % alignment; */
|
||||
|
||||
/* Substracting size leads to padding */
|
||||
return ((size + alignment - 1) / alignment) * alignment - size;
|
||||
}
|
||||
|
||||
/** Gets padded address - or same address if divisible by alignment */
|
||||
static inline void *get_padded(void *ptr, int alignment) {
|
||||
// return (alignment - (ptr % alignment)) % alignment;
|
||||
return (void*)((ptrdiff_t)((uint8_t *)ptr + alignment - 1) / alignment * alignment);
|
||||
}
|
||||
|
||||
static inline SM_ALWAYS_INLINE auint64 *make_tipp(auint64 *base, auint64 *tip, auint64 prefix, auint64 *end) {
|
||||
#ifdef SIMAP_AVX2_RAW
|
||||
/* TODO: Implement */
|
||||
#ifdef __AVX2__
|
||||
/* See:
|
||||
*
|
||||
* https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#ig_expand=861,4605&avxnewtechs=AVX
|
||||
* https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#ig_expand=861,4605&avxnewtechs=AVX&text=movemask
|
||||
* https://chryswoods.com/vector_c++/part2.html
|
||||
* https://blog.triplez.cn/posts/avx-avx2-learning-notes/
|
||||
* https://github.com/Triple-Z/AVX-AVX2-Example-Code
|
||||
* https://en.algorithmica.org/hpc/simd/masking/
|
||||
* https://stackoverflow.com/questions/31089502/aligned-and-unaligned-memory-access-with-avx-avx2-intrinsics
|
||||
*/
|
||||
|
||||
/* Step over previous tipp and search until the AVX2 alignment needs */
|
||||
/* AVXs 256 bit = 32 byte so multiple of 32 needed here */
|
||||
/* TODO: Probably I can change "32" here into something bigger to non-avx small arrays! */
|
||||
auint64 *neotip = (auint64 *) get_padded(++tip, 32);
|
||||
while((tip < neotip) && (*tip != prefix)) ++tip;
|
||||
if(tip < neotip) return tip;
|
||||
|
||||
/* Prepare an AVX 256bit search register: 4 uint64_t */
|
||||
__m256i sreg = _mm256_set1_epi64x(prefix);
|
||||
|
||||
while(tip < end) {
|
||||
/* This needs 32 byte alignment here that we do above */
|
||||
/* The tipp register: 4 uint64_t */
|
||||
__m256i treg = _mm256_load_si256((__m256i *) tip);
|
||||
/* Check equality and return proper tip address for first found */
|
||||
__m256i m = _mm256_cmpeq_epi64(sreg, treg); /* Needs AVX2 */
|
||||
uint32_t mask = (uint32_t) _mm256_movemask_pd((__m256d) m);
|
||||
|
||||
/* Try next tip, processes 256 bits per loop */
|
||||
tip += 4; /* 4x64 bit */
|
||||
|
||||
/* One of the links used __builtin_ctz(mask), but I
|
||||
* think it was bad implementation and only finds the
|
||||
* last search result!
|
||||
*
|
||||
* __builtin_clz returns leading zeroes of the mask
|
||||
* and the mask has 4 bits at most and each show if
|
||||
* 1..4 places of AVX reg compared properly to the
|
||||
* given prefix value (4x 64 bit comparizons happen).
|
||||
* Subtracting from 31 we subtract either 28,29,30,31
|
||||
* and thus resulting in 3, 2, 1, 0 (right offsets).
|
||||
*
|
||||
* If the mask got all zero, there is nothing found,
|
||||
* otherwise its the tipp + offset we calculated. */
|
||||
if(SM_UNLIKELY(mask != 0)) {
|
||||
int offset = (31 - __builtin_clz(mask));
|
||||
/* -4 because this is the unlikely scenario and we already incremented! */
|
||||
return tip - 4 + offset;
|
||||
}
|
||||
}
|
||||
/* Not found case */
|
||||
return tip;
|
||||
#endif
|
||||
#ifdef SIMAP_RAW
|
||||
#pragma GCC unroll 16
|
||||
@ -150,7 +218,11 @@ static inline simap_ptr64 *simap_search_internal(simap_instance *map, const char
|
||||
|
||||
simap_c64 prefix {0};
|
||||
size_t prefixlen = is_smallkey ? keylen : 8;
|
||||
/* Ignore warning because we know what we are doing here... */
|
||||
#pragma GCC diagnostic push
|
||||
#pragma GCC diagnostic ignored "-Wstringop-truncation"
|
||||
strncpy(prefix.str8, key, prefixlen);
|
||||
#pragma GCC diagnostic pop
|
||||
|
||||
/* Construct keyremains - might point to the \0 terminator only if smallkey or 8 bytes exactly */
|
||||
const char *keyremains = key + prefixlen;
|
||||
@ -204,16 +276,6 @@ static inline simap_ptr64 *simap_search_internal(simap_instance *map, const char
|
||||
return NULL;
|
||||
}
|
||||
|
||||
/** Gets padding bytes for a size to be padded to divisible alignment */
|
||||
static inline unsigned int get_size_padding(unsigned int size, unsigned int alignment) {
|
||||
/* Would ensure returned value divisible by alignment */
|
||||
/* return (size + alignment - 1) / alignment * alignment; */
|
||||
/* same: return (alignment - (size % alignment)) % alignment; */
|
||||
|
||||
/* Substracting size leads to padding */
|
||||
return ((size + alignment - 1) / alignment) * alignment - size;
|
||||
}
|
||||
|
||||
/** Returns the size of the storage needed for the given key */
|
||||
static inline uint32_t simap_elem_storage_size(const char *key) {
|
||||
uint32_t keysize = strlen(key);
|
||||
@ -248,7 +310,11 @@ static inline void *simap_force_add(simap_instance *map, const char *key, void *
|
||||
/* Create first 8 char encoding (this ensures endianness and all such stuff) */
|
||||
simap_c64 first8 {0};
|
||||
uint32_t keylen = strlen(key);
|
||||
/* Ignore warning because we know what we are doing here... */
|
||||
#pragma GCC diagnostic push
|
||||
#pragma GCC diagnostic ignored "-Wstringop-truncation"
|
||||
strncpy(first8.str8, key, (keylen < 8) ? keylen : 8);
|
||||
#pragma GCC diagnostic pop
|
||||
|
||||
uint32_t usi = map->usage_end;
|
||||
uint32_t previ = map->prev_usage_end;
|
||||
|
Loading…
x
Reference in New Issue
Block a user