From c698fc55c355c9df6e85158178ed98c22f8af8eb Mon Sep 17 00:00:00 2001 From: Richard Thier Date: Mon, 27 Jan 2025 04:21:18 +0100 Subject: [PATCH] vmap find implementation - untested --- vmap.h | 113 ++++++++++++++++++++++++++++++++++++++++++++++----------- 1 file changed, 92 insertions(+), 21 deletions(-) diff --git a/vmap.h b/vmap.h index bb99273..4feb71f 100644 --- a/vmap.h +++ b/vmap.h @@ -1,16 +1,16 @@ #ifndef VMAP_H #define VMAP_H /* - * A virtual memory misusing flat-ish hashmap optimized with AVX2. + * A virtual memory misusing flat-ish hashmap optimized with AVX2 (if available at compilation). * * Structure * * VMEM * STRUCT - * PRIVATE - * UINTAPI + * INTAPI */ #include +#include #include "simd_map_lane.h" /* VMEM */ @@ -51,12 +51,12 @@ struct vmap_find_res { uint32_t lane_abcd_next; /** Meta-data for continuation of the search. In-lane where we search from next time? */ int lane_next_begin; + /** Meta-data for continuation of the search. Last value found in lastly looked lane. */ + uint32_t last_found_lane_val; }; typedef struct simd_map_find_res simd_map_find_res; -/* PRIVATE */ - -/* UINTAPI */ +/* INTAPI */ static inline vmap create_vmap(uint32_t max_levels) { vmap map{ NULL, 0, max_levels}; @@ -76,6 +76,7 @@ static inline vmap_find_res vmap_search_all_begin() { ret.level = 0; ret.lane_abcd_next = 0; ret.lane_next_begin = 0; + ret.last_found_lane_val = 0; return ret; } @@ -96,6 +97,14 @@ static inline vmap_find_res search_all_vmap(vmap *map, uint32_t key, vmap_find_r uint32_t level = prev.level; /* Probably the loop exists always without this predicate being false */ while(level <= map->max_levels) { + /* Rare edge-case when last lane element was returned and we continue from it */ + if(prev.lane_abcd_next > 4) { + prev = vmap_search_all_begin(); + ++level; + /* prev.level = level; // unnecessary, I hand-optimized out */ + continue; + } + /* Process 8 bits of the 32-bit circular order - so its not radix, but similar */ uint32_t byt = level % 4; // Low 4 bits: page @@ -104,15 +113,11 @@ static inline vmap_find_res search_all_vmap(vmap *map, uint32_t key, vmap_find_r uint32_t page_offset = 1024 * page_no; /* Top 4 bits: lane. There is 32 lane start positions in the 4k page */ - uint32_t lane_no = (key >> (byt * 8 + 4)) && 15; + uint32_t lane_no = (key >> (byt * 8 + 4)) && 15 + + prev.lane_abcd_next; /* continuations start where we left off */ /* But 4096 / 4 == 1024 elements, which then divided by 16 == 64 uint32_t elems */ uint32_t lane_offset = lane_no * 64; -// FIXME: Rerhink what is needed for continuations! -// I think we should store A, B, C and D lane retvals plus where we are -// or maybe just the "where we are" and figure out with logic here, -// but maybe I need to just save flags (4x1 bytes) for "does lane-ABCD search needed?" as that is faster to simd branch pred? - /* A lane has 8x32 bit keys, then 8x32 bit values. 16 uint32_t elems. */ /* So grab the A, B, C and D candidate lanes for each lane_offset. */ simd_map_lane *lane_a = (simd_map_lane *) map->data + page_offset + lane_offset; @@ -120,28 +125,94 @@ static inline vmap_find_res search_all_vmap(vmap *map, uint32_t key, vmap_find_r simd_map_lane *lane_c = lane_b + 1; simd_map_lane *lane_d = lane_c + 1; + /* Get which lane we should begin at where */ + uint32_t lane_a_begin = prev.lane_next_begin; + int lane_next_begin = 0; + /* Further lanes only needed if ours is fully filled */ /* Overlay simd and integer units here for perf */ uint32_t *afind = simd_map_lane_find( lane_a, key, 0, /* lane modulo: 0 means until lane end */ - 0, /* FIXME - from continuation! */ - NULL); /* FIXME - we should fill a *lane_next_begin ptr here */ - uint32_t bneed = simd_map_lane_last_value(lane_a); + lane_a_begin, + &lane_next_begin); + uint32_t lasta = simd_map_lane_last_value(lane_a); + char bneed = (lasta != 0) && (prev.lane_abcd_next < 3); if(afind) { + ret.lane_next_begin = lane_next_begin; + ret.lane_abcd_next = prev.lane_abcd_next + (lane_next_begin == 0); ret.value_location = afind; ret.level = level; + ret.last_found_lane_val = lasta; + return ret; } - /* TODO: Implement B, C and D */ + if(bneed) { + uint32_t *bfind = simd_map_lane_find( + lane_b, + key, + 0, /* lane modulo: 0 means until lane end */ + 0, /* non-a lanes all start from 0 */ + &lane_next_begin); + uint32_t lastb = simd_map_lane_last_value(lane_b); + char cneed = (lastb != 0) && (prev.lane_abcd_next < 2); + if(bfind) { + ret.lane_next_begin = lane_next_begin; + ret.lane_abcd_next = prev.lane_abcd_next + (lane_next_begin == 0); + ret.value_location = bfind; + ret.level = level; + ret.last_found_lane_val = lastb; + return ret; + } - uint32_t cneed = simd_map_lane_last_value(lane_b); - uint32_t dneed = simd_map_lane_last_value(lane_c); + if(cneed) { + uint32_t *cfind = simd_map_lane_find( + lane_c, + key, + 0, /* lane modulo: 0 means until lane end */ + 0, /* non-a lanes all start from 0 */ + &lane_next_begin); + uint32_t lastc = simd_map_lane_last_value(lane_c); + char dneed = (lastc != 0) && (prev.lane_abcd_next < 1); + if(cfind) { + ret.lane_next_begin = lane_next_begin; + ret.lane_abcd_next = prev.lane_abcd_next + (lane_next_begin == 0); + ret.value_location = cfind; + ret.level = level; + ret.last_found_lane_val = lastc; + return ret; + } - /* Check if we need to jump to the next level and do */ - uint32_t more = simd_map_lane_last_value(lane_c); - if(!more) return ret; + if(dneed) { + uint32_t *dfind = simd_map_lane_find( + lane_d, + key, + 0, /* lane modulo: 0 means until lane end */ + 0, /* non-a lanes all start from 0 */ + &lane_next_begin); + uint32_t lastd = simd_map_lane_last_value(lane_d); + char next_level = (lastd != 0); + if(dfind) { + ret.lane_next_begin = lane_next_begin; + ret.lane_abcd_next = prev.lane_abcd_next + (lane_next_begin == 0); + ret.value_location = dfind; + ret.level = level; + ret.last_found_lane_val = lastd; + return ret; + } + + /* Check to avoid next level (stop iteration) */ + if(!next_level) { + return vmap_search_all_begin(); + } + } + } + } + + /* Next level needs checking */ + prev = vmap_search_all_begin(); ++level; + /* prev.level = level; // unnecessary, I hand-optimized out */ } return ret;