diff --git a/.github/style_type_check_cfg/.flake8 b/.github/style_type_check_cfg/.flake8 index dbe66680..5326990d 100644 --- a/.github/style_type_check_cfg/.flake8 +++ b/.github/style_type_check_cfg/.flake8 @@ -1,3 +1,3 @@ [flake8] -ignore = E203,E501,W605,F541 +extend-ignore = E203,E501,W605,F541 max_line_length = 100 diff --git a/pecos/core/base.py b/pecos/core/base.py index 6a80b4b5..8cbe6660 100644 --- a/pecos/core/base.py +++ b/pecos/core/base.py @@ -2070,7 +2070,7 @@ def link_mmap_hashmap_methods(self): Specify C-lib's Memory-mappable Hashmap methods arguments and return types. """ fn_prefix = "mmap_hashmap" - map_type_list = ["str2int", "fixed_len_str2int", "asin_str2int", "int2int"] + map_type_list = ["str2int", "fixed_len_str2int", "fixed_len_10_str2int", "int2int"] key_args_dict = { "str2int": [ c_char_p, # pointer of key string @@ -2080,7 +2080,7 @@ def link_mmap_hashmap_methods(self): c_char_p, # pointer of key string c_uint32, # length of key string ], - "asin_str2int": [ + "fixed_len_10_str2int": [ c_char_p, # pointer of key string c_uint32, # length of key string ], @@ -2097,7 +2097,7 @@ def link_mmap_hashmap_methods(self): c_void_p, # List of pointer of key string POINTER(c_uint32), # List of length of key string ], - "asin_str2int": [ + "fixed_len_10_str2int": [ c_void_p, # List of pointer of key string POINTER(c_uint32), # List of length of key string ], diff --git a/pecos/core/libpecos.cpp b/pecos/core/libpecos.cpp index 9337ab37..7e3b2636 100644 --- a/pecos/core/libpecos.cpp +++ b/pecos/core/libpecos.cpp @@ -663,7 +663,7 @@ extern "C" { typedef pecos::mmap_hashmap::Str2IntMap mmap_hashmap_str2int; typedef pecos::mmap_hashmap::Str2IntMap mmap_hashmap_fixed_len_str2int; - typedef pecos::mmap_hashmap::Str2IntMap mmap_hashmap_asin_str2int; + typedef pecos::mmap_hashmap::Str2IntMap mmap_hashmap_fixed_len_10_str2int; typedef pecos::mmap_hashmap::Int2IntMap mmap_hashmap_int2int; // New @@ -672,7 +672,7 @@ extern "C" { return static_cast(new mmap_hashmap_ ## SUFFIX()); } MMAP_MAP_NEW(str2int) MMAP_MAP_NEW(fixed_len_str2int) - MMAP_MAP_NEW(asin_str2int) + MMAP_MAP_NEW(fixed_len_10_str2int) MMAP_MAP_NEW(int2int) // Destruct @@ -681,7 +681,7 @@ extern "C" { delete static_cast(map_ptr); } MMAP_MAP_DESTRUCT(str2int) MMAP_MAP_DESTRUCT(fixed_len_str2int) - MMAP_MAP_DESTRUCT(asin_str2int) + MMAP_MAP_DESTRUCT(fixed_len_10_str2int) MMAP_MAP_DESTRUCT(int2int) // Save @@ -690,7 +690,7 @@ extern "C" { static_cast(map_ptr)->save(map_dir); } MMAP_MAP_SAVE(str2int) MMAP_MAP_SAVE(fixed_len_str2int) - MMAP_MAP_SAVE(asin_str2int) + MMAP_MAP_SAVE(fixed_len_10_str2int) MMAP_MAP_SAVE(int2int) // Load @@ -701,7 +701,7 @@ extern "C" { return static_cast(map_ptr); } MMAP_MAP_LOAD(str2int) MMAP_MAP_LOAD(fixed_len_str2int) - MMAP_MAP_LOAD(asin_str2int) + MMAP_MAP_LOAD(fixed_len_10_str2int) MMAP_MAP_LOAD(int2int) // Size @@ -710,7 +710,7 @@ extern "C" { return static_cast(map_ptr)->size(); } MMAP_MAP_SIZE(str2int) MMAP_MAP_SIZE(fixed_len_str2int) - MMAP_MAP_SIZE(asin_str2int) + MMAP_MAP_SIZE(fixed_len_10_str2int) MMAP_MAP_SIZE(int2int) // Insert @@ -720,7 +720,7 @@ extern "C" { static_cast(map_ptr)->insert(FUNC_CALL_KEY, val); } MMAP_MAP_INSERT(str2int, KEY_SINGLE_ARG(const char* key, uint32_t key_len), KEY_SINGLE_ARG(key, key_len)) MMAP_MAP_INSERT(fixed_len_str2int, KEY_SINGLE_ARG(const char* key, uint32_t key_len), KEY_SINGLE_ARG(key, key_len)) - MMAP_MAP_INSERT(asin_str2int, KEY_SINGLE_ARG(const char* key, uint32_t key_len), KEY_SINGLE_ARG(key, key_len)) + MMAP_MAP_INSERT(fixed_len_10_str2int, KEY_SINGLE_ARG(const char* key, uint32_t key_len), KEY_SINGLE_ARG(key, key_len)) MMAP_MAP_INSERT(int2int, uint64_t key, key) // Get @@ -729,7 +729,7 @@ extern "C" { return static_cast(map_ptr)->get(FUNC_CALL_KEY); } MMAP_MAP_GET(str2int, KEY_SINGLE_ARG(const char* key, uint32_t key_len), KEY_SINGLE_ARG(key, key_len)) MMAP_MAP_GET(fixed_len_str2int, KEY_SINGLE_ARG(const char* key, uint32_t key_len), KEY_SINGLE_ARG(key, key_len)) - MMAP_MAP_GET(asin_str2int, KEY_SINGLE_ARG(const char* key, uint32_t key_len), KEY_SINGLE_ARG(key, key_len)) + MMAP_MAP_GET(fixed_len_10_str2int, KEY_SINGLE_ARG(const char* key, uint32_t key_len), KEY_SINGLE_ARG(key, key_len)) MMAP_MAP_GET(int2int, uint64_t key, key) #define MMAP_MAP_GET_W_DEFAULT(SUFFIX, KEY, FUNC_CALL_KEY) \ @@ -737,7 +737,7 @@ extern "C" { return static_cast(map_ptr)->get_w_default(FUNC_CALL_KEY, def_val); } MMAP_MAP_GET_W_DEFAULT(str2int, KEY_SINGLE_ARG(const char* key, uint32_t key_len), KEY_SINGLE_ARG(key, key_len)) MMAP_MAP_GET_W_DEFAULT(fixed_len_str2int, KEY_SINGLE_ARG(const char* key, uint32_t key_len), KEY_SINGLE_ARG(key, key_len)) - MMAP_MAP_GET_W_DEFAULT(asin_str2int, KEY_SINGLE_ARG(const char* key, uint32_t key_len), KEY_SINGLE_ARG(key, key_len)) + MMAP_MAP_GET_W_DEFAULT(fixed_len_10_str2int, KEY_SINGLE_ARG(const char* key, uint32_t key_len), KEY_SINGLE_ARG(key, key_len)) MMAP_MAP_GET_W_DEFAULT(int2int, uint64_t key, key) #define MMAP_MAP_BATCH_GET_W_DEFAULT(SUFFIX, KEY, FUNC_CALL_KEY) \ @@ -745,7 +745,7 @@ extern "C" { static_cast(map_ptr)->batch_get_w_default(n_key, FUNC_CALL_KEY, def_val, vals, threads); } MMAP_MAP_BATCH_GET_W_DEFAULT(str2int, KEY_SINGLE_ARG(const char* const* keys, const uint32_t* keys_lens), KEY_SINGLE_ARG(keys, keys_lens)) MMAP_MAP_BATCH_GET_W_DEFAULT(fixed_len_str2int, KEY_SINGLE_ARG(const char* const* keys, const uint32_t* keys_lens), KEY_SINGLE_ARG(keys, keys_lens)) - MMAP_MAP_BATCH_GET_W_DEFAULT(asin_str2int, KEY_SINGLE_ARG(const char* const* keys, const uint32_t* keys_lens), KEY_SINGLE_ARG(keys, keys_lens)) + MMAP_MAP_BATCH_GET_W_DEFAULT(fixed_len_10_str2int, KEY_SINGLE_ARG(const char* const* keys, const uint32_t* keys_lens), KEY_SINGLE_ARG(keys, keys_lens)) MMAP_MAP_BATCH_GET_W_DEFAULT(int2int, const uint64_t* key, key) // Contains @@ -754,7 +754,7 @@ extern "C" { return static_cast(map_ptr)->contains(FUNC_CALL_KEY); } MMAP_MAP_CONTAINS(str2int, KEY_SINGLE_ARG(const char* key, uint32_t key_len), KEY_SINGLE_ARG(key, key_len)) MMAP_MAP_CONTAINS(fixed_len_str2int, KEY_SINGLE_ARG(const char* key, uint32_t key_len), KEY_SINGLE_ARG(key, key_len)) - MMAP_MAP_CONTAINS(asin_str2int, KEY_SINGLE_ARG(const char* key, uint32_t key_len), KEY_SINGLE_ARG(key, key_len)) + MMAP_MAP_CONTAINS(fixed_len_10_str2int, KEY_SINGLE_ARG(const char* key, uint32_t key_len), KEY_SINGLE_ARG(key, key_len)) MMAP_MAP_CONTAINS(int2int, uint64_t key, key) diff --git a/pecos/core/utils/mmap_hashmap.hpp b/pecos/core/utils/mmap_hashmap.hpp index b6a90af2..08bcee09 100644 --- a/pecos/core/utils/mmap_hashmap.hpp +++ b/pecos/core/utils/mmap_hashmap.hpp @@ -285,15 +285,20 @@ class AnkerlFixedLenStr2IntMmapableVector { std::forward_as_tuple(size_), std::forward< std::tuple >(args)); + + size_type key_length = k.size(); + // Length of new key should be the same as previous keys - if (fixed_str_len_ != -1 && fixed_str_len_ != k.size()) { - throw std::runtime_error("String length differs from previous keys."); + if (key_length == 0) { + throw std::runtime_error("Key length should be greater than 0."); + } else if (fixed_str_len_ != 0 && fixed_str_len_ != key_length) { + throw std::runtime_error("Key length differs from previous keys."); } else { - fixed_str_len_ = k.size(); + fixed_str_len_ = key_length; } // Append key string - str_store_.insert(str_store_.end(), k.data(), k.data() + k.size()); + str_store_.insert(str_store_.end(), k.data(), k.data() + key_length); // Update pointers size_ = store_.size(); @@ -349,7 +354,7 @@ class AnkerlFixedLenStr2IntMmapableVector { value_type* data_ = nullptr; char* str_data_ = nullptr; - size_type fixed_str_len_ = -1; + size_type fixed_str_len_ = 0; // Actual data storage for in-memory case std::vector store_; @@ -435,16 +440,18 @@ class AnkerlFixedLenStr2IntMmapableVector { // Memory-mappable vector of std::pair for Ankerl // This vector takes/gets std::string_view as the key, but emplace back as the special mmap format StrView // The key must be of length 10 -class AnkerlAsinStr2IntMmapableVector { +class AnkerlFixedLen10Str2IntMmapableVector { template class iter_t; + static constexpr std::size_t fixed_str_len = 10; + struct StrView { - char str[10]; + char str[fixed_str_len]; StrView(const char* input_str = nullptr) { if (input_str) { - std::strncpy(str, input_str, 10); + std::memcpy(str, input_str, fixed_str_len); } } }; @@ -463,8 +470,8 @@ class AnkerlAsinStr2IntMmapableVector { using iterator = iter_t; using const_iterator = iter_t; - AnkerlAsinStr2IntMmapableVector() = default; - AnkerlAsinStr2IntMmapableVector(allocator_type alloc) + AnkerlFixedLen10Str2IntMmapableVector() = default; + AnkerlFixedLen10Str2IntMmapableVector(allocator_type alloc) : store_(alloc) {} value_type* data() { return data_; } @@ -493,12 +500,12 @@ class AnkerlAsinStr2IntMmapableVector { // Extract key key_type key_string = std::get<0>(key); - if (key_string.size() != 10) { + if (key_string.size() != fixed_str_len) { throw std::runtime_error("ASIN string length is not 10."); } - char key_arr[10]; - std::strncpy(key_arr, key_string.data(), key_string.size()); + char key_arr[fixed_str_len]; + std::memcpy(key_arr, key_string.data(), key_string.size()); // Emplace back std::pair auto eb_val = store_.emplace_back( @@ -524,7 +531,7 @@ class AnkerlAsinStr2IntMmapableVector { /* Get key for given member */ key_type get_key(value_type const& vt) const { - return key_type(vt.first.str, 10); + return key_type(vt.first.str, fixed_str_len); } /* Mmap save/load with MmapStore */ @@ -564,7 +571,7 @@ class AnkerlAsinStr2IntMmapableVector { template class iter_t { using ptr_t = typename std::conditional_t; + AnkerlFixedLen10Str2IntMmapableVector::const_pointer, AnkerlFixedLen10Str2IntMmapableVector::pointer>; ptr_t iter_data_{}; template @@ -572,12 +579,12 @@ class AnkerlAsinStr2IntMmapableVector { public: using iterator_category = std::forward_iterator_tag; - using difference_type = AnkerlAsinStr2IntMmapableVector::difference_type; - using value_type = AnkerlAsinStr2IntMmapableVector::value_type; + using difference_type = AnkerlFixedLen10Str2IntMmapableVector::difference_type; + using value_type = AnkerlFixedLen10Str2IntMmapableVector::value_type; using reference = typename std::conditional_t; using pointer = typename std::conditional_t; + AnkerlFixedLen10Str2IntMmapableVector::const_pointer, AnkerlFixedLen10Str2IntMmapableVector::pointer>; iter_t() noexcept = default; diff --git a/pecos/utils/mmap_hashmap_util.py b/pecos/utils/mmap_hashmap_util.py index 74c13053..b9d1cc0e 100644 --- a/pecos/utils/mmap_hashmap_util.py +++ b/pecos/utils/mmap_hashmap_util.py @@ -187,7 +187,11 @@ def init(cls, map_type, map_dir, lazy_load): fn_dict = clib.mmap_hashmap_init(map_type) map_ptr = fn_dict["load"](map_dir.encode("utf-8"), lazy_load) - if map_type == "str2int" or map_type == "fixed_len_str2int" or map_type == "asin_str2int": + if ( + map_type == "str2int" + or map_type == "fixed_len_str2int" + or map_type == "fixed_len_10_str2int" + ): return _MmapHashmapStr2IntReadOnly(map_ptr, fn_dict) elif map_type == "int2int": return _MmapHashmapInt2IntReadOnly(map_ptr, fn_dict) @@ -340,7 +344,11 @@ def init(cls, map_type, map_dir): fn_dict = clib.mmap_hashmap_init(map_type) map_ptr = fn_dict["new"]() - if map_type == "str2int" or map_type == "fixed_len_str2int" or map_type == "asin_str2int": + if ( + map_type == "str2int" + or map_type == "fixed_len_str2int" + or map_type == "fixed_len_10_str2int" + ): return _MmapHashmapStr2IntWrite(map_ptr, fn_dict, map_dir) elif map_type == "int2int": return _MmapHashmapInt2IntWrite(map_ptr, fn_dict, map_dir) diff --git a/test/pecos/utils/test_mmap_hashmap_util.py b/test/pecos/utils/test_mmap_hashmap_util.py index eb74be84..c58b07e7 100644 --- a/test/pecos/utils/test_mmap_hashmap_util.py +++ b/test/pecos/utils/test_mmap_hashmap_util.py @@ -121,14 +121,14 @@ def test_fixed_len_str2int_mmap_hashmap(tmpdir): assert r_map_batch_getter.get(ks, 10).tolist() == vs -def test_asin_str2int_mmap_hashmap(tmpdir): +def test_fixed_len_10_str2int_mmap_hashmap(tmpdir): from pecos.utils.mmap_hashmap_util import MmapHashmap, MmapHashmapBatchGetter - map_dir = tmpdir.join("asin_str2int").realpath().strpath + map_dir = tmpdir.join("fixed_len_10_str2int").realpath().strpath kv_dict = {"aaaaaaaaaa".encode("utf-8"): 2, "bbbbbbbbbb".encode("utf-8"): 3} # Write-only Mode - w_map = MmapHashmap("asin_str2int") + w_map = MmapHashmap("fixed_len_10_str2int") w_map.open("w", map_dir) # Insert w_map.map.insert("aaaaaaaaaa".encode("utf-8"), 1) # Test for overwrite later @@ -139,7 +139,7 @@ def test_asin_str2int_mmap_hashmap(tmpdir): w_map.close() # Read-only Mode - r_map = MmapHashmap("asin_str2int") + r_map = MmapHashmap("fixed_len_10_str2int") r_map.open("r", map_dir) # Get for k, v in kv_dict.items():