Skip to content

Commit

Permalink
rename asin_str2int to fixed_len_10_str2int
Browse files Browse the repository at this point in the history
  • Loading branch information
vpung committed Aug 27, 2024
1 parent a0a3fd8 commit 1aedb47
Show file tree
Hide file tree
Showing 26 changed files with 82 additions and 55 deletions.
2 changes: 1 addition & 1 deletion .github/style_type_check_cfg/.flake8
Original file line number Diff line number Diff line change
@@ -1,3 +1,3 @@
[flake8]
ignore = E203,E501,W605,F541
extend-ignore = E203,E501,W605,F541
max_line_length = 100
6 changes: 3 additions & 3 deletions pecos/core/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -2070,7 +2070,7 @@ def link_mmap_hashmap_methods(self):
Specify C-lib's Memory-mappable Hashmap methods arguments and return types.
"""
fn_prefix = "mmap_hashmap"
map_type_list = ["str2int", "fixed_len_str2int", "asin_str2int", "int2int"]
map_type_list = ["str2int", "fixed_len_str2int", "fixed_len_10_str2int", "int2int"]
key_args_dict = {
"str2int": [
c_char_p, # pointer of key string
Expand All @@ -2080,7 +2080,7 @@ def link_mmap_hashmap_methods(self):
c_char_p, # pointer of key string
c_uint32, # length of key string
],
"asin_str2int": [
"fixed_len_10_str2int": [
c_char_p, # pointer of key string
c_uint32, # length of key string
],
Expand All @@ -2097,7 +2097,7 @@ def link_mmap_hashmap_methods(self):
c_void_p, # List of pointer of key string
POINTER(c_uint32), # List of length of key string
],
"asin_str2int": [
"fixed_len_10_str2int": [
c_void_p, # List of pointer of key string
POINTER(c_uint32), # List of length of key string
],
Expand Down
22 changes: 11 additions & 11 deletions pecos/core/libpecos.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -663,7 +663,7 @@ extern "C" {

typedef pecos::mmap_hashmap::Str2IntMap<pecos::mmap_hashmap::details_::AnkerlStr2IntMmapableVector> mmap_hashmap_str2int;
typedef pecos::mmap_hashmap::Str2IntMap<pecos::mmap_hashmap::details_::AnkerlFixedLenStr2IntMmapableVector> mmap_hashmap_fixed_len_str2int;
typedef pecos::mmap_hashmap::Str2IntMap<pecos::mmap_hashmap::details_::AnkerlAsinStr2IntMmapableVector> mmap_hashmap_asin_str2int;
typedef pecos::mmap_hashmap::Str2IntMap<pecos::mmap_hashmap::details_::AnkerlFixedLen10Str2IntMmapableVector> mmap_hashmap_fixed_len_10_str2int;
typedef pecos::mmap_hashmap::Int2IntMap mmap_hashmap_int2int;

// New
Expand All @@ -672,7 +672,7 @@ extern "C" {
return static_cast<void*>(new mmap_hashmap_ ## SUFFIX()); }
MMAP_MAP_NEW(str2int)
MMAP_MAP_NEW(fixed_len_str2int)
MMAP_MAP_NEW(asin_str2int)
MMAP_MAP_NEW(fixed_len_10_str2int)
MMAP_MAP_NEW(int2int)

// Destruct
Expand All @@ -681,7 +681,7 @@ extern "C" {
delete static_cast<mmap_hashmap_ ## SUFFIX *>(map_ptr); }
MMAP_MAP_DESTRUCT(str2int)
MMAP_MAP_DESTRUCT(fixed_len_str2int)
MMAP_MAP_DESTRUCT(asin_str2int)
MMAP_MAP_DESTRUCT(fixed_len_10_str2int)
MMAP_MAP_DESTRUCT(int2int)

// Save
Expand All @@ -690,7 +690,7 @@ extern "C" {
static_cast<mmap_hashmap_ ## SUFFIX *>(map_ptr)->save(map_dir); }
MMAP_MAP_SAVE(str2int)
MMAP_MAP_SAVE(fixed_len_str2int)
MMAP_MAP_SAVE(asin_str2int)
MMAP_MAP_SAVE(fixed_len_10_str2int)
MMAP_MAP_SAVE(int2int)

// Load
Expand All @@ -701,7 +701,7 @@ extern "C" {
return static_cast<void *>(map_ptr); }
MMAP_MAP_LOAD(str2int)
MMAP_MAP_LOAD(fixed_len_str2int)
MMAP_MAP_LOAD(asin_str2int)
MMAP_MAP_LOAD(fixed_len_10_str2int)
MMAP_MAP_LOAD(int2int)

// Size
Expand All @@ -710,7 +710,7 @@ extern "C" {
return static_cast<mmap_hashmap_ ## SUFFIX *>(map_ptr)->size(); }
MMAP_MAP_SIZE(str2int)
MMAP_MAP_SIZE(fixed_len_str2int)
MMAP_MAP_SIZE(asin_str2int)
MMAP_MAP_SIZE(fixed_len_10_str2int)
MMAP_MAP_SIZE(int2int)

// Insert
Expand All @@ -720,7 +720,7 @@ extern "C" {
static_cast<mmap_hashmap_ ## SUFFIX *>(map_ptr)->insert(FUNC_CALL_KEY, val); }
MMAP_MAP_INSERT(str2int, KEY_SINGLE_ARG(const char* key, uint32_t key_len), KEY_SINGLE_ARG(key, key_len))
MMAP_MAP_INSERT(fixed_len_str2int, KEY_SINGLE_ARG(const char* key, uint32_t key_len), KEY_SINGLE_ARG(key, key_len))
MMAP_MAP_INSERT(asin_str2int, KEY_SINGLE_ARG(const char* key, uint32_t key_len), KEY_SINGLE_ARG(key, key_len))
MMAP_MAP_INSERT(fixed_len_10_str2int, KEY_SINGLE_ARG(const char* key, uint32_t key_len), KEY_SINGLE_ARG(key, key_len))
MMAP_MAP_INSERT(int2int, uint64_t key, key)

// Get
Expand All @@ -729,23 +729,23 @@ extern "C" {
return static_cast<mmap_hashmap_ ## SUFFIX *>(map_ptr)->get(FUNC_CALL_KEY); }
MMAP_MAP_GET(str2int, KEY_SINGLE_ARG(const char* key, uint32_t key_len), KEY_SINGLE_ARG(key, key_len))
MMAP_MAP_GET(fixed_len_str2int, KEY_SINGLE_ARG(const char* key, uint32_t key_len), KEY_SINGLE_ARG(key, key_len))
MMAP_MAP_GET(asin_str2int, KEY_SINGLE_ARG(const char* key, uint32_t key_len), KEY_SINGLE_ARG(key, key_len))
MMAP_MAP_GET(fixed_len_10_str2int, KEY_SINGLE_ARG(const char* key, uint32_t key_len), KEY_SINGLE_ARG(key, key_len))
MMAP_MAP_GET(int2int, uint64_t key, key)

#define MMAP_MAP_GET_W_DEFAULT(SUFFIX, KEY, FUNC_CALL_KEY) \
uint64_t mmap_hashmap_get_w_default_ ## SUFFIX (void* map_ptr, KEY, uint64_t def_val) { \
return static_cast<mmap_hashmap_ ## SUFFIX *>(map_ptr)->get_w_default(FUNC_CALL_KEY, def_val); }
MMAP_MAP_GET_W_DEFAULT(str2int, KEY_SINGLE_ARG(const char* key, uint32_t key_len), KEY_SINGLE_ARG(key, key_len))
MMAP_MAP_GET_W_DEFAULT(fixed_len_str2int, KEY_SINGLE_ARG(const char* key, uint32_t key_len), KEY_SINGLE_ARG(key, key_len))
MMAP_MAP_GET_W_DEFAULT(asin_str2int, KEY_SINGLE_ARG(const char* key, uint32_t key_len), KEY_SINGLE_ARG(key, key_len))
MMAP_MAP_GET_W_DEFAULT(fixed_len_10_str2int, KEY_SINGLE_ARG(const char* key, uint32_t key_len), KEY_SINGLE_ARG(key, key_len))
MMAP_MAP_GET_W_DEFAULT(int2int, uint64_t key, key)

#define MMAP_MAP_BATCH_GET_W_DEFAULT(SUFFIX, KEY, FUNC_CALL_KEY) \
void mmap_hashmap_batch_get_w_default_ ## SUFFIX (void* map_ptr, const uint32_t n_key, KEY, uint64_t def_val, uint64_t* vals, const int threads) { \
static_cast<mmap_hashmap_ ## SUFFIX *>(map_ptr)->batch_get_w_default(n_key, FUNC_CALL_KEY, def_val, vals, threads); }
MMAP_MAP_BATCH_GET_W_DEFAULT(str2int, KEY_SINGLE_ARG(const char* const* keys, const uint32_t* keys_lens), KEY_SINGLE_ARG(keys, keys_lens))
MMAP_MAP_BATCH_GET_W_DEFAULT(fixed_len_str2int, KEY_SINGLE_ARG(const char* const* keys, const uint32_t* keys_lens), KEY_SINGLE_ARG(keys, keys_lens))
MMAP_MAP_BATCH_GET_W_DEFAULT(asin_str2int, KEY_SINGLE_ARG(const char* const* keys, const uint32_t* keys_lens), KEY_SINGLE_ARG(keys, keys_lens))
MMAP_MAP_BATCH_GET_W_DEFAULT(fixed_len_10_str2int, KEY_SINGLE_ARG(const char* const* keys, const uint32_t* keys_lens), KEY_SINGLE_ARG(keys, keys_lens))
MMAP_MAP_BATCH_GET_W_DEFAULT(int2int, const uint64_t* key, key)

// Contains
Expand All @@ -754,7 +754,7 @@ extern "C" {
return static_cast<mmap_hashmap_ ## SUFFIX *>(map_ptr)->contains(FUNC_CALL_KEY); }
MMAP_MAP_CONTAINS(str2int, KEY_SINGLE_ARG(const char* key, uint32_t key_len), KEY_SINGLE_ARG(key, key_len))
MMAP_MAP_CONTAINS(fixed_len_str2int, KEY_SINGLE_ARG(const char* key, uint32_t key_len), KEY_SINGLE_ARG(key, key_len))
MMAP_MAP_CONTAINS(asin_str2int, KEY_SINGLE_ARG(const char* key, uint32_t key_len), KEY_SINGLE_ARG(key, key_len))
MMAP_MAP_CONTAINS(fixed_len_10_str2int, KEY_SINGLE_ARG(const char* key, uint32_t key_len), KEY_SINGLE_ARG(key, key_len))
MMAP_MAP_CONTAINS(int2int, uint64_t key, key)


Expand Down
61 changes: 34 additions & 27 deletions pecos/core/utils/mmap_hashmap.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -285,15 +285,20 @@ class AnkerlFixedLenStr2IntMmapableVector {
std::forward_as_tuple(size_),
std::forward< std::tuple<Args...> >(args));


size_type key_length = k.size();

// Length of new key should be the same as previous keys
if (fixed_str_len_ != -1 && fixed_str_len_ != k.size()) {
throw std::runtime_error("String length differs from previous keys.");
if (key_length == 0) {
throw std::runtime_error("Key length should be greater than 0.");
} else if (fixed_str_len_ != 0 && fixed_str_len_ != key_length) {
throw std::runtime_error("Key length differs from previous keys.");
} else {
fixed_str_len_ = k.size();
fixed_str_len_ = key_length;
}

// Append key string
str_store_.insert(str_store_.end(), k.data(), k.data() + k.size());
str_store_.insert(str_store_.end(), k.data(), k.data() + key_length);

// Update pointers
size_ = store_.size();
Expand Down Expand Up @@ -349,7 +354,7 @@ class AnkerlFixedLenStr2IntMmapableVector {
value_type* data_ = nullptr;
char* str_data_ = nullptr;

size_type fixed_str_len_ = -1;
size_type fixed_str_len_ = 0;

// Actual data storage for in-memory case
std::vector<value_type> store_;
Expand Down Expand Up @@ -432,26 +437,31 @@ class AnkerlFixedLenStr2IntMmapableVector {
};


// Memory-mappable vector of std::pair<StrView, uint64_t> for Ankerl
// This vector takes/gets std::string_view as the key, but emplace back as the special mmap format StrView
// Memory-mappable vector of std::pair<FixedLen10Str, uint64_t> for Ankerl
// This vector takes/gets std::string_view as the key, but emplace back as the special mmap format FixedLen10Str
// The key must be of length 10
class AnkerlAsinStr2IntMmapableVector {
class AnkerlFixedLen10Str2IntMmapableVector {
template <bool IsConst>
class iter_t;

struct StrView {
char str[10];
// Fixed Length of 10 for keys
static constexpr std::size_t fixed_str_len = 10;

struct FixedLen10Str {
char str[fixed_str_len];

StrView(const char* input_str = nullptr) {
FixedLen10Str(const char* input_str = nullptr) {
if (input_str) {
std::strncpy(str, input_str, 10);
std::memcpy(str, input_str, fixed_str_len);
} else {
throw std::runtime_error("Illegal initialization of FixLen10Str with nullptr.");
}
}
};

public:
using key_type = std::string_view;
using value_type = std::pair<StrView, uint64_t>;
using value_type = std::pair<FixedLen10Str, uint64_t>;
using size_type = std::size_t;
using difference_type = std::ptrdiff_t;
using allocator_type = std::allocator<value_type>;
Expand All @@ -463,8 +473,8 @@ class AnkerlAsinStr2IntMmapableVector {
using iterator = iter_t<false>;
using const_iterator = iter_t<true>;

AnkerlAsinStr2IntMmapableVector() = default;
AnkerlAsinStr2IntMmapableVector(allocator_type alloc)
AnkerlFixedLen10Str2IntMmapableVector() = default;
AnkerlFixedLen10Str2IntMmapableVector(allocator_type alloc)
: store_(alloc) {}

value_type* data() { return data_; }
Expand All @@ -487,23 +497,20 @@ class AnkerlAsinStr2IntMmapableVector {
void shrink_to_fit() { store_.shrink_to_fit(); }
void reserve(size_t new_capacity) { store_.reserve(new_capacity); }

/* Emplace string-like key and int value as std::pair<StrView, uint64_t>*/
/* Emplace string-like key and int value as std::pair<FixedLen10Str, uint64_t>*/
template <typename K, typename... Args>
auto emplace_back(std::piecewise_construct_t, std::tuple<K> key, std::tuple<Args...> args) {
// Extract key
key_type key_string = std::get<0>(key);

if (key_string.size() != 10) {
if (key_string.size() != fixed_str_len) {
throw std::runtime_error("ASIN string length is not 10.");
}

char key_arr[10];
std::strncpy(key_arr, key_string.data(), key_string.size());

// Emplace back std::pair<StrView, uint64_t>
// Emplace back std::pair<FixedLen10Str, uint64_t>
auto eb_val = store_.emplace_back(
std::piecewise_construct,
std::forward_as_tuple(key_arr),
std::forward_as_tuple(key_string.data()),
std::forward< std::tuple<Args...> >(args));

// Update pointers
Expand All @@ -524,7 +531,7 @@ class AnkerlAsinStr2IntMmapableVector {

/* Get key for given member */
key_type get_key(value_type const& vt) const {
return key_type(vt.first.str, 10);
return key_type(vt.first.str, fixed_str_len);
}

/* Mmap save/load with MmapStore */
Expand Down Expand Up @@ -564,20 +571,20 @@ class AnkerlAsinStr2IntMmapableVector {
template <bool IsConst>
class iter_t {
using ptr_t = typename std::conditional_t<IsConst,
AnkerlAsinStr2IntMmapableVector::const_pointer, AnkerlAsinStr2IntMmapableVector::pointer>;
AnkerlFixedLen10Str2IntMmapableVector::const_pointer, AnkerlFixedLen10Str2IntMmapableVector::pointer>;
ptr_t iter_data_{};

template <bool B>
friend class iter_t;

public:
using iterator_category = std::forward_iterator_tag;
using difference_type = AnkerlAsinStr2IntMmapableVector::difference_type;
using value_type = AnkerlAsinStr2IntMmapableVector::value_type;
using difference_type = AnkerlFixedLen10Str2IntMmapableVector::difference_type;
using value_type = AnkerlFixedLen10Str2IntMmapableVector::value_type;
using reference = typename std::conditional_t<IsConst,
value_type const&, value_type&>;
using pointer = typename std::conditional_t<IsConst,
AnkerlAsinStr2IntMmapableVector::const_pointer, AnkerlAsinStr2IntMmapableVector::pointer>;
AnkerlFixedLen10Str2IntMmapableVector::const_pointer, AnkerlFixedLen10Str2IntMmapableVector::pointer>;

iter_t() noexcept = default;

Expand Down
12 changes: 10 additions & 2 deletions pecos/utils/mmap_hashmap_util.py
Original file line number Diff line number Diff line change
Expand Up @@ -187,7 +187,11 @@ def init(cls, map_type, map_dir, lazy_load):
fn_dict = clib.mmap_hashmap_init(map_type)
map_ptr = fn_dict["load"](map_dir.encode("utf-8"), lazy_load)

if map_type == "str2int" or map_type == "fixed_len_str2int" or map_type == "asin_str2int":
if (
map_type == "str2int"
or map_type == "fixed_len_str2int"
or map_type == "fixed_len_10_str2int"
):
return _MmapHashmapStr2IntReadOnly(map_ptr, fn_dict)
elif map_type == "int2int":
return _MmapHashmapInt2IntReadOnly(map_ptr, fn_dict)
Expand Down Expand Up @@ -340,7 +344,11 @@ def init(cls, map_type, map_dir):
fn_dict = clib.mmap_hashmap_init(map_type)
map_ptr = fn_dict["new"]()

if map_type == "str2int" or map_type == "fixed_len_str2int" or map_type == "asin_str2int":
if (
map_type == "str2int"
or map_type == "fixed_len_str2int"
or map_type == "fixed_len_10_str2int"
):
return _MmapHashmapStr2IntWrite(map_ptr, fn_dict, map_dir)
elif map_type == "int2int":
return _MmapHashmapInt2IntWrite(map_ptr, fn_dict, map_dir)
Expand Down
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
34 changes: 23 additions & 11 deletions test/pecos/utils/test_mmap_hashmap_util.py
Original file line number Diff line number Diff line change
Expand Up @@ -64,6 +64,8 @@ def test_str2int_mmap_hashmap(tmpdir):
) # Non-exist key
vs = list(kv_dict.values()) + [10] * (3 * max_batch_size - len(kv_dict))
assert r_map_batch_getter.get(ks, 10).tolist() == vs
# check max batch size increased
assert r_map_batch_getter.max_batch_size == 15


def test_fixed_len_str2int_mmap_hashmap(tmpdir):
Expand Down Expand Up @@ -119,61 +121,69 @@ def test_fixed_len_str2int_mmap_hashmap(tmpdir):
) # Non-exist key
vs = list(kv_dict.values()) + [10] * (3 * max_batch_size - len(kv_dict))
assert r_map_batch_getter.get(ks, 10).tolist() == vs
# check max batch size increased
assert r_map_batch_getter.max_batch_size == 15


def test_asin_str2int_mmap_hashmap(tmpdir):
def test_fixed_len_10_str2int_mmap_hashmap(tmpdir):
from pecos.utils.mmap_hashmap_util import MmapHashmap, MmapHashmapBatchGetter

map_dir = tmpdir.join("asin_str2int").realpath().strpath
kv_dict = {"aaaaaaaaaa".encode("utf-8"): 2, "bbbbbbbbbb".encode("utf-8"): 3}
len_10_a_string = "a" * 10
len_10_b_string = "b" * 10
len_10_c_string = "c" * 10

map_dir = tmpdir.join("fixed_len_10_str2int").realpath().strpath
kv_dict = {len_10_a_string.encode("utf-8"): 2, len_10_b_string.encode("utf-8"): 3}

# Write-only Mode
w_map = MmapHashmap("asin_str2int")
w_map = MmapHashmap("fixed_len_10_str2int")
w_map.open("w", map_dir)
# Insert
w_map.map.insert("aaaaaaaaaa".encode("utf-8"), 1) # Test for overwrite later
w_map.map.insert(len_10_a_string.encode("utf-8"), 1) # Test for overwrite later
for k, v in kv_dict.items():
w_map.map.insert(k, v)
# Size
assert w_map.map.size() == len(kv_dict)
w_map.close()

# Read-only Mode
r_map = MmapHashmap("asin_str2int")
r_map = MmapHashmap("fixed_len_10_str2int")
r_map.open("r", map_dir)
# Get
for k, v in kv_dict.items():
assert r_map.map[k] == v
# Get with default
for k, v in kv_dict.items():
assert r_map.map.get(k, 10) == v
assert r_map.map.get("cccccccccc".encode("utf-8"), 10) == 10
assert r_map.map.get(len_10_c_string.encode("utf-8"), 10) == 10
# Contains
for k, _ in kv_dict.items():
assert k in r_map.map
assert not ("cccccccccc".encode("utf-8") in r_map.map)
assert not (len_10_c_string.encode("utf-8") in r_map.map)
# Size
assert r_map.map.size() == len(kv_dict)

# Batch get with default
max_batch_size = 5
# max_batch_size > num of key
r_map_batch_getter = MmapHashmapBatchGetter(r_map.map, max_batch_size)
ks = list(kv_dict.keys()) + ["cccccccccc".encode("utf-8")] # Non-exist key
ks = list(kv_dict.keys()) + [len_10_c_string.encode("utf-8")] # Non-exist key
vs = list(kv_dict.values()) + [10]
assert r_map_batch_getter.get(ks, 10).tolist() == vs
# max_batch_size = num of key
ks = list(kv_dict.keys()) + ["cccccccccc".encode("utf-8")] * (
ks = list(kv_dict.keys()) + [len_10_c_string.encode("utf-8")] * (
max_batch_size - len(kv_dict)
) # Non-exist key
vs = list(kv_dict.values()) + [10] * (max_batch_size - len(kv_dict))
assert r_map_batch_getter.get(ks, 10).tolist() == vs
# max_batch_size = num of key * 3
ks = list(kv_dict.keys()) + ["cccccccccc".encode("utf-8")] * (
ks = list(kv_dict.keys()) + [len_10_c_string.encode("utf-8")] * (
3 * max_batch_size - len(kv_dict)
) # Non-exist key
vs = list(kv_dict.values()) + [10] * (3 * max_batch_size - len(kv_dict))
assert r_map_batch_getter.get(ks, 10).tolist() == vs
# check max batch size increased
assert r_map_batch_getter.max_batch_size == 15


def test_int2int_mmap_hashmap(tmpdir):
Expand Down Expand Up @@ -226,3 +236,5 @@ def test_int2int_mmap_hashmap(tmpdir):
ks = list(kv_dict.keys()) + [1000] * (3 * max_batch_size - len(kv_dict)) # Non-exist key
vs = list(kv_dict.values()) + [10] * (3 * max_batch_size - len(kv_dict))
assert r_map_batch_getter.get(np.array(ks, dtype=np.int64), 10).tolist() == vs
# check max batch size increased
assert r_map_batch_getter.max_batch_size == 15
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.

0 comments on commit 1aedb47

Please sign in to comment.