Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Performance regression when requesting a timestamp before the earliest version #2076

Merged
merged 14 commits into from
Jan 15, 2025
Merged
214 changes: 125 additions & 89 deletions cpp/arcticdb/version/test/test_version_map.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -547,56 +547,65 @@ TEST(VersionMap, StorageLogging) {
ASSERT_EQ(tomb_keys, 3u);
}

std::shared_ptr<VersionMapEntry> write_two_versions(
std::shared_ptr<InMemoryStore> store,
std::shared_ptr<VersionMap> version_map,
const StreamId& id) {
struct VersionChainOperation {
enum class Type {
WRITE,
TOMBSTONE,
TOMBSTONE_ALL
} type {Type::WRITE};

VersionId version_id { 0 };
};

std::shared_ptr<VersionMapEntry> write_versions(
const std::shared_ptr<InMemoryStore>& store,
const std::shared_ptr<VersionMap>& version_map,
const StreamId& id,
const std::vector<VersionChainOperation>& operations) {
auto entry = version_map->check_reload(
store,
id,
LoadStrategy{LoadType::NOT_LOADED, LoadObjective::INCLUDE_DELETED},
__FUNCTION__);

auto key1 = atom_key_with_version(id, 0, 0);
version_map->do_write(store, key1, entry);
write_symbol_ref(store, key1, std::nullopt, entry->head_.value());
auto key2 = atom_key_with_version(id, 1, 1);
version_map->do_write(store, key2, entry);
// We override the symbol ref without a prev_key on purpose. This way we'll only load the version=1 from the ref key
write_symbol_ref(store, key2, std::nullopt, entry->head_.value());
for (const auto& [type, version_id]: operations) {
switch (type) {
case VersionChainOperation::Type::WRITE: {
auto key = atom_key_with_version(id, version_id, version_id);
version_map->do_write(store, key, entry);
write_symbol_ref(store, key, std::nullopt, entry->head_.value());
break;
}
case VersionChainOperation::Type::TOMBSTONE: {
version_map->write_tombstone(store, version_id, id, entry);
break;
}
case VersionChainOperation::Type::TOMBSTONE_ALL: {
version_map->delete_all_versions(store, id);
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Nit: Theoretically we can still have a tombstone_all to a version_id different than the last.
You can instead use the write_tombstone_all_internal + write_symbol_ref to do that.

If we decide against that it would be good to document that VersionChainOperation{TOMBSTONE_ALL, 3} might not do what we expect.

break;
}
}
}

return entry;
}

// Produces the following version chain: v0 <- tombstone_all <- v1 <- v2 <- tombstone
void write_alternating_deleted_undeleted(std::shared_ptr<InMemoryStore> store, std::shared_ptr<VersionMap> version_map, StreamId id) {
auto entry = version_map->check_reload(
store,
id,
LoadStrategy{LoadType::NOT_LOADED, LoadObjective::INCLUDE_DELETED},
__FUNCTION__);

auto key1 = atom_key_with_version(id, 0, 0);
auto key2 = atom_key_with_version(id, 1, 1);
auto key3 = atom_key_with_version(id, 2, 2);

// Write version 0
version_map->do_write(store, key1, entry);
write_symbol_ref(store, key1, std::nullopt, entry->head_.value());

// Tombstone_all on version 0
version_map->delete_all_versions(store, id);

// Write version 1
version_map->do_write(store, key2, entry);
write_symbol_ref(store, key2, std::nullopt, entry->head_.value());

// Write version 2
version_map->do_write(store, key3, entry);
write_symbol_ref(store, key3, std::nullopt, entry->head_.value());
using Type = VersionChainOperation::Type;
write_versions(store, version_map, id, {{Type::WRITE, 0},
{Type::TOMBSTONE_ALL},
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Nit: These extra whitespaces seem off. Maybe align all the operations the same number of spaces to the right.

Hopefully @poodlewars's formatting will get merged and we can all use the same formatting throughout.

{Type::WRITE, 1},
{Type::WRITE, 2},
{Type::TOMBSTONE, 2}});
}

// Tombstone version 2
version_map->write_tombstone(store, VersionId{2}, id, entry, timestamp{3});
void write_versions(std::shared_ptr<InMemoryStore> store, std::shared_ptr<VersionMap> version_map, StreamId id, int number_of_versions) {
std::vector<VersionChainOperation> version_chain;
for (int i = 0; i < number_of_versions; i++) {
version_chain.emplace_back(VersionChainOperation::Type::WRITE, i);
}
write_versions(store, version_map, id, version_chain);
}

TEST(VersionMap, FollowingVersionChain){
Expand Down Expand Up @@ -669,8 +678,12 @@ TEST(VersionMap, FollowingVersionChainWithCaching){

// LATEST should still be cached, but the cached entry now needs to have no undeleted keys
check_loads_versions(LoadStrategy{LoadType::LATEST, LoadObjective::INCLUDE_DELETED}, 2, 0);
EXPECT_FALSE(version_map->has_cached_entry(id, LoadStrategy{LoadType::FROM_TIME, LoadObjective::INCLUDE_DELETED, static_cast<timestamp>(-1)}));
// FROM_TIME UNDELETED_ONLY should no longer be cached even though we used the same request before because the undeleted key it went to got deleted. So it will load the entire version chain
check_loads_versions(LoadStrategy{LoadType::FROM_TIME, LoadObjective::UNDELETED_ONLY, static_cast<timestamp>(10)}, 3, 0);
// We have the full version chain loaded, so has_cached_entry should always return true (even when requesting timestamp before earliest version)
EXPECT_TRUE(version_map->has_cached_entry(id, LoadStrategy{LoadType::FROM_TIME, LoadObjective::UNDELETED_ONLY, static_cast<timestamp>(-1)}));
EXPECT_TRUE(version_map->has_cached_entry(id, LoadStrategy{LoadType::FROM_TIME, LoadObjective::INCLUDE_DELETED, static_cast<timestamp>(-1)}));
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Maybe one more assertion to showcase that it even works with e.g. LoadType::ALL?

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Looks like we missed this one?
I mean something like:

EXPECT_TRUE(version_map->has_cached_entry(id, LoadStrategy{LoadType::ALL, LoadObjective::INCLUDE_DELETED}));


// We add a new undeleted key
auto key4 = atom_key_with_version(id, 3, 5);
Expand All @@ -692,7 +705,7 @@ TEST(VersionMap, FollowingVersionChainEndEarlyOnTombstoneAll) {
auto version_map = std::make_shared<VersionMap>();
StreamId id{"test"};

write_two_versions(store, version_map, id);
write_versions(store, version_map, id, 2);
// Deleting should add a TOMBSTONE_ALL which should end searching for undeleted versions early.
version_map->delete_all_versions(store, id);

Expand Down Expand Up @@ -726,13 +739,20 @@ TEST(VersionMap, FollowingVersionChainEndEarlyOnTombstoneAll) {
}
}

TEST(VersionMap, CacheInvalidation) {
TEST(VersionMap, HasCachedEntry) {
ScopedConfig sc("VersionMap.ReloadInterval", std::numeric_limits<int64_t>::max());
// Set up the version chain v0(tombstone_all) <- v1 <- v2(tombstoned)
// Set up the version chain v0 <- v1(tombstone_all) <- v2 <- v3(tombstoned)
auto store = std::make_shared<InMemoryStore>();
auto version_map = std::make_shared<VersionMap>();
StreamId id{"test"};
write_alternating_deleted_undeleted(store, version_map, id);
using Type = VersionChainOperation::Type;
std::vector<VersionChainOperation> version_chain = {{Type::WRITE, 0},
{Type::WRITE, 1},
{Type::TOMBSTONE_ALL},
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Nit: some more weird whitespaces

{Type::WRITE, 2},
{Type::WRITE, 3},
{Type::TOMBSTONE, 3}};
write_versions(store, version_map, id, version_chain);

auto check_caching = [&](LoadStrategy to_load, LoadStrategy to_check_if_cached, bool expected_outcome){
auto clean_version_map = std::make_shared<VersionMap>();
Expand All @@ -750,48 +770,47 @@ TEST(VersionMap, CacheInvalidation) {
}
};

auto load_all_param = LoadStrategy{LoadType::ALL, LoadObjective::INCLUDE_DELETED};
auto load_all_undeleted_param = LoadStrategy{LoadType::ALL, LoadObjective::UNDELETED_ONLY};
check_caching(load_all_param, load_all_undeleted_param, true);
check_caching(load_all_undeleted_param, load_all_param, false);

constexpr auto num_versions = 3u;
constexpr auto num_versions = 4u;
std::vector<LoadStrategy> should_load_to_v[num_versions] = {
// Different parameters which should all load to v0
std::vector<LoadStrategy>{
LoadStrategy{LoadType::DOWNTO, LoadObjective::INCLUDE_DELETED, static_cast<SignedVersionId>(0)},
LoadStrategy{LoadType::DOWNTO, LoadObjective::INCLUDE_DELETED, static_cast<SignedVersionId>(-3)},
LoadStrategy{LoadType::DOWNTO, LoadObjective::INCLUDE_DELETED, static_cast<SignedVersionId>(-4)},
LoadStrategy{LoadType::FROM_TIME, LoadObjective::INCLUDE_DELETED, static_cast<timestamp>(0)},
LoadStrategy{LoadType::ALL, LoadObjective::INCLUDE_DELETED}
},

// Different parameters which should all load to v1
std::vector<LoadStrategy>{
LoadStrategy{LoadType::DOWNTO, LoadObjective::INCLUDE_DELETED, static_cast<SignedVersionId>(1)},
LoadStrategy{LoadType::DOWNTO, LoadObjective::INCLUDE_DELETED, static_cast<SignedVersionId>(-2)},
LoadStrategy{LoadType::DOWNTO, LoadObjective::INCLUDE_DELETED, static_cast<SignedVersionId>(-3)},
LoadStrategy{LoadType::FROM_TIME, LoadObjective::INCLUDE_DELETED, static_cast<timestamp>(1)},
LoadStrategy{LoadType::FROM_TIME, LoadObjective::UNDELETED_ONLY,
static_cast<timestamp>(2)}, // when include_deleted=false FROM_TIME searches for an undeleted version
LoadStrategy{LoadType::LATEST, LoadObjective::UNDELETED_ONLY},
LoadStrategy{LoadType::ALL, LoadObjective::UNDELETED_ONLY}
},

// Different parameters which should all load to v2
std::vector<LoadStrategy>{
LoadStrategy{LoadType::DOWNTO, LoadObjective::INCLUDE_DELETED, static_cast<SignedVersionId>(2)},
LoadStrategy{LoadType::DOWNTO, LoadObjective::INCLUDE_DELETED, static_cast<SignedVersionId>(-1)},
LoadStrategy{LoadType::DOWNTO, LoadObjective::INCLUDE_DELETED, static_cast<SignedVersionId>(-2)},
LoadStrategy{LoadType::FROM_TIME, LoadObjective::INCLUDE_DELETED, static_cast<timestamp>(2)},
LoadStrategy{LoadType::FROM_TIME, LoadObjective::UNDELETED_ONLY,
static_cast<timestamp>(3)}, // when include_deleted=false FROM_TIME searches for an undeleted version
LoadStrategy{LoadType::LATEST, LoadObjective::UNDELETED_ONLY},
},
// Different parameters which should all load to v3
std::vector<LoadStrategy>{
LoadStrategy{LoadType::DOWNTO, LoadObjective::INCLUDE_DELETED, static_cast<SignedVersionId>(3)},
LoadStrategy{LoadType::DOWNTO, LoadObjective::INCLUDE_DELETED, static_cast<SignedVersionId>(-1)},
LoadStrategy{LoadType::FROM_TIME, LoadObjective::INCLUDE_DELETED, static_cast<timestamp>(3)},
LoadStrategy{LoadType::LATEST, LoadObjective::INCLUDE_DELETED},
}
};

for (auto i=0u; i<num_versions; ++i){
for (auto j=0u; j<num_versions; ++j){
// For every two versions we check that all load params for earlier versions cache load paramse for later versions:
// For every two versions we check that all load params for earlier versions cache load params for later versions:
check_all_caching(should_load_to_v[i], should_load_to_v[j], i<=j);
}

// ALL and LOAD_UNDELETED because they both load to v0 (UNDELETED_ONLY will load v0 because only there it will load
check_all_caching({load_all_param, load_all_undeleted_param}, should_load_to_v[i], true);
check_all_caching(should_load_to_v[i], {load_all_param, load_all_undeleted_param}, false);
}
}

Expand All @@ -805,7 +824,7 @@ TEST(VersionMap, CacheInvalidationWithTombstoneAfterLoad) {

auto version_map = std::make_shared<VersionMap>();
StreamId id{"test"};
write_two_versions(store, version_map, id);
write_versions(store, version_map, id, 2);

// Use a clean version_map
version_map = std::make_shared<VersionMap>();
Expand Down Expand Up @@ -841,45 +860,62 @@ TEST(VersionMap, CacheInvalidationWithTombstoneAfterLoad) {

TEST(VersionMap, CacheInvalidationWithTombstoneAllAfterLoad) {
using namespace arcticdb;
// Given - symbol with 2 versions - load downto version 0
// Given - symbol with 3 versions - load downto version 1 or 0
// never time-invalidate the cache so we can test our other cache invalidation logic
ScopedConfig sc("VersionMap.ReloadInterval", std::numeric_limits<int64_t>::max());
auto store = std::make_shared<InMemoryStore>();

auto version_map = std::make_shared<VersionMap>();
StreamId id{"test"};
write_two_versions(store, version_map, id);
write_versions(store, version_map, id, 3);

// Use a clean version_map
version_map = std::make_shared<VersionMap>();

auto entry = version_map->check_reload(
store,
id,
LoadStrategy{LoadType::DOWNTO, LoadObjective::INCLUDE_DELETED, static_cast<SignedVersionId>(0)},
__FUNCTION__);

ASSERT_TRUE(version_map->has_cached_entry(id, LoadStrategy{LoadType::LATEST, LoadObjective::UNDELETED_ONLY}));
ASSERT_TRUE(version_map->has_cached_entry(id, LoadStrategy{LoadType::FROM_TIME, LoadObjective::UNDELETED_ONLY, static_cast<timestamp>(1)}));
ASSERT_TRUE(version_map->has_cached_entry(id, LoadStrategy{LoadType::FROM_TIME, LoadObjective::UNDELETED_ONLY, static_cast<timestamp>(0)}));
ASSERT_TRUE(version_map->has_cached_entry(id, LoadStrategy{LoadType::DOWNTO, LoadObjective::UNDELETED_ONLY, static_cast<SignedVersionId>(-1)}));
ASSERT_TRUE(version_map->has_cached_entry(id, LoadStrategy{LoadType::DOWNTO, LoadObjective::UNDELETED_ONLY, static_cast<SignedVersionId>(-2)}));

// When - we delete version 1
auto tombstone_key = version_map->write_tombstone(store, VersionId{1}, id, entry);

// We should not invalidate the cache because the version we loaded to is still undeleted
ASSERT_TRUE(version_map->has_cached_entry(id, LoadStrategy{LoadType::LATEST, LoadObjective::UNDELETED_ONLY}));
ASSERT_TRUE(version_map->has_cached_entry(id, LoadStrategy{LoadType::FROM_TIME, LoadObjective::UNDELETED_ONLY, static_cast<timestamp>(1)}));
ASSERT_TRUE(version_map->has_cached_entry(id, LoadStrategy{LoadType::FROM_TIME, LoadObjective::UNDELETED_ONLY, static_cast<timestamp>(0)}));

// When - we delete all versions without reloading
version_map->write_tombstone_all_key_internal(store, tombstone_key, entry);

// We should invalidate cached undeleted checks
ASSERT_FALSE(version_map->has_cached_entry(id, LoadStrategy{LoadType::LATEST, LoadObjective::UNDELETED_ONLY}));
ASSERT_FALSE(version_map->has_cached_entry(id, LoadStrategy{LoadType::FROM_TIME, LoadObjective::UNDELETED_ONLY, static_cast<timestamp>(1)}));
ASSERT_FALSE(version_map->has_cached_entry(id, LoadStrategy{LoadType::FROM_TIME, LoadObjective::UNDELETED_ONLY, static_cast<timestamp>(0)}));
for (const auto& load_strategy : {LoadStrategy{LoadType::DOWNTO, LoadObjective::INCLUDE_DELETED, static_cast<SignedVersionId>(1)},
LoadStrategy{LoadType::DOWNTO, LoadObjective::INCLUDE_DELETED, static_cast<SignedVersionId>(0)}})
{
const bool is_loaded_to_0 = load_strategy.load_until_version_ == 0;
auto entry = version_map->check_reload(
store,
id,
load_strategy,
__FUNCTION__);
ASSERT_TRUE(version_map->has_cached_entry(id, LoadStrategy{LoadType::LATEST, LoadObjective::UNDELETED_ONLY}));
ASSERT_TRUE(version_map->has_cached_entry(id, LoadStrategy{LoadType::FROM_TIME, LoadObjective::UNDELETED_ONLY, static_cast<timestamp>(2)}));
ASSERT_TRUE(version_map->has_cached_entry(id, LoadStrategy{LoadType::FROM_TIME, LoadObjective::UNDELETED_ONLY, static_cast<timestamp>(1)}));
ASSERT_EQ(version_map->has_cached_entry(id, LoadStrategy{LoadType::FROM_TIME, LoadObjective::UNDELETED_ONLY, static_cast<timestamp>(0)}), is_loaded_to_0);
ASSERT_TRUE(version_map->has_cached_entry(id, LoadStrategy{LoadType::DOWNTO, LoadObjective::UNDELETED_ONLY, static_cast<SignedVersionId>(-1)}));
ASSERT_TRUE(version_map->has_cached_entry(id, LoadStrategy{LoadType::DOWNTO, LoadObjective::UNDELETED_ONLY, static_cast<SignedVersionId>(-2)}));
ASSERT_EQ(version_map->has_cached_entry(id, LoadStrategy{LoadType::DOWNTO, LoadObjective::UNDELETED_ONLY, static_cast<SignedVersionId>(-3)}), is_loaded_to_0);

// When - we delete version 2
auto tombstone_key = version_map->write_tombstone(store, VersionId{2}, id, entry);

// We should not invalidate the cache because the version we loaded to is still undeleted
ASSERT_TRUE(version_map->has_cached_entry(id, LoadStrategy{LoadType::LATEST, LoadObjective::UNDELETED_ONLY}));
ASSERT_TRUE(version_map->has_cached_entry(id, LoadStrategy{LoadType::FROM_TIME, LoadObjective::UNDELETED_ONLY, static_cast<timestamp>(2)}));
ASSERT_TRUE(version_map->has_cached_entry(id, LoadStrategy{LoadType::FROM_TIME, LoadObjective::UNDELETED_ONLY, static_cast<timestamp>(1)}));
ASSERT_EQ(version_map->has_cached_entry(id, LoadStrategy{LoadType::FROM_TIME, LoadObjective::UNDELETED_ONLY, static_cast<timestamp>(0)}), is_loaded_to_0);

// When - we delete all versions without reloading
version_map->write_tombstone_all_key_internal(store, tombstone_key, entry);

if (is_loaded_to_0) {
// If we have loaded everything we should not invalidate cache
ASSERT_TRUE(version_map->has_cached_entry(id, LoadStrategy{LoadType::LATEST, LoadObjective::UNDELETED_ONLY}));
ASSERT_TRUE(version_map->has_cached_entry(id, LoadStrategy{LoadType::FROM_TIME, LoadObjective::UNDELETED_ONLY, static_cast<timestamp>(2)}));
ASSERT_TRUE(version_map->has_cached_entry(id, LoadStrategy{LoadType::FROM_TIME, LoadObjective::UNDELETED_ONLY, static_cast<timestamp>(1)}));
ASSERT_TRUE(version_map->has_cached_entry(id, LoadStrategy{LoadType::FROM_TIME, LoadObjective::UNDELETED_ONLY, static_cast<timestamp>(0)}));
}
else {
// If we haven't loaded everything we should invalidate cached undeleted checks
ASSERT_FALSE(version_map->has_cached_entry(id, LoadStrategy{LoadType::LATEST, LoadObjective::UNDELETED_ONLY}));
ASSERT_FALSE(version_map->has_cached_entry(id, LoadStrategy{LoadType::FROM_TIME, LoadObjective::UNDELETED_ONLY, static_cast<timestamp>(1)}));
ASSERT_FALSE(version_map->has_cached_entry(id, LoadStrategy{LoadType::FROM_TIME, LoadObjective::UNDELETED_ONLY, static_cast<timestamp>(0)}));
}
}
}

TEST(VersionMap, CompactionUpdateCache) {
Expand Down Expand Up @@ -979,8 +1015,8 @@ TEST(VersionMap, TombstoneAllFromEntry) {
ASSERT_EQ(version_id, 0);


// With cached entry from the write ops
// Tombstone all should succeed as we are not relying on the ref key
// With cached entry from the write ops
// Tombstone all should succeed as we are not relying on the ref key
version_map->tombstone_from_key_or_all(store, id, dummy_key, entry);

auto [maybe_prev_cached_entry, deleted_cached_entry] = get_latest_version(store, version_map, id);
Expand Down
Loading
Loading