From 50214e8212a31c7b619d4e88793252183ef533fe Mon Sep 17 00:00:00 2001 From: yuwmao Date: Mon, 23 Dec 2024 17:29:20 +0800 Subject: [PATCH] Reset PG after failures --- src/lib/homestore_backend/hs_homeobject.hpp | 1 + src/lib/homestore_backend/hs_pg_manager.cpp | 6 ++++++ src/lib/homestore_backend/replication_state_machine.cpp | 8 ++++---- 3 files changed, 11 insertions(+), 4 deletions(-) diff --git a/src/lib/homestore_backend/hs_homeobject.hpp b/src/lib/homestore_backend/hs_homeobject.hpp index 3411c1d..3ca1802 100644 --- a/src/lib/homestore_backend/hs_homeobject.hpp +++ b/src/lib/homestore_backend/hs_homeobject.hpp @@ -587,6 +587,7 @@ class HSHomeObject : public HomeObjectImpl { std::shared_ptr< BlobIndexTable > recover_index_table(homestore::superblk< homestore::index_table_sb >&& sb); std::optional< pg_id_t > get_pg_id_with_group_id(homestore::group_id_t group_id) const; + bool is_pg_present(pg_id_t pg_id); private: std::shared_ptr< BlobIndexTable > create_index_table(); diff --git a/src/lib/homestore_backend/hs_pg_manager.cpp b/src/lib/homestore_backend/hs_pg_manager.cpp index b9feb31..6a2578c 100644 --- a/src/lib/homestore_backend/hs_pg_manager.cpp +++ b/src/lib/homestore_backend/hs_pg_manager.cpp @@ -278,6 +278,12 @@ std::optional< pg_id_t > HSHomeObject::get_pg_id_with_group_id(homestore::group_ } } +bool HSHomeObject::is_pg_present(pg_id_t pg_id) { + auto lg = std::scoped_lock(_pg_lock); + auto iter = _pg_map.find(pg_id); + return iter != _pg_map.end(); +} + void HSHomeObject::pg_destroy(pg_id_t pg_id) { mark_pg_destroyed(pg_id); destroy_shards(pg_id); diff --git a/src/lib/homestore_backend/replication_state_machine.cpp b/src/lib/homestore_backend/replication_state_machine.cpp index f235560..9e571d5 100644 --- a/src/lib/homestore_backend/replication_state_machine.cpp +++ b/src/lib/homestore_backend/replication_state_machine.cpp @@ -354,12 +354,12 @@ void ReplicationStateMachine::write_snapshot_obj(std::shared_ptr< homestore::sna auto pg_data = GetSizePrefixedResyncPGMetaData(data_buf); - // Check if the snapshot context is same as the current snapshot context. - // If not, drop the previous context and re-init a new one - if (m_snp_rcv_handler->get_context_lsn() != context->get_lsn()) { + //Check if pg exists, if yes, clean the stale pg resources, may be due to previous snapshot failure. Let's resync on a pristine base + if (home_object_->is_pg_present(pg_data->pg_id())) { + LOGI("pg already exists, clean pg resources before snapshot, pg_id:{} {}", pg_data->pg_id(), log_suffix); + home_object_->pg_destroy(pg_data->pg_id()); LOGI("reset context from lsn:{} to lsn:{}", m_snp_rcv_handler->get_context_lsn(), context->get_lsn()); m_snp_rcv_handler->reset_context(context->get_lsn(), pg_data->pg_id()); - // TODO: Reset all data of current PG - let's resync on a pristine base } auto ret = m_snp_rcv_handler->process_pg_snapshot_data(*pg_data);