From 82fa664b24c4dd593d4623e92a46b2b6ac092b01 Mon Sep 17 00:00:00 2001 From: Connor1996 Date: Sun, 4 Dec 2022 22:25:06 -0800 Subject: [PATCH] close #5753 fix unsafe recovery auto detect mode Signed-off-by: Connor1996 --- server/cluster/unsafe_recovery_controller.go | 15 +++++++ .../unsafe_recovery_controller_test.go | 41 +++++++++++++++++++ 2 files changed, 56 insertions(+) diff --git a/server/cluster/unsafe_recovery_controller.go b/server/cluster/unsafe_recovery_controller.go index 9c7446766fa..0c89995fc85 100644 --- a/server/cluster/unsafe_recovery_controller.go +++ b/server/cluster/unsafe_recovery_controller.go @@ -1000,6 +1000,21 @@ func (u *unsafeRecoveryController) generateForceLeaderPlan(newestRegionTree *reg storeRecoveryPlan.ForceLeader.FailedStores = append(storeRecoveryPlan.ForceLeader.FailedStores, store) } } + if u.autoDetect { + // For auto detect, the failedStores is empty. So need to add the detected failed store to the list + for _, peer := range u.getFailedPeers(leader.Region()) { + found := false + for _, store := range storeRecoveryPlan.ForceLeader.FailedStores { + if store == peer.StoreId { + found = true + break + } + } + if !found { + storeRecoveryPlan.ForceLeader.FailedStores = append(storeRecoveryPlan.ForceLeader.FailedStores, peer.StoreId) + } + } + } storeRecoveryPlan.ForceLeader.EnterForceLeaders = append(storeRecoveryPlan.ForceLeader.EnterForceLeaders, region.GetId()) u.recordAffectedRegion(leader.Region()) hasPlan = true diff --git a/server/cluster/unsafe_recovery_controller_test.go b/server/cluster/unsafe_recovery_controller_test.go index 4165c0b2721..1baf1466853 100644 --- a/server/cluster/unsafe_recovery_controller_test.go +++ b/server/cluster/unsafe_recovery_controller_test.go @@ -40,6 +40,44 @@ func newStoreHeartbeat(storeID uint64, report *pdpb.StoreReport) *pdpb.StoreHear } } +func hasQuorum(region *metapb.Region, failedStores []uint64) bool { + hasQuorum := func(voters []*metapb.Peer) bool { + numFailedVoters := 0 + numLiveVoters := 0 + + for _, voter := range voters { + found := false + for _, store := range failedStores { + if store == voter.GetStoreId() { + found = true + break + } + } + if found { + numFailedVoters += 1 + } else { + numLiveVoters += 1 + } + } + return numFailedVoters < numLiveVoters + } + + // consider joint consensus + var incomingVoters []*metapb.Peer + var outgoingVoters []*metapb.Peer + + for _, peer := range region.Peers { + if peer.Role == metapb.PeerRole_Voter || peer.Role == metapb.PeerRole_IncomingVoter { + incomingVoters = append(incomingVoters, peer) + } + if peer.Role == metapb.PeerRole_Voter || peer.Role == metapb.PeerRole_DemotingVoter { + outgoingVoters = append(outgoingVoters, peer) + } + } + + return hasQuorum(incomingVoters) && hasQuorum(outgoingVoters) +} + func applyRecoveryPlan(re *require.Assertions, storeID uint64, storeReports map[uint64]*pdpb.StoreReport, resp *pdpb.StoreHeartbeatResponse) { plan := resp.GetRecoveryPlan() if plan == nil { @@ -55,6 +93,9 @@ func applyRecoveryPlan(re *require.Assertions, storeID uint64, storeReports map[ for _, report := range reports.PeerReports { region := report.GetRegionState().GetRegion() if region.GetId() == forceLeader { + if hasQuorum(region, forceLeaders.GetFailedStores()) { + re.FailNow("should not enter force leader when quorum is still alive") + } report.IsForceLeader = true break }