diff --git a/pkg/schedule/checker/rule_checker.go b/pkg/schedule/checker/rule_checker.go index b0537bf9ce4..7012359ca36 100644 --- a/pkg/schedule/checker/rule_checker.go +++ b/pkg/schedule/checker/rule_checker.go @@ -479,6 +479,13 @@ loopFits: hasUnhealthyFit = true break loopFits } + // avoid to meet down store when fix orpahn peers, + // Isdisconnected is more strictly than IsUnhealthy. + if c.cluster.GetStore(p.GetStoreId()).IsDisconnected() { + hasUnhealthyFit = true + pinDownPeer = p + break loopFits + } } } @@ -491,6 +498,9 @@ loopFits: // try to use orphan peers to replace unhealthy down peers. for _, orphanPeer := range fit.OrphanPeers { if pinDownPeer != nil { + if pinDownPeer.GetId() == orphanPeer.GetId() { + continue + } // make sure the orphan peer is healthy. if isUnhealthyPeer(orphanPeer.GetId()) { continue @@ -514,6 +524,9 @@ loopFits: return operator.CreatePromoteLearnerOperatorAndRemovePeer("replace-down-peer-with-orphan-peer", c.cluster, region, orphanPeer, pinDownPeer) case orphanPeerRole == metapb.PeerRole_Voter && destRole == metapb.PeerRole_Learner: return operator.CreateDemoteLearnerOperatorAndRemovePeer("replace-down-peer-with-orphan-peer", c.cluster, region, orphanPeer, pinDownPeer) + case orphanPeerRole == metapb.PeerRole_Voter && destRole == metapb.PeerRole_Voter && + c.cluster.GetStore(pinDownPeer.GetStoreId()).IsDisconnected() && !dstStore.IsDisconnected(): + return operator.CreateRemovePeerOperator("remove-replaced-orphan-peer", c.cluster, 0, region, pinDownPeer.GetStoreId()) default: // destRole should not same with orphanPeerRole. if role is same, it fit with orphanPeer should be better than now. // destRole never be leader, so we not consider it. diff --git a/server/cluster/cluster_test.go b/server/cluster/cluster_test.go index b7b9dcfb736..89c9ea32f19 100644 --- a/server/cluster/cluster_test.go +++ b/server/cluster/cluster_test.go @@ -2796,7 +2796,7 @@ func TestReplica(t *testing.T) { re.NoError(tc.addLeaderRegion(2, 1, 2, 3, 4)) region = tc.GetRegion(2) re.NoError(dispatchHeartbeat(co, region, stream)) - region = waitRemovePeer(re, stream, region, 4) + region = waitRemovePeer(re, stream, region, 3) // store3 is down, we should remove it firstly. re.NoError(dispatchHeartbeat(co, region, stream)) waitNoResponse(re, stream)