Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

checker: reduces the probability of deleting normal peers when the store becomes unavailable (#7249) #7334

Merged
merged 6 commits into from
Dec 11, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
70 changes: 55 additions & 15 deletions pkg/schedule/checker/rule_checker.go
Original file line number Diff line number Diff line change
Expand Up @@ -78,6 +78,7 @@
ruleCheckerSkipRemoveOrphanPeerCounter = checkerCounter.WithLabelValues(ruleChecker, "skip-remove-orphan-peer")
ruleCheckerRemoveOrphanPeerCounter = checkerCounter.WithLabelValues(ruleChecker, "remove-orphan-peer")
ruleCheckerReplaceOrphanPeerCounter = checkerCounter.WithLabelValues(ruleChecker, "replace-orphan-peer")
ruleCheckerReplaceOrphanPeerNoFitCounter = checkerCounter.WithLabelValues(ruleChecker, "replace-orphan-peer-no-fit")
)

// RuleChecker fix/improve region by placement rules.
Expand Down Expand Up @@ -452,7 +453,7 @@
if len(fit.OrphanPeers) == 0 {
return nil, nil
}
var pinDownPeer *metapb.Peer

isUnhealthyPeer := func(id uint64) bool {
for _, downPeer := range region.GetDownPeers() {
if downPeer.Peer.GetId() == id {
Expand All @@ -466,24 +467,45 @@
}
return false
}

isDisconnectedPeer := func(p *metapb.Peer) bool {
// avoid to meet down store when fix orphan peers,
// Isdisconnected is more strictly than IsUnhealthy.
store := c.cluster.GetStore(p.GetStoreId())
if store == nil {
return true
}
return store.IsDisconnected()
}

checkDownPeer := func(peers []*metapb.Peer) (*metapb.Peer, bool) {
for _, p := range peers {
if isUnhealthyPeer(p.GetId()) {
// make sure is down peer.
if region.GetDownPeer(p.GetId()) != nil {
return p, true
}
return nil, true
}
if isDisconnectedPeer(p) {
return p, true
}
}
return nil, false
}

// remove orphan peers only when all rules are satisfied (count+role) and all peers selected
// by RuleFits is not pending or down.
var pinDownPeer *metapb.Peer
hasUnhealthyFit := false
loopFits:
for _, rf := range fit.RuleFits {
if !rf.IsSatisfied() {
hasUnhealthyFit = true
break
}
for _, p := range rf.Peers {
if isUnhealthyPeer(p.GetId()) {
// make sure is down peer.
if region.GetDownPeer(p.GetId()) != nil {
pinDownPeer = p
}
hasUnhealthyFit = true
break loopFits
}
pinDownPeer, hasUnhealthyFit = checkDownPeer(rf.Peers)
if hasUnhealthyFit {
break
}
}

Expand All @@ -496,16 +518,19 @@
// try to use orphan peers to replace unhealthy down peers.
for _, orphanPeer := range fit.OrphanPeers {
if pinDownPeer != nil {
if pinDownPeer.GetId() == orphanPeer.GetId() {
continue

Check warning on line 522 in pkg/schedule/checker/rule_checker.go

View check run for this annotation

Codecov / codecov/patch

pkg/schedule/checker/rule_checker.go#L522

Added line #L522 was not covered by tests
}
// make sure the orphan peer is healthy.
if isUnhealthyPeer(orphanPeer.GetId()) {
if isUnhealthyPeer(orphanPeer.GetId()) || isDisconnectedPeer(orphanPeer) {
continue
}
// no consider witness in this path.
if pinDownPeer.GetIsWitness() || orphanPeer.GetIsWitness() {
continue
}
// down peer's store should be down.
if !c.isStoreDownTimeHitMaxDownTime(pinDownPeer.GetStoreId()) {
// pinDownPeer's store should be disconnected, because we use more strict judge before.
if !isDisconnectedPeer(pinDownPeer) {
continue
}
// check if down peer can replace with orphan peer.
Expand All @@ -519,10 +544,14 @@
return operator.CreatePromoteLearnerOperatorAndRemovePeer("replace-down-peer-with-orphan-peer", c.cluster, region, orphanPeer, pinDownPeer)
case orphanPeerRole == metapb.PeerRole_Voter && destRole == metapb.PeerRole_Learner:
return operator.CreateDemoteLearnerOperatorAndRemovePeer("replace-down-peer-with-orphan-peer", c.cluster, region, orphanPeer, pinDownPeer)
case orphanPeerRole == destRole && isDisconnectedPeer(pinDownPeer) && !dstStore.IsDisconnected():
return operator.CreateRemovePeerOperator("remove-replaced-orphan-peer", c.cluster, 0, region, pinDownPeer.GetStoreId())
default:
// destRole should not same with orphanPeerRole. if role is same, it fit with orphanPeer should be better than now.
// destRole never be leader, so we not consider it.
}
} else {
ruleCheckerReplaceOrphanPeerNoFitCounter.Inc()

Check warning on line 554 in pkg/schedule/checker/rule_checker.go

View check run for this annotation

Codecov / codecov/patch

pkg/schedule/checker/rule_checker.go#L553-L554

Added lines #L553 - L554 were not covered by tests
}
}
}
Expand All @@ -531,14 +560,25 @@
// Ref https://github.com/tikv/pd/issues/4045
if len(fit.OrphanPeers) >= 2 {
hasHealthPeer := false
var disconnectedPeer *metapb.Peer
for _, orphanPeer := range fit.OrphanPeers {
if isDisconnectedPeer(orphanPeer) {
disconnectedPeer = orphanPeer
break
}
}
for _, orphanPeer := range fit.OrphanPeers {
if isUnhealthyPeer(orphanPeer.GetId()) {
ruleCheckerRemoveOrphanPeerCounter.Inc()
return operator.CreateRemovePeerOperator("remove-orphan-peer", c.cluster, 0, region, orphanPeer.StoreId)
return operator.CreateRemovePeerOperator("remove-unhealthy-orphan-peer", c.cluster, 0, region, orphanPeer.StoreId)
}
if hasHealthPeer {
// there already exists a healthy orphan peer, so we can remove other orphan Peers.
ruleCheckerRemoveOrphanPeerCounter.Inc()
// if there exists a disconnected orphan peer, we will pick it to remove firstly.
if disconnectedPeer != nil {
return operator.CreateRemovePeerOperator("remove-orphan-peer", c.cluster, 0, region, disconnectedPeer.StoreId)
}
return operator.CreateRemovePeerOperator("remove-orphan-peer", c.cluster, 0, region, orphanPeer.StoreId)
}
hasHealthPeer = true
Expand Down
Loading
Loading