From 20f2ce582187d6ba5f6ea7c6747d61e9ecab6d00 Mon Sep 17 00:00:00 2001 From: Ryan Leung Date: Tue, 14 Sep 2021 01:26:41 +0800 Subject: [PATCH 1/2] This is an automated cherry-pick of #4097 Signed-off-by: ti-chi-bot --- server/cluster/coordinator_test.go | 30 ++ server/schedule/checker/rule_checker_test.go | 322 +++++++++++++++++++ server/schedule/checker_controller.go | 33 ++ server/schedule/operator/builder.go | 25 ++ server/schedule/operator/step.go | 9 + 5 files changed, 419 insertions(+) diff --git a/server/cluster/coordinator_test.go b/server/cluster/coordinator_test.go index 6bcb1fb4abb..a4cefb9cd92 100644 --- a/server/cluster/coordinator_test.go +++ b/server/cluster/coordinator_test.go @@ -987,6 +987,36 @@ func (s *testOperatorControllerSuite) TestStoreOverloadedWithReplace(c *C) { c.Assert(lb.Schedule(tc), NotNil) } +func (s *testOperatorControllerSuite) TestDownStoreLimit(c *C) { + tc, co, cleanup := prepare(nil, nil, nil, c) + defer cleanup() + oc := co.opController + rc := co.checkers.GetRuleChecker() + + tc.addRegionStore(1, 100) + tc.addRegionStore(2, 100) + tc.addRegionStore(3, 100) + tc.addLeaderRegion(1, 1, 2, 3) + + region := tc.GetRegion(1) + tc.setStoreDown(1) + tc.SetStoreLimit(1, storelimit.RemovePeer, 1) + region = region.Clone(core.WithDownPeers([]*pdpb.PeerStats{ + { + Peer: region.GetStorePeer(1), + DownSeconds: 24 * 60 * 60, + }, + })) + + for i := uint64(1); i <= 5; i++ { + tc.addRegionStore(i+3, 100) + op := rc.Check(region) + c.Assert(op, NotNil) + c.Assert(oc.AddOperator(op), IsTrue) + oc.RemoveOperator(op) + } +} + var _ = Suite(&testScheduleControllerSuite{}) type testScheduleControllerSuite struct { diff --git a/server/schedule/checker/rule_checker_test.go b/server/schedule/checker/rule_checker_test.go index 3ecfb1097ad..5de146ebe22 100644 --- a/server/schedule/checker/rule_checker_test.go +++ b/server/schedule/checker/rule_checker_test.go @@ -14,7 +14,11 @@ package checker import ( +<<<<<<< HEAD "encoding/hex" +======= + "context" +>>>>>>> 1a7caa95c (schedule: not limit remove peer of the down store (#4097)) . "github.com/pingcap/check" "github.com/pingcap/kvproto/pkg/metapb" @@ -333,3 +337,321 @@ func (s *testRuleCheckerSuite) TestIssue3521_PriorityFixOrphanPeer(c *C) { c.Assert(op.Step(0), FitsTypeOf, remove) c.Assert(op.Desc(), Equals, "remove-orphan-peer") } +<<<<<<< HEAD +======= + +func (s *testRuleCheckerSuite) TestIssue3293(c *C) { + s.cluster.AddLabelsStore(1, 1, map[string]string{"host": "host1"}) + s.cluster.AddLabelsStore(2, 1, map[string]string{"host": "host1"}) + s.cluster.AddLabelsStore(3, 1, map[string]string{"host": "host2"}) + s.cluster.AddLabelsStore(4, 1, map[string]string{"host": "host4"}) + s.cluster.AddLabelsStore(5, 1, map[string]string{"host": "host5"}) + s.cluster.AddLeaderRegionWithRange(1, "", "", 1, 2) + err := s.ruleManager.SetRule(&placement.Rule{ + GroupID: "TiDB_DDL_51", + ID: "0", + Role: placement.Follower, + Count: 1, + LabelConstraints: []placement.LabelConstraint{ + { + Key: "host", + Values: []string{ + "host5", + }, + Op: placement.In, + }, + }, + }) + c.Assert(err, IsNil) + s.cluster.DeleteStore(s.cluster.GetStore(5)) + err = s.ruleManager.SetRule(&placement.Rule{ + GroupID: "TiDB_DDL_51", + ID: "default", + Role: placement.Voter, + Count: 3, + }) + c.Assert(err, IsNil) + err = s.ruleManager.DeleteRule("pd", "default") + c.Assert(err, IsNil) + op := s.rc.Check(s.cluster.GetRegion(1)) + c.Assert(op, NotNil) + c.Assert(op.Desc(), Equals, "add-rule-peer") +} + +func (s *testRuleCheckerSuite) TestIssue3299(c *C) { + s.cluster.AddLabelsStore(1, 1, map[string]string{"host": "host1"}) + s.cluster.AddLabelsStore(2, 1, map[string]string{"dc": "sh"}) + s.cluster.AddLeaderRegionWithRange(1, "", "", 1, 2) + + testCases := []struct { + constraints []placement.LabelConstraint + err string + }{ + { + constraints: []placement.LabelConstraint{ + { + Key: "host", + Values: []string{"host5"}, + Op: placement.In, + }, + }, + err: ".*can not match any store", + }, + { + constraints: []placement.LabelConstraint{ + { + Key: "ho", + Values: []string{"sh"}, + Op: placement.In, + }, + }, + err: ".*can not match any store", + }, + { + constraints: []placement.LabelConstraint{ + { + Key: "host", + Values: []string{"host1"}, + Op: placement.In, + }, + { + Key: "host", + Values: []string{"host1"}, + Op: placement.NotIn, + }, + }, + err: ".*can not match any store", + }, + { + constraints: []placement.LabelConstraint{ + { + Key: "host", + Values: []string{"host1"}, + Op: placement.In, + }, + { + Key: "host", + Values: []string{"host3"}, + Op: placement.In, + }, + }, + err: ".*can not match any store", + }, + { + constraints: []placement.LabelConstraint{ + { + Key: "host", + Values: []string{"host1"}, + Op: placement.In, + }, + { + Key: "host", + Values: []string{"host1"}, + Op: placement.In, + }, + }, + err: "", + }, + } + + for _, t := range testCases { + err := s.ruleManager.SetRule(&placement.Rule{ + GroupID: "p", + ID: "0", + Role: placement.Follower, + Count: 1, + LabelConstraints: t.constraints, + }) + if t.err != "" { + c.Assert(err, ErrorMatches, t.err) + } else { + c.Assert(err, IsNil) + } + } +} + +// See issue: https://github.com/tikv/pd/issues/3705 +func (s *testRuleCheckerSuite) TestFixDownPeer(c *C) { + s.cluster.AddLabelsStore(1, 1, map[string]string{"zone": "z1"}) + s.cluster.AddLabelsStore(2, 1, map[string]string{"zone": "z1"}) + s.cluster.AddLabelsStore(3, 1, map[string]string{"zone": "z2"}) + s.cluster.AddLabelsStore(4, 1, map[string]string{"zone": "z3"}) + s.cluster.AddLabelsStore(5, 1, map[string]string{"zone": "z3"}) + s.cluster.AddLeaderRegion(1, 1, 3, 4) + rule := &placement.Rule{ + GroupID: "pd", + ID: "test", + Index: 100, + Override: true, + Role: placement.Voter, + Count: 3, + LocationLabels: []string{"zone"}, + } + s.ruleManager.SetRule(rule) + + region := s.cluster.GetRegion(1) + c.Assert(s.rc.Check(region), IsNil) + + s.cluster.SetStoreDown(4) + region = region.Clone(core.WithDownPeers([]*pdpb.PeerStats{ + {Peer: region.GetStorePeer(4), DownSeconds: 6000}, + })) + testutil.CheckTransferPeer(c, s.rc.Check(region), operator.OpRegion, 4, 5) + + s.cluster.SetStoreDown(5) + testutil.CheckTransferPeer(c, s.rc.Check(region), operator.OpRegion, 4, 2) + + rule.IsolationLevel = "zone" + s.ruleManager.SetRule(rule) + c.Assert(s.rc.Check(region), IsNil) +} + +// See issue: https://github.com/tikv/pd/issues/3705 +func (s *testRuleCheckerSuite) TestFixOfflinePeer(c *C) { + s.cluster.AddLabelsStore(1, 1, map[string]string{"zone": "z1"}) + s.cluster.AddLabelsStore(2, 1, map[string]string{"zone": "z1"}) + s.cluster.AddLabelsStore(3, 1, map[string]string{"zone": "z2"}) + s.cluster.AddLabelsStore(4, 1, map[string]string{"zone": "z3"}) + s.cluster.AddLabelsStore(5, 1, map[string]string{"zone": "z3"}) + s.cluster.AddLeaderRegion(1, 1, 3, 4) + rule := &placement.Rule{ + GroupID: "pd", + ID: "test", + Index: 100, + Override: true, + Role: placement.Voter, + Count: 3, + LocationLabels: []string{"zone"}, + } + s.ruleManager.SetRule(rule) + + region := s.cluster.GetRegion(1) + c.Assert(s.rc.Check(region), IsNil) + + s.cluster.SetStoreOffline(4) + testutil.CheckTransferPeer(c, s.rc.Check(region), operator.OpRegion, 4, 5) + + s.cluster.SetStoreOffline(5) + testutil.CheckTransferPeer(c, s.rc.Check(region), operator.OpRegion, 4, 2) + + rule.IsolationLevel = "zone" + s.ruleManager.SetRule(rule) + c.Assert(s.rc.Check(region), IsNil) +} + +func (s *testRuleCheckerSerialSuite) TestRuleCache(c *C) { + s.cluster.AddLabelsStore(1, 1, map[string]string{"zone": "z1"}) + s.cluster.AddLabelsStore(2, 1, map[string]string{"zone": "z1"}) + s.cluster.AddLabelsStore(3, 1, map[string]string{"zone": "z2"}) + s.cluster.AddLabelsStore(4, 1, map[string]string{"zone": "z3"}) + s.cluster.AddLabelsStore(5, 1, map[string]string{"zone": "z3"}) + s.cluster.AddRegionStore(999, 1) + s.cluster.AddLeaderRegion(1, 1, 3, 4) + rule := &placement.Rule{ + GroupID: "pd", + ID: "test", + Index: 100, + Override: true, + Role: placement.Voter, + Count: 3, + LocationLabels: []string{"zone"}, + } + s.ruleManager.SetRule(rule) + region := s.cluster.GetRegion(1) + region = region.Clone(core.WithIncConfVer(), core.WithIncVersion()) + c.Assert(s.rc.Check(region), IsNil) + + testcases := []struct { + name string + region *core.RegionInfo + stillCached bool + }{ + { + name: "default", + region: region, + stillCached: true, + }, + { + name: "region topo changed", + region: func() *core.RegionInfo { + return region.Clone(core.WithAddPeer(&metapb.Peer{ + Id: 999, + StoreId: 999, + Role: metapb.PeerRole_Voter, + }), core.WithIncConfVer()) + }(), + stillCached: false, + }, + { + name: "region leader changed", + region: region.Clone( + core.WithLeader(&metapb.Peer{Role: metapb.PeerRole_Voter, Id: 2, StoreId: 3})), + stillCached: false, + }, + { + name: "region have down peers", + region: region.Clone(core.WithDownPeers([]*pdpb.PeerStats{ + { + Peer: region.GetPeer(3), + DownSeconds: 42, + }, + })), + stillCached: false, + }, + } + for _, testcase := range testcases { + c.Log(testcase.name) + if testcase.stillCached { + c.Assert(failpoint.Enable("github.com/tikv/pd/server/schedule/checker/assertShouldCache", "return(true)"), IsNil) + s.rc.Check(testcase.region) + c.Assert(failpoint.Disable("github.com/tikv/pd/server/schedule/checker/assertShouldCache"), IsNil) + } else { + c.Assert(failpoint.Enable("github.com/tikv/pd/server/schedule/checker/assertShouldNotCache", "return(true)"), IsNil) + s.rc.Check(testcase.region) + c.Assert(failpoint.Disable("github.com/tikv/pd/server/schedule/checker/assertShouldNotCache"), IsNil) + } + } +} + +// Ref https://github.com/tikv/pd/issues/4045 +func (s *testRuleCheckerSuite) TestSkipFixOrphanPeerIfSelectedPeerisPendingOrDown(c *C) { + s.cluster.AddLabelsStore(1, 1, map[string]string{"host": "host1"}) + s.cluster.AddLabelsStore(2, 1, map[string]string{"host": "host1"}) + s.cluster.AddLabelsStore(3, 1, map[string]string{"host": "host2"}) + s.cluster.AddLabelsStore(4, 1, map[string]string{"host": "host4"}) + s.cluster.AddLeaderRegionWithRange(1, "", "", 1, 2, 3, 4) + + // set peer3 and peer4 to pending + r1 := s.cluster.GetRegion(1) + r1 = r1.Clone(core.WithPendingPeers([]*metapb.Peer{r1.GetStorePeer(3), r1.GetStorePeer(4)})) + s.cluster.PutRegion(r1) + + // should not remove extra peer + op := s.rc.Check(s.cluster.GetRegion(1)) + c.Assert(op, IsNil) + + // set peer3 to down-peer + r1 = r1.Clone(core.WithPendingPeers([]*metapb.Peer{r1.GetStorePeer(4)})) + r1 = r1.Clone(core.WithDownPeers([]*pdpb.PeerStats{ + { + Peer: r1.GetStorePeer(3), + DownSeconds: 42, + }, + })) + s.cluster.PutRegion(r1) + + // should not remove extra peer + op = s.rc.Check(s.cluster.GetRegion(1)) + c.Assert(op, IsNil) + + // set peer3 to normal + r1 = r1.Clone(core.WithDownPeers(nil)) + s.cluster.PutRegion(r1) + + // should remove extra peer now + var remove operator.RemovePeer + op = s.rc.Check(s.cluster.GetRegion(1)) + c.Assert(op.Step(0), FitsTypeOf, remove) + c.Assert(op.Desc(), Equals, "remove-orphan-peer") +} +>>>>>>> 1a7caa95c (schedule: not limit remove peer of the down store (#4097)) diff --git a/server/schedule/checker_controller.go b/server/schedule/checker_controller.go index d7619f34888..6880f939bb7 100644 --- a/server/schedule/checker_controller.go +++ b/server/schedule/checker_controller.go @@ -93,3 +93,36 @@ func (c *CheckerController) CheckRegion(region *core.RegionInfo) (bool, []*opera func (c *CheckerController) GetMergeChecker() *checker.MergeChecker { return c.mergeChecker } +<<<<<<< HEAD +======= + +// GetRuleChecker returns the rule checker. +func (c *CheckerController) GetRuleChecker() *checker.RuleChecker { + return c.ruleChecker +} + +// GetWaitingRegions returns the regions in the waiting list. +func (c *CheckerController) GetWaitingRegions() []*cache.Item { + return c.regionWaitingList.Elems() +} + +// AddWaitingRegion returns the regions in the waiting list. +func (c *CheckerController) AddWaitingRegion(region *core.RegionInfo) { + c.regionWaitingList.Put(region.GetID(), nil) +} + +// RemoveWaitingRegion removes the region from the waiting list. +func (c *CheckerController) RemoveWaitingRegion(id uint64) { + c.regionWaitingList.Remove(id) +} + +// GetPriorityRegions returns the region in priority queue +func (c *CheckerController) GetPriorityRegions() []uint64 { + return c.priorityChecker.GetPriorityRegions() +} + +// RemovePriorityRegions removes priority region from priority queue +func (c *CheckerController) RemovePriorityRegions(id uint64) { + c.priorityChecker.RemovePriorityRegion(id) +} +>>>>>>> 1a7caa95c (schedule: not limit remove peer of the down store (#4097)) diff --git a/server/schedule/operator/builder.go b/server/schedule/operator/builder.go index 0dc5222e918..d4e2916baf1 100644 --- a/server/schedule/operator/builder.go +++ b/server/schedule/operator/builder.go @@ -332,9 +332,34 @@ func (b *Builder) execAddPeer(p *metapb.Peer) { if !p.GetIsLearner() { b.steps = append(b.steps, PromoteLearner{ToStore: p.GetStoreId(), PeerID: p.GetId()}) } +<<<<<<< HEAD b.currentPeers.Set(p) if b.peerAddStep == nil { b.peerAddStep = make(map[uint64]int) +======= + b.currentPeers.Set(peer) + b.peerAddStep[peer.GetStoreId()] = len(b.steps) + delete(b.toAdd, peer.GetStoreId()) +} + +func (b *Builder) execRemovePeer(peer *metapb.Peer) { + removeStoreID := peer.GetStoreId() + var isDownStore bool + store := b.cluster.GetStore(removeStoreID) + if store != nil { + isDownStore = store.DownTime() > b.cluster.GetOpts().GetMaxStoreDownTime() + } + b.steps = append(b.steps, RemovePeer{FromStore: removeStoreID, PeerID: peer.GetId(), IsDownStore: isDownStore}) + delete(b.currentPeers, removeStoreID) + delete(b.toRemove, removeStoreID) +} + +func (b *Builder) execChangePeerV2(needEnter bool, needTransferLeader bool) { + // Enter + step := ChangePeerV2Enter{ + PromoteLearners: make([]PromoteLearner, 0, len(b.toPromote)), + DemoteVoters: make([]DemoteVoter, 0, len(b.toDemote)), +>>>>>>> 1a7caa95c (schedule: not limit remove peer of the down store (#4097)) } b.peerAddStep[p.GetStoreId()] = len(b.steps) b.toAdd.Delete(p.GetStoreId()) diff --git a/server/schedule/operator/step.go b/server/schedule/operator/step.go index f2c6566bfdd..b70cee00299 100644 --- a/server/schedule/operator/step.go +++ b/server/schedule/operator/step.go @@ -220,7 +220,12 @@ func (pl PromoteLearner) Influence(opInfluence OpInfluence, region *core.RegionI // RemovePeer is an OpStep that removes a region peer. type RemovePeer struct { +<<<<<<< HEAD FromStore uint64 +======= + FromStore, PeerID uint64 + IsDownStore bool +>>>>>>> 1a7caa95c (schedule: not limit remove peer of the down store (#4097)) } // ConfVerChanged returns true if the conf version has been changed by this step @@ -252,6 +257,10 @@ func (rp RemovePeer) Influence(opInfluence OpInfluence, region *core.RegionInfo) regionSize := region.GetApproximateSize() from.RegionSize -= regionSize from.RegionCount-- + if rp.IsDownStore { + from.AdjustStepCost(storelimit.RemovePeer, storelimit.SmallRegionThreshold) + return + } from.AdjustStepCost(storelimit.RemovePeer, regionSize) } From 25cc2313af34ba85eaeab0c96dd9bfb3344039f5 Mon Sep 17 00:00:00 2001 From: Ryan Leung Date: Tue, 16 Nov 2021 19:07:49 +0800 Subject: [PATCH 2/2] resolve the conflicts close #4090 Signed-off-by: Ryan Leung --- server/cluster/coordinator_test.go | 17 +- server/schedule/checker/rule_checker.go | 1 - server/schedule/checker/rule_checker_test.go | 322 ------------------- server/schedule/checker_controller.go | 34 +- server/schedule/operator/builder.go | 33 +- server/schedule/operator/influence.go | 2 +- server/schedule/operator/step.go | 14 +- 7 files changed, 30 insertions(+), 393 deletions(-) diff --git a/server/cluster/coordinator_test.go b/server/cluster/coordinator_test.go index a4cefb9cd92..93e33e0131a 100644 --- a/server/cluster/coordinator_test.go +++ b/server/cluster/coordinator_test.go @@ -991,7 +991,7 @@ func (s *testOperatorControllerSuite) TestDownStoreLimit(c *C) { tc, co, cleanup := prepare(nil, nil, nil, c) defer cleanup() oc := co.opController - rc := co.checkers.GetRuleChecker() + rc := co.checkers.GetReplicaChecker() tc.addRegionStore(1, 100) tc.addRegionStore(2, 100) @@ -1001,14 +1001,25 @@ func (s *testOperatorControllerSuite) TestDownStoreLimit(c *C) { region := tc.GetRegion(1) tc.setStoreDown(1) tc.SetStoreLimit(1, storelimit.RemovePeer, 1) + region = region.Clone(core.WithDownPeers([]*pdpb.PeerStats{ { Peer: region.GetStorePeer(1), DownSeconds: 24 * 60 * 60, }, - })) + }), core.SetApproximateSize(1)) + tc.putRegion(region) + for i := uint64(1); i < 20; i++ { + tc.addRegionStore(i+3, 100) + op := rc.Check(region) + c.Assert(op, NotNil) + c.Assert(oc.AddOperator(op), IsTrue) + oc.RemoveOperator(op) + } - for i := uint64(1); i <= 5; i++ { + region = region.Clone(core.SetApproximateSize(100)) + tc.putRegion(region) + for i := uint64(20); i < 25; i++ { tc.addRegionStore(i+3, 100) op := rc.Check(region) c.Assert(op, NotNil) diff --git a/server/schedule/checker/rule_checker.go b/server/schedule/checker/rule_checker.go index 2ef2fb922a2..266a633fecf 100644 --- a/server/schedule/checker/rule_checker.go +++ b/server/schedule/checker/rule_checker.go @@ -53,7 +53,6 @@ func (c *RuleChecker) GetType() string { // fix it. func (c *RuleChecker) Check(region *core.RegionInfo) *operator.Operator { checkerCounter.WithLabelValues("rule_checker", "check").Inc() - fit := c.cluster.FitRegion(region) if len(fit.RuleFits) == 0 { checkerCounter.WithLabelValues("rule_checker", "fix-range").Inc() diff --git a/server/schedule/checker/rule_checker_test.go b/server/schedule/checker/rule_checker_test.go index 5de146ebe22..3ecfb1097ad 100644 --- a/server/schedule/checker/rule_checker_test.go +++ b/server/schedule/checker/rule_checker_test.go @@ -14,11 +14,7 @@ package checker import ( -<<<<<<< HEAD "encoding/hex" -======= - "context" ->>>>>>> 1a7caa95c (schedule: not limit remove peer of the down store (#4097)) . "github.com/pingcap/check" "github.com/pingcap/kvproto/pkg/metapb" @@ -337,321 +333,3 @@ func (s *testRuleCheckerSuite) TestIssue3521_PriorityFixOrphanPeer(c *C) { c.Assert(op.Step(0), FitsTypeOf, remove) c.Assert(op.Desc(), Equals, "remove-orphan-peer") } -<<<<<<< HEAD -======= - -func (s *testRuleCheckerSuite) TestIssue3293(c *C) { - s.cluster.AddLabelsStore(1, 1, map[string]string{"host": "host1"}) - s.cluster.AddLabelsStore(2, 1, map[string]string{"host": "host1"}) - s.cluster.AddLabelsStore(3, 1, map[string]string{"host": "host2"}) - s.cluster.AddLabelsStore(4, 1, map[string]string{"host": "host4"}) - s.cluster.AddLabelsStore(5, 1, map[string]string{"host": "host5"}) - s.cluster.AddLeaderRegionWithRange(1, "", "", 1, 2) - err := s.ruleManager.SetRule(&placement.Rule{ - GroupID: "TiDB_DDL_51", - ID: "0", - Role: placement.Follower, - Count: 1, - LabelConstraints: []placement.LabelConstraint{ - { - Key: "host", - Values: []string{ - "host5", - }, - Op: placement.In, - }, - }, - }) - c.Assert(err, IsNil) - s.cluster.DeleteStore(s.cluster.GetStore(5)) - err = s.ruleManager.SetRule(&placement.Rule{ - GroupID: "TiDB_DDL_51", - ID: "default", - Role: placement.Voter, - Count: 3, - }) - c.Assert(err, IsNil) - err = s.ruleManager.DeleteRule("pd", "default") - c.Assert(err, IsNil) - op := s.rc.Check(s.cluster.GetRegion(1)) - c.Assert(op, NotNil) - c.Assert(op.Desc(), Equals, "add-rule-peer") -} - -func (s *testRuleCheckerSuite) TestIssue3299(c *C) { - s.cluster.AddLabelsStore(1, 1, map[string]string{"host": "host1"}) - s.cluster.AddLabelsStore(2, 1, map[string]string{"dc": "sh"}) - s.cluster.AddLeaderRegionWithRange(1, "", "", 1, 2) - - testCases := []struct { - constraints []placement.LabelConstraint - err string - }{ - { - constraints: []placement.LabelConstraint{ - { - Key: "host", - Values: []string{"host5"}, - Op: placement.In, - }, - }, - err: ".*can not match any store", - }, - { - constraints: []placement.LabelConstraint{ - { - Key: "ho", - Values: []string{"sh"}, - Op: placement.In, - }, - }, - err: ".*can not match any store", - }, - { - constraints: []placement.LabelConstraint{ - { - Key: "host", - Values: []string{"host1"}, - Op: placement.In, - }, - { - Key: "host", - Values: []string{"host1"}, - Op: placement.NotIn, - }, - }, - err: ".*can not match any store", - }, - { - constraints: []placement.LabelConstraint{ - { - Key: "host", - Values: []string{"host1"}, - Op: placement.In, - }, - { - Key: "host", - Values: []string{"host3"}, - Op: placement.In, - }, - }, - err: ".*can not match any store", - }, - { - constraints: []placement.LabelConstraint{ - { - Key: "host", - Values: []string{"host1"}, - Op: placement.In, - }, - { - Key: "host", - Values: []string{"host1"}, - Op: placement.In, - }, - }, - err: "", - }, - } - - for _, t := range testCases { - err := s.ruleManager.SetRule(&placement.Rule{ - GroupID: "p", - ID: "0", - Role: placement.Follower, - Count: 1, - LabelConstraints: t.constraints, - }) - if t.err != "" { - c.Assert(err, ErrorMatches, t.err) - } else { - c.Assert(err, IsNil) - } - } -} - -// See issue: https://github.com/tikv/pd/issues/3705 -func (s *testRuleCheckerSuite) TestFixDownPeer(c *C) { - s.cluster.AddLabelsStore(1, 1, map[string]string{"zone": "z1"}) - s.cluster.AddLabelsStore(2, 1, map[string]string{"zone": "z1"}) - s.cluster.AddLabelsStore(3, 1, map[string]string{"zone": "z2"}) - s.cluster.AddLabelsStore(4, 1, map[string]string{"zone": "z3"}) - s.cluster.AddLabelsStore(5, 1, map[string]string{"zone": "z3"}) - s.cluster.AddLeaderRegion(1, 1, 3, 4) - rule := &placement.Rule{ - GroupID: "pd", - ID: "test", - Index: 100, - Override: true, - Role: placement.Voter, - Count: 3, - LocationLabels: []string{"zone"}, - } - s.ruleManager.SetRule(rule) - - region := s.cluster.GetRegion(1) - c.Assert(s.rc.Check(region), IsNil) - - s.cluster.SetStoreDown(4) - region = region.Clone(core.WithDownPeers([]*pdpb.PeerStats{ - {Peer: region.GetStorePeer(4), DownSeconds: 6000}, - })) - testutil.CheckTransferPeer(c, s.rc.Check(region), operator.OpRegion, 4, 5) - - s.cluster.SetStoreDown(5) - testutil.CheckTransferPeer(c, s.rc.Check(region), operator.OpRegion, 4, 2) - - rule.IsolationLevel = "zone" - s.ruleManager.SetRule(rule) - c.Assert(s.rc.Check(region), IsNil) -} - -// See issue: https://github.com/tikv/pd/issues/3705 -func (s *testRuleCheckerSuite) TestFixOfflinePeer(c *C) { - s.cluster.AddLabelsStore(1, 1, map[string]string{"zone": "z1"}) - s.cluster.AddLabelsStore(2, 1, map[string]string{"zone": "z1"}) - s.cluster.AddLabelsStore(3, 1, map[string]string{"zone": "z2"}) - s.cluster.AddLabelsStore(4, 1, map[string]string{"zone": "z3"}) - s.cluster.AddLabelsStore(5, 1, map[string]string{"zone": "z3"}) - s.cluster.AddLeaderRegion(1, 1, 3, 4) - rule := &placement.Rule{ - GroupID: "pd", - ID: "test", - Index: 100, - Override: true, - Role: placement.Voter, - Count: 3, - LocationLabels: []string{"zone"}, - } - s.ruleManager.SetRule(rule) - - region := s.cluster.GetRegion(1) - c.Assert(s.rc.Check(region), IsNil) - - s.cluster.SetStoreOffline(4) - testutil.CheckTransferPeer(c, s.rc.Check(region), operator.OpRegion, 4, 5) - - s.cluster.SetStoreOffline(5) - testutil.CheckTransferPeer(c, s.rc.Check(region), operator.OpRegion, 4, 2) - - rule.IsolationLevel = "zone" - s.ruleManager.SetRule(rule) - c.Assert(s.rc.Check(region), IsNil) -} - -func (s *testRuleCheckerSerialSuite) TestRuleCache(c *C) { - s.cluster.AddLabelsStore(1, 1, map[string]string{"zone": "z1"}) - s.cluster.AddLabelsStore(2, 1, map[string]string{"zone": "z1"}) - s.cluster.AddLabelsStore(3, 1, map[string]string{"zone": "z2"}) - s.cluster.AddLabelsStore(4, 1, map[string]string{"zone": "z3"}) - s.cluster.AddLabelsStore(5, 1, map[string]string{"zone": "z3"}) - s.cluster.AddRegionStore(999, 1) - s.cluster.AddLeaderRegion(1, 1, 3, 4) - rule := &placement.Rule{ - GroupID: "pd", - ID: "test", - Index: 100, - Override: true, - Role: placement.Voter, - Count: 3, - LocationLabels: []string{"zone"}, - } - s.ruleManager.SetRule(rule) - region := s.cluster.GetRegion(1) - region = region.Clone(core.WithIncConfVer(), core.WithIncVersion()) - c.Assert(s.rc.Check(region), IsNil) - - testcases := []struct { - name string - region *core.RegionInfo - stillCached bool - }{ - { - name: "default", - region: region, - stillCached: true, - }, - { - name: "region topo changed", - region: func() *core.RegionInfo { - return region.Clone(core.WithAddPeer(&metapb.Peer{ - Id: 999, - StoreId: 999, - Role: metapb.PeerRole_Voter, - }), core.WithIncConfVer()) - }(), - stillCached: false, - }, - { - name: "region leader changed", - region: region.Clone( - core.WithLeader(&metapb.Peer{Role: metapb.PeerRole_Voter, Id: 2, StoreId: 3})), - stillCached: false, - }, - { - name: "region have down peers", - region: region.Clone(core.WithDownPeers([]*pdpb.PeerStats{ - { - Peer: region.GetPeer(3), - DownSeconds: 42, - }, - })), - stillCached: false, - }, - } - for _, testcase := range testcases { - c.Log(testcase.name) - if testcase.stillCached { - c.Assert(failpoint.Enable("github.com/tikv/pd/server/schedule/checker/assertShouldCache", "return(true)"), IsNil) - s.rc.Check(testcase.region) - c.Assert(failpoint.Disable("github.com/tikv/pd/server/schedule/checker/assertShouldCache"), IsNil) - } else { - c.Assert(failpoint.Enable("github.com/tikv/pd/server/schedule/checker/assertShouldNotCache", "return(true)"), IsNil) - s.rc.Check(testcase.region) - c.Assert(failpoint.Disable("github.com/tikv/pd/server/schedule/checker/assertShouldNotCache"), IsNil) - } - } -} - -// Ref https://github.com/tikv/pd/issues/4045 -func (s *testRuleCheckerSuite) TestSkipFixOrphanPeerIfSelectedPeerisPendingOrDown(c *C) { - s.cluster.AddLabelsStore(1, 1, map[string]string{"host": "host1"}) - s.cluster.AddLabelsStore(2, 1, map[string]string{"host": "host1"}) - s.cluster.AddLabelsStore(3, 1, map[string]string{"host": "host2"}) - s.cluster.AddLabelsStore(4, 1, map[string]string{"host": "host4"}) - s.cluster.AddLeaderRegionWithRange(1, "", "", 1, 2, 3, 4) - - // set peer3 and peer4 to pending - r1 := s.cluster.GetRegion(1) - r1 = r1.Clone(core.WithPendingPeers([]*metapb.Peer{r1.GetStorePeer(3), r1.GetStorePeer(4)})) - s.cluster.PutRegion(r1) - - // should not remove extra peer - op := s.rc.Check(s.cluster.GetRegion(1)) - c.Assert(op, IsNil) - - // set peer3 to down-peer - r1 = r1.Clone(core.WithPendingPeers([]*metapb.Peer{r1.GetStorePeer(4)})) - r1 = r1.Clone(core.WithDownPeers([]*pdpb.PeerStats{ - { - Peer: r1.GetStorePeer(3), - DownSeconds: 42, - }, - })) - s.cluster.PutRegion(r1) - - // should not remove extra peer - op = s.rc.Check(s.cluster.GetRegion(1)) - c.Assert(op, IsNil) - - // set peer3 to normal - r1 = r1.Clone(core.WithDownPeers(nil)) - s.cluster.PutRegion(r1) - - // should remove extra peer now - var remove operator.RemovePeer - op = s.rc.Check(s.cluster.GetRegion(1)) - c.Assert(op.Step(0), FitsTypeOf, remove) - c.Assert(op.Desc(), Equals, "remove-orphan-peer") -} ->>>>>>> 1a7caa95c (schedule: not limit remove peer of the down store (#4097)) diff --git a/server/schedule/checker_controller.go b/server/schedule/checker_controller.go index 6880f939bb7..c2857dbafcc 100644 --- a/server/schedule/checker_controller.go +++ b/server/schedule/checker_controller.go @@ -93,36 +93,8 @@ func (c *CheckerController) CheckRegion(region *core.RegionInfo) (bool, []*opera func (c *CheckerController) GetMergeChecker() *checker.MergeChecker { return c.mergeChecker } -<<<<<<< HEAD -======= -// GetRuleChecker returns the rule checker. -func (c *CheckerController) GetRuleChecker() *checker.RuleChecker { - return c.ruleChecker +// GetReplicaChecker returns the replica checker. +func (c *CheckerController) GetReplicaChecker() *checker.ReplicaChecker { + return c.replicaChecker } - -// GetWaitingRegions returns the regions in the waiting list. -func (c *CheckerController) GetWaitingRegions() []*cache.Item { - return c.regionWaitingList.Elems() -} - -// AddWaitingRegion returns the regions in the waiting list. -func (c *CheckerController) AddWaitingRegion(region *core.RegionInfo) { - c.regionWaitingList.Put(region.GetID(), nil) -} - -// RemoveWaitingRegion removes the region from the waiting list. -func (c *CheckerController) RemoveWaitingRegion(id uint64) { - c.regionWaitingList.Remove(id) -} - -// GetPriorityRegions returns the region in priority queue -func (c *CheckerController) GetPriorityRegions() []uint64 { - return c.priorityChecker.GetPriorityRegions() -} - -// RemovePriorityRegions removes priority region from priority queue -func (c *CheckerController) RemovePriorityRegions(id uint64) { - c.priorityChecker.RemovePriorityRegion(id) -} ->>>>>>> 1a7caa95c (schedule: not limit remove peer of the down store (#4097)) diff --git a/server/schedule/operator/builder.go b/server/schedule/operator/builder.go index d4e2916baf1..2665a8beba6 100644 --- a/server/schedule/operator/builder.go +++ b/server/schedule/operator/builder.go @@ -332,41 +332,22 @@ func (b *Builder) execAddPeer(p *metapb.Peer) { if !p.GetIsLearner() { b.steps = append(b.steps, PromoteLearner{ToStore: p.GetStoreId(), PeerID: p.GetId()}) } -<<<<<<< HEAD b.currentPeers.Set(p) if b.peerAddStep == nil { b.peerAddStep = make(map[uint64]int) -======= - b.currentPeers.Set(peer) - b.peerAddStep[peer.GetStoreId()] = len(b.steps) - delete(b.toAdd, peer.GetStoreId()) -} - -func (b *Builder) execRemovePeer(peer *metapb.Peer) { - removeStoreID := peer.GetStoreId() - var isDownStore bool - store := b.cluster.GetStore(removeStoreID) - if store != nil { - isDownStore = store.DownTime() > b.cluster.GetOpts().GetMaxStoreDownTime() - } - b.steps = append(b.steps, RemovePeer{FromStore: removeStoreID, PeerID: peer.GetId(), IsDownStore: isDownStore}) - delete(b.currentPeers, removeStoreID) - delete(b.toRemove, removeStoreID) -} - -func (b *Builder) execChangePeerV2(needEnter bool, needTransferLeader bool) { - // Enter - step := ChangePeerV2Enter{ - PromoteLearners: make([]PromoteLearner, 0, len(b.toPromote)), - DemoteVoters: make([]DemoteVoter, 0, len(b.toDemote)), ->>>>>>> 1a7caa95c (schedule: not limit remove peer of the down store (#4097)) } b.peerAddStep[p.GetStoreId()] = len(b.steps) b.toAdd.Delete(p.GetStoreId()) } func (b *Builder) execRemovePeer(p *metapb.Peer) { - b.steps = append(b.steps, RemovePeer{FromStore: p.GetStoreId()}) + removeStoreID := p.GetStoreId() + var isDownStore bool + store := b.cluster.GetStore(removeStoreID) + if store != nil { + isDownStore = store.DownTime() > b.cluster.GetMaxStoreDownTime() + } + b.steps = append(b.steps, RemovePeer{FromStore: removeStoreID, IsDownStore: isDownStore}) b.currentPeers.Delete(p.GetStoreId()) b.toRemove.Delete(p.GetStoreId()) } diff --git a/server/schedule/operator/influence.go b/server/schedule/operator/influence.go index 1e02a388aa5..a3028920a77 100644 --- a/server/schedule/operator/influence.go +++ b/server/schedule/operator/influence.go @@ -80,7 +80,7 @@ func (s *StoreInfluence) addStepCost(limitType storelimit.Type, cost int64) { func (s *StoreInfluence) AdjustStepCost(limitType storelimit.Type, regionSize int64) { if regionSize > storelimit.SmallRegionThreshold { s.addStepCost(limitType, storelimit.RegionInfluence[limitType]) - } else if regionSize <= storelimit.SmallRegionThreshold && regionSize > core.EmptyRegionApproximateSize { + } else if regionSize > core.EmptyRegionApproximateSize { s.addStepCost(limitType, storelimit.SmallRegionInfluence[limitType]) } } diff --git a/server/schedule/operator/step.go b/server/schedule/operator/step.go index b70cee00299..18b113589dd 100644 --- a/server/schedule/operator/step.go +++ b/server/schedule/operator/step.go @@ -220,12 +220,8 @@ func (pl PromoteLearner) Influence(opInfluence OpInfluence, region *core.RegionI // RemovePeer is an OpStep that removes a region peer. type RemovePeer struct { -<<<<<<< HEAD - FromStore uint64 -======= - FromStore, PeerID uint64 - IsDownStore bool ->>>>>>> 1a7caa95c (schedule: not limit remove peer of the down store (#4097)) + FromStore uint64 + IsDownStore bool } // ConfVerChanged returns true if the conf version has been changed by this step @@ -257,9 +253,9 @@ func (rp RemovePeer) Influence(opInfluence OpInfluence, region *core.RegionInfo) regionSize := region.GetApproximateSize() from.RegionSize -= regionSize from.RegionCount-- - if rp.IsDownStore { - from.AdjustStepCost(storelimit.RemovePeer, storelimit.SmallRegionThreshold) - return + + if rp.IsDownStore && regionSize > storelimit.SmallRegionThreshold { + regionSize = storelimit.SmallRegionThreshold } from.AdjustStepCost(storelimit.RemovePeer, regionSize) }