From 8e8b3752065324fbc7c3e4c1854ba5dcc752b201 Mon Sep 17 00:00:00 2001 From: bufferflies <1045931706@qq.com> Date: Tue, 14 Dec 2021 10:17:29 +0800 Subject: [PATCH 01/25] operator add region size properties and close #4457 Signed-off-by: bufferflies <1045931706@qq.com> --- server/cluster/coordinator_test.go | 2 +- server/schedule/operator/builder.go | 24 ++++---- server/schedule/operator/create_operator.go | 8 +-- server/schedule/operator/operator.go | 16 +++++- server/schedule/operator/operator_test.go | 2 +- server/schedule/operator_controller_test.go | 62 ++++++++++----------- server/schedule/waiting_operator_test.go | 12 ++-- tests/server/cluster/cluster_test.go | 22 ++++---- 8 files changed, 81 insertions(+), 67 deletions(-) diff --git a/server/cluster/coordinator_test.go b/server/cluster/coordinator_test.go index b8d9b134430..b4cd7d6b6ad 100644 --- a/server/cluster/coordinator_test.go +++ b/server/cluster/coordinator_test.go @@ -42,7 +42,7 @@ import ( ) func newTestOperator(regionID uint64, regionEpoch *metapb.RegionEpoch, kind operator.OpKind, steps ...operator.OpStep) *operator.Operator { - return operator.NewOperator("test", "test", regionID, regionEpoch, kind, steps...) + return operator.NewTestOperator("test", "test", regionID, regionEpoch, kind, steps...) } func (c *testCluster) AllocPeer(storeID uint64) (*metapb.Peer, error) { diff --git a/server/schedule/operator/builder.go b/server/schedule/operator/builder.go index d2215299aa7..765ddde7534 100644 --- a/server/schedule/operator/builder.go +++ b/server/schedule/operator/builder.go @@ -38,12 +38,13 @@ import ( // according to various constraints. type Builder struct { // basic info - desc string - cluster opt.Cluster - regionID uint64 - regionEpoch *metapb.RegionEpoch - rules []*placement.Rule - expectedRoles map[uint64]placement.PeerRoleType + desc string + cluster opt.Cluster + regionID uint64 + regionEpoch *metapb.RegionEpoch + rules []*placement.Rule + expectedRoles map[uint64]placement.PeerRoleType + approximateSize int64 // operation record originPeers peersMap @@ -84,10 +85,11 @@ func SkipOriginJointStateCheck(b *Builder) { // NewBuilder creates a Builder. func NewBuilder(desc string, cluster opt.Cluster, region *core.RegionInfo, opts ...BuilderOption) *Builder { b := &Builder{ - desc: desc, - cluster: cluster, - regionID: region.GetID(), - regionEpoch: region.GetRegionEpoch(), + desc: desc, + cluster: cluster, + regionID: region.GetID(), + regionEpoch: region.GetRegionEpoch(), + approximateSize: region.GetApproximateSize(), } // options @@ -332,7 +334,7 @@ func (b *Builder) Build(kind OpKind) (*Operator, error) { return nil, b.err } - return NewOperator(b.desc, brief, b.regionID, b.regionEpoch, kind, b.steps...), nil + return NewOperator(b.desc, brief, b.regionID, b.regionEpoch, kind, b.approximateSize, b.steps...), nil } // Initialize intermediate states. diff --git a/server/schedule/operator/create_operator.go b/server/schedule/operator/create_operator.go index 65e4d2fedd3..bc156a600f4 100644 --- a/server/schedule/operator/create_operator.go +++ b/server/schedule/operator/create_operator.go @@ -133,7 +133,7 @@ func CreateSplitRegionOperator(desc string, region *core.RegionInfo, kind OpKind } brief += fmt.Sprintf(" and keys %v", hexKeys) } - return NewOperator(desc, brief, region.GetID(), region.GetRegionEpoch(), kind|OpSplit, step), nil + return NewOperator(desc, brief, region.GetID(), region.GetRegionEpoch(), kind|OpSplit, region.GetApproximateSize(), step), nil } // CreateMergeRegionOperator creates an operator that merge two region into one. @@ -169,8 +169,8 @@ func CreateMergeRegionOperator(desc string, cluster opt.Cluster, source *core.Re }) brief := fmt.Sprintf("merge: region %v to %v", source.GetID(), target.GetID()) - op1 := NewOperator(desc, brief, source.GetID(), source.GetRegionEpoch(), kind|OpMerge, steps...) - op2 := NewOperator(desc, brief, target.GetID(), target.GetRegionEpoch(), kind|OpMerge, MergeRegion{ + op1 := NewOperator(desc, brief, source.GetID(), source.GetRegionEpoch(), kind|OpMerge, source.GetApproximateSize(), steps...) + op2 := NewOperator(desc, brief, target.GetID(), target.GetRegionEpoch(), kind|OpMerge, target.GetApproximateSize(), MergeRegion{ FromRegion: source.GetMeta(), ToRegion: target.GetMeta(), IsPassive: true, @@ -274,5 +274,5 @@ func CreateLeaveJointStateOperator(desc string, cluster opt.Cluster, origin *cor } b.execChangePeerV2(false, true) - return NewOperator(b.desc, brief, b.regionID, b.regionEpoch, kind, b.steps...), nil + return NewOperator(b.desc, brief, b.regionID, b.regionEpoch, kind, origin.GetApproximateSize(), b.steps...), nil } diff --git a/server/schedule/operator/operator.go b/server/schedule/operator/operator.go index 39c6e8fd243..6e60b674526 100644 --- a/server/schedule/operator/operator.go +++ b/server/schedule/operator/operator.go @@ -54,10 +54,11 @@ type Operator struct { Counters []prometheus.Counter FinishedCounters []prometheus.Counter AdditionalInfos map[string]string + ApproximateSize int64 } // NewOperator creates a new operator. -func NewOperator(desc, brief string, regionID uint64, regionEpoch *metapb.RegionEpoch, kind OpKind, steps ...OpStep) *Operator { +func NewOperator(desc, brief string, regionID uint64, regionEpoch *metapb.RegionEpoch, kind OpKind, approximateSize int64, steps ...OpStep) *Operator { level := core.NormalPriority if kind&OpAdmin != 0 { level = core.HighPriority @@ -73,6 +74,7 @@ func NewOperator(desc, brief string, regionID uint64, regionEpoch *metapb.Region status: NewOpStatusTracker(), level: level, AdditionalInfos: make(map[string]string), + ApproximateSize: approximateSize, } } @@ -81,7 +83,9 @@ func (o *Operator) String() string { for i := range o.steps { stepStrs[i] = o.steps[i].String() } - s := fmt.Sprintf("%s {%s} (kind:%s, region:%v(%v,%v), createAt:%s, startAt:%s, currentStep:%v, steps:[%s])", o.desc, o.brief, o.kind, o.regionID, o.regionEpoch.GetVersion(), o.regionEpoch.GetConfVer(), o.GetCreateTime(), o.GetStartTime(), atomic.LoadInt32(&o.currentStep), strings.Join(stepStrs, ", ")) + s := fmt.Sprintf("%s {%s} (kind:%s, region:%v(%v,%v), createAt:%s, startAt:%s, currentStep:%v, size:%v,steps:[%s])", + o.desc, o.brief, o.kind, o.regionID, o.regionEpoch.GetVersion(), o.regionEpoch.GetConfVer(), o.GetCreateTime(), + o.GetStartTime(), atomic.LoadInt32(&o.currentStep), o.ApproximateSize, strings.Join(stepStrs, ", ")) if o.CheckSuccess() { s += " finished" } @@ -363,3 +367,11 @@ func (o *Operator) GetAdditionalInfo() string { } return "" } + +// mock region default region size is 96Mb. +const mockRegionSize = 96 * (1 << 20) + +// NewTestOperator creates a test operator. +func NewTestOperator(desc, brief string, regionID uint64, regionEpoch *metapb.RegionEpoch, kind OpKind, steps ...OpStep) *Operator { + return NewOperator(desc, brief, regionID, regionEpoch, kind, mockRegionSize, steps...) +} diff --git a/server/schedule/operator/operator_test.go b/server/schedule/operator/operator_test.go index ffe1f77f2f1..7ff25b0632a 100644 --- a/server/schedule/operator/operator_test.go +++ b/server/schedule/operator/operator_test.go @@ -97,7 +97,7 @@ func (s *testOperatorSuite) TestOperatorStep(c *C) { //nolint func (s *testOperatorSuite) newTestOperator(regionID uint64, kind OpKind, steps ...OpStep) *Operator { - return NewOperator("test", "test", regionID, &metapb.RegionEpoch{}, kind, steps...) + return NewOperator("test", "test", regionID, &metapb.RegionEpoch{}, kind, mockRegionSize, steps...) } func (s *testOperatorSuite) checkSteps(c *C, op *Operator, steps []OpStep) { diff --git a/server/schedule/operator_controller_test.go b/server/schedule/operator_controller_test.go index 1ac798494f6..44a4c205b43 100644 --- a/server/schedule/operator_controller_test.go +++ b/server/schedule/operator_controller_test.go @@ -66,8 +66,8 @@ func (t *testOperatorControllerSuite) TestGetOpInfluence(c *C) { steps := []operator.OpStep{ operator.RemovePeer{FromStore: 2}, } - op1 := operator.NewOperator("test", "test", 1, &metapb.RegionEpoch{}, operator.OpRegion, steps...) - op2 := operator.NewOperator("test", "test", 2, &metapb.RegionEpoch{}, operator.OpRegion, steps...) + op1 := operator.NewTestOperator("test", "test", 1, &metapb.RegionEpoch{}, operator.OpRegion, steps...) + op2 := operator.NewTestOperator("test", "test", 2, &metapb.RegionEpoch{}, operator.OpRegion, steps...) c.Assert(op1.Start(), IsTrue) oc.SetOperator(op1) c.Assert(op2.Start(), IsTrue) @@ -110,10 +110,10 @@ func (t *testOperatorControllerSuite) TestOperatorStatus(c *C) { operator.RemovePeer{FromStore: 2}, operator.AddPeer{ToStore: 2, PeerID: 4}, } - op1 := operator.NewOperator("test", "test", 1, &metapb.RegionEpoch{}, operator.OpRegion, steps...) - op2 := operator.NewOperator("test", "test", 2, &metapb.RegionEpoch{}, operator.OpRegion, steps...) region1 := tc.GetRegion(1) region2 := tc.GetRegion(2) + op1 := operator.NewTestOperator("test", "test", 1, &metapb.RegionEpoch{}, operator.OpRegion, steps...) + op2 := operator.NewTestOperator("test", "test", 2, &metapb.RegionEpoch{}, operator.OpRegion, steps...) c.Assert(op1.Start(), IsTrue) oc.SetOperator(op1) c.Assert(op2.Start(), IsTrue) @@ -145,8 +145,8 @@ func (t *testOperatorControllerSuite) TestFastFailOperator(c *C) { operator.RemovePeer{FromStore: 2}, operator.AddPeer{ToStore: 3, PeerID: 4}, } - op := operator.NewOperator("test", "test", 1, &metapb.RegionEpoch{}, operator.OpRegion, steps...) region := tc.GetRegion(1) + op := operator.NewTestOperator("test", "test", 1, &metapb.RegionEpoch{}, operator.OpRegion, steps...) c.Assert(op.Start(), IsTrue) oc.SetOperator(op) oc.Dispatch(region, "test") @@ -158,7 +158,7 @@ func (t *testOperatorControllerSuite) TestFastFailOperator(c *C) { c.Assert(oc.GetOperator(region.GetID()), IsNil) // transfer leader to an illegal store. - op = operator.NewOperator("test", "test", 1, &metapb.RegionEpoch{}, operator.OpRegion, operator.TransferLeader{ToStore: 5}) + op = operator.NewTestOperator("test", "test", 1, &metapb.RegionEpoch{}, operator.OpRegion, operator.TransferLeader{ToStore: 5}) oc.SetOperator(op) oc.Dispatch(region, DispatchFromHeartBeat) c.Assert(op.Status(), Equals, operator.CANCELED) @@ -201,7 +201,7 @@ func (t *testOperatorControllerSuite) TestCheckAddUnexpectedStatus(c *C) { } { // finished op - op := operator.NewOperator("test", "test", 1, &metapb.RegionEpoch{}, operator.OpRegion, operator.TransferLeader{ToStore: 2}) + op := operator.NewTestOperator("test", "test", 1, &metapb.RegionEpoch{}, operator.OpRegion, operator.TransferLeader{ToStore: 2}) c.Assert(oc.checkAddOperator(op), IsTrue) op.Start() c.Assert(oc.checkAddOperator(op), IsFalse) // started @@ -211,14 +211,14 @@ func (t *testOperatorControllerSuite) TestCheckAddUnexpectedStatus(c *C) { } { // finished op canceled - op := operator.NewOperator("test", "test", 1, &metapb.RegionEpoch{}, operator.OpRegion, operator.TransferLeader{ToStore: 2}) + op := operator.NewTestOperator("test", "test", 1, &metapb.RegionEpoch{}, operator.OpRegion, operator.TransferLeader{ToStore: 2}) c.Assert(oc.checkAddOperator(op), IsTrue) c.Assert(op.Cancel(), IsTrue) c.Assert(oc.checkAddOperator(op), IsFalse) } { // finished op replaced - op := operator.NewOperator("test", "test", 1, &metapb.RegionEpoch{}, operator.OpRegion, operator.TransferLeader{ToStore: 2}) + op := operator.NewTestOperator("test", "test", 1, &metapb.RegionEpoch{}, operator.OpRegion, operator.TransferLeader{ToStore: 2}) c.Assert(oc.checkAddOperator(op), IsTrue) c.Assert(op.Start(), IsTrue) c.Assert(op.Replace(), IsTrue) @@ -226,8 +226,8 @@ func (t *testOperatorControllerSuite) TestCheckAddUnexpectedStatus(c *C) { } { // finished op expired - op1 := operator.NewOperator("test", "test", 1, &metapb.RegionEpoch{}, operator.OpRegion, operator.TransferLeader{ToStore: 2}) - op2 := operator.NewOperator("test", "test", 2, &metapb.RegionEpoch{}, operator.OpRegion, operator.TransferLeader{ToStore: 1}) + op1 := operator.NewTestOperator("test", "test", 1, &metapb.RegionEpoch{}, operator.OpRegion, operator.TransferLeader{ToStore: 2}) + op2 := operator.NewTestOperator("test", "test", 2, &metapb.RegionEpoch{}, operator.OpRegion, operator.TransferLeader{ToStore: 1}) c.Assert(oc.checkAddOperator(op1, op2), IsTrue) operator.SetOperatorStatusReachTime(op1, operator.CREATED, time.Now().Add(-operator.OperatorExpireTime)) operator.SetOperatorStatusReachTime(op2, operator.CREATED, time.Now().Add(-operator.OperatorExpireTime)) @@ -239,7 +239,7 @@ func (t *testOperatorControllerSuite) TestCheckAddUnexpectedStatus(c *C) { { // unfinished op timeout - op := operator.NewOperator("test", "test", 1, &metapb.RegionEpoch{}, operator.OpRegion, steps...) + op := operator.NewTestOperator("test", "test", 1, &metapb.RegionEpoch{}, operator.OpRegion, steps...) c.Assert(oc.checkAddOperator(op), IsTrue) op.Start() operator.SetOperatorStatusReachTime(op, operator.STARTED, time.Now().Add(-operator.SlowOperatorWaitTime)) @@ -263,9 +263,9 @@ func (t *testOperatorControllerSuite) TestConcurrentRemoveOperator(c *C) { operator.AddPeer{ToStore: 1, PeerID: 4}, } // finished op with normal priority - op1 := operator.NewOperator("test", "test", 1, &metapb.RegionEpoch{}, operator.OpRegion, operator.TransferLeader{ToStore: 2}) + op1 := operator.NewTestOperator("test", "test", 1, &metapb.RegionEpoch{}, operator.OpRegion, operator.TransferLeader{ToStore: 2}) // unfinished op with high priority - op2 := operator.NewOperator("test", "test", 1, &metapb.RegionEpoch{}, operator.OpRegion|operator.OpAdmin, steps...) + op2 := operator.NewTestOperator("test", "test", 1, &metapb.RegionEpoch{}, operator.OpRegion|operator.OpAdmin, steps...) c.Assert(op1.Start(), IsTrue) oc.SetOperator(op1) @@ -304,10 +304,10 @@ func (t *testOperatorControllerSuite) TestPollDispatchRegion(c *C) { operator.RemovePeer{FromStore: 2}, operator.AddPeer{ToStore: 2, PeerID: 4}, } - op1 := operator.NewOperator("test", "test", 1, &metapb.RegionEpoch{}, operator.OpRegion, operator.TransferLeader{ToStore: 2}) - op2 := operator.NewOperator("test", "test", 2, &metapb.RegionEpoch{}, operator.OpRegion, steps...) - op3 := operator.NewOperator("test", "test", 3, &metapb.RegionEpoch{}, operator.OpRegion, steps...) - op4 := operator.NewOperator("test", "test", 4, &metapb.RegionEpoch{}, operator.OpRegion, operator.TransferLeader{ToStore: 2}) + op1 := operator.NewTestOperator("test", "test", 1, &metapb.RegionEpoch{}, operator.OpRegion, operator.TransferLeader{ToStore: 2}) + op2 := operator.NewTestOperator("test", "test", 2, &metapb.RegionEpoch{}, operator.OpRegion, steps...) + op3 := operator.NewTestOperator("test", "test", 3, &metapb.RegionEpoch{}, operator.OpRegion, steps...) + op4 := operator.NewTestOperator("test", "test", 4, &metapb.RegionEpoch{}, operator.OpRegion, operator.TransferLeader{ToStore: 2}) region1 := tc.GetRegion(1) region2 := tc.GetRegion(2) region4 := tc.GetRegion(4) @@ -378,53 +378,53 @@ func (t *testOperatorControllerSuite) TestStoreLimit(c *C) { tc.SetStoreLimit(2, storelimit.AddPeer, 60) for i := uint64(1); i <= 5; i++ { - op := operator.NewOperator("test", "test", 1, &metapb.RegionEpoch{}, operator.OpRegion, operator.AddPeer{ToStore: 2, PeerID: i}) + op := operator.NewTestOperator("test", "test", 1, &metapb.RegionEpoch{}, operator.OpRegion, operator.AddPeer{ToStore: 2, PeerID: i}) c.Assert(oc.AddOperator(op), IsTrue) checkRemoveOperatorSuccess(c, oc, op) } - op := operator.NewOperator("test", "test", 1, &metapb.RegionEpoch{}, operator.OpRegion, operator.AddPeer{ToStore: 2, PeerID: 1}) + op := operator.NewTestOperator("test", "test", 1, &metapb.RegionEpoch{}, operator.OpRegion, operator.AddPeer{ToStore: 2, PeerID: 1}) c.Assert(oc.AddOperator(op), IsFalse) c.Assert(oc.RemoveOperator(op), IsFalse) tc.SetStoreLimit(2, storelimit.AddPeer, 120) for i := uint64(1); i <= 10; i++ { - op = operator.NewOperator("test", "test", i, &metapb.RegionEpoch{}, operator.OpRegion, operator.AddPeer{ToStore: 2, PeerID: i}) + op = operator.NewTestOperator("test", "test", i, &metapb.RegionEpoch{}, operator.OpRegion, operator.AddPeer{ToStore: 2, PeerID: i}) c.Assert(oc.AddOperator(op), IsTrue) checkRemoveOperatorSuccess(c, oc, op) } tc.SetAllStoresLimit(storelimit.AddPeer, 60) for i := uint64(1); i <= 5; i++ { - op = operator.NewOperator("test", "test", i, &metapb.RegionEpoch{}, operator.OpRegion, operator.AddPeer{ToStore: 2, PeerID: i}) + op = operator.NewTestOperator("test", "test", i, &metapb.RegionEpoch{}, operator.OpRegion, operator.AddPeer{ToStore: 2, PeerID: i}) c.Assert(oc.AddOperator(op), IsTrue) checkRemoveOperatorSuccess(c, oc, op) } - op = operator.NewOperator("test", "test", 1, &metapb.RegionEpoch{}, operator.OpRegion, operator.AddPeer{ToStore: 2, PeerID: 1}) + op = operator.NewTestOperator("test", "test", 1, &metapb.RegionEpoch{}, operator.OpRegion, operator.AddPeer{ToStore: 2, PeerID: 1}) c.Assert(oc.AddOperator(op), IsFalse) c.Assert(oc.RemoveOperator(op), IsFalse) tc.SetStoreLimit(2, storelimit.RemovePeer, 60) for i := uint64(1); i <= 5; i++ { - op := operator.NewOperator("test", "test", 1, &metapb.RegionEpoch{}, operator.OpRegion, operator.RemovePeer{FromStore: 2}) + op := operator.NewTestOperator("test", "test", 1, &metapb.RegionEpoch{}, operator.OpRegion, operator.RemovePeer{FromStore: 2}) c.Assert(oc.AddOperator(op), IsTrue) checkRemoveOperatorSuccess(c, oc, op) } - op = operator.NewOperator("test", "test", 1, &metapb.RegionEpoch{}, operator.OpRegion, operator.RemovePeer{FromStore: 2}) + op = operator.NewTestOperator("test", "test", 1, &metapb.RegionEpoch{}, operator.OpRegion, operator.RemovePeer{FromStore: 2}) c.Assert(oc.AddOperator(op), IsFalse) c.Assert(oc.RemoveOperator(op), IsFalse) tc.SetStoreLimit(2, storelimit.RemovePeer, 120) for i := uint64(1); i <= 10; i++ { - op = operator.NewOperator("test", "test", i, &metapb.RegionEpoch{}, operator.OpRegion, operator.RemovePeer{FromStore: 2}) + op = operator.NewTestOperator("test", "test", i, &metapb.RegionEpoch{}, operator.OpRegion, operator.RemovePeer{FromStore: 2}) c.Assert(oc.AddOperator(op), IsTrue) checkRemoveOperatorSuccess(c, oc, op) } tc.SetAllStoresLimit(storelimit.RemovePeer, 60) for i := uint64(1); i <= 5; i++ { - op = operator.NewOperator("test", "test", i, &metapb.RegionEpoch{}, operator.OpRegion, operator.RemovePeer{FromStore: 2}) + op = operator.NewTestOperator("test", "test", i, &metapb.RegionEpoch{}, operator.OpRegion, operator.RemovePeer{FromStore: 2}) c.Assert(oc.AddOperator(op), IsTrue) checkRemoveOperatorSuccess(c, oc, op) } - op = operator.NewOperator("test", "test", 1, &metapb.RegionEpoch{}, operator.OpRegion, operator.RemovePeer{FromStore: 2}) + op = operator.NewTestOperator("test", "test", 1, &metapb.RegionEpoch{}, operator.OpRegion, operator.RemovePeer{FromStore: 2}) c.Assert(oc.AddOperator(op), IsFalse) c.Assert(oc.RemoveOperator(op), IsFalse) } @@ -444,7 +444,7 @@ func (t *testOperatorControllerSuite) TestDispatchOutdatedRegion(c *C) { operator.RemovePeer{FromStore: 1}, } - op := operator.NewOperator("test", "test", 1, + op := operator.NewTestOperator("test", "test", 1, &metapb.RegionEpoch{ConfVer: 0, Version: 0}, operator.OpRegion, steps...) c.Assert(controller.AddOperator(op), IsTrue) @@ -467,7 +467,7 @@ func (t *testOperatorControllerSuite) TestDispatchOutdatedRegion(c *C) { c.Assert(stream.MsgLength(), Equals, 2) // add and dispatch op again, the op should be stale - op = operator.NewOperator("test", "test", 1, + op = operator.NewTestOperator("test", "test", 1, &metapb.RegionEpoch{ConfVer: 0, Version: 0}, operator.OpRegion, steps...) c.Assert(controller.AddOperator(op), IsTrue) @@ -518,7 +518,7 @@ func (t *testOperatorControllerSuite) TestDispatchUnfinishedStep(c *C) { for _, steps := range testSteps { // Create an operator - op := operator.NewOperator("test", "test", 1, epoch, + op := operator.NewTestOperator("test", "test", 1, epoch, operator.OpRegion, steps...) c.Assert(controller.AddOperator(op), IsTrue) c.Assert(stream.MsgLength(), Equals, 1) diff --git a/server/schedule/waiting_operator_test.go b/server/schedule/waiting_operator_test.go index 671be59d672..c1c078cc56d 100644 --- a/server/schedule/waiting_operator_test.go +++ b/server/schedule/waiting_operator_test.go @@ -36,16 +36,16 @@ func (s *testWaitingOperatorSuite) TestRandBuckets(c *C) { } func addOperators(wop WaitingOperator) { - op := operator.NewOperator("testOperatorNormal", "test", uint64(1), &metapb.RegionEpoch{}, operator.OpRegion, []operator.OpStep{ + op := operator.NewTestOperator("testOperatorNormal", "test", uint64(1), &metapb.RegionEpoch{}, operator.OpRegion, []operator.OpStep{ operator.RemovePeer{FromStore: uint64(1)}, }...) wop.PutOperator(op) - op = operator.NewOperator("testOperatorHigh", "test", uint64(2), &metapb.RegionEpoch{}, operator.OpRegion, []operator.OpStep{ + op = operator.NewTestOperator("testOperatorHigh", "test", uint64(2), &metapb.RegionEpoch{}, operator.OpRegion, []operator.OpStep{ operator.RemovePeer{FromStore: uint64(2)}, }...) op.SetPriorityLevel(core.HighPriority) wop.PutOperator(op) - op = operator.NewOperator("testOperatorLow", "test", uint64(3), &metapb.RegionEpoch{}, operator.OpRegion, []operator.OpStep{ + op = operator.NewTestOperator("testOperatorLow", "test", uint64(3), &metapb.RegionEpoch{}, operator.OpRegion, []operator.OpStep{ operator.RemovePeer{FromStore: uint64(3)}, }...) op.SetPriorityLevel(core.LowPriority) @@ -64,7 +64,7 @@ func (s *testWaitingOperatorSuite) TestRandomBucketsWithMergeRegion(c *C) { for j := 0; j < 100; j++ { // adds operators desc := descs[j%3] - op := operator.NewOperator(desc, "test", uint64(1), &metapb.RegionEpoch{}, operator.OpRegion|operator.OpMerge, []operator.OpStep{ + op := operator.NewTestOperator(desc, "test", uint64(1), &metapb.RegionEpoch{}, operator.OpRegion|operator.OpMerge, []operator.OpStep{ operator.MergeRegion{ FromRegion: &metapb.Region{ Id: 1, @@ -79,7 +79,7 @@ func (s *testWaitingOperatorSuite) TestRandomBucketsWithMergeRegion(c *C) { }, }...) rb.PutOperator(op) - op = operator.NewOperator(desc, "test", uint64(2), &metapb.RegionEpoch{}, operator.OpRegion|operator.OpMerge, []operator.OpStep{ + op = operator.NewTestOperator(desc, "test", uint64(2), &metapb.RegionEpoch{}, operator.OpRegion|operator.OpMerge, []operator.OpStep{ operator.MergeRegion{ FromRegion: &metapb.Region{ Id: 1, @@ -94,7 +94,7 @@ func (s *testWaitingOperatorSuite) TestRandomBucketsWithMergeRegion(c *C) { }, }...) rb.PutOperator(op) - op = operator.NewOperator("testOperatorHigh", "test", uint64(3), &metapb.RegionEpoch{}, operator.OpRegion, []operator.OpStep{ + op = operator.NewTestOperator("testOperatorHigh", "test", uint64(3), &metapb.RegionEpoch{}, operator.OpRegion, []operator.OpStep{ operator.RemovePeer{FromStore: uint64(3)}, }...) op.SetPriorityLevel(core.HighPriority) diff --git a/tests/server/cluster/cluster_test.go b/tests/server/cluster/cluster_test.go index a35245beae6..c90bd8090a3 100644 --- a/tests/server/cluster/cluster_test.go +++ b/tests/server/cluster/cluster_test.go @@ -244,9 +244,9 @@ func testStateAndLimit(c *C, clusterID uint64, rc *cluster.RaftCluster, grpcPDCl oc := rc.GetOperatorController() rc.SetStoreLimit(storeID, storelimit.AddPeer, 60) rc.SetStoreLimit(storeID, storelimit.RemovePeer, 60) - op := operator.NewOperator("test", "test", 2, &metapb.RegionEpoch{}, operator.OpRegion, operator.AddPeer{ToStore: storeID, PeerID: 3}) + op := operator.NewTestOperator("test", "test", 2, &metapb.RegionEpoch{}, operator.OpRegion, operator.AddPeer{ToStore: storeID, PeerID: 3}) oc.AddOperator(op) - op = operator.NewOperator("test", "test", 2, &metapb.RegionEpoch{}, operator.OpRegion, operator.RemovePeer{FromStore: storeID}) + op = operator.NewTestOperator("test", "test", 2, &metapb.RegionEpoch{}, operator.OpRegion, operator.RemovePeer{FromStore: storeID}) oc.AddOperator(op) resetStoreState(c, rc, store.GetId(), beforeState) @@ -922,21 +922,21 @@ func (s *clusterTestSuite) TestOfflineStoreLimit(c *C) { opt.SetAllStoresLimit(storelimit.RemovePeer, 1) // only can add 5 remove peer operators on store 1 for i := uint64(1); i <= 5; i++ { - op := operator.NewOperator("test", "test", 1, &metapb.RegionEpoch{ConfVer: 1, Version: 1}, operator.OpRegion, operator.RemovePeer{FromStore: 1}) + op := operator.NewTestOperator("test", "test", 1, &metapb.RegionEpoch{ConfVer: 1, Version: 1}, operator.OpRegion, operator.RemovePeer{FromStore: 1}) c.Assert(oc.AddOperator(op), IsTrue) c.Assert(oc.RemoveOperator(op), IsTrue) } - op := operator.NewOperator("test", "test", 1, &metapb.RegionEpoch{ConfVer: 1, Version: 1}, operator.OpRegion, operator.RemovePeer{FromStore: 1}) + op := operator.NewTestOperator("test", "test", 1, &metapb.RegionEpoch{ConfVer: 1, Version: 1}, operator.OpRegion, operator.RemovePeer{FromStore: 1}) c.Assert(oc.AddOperator(op), IsFalse) c.Assert(oc.RemoveOperator(op), IsFalse) // only can add 5 remove peer operators on store 2 for i := uint64(1); i <= 5; i++ { - op := operator.NewOperator("test", "test", 2, &metapb.RegionEpoch{ConfVer: 1, Version: 1}, operator.OpRegion, operator.RemovePeer{FromStore: 2}) + op := operator.NewTestOperator("test", "test", 2, &metapb.RegionEpoch{ConfVer: 1, Version: 1}, operator.OpRegion, operator.RemovePeer{FromStore: 2}) c.Assert(oc.AddOperator(op), IsTrue) c.Assert(oc.RemoveOperator(op), IsTrue) } - op = operator.NewOperator("test", "test", 2, &metapb.RegionEpoch{ConfVer: 1, Version: 1}, operator.OpRegion, operator.RemovePeer{FromStore: 2}) + op = operator.NewTestOperator("test", "test", 2, &metapb.RegionEpoch{ConfVer: 1, Version: 1}, operator.OpRegion, operator.RemovePeer{FromStore: 2}) c.Assert(oc.AddOperator(op), IsFalse) c.Assert(oc.RemoveOperator(op), IsFalse) @@ -945,11 +945,11 @@ func (s *clusterTestSuite) TestOfflineStoreLimit(c *C) { // only can add 5 remove peer operators on store 2 for i := uint64(1); i <= 5; i++ { - op := operator.NewOperator("test", "test", 2, &metapb.RegionEpoch{ConfVer: 1, Version: 1}, operator.OpRegion, operator.RemovePeer{FromStore: 2}) + op := operator.NewTestOperator("test", "test", 2, &metapb.RegionEpoch{ConfVer: 1, Version: 1}, operator.OpRegion, operator.RemovePeer{FromStore: 2}) c.Assert(oc.AddOperator(op), IsTrue) c.Assert(oc.RemoveOperator(op), IsTrue) } - op = operator.NewOperator("test", "test", 2, &metapb.RegionEpoch{ConfVer: 1, Version: 1}, operator.OpRegion, operator.RemovePeer{FromStore: 2}) + op = operator.NewTestOperator("test", "test", 2, &metapb.RegionEpoch{ConfVer: 1, Version: 1}, operator.OpRegion, operator.RemovePeer{FromStore: 2}) c.Assert(oc.AddOperator(op), IsFalse) c.Assert(oc.RemoveOperator(op), IsFalse) @@ -959,7 +959,7 @@ func (s *clusterTestSuite) TestOfflineStoreLimit(c *C) { // can add unlimited remove peer operators on store 1 for i := uint64(1); i <= 30; i++ { - op := operator.NewOperator("test", "test", 1, &metapb.RegionEpoch{ConfVer: 1, Version: 1}, operator.OpRegion, operator.RemovePeer{FromStore: 1}) + op := operator.NewTestOperator("test", "test", 1, &metapb.RegionEpoch{ConfVer: 1, Version: 1}, operator.OpRegion, operator.RemovePeer{FromStore: 1}) c.Assert(oc.AddOperator(op), IsTrue) c.Assert(oc.RemoveOperator(op), IsTrue) } @@ -1011,11 +1011,11 @@ func (s *clusterTestSuite) TestUpgradeStoreLimit(c *C) { oc := rc.GetOperatorController() // only can add 5 remove peer operators on store 1 for i := uint64(1); i <= 5; i++ { - op := operator.NewOperator("test", "test", 1, &metapb.RegionEpoch{ConfVer: 1, Version: 1}, operator.OpRegion, operator.RemovePeer{FromStore: 1}) + op := operator.NewTestOperator("test", "test", 1, &metapb.RegionEpoch{ConfVer: 1, Version: 1}, operator.OpRegion, operator.RemovePeer{FromStore: 1}) c.Assert(oc.AddOperator(op), IsTrue) c.Assert(oc.RemoveOperator(op), IsTrue) } - op := operator.NewOperator("test", "test", 1, &metapb.RegionEpoch{ConfVer: 1, Version: 1}, operator.OpRegion, operator.RemovePeer{FromStore: 1}) + op := operator.NewTestOperator("test", "test", 1, &metapb.RegionEpoch{ConfVer: 1, Version: 1}, operator.OpRegion, operator.RemovePeer{FromStore: 1}) c.Assert(oc.AddOperator(op), IsFalse) c.Assert(oc.RemoveOperator(op), IsFalse) } From 98e6c9ad0a487a55e6f8375e30a8fc42f3b097d5 Mon Sep 17 00:00:00 2001 From: bufferflies <1045931706@qq.com> Date: Tue, 14 Dec 2021 12:00:45 +0800 Subject: [PATCH 02/25] lint check Signed-off-by: bufferflies <1045931706@qq.com> --- server/schedule/operator/operator_test.go | 2 +- server/schedule/operator_controller_test.go | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/server/schedule/operator/operator_test.go b/server/schedule/operator/operator_test.go index 7ff25b0632a..ffb46d02085 100644 --- a/server/schedule/operator/operator_test.go +++ b/server/schedule/operator/operator_test.go @@ -97,7 +97,7 @@ func (s *testOperatorSuite) TestOperatorStep(c *C) { //nolint func (s *testOperatorSuite) newTestOperator(regionID uint64, kind OpKind, steps ...OpStep) *Operator { - return NewOperator("test", "test", regionID, &metapb.RegionEpoch{}, kind, mockRegionSize, steps...) + return NewTestOperator("test", "test", regionID, &metapb.RegionEpoch{}, kind, steps...) } func (s *testOperatorSuite) checkSteps(c *C, op *Operator, steps []OpStep) { diff --git a/server/schedule/operator_controller_test.go b/server/schedule/operator_controller_test.go index 44a4c205b43..9c3bd5989a4 100644 --- a/server/schedule/operator_controller_test.go +++ b/server/schedule/operator_controller_test.go @@ -177,7 +177,7 @@ func (t *testOperatorControllerSuite) TestFastFailWithUnhealthyStore(c *C) { tc.AddLeaderRegion(1, 1, 2) region := tc.GetRegion(1) steps := []operator.OpStep{operator.TransferLeader{ToStore: 2}} - op := operator.NewOperator("test", "test", 1, region.GetRegionEpoch(), operator.OpLeader, steps...) + op := operator.NewTestOperator("test", "test", 1, region.GetRegionEpoch(), operator.OpLeader, steps...) oc.SetOperator(op) c.Assert(oc.checkStaleOperator(op, steps[0], region), IsFalse) tc.SetStoreDown(2) From ff4e389cefec6449613967dcbf9be1ede1fa4c19 Mon Sep 17 00:00:00 2001 From: bufferflies <1045931706@qq.com> Date: Wed, 5 Jan 2022 18:39:21 +0800 Subject: [PATCH 03/25] adds timeoutfactor property to operator struct, adds max-region-size and operator-time-factor to scheduler config && close 4547 Signed-off-by: bufferflies <1045931706@qq.com> --- server/config/config.go | 17 +++++++++++++++++ server/config/persist_options.go | 12 ++++++++++++ server/handler.go | 4 ++-- server/schedule/checker/merge_checker.go | 11 +++++++++-- server/schedule/checker/split_checker.go | 2 +- server/schedule/operator/builder.go | 2 +- server/schedule/operator/create_operator.go | 11 ++++++----- server/schedule/operator/operator.go | 11 ++++++++--- server/schedule/region_splitter.go | 2 +- 9 files changed, 57 insertions(+), 15 deletions(-) diff --git a/server/config/config.go b/server/config/config.go index da850fbf7f1..af57221134f 100644 --- a/server/config/config.go +++ b/server/config/config.go @@ -742,6 +742,15 @@ type ScheduleConfig struct { // The day of hot regions data to be reserved. 0 means close. HotRegionsReservedDays int64 `toml:"hot-regions-reserved-days" json:"hot-regions-reserved-days"` + + // OperatorTimeFactor is the time factor for operator. + // The max duration of one operator step is: multi(region_size, factor) + // Default: 6 s/MB + OperatorTimeFactor uint64 `toml:"operator-time-factor" json:"operator-time-factor"` + + // MaxRegionSize is the max size of region. + // default: 96MB + MaxRegionSize uint64 `toml:"max-region-size" json:"max-region-size"` } // Clone returns a cloned scheduling configuration. @@ -789,6 +798,8 @@ const ( defaultEnableCrossTableMerge = true defaultHotRegionsWriteInterval = 10 * time.Minute defaultHotRegionsResevervedDays = 0 + defaultOperatorTimeFactor = 6 + defaultMaxRegionSize = 96 ) func (c *ScheduleConfig) adjust(meta *configMetaData, reloading bool) error { @@ -843,6 +854,12 @@ func (c *ScheduleConfig) adjust(meta *configMetaData, reloading bool) error { if !meta.IsDefined("enable-cross-table-merge") { c.EnableCrossTableMerge = defaultEnableCrossTableMerge } + if !meta.IsDefined("operator-time-factor") { + adjustUint64(&c.OperatorTimeFactor, defaultOperatorTimeFactor) + } + if !meta.IsDefined("operator-time-factor") { + adjustUint64(&c.MaxRegionSize, defaultMaxRegionSize) + } adjustFloat64(&c.LowSpaceRatio, defaultLowSpaceRatio) adjustFloat64(&c.HighSpaceRatio, defaultHighSpaceRatio) diff --git a/server/config/persist_options.go b/server/config/persist_options.go index 6ca1e0c7f9c..f553f126932 100644 --- a/server/config/persist_options.go +++ b/server/config/persist_options.go @@ -192,6 +192,8 @@ const ( hotRegionScheduleLimitKey = "schedule.hot-region-schedule-limit" schedulerMaxWaitingOperatorKey = "schedule.scheduler-max-waiting-operator" enableLocationReplacement = "schedule.enable-location-replacement" + operatorTimeFactorKey = "schedule.operator-time-factor" + maxRegionSizeKey = "schedule.max-region-size" ) var supportedTTLConfigs = []string{ @@ -245,6 +247,16 @@ func (o *PersistOptions) GetSplitMergeInterval() time.Duration { return o.GetScheduleConfig().SplitMergeInterval.Duration } +// GetOperatorTimeFactor returns the operator time factor. +func (o *PersistOptions) GetOperatorTimeFactor() uint64 { + return o.getTTLUintOr(operatorTimeFactorKey, o.GetScheduleConfig().OperatorTimeFactor) +} + +// GetMaxRegionSize returns the max size of regions. +func (o *PersistOptions) GetMaxRegionSize() uint64 { + return o.getTTLUintOr(maxRegionSizeKey, o.GetScheduleConfig().MaxRegionSize) +} + // SetSplitMergeInterval to set the interval between finishing split and starting to merge. It's only used to test. func (o *PersistOptions) SetSplitMergeInterval(splitMergeInterval time.Duration) { v := o.GetScheduleConfig().Clone() diff --git a/server/handler.go b/server/handler.go index 8125f26b454..499e32912ee 100644 --- a/server/handler.go +++ b/server/handler.go @@ -769,8 +769,8 @@ func (h *Handler) AddSplitRegionOperator(regionID uint64, policyStr string, keys splitKeys = append(splitKeys, k) } } - - op, err := operator.CreateSplitRegionOperator("admin-split-region", region, operator.OpAdmin, pdpb.CheckPolicy(policy), splitKeys) + factor := h.opt.GetOperatorTimeFactor() + op, err := operator.CreateSplitRegionOperator("admin-split-region", region, operator.OpAdmin, pdpb.CheckPolicy(policy), splitKeys, factor) if err != nil { return err } diff --git a/server/schedule/checker/merge_checker.go b/server/schedule/checker/merge_checker.go index 15175b2c538..a832c48af42 100644 --- a/server/schedule/checker/merge_checker.go +++ b/server/schedule/checker/merge_checker.go @@ -32,7 +32,10 @@ import ( "github.com/tikv/pd/server/schedule/placement" ) -const maxTargetRegionSize = 500 +const ( + maxTargetRegionSize = 500 + maxRegionFactor = 5 +) // When a region has label `merge_option=deny`, skip merging the region. // If label value is `allow` or other value, it will be treated as `allow`. @@ -145,7 +148,11 @@ func (m *MergeChecker) Check(region *core.RegionInfo) []*operator.Operator { return nil } - if target.GetApproximateSize() > maxTargetRegionSize { + maxSize := int64(maxRegionFactor * m.cluster.GetOpts().GetMaxRegionSize()) + if maxSize < maxTargetRegionSize { + maxSize = maxTargetRegionSize + } + if target.GetApproximateSize() > maxSize { checkerCounter.WithLabelValues("merge_checker", "target-too-large").Inc() return nil } diff --git a/server/schedule/checker/split_checker.go b/server/schedule/checker/split_checker.go index 0cbcb64ad1c..55e17feee62 100644 --- a/server/schedule/checker/split_checker.go +++ b/server/schedule/checker/split_checker.go @@ -72,7 +72,7 @@ func (c *SplitChecker) Check(region *core.RegionInfo) *operator.Operator { return nil } - op, err := operator.CreateSplitRegionOperator(desc, region, 0, pdpb.CheckPolicy_USEKEY, keys) + op, err := operator.CreateSplitRegionOperator(desc, region, 0, pdpb.CheckPolicy_USEKEY, keys, c.cluster.GetOpts().GetOperatorTimeFactor()) if err != nil { log.Debug("create split region operator failed", errs.ZapError(err)) return nil diff --git a/server/schedule/operator/builder.go b/server/schedule/operator/builder.go index 765ddde7534..d2d04b4009f 100644 --- a/server/schedule/operator/builder.go +++ b/server/schedule/operator/builder.go @@ -334,7 +334,7 @@ func (b *Builder) Build(kind OpKind) (*Operator, error) { return nil, b.err } - return NewOperator(b.desc, brief, b.regionID, b.regionEpoch, kind, b.approximateSize, b.steps...), nil + return NewOperator(b.desc, brief, b.regionID, b.regionEpoch, kind, b.approximateSize, b.cluster.GetOpts().GetOperatorTimeFactor(), b.steps...), nil } // Initialize intermediate states. diff --git a/server/schedule/operator/create_operator.go b/server/schedule/operator/create_operator.go index bc156a600f4..43a1674fd53 100644 --- a/server/schedule/operator/create_operator.go +++ b/server/schedule/operator/create_operator.go @@ -114,7 +114,7 @@ func CreateMoveLeaderOperator(desc string, cluster opt.Cluster, region *core.Reg } // CreateSplitRegionOperator creates an operator that splits a region. -func CreateSplitRegionOperator(desc string, region *core.RegionInfo, kind OpKind, policy pdpb.CheckPolicy, keys [][]byte) (*Operator, error) { +func CreateSplitRegionOperator(desc string, region *core.RegionInfo, kind OpKind, policy pdpb.CheckPolicy, keys [][]byte, factor uint64) (*Operator, error) { if core.IsInJointState(region.GetPeers()...) { return nil, errors.Errorf("cannot split region which is in joint state") } @@ -133,7 +133,7 @@ func CreateSplitRegionOperator(desc string, region *core.RegionInfo, kind OpKind } brief += fmt.Sprintf(" and keys %v", hexKeys) } - return NewOperator(desc, brief, region.GetID(), region.GetRegionEpoch(), kind|OpSplit, region.GetApproximateSize(), step), nil + return NewOperator(desc, brief, region.GetID(), region.GetRegionEpoch(), kind|OpSplit, region.GetApproximateSize(), factor, step), nil } // CreateMergeRegionOperator creates an operator that merge two region into one. @@ -169,8 +169,9 @@ func CreateMergeRegionOperator(desc string, cluster opt.Cluster, source *core.Re }) brief := fmt.Sprintf("merge: region %v to %v", source.GetID(), target.GetID()) - op1 := NewOperator(desc, brief, source.GetID(), source.GetRegionEpoch(), kind|OpMerge, source.GetApproximateSize(), steps...) - op2 := NewOperator(desc, brief, target.GetID(), target.GetRegionEpoch(), kind|OpMerge, target.GetApproximateSize(), MergeRegion{ + factor := cluster.GetOpts().GetOperatorTimeFactor() + op1 := NewOperator(desc, brief, source.GetID(), source.GetRegionEpoch(), kind|OpMerge, source.GetApproximateSize(), factor, steps...) + op2 := NewOperator(desc, brief, target.GetID(), target.GetRegionEpoch(), kind|OpMerge, target.GetApproximateSize(), factor, MergeRegion{ FromRegion: source.GetMeta(), ToRegion: target.GetMeta(), IsPassive: true, @@ -274,5 +275,5 @@ func CreateLeaveJointStateOperator(desc string, cluster opt.Cluster, origin *cor } b.execChangePeerV2(false, true) - return NewOperator(b.desc, brief, b.regionID, b.regionEpoch, kind, origin.GetApproximateSize(), b.steps...), nil + return NewOperator(b.desc, brief, b.regionID, b.regionEpoch, kind, origin.GetApproximateSize(), cluster.GetOpts().GetOperatorTimeFactor(), b.steps...), nil } diff --git a/server/schedule/operator/operator.go b/server/schedule/operator/operator.go index 6e60b674526..199f124e5da 100644 --- a/server/schedule/operator/operator.go +++ b/server/schedule/operator/operator.go @@ -55,10 +55,11 @@ type Operator struct { FinishedCounters []prometheus.Counter AdditionalInfos map[string]string ApproximateSize int64 + TimeFactor uint64 } // NewOperator creates a new operator. -func NewOperator(desc, brief string, regionID uint64, regionEpoch *metapb.RegionEpoch, kind OpKind, approximateSize int64, steps ...OpStep) *Operator { +func NewOperator(desc, brief string, regionID uint64, regionEpoch *metapb.RegionEpoch, kind OpKind, approximateSize int64, factor uint64, steps ...OpStep) *Operator { level := core.NormalPriority if kind&OpAdmin != 0 { level = core.HighPriority @@ -75,6 +76,7 @@ func NewOperator(desc, brief string, regionID uint64, regionEpoch *metapb.Region level: level, AdditionalInfos: make(map[string]string), ApproximateSize: approximateSize, + TimeFactor: factor, } } @@ -369,9 +371,12 @@ func (o *Operator) GetAdditionalInfo() string { } // mock region default region size is 96Mb. -const mockRegionSize = 96 * (1 << 20) +const ( + mockRegionSize = 96 * (1 << 20) + mockFactor = 6 +) // NewTestOperator creates a test operator. func NewTestOperator(desc, brief string, regionID uint64, regionEpoch *metapb.RegionEpoch, kind OpKind, steps ...OpStep) *Operator { - return NewOperator(desc, brief, regionID, regionEpoch, kind, mockRegionSize, steps...) + return NewOperator(desc, brief, regionID, regionEpoch, kind, mockRegionSize, mockFactor, steps...) } diff --git a/server/schedule/region_splitter.go b/server/schedule/region_splitter.go index ce4d84e31dc..67a3fd3ee1a 100644 --- a/server/schedule/region_splitter.go +++ b/server/schedule/region_splitter.go @@ -182,7 +182,7 @@ type splitRegionsHandler struct { } func (h *splitRegionsHandler) SplitRegionByKeys(region *core.RegionInfo, splitKeys [][]byte) error { - op, err := operator.CreateSplitRegionOperator("region-splitter", region, 0, pdpb.CheckPolicy_USEKEY, splitKeys) + op, err := operator.CreateSplitRegionOperator("region-splitter", region, 0, pdpb.CheckPolicy_USEKEY, splitKeys, h.cluster.GetOpts().GetOperatorTimeFactor()) if err != nil { return err } From 59a0dffd64c8b226e116e9254e4f2ca23e2315bd Mon Sep 17 00:00:00 2001 From: bufferflies <1045931706@qq.com> Date: Wed, 5 Jan 2022 20:17:07 +0800 Subject: [PATCH 04/25] pass ut Signed-off-by: bufferflies <1045931706@qq.com> --- server/schedule/operator/create_operator_test.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/server/schedule/operator/create_operator_test.go b/server/schedule/operator/create_operator_test.go index 6f0f63a6a62..98614157d7b 100644 --- a/server/schedule/operator/create_operator_test.go +++ b/server/schedule/operator/create_operator_test.go @@ -136,7 +136,7 @@ func (s *testCreateOperatorSuite) TestCreateSplitRegionOperator(c *C) { EndKey: tc.endKey, Peers: tc.originPeers, }, tc.originPeers[0]) - op, err := CreateSplitRegionOperator("test", region, 0, tc.policy, tc.keys) + op, err := CreateSplitRegionOperator("test", region, 0, tc.policy, tc.keys, 6) if tc.expectedError { c.Assert(err, NotNil) continue From b792225a5a1203b373e2e75228ca4e08eed5dad2 Mon Sep 17 00:00:00 2001 From: bufferflies <1045931706@qq.com> Date: Thu, 13 Jan 2022 17:36:23 +0800 Subject: [PATCH 05/25] replace max-region-size Signed-off-by: bufferflies <1045931706@qq.com> --- server/config/config.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/server/config/config.go b/server/config/config.go index b058683639e..f2b523f7990 100644 --- a/server/config/config.go +++ b/server/config/config.go @@ -855,7 +855,7 @@ func (c *ScheduleConfig) adjust(meta *configMetaData, reloading bool) error { if !meta.IsDefined("operator-time-factor") { adjustUint64(&c.OperatorTimeFactor, defaultOperatorTimeFactor) } - if !meta.IsDefined("operator-time-factor") { + if !meta.IsDefined("max-region-size") { adjustUint64(&c.MaxRegionSize, defaultMaxRegionSize) } adjustFloat64(&c.LowSpaceRatio, defaultLowSpaceRatio) From 8a5a203f61e2a8c6db3062457ddfe484468ab51c Mon Sep 17 00:00:00 2001 From: bufferflies <1045931706@qq.com> Date: Mon, 24 Jan 2022 22:34:09 +0800 Subject: [PATCH 06/25] check all in step Signed-off-by: bufferflies <1045931706@qq.com> --- server/config/config.go | 13 ++--- server/config/persist_options.go | 5 -- server/core/store.go | 42 ++++++++++------ server/handler.go | 3 +- server/schedule/checker/merge_checker.go | 4 +- server/schedule/checker/split_checker.go | 2 +- server/schedule/operator/builder.go | 21 +++++--- server/schedule/operator/create_operator.go | 12 ++--- server/schedule/operator/operator.go | 29 +++++------ server/schedule/operator/status_tracker.go | 14 ++++++ server/schedule/operator/step.go | 54 +++++++++++++++++++++ server/schedule/region_splitter.go | 2 +- 12 files changed, 133 insertions(+), 68 deletions(-) diff --git a/server/config/config.go b/server/config/config.go index d01d44a1a7b..e31c118a765 100644 --- a/server/config/config.go +++ b/server/config/config.go @@ -741,14 +741,10 @@ type ScheduleConfig struct { // The day of hot regions data to be reserved. 0 means close. HotRegionsReservedDays uint64 `toml:"hot-regions-reserved-days" json:"hot-regions-reserved-days"` - // OperatorTimeFactor is the time factor for operator. - // The max duration of one operator step is: multi(region_size, factor) - // Default: 6 s/MB - OperatorTimeFactor uint64 `toml:"operator-time-factor" json:"operator-time-factor"` - // MaxRegionSize is the max size of region. - // default: 96MB - MaxRegionSize uint64 `toml:"max-region-size" json:"max-region-size"` + // It's dangerous to change it. + // Default: 96MB + MaxRegionSize uint64 `toml:"max-region-size"` } // Clone returns a cloned scheduling configuration. @@ -853,9 +849,6 @@ func (c *ScheduleConfig) adjust(meta *configMetaData, reloading bool) error { if !meta.IsDefined("enable-cross-table-merge") { c.EnableCrossTableMerge = defaultEnableCrossTableMerge } - if !meta.IsDefined("operator-time-factor") { - adjustUint64(&c.OperatorTimeFactor, defaultOperatorTimeFactor) - } if !meta.IsDefined("max-region-size") { adjustUint64(&c.MaxRegionSize, defaultMaxRegionSize) } diff --git a/server/config/persist_options.go b/server/config/persist_options.go index d6282e72bb6..0bb5f60edf8 100644 --- a/server/config/persist_options.go +++ b/server/config/persist_options.go @@ -247,11 +247,6 @@ func (o *PersistOptions) GetSplitMergeInterval() time.Duration { return o.GetScheduleConfig().SplitMergeInterval.Duration } -// GetOperatorTimeFactor returns the operator time factor. -func (o *PersistOptions) GetOperatorTimeFactor() uint64 { - return o.getTTLUintOr(operatorTimeFactorKey, o.GetScheduleConfig().OperatorTimeFactor) -} - // GetMaxRegionSize returns the max size of regions. func (o *PersistOptions) GetMaxRegionSize() uint64 { return o.getTTLUintOr(maxRegionSizeKey, o.GetScheduleConfig().MaxRegionSize) diff --git a/server/core/store.go b/server/core/store.go index c01ff2a9c6e..892497794b0 100644 --- a/server/core/store.go +++ b/server/core/store.go @@ -42,33 +42,37 @@ const ( EngineTiFlash = "tiflash" // EngineTiKV indicates the tikv engine in metrics EngineTiKV = "tikv" + + OperatorExecutorRate = 6.0 ) // StoreInfo contains information about a store. type StoreInfo struct { meta *metapb.Store *storeStats - pauseLeaderTransfer bool // not allow to be used as source or target of transfer leader - slowStoreEvicted bool // this store has been evicted as a slow store, should not transfer leader to it - leaderCount int - regionCount int - leaderSize int64 - regionSize int64 - pendingPeerCount int - lastPersistTime time.Time - leaderWeight float64 - regionWeight float64 - limiter map[storelimit.Type]*storelimit.StoreLimit + pauseLeaderTransfer bool // not allow to be used as source or target of transfer leader + slowStoreEvicted bool // this store has been evicted as a slow store, should not transfer leader to it + leaderCount int + regionCount int + leaderSize int64 + regionSize int64 + pendingPeerCount int + lastPersistTime time.Time + leaderWeight float64 + regionWeight float64 + limiter map[storelimit.Type]*storelimit.StoreLimit + operatorExecutorRate float64 } // NewStoreInfo creates StoreInfo with meta data. func NewStoreInfo(store *metapb.Store, opts ...StoreCreateOption) *StoreInfo { storeInfo := &StoreInfo{ - meta: store, - storeStats: newStoreStats(), - leaderWeight: 1.0, - regionWeight: 1.0, - limiter: make(map[storelimit.Type]*storelimit.StoreLimit), + meta: store, + storeStats: newStoreStats(), + leaderWeight: 1.0, + regionWeight: 1.0, + limiter: make(map[storelimit.Type]*storelimit.StoreLimit), + operatorExecutorRate: OperatorExecutorRate, } for _, opt := range opts { opt(storeInfo) @@ -267,6 +271,12 @@ func (s *StoreInfo) GetStoreLimit(limitType storelimit.Type) *storelimit.StoreLi return s.limiter[limitType] } +func (s *StoreInfo) GetOperatorExecutorRate() float64 { + s.mu.RLock() + defer s.mu.RUnlock() + return s.operatorExecutorRate +} + const minWeight = 1e-6 const maxScore = 1024 * 1024 * 1024 diff --git a/server/handler.go b/server/handler.go index 6c0db22ff0b..afe4a8c3ab8 100644 --- a/server/handler.go +++ b/server/handler.go @@ -778,8 +778,7 @@ func (h *Handler) AddSplitRegionOperator(regionID uint64, policyStr string, keys splitKeys = append(splitKeys, k) } } - factor := h.opt.GetOperatorTimeFactor() - op, err := operator.CreateSplitRegionOperator("admin-split-region", region, operator.OpAdmin, pdpb.CheckPolicy(policy), splitKeys, factor) + op, err := operator.CreateSplitRegionOperator("admin-split-region", region, operator.OpAdmin, pdpb.CheckPolicy(policy), splitKeys) if err != nil { return err } diff --git a/server/schedule/checker/merge_checker.go b/server/schedule/checker/merge_checker.go index 2f5f3b2a230..681f09ef6dc 100644 --- a/server/schedule/checker/merge_checker.go +++ b/server/schedule/checker/merge_checker.go @@ -34,7 +34,7 @@ import ( const ( maxTargetRegionSize = 500 - maxRegionFactor = 5 + maxRegionSizeFactor = 5 ) // When a region has label `merge_option=deny`, skip merging the region. @@ -148,7 +148,7 @@ func (m *MergeChecker) Check(region *core.RegionInfo) []*operator.Operator { return nil } - maxSize := int64(maxRegionFactor * m.cluster.GetOpts().GetMaxRegionSize()) + maxSize := int64(maxRegionSizeFactor * m.cluster.GetOpts().GetMaxRegionSize()) if maxSize < maxTargetRegionSize { maxSize = maxTargetRegionSize } diff --git a/server/schedule/checker/split_checker.go b/server/schedule/checker/split_checker.go index 781e2ba59a7..1b0db44df47 100644 --- a/server/schedule/checker/split_checker.go +++ b/server/schedule/checker/split_checker.go @@ -72,7 +72,7 @@ func (c *SplitChecker) Check(region *core.RegionInfo) *operator.Operator { return nil } - op, err := operator.CreateSplitRegionOperator(desc, region, 0, pdpb.CheckPolicy_USEKEY, keys, c.cluster.GetOpts().GetOperatorTimeFactor()) + op, err := operator.CreateSplitRegionOperator(desc, region, 0, pdpb.CheckPolicy_USEKEY, keys) if err != nil { log.Debug("create split region operator failed", errs.ZapError(err)) return nil diff --git a/server/schedule/operator/builder.go b/server/schedule/operator/builder.go index 300b9ada109..08a25342d90 100644 --- a/server/schedule/operator/builder.go +++ b/server/schedule/operator/builder.go @@ -48,12 +48,13 @@ type ClusterInformer interface { type Builder struct { // basic info ClusterInformer - desc string - regionID uint64 - regionEpoch *metapb.RegionEpoch - rules []*placement.Rule - expectedRoles map[uint64]placement.PeerRoleType - approximateSize int64 + desc string + regionID uint64 + regionEpoch *metapb.RegionEpoch + rules []*placement.Rule + expectedRoles map[uint64]placement.PeerRoleType + approximateSize int64 + operatorExecutorRate float64 // operation record originPeers peersMap @@ -160,6 +161,9 @@ func NewBuilder(desc string, ci ClusterInformer, region *core.RegionInfo, opts . b.targetPeers = originPeers.Copy() b.useJointConsensus = supportJointConsensus && b.GetOpts().IsUseJointConsensus() b.err = err + if originLeaderStore := b.GetBasicCluster().GetStore(b.originLeaderStoreID); originLeaderStore != nil { + b.operatorExecutorRate = originLeaderStore.GetOperatorExecutorRate() + } return b } @@ -361,7 +365,7 @@ func (b *Builder) Build(kind OpKind) (*Operator, error) { return nil, b.err } - return NewOperator(b.desc, brief, b.regionID, b.regionEpoch, kind, b.approximateSize, b.GetOpts().GetOperatorTimeFactor(), b.steps...), nil + return NewOperator(b.desc, brief, b.regionID, b.regionEpoch, kind, b.steps...), nil } // Initialize intermediate states. @@ -673,7 +677,8 @@ func (b *Builder) execPromoteLearner(peer *metapb.Peer) { func (b *Builder) execAddPeer(peer *metapb.Peer) { if b.lightWeight { - b.steps = append(b.steps, AddLearner{ToStore: peer.GetStoreId(), PeerID: peer.GetId(), IsLightWeight: b.lightWeight}) + b.steps = append(b.steps, AddLearner{ToStore: peer.GetStoreId(), PeerID: peer.GetId(), + IsLightWeight: b.lightWeight, regionSize: b.approximateSize, rate: b.operatorExecutorRate}) } else { b.steps = append(b.steps, AddLearner{ToStore: peer.GetStoreId(), PeerID: peer.GetId()}) } diff --git a/server/schedule/operator/create_operator.go b/server/schedule/operator/create_operator.go index 12ac2984e03..caeb6dd033c 100644 --- a/server/schedule/operator/create_operator.go +++ b/server/schedule/operator/create_operator.go @@ -114,7 +114,7 @@ func CreateMoveLeaderOperator(desc string, ci ClusterInformer, region *core.Regi } // CreateSplitRegionOperator creates an operator that splits a region. -func CreateSplitRegionOperator(desc string, region *core.RegionInfo, kind OpKind, policy pdpb.CheckPolicy, keys [][]byte, factor uint64) (*Operator, error) { +func CreateSplitRegionOperator(desc string, region *core.RegionInfo, kind OpKind, policy pdpb.CheckPolicy, keys [][]byte) (*Operator, error) { if core.IsInJointState(region.GetPeers()...) { return nil, errors.Errorf("cannot split region which is in joint state") } @@ -133,7 +133,7 @@ func CreateSplitRegionOperator(desc string, region *core.RegionInfo, kind OpKind } brief += fmt.Sprintf(" and keys %v", hexKeys) } - return NewOperator(desc, brief, region.GetID(), region.GetRegionEpoch(), kind|OpSplit, region.GetApproximateSize(), factor, step), nil + return NewOperator(desc, brief, region.GetID(), region.GetRegionEpoch(), kind|OpSplit, step), nil } // CreateMergeRegionOperator creates an operator that merge two region into one. @@ -169,9 +169,9 @@ func CreateMergeRegionOperator(desc string, ci ClusterInformer, source *core.Reg }) brief := fmt.Sprintf("merge: region %v to %v", source.GetID(), target.GetID()) - factor := cluster.GetOpts().GetOperatorTimeFactor() - op1 := NewOperator(desc, brief, source.GetID(), source.GetRegionEpoch(), kind|OpMerge, source.GetApproximateSize(), factor, steps...) - op2 := NewOperator(desc, brief, target.GetID(), target.GetRegionEpoch(), kind|OpMerge, target.GetApproximateSize(), factor, MergeRegion{ + + op1 := NewOperator(desc, brief, source.GetID(), source.GetRegionEpoch(), kind|OpMerge, steps...) + op2 := NewOperator(desc, brief, target.GetID(), target.GetRegionEpoch(), kind|OpMerge, MergeRegion{ FromRegion: source.GetMeta(), ToRegion: target.GetMeta(), IsPassive: true, @@ -275,5 +275,5 @@ func CreateLeaveJointStateOperator(desc string, ci ClusterInformer, origin *core } b.execChangePeerV2(false, true) - return NewOperator(b.desc, brief, b.regionID, b.regionEpoch, kind, origin.GetApproximateSize(), cluster.GetOpts().GetOperatorTimeFactor(), b.steps...), nil + return NewOperator(b.desc, brief, b.regionID, b.regionEpoch, kind, b.steps...), nil } diff --git a/server/schedule/operator/operator.go b/server/schedule/operator/operator.go index 199f124e5da..556fd877064 100644 --- a/server/schedule/operator/operator.go +++ b/server/schedule/operator/operator.go @@ -54,12 +54,10 @@ type Operator struct { Counters []prometheus.Counter FinishedCounters []prometheus.Counter AdditionalInfos map[string]string - ApproximateSize int64 - TimeFactor uint64 } // NewOperator creates a new operator. -func NewOperator(desc, brief string, regionID uint64, regionEpoch *metapb.RegionEpoch, kind OpKind, approximateSize int64, factor uint64, steps ...OpStep) *Operator { +func NewOperator(desc, brief string, regionID uint64, regionEpoch *metapb.RegionEpoch, kind OpKind, steps ...OpStep) *Operator { level := core.NormalPriority if kind&OpAdmin != 0 { level = core.HighPriority @@ -75,8 +73,6 @@ func NewOperator(desc, brief string, regionID uint64, regionEpoch *metapb.Region status: NewOpStatusTracker(), level: level, AdditionalInfos: make(map[string]string), - ApproximateSize: approximateSize, - TimeFactor: factor, } } @@ -85,9 +81,9 @@ func (o *Operator) String() string { for i := range o.steps { stepStrs[i] = o.steps[i].String() } - s := fmt.Sprintf("%s {%s} (kind:%s, region:%v(%v,%v), createAt:%s, startAt:%s, currentStep:%v, size:%v,steps:[%s])", + s := fmt.Sprintf("%s {%s} (kind:%s, region:%v(%v,%v), createAt:%s, startAt:%s, currentStep:%v,steps:[%s])", o.desc, o.brief, o.kind, o.regionID, o.regionEpoch.GetVersion(), o.regionEpoch.GetConfVer(), o.GetCreateTime(), - o.GetStartTime(), atomic.LoadInt32(&o.currentStep), o.ApproximateSize, strings.Join(stepStrs, ", ")) + o.GetStartTime(), atomic.LoadInt32(&o.currentStep), strings.Join(stepStrs, ", ")) if o.CheckSuccess() { s += " finished" } @@ -230,10 +226,15 @@ func (o *Operator) CheckTimeout() bool { if o.CheckSuccess() { return false } - if o.kind&OpRegion != 0 { - return o.status.CheckTimeout(SlowOperatorWaitTime) + currentStep := int(atomic.LoadInt32(&o.currentStep)) + var startTime time.Time + if currentStep == 0 { + startTime = o.GetStartTime() + } else { + startTime = time.Unix(0, atomic.LoadInt64(&(o.stepsTime[currentStep-1]))) } - return o.status.CheckTimeout(FastOperatorWaitTime) + step := o.steps[currentStep] + return o.status.CheckStepTimeout(startTime, step) } // Len returns the operator's steps count. @@ -370,13 +371,7 @@ func (o *Operator) GetAdditionalInfo() string { return "" } -// mock region default region size is 96Mb. -const ( - mockRegionSize = 96 * (1 << 20) - mockFactor = 6 -) - // NewTestOperator creates a test operator. func NewTestOperator(desc, brief string, regionID uint64, regionEpoch *metapb.RegionEpoch, kind OpKind, steps ...OpStep) *Operator { - return NewOperator(desc, brief, regionID, regionEpoch, kind, mockRegionSize, mockFactor, steps...) + return NewOperator(desc, brief, regionID, regionEpoch, kind, steps...) } diff --git a/server/schedule/operator/status_tracker.go b/server/schedule/operator/status_tracker.go index ca051990235..930b8b7698f 100644 --- a/server/schedule/operator/status_tracker.go +++ b/server/schedule/operator/status_tracker.go @@ -128,6 +128,20 @@ func (trk *OpStatusTracker) CheckTimeout(wait time.Duration) bool { return trk.current == TIMEOUT } +// CheckStepTimeout checks if timeout, and update the current status. +func (trk *OpStatusTracker) CheckStepTimeout(start time.Time, step OpStep) bool { + trk.rw.Lock() + defer trk.rw.Unlock() + if trk.current == STARTED { + if !step.TimeOut(start) { + return false + } + _ = trk.toLocked(TIMEOUT) + return true + } + return trk.current == TIMEOUT +} + // String implements fmt.Stringer. func (trk *OpStatusTracker) String() string { trk.rw.RLock() diff --git a/server/schedule/operator/step.go b/server/schedule/operator/step.go index 505e875a54f..48554308db1 100644 --- a/server/schedule/operator/step.go +++ b/server/schedule/operator/step.go @@ -18,6 +18,7 @@ import ( "bytes" "fmt" "strings" + "time" "github.com/pingcap/errors" "github.com/pingcap/kvproto/pkg/eraftpb" @@ -37,6 +38,7 @@ type OpStep interface { IsFinish(region *core.RegionInfo) bool CheckInProgress(ci ClusterInformer, region *core.RegionInfo) error Influence(opInfluence OpInfluence, region *core.RegionInfo) + TimeOut(start time.Time) bool } // TransferLeader is an OpStep that transfers a region's leader. @@ -99,6 +101,11 @@ func (tl TransferLeader) Influence(opInfluence OpInfluence, region *core.RegionI to.LeaderCount++ } +// TimeOut returns true if the step is timeout. +func (tl TransferLeader) TimeOut(start time.Time) bool { + return time.Since(start) > FastOperatorWaitTime +} + // AddPeer is an OpStep that adds a region peer. type AddPeer struct { ToStore, PeerID uint64 @@ -152,10 +159,17 @@ func (ap AddPeer) CheckInProgress(ci ClusterInformer, region *core.RegionInfo) e return nil } +// TimeOut returns true if the step is timeout. +func (ap AddPeer) TimeOut(start time.Time) bool { + return time.Since(start) > FastOperatorWaitTime +} + // AddLearner is an OpStep that adds a region learner peer. type AddLearner struct { ToStore, PeerID uint64 IsLightWeight bool + regionSize int64 + rate float64 } // ConfVerChanged returns the delta value for version increased by this step. @@ -211,6 +225,11 @@ func (al AddLearner) Influence(opInfluence OpInfluence, region *core.RegionInfo) to.AdjustStepCost(storelimit.AddPeer, regionSize) } +// TimeOut returns true if the step is timeout. +func (al AddLearner) TimeOut(start time.Time) bool { + return time.Since(start) > FastOperatorWaitTime +} + // PromoteLearner is an OpStep that promotes a region learner peer to normal voter. type PromoteLearner struct { ToStore, PeerID uint64 @@ -249,6 +268,11 @@ func (pl PromoteLearner) CheckInProgress(_ ClusterInformer, region *core.RegionI // Influence calculates the store difference that current step makes. func (pl PromoteLearner) Influence(opInfluence OpInfluence, region *core.RegionInfo) {} +// TimeOut returns true if the step is timeout. +func (pl PromoteLearner) TimeOut(start time.Time) bool { + return time.Since(start) > FastOperatorWaitTime +} + // RemovePeer is an OpStep that removes a region peer. type RemovePeer struct { FromStore, PeerID uint64 @@ -300,6 +324,11 @@ func (rp RemovePeer) Influence(opInfluence OpInfluence, region *core.RegionInfo) from.AdjustStepCost(storelimit.RemovePeer, regionSize) } +// TimeOut returns true if the step is timeout. +func (rp RemovePeer) TimeOut(start time.Time) bool { + return time.Since(start) > FastOperatorWaitTime +} + // MergeRegion is an OpStep that merge two regions. type MergeRegion struct { FromRegion *metapb.Region @@ -348,6 +377,11 @@ func (mr MergeRegion) Influence(opInfluence OpInfluence, region *core.RegionInfo } } +// TimeOut returns true if the step is timeout. +func (mr MergeRegion) TimeOut(start time.Time) bool { + return time.Since(start) > FastOperatorWaitTime +} + // SplitRegion is an OpStep that splits a region. type SplitRegion struct { StartKey, EndKey []byte @@ -385,6 +419,11 @@ func (sr SplitRegion) CheckInProgress(_ ClusterInformer, region *core.RegionInfo return nil } +// TimeOut returns true if the step is timeout. +func (sr SplitRegion) TimeOut(start time.Time) bool { + return time.Since(start) > FastOperatorWaitTime +} + // DemoteVoter is very similar to DemoteFollower. But it allows Demote Leader. // Note: It is not an OpStep, only a sub step in ChangePeerV2Enter and ChangePeerV2Leave. type DemoteVoter struct { @@ -412,6 +451,11 @@ func (dv DemoteVoter) IsFinish(region *core.RegionInfo) bool { return false } +// TimeOut returns true if the step is timeout. +func (dv DemoteVoter) TimeOut(start time.Time) bool { + return time.Since(start) > FastOperatorWaitTime +} + // ChangePeerV2Enter is an OpStep that uses joint consensus to request all PromoteLearner and DemoteVoter. type ChangePeerV2Enter struct { PromoteLearners []PromoteLearner @@ -553,6 +597,11 @@ func (cpe ChangePeerV2Enter) GetRequest() *pdpb.ChangePeerV2 { } } +// TimeOut returns true if the step is timeout. +func (cpe ChangePeerV2Enter) TimeOut(start time.Time) bool { + return time.Since(start) > FastOperatorWaitTime +} + // ChangePeerV2Leave is an OpStep that leaves the joint state. type ChangePeerV2Leave struct { PromoteLearners []PromoteLearner @@ -672,6 +721,11 @@ func (cpl ChangePeerV2Leave) CheckInProgress(_ ClusterInformer, region *core.Reg // Influence calculates the store difference that current step makes. func (cpl ChangePeerV2Leave) Influence(opInfluence OpInfluence, region *core.RegionInfo) {} +// TimeOut returns true if the step is timeout. +func (cpl ChangePeerV2Leave) TimeOut(start time.Time) bool { + return time.Since(start) > FastOperatorWaitTime +} + func validateStore(ci ClusterInformer, id uint64) error { store := ci.GetBasicCluster().GetStore(id) if store == nil { diff --git a/server/schedule/region_splitter.go b/server/schedule/region_splitter.go index f89aa2848ab..8e712375b55 100644 --- a/server/schedule/region_splitter.go +++ b/server/schedule/region_splitter.go @@ -181,7 +181,7 @@ type splitRegionsHandler struct { } func (h *splitRegionsHandler) SplitRegionByKeys(region *core.RegionInfo, splitKeys [][]byte) error { - op, err := operator.CreateSplitRegionOperator("region-splitter", region, 0, pdpb.CheckPolicy_USEKEY, splitKeys, h.cluster.GetOpts().GetOperatorTimeFactor()) + op, err := operator.CreateSplitRegionOperator("region-splitter", region, 0, pdpb.CheckPolicy_USEKEY, splitKeys) if err != nil { return err } From 155a53dd7f4a76c7c758e869dbae690335279439 Mon Sep 17 00:00:00 2001 From: bufferflies <1045931706@qq.com> Date: Mon, 24 Jan 2022 22:41:28 +0800 Subject: [PATCH 07/25] remove unnecessary change Signed-off-by: bufferflies <1045931706@qq.com> --- server/config/config.go | 1 - server/config/persist_options.go | 3 +-- server/handler.go | 1 + server/schedule/operator/create_operator.go | 1 - server/schedule/operator/create_operator_test.go | 2 +- 5 files changed, 3 insertions(+), 5 deletions(-) diff --git a/server/config/config.go b/server/config/config.go index e31c118a765..f9a1e1998d9 100644 --- a/server/config/config.go +++ b/server/config/config.go @@ -792,7 +792,6 @@ const ( defaultEnableCrossTableMerge = true defaultHotRegionsWriteInterval = 10 * time.Minute defaultHotRegionsReservedDays = 7 - defaultOperatorTimeFactor = 6 defaultMaxRegionSize = 96 ) diff --git a/server/config/persist_options.go b/server/config/persist_options.go index 0bb5f60edf8..65847d5aadd 100644 --- a/server/config/persist_options.go +++ b/server/config/persist_options.go @@ -192,7 +192,6 @@ const ( hotRegionScheduleLimitKey = "schedule.hot-region-schedule-limit" schedulerMaxWaitingOperatorKey = "schedule.scheduler-max-waiting-operator" enableLocationReplacement = "schedule.enable-location-replacement" - operatorTimeFactorKey = "schedule.operator-time-factor" maxRegionSizeKey = "schedule.max-region-size" ) @@ -249,7 +248,7 @@ func (o *PersistOptions) GetSplitMergeInterval() time.Duration { // GetMaxRegionSize returns the max size of regions. func (o *PersistOptions) GetMaxRegionSize() uint64 { - return o.getTTLUintOr(maxRegionSizeKey, o.GetScheduleConfig().MaxRegionSize) + return o.GetScheduleConfig().MaxRegionSize } // SetSplitMergeInterval to set the interval between finishing split and starting to merge. It's only used to test. diff --git a/server/handler.go b/server/handler.go index afe4a8c3ab8..bebe47c483b 100644 --- a/server/handler.go +++ b/server/handler.go @@ -778,6 +778,7 @@ func (h *Handler) AddSplitRegionOperator(regionID uint64, policyStr string, keys splitKeys = append(splitKeys, k) } } + op, err := operator.CreateSplitRegionOperator("admin-split-region", region, operator.OpAdmin, pdpb.CheckPolicy(policy), splitKeys) if err != nil { return err diff --git a/server/schedule/operator/create_operator.go b/server/schedule/operator/create_operator.go index caeb6dd033c..3eb20fc2faf 100644 --- a/server/schedule/operator/create_operator.go +++ b/server/schedule/operator/create_operator.go @@ -169,7 +169,6 @@ func CreateMergeRegionOperator(desc string, ci ClusterInformer, source *core.Reg }) brief := fmt.Sprintf("merge: region %v to %v", source.GetID(), target.GetID()) - op1 := NewOperator(desc, brief, source.GetID(), source.GetRegionEpoch(), kind|OpMerge, steps...) op2 := NewOperator(desc, brief, target.GetID(), target.GetRegionEpoch(), kind|OpMerge, MergeRegion{ FromRegion: source.GetMeta(), diff --git a/server/schedule/operator/create_operator_test.go b/server/schedule/operator/create_operator_test.go index 250f89a74d7..0832b27911c 100644 --- a/server/schedule/operator/create_operator_test.go +++ b/server/schedule/operator/create_operator_test.go @@ -136,7 +136,7 @@ func (s *testCreateOperatorSuite) TestCreateSplitRegionOperator(c *C) { EndKey: tc.endKey, Peers: tc.originPeers, }, tc.originPeers[0]) - op, err := CreateSplitRegionOperator("test", region, 0, tc.policy, tc.keys, 6) + op, err := CreateSplitRegionOperator("test", region, 0, tc.policy, tc.keys) if tc.expectedError { c.Assert(err, NotNil) continue From 0a9d9cc60c48753c822faddb0acb09a53c7c9768 Mon Sep 17 00:00:00 2001 From: bufferflies <1045931706@qq.com> Date: Mon, 24 Jan 2022 22:47:16 +0800 Subject: [PATCH 08/25] remove unnecessary change Signed-off-by: bufferflies <1045931706@qq.com> --- server/cluster/coordinator_test.go | 2 +- server/schedule/operator/operator.go | 4 +- server/schedule/operator_controller_test.go | 65 ++++++++++----------- server/schedule/waiting_operator_test.go | 17 ++++-- tests/server/cluster/cluster_test.go | 22 +++---- 5 files changed, 57 insertions(+), 53 deletions(-) diff --git a/server/cluster/coordinator_test.go b/server/cluster/coordinator_test.go index 5d7a49fb85d..41ad8637ef9 100644 --- a/server/cluster/coordinator_test.go +++ b/server/cluster/coordinator_test.go @@ -41,7 +41,7 @@ import ( ) func newTestOperator(regionID uint64, regionEpoch *metapb.RegionEpoch, kind operator.OpKind, steps ...operator.OpStep) *operator.Operator { - return operator.NewTestOperator("test", "test", regionID, regionEpoch, kind, steps...) + return operator.NewTestOperator(regionID, regionEpoch, kind, steps...) } func (c *testCluster) AllocPeer(storeID uint64) (*metapb.Peer, error) { diff --git a/server/schedule/operator/operator.go b/server/schedule/operator/operator.go index 556fd877064..07bc4cfe7bf 100644 --- a/server/schedule/operator/operator.go +++ b/server/schedule/operator/operator.go @@ -372,6 +372,6 @@ func (o *Operator) GetAdditionalInfo() string { } // NewTestOperator creates a test operator. -func NewTestOperator(desc, brief string, regionID uint64, regionEpoch *metapb.RegionEpoch, kind OpKind, steps ...OpStep) *Operator { - return NewOperator(desc, brief, regionID, regionEpoch, kind, steps...) +func NewTestOperator(regionID uint64, regionEpoch *metapb.RegionEpoch, kind OpKind, steps ...OpStep) *Operator { + return NewOperator("test", "test", regionID, regionEpoch, kind, steps...) } diff --git a/server/schedule/operator_controller_test.go b/server/schedule/operator_controller_test.go index 57efc81d36e..ffe2f98b896 100644 --- a/server/schedule/operator_controller_test.go +++ b/server/schedule/operator_controller_test.go @@ -65,8 +65,8 @@ func (t *testOperatorControllerSuite) TestGetOpInfluence(c *C) { steps := []operator.OpStep{ operator.RemovePeer{FromStore: 2}, } - op1 := operator.NewTestOperator("test", "test", 1, &metapb.RegionEpoch{}, operator.OpRegion, steps...) - op2 := operator.NewTestOperator("test", "test", 2, &metapb.RegionEpoch{}, operator.OpRegion, steps...) + op1 := operator.NewTestOperator(1, &metapb.RegionEpoch{}, operator.OpRegion, steps...) + op2 := operator.NewTestOperator(2, &metapb.RegionEpoch{}, operator.OpRegion, steps...) c.Assert(op1.Start(), IsTrue) oc.SetOperator(op1) c.Assert(op2.Start(), IsTrue) @@ -111,8 +111,8 @@ func (t *testOperatorControllerSuite) TestOperatorStatus(c *C) { } region1 := tc.GetRegion(1) region2 := tc.GetRegion(2) - op1 := operator.NewTestOperator("test", "test", 1, &metapb.RegionEpoch{}, operator.OpRegion, steps...) - op2 := operator.NewTestOperator("test", "test", 2, &metapb.RegionEpoch{}, operator.OpRegion, steps...) + op1 := operator.NewTestOperator(1, &metapb.RegionEpoch{}, operator.OpRegion, steps...) + op2 := operator.NewTestOperator(2, &metapb.RegionEpoch{}, operator.OpRegion, steps...) c.Assert(op1.Start(), IsTrue) oc.SetOperator(op1) c.Assert(op2.Start(), IsTrue) @@ -145,7 +145,7 @@ func (t *testOperatorControllerSuite) TestFastFailOperator(c *C) { operator.AddPeer{ToStore: 3, PeerID: 4}, } region := tc.GetRegion(1) - op := operator.NewTestOperator("test", "test", 1, &metapb.RegionEpoch{}, operator.OpRegion, steps...) + op := operator.NewTestOperator(1, &metapb.RegionEpoch{}, operator.OpRegion, steps...) c.Assert(op.Start(), IsTrue) oc.SetOperator(op) oc.Dispatch(region, "test") @@ -157,7 +157,7 @@ func (t *testOperatorControllerSuite) TestFastFailOperator(c *C) { c.Assert(oc.GetOperator(region.GetID()), IsNil) // transfer leader to an illegal store. - op = operator.NewTestOperator("test", "test", 1, &metapb.RegionEpoch{}, operator.OpRegion, operator.TransferLeader{ToStore: 5}) + op = operator.NewTestOperator(1, &metapb.RegionEpoch{}, operator.OpRegion, operator.TransferLeader{ToStore: 5}) oc.SetOperator(op) oc.Dispatch(region, DispatchFromHeartBeat) c.Assert(op.Status(), Equals, operator.CANCELED) @@ -176,7 +176,7 @@ func (t *testOperatorControllerSuite) TestFastFailWithUnhealthyStore(c *C) { tc.AddLeaderRegion(1, 1, 2) region := tc.GetRegion(1) steps := []operator.OpStep{operator.TransferLeader{ToStore: 2}} - op := operator.NewTestOperator("test", "test", 1, region.GetRegionEpoch(), operator.OpLeader, steps...) + op := operator.NewTestOperator(1, region.GetRegionEpoch(), operator.OpLeader, steps...) oc.SetOperator(op) c.Assert(oc.checkStaleOperator(op, steps[0], region), IsFalse) tc.SetStoreDown(2) @@ -200,7 +200,7 @@ func (t *testOperatorControllerSuite) TestCheckAddUnexpectedStatus(c *C) { } { // finished op - op := operator.NewTestOperator("test", "test", 1, &metapb.RegionEpoch{}, operator.OpRegion, operator.TransferLeader{ToStore: 2}) + op := operator.NewTestOperator(1, &metapb.RegionEpoch{}, operator.OpRegion, operator.TransferLeader{ToStore: 2}) c.Assert(oc.checkAddOperator(op), IsTrue) op.Start() c.Assert(oc.checkAddOperator(op), IsFalse) // started @@ -210,14 +210,14 @@ func (t *testOperatorControllerSuite) TestCheckAddUnexpectedStatus(c *C) { } { // finished op canceled - op := operator.NewTestOperator("test", "test", 1, &metapb.RegionEpoch{}, operator.OpRegion, operator.TransferLeader{ToStore: 2}) + op := operator.NewTestOperator(1, &metapb.RegionEpoch{}, operator.OpRegion, operator.TransferLeader{ToStore: 2}) c.Assert(oc.checkAddOperator(op), IsTrue) c.Assert(op.Cancel(), IsTrue) c.Assert(oc.checkAddOperator(op), IsFalse) } { // finished op replaced - op := operator.NewTestOperator("test", "test", 1, &metapb.RegionEpoch{}, operator.OpRegion, operator.TransferLeader{ToStore: 2}) + op := operator.NewTestOperator(1, &metapb.RegionEpoch{}, operator.OpRegion, operator.TransferLeader{ToStore: 2}) c.Assert(oc.checkAddOperator(op), IsTrue) c.Assert(op.Start(), IsTrue) c.Assert(op.Replace(), IsTrue) @@ -225,8 +225,8 @@ func (t *testOperatorControllerSuite) TestCheckAddUnexpectedStatus(c *C) { } { // finished op expired - op1 := operator.NewTestOperator("test", "test", 1, &metapb.RegionEpoch{}, operator.OpRegion, operator.TransferLeader{ToStore: 2}) - op2 := operator.NewTestOperator("test", "test", 2, &metapb.RegionEpoch{}, operator.OpRegion, operator.TransferLeader{ToStore: 1}) + op1 := operator.NewTestOperator(1, &metapb.RegionEpoch{}, operator.OpRegion, operator.TransferLeader{ToStore: 2}) + op2 := operator.NewTestOperator(2, &metapb.RegionEpoch{}, operator.OpRegion, operator.TransferLeader{ToStore: 1}) c.Assert(oc.checkAddOperator(op1, op2), IsTrue) operator.SetOperatorStatusReachTime(op1, operator.CREATED, time.Now().Add(-operator.OperatorExpireTime)) operator.SetOperatorStatusReachTime(op2, operator.CREATED, time.Now().Add(-operator.OperatorExpireTime)) @@ -238,7 +238,7 @@ func (t *testOperatorControllerSuite) TestCheckAddUnexpectedStatus(c *C) { { // unfinished op timeout - op := operator.NewTestOperator("test", "test", 1, &metapb.RegionEpoch{}, operator.OpRegion, steps...) + op := operator.NewTestOperator(1, &metapb.RegionEpoch{}, operator.OpRegion, steps...) c.Assert(oc.checkAddOperator(op), IsTrue) op.Start() operator.SetOperatorStatusReachTime(op, operator.STARTED, time.Now().Add(-operator.SlowOperatorWaitTime)) @@ -262,9 +262,9 @@ func (t *testOperatorControllerSuite) TestConcurrentRemoveOperator(c *C) { operator.AddPeer{ToStore: 1, PeerID: 4}, } // finished op with normal priority - op1 := operator.NewTestOperator("test", "test", 1, &metapb.RegionEpoch{}, operator.OpRegion, operator.TransferLeader{ToStore: 2}) + op1 := operator.NewTestOperator(1, &metapb.RegionEpoch{}, operator.OpRegion, operator.TransferLeader{ToStore: 2}) // unfinished op with high priority - op2 := operator.NewTestOperator("test", "test", 1, &metapb.RegionEpoch{}, operator.OpRegion|operator.OpAdmin, steps...) + op2 := operator.NewTestOperator(1, &metapb.RegionEpoch{}, operator.OpRegion|operator.OpAdmin, steps...) c.Assert(op1.Start(), IsTrue) oc.SetOperator(op1) @@ -303,10 +303,10 @@ func (t *testOperatorControllerSuite) TestPollDispatchRegion(c *C) { operator.RemovePeer{FromStore: 2}, operator.AddPeer{ToStore: 2, PeerID: 4}, } - op1 := operator.NewTestOperator("test", "test", 1, &metapb.RegionEpoch{}, operator.OpRegion, operator.TransferLeader{ToStore: 2}) - op2 := operator.NewTestOperator("test", "test", 2, &metapb.RegionEpoch{}, operator.OpRegion, steps...) - op3 := operator.NewTestOperator("test", "test", 3, &metapb.RegionEpoch{}, operator.OpRegion, steps...) - op4 := operator.NewTestOperator("test", "test", 4, &metapb.RegionEpoch{}, operator.OpRegion, operator.TransferLeader{ToStore: 2}) + op1 := operator.NewTestOperator(1, &metapb.RegionEpoch{}, operator.OpRegion, operator.TransferLeader{ToStore: 2}) + op2 := operator.NewTestOperator(2, &metapb.RegionEpoch{}, operator.OpRegion, steps...) + op3 := operator.NewTestOperator(3, &metapb.RegionEpoch{}, operator.OpRegion, steps...) + op4 := operator.NewTestOperator(4, &metapb.RegionEpoch{}, operator.OpRegion, operator.TransferLeader{ToStore: 2}) region1 := tc.GetRegion(1) region2 := tc.GetRegion(2) region4 := tc.GetRegion(4) @@ -377,53 +377,53 @@ func (t *testOperatorControllerSuite) TestStoreLimit(c *C) { tc.SetStoreLimit(2, storelimit.AddPeer, 60) for i := uint64(1); i <= 5; i++ { - op := operator.NewTestOperator("test", "test", 1, &metapb.RegionEpoch{}, operator.OpRegion, operator.AddPeer{ToStore: 2, PeerID: i}) + op := operator.NewTestOperator(1, &metapb.RegionEpoch{}, operator.OpRegion, operator.AddPeer{ToStore: 2, PeerID: i}) c.Assert(oc.AddOperator(op), IsTrue) checkRemoveOperatorSuccess(c, oc, op) } - op := operator.NewTestOperator("test", "test", 1, &metapb.RegionEpoch{}, operator.OpRegion, operator.AddPeer{ToStore: 2, PeerID: 1}) + op := operator.NewTestOperator(1, &metapb.RegionEpoch{}, operator.OpRegion, operator.AddPeer{ToStore: 2, PeerID: 1}) c.Assert(oc.AddOperator(op), IsFalse) c.Assert(oc.RemoveOperator(op), IsFalse) tc.SetStoreLimit(2, storelimit.AddPeer, 120) for i := uint64(1); i <= 10; i++ { - op = operator.NewTestOperator("test", "test", i, &metapb.RegionEpoch{}, operator.OpRegion, operator.AddPeer{ToStore: 2, PeerID: i}) + op = operator.NewTestOperator(i, &metapb.RegionEpoch{}, operator.OpRegion, operator.AddPeer{ToStore: 2, PeerID: i}) c.Assert(oc.AddOperator(op), IsTrue) checkRemoveOperatorSuccess(c, oc, op) } tc.SetAllStoresLimit(storelimit.AddPeer, 60) for i := uint64(1); i <= 5; i++ { - op = operator.NewTestOperator("test", "test", i, &metapb.RegionEpoch{}, operator.OpRegion, operator.AddPeer{ToStore: 2, PeerID: i}) + op = operator.NewTestOperator(i, &metapb.RegionEpoch{}, operator.OpRegion, operator.AddPeer{ToStore: 2, PeerID: i}) c.Assert(oc.AddOperator(op), IsTrue) checkRemoveOperatorSuccess(c, oc, op) } - op = operator.NewTestOperator("test", "test", 1, &metapb.RegionEpoch{}, operator.OpRegion, operator.AddPeer{ToStore: 2, PeerID: 1}) + op = operator.NewTestOperator(1, &metapb.RegionEpoch{}, operator.OpRegion, operator.AddPeer{ToStore: 2, PeerID: 1}) c.Assert(oc.AddOperator(op), IsFalse) c.Assert(oc.RemoveOperator(op), IsFalse) tc.SetStoreLimit(2, storelimit.RemovePeer, 60) for i := uint64(1); i <= 5; i++ { - op := operator.NewTestOperator("test", "test", 1, &metapb.RegionEpoch{}, operator.OpRegion, operator.RemovePeer{FromStore: 2}) + op := operator.NewTestOperator(1, &metapb.RegionEpoch{}, operator.OpRegion, operator.RemovePeer{FromStore: 2}) c.Assert(oc.AddOperator(op), IsTrue) checkRemoveOperatorSuccess(c, oc, op) } - op = operator.NewTestOperator("test", "test", 1, &metapb.RegionEpoch{}, operator.OpRegion, operator.RemovePeer{FromStore: 2}) + op = operator.NewTestOperator(1, &metapb.RegionEpoch{}, operator.OpRegion, operator.RemovePeer{FromStore: 2}) c.Assert(oc.AddOperator(op), IsFalse) c.Assert(oc.RemoveOperator(op), IsFalse) tc.SetStoreLimit(2, storelimit.RemovePeer, 120) for i := uint64(1); i <= 10; i++ { - op = operator.NewTestOperator("test", "test", i, &metapb.RegionEpoch{}, operator.OpRegion, operator.RemovePeer{FromStore: 2}) + op = operator.NewTestOperator(i, &metapb.RegionEpoch{}, operator.OpRegion, operator.RemovePeer{FromStore: 2}) c.Assert(oc.AddOperator(op), IsTrue) checkRemoveOperatorSuccess(c, oc, op) } tc.SetAllStoresLimit(storelimit.RemovePeer, 60) for i := uint64(1); i <= 5; i++ { - op = operator.NewTestOperator("test", "test", i, &metapb.RegionEpoch{}, operator.OpRegion, operator.RemovePeer{FromStore: 2}) + op = operator.NewTestOperator(i, &metapb.RegionEpoch{}, operator.OpRegion, operator.RemovePeer{FromStore: 2}) c.Assert(oc.AddOperator(op), IsTrue) checkRemoveOperatorSuccess(c, oc, op) } - op = operator.NewTestOperator("test", "test", 1, &metapb.RegionEpoch{}, operator.OpRegion, operator.RemovePeer{FromStore: 2}) + op = operator.NewTestOperator(1, &metapb.RegionEpoch{}, operator.OpRegion, operator.RemovePeer{FromStore: 2}) c.Assert(oc.AddOperator(op), IsFalse) c.Assert(oc.RemoveOperator(op), IsFalse) } @@ -443,7 +443,7 @@ func (t *testOperatorControllerSuite) TestDispatchOutdatedRegion(c *C) { operator.RemovePeer{FromStore: 1}, } - op := operator.NewTestOperator("test", "test", 1, + op := operator.NewTestOperator(1, &metapb.RegionEpoch{ConfVer: 0, Version: 0}, operator.OpRegion, steps...) c.Assert(controller.AddOperator(op), IsTrue) @@ -466,7 +466,7 @@ func (t *testOperatorControllerSuite) TestDispatchOutdatedRegion(c *C) { c.Assert(stream.MsgLength(), Equals, 2) // add and dispatch op again, the op should be stale - op = operator.NewTestOperator("test", "test", 1, + op = operator.NewTestOperator(1, &metapb.RegionEpoch{ConfVer: 0, Version: 0}, operator.OpRegion, steps...) c.Assert(controller.AddOperator(op), IsTrue) @@ -517,8 +517,7 @@ func (t *testOperatorControllerSuite) TestDispatchUnfinishedStep(c *C) { for _, steps := range testSteps { // Create an operator - op := operator.NewTestOperator("test", "test", 1, epoch, - operator.OpRegion, steps...) + op := operator.NewTestOperator(1, epoch, operator.OpRegion, steps...) c.Assert(controller.AddOperator(op), IsTrue) c.Assert(stream.MsgLength(), Equals, 1) diff --git a/server/schedule/waiting_operator_test.go b/server/schedule/waiting_operator_test.go index c1c078cc56d..ed5fb244769 100644 --- a/server/schedule/waiting_operator_test.go +++ b/server/schedule/waiting_operator_test.go @@ -36,18 +36,21 @@ func (s *testWaitingOperatorSuite) TestRandBuckets(c *C) { } func addOperators(wop WaitingOperator) { - op := operator.NewTestOperator("testOperatorNormal", "test", uint64(1), &metapb.RegionEpoch{}, operator.OpRegion, []operator.OpStep{ + op := operator.NewTestOperator(uint64(1), &metapb.RegionEpoch{}, operator.OpRegion, []operator.OpStep{ operator.RemovePeer{FromStore: uint64(1)}, }...) + op.SetDesc("testOperatorNormal") wop.PutOperator(op) - op = operator.NewTestOperator("testOperatorHigh", "test", uint64(2), &metapb.RegionEpoch{}, operator.OpRegion, []operator.OpStep{ + op = operator.NewTestOperator(uint64(2), &metapb.RegionEpoch{}, operator.OpRegion, []operator.OpStep{ operator.RemovePeer{FromStore: uint64(2)}, }...) + op.SetDesc("testOperatorHigh") op.SetPriorityLevel(core.HighPriority) wop.PutOperator(op) - op = operator.NewTestOperator("testOperatorLow", "test", uint64(3), &metapb.RegionEpoch{}, operator.OpRegion, []operator.OpStep{ + op = operator.NewTestOperator(uint64(3), &metapb.RegionEpoch{}, operator.OpRegion, []operator.OpStep{ operator.RemovePeer{FromStore: uint64(3)}, }...) + op.SetDesc("testOperatorLow") op.SetPriorityLevel(core.LowPriority) wop.PutOperator(op) } @@ -64,7 +67,7 @@ func (s *testWaitingOperatorSuite) TestRandomBucketsWithMergeRegion(c *C) { for j := 0; j < 100; j++ { // adds operators desc := descs[j%3] - op := operator.NewTestOperator(desc, "test", uint64(1), &metapb.RegionEpoch{}, operator.OpRegion|operator.OpMerge, []operator.OpStep{ + op := operator.NewTestOperator(uint64(1), &metapb.RegionEpoch{}, operator.OpRegion|operator.OpMerge, []operator.OpStep{ operator.MergeRegion{ FromRegion: &metapb.Region{ Id: 1, @@ -78,8 +81,9 @@ func (s *testWaitingOperatorSuite) TestRandomBucketsWithMergeRegion(c *C) { IsPassive: false, }, }...) + op.SetDesc(desc) rb.PutOperator(op) - op = operator.NewTestOperator(desc, "test", uint64(2), &metapb.RegionEpoch{}, operator.OpRegion|operator.OpMerge, []operator.OpStep{ + op = operator.NewTestOperator(uint64(2), &metapb.RegionEpoch{}, operator.OpRegion|operator.OpMerge, []operator.OpStep{ operator.MergeRegion{ FromRegion: &metapb.Region{ Id: 1, @@ -94,9 +98,10 @@ func (s *testWaitingOperatorSuite) TestRandomBucketsWithMergeRegion(c *C) { }, }...) rb.PutOperator(op) - op = operator.NewTestOperator("testOperatorHigh", "test", uint64(3), &metapb.RegionEpoch{}, operator.OpRegion, []operator.OpStep{ + op = operator.NewTestOperator(uint64(3), &metapb.RegionEpoch{}, operator.OpRegion, []operator.OpStep{ operator.RemovePeer{FromStore: uint64(3)}, }...) + op.SetDesc("testOperatorHigh") op.SetPriorityLevel(core.HighPriority) rb.PutOperator(op) diff --git a/tests/server/cluster/cluster_test.go b/tests/server/cluster/cluster_test.go index 2f04c326541..f4ca28f10c8 100644 --- a/tests/server/cluster/cluster_test.go +++ b/tests/server/cluster/cluster_test.go @@ -244,9 +244,9 @@ func testStateAndLimit(c *C, clusterID uint64, rc *cluster.RaftCluster, grpcPDCl oc := rc.GetOperatorController() rc.SetStoreLimit(storeID, storelimit.AddPeer, 60) rc.SetStoreLimit(storeID, storelimit.RemovePeer, 60) - op := operator.NewTestOperator("test", "test", 2, &metapb.RegionEpoch{}, operator.OpRegion, operator.AddPeer{ToStore: storeID, PeerID: 3}) + op := operator.NewTestOperator(2, &metapb.RegionEpoch{}, operator.OpRegion, operator.AddPeer{ToStore: storeID, PeerID: 3}) oc.AddOperator(op) - op = operator.NewTestOperator("test", "test", 2, &metapb.RegionEpoch{}, operator.OpRegion, operator.RemovePeer{FromStore: storeID}) + op = operator.NewTestOperator(2, &metapb.RegionEpoch{}, operator.OpRegion, operator.RemovePeer{FromStore: storeID}) oc.AddOperator(op) resetStoreState(c, rc, store.GetId(), beforeState) @@ -922,21 +922,21 @@ func (s *clusterTestSuite) TestOfflineStoreLimit(c *C) { opt.SetAllStoresLimit(storelimit.RemovePeer, 1) // only can add 5 remove peer operators on store 1 for i := uint64(1); i <= 5; i++ { - op := operator.NewTestOperator("test", "test", 1, &metapb.RegionEpoch{ConfVer: 1, Version: 1}, operator.OpRegion, operator.RemovePeer{FromStore: 1}) + op := operator.NewTestOperator(1, &metapb.RegionEpoch{ConfVer: 1, Version: 1}, operator.OpRegion, operator.RemovePeer{FromStore: 1}) c.Assert(oc.AddOperator(op), IsTrue) c.Assert(oc.RemoveOperator(op), IsTrue) } - op := operator.NewTestOperator("test", "test", 1, &metapb.RegionEpoch{ConfVer: 1, Version: 1}, operator.OpRegion, operator.RemovePeer{FromStore: 1}) + op := operator.NewTestOperator(1, &metapb.RegionEpoch{ConfVer: 1, Version: 1}, operator.OpRegion, operator.RemovePeer{FromStore: 1}) c.Assert(oc.AddOperator(op), IsFalse) c.Assert(oc.RemoveOperator(op), IsFalse) // only can add 5 remove peer operators on store 2 for i := uint64(1); i <= 5; i++ { - op := operator.NewTestOperator("test", "test", 2, &metapb.RegionEpoch{ConfVer: 1, Version: 1}, operator.OpRegion, operator.RemovePeer{FromStore: 2}) + op := operator.NewTestOperator(2, &metapb.RegionEpoch{ConfVer: 1, Version: 1}, operator.OpRegion, operator.RemovePeer{FromStore: 2}) c.Assert(oc.AddOperator(op), IsTrue) c.Assert(oc.RemoveOperator(op), IsTrue) } - op = operator.NewTestOperator("test", "test", 2, &metapb.RegionEpoch{ConfVer: 1, Version: 1}, operator.OpRegion, operator.RemovePeer{FromStore: 2}) + op = operator.NewTestOperator(2, &metapb.RegionEpoch{ConfVer: 1, Version: 1}, operator.OpRegion, operator.RemovePeer{FromStore: 2}) c.Assert(oc.AddOperator(op), IsFalse) c.Assert(oc.RemoveOperator(op), IsFalse) @@ -945,11 +945,11 @@ func (s *clusterTestSuite) TestOfflineStoreLimit(c *C) { // only can add 5 remove peer operators on store 2 for i := uint64(1); i <= 5; i++ { - op := operator.NewTestOperator("test", "test", 2, &metapb.RegionEpoch{ConfVer: 1, Version: 1}, operator.OpRegion, operator.RemovePeer{FromStore: 2}) + op := operator.NewTestOperator(2, &metapb.RegionEpoch{ConfVer: 1, Version: 1}, operator.OpRegion, operator.RemovePeer{FromStore: 2}) c.Assert(oc.AddOperator(op), IsTrue) c.Assert(oc.RemoveOperator(op), IsTrue) } - op = operator.NewTestOperator("test", "test", 2, &metapb.RegionEpoch{ConfVer: 1, Version: 1}, operator.OpRegion, operator.RemovePeer{FromStore: 2}) + op = operator.NewTestOperator(2, &metapb.RegionEpoch{ConfVer: 1, Version: 1}, operator.OpRegion, operator.RemovePeer{FromStore: 2}) c.Assert(oc.AddOperator(op), IsFalse) c.Assert(oc.RemoveOperator(op), IsFalse) @@ -959,7 +959,7 @@ func (s *clusterTestSuite) TestOfflineStoreLimit(c *C) { // can add unlimited remove peer operators on store 1 for i := uint64(1); i <= 30; i++ { - op := operator.NewTestOperator("test", "test", 1, &metapb.RegionEpoch{ConfVer: 1, Version: 1}, operator.OpRegion, operator.RemovePeer{FromStore: 1}) + op := operator.NewTestOperator(1, &metapb.RegionEpoch{ConfVer: 1, Version: 1}, operator.OpRegion, operator.RemovePeer{FromStore: 1}) c.Assert(oc.AddOperator(op), IsTrue) c.Assert(oc.RemoveOperator(op), IsTrue) } @@ -1011,11 +1011,11 @@ func (s *clusterTestSuite) TestUpgradeStoreLimit(c *C) { oc := rc.GetOperatorController() // only can add 5 remove peer operators on store 1 for i := uint64(1); i <= 5; i++ { - op := operator.NewTestOperator("test", "test", 1, &metapb.RegionEpoch{ConfVer: 1, Version: 1}, operator.OpRegion, operator.RemovePeer{FromStore: 1}) + op := operator.NewTestOperator(1, &metapb.RegionEpoch{ConfVer: 1, Version: 1}, operator.OpRegion, operator.RemovePeer{FromStore: 1}) c.Assert(oc.AddOperator(op), IsTrue) c.Assert(oc.RemoveOperator(op), IsTrue) } - op := operator.NewTestOperator("test", "test", 1, &metapb.RegionEpoch{ConfVer: 1, Version: 1}, operator.OpRegion, operator.RemovePeer{FromStore: 1}) + op := operator.NewTestOperator(1, &metapb.RegionEpoch{ConfVer: 1, Version: 1}, operator.OpRegion, operator.RemovePeer{FromStore: 1}) c.Assert(oc.AddOperator(op), IsFalse) c.Assert(oc.RemoveOperator(op), IsFalse) } From 62340cfc8caab821a968990bbcf0f173a32a7d50 Mon Sep 17 00:00:00 2001 From: bufferflies <1045931706@qq.com> Date: Mon, 24 Jan 2022 22:55:35 +0800 Subject: [PATCH 09/25] resolved conflict Signed-off-by: bufferflies <1045931706@qq.com> --- server/schedule/waiting_operator_test.go | 3 --- 1 file changed, 3 deletions(-) diff --git a/server/schedule/waiting_operator_test.go b/server/schedule/waiting_operator_test.go index a732558285d..9237505acc3 100644 --- a/server/schedule/waiting_operator_test.go +++ b/server/schedule/waiting_operator_test.go @@ -39,18 +39,15 @@ func addOperators(wop WaitingOperator) { op := operator.NewTestOperator(uint64(1), &metapb.RegionEpoch{}, operator.OpRegion, []operator.OpStep{ operator.RemovePeer{FromStore: uint64(1)}, }...) - op.SetDesc("testOperatorNormal") wop.PutOperator(op) op = operator.NewTestOperator(uint64(2), &metapb.RegionEpoch{}, operator.OpRegion, []operator.OpStep{ operator.RemovePeer{FromStore: uint64(2)}, }...) - op.SetDesc("testOperatorHigh") op.SetPriorityLevel(core.HighPriority) wop.PutOperator(op) op = operator.NewTestOperator(uint64(3), &metapb.RegionEpoch{}, operator.OpRegion, []operator.OpStep{ operator.RemovePeer{FromStore: uint64(3)}, }...) - op.SetDesc("testOperatorLow") op.SetPriorityLevel(core.LowPriority) wop.PutOperator(op) } From 1c7bdc822620b0cec4831cc0be49258a156d0f03 Mon Sep 17 00:00:00 2001 From: bufferflies <1045931706@qq.com> Date: Tue, 25 Jan 2022 10:31:28 +0800 Subject: [PATCH 10/25] resolved conflict Signed-off-by: bufferflies <1045931706@qq.com> --- server/config/persist_options.go | 1 - 1 file changed, 1 deletion(-) diff --git a/server/config/persist_options.go b/server/config/persist_options.go index c0181d2a18e..6a1ab04d125 100644 --- a/server/config/persist_options.go +++ b/server/config/persist_options.go @@ -193,7 +193,6 @@ const ( hotRegionScheduleLimitKey = "schedule.hot-region-schedule-limit" schedulerMaxWaitingOperatorKey = "schedule.scheduler-max-waiting-operator" enableLocationReplacement = "schedule.enable-location-replacement" - maxRegionSizeKey = "schedule.max-region-size" ) var supportedTTLConfigs = []string{ From 1a796f962684185ad04f8f306bc1bccdee7e1d48 Mon Sep 17 00:00:00 2001 From: bufferflies <1045931706@qq.com> Date: Tue, 25 Jan 2022 10:40:50 +0800 Subject: [PATCH 11/25] rename TimeOut to timeout Signed-off-by: bufferflies <1045931706@qq.com> --- server/schedule/operator/status_tracker.go | 2 +- server/schedule/operator/step.go | 20 ++++++++++---------- 2 files changed, 11 insertions(+), 11 deletions(-) diff --git a/server/schedule/operator/status_tracker.go b/server/schedule/operator/status_tracker.go index 930b8b7698f..914b87e9185 100644 --- a/server/schedule/operator/status_tracker.go +++ b/server/schedule/operator/status_tracker.go @@ -133,7 +133,7 @@ func (trk *OpStatusTracker) CheckStepTimeout(start time.Time, step OpStep) bool trk.rw.Lock() defer trk.rw.Unlock() if trk.current == STARTED { - if !step.TimeOut(start) { + if !step.Timeout(start) { return false } _ = trk.toLocked(TIMEOUT) diff --git a/server/schedule/operator/step.go b/server/schedule/operator/step.go index 48554308db1..09725bb2dae 100644 --- a/server/schedule/operator/step.go +++ b/server/schedule/operator/step.go @@ -38,7 +38,7 @@ type OpStep interface { IsFinish(region *core.RegionInfo) bool CheckInProgress(ci ClusterInformer, region *core.RegionInfo) error Influence(opInfluence OpInfluence, region *core.RegionInfo) - TimeOut(start time.Time) bool + Timeout(start time.Time) bool } // TransferLeader is an OpStep that transfers a region's leader. @@ -102,7 +102,7 @@ func (tl TransferLeader) Influence(opInfluence OpInfluence, region *core.RegionI } // TimeOut returns true if the step is timeout. -func (tl TransferLeader) TimeOut(start time.Time) bool { +func (tl TransferLeader) Timeout(start time.Time) bool { return time.Since(start) > FastOperatorWaitTime } @@ -160,7 +160,7 @@ func (ap AddPeer) CheckInProgress(ci ClusterInformer, region *core.RegionInfo) e } // TimeOut returns true if the step is timeout. -func (ap AddPeer) TimeOut(start time.Time) bool { +func (ap AddPeer) Timeout(start time.Time) bool { return time.Since(start) > FastOperatorWaitTime } @@ -226,7 +226,7 @@ func (al AddLearner) Influence(opInfluence OpInfluence, region *core.RegionInfo) } // TimeOut returns true if the step is timeout. -func (al AddLearner) TimeOut(start time.Time) bool { +func (al AddLearner) Timeout(start time.Time) bool { return time.Since(start) > FastOperatorWaitTime } @@ -269,7 +269,7 @@ func (pl PromoteLearner) CheckInProgress(_ ClusterInformer, region *core.RegionI func (pl PromoteLearner) Influence(opInfluence OpInfluence, region *core.RegionInfo) {} // TimeOut returns true if the step is timeout. -func (pl PromoteLearner) TimeOut(start time.Time) bool { +func (pl PromoteLearner) Timeout(start time.Time) bool { return time.Since(start) > FastOperatorWaitTime } @@ -325,7 +325,7 @@ func (rp RemovePeer) Influence(opInfluence OpInfluence, region *core.RegionInfo) } // TimeOut returns true if the step is timeout. -func (rp RemovePeer) TimeOut(start time.Time) bool { +func (rp RemovePeer) Timeout(start time.Time) bool { return time.Since(start) > FastOperatorWaitTime } @@ -378,7 +378,7 @@ func (mr MergeRegion) Influence(opInfluence OpInfluence, region *core.RegionInfo } // TimeOut returns true if the step is timeout. -func (mr MergeRegion) TimeOut(start time.Time) bool { +func (mr MergeRegion) Timeout(start time.Time) bool { return time.Since(start) > FastOperatorWaitTime } @@ -420,7 +420,7 @@ func (sr SplitRegion) CheckInProgress(_ ClusterInformer, region *core.RegionInfo } // TimeOut returns true if the step is timeout. -func (sr SplitRegion) TimeOut(start time.Time) bool { +func (sr SplitRegion) Timeout(start time.Time) bool { return time.Since(start) > FastOperatorWaitTime } @@ -598,7 +598,7 @@ func (cpe ChangePeerV2Enter) GetRequest() *pdpb.ChangePeerV2 { } // TimeOut returns true if the step is timeout. -func (cpe ChangePeerV2Enter) TimeOut(start time.Time) bool { +func (cpe ChangePeerV2Enter) Timeout(start time.Time) bool { return time.Since(start) > FastOperatorWaitTime } @@ -722,7 +722,7 @@ func (cpl ChangePeerV2Leave) CheckInProgress(_ ClusterInformer, region *core.Reg func (cpl ChangePeerV2Leave) Influence(opInfluence OpInfluence, region *core.RegionInfo) {} // TimeOut returns true if the step is timeout. -func (cpl ChangePeerV2Leave) TimeOut(start time.Time) bool { +func (cpl ChangePeerV2Leave) Timeout(start time.Time) bool { return time.Since(start) > FastOperatorWaitTime } From bb427c82d10166b9fd5136b997c01f97d98fc13e Mon Sep 17 00:00:00 2001 From: bufferflies <1045931706@qq.com> Date: Tue, 25 Jan 2022 15:13:01 +0800 Subject: [PATCH 12/25] add and pass unit test Signed-off-by: bufferflies <1045931706@qq.com> --- server/core/store.go | 3 +- server/schedule/operator/builder.go | 2 +- server/schedule/operator/operator_test.go | 57 ++++++++++++++++- .../schedule/operator/status_tracker_test.go | 23 +++++++ server/schedule/operator/step.go | 62 ++++++++++++------- 5 files changed, 119 insertions(+), 28 deletions(-) diff --git a/server/core/store.go b/server/core/store.go index 892497794b0..a24a510255c 100644 --- a/server/core/store.go +++ b/server/core/store.go @@ -42,7 +42,7 @@ const ( EngineTiFlash = "tiflash" // EngineTiKV indicates the tikv engine in metrics EngineTiKV = "tikv" - + // OperatorExecutorRate is the rate of the store operator executor. OperatorExecutorRate = 6.0 ) @@ -271,6 +271,7 @@ func (s *StoreInfo) GetStoreLimit(limitType storelimit.Type) *storelimit.StoreLi return s.limiter[limitType] } +// GetOperatorExecutorRate returns the operator executor rate of the store. func (s *StoreInfo) GetOperatorExecutorRate() float64 { s.mu.RLock() defer s.mu.RUnlock() diff --git a/server/schedule/operator/builder.go b/server/schedule/operator/builder.go index dfad885202c..538336ce3a6 100644 --- a/server/schedule/operator/builder.go +++ b/server/schedule/operator/builder.go @@ -678,7 +678,7 @@ func (b *Builder) execPromoteLearner(peer *metapb.Peer) { func (b *Builder) execAddPeer(peer *metapb.Peer) { if b.lightWeight { b.steps = append(b.steps, AddLearner{ToStore: peer.GetStoreId(), PeerID: peer.GetId(), - IsLightWeight: b.lightWeight, regionSize: b.approximateSize, rate: b.operatorExecutorRate}) + IsLightWeight: b.lightWeight, regionSize: b.approximateSize, executorRate: b.operatorExecutorRate}) } else { b.steps = append(b.steps, AddLearner{ToStore: peer.GetStoreId(), PeerID: peer.GetId()}) } diff --git a/server/schedule/operator/operator_test.go b/server/schedule/operator/operator_test.go index db01a047efb..2832ae8f70c 100644 --- a/server/schedule/operator/operator_test.go +++ b/server/schedule/operator/operator_test.go @@ -124,7 +124,7 @@ func (s *testOperatorSuite) TestOperator(c *C) { // addPeer1, transferLeader1, removePeer2 steps = []OpStep{ - AddPeer{ToStore: 1, PeerID: 1}, + AddPeer{ToStore: 1, PeerID: 1, regionSize: 10, executorRate: 6}, TransferLeader{FromStore: 2, ToStore: 1}, RemovePeer{FromStore: 2}, } @@ -136,6 +136,7 @@ func (s *testOperatorSuite) TestOperator(c *C) { c.Assert(op.CheckTimeout(), IsFalse) SetOperatorStatusReachTime(op, STARTED, op.GetStartTime().Add(-FastOperatorWaitTime-time.Second)) c.Assert(op.CheckTimeout(), IsFalse) + op.stepsTime[op.currentStep-1] = op.GetReachTimeOf(STARTED).Unix() SetOperatorStatusReachTime(op, STARTED, op.GetStartTime().Add(-SlowOperatorWaitTime-time.Second)) c.Assert(op.CheckTimeout(), IsTrue) res, err := json.Marshal(op) @@ -361,7 +362,7 @@ func (s *testOperatorSuite) TestCheck(c *C) { c.Assert(op.Start(), IsTrue) c.Assert(op.Check(region), NotNil) c.Assert(op.Status(), Equals, STARTED) - op.status.setTime(STARTED, time.Now().Add(-SlowOperatorWaitTime)) + op.stepsTime[op.currentStep-1] = time.Now().Add(-SlowOperatorWaitTime).Unix() c.Assert(op.Check(region), NotNil) c.Assert(op.Status(), Equals, TIMEOUT) } @@ -418,3 +419,55 @@ func (s *testOperatorSuite) TestSchedulerKind(c *C) { c.Assert(v.op.SchedulerKind(), Equals, v.expect) } } + +func (s *testOperatorSuite) TestOpStepTimeout(c *C) { + testdata := []struct { + step []OpStep + start time.Time + expect bool + }{{ + // case1: 10GB region will have 60,000s to executor. + step: []OpStep{AddLearner{executorRate: 6, regionSize: 10 * 1000}, AddPeer{executorRate: 6, regionSize: 10 * 1000}}, + start: time.Now().Add(-time.Second * (6*10*1000 + 20)), + expect: true, + }, { + step: []OpStep{AddLearner{executorRate: 6, regionSize: 10 * 1000}, AddPeer{executorRate: 6, regionSize: 10 * 1000}}, + start: time.Now().Add(-time.Second * (6*10*1000 - 20)), + expect: false, + }, { + // case2: 10MB region will have at least SlowOperatorWaitTime(10min) to executor. + step: []OpStep{AddLearner{executorRate: 6, regionSize: 10}, AddPeer{executorRate: 6, regionSize: 10}}, + start: time.Now().Add(-(SlowOperatorWaitTime + time.Second*20)), + expect: true, + }, { + step: []OpStep{AddLearner{executorRate: 6, regionSize: 10}, AddPeer{executorRate: 6, regionSize: 10}}, + start: time.Now().Add(-time.Second * (6*10 + 20)), + expect: false, + }, { + // case3: RemovePeer, TransferLeader, SplitRegion, MergeRegion, PromoteLearner will have FastOperatorWaitTime(10s) to executor. + step: []OpStep{RemovePeer{}, TransferLeader{}, SplitRegion{}, MergeRegion{}, PromoteLearner{}}, + start: time.Now().Add(-(FastOperatorWaitTime + time.Second)), + expect: true, + }, { + step: []OpStep{RemovePeer{}, TransferLeader{}, SplitRegion{}, MergeRegion{}, PromoteLearner{}}, + start: time.Now().Add(-(FastOperatorWaitTime - time.Second)), + expect: false, + }, { + // case4: ChangePeerV2Enter, ChangePeerV2Leave has FastOperatorWaitTime times than FastOperatorWaitTime + step: []OpStep{ChangePeerV2Enter{PromoteLearners: []PromoteLearner{{}, {}}}, + ChangePeerV2Leave{PromoteLearners: []PromoteLearner{{}, {}}}}, + start: time.Now().Add(-(FastOperatorWaitTime*(2+1) + time.Second)), + expect: true, + }, { + step: []OpStep{ChangePeerV2Enter{PromoteLearners: []PromoteLearner{{}, {}}}, + ChangePeerV2Leave{PromoteLearners: []PromoteLearner{{}, {}}}}, + start: time.Now().Add(-(FastOperatorWaitTime*(2+1) - time.Second)), + expect: false, + }, + } + for _, v := range testdata { + for _, step := range v.step { + c.Assert(v.expect, Equals, step.Timeout(v.start)) + } + } +} diff --git a/server/schedule/operator/status_tracker_test.go b/server/schedule/operator/status_tracker_test.go index 78ad30979fa..06c506d7a9e 100644 --- a/server/schedule/operator/status_tracker_test.go +++ b/server/schedule/operator/status_tracker_test.go @@ -114,6 +114,29 @@ func (s *testOpStatusTrackerSuite) TestCheckExpired(c *C) { } } +func (s *testOpStatusTrackerSuite) TestCheckStepTimeout(c *C) { + testdata := []struct { + step OpStep + start time.Time + status OpStatus + }{{ + step: AddLearner{}, + start: time.Now().Add(-(SlowOperatorWaitTime - 1*time.Second)), + status: STARTED, + }, { + step: AddLearner{}, + start: time.Now().Add(-(SlowOperatorWaitTime + 1*time.Second)), + status: TIMEOUT, + }} + + for _, v := range testdata { + trk := NewOpStatusTracker() + trk.To(STARTED) + c.Assert(trk.CheckStepTimeout(v.start, v.step), Equals, v.status == TIMEOUT) + c.Assert(trk.Status(), Equals, v.status) + } +} + func (s *testOpStatusTrackerSuite) TestCheckTimeout(c *C) { { // Not timeout diff --git a/server/schedule/operator/step.go b/server/schedule/operator/step.go index 09725bb2dae..7705ef94718 100644 --- a/server/schedule/operator/step.go +++ b/server/schedule/operator/step.go @@ -17,6 +17,7 @@ package operator import ( "bytes" "fmt" + "math" "strings" "time" @@ -50,7 +51,7 @@ type TransferLeader struct { } // ConfVerChanged returns the delta value for version increased by this step. -func (tl TransferLeader) ConfVerChanged(region *core.RegionInfo) uint64 { +func (tl TransferLeader) ConfVerChanged(_ *core.RegionInfo) uint64 { return 0 // transfer leader never change the conf version } @@ -101,7 +102,7 @@ func (tl TransferLeader) Influence(opInfluence OpInfluence, region *core.RegionI to.LeaderCount++ } -// TimeOut returns true if the step is timeout. +// Timeout returns true if the step is timeout. func (tl TransferLeader) Timeout(start time.Time) bool { return time.Since(start) > FastOperatorWaitTime } @@ -110,6 +111,8 @@ func (tl TransferLeader) Timeout(start time.Time) bool { type AddPeer struct { ToStore, PeerID uint64 IsLightWeight bool + regionSize int64 + executorRate float64 } // ConfVerChanged returns the delta value for version increased by this step. @@ -159,9 +162,9 @@ func (ap AddPeer) CheckInProgress(ci ClusterInformer, region *core.RegionInfo) e return nil } -// TimeOut returns true if the step is timeout. +// Timeout returns true if the step is timeout. func (ap AddPeer) Timeout(start time.Time) bool { - return time.Since(start) > FastOperatorWaitTime + return time.Since(start) > maxWaitTime(ap.executorRate, ap.regionSize) } // AddLearner is an OpStep that adds a region learner peer. @@ -169,7 +172,7 @@ type AddLearner struct { ToStore, PeerID uint64 IsLightWeight bool regionSize int64 - rate float64 + executorRate float64 } // ConfVerChanged returns the delta value for version increased by this step. @@ -225,9 +228,9 @@ func (al AddLearner) Influence(opInfluence OpInfluence, region *core.RegionInfo) to.AdjustStepCost(storelimit.AddPeer, regionSize) } -// TimeOut returns true if the step is timeout. +// Timeout returns true if the step is timeout. func (al AddLearner) Timeout(start time.Time) bool { - return time.Since(start) > FastOperatorWaitTime + return time.Since(start) > maxWaitTime(al.executorRate, al.regionSize) } // PromoteLearner is an OpStep that promotes a region learner peer to normal voter. @@ -266,9 +269,9 @@ func (pl PromoteLearner) CheckInProgress(_ ClusterInformer, region *core.RegionI } // Influence calculates the store difference that current step makes. -func (pl PromoteLearner) Influence(opInfluence OpInfluence, region *core.RegionInfo) {} +func (pl PromoteLearner) Influence(_ OpInfluence, _ *core.RegionInfo) {} -// TimeOut returns true if the step is timeout. +// Timeout returns true if the step is timeout. func (pl PromoteLearner) Timeout(start time.Time) bool { return time.Since(start) > FastOperatorWaitTime } @@ -324,7 +327,7 @@ func (rp RemovePeer) Influence(opInfluence OpInfluence, region *core.RegionInfo) from.AdjustStepCost(storelimit.RemovePeer, regionSize) } -// TimeOut returns true if the step is timeout. +// Timeout returns true if the step is timeout. func (rp RemovePeer) Timeout(start time.Time) bool { return time.Since(start) > FastOperatorWaitTime } @@ -343,7 +346,7 @@ type MergeRegion struct { } // ConfVerChanged returns the delta value for version increased by this step. -func (mr MergeRegion) ConfVerChanged(region *core.RegionInfo) uint64 { +func (mr MergeRegion) ConfVerChanged(_ *core.RegionInfo) uint64 { return 0 } @@ -360,7 +363,7 @@ func (mr MergeRegion) IsFinish(region *core.RegionInfo) bool { } // CheckInProgress checks if the step is in the progress of advancing. -func (mr MergeRegion) CheckInProgress(_ ClusterInformer, region *core.RegionInfo) error { +func (mr MergeRegion) CheckInProgress(_ ClusterInformer, _ *core.RegionInfo) error { return nil } @@ -377,7 +380,7 @@ func (mr MergeRegion) Influence(opInfluence OpInfluence, region *core.RegionInfo } } -// TimeOut returns true if the step is timeout. +// Timeout returns true if the step is timeout. func (mr MergeRegion) Timeout(start time.Time) bool { return time.Since(start) > FastOperatorWaitTime } @@ -390,7 +393,7 @@ type SplitRegion struct { } // ConfVerChanged returns the delta value for version increased by this step. -func (sr SplitRegion) ConfVerChanged(region *core.RegionInfo) uint64 { +func (sr SplitRegion) ConfVerChanged(_ *core.RegionInfo) uint64 { return 0 } @@ -415,11 +418,11 @@ func (sr SplitRegion) Influence(opInfluence OpInfluence, region *core.RegionInfo } // CheckInProgress checks if the step is in the progress of advancing. -func (sr SplitRegion) CheckInProgress(_ ClusterInformer, region *core.RegionInfo) error { +func (sr SplitRegion) CheckInProgress(_ ClusterInformer, _ *core.RegionInfo) error { return nil } -// TimeOut returns true if the step is timeout. +// Timeout returns true if the step is timeout. func (sr SplitRegion) Timeout(start time.Time) bool { return time.Since(start) > FastOperatorWaitTime } @@ -451,8 +454,8 @@ func (dv DemoteVoter) IsFinish(region *core.RegionInfo) bool { return false } -// TimeOut returns true if the step is timeout. -func (dv DemoteVoter) TimeOut(start time.Time) bool { +// Timeout returns true if the step is timeout. +func (dv DemoteVoter) Timeout(start time.Time) bool { return time.Since(start) > FastOperatorWaitTime } @@ -567,7 +570,7 @@ func (cpe ChangePeerV2Enter) CheckInProgress(_ ClusterInformer, region *core.Reg } // Influence calculates the store difference that current step makes. -func (cpe ChangePeerV2Enter) Influence(opInfluence OpInfluence, region *core.RegionInfo) {} +func (cpe ChangePeerV2Enter) Influence(_ OpInfluence, _ *core.RegionInfo) {} // GetRequest get the ChangePeerV2 request func (cpe ChangePeerV2Enter) GetRequest() *pdpb.ChangePeerV2 { @@ -597,9 +600,10 @@ func (cpe ChangePeerV2Enter) GetRequest() *pdpb.ChangePeerV2 { } } -// TimeOut returns true if the step is timeout. +// Timeout returns true if the step is timeout. func (cpe ChangePeerV2Enter) Timeout(start time.Time) bool { - return time.Since(start) > FastOperatorWaitTime + count := uint64(len(cpe.PromoteLearners)+len(cpe.DemoteVoters)) + 1 + return time.Since(start) > FastOperatorWaitTime*time.Duration(count) } // ChangePeerV2Leave is an OpStep that leaves the joint state. @@ -719,11 +723,12 @@ func (cpl ChangePeerV2Leave) CheckInProgress(_ ClusterInformer, region *core.Reg } // Influence calculates the store difference that current step makes. -func (cpl ChangePeerV2Leave) Influence(opInfluence OpInfluence, region *core.RegionInfo) {} +func (cpl ChangePeerV2Leave) Influence(_ OpInfluence, _ *core.RegionInfo) {} -// TimeOut returns true if the step is timeout. +// Timeout returns true if the step is timeout. func (cpl ChangePeerV2Leave) Timeout(start time.Time) bool { - return time.Since(start) > FastOperatorWaitTime + count := uint64(len(cpl.PromoteLearners)+len(cpl.DemoteVoters)) + 1 + return time.Since(start) > FastOperatorWaitTime*time.Duration(count) } func validateStore(ci ClusterInformer, id uint64) error { @@ -736,3 +741,12 @@ func validateStore(ci ClusterInformer, id uint64) error { } return nil } + +func maxWaitTime(executorRate float64, regionSize int64) time.Duration { + seconds := int64(math.Ceil(executorRate)) * regionSize + wait := time.Duration(seconds) * time.Second + if wait < SlowOperatorWaitTime { + wait = SlowOperatorWaitTime + } + return wait +} From 2a10af6549eab38efdf271b1a5c52ade30dd609b Mon Sep 17 00:00:00 2001 From: bufferflies <1045931706@qq.com> Date: Tue, 25 Jan 2022 17:21:14 +0800 Subject: [PATCH 13/25] dead lock Signed-off-by: bufferflies <1045931706@qq.com> --- server/api/scheduler.go | 1 + server/cluster/coordinator.go | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/server/api/scheduler.go b/server/api/scheduler.go index 019f1d3982d..9359b81cf94 100644 --- a/server/api/scheduler.go +++ b/server/api/scheduler.go @@ -239,6 +239,7 @@ func (h *schedulerHandler) addEvictOrGrant(w http.ResponseWriter, input map[stri h.r.JSON(w, http.StatusBadRequest, "missing store id") return } + log.Info("update scheduler", zap.String("scheduler-name", name), zap.Uint64("store-id", uint64(storeID))) if exist, err := h.Handler.IsSchedulerExisted(name); !exist { if err != nil && !errors.ErrorEqual(err, errs.ErrSchedulerNotFound.FastGenByArgs()) { h.r.JSON(w, http.StatusInternalServerError, err.Error()) diff --git a/server/cluster/coordinator.go b/server/cluster/coordinator.go index bb26805a818..1f0d757d852 100644 --- a/server/cluster/coordinator.go +++ b/server/cluster/coordinator.go @@ -655,7 +655,7 @@ func (c *coordinator) removeScheduler(name string) error { s.Stop() schedulerStatusGauge.WithLabelValues(name, "allow").Set(0) delete(c.schedulers, name) - + time.Sleep(time.Second) return nil } From 99488c0c44699d7d91459afd74c7666aa5f9616b Mon Sep 17 00:00:00 2001 From: bufferflies <1045931706@qq.com> Date: Tue, 8 Feb 2022 15:43:51 +0800 Subject: [PATCH 14/25] remove sleep Signed-off-by: bufferflies <1045931706@qq.com> --- server/cluster/coordinator.go | 1 - 1 file changed, 1 deletion(-) diff --git a/server/cluster/coordinator.go b/server/cluster/coordinator.go index 1f0d757d852..ddfc705882f 100644 --- a/server/cluster/coordinator.go +++ b/server/cluster/coordinator.go @@ -655,7 +655,6 @@ func (c *coordinator) removeScheduler(name string) error { s.Stop() schedulerStatusGauge.WithLabelValues(name, "allow").Set(0) delete(c.schedulers, name) - time.Sleep(time.Second) return nil } From 685b900dd320e27b75b44723cdec67977c53b0be Mon Sep 17 00:00:00 2001 From: bufferflies <1045931706@qq.com> Date: Thu, 10 Feb 2022 14:37:27 +0800 Subject: [PATCH 15/25] merge timeout Signed-off-by: bufferflies <1045931706@qq.com> --- server/schedule/operator/step.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/server/schedule/operator/step.go b/server/schedule/operator/step.go index 7705ef94718..6d73a975e74 100644 --- a/server/schedule/operator/step.go +++ b/server/schedule/operator/step.go @@ -382,7 +382,7 @@ func (mr MergeRegion) Influence(opInfluence OpInfluence, region *core.RegionInfo // Timeout returns true if the step is timeout. func (mr MergeRegion) Timeout(start time.Time) bool { - return time.Since(start) > FastOperatorWaitTime + return time.Since(start) > FastOperatorWaitTime*20 } // SplitRegion is an OpStep that splits a region. From bfe9788df4dc7c107ef9817f40192020cfb0406c Mon Sep 17 00:00:00 2001 From: bufferflies <1045931706@qq.com> Date: Fri, 11 Feb 2022 14:57:35 +0800 Subject: [PATCH 16/25] remove config changes Signed-off-by: bufferflies <1045931706@qq.com> --- server/api/scheduler.go | 1 - server/cluster/coordinator.go | 1 + server/config/config.go | 9 --------- server/config/persist_options.go | 5 ----- server/schedule/checker/merge_checker.go | 11 ++--------- 5 files changed, 3 insertions(+), 24 deletions(-) diff --git a/server/api/scheduler.go b/server/api/scheduler.go index 9359b81cf94..019f1d3982d 100644 --- a/server/api/scheduler.go +++ b/server/api/scheduler.go @@ -239,7 +239,6 @@ func (h *schedulerHandler) addEvictOrGrant(w http.ResponseWriter, input map[stri h.r.JSON(w, http.StatusBadRequest, "missing store id") return } - log.Info("update scheduler", zap.String("scheduler-name", name), zap.Uint64("store-id", uint64(storeID))) if exist, err := h.Handler.IsSchedulerExisted(name); !exist { if err != nil && !errors.ErrorEqual(err, errs.ErrSchedulerNotFound.FastGenByArgs()) { h.r.JSON(w, http.StatusInternalServerError, err.Error()) diff --git a/server/cluster/coordinator.go b/server/cluster/coordinator.go index ddfc705882f..bb26805a818 100644 --- a/server/cluster/coordinator.go +++ b/server/cluster/coordinator.go @@ -655,6 +655,7 @@ func (c *coordinator) removeScheduler(name string) error { s.Stop() schedulerStatusGauge.WithLabelValues(name, "allow").Set(0) delete(c.schedulers, name) + return nil } diff --git a/server/config/config.go b/server/config/config.go index 6fc78134074..6a088e210ea 100644 --- a/server/config/config.go +++ b/server/config/config.go @@ -742,11 +742,6 @@ type ScheduleConfig struct { // The day of hot regions data to be reserved. 0 means close. HotRegionsReservedDays uint64 `toml:"hot-regions-reserved-days" json:"hot-regions-reserved-days"` - - // MaxRegionSize is the max size of region. - // It's dangerous to change it. - // Default: 96MB - MaxRegionSize uint64 `toml:"max-region-size"` } // Clone returns a cloned scheduling configuration. @@ -794,7 +789,6 @@ const ( defaultEnableCrossTableMerge = true defaultHotRegionsWriteInterval = 10 * time.Minute defaultHotRegionsReservedDays = 7 - defaultMaxRegionSize = 96 ) func (c *ScheduleConfig) adjust(meta *configMetaData, reloading bool) error { @@ -850,9 +844,6 @@ func (c *ScheduleConfig) adjust(meta *configMetaData, reloading bool) error { if !meta.IsDefined("enable-cross-table-merge") { c.EnableCrossTableMerge = defaultEnableCrossTableMerge } - if !meta.IsDefined("max-region-size") { - adjustUint64(&c.MaxRegionSize, defaultMaxRegionSize) - } adjustFloat64(&c.LowSpaceRatio, defaultLowSpaceRatio) adjustFloat64(&c.HighSpaceRatio, defaultHighSpaceRatio) diff --git a/server/config/persist_options.go b/server/config/persist_options.go index 6a1ab04d125..a8672b6712e 100644 --- a/server/config/persist_options.go +++ b/server/config/persist_options.go @@ -246,11 +246,6 @@ func (o *PersistOptions) GetSplitMergeInterval() time.Duration { return o.GetScheduleConfig().SplitMergeInterval.Duration } -// GetMaxRegionSize returns the max size of regions. -func (o *PersistOptions) GetMaxRegionSize() uint64 { - return o.GetScheduleConfig().MaxRegionSize -} - // SetSplitMergeInterval to set the interval between finishing split and starting to merge. It's only used to test. func (o *PersistOptions) SetSplitMergeInterval(splitMergeInterval time.Duration) { v := o.GetScheduleConfig().Clone() diff --git a/server/schedule/checker/merge_checker.go b/server/schedule/checker/merge_checker.go index 681f09ef6dc..932f285ad24 100644 --- a/server/schedule/checker/merge_checker.go +++ b/server/schedule/checker/merge_checker.go @@ -32,10 +32,7 @@ import ( "github.com/tikv/pd/server/schedule/placement" ) -const ( - maxTargetRegionSize = 500 - maxRegionSizeFactor = 5 -) +const maxTargetRegionSize = 500 // When a region has label `merge_option=deny`, skip merging the region. // If label value is `allow` or other value, it will be treated as `allow`. @@ -148,11 +145,7 @@ func (m *MergeChecker) Check(region *core.RegionInfo) []*operator.Operator { return nil } - maxSize := int64(maxRegionSizeFactor * m.cluster.GetOpts().GetMaxRegionSize()) - if maxSize < maxTargetRegionSize { - maxSize = maxTargetRegionSize - } - if target.GetApproximateSize() > maxSize { + if target.GetApproximateSize() > maxTargetRegionSize { checkerCounter.WithLabelValues("merge_checker", "target-too-large").Inc() return nil } From be1c56bbbc348edef018f9876f3b0831654e3102 Mon Sep 17 00:00:00 2001 From: bufferflies <1045931706@qq.com> Date: Fri, 11 Feb 2022 18:28:05 +0800 Subject: [PATCH 17/25] update deadlock Signed-off-by: bufferflies <1045931706@qq.com> --- go.mod | 2 +- go.sum | 5 +++-- server/schedule/operator/operator.go | 8 +++----- 3 files changed, 7 insertions(+), 8 deletions(-) diff --git a/go.mod b/go.mod index 16059688dee..67e15f7b492 100644 --- a/go.mod +++ b/go.mod @@ -36,7 +36,7 @@ require ( github.com/pingcap/tidb-dashboard v0.0.0-20220117082709-e8076b5c79ba github.com/prometheus/client_golang v1.1.0 github.com/prometheus/common v0.6.0 - github.com/sasha-s/go-deadlock v0.2.0 + github.com/sasha-s/go-deadlock v0.3.1 github.com/spf13/cobra v1.0.0 github.com/spf13/pflag v1.0.5 github.com/swaggo/http-swagger v0.0.0-20200308142732-58ac5e232fba diff --git a/go.sum b/go.sum index e116a9eb66c..d338b5231f1 100644 --- a/go.sum +++ b/go.sum @@ -383,6 +383,7 @@ github.com/opentracing/opentracing-go v1.1.0 h1:pWlfV3Bxv7k65HYwkikxat0+s3pV4bsq github.com/opentracing/opentracing-go v1.1.0/go.mod h1:UkNAQd3GIcIGf0SeVgPpRdFStlNbqXla1AfSYxPUl2o= github.com/pascaldekloe/name v0.0.0-20180628100202-0fd16699aae1/go.mod h1:eD5JxqMiuNYyFNmyY9rkJ/slN8y59oEu4Ei7F8OoKWQ= github.com/pelletier/go-toml v1.2.0/go.mod h1:5z9KED0ma1S8pY6P1sdut58dfprrGBbd/94hg7ilaic= +github.com/petermattis/goid v0.0.0-20180202154549-b0b1615b78e5/go.mod h1:jvVRKCrJTQWu0XVbaOlby/2lO20uSCHEMzzplHXte1o= github.com/petermattis/goid v0.0.0-20211229010228-4d14c490ee36 h1:64bxqeTEN0/xoEqhKGowgihNuzISS9rEG6YUMU4bzJo= github.com/petermattis/goid v0.0.0-20211229010228-4d14c490ee36/go.mod h1:pxMtw7cyUw6B2bRH0ZBANSPg+AoSud1I1iyJHI69jH4= github.com/phf/go-queue v0.0.0-20170504031614-9abe38d0371d h1:U+PMnTlV2tu7RuMK5etusZG3Cf+rpow5hqQByeCzJ2g= @@ -455,8 +456,8 @@ github.com/rs/zerolog v1.15.0/go.mod h1:xYTKnLHcpfU2225ny5qZjxnj9NvkumZYjJHlAThC github.com/russross/blackfriday/v2 v2.0.1 h1:lPqVAte+HuHNfhJ/0LC98ESWRz8afy9tM/0RK8m9o+Q= github.com/russross/blackfriday/v2 v2.0.1/go.mod h1:+Rmxgy9KzJVeS9/2gXHxylqXiyQDYRxCVz55jmeOWTM= github.com/sanity-io/litter v1.2.0/go.mod h1:JF6pZUFgu2Q0sBZ+HSV35P8TVPI1TTzEwyu9FXAw2W4= -github.com/sasha-s/go-deadlock v0.2.0 h1:lMqc+fUb7RrFS3gQLtoQsJ7/6TV/pAIFvBsqX73DK8Y= -github.com/sasha-s/go-deadlock v0.2.0/go.mod h1:StQn567HiB1fF2yJ44N9au7wOhrPS3iZqiDbRupzT10= +github.com/sasha-s/go-deadlock v0.3.1 h1:sqv7fDNShgjcaxkO0JNcOAlr8B9+cV5Ey/OB71efZx0= +github.com/sasha-s/go-deadlock v0.3.1/go.mod h1:F73l+cr82YSh10GxyRI6qZiCgK64VaZjwesgfQ1/iLM= github.com/satori/go.uuid v1.2.0/go.mod h1:dA0hQrYB0VpLJoorglMZABFdXlWrHn1NEOzdhQKdks0= github.com/sergi/go-diff v1.0.1-0.20180205163309-da645544ed44 h1:tB9NOR21++IjLyVx3/PCPhWMwqGNCMQEH96A6dMZ/gc= github.com/sergi/go-diff v1.0.1-0.20180205163309-da645544ed44/go.mod h1:0CfEIISq7TuYL3j771MWULgwwjU+GofnZX9QAmXWZgo= diff --git a/server/schedule/operator/operator.go b/server/schedule/operator/operator.go index cbd42e1a756..fc97001abde 100644 --- a/server/schedule/operator/operator.go +++ b/server/schedule/operator/operator.go @@ -225,14 +225,12 @@ func (o *Operator) CheckExpired() bool { // CheckTimeout checks if the operator is timeout, and update the status. func (o *Operator) CheckTimeout() bool { - if o.CheckSuccess() { + if o.CheckSuccess() || len(o.steps) == 0 { return false } currentStep := int(atomic.LoadInt32(&o.currentStep)) - var startTime time.Time - if currentStep == 0 { - startTime = o.GetStartTime() - } else { + startTime := o.GetStartTime() + if currentStep > 0 { startTime = time.Unix(0, atomic.LoadInt64(&(o.stepsTime[currentStep-1]))) } step := o.steps[currentStep] From 8395b99ccff3639342dd7cb0b6c3b572d8bf2ffe Mon Sep 17 00:00:00 2001 From: bufferflies <1045931706@qq.com> Date: Fri, 11 Feb 2022 19:33:52 +0800 Subject: [PATCH 18/25] remove prepare lock Signed-off-by: bufferflies <1045931706@qq.com> --- go.mod | 2 +- go.sum | 5 ++- server/core/store.go | 54 ++++++++++++++++--------------- server/schedulers/grant_leader.go | 3 +- 4 files changed, 32 insertions(+), 32 deletions(-) diff --git a/go.mod b/go.mod index 67e15f7b492..16059688dee 100644 --- a/go.mod +++ b/go.mod @@ -36,7 +36,7 @@ require ( github.com/pingcap/tidb-dashboard v0.0.0-20220117082709-e8076b5c79ba github.com/prometheus/client_golang v1.1.0 github.com/prometheus/common v0.6.0 - github.com/sasha-s/go-deadlock v0.3.1 + github.com/sasha-s/go-deadlock v0.2.0 github.com/spf13/cobra v1.0.0 github.com/spf13/pflag v1.0.5 github.com/swaggo/http-swagger v0.0.0-20200308142732-58ac5e232fba diff --git a/go.sum b/go.sum index d338b5231f1..e116a9eb66c 100644 --- a/go.sum +++ b/go.sum @@ -383,7 +383,6 @@ github.com/opentracing/opentracing-go v1.1.0 h1:pWlfV3Bxv7k65HYwkikxat0+s3pV4bsq github.com/opentracing/opentracing-go v1.1.0/go.mod h1:UkNAQd3GIcIGf0SeVgPpRdFStlNbqXla1AfSYxPUl2o= github.com/pascaldekloe/name v0.0.0-20180628100202-0fd16699aae1/go.mod h1:eD5JxqMiuNYyFNmyY9rkJ/slN8y59oEu4Ei7F8OoKWQ= github.com/pelletier/go-toml v1.2.0/go.mod h1:5z9KED0ma1S8pY6P1sdut58dfprrGBbd/94hg7ilaic= -github.com/petermattis/goid v0.0.0-20180202154549-b0b1615b78e5/go.mod h1:jvVRKCrJTQWu0XVbaOlby/2lO20uSCHEMzzplHXte1o= github.com/petermattis/goid v0.0.0-20211229010228-4d14c490ee36 h1:64bxqeTEN0/xoEqhKGowgihNuzISS9rEG6YUMU4bzJo= github.com/petermattis/goid v0.0.0-20211229010228-4d14c490ee36/go.mod h1:pxMtw7cyUw6B2bRH0ZBANSPg+AoSud1I1iyJHI69jH4= github.com/phf/go-queue v0.0.0-20170504031614-9abe38d0371d h1:U+PMnTlV2tu7RuMK5etusZG3Cf+rpow5hqQByeCzJ2g= @@ -456,8 +455,8 @@ github.com/rs/zerolog v1.15.0/go.mod h1:xYTKnLHcpfU2225ny5qZjxnj9NvkumZYjJHlAThC github.com/russross/blackfriday/v2 v2.0.1 h1:lPqVAte+HuHNfhJ/0LC98ESWRz8afy9tM/0RK8m9o+Q= github.com/russross/blackfriday/v2 v2.0.1/go.mod h1:+Rmxgy9KzJVeS9/2gXHxylqXiyQDYRxCVz55jmeOWTM= github.com/sanity-io/litter v1.2.0/go.mod h1:JF6pZUFgu2Q0sBZ+HSV35P8TVPI1TTzEwyu9FXAw2W4= -github.com/sasha-s/go-deadlock v0.3.1 h1:sqv7fDNShgjcaxkO0JNcOAlr8B9+cV5Ey/OB71efZx0= -github.com/sasha-s/go-deadlock v0.3.1/go.mod h1:F73l+cr82YSh10GxyRI6qZiCgK64VaZjwesgfQ1/iLM= +github.com/sasha-s/go-deadlock v0.2.0 h1:lMqc+fUb7RrFS3gQLtoQsJ7/6TV/pAIFvBsqX73DK8Y= +github.com/sasha-s/go-deadlock v0.2.0/go.mod h1:StQn567HiB1fF2yJ44N9au7wOhrPS3iZqiDbRupzT10= github.com/satori/go.uuid v1.2.0/go.mod h1:dA0hQrYB0VpLJoorglMZABFdXlWrHn1NEOzdhQKdks0= github.com/sergi/go-diff v1.0.1-0.20180205163309-da645544ed44 h1:tB9NOR21++IjLyVx3/PCPhWMwqGNCMQEH96A6dMZ/gc= github.com/sergi/go-diff v1.0.1-0.20180205163309-da645544ed44/go.mod h1:0CfEIISq7TuYL3j771MWULgwwjU+GofnZX9QAmXWZgo= diff --git a/server/core/store.go b/server/core/store.go index a24a510255c..703f37d4ea9 100644 --- a/server/core/store.go +++ b/server/core/store.go @@ -84,19 +84,20 @@ func NewStoreInfo(store *metapb.Store, opts ...StoreCreateOption) *StoreInfo { func (s *StoreInfo) Clone(opts ...StoreCreateOption) *StoreInfo { meta := proto.Clone(s.meta).(*metapb.Store) store := &StoreInfo{ - meta: meta, - storeStats: s.storeStats, - pauseLeaderTransfer: s.pauseLeaderTransfer, - slowStoreEvicted: s.slowStoreEvicted, - leaderCount: s.leaderCount, - regionCount: s.regionCount, - leaderSize: s.leaderSize, - regionSize: s.regionSize, - pendingPeerCount: s.pendingPeerCount, - lastPersistTime: s.lastPersistTime, - leaderWeight: s.leaderWeight, - regionWeight: s.regionWeight, - limiter: s.limiter, + meta: meta, + storeStats: s.storeStats, + pauseLeaderTransfer: s.pauseLeaderTransfer, + slowStoreEvicted: s.slowStoreEvicted, + leaderCount: s.leaderCount, + regionCount: s.regionCount, + leaderSize: s.leaderSize, + regionSize: s.regionSize, + pendingPeerCount: s.pendingPeerCount, + lastPersistTime: s.lastPersistTime, + leaderWeight: s.leaderWeight, + regionWeight: s.regionWeight, + limiter: s.limiter, + operatorExecutorRate: s.operatorExecutorRate, } for _, opt := range opts { @@ -108,19 +109,20 @@ func (s *StoreInfo) Clone(opts ...StoreCreateOption) *StoreInfo { // ShallowClone creates a copy of current StoreInfo, but not clone 'meta'. func (s *StoreInfo) ShallowClone(opts ...StoreCreateOption) *StoreInfo { store := &StoreInfo{ - meta: s.meta, - storeStats: s.storeStats, - pauseLeaderTransfer: s.pauseLeaderTransfer, - slowStoreEvicted: s.slowStoreEvicted, - leaderCount: s.leaderCount, - regionCount: s.regionCount, - leaderSize: s.leaderSize, - regionSize: s.regionSize, - pendingPeerCount: s.pendingPeerCount, - lastPersistTime: s.lastPersistTime, - leaderWeight: s.leaderWeight, - regionWeight: s.regionWeight, - limiter: s.limiter, + meta: s.meta, + storeStats: s.storeStats, + pauseLeaderTransfer: s.pauseLeaderTransfer, + slowStoreEvicted: s.slowStoreEvicted, + leaderCount: s.leaderCount, + regionCount: s.regionCount, + leaderSize: s.leaderSize, + regionSize: s.regionSize, + pendingPeerCount: s.pendingPeerCount, + lastPersistTime: s.lastPersistTime, + leaderWeight: s.leaderWeight, + regionWeight: s.regionWeight, + limiter: s.limiter, + operatorExecutorRate: s.operatorExecutorRate, } for _, opt := range opts { diff --git a/server/schedulers/grant_leader.go b/server/schedulers/grant_leader.go index 245d0fa6c10..5bb90b80a71 100644 --- a/server/schedulers/grant_leader.go +++ b/server/schedulers/grant_leader.go @@ -198,9 +198,8 @@ func (s *grantLeaderScheduler) EncodeConfig() ([]byte, error) { return schedule.EncodeConfig(s.conf) } +// Prepare run it after adding new scheduler. func (s *grantLeaderScheduler) Prepare(cluster schedule.Cluster) error { - s.conf.mu.RLock() - defer s.conf.mu.RUnlock() var res error for id := range s.conf.StoreIDWithRanges { if err := cluster.PauseLeaderTransfer(id); err != nil { From 22dfc3f27437d1ecf5e331178b072ca5355c9cc4 Mon Sep 17 00:00:00 2001 From: bufferflies <1045931706@qq.com> Date: Tue, 15 Feb 2022 14:50:27 +0800 Subject: [PATCH 19/25] move executor rate to operator control Signed-off-by: bufferflies <1045931706@qq.com> --- server/core/store.go | 97 ++++++++++------------ server/schedule/operator/operator.go | 12 ++- server/schedule/operator/operator_test.go | 2 +- server/schedule/operator/status_tracker.go | 4 +- server/schedule/operator/step.go | 26 +++--- server/schedule/operator_controller.go | 7 +- 6 files changed, 75 insertions(+), 73 deletions(-) diff --git a/server/core/store.go b/server/core/store.go index 703f37d4ea9..c01ff2a9c6e 100644 --- a/server/core/store.go +++ b/server/core/store.go @@ -42,37 +42,33 @@ const ( EngineTiFlash = "tiflash" // EngineTiKV indicates the tikv engine in metrics EngineTiKV = "tikv" - // OperatorExecutorRate is the rate of the store operator executor. - OperatorExecutorRate = 6.0 ) // StoreInfo contains information about a store. type StoreInfo struct { meta *metapb.Store *storeStats - pauseLeaderTransfer bool // not allow to be used as source or target of transfer leader - slowStoreEvicted bool // this store has been evicted as a slow store, should not transfer leader to it - leaderCount int - regionCount int - leaderSize int64 - regionSize int64 - pendingPeerCount int - lastPersistTime time.Time - leaderWeight float64 - regionWeight float64 - limiter map[storelimit.Type]*storelimit.StoreLimit - operatorExecutorRate float64 + pauseLeaderTransfer bool // not allow to be used as source or target of transfer leader + slowStoreEvicted bool // this store has been evicted as a slow store, should not transfer leader to it + leaderCount int + regionCount int + leaderSize int64 + regionSize int64 + pendingPeerCount int + lastPersistTime time.Time + leaderWeight float64 + regionWeight float64 + limiter map[storelimit.Type]*storelimit.StoreLimit } // NewStoreInfo creates StoreInfo with meta data. func NewStoreInfo(store *metapb.Store, opts ...StoreCreateOption) *StoreInfo { storeInfo := &StoreInfo{ - meta: store, - storeStats: newStoreStats(), - leaderWeight: 1.0, - regionWeight: 1.0, - limiter: make(map[storelimit.Type]*storelimit.StoreLimit), - operatorExecutorRate: OperatorExecutorRate, + meta: store, + storeStats: newStoreStats(), + leaderWeight: 1.0, + regionWeight: 1.0, + limiter: make(map[storelimit.Type]*storelimit.StoreLimit), } for _, opt := range opts { opt(storeInfo) @@ -84,20 +80,19 @@ func NewStoreInfo(store *metapb.Store, opts ...StoreCreateOption) *StoreInfo { func (s *StoreInfo) Clone(opts ...StoreCreateOption) *StoreInfo { meta := proto.Clone(s.meta).(*metapb.Store) store := &StoreInfo{ - meta: meta, - storeStats: s.storeStats, - pauseLeaderTransfer: s.pauseLeaderTransfer, - slowStoreEvicted: s.slowStoreEvicted, - leaderCount: s.leaderCount, - regionCount: s.regionCount, - leaderSize: s.leaderSize, - regionSize: s.regionSize, - pendingPeerCount: s.pendingPeerCount, - lastPersistTime: s.lastPersistTime, - leaderWeight: s.leaderWeight, - regionWeight: s.regionWeight, - limiter: s.limiter, - operatorExecutorRate: s.operatorExecutorRate, + meta: meta, + storeStats: s.storeStats, + pauseLeaderTransfer: s.pauseLeaderTransfer, + slowStoreEvicted: s.slowStoreEvicted, + leaderCount: s.leaderCount, + regionCount: s.regionCount, + leaderSize: s.leaderSize, + regionSize: s.regionSize, + pendingPeerCount: s.pendingPeerCount, + lastPersistTime: s.lastPersistTime, + leaderWeight: s.leaderWeight, + regionWeight: s.regionWeight, + limiter: s.limiter, } for _, opt := range opts { @@ -109,20 +104,19 @@ func (s *StoreInfo) Clone(opts ...StoreCreateOption) *StoreInfo { // ShallowClone creates a copy of current StoreInfo, but not clone 'meta'. func (s *StoreInfo) ShallowClone(opts ...StoreCreateOption) *StoreInfo { store := &StoreInfo{ - meta: s.meta, - storeStats: s.storeStats, - pauseLeaderTransfer: s.pauseLeaderTransfer, - slowStoreEvicted: s.slowStoreEvicted, - leaderCount: s.leaderCount, - regionCount: s.regionCount, - leaderSize: s.leaderSize, - regionSize: s.regionSize, - pendingPeerCount: s.pendingPeerCount, - lastPersistTime: s.lastPersistTime, - leaderWeight: s.leaderWeight, - regionWeight: s.regionWeight, - limiter: s.limiter, - operatorExecutorRate: s.operatorExecutorRate, + meta: s.meta, + storeStats: s.storeStats, + pauseLeaderTransfer: s.pauseLeaderTransfer, + slowStoreEvicted: s.slowStoreEvicted, + leaderCount: s.leaderCount, + regionCount: s.regionCount, + leaderSize: s.leaderSize, + regionSize: s.regionSize, + pendingPeerCount: s.pendingPeerCount, + lastPersistTime: s.lastPersistTime, + leaderWeight: s.leaderWeight, + regionWeight: s.regionWeight, + limiter: s.limiter, } for _, opt := range opts { @@ -273,13 +267,6 @@ func (s *StoreInfo) GetStoreLimit(limitType storelimit.Type) *storelimit.StoreLi return s.limiter[limitType] } -// GetOperatorExecutorRate returns the operator executor rate of the store. -func (s *StoreInfo) GetOperatorExecutorRate() float64 { - s.mu.RLock() - defer s.mu.RUnlock() - return s.operatorExecutorRate -} - const minWeight = 1e-6 const maxScore = 1024 * 1024 * 1024 diff --git a/server/schedule/operator/operator.go b/server/schedule/operator/operator.go index fc97001abde..4ed70e279da 100644 --- a/server/schedule/operator/operator.go +++ b/server/schedule/operator/operator.go @@ -37,6 +37,9 @@ const ( // SlowOperatorWaitTime is the duration that when an operator marked `OpRegion` // runs longer than it, the operator will be considered timeout. SlowOperatorWaitTime = 10 * time.Minute + // OperatorExecutorRate is the rate of the operator executor. + // default: 6 s/Mb + OperatorExecutorRate = 6 ) // Operator contains execution steps generated by scheduler. @@ -55,6 +58,7 @@ type Operator struct { FinishedCounters []prometheus.Counter AdditionalInfos map[string]string ApproximateSize int64 + ExecutorRate float64 } // NewOperator creates a new operator. @@ -75,6 +79,7 @@ func NewOperator(desc, brief string, regionID uint64, regionEpoch *metapb.Region level: level, AdditionalInfos: make(map[string]string), ApproximateSize: approximateSize, + ExecutorRate: OperatorExecutorRate, } } @@ -110,6 +115,11 @@ func (o *Operator) SetDesc(desc string) { o.desc = desc } +// SetExecutorRate sets the executor rate of the operator. +func (o *Operator) SetExecutorRate(rate float64) { + o.ExecutorRate = rate +} + // AttachKind attaches an operator kind for the operator. func (o *Operator) AttachKind(kind OpKind) { o.kind |= kind @@ -234,7 +244,7 @@ func (o *Operator) CheckTimeout() bool { startTime = time.Unix(0, atomic.LoadInt64(&(o.stepsTime[currentStep-1]))) } step := o.steps[currentStep] - return o.status.CheckStepTimeout(startTime, step) + return o.status.CheckStepTimeout(startTime, step, o.ExecutorRate) } // Len returns the operator's steps count. diff --git a/server/schedule/operator/operator_test.go b/server/schedule/operator/operator_test.go index 2832ae8f70c..2dde4320fb6 100644 --- a/server/schedule/operator/operator_test.go +++ b/server/schedule/operator/operator_test.go @@ -467,7 +467,7 @@ func (s *testOperatorSuite) TestOpStepTimeout(c *C) { } for _, v := range testdata { for _, step := range v.step { - c.Assert(v.expect, Equals, step.Timeout(v.start)) + c.Assert(v.expect, Equals, step.Timeout(v.start, 6.0)) } } } diff --git a/server/schedule/operator/status_tracker.go b/server/schedule/operator/status_tracker.go index 914b87e9185..6678de60ca5 100644 --- a/server/schedule/operator/status_tracker.go +++ b/server/schedule/operator/status_tracker.go @@ -129,11 +129,11 @@ func (trk *OpStatusTracker) CheckTimeout(wait time.Duration) bool { } // CheckStepTimeout checks if timeout, and update the current status. -func (trk *OpStatusTracker) CheckStepTimeout(start time.Time, step OpStep) bool { +func (trk *OpStatusTracker) CheckStepTimeout(start time.Time, step OpStep, executerRate float64) bool { trk.rw.Lock() defer trk.rw.Unlock() if trk.current == STARTED { - if !step.Timeout(start) { + if !step.Timeout(start, executerRate) { return false } _ = trk.toLocked(TIMEOUT) diff --git a/server/schedule/operator/step.go b/server/schedule/operator/step.go index 6d73a975e74..af3307a96cf 100644 --- a/server/schedule/operator/step.go +++ b/server/schedule/operator/step.go @@ -39,7 +39,7 @@ type OpStep interface { IsFinish(region *core.RegionInfo) bool CheckInProgress(ci ClusterInformer, region *core.RegionInfo) error Influence(opInfluence OpInfluence, region *core.RegionInfo) - Timeout(start time.Time) bool + Timeout(start time.Time, executorRate float64) bool } // TransferLeader is an OpStep that transfers a region's leader. @@ -103,7 +103,7 @@ func (tl TransferLeader) Influence(opInfluence OpInfluence, region *core.RegionI } // Timeout returns true if the step is timeout. -func (tl TransferLeader) Timeout(start time.Time) bool { +func (tl TransferLeader) Timeout(start time.Time, _ float64) bool { return time.Since(start) > FastOperatorWaitTime } @@ -163,8 +163,8 @@ func (ap AddPeer) CheckInProgress(ci ClusterInformer, region *core.RegionInfo) e } // Timeout returns true if the step is timeout. -func (ap AddPeer) Timeout(start time.Time) bool { - return time.Since(start) > maxWaitTime(ap.executorRate, ap.regionSize) +func (ap AddPeer) Timeout(start time.Time, executorRate float64) bool { + return time.Since(start) > maxWaitTime(executorRate, ap.regionSize) } // AddLearner is an OpStep that adds a region learner peer. @@ -229,8 +229,8 @@ func (al AddLearner) Influence(opInfluence OpInfluence, region *core.RegionInfo) } // Timeout returns true if the step is timeout. -func (al AddLearner) Timeout(start time.Time) bool { - return time.Since(start) > maxWaitTime(al.executorRate, al.regionSize) +func (al AddLearner) Timeout(start time.Time, executorRate float64) bool { + return time.Since(start) > maxWaitTime(executorRate, al.regionSize) } // PromoteLearner is an OpStep that promotes a region learner peer to normal voter. @@ -272,7 +272,7 @@ func (pl PromoteLearner) CheckInProgress(_ ClusterInformer, region *core.RegionI func (pl PromoteLearner) Influence(_ OpInfluence, _ *core.RegionInfo) {} // Timeout returns true if the step is timeout. -func (pl PromoteLearner) Timeout(start time.Time) bool { +func (pl PromoteLearner) Timeout(start time.Time, _ float64) bool { return time.Since(start) > FastOperatorWaitTime } @@ -328,7 +328,7 @@ func (rp RemovePeer) Influence(opInfluence OpInfluence, region *core.RegionInfo) } // Timeout returns true if the step is timeout. -func (rp RemovePeer) Timeout(start time.Time) bool { +func (rp RemovePeer) Timeout(start time.Time, _ float64) bool { return time.Since(start) > FastOperatorWaitTime } @@ -381,7 +381,7 @@ func (mr MergeRegion) Influence(opInfluence OpInfluence, region *core.RegionInfo } // Timeout returns true if the step is timeout. -func (mr MergeRegion) Timeout(start time.Time) bool { +func (mr MergeRegion) Timeout(start time.Time, _ float64) bool { return time.Since(start) > FastOperatorWaitTime*20 } @@ -423,7 +423,7 @@ func (sr SplitRegion) CheckInProgress(_ ClusterInformer, _ *core.RegionInfo) err } // Timeout returns true if the step is timeout. -func (sr SplitRegion) Timeout(start time.Time) bool { +func (sr SplitRegion) Timeout(start time.Time, _ float64) bool { return time.Since(start) > FastOperatorWaitTime } @@ -455,7 +455,7 @@ func (dv DemoteVoter) IsFinish(region *core.RegionInfo) bool { } // Timeout returns true if the step is timeout. -func (dv DemoteVoter) Timeout(start time.Time) bool { +func (dv DemoteVoter) Timeout(start time.Time, _ float64) bool { return time.Since(start) > FastOperatorWaitTime } @@ -601,7 +601,7 @@ func (cpe ChangePeerV2Enter) GetRequest() *pdpb.ChangePeerV2 { } // Timeout returns true if the step is timeout. -func (cpe ChangePeerV2Enter) Timeout(start time.Time) bool { +func (cpe ChangePeerV2Enter) Timeout(start time.Time, _ float64) bool { count := uint64(len(cpe.PromoteLearners)+len(cpe.DemoteVoters)) + 1 return time.Since(start) > FastOperatorWaitTime*time.Duration(count) } @@ -726,7 +726,7 @@ func (cpl ChangePeerV2Leave) CheckInProgress(_ ClusterInformer, region *core.Reg func (cpl ChangePeerV2Leave) Influence(_ OpInfluence, _ *core.RegionInfo) {} // Timeout returns true if the step is timeout. -func (cpl ChangePeerV2Leave) Timeout(start time.Time) bool { +func (cpl ChangePeerV2Leave) Timeout(start time.Time, _ float64) bool { count := uint64(len(cpl.PromoteLearners)+len(cpl.DemoteVoters)) + 1 return time.Since(start) > FastOperatorWaitTime*time.Duration(count) } diff --git a/server/schedule/operator_controller.go b/server/schedule/operator_controller.go index 968ed80fa07..d66afbdae98 100644 --- a/server/schedule/operator_controller.go +++ b/server/schedule/operator_controller.go @@ -54,6 +54,9 @@ var ( StoreBalanceBaseTime float64 = 60 // FastOperatorFinishTime min finish time, if finish duration less than it,op will be pushed to fast operator queue FastOperatorFinishTime = 10 * time.Second + // DefaultOperatorExecutorRate executor rate of the operator. + // unit: s/MB + DefaultOperatorExecutorRate = 6.0 ) // OperatorController is used to limit the speed of scheduling. @@ -70,6 +73,7 @@ type OperatorController struct { wop WaitingOperator wopStatus *WaitingOperatorStatus opNotifierQueue operatorQueue + executerRate float64 } // NewOperatorController creates a OperatorController. @@ -86,6 +90,7 @@ func NewOperatorController(ctx context.Context, cluster Cluster, hbStreams *hbst wop: NewRandBuckets(), wopStatus: NewWaitingOperatorStatus(), opNotifierQueue: make(operatorQueue, 0), + executerRate: DefaultOperatorExecutorRate, } } @@ -432,7 +437,7 @@ func isHigherPriorityOperator(new, old *operator.Operator) bool { func (oc *OperatorController) addOperatorLocked(op *operator.Operator) bool { regionID := op.RegionID() - + op.SetExecutorRate(oc.executerRate) log.Info("add operator", zap.Uint64("region-id", regionID), zap.Reflect("operator", op), From e696959a8f1afba922a01fa99afa1f9ecf0b6527 Mon Sep 17 00:00:00 2001 From: bufferflies <1045931706@qq.com> Date: Tue, 15 Feb 2022 15:25:52 +0800 Subject: [PATCH 20/25] operator remove executor rate Signed-off-by: bufferflies <1045931706@qq.com> --- server/schedule/operator/builder.go | 19 +++++++------------ server/schedule/operator/operator.go | 8 ++++---- server/schedule/operator/operator_test.go | 10 +++++----- server/schedule/operator/status_tracker.go | 4 ++-- .../schedule/operator/status_tracker_test.go | 2 +- server/schedule/operator/step.go | 2 -- 6 files changed, 19 insertions(+), 26 deletions(-) diff --git a/server/schedule/operator/builder.go b/server/schedule/operator/builder.go index 538336ce3a6..4517d599c2a 100644 --- a/server/schedule/operator/builder.go +++ b/server/schedule/operator/builder.go @@ -48,13 +48,12 @@ type ClusterInformer interface { type Builder struct { // basic info ClusterInformer - desc string - regionID uint64 - regionEpoch *metapb.RegionEpoch - rules []*placement.Rule - expectedRoles map[uint64]placement.PeerRoleType - approximateSize int64 - operatorExecutorRate float64 + desc string + regionID uint64 + regionEpoch *metapb.RegionEpoch + rules []*placement.Rule + expectedRoles map[uint64]placement.PeerRoleType + approximateSize int64 // operation record originPeers peersMap @@ -161,9 +160,6 @@ func NewBuilder(desc string, ci ClusterInformer, region *core.RegionInfo, opts . b.targetPeers = originPeers.Copy() b.useJointConsensus = supportJointConsensus && b.GetOpts().IsUseJointConsensus() b.err = err - if originLeaderStore := b.GetBasicCluster().GetStore(b.originLeaderStoreID); originLeaderStore != nil { - b.operatorExecutorRate = originLeaderStore.GetOperatorExecutorRate() - } return b } @@ -677,8 +673,7 @@ func (b *Builder) execPromoteLearner(peer *metapb.Peer) { func (b *Builder) execAddPeer(peer *metapb.Peer) { if b.lightWeight { - b.steps = append(b.steps, AddLearner{ToStore: peer.GetStoreId(), PeerID: peer.GetId(), - IsLightWeight: b.lightWeight, regionSize: b.approximateSize, executorRate: b.operatorExecutorRate}) + b.steps = append(b.steps, AddLearner{ToStore: peer.GetStoreId(), PeerID: peer.GetId(), IsLightWeight: b.lightWeight, regionSize: b.approximateSize}) } else { b.steps = append(b.steps, AddLearner{ToStore: peer.GetStoreId(), PeerID: peer.GetId()}) } diff --git a/server/schedule/operator/operator.go b/server/schedule/operator/operator.go index 4ed70e279da..18d88ed883a 100644 --- a/server/schedule/operator/operator.go +++ b/server/schedule/operator/operator.go @@ -58,7 +58,7 @@ type Operator struct { FinishedCounters []prometheus.Counter AdditionalInfos map[string]string ApproximateSize int64 - ExecutorRate float64 + ExecuteRate float64 } // NewOperator creates a new operator. @@ -79,7 +79,7 @@ func NewOperator(desc, brief string, regionID uint64, regionEpoch *metapb.Region level: level, AdditionalInfos: make(map[string]string), ApproximateSize: approximateSize, - ExecutorRate: OperatorExecutorRate, + ExecuteRate: OperatorExecutorRate, } } @@ -117,7 +117,7 @@ func (o *Operator) SetDesc(desc string) { // SetExecutorRate sets the executor rate of the operator. func (o *Operator) SetExecutorRate(rate float64) { - o.ExecutorRate = rate + o.ExecuteRate = rate } // AttachKind attaches an operator kind for the operator. @@ -244,7 +244,7 @@ func (o *Operator) CheckTimeout() bool { startTime = time.Unix(0, atomic.LoadInt64(&(o.stepsTime[currentStep-1]))) } step := o.steps[currentStep] - return o.status.CheckStepTimeout(startTime, step, o.ExecutorRate) + return o.status.CheckStepTimeout(startTime, step, o.ExecuteRate) } // Len returns the operator's steps count. diff --git a/server/schedule/operator/operator_test.go b/server/schedule/operator/operator_test.go index 2dde4320fb6..c74826bc41c 100644 --- a/server/schedule/operator/operator_test.go +++ b/server/schedule/operator/operator_test.go @@ -124,7 +124,7 @@ func (s *testOperatorSuite) TestOperator(c *C) { // addPeer1, transferLeader1, removePeer2 steps = []OpStep{ - AddPeer{ToStore: 1, PeerID: 1, regionSize: 10, executorRate: 6}, + AddPeer{ToStore: 1, PeerID: 1, regionSize: 10}, TransferLeader{FromStore: 2, ToStore: 1}, RemovePeer{FromStore: 2}, } @@ -427,20 +427,20 @@ func (s *testOperatorSuite) TestOpStepTimeout(c *C) { expect bool }{{ // case1: 10GB region will have 60,000s to executor. - step: []OpStep{AddLearner{executorRate: 6, regionSize: 10 * 1000}, AddPeer{executorRate: 6, regionSize: 10 * 1000}}, + step: []OpStep{AddLearner{regionSize: 10 * 1000}, AddPeer{regionSize: 10 * 1000}}, start: time.Now().Add(-time.Second * (6*10*1000 + 20)), expect: true, }, { - step: []OpStep{AddLearner{executorRate: 6, regionSize: 10 * 1000}, AddPeer{executorRate: 6, regionSize: 10 * 1000}}, + step: []OpStep{AddLearner{regionSize: 10 * 1000}, AddPeer{regionSize: 10 * 1000}}, start: time.Now().Add(-time.Second * (6*10*1000 - 20)), expect: false, }, { // case2: 10MB region will have at least SlowOperatorWaitTime(10min) to executor. - step: []OpStep{AddLearner{executorRate: 6, regionSize: 10}, AddPeer{executorRate: 6, regionSize: 10}}, + step: []OpStep{AddLearner{regionSize: 10}, AddPeer{regionSize: 10}}, start: time.Now().Add(-(SlowOperatorWaitTime + time.Second*20)), expect: true, }, { - step: []OpStep{AddLearner{executorRate: 6, regionSize: 10}, AddPeer{executorRate: 6, regionSize: 10}}, + step: []OpStep{AddLearner{regionSize: 10}, AddPeer{regionSize: 10}}, start: time.Now().Add(-time.Second * (6*10 + 20)), expect: false, }, { diff --git a/server/schedule/operator/status_tracker.go b/server/schedule/operator/status_tracker.go index 6678de60ca5..ebba54e08b8 100644 --- a/server/schedule/operator/status_tracker.go +++ b/server/schedule/operator/status_tracker.go @@ -129,11 +129,11 @@ func (trk *OpStatusTracker) CheckTimeout(wait time.Duration) bool { } // CheckStepTimeout checks if timeout, and update the current status. -func (trk *OpStatusTracker) CheckStepTimeout(start time.Time, step OpStep, executerRate float64) bool { +func (trk *OpStatusTracker) CheckStepTimeout(start time.Time, step OpStep, executeRate float64) bool { trk.rw.Lock() defer trk.rw.Unlock() if trk.current == STARTED { - if !step.Timeout(start, executerRate) { + if !step.Timeout(start, executeRate) { return false } _ = trk.toLocked(TIMEOUT) diff --git a/server/schedule/operator/status_tracker_test.go b/server/schedule/operator/status_tracker_test.go index 06c506d7a9e..da2e7ceab18 100644 --- a/server/schedule/operator/status_tracker_test.go +++ b/server/schedule/operator/status_tracker_test.go @@ -132,7 +132,7 @@ func (s *testOpStatusTrackerSuite) TestCheckStepTimeout(c *C) { for _, v := range testdata { trk := NewOpStatusTracker() trk.To(STARTED) - c.Assert(trk.CheckStepTimeout(v.start, v.step), Equals, v.status == TIMEOUT) + c.Assert(trk.CheckStepTimeout(v.start, v.step, 6), Equals, v.status == TIMEOUT) c.Assert(trk.Status(), Equals, v.status) } } diff --git a/server/schedule/operator/step.go b/server/schedule/operator/step.go index af3307a96cf..ae6791f2465 100644 --- a/server/schedule/operator/step.go +++ b/server/schedule/operator/step.go @@ -112,7 +112,6 @@ type AddPeer struct { ToStore, PeerID uint64 IsLightWeight bool regionSize int64 - executorRate float64 } // ConfVerChanged returns the delta value for version increased by this step. @@ -172,7 +171,6 @@ type AddLearner struct { ToStore, PeerID uint64 IsLightWeight bool regionSize int64 - executorRate float64 } // ConfVerChanged returns the delta value for version increased by this step. From e07edf6f2cff0e98232bfb8309c6e234fab2df61 Mon Sep 17 00:00:00 2001 From: bufferflies <1045931706@qq.com> Date: Wed, 16 Feb 2022 14:29:26 +0800 Subject: [PATCH 21/25] address comment Signed-off-by: bufferflies <1045931706@qq.com> --- server/schedule/operator/operator_test.go | 75 +++++++++++++---------- server/schedulers/grant_leader.go | 2 + 2 files changed, 43 insertions(+), 34 deletions(-) diff --git a/server/schedule/operator/operator_test.go b/server/schedule/operator/operator_test.go index c74826bc41c..bc6a7540466 100644 --- a/server/schedule/operator/operator_test.go +++ b/server/schedule/operator/operator_test.go @@ -17,6 +17,7 @@ package operator import ( "context" "encoding/json" + "fmt" "sync/atomic" "testing" "time" @@ -430,43 +431,49 @@ func (s *testOperatorSuite) TestOpStepTimeout(c *C) { step: []OpStep{AddLearner{regionSize: 10 * 1000}, AddPeer{regionSize: 10 * 1000}}, start: time.Now().Add(-time.Second * (6*10*1000 + 20)), expect: true, - }, { - step: []OpStep{AddLearner{regionSize: 10 * 1000}, AddPeer{regionSize: 10 * 1000}}, - start: time.Now().Add(-time.Second * (6*10*1000 - 20)), - expect: false, - }, { - // case2: 10MB region will have at least SlowOperatorWaitTime(10min) to executor. - step: []OpStep{AddLearner{regionSize: 10}, AddPeer{regionSize: 10}}, - start: time.Now().Add(-(SlowOperatorWaitTime + time.Second*20)), - expect: true, - }, { - step: []OpStep{AddLearner{regionSize: 10}, AddPeer{regionSize: 10}}, - start: time.Now().Add(-time.Second * (6*10 + 20)), - expect: false, - }, { - // case3: RemovePeer, TransferLeader, SplitRegion, MergeRegion, PromoteLearner will have FastOperatorWaitTime(10s) to executor. - step: []OpStep{RemovePeer{}, TransferLeader{}, SplitRegion{}, MergeRegion{}, PromoteLearner{}}, - start: time.Now().Add(-(FastOperatorWaitTime + time.Second)), - expect: true, - }, { - step: []OpStep{RemovePeer{}, TransferLeader{}, SplitRegion{}, MergeRegion{}, PromoteLearner{}}, - start: time.Now().Add(-(FastOperatorWaitTime - time.Second)), - expect: false, - }, { - // case4: ChangePeerV2Enter, ChangePeerV2Leave has FastOperatorWaitTime times than FastOperatorWaitTime - step: []OpStep{ChangePeerV2Enter{PromoteLearners: []PromoteLearner{{}, {}}}, - ChangePeerV2Leave{PromoteLearners: []PromoteLearner{{}, {}}}}, - start: time.Now().Add(-(FastOperatorWaitTime*(2+1) + time.Second)), - expect: true, - }, { - step: []OpStep{ChangePeerV2Enter{PromoteLearners: []PromoteLearner{{}, {}}}, - ChangePeerV2Leave{PromoteLearners: []PromoteLearner{{}, {}}}}, - start: time.Now().Add(-(FastOperatorWaitTime*(2+1) - time.Second)), - expect: false, }, + { + step: []OpStep{AddLearner{regionSize: 10 * 1000}, AddPeer{regionSize: 10 * 1000}}, + start: time.Now().Add(-time.Second * (6*10*1000 - 20)), + expect: false, + }, { + // case2: 10MB region will have at least SlowOperatorWaitTime(10min) to executor. + step: []OpStep{AddLearner{regionSize: 10}, AddPeer{regionSize: 10}}, + start: time.Now().Add(-(SlowOperatorWaitTime + time.Second*20)), + expect: true, + }, { + step: []OpStep{AddLearner{regionSize: 10}, AddPeer{regionSize: 10}}, + start: time.Now().Add(-time.Second * (6*10 + 20)), + expect: false, + }, { + // case3: RemovePeer, TransferLeader, SplitRegion, MergeRegion, PromoteLearner will have FastOperatorWaitTime(10s) to executor. + step: []OpStep{RemovePeer{}, TransferLeader{}, SplitRegion{}, PromoteLearner{}}, + start: time.Now().Add(-(FastOperatorWaitTime + time.Second)), + expect: true, + }, { + step: []OpStep{RemovePeer{}, TransferLeader{}, SplitRegion{}, PromoteLearner{}}, + start: time.Now().Add(-(FastOperatorWaitTime - time.Second)), + expect: false, + }, { + // case4: ChangePeerV2Enter, ChangePeerV2Leave has FastOperatorWaitTime times than FastOperatorWaitTime + step: []OpStep{ChangePeerV2Enter{PromoteLearners: []PromoteLearner{{}, {}}}, + ChangePeerV2Leave{PromoteLearners: []PromoteLearner{{}, {}}}}, + start: time.Now().Add(-(FastOperatorWaitTime*(2+1) + time.Second)), + expect: true, + }, { + step: []OpStep{MergeRegion{}}, + start: time.Now().Add(-(FastOperatorWaitTime*20 + time.Second)), + expect: true, + }, { + step: []OpStep{MergeRegion{}}, + start: time.Now().Add(-(FastOperatorWaitTime*20 - time.Second)), + expect: false, + }, } - for _, v := range testdata { + for i, v := range testdata { + fmt.Printf("case:%d ,data:%v\n", i, v) for _, step := range v.step { + fmt.Printf("step:%v\n", step) c.Assert(v.expect, Equals, step.Timeout(v.start, 6.0)) } } diff --git a/server/schedulers/grant_leader.go b/server/schedulers/grant_leader.go index 5bb90b80a71..e040d67ec3e 100644 --- a/server/schedulers/grant_leader.go +++ b/server/schedulers/grant_leader.go @@ -200,6 +200,8 @@ func (s *grantLeaderScheduler) EncodeConfig() ([]byte, error) { // Prepare run it after adding new scheduler. func (s *grantLeaderScheduler) Prepare(cluster schedule.Cluster) error { + s.conf.mu.RLock() + defer s.conf.mu.RUnlock() var res error for id := range s.conf.StoreIDWithRanges { if err := cluster.PauseLeaderTransfer(id); err != nil { From 1866b51037383bb4db8ddd905f2786ad9969b1f9 Mon Sep 17 00:00:00 2001 From: bufferflies <1045931706@qq.com> Date: Wed, 16 Feb 2022 14:50:01 +0800 Subject: [PATCH 22/25] remove operator executor rate Signed-off-by: bufferflies <1045931706@qq.com> --- server/schedule/operator/operator.go | 14 ++------ server/schedule/operator/operator_test.go | 7 ++-- server/schedule/operator/status_tracker.go | 18 ++-------- .../schedule/operator/status_tracker_test.go | 34 +----------------- server/schedule/operator/step.go | 35 ++++++++++--------- server/schedule/operator_controller.go | 7 +--- server/schedulers/grant_leader.go | 1 - 7 files changed, 27 insertions(+), 89 deletions(-) diff --git a/server/schedule/operator/operator.go b/server/schedule/operator/operator.go index 18d88ed883a..43945574fac 100644 --- a/server/schedule/operator/operator.go +++ b/server/schedule/operator/operator.go @@ -37,9 +37,6 @@ const ( // SlowOperatorWaitTime is the duration that when an operator marked `OpRegion` // runs longer than it, the operator will be considered timeout. SlowOperatorWaitTime = 10 * time.Minute - // OperatorExecutorRate is the rate of the operator executor. - // default: 6 s/Mb - OperatorExecutorRate = 6 ) // Operator contains execution steps generated by scheduler. @@ -58,7 +55,6 @@ type Operator struct { FinishedCounters []prometheus.Counter AdditionalInfos map[string]string ApproximateSize int64 - ExecuteRate float64 } // NewOperator creates a new operator. @@ -79,7 +75,6 @@ func NewOperator(desc, brief string, regionID uint64, regionEpoch *metapb.Region level: level, AdditionalInfos: make(map[string]string), ApproximateSize: approximateSize, - ExecuteRate: OperatorExecutorRate, } } @@ -115,11 +110,6 @@ func (o *Operator) SetDesc(desc string) { o.desc = desc } -// SetExecutorRate sets the executor rate of the operator. -func (o *Operator) SetExecutorRate(rate float64) { - o.ExecuteRate = rate -} - // AttachKind attaches an operator kind for the operator. func (o *Operator) AttachKind(kind OpKind) { o.kind |= kind @@ -240,11 +230,11 @@ func (o *Operator) CheckTimeout() bool { } currentStep := int(atomic.LoadInt32(&o.currentStep)) startTime := o.GetStartTime() - if currentStep > 0 { + if 0 < currentStep && currentStep-1 < len(o.steps) { startTime = time.Unix(0, atomic.LoadInt64(&(o.stepsTime[currentStep-1]))) } step := o.steps[currentStep] - return o.status.CheckStepTimeout(startTime, step, o.ExecuteRate) + return o.status.CheckStepTimeout(startTime, step) } // Len returns the operator's steps count. diff --git a/server/schedule/operator/operator_test.go b/server/schedule/operator/operator_test.go index bc6a7540466..c5d5265a327 100644 --- a/server/schedule/operator/operator_test.go +++ b/server/schedule/operator/operator_test.go @@ -17,7 +17,6 @@ package operator import ( "context" "encoding/json" - "fmt" "sync/atomic" "testing" "time" @@ -470,11 +469,9 @@ func (s *testOperatorSuite) TestOpStepTimeout(c *C) { expect: false, }, } - for i, v := range testdata { - fmt.Printf("case:%d ,data:%v\n", i, v) + for _, v := range testdata { for _, step := range v.step { - fmt.Printf("step:%v\n", step) - c.Assert(v.expect, Equals, step.Timeout(v.start, 6.0)) + c.Assert(v.expect, Equals, step.Timeout(v.start)) } } } diff --git a/server/schedule/operator/status_tracker.go b/server/schedule/operator/status_tracker.go index ebba54e08b8..503e33f7b8a 100644 --- a/server/schedule/operator/status_tracker.go +++ b/server/schedule/operator/status_tracker.go @@ -114,26 +114,12 @@ func (trk *OpStatusTracker) CheckExpired(exp time.Duration) bool { return trk.current == EXPIRED } -// CheckTimeout checks if timeout, and update the current status. -func (trk *OpStatusTracker) CheckTimeout(wait time.Duration) bool { - trk.rw.Lock() - defer trk.rw.Unlock() - if trk.current == STARTED { - if time.Since(trk.reachTimes[STARTED]) < wait { - return false - } - _ = trk.toLocked(TIMEOUT) - return true - } - return trk.current == TIMEOUT -} - // CheckStepTimeout checks if timeout, and update the current status. -func (trk *OpStatusTracker) CheckStepTimeout(start time.Time, step OpStep, executeRate float64) bool { +func (trk *OpStatusTracker) CheckStepTimeout(start time.Time, step OpStep) bool { trk.rw.Lock() defer trk.rw.Unlock() if trk.current == STARTED { - if !step.Timeout(start, executeRate) { + if !step.Timeout(start) { return false } _ = trk.toLocked(TIMEOUT) diff --git a/server/schedule/operator/status_tracker_test.go b/server/schedule/operator/status_tracker_test.go index da2e7ceab18..d82a8c1a5a8 100644 --- a/server/schedule/operator/status_tracker_test.go +++ b/server/schedule/operator/status_tracker_test.go @@ -132,43 +132,11 @@ func (s *testOpStatusTrackerSuite) TestCheckStepTimeout(c *C) { for _, v := range testdata { trk := NewOpStatusTracker() trk.To(STARTED) - c.Assert(trk.CheckStepTimeout(v.start, v.step, 6), Equals, v.status == TIMEOUT) + c.Assert(trk.CheckStepTimeout(v.start, v.step), Equals, v.status == TIMEOUT) c.Assert(trk.Status(), Equals, v.status) } } -func (s *testOpStatusTrackerSuite) TestCheckTimeout(c *C) { - { - // Not timeout - trk := NewOpStatusTracker() - before := time.Now() - c.Assert(trk.To(STARTED), IsTrue) - after := time.Now() - c.Assert(trk.CheckTimeout(10*time.Second), IsFalse) - c.Assert(trk.Status(), Equals, STARTED) - checkTimeOrder(c, before, trk.ReachTime(), after) - } - { - // Timeout but status not changed - trk := NewOpStatusTracker() - c.Assert(trk.To(STARTED), IsTrue) - trk.setTime(STARTED, time.Now().Add(-10*time.Second)) - c.Assert(trk.CheckTimeout(5*time.Second), IsTrue) - c.Assert(trk.Status(), Equals, TIMEOUT) - } - { - // Timeout and status changed - trk := NewOpStatusTracker() - c.Assert(trk.To(STARTED), IsTrue) - before := time.Now() - c.Assert(trk.To(TIMEOUT), IsTrue) - after := time.Now() - c.Assert(trk.CheckTimeout(0), IsTrue) - c.Assert(trk.Status(), Equals, TIMEOUT) - checkTimeOrder(c, before, trk.ReachTime(), after) - } -} - func checkTimeOrder(c *C, t1, t2, t3 time.Time) { c.Assert(t1.Before(t2), IsTrue) c.Assert(t3.After(t2), IsTrue) diff --git a/server/schedule/operator/step.go b/server/schedule/operator/step.go index ae6791f2465..14d15a7ece9 100644 --- a/server/schedule/operator/step.go +++ b/server/schedule/operator/step.go @@ -17,7 +17,6 @@ package operator import ( "bytes" "fmt" - "math" "strings" "time" @@ -32,6 +31,10 @@ import ( "go.uber.org/zap" ) +// DefaultExecutorRate is the rate of the operator executor. +// default: 6 s/Mb +const DefaultExecutorRate = 6 + // OpStep describes the basic scheduling steps that can not be subdivided. type OpStep interface { fmt.Stringer @@ -39,7 +42,7 @@ type OpStep interface { IsFinish(region *core.RegionInfo) bool CheckInProgress(ci ClusterInformer, region *core.RegionInfo) error Influence(opInfluence OpInfluence, region *core.RegionInfo) - Timeout(start time.Time, executorRate float64) bool + Timeout(start time.Time) bool } // TransferLeader is an OpStep that transfers a region's leader. @@ -103,7 +106,7 @@ func (tl TransferLeader) Influence(opInfluence OpInfluence, region *core.RegionI } // Timeout returns true if the step is timeout. -func (tl TransferLeader) Timeout(start time.Time, _ float64) bool { +func (tl TransferLeader) Timeout(start time.Time) bool { return time.Since(start) > FastOperatorWaitTime } @@ -162,8 +165,8 @@ func (ap AddPeer) CheckInProgress(ci ClusterInformer, region *core.RegionInfo) e } // Timeout returns true if the step is timeout. -func (ap AddPeer) Timeout(start time.Time, executorRate float64) bool { - return time.Since(start) > maxWaitTime(executorRate, ap.regionSize) +func (ap AddPeer) Timeout(start time.Time) bool { + return time.Since(start) > maxWaitTime(ap.regionSize) } // AddLearner is an OpStep that adds a region learner peer. @@ -227,8 +230,8 @@ func (al AddLearner) Influence(opInfluence OpInfluence, region *core.RegionInfo) } // Timeout returns true if the step is timeout. -func (al AddLearner) Timeout(start time.Time, executorRate float64) bool { - return time.Since(start) > maxWaitTime(executorRate, al.regionSize) +func (al AddLearner) Timeout(start time.Time) bool { + return time.Since(start) > maxWaitTime(al.regionSize) } // PromoteLearner is an OpStep that promotes a region learner peer to normal voter. @@ -270,7 +273,7 @@ func (pl PromoteLearner) CheckInProgress(_ ClusterInformer, region *core.RegionI func (pl PromoteLearner) Influence(_ OpInfluence, _ *core.RegionInfo) {} // Timeout returns true if the step is timeout. -func (pl PromoteLearner) Timeout(start time.Time, _ float64) bool { +func (pl PromoteLearner) Timeout(start time.Time) bool { return time.Since(start) > FastOperatorWaitTime } @@ -326,7 +329,7 @@ func (rp RemovePeer) Influence(opInfluence OpInfluence, region *core.RegionInfo) } // Timeout returns true if the step is timeout. -func (rp RemovePeer) Timeout(start time.Time, _ float64) bool { +func (rp RemovePeer) Timeout(start time.Time) bool { return time.Since(start) > FastOperatorWaitTime } @@ -379,7 +382,7 @@ func (mr MergeRegion) Influence(opInfluence OpInfluence, region *core.RegionInfo } // Timeout returns true if the step is timeout. -func (mr MergeRegion) Timeout(start time.Time, _ float64) bool { +func (mr MergeRegion) Timeout(start time.Time) bool { return time.Since(start) > FastOperatorWaitTime*20 } @@ -421,7 +424,7 @@ func (sr SplitRegion) CheckInProgress(_ ClusterInformer, _ *core.RegionInfo) err } // Timeout returns true if the step is timeout. -func (sr SplitRegion) Timeout(start time.Time, _ float64) bool { +func (sr SplitRegion) Timeout(start time.Time) bool { return time.Since(start) > FastOperatorWaitTime } @@ -453,7 +456,7 @@ func (dv DemoteVoter) IsFinish(region *core.RegionInfo) bool { } // Timeout returns true if the step is timeout. -func (dv DemoteVoter) Timeout(start time.Time, _ float64) bool { +func (dv DemoteVoter) Timeout(start time.Time) bool { return time.Since(start) > FastOperatorWaitTime } @@ -599,7 +602,7 @@ func (cpe ChangePeerV2Enter) GetRequest() *pdpb.ChangePeerV2 { } // Timeout returns true if the step is timeout. -func (cpe ChangePeerV2Enter) Timeout(start time.Time, _ float64) bool { +func (cpe ChangePeerV2Enter) Timeout(start time.Time) bool { count := uint64(len(cpe.PromoteLearners)+len(cpe.DemoteVoters)) + 1 return time.Since(start) > FastOperatorWaitTime*time.Duration(count) } @@ -724,7 +727,7 @@ func (cpl ChangePeerV2Leave) CheckInProgress(_ ClusterInformer, region *core.Reg func (cpl ChangePeerV2Leave) Influence(_ OpInfluence, _ *core.RegionInfo) {} // Timeout returns true if the step is timeout. -func (cpl ChangePeerV2Leave) Timeout(start time.Time, _ float64) bool { +func (cpl ChangePeerV2Leave) Timeout(start time.Time) bool { count := uint64(len(cpl.PromoteLearners)+len(cpl.DemoteVoters)) + 1 return time.Since(start) > FastOperatorWaitTime*time.Duration(count) } @@ -740,8 +743,8 @@ func validateStore(ci ClusterInformer, id uint64) error { return nil } -func maxWaitTime(executorRate float64, regionSize int64) time.Duration { - seconds := int64(math.Ceil(executorRate)) * regionSize +func maxWaitTime(regionSize int64) time.Duration { + seconds := DefaultExecutorRate * regionSize wait := time.Duration(seconds) * time.Second if wait < SlowOperatorWaitTime { wait = SlowOperatorWaitTime diff --git a/server/schedule/operator_controller.go b/server/schedule/operator_controller.go index d66afbdae98..968ed80fa07 100644 --- a/server/schedule/operator_controller.go +++ b/server/schedule/operator_controller.go @@ -54,9 +54,6 @@ var ( StoreBalanceBaseTime float64 = 60 // FastOperatorFinishTime min finish time, if finish duration less than it,op will be pushed to fast operator queue FastOperatorFinishTime = 10 * time.Second - // DefaultOperatorExecutorRate executor rate of the operator. - // unit: s/MB - DefaultOperatorExecutorRate = 6.0 ) // OperatorController is used to limit the speed of scheduling. @@ -73,7 +70,6 @@ type OperatorController struct { wop WaitingOperator wopStatus *WaitingOperatorStatus opNotifierQueue operatorQueue - executerRate float64 } // NewOperatorController creates a OperatorController. @@ -90,7 +86,6 @@ func NewOperatorController(ctx context.Context, cluster Cluster, hbStreams *hbst wop: NewRandBuckets(), wopStatus: NewWaitingOperatorStatus(), opNotifierQueue: make(operatorQueue, 0), - executerRate: DefaultOperatorExecutorRate, } } @@ -437,7 +432,7 @@ func isHigherPriorityOperator(new, old *operator.Operator) bool { func (oc *OperatorController) addOperatorLocked(op *operator.Operator) bool { regionID := op.RegionID() - op.SetExecutorRate(oc.executerRate) + log.Info("add operator", zap.Uint64("region-id", regionID), zap.Reflect("operator", op), diff --git a/server/schedulers/grant_leader.go b/server/schedulers/grant_leader.go index e040d67ec3e..245d0fa6c10 100644 --- a/server/schedulers/grant_leader.go +++ b/server/schedulers/grant_leader.go @@ -198,7 +198,6 @@ func (s *grantLeaderScheduler) EncodeConfig() ([]byte, error) { return schedule.EncodeConfig(s.conf) } -// Prepare run it after adding new scheduler. func (s *grantLeaderScheduler) Prepare(cluster schedule.Cluster) error { s.conf.mu.RLock() defer s.conf.mu.RUnlock() From 38cdf2e4c2585fdc9933220ccf7f053332a6c392 Mon Sep 17 00:00:00 2001 From: bufferflies <1045931706@qq.com> Date: Fri, 18 Feb 2022 11:02:35 +0800 Subject: [PATCH 23/25] change timeout signature Signed-off-by: bufferflies <1045931706@qq.com> --- server/schedule/operator/builder.go | 2 +- server/schedule/operator/operator.go | 2 +- server/schedule/operator/operator_test.go | 50 ++++++++------ server/schedule/operator/status_tracker.go | 4 +- .../schedule/operator/status_tracker_test.go | 2 +- server/schedule/operator/step.go | 68 +++++++++++-------- 6 files changed, 73 insertions(+), 55 deletions(-) diff --git a/server/schedule/operator/builder.go b/server/schedule/operator/builder.go index 4517d599c2a..98944415d36 100644 --- a/server/schedule/operator/builder.go +++ b/server/schedule/operator/builder.go @@ -673,7 +673,7 @@ func (b *Builder) execPromoteLearner(peer *metapb.Peer) { func (b *Builder) execAddPeer(peer *metapb.Peer) { if b.lightWeight { - b.steps = append(b.steps, AddLearner{ToStore: peer.GetStoreId(), PeerID: peer.GetId(), IsLightWeight: b.lightWeight, regionSize: b.approximateSize}) + b.steps = append(b.steps, AddLearner{ToStore: peer.GetStoreId(), PeerID: peer.GetId(), IsLightWeight: b.lightWeight}) } else { b.steps = append(b.steps, AddLearner{ToStore: peer.GetStoreId(), PeerID: peer.GetId()}) } diff --git a/server/schedule/operator/operator.go b/server/schedule/operator/operator.go index 4df277652cd..792778a201a 100644 --- a/server/schedule/operator/operator.go +++ b/server/schedule/operator/operator.go @@ -234,7 +234,7 @@ func (o *Operator) CheckTimeout() bool { startTime = time.Unix(0, atomic.LoadInt64(&(o.stepsTime[currentStep-1]))) } step := o.steps[currentStep] - return o.status.CheckStepTimeout(startTime, step) + return o.status.CheckStepTimeout(startTime, step, o.ApproximateSize) } // Len returns the operator's steps count. diff --git a/server/schedule/operator/operator_test.go b/server/schedule/operator/operator_test.go index 6350f27f5f3..deb12aae510 100644 --- a/server/schedule/operator/operator_test.go +++ b/server/schedule/operator/operator_test.go @@ -124,7 +124,7 @@ func (s *testOperatorSuite) TestOperator(c *C) { // addPeer1, transferLeader1, removePeer2 steps = []OpStep{ - AddPeer{ToStore: 1, PeerID: 1, regionSize: 10}, + AddPeer{ToStore: 1, PeerID: 1}, TransferLeader{FromStore: 2, ToStore: 1}, RemovePeer{FromStore: 2}, } @@ -422,28 +422,34 @@ func (s *testOperatorSuite) TestSchedulerKind(c *C) { func (s *testOperatorSuite) TestOpStepTimeout(c *C) { testdata := []struct { - step []OpStep - start time.Time - expect bool - }{{ - // case1: 10GB region will have 60,000s to executor. - step: []OpStep{AddLearner{regionSize: 10 * 1000}, AddPeer{regionSize: 10 * 1000}}, - start: time.Now().Add(-time.Second * (6*10*1000 + 20)), - expect: true, - }, + step []OpStep + regionSize int64 + start time.Time + expect bool + }{ { - step: []OpStep{AddLearner{regionSize: 10 * 1000}, AddPeer{regionSize: 10 * 1000}}, - start: time.Now().Add(-time.Second * (6*10*1000 - 20)), - expect: false, + // case1: 10GB region will have 60,000s to executor. + step: []OpStep{AddLearner{}, AddPeer{}}, + regionSize: 10 * 1000, + start: time.Now().Add(-time.Second * (6*10*1000 + 20)), + expect: true, + }, + { + step: []OpStep{AddLearner{}, AddPeer{}}, + regionSize: 10 * 1000, + start: time.Now().Add(-time.Second * (6*10*1000 - 20)), + expect: false, }, { // case2: 10MB region will have at least SlowOperatorWaitTime(10min) to executor. - step: []OpStep{AddLearner{regionSize: 10}, AddPeer{regionSize: 10}}, - start: time.Now().Add(-(SlowOperatorWaitTime + time.Second*20)), - expect: true, + step: []OpStep{AddLearner{}, AddPeer{}}, + regionSize: 10, + start: time.Now().Add(-(SlowOperatorWaitTime + time.Second*20)), + expect: true, }, { - step: []OpStep{AddLearner{regionSize: 10}, AddPeer{regionSize: 10}}, - start: time.Now().Add(-time.Second * (6*10 + 20)), - expect: false, + step: []OpStep{AddLearner{}, AddPeer{}}, + regionSize: 10, + start: time.Now().Add(-time.Second * (6*10 + 20)), + expect: false, }, { // case3: RemovePeer, TransferLeader, SplitRegion, MergeRegion, PromoteLearner will have FastOperatorWaitTime(10s) to executor. step: []OpStep{RemovePeer{}, TransferLeader{}, SplitRegion{}, PromoteLearner{}}, @@ -461,17 +467,17 @@ func (s *testOperatorSuite) TestOpStepTimeout(c *C) { expect: true, }, { step: []OpStep{MergeRegion{}}, - start: time.Now().Add(-(FastOperatorWaitTime*20 + time.Second)), + start: time.Now().Add(-(FastOperatorWaitTime*10 + time.Second)), expect: true, }, { step: []OpStep{MergeRegion{}}, - start: time.Now().Add(-(FastOperatorWaitTime*20 - time.Second)), + start: time.Now().Add(-(FastOperatorWaitTime*10 - time.Second)), expect: false, }, } for _, v := range testdata { for _, step := range v.step { - c.Assert(v.expect, Equals, step.Timeout(v.start)) + c.Assert(v.expect, Equals, step.Timeout(v.start, v.regionSize)) } } } diff --git a/server/schedule/operator/status_tracker.go b/server/schedule/operator/status_tracker.go index 503e33f7b8a..163ff21b606 100644 --- a/server/schedule/operator/status_tracker.go +++ b/server/schedule/operator/status_tracker.go @@ -115,11 +115,11 @@ func (trk *OpStatusTracker) CheckExpired(exp time.Duration) bool { } // CheckStepTimeout checks if timeout, and update the current status. -func (trk *OpStatusTracker) CheckStepTimeout(start time.Time, step OpStep) bool { +func (trk *OpStatusTracker) CheckStepTimeout(start time.Time, step OpStep, approximateSize int64) bool { trk.rw.Lock() defer trk.rw.Unlock() if trk.current == STARTED { - if !step.Timeout(start) { + if !step.Timeout(start, approximateSize) { return false } _ = trk.toLocked(TIMEOUT) diff --git a/server/schedule/operator/status_tracker_test.go b/server/schedule/operator/status_tracker_test.go index d82a8c1a5a8..aed287d833d 100644 --- a/server/schedule/operator/status_tracker_test.go +++ b/server/schedule/operator/status_tracker_test.go @@ -132,7 +132,7 @@ func (s *testOpStatusTrackerSuite) TestCheckStepTimeout(c *C) { for _, v := range testdata { trk := NewOpStatusTracker() trk.To(STARTED) - c.Assert(trk.CheckStepTimeout(v.start, v.step), Equals, v.status == TIMEOUT) + c.Assert(trk.CheckStepTimeout(v.start, v.step, 0), Equals, v.status == TIMEOUT) c.Assert(trk.Status(), Equals, v.status) } } diff --git a/server/schedule/operator/step.go b/server/schedule/operator/step.go index 14d15a7ece9..3b9e194e172 100644 --- a/server/schedule/operator/step.go +++ b/server/schedule/operator/step.go @@ -31,9 +31,14 @@ import ( "go.uber.org/zap" ) -// DefaultExecutorRate is the rate of the operator executor. -// default: 6 s/Mb -const DefaultExecutorRate = 6 +const ( + // DefaultSlowExecutorRate is the fast rate of the operator executor. + // default: 6 s/Mb + DefaultSlowExecutorRate = 6 + // DefaultFastExecutorRate is the slow rate of the operator executor. + // default: 0.01 s/Mb + DefaultFastExecutorRate = 0.1 +) // OpStep describes the basic scheduling steps that can not be subdivided. type OpStep interface { @@ -42,7 +47,7 @@ type OpStep interface { IsFinish(region *core.RegionInfo) bool CheckInProgress(ci ClusterInformer, region *core.RegionInfo) error Influence(opInfluence OpInfluence, region *core.RegionInfo) - Timeout(start time.Time) bool + Timeout(start time.Time, regionSize int64) bool } // TransferLeader is an OpStep that transfers a region's leader. @@ -106,15 +111,14 @@ func (tl TransferLeader) Influence(opInfluence OpInfluence, region *core.RegionI } // Timeout returns true if the step is timeout. -func (tl TransferLeader) Timeout(start time.Time) bool { - return time.Since(start) > FastOperatorWaitTime +func (tl TransferLeader) Timeout(start time.Time, regionSize int64) bool { + return time.Since(start) > maxFastWaitTime(regionSize) } // AddPeer is an OpStep that adds a region peer. type AddPeer struct { ToStore, PeerID uint64 IsLightWeight bool - regionSize int64 } // ConfVerChanged returns the delta value for version increased by this step. @@ -165,15 +169,14 @@ func (ap AddPeer) CheckInProgress(ci ClusterInformer, region *core.RegionInfo) e } // Timeout returns true if the step is timeout. -func (ap AddPeer) Timeout(start time.Time) bool { - return time.Since(start) > maxWaitTime(ap.regionSize) +func (ap AddPeer) Timeout(start time.Time, regionSize int64) bool { + return time.Since(start) > maxSlowWaitTime(regionSize) } // AddLearner is an OpStep that adds a region learner peer. type AddLearner struct { ToStore, PeerID uint64 IsLightWeight bool - regionSize int64 } // ConfVerChanged returns the delta value for version increased by this step. @@ -230,8 +233,8 @@ func (al AddLearner) Influence(opInfluence OpInfluence, region *core.RegionInfo) } // Timeout returns true if the step is timeout. -func (al AddLearner) Timeout(start time.Time) bool { - return time.Since(start) > maxWaitTime(al.regionSize) +func (al AddLearner) Timeout(start time.Time, regionSize int64) bool { + return time.Since(start) > maxSlowWaitTime(regionSize) } // PromoteLearner is an OpStep that promotes a region learner peer to normal voter. @@ -273,8 +276,8 @@ func (pl PromoteLearner) CheckInProgress(_ ClusterInformer, region *core.RegionI func (pl PromoteLearner) Influence(_ OpInfluence, _ *core.RegionInfo) {} // Timeout returns true if the step is timeout. -func (pl PromoteLearner) Timeout(start time.Time) bool { - return time.Since(start) > FastOperatorWaitTime +func (pl PromoteLearner) Timeout(start time.Time, regionSize int64) bool { + return time.Since(start) > maxFastWaitTime(regionSize) } // RemovePeer is an OpStep that removes a region peer. @@ -329,8 +332,8 @@ func (rp RemovePeer) Influence(opInfluence OpInfluence, region *core.RegionInfo) } // Timeout returns true if the step is timeout. -func (rp RemovePeer) Timeout(start time.Time) bool { - return time.Since(start) > FastOperatorWaitTime +func (rp RemovePeer) Timeout(start time.Time, regionSize int64) bool { + return time.Since(start) > maxFastWaitTime(regionSize) } // MergeRegion is an OpStep that merge two regions. @@ -382,8 +385,8 @@ func (mr MergeRegion) Influence(opInfluence OpInfluence, region *core.RegionInfo } // Timeout returns true if the step is timeout. -func (mr MergeRegion) Timeout(start time.Time) bool { - return time.Since(start) > FastOperatorWaitTime*20 +func (mr MergeRegion) Timeout(start time.Time, regionSize int64) bool { + return time.Since(start) > maxFastWaitTime(regionSize)*10 } // SplitRegion is an OpStep that splits a region. @@ -424,8 +427,8 @@ func (sr SplitRegion) CheckInProgress(_ ClusterInformer, _ *core.RegionInfo) err } // Timeout returns true if the step is timeout. -func (sr SplitRegion) Timeout(start time.Time) bool { - return time.Since(start) > FastOperatorWaitTime +func (sr SplitRegion) Timeout(start time.Time, regionSize int64) bool { + return time.Since(start) > maxFastWaitTime(regionSize) } // DemoteVoter is very similar to DemoteFollower. But it allows Demote Leader. @@ -456,8 +459,8 @@ func (dv DemoteVoter) IsFinish(region *core.RegionInfo) bool { } // Timeout returns true if the step is timeout. -func (dv DemoteVoter) Timeout(start time.Time) bool { - return time.Since(start) > FastOperatorWaitTime +func (dv DemoteVoter) Timeout(start time.Time, regionSize int64) bool { + return time.Since(start) > maxFastWaitTime(regionSize) } // ChangePeerV2Enter is an OpStep that uses joint consensus to request all PromoteLearner and DemoteVoter. @@ -602,9 +605,9 @@ func (cpe ChangePeerV2Enter) GetRequest() *pdpb.ChangePeerV2 { } // Timeout returns true if the step is timeout. -func (cpe ChangePeerV2Enter) Timeout(start time.Time) bool { +func (cpe ChangePeerV2Enter) Timeout(start time.Time, regionSize int64) bool { count := uint64(len(cpe.PromoteLearners)+len(cpe.DemoteVoters)) + 1 - return time.Since(start) > FastOperatorWaitTime*time.Duration(count) + return time.Since(start) > maxFastWaitTime(regionSize)*time.Duration(count) } // ChangePeerV2Leave is an OpStep that leaves the joint state. @@ -727,9 +730,9 @@ func (cpl ChangePeerV2Leave) CheckInProgress(_ ClusterInformer, region *core.Reg func (cpl ChangePeerV2Leave) Influence(_ OpInfluence, _ *core.RegionInfo) {} // Timeout returns true if the step is timeout. -func (cpl ChangePeerV2Leave) Timeout(start time.Time) bool { +func (cpl ChangePeerV2Leave) Timeout(start time.Time, regionSize int64) bool { count := uint64(len(cpl.PromoteLearners)+len(cpl.DemoteVoters)) + 1 - return time.Since(start) > FastOperatorWaitTime*time.Duration(count) + return time.Since(start) > maxFastWaitTime(regionSize)*time.Duration(count) } func validateStore(ci ClusterInformer, id uint64) error { @@ -743,11 +746,20 @@ func validateStore(ci ClusterInformer, id uint64) error { return nil } -func maxWaitTime(regionSize int64) time.Duration { - seconds := DefaultExecutorRate * regionSize +func maxSlowWaitTime(regionSize int64) time.Duration { + seconds := DefaultSlowExecutorRate * regionSize wait := time.Duration(seconds) * time.Second if wait < SlowOperatorWaitTime { wait = SlowOperatorWaitTime } return wait } + +func maxFastWaitTime(regionSize int64) time.Duration { + seconds := int64(DefaultFastExecutorRate * float64(regionSize)) + wait := time.Duration(seconds) * time.Second + if wait < FastOperatorWaitTime { + wait = FastOperatorWaitTime + } + return wait +} From ceee2b7e4a3f982c70e98a157b9b086851c92a58 Mon Sep 17 00:00:00 2001 From: bufferflies <1045931706@qq.com> Date: Fri, 18 Feb 2022 19:00:05 +0800 Subject: [PATCH 24/25] address comment Signed-off-by: bufferflies <1045931706@qq.com> --- server/schedule/operator/operator.go | 35 ++++++++++--------- .../schedule/operator/status_tracker_test.go | 1 + server/schedule/operator/step.go | 26 +++++++------- 3 files changed, 33 insertions(+), 29 deletions(-) diff --git a/server/schedule/operator/operator.go b/server/schedule/operator/operator.go index 792778a201a..62638a2271a 100644 --- a/server/schedule/operator/operator.go +++ b/server/schedule/operator/operator.go @@ -225,16 +225,12 @@ func (o *Operator) CheckExpired() bool { // CheckTimeout checks if the operator is timeout, and update the status. func (o *Operator) CheckTimeout() bool { - if o.CheckSuccess() || len(o.steps) == 0 { + if o.CheckSuccess() { return false } - currentStep := int(atomic.LoadInt32(&o.currentStep)) - startTime := o.GetStartTime() - if 0 < currentStep && currentStep-1 < len(o.steps) { - startTime = time.Unix(0, atomic.LoadInt64(&(o.stepsTime[currentStep-1]))) - } - step := o.steps[currentStep] - return o.status.CheckStepTimeout(startTime, step, o.ApproximateSize) + currentStep := atomic.LoadInt32(&o.currentStep) + startTime := o.getStepStartTime(currentStep) + return o.status.CheckStepTimeout(startTime, o.steps[currentStep], o.ApproximateSize) } // Len returns the operator's steps count. @@ -250,6 +246,15 @@ func (o *Operator) Step(i int) OpStep { return nil } +// getStepStartTime returns the start time of the i-th step. +func (o *Operator) getStepStartTime(step int32) time.Time { + startTime := o.GetStartTime() + if 0 < step && int(step-1) < len(o.steps) { + startTime = time.Unix(0, atomic.LoadInt64(&(o.stepsTime[step-1]))) + } + return startTime +} + // Check checks if current step is finished, returns next step to take action. // If operator is at an end status, check returns nil. // It's safe to be called by multiple goroutine concurrently. @@ -262,14 +267,8 @@ func (o *Operator) Check(region *core.RegionInfo) OpStep { for step := atomic.LoadInt32(&o.currentStep); int(step) < len(o.steps); step++ { if o.steps[int(step)].IsFinish(region) { if atomic.CompareAndSwapInt64(&(o.stepsTime[step]), 0, time.Now().UnixNano()) { - var startTime time.Time - if step == 0 { - startTime = o.GetStartTime() - } else { - startTime = time.Unix(0, atomic.LoadInt64(&(o.stepsTime[step-1]))) - } operatorStepDuration.WithLabelValues(reflect.TypeOf(o.steps[int(step)]).Name()). - Observe(time.Unix(0, o.stepsTime[step]).Sub(startTime).Seconds()) + Observe(time.Unix(0, o.stepsTime[step]).Sub(o.getStepStartTime(step)).Seconds()) } atomic.StoreInt32(&o.currentStep, step+1) } else { @@ -405,12 +404,16 @@ func (o *Operator) GetAdditionalInfo() string { // these values are used for unit test. const ( // mock region default region size is 96MB. - mockRegionSize = 96 * (1 << 20) + mockRegionSize = 96 mockDesc = "test" mockBrief = "test" ) // NewTestOperator creates a test operator, only used for unit test. func NewTestOperator(regionID uint64, regionEpoch *metapb.RegionEpoch, kind OpKind, steps ...OpStep) *Operator { + // OpSteps can not be empty for test. + if len(steps) == 0 { + steps = []OpStep{ChangePeerV2Leave{}} + } return NewOperator(mockDesc, mockBrief, regionID, regionEpoch, kind, mockRegionSize, steps...) } diff --git a/server/schedule/operator/status_tracker_test.go b/server/schedule/operator/status_tracker_test.go index aed287d833d..8ada8b386f2 100644 --- a/server/schedule/operator/status_tracker_test.go +++ b/server/schedule/operator/status_tracker_test.go @@ -130,6 +130,7 @@ func (s *testOpStatusTrackerSuite) TestCheckStepTimeout(c *C) { }} for _, v := range testdata { + // Timeout and status changed trk := NewOpStatusTracker() trk.To(STARTED) c.Assert(trk.CheckStepTimeout(v.start, v.step, 0), Equals, v.status == TIMEOUT) diff --git a/server/schedule/operator/step.go b/server/schedule/operator/step.go index 3b9e194e172..f8d075577f5 100644 --- a/server/schedule/operator/step.go +++ b/server/schedule/operator/step.go @@ -36,7 +36,7 @@ const ( // default: 6 s/Mb DefaultSlowExecutorRate = 6 // DefaultFastExecutorRate is the slow rate of the operator executor. - // default: 0.01 s/Mb + // default: 0.1 s/Mb DefaultFastExecutorRate = 0.1 ) @@ -112,7 +112,7 @@ func (tl TransferLeader) Influence(opInfluence OpInfluence, region *core.RegionI // Timeout returns true if the step is timeout. func (tl TransferLeader) Timeout(start time.Time, regionSize int64) bool { - return time.Since(start) > maxFastWaitTime(regionSize) + return time.Since(start) > fastStepWaitDuration(regionSize) } // AddPeer is an OpStep that adds a region peer. @@ -170,7 +170,7 @@ func (ap AddPeer) CheckInProgress(ci ClusterInformer, region *core.RegionInfo) e // Timeout returns true if the step is timeout. func (ap AddPeer) Timeout(start time.Time, regionSize int64) bool { - return time.Since(start) > maxSlowWaitTime(regionSize) + return time.Since(start) > slowStepWaitDuration(regionSize) } // AddLearner is an OpStep that adds a region learner peer. @@ -234,7 +234,7 @@ func (al AddLearner) Influence(opInfluence OpInfluence, region *core.RegionInfo) // Timeout returns true if the step is timeout. func (al AddLearner) Timeout(start time.Time, regionSize int64) bool { - return time.Since(start) > maxSlowWaitTime(regionSize) + return time.Since(start) > slowStepWaitDuration(regionSize) } // PromoteLearner is an OpStep that promotes a region learner peer to normal voter. @@ -277,7 +277,7 @@ func (pl PromoteLearner) Influence(_ OpInfluence, _ *core.RegionInfo) {} // Timeout returns true if the step is timeout. func (pl PromoteLearner) Timeout(start time.Time, regionSize int64) bool { - return time.Since(start) > maxFastWaitTime(regionSize) + return time.Since(start) > fastStepWaitDuration(regionSize) } // RemovePeer is an OpStep that removes a region peer. @@ -333,7 +333,7 @@ func (rp RemovePeer) Influence(opInfluence OpInfluence, region *core.RegionInfo) // Timeout returns true if the step is timeout. func (rp RemovePeer) Timeout(start time.Time, regionSize int64) bool { - return time.Since(start) > maxFastWaitTime(regionSize) + return time.Since(start) > fastStepWaitDuration(regionSize) } // MergeRegion is an OpStep that merge two regions. @@ -386,7 +386,7 @@ func (mr MergeRegion) Influence(opInfluence OpInfluence, region *core.RegionInfo // Timeout returns true if the step is timeout. func (mr MergeRegion) Timeout(start time.Time, regionSize int64) bool { - return time.Since(start) > maxFastWaitTime(regionSize)*10 + return time.Since(start) > fastStepWaitDuration(regionSize)*10 } // SplitRegion is an OpStep that splits a region. @@ -428,7 +428,7 @@ func (sr SplitRegion) CheckInProgress(_ ClusterInformer, _ *core.RegionInfo) err // Timeout returns true if the step is timeout. func (sr SplitRegion) Timeout(start time.Time, regionSize int64) bool { - return time.Since(start) > maxFastWaitTime(regionSize) + return time.Since(start) > fastStepWaitDuration(regionSize) } // DemoteVoter is very similar to DemoteFollower. But it allows Demote Leader. @@ -460,7 +460,7 @@ func (dv DemoteVoter) IsFinish(region *core.RegionInfo) bool { // Timeout returns true if the step is timeout. func (dv DemoteVoter) Timeout(start time.Time, regionSize int64) bool { - return time.Since(start) > maxFastWaitTime(regionSize) + return time.Since(start) > fastStepWaitDuration(regionSize) } // ChangePeerV2Enter is an OpStep that uses joint consensus to request all PromoteLearner and DemoteVoter. @@ -607,7 +607,7 @@ func (cpe ChangePeerV2Enter) GetRequest() *pdpb.ChangePeerV2 { // Timeout returns true if the step is timeout. func (cpe ChangePeerV2Enter) Timeout(start time.Time, regionSize int64) bool { count := uint64(len(cpe.PromoteLearners)+len(cpe.DemoteVoters)) + 1 - return time.Since(start) > maxFastWaitTime(regionSize)*time.Duration(count) + return time.Since(start) > fastStepWaitDuration(regionSize)*time.Duration(count) } // ChangePeerV2Leave is an OpStep that leaves the joint state. @@ -732,7 +732,7 @@ func (cpl ChangePeerV2Leave) Influence(_ OpInfluence, _ *core.RegionInfo) {} // Timeout returns true if the step is timeout. func (cpl ChangePeerV2Leave) Timeout(start time.Time, regionSize int64) bool { count := uint64(len(cpl.PromoteLearners)+len(cpl.DemoteVoters)) + 1 - return time.Since(start) > maxFastWaitTime(regionSize)*time.Duration(count) + return time.Since(start) > fastStepWaitDuration(regionSize)*time.Duration(count) } func validateStore(ci ClusterInformer, id uint64) error { @@ -746,7 +746,7 @@ func validateStore(ci ClusterInformer, id uint64) error { return nil } -func maxSlowWaitTime(regionSize int64) time.Duration { +func slowStepWaitDuration(regionSize int64) time.Duration { seconds := DefaultSlowExecutorRate * regionSize wait := time.Duration(seconds) * time.Second if wait < SlowOperatorWaitTime { @@ -755,7 +755,7 @@ func maxSlowWaitTime(regionSize int64) time.Duration { return wait } -func maxFastWaitTime(regionSize int64) time.Duration { +func fastStepWaitDuration(regionSize int64) time.Duration { seconds := int64(DefaultFastExecutorRate * float64(regionSize)) wait := time.Duration(seconds) * time.Second if wait < FastOperatorWaitTime { From e74e2f7f1243cde98732d2fca3b2620c6ef9fa3b Mon Sep 17 00:00:00 2001 From: bufferflies <1045931706@qq.com> Date: Wed, 23 Feb 2022 11:55:00 +0800 Subject: [PATCH 25/25] change comment Signed-off-by: bufferflies <1045931706@qq.com> --- server/schedule/operator/operator.go | 4 +- server/schedule/operator/operator_test.go | 63 ++++++++++++++++------- 2 files changed, 45 insertions(+), 22 deletions(-) diff --git a/server/schedule/operator/operator.go b/server/schedule/operator/operator.go index 62638a2271a..e9a9ddc0043 100644 --- a/server/schedule/operator/operator.go +++ b/server/schedule/operator/operator.go @@ -83,9 +83,9 @@ func (o *Operator) String() string { for i := range o.steps { stepStrs[i] = o.steps[i].String() } - s := fmt.Sprintf("%s {%s} (kind:%s, region:%v(%v, %v), createAt:%s, startAt:%s, currentStep:%v, steps:[%s])", + s := fmt.Sprintf("%s {%s} (kind:%s, region:%v(%v, %v), createAt:%s, startAt:%s, currentStep:%v, size:%d, steps:[%s])", o.desc, o.brief, o.kind, o.regionID, o.regionEpoch.GetVersion(), o.regionEpoch.GetConfVer(), o.GetCreateTime(), - o.GetStartTime(), atomic.LoadInt32(&o.currentStep), strings.Join(stepStrs, ", ")) + o.GetStartTime(), atomic.LoadInt32(&o.currentStep), o.ApproximateSize, strings.Join(stepStrs, ", ")) if o.CheckSuccess() { s += " finished" } diff --git a/server/schedule/operator/operator_test.go b/server/schedule/operator/operator_test.go index deb12aae510..dc3eae08850 100644 --- a/server/schedule/operator/operator_test.go +++ b/server/schedule/operator/operator_test.go @@ -431,48 +431,71 @@ func (s *testOperatorSuite) TestOpStepTimeout(c *C) { // case1: 10GB region will have 60,000s to executor. step: []OpStep{AddLearner{}, AddPeer{}}, regionSize: 10 * 1000, - start: time.Now().Add(-time.Second * (6*10*1000 + 20)), + start: time.Now().Add(-(time.Second*(6*10*1000) + time.Second)), expect: true, }, { step: []OpStep{AddLearner{}, AddPeer{}}, regionSize: 10 * 1000, - start: time.Now().Add(-time.Second * (6*10*1000 - 20)), + start: time.Now().Add(-(time.Second*(6*10*1000) - time.Second)), expect: false, }, { // case2: 10MB region will have at least SlowOperatorWaitTime(10min) to executor. step: []OpStep{AddLearner{}, AddPeer{}}, regionSize: 10, - start: time.Now().Add(-(SlowOperatorWaitTime + time.Second*20)), + start: time.Now().Add(-(SlowOperatorWaitTime + time.Second)), expect: true, }, { step: []OpStep{AddLearner{}, AddPeer{}}, regionSize: 10, - start: time.Now().Add(-time.Second * (6*10 + 20)), + start: time.Now().Add(-(time.Second*(6*10) - time.Second)), expect: false, }, { - // case3: RemovePeer, TransferLeader, SplitRegion, MergeRegion, PromoteLearner will have FastOperatorWaitTime(10s) to executor. - step: []OpStep{RemovePeer{}, TransferLeader{}, SplitRegion{}, PromoteLearner{}}, - start: time.Now().Add(-(FastOperatorWaitTime + time.Second)), - expect: true, + // case3: 10GB region will have 1000s to executor for RemovePeer, TransferLeader, SplitRegion, PromoteLearner. + step: []OpStep{RemovePeer{}, TransferLeader{}, SplitRegion{}, PromoteLearner{}}, + start: time.Now().Add(-(time.Second*(1000) + time.Second)), + regionSize: 10 * 1000, + expect: true, }, { - step: []OpStep{RemovePeer{}, TransferLeader{}, SplitRegion{}, PromoteLearner{}}, - start: time.Now().Add(-(FastOperatorWaitTime - time.Second)), - expect: false, + step: []OpStep{RemovePeer{}, TransferLeader{}, SplitRegion{}, PromoteLearner{}}, + start: time.Now().Add(-(time.Second*(1000) - time.Second)), + regionSize: 10 * 1000, + expect: false, }, { - // case4: ChangePeerV2Enter, ChangePeerV2Leave has FastOperatorWaitTime times than FastOperatorWaitTime + // case4: 10MB will have at lease FastOperatorWaitTime(10s) to executor for RemovePeer, TransferLeader, SplitRegion, PromoteLearner. + step: []OpStep{RemovePeer{}, TransferLeader{}, SplitRegion{}, PromoteLearner{}}, + start: time.Now().Add(-(FastOperatorWaitTime + time.Second)), + regionSize: 10, + expect: true, + }, { + step: []OpStep{RemovePeer{}, TransferLeader{}, SplitRegion{}, PromoteLearner{}}, + start: time.Now().Add(-(FastOperatorWaitTime - time.Second)), + regionSize: 10, + expect: false, + }, { + // case5: 10GB region will have 1000*3 for ChangePeerV2Enter, ChangePeerV2Leave. step: []OpStep{ChangePeerV2Enter{PromoteLearners: []PromoteLearner{{}, {}}}, ChangePeerV2Leave{PromoteLearners: []PromoteLearner{{}, {}}}}, - start: time.Now().Add(-(FastOperatorWaitTime*(2+1) + time.Second)), - expect: true, + start: time.Now().Add(-(time.Second*(3000) + time.Second)), + regionSize: 10 * 1000, + expect: true, + }, { + step: []OpStep{ChangePeerV2Enter{PromoteLearners: []PromoteLearner{{}, {}}}, + ChangePeerV2Leave{PromoteLearners: []PromoteLearner{{}, {}}}}, + start: time.Now().Add(-(time.Second*(3000) - time.Second)), + regionSize: 10 * 1000, + expect: false, }, { - step: []OpStep{MergeRegion{}}, - start: time.Now().Add(-(FastOperatorWaitTime*10 + time.Second)), - expect: true, + //case6: 10GB region will have 1000*10s for ChangePeerV2Enter, ChangePeerV2Leave. + step: []OpStep{MergeRegion{}}, + start: time.Now().Add(-(time.Second*(10000) + time.Second)), + regionSize: 10 * 1000, + expect: true, }, { - step: []OpStep{MergeRegion{}}, - start: time.Now().Add(-(FastOperatorWaitTime*10 - time.Second)), - expect: false, + step: []OpStep{MergeRegion{}}, + start: time.Now().Add(-(time.Second*(10000) - time.Second)), + regionSize: 10 * 1000, + expect: false, }, } for _, v := range testdata {