Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

scheduler: allow balance-leader-scheduler generate multiple operators #4652

Merged
merged 19 commits into from
Mar 14, 2022
Merged
Show file tree
Hide file tree
Changes from 5 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions server/cluster/coordinator.go
Original file line number Diff line number Diff line change
Expand Up @@ -855,9 +855,9 @@ func (s *scheduleController) Schedule() []*operator.Operator {
}
cacheCluster := newCacheCluster(s.cluster)
// If we have schedule, reset interval to the minimal interval.
if op := s.Scheduler.Schedule(cacheCluster); op != nil {
if ops := s.Scheduler.Schedule(cacheCluster); len(ops) > 0 {
s.nextInterval = s.Scheduler.GetMinInterval()
return op
return ops
}
}
s.nextInterval = s.Scheduler.GetNextInterval(s.nextInterval)
Expand Down
12 changes: 6 additions & 6 deletions server/cluster/coordinator_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -1006,7 +1006,7 @@ func (s *testOperatorControllerSuite) TestStoreOverloaded(c *C) {
if time.Since(start) > time.Second {
break
}
c.Assert(ops, IsNil)
c.Assert(ops, HasLen, 0)
}

// reset all stores' limit
Expand All @@ -1024,7 +1024,7 @@ func (s *testOperatorControllerSuite) TestStoreOverloaded(c *C) {
// sleep 1 seconds to make sure that the token is filled up
time.Sleep(time.Second)
for i := 0; i < 100; i++ {
c.Assert(lb.Schedule(tc), NotNil)
c.Assert(len(lb.Schedule(tc)), Greater, 0)
}
}

Expand Down Expand Up @@ -1052,10 +1052,10 @@ func (s *testOperatorControllerSuite) TestStoreOverloadedWithReplace(c *C) {
c.Assert(oc.AddOperator(op2), IsTrue)
op3 := newTestOperator(1, tc.GetRegion(2).GetRegionEpoch(), operator.OpRegion, operator.AddPeer{ToStore: 1, PeerID: 3})
c.Assert(oc.AddOperator(op3), IsFalse)
c.Assert(lb.Schedule(tc), IsNil)
c.Assert(lb.Schedule(tc), HasLen, 0)
// sleep 2 seconds to make sure that token is filled up
time.Sleep(2 * time.Second)
c.Assert(lb.Schedule(tc), NotNil)
c.Assert(len(lb.Schedule(tc)), Greater, 0)
}

func (s *testOperatorControllerSuite) TestDownStoreLimit(c *C) {
Expand Down Expand Up @@ -1146,7 +1146,7 @@ func (s *testScheduleControllerSuite) TestController(c *C) {

for i := schedulers.MinScheduleInterval; sc.GetInterval() != schedulers.MaxScheduleInterval; i = sc.GetNextInterval(i) {
c.Assert(sc.GetInterval(), Equals, i)
c.Assert(sc.Schedule(), IsNil)
c.Assert(sc.Schedule(), HasLen, 0)
}
// limit = 2
lb.limit = 2
Expand Down Expand Up @@ -1227,7 +1227,7 @@ func (s *testScheduleControllerSuite) TestInterval(c *C) {
for _, n := range idleSeconds {
sc.nextInterval = schedulers.MinScheduleInterval
for totalSleep := time.Duration(0); totalSleep <= time.Second*time.Duration(n); totalSleep += sc.GetInterval() {
c.Assert(sc.Schedule(), IsNil)
c.Assert(sc.Schedule(), HasLen, 0)
}
c.Assert(sc.GetInterval(), Less, time.Second*time.Duration(n/2))
}
Expand Down
8 changes: 8 additions & 0 deletions server/schedule/operator_controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -823,6 +823,14 @@ func (oc *OperatorController) GetFastOpInfluence(cluster Cluster, influence oper
}
}

// AddOpInfluence add operator influence for cluster
func AddOpInfluence(op *operator.Operator, influence operator.OpInfluence, cluster Cluster) {
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

What is it used for?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

change store leader score to sort again

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Could we use this function on L841-845 in the function NewTotalOpInfluence?

region := cluster.GetRegion(op.RegionID())
if region != nil {
op.TotalInfluence(influence, region)
}
}

// NewTotalOpInfluence creates a OpInfluence.
func NewTotalOpInfluence(operators []*operator.Operator, cluster Cluster) operator.OpInfluence {
influence := operator.OpInfluence{
Expand Down
77 changes: 65 additions & 12 deletions server/schedulers/balance_leader.go
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,7 @@ func init() {
}
conf.Ranges = ranges
conf.Name = BalanceLeaderName
conf.Batch = 5
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

how about using a constant?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

fix

return nil
}
})
Expand All @@ -67,6 +68,7 @@ func init() {
type balanceLeaderSchedulerConfig struct {
Name string `json:"name"`
Ranges []core.KeyRange `json:"ranges"`
Batch int `json:"batch"`
}

type balanceLeaderScheduler struct {
Expand Down Expand Up @@ -148,6 +150,7 @@ func (l *balanceLeaderScheduler) Schedule(cluster schedule.Cluster) []*operator.
stores := cluster.GetStores()
sources := filter.SelectSourceStores(stores, l.filters, cluster.GetOpts())
targets := filter.SelectTargetStores(stores, l.filters, cluster.GetOpts())
result := make([]*operator.Operator, 0, l.conf.Batch)
sort.Slice(sources, func(i, j int) bool {
iOp := plan.GetOpInfluence(sources[i].GetID())
jOp := plan.GetOpInfluence(sources[j].GetID())
Expand All @@ -161,42 +164,92 @@ func (l *balanceLeaderScheduler) Schedule(cluster schedule.Cluster) []*operator.
targets[j].LeaderScore(leaderSchedulePolicy, jOp)
})

for i := 0; i < len(sources) || i < len(targets); i++ {
if i < len(sources) {
plan.source, plan.target = sources[i], nil
usedRegions := make(map[uint64]struct{})
sourcePoint := 0
targetPoint := 0
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think at least some comments are needed.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

rafactor it

for sourcePoint < len(sources) || targetPoint < len(targets) {
if sourcePoint < len(sources) {
used := false
plan.source, plan.target = sources[sourcePoint], nil
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Maybe index?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

yep, updated

retryLimit := l.retryQuota.GetLimit(plan.source)
log.Debug("store leader score", zap.String("scheduler", l.GetName()), zap.Uint64("source-store", plan.SourceStoreID()))
l.counter.WithLabelValues("high-score", plan.SourceMetricLabel()).Inc()
for j := 0; j < retryLimit; j++ {
schedulerCounter.WithLabelValues(l.GetName(), "total").Inc()
if ops := l.transferLeaderOut(plan); len(ops) > 0 {
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

How about changing it to op?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

fix

if _, ok := usedRegions[ops[0].RegionID()]; ok {
continue
}
l.retryQuota.ResetLimit(plan.source)
ops[0].Counters = append(ops[0].Counters, l.counter.WithLabelValues("transfer-out", plan.SourceMetricLabel()))
return ops
result = append(result, ops...)
if len(result) >= l.conf.Batch {
return result
}
used = true
usedRegions[ops[0].RegionID()] = struct{}{}
schedule.AddOpInfluence(ops[0], plan.opInfluence, cluster)
sortStores(sources, sourcePoint, func(i, j int) bool {
iOp := plan.GetOpInfluence(sources[i].GetID())
nolouch marked this conversation as resolved.
Show resolved Hide resolved
jOp := plan.GetOpInfluence(sources[j].GetID())
return sources[i].LeaderScore(leaderSchedulePolicy, iOp) <=
sources[j].LeaderScore(leaderSchedulePolicy, jOp)
})
break
}
}
l.Attenuate(plan.source)
log.Debug("no operator created for selected stores", zap.String("scheduler", l.GetName()), zap.Uint64("source", plan.SourceStoreID()))
if !used {
sourcePoint++
l.Attenuate(plan.source)
log.Debug("no operator created for selected stores", zap.String("scheduler", l.GetName()), zap.Uint64("source", plan.SourceStoreID()))
}
}
if i < len(targets) {
plan.source, plan.target = nil, targets[i]
if targetPoint < len(targets) {
used := false
plan.source, plan.target = nil, targets[targetPoint]
retryLimit := l.retryQuota.GetLimit(plan.target)
log.Debug("store leader score", zap.String("scheduler", l.GetName()), zap.Uint64("target-store", plan.TargetStoreID()))
l.counter.WithLabelValues("low-score", plan.TargetMetricLabel()).Inc()
for j := 0; j < retryLimit; j++ {
schedulerCounter.WithLabelValues(l.GetName(), "total").Inc()
if ops := l.transferLeaderIn(plan); len(ops) > 0 {
if _, ok := usedRegions[ops[0].RegionID()]; ok {
continue
}
l.retryQuota.ResetLimit(plan.target)
ops[0].Counters = append(ops[0].Counters, l.counter.WithLabelValues("transfer-in", plan.TargetMetricLabel()))
return ops
result = append(result, ops...)
if len(result) >= l.conf.Batch {
return result
}
used = true
usedRegions[ops[0].RegionID()] = struct{}{}
schedule.AddOpInfluence(ops[0], plan.opInfluence, cluster)
sortStores(targets, targetPoint, func(i, j int) bool {
iOp := plan.GetOpInfluence(targets[i].GetID())
jOp := plan.GetOpInfluence(targets[j].GetID())
return targets[i].LeaderScore(leaderSchedulePolicy, iOp) >=
targets[j].LeaderScore(leaderSchedulePolicy, jOp)
})
break
}
}
l.Attenuate(plan.target)
log.Debug("no operator created for selected stores", zap.String("scheduler", l.GetName()), zap.Uint64("target", plan.TargetStoreID()))
if !used {
targetPoint++
l.Attenuate(plan.target)
log.Debug("no operator created for selected stores", zap.String("scheduler", l.GetName()), zap.Uint64("target", plan.TargetStoreID()))
}
}
}
l.retryQuota.GC(append(sources, targets...))
return nil
return result
}

func sortStores(stores []*core.StoreInfo, pos int, less func(i, j int) bool) {
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

How about resortStores?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

fix

swapper := func(i, j int) { stores[i], stores[j] = stores[j], stores[i] }
for ; pos+1 < len(stores) && less(pos, pos+1); pos++ {
swapper(pos, pos+1)
}
}

// transferLeaderOut transfers leader from the source store.
Expand Down
Loading