From 3dd3c7ba1487aa30305473c8349adc3a7c8ddd6a Mon Sep 17 00:00:00 2001 From: ti-srebot <66930949+ti-srebot@users.noreply.github.com> Date: Fri, 16 Apr 2021 21:02:33 +0800 Subject: [PATCH] schedule: add metrcis for region scatter (#3582) (#3596) --- metrics/grafana/pd.json | 214 ++++++++++++++++++++++++++++ server/schedule/filter/filters.go | 4 +- server/schedule/metrics.go | 18 +++ server/schedule/region_scatterer.go | 36 ++++- 4 files changed, 267 insertions(+), 5 deletions(-) diff --git a/metrics/grafana/pd.json b/metrics/grafana/pd.json index 6f423e1de39..26f869a8ef6 100644 --- a/metrics/grafana/pd.json +++ b/metrics/grafana/pd.json @@ -6853,6 +6853,220 @@ "title": "Scheduler", "type": "row" }, + { + "collapsed": true, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 19 + }, + "id": 1437, + "panels": [ + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "tidb-cluster", + "description": "", + "fill": 0, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 20 + }, + "id": 1433, + "legend": { + "alignAsTable": true, + "avg": true, + "current": true, + "hideEmpty": true, + "hideZero": true, + "max": true, + "min": false, + "rightSide": true, + "show": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 2, + "links": [], + "nullPointMode": "null", + "percentage": false, + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "sum(delta(pd_schedule_scatter_operators_count{tidb_cluster=\"$tidb_cluster\", instance=\"$instance\", type=\"skip\"}[1m])) by (event)", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "skip-{{event}}", + "refId": "A" + }, + { + "expr": "delta(pd_schedule_scatter_operators_count{tidb_cluster=\"$tidb_cluster\", instance=\"$instance\", type=\"fail\"}[1m])", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "fail", + "refId": "B" + }, + { + "expr": "delta(pd_schedule_scatter_operators_count{tidb_cluster=\"$tidb_cluster\", instance=\"$instance\", type=\"success\"}[1m])", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "success", + "refId": "C" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "scatter operator event", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "opm", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "tidb-cluster", + "fill": 0, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 20 + }, + "id": 1435, + "legend": { + "alignAsTable": true, + "avg": false, + "current": true, + "hideEmpty": true, + "hideZero": true, + "max": true, + "min": false, + "rightSide": true, + "show": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 2, + "links": [], + "nullPointMode": "null", + "percentage": false, + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "sum(delta(pd_schedule_scatter_distribution{tidb_cluster=\"$tidb_cluster\", instance=\"$instance\",engine=\"tikv\",is_leader=\"false\"}[1m])) by (store)", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "peer-{{store}}", + "refId": "A" + }, + { + "expr": "sum(delta(pd_schedule_scatter_distribution{tidb_cluster=\"$tidb_cluster\", instance=\"$instance\",engine=\"tikv\",is_leader=\"true\"}[1m])) by (store)", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "leader-{{store}}", + "refId": "B" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "scatter store selection", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "opm", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + } + ], + "title": "Scatter and Splitter", + "type": "row" + }, { "collapsed": true, "gridPos": { diff --git a/server/schedule/filter/filters.go b/server/schedule/filter/filters.go index e8cefee7206..62cf42315f4 100644 --- a/server/schedule/filter/filters.go +++ b/server/schedule/filter/filters.go @@ -389,7 +389,7 @@ func (f *StoreStateFilter) anyConditionMatch(typ int, opt *config.PersistOptions funcs = []conditionFunc{f.isTombstone, f.isOffline, f.isDown, f.isDisconnected, f.isBusy, f.exceedAddLimit, f.tooManySnapshots, f.tooManyPendingPeers} case scatterRegionTarget: - funcs = []conditionFunc{f.isTombstone, f.isOffline, f.isDown, f.isDisconnected} + funcs = []conditionFunc{f.isTombstone, f.isOffline, f.isDown, f.isDisconnected, f.isBusy} } for _, cf := range funcs { if cf(opt, store) { @@ -686,6 +686,8 @@ const ( EngineKey = "engine" // EngineTiFlash is the tiflash value of the engine label. EngineTiFlash = "tiflash" + // EngineTiKV indicates the tikv engine in metrics + EngineTiKV = "tikv" ) var allSpecialUses = []string{SpecialUseHotRegion, SpecialUseReserved} diff --git a/server/schedule/metrics.go b/server/schedule/metrics.go index 80205d8068d..0feddf6a159 100644 --- a/server/schedule/metrics.go +++ b/server/schedule/metrics.go @@ -73,6 +73,22 @@ var ( Name: "store_limit_cost", Help: "limit rate cost of store.", }, []string{"store", "limit_type"}) + + scatterCounter = prometheus.NewCounterVec( + prometheus.CounterOpts{ + Namespace: "pd", + Subsystem: "schedule", + Name: "scatter_operators_count", + Help: "Counter of region scatter operators.", + }, []string{"type", "event"}) + + scatterDistributionCounter = prometheus.NewCounterVec( + prometheus.CounterOpts{ + Namespace: "pd", + Subsystem: "schedule", + Name: "scatter_distribution", + Help: "Counter of the distribution in scatter.", + }, []string{"store", "is_leader", "engine"}) ) func init() { @@ -83,4 +99,6 @@ func init() { prometheus.MustRegister(storeLimitRateGauge) prometheus.MustRegister(storeLimitCostCounter) prometheus.MustRegister(operatorWaitCounter) + prometheus.MustRegister(scatterCounter) + prometheus.MustRegister(scatterDistributionCounter) } diff --git a/server/schedule/region_scatterer.go b/server/schedule/region_scatterer.go index 89e8261d213..03b480877f1 100644 --- a/server/schedule/region_scatterer.go +++ b/server/schedule/region_scatterer.go @@ -17,6 +17,7 @@ import ( "context" "fmt" "math" + "strconv" "sync" "time" @@ -137,6 +138,7 @@ const maxRetryLimit = 30 func (r *RegionScatterer) ScatterRegionsByRange(startKey, endKey []byte, group string, retryLimit int) ([]*operator.Operator, map[uint64]error, error) { regions := r.cluster.ScanRegions(startKey, endKey, -1) if len(regions) < 1 { + scatterCounter.WithLabelValues("skip", "empty-region").Inc() return nil, nil, errors.New("empty region") } failures := make(map[uint64]error, len(regions)) @@ -155,6 +157,7 @@ func (r *RegionScatterer) ScatterRegionsByRange(startKey, endKey []byte, group s // ScatterRegionsByID directly scatter regions by ScatterRegions func (r *RegionScatterer) ScatterRegionsByID(regionsID []uint64, group string, retryLimit int) ([]*operator.Operator, map[uint64]error, error) { if len(regionsID) < 1 { + scatterCounter.WithLabelValues("skip", "empty-region").Inc() return nil, nil, errors.New("empty region") } failures := make(map[uint64]error, len(regionsID)) @@ -162,6 +165,8 @@ func (r *RegionScatterer) ScatterRegionsByID(regionsID []uint64, group string, r for _, id := range regionsID { region := r.cluster.GetRegion(id) if region == nil { + scatterCounter.WithLabelValues("skip", "no-region").Inc() + log.Warn("failed to find region during scatter", zap.Uint64("region-id", id)) failures[id] = errors.New(fmt.Sprintf("failed to find region %v", id)) continue } @@ -187,6 +192,7 @@ func (r *RegionScatterer) ScatterRegionsByID(regionsID []uint64, group string, r // and the value of the failures indicates the failure error. func (r *RegionScatterer) ScatterRegions(regions map[uint64]*core.RegionInfo, failures map[uint64]error, group string, retryLimit int) ([]*operator.Operator, error) { if len(regions) < 1 { + scatterCounter.WithLabelValues("skip", "empty-region").Inc() return nil, errors.New("empty region") } if retryLimit > maxRetryLimit { @@ -226,14 +232,20 @@ func (r *RegionScatterer) ScatterRegions(regions map[uint64]*core.RegionInfo, fa func (r *RegionScatterer) Scatter(region *core.RegionInfo, group string) (*operator.Operator, error) { if !opt.IsRegionReplicated(r.cluster, region) { r.cluster.AddSuspectRegions(region.GetID()) + scatterCounter.WithLabelValues("skip", "not-replicated").Inc() + log.Warn("region not replicated during scatter", zap.Uint64("region-id", region.GetID())) return nil, errors.Errorf("region %d is not fully replicated", region.GetID()) } if region.GetLeader() == nil { + scatterCounter.WithLabelValues("skip", "no-leader").Inc() + log.Warn("region no leader during scatter", zap.Uint64("region-id", region.GetID())) return nil, errors.Errorf("region %d has no leader", region.GetID()) } if r.cluster.IsRegionHot(region) { + scatterCounter.WithLabelValues("skip", "hot").Inc() + log.Warn("region too hot during scatter", zap.Uint64("region-id", region.GetID())) return nil, errors.Errorf("region %d is hot", region.GetID()) } @@ -286,6 +298,7 @@ func (r *RegionScatterer) scatterRegion(region *core.RegionInfo, group string) * op, err := operator.CreateScatterRegionOperator("scatter-region", r.cluster, region, targetPeers, targetLeader) if err != nil { + scatterCounter.WithLabelValues("fail", "").Inc() for _, peer := range region.GetPeers() { targetPeers[peer.GetStoreId()] = peer } @@ -293,8 +306,11 @@ func (r *RegionScatterer) scatterRegion(region *core.RegionInfo, group string) * log.Debug("fail to create scatter region operator", errs.ZapError(err)) return nil } - r.Put(targetPeers, targetLeader, group) - op.SetPriorityLevel(core.HighPriority) + if op != nil { + scatterCounter.WithLabelValues("success", "").Inc() + r.Put(targetPeers, targetLeader, group) + op.SetPriorityLevel(core.HighPriority) + } return op } @@ -305,7 +321,7 @@ func (r *RegionScatterer) selectCandidates(region *core.RegionInfo, sourceStoreI return nil } filters := []filter.Filter{ - filter.NewExcludedFilter("scatter-region", nil, selectedStores), + filter.NewExcludedFilter(r.name, nil, selectedStores), } scoreGuard := filter.NewPlacementSafeguard(r.name, r.cluster, region, sourceStore) filters = append(filters, context.filters...) @@ -313,7 +329,7 @@ func (r *RegionScatterer) selectCandidates(region *core.RegionInfo, sourceStoreI stores := r.cluster.GetStores() candidates := make([]uint64, 0) for _, store := range stores { - if filter.Target(r.cluster.GetOpts(), store, filters) && !store.IsBusy() { + if filter.Target(r.cluster.GetOpts(), store, filters) { candidates = append(candidates, store.GetID()) } } @@ -375,10 +391,22 @@ func (r *RegionScatterer) Put(peers map[uint64]*metapb.Peer, leaderStoreID uint6 store := r.cluster.GetStore(storeID) if ordinaryFilter.Target(r.cluster.GetOpts(), store) { r.ordinaryEngine.selectedPeer.Put(storeID, group) + scatterDistributionCounter.WithLabelValues( + fmt.Sprintf("%v", storeID), + strconv.FormatBool(false), + filter.EngineTiKV).Inc() } else { engine := store.GetLabelValue(filter.EngineKey) r.specialEngines[engine].selectedPeer.Put(storeID, group) + scatterDistributionCounter.WithLabelValues( + fmt.Sprintf("%v", storeID), + strconv.FormatBool(false), + engine).Inc() } } r.ordinaryEngine.selectedLeader.Put(leaderStoreID, group) + scatterDistributionCounter.WithLabelValues( + fmt.Sprintf("%v", leaderStoreID), + strconv.FormatBool(true), + filter.EngineTiKV).Inc() }