From 948b5dc8e13f12871c6e5728457a3e719df25751 Mon Sep 17 00:00:00 2001 From: nolouch Date: Wed, 18 Oct 2023 18:20:47 +0800 Subject: [PATCH] placement: fix the geopartition issues Signed-off-by: nolouch --- metrics/grafana/pd.json | 105 ++++++++++++++++++ pkg/schedule/placement/rule_manager.go | 14 +++ pkg/schedule/placement/rule_manager_test.go | 3 + pkg/schedule/schedulers/metrics.go | 9 ++ .../schedulers/scheduler_controller.go | 20 +++- 5 files changed, 149 insertions(+), 2 deletions(-) diff --git a/metrics/grafana/pd.json b/metrics/grafana/pd.json index c811303d1c42..809244771b7c 100644 --- a/metrics/grafana/pd.json +++ b/metrics/grafana/pd.json @@ -1139,6 +1139,111 @@ "timeFrom": null, "timeShift": null }, + { + "bars": false, + "cacheTimeout": null, + "dashLength": 10, + "dashes": false, + "datasource": "${DS_TEST-CLUSTER}", + "description": "The current peer count of the cluster", + "fieldConfig": { + "defaults": {}, + "overrides": [] + }, + "fill": 1, + "fillGradient": 0, + "gridPos": { + "h": 6, + "w": 4, + "x": 16, + "y": 13 + }, + "hiddenSeries": false, + "id": 22, + "interval": null, + "legend": { + "alignAsTable": true, + "avg": false, + "current": true, + "max": false, + "min": false, + "rightSide": true, + "show": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "links": [], + "maxDataPoints": 100, + "nullPointMode": "null", + "options": { + "alertThreshold": true + }, + "percentage": false, + "pluginVersion": "7.5.10", + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "exemplar": true, + "expr": "sum(pd_rule_manager_status{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\"}) by (type)", + "format": "time_series", + "interval": "", + "intervalFactor": 2, + "legendFormat": "{{type}}", + "refId": "A", + "step": 4 + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Placement Rules Status", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "$$hashKey": "object:192", + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "$$hashKey": "object:193", + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, { "collapsed": true, "gridPos": { diff --git a/pkg/schedule/placement/rule_manager.go b/pkg/schedule/placement/rule_manager.go index bdca4cc1b19d..a7e169b74aab 100644 --- a/pkg/schedule/placement/rule_manager.go +++ b/pkg/schedule/placement/rule_manager.go @@ -321,6 +321,20 @@ func (m *RuleManager) GetAllRules() []*Rule { return rules } +// GetRulesCount returns the number of rules. +func (m *RuleManager) GetRulesCount() int { + m.RLock() + defer m.RUnlock() + return len(m.ruleConfig.rules) +} + +// GetGroupsCount returns the number of rule groups. +func (m *RuleManager) GetGroupsCount() int { + m.RLock() + defer m.RUnlock() + return len(m.ruleConfig.groups) +} + // GetRulesByGroup returns sorted rules of a group. func (m *RuleManager) GetRulesByGroup(group string) []*Rule { m.RLock() diff --git a/pkg/schedule/placement/rule_manager_test.go b/pkg/schedule/placement/rule_manager_test.go index a6454337aa84..8c6352618888 100644 --- a/pkg/schedule/placement/rule_manager_test.go +++ b/pkg/schedule/placement/rule_manager_test.go @@ -125,6 +125,7 @@ func TestAdjustRule(t *testing.T) { IsWitness: true, LabelConstraints: []LabelConstraint{{Key: "engine", Op: "in", Values: []string{"tiflash"}}}, }, "tiflash")) + } func TestLeaderCheck(t *testing.T) { @@ -163,6 +164,8 @@ func TestSaveLoad(t *testing.T) { re.Equal(rules[0].String(), m2.GetRule("pd", "default").String()) re.Equal(rules[1].String(), m2.GetRule("foo", "baz").String()) re.Equal(rules[2].String(), m2.GetRule("foo", "bar").String()) + re.Equal(manager.GetRulesCount(), 3) + re.Equal(manager.GetGroupsCount(), 2) } func TestSetAfterGet(t *testing.T) { diff --git a/pkg/schedule/schedulers/metrics.go b/pkg/schedule/schedulers/metrics.go index 2052bc923afa..34e4606a7ce3 100644 --- a/pkg/schedule/schedulers/metrics.go +++ b/pkg/schedule/schedulers/metrics.go @@ -134,10 +134,19 @@ var ( Name: "hot_pending_sum", Help: "Pending influence sum of store in hot region scheduler.", }, []string{"store", "rw", "dim"}) + + ruleStatusGauge = prometheus.NewGaugeVec( + prometheus.GaugeOpts{ + Namespace: "pd", + Subsystem: "rule_manager", + Name: "status", + Help: "Status of the rule.", + }, []string{"type"}) ) func init() { prometheus.MustRegister(schedulerStatusGauge) + prometheus.MustRegister(ruleStatusGauge) prometheus.MustRegister(schedulerCounter) prometheus.MustRegister(balanceWitnessCounter) prometheus.MustRegister(hotSchedulerResultCounter) diff --git a/pkg/schedule/schedulers/scheduler_controller.go b/pkg/schedule/schedulers/scheduler_controller.go index d58a78ca82f3..0f2264392aa5 100644 --- a/pkg/schedule/schedulers/scheduler_controller.go +++ b/pkg/schedule/schedulers/scheduler_controller.go @@ -36,7 +36,11 @@ import ( const maxScheduleRetries = 10 -var denySchedulersByLabelerCounter = labeler.LabelerEventCounter.WithLabelValues("schedulers", "deny") +var ( + denySchedulersByLabelerCounter = labeler.LabelerEventCounter.WithLabelValues("schedulers", "deny") + rulesCntStatusGauge = ruleStatusGauge.WithLabelValues("rule_count") + groupsCntStatusGauge = ruleStatusGauge.WithLabelValues("group_count") +) // Controller is used to manage all schedulers. type Controller struct { @@ -108,7 +112,6 @@ func (c *Controller) GetSchedulerHandlers() map[string]http.Handler { // CollectSchedulerMetrics collects metrics of all schedulers. func (c *Controller) CollectSchedulerMetrics() { c.RLock() - defer c.RUnlock() for _, s := range c.schedulers { var allowScheduler float64 // If the scheduler is not allowed to schedule, it will disappear in Grafana panel. @@ -118,6 +121,15 @@ func (c *Controller) CollectSchedulerMetrics() { } schedulerStatusGauge.WithLabelValues(s.Scheduler.GetName(), "allow").Set(allowScheduler) } + c.RUnlock() + ruleMgr := c.cluster.GetRuleManager() + if ruleMgr == nil { + return + } + ruleCnt := ruleMgr.GetRulesCount() + groupCnt := ruleMgr.GetGroupsCount() + rulesCntStatusGauge.Set(float64(ruleCnt)) + groupsCntStatusGauge.Set(float64(groupCnt)) } func (c *Controller) isSchedulingHalted() bool { @@ -127,6 +139,10 @@ func (c *Controller) isSchedulingHalted() bool { // ResetSchedulerMetrics resets metrics of all schedulers. func (c *Controller) ResetSchedulerMetrics() { schedulerStatusGauge.Reset() + ruleStatusGauge.Reset() + // create in map again + rulesCntStatusGauge = ruleStatusGauge.WithLabelValues("rule_count") + groupsCntStatusGauge = ruleStatusGauge.WithLabelValues("group_count") } // AddSchedulerHandler adds the HTTP handler for a scheduler.