Skip to content

Commit

Permalink
placement: add rule/group count metrics
Browse files Browse the repository at this point in the history
Signed-off-by: nolouch <nolouch@gmail.com>
  • Loading branch information
nolouch committed Oct 19, 2023
1 parent cb9c70c commit 4bc0ab7
Show file tree
Hide file tree
Showing 5 changed files with 149 additions and 2 deletions.
105 changes: 105 additions & 0 deletions metrics/grafana/pd.json
Original file line number Diff line number Diff line change
Expand Up @@ -1139,6 +1139,111 @@
"timeFrom": null,
"timeShift": null
},
{
"bars": false,
"cacheTimeout": null,
"dashLength": 10,
"dashes": false,
"datasource": "${DS_TEST-CLUSTER}",
"description": "The current peer count of the cluster",
"fieldConfig": {
"defaults": {},
"overrides": []
},
"fill": 1,
"fillGradient": 0,
"gridPos": {
"h": 6,
"w": 4,
"x": 16,
"y": 13
},
"hiddenSeries": false,
"id": 22,
"interval": null,
"legend": {
"alignAsTable": true,
"avg": false,
"current": true,
"max": false,
"min": false,
"rightSide": true,
"show": true,
"total": false,
"values": true
},
"lines": true,
"linewidth": 1,
"links": [],
"maxDataPoints": 100,
"nullPointMode": "null",
"options": {
"alertThreshold": true
},
"percentage": false,
"pluginVersion": "7.5.10",
"pointradius": 2,
"points": false,
"renderer": "flot",
"seriesOverrides": [],
"spaceLength": 10,
"stack": false,
"steppedLine": false,
"targets": [
{
"exemplar": true,
"expr": "sum(pd_rule_manager_status{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\"}) by (type)",
"format": "time_series",
"interval": "",
"intervalFactor": 2,
"legendFormat": "{{type}}",
"refId": "A",
"step": 4
}
],
"thresholds": [],
"timeFrom": null,
"timeRegions": [],
"timeShift": null,
"title": "Placement Rules Status",
"tooltip": {
"shared": true,
"sort": 0,
"value_type": "individual"
},
"type": "graph",
"xaxis": {
"buckets": null,
"mode": "time",
"name": null,
"show": true,
"values": []
},
"yaxes": [
{
"$$hashKey": "object:192",
"format": "short",
"label": null,
"logBase": 1,
"max": null,
"min": null,
"show": true
},
{
"$$hashKey": "object:193",
"format": "short",
"label": null,
"logBase": 1,
"max": null,
"min": null,
"show": true
}
],
"yaxis": {
"align": false,
"alignLevel": null
}
},
{
"collapsed": true,
"gridPos": {
Expand Down
14 changes: 14 additions & 0 deletions pkg/schedule/placement/rule_manager.go
Original file line number Diff line number Diff line change
Expand Up @@ -321,6 +321,20 @@ func (m *RuleManager) GetAllRules() []*Rule {
return rules
}

// GetRulesCount returns the number of rules.
func (m *RuleManager) GetRulesCount() int {
m.RLock()
defer m.RUnlock()
return len(m.ruleConfig.rules)
}

// GetGroupsCount returns the number of rule groups.
func (m *RuleManager) GetGroupsCount() int {
m.RLock()
defer m.RUnlock()
return len(m.ruleConfig.groups)
}

// GetRulesByGroup returns sorted rules of a group.
func (m *RuleManager) GetRulesByGroup(group string) []*Rule {
m.RLock()
Expand Down
3 changes: 3 additions & 0 deletions pkg/schedule/placement/rule_manager_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -125,6 +125,7 @@ func TestAdjustRule(t *testing.T) {
IsWitness: true,
LabelConstraints: []LabelConstraint{{Key: "engine", Op: "in", Values: []string{"tiflash"}}},
}, "tiflash"))

}

func TestLeaderCheck(t *testing.T) {
Expand Down Expand Up @@ -163,6 +164,8 @@ func TestSaveLoad(t *testing.T) {
re.Equal(rules[0].String(), m2.GetRule("pd", "default").String())
re.Equal(rules[1].String(), m2.GetRule("foo", "baz").String())
re.Equal(rules[2].String(), m2.GetRule("foo", "bar").String())
re.Equal(manager.GetRulesCount(), 3)
re.Equal(manager.GetGroupsCount(), 2)
}

func TestSetAfterGet(t *testing.T) {
Expand Down
9 changes: 9 additions & 0 deletions pkg/schedule/schedulers/metrics.go
Original file line number Diff line number Diff line change
Expand Up @@ -134,10 +134,19 @@ var (
Name: "hot_pending_sum",
Help: "Pending influence sum of store in hot region scheduler.",
}, []string{"store", "rw", "dim"})

ruleStatusGauge = prometheus.NewGaugeVec(
prometheus.GaugeOpts{
Namespace: "pd",
Subsystem: "rule_manager",
Name: "status",
Help: "Status of the rule.",
}, []string{"type"})
)

func init() {
prometheus.MustRegister(schedulerStatusGauge)
prometheus.MustRegister(ruleStatusGauge)
prometheus.MustRegister(schedulerCounter)
prometheus.MustRegister(balanceWitnessCounter)
prometheus.MustRegister(hotSchedulerResultCounter)
Expand Down
20 changes: 18 additions & 2 deletions pkg/schedule/schedulers/scheduler_controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,11 @@ import (

const maxScheduleRetries = 10

var denySchedulersByLabelerCounter = labeler.LabelerEventCounter.WithLabelValues("schedulers", "deny")
var (
denySchedulersByLabelerCounter = labeler.LabelerEventCounter.WithLabelValues("schedulers", "deny")
rulesCntStatusGauge = ruleStatusGauge.WithLabelValues("rule_count")
groupsCntStatusGauge = ruleStatusGauge.WithLabelValues("group_count")
)

// Controller is used to manage all schedulers.
type Controller struct {
Expand Down Expand Up @@ -108,7 +112,6 @@ func (c *Controller) GetSchedulerHandlers() map[string]http.Handler {
// CollectSchedulerMetrics collects metrics of all schedulers.
func (c *Controller) CollectSchedulerMetrics() {
c.RLock()
defer c.RUnlock()
for _, s := range c.schedulers {
var allowScheduler float64
// If the scheduler is not allowed to schedule, it will disappear in Grafana panel.
Expand All @@ -118,6 +121,15 @@ func (c *Controller) CollectSchedulerMetrics() {
}
schedulerStatusGauge.WithLabelValues(s.Scheduler.GetName(), "allow").Set(allowScheduler)
}
c.RUnlock()
ruleMgr := c.cluster.GetRuleManager()
if ruleMgr == nil {
return
}
ruleCnt := ruleMgr.GetRulesCount()
groupCnt := ruleMgr.GetGroupsCount()
rulesCntStatusGauge.Set(float64(ruleCnt))
groupsCntStatusGauge.Set(float64(groupCnt))
}

func (c *Controller) isSchedulingHalted() bool {
Expand All @@ -127,6 +139,10 @@ func (c *Controller) isSchedulingHalted() bool {
// ResetSchedulerMetrics resets metrics of all schedulers.
func (c *Controller) ResetSchedulerMetrics() {
schedulerStatusGauge.Reset()
ruleStatusGauge.Reset()
// create in map again
rulesCntStatusGauge = ruleStatusGauge.WithLabelValues("rule_count")
groupsCntStatusGauge = ruleStatusGauge.WithLabelValues("group_count")
}

// AddSchedulerHandler adds the HTTP handler for a scheduler.
Expand Down

0 comments on commit 4bc0ab7

Please sign in to comment.