Skip to content

Commit

Permalink
collect scheduling statistics
Browse files Browse the repository at this point in the history
Signed-off-by: Ryan Leung <rleungx@gmail.com>
  • Loading branch information
rleungx committed Sep 20, 2023
1 parent b61a318 commit 7b172f2
Show file tree
Hide file tree
Showing 8 changed files with 129 additions and 127 deletions.
56 changes: 28 additions & 28 deletions metrics/grafana/pd.json
Original file line number Diff line number Diff line change
Expand Up @@ -2591,7 +2591,7 @@
"steppedLine": false,
"targets": [
{
"expr": "sum(delta(pd_schedule_operators_count{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=\"$instance\", event=\"create\"}[1m])) by (type)",
"expr": "sum(delta(pd_schedule_operators_count{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", event=\"create\"}[1m])) by (type)",
"format": "time_series",
"intervalFactor": 2,
"legendFormat": "{{type}}",
Expand Down Expand Up @@ -2684,7 +2684,7 @@
"steppedLine": false,
"targets": [
{
"expr": "sum(delta(pd_schedule_operators_count{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=\"$instance\", event=\"check\"}[1m])) by (type)",
"expr": "sum(delta(pd_schedule_operators_count{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", event=\"check\"}[1m])) by (type)",
"format": "time_series",
"intervalFactor": 2,
"legendFormat": "{{type}}",
Expand Down Expand Up @@ -2777,7 +2777,7 @@
"steppedLine": false,
"targets": [
{
"expr": "sum(delta(pd_schedule_operators_count{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=\"$instance\", event=\"finish\"}[1m])) by (type)",
"expr": "sum(delta(pd_schedule_operators_count{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", event=\"finish\"}[1m])) by (type)",
"format": "time_series",
"intervalFactor": 2,
"legendFormat": "{{type}}",
Expand Down Expand Up @@ -2869,7 +2869,7 @@
"steppedLine": false,
"targets": [
{
"expr": "sum(delta(pd_schedule_operators_count{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=\"$instance\", event=\"timeout\"}[1m])) by (type)",
"expr": "sum(delta(pd_schedule_operators_count{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", event=\"timeout\"}[1m])) by (type)",
"format": "time_series",
"intervalFactor": 2,
"legendFormat": "{{type}}",
Expand Down Expand Up @@ -2962,15 +2962,15 @@
"steppedLine": false,
"targets": [
{
"expr": "sum(delta(pd_schedule_operators_count{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=\"$instance\", event=\"cancel\"}[1m])) by (type)",
"expr": "sum(delta(pd_schedule_operators_count{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", event=\"cancel\"}[1m])) by (type)",
"format": "time_series",
"intervalFactor": 2,
"legendFormat": "{{type}}",
"refId": "A",
"step": 4
},
{
"expr": "sum(delta(pd_schedule_operators_count{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=\"$instance\", event=\"replace\"}[1m])) by (type)",
"expr": "sum(delta(pd_schedule_operators_count{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", event=\"replace\"}[1m])) by (type)",
"format": "time_series",
"intervalFactor": 2,
"legendFormat": "{{type}}",
Expand Down Expand Up @@ -3063,7 +3063,7 @@
"steppedLine": false,
"targets": [
{
"expr": "sum(delta(pd_schedule_operators_count{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=\"$instance\"}[1m])) by (event)",
"expr": "sum(delta(pd_schedule_operators_count{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\"}[1m])) by (event)",
"format": "time_series",
"intervalFactor": 2,
"legendFormat": "{{event}}",
Expand Down Expand Up @@ -4765,7 +4765,7 @@
"steppedLine": false,
"targets": [
{
"expr": "pd_hotspot_status{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", store=~\"$store\", instance=\"$instance\", type=\"hot_write_region_as_leader\"}",
"expr": "pd_hotspot_status{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", store=~\"$store\", type=\"hot_write_region_as_leader\"}",
"format": "time_series",
"intervalFactor": 2,
"legendFormat": "{{address}}-store-{{store}}",
Expand Down Expand Up @@ -4861,7 +4861,7 @@
"steppedLine": false,
"targets": [
{
"expr": "pd_hotspot_status{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", store=~\"$store\", instance=\"$instance\", type=\"hot_write_region_as_peer\"}",
"expr": "pd_hotspot_status{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", store=~\"$store\", type=\"hot_write_region_as_peer\"}",
"format": "time_series",
"intervalFactor": 2,
"legendFormat": "{{address}}-store-{{store}}",
Expand Down Expand Up @@ -5070,7 +5070,7 @@
"steppedLine": false,
"targets": [
{
"expr": "pd_hotspot_status{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", store=~\"$store\", instance=\"$instance\", type=\"total_write_keys_as_leader\"}",
"expr": "pd_hotspot_status{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", store=~\"$store\", type=\"total_write_keys_as_leader\"}",
"format": "time_series",
"intervalFactor": 2,
"legendFormat": "{{address}}-store-{{store}}",
Expand Down Expand Up @@ -5175,7 +5175,7 @@
"steppedLine": false,
"targets": [
{
"expr": "pd_hotspot_status{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", store=~\"$store\", instance=\"$instance\", type=\"total_write_bytes_as_leader\"}",
"expr": "pd_hotspot_status{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", store=~\"$store\", type=\"total_write_bytes_as_leader\"}",
"format": "time_series",
"intervalFactor": 2,
"legendFormat": "{{address}}-store-{{store}}",
Expand Down Expand Up @@ -6019,7 +6019,7 @@
"steppedLine": false,
"targets": [
{
"expr": "pd_hotspot_status{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", store=~\"$store\", instance=\"$instance\", type=\"hot_read_region_as_peer\"}",
"expr": "pd_hotspot_status{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", store=~\"$store\", type=\"hot_read_region_as_peer\"}",
"format": "time_series",
"interval": "",
"intervalFactor": 2,
Expand All @@ -6028,7 +6028,7 @@
"step": 4
},
{
"expr": "pd_hotspot_status{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", store=~\"$store\", instance=\"$instance\", type=\"hot_read_region_as_leader\"}",
"expr": "pd_hotspot_status{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", store=~\"$store\", type=\"hot_read_region_as_leader\"}",
"hide": true,
"interval": "",
"legendFormat": "{{address}}-store-{{store}}-leader",
Expand Down Expand Up @@ -6993,7 +6993,7 @@
"steppedLine": false,
"targets": [
{
"expr": "pd_scheduler_status{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", type=\"allow\",instance=\"$instance\"}",
"expr": "pd_scheduler_status{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", type=\"allow\"}",
"format": "time_series",
"intervalFactor": 2,
"legendFormat": "{{kind}}",
Expand Down Expand Up @@ -7091,14 +7091,14 @@
"steppedLine": false,
"targets": [
{
"expr": "-sum(delta(pd_scheduler_balance_direction{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\",instance=\"$instance\",type=\"balance-leader-scheduler\"}[1m])) by (source)",
"expr": "-sum(delta(pd_scheduler_balance_direction{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", type=\"balance-leader-scheduler\"}[1m])) by (source)",
"format": "time_series",
"intervalFactor": 2,
"legendFormat": "store-{{source}}",
"refId": "A"
},
{
"expr": "sum(delta(pd_scheduler_balance_direction{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\",instance=\"$instance\",type=\"balance-leader-scheduler\"}[1m])) by (target)",
"expr": "sum(delta(pd_scheduler_balance_direction{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", type=\"balance-leader-scheduler\"}[1m])) by (target)",
"format": "time_series",
"intervalFactor": 2,
"legendFormat": "store-{{target}}",
Expand Down Expand Up @@ -7195,14 +7195,14 @@
"steppedLine": false,
"targets": [
{
"expr": "-sum(delta(pd_scheduler_balance_direction{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\",instance=\"$instance\",type=\"balance-region-scheduler\"}[1m])) by (source)",
"expr": "-sum(delta(pd_scheduler_balance_direction{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", type=\"balance-region-scheduler\"}[1m])) by (source)",
"format": "time_series",
"intervalFactor": 2,
"legendFormat": "store-{{source}}",
"refId": "A"
},
{
"expr": "sum(delta(pd_scheduler_balance_direction{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\",instance=\"$instance\",type=\"balance-region-scheduler\"}[1m])) by (target)",
"expr": "sum(delta(pd_scheduler_balance_direction{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", type=\"balance-region-scheduler\"}[1m])) by (target)",
"format": "time_series",
"intervalFactor": 2,
"legendFormat": "store-{{target}}",
Expand Down Expand Up @@ -7297,7 +7297,7 @@
"steppedLine": false,
"targets": [
{
"expr": "sum(rate(pd_scheduler_event_count{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=\"$instance\", type=\"balance-leader-scheduler\"}[5m])) by (name)",
"expr": "sum(rate(pd_scheduler_event_count{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", type=\"balance-leader-scheduler\"}[5m])) by (name)",
"format": "time_series",
"intervalFactor": 2,
"legendFormat": "{{name}}",
Expand All @@ -7306,7 +7306,7 @@
"step": 4
},
{
"expr": "sum(rate(pd_scheduler_event_count{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=\"$instance\", type=\"balance-leader-scheduler\"}[5m]))",
"expr": "sum(rate(pd_scheduler_event_count{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", type=\"balance-leader-scheduler\"}[5m]))",
"format": "time_series",
"intervalFactor": 2,
"legendFormat": "total",
Expand Down Expand Up @@ -7403,7 +7403,7 @@
"steppedLine": false,
"targets": [
{
"expr": "sum(rate(pd_scheduler_event_count{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=\"$instance\", type=\"balance-region-scheduler\"}[5m])) by (name)",
"expr": "sum(rate(pd_scheduler_event_count{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", type=\"balance-region-scheduler\"}[5m])) by (name)",
"format": "time_series",
"intervalFactor": 2,
"legendFormat": "{{name}}",
Expand All @@ -7412,7 +7412,7 @@
"step": 4
},
{
"expr": "sum(rate(pd_scheduler_event_count{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=\"$instance\", type=\"balance-region-scheduler\"}[5m]))",
"expr": "sum(rate(pd_scheduler_event_count{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", type=\"balance-region-scheduler\"}[5m]))",
"format": "time_series",
"intervalFactor": 2,
"legendFormat": "total",
Expand Down Expand Up @@ -7509,7 +7509,7 @@
"steppedLine": false,
"targets": [
{
"expr": "sum(rate(pd_scheduler_event_count{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=\"$instance\", type=\"balance-hot-region-scheduler\"}[5m])) by (name)",
"expr": "sum(rate(pd_scheduler_event_count{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", type=\"balance-hot-region-scheduler\"}[5m])) by (name)",
"format": "time_series",
"intervalFactor": 2,
"legendFormat": "{{name}}",
Expand Down Expand Up @@ -7606,7 +7606,7 @@
"steppedLine": false,
"targets": [
{
"expr": "sum(rate(pd_scheduler_event_count{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=\"$instance\", type=\"split-bucket-scheduler\"}[5m])) by (name)",
"expr": "sum(rate(pd_scheduler_event_count{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", type=\"split-bucket-scheduler\"}[5m])) by (name)",
"format": "time_series",
"intervalFactor": 2,
"legendFormat": "{{name}}",
Expand Down Expand Up @@ -7702,7 +7702,7 @@
"steppedLine": false,
"targets": [
{
"expr": "sum(delta(pd_scheduler_balance_direction{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=\"$instance\"}[1m])) by (type, source, target)",
"expr": "sum(delta(pd_scheduler_balance_direction{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\"}[1m])) by (type, source, target)",
"format": "time_series",
"intervalFactor": 2,
"legendFormat": "{{source}}-{{target}}-{{type}}",
Expand Down Expand Up @@ -7880,7 +7880,7 @@
"steppedLine": false,
"targets": [
{
"expr": "sum(rate(pd_checker_event_count{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=\"$instance\", type=\"rule_checker\"}[1m])) by (name)",
"expr": "sum(rate(pd_checker_event_count{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", type=\"rule_checker\"}[1m])) by (name)",
"format": "time_series",
"intervalFactor": 2,
"legendFormat": "{{name}}",
Expand Down Expand Up @@ -7972,7 +7972,7 @@
"steppedLine": false,
"targets": [
{
"expr": "sum(rate(pd_checker_event_count{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=\"$instance\", type=\"replica_checker\"}[1m])) by (name)",
"expr": "sum(rate(pd_checker_event_count{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", type=\"replica_checker\"}[1m])) by (name)",
"format": "time_series",
"intervalFactor": 2,
"legendFormat": "{{name}}",
Expand Down Expand Up @@ -8065,7 +8065,7 @@
"steppedLine": false,
"targets": [
{
"expr": "sum(rate(pd_checker_event_count{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=\"$instance\", type=\"merge_checker\"}[1m])) by (name)",
"expr": "sum(rate(pd_checker_event_count{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", type=\"merge_checker\"}[1m])) by (name)",
"format": "time_series",
"intervalFactor": 2,
"legendFormat": "{{name}}",
Expand Down
71 changes: 70 additions & 1 deletion pkg/mcs/scheduling/server/cluster.go
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@ import (
"time"

"github.com/pingcap/errors"
"github.com/pingcap/failpoint"
"github.com/pingcap/kvproto/pkg/pdpb"
"github.com/pingcap/kvproto/pkg/schedulingpb"
"github.com/pingcap/log"
Expand Down Expand Up @@ -394,11 +395,79 @@ func (c *Cluster) runUpdateStoreStats() {
}
}

func (c *Cluster) runMetricsCollectionJob() {
defer logutil.LogPanic()
defer c.wg.Done()

ticker := time.NewTicker(10 * time.Second)
failpoint.Inject("highFrequencyClusterJobs", func() {
ticker.Stop()
ticker = time.NewTicker(time.Microsecond)
})

defer ticker.Stop()

for {
select {
case <-c.ctx.Done():
log.Info("metrics are reset")
c.resetMetrics()
log.Info("metrics collection job has been stopped")
return
case <-ticker.C:
c.collectMetrics()
}
}
}

func (c *Cluster) collectMetrics() {
statsMap := statistics.NewStoreStatisticsMap(c.persistConfig)
stores := c.GetStores()
for _, s := range stores {
statsMap.ObserveHotStat(s, c.hotStat.StoresStats)
}
statsMap.Collect()

c.coordinator.GetSchedulersController().CollectSchedulerMetrics()
c.coordinator.CollectHotSpotMetrics()
c.collectClusterMetrics()
}

func (c *Cluster) collectClusterMetrics() {
if c.regionStats == nil {
return
}
c.regionStats.Collect()
c.labelStats.Collect()
// collect hot cache metrics
c.hotStat.CollectMetrics()
}

func (c *Cluster) resetMetrics() {
statsMap := statistics.NewStoreStatisticsMap(c.persistConfig)
statsMap.Reset()

c.coordinator.GetSchedulersController().ResetSchedulerMetrics()
c.coordinator.ResetHotSpotMetrics()
c.resetClusterMetrics()
}

func (c *Cluster) resetClusterMetrics() {
if c.regionStats == nil {
return
}
c.regionStats.Reset()
c.labelStats.Reset()
// reset hot cache metrics
c.hotStat.ResetMetrics()
}

// StartBackgroundJobs starts background jobs.
func (c *Cluster) StartBackgroundJobs() {
c.wg.Add(2)
c.wg.Add(3)
go c.updateScheduler()
go c.runUpdateStoreStats()
go c.runMetricsCollectionJob()
}

// StopBackgroundJobs stops background jobs.
Expand Down
17 changes: 16 additions & 1 deletion pkg/mcs/scheduling/server/config/config.go
Original file line number Diff line number Diff line change
Expand Up @@ -499,6 +499,11 @@ func (o *PersistConfig) IsSchedulingHalted() bool {
return o.GetScheduleConfig().HaltScheduling
}

// GetStoresLimit gets the stores' limit.
func (o *PersistConfig) GetStoresLimit() map[uint64]sc.StoreLimitConfig {
return o.GetScheduleConfig().StoreLimit
}

// GetStoreLimitByType returns the limit of a store with a given type.
func (o *PersistConfig) GetStoreLimitByType(storeID uint64, typ storelimit.Type) (returned float64) {
limit := o.GetStoreLimit(storeID)
Expand Down Expand Up @@ -620,11 +625,21 @@ func (o *PersistConfig) GetRegionMaxSize() uint64 {
return o.GetStoreConfig().GetRegionMaxSize()
}

// GetRegionMaxKeys returns the region split keys
// GetRegionMaxKeys returns the max region keys
func (o *PersistConfig) GetRegionMaxKeys() uint64 {
return o.GetStoreConfig().GetRegionMaxKeys()
}

// GetRegionSplitSize returns the region split size in MB
func (o *PersistConfig) GetRegionSplitSize() uint64 {
return o.GetStoreConfig().GetRegionSplitSize()
}

// GetRegionSplitKeys returns the region split keys
func (o *PersistConfig) GetRegionSplitKeys() uint64 {
return o.GetStoreConfig().GetRegionSplitKeys()
}

// IsEnableRegionBucket return true if the region bucket is enabled.
func (o *PersistConfig) IsEnableRegionBucket() bool {
return o.GetStoreConfig().IsEnableRegionBucket()
Expand Down
3 changes: 3 additions & 0 deletions pkg/schedule/config/config_provider.go
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,7 @@ type SchedulerConfigProvider interface {
SharedConfigProvider

IsSchedulingHalted() bool
GetStoresLimit() map[uint64]StoreLimitConfig

IsSchedulerDisabled(string) bool
AddSchedulerCfg(string, []string)
Expand Down Expand Up @@ -137,6 +138,8 @@ type ConfProvider interface {
type StoreConfigProvider interface {
GetRegionMaxSize() uint64
GetRegionMaxKeys() uint64
GetRegionSplitSize() uint64
GetRegionSplitKeys() uint64
CheckRegionSize(uint64, uint64) error
CheckRegionKeys(uint64, uint64) error
IsEnableRegionBucket() bool
Expand Down
Loading

0 comments on commit 7b172f2

Please sign in to comment.