From 511b094952a3ac46c1407d861f3de0205d51258a Mon Sep 17 00:00:00 2001 From: Ti Chi Robot Date: Wed, 3 Jan 2024 18:33:32 +0800 Subject: [PATCH] resource_control: unify label name to group_name (#7547) (#7656) close tikv/pd#7546 Signed-off-by: nolouch Co-authored-by: nolouch --- .../resource_group/controller/controller.go | 16 ++++---- client/resource_group/controller/metrics.go | 16 ++++---- pkg/mcs/resourcemanager/server/manager.go | 40 +++++++++--------- pkg/mcs/resourcemanager/server/metrics.go | 41 ++++++++++--------- 4 files changed, 58 insertions(+), 55 deletions(-) diff --git a/client/resource_group/controller/controller.go b/client/resource_group/controller/controller.go index 14b5cc7e511..4a5b5779b33 100755 --- a/client/resource_group/controller/controller.go +++ b/client/resource_group/controller/controller.go @@ -346,7 +346,7 @@ func (c *ResourceGroupsController) Start(ctx context.Context) { continue } if _, ok := c.groupsController.LoadAndDelete(group.Name); ok { - resourceGroupStatusGauge.DeleteLabelValues(group.Name) + resourceGroupStatusGauge.DeleteLabelValues(group.Name, group.Name) } } else { // Prev-kv is compacted means there must have been a delete event before this event, @@ -431,7 +431,7 @@ func (c *ResourceGroupsController) tryGetResourceGroup(ctx context.Context, name // Check again to prevent initializing the same resource group concurrently. tmp, loaded := c.groupsController.LoadOrStore(group.GetName(), gc) if !loaded { - resourceGroupStatusGauge.WithLabelValues(name).Set(1) + resourceGroupStatusGauge.WithLabelValues(name, group.Name).Set(1) log.Info("[resource group controller] create resource group cost controller", zap.String("name", group.GetName())) } return tmp.(*groupCostController), nil @@ -448,7 +448,7 @@ func (c *ResourceGroupsController) cleanUpResourceGroup() { if equalRU(latestConsumption, *gc.run.consumption) { if gc.tombstone { c.groupsController.Delete(resourceGroupName) - resourceGroupStatusGauge.DeleteLabelValues(resourceGroupName) + resourceGroupStatusGauge.DeleteLabelValues(resourceGroupName, resourceGroupName) return true } gc.tombstone = true @@ -713,11 +713,11 @@ func newGroupCostController( name: group.Name, mainCfg: mainCfg, mode: group.GetMode(), - successfulRequestDuration: successfulRequestDuration.WithLabelValues(group.Name), - failedLimitReserveDuration: failedLimitReserveDuration.WithLabelValues(group.Name), - failedRequestCounter: failedRequestCounter.WithLabelValues(group.Name), - requestRetryCounter: requestRetryCounter.WithLabelValues(group.Name), - tokenRequestCounter: resourceGroupTokenRequestCounter.WithLabelValues(group.Name), + successfulRequestDuration: successfulRequestDuration.WithLabelValues(group.Name, group.Name), + failedLimitReserveDuration: failedLimitReserveDuration.WithLabelValues(group.Name, group.Name), + failedRequestCounter: failedRequestCounter.WithLabelValues(group.Name, group.Name), + requestRetryCounter: requestRetryCounter.WithLabelValues(group.Name, group.Name), + tokenRequestCounter: resourceGroupTokenRequestCounter.WithLabelValues(group.Name, group.Name), calculators: []ResourceCalculator{ newKVCalculator(mainCfg), newSQLCalculator(mainCfg), diff --git a/client/resource_group/controller/metrics.go b/client/resource_group/controller/metrics.go index 7e6a559265b..4261705a6f6 100644 --- a/client/resource_group/controller/metrics.go +++ b/client/resource_group/controller/metrics.go @@ -21,7 +21,9 @@ const ( requestSubsystem = "request" tokenRequestSubsystem = "token_request" - resourceGroupNameLabel = "name" + // TODO: remove old label in 8.x + resourceGroupNameLabel = "name" + newResourceGroupNameLabel = "resource_group" ) var ( @@ -31,7 +33,7 @@ var ( Subsystem: "resource_group", Name: "status", Help: "Status of the resource group.", - }, []string{resourceGroupNameLabel}) + }, []string{resourceGroupNameLabel, newResourceGroupNameLabel}) successfulRequestDuration = prometheus.NewHistogramVec( prometheus.HistogramOpts{ @@ -40,7 +42,7 @@ var ( Name: "success", Buckets: []float64{.005, .01, .05, .1, .5, 1, 5, 10, 20, 25, 30}, // 0.005 ~ 30 Help: "Bucketed histogram of wait duration of successful request.", - }, []string{resourceGroupNameLabel}) + }, []string{resourceGroupNameLabel, newResourceGroupNameLabel}) failedLimitReserveDuration = prometheus.NewHistogramVec( prometheus.HistogramOpts{ @@ -49,7 +51,7 @@ var ( Name: "limit_reserve_time_failed", Buckets: []float64{.005, .01, .05, .1, .5, 1, 5, 10, 20, 25, 30}, // 0.005 ~ 30 Help: "Bucketed histogram of wait duration of failed request.", - }, []string{resourceGroupNameLabel}) + }, []string{resourceGroupNameLabel, newResourceGroupNameLabel}) failedRequestCounter = prometheus.NewCounterVec( prometheus.CounterOpts{ @@ -57,7 +59,7 @@ var ( Subsystem: requestSubsystem, Name: "fail", Help: "Counter of failed request.", - }, []string{resourceGroupNameLabel}) + }, []string{resourceGroupNameLabel, newResourceGroupNameLabel}) requestRetryCounter = prometheus.NewCounterVec( prometheus.CounterOpts{ @@ -65,7 +67,7 @@ var ( Subsystem: requestSubsystem, Name: "retry", Help: "Counter of retry time for request.", - }, []string{resourceGroupNameLabel}) + }, []string{resourceGroupNameLabel, newResourceGroupNameLabel}) tokenRequestDuration = prometheus.NewHistogramVec( prometheus.HistogramOpts{ @@ -81,7 +83,7 @@ var ( Subsystem: tokenRequestSubsystem, Name: "resource_group", Help: "Counter of token request by every resource group.", - }, []string{resourceGroupNameLabel}) + }, []string{resourceGroupNameLabel, newResourceGroupNameLabel}) ) var ( diff --git a/pkg/mcs/resourcemanager/server/manager.go b/pkg/mcs/resourcemanager/server/manager.go index 71cfd6aa447..c60dfe011e1 100644 --- a/pkg/mcs/resourcemanager/server/manager.go +++ b/pkg/mcs/resourcemanager/server/manager.go @@ -373,15 +373,15 @@ func (m *Manager) backgroundMetricsFlush(ctx context.Context) { var ( name = consumptionInfo.resourceGroupName - rruMetrics = readRequestUnitCost.WithLabelValues(name, ruLabelType) - wruMetrics = writeRequestUnitCost.WithLabelValues(name, ruLabelType) - sqlLayerRuMetrics = sqlLayerRequestUnitCost.WithLabelValues(name) - readByteMetrics = readByteCost.WithLabelValues(name, ruLabelType) - writeByteMetrics = writeByteCost.WithLabelValues(name, ruLabelType) - kvCPUMetrics = kvCPUCost.WithLabelValues(name, ruLabelType) - sqlCPUMetrics = sqlCPUCost.WithLabelValues(name, ruLabelType) - readRequestCountMetrics = requestCount.WithLabelValues(name, readTypeLabel) - writeRequestCountMetrics = requestCount.WithLabelValues(name, writeTypeLabel) + rruMetrics = readRequestUnitCost.WithLabelValues(name, name, ruLabelType) + wruMetrics = writeRequestUnitCost.WithLabelValues(name, name, ruLabelType) + sqlLayerRuMetrics = sqlLayerRequestUnitCost.WithLabelValues(name, name) + readByteMetrics = readByteCost.WithLabelValues(name, name, ruLabelType) + writeByteMetrics = writeByteCost.WithLabelValues(name, name, ruLabelType) + kvCPUMetrics = kvCPUCost.WithLabelValues(name, name, ruLabelType) + sqlCPUMetrics = sqlCPUCost.WithLabelValues(name, name, ruLabelType) + readRequestCountMetrics = requestCount.WithLabelValues(name, name, readTypeLabel) + writeRequestCountMetrics = requestCount.WithLabelValues(name, name, writeTypeLabel) ) // RU info. if consumption.RRU > 0 { @@ -419,16 +419,16 @@ func (m *Manager) backgroundMetricsFlush(ctx context.Context) { // Clean up the metrics that have not been updated for a long time. for name, lastTime := range m.consumptionRecord { if time.Since(lastTime) > metricsCleanupTimeout { - readRequestUnitCost.DeleteLabelValues(name) - writeRequestUnitCost.DeleteLabelValues(name) - sqlLayerRequestUnitCost.DeleteLabelValues(name) - readByteCost.DeleteLabelValues(name) - writeByteCost.DeleteLabelValues(name) - kvCPUCost.DeleteLabelValues(name) - sqlCPUCost.DeleteLabelValues(name) - requestCount.DeleteLabelValues(name, readTypeLabel) - requestCount.DeleteLabelValues(name, writeTypeLabel) - availableRUCounter.DeleteLabelValues(name) + readRequestUnitCost.DeleteLabelValues(name, name) + writeRequestUnitCost.DeleteLabelValues(name, name) + sqlLayerRequestUnitCost.DeleteLabelValues(name, name) + readByteCost.DeleteLabelValues(name, name) + writeByteCost.DeleteLabelValues(name, name) + kvCPUCost.DeleteLabelValues(name, name) + sqlCPUCost.DeleteLabelValues(name, name) + requestCount.DeleteLabelValues(name, name, readTypeLabel) + requestCount.DeleteLabelValues(name, name, writeTypeLabel) + availableRUCounter.DeleteLabelValues(name, name) delete(m.consumptionRecord, name) } } @@ -442,7 +442,7 @@ func (m *Manager) backgroundMetricsFlush(ctx context.Context) { if ru < 0 { ru = 0 } - availableRUCounter.WithLabelValues(name).Set(ru) + availableRUCounter.WithLabelValues(name, name).Set(ru) } m.RUnlock() } diff --git a/pkg/mcs/resourcemanager/server/metrics.go b/pkg/mcs/resourcemanager/server/metrics.go index 25d0516d269..4322ed1a640 100644 --- a/pkg/mcs/resourcemanager/server/metrics.go +++ b/pkg/mcs/resourcemanager/server/metrics.go @@ -17,17 +17,18 @@ package server import "github.com/prometheus/client_golang/prometheus" const ( - namespace = "resource_manager" - serverSubsystem = "server" - ruSubsystem = "resource_unit" - resourceSubsystem = "resource" - resourceGroupNameLabel = "name" - typeLabel = "type" - readTypeLabel = "read" - writeTypeLabel = "write" - backgroundTypeLabel = "background" - tiflashTypeLabel = "ap" - defaultTypeLabel = "tp" + namespace = "resource_manager" + serverSubsystem = "server" + ruSubsystem = "resource_unit" + resourceSubsystem = "resource" + resourceGroupNameLabel = "name" + typeLabel = "type" + readTypeLabel = "read" + writeTypeLabel = "write" + backgroundTypeLabel = "background" + tiflashTypeLabel = "ap" + defaultTypeLabel = "tp" + newResourceGroupNameLabel = "resource_group" ) var ( @@ -47,21 +48,21 @@ var ( Subsystem: ruSubsystem, Name: "read_request_unit_sum", Help: "Counter of the read request unit cost for all resource groups.", - }, []string{resourceGroupNameLabel, typeLabel}) + }, []string{resourceGroupNameLabel, newResourceGroupNameLabel, typeLabel}) writeRequestUnitCost = prometheus.NewCounterVec( prometheus.CounterOpts{ Namespace: namespace, Subsystem: ruSubsystem, Name: "write_request_unit_sum", Help: "Counter of the write request unit cost for all resource groups.", - }, []string{resourceGroupNameLabel, typeLabel}) + }, []string{resourceGroupNameLabel, newResourceGroupNameLabel, typeLabel}) sqlLayerRequestUnitCost = prometheus.NewCounterVec( prometheus.CounterOpts{ Namespace: namespace, Subsystem: ruSubsystem, Name: "sql_layer_request_unit_sum", Help: "The number of the sql layer request unit cost for all resource groups.", - }, []string{resourceGroupNameLabel}) + }, []string{resourceGroupNameLabel, newResourceGroupNameLabel}) // Resource cost metrics. readByteCost = prometheus.NewCounterVec( @@ -70,35 +71,35 @@ var ( Subsystem: resourceSubsystem, Name: "read_byte_sum", Help: "Counter of the read byte cost for all resource groups.", - }, []string{resourceGroupNameLabel, typeLabel}) + }, []string{resourceGroupNameLabel, newResourceGroupNameLabel, typeLabel}) writeByteCost = prometheus.NewCounterVec( prometheus.CounterOpts{ Namespace: namespace, Subsystem: resourceSubsystem, Name: "write_byte_sum", Help: "Counter of the write byte cost for all resource groups.", - }, []string{resourceGroupNameLabel, typeLabel}) + }, []string{resourceGroupNameLabel, newResourceGroupNameLabel, typeLabel}) kvCPUCost = prometheus.NewCounterVec( prometheus.CounterOpts{ Namespace: namespace, Subsystem: resourceSubsystem, Name: "kv_cpu_time_ms_sum", Help: "Counter of the KV CPU time cost in milliseconds for all resource groups.", - }, []string{resourceGroupNameLabel, typeLabel}) + }, []string{resourceGroupNameLabel, newResourceGroupNameLabel, typeLabel}) sqlCPUCost = prometheus.NewCounterVec( prometheus.CounterOpts{ Namespace: namespace, Subsystem: resourceSubsystem, Name: "sql_cpu_time_ms_sum", Help: "Counter of the SQL CPU time cost in milliseconds for all resource groups.", - }, []string{resourceGroupNameLabel, typeLabel}) + }, []string{resourceGroupNameLabel, newResourceGroupNameLabel, typeLabel}) requestCount = prometheus.NewCounterVec( prometheus.CounterOpts{ Namespace: namespace, Subsystem: resourceSubsystem, Name: "request_count", Help: "The number of read/write requests for all resource groups.", - }, []string{resourceGroupNameLabel, typeLabel}) + }, []string{resourceGroupNameLabel, newResourceGroupNameLabel, typeLabel}) availableRUCounter = prometheus.NewGaugeVec( prometheus.GaugeOpts{ @@ -106,7 +107,7 @@ var ( Subsystem: ruSubsystem, Name: "available_ru", Help: "Counter of the available RU for all resource groups.", - }, []string{resourceGroupNameLabel}) + }, []string{resourceGroupNameLabel, newResourceGroupNameLabel}) ) func init() {