diff --git a/go.sum b/go.sum index 732f29695..042da6beb 100644 --- a/go.sum +++ b/go.sum @@ -306,8 +306,6 @@ github.com/gobwas/pool v0.2.1 h1:xfeeEhW7pwmX8nuLVlqbzVc7udMDrwetjEv+TZIz1og= github.com/gobwas/pool v0.2.1/go.mod h1:q8bcK0KcYlCgd9e7WYLm9LpyS+YeLd8JVDW6WezmKEw= github.com/gobwas/ws v1.1.0-rc.5 h1:QOAag7FoBaBYYHRqzqkhhd8fq5RTubvI4v3Ft/gDVVQ= github.com/gobwas/ws v1.1.0-rc.5/go.mod h1:nzvNcVha5eUziGrbxFCo6qFIojQHjJV5cLYIbezhfL0= -github.com/gocrane/api v0.3.0 h1:ziH+zYQy/shiqQ6yskMs67e+bQ9WmPp8eCVhLW85NFQ= -github.com/gocrane/api v0.3.0/go.mod h1:GxI+t9AW8+NsHkz2JkPBIJN//9eLUjTZl1ScYAbXMbk= github.com/gocrane/api v0.4.0 h1:1IWP3gbkp3T4kX68w4+PfqUr4Cb/gaJrihLYg6aKOLY= github.com/gocrane/api v0.4.0/go.mod h1:GxI+t9AW8+NsHkz2JkPBIJN//9eLUjTZl1ScYAbXMbk= github.com/godbus/dbus/v5 v5.0.3/go.mod h1:xhWf0FNVPg57R7Z0UbKHbJfkEywrmjJnf7w5xrFpKfA= diff --git a/pkg/metrics/ensuarance.go b/pkg/metrics/ensuarance.go index 0fb4a925f..26324fec9 100644 --- a/pkg/metrics/ensuarance.go +++ b/pkg/metrics/ensuarance.go @@ -26,6 +26,10 @@ const ( ExecutorErrorTotal = "executor_error_total" ExecutorEvictTotal = "executor_evict_total" PodResourceErrorTotal = "pod_resource_error_total" + + NodeCpuCannotBeReclaimedSeconds = "node_cpu_cannot_be_reclaimed_seconds" + NodeResourceRecommended = "node_resource_recommended" + NodeResourceRecommendedFrom = "node_resource_recommended_from" ) type StepLabel string @@ -42,15 +46,18 @@ const ( // Step for pod resource manager StepGetPeriod StepLabel = "getPeriod" StepUpdateQuota StepLabel = "updateQuota" + + StepGetExtResourceRecommended StepLabel = "getExtResourceRecommended" ) type SubComponent string const ( - SubComponentSchedule SubComponent = "schedule" - SubComponentThrottle SubComponent = "throttle" - SubComponentEvict SubComponent = "evict" - SubComponentPodResource SubComponent = "pod-resource-manager" + SubComponentSchedule SubComponent = "schedule" + SubComponentThrottle SubComponent = "throttle" + SubComponentEvict SubComponent = "evict" + SubComponentPodResource SubComponent = "pod-resource-manager" + SubComponentNodeResource SubComponent = "node-resource-manager" ) type AnalyzeType string @@ -180,6 +187,39 @@ var ( StabilityLevel: k8smetrics.ALPHA, }, []string{"subcomponent", "step"}, ) + + // LastActivity records the last activity time of each steps + nodeCpuCannotBeReclaimedSeconds = k8smetrics.NewGaugeVec( + &k8smetrics.GaugeOpts{ + Namespace: CraneNamespace, + Subsystem: CraneAgentSubsystem, + Name: NodeCpuCannotBeReclaimedSeconds, + Help: "The cpu seconds that cannot be reclaimed.", + StabilityLevel: k8smetrics.ALPHA, + }, []string{"node"}, + ) + + //NodeResourceRecommended + nodeResourceRecommended = k8smetrics.NewGaugeVec( + &k8smetrics.GaugeOpts{ + Namespace: CraneNamespace, + Subsystem: CraneAgentSubsystem, + Name: NodeResourceRecommended, + Help: "The value of recommendation.", + StabilityLevel: k8smetrics.ALPHA, + }, []string{"subcomponent", "step", "resourceName", "node"}, + ) + + //NodeResourceRecommended + nodeResourceRecommendedFrom = k8smetrics.NewGaugeVec( + &k8smetrics.GaugeOpts{ + Namespace: CraneNamespace, + Subsystem: CraneAgentSubsystem, + Name: NodeResourceRecommendedFrom, + Help: "The value of vap recommendation.", + StabilityLevel: k8smetrics.ALPHA, + }, []string{"subcomponent", "step", "resourceName", "node"}, + ) ) var registerCraneAgentMetricsOnce sync.Once @@ -195,6 +235,9 @@ func RegisterCraneAgent() { legacyregistry.MustRegister(executorStatusCounts) legacyregistry.MustRegister(executorErrorCounts) legacyregistry.MustRegister(executorEvictCounts) + legacyregistry.MustRegister(nodeCpuCannotBeReclaimedSeconds) + legacyregistry.MustRegister(nodeResourceRecommended) + legacyregistry.MustRegister(nodeResourceRecommendedFrom) }) } @@ -258,3 +301,23 @@ func PodResourceUpdateErrorCounterInc(subComponent SubComponent, stepName StepLa func ExecutorEvictCountsInc() { executorEvictCounts.Inc() } + +func UpdateNodeCpuCannotBeReclaimedSeconds(node string, value float64) { + nodeCpuCannotBeReclaimedSeconds.With(prometheus.Labels{"node": node}).Set(value) +} + +func UpdateNodeResourceRecommendedValue(subComponent SubComponent, stepName StepLabel, resourceName string, from string, node string, value float64) { + nodeResourceRecommended.With(prometheus.Labels{"subcomponent": string(subComponent), "step": string(stepName), "resourceName": resourceName, "node": node}).Set(value) + switch from { + case "tsp": + UpdateNodeResourceRecommendedFromValue(subComponent, stepName, resourceName, node, 1) + case "local": + UpdateNodeResourceRecommendedFromValue(subComponent, stepName, resourceName, node, 0) + default: + UpdateNodeResourceRecommendedFromValue(subComponent, stepName, resourceName, node, -1) + } +} + +func UpdateNodeResourceRecommendedFromValue(subComponent SubComponent, stepName StepLabel, resourceName string, node string, value float64) { + nodeResourceRecommendedFrom.With(prometheus.Labels{"subcomponent": string(subComponent), "step": string(stepName), "resourceName": resourceName, "node": node}).Set(value) +} diff --git a/pkg/resource/node_resource_manager.go b/pkg/resource/node_resource_manager.go index e6e7d31d7..1ed8c5a4f 100644 --- a/pkg/resource/node_resource_manager.go +++ b/pkg/resource/node_resource_manager.go @@ -228,6 +228,7 @@ func (o *NodeResourceManager) BuildNodeStatus(node *v1.Node) map[v1.ResourceName if nextRecommendation < 0 { nextRecommendation = 0 } + metrics.UpdateNodeResourceRecommendedValue(metrics.SubComponentNodeResource, metrics.StepGetExtResourceRecommended, string(resourceName), resourceFrom, o.nodeName, nextRecommendation) extResourceName := fmt.Sprintf(utils.ExtResourcePrefixFormat, string(resourceName)) resValue, exists := node.Status.Capacity[v1.ResourceName(extResourceName)] if exists && resValue.Value() != 0 && @@ -335,7 +336,9 @@ func (o *NodeResourceManager) GetCpuCoreCanNotBeReclaimedFromLocal() float64 { // 1. Exclusive tethered CPU cannot be reclaimed even if the free part is free, so add the exclusive CPUIdle to the CanNotBeReclaimed CPU // 2. The CPU used by extRes-container needs to be reclaimed, otherwise it will be double-counted due to the allotted mechanism of k8s, so the extResContainerCpuUsageTotal is subtracted from the CanNotBeReclaimedCpu - return nodeCpuUsageTotal + exclusiveCPUIdle - extResContainerCpuUsageTotal + nodeCpuCannotBeReclaimedSeconds := nodeCpuUsageTotal + exclusiveCPUIdle - extResContainerCpuUsageTotal + metrics.UpdateNodeCpuCannotBeReclaimedSeconds(o.nodeName, nodeCpuCannotBeReclaimedSeconds) + return nodeCpuCannotBeReclaimedSeconds } func getReserveResourcePercentFromNodeAnnotations(annotations map[string]string, resourceName string) (float64, bool) {