Skip to content

Commit

Permalink
add some metrics; 1.node_cpu_cannot_be_reclaimed_seconds, 2.node_reso…
Browse files Browse the repository at this point in the history
…urce_recommended, 3.node_resource_recommended_from
  • Loading branch information
shijieqin committed May 16, 2022
1 parent b7218fc commit d5e9706
Show file tree
Hide file tree
Showing 3 changed files with 71 additions and 7 deletions.
2 changes: 0 additions & 2 deletions go.sum
Original file line number Diff line number Diff line change
Expand Up @@ -306,8 +306,6 @@ github.com/gobwas/pool v0.2.1 h1:xfeeEhW7pwmX8nuLVlqbzVc7udMDrwetjEv+TZIz1og=
github.com/gobwas/pool v0.2.1/go.mod h1:q8bcK0KcYlCgd9e7WYLm9LpyS+YeLd8JVDW6WezmKEw=
github.com/gobwas/ws v1.1.0-rc.5 h1:QOAag7FoBaBYYHRqzqkhhd8fq5RTubvI4v3Ft/gDVVQ=
github.com/gobwas/ws v1.1.0-rc.5/go.mod h1:nzvNcVha5eUziGrbxFCo6qFIojQHjJV5cLYIbezhfL0=
github.com/gocrane/api v0.3.0 h1:ziH+zYQy/shiqQ6yskMs67e+bQ9WmPp8eCVhLW85NFQ=
github.com/gocrane/api v0.3.0/go.mod h1:GxI+t9AW8+NsHkz2JkPBIJN//9eLUjTZl1ScYAbXMbk=
github.com/gocrane/api v0.4.0 h1:1IWP3gbkp3T4kX68w4+PfqUr4Cb/gaJrihLYg6aKOLY=
github.com/gocrane/api v0.4.0/go.mod h1:GxI+t9AW8+NsHkz2JkPBIJN//9eLUjTZl1ScYAbXMbk=
github.com/godbus/dbus/v5 v5.0.3/go.mod h1:xhWf0FNVPg57R7Z0UbKHbJfkEywrmjJnf7w5xrFpKfA=
Expand Down
71 changes: 67 additions & 4 deletions pkg/metrics/ensuarance.go
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,10 @@ const (
ExecutorErrorTotal = "executor_error_total"
ExecutorEvictTotal = "executor_evict_total"
PodResourceErrorTotal = "pod_resource_error_total"

NodeCpuCannotBeReclaimedSeconds = "node_cpu_cannot_be_reclaimed_seconds"
NodeResourceRecommended = "node_resource_recommended"
NodeResourceRecommendedFrom = "node_resource_recommended_from"
)

type StepLabel string
Expand All @@ -42,15 +46,18 @@ const (
// Step for pod resource manager
StepGetPeriod StepLabel = "getPeriod"
StepUpdateQuota StepLabel = "updateQuota"

StepGetExtResourceRecommended StepLabel = "getExtResourceRecommended"
)

type SubComponent string

const (
SubComponentSchedule SubComponent = "schedule"
SubComponentThrottle SubComponent = "throttle"
SubComponentEvict SubComponent = "evict"
SubComponentPodResource SubComponent = "pod-resource-manager"
SubComponentSchedule SubComponent = "schedule"
SubComponentThrottle SubComponent = "throttle"
SubComponentEvict SubComponent = "evict"
SubComponentPodResource SubComponent = "pod-resource-manager"
SubComponentNodeResource SubComponent = "node-resource-manager"
)

type AnalyzeType string
Expand Down Expand Up @@ -180,6 +187,39 @@ var (
StabilityLevel: k8smetrics.ALPHA,
}, []string{"subcomponent", "step"},
)

// LastActivity records the last activity time of each steps
nodeCpuCannotBeReclaimedSeconds = k8smetrics.NewGaugeVec(
&k8smetrics.GaugeOpts{
Namespace: CraneNamespace,
Subsystem: CraneAgentSubsystem,
Name: NodeCpuCannotBeReclaimedSeconds,
Help: "The cpu seconds that cannot be reclaimed.",
StabilityLevel: k8smetrics.ALPHA,
}, []string{"node"},
)

//NodeResourceRecommended
nodeResourceRecommended = k8smetrics.NewGaugeVec(
&k8smetrics.GaugeOpts{
Namespace: CraneNamespace,
Subsystem: CraneAgentSubsystem,
Name: NodeResourceRecommended,
Help: "The value of recommendation.",
StabilityLevel: k8smetrics.ALPHA,
}, []string{"subcomponent", "step", "resourceName", "node"},
)

//NodeResourceRecommended
nodeResourceRecommendedFrom = k8smetrics.NewGaugeVec(
&k8smetrics.GaugeOpts{
Namespace: CraneNamespace,
Subsystem: CraneAgentSubsystem,
Name: NodeResourceRecommendedFrom,
Help: "The value of vap recommendation.",
StabilityLevel: k8smetrics.ALPHA,
}, []string{"subcomponent", "step", "resourceName", "node"},
)
)

var registerCraneAgentMetricsOnce sync.Once
Expand All @@ -195,6 +235,9 @@ func RegisterCraneAgent() {
legacyregistry.MustRegister(executorStatusCounts)
legacyregistry.MustRegister(executorErrorCounts)
legacyregistry.MustRegister(executorEvictCounts)
legacyregistry.MustRegister(nodeCpuCannotBeReclaimedSeconds)
legacyregistry.MustRegister(nodeResourceRecommended)
legacyregistry.MustRegister(nodeResourceRecommendedFrom)
})
}

Expand Down Expand Up @@ -258,3 +301,23 @@ func PodResourceUpdateErrorCounterInc(subComponent SubComponent, stepName StepLa
func ExecutorEvictCountsInc() {
executorEvictCounts.Inc()
}

func UpdateNodeCpuCannotBeReclaimedSeconds(node string, value float64) {
nodeCpuCannotBeReclaimedSeconds.With(prometheus.Labels{"node": node}).Set(value)
}

func UpdateNodeResourceRecommendedValue(subComponent SubComponent, stepName StepLabel, resourceName string, from string, node string, value float64) {
nodeResourceRecommended.With(prometheus.Labels{"subcomponent": string(subComponent), "step": string(stepName), "resourceName": resourceName, "node": node}).Set(value)
switch from {
case "tsp":
UpdateNodeResourceRecommendedFromValue(subComponent, stepName, resourceName, node, 1)
case "local":
UpdateNodeResourceRecommendedFromValue(subComponent, stepName, resourceName, node, 0)
default:
UpdateNodeResourceRecommendedFromValue(subComponent, stepName, resourceName, node, -1)
}
}

func UpdateNodeResourceRecommendedFromValue(subComponent SubComponent, stepName StepLabel, resourceName string, node string, value float64) {
nodeResourceRecommendedFrom.With(prometheus.Labels{"subcomponent": string(subComponent), "step": string(stepName), "resourceName": resourceName, "node": node}).Set(value)
}
5 changes: 4 additions & 1 deletion pkg/resource/node_resource_manager.go
Original file line number Diff line number Diff line change
Expand Up @@ -228,6 +228,7 @@ func (o *NodeResourceManager) BuildNodeStatus(node *v1.Node) map[v1.ResourceName
if nextRecommendation < 0 {
nextRecommendation = 0
}
metrics.UpdateNodeResourceRecommendedValue(metrics.SubComponentNodeResource, metrics.StepGetExtResourceRecommended, string(resourceName), resourceFrom, o.nodeName, nextRecommendation)
extResourceName := fmt.Sprintf(utils.ExtResourcePrefixFormat, string(resourceName))
resValue, exists := node.Status.Capacity[v1.ResourceName(extResourceName)]
if exists && resValue.Value() != 0 &&
Expand Down Expand Up @@ -335,7 +336,9 @@ func (o *NodeResourceManager) GetCpuCoreCanNotBeReclaimedFromLocal() float64 {

// 1. Exclusive tethered CPU cannot be reclaimed even if the free part is free, so add the exclusive CPUIdle to the CanNotBeReclaimed CPU
// 2. The CPU used by extRes-container needs to be reclaimed, otherwise it will be double-counted due to the allotted mechanism of k8s, so the extResContainerCpuUsageTotal is subtracted from the CanNotBeReclaimedCpu
return nodeCpuUsageTotal + exclusiveCPUIdle - extResContainerCpuUsageTotal
nodeCpuCannotBeReclaimedSeconds := nodeCpuUsageTotal + exclusiveCPUIdle - extResContainerCpuUsageTotal
metrics.UpdateNodeCpuCannotBeReclaimedSeconds(o.nodeName, nodeCpuCannotBeReclaimedSeconds)
return nodeCpuCannotBeReclaimedSeconds
}

func getReserveResourcePercentFromNodeAnnotations(annotations map[string]string, resourceName string) (float64, bool) {
Expand Down

0 comments on commit d5e9706

Please sign in to comment.