Skip to content

Commit

Permalink
add wildcard support for evpa container policy, and update the status…
Browse files Browse the repository at this point in the history
… even if it is off mode, because we need to record the metrics
  • Loading branch information
kitianFresh committed May 9, 2022
1 parent a212a98 commit 4c01942
Show file tree
Hide file tree
Showing 5 changed files with 72 additions and 34 deletions.
4 changes: 2 additions & 2 deletions pkg/autoscaling/estimator/percentile.go
Original file line number Diff line number Diff line change
Expand Up @@ -92,7 +92,7 @@ func (e *PercentileResourceEstimator) GetResourceEstimation(evpa *autoscalingapi
predictErrs = append(predictErrs, err)
}

if len(tsList) > 1 && len(tsList[0].Samples) > 1 {
if len(tsList) > 0 && len(tsList[0].Samples) > 0 {
cpuValue := int64(tsList[0].Samples[0].Value * 1000)
recommendResource[corev1.ResourceCPU] = *resource.NewMilliQuantity(cpuValue, resource.DecimalSI)
} else {
Expand All @@ -104,7 +104,7 @@ func (e *PercentileResourceEstimator) GetResourceEstimation(evpa *autoscalingapi
predictErrs = append(predictErrs, err)
}

if len(tsList) > 1 && len(tsList[0].Samples) > 1 {
if len(tsList) > 0 && len(tsList[0].Samples) > 0 {
memValue := int64(tsList[0].Samples[0].Value)
recommendResource[corev1.ResourceMemory] = *resource.NewQuantity(memValue, resource.BinarySI)
} else {
Expand Down
21 changes: 13 additions & 8 deletions pkg/controller/evpa/container_policy.go
Original file line number Diff line number Diff line change
Expand Up @@ -28,23 +28,28 @@ func (c *EffectiveVPAController) ReconcileContainerPolicies(evpa *autoscalingapi
recommendation = evpa.Status.Recommendation

rankedEstimators := RankEstimators(resourceEstimators)
for _, containerPolicy := range evpa.Spec.ResourcePolicy.ContainerPolicies {
// container scaling is disabled
if (containerPolicy.ScaleUpPolicy.ScaleMode != nil && *containerPolicy.ScaleUpPolicy.ScaleMode == vpatypes.ContainerScalingModeOff) ||
(containerPolicy.ScaleDownPolicy.ScaleMode != nil && *containerPolicy.ScaleDownPolicy.ScaleMode == vpatypes.ContainerScalingModeOff) {
continue
needReconciledContainers := make(map[string]autoscalingapi.ContainerResourcePolicy)
for _, container := range podTemplate.Spec.Containers {
for _, containerPolicy := range evpa.Spec.ResourcePolicy.ContainerPolicies {
if containerPolicy.ContainerName == "*" || containerPolicy.ContainerName == container.Name {
out := containerPolicy.DeepCopy()
out.ContainerName = container.Name
needReconciledContainers[container.Name] = *out
}
}

}
for container, containerPolicy := range needReconciledContainers {
// get current resource by pod template
// todo: support "*"
resourceRequirement, found := utils.GetResourceByPodTemplate(podTemplate, containerPolicy.ContainerName)
resourceRequirement, found := utils.GetResourceByPodTemplate(podTemplate, container)
if !found {
klog.Warningf("ContainerName %s not found", containerPolicy.ContainerName)
continue
}

// loop estimator and get final estimated resource for container
recommendResourceContainer, currentStatus := GetEstimatedResourceForContainer(evpa, containerPolicy, resourceRequirement, rankedEstimators, currentEstimatorStatus)
// record the recommended resource each time to do estimating. so we can get more observability
recordResourceRecommendation(evpa, containerPolicy, recommendResourceContainer)
currentEstimatorStatus = currentStatus
if IsResourceListEmpty(recommendResourceContainer) {
klog.V(4).Infof("Container %s recommend resource is empty, skip scaling. ", containerPolicy.ContainerName)
Expand Down
41 changes: 33 additions & 8 deletions pkg/controller/evpa/effective_vpa_controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -92,7 +92,7 @@ func (c *EffectiveVPAController) Reconcile(ctx context.Context, req ctrl.Request
c.UpdateStatus(ctx, evpa, newStatus)
return ctrl.Result{}, err
}
c.Recorder.Event(evpa, v1.EventTypeNormal, "RemoveFinalizers", err.Error())
c.Recorder.Event(evpa, v1.EventTypeNormal, "RemoveFinalizers", "")
} else if !utils.ContainsString(evpa.Finalizers, known.AutoscalingFinalizer) {
evpa.Finalizers = append(evpa.Finalizers, known.AutoscalingFinalizer)
err = c.Client.Update(ctx, evpa)
Expand Down Expand Up @@ -155,15 +155,38 @@ func (c *EffectiveVPAController) SetupWithManager(mgr ctrl.Manager) error {
Complete(c)
}

func recordMetric(evpa *autoscalingapi.EffectiveVerticalPodAutoscaler, status *autoscalingapi.EffectiveVerticalPodAutoscalerStatus, podTemplate *v1.PodTemplateSpec) {
labels := map[string]string{
"resourceName": fmt.Sprintf("%s/%s", evpa.Namespace, evpa.Spec.TargetRef.Name),
func recordResourceRecommendation(evpa *autoscalingapi.EffectiveVerticalPodAutoscaler, containerPolicy autoscalingapi.ContainerResourcePolicy, resourceList v1.ResourceList) {
for resourceName, resource := range resourceList {
labels := map[string]string{
"apiversion": evpa.Spec.TargetRef.APIVersion,
"owner_kind": evpa.Spec.TargetRef.Kind,
"namespace": evpa.Namespace,
"owner_name": evpa.Spec.TargetRef.Name,
"container": containerPolicy.ContainerName,
"resource": resourceName.String(),
}
switch resourceName {
case v1.ResourceCPU:
metrics.EVPAResourceRecommendation.With(labels).Set(float64(resource.MilliValue()) / 1000.)
case v1.ResourceMemory:
metrics.EVPAResourceRecommendation.With(labels).Set(float64(resource.Value()))
}
}
}

func recordMetric(evpa *autoscalingapi.EffectiveVerticalPodAutoscaler, status *autoscalingapi.EffectiveVerticalPodAutoscalerStatus, podTemplate *v1.PodTemplateSpec) {

if status.Recommendation == nil {
return
}
for _, container := range status.Recommendation.ContainerRecommendations {
labels := map[string]string{
"apiversion": evpa.Spec.TargetRef.APIVersion,
"owner_kind": evpa.Spec.TargetRef.Kind,
"namespace": evpa.Namespace,
"owner_name": evpa.Spec.TargetRef.Name,
"container": container.ContainerName,
}
resourceRequirement, found := utils.GetResourceByPodTemplate(podTemplate, container.ContainerName)
if !found {
klog.Warningf("ContainerName %s not found", container.ContainerName)
Expand All @@ -172,30 +195,32 @@ func recordMetric(evpa *autoscalingapi.EffectiveVerticalPodAutoscaler, status *a

recommendCpu := container.Target[v1.ResourceCPU]
currentCpu := resourceRequirement.Requests[v1.ResourceCPU]
labels["resource"] = v1.ResourceCPU.String()
if currentCpu.Cmp(recommendCpu) > 0 {
// scale down
currCopy := currentCpu.DeepCopy()
currCopy.Sub(recommendCpu)
metrics.EVPACpuScaleDownMilliCores.With(labels).Set(float64(currCopy.MilliValue()))
metrics.EVPACpuScaleDownMilliCores.With(labels).Set(float64(currCopy.MilliValue()) / 1000.)
} else if currentCpu.Cmp(recommendCpu) < 0 {
// scale up
recommendCopy := recommendCpu.DeepCopy()
recommendCopy.Sub(currentCpu)
metrics.EVPACpuScaleUpMilliCores.With(labels).Set(float64(recommendCopy.MilliValue()))
metrics.EVPACpuScaleUpMilliCores.With(labels).Set(float64(recommendCopy.MilliValue()) / 1000.)
}

recommendMem := container.Target[v1.ResourceMemory]
currentMem := resourceRequirement.Requests[v1.ResourceMemory]
labels["resource"] = v1.ResourceMemory.String()
if currentMem.Cmp(recommendMem) > 0 {
// scale down
currCopy := currentMem.DeepCopy()
currCopy.Sub(recommendMem)
metrics.EVPAMemoryScaleDownMB.With(labels).Set(float64(currCopy.Value() / 1024 / 1024))
metrics.EVPAMemoryScaleDownMB.With(labels).Set(float64(currCopy.Value()))
} else if currentMem.Cmp(recommendMem) < 0 {
// scale up
recommendCopy := recommendMem.DeepCopy()
recommendCopy.Sub(currentMem)
metrics.EVPAMemoryScaleUpMB.With(labels).Set(float64(recommendCopy.Value() / 1024 / 1024))
metrics.EVPAMemoryScaleUpMB.With(labels).Set(float64(recommendCopy.Value()))
}
}
}
Expand Down
33 changes: 17 additions & 16 deletions pkg/metrics/autoscaling.go
Original file line number Diff line number Diff line change
Expand Up @@ -49,51 +49,52 @@ var (
prometheus.GaugeOpts{
Namespace: "crane",
Subsystem: "autoscaling",
Name: "effective_vpa_cpu_scale_up_millicores",
Name: "effective_vpa_cpu_scale_up",
Help: "The cpu scale up for Effective VPA",
},
[]string{
"resourceName",
},
[]string{"apiversion", "owner_kind", "namespace", "owner_name", "container", "resource"},
)
EVPACpuScaleDownMilliCores = prometheus.NewGaugeVec(
prometheus.GaugeOpts{
Namespace: "crane",
Subsystem: "autoscaling",
Name: "effective_vpa_cpu_scale_down_millicores",
Name: "effective_vpa_cpu_scale_down",
Help: "The cpu scale down for Effective VPA",
},
[]string{
"resourceName",
},
[]string{"apiversion", "owner_kind", "namespace", "owner_name", "container", "resource"},
)
EVPAMemoryScaleUpMB = prometheus.NewGaugeVec(
prometheus.GaugeOpts{
Namespace: "crane",
Subsystem: "autoscaling",
Name: "effective_vpa_memory_scale_up_mb",
Name: "effective_vpa_memory_scale_up",
Help: "The memory scale up for Effective VPA",
},
[]string{
"resourceName",
},
[]string{"apiversion", "owner_kind", "namespace", "owner_name", "container", "resource"},
)
EVPAMemoryScaleDownMB = prometheus.NewGaugeVec(
prometheus.GaugeOpts{
Namespace: "crane",
Subsystem: "autoscaling",
Name: "effective_vpa_memory_scale_down_mb",
Name: "effective_vpa_memory_scale_down",
Help: "The memory scale down for Effective VPA",
},
[]string{
"resourceName",
[]string{"apiversion", "owner_kind", "namespace", "owner_name", "container", "resource"},
)
EVPAResourceRecommendation = prometheus.NewGaugeVec(
prometheus.GaugeOpts{
Namespace: "crane",
Subsystem: "autoscaling",
Name: "effective_vpa_resource_recommendation",
Help: "The resource recommendation for Effective VPA",
},
[]string{"apiversion", "owner_kind", "namespace", "owner_name", "container", "resource"},
)
)

func init() {
// Register custom metrics with the global prometheus registry
metrics.Registry.MustRegister(HPAReplicas, EHPAReplicas, OOMCount, HPAScaleCount, EVPACpuScaleUpMilliCores, EVPACpuScaleDownMilliCores, EVPAMemoryScaleDownMB, EVPAMemoryScaleUpMB)
metrics.Registry.MustRegister(HPAReplicas, EHPAReplicas, OOMCount, HPAScaleCount, EVPACpuScaleUpMilliCores, EVPACpuScaleDownMilliCores, EVPAMemoryScaleDownMB, EVPAMemoryScaleUpMB, EVPAResourceRecommendation)

}

Expand Down
7 changes: 7 additions & 0 deletions pkg/prediction/percentile/prediction.go
Original file line number Diff line number Diff line change
Expand Up @@ -282,10 +282,12 @@ func (p *percentilePrediction) Run(stopCh <-chan struct{}) {
QueryExpr := qc.MetricNamer.BuildUniqueKey()

if _, ok := p.queryRoutines.Load(QueryExpr); ok {
klog.V(6).InfoS("Prediction percentile routine %v already registered.", "queryExpr", QueryExpr, "caller", qc.Caller)
continue
}

if _, ok := p.stopChMap.Load(QueryExpr); ok {
klog.V(6).InfoS("Prediction percentile routine %v already stopped.", "queryExpr", QueryExpr, "caller", qc.Caller)
continue
}

Expand Down Expand Up @@ -466,6 +468,11 @@ func (p *percentilePrediction) addSamples(namer metricnaming.MetricNamer) {
if signal == nil {
return
}
// maybe we can use other aggregated way to deal with the container instances of the same container in different pods of the same workload,
// aggregated by reducing all the samples to just one p99 or avg value and so on.
// saw that when the workload is daemonset, container in different node has very different resource usage. this is unexpected in production, maybe the daemonset in different nodes has different loads.
// NOTE: now it there are N instance of the workload, there are N samples in latest, then the aggregationWindowLength is N times growth to accumulate data fastly.
// it is not a time dimension, but we use N samples of different container instances of the workload to represent the N intervals samples
for _, ts := range latestTimeSeriesList {
if len(ts.Samples) < 1 {
klog.V(4).InfoS("Sample not found.", "key", key)
Expand Down

0 comments on commit 4c01942

Please sign in to comment.