From 61accfd0591c184bf2585f5b6711b3871024f506 Mon Sep 17 00:00:00 2001
From: Kevin Earls <kearls@redhat.com>
Date: Thu, 1 Dec 2022 10:48:42 +0100
Subject: [PATCH] Add TargetMemoryUtilization metric for AutoScaling (#1223)

* Add TargetMemoryUtilization metric for AutoScaling

Signed-off-by: Kevin Earls <kearls@redhat.com>

* Add changes to v2beta2 as there is no way to un e2e tests just for one version

Signed-off-by: Kevin Earls <kearls@redhat.com>

* See if we just have a race condition

Signed-off-by: Kevin Earls <kearls@redhat.com>

* Reset kuttl timeout

Signed-off-by: Kevin Earls <kearls@redhat.com>

* Add some debugging code to help analyze failures on github

Signed-off-by: Kevin Earls <kearls@redhat.com>

* Try to appease the linter

Signed-off-by: Kevin Earls <kearls@redhat.com>

* Restore autoscale tests

Signed-off-by: Kevin Earls <kearls@redhat.com>

* Cleanup

Signed-off-by: Kevin Earls <kearls@redhat.com>

* More cleanup

Signed-off-by: Kevin Earls <kearls@redhat.com>

* Respond to comments

Signed-off-by: Kevin Earls <kearls@redhat.com>

* Cleanup whitespace so linter will rerun

Signed-off-by: Kevin Earls <kearls@redhat.com>

* Don't set TargetCPUUtilization to default if another metric is set

Signed-off-by: Kevin Earls <kearls@redhat.com>

Signed-off-by: Kevin Earls <kearls@redhat.com>
---
 apis/v1alpha1/opentelemetrycollector_types.go |  3 +
 .../opentelemetrycollector_webhook.go         | 14 +++--
 apis/v1alpha1/zz_generated.deepcopy.go        |  5 ++
 ...ntelemetry.io_opentelemetrycollectors.yaml |  5 ++
 ...ntelemetry.io_opentelemetrycollectors.yaml |  5 ++
 docs/api.md                                   |  9 +++
 pkg/collector/horizontalpodautoscaler.go      | 58 ++++++++++++++-----
 pkg/collector/horizontalpodautoscaler_test.go | 28 ++++++---
 .../reconcile/horizontalpodautoscaler.go      | 11 +++-
 tests/e2e/autoscale/00-install.yaml           |  3 +
 tests/e2e/autoscale/01-assert.yaml            |  1 +
 tests/e2e/autoscale/02-assert.yaml            |  1 +
 12 files changed, 116 insertions(+), 27 deletions(-)

diff --git a/apis/v1alpha1/opentelemetrycollector_types.go b/apis/v1alpha1/opentelemetrycollector_types.go
index 0aac34ca75..a296381b23 100644
--- a/apis/v1alpha1/opentelemetrycollector_types.go
+++ b/apis/v1alpha1/opentelemetrycollector_types.go
@@ -277,6 +277,9 @@ type AutoscalerSpec struct {
 	// If average CPU exceeds this value, the HPA will scale up. Defaults to 90 percent.
 	// +optional
 	TargetCPUUtilization *int32 `json:"targetCPUUtilization,omitempty"`
+	// +optional
+	// TargetMemoryUtilization sets the target average memory utilization across all replicas
+	TargetMemoryUtilization *int32 `json:"targetMemoryUtilization,omitempty"`
 }
 
 func init() {
diff --git a/apis/v1alpha1/opentelemetrycollector_webhook.go b/apis/v1alpha1/opentelemetrycollector_webhook.go
index fd4c5bd6a9..ea41f2b103 100644
--- a/apis/v1alpha1/opentelemetrycollector_webhook.go
+++ b/apis/v1alpha1/opentelemetrycollector_webhook.go
@@ -67,13 +67,15 @@ func (r *OpenTelemetryCollector) Default() {
 		r.Spec.TargetAllocator.Replicas = &one
 	}
 
-	// Set default targetCPUUtilization for autoscaler
-	if r.Spec.MaxReplicas != nil && (r.Spec.Autoscaler == nil || r.Spec.Autoscaler.TargetCPUUtilization == nil) {
-		defaultCPUTarget := int32(90)
+	if r.Spec.MaxReplicas != nil {
 		if r.Spec.Autoscaler == nil {
 			r.Spec.Autoscaler = &AutoscalerSpec{}
 		}
-		r.Spec.Autoscaler.TargetCPUUtilization = &defaultCPUTarget
+
+		if r.Spec.Autoscaler.TargetMemoryUtilization == nil && r.Spec.Autoscaler.TargetCPUUtilization == nil {
+			defaultCPUTarget := int32(90)
+			r.Spec.Autoscaler.TargetCPUUtilization = &defaultCPUTarget
+		}
 	}
 }
 
@@ -176,7 +178,9 @@ func (r *OpenTelemetryCollector) validateCRDSpec() error {
 		if r.Spec.Autoscaler != nil && r.Spec.Autoscaler.TargetCPUUtilization != nil && (*r.Spec.Autoscaler.TargetCPUUtilization < int32(1) || *r.Spec.Autoscaler.TargetCPUUtilization > int32(99)) {
 			return fmt.Errorf("the OpenTelemetry Spec autoscale configuration is incorrect, targetCPUUtilization should be greater than 0 and less than 100")
 		}
-
+		if r.Spec.Autoscaler != nil && r.Spec.Autoscaler.TargetMemoryUtilization != nil && (*r.Spec.Autoscaler.TargetMemoryUtilization < int32(1) || *r.Spec.Autoscaler.TargetMemoryUtilization > int32(99)) {
+			return fmt.Errorf("the OpenTelemetry Spec autoscale configuration is incorrect, targetMemoryUtilization should be greater than 0 and less than 100")
+		}
 	}
 
 	if r.Spec.Ingress.Type == IngressTypeNginx && r.Spec.Mode == ModeSidecar {
diff --git a/apis/v1alpha1/zz_generated.deepcopy.go b/apis/v1alpha1/zz_generated.deepcopy.go
index 65bd3901d8..01c5f1dbdc 100644
--- a/apis/v1alpha1/zz_generated.deepcopy.go
+++ b/apis/v1alpha1/zz_generated.deepcopy.go
@@ -39,6 +39,11 @@ func (in *AutoscalerSpec) DeepCopyInto(out *AutoscalerSpec) {
 		*out = new(int32)
 		**out = **in
 	}
+	if in.TargetMemoryUtilization != nil {
+		in, out := &in.TargetMemoryUtilization, &out.TargetMemoryUtilization
+		*out = new(int32)
+		**out = **in
+	}
 }
 
 // DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new AutoscalerSpec.
diff --git a/bundle/manifests/opentelemetry.io_opentelemetrycollectors.yaml b/bundle/manifests/opentelemetry.io_opentelemetrycollectors.yaml
index 41d2e014ae..38dbce2c79 100644
--- a/bundle/manifests/opentelemetry.io_opentelemetrycollectors.yaml
+++ b/bundle/manifests/opentelemetry.io_opentelemetrycollectors.yaml
@@ -1016,6 +1016,11 @@ spec:
                       the HPA will scale up. Defaults to 90 percent.
                     format: int32
                     type: integer
+                  targetMemoryUtilization:
+                    description: TargetMemoryUtilization sets the target average memory
+                      utilization across all replicas
+                    format: int32
+                    type: integer
                 type: object
               config:
                 description: Config is the raw JSON to be used as the collector's
diff --git a/config/crd/bases/opentelemetry.io_opentelemetrycollectors.yaml b/config/crd/bases/opentelemetry.io_opentelemetrycollectors.yaml
index 068df5d1ad..c6990780b0 100644
--- a/config/crd/bases/opentelemetry.io_opentelemetrycollectors.yaml
+++ b/config/crd/bases/opentelemetry.io_opentelemetrycollectors.yaml
@@ -1014,6 +1014,11 @@ spec:
                       the HPA will scale up. Defaults to 90 percent.
                     format: int32
                     type: integer
+                  targetMemoryUtilization:
+                    description: TargetMemoryUtilization sets the target average memory
+                      utilization across all replicas
+                    format: int32
+                    type: integer
                 type: object
               config:
                 description: Config is the raw JSON to be used as the collector's
diff --git a/docs/api.md b/docs/api.md
index 87358baaf8..9359a696c4 100644
--- a/docs/api.md
+++ b/docs/api.md
@@ -3228,6 +3228,15 @@ Autoscaler specifies the pod autoscaling configuration to use for the OpenTeleme
             <i>Format</i>: int32<br/>
         </td>
         <td>false</td>
+      </tr><tr>
+        <td><b>targetMemoryUtilization</b></td>
+        <td>integer</td>
+        <td>
+          TargetMemoryUtilization sets the target average memory utilization across all replicas<br/>
+          <br/>
+            <i>Format</i>: int32<br/>
+        </td>
+        <td>false</td>
       </tr></tbody>
 </table>
 
diff --git a/pkg/collector/horizontalpodautoscaler.go b/pkg/collector/horizontalpodautoscaler.go
index 8fc133ab1b..c7205562d9 100644
--- a/pkg/collector/horizontalpodautoscaler.go
+++ b/pkg/collector/horizontalpodautoscaler.go
@@ -33,9 +33,7 @@ func HorizontalPodAutoscaler(cfg config.Config, logger logr.Logger, otelcol v1al
 
 	labels := Labels(otelcol, cfg.LabelsFilter())
 	labels["app.kubernetes.io/name"] = naming.Collector(otelcol)
-
 	annotations := Annotations(otelcol)
-
 	var result client.Object
 
 	objectMeta := metav1.ObjectMeta{
@@ -46,6 +44,22 @@ func HorizontalPodAutoscaler(cfg config.Config, logger logr.Logger, otelcol v1al
 	}
 
 	if autoscalingVersion == autodetect.AutoscalingVersionV2Beta2 {
+		metrics := []autoscalingv2beta2.MetricSpec{}
+
+		if otelcol.Spec.Autoscaler.TargetMemoryUtilization != nil {
+			utilizationTarget := autoscalingv2beta2.MetricSpec{
+				Type: autoscalingv2beta2.ResourceMetricSourceType,
+				Resource: &autoscalingv2beta2.ResourceMetricSource{
+					Name: corev1.ResourceMemory,
+					Target: autoscalingv2beta2.MetricTarget{
+						Type:               autoscalingv2beta2.UtilizationMetricType,
+						AverageUtilization: otelcol.Spec.Autoscaler.TargetMemoryUtilization,
+					},
+				},
+			}
+			metrics = append(metrics, utilizationTarget)
+		}
+
 		targetCPUUtilization := autoscalingv2beta2.MetricSpec{
 			Type: autoscalingv2beta2.ResourceMetricSourceType,
 			Resource: &autoscalingv2beta2.ResourceMetricSource{
@@ -56,7 +70,7 @@ func HorizontalPodAutoscaler(cfg config.Config, logger logr.Logger, otelcol v1al
 				},
 			},
 		}
-		metrics := []autoscalingv2beta2.MetricSpec{targetCPUUtilization}
+		metrics = append(metrics, targetCPUUtilization)
 
 		autoscaler := autoscalingv2beta2.HorizontalPodAutoscaler{
 			ObjectMeta: objectMeta,
@@ -79,17 +93,35 @@ func HorizontalPodAutoscaler(cfg config.Config, logger logr.Logger, otelcol v1al
 
 		result = &autoscaler
 	} else {
-		targetCPUUtilization := autoscalingv2.MetricSpec{
-			Type: autoscalingv2.ResourceMetricSourceType,
-			Resource: &autoscalingv2.ResourceMetricSource{
-				Name: corev1.ResourceCPU,
-				Target: autoscalingv2.MetricTarget{
-					Type:               autoscalingv2.UtilizationMetricType,
-					AverageUtilization: otelcol.Spec.Autoscaler.TargetCPUUtilization,
+		metrics := []autoscalingv2.MetricSpec{}
+
+		if otelcol.Spec.Autoscaler.TargetMemoryUtilization != nil {
+			utilizationTarget := autoscalingv2.MetricSpec{
+				Type: autoscalingv2.ResourceMetricSourceType,
+				Resource: &autoscalingv2.ResourceMetricSource{
+					Name: corev1.ResourceMemory,
+					Target: autoscalingv2.MetricTarget{
+						Type:               autoscalingv2.UtilizationMetricType,
+						AverageUtilization: otelcol.Spec.Autoscaler.TargetMemoryUtilization,
+					},
 				},
-			},
+			}
+			metrics = append(metrics, utilizationTarget)
+		}
+
+		if otelcol.Spec.Autoscaler.TargetCPUUtilization != nil {
+			targetCPUUtilization := autoscalingv2.MetricSpec{
+				Type: autoscalingv2.ResourceMetricSourceType,
+				Resource: &autoscalingv2.ResourceMetricSource{
+					Name: corev1.ResourceCPU,
+					Target: autoscalingv2.MetricTarget{
+						Type:               autoscalingv2.UtilizationMetricType,
+						AverageUtilization: otelcol.Spec.Autoscaler.TargetCPUUtilization,
+					},
+				},
+			}
+			metrics = append(metrics, targetCPUUtilization)
 		}
-		metrics := []autoscalingv2.MetricSpec{targetCPUUtilization}
 
 		autoscaler := autoscalingv2.HorizontalPodAutoscaler{
 			ObjectMeta: objectMeta,
@@ -104,7 +136,7 @@ func HorizontalPodAutoscaler(cfg config.Config, logger logr.Logger, otelcol v1al
 				Metrics:     metrics,
 			},
 		}
-		if otelcol.Spec.Autoscaler != nil && otelcol.Spec.Autoscaler.Behavior != nil {
+		if otelcol.Spec.Autoscaler.Behavior != nil {
 			autoscaler.Spec.Behavior = otelcol.Spec.Autoscaler.Behavior
 		}
 		result = &autoscaler
diff --git a/pkg/collector/horizontalpodautoscaler_test.go b/pkg/collector/horizontalpodautoscaler_test.go
index 04b90b3480..aa5e1c173b 100644
--- a/pkg/collector/horizontalpodautoscaler_test.go
+++ b/pkg/collector/horizontalpodautoscaler_test.go
@@ -41,7 +41,8 @@ func TestHPA(t *testing.T) {
 
 	var minReplicas int32 = 3
 	var maxReplicas int32 = 5
-	var cpuUtilization int32 = 90
+	var cpuUtilization int32 = 66
+	var memoryUtilization int32 = 77
 
 	otelcol := v1alpha1.OpenTelemetryCollector{
 		ObjectMeta: metav1.ObjectMeta{
@@ -51,7 +52,8 @@ func TestHPA(t *testing.T) {
 			Replicas:    &minReplicas,
 			MaxReplicas: &maxReplicas,
 			Autoscaler: &v1alpha1.AutoscalerSpec{
-				TargetCPUUtilization: &cpuUtilization,
+				TargetCPUUtilization:    &cpuUtilization,
+				TargetMemoryUtilization: &memoryUtilization,
 			},
 		},
 	}
@@ -76,9 +78,13 @@ func TestHPA(t *testing.T) {
 				assert.Equal(t, "my-instance-collector", hpa.Labels["app.kubernetes.io/name"])
 				assert.Equal(t, int32(3), *hpa.Spec.MinReplicas)
 				assert.Equal(t, int32(5), hpa.Spec.MaxReplicas)
-				assert.Equal(t, 1, len(hpa.Spec.Metrics))
-				assert.Equal(t, corev1.ResourceCPU, hpa.Spec.Metrics[0].Resource.Name)
-				assert.Equal(t, int32(90), *hpa.Spec.Metrics[0].Resource.Target.AverageUtilization)
+				for _, metric := range hpa.Spec.Metrics {
+					if metric.Resource.Name == corev1.ResourceCPU {
+						assert.Equal(t, cpuUtilization, *metric.Resource.Target.AverageUtilization)
+					} else if metric.Resource.Name == corev1.ResourceMemory {
+						assert.Equal(t, memoryUtilization, *metric.Resource.Target.AverageUtilization)
+					}
+				}
 			} else {
 				hpa := raw.(*autoscalingv2.HorizontalPodAutoscaler)
 
@@ -87,9 +93,15 @@ func TestHPA(t *testing.T) {
 				assert.Equal(t, "my-instance-collector", hpa.Labels["app.kubernetes.io/name"])
 				assert.Equal(t, int32(3), *hpa.Spec.MinReplicas)
 				assert.Equal(t, int32(5), hpa.Spec.MaxReplicas)
-				assert.Equal(t, 1, len(hpa.Spec.Metrics))
-				assert.Equal(t, corev1.ResourceCPU, hpa.Spec.Metrics[0].Resource.Name)
-				assert.Equal(t, int32(90), *hpa.Spec.Metrics[0].Resource.Target.AverageUtilization)
+				assert.Equal(t, 2, len(hpa.Spec.Metrics))
+
+				for _, metric := range hpa.Spec.Metrics {
+					if metric.Resource.Name == corev1.ResourceCPU {
+						assert.Equal(t, cpuUtilization, *metric.Resource.Target.AverageUtilization)
+					} else if metric.Resource.Name == corev1.ResourceMemory {
+						assert.Equal(t, memoryUtilization, *metric.Resource.Target.AverageUtilization)
+					}
+				}
 			}
 		})
 	}
diff --git a/pkg/collector/reconcile/horizontalpodautoscaler.go b/pkg/collector/reconcile/horizontalpodautoscaler.go
index 1f8435ff11..d5df719b0c 100644
--- a/pkg/collector/reconcile/horizontalpodautoscaler.go
+++ b/pkg/collector/reconcile/horizontalpodautoscaler.go
@@ -20,6 +20,7 @@ import (
 
 	autoscalingv2 "k8s.io/api/autoscaling/v2"
 	autoscalingv2beta2 "k8s.io/api/autoscaling/v2beta2"
+	corev1 "k8s.io/api/core/v1"
 	k8serrors "k8s.io/apimachinery/pkg/api/errors"
 	"k8s.io/apimachinery/pkg/api/meta"
 	"k8s.io/apimachinery/pkg/types"
@@ -127,7 +128,15 @@ func setAutoscalerSpec(params Params, autoscalingVersion autodetect.AutoscalingV
 			} else {
 				updated.(*autoscalingv2.HorizontalPodAutoscaler).Spec.MinReplicas = &one
 			}
-			updated.(*autoscalingv2.HorizontalPodAutoscaler).Spec.Metrics[0].Resource.Target.AverageUtilization = params.Instance.Spec.Autoscaler.TargetCPUUtilization
+
+			// This will update memory and CPU usage for now, and can be used to update other metrics in the future
+			for _, metric := range updated.(*autoscalingv2.HorizontalPodAutoscaler).Spec.Metrics {
+				if metric.Resource.Name == corev1.ResourceCPU {
+					metric.Resource.Target.AverageUtilization = params.Instance.Spec.Autoscaler.TargetCPUUtilization
+				} else if metric.Resource.Name == corev1.ResourceMemory {
+					metric.Resource.Target.AverageUtilization = params.Instance.Spec.Autoscaler.TargetMemoryUtilization
+				}
+			}
 		}
 	}
 }
diff --git a/tests/e2e/autoscale/00-install.yaml b/tests/e2e/autoscale/00-install.yaml
index 04e52e7be9..15f4aed640 100644
--- a/tests/e2e/autoscale/00-install.yaml
+++ b/tests/e2e/autoscale/00-install.yaml
@@ -1,3 +1,6 @@
+# This creates two different deployments. The first one will be used to see if we scale properly.  (Note that we are
+# only scaling up to 2 because of limitations of KUTTL). The second is to check the targetCPUUtilization option.
+#
 apiVersion: opentelemetry.io/v1alpha1
 kind: OpenTelemetryCollector
 metadata:
diff --git a/tests/e2e/autoscale/01-assert.yaml b/tests/e2e/autoscale/01-assert.yaml
index 2c2eec0238..76c712f7af 100644
--- a/tests/e2e/autoscale/01-assert.yaml
+++ b/tests/e2e/autoscale/01-assert.yaml
@@ -1,3 +1,4 @@
+# Wait until tracegen has completed and the simplest deployment has scaled up to 2
 apiVersion: batch/v1
 kind: Job
 metadata:
diff --git a/tests/e2e/autoscale/02-assert.yaml b/tests/e2e/autoscale/02-assert.yaml
index fb4e052f2b..c610ad3f0e 100644
--- a/tests/e2e/autoscale/02-assert.yaml
+++ b/tests/e2e/autoscale/02-assert.yaml
@@ -1,3 +1,4 @@
+# Wait for the collector to scale back down to 1
 apiVersion: opentelemetry.io/v1alpha1
 kind: OpenTelemetryCollector