feat: Add metrics support for poller and scaler events (#143)

* Add metrics support for scaler and poller events Including required role for metrics writing to scaler and poller service accounts. In GKE, the OpenTelemetry collector is used to collect and forward metrics to the monitoring backend. In Cloud Functions, the code is configured to send metrics to Cloud Monitoring.
cloudspannerecosystem · Mar 12, 2024 · 4f68414 · 4f68414
1 parent b5ade6d
commit 4f68414
Show file tree

Hide file tree

Showing 38 changed files with 16,316 additions and 7,886 deletions.
diff --git a/.github/dependabot.yaml b/.github/dependabot.yaml
@@ -71,6 +71,13 @@ updates:
  - dependency-name: "*"
  update-types: ["version-update:semver-major"]
 
+ - directory: "/src/autoscaler-common"
+ package-ecosystem: "npm"
+ schedule:
+ interval: "weekly"
+ - dependency-name: "*"
+ update-types: ["version-update:semver-major"]
+
  # Docker dependencies
  - directory: "/src"
  package-ecosystem: "docker"

diff --git a/README.md b/README.md
@@ -100,6 +100,62 @@ In both of the above instances, the Google Cloud Platform resources are
 deployed using Terraform. Please see the [Terraform instructions](terraform/README.md)
 for more information on the deployment options available.
 
+## Monitoring
+
+The autoscaler publishes the following metrics to Cloud Monitoring which can be
+used to monitor the behavior of the autoscaler, and to configure alerts.
+
+### Poller
+
+* Message processing counters:
+ * `cloudspannerecosystem/autoscaler/poller/requests-success` - the number
+ of polling request messages recieved and processed successfully.
+ * `cloudspannerecosystem/autoscaler/poller/requests-failed` - the number
+ of polling request messages which failed processing.
+
+* Spanner Instance polling counters:
+ * `cloudspannerecosystem/autoscaler/poller/polling-success` - the number
+ of successful polls of the Spanner instance metrics.
+ * `cloudspannerecosystem/autoscaler/poller/polling-failed` - the number of
+ failed polls of the Spanner instance metrics.
+ * Both of these metrics have `projectid` and `instanceid` to identify the
+ Spanner instance.
+
+### Scaler
+
+* Message processing counters:
+ * `cloudspannerecosystem/autoscaler/scaler/requests-success` - the number
+ of scaling request messages recieved and processed successfully.
+ * `cloudspannerecosystem/autoscaler/scaler/requests-failed` - the number
+ of scaling request messages which failed processing.
+* Spanner Instance scaling counters:
+ * `cloudspannerecosystem/autoscaler/scaler/scaling-success` - the number
+ of succesful rescales of the Spanner instance.
+ * `cloudspannerecosystem/autoscaler/scaler/scaling-denied` - the number of
+ Spanner instance rescale attempts that failed
+ * `cloudspannerecosystem/autoscaler/scaler/scaling-failed` - the number of
+ Spanner instance rescale attempts that were denied by autoscaler
+ configuration or policy.
+
+ * These three metrics have the following attributes:
+ * `spanner_project_id` - the Project ID of the affected Spanner
+ instance
+ * `spanner_instance_id` - the Instance ID of the affected Spanner
+ instance
+ * `scaling_method` - the scaling method used
+ * `scaling_direction` - which can be `SCALE_UP`, `SCALE_DOWN` or
+ `SCALE_SAME` (when the calculated rescale size is equal to the
+ current size)
+ * In addition, the `scaling-denied` counter has a `scaling_denied_reason`
+ attribute containing the reason why the scaling was not performed, which
+ can be:
+ * `SAME_SIZE` - when the calculated rescale size is equal to the
+ current instance size.
+ * `MAX_SIZE` - when the instance has already been scaled up to the
+ maximum configured size.
+ * `WITHIN_COOLDOWN` - when the instance has been recently rescaled,
+ and the autoscaler is waiting for the cooldown period to end.
+
 ## Configuration
 
 The parameters for configuring the Autoscaler are identical regardless of the chosen

diff --git a/kubernetes/decoupled/autoscaler-config/otel-collector.yaml.template b/kubernetes/decoupled/autoscaler-config/otel-collector.yaml.template
@@ -0,0 +1,49 @@
+apiVersion: v1
+kind: ConfigMap
+metadata:
+ name: otel-config
+ namespace: spanner-autoscaler
+data:
+ config.yaml: |
+ ---
+ receivers:
+ otlp:
+ protocols:
+ grpc:
+ endpoint: 0.0.0.0:4317
+
+ processors:
+ resourcedetection:
+ detectors: [gcp]
+ timeout: 10s
+ override: false
+
+ batch:
+ # batch metrics before sending to reduce API usage
+ send_batch_max_size: 200
+ send_batch_size: 200
+ timeout: 10s
+
+ memory_limiter:
+ # drop metrics if memory usage gets too high
+ check_interval: 10s
+ limit_percentage: 65
+ spike_limit_percentage: 20
+
+ exporters:
+ googlecloud:
+ timeout: 45s
+ # Enable the debug exporter, and add to exporters pipeline to see the metrics being delivered
+ # debug:
+ # verbosity: detailed
+
+ service:
+ pipelines:
+ metrics:
+ receivers: [otlp]
+ processors: [resourcedetection, batch, memory_limiter]
+ exporters: [googlecloud]
+ telemetry:
+ logs:
+ # Change log level from "info" to "debug" to view detailed logs
+ level: "info"
diff --git a/kubernetes/decoupled/autoscaler-pkg/networkpolicy.yaml b/kubernetes/decoupled/autoscaler-pkg/networkpolicy.yaml
@@ -40,3 +40,23 @@ spec:
  ports:
  - protocol: TCP
  port: 3000
+---
+apiVersion: networking.k8s.io/v1
+kind: NetworkPolicy
+metadata:
+ name: allow-otel-submitter-to-collector
+ namespace: spanner-autoscaler # kpt-set: ${namespace}
+spec:
+ podSelector:
+ matchLabels:
+ app: otel-collector
+ policyTypes:
+ - Ingress
+ ingress:
+ - from:
+ - podSelector:
+ matchLabels:
+ otel-submitter: "true"
+ ports:
+ - protocol: TCP
+ port: 4317
diff --git a/kubernetes/decoupled/autoscaler-pkg/otel-collector/Kptfile b/kubernetes/decoupled/autoscaler-pkg/otel-collector/Kptfile
@@ -0,0 +1,21 @@
+# Copyright 2022 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+apiVersion: kpt.dev/v1
+kind: Kptfile
+metadata:
+ name: otel-collector
+ annotations:
+ config.kubernetes.io/local-config: "true"
+info:
+ description: Config for OpenTelemetry Collector component of Spanner autoscaler
diff --git a/kubernetes/decoupled/autoscaler-pkg/otel-collector/README.md b/kubernetes/decoupled/autoscaler-pkg/otel-collector/README.md
@@ -0,0 +1,17 @@
+# Open Telemtry Collector
+
+## Description
+
+Config for [OpenTelemetry Collector](https://opentelemetry.io/docs/collector/)
+component of Spanner Autoscaler
+
+### View package content
+
+`kpt pkg tree otel-collector`
+[Details](https://kpt.dev/reference/cli/pkg/tree/)
+
+## Installation
+
+See [documentation][docs] for installation and configuration instructions.
+
+[docs]: ../../../../terraform/gke/README.md
diff --git a/kubernetes/decoupled/autoscaler-pkg/otel-collector/otel-collector.yaml b/kubernetes/decoupled/autoscaler-pkg/otel-collector/otel-collector.yaml
@@ -0,0 +1,48 @@
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+ name: otel-collector
+ namespace: spanner-autoscaler # kpt-set: ${namespace}
+ labels:
+ app: otel-collector
+spec:
+ replicas: 1
+ selector:
+ matchLabels:
+ app: otel-collector
+ template:
+ metadata:
+ labels:
+ app: otel-collector
+ spec:
+ containers:
+ - name: otel-collector
+ image: otel/opentelemetry-collector-contrib:0.93.0
+ args:
+ - --config
+ - /etc/otel/config.yaml
+ volumeMounts:
+ - mountPath: /etc/otel/
+ name: otel-config
+ volumes:
+ - name: otel-config
+ configMap:
+ name: otel-config
+ nodeSelector:
+ iam.gke.io/gke-metadata-server-enabled: "true"
+ serviceAccountName: otel-collector-sa
+ automountServiceAccountToken: true
+---
+apiVersion: v1
+kind: Service
+metadata:
+ name: otel-collector
+ namespace: spanner-autoscaler # kpt-set: ${namespace}
+spec:
+ type: ClusterIP
+ selector:
+ app: otel-collector
+ ports:
+ - protocol: TCP
+ port: 4317
+ targetPort: 4317
diff --git a/kubernetes/decoupled/autoscaler-pkg/poller/poller-hourly.yaml b/kubernetes/decoupled/autoscaler-pkg/poller/poller-hourly.yaml
@@ -25,6 +25,7 @@ spec:
  metadata:
  labels:
  app: poller
+ otel-submitter: "true"
  spec:
  containers:
  - name: poller
@@ -38,6 +39,12 @@ spec:
  env:
  - name: AUTOSCALER_CONFIG
  value: "/etc/autoscaler-config/autoscaler-config-hourly.yaml"
+ - name: K8S_POD_NAME
+ valueFrom:
+ fieldRef:
+ fieldPath: metadata.name
+ - name: OTLP_COLLECTOR_URL
+ value: "http://otel-collector:4317/"
  volumeMounts:
  - name: config-volume
  mountPath: /etc/autoscaler-config

diff --git a/kubernetes/decoupled/autoscaler-pkg/poller/poller.yaml b/kubernetes/decoupled/autoscaler-pkg/poller/poller.yaml
@@ -25,6 +25,7 @@ spec:
  metadata:
  labels:
  app: poller
+ otel-submitter: "true"
  spec:
  containers:
  - name: poller
@@ -35,6 +36,13 @@ spec:
  cpu: "250m"
  limits:
  memory: "256Mi"
+ env:
+ - name: K8S_POD_NAME
+ valueFrom:
+ fieldRef:
+ fieldPath: metadata.name
+ - name: OTLP_COLLECTOR_URL
+ value: "http://otel-collector:4317/"
  volumeMounts:
  - name: config-volume
  mountPath: /etc/autoscaler-config

diff --git a/kubernetes/decoupled/autoscaler-pkg/scaler/scaler.yaml b/kubernetes/decoupled/autoscaler-pkg/scaler/scaler.yaml
@@ -19,6 +19,7 @@ metadata:
  namespace: spanner-autoscaler # kpt-set: ${namespace}
  labels:
  app: scaler
+ otel-submitter: "true"
 spec:
  replicas: 3
  selector:
@@ -28,6 +29,7 @@ spec:
  metadata:
  labels:
  app: scaler
+ otel-submitter: "true"
  spec:
  containers:
  - name: scaler
@@ -40,6 +42,13 @@ spec:
  cpu: "100m"
  limits:
  memory: "256Mi"
+ env:
+ - name: K8S_POD_NAME
+ valueFrom:
+ fieldRef:
+ fieldPath: metadata.name
+ - name: OTLP_COLLECTOR_URL
+ value: "http://otel-collector:4317/"
  nodeSelector:
  iam.gke.io/gke-metadata-server-enabled: "true"
  serviceAccountName: scaler-sa

diff --git a/kubernetes/unified/autoscaler-config/otel-collector.yaml.template b/kubernetes/unified/autoscaler-config/otel-collector.yaml.template
@@ -0,0 +1,49 @@
+apiVersion: v1
+kind: ConfigMap
+metadata:
+ name: otel-config
+ namespace: spanner-autoscaler
+data:
+ config.yaml: |
+ ---
+ receivers:
+ otlp:
+ protocols:
+ grpc:
+ endpoint: 0.0.0.0:4317
+
+ processors:
+ resourcedetection:
+ detectors: [gcp]
+ timeout: 10s
+ override: false
+
+ batch:
+ # batch metrics before sending to reduce API usage
+ send_batch_max_size: 200
+ send_batch_size: 200
+ timeout: 5s
+
+ memory_limiter:
+ # drop metrics if memory usage gets too high
+ check_interval: 1s
+ limit_percentage: 65
+ spike_limit_percentage: 20
+
+ exporters:
+ googlecloud:
+ timeout: 45s
+ # Enable the debug exporter, and add to expoters pipeline to see the metrics being delivered
+ # debug:
+ # verbosity: detailed
+
+ service:
+ pipelines:
+ metrics:
+ receivers: [otlp]
+ processors: [resourcedetection, batch, memory_limiter]
+ exporters: [googlecloud]
+ telemetry:
+ logs:
+ # Change log level from "info" to "debug" to view detailed logs
+ level: "info"
diff --git a/kubernetes/unified/autoscaler-pkg/networkpolicy.yaml b/kubernetes/unified/autoscaler-pkg/networkpolicy.yaml
@@ -21,3 +21,23 @@ spec:
  podSelector: {}
  policyTypes:
  - Ingress
+---
+apiVersion: networking.k8s.io/v1
+kind: NetworkPolicy
+metadata:
+ name: allow-otel-submitter-to-collector
+ namespace: spanner-autoscaler # kpt-set: ${namespace}
+spec:
+ podSelector:
+ matchLabels:
+ app: otel-collector
+ policyTypes:
+ - Ingress
+ ingress:
+ - from:
+ - podSelector:
+ matchLabels:
+ otel-submitter: "true"
+ ports:
+ - protocol: TCP
+ port: 4317