From b2b38de5684cd086627906dfeb837a8708936a73 Mon Sep 17 00:00:00 2001
From: YuviPanda <yuvipanda@gmail.com>
Date: Wed, 18 Dec 2024 18:51:11 -0800
Subject: [PATCH 1/2] Collect GPU usage metrics with prometheus

We use [prometheus node exporter](https://github.com/prometheus/node_exporter),
deployed as part of our prometheus chart, to collect metrics about
CPU and memory usage.

This deploys NVIDIA's [dcgm-exporter](https://github.com/NVIDIA/dcgm-exporter)
which collects information about GPU usage.

As we work towards more cost monitoring and usage monitoring,
collecting this information should allow us to help users get more
bang for the buck from their GPU use. Since we only collect information
after the exporters are deployed, this starts the information collection
process even if it's not directly visible to end users.

Works towards https://2i2c.productboard.com/entity-detail/features/30046512,
initially requested as part of https://2i2c.freshdesk.com/a/tickets/2545.
---
 helm-charts/support/Chart.yaml         |  6 ++++++
 helm-charts/support/values.schema.yaml |  3 +++
 helm-charts/support/values.yaml        | 14 ++++++++++++++
 3 files changed, 23 insertions(+)

diff --git a/helm-charts/support/Chart.yaml b/helm-charts/support/Chart.yaml
index 783fb0e2fa..72b078424e 100644
--- a/helm-charts/support/Chart.yaml
+++ b/helm-charts/support/Chart.yaml
@@ -59,3 +59,9 @@ dependencies:
     version: "0.0.1-0.dev.git.72.hadbe1d4"
     repository: https://2i2c.org/gcp-filestore-backups
     condition: gcpFilestoreBackups.enabled
+
+  # Provide metrics about GPU usage
+  # https://github.com/NVIDIA/dcgm-exporter
+  - name: dcgm-exporter
+    version: 3.6.1
+    repository: https://nvidia.github.io/dcgm-exporter/helm-charts
\ No newline at end of file
diff --git a/helm-charts/support/values.schema.yaml b/helm-charts/support/values.schema.yaml
index 26f0eda484..0bd55f59d3 100644
--- a/helm-charts/support/values.schema.yaml
+++ b/helm-charts/support/values.schema.yaml
@@ -42,6 +42,9 @@ properties:
   global:
     type: object
     additionalProperties: true
+  dcgm-exporter:
+    type: object
+    additionalProperties: true
 
   # These provide values for objects we create, so we validate their schema
   # to the best of our ability.
diff --git a/helm-charts/support/values.yaml b/helm-charts/support/values.yaml
index 71de926196..c73794de5b 100644
--- a/helm-charts/support/values.yaml
+++ b/helm-charts/support/values.yaml
@@ -494,6 +494,20 @@ cryptnono:
 aws-ce-grafana-backend:
   enabled: false
 
+dcgm-exporter:
+  serviceMonitor:
+    enabled: false
+  podAnnotations:
+    prometheus.io/path: "/metrics"
+    prometheus.io/port: "12121"
+    prometheus.io/scrape: "true"
+  tolerations:
+  - key: nvidia.com/gpu
+    operator: Equal
+    value: present
+    effect: NoSchedule
+
+
 # Configuration of templates provided directly by this chart
 # -------------------------------------------------------------------------------
 #

From 75b28a4ce7f24ad54bcd71b0c29eada2c36ccda1 Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Thu, 19 Dec 2024 03:11:00 +0000
Subject: [PATCH 2/2] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 helm-charts/support/Chart.yaml  | 2 +-
 helm-charts/support/values.yaml | 9 ++++-----
 2 files changed, 5 insertions(+), 6 deletions(-)

diff --git a/helm-charts/support/Chart.yaml b/helm-charts/support/Chart.yaml
index 72b078424e..a05f2e83c5 100644
--- a/helm-charts/support/Chart.yaml
+++ b/helm-charts/support/Chart.yaml
@@ -64,4 +64,4 @@ dependencies:
   # https://github.com/NVIDIA/dcgm-exporter
   - name: dcgm-exporter
     version: 3.6.1
-    repository: https://nvidia.github.io/dcgm-exporter/helm-charts
\ No newline at end of file
+    repository: https://nvidia.github.io/dcgm-exporter/helm-charts
diff --git a/helm-charts/support/values.yaml b/helm-charts/support/values.yaml
index c73794de5b..1bdcb89c85 100644
--- a/helm-charts/support/values.yaml
+++ b/helm-charts/support/values.yaml
@@ -502,11 +502,10 @@ dcgm-exporter:
     prometheus.io/port: "12121"
     prometheus.io/scrape: "true"
   tolerations:
-  - key: nvidia.com/gpu
-    operator: Equal
-    value: present
-    effect: NoSchedule
-
+    - key: nvidia.com/gpu
+      operator: Equal
+      value: present
+      effect: NoSchedule
 
 # Configuration of templates provided directly by this chart
 # -------------------------------------------------------------------------------