From b2b38de5684cd086627906dfeb837a8708936a73 Mon Sep 17 00:00:00 2001 From: YuviPanda Date: Wed, 18 Dec 2024 18:51:11 -0800 Subject: [PATCH 1/2] Collect GPU usage metrics with prometheus We use [prometheus node exporter](https://github.com/prometheus/node_exporter), deployed as part of our prometheus chart, to collect metrics about CPU and memory usage. This deploys NVIDIA's [dcgm-exporter](https://github.com/NVIDIA/dcgm-exporter) which collects information about GPU usage. As we work towards more cost monitoring and usage monitoring, collecting this information should allow us to help users get more bang for the buck from their GPU use. Since we only collect information after the exporters are deployed, this starts the information collection process even if it's not directly visible to end users. Works towards https://2i2c.productboard.com/entity-detail/features/30046512, initially requested as part of https://2i2c.freshdesk.com/a/tickets/2545. --- helm-charts/support/Chart.yaml | 6 ++++++ helm-charts/support/values.schema.yaml | 3 +++ helm-charts/support/values.yaml | 14 ++++++++++++++ 3 files changed, 23 insertions(+) diff --git a/helm-charts/support/Chart.yaml b/helm-charts/support/Chart.yaml index 783fb0e2fa..72b078424e 100644 --- a/helm-charts/support/Chart.yaml +++ b/helm-charts/support/Chart.yaml @@ -59,3 +59,9 @@ dependencies: version: "0.0.1-0.dev.git.72.hadbe1d4" repository: https://2i2c.org/gcp-filestore-backups condition: gcpFilestoreBackups.enabled + + # Provide metrics about GPU usage + # https://github.com/NVIDIA/dcgm-exporter + - name: dcgm-exporter + version: 3.6.1 + repository: https://nvidia.github.io/dcgm-exporter/helm-charts \ No newline at end of file diff --git a/helm-charts/support/values.schema.yaml b/helm-charts/support/values.schema.yaml index 26f0eda484..0bd55f59d3 100644 --- a/helm-charts/support/values.schema.yaml +++ b/helm-charts/support/values.schema.yaml @@ -42,6 +42,9 @@ properties: global: type: object additionalProperties: true + dcgm-exporter: + type: object + additionalProperties: true # These provide values for objects we create, so we validate their schema # to the best of our ability. diff --git a/helm-charts/support/values.yaml b/helm-charts/support/values.yaml index 71de926196..c73794de5b 100644 --- a/helm-charts/support/values.yaml +++ b/helm-charts/support/values.yaml @@ -494,6 +494,20 @@ cryptnono: aws-ce-grafana-backend: enabled: false +dcgm-exporter: + serviceMonitor: + enabled: false + podAnnotations: + prometheus.io/path: "/metrics" + prometheus.io/port: "12121" + prometheus.io/scrape: "true" + tolerations: + - key: nvidia.com/gpu + operator: Equal + value: present + effect: NoSchedule + + # Configuration of templates provided directly by this chart # ------------------------------------------------------------------------------- # From 75b28a4ce7f24ad54bcd71b0c29eada2c36ccda1 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Thu, 19 Dec 2024 03:11:00 +0000 Subject: [PATCH 2/2] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- helm-charts/support/Chart.yaml | 2 +- helm-charts/support/values.yaml | 9 ++++----- 2 files changed, 5 insertions(+), 6 deletions(-) diff --git a/helm-charts/support/Chart.yaml b/helm-charts/support/Chart.yaml index 72b078424e..a05f2e83c5 100644 --- a/helm-charts/support/Chart.yaml +++ b/helm-charts/support/Chart.yaml @@ -64,4 +64,4 @@ dependencies: # https://github.com/NVIDIA/dcgm-exporter - name: dcgm-exporter version: 3.6.1 - repository: https://nvidia.github.io/dcgm-exporter/helm-charts \ No newline at end of file + repository: https://nvidia.github.io/dcgm-exporter/helm-charts diff --git a/helm-charts/support/values.yaml b/helm-charts/support/values.yaml index c73794de5b..1bdcb89c85 100644 --- a/helm-charts/support/values.yaml +++ b/helm-charts/support/values.yaml @@ -502,11 +502,10 @@ dcgm-exporter: prometheus.io/port: "12121" prometheus.io/scrape: "true" tolerations: - - key: nvidia.com/gpu - operator: Equal - value: present - effect: NoSchedule - + - key: nvidia.com/gpu + operator: Equal + value: present + effect: NoSchedule # Configuration of templates provided directly by this chart # -------------------------------------------------------------------------------