diff --git a/helm-charts/support/Chart.yaml b/helm-charts/support/Chart.yaml index 783fb0e2f..a05f2e83c 100644 --- a/helm-charts/support/Chart.yaml +++ b/helm-charts/support/Chart.yaml @@ -59,3 +59,9 @@ dependencies: version: "0.0.1-0.dev.git.72.hadbe1d4" repository: https://2i2c.org/gcp-filestore-backups condition: gcpFilestoreBackups.enabled + + # Provide metrics about GPU usage + # https://github.com/NVIDIA/dcgm-exporter + - name: dcgm-exporter + version: 3.6.1 + repository: https://nvidia.github.io/dcgm-exporter/helm-charts diff --git a/helm-charts/support/values.schema.yaml b/helm-charts/support/values.schema.yaml index 26f0eda48..0bd55f59d 100644 --- a/helm-charts/support/values.schema.yaml +++ b/helm-charts/support/values.schema.yaml @@ -42,6 +42,9 @@ properties: global: type: object additionalProperties: true + dcgm-exporter: + type: object + additionalProperties: true # These provide values for objects we create, so we validate their schema # to the best of our ability. diff --git a/helm-charts/support/values.yaml b/helm-charts/support/values.yaml index 71de92619..1bdcb89c8 100644 --- a/helm-charts/support/values.yaml +++ b/helm-charts/support/values.yaml @@ -494,6 +494,19 @@ cryptnono: aws-ce-grafana-backend: enabled: false +dcgm-exporter: + serviceMonitor: + enabled: false + podAnnotations: + prometheus.io/path: "/metrics" + prometheus.io/port: "12121" + prometheus.io/scrape: "true" + tolerations: + - key: nvidia.com/gpu + operator: Equal + value: present + effect: NoSchedule + # Configuration of templates provided directly by this chart # ------------------------------------------------------------------------------- #