Skip to content

Commit

Permalink
Ops Agent DCGM integration: Manually generate DCGM metadata with V1 a…
Browse files Browse the repository at this point in the history
…nd V2 metrics
  • Loading branch information
LujieDuan committed Dec 18, 2024
1 parent bd8167e commit 2edbbc6
Showing 1 changed file with 118 additions and 0 deletions.
118 changes: 118 additions & 0 deletions integrations/dcgm/ops_agent_metadata.yaml
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
platforms:
- type: GCE
launch_stage: GA
version: '1'
install_documentation_url: https://cloud.google.com/stackdriver/docs/solutions/agents/ops-agent/third-party-nvidia
agent_requirement:
metrics_minimum_supported_version:
Expand Down Expand Up @@ -56,3 +57,120 @@ platforms:
- gpu_number
- model
- uuid
- type: GCE
launch_stage: GA
version: '2'
install_documentation_url: https://cloud.google.com/stackdriver/docs/solutions/agents/ops-agent/third-party-nvidia
agent_requirement:
metrics_minimum_supported_version:
major: 2
minor: 38
patch: 0
detections:
- characteristic_metric:
metric_type: workload.googleapis.com/gpu.dcgm.memory.bytes_used
default_metrics:
- name: workload.googleapis.com/gpu.dcgm.utilization
value_type: DOUBLE
kind: GAUGE
labels:
- gpu_number
- model
- uuid
- name: workload.googleapis.com/gpu.dcgm.sm.utilization
value_type: DOUBLE
kind: GAUGE
labels:
- gpu_number
- model
- uuid
- name: workload.googleapis.com/gpu.dcgm.pipe.utilization
value_type: DOUBLE
kind: GAUGE
labels:
- gpu_number
- model
- pipe
- uuid
- name: workload.googleapis.com/gpu.dcgm.codec.encoder.utilization
value_type: DOUBLE
kind: GAUGE
labels:
- gpu_number
- model
- uuid
- name: workload.googleapis.com/gpu.dcgm.codec.decoder.utilization
value_type: DOUBLE
kind: GAUGE
labels:
- gpu_number
- model
- uuid
- name: workload.googleapis.com/gpu.dcgm.memory.bytes_used
value_type: INT64
kind: GAUGE
labels:
- gpu_number
- model
- state
- uuid
- name: workload.googleapis.com/gpu.dcgm.memory.bandwidth_utilization
value_type: DOUBLE
kind: GAUGE
labels:
- gpu_number
- model
- uuid
- name: workload.googleapis.com/gpu.dcgm.pcie.io
value_type: INT64
kind: CUMULATIVE
labels:
- direction
- gpu_number
- model
- uuid
- name: workload.googleapis.com/gpu.dcgm.nvlink.io
value_type: INT64
kind: CUMULATIVE
labels:
- direction
- gpu_number
- model
- uuid
- name: workload.googleapis.com/gpu.dcgm.energy_consumption
value_type: DOUBLE
kind: CUMULATIVE
labels:
- gpu_number
- model
- uuid
- name: workload.googleapis.com/gpu.dcgm.temperature
value_type: DOUBLE
kind: GAUGE
labels:
- gpu_number
- model
- uuid
- name: workload.googleapis.com/gpu.dcgm.clock.frequency
value_type: DOUBLE
kind: GAUGE
labels:
- gpu_number
- model
- uuid
- name: workload.googleapis.com/gpu.dcgm.clock.throttle_duration.time
value_type: DOUBLE
kind: CUMULATIVE
labels:
- gpu_number
- model
- uuid
- violation
- name: workload.googleapis.com/gpu.dcgm.ecc_errors
value_type: INT64
kind: CUMULATIVE
labels:
- error_type
- gpu_number
- model
- uuid

0 comments on commit 2edbbc6

Please sign in to comment.