From 2edbbc6621bb6f59f1ca5f7b2de758fc0a827ed5 Mon Sep 17 00:00:00 2001 From: Lujie Duan Date: Wed, 18 Dec 2024 19:30:58 +0000 Subject: [PATCH] Ops Agent DCGM integration: Manually generate DCGM metadata with V1 and V2 metrics --- integrations/dcgm/ops_agent_metadata.yaml | 118 ++++++++++++++++++++++ 1 file changed, 118 insertions(+) diff --git a/integrations/dcgm/ops_agent_metadata.yaml b/integrations/dcgm/ops_agent_metadata.yaml index c7ab8325ab..9c2b3048c4 100644 --- a/integrations/dcgm/ops_agent_metadata.yaml +++ b/integrations/dcgm/ops_agent_metadata.yaml @@ -1,6 +1,7 @@ platforms: - type: GCE launch_stage: GA + version: '1' install_documentation_url: https://cloud.google.com/stackdriver/docs/solutions/agents/ops-agent/third-party-nvidia agent_requirement: metrics_minimum_supported_version: @@ -56,3 +57,120 @@ platforms: - gpu_number - model - uuid +- type: GCE + launch_stage: GA + version: '2' + install_documentation_url: https://cloud.google.com/stackdriver/docs/solutions/agents/ops-agent/third-party-nvidia + agent_requirement: + metrics_minimum_supported_version: + major: 2 + minor: 38 + patch: 0 + detections: + - characteristic_metric: + metric_type: workload.googleapis.com/gpu.dcgm.memory.bytes_used + default_metrics: + - name: workload.googleapis.com/gpu.dcgm.utilization + value_type: DOUBLE + kind: GAUGE + labels: + - gpu_number + - model + - uuid + - name: workload.googleapis.com/gpu.dcgm.sm.utilization + value_type: DOUBLE + kind: GAUGE + labels: + - gpu_number + - model + - uuid + - name: workload.googleapis.com/gpu.dcgm.pipe.utilization + value_type: DOUBLE + kind: GAUGE + labels: + - gpu_number + - model + - pipe + - uuid + - name: workload.googleapis.com/gpu.dcgm.codec.encoder.utilization + value_type: DOUBLE + kind: GAUGE + labels: + - gpu_number + - model + - uuid + - name: workload.googleapis.com/gpu.dcgm.codec.decoder.utilization + value_type: DOUBLE + kind: GAUGE + labels: + - gpu_number + - model + - uuid + - name: workload.googleapis.com/gpu.dcgm.memory.bytes_used + value_type: INT64 + kind: GAUGE + labels: + - gpu_number + - model + - state + - uuid + - name: workload.googleapis.com/gpu.dcgm.memory.bandwidth_utilization + value_type: DOUBLE + kind: GAUGE + labels: + - gpu_number + - model + - uuid + - name: workload.googleapis.com/gpu.dcgm.pcie.io + value_type: INT64 + kind: CUMULATIVE + labels: + - direction + - gpu_number + - model + - uuid + - name: workload.googleapis.com/gpu.dcgm.nvlink.io + value_type: INT64 + kind: CUMULATIVE + labels: + - direction + - gpu_number + - model + - uuid + - name: workload.googleapis.com/gpu.dcgm.energy_consumption + value_type: DOUBLE + kind: CUMULATIVE + labels: + - gpu_number + - model + - uuid + - name: workload.googleapis.com/gpu.dcgm.temperature + value_type: DOUBLE + kind: GAUGE + labels: + - gpu_number + - model + - uuid + - name: workload.googleapis.com/gpu.dcgm.clock.frequency + value_type: DOUBLE + kind: GAUGE + labels: + - gpu_number + - model + - uuid + - name: workload.googleapis.com/gpu.dcgm.clock.throttle_duration.time + value_type: DOUBLE + kind: CUMULATIVE + labels: + - gpu_number + - model + - uuid + - violation + - name: workload.googleapis.com/gpu.dcgm.ecc_errors + value_type: INT64 + kind: CUMULATIVE + labels: + - error_type + - gpu_number + - model + - uuid \ No newline at end of file