Skip to content

Commit

Permalink
Add initial support for monitoring GPUs on Linux (prometheus#1998)
Browse files Browse the repository at this point in the history
Expose GPU metrics using `sysfs/drm`.
`amdgpu` is the only driver which exposes this information through DRM.

Signed-off-by: Siavash Safi <siavash.safi@gmail.com>
  • Loading branch information
siavashs authored and oblitorum committed Apr 9, 2024
1 parent f72a117 commit fd1c90d
Showing 1 changed file with 143 additions and 0 deletions.
143 changes: 143 additions & 0 deletions collector/drm_linux.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,143 @@
// Copyright 2021 The Prometheus Authors
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

// +build !nogpu

package collector

import (
"fmt"

"github.com/go-kit/log"
"github.com/prometheus/client_golang/prometheus"
"github.com/prometheus/procfs/sysfs"
)

const (
drmCollectorSubsystem = "drm"
)

type drmCollector struct {
fs sysfs.FS
logger log.Logger
CardInfo *prometheus.Desc
GPUBusyPercent *prometheus.Desc
MemoryGTTSize *prometheus.Desc
MemoryGTTUsed *prometheus.Desc
MemoryVisibleVRAMSize *prometheus.Desc
MemoryVisibleVRAMUsed *prometheus.Desc
MemoryVRAMSize *prometheus.Desc
MemoryVRAMUsed *prometheus.Desc
}

func init() {
registerCollector("drm", defaultDisabled, NewDrmCollector)
}

// NewDrmCollector returns a new Collector exposing /sys/class/drm/card?/device stats.
func NewDrmCollector(logger log.Logger) (Collector, error) {
fs, err := sysfs.NewFS(*sysPath)
if err != nil {
return nil, fmt.Errorf("failed to open sysfs: %w", err)
}

return &drmCollector{
fs: fs,
logger: logger,
CardInfo: prometheus.NewDesc(
prometheus.BuildFQName(namespace, drmCollectorSubsystem, "card_info"),
"Card information",
[]string{"card", "memory_vendor", "power_performance_level", "unique_id", "vendor"}, nil,
),
GPUBusyPercent: prometheus.NewDesc(
prometheus.BuildFQName(namespace, drmCollectorSubsystem, "gpu_busy_percent"),
"How busy the GPU is as a percentage.",
[]string{"card"}, nil,
),
MemoryGTTSize: prometheus.NewDesc(
prometheus.BuildFQName(namespace, drmCollectorSubsystem, "memory_gtt_size_bytes"),
"The size of the graphics translation table (GTT) block in bytes.",
[]string{"card"}, nil,
),
MemoryGTTUsed: prometheus.NewDesc(
prometheus.BuildFQName(namespace, drmCollectorSubsystem, "memory_gtt_used_bytes"),
"The used amount of the graphics translation table (GTT) block in bytes.",
[]string{"card"}, nil,
),
MemoryVisibleVRAMSize: prometheus.NewDesc(
prometheus.BuildFQName(namespace, drmCollectorSubsystem, "memory_vis_vram_size_bytes"),
"The size of visible VRAM in bytes.",
[]string{"card"}, nil,
),
MemoryVisibleVRAMUsed: prometheus.NewDesc(
prometheus.BuildFQName(namespace, drmCollectorSubsystem, "memory_vis_vram_used_bytes"),
"The used amount of visible VRAM in bytes.",
[]string{"card"}, nil,
),
MemoryVRAMSize: prometheus.NewDesc(
prometheus.BuildFQName(namespace, drmCollectorSubsystem, "memory_vram_size_bytes"),
"The size of VRAM in bytes.",
[]string{"card"}, nil,
),
MemoryVRAMUsed: prometheus.NewDesc(
prometheus.BuildFQName(namespace, drmCollectorSubsystem, "memory_vram_used_bytes"),
"The used amount of VRAM in bytes.",
[]string{"card"}, nil,
),
}, nil
}

func (c *drmCollector) Update(ch chan<- prometheus.Metric) error {
if err := c.updateAMDCards(ch); err != nil {
return err
}

return nil
}

func (c *drmCollector) updateAMDCards(ch chan<- prometheus.Metric) error {
vendor := "amd"
stats, err := c.fs.ClassDRMCardAMDGPUStats()
if err != nil {
return err
}

for _, s := range stats {
ch <- prometheus.MustNewConstMetric(
c.CardInfo, prometheus.GaugeValue, 1,
s.Name, s.MemoryVRAMVendor, s.PowerDPMForcePerformanceLevel, s.UniqueID, vendor)

ch <- prometheus.MustNewConstMetric(
c.GPUBusyPercent, prometheus.GaugeValue, float64(s.GPUBusyPercent), s.Name)

ch <- prometheus.MustNewConstMetric(
c.MemoryGTTSize, prometheus.GaugeValue, float64(s.MemoryGTTSize), s.Name)

ch <- prometheus.MustNewConstMetric(
c.MemoryGTTUsed, prometheus.GaugeValue, float64(s.MemoryGTTUsed), s.Name)

ch <- prometheus.MustNewConstMetric(
c.MemoryVRAMSize, prometheus.GaugeValue, float64(s.MemoryVRAMSize), s.Name)

ch <- prometheus.MustNewConstMetric(
c.MemoryVRAMUsed, prometheus.GaugeValue, float64(s.MemoryVRAMUsed), s.Name)

ch <- prometheus.MustNewConstMetric(
c.MemoryVisibleVRAMSize, prometheus.GaugeValue, float64(s.MemoryVisibleVRAMSize), s.Name)

ch <- prometheus.MustNewConstMetric(
c.MemoryVisibleVRAMUsed, prometheus.GaugeValue, float64(s.MemoryVisibleVRAMUsed), s.Name)
}

return nil
}

0 comments on commit fd1c90d

Please sign in to comment.