Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Metrics support enhancements: 1) Splitting up amd_gpu_memory_use_perc… #5

Open
wants to merge 1 commit into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 10 additions & 0 deletions src/collect/cpustat.go
Original file line number Diff line number Diff line change
Expand Up @@ -59,7 +59,9 @@ type AMDParams struct {
GPUSCLK [24]float64
GPUMCLK [24]float64
GPUUsage [24]float64
GPUMemoryBusyPercent [24]float64
GPUMemoryUsage [24]float64
GPUMemoryTotal [24]float64
}

func Scan() (AMDParams) {
Expand Down Expand Up @@ -141,8 +143,16 @@ func Scan() (AMDParams) {
value64 = 0

value64 = uint64(goamdsmi.GO_rsmi_dev_gpu_memory_busy_percent_get(i))
stat.GPUMemoryBusyPercent[i] = float64(value64)
value64 = 0

value64 = uint64(goamdsmi.GO_rsmi_dev_gpu_memory_usage_get(i))
stat.GPUMemoryUsage[i] = float64(value64)
value64 = 0

value64 = uint64(goamdsmi.GO_rsmi_dev_gpu_memory_total_get(i))
stat.GPUMemoryTotal[i] = float64(value64)
value64 = 0
}
}

Expand Down
65 changes: 47 additions & 18 deletions src/cpu_data.go
Original file line number Diff line number Diff line change
Expand Up @@ -66,7 +66,9 @@ type amd_data struct {
GPUSCLK *prometheus.Desc
GPUMCLK *prometheus.Desc
GPUUsage *prometheus.Desc
GPUMemoryBusyPercent *prometheus.Desc
GPUMemoryUsage *prometheus.Desc
GPUMemoryTotal *prometheus.Desc
Data func() (collect.AMDParams)
}

Expand Down Expand Up @@ -141,52 +143,63 @@ func NewCollector(handle func() (collect.AMDParams)) prometheus.Collector {
GPUDevId: prometheus.NewDesc(
prometheus.BuildFQName("amd", "", "gpu_dev_id"),
"AMD Params",// The metric's help text.
[]string{"gpu_dev_id", "productname"},// The metric's variable label dimensions.
[]string{"gpu_dev_id", "productname", "node_id", "pod"},// The metric's variable label dimensions.
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

argument "pod" is not working for any of the APIs when we try to query. Please check
image

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Query with argument "node_id" also gives "Empty Query result"
image

nil,// The metric's constant label dimensions.
),
GPUPowerCap: prometheus.NewDesc(
prometheus.BuildFQName("amd", "", "gpu_power_cap"),
"AMD Params",// The metric's help text.
[]string{"gpu_power_cap", "productname"},// The metric's variable label dimensions.
[]string{"gpu_power_cap", "productname", "node_id", "pod"},// The metric's variable label dimensions.
nil,// The metric's constant label dimensions.
),
GPUPowerAvg: prometheus.NewDesc(
prometheus.BuildFQName("amd", "", "gpu_power_avg"),
"AMD Params",// The metric's help text.
[]string{"gpu_power_avg", "productname"},// The metric's variable label dimensions.
[]string{"gpu_power_avg", "productname", "node_id", "pod"},// The metric's variable label dimensions.
nil,// The metric's constant label dimensions.
),
GPUTemperature: prometheus.NewDesc(
prometheus.BuildFQName("amd", "", "gpu_current_temperature"),
"AMD Params",// The metric's help text.
[]string{"gpu_current_temperature", "productname"},// The metric's variable label dimensions.
[]string{"gpu_current_temperature", "productname", "node_id", "pod"},// The metric's variable label dimensions.
nil,// The metric's constant label dimensions.
),
GPUSCLK: prometheus.NewDesc(
prometheus.BuildFQName("amd", "", "gpu_SCLK"),
"AMD Params",// The metric's help text.
[]string{"gpu_SCLK", "productname"},// The metric's variable label dimensions.
[]string{"gpu_SCLK", "productname", "node_id", "pod"},// The metric's variable label dimensions.
nil,// The metric's constant label dimensions.
),
GPUMCLK: prometheus.NewDesc(
prometheus.BuildFQName("amd", "", "gpu_MCLK"),
"AMD Params",// The metric's help text.
[]string{"gpu_MCLK", "productname"},// The metric's variable label dimensions.
[]string{"gpu_MCLK", "productname", "node_id", "pod"},// The metric's variable label dimensions.
nil,// The metric's constant label dimensions.
),
GPUUsage: prometheus.NewDesc(
prometheus.BuildFQName("amd", "", "gpu_use_percent"),
"AMD Params",// The metric's help text.
[]string{"gpu_use_percent", "productname"},// The metric's variable label dimensions.
[]string{"gpu_id", "productname", "node_id", "pod"},// The metric's variable label dimensions.
nil,// The metric's constant label dimensions.
),
GPUMemoryUsage: prometheus.NewDesc(
GPUMemoryBusyPercent: prometheus.NewDesc(

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

By "busy" memory, does it mean "used"? Not sure what 'busy' means for memory? Could we changed to "used"?

prometheus.BuildFQName("amd", "", "gpu_memory_use_percent"),
"AMD Params",// The metric's help text.
[]string{"gpu_memory_use_percent", "productname"},// The metric's variable label dimensions.
[]string{"gpu_id", "productname", "node_id", "pod"},// The metric's variable label dimensions.
nil,// The metric's constant label dimensions.
),
GPUMemoryUsage: prometheus.NewDesc(
prometheus.BuildFQName("amd", "", "gpu_memory_used"),
"AMD Params",// The metric's help text.
[]string{"gpu_id", "productname", "node_id", "pod"},// The metric's variable label dimensions.
nil,// The metric's constant label dimensions.
),
GPUMemoryTotal: prometheus.NewDesc(
prometheus.BuildFQName("amd", "", "gpu_memory_total"),
"AMD Params",// The metric's help text.
[]string{"gpu_id", "productname", "node_id", "pod"},// The metric's variable label dimensions.
nil,// The metric's constant label dimensions.
),


Data: handle, //This is the Scan() function handle
}
Expand Down Expand Up @@ -260,63 +273,79 @@ func (c *amd_data) Collect(ch chan<- prometheus.Metric) {
continue
}
ch <- prometheus.MustNewConstMetric(c.GPUDevId,
prometheus.GaugeValue, float64(s), strconv.Itoa(i), gGPUProductNames[i])
prometheus.GaugeValue, float64(s), strconv.Itoa(i), gGPUProductNames[i], gNodeName, gPod)
}

for i,s := range data.GPUPowerCap{
if uint(i) > (data.NumGPUs - 1) {
continue
}
ch <- prometheus.MustNewConstMetric(c.GPUPowerCap,
prometheus.GaugeValue, float64(s), strconv.Itoa(i), gGPUProductNames[i])
prometheus.GaugeValue, float64(s), strconv.Itoa(i), gGPUProductNames[i], gNodeName, gPod)
}

for i,s := range data.GPUPowerAvg{
if uint(i) > (data.NumGPUs - 1) {
continue
}
ch <- prometheus.MustNewConstMetric(c.GPUPowerAvg,
prometheus.CounterValue, float64(s), strconv.Itoa(i), gGPUProductNames[i])
prometheus.CounterValue, float64(s), strconv.Itoa(i), gGPUProductNames[i], gNodeName, gPod)
}

for i,s := range data.GPUTemperature{
if uint(i) > (data.NumGPUs - 1) {
continue
}
ch <- prometheus.MustNewConstMetric(c.GPUTemperature,
prometheus.GaugeValue, float64(s), strconv.Itoa(i), gGPUProductNames[i])
prometheus.GaugeValue, float64(s), strconv.Itoa(i), gGPUProductNames[i], gNodeName, gPod)
}

for i,s := range data.GPUSCLK{
if uint(i) > (data.NumGPUs - 1) {
continue
}
ch <- prometheus.MustNewConstMetric(c.GPUSCLK,
prometheus.GaugeValue, float64(s), strconv.Itoa(i), gGPUProductNames[i])
prometheus.GaugeValue, float64(s), strconv.Itoa(i), gGPUProductNames[i], gNodeName, gPod)
}

for i,s := range data.GPUMCLK{
if uint(i) > (data.NumGPUs - 1) {
continue
}
ch <- prometheus.MustNewConstMetric(c.GPUMCLK,
prometheus.GaugeValue, float64(s), strconv.Itoa(i), gGPUProductNames[i])
prometheus.GaugeValue, float64(s), strconv.Itoa(i), gGPUProductNames[i], gNodeName, gPod)
}

for i,s := range data.GPUUsage{
if uint(i) > (data.NumGPUs - 1) {
continue
}
ch <- prometheus.MustNewConstMetric(c.GPUUsage,
prometheus.GaugeValue, float64(s), strconv.Itoa(i), gGPUProductNames[i])
prometheus.GaugeValue, float64(s), strconv.Itoa(i), gGPUProductNames[i], gNodeName, gPod)
}

for i,s := range data.GPUMemoryBusyPercent{
if uint(i) > (data.NumGPUs - 1) {
continue
}
ch <- prometheus.MustNewConstMetric(c.GPUMemoryBusyPercent,
prometheus.GaugeValue, float64(s), strconv.Itoa(i), gGPUProductNames[i], gNodeName, gPod)
}

for i,s := range data.GPUMemoryUsage{
if uint(i) > (data.NumGPUs - 1) {
continue
}
ch <- prometheus.MustNewConstMetric(c.GPUMemoryUsage,
prometheus.GaugeValue, float64(s), strconv.Itoa(i), gGPUProductNames[i])
prometheus.GaugeValue, float64(s), strconv.Itoa(i), gGPUProductNames[i], gNodeName, gPod)
}

for i,s := range data.GPUMemoryTotal{
if uint(i) > (data.NumGPUs - 1) {
continue
}
ch <- prometheus.MustNewConstMetric(c.GPUMemoryTotal,
prometheus.GaugeValue, float64(s), strconv.Itoa(i), gGPUProductNames[i], gNodeName, gPod)
}

ch <- prometheus.MustNewConstMetric(c.Sockets,
Expand Down
16 changes: 14 additions & 2 deletions src/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -53,7 +53,8 @@ import (
)

var gGPUProductNames[24] string

var gNodeName string
var gPod string
/* rocm-smi output sample
{"card0":
{
Expand Down Expand Up @@ -98,7 +99,19 @@ func GetGpuProductNames() {
}
}

func GetNodeName() {
nodename, err := exec.Command("uname", "-n").Output()
if err == nil {
gNodeName = string(nodename)
gPod = string(nodename)

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why do we need gPod and gNodeName - what is gPod' supposed to be, is it always same as gNodeName?

} else {
log.Fatal(err)
}
}

func main() {
// Get Node name
GetNodeName()
// Get all GPU product names
GetGpuProductNames()

Expand All @@ -122,4 +135,3 @@ func main() {
log.Fatalf("cannot start collector exporter: %s", err)
}
}