Skip to content

Commit

Permalink
[Issue#29][Can nvidiagpubeat be made to also export the process runni…
Browse files Browse the repository at this point in the history
…ng on each card? #29] (#31)
  • Loading branch information
deepujain authored Jan 3, 2021
1 parent 89afd98 commit 6a42990
Show file tree
Hide file tree
Showing 6 changed files with 132 additions and 30 deletions.
87 changes: 69 additions & 18 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -133,43 +133,94 @@ export PATH=$PATH:.
./nvidiagpubeat -c nvidiagpubeat.yml -e -d "*" -E seccomp.enabled=false
```

localnvidiasmi executable built for macOS and is a mock GPU event generator.
localnvidiasmi executable built for macOS and is a mock GPU event generator that supports events for --query-compute-apps and --query-gpu. The executable is generated using nvidiasmilocal/localnvidiasmi.go file.


### Sample event
Below is a sample event emitted by nvidiagpubeat.
The file nvidiagpubeat.yml defines the beat `nvidiagpubeat` with multiple options for `query`. For example `query: "--query-gpu=` will provide information about GPU and `query: "--query-compute-apps=` will list currently active compute processes.

The `--query-gpu` will generate below event by nvidiagpubeat.

```
Publish event: {
"@timestamp": "2019-03-25T15:34:17.739Z",
Publish event: Publish event: {
"@timestamp": "2021-01-03T07:27:16.080Z",
"@metadata": {
"beat": "nvidiagpubeat",
"type": "doc",
"version": "6.5.5"
},
"type": "nvidiagpubeat",
"gpu_uuid": "GPU-b884db58-6340-7969-a79f-b937f3583884",
"driver_version": "418.87.01",
"index": 3,
"gpu_serial": 3.20218176911e+11,
"memory": {
"used": 3256,
"total": 16280
},
"name": "Tesla100-PCIE-16GB",
"host": {
"name": "AB-SJC-11111111"
},
"utilization": {
"gpu": 4,
"memory": 40
"memory": 50,
"gpu": 50
},
"memory": {
"used": 0,
"total": 6082,
"free": 6082
"beat": {
"name": "AB-SJC-11111111",
"hostname": "AB-SJC-11111111",
"version": "6.5.5"
},
"pstate": 0,
"gpu_bus_id": "00000000:19:00.0",
"count": 4,
"fan": {
"speed": "[NotSupported]"
},
"gpuIndex": 3,
"power": {
"draw": 25.28,
"limit": 250
},
"temperature": {
"gpu": 27
"gpu": 24
},
"pstate": 8,
"beat": {
"name": "hostname.company.com",
"hostname": "hostname.company.com",
"clocks": {
"gr": 405,
"sm": 405,
"mem": 715
}
}
```

The `--query-compute-apps` will generate below event by nvidiagpubeat.

```
Publish event: {
"@timestamp": "2021-01-03T07:29:53.633Z",
"@metadata": {
"beat": "nvidiagpubeat",
"type": "doc",
"version": "6.5.5"
},
"host": {
"name": "hostname.company.com"
"pid": 222414,
"process_name": "[NotFound]",
"used_gpu_memory": 10,
"gpu_bus_id": "00000000:19:00.0",
"gpu_serial": 3.20218176911e+11,
"beat": {
"name": "AB-SJC-11111111",
"hostname": "AB-SJC-11111111",
"version": "6.5.5"
},
"gpu_name": "Tesla100-PCIE-16GB",
"used_memory": 15,
"gpuIndex": 3,
"type": "nvidiagpubeat"
"type": "nvidiagpubeat",
"gpu_uuid": "GPU-b884db58-6340-7969-a79f-b937f3583884",
"host": {
"name": "AB-SJC-11111111"
}
}
```

Expand Down
2 changes: 1 addition & 1 deletion config/config.go
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,6 @@ type Config struct {

var DefaultConfig = Config{
Period: 1 * time.Second,
Query: "utilization.gpu,utilization.memory,memory.total,memory.free,memory.used,temperature.gpu,pstate",
Query: "--query-gpu=utilization.gpu,utilization.memory,memory.total,memory.free,memory.used,temperature.gpu,pstate",
Env: "local",
}
Binary file modified localnvidiasmi
Binary file not shown.
30 changes: 25 additions & 5 deletions nvidia/gpu.go
Original file line number Diff line number Diff line change
Expand Up @@ -46,26 +46,38 @@ func newUtilization() Utilization {

func (g Utilization) command(env string, query string) *exec.Cmd {
if env == "test" {
return exec.Command("localnvidiasmi")
if strings.Contains(query, "query-compute-apps") {
args := "query-compute-apps"
return exec.Command("localnvidiasmi", args)
} else {
return exec.Command("localnvidiasmi")
}
}

//For backward compatibility support the default query of --query-apps, if none is provided
if strings.Contains(query, "=") {
return exec.Command("nvidia-smi", query, "--format=csv")
} else {
return exec.Command("nvidia-smi", "--query-gpu="+query, "--format=csv")
}
return exec.Command("nvidia-smi", "--query-gpu="+query, "--format=csv")
}

//Run the nvidiasmi command to collect GPU metrics
//Parse output and return events.
func (g Utilization) run(cmd *exec.Cmd, gpuCount int, query string, action Action) ([]common.MapStr, error) {
logp.Info("Running query: %s with gpuCount %d", query, gpuCount)
logp.Info("Running command %s for query: %s with gpuCount %d", cmd, query, gpuCount)
reader := action.start(cmd)
gpuIndex := 0
events := make([]common.MapStr, gpuCount, 2*gpuCount)

for {
line, err := reader.ReadString('\n')

if err == io.EOF {
break
}
// Ignore header
if strings.Contains(line, "utilization") {
if strings.Contains(line, "utilization") || strings.Contains(line, "gpu_name") || strings.Contains(line, "gpu_uuid") {
continue
}
if len(line) == 0 {
Expand All @@ -85,7 +97,15 @@ func (g Utilization) run(cmd *exec.Cmd, gpuCount int, query string, action Actio
if err == io.EOF {
break
}
headers := strings.Split(query, ",")

var headers []string
if strings.Contains(query, "=") {
rawHeaders := strings.Split(query, "=")
headers = strings.Split(rawHeaders[1], ",")
} else {
headers = strings.Split(query, ",")
}

event := common.MapStr{
"gpuIndex": gpuIndex,
"type": "nvidiagpubeat",
Expand Down
8 changes: 7 additions & 1 deletion nvidiagpubeat.yml
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,13 @@
nvidiagpubeat:
# Defines how often an event is sent to the output
period: 1s
query: "name,driver_version,count,index,fan.speed,memory.total,memory.used,utilization.gpu,utilization.memory,temperature.gpu,power.draw,power.limit,clocks.gr,clocks.sm,clocks.mem,pstate"
# By default the query of type query-gpu is executed to support backward compatibility
query: "name,gpu_bus_id,gpu_serial,gpu_uuid,driver_version,count,index,fan.speed,memory.total,memory.used,utilization.gpu,utilization.memory,temperature.gpu,power.draw,power.limit,clocks.gr,clocks.sm,clocks.mem,pstate"
# A generic version of query is supported by nvidiagpubeat for query options like --query-gpu,--query-compute-apps and others.
# -query-gpu will provide information about GPU.
# query: "--query-gpu=name,gpu_bus_id,gpu_serial,gpu_uuid,driver_version,count,index,fan.speed,memory.total,memory.used,utilization.gpu,utilization.memory,temperature.gpu,power.draw,power.limit,clocks.gr,clocks.sm,clocks.mem,pstate"
# --query-compute-apps will list currently active compute processes.
# query: "--query-compute-apps=gpu_name,gpu_bus_id,gpu_serial,gpu_uuid,pid,process_name,used_gpu_memory,used_memory"
env: "test"
# env can be test or production. test is for test purposes to evaluate funcationality of this beat. Switch to production
# when you want to run this beat on a Nvidia GPU machine with SMI driver installed.
Expand Down
35 changes: 30 additions & 5 deletions nvidiasmilocal/localnvidiasmi.go
Original file line number Diff line number Diff line change
@@ -1,11 +1,36 @@
package main

import "fmt"
import (
"fmt"
"os"
)

func main() {
if len(os.Args) <= 1 {
utilization()
} else {
arg := os.Args[1]
if arg == "utilization" {
utilization()
} else {
process()
}
}

}

func utilization() {
fmt.Println("name, driver_version, count, index, fan.speed [%], memory.total [MiB], memory.used [MiB], utilization.gpu [%], utilization.memory [%], temperature.gpu, power.draw [W], power.limit [W], clocks.current.graphics [MHz], clocks.current.sm [MHz], clocks.current.memory [MHz], pstate")
fmt.Println("Tesla P100-PCIE-16GB, 418.87.01, 4, 0, [Not Supported], 16280 MiB, 1628 MiB, 10 %, 10 %, 25, 24.80 W, 250.00 W, 405 MHz, 405 MHz, 715 MHz, P0")
fmt.Println("Tesla P100-PCIE-16GB, 418.87.01, 4, 1, [Not Supported], 16280 MiB, 3256 MiB, 30 %, 20 %, 25, 25.05 W, 250.00 W, 405 MHz, 405 MHz, 715 MHz, P0")
fmt.Println("Tesla P100-PCIE-16GB, 418.87.01, 4, 2, [Not Supported], 16280 MiB, 1628 MiB, 20 %, 10 %, 24, 26.26 W, 250.00 W, 405 MHz, 405 MHz, 715 MHz, P0")
fmt.Println("Tesla P100-PCIE-16GB, 418.87.01, 4, 3, [Not Supported], 16280 MiB, 3256 MiB, 70 %, 20 %, 24, 25.28 W, 250.00 W, 405 MHz, 405 MHz, 715 MHz, P0")
fmt.Println("Tesla P100-PCIE-16GB, 00000000:08:00.0, 0320218165889, GPU-78f90e78-39a0-4f40-fcbc-0adf3598c166, 418.87.01, 4, 0, [Not Supported], 16280 MiB, 1628 MiB, 10 %, 10 %, 25, 24.80 W, 250.00 W, 405 MHz, 405 MHz, 715 MHz, P0")
fmt.Println("Tesla P100-PCIE-16GB, 00000000:0B:00.0, 0320218176885, GPU-d1229c61-babc-aebe-ff8f-6dc94386640c, 418.87.01, 4, 1, [Not Supported], 16280 MiB, 3256 MiB, 30 %, 20 %, 25, 25.05 W, 250.00 W, 405 MHz, 405 MHz, 715 MHz, P0")
fmt.Println("Tesla P100-PCIE-16GB, 00000000:16:00.0, 0320218166179, GPU-eb5e8723-4a49-98f5-8e77-21b06537da8a, 418.87.01, 4, 2, [Not Supported], 16280 MiB, 1628 MiB, 20 %, 10 %, 24, 26.26 W, 250.00 W, 405 MHz, 405 MHz, 715 MHz, P0")
fmt.Println("Tesla P100-PCIE-16GB, 00000000:19:00.0, 0320218176911, GPU-b884db58-6340-7969-a79f-b937f3583884, 418.87.01, 4, 3, [Not Supported], 16280 MiB, 3256 MiB, 50 %, 50 %, 24, 25.28 W, 250.00 W, 405 MHz, 405 MHz, 715 MHz, P0")
}

func process() {
fmt.Println("gpu_name, gpu_bus_id, gpu_serial, gpu_uuid, pid, process_name, used_gpu_memory [MiB], used_gpu_memory [MiB]")
fmt.Println("Tesla P100-PCIE-16GB, 00000000:16:00.0, 0320218176947, GPU-bb7f65ee-acdb-7efd-0f32-73699400b86e, 240930, python, 10 MiB, 15 MiB")
fmt.Println("Tesla P100-PCIE-16GB, 00000000:0B:00.0, 0320218176885, GPU-d1229c61-babc-aebe-ff8f-6dc94386640c, 65808, python, 10 MiB, 15 MiB")
fmt.Println("Tesla P100-PCIE-16GB, 00000000:16:00.0, 0320218166179, GPU-eb5e8723-4a49-98f5-8e77-21b06537da8a, 267414, python, 10 MiB, 15 MiB")
fmt.Println("Tesla P100-PCIE-16GB, 00000000:19:00.0, 0320218176911, GPU-b884db58-6340-7969-a79f-b937f3583884, 222414, [Not Found], 10 MiB, 15 MiB")
}

0 comments on commit 6a42990

Please sign in to comment.