Skip to content

Commit

Permalink
Add AMD support on on-prem fleets
Browse files Browse the repository at this point in the history
Closes: #1413
  • Loading branch information
un-def committed Oct 1, 2024
1 parent 55d09a7 commit 0eacdab
Show file tree
Hide file tree
Showing 6 changed files with 292 additions and 116 deletions.
77 changes: 16 additions & 61 deletions runner/cmd/shim/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -2,21 +2,17 @@ package main

import (
"context"
"encoding/csv"
"encoding/json"
"errors"
"fmt"
"io"
"log"
"net"
"net/http"
"os"
"path/filepath"
"runtime"
"strings"
"time"

execute "github.com/alexellis/go-execute/v2"
"github.com/dstackai/dstack/runner/consts"
"github.com/dstackai/dstack/runner/internal/shim"
"github.com/dstackai/dstack/runner/internal/shim/api"
Expand Down Expand Up @@ -191,29 +187,33 @@ func writeHostInfo() {
}

type Message struct {
GpuName string `json:"gpu_name"`
GpuMemory string `json:"gpu_memory"`
GpuCount int `json:"gpu_count"`
Adresses []string `json:"addresses"`
DiskSize uint64 `json:"disk_size"`
NumCPUs int `json:"cpus"`
Memory uint64 `json:"memory"`
GpuVendor shim.GpuVendor `json:"gpu_vendor"`
GpuName string `json:"gpu_name"`
GpuMemory int `json:"gpu_memory"` // MiB
GpuCount int `json:"gpu_count"`
Addresses []string `json:"addresses"`
DiskSize uint64 `json:"disk_size"` // bytes
NumCPUs int `json:"cpus"`
Memory uint64 `json:"memory"` // bytes
}

gpuVendor := shim.NoVendor
gpuCount := 0
gpuMemory := ""
gpuMemory := 0
gpuName := ""
gpus := getGpuInfo()
gpus := shim.GetGpuInfo()
if len(gpus) != 0 {
gpuCount = len(gpus)
gpuMemory = gpus[0][1]
gpuName = gpus[0][0]
gpuVendor = gpus[0].Vendor
gpuMemory = gpus[0].Vram
gpuName = gpus[0].Name
}
m := Message{
GpuVendor: gpuVendor,
GpuName: gpuName,
GpuMemory: gpuMemory,
GpuCount: gpuCount,
Adresses: getInterfaces(),
Addresses: getInterfaces(),
DiskSize: getDiskSize(),
NumCPUs: runtime.NumCPU(),
Memory: getMemory(),
Expand Down Expand Up @@ -241,51 +241,6 @@ func writeHostInfo() {
}
}

func getGpuInfo() [][]string {
cmd := execute.ExecTask{
Command: "docker",
Args: []string{
"run",
"--rm",
"--gpus", "all",
"dstackai/base:py3.12-0.5-cuda-12.1",
"nvidia-smi", "--query-gpu=gpu_name,memory.total", "--format=csv",
},
StreamStdio: false,
}

res, err := cmd.Execute(context.Background())
if err != nil {
return [][]string{} // GPU not found
}

if res.ExitCode != 0 {
return [][]string{} // GPU not found
}

r := csv.NewReader(strings.NewReader(res.Stdout))

var gpus [][]string

// Skip header
if _, err := r.Read(); err != nil {
panic("canot read csv")
}

for {
record, err := r.Read()
if err == io.EOF {
break
}
if err != nil {
log.Fatal(err)
}

gpus = append(gpus, record)
}
return gpus
}

func getInterfaces() []string {
var addresses []string
ifaces, err := net.Interfaces()
Expand Down
71 changes: 41 additions & 30 deletions runner/internal/shim/docker.go
Original file line number Diff line number Diff line change
Expand Up @@ -445,10 +445,6 @@ func createContainer(ctx context.Context, client docker.APIClient, runnerDir str
log.Printf("Cleanup routine: Cannot remove container: %s", err)
}

gpuRequest, err := requestGpuIfAvailable(ctx, client)
if err != nil {
return "", tracerr.Wrap(err)
}
mounts, err := dockerParams.DockerMounts(runnerDir)
if err != nil {
return "", tracerr.Wrap(err)
Expand Down Expand Up @@ -497,13 +493,11 @@ func createContainer(ctx context.Context, client docker.APIClient, runnerDir str
PortBindings: bindPorts(dockerParams.DockerPorts()...),
PublishAllPorts: true,
Sysctls: map[string]string{},
Resources: container.Resources{
DeviceRequests: gpuRequest,
},
Mounts: mounts,
ShmSize: taskConfig.ShmSize,
Tmpfs: tmpfs,
Mounts: mounts,
ShmSize: taskConfig.ShmSize,
Tmpfs: tmpfs,
}
configureGpuIfAvailable(hostConfig)

log.Printf("Creating container %s:\nconfig: %v\nhostConfig:%v", taskConfig.ContainerName, containerConfig, hostConfig)
resp, err := client.ContainerCreate(ctx, containerConfig, hostConfig, nil, nil, taskConfig.ContainerName)
Expand Down Expand Up @@ -587,27 +581,44 @@ func getNetworkMode() container.NetworkMode {
return "default"
}

func requestGpuIfAvailable(ctx context.Context, client docker.APIClient) ([]container.DeviceRequest, error) {
info, err := client.Info(ctx)
if err != nil {
return nil, tracerr.Wrap(err)
}

for runtime := range info.Runtimes {
if runtime == consts.NVIDIA_RUNTIME {
return []container.DeviceRequest{
{
// Request all capabilities to maximize compatibility with all sorts of GPU workloads.
// Default capabilities: utility, compute.
// https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/1.16.0/docker-specialized.html
Capabilities: [][]string{{"gpu", "utility", "compute", "graphics", "video", "display", "compat32"}},
Count: -1, // --gpus=all
},
}, nil
}
func configureGpuIfAvailable(hostConfig *container.HostConfig) {
switch gpuVendor := GetGpuVendor(); gpuVendor {
case Nvidia:
hostConfig.Resources.DeviceRequests = append(
hostConfig.Resources.DeviceRequests,
container.DeviceRequest{
// Request all capabilities to maximize compatibility with all sorts of GPU workloads.
// Default capabilities: utility, compute.
// https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/1.16.0/docker-specialized.html
Capabilities: [][]string{{"gpu", "utility", "compute", "graphics", "video", "display", "compat32"}},
Count: -1, // --gpus=all
},
)
case Amd:
// All options are listed here: https://hub.docker.com/r/rocm/pytorch
// Only --device are mandatory, other seem to be performance-related.
// --device=/dev/kfd --device=/dev/dri
hostConfig.Resources.Devices = append(
hostConfig.Resources.Devices,
container.DeviceMapping{
PathOnHost: "/dev/kfd",
PathInContainer: "/dev/kfd",
CgroupPermissions: "rwm",
},
container.DeviceMapping{
PathOnHost: "/dev/dri",
PathInContainer: "/dev/dri",
CgroupPermissions: "rwm",
},
)
// --ipc=host
hostConfig.IpcMode = container.IPCModeHost
// --cap-add=SYS_PTRACE
hostConfig.CapAdd = append(hostConfig.CapAdd, "SYS_PTRACE")
// --security-opt=seccomp=unconfined
hostConfig.SecurityOpt = append(hostConfig.SecurityOpt, "seccomp=unconfined")
// TODO: in addition, for non-root user, --group-add=video, and possibly --group-add=render, are required.
}

return nil, nil
}

func getVolumeMounts(mountPoints []MountPoint) ([]mount.Mount, error) {
Expand Down
179 changes: 179 additions & 0 deletions runner/internal/shim/gpu.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,179 @@
package shim

import (
"context"
"encoding/csv"
"encoding/json"
"errors"
"io"
"log"
"os"
"os/exec"
"strconv"
"strings"

execute "github.com/alexellis/go-execute/v2"
)

const nvidiaSmiImage = "dstackai/base:py3.12-0.5-cuda-12.1"
const amdSmiImage = "un1def/amd-smi:6.2.2-0"

type GpuVendor string

const (
NoVendor GpuVendor = "none"
Nvidia GpuVendor = "nvidia"
Amd GpuVendor = "amd"
)

type GpuInfo struct {
Vendor GpuVendor
Name string
Vram int // MiB
}

var gpuVendor GpuVendor

func GetGpuVendor() GpuVendor {
if gpuVendor != "" {
return gpuVendor
}
if _, err := os.Stat("/dev/kfd"); !errors.Is(err, os.ErrNotExist) {
gpuVendor = Amd
} else if _, err := exec.LookPath("nvidia-smi"); err == nil {
gpuVendor = Nvidia
} else {
gpuVendor = NoVendor
}
return gpuVendor
}

func GetGpuInfo() []GpuInfo {
switch gpuVendor := GetGpuVendor(); gpuVendor {
case Nvidia:
return getNvidiaGpuInfo()
case Amd:
return getAmdGpuInfo()
}
return []GpuInfo{}
}

func getNvidiaGpuInfo() []GpuInfo {
gpus := []GpuInfo{}

cmd := execute.ExecTask{
Command: "docker",
Args: []string{
"run",
"--rm",
"--gpus", "all",
nvidiaSmiImage,
"nvidia-smi", "--query-gpu=gpu_name,memory.total", "--format=csv,nounits",
},
StreamStdio: false,
}
res, err := cmd.Execute(context.Background())
if err != nil {
log.Printf("failed to execute nvidia-smi: %s", err)
return gpus
}
if res.ExitCode != 0 {
log.Printf(
"failed to execute nvidia-smi: exit code: %d: stdout: %s; stderr: %s",
res.ExitCode, res.Stdout, res.Stderr,
)
return gpus
}

r := csv.NewReader(strings.NewReader(res.Stdout))
// Skip header
if _, err := r.Read(); err != nil {
log.Printf("cannot read csv: %s", err)
return gpus
}
for {
record, err := r.Read()
if err == io.EOF {
break
}
if err != nil {
log.Printf("cannot read csv: %s", err)
return gpus
}
if len(record) != 2 {
log.Printf("two csv fields expected, got: %d", len(record))
return gpus
}
vram, err := strconv.Atoi(strings.TrimSpace(record[1]))
if err != nil {
log.Printf("invalid VRAM value: %s", record[1])
vram = 0
}
gpus = append(gpus, GpuInfo{
Vendor: Nvidia,
Name: strings.TrimSpace(record[0]),
Vram: vram,
})
}
return gpus
}

type amdGpu struct {
Asic amdAsic `json:"asic"`
Vram amdVram `json:"vram"`
}

type amdAsic struct {
Name string `json:"market_name"`
}

type amdVram struct {
Size amdVramSize `json:"size"`
}

type amdVramSize struct {
Value int `json:"value"`
}

func getAmdGpuInfo() []GpuInfo {
gpus := []GpuInfo{}

cmd := execute.ExecTask{
Command: "docker",
Args: []string{
"run",
"--rm",
"--device", "/dev/kfd",
"--device", "/dev/dri",
amdSmiImage,
"static", "--json", "--asic", "--vram",
},
StreamStdio: false,
}
res, err := cmd.Execute(context.Background())
if err != nil {
log.Printf("failed to execute amd-smi: %s", err)
return gpus
}
if res.ExitCode != 0 {
log.Printf(
"failed to execute amd-smi: exit code: %d: stdout: %s; stderr: %s",
res.ExitCode, res.Stdout, res.Stderr,
)
return gpus
}

var amdGpus []amdGpu
if err := json.Unmarshal([]byte(res.Stdout), &amdGpus); err != nil {
log.Printf("cannot read json: %s", err)
return gpus
}
for _, amdGpu := range amdGpus {
gpus = append(gpus, GpuInfo{
Vendor: Amd,
Name: amdGpu.Asic.Name,
Vram: amdGpu.Vram.Size.Value,
})
}
return gpus
}
Loading

0 comments on commit 0eacdab

Please sign in to comment.