Add AMD support on on-prem fleets

Closes: #1413
dstackai · Oct 1, 2024 · 0eacdab · 0eacdab
1 parent 55d09a7
commit 0eacdab
Show file tree

Hide file tree

Showing 6 changed files with 292 additions and 116 deletions.
diff --git a/runner/cmd/shim/main.go b/runner/cmd/shim/main.go
@@ -2,21 +2,17 @@ package main
 
 import (
 	"context"
-	"encoding/csv"
 	"encoding/json"
 	"errors"
 	"fmt"
-	"io"
 	"log"
 	"net"
 	"net/http"
 	"os"
 	"path/filepath"
 	"runtime"
-	"strings"
 	"time"
 
-	execute "github.com/alexellis/go-execute/v2"
 	"github.com/dstackai/dstack/runner/consts"
 	"github.com/dstackai/dstack/runner/internal/shim"
 	"github.com/dstackai/dstack/runner/internal/shim/api"
@@ -191,29 +187,33 @@ func writeHostInfo() {
 	}
 
 	type Message struct {
-		GpuName   string   `json:"gpu_name"`
-		GpuMemory string   `json:"gpu_memory"`
-		GpuCount  int      `json:"gpu_count"`
-		Adresses  []string `json:"addresses"`
-		DiskSize  uint64   `json:"disk_size"`
-		NumCPUs   int      `json:"cpus"`
-		Memory    uint64   `json:"memory"`
+		GpuVendor shim.GpuVendor `json:"gpu_vendor"`
+		GpuName   string         `json:"gpu_name"`
+		GpuMemory int            `json:"gpu_memory"` // MiB
+		GpuCount  int            `json:"gpu_count"`
+		Addresses []string       `json:"addresses"`
+		DiskSize  uint64         `json:"disk_size"` // bytes
+		NumCPUs   int            `json:"cpus"`
+		Memory    uint64         `json:"memory"` // bytes
 	}
 
+	gpuVendor := shim.NoVendor
 	gpuCount := 0
-	gpuMemory := ""
+	gpuMemory := 0
 	gpuName := ""
-	gpus := getGpuInfo()
+	gpus := shim.GetGpuInfo()
 	if len(gpus) != 0 {
 		gpuCount = len(gpus)
-		gpuMemory = gpus[0][1]
-		gpuName = gpus[0][0]
+		gpuVendor = gpus[0].Vendor
+		gpuMemory = gpus[0].Vram
+		gpuName = gpus[0].Name
 	}
 	m := Message{
+		GpuVendor: gpuVendor,
 		GpuName:   gpuName,
 		GpuMemory: gpuMemory,
 		GpuCount:  gpuCount,
-		Adresses:  getInterfaces(),
+		Addresses: getInterfaces(),
 		DiskSize:  getDiskSize(),
 		NumCPUs:   runtime.NumCPU(),
 		Memory:    getMemory(),
@@ -241,51 +241,6 @@ func writeHostInfo() {
 	}
 }
 
-func getGpuInfo() [][]string {
-	cmd := execute.ExecTask{
-		Command: "docker",
-		Args: []string{
-			"run",
-			"--rm",
-			"--gpus", "all",
-			"dstackai/base:py3.12-0.5-cuda-12.1",
-			"nvidia-smi", "--query-gpu=gpu_name,memory.total", "--format=csv",
-		},
-		StreamStdio: false,
-	}
-
-	res, err := cmd.Execute(context.Background())
-	if err != nil {
-		return [][]string{} // GPU not found
-	}
-
-	if res.ExitCode != 0 {
-		return [][]string{} // GPU not found
-	}
-
-	r := csv.NewReader(strings.NewReader(res.Stdout))
-
-	var gpus [][]string
-
-	// Skip header
-	if _, err := r.Read(); err != nil {
-		panic("canot read csv")
-	}
-
-	for {
-		record, err := r.Read()
-		if err == io.EOF {
-			break
-		}
-		if err != nil {
-			log.Fatal(err)
-		}
-
-		gpus = append(gpus, record)
-	}
-	return gpus
-}
-
 func getInterfaces() []string {
 	var addresses []string
 	ifaces, err := net.Interfaces()

diff --git a/runner/internal/shim/docker.go b/runner/internal/shim/docker.go
@@ -445,10 +445,6 @@ func createContainer(ctx context.Context, client docker.APIClient, runnerDir str
 		log.Printf("Cleanup routine: Cannot remove container: %s", err)
 	}
 
-	gpuRequest, err := requestGpuIfAvailable(ctx, client)
-	if err != nil {
-		return "", tracerr.Wrap(err)
-	}
 	mounts, err := dockerParams.DockerMounts(runnerDir)
 	if err != nil {
 		return "", tracerr.Wrap(err)
@@ -497,13 +493,11 @@ func createContainer(ctx context.Context, client docker.APIClient, runnerDir str
 		PortBindings:    bindPorts(dockerParams.DockerPorts()...),
 		PublishAllPorts: true,
 		Sysctls:         map[string]string{},
-		Resources: container.Resources{
-			DeviceRequests: gpuRequest,
-		},
-		Mounts:  mounts,
-		ShmSize: taskConfig.ShmSize,
-		Tmpfs:   tmpfs,
+		Mounts:          mounts,
+		ShmSize:         taskConfig.ShmSize,
+		Tmpfs:           tmpfs,
 	}
+	configureGpuIfAvailable(hostConfig)
 
 	log.Printf("Creating container %s:\nconfig: %v\nhostConfig:%v", taskConfig.ContainerName, containerConfig, hostConfig)
 	resp, err := client.ContainerCreate(ctx, containerConfig, hostConfig, nil, nil, taskConfig.ContainerName)
@@ -587,27 +581,44 @@ func getNetworkMode() container.NetworkMode {
 	return "default"
 }
 
-func requestGpuIfAvailable(ctx context.Context, client docker.APIClient) ([]container.DeviceRequest, error) {
-	info, err := client.Info(ctx)
-	if err != nil {
-		return nil, tracerr.Wrap(err)
-	}
-
-	for runtime := range info.Runtimes {
-		if runtime == consts.NVIDIA_RUNTIME {
-			return []container.DeviceRequest{
-				{
-					// Request all capabilities to maximize compatibility with all sorts of GPU workloads.
-					// Default capabilities: utility, compute.
-					// https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/1.16.0/docker-specialized.html
-					Capabilities: [][]string{{"gpu", "utility", "compute", "graphics", "video", "display", "compat32"}},
-					Count:        -1, // --gpus=all
-				},
-			}, nil
-		}
+func configureGpuIfAvailable(hostConfig *container.HostConfig) {
+	switch gpuVendor := GetGpuVendor(); gpuVendor {
+	case Nvidia:
+		hostConfig.Resources.DeviceRequests = append(
+			hostConfig.Resources.DeviceRequests,
+			container.DeviceRequest{
+				// Request all capabilities to maximize compatibility with all sorts of GPU workloads.
+				// Default capabilities: utility, compute.
+				// https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/1.16.0/docker-specialized.html
+				Capabilities: [][]string{{"gpu", "utility", "compute", "graphics", "video", "display", "compat32"}},
+				Count:        -1, // --gpus=all
+			},
+		)
+	case Amd:
+		// All options are listed here: https://hub.docker.com/r/rocm/pytorch
+		// Only --device are mandatory, other seem to be performance-related.
+		// --device=/dev/kfd --device=/dev/dri
+		hostConfig.Resources.Devices = append(
+			hostConfig.Resources.Devices,
+			container.DeviceMapping{
+				PathOnHost:        "/dev/kfd",
+				PathInContainer:   "/dev/kfd",
+				CgroupPermissions: "rwm",
+			},
+			container.DeviceMapping{
+				PathOnHost:        "/dev/dri",
+				PathInContainer:   "/dev/dri",
+				CgroupPermissions: "rwm",
+			},
+		)
+		// --ipc=host
+		hostConfig.IpcMode = container.IPCModeHost
+		// --cap-add=SYS_PTRACE
+		hostConfig.CapAdd = append(hostConfig.CapAdd, "SYS_PTRACE")
+		// --security-opt=seccomp=unconfined
+		hostConfig.SecurityOpt = append(hostConfig.SecurityOpt, "seccomp=unconfined")
+		// TODO: in addition, for non-root user, --group-add=video, and possibly --group-add=render, are required.
 	}
-
-	return nil, nil
 }
 
 func getVolumeMounts(mountPoints []MountPoint) ([]mount.Mount, error) {

diff --git a/runner/internal/shim/gpu.go b/runner/internal/shim/gpu.go
@@ -0,0 +1,179 @@
+package shim
+
+import (
+	"context"
+	"encoding/csv"
+	"encoding/json"
+	"errors"
+	"io"
+	"log"
+	"os"
+	"os/exec"
+	"strconv"
+	"strings"
+
+	execute "github.com/alexellis/go-execute/v2"
+)
+
+const nvidiaSmiImage = "dstackai/base:py3.12-0.5-cuda-12.1"
+const amdSmiImage = "un1def/amd-smi:6.2.2-0"
+
+type GpuVendor string
+
+const (
+	NoVendor GpuVendor = "none"
+	Nvidia   GpuVendor = "nvidia"
+	Amd      GpuVendor = "amd"
+)
+
+type GpuInfo struct {
+	Vendor GpuVendor
+	Name   string
+	Vram   int // MiB
+}
+
+var gpuVendor GpuVendor
+
+func GetGpuVendor() GpuVendor {
+	if gpuVendor != "" {
+		return gpuVendor
+	}
+	if _, err := os.Stat("/dev/kfd"); !errors.Is(err, os.ErrNotExist) {
+		gpuVendor = Amd
+	} else if _, err := exec.LookPath("nvidia-smi"); err == nil {
+		gpuVendor = Nvidia
+	} else {
+		gpuVendor = NoVendor
+	}
+	return gpuVendor
+}
+
+func GetGpuInfo() []GpuInfo {
+	switch gpuVendor := GetGpuVendor(); gpuVendor {
+	case Nvidia:
+		return getNvidiaGpuInfo()
+	case Amd:
+		return getAmdGpuInfo()
+	}
+	return []GpuInfo{}
+}
+
+func getNvidiaGpuInfo() []GpuInfo {
+	gpus := []GpuInfo{}
+
+	cmd := execute.ExecTask{
+		Command: "docker",
+		Args: []string{
+			"run",
+			"--rm",
+			"--gpus", "all",
+			nvidiaSmiImage,
+			"nvidia-smi", "--query-gpu=gpu_name,memory.total", "--format=csv,nounits",
+		},
+		StreamStdio: false,
+	}
+	res, err := cmd.Execute(context.Background())
+	if err != nil {
+		log.Printf("failed to execute nvidia-smi: %s", err)
+		return gpus
+	}
+	if res.ExitCode != 0 {
+		log.Printf(
+			"failed to execute nvidia-smi: exit code: %d: stdout: %s; stderr: %s",
+			res.ExitCode, res.Stdout, res.Stderr,
+		)
+		return gpus
+	}
+
+	r := csv.NewReader(strings.NewReader(res.Stdout))
+	// Skip header
+	if _, err := r.Read(); err != nil {
+		log.Printf("cannot read csv: %s", err)
+		return gpus
+	}
+	for {
+		record, err := r.Read()
+		if err == io.EOF {
+			break
+		}
+		if err != nil {
+			log.Printf("cannot read csv: %s", err)
+			return gpus
+		}
+		if len(record) != 2 {
+			log.Printf("two csv fields expected, got: %d", len(record))
+			return gpus
+		}
+		vram, err := strconv.Atoi(strings.TrimSpace(record[1]))
+		if err != nil {
+			log.Printf("invalid VRAM value: %s", record[1])
+			vram = 0
+		}
+		gpus = append(gpus, GpuInfo{
+			Vendor: Nvidia,
+			Name:   strings.TrimSpace(record[0]),
+			Vram:   vram,
+		})
+	}
+	return gpus
+}
+
+type amdGpu struct {
+	Asic amdAsic `json:"asic"`
+	Vram amdVram `json:"vram"`
+}
+
+type amdAsic struct {
+	Name string `json:"market_name"`
+}
+
+type amdVram struct {
+	Size amdVramSize `json:"size"`
+}
+
+type amdVramSize struct {
+	Value int `json:"value"`
+}
+
+func getAmdGpuInfo() []GpuInfo {
+	gpus := []GpuInfo{}
+
+	cmd := execute.ExecTask{
+		Command: "docker",
+		Args: []string{
+			"run",
+			"--rm",
+			"--device", "/dev/kfd",
+			"--device", "/dev/dri",
+			amdSmiImage,
+			"static", "--json", "--asic", "--vram",
+		},
+		StreamStdio: false,
+	}
+	res, err := cmd.Execute(context.Background())
+	if err != nil {
+		log.Printf("failed to execute amd-smi: %s", err)
+		return gpus
+	}
+	if res.ExitCode != 0 {
+		log.Printf(
+			"failed to execute amd-smi: exit code: %d: stdout: %s; stderr: %s",
+			res.ExitCode, res.Stdout, res.Stderr,
+		)
+		return gpus
+	}
+
+	var amdGpus []amdGpu
+	if err := json.Unmarshal([]byte(res.Stdout), &amdGpus); err != nil {
+		log.Printf("cannot read json: %s", err)
+		return gpus
+	}
+	for _, amdGpu := range amdGpus {
+		gpus = append(gpus, GpuInfo{
+			Vendor: Amd,
+			Name:   amdGpu.Asic.Name,
+			Vram:   amdGpu.Vram.Size.Value,
+		})
+	}
+	return gpus
+}