From 4c46860dd08811bf286e3abcaca6495f7f029d8e Mon Sep 17 00:00:00 2001 From: arpanetus Date: Sun, 24 Dec 2023 00:15:04 +0600 Subject: [PATCH 1/4] add: arbitrary gpus --- internal/docker.go | 4 +- internal/misc.go | 8 +++ internal/run.go | 124 +++++++++++++++++++++++++++++++-------------- 3 files changed, 97 insertions(+), 39 deletions(-) diff --git a/internal/docker.go b/internal/docker.go index d77e261..4af67ef 100644 --- a/internal/docker.go +++ b/internal/docker.go @@ -102,6 +102,7 @@ func (d *DockerRun) Run( runCommand string, runCommandArgs []string, exposePort int, + gpuIDs []string, ) error { fmt.Printf("killing container %s\n", containerName) @@ -148,8 +149,9 @@ func (d *DockerRun) Run( if _, err := os.Stat("/dev/nvidia0"); err == nil { fmt.Printf("host has gpu, adding gpu to device requests\n") dr = append(dr, container.DeviceRequest{ - Count: -1, + Count: len(gpuIDs), Capabilities: [][]string{{"gpu"}}, + DeviceIDs: gpuIDs, }) } else { fmt.Printf("host does not have gpu, not adding gpu to device requests\n") diff --git a/internal/misc.go b/internal/misc.go index 346e7d4..d0d9d50 100644 --- a/internal/misc.go +++ b/internal/misc.go @@ -193,3 +193,11 @@ func parseOrExitInternal[T ~string | ~int | ~[]string](cmd *cobra.Command, flag return nil, false } + +func toStringSlice[T any](slice []T) []string { + var stringSlice []string + for _, v := range slice { + stringSlice = append(stringSlice, fmt.Sprintf("%v", v)) + } + return stringSlice +} diff --git a/internal/run.go b/internal/run.go index 0e716ff..80dab76 100644 --- a/internal/run.go +++ b/internal/run.go @@ -2,6 +2,7 @@ package internal import ( "context" + "encoding/json" "fmt" "os" "strings" @@ -10,62 +11,90 @@ import ( type RunArgs struct { ProjectName string `validate:"required,varname"` Hosts []string `validate:"required"` - NProcPerNode int `validate:"required,min=1"` + NProcPerNode string `validate:"required"` ExperimentName string `validate:"required,varname"` Port int `validate:"required,min=1"` RunName string `validate:"required,varname"` MaxRepeats int `validate:"required,min=-1"` Rest []string - ContainerName *string + ContainerName *string } const runScript = `#!/usr/bin/env python from higgsfield.internal.main import cli; cli() ` + func nameFromRunArgs(args RunArgs) string { - if args.ContainerName != nil && *args.ContainerName != "" { + if args.ContainerName != nil && *args.ContainerName != "" { return *args.ContainerName - } + } return DefaultProjExpContainerName(args.ProjectName, args.ExperimentName) } func trimPathForLength(path string, length int) string { - // check if path is less than length - if len(path) < length { - return path - } - - // get rid of home directory and replace is with ~ - // e.g. /home/user/... -> ~/... - if path[0] == '/' { - path = path[1:] - } - - branches := strings.Split(path, "/") - slashes := len(branches) - 1 - if slashes == 0 { - return path[:length] - } - - if branches[0] == "home" { - path = "~/" + strings.Join(branches[2:], "/") - } - - if len(path) < length { - return path - } - - return path[:length] + "..." + // check if path is less than length + if len(path) < length { + return path + } + + // get rid of home directory and replace is with ~ + // e.g. /home/user/... -> ~/... + if path[0] == '/' { + path = path[1:] + } + + branches := strings.Split(path, "/") + slashes := len(branches) - 1 + if slashes == 0 { + return path[:length] + } + + if branches[0] == "home" { + path = "~/" + strings.Join(branches[2:], "/") + } + + if len(path) < length { + return path + } + + return path[:length] + "..." +} + +type nProcPerNode map[string][]int + +// parseNProcPerNode converts +func parseNProcPerNode(host, nppn string) []int { + var procMap nProcPerNode + if err := json.Unmarshal([]byte(nppn), &procMap); err != nil { + fmt.Printf("failed to parse nProcPerNode: %v\n", err) + os.Exit(1) + } + + hostNProc, ok := procMap[host] + if !ok { + fmt.Printf("failed to find host %s in nProcPerNode map\n", host) + os.Exit(1) + } + + return hostNProc } func Run(args RunArgs) { if err := Validator().Struct(args); err != nil { panic(err) } - - master := args.Hosts[0] + + myIP, err := myPublicIP() + if err != nil { + fmt.Printf("failed to get my public IP: %v\n", err) + os.Exit(1) + } + + nppn := parseNProcPerNode(myIP, args.NProcPerNode) + + master := args.Hosts[0] rank := 0 if len(args.Hosts) > 1 { @@ -82,13 +111,15 @@ func Run(args RunArgs) { os.Exit(1) } + gpuIDs := make([]int, 0) + hostCachePath, checkpointDir, err := makeDefaultDirectories(args.ProjectName, args.ExperimentName, args.RunName) if err != nil { fmt.Printf("failed to create directories: %v\n", err) os.Exit(1) } - containerName := nameFromRunArgs(args) + containerName := nameFromRunArgs(args) fmt.Printf(` ╔══════════════════════════════════════════════════════════════════════════════════════════════════════ @@ -100,9 +131,14 @@ func Run(args RunArgs) { ║ > RUN NAME = %s ║ > CONTAINER NAME = %s ║ > MODEL CHKPT PATH = %s -║ +║ > GPU IDs = %v ╚══════════════════════════════════════════════════════════════════════════════════════════════════════ -`, args.ExperimentName, args.RunName, containerName, trimPathForLength(checkpointDir, 70)) +`, + args.ExperimentName, + args.RunName, + containerName, + trimPathForLength(checkpointDir, 70), + ) cmd, cmdArgs := buildArgs( nodeNum, @@ -110,7 +146,7 @@ func Run(args RunArgs) { master, args.Port, []string{"hf.py", "run"}, - args.NProcPerNode, + len(nppn), args.ExperimentName, args.RunName, args.MaxRepeats, @@ -132,8 +168,20 @@ func Run(args RunArgs) { f.Write([]byte(runScript)) - dr := NewDockerRun(context.Background(), args.ProjectName, cwd, hostCachePath) - if err := dr.Run(containerName, cmd, cmdArgs, args.Port); err != nil { + + + dr := NewDockerRun( + context.Background(), + args.ProjectName, + cwd, + hostCachePath) + if err := dr.Run( + containerName, + cmd, + cmdArgs, + args.Port, + toStringSlice(nppn), + ); err != nil { fmt.Printf("error occured while running experiment: %+v\n", err) os.Exit(1) } From 46eddaf11402bacda7bda7f8f0462e86ceab95c7 Mon Sep 17 00:00:00 2001 From: arpanetus Date: Sun, 24 Dec 2023 00:17:13 +0600 Subject: [PATCH 2/4] fix: lint --- internal/run.go | 14 ++++++-------- 1 file changed, 6 insertions(+), 8 deletions(-) diff --git a/internal/run.go b/internal/run.go index 80dab76..1874155 100644 --- a/internal/run.go +++ b/internal/run.go @@ -168,20 +168,18 @@ func Run(args RunArgs) { f.Write([]byte(runScript)) - - dr := NewDockerRun( context.Background(), args.ProjectName, cwd, hostCachePath) if err := dr.Run( - containerName, - cmd, - cmdArgs, - args.Port, - toStringSlice(nppn), - ); err != nil { + containerName, + cmd, + cmdArgs, + args.Port, + toStringSlice(nppn), + ); err != nil { fmt.Printf("error occured while running experiment: %+v\n", err) os.Exit(1) } From 5da39303a3ac914c1049a9ff7cae6e47a70b0643 Mon Sep 17 00:00:00 2001 From: arpanetus Date: Sun, 24 Dec 2023 00:31:54 +0600 Subject: [PATCH 3/4] fix: print GPU IDs --- internal/run.go | 1 + 1 file changed, 1 insertion(+) diff --git a/internal/run.go b/internal/run.go index 1874155..540e8bb 100644 --- a/internal/run.go +++ b/internal/run.go @@ -138,6 +138,7 @@ func Run(args RunArgs) { args.RunName, containerName, trimPathForLength(checkpointDir, 70), + nppn, ) cmd, cmdArgs := buildArgs( From 6c361100d362391c51d53b13d50e34cf11dfa96b Mon Sep 17 00:00:00 2001 From: arpanetus Date: Sun, 24 Dec 2023 00:33:13 +0600 Subject: [PATCH 4/4] fix: gpu ids --- internal/run.go | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/internal/run.go b/internal/run.go index 540e8bb..bd0d42b 100644 --- a/internal/run.go +++ b/internal/run.go @@ -92,7 +92,7 @@ func Run(args RunArgs) { os.Exit(1) } - nppn := parseNProcPerNode(myIP, args.NProcPerNode) + gpuIDs := parseNProcPerNode(myIP, args.NProcPerNode) master := args.Hosts[0] rank := 0 @@ -111,8 +111,6 @@ func Run(args RunArgs) { os.Exit(1) } - gpuIDs := make([]int, 0) - hostCachePath, checkpointDir, err := makeDefaultDirectories(args.ProjectName, args.ExperimentName, args.RunName) if err != nil { fmt.Printf("failed to create directories: %v\n", err) @@ -138,7 +136,7 @@ func Run(args RunArgs) { args.RunName, containerName, trimPathForLength(checkpointDir, 70), - nppn, + gpuIDs, ) cmd, cmdArgs := buildArgs( @@ -147,7 +145,7 @@ func Run(args RunArgs) { master, args.Port, []string{"hf.py", "run"}, - len(nppn), + len(gpuIDs), args.ExperimentName, args.RunName, args.MaxRepeats, @@ -179,7 +177,7 @@ func Run(args RunArgs) { cmd, cmdArgs, args.Port, - toStringSlice(nppn), + toStringSlice(gpuIDs), ); err != nil { fmt.Printf("error occured while running experiment: %+v\n", err) os.Exit(1)