-
Notifications
You must be signed in to change notification settings - Fork 12
fix(xunix): improve handling of gpu library mounts #129
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
4ea87fc
ad7c946
4878301
669f1e5
c3ad5b4
f3da271
7dee998
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -9,6 +9,7 @@ import ( | |
"testing" | ||
"time" | ||
|
||
"github.com/stretchr/testify/assert" | ||
"github.com/stretchr/testify/require" | ||
|
||
"github.com/coder/envbox/integration/integrationtest" | ||
|
@@ -41,8 +42,7 @@ func TestDocker_Nvidia(t *testing.T) { | |
) | ||
|
||
// Assert that we can run nvidia-smi in the inner container. | ||
_, err := execContainerCmd(ctx, t, ctID, "docker", "exec", "workspace_cvm", "nvidia-smi") | ||
require.NoError(t, err, "failed to run nvidia-smi in the inner container") | ||
assertInnerNvidiaSMI(ctx, t, ctID) | ||
}) | ||
|
||
t.Run("Redhat", func(t *testing.T) { | ||
|
@@ -52,16 +52,29 @@ func TestDocker_Nvidia(t *testing.T) { | |
|
||
// Start the envbox container. | ||
ctID := startEnvboxCmd(ctx, t, integrationtest.RedhatImage, "root", | ||
"-v", "/usr/lib/x86_64-linux-gnu:/var/coder/usr/lib64", | ||
"-v", "/usr/lib/x86_64-linux-gnu:/var/coder/usr/lib", | ||
"--env", "CODER_ADD_GPU=true", | ||
"--env", "CODER_USR_LIB_DIR=/var/coder/usr/lib64", | ||
"--env", "CODER_USR_LIB_DIR=/var/coder/usr/lib", | ||
"--runtime=nvidia", | ||
"--gpus=all", | ||
) | ||
|
||
// Assert that we can run nvidia-smi in the inner container. | ||
_, err := execContainerCmd(ctx, t, ctID, "docker", "exec", "workspace_cvm", "nvidia-smi") | ||
require.NoError(t, err, "failed to run nvidia-smi in the inner container") | ||
assertInnerNvidiaSMI(ctx, t, ctID) | ||
|
||
// Make sure dnf still works. This checks for a regression due to | ||
// gpuExtraRegex matching `libglib.so` in the outer container. | ||
// This had a dependency on `libpcre.so.3` which would cause dnf to fail. | ||
out, err := execContainerCmd(ctx, t, ctID, "docker", "exec", "workspace_cvm", "dnf") | ||
if !assert.NoError(t, err, "failed to run dnf in the inner container") { | ||
t.Logf("dnf output:\n%s", strings.TrimSpace(out)) | ||
} | ||
|
||
// Make sure libglib.so is not present in the inner container. | ||
out, err = execContainerCmd(ctx, t, ctID, "docker", "exec", "workspace_cvm", "ls", "-1", "/usr/lib/x86_64-linux-gnu/libglib*") | ||
// An error is expected here. | ||
assert.Error(t, err, "libglib should not be present in the inner container") | ||
assert.Contains(t, out, "No such file or directory", "libglib should not be present in the inner container") | ||
}) | ||
|
||
t.Run("InnerUsrLibDirOverride", func(t *testing.T) { | ||
|
@@ -79,11 +92,58 @@ func TestDocker_Nvidia(t *testing.T) { | |
"--gpus=all", | ||
) | ||
|
||
// Assert that the libraries end up in the expected location in the inner container. | ||
out, err := execContainerCmd(ctx, t, ctID, "docker", "exec", "workspace_cvm", "ls", "-l", "/usr/lib/coder") | ||
// Assert that the libraries end up in the expected location in the inner | ||
// container. | ||
out, err := execContainerCmd(ctx, t, ctID, "docker", "exec", "workspace_cvm", "ls", "-1", "/usr/lib/coder") | ||
require.NoError(t, err, "inner usr lib dir override failed") | ||
require.Regexp(t, `(?i)(libgl|nvidia|vulkan|cuda)`, out) | ||
}) | ||
|
||
t.Run("EmptyHostUsrLibDir", func(t *testing.T) { | ||
t.Parallel() | ||
ctx, cancel := context.WithCancel(context.Background()) | ||
t.Cleanup(cancel) | ||
emptyUsrLibDir := t.TempDir() | ||
|
||
// Start the envbox container. | ||
ctID := startEnvboxCmd(ctx, t, integrationtest.UbuntuImage, "root", | ||
"-v", emptyUsrLibDir+":/var/coder/usr/lib", | ||
"--env", "CODER_ADD_GPU=true", | ||
"--env", "CODER_USR_LIB_DIR=/var/coder/usr/lib", | ||
"--runtime=nvidia", | ||
"--gpus=all", | ||
) | ||
|
||
ofs := outerFiles(ctx, t, ctID, "/usr/lib/x86_64-linux-gnu/libnv*") | ||
// Assert invariant: the outer container has the files we expect. | ||
require.NotEmpty(t, ofs, "failed to list outer container files") | ||
// Assert that expected files are available in the inner container. | ||
assertInnerFiles(ctx, t, ctID, "/usr/lib/x86_64-linux-gnu/libnv*", ofs...) | ||
assertInnerNvidiaSMI(ctx, t, ctID) | ||
}) | ||
Comment on lines
+102
to
+123
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. review: this tests that we can get by with no extra files in |
||
|
||
t.Run("CUDASample", func(t *testing.T) { | ||
t.Parallel() | ||
|
||
ctx, cancel := context.WithCancel(context.Background()) | ||
t.Cleanup(cancel) | ||
|
||
// Start the envbox container. | ||
ctID := startEnvboxCmd(ctx, t, integrationtest.CUDASampleImage, "root", | ||
"-v", "/usr/lib/x86_64-linux-gnu:/var/coder/usr/lib", | ||
"--env", "CODER_ADD_GPU=true", | ||
"--env", "CODER_USR_LIB_DIR=/var/coder/usr/lib", | ||
"--runtime=nvidia", | ||
"--gpus=all", | ||
) | ||
|
||
// Assert that we can run nvidia-smi in the inner container. | ||
assertInnerNvidiaSMI(ctx, t, ctID) | ||
|
||
// Assert that /tmp/vectorAdd runs successfully in the inner container. | ||
_, err := execContainerCmd(ctx, t, ctID, "docker", "exec", "workspace_cvm", "/tmp/vectorAdd") | ||
require.NoError(t, err, "failed to run /tmp/vectorAdd in the inner container") | ||
}) | ||
} | ||
|
||
// dockerRuntimes returns the list of container runtimes available on the host. | ||
|
@@ -101,6 +161,49 @@ func dockerRuntimes(t *testing.T) []string { | |
return strings.Split(raw, "\n") | ||
} | ||
|
||
// outerFiles returns the list of files in the outer container matching the | ||
// given pattern. It does this by running `ls -1` in the outer container. | ||
func outerFiles(ctx context.Context, t *testing.T, containerID, pattern string) []string { | ||
t.Helper() | ||
// We need to use /bin/sh -c to avoid the shell interpreting the glob. | ||
out, err := execContainerCmd(ctx, t, containerID, "/bin/sh", "-c", "ls -1 "+pattern) | ||
require.NoError(t, err, "failed to list outer container files") | ||
files := strings.Split(strings.TrimSpace(out), "\n") | ||
slices.Sort(files) | ||
return files | ||
} | ||
|
||
// assertInnerFiles checks that all the files matching the given pattern exist in the | ||
// inner container. | ||
func assertInnerFiles(ctx context.Context, t *testing.T, containerID, pattern string, expected ...string) { | ||
t.Helper() | ||
|
||
// Get the list of files in the inner container. | ||
// We need to use /bin/sh -c to avoid the shell interpreting the glob. | ||
out, err := execContainerCmd(ctx, t, containerID, "docker", "exec", "workspace_cvm", "/bin/sh", "-c", "ls -1 "+pattern) | ||
require.NoError(t, err, "failed to list inner container files") | ||
innerFiles := strings.Split(strings.TrimSpace(out), "\n") | ||
|
||
// Check that the expected files exist in the inner container. | ||
missingFiles := make([]string, 0) | ||
for _, expectedFile := range expected { | ||
if !slices.Contains(innerFiles, expectedFile) { | ||
missingFiles = append(missingFiles, expectedFile) | ||
} | ||
} | ||
require.Empty(t, missingFiles, "missing files in inner container: %s", strings.Join(missingFiles, ", ")) | ||
} | ||
|
||
// assertInnerNvidiaSMI checks that nvidia-smi runs successfully in the inner | ||
// container. | ||
func assertInnerNvidiaSMI(ctx context.Context, t *testing.T, containerID string) { | ||
t.Helper() | ||
// Assert that we can run nvidia-smi in the inner container. | ||
out, err := execContainerCmd(ctx, t, containerID, "docker", "exec", "workspace_cvm", "nvidia-smi") | ||
require.NoError(t, err, "failed to run nvidia-smi in the inner container") | ||
require.Contains(t, out, "NVIDIA-SMI", "nvidia-smi output does not contain NVIDIA-SMI") | ||
} | ||
|
||
// startEnvboxCmd starts the envbox container with the given arguments. | ||
// Ideally we would use ory/dockertest for this, but it doesn't support | ||
// specifying the runtime. We have alternatively used the docker client library, | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -6,6 +6,7 @@ import ( | |
"os" | ||
"path/filepath" | ||
"regexp" | ||
"slices" | ||
"sort" | ||
"strings" | ||
|
||
|
@@ -17,9 +18,9 @@ import ( | |
) | ||
|
||
var ( | ||
gpuMountRegex = regexp.MustCompile("(?i)(nvidia|vulkan|cuda)") | ||
gpuExtraRegex = regexp.MustCompile("(?i)(libgl|nvidia|vulkan|cuda)") | ||
gpuEnvRegex = regexp.MustCompile("(?i)nvidia") | ||
gpuMountRegex = regexp.MustCompile(`(?i)(nvidia|vulkan|cuda)`) | ||
gpuExtraRegex = regexp.MustCompile(`(?i)(libgl(e|sx|\.)|nvidia|vulkan|cuda)`) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. review: modified this regex to hopefully match the right things and not the wrong things |
||
gpuEnvRegex = regexp.MustCompile(`(?i)nvidia`) | ||
sharedObjectRegex = regexp.MustCompile(`\.so(\.[0-9\.]+)?$`) | ||
) | ||
|
||
|
@@ -39,6 +40,7 @@ func GPUEnvs(ctx context.Context) []string { | |
|
||
func GPUs(ctx context.Context, log slog.Logger, usrLibDir string) ([]Device, []mount.MountPoint, error) { | ||
var ( | ||
afs = GetFS(ctx) | ||
mounter = Mounter(ctx) | ||
devices = []Device{} | ||
binds = []mount.MountPoint{} | ||
|
@@ -64,6 +66,22 @@ func GPUs(ctx context.Context, log slog.Logger, usrLibDir string) ([]Device, []m | |
|
||
// If it's not in /dev treat it as a bind mount. | ||
binds = append(binds, m) | ||
// We also want to find any symlinks that point to the target. | ||
// This is important for the nvidia driver as it mounts the driver | ||
// files with the driver version appended to the end, and creates | ||
// symlinks that point to the actual files. | ||
links, err := SameDirSymlinks(afs, m.Path) | ||
if err != nil { | ||
log.Error(ctx, "find symlinks", slog.F("path", m.Path), slog.Error(err)) | ||
} else { | ||
for _, link := range links { | ||
log.Debug(ctx, "found symlink", slog.F("link", link), slog.F("target", m.Path)) | ||
binds = append(binds, mount.MountPoint{ | ||
Path: link, | ||
Opts: []string{"ro"}, | ||
}) | ||
} | ||
} | ||
} | ||
} | ||
|
||
|
@@ -104,7 +122,11 @@ func usrLibGPUs(ctx context.Context, log slog.Logger, usrLibDir string) ([]mount | |
return nil | ||
} | ||
|
||
if !sharedObjectRegex.MatchString(path) || !gpuExtraRegex.MatchString(path) { | ||
if !gpuExtraRegex.MatchString(path) { | ||
return nil | ||
} | ||
|
||
if !sharedObjectRegex.MatchString(path) { | ||
Comment on lines
+125
to
+129
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. review: this makes the control flow a little easier to read; I accidentally removed this but it still performs an important task |
||
return nil | ||
} | ||
|
||
|
@@ -176,6 +198,75 @@ func recursiveSymlinks(afs FS, mountpoint string, path string) ([]string, error) | |
return paths, nil | ||
} | ||
|
||
// SameDirSymlinks returns all links in the same directory as `target` that | ||
// point to target, either indirectly or directly. Only symlinks in the same | ||
// directory as `target` are considered. | ||
func SameDirSymlinks(afs FS, target string) ([]string, error) { | ||
// Get the list of files in the directory of the target. | ||
fis, err := afero.ReadDir(afs, filepath.Dir(target)) | ||
if err != nil { | ||
return nil, xerrors.Errorf("read dir %q: %w", filepath.Dir(target), err) | ||
} | ||
|
||
// Do an initial pass to map all symlinks to their destinations. | ||
allLinks := make(map[string]string) | ||
for _, fi := range fis { | ||
// Ignore non-symlinks. | ||
if fi.Mode()&os.ModeSymlink == 0 { | ||
continue | ||
} | ||
|
||
absPath := filepath.Join(filepath.Dir(target), fi.Name()) | ||
link, err := afs.Readlink(filepath.Join(filepath.Dir(target), fi.Name())) | ||
if err != nil { | ||
return nil, xerrors.Errorf("readlink %q: %w", fi.Name(), err) | ||
} | ||
|
||
if !filepath.IsAbs(link) { | ||
link = filepath.Join(filepath.Dir(target), link) | ||
} | ||
allLinks[absPath] = link | ||
} | ||
|
||
// Now we can start checking for symlinks that point to the target. | ||
var ( | ||
found = make([]string, 0) | ||
// Set an arbitrary upper limit to prevent infinite loops. | ||
maxIterations = 10 | ||
) | ||
for range maxIterations { | ||
johnstcn marked this conversation as resolved.
Show resolved
Hide resolved
|
||
var foundThisTime bool | ||
for linkName, linkDest := range allLinks { | ||
// Ignore symlinks that point outside of target's directory. | ||
if filepath.Dir(linkName) != filepath.Dir(target) { | ||
continue | ||
} | ||
|
||
// If the symlink points to the target, add it to the list. | ||
if linkDest == target { | ||
if !slices.Contains(found, linkName) { | ||
found = append(found, linkName) | ||
foundThisTime = true | ||
} | ||
} | ||
|
||
// If the symlink points to another symlink that we already determined | ||
// points to the target, add it to the list. | ||
if slices.Contains(found, linkDest) { | ||
if !slices.Contains(found, linkName) { | ||
found = append(found, linkName) | ||
foundThisTime = true | ||
} | ||
} | ||
} | ||
// If we didn't find any new symlinks, we're done. | ||
if !foundThisTime { | ||
break | ||
} | ||
} | ||
return found, nil | ||
} | ||
|
||
// TryUnmountProcGPUDrivers unmounts any GPU-related mounts under /proc as it causes | ||
// issues when creating any container in some cases. Errors encountered while | ||
// unmounting are treated as non-fatal. | ||
|
Uh oh!
There was an error while loading. Please reload this page.