diff --git a/launcher/container_runner.go b/launcher/container_runner.go index d13901a7..660d1a91 100644 --- a/launcher/container_runner.go +++ b/launcher/container_runner.go @@ -161,8 +161,20 @@ func NewRunner(ctx context.Context, cdClient *containerd.Client, token oauth2.To } if launchSpec.Experiments.EnableGpuDriverInstallation && launchSpec.InstallGpuDriver { - mounts = appendGpuDriverMounts(mounts) - specOpts = append(specOpts, oci.WithMounts(mounts)) + gpuMounts := []specs.Mount{ + { + Type: "volume", + Source: fmt.Sprintf("%s/lib64", gpu.InstallationHostDir), + Destination: fmt.Sprintf("%s/lib64", gpu.InstallationContainerDir), + Options: []string{"rbind", "rw"}, + }, { + Type: "volume", + Source: fmt.Sprintf("%s/bin", gpu.InstallationHostDir), + Destination: fmt.Sprintf("%s/bin", gpu.InstallationContainerDir), + Options: []string{"rbind", "rw"}, + }, + } + specOpts = append(specOpts, oci.WithMounts(gpuMounts)) gpuDeviceFiles, err := listFilesWithPrefix("/dev", "nvidia") if err != nil { @@ -282,24 +294,6 @@ func appendTokenMounts(mounts []specs.Mount) []specs.Mount { return append(mounts, m) } -// appendGpuMounts appends the default mount specs for GPU drivers -func appendGpuDriverMounts(mounts []specs.Mount) []specs.Mount { - gpuMounts := []specs.Mount{ - { - Type: "volume", - Source: fmt.Sprintf("%s/lib64", gpu.InstallationHostDir), - Destination: fmt.Sprintf("%s/lib64", gpu.InstallationContainerDir), - Options: []string{"rbind", "rw"}, - }, { - Type: "volume", - Source: fmt.Sprintf("%s/bin", gpu.InstallationHostDir), - Destination: fmt.Sprintf("%s/bin", gpu.InstallationContainerDir), - Options: []string{"rbind", "rw"}, - }, - } - return append(mounts, gpuMounts...) -} - func (r *ContainerRunner) measureCELEvents(ctx context.Context) error { if err := r.measureContainerClaims(ctx); err != nil { return fmt.Errorf("failed to measure container claims: %v", err) diff --git a/launcher/image/test/scripts/test_gpu_nogpu.sh b/launcher/image/test/scripts/gpu/test_gpu_nogpu.sh similarity index 100% rename from launcher/image/test/scripts/test_gpu_nogpu.sh rename to launcher/image/test/scripts/gpu/test_gpu_nogpu.sh diff --git a/launcher/image/test/scripts/test_gpu_unsupported_gputype.sh b/launcher/image/test/scripts/gpu/test_gpu_unsupported_gputype.sh similarity index 100% rename from launcher/image/test/scripts/test_gpu_unsupported_gputype.sh rename to launcher/image/test/scripts/gpu/test_gpu_unsupported_gputype.sh diff --git a/launcher/image/test/scripts/test_gpu_workload.sh b/launcher/image/test/scripts/gpu/test_gpu_workload.sh similarity index 100% rename from launcher/image/test/scripts/test_gpu_workload.sh rename to launcher/image/test/scripts/gpu/test_gpu_workload.sh diff --git a/launcher/image/test/test_gpu_driver_Installation_cloudbuild.yaml b/launcher/image/test/test_gpu_driver_installation_cloudbuild.yaml similarity index 90% rename from launcher/image/test/test_gpu_driver_Installation_cloudbuild.yaml rename to launcher/image/test/test_gpu_driver_installation_cloudbuild.yaml index 7df1a83c..aba40705 100644 --- a/launcher/image/test/test_gpu_driver_Installation_cloudbuild.yaml +++ b/launcher/image/test/test_gpu_driver_installation_cloudbuild.yaml @@ -62,19 +62,19 @@ steps: - name: 'gcr.io/cloud-builders/gcloud' id: SingleGpuWorkloadTest entrypoint: 'bash' - args: ['scripts/test_gpu_workload.sh', '${_VM_NAME_PREFIX}-${BUILD_ID}', '${_ZONE}'] + args: ['scripts/gpu/test_gpu_workload.sh', '${_VM_NAME_PREFIX}-${BUILD_ID}', '${_ZONE}'] - name: 'gcr.io/cloud-builders/gcloud' id: MultipleGpuWorkloadTest entrypoint: 'bash' - args: ['scripts/test_gpu_workload.sh', '${_VM_NAME_PREFIX}-${BUILD_ID}-mul', '${_ZONE}'] + args: ['scripts/gpu/test_gpu_workload.sh', '${_VM_NAME_PREFIX}-${BUILD_ID}-mul', '${_ZONE}'] - name: 'gcr.io/cloud-builders/gcloud' id: UnsupportedGpuWorkloadTest entrypoint: 'bash' - args: ['scripts/test_gpu_unsupported_gputype.sh', '${_VM_NAME_PREFIX}-${BUILD_ID}-unsup', '${_ZONE}'] + args: ['scripts/gpu/test_gpu_unsupported_gputype.sh', '${_VM_NAME_PREFIX}-${BUILD_ID}-unsup', '${_ZONE}'] - name: 'gcr.io/cloud-builders/gcloud' id: NoGpuWorkloadTest entrypoint: 'bash' - args: ['scripts/test_gpu_nogpu.sh', '${_VM_NAME_PREFIX}-${BUILD_ID}-nogpu', '${_ZONE}'] + args: ['scripts/gpu/test_gpu_nogpu.sh', '${_VM_NAME_PREFIX}-${BUILD_ID}-nogpu', '${_ZONE}'] - name: 'gcr.io/cloud-builders/gcloud' id: SingleGpuCleanUp entrypoint: 'bash' diff --git a/launcher/internal/gpu/driverinstaller.go b/launcher/internal/gpu/driverinstaller.go index 24d2c6ee..109a55ae 100644 --- a/launcher/internal/gpu/driverinstaller.go +++ b/launcher/internal/gpu/driverinstaller.go @@ -23,8 +23,7 @@ const ( installerSnapshotID = "tee-gpu-driver-installer-snapshot" ) -// SupportedGpuTypes is the list of supported gpu types with open sourced nvidia kernel modules. -var SupportedGpuTypes = []deviceinfo.GPUType{ +var supportedGpuTypes = []deviceinfo.GPUType{ deviceinfo.L4, deviceinfo.T4, deviceinfo.A100_40GB, @@ -53,13 +52,18 @@ func NewDriverInstaller(cdClient *containerd.Client, launchSpec spec.LaunchSpec, // https://pkg.go.dev/cos.googlesource.com/cos/tools.git@v0.0.0-20241008015903-8431fe581b1f/src/cmd/cos_gpu_installer#section-readme // README specifies docker command where this function uses containerd for launching and managing the gpu driver installer container. func (di *DriverInstaller) InstallGPUDrivers(ctx context.Context) error { + err := remountAsExecutable(InstallationHostDir) + if err != nil { + return fmt.Errorf("failed to remount the installation directory: %v", err) + } + gpuType, err := deviceinfo.GetGPUTypeInfo() if err != nil { return fmt.Errorf("failed to get the gpu type info: %v", err) } if !gpuType.OpenSupported() { - return fmt.Errorf("unsupported gpu type %s, please retry with one of the supported gpu types: %v", gpuType.String(), gpu.SupportedGpuTypes) + return fmt.Errorf("unsupported gpu type %s, please retry with one of the supported gpu types: %v", gpuType.String(), supportedGpuTypes) } ctx = namespaces.WithNamespace(ctx, namespaces.Default) @@ -113,11 +117,13 @@ func (di *DriverInstaller) InstallGPUDrivers(ctx context.Context) error { if err != nil { return fmt.Errorf("failed to create gpu driver installer container: %v", err) } + defer container.Delete(ctx, containerd.WithSnapshotCleanup) task, err := container.NewTask(ctx, cio.NewCreator(cio.WithStdio)) if err != nil { return fmt.Errorf("failed to create gpu driver installation task: %v", err) } + defer task.Delete(ctx) statusC, err := task.Wait(ctx) if err != nil { @@ -130,12 +136,13 @@ func (di *DriverInstaller) InstallGPUDrivers(ctx context.Context) error { status := <-statusC code, _, _ := status.Result() - di.logger.Printf("Gpu driver installation task exited with status: %d\n", code) - err = remountAsExecutable(gpu.InstallationHostDir) - if err != nil { - return fmt.Errorf("failed to remount the installed drivers: %v", err) + if code != 0 { + di.logger.Printf("Gpu driver installation task ended and returned non-zero status code %d", code) + return fmt.Errorf("gpu driver installation task ended with non-zero status code %d", code) } + + di.logger.Println("Gpu driver installation task exited with status: 0") return nil } diff --git a/launcher/util.go b/launcher/util.go index dc9c5c14..9ec9e625 100644 --- a/launcher/util.go +++ b/launcher/util.go @@ -33,23 +33,18 @@ func FetchImpersonatedToken(ctx context.Context, serviceAccount string, audience } func listFilesWithPrefix(targetDir string, prefix string) ([]string, error) { - targetFiles := make([]string, 0) - - err := filepath.WalkDir(targetDir, func(path string, _ os.DirEntry, err error) error { + var targetFiles []string + err := filepath.WalkDir(targetDir, func(path string, d os.DirEntry, err error) error { if err != nil { - return err + return filepath.SkipDir } - - if strings.HasPrefix(filepath.Base(path), prefix) { + if !d.IsDir() && strings.HasPrefix(filepath.Base(path), prefix) { targetFiles = append(targetFiles, path) } - return nil }) - if err != nil { return nil, fmt.Errorf("error walking directory: %v", err) } - return targetFiles, nil } diff --git a/launcher/util_test.go b/launcher/util_test.go index 55163c06..64c19cba 100644 --- a/launcher/util_test.go +++ b/launcher/util_test.go @@ -104,20 +104,19 @@ func TestListFilesWithPrefix(t *testing.T) { { dir: tmpDir, pattern: "newfile", - want: []string{}, wantErr: false, }, { dir: "otherdir", pattern: "file", want: nil, - wantErr: true, + wantErr: false, }, { dir: "otherdir", pattern: "tmpfile", want: nil, - wantErr: true, + wantErr: false, }, }