Skip to content

Commit

Permalink
Merge branch 'main' of github.com:meetrajvala/go-tpm-tools into gpu-s…
Browse files Browse the repository at this point in the history
…upport
  • Loading branch information
meetrajvala committed Oct 10, 2024
2 parents 0ae2e12 + 6405dbb commit bae1a53
Show file tree
Hide file tree
Showing 8 changed files with 38 additions and 43 deletions.
34 changes: 14 additions & 20 deletions launcher/container_runner.go
Original file line number Diff line number Diff line change
Expand Up @@ -161,8 +161,20 @@ func NewRunner(ctx context.Context, cdClient *containerd.Client, token oauth2.To
}

if launchSpec.Experiments.EnableGpuDriverInstallation && launchSpec.InstallGpuDriver {
mounts = appendGpuDriverMounts(mounts)
specOpts = append(specOpts, oci.WithMounts(mounts))
gpuMounts := []specs.Mount{
{
Type: "volume",
Source: fmt.Sprintf("%s/lib64", gpu.InstallationHostDir),
Destination: fmt.Sprintf("%s/lib64", gpu.InstallationContainerDir),
Options: []string{"rbind", "rw"},
}, {
Type: "volume",
Source: fmt.Sprintf("%s/bin", gpu.InstallationHostDir),
Destination: fmt.Sprintf("%s/bin", gpu.InstallationContainerDir),
Options: []string{"rbind", "rw"},
},
}
specOpts = append(specOpts, oci.WithMounts(gpuMounts))

gpuDeviceFiles, err := listFilesWithPrefix("/dev", "nvidia")
if err != nil {
Expand Down Expand Up @@ -282,24 +294,6 @@ func appendTokenMounts(mounts []specs.Mount) []specs.Mount {
return append(mounts, m)
}

// appendGpuMounts appends the default mount specs for GPU drivers
func appendGpuDriverMounts(mounts []specs.Mount) []specs.Mount {
gpuMounts := []specs.Mount{
{
Type: "volume",
Source: fmt.Sprintf("%s/lib64", gpu.InstallationHostDir),
Destination: fmt.Sprintf("%s/lib64", gpu.InstallationContainerDir),
Options: []string{"rbind", "rw"},
}, {
Type: "volume",
Source: fmt.Sprintf("%s/bin", gpu.InstallationHostDir),
Destination: fmt.Sprintf("%s/bin", gpu.InstallationContainerDir),
Options: []string{"rbind", "rw"},
},
}
return append(mounts, gpuMounts...)
}

func (r *ContainerRunner) measureCELEvents(ctx context.Context) error {
if err := r.measureContainerClaims(ctx); err != nil {
return fmt.Errorf("failed to measure container claims: %v", err)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -62,19 +62,19 @@ steps:
- name: 'gcr.io/cloud-builders/gcloud'
id: SingleGpuWorkloadTest
entrypoint: 'bash'
args: ['scripts/test_gpu_workload.sh', '${_VM_NAME_PREFIX}-${BUILD_ID}', '${_ZONE}']
args: ['scripts/gpu/test_gpu_workload.sh', '${_VM_NAME_PREFIX}-${BUILD_ID}', '${_ZONE}']
- name: 'gcr.io/cloud-builders/gcloud'
id: MultipleGpuWorkloadTest
entrypoint: 'bash'
args: ['scripts/test_gpu_workload.sh', '${_VM_NAME_PREFIX}-${BUILD_ID}-mul', '${_ZONE}']
args: ['scripts/gpu/test_gpu_workload.sh', '${_VM_NAME_PREFIX}-${BUILD_ID}-mul', '${_ZONE}']
- name: 'gcr.io/cloud-builders/gcloud'
id: UnsupportedGpuWorkloadTest
entrypoint: 'bash'
args: ['scripts/test_gpu_unsupported_gputype.sh', '${_VM_NAME_PREFIX}-${BUILD_ID}-unsup', '${_ZONE}']
args: ['scripts/gpu/test_gpu_unsupported_gputype.sh', '${_VM_NAME_PREFIX}-${BUILD_ID}-unsup', '${_ZONE}']
- name: 'gcr.io/cloud-builders/gcloud'
id: NoGpuWorkloadTest
entrypoint: 'bash'
args: ['scripts/test_gpu_nogpu.sh', '${_VM_NAME_PREFIX}-${BUILD_ID}-nogpu', '${_ZONE}']
args: ['scripts/gpu/test_gpu_nogpu.sh', '${_VM_NAME_PREFIX}-${BUILD_ID}-nogpu', '${_ZONE}']
- name: 'gcr.io/cloud-builders/gcloud'
id: SingleGpuCleanUp
entrypoint: 'bash'
Expand Down
21 changes: 14 additions & 7 deletions launcher/internal/gpu/driverinstaller.go
Original file line number Diff line number Diff line change
Expand Up @@ -23,8 +23,7 @@ const (
installerSnapshotID = "tee-gpu-driver-installer-snapshot"
)

// SupportedGpuTypes is the list of supported gpu types with open sourced nvidia kernel modules.
var SupportedGpuTypes = []deviceinfo.GPUType{
var supportedGpuTypes = []deviceinfo.GPUType{
deviceinfo.L4,
deviceinfo.T4,
deviceinfo.A100_40GB,
Expand Down Expand Up @@ -53,13 +52,18 @@ func NewDriverInstaller(cdClient *containerd.Client, launchSpec spec.LaunchSpec,
// https://pkg.go.dev/cos.googlesource.com/cos/tools.git@v0.0.0-20241008015903-8431fe581b1f/src/cmd/cos_gpu_installer#section-readme
// README specifies docker command where this function uses containerd for launching and managing the gpu driver installer container.
func (di *DriverInstaller) InstallGPUDrivers(ctx context.Context) error {
err := remountAsExecutable(InstallationHostDir)
if err != nil {
return fmt.Errorf("failed to remount the installation directory: %v", err)
}

gpuType, err := deviceinfo.GetGPUTypeInfo()
if err != nil {
return fmt.Errorf("failed to get the gpu type info: %v", err)
}

if !gpuType.OpenSupported() {
return fmt.Errorf("unsupported gpu type %s, please retry with one of the supported gpu types: %v", gpuType.String(), gpu.SupportedGpuTypes)
return fmt.Errorf("unsupported gpu type %s, please retry with one of the supported gpu types: %v", gpuType.String(), supportedGpuTypes)
}

ctx = namespaces.WithNamespace(ctx, namespaces.Default)
Expand Down Expand Up @@ -113,11 +117,13 @@ func (di *DriverInstaller) InstallGPUDrivers(ctx context.Context) error {
if err != nil {
return fmt.Errorf("failed to create gpu driver installer container: %v", err)
}
defer container.Delete(ctx, containerd.WithSnapshotCleanup)

task, err := container.NewTask(ctx, cio.NewCreator(cio.WithStdio))
if err != nil {
return fmt.Errorf("failed to create gpu driver installation task: %v", err)
}
defer task.Delete(ctx)

statusC, err := task.Wait(ctx)
if err != nil {
Expand All @@ -130,12 +136,13 @@ func (di *DriverInstaller) InstallGPUDrivers(ctx context.Context) error {

status := <-statusC
code, _, _ := status.Result()
di.logger.Printf("Gpu driver installation task exited with status: %d\n", code)

err = remountAsExecutable(gpu.InstallationHostDir)
if err != nil {
return fmt.Errorf("failed to remount the installed drivers: %v", err)
if code != 0 {
di.logger.Printf("Gpu driver installation task ended and returned non-zero status code %d", code)
return fmt.Errorf("gpu driver installation task ended with non-zero status code %d", code)
}

di.logger.Println("Gpu driver installation task exited with status: 0")
return nil
}

Expand Down
13 changes: 4 additions & 9 deletions launcher/util.go
Original file line number Diff line number Diff line change
Expand Up @@ -33,23 +33,18 @@ func FetchImpersonatedToken(ctx context.Context, serviceAccount string, audience
}

func listFilesWithPrefix(targetDir string, prefix string) ([]string, error) {
targetFiles := make([]string, 0)

err := filepath.WalkDir(targetDir, func(path string, _ os.DirEntry, err error) error {
var targetFiles []string
err := filepath.WalkDir(targetDir, func(path string, d os.DirEntry, err error) error {
if err != nil {
return err
return filepath.SkipDir
}

if strings.HasPrefix(filepath.Base(path), prefix) {
if !d.IsDir() && strings.HasPrefix(filepath.Base(path), prefix) {
targetFiles = append(targetFiles, path)
}

return nil
})

if err != nil {
return nil, fmt.Errorf("error walking directory: %v", err)
}

return targetFiles, nil
}
5 changes: 2 additions & 3 deletions launcher/util_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -104,20 +104,19 @@ func TestListFilesWithPrefix(t *testing.T) {
{
dir: tmpDir,
pattern: "newfile",
want: []string{},
wantErr: false,
},
{
dir: "otherdir",
pattern: "file",
want: nil,
wantErr: true,
wantErr: false,
},
{
dir: "otherdir",
pattern: "tmpfile",
want: nil,
wantErr: true,
wantErr: false,
},
}

Expand Down

0 comments on commit bae1a53

Please sign in to comment.