Skip to content

Commit

Permalink
Add ffmpeg GPU test with h264_nvenc video codec (which uses NVENC).
Browse files Browse the repository at this point in the history
This test does NOT work yet in gVisor.

Updates #9452

PiperOrigin-RevId: 670751228
  • Loading branch information
EtiennePerot authored and gvisor-bot committed Sep 9, 2024
1 parent 905d769 commit 0ede98b
Show file tree
Hide file tree
Showing 17 changed files with 327 additions and 113 deletions.
3 changes: 2 additions & 1 deletion Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -177,6 +177,7 @@ dev: $(RUNTIME_BIN) ## Installs a set of local runtimes. Requires sudo.
@$(call configure_noreload,$(RUNTIME)-p,--net-raw --profile)
@$(call configure_noreload,$(RUNTIME)-cgroup-d,--net-raw --debug --strace --log-packets --cgroupfs)
@$(call configure_noreload,$(RUNTIME)-systemd-d,--net-raw --debug --strace --log-packets --systemd-cgroup)
@$(call configure_noreload,$(RUNTIME)-gpu,--nvproxy)
@$(call reload_docker)
.PHONY: dev

Expand Down Expand Up @@ -298,7 +299,7 @@ cos-gpu-smoke-tests: gpu-smoke-images $(RUNTIME_BIN)
# This is a superset of those needed for smoke tests.
# It includes non-GPU images that are used as part of GPU tests,
# e.g. busybox and python.
gpu-images: gpu-smoke-images load-gpu_pytorch load-gpu_ollama load-gpu_ollama_client load-basic_busybox load-basic_python load-gpu_stable-diffusion-xl load-gpu_vllm load-gpu_nccl-tests
gpu-images: gpu-smoke-images load-gpu_pytorch load-gpu_ollama load-gpu_ollama_client load-basic_busybox load-basic_python load-gpu_stable-diffusion-xl load-gpu_vllm load-gpu_nccl-tests load-benchmarks_ffmpeg
.PHONY: gpu-images

gpu-all-tests: gpu-images gpu-smoke-tests $(RUNTIME_BIN)
Expand Down
14 changes: 14 additions & 0 deletions pkg/test/dockerutil/BUILD
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,17 @@ package(
licenses = ["notice"],
)

# We copy the `run_sniffer` binary here because `go:embed` can only embed
# from the current directory or subdirectories, not parents of it.
genrule(
name = "run_sniffer_bin",
srcs = [
"//tools/ioctl_sniffer:run_sniffer",
],
outs = ["run_sniffer_copy"],
cmd = "cat < $(SRCS) > $@",
)

go_library(
name = "dockerutil",
testonly = 1,
Expand All @@ -16,6 +27,9 @@ go_library(
"network.go",
"profile.go",
],
embedsrcs = [
":run_sniffer_bin", # keep
],
visibility = ["//:sandbox"],
deps = [
"//pkg/sync",
Expand Down
66 changes: 56 additions & 10 deletions pkg/test/dockerutil/container.go
Original file line number Diff line number Diff line change
Expand Up @@ -115,6 +115,10 @@ type RunOpts struct {
DeviceRequests []container.DeviceRequest

Devices []container.DeviceMapping

// sniffGPUOpts, if set, sets the rules for GPU sniffing during this test.
// Must be set via `RunOpts.SniffGPU`.
sniffGPUOpts *SniffGPUOpts
}

func makeContainer(ctx context.Context, logger testutil.Logger, runtime string) *Container {
Expand Down Expand Up @@ -164,7 +168,11 @@ func MakeNativeContainer(ctx context.Context, logger testutil.Logger) *Container

// Spawn is analogous to 'docker run -d'.
func (c *Container) Spawn(ctx context.Context, r RunOpts, args ...string) error {
if err := c.create(ctx, r.Image, c.config(r, args), c.hostConfig(r), nil); err != nil {
cfg, err := c.config(ctx, r, args)
if err != nil {
return fmt.Errorf("container config: %w", err)
}
if err := c.create(ctx, r.Image, cfg, c.hostConfig(r), nil); err != nil {
return err
}
return c.Start(ctx)
Expand All @@ -173,7 +181,10 @@ func (c *Container) Spawn(ctx context.Context, r RunOpts, args ...string) error
// SpawnProcess is analogous to 'docker run -it'. It returns a process
// which represents the root process.
func (c *Container) SpawnProcess(ctx context.Context, r RunOpts, args ...string) (Process, error) {
config, hostconf, netconf := c.ConfigsFrom(r, args...)
config, hostconf, netconf, err := c.ConfigsFrom(ctx, r, args...)
if err != nil {
return Process{}, fmt.Errorf("container config: %w", err)
}
config.Tty = true
config.OpenStdin = true

Expand Down Expand Up @@ -204,7 +215,11 @@ func (c *Container) SpawnProcess(ctx context.Context, r RunOpts, args ...string)

// Run is analogous to 'docker run'.
func (c *Container) Run(ctx context.Context, r RunOpts, args ...string) (string, error) {
if err := c.create(ctx, r.Image, c.config(r, args), c.hostConfig(r), nil); err != nil {
cfg, err := c.config(ctx, r, args)
if err != nil {
return "", fmt.Errorf("container config: %w", err)
}
if err := c.create(ctx, r.Image, cfg, c.hostConfig(r), nil); err != nil {
return "", err
}

Expand All @@ -223,8 +238,12 @@ func (c *Container) Run(ctx context.Context, r RunOpts, args ...string) (string,

// ConfigsFrom returns container configs from RunOpts and args. The caller should call 'CreateFrom'
// and Start.
func (c *Container) ConfigsFrom(r RunOpts, args ...string) (*container.Config, *container.HostConfig, *network.NetworkingConfig) {
return c.config(r, args), c.hostConfig(r), &network.NetworkingConfig{}
func (c *Container) ConfigsFrom(ctx context.Context, r RunOpts, args ...string) (*container.Config, *container.HostConfig, *network.NetworkingConfig, error) {
cfg, err := c.config(ctx, r, args)
if err != nil {
return nil, nil, nil, fmt.Errorf("container config: %w", err)
}
return cfg, c.hostConfig(r), &network.NetworkingConfig{}, nil
}

// MakeLink formats a link to add to a RunOpts.
Expand All @@ -239,7 +258,11 @@ func (c *Container) CreateFrom(ctx context.Context, profileImage string, conf *c

// Create is analogous to 'docker create'.
func (c *Container) Create(ctx context.Context, r RunOpts, args ...string) error {
return c.create(ctx, r.Image, c.config(r, args), c.hostConfig(r), nil)
cfg, err := c.config(ctx, r, args)
if err != nil {
return fmt.Errorf("container config: %w", err)
}
return c.create(ctx, r.Image, cfg, c.hostConfig(r), nil)
}

func (c *Container) create(ctx context.Context, profileImage string, conf *container.Config, hostconf *container.HostConfig, netconf *network.NetworkingConfig) error {
Expand Down Expand Up @@ -271,23 +294,46 @@ func (c *Container) create(ctx context.Context, profileImage string, conf *conta
return nil
}

func (c *Container) config(r RunOpts, args []string) *container.Config {
func (c *Container) config(ctx context.Context, r RunOpts, args []string) (*container.Config, error) {
ports := nat.PortSet{}
for _, p := range r.Ports {
port := nat.Port(fmt.Sprintf("%d", p))
ports[port] = struct{}{}
}
env := append(r.Env, fmt.Sprintf("RUNSC_TEST_NAME=%s", c.Name))

image := testutil.ImageByName(r.Image)
entrypoint := r.Entrypoint
if r.sniffGPUOpts != nil {
c.cleanups = append(c.cleanups, func() {
r.sniffGPUOpts.cleanup()
})
if len(entrypoint) == 0 && len(args) == 0 {
// Need to look up the image's default entrypoint/args so we can prepend to them.
// If we don't, then we will end up overwriting them.
imageInfo, _, err := c.client.ImageInspectWithRaw(ctx, image)
if err != nil {
return nil, fmt.Errorf("cannot inspect image %q: %w", image, err)
}
entrypoint = []string(imageInfo.Config.Entrypoint)
args = []string(imageInfo.Config.Cmd)
}
if len(entrypoint) != 0 {
entrypoint = r.sniffGPUOpts.prepend(entrypoint)
} else {
args = r.sniffGPUOpts.prepend(args)
}
}

return &container.Config{
Image: testutil.ImageByName(r.Image),
Image: image,
Cmd: args,
Entrypoint: r.Entrypoint,
Entrypoint: entrypoint,
ExposedPorts: ports,
Env: env,
WorkingDir: r.WorkDir,
User: r.User,
}
}, nil
}

func (c *Container) hostConfig(r RunOpts) *container.HostConfig {
Expand Down
118 changes: 108 additions & 10 deletions pkg/test/dockerutil/gpu.go
Original file line number Diff line number Diff line change
Expand Up @@ -22,30 +22,78 @@ import (

"github.com/docker/docker/api/types/container"
"github.com/docker/docker/api/types/mount"

// Needed for go:embed
_ "embed"
)

// Flags.
var (
setCOSGPU = flag.Bool("cos-gpu", false, "set to configure GPU settings for COS, as opposed to Docker")
)

// AllGPUCapabilities is the environment variable that enables all NVIDIA GPU
// capabilities within a container.
const AllGPUCapabilities = "NVIDIA_DRIVER_CAPABILITIES=all"
//go:embed run_sniffer_copy
var runSnifferBinary []byte

const (
// ioctlSnifferMountPath is the in-container path at which the ioctl sniffer is mounted.
ioctlSnifferMountPath = "/ioctl_sniffer"
)

const (
// AllGPUCapabilities is the environment variable that enables all NVIDIA
// GPU capabilities within a container.
AllGPUCapabilities = "NVIDIA_DRIVER_CAPABILITIES=all"

// DefaultGPUCapabilities is the environment variable that enables default
// NVIDIA GPU capabilities within a container.
DefaultGPUCapabilities = "NVIDIA_DRIVER_CAPABILITIES=compute,utility"
)

// GPURunOpts returns Docker run options with GPU support enabled.
func GPURunOpts() RunOpts {
func GPURunOpts(sniffGPUOpts SniffGPUOpts) (RunOpts, error) {
var mounts []mount.Mount
if sniffGPUOpts.DisableSnifferReason == "" {
// Extract the sniffer binary to a temporary location.
runSniffer, err := os.CreateTemp("", "run_sniffer.*")
if err != nil {
return RunOpts{}, fmt.Errorf("failed to create temporary file: %w", err)
}
if _, err := runSniffer.Write(runSnifferBinary); err != nil {
return RunOpts{}, fmt.Errorf("failed to write to temporary file: %w", err)
}
if err := runSniffer.Sync(); err != nil {
return RunOpts{}, fmt.Errorf("failed to sync temporary file: %w", err)
}
if err := runSniffer.Chmod(0o555); err != nil {
return RunOpts{}, fmt.Errorf("failed to chmod temporary file: %w", err)
}
if err := runSniffer.Close(); err != nil {
return RunOpts{}, fmt.Errorf("failed to close temporary file: %w", err)
}
sniffGPUOpts.runSniffer = runSniffer
mounts = append(mounts, mount.Mount{
Source: runSniffer.Name(),
Target: ioctlSnifferMountPath,
Type: mount.TypeBind,
ReadOnly: true,
})
}
gpuEnv := []string{sniffGPUOpts.GPUCapabilities()}

if !*setCOSGPU {
return RunOpts{
Env: []string{AllGPUCapabilities},
Env: gpuEnv,
DeviceRequests: []container.DeviceRequest{
{
Count: -1,
Capabilities: [][]string{{"gpu"}},
Options: map[string]string{},
},
},
}
Mounts: mounts,
sniffGPUOpts: &sniffGPUOpts,
}, nil
}

// COS has specific settings since it has a custom installer for GPU drivers.
Expand All @@ -68,7 +116,6 @@ func GPURunOpts() RunOpts {
})
}

var mounts []mount.Mount
for _, nvidiaBin := range []string{
"/home/kubernetes/bin/nvidia/bin",
"/var/lib/nvidia/bin",
Expand Down Expand Up @@ -97,10 +144,61 @@ func GPURunOpts() RunOpts {
}

return RunOpts{
Env: []string{AllGPUCapabilities},
Mounts: mounts,
Devices: devices,
Env: gpuEnv,
Mounts: mounts,
Devices: devices,
sniffGPUOpts: &sniffGPUOpts,
}, nil
}

// SniffGPUOpts dictates options to sniffer GPU workloads.
type SniffGPUOpts struct {
// If set, explains why the sniffer should be disabled for this test.
// If unset or empty, the sniffer is enabled.
DisableSnifferReason string

// If true, the test will not fail even when the workload calls incompatible
// ioctls. Useful for debugging.
// TODO(b/340955577): Should be converted to a flag and removed from this
// struct once all GPU tests have no incompatible ioctls.
AllowIncompatibleIoctl bool

// The set of GPU capabilities exposed to the container.
// If unset, defaults to `DefaultGPUCapabilities`.
Capabilities string

// The fields below are set internally.
runSniffer *os.File
}

// GPUCapabilities returns the set of GPU capabilities meant to be
// exposed to the container.
func (sgo *SniffGPUOpts) GPUCapabilities() string {
if sgo.Capabilities == "" {
return DefaultGPUCapabilities
}
return sgo.Capabilities
}

// prepend prepends the sniffer arguments to the given command.
func (sgo *SniffGPUOpts) prepend(argv []string) []string {
if sgo.DisableSnifferReason != "" {
return argv
}
snifferArgv := []string{
ioctlSnifferMountPath,
"--verbose=true",
fmt.Sprintf("--enforce_compatibility=%t", !sgo.AllowIncompatibleIoctl),
// TODO(eperot): Add flag to enforce capability set here once implemented.
}
return append(snifferArgv, argv...)
}

func (sgo *SniffGPUOpts) cleanup() error {
if err := os.Remove(sgo.runSniffer.Name()); err != nil {
return fmt.Errorf("failed to unlink temporary file %q: %w", sgo.runSniffer.Name(), err)
}
return nil
}

// NumGPU crudely estimates the number of NVIDIA GPUs on the host.
Expand Down
33 changes: 15 additions & 18 deletions test/gpu/BUILD
Original file line number Diff line number Diff line change
Expand Up @@ -90,8 +90,8 @@ go_test(
)

go_test(
name = "imagegen_test",
srcs = ["imagegen_test.go"],
name = "ffmpeg_test",
srcs = ["ffmpeg_test.go"],
# runsc is needed to invalidate the bazel cache in case of any code changes.
data = ["//runsc"],
tags = [
Expand All @@ -100,36 +100,33 @@ go_test(
"notap",
],
visibility = ["//:sandbox"],
deps = ["//test/gpu/stablediffusion"],
deps = ["//pkg/test/dockerutil"],
)

# We copy the `run_sniffer` binary here because `go:embed` can only embed
# from the current directory or subdirectories, not parents of it.
genrule(
name = "run_sniffer_copy",
srcs = [
"//tools/ioctl_sniffer:run_sniffer",
go_test(
name = "imagegen_test",
srcs = ["imagegen_test.go"],
# runsc is needed to invalidate the bazel cache in case of any code changes.
data = ["//runsc"],
tags = [
"manual",
"noguitar",
"notap",
],
outs = ["run_sniffer_copy"],
cmd = "cat < $(SRCS) > $@",
visibility = ["//:sandbox"],
deps = ["//test/gpu/stablediffusion"],
)

go_test(
name = "sniffer_test",
srcs = ["sniffer_test.go"],
embedsrcs = [
":run_sniffer_copy", # keep
],
tags = [
"manual",
"noguitar",
"notap",
],
visibility = ["//:sandbox"],
deps = [
"//pkg/test/dockerutil",
"@com_github_docker_docker//api/types/mount:go_default_library",
],
deps = ["//pkg/test/dockerutil"],
)

go_test(
Expand Down
Loading

0 comments on commit 0ede98b

Please sign in to comment.