Skip to content

Commit

Permalink
feature: add nvidia-container 2.0
Browse files Browse the repository at this point in the history
Signed-off-by: codejuan <xh@decbug.com>
  • Loading branch information
CodeJuan committed Aug 6, 2018
1 parent f76d3bc commit 49a2f4e
Show file tree
Hide file tree
Showing 12 changed files with 341 additions and 5 deletions.
23 changes: 23 additions & 0 deletions apis/swagger.yml
Original file line number Diff line number Diff line change
Expand Up @@ -2479,6 +2479,29 @@ definitions:
x-nullable: false
minimum: 0
maximum: 1
NvidiaConfig:
$ref: "#/definitions/NvidiaConfig"

NvidiaConfig:
type: "object"
properties:
NvidiaVisibleDevices:
description: "NvidiaVisibleDevices controls which GPUs will be made accessible inside the container"
type: "string"
example: |
Possible values.
0,1,2, GPU-fef8089b …: a comma-separated list of GPU UUID(s) or index(es).
all: all GPUs will be accessible, this is the default value in our container images.
none: no GPU will be accessible, but driver capabilities will be enabled.
x-nullable: false
NvidiaDriverCapabilities:
description: "NvidiaDriverCapabilities controls which driver libraries/binaries will be mounted inside the container"
type: "string"
example: |
Possible values
compute,video, graphics,utility …: a comma-separated list of driver features the container needs.
all: enable all available driver capabilities.
x-nullable: false

ThrottleDevice:
type: "object"
Expand Down
57 changes: 57 additions & 0 deletions apis/types/nvidia_config.go

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

29 changes: 29 additions & 0 deletions apis/types/resources.go

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

4 changes: 4 additions & 0 deletions cli/common_flags.go
Original file line number Diff line number Diff line change
Expand Up @@ -109,5 +109,9 @@ func addCommonFlags(flagSet *pflag.FlagSet) *container {
// additional runtime spec annotations
flagSet.StringSliceVar(&c.specAnnotation, "annotation", nil, "Additional annotation for runtime")

// nvidia container
flagSet.StringVar(&c.nvidiaDriverCapabilities, "nvidia-capabilities", "", "NvidiaDriverCapabilities controls which driver libraries/binaries will be mounted inside the container")
flagSet.StringVar(&c.nvidiaVisibleDevices, "nvidia-visible-devs", "", "NvidiaVisibleDevices controls which GPUs will be made accessible inside the container")

return c
}
11 changes: 11 additions & 0 deletions cli/container.go
Original file line number Diff line number Diff line change
Expand Up @@ -83,6 +83,10 @@ type container struct {
rich bool
richMode string
initScript string

// nvidia container
nvidiaVisibleDevices string
nvidiaDriverCapabilities string
}

func (c *container) config() (*types.ContainerCreateConfig, error) {
Expand Down Expand Up @@ -269,5 +273,12 @@ func (c *container) config() (*types.ContainerCreateConfig, error) {
NetworkingConfig: networkingConfig,
}

if c.nvidiaDriverCapabilities != "" || c.nvidiaVisibleDevices != "" {
config.HostConfig.Resources.NvidiaConfig = &types.NvidiaConfig{
NvidiaDriverCapabilities: c.nvidiaDriverCapabilities,
NvidiaVisibleDevices: c.nvidiaVisibleDevices,
}
}

return config, nil
}
6 changes: 3 additions & 3 deletions daemon/mgr/container_utils.go
Original file line number Diff line number Diff line change
Expand Up @@ -5,14 +5,14 @@ import (
"strconv"
"strings"

"github.com/opencontainers/selinux/go-selinux/label"
"github.com/pkg/errors"

"github.com/alibaba/pouch/apis/types"
networktypes "github.com/alibaba/pouch/network/types"
"github.com/alibaba/pouch/pkg/errtypes"
"github.com/alibaba/pouch/pkg/meta"
"github.com/alibaba/pouch/pkg/randomid"

"github.com/opencontainers/selinux/go-selinux/label"
"github.com/pkg/errors"
)

// containerID returns the container's id, the parameter 'nameOrPrefix' may be container's
Expand Down
85 changes: 83 additions & 2 deletions daemon/mgr/container_validation.go
Original file line number Diff line number Diff line change
Expand Up @@ -2,12 +2,16 @@ package mgr

import (
"fmt"
"os"
"strconv"
"strings"

"github.com/sirupsen/logrus"

"github.com/alibaba/pouch/apis/types"
"github.com/alibaba/pouch/daemon/logger/syslog"
"github.com/alibaba/pouch/pkg/system"

"github.com/sirupsen/logrus"
"github.com/alibaba/pouch/pkg/utils"
)

// validateConfig validates container config
Expand All @@ -19,6 +23,10 @@ func (mgr *ContainerManager) validateConfig(c *Container, update bool) ([]string
if err != nil {
return nil, err
}
// validates nvidia config
if err := validateNvidiaConfig(hostConfig); err != nil {
return warnings, err
}
warnings = append(warnings, warns...)

if hostConfig.OomScoreAdj < -1000 || hostConfig.OomScoreAdj > 1000 {
Expand Down Expand Up @@ -188,3 +196,76 @@ func (mgr *ContainerManager) validateLogConfig(c *Container) error {
return fmt.Errorf("not support (%v) log driver yet", logCfg.LogDriver)
}
}

// validateNvidiaConfig
func validateNvidiaConfig(hostConfig *types.HostConfig) error {
r := &hostConfig.Resources
if r.NvidiaConfig == nil {
return nil
}

if err := validateNvidiaDriver(r); err != nil {
return err
}

if err := validateNvidiaDevice(hostConfig); err != nil {
return err
}

return nil
}

func validateNvidiaDriver(r *types.Resources) error {
n := r.NvidiaConfig
n.NvidiaDriverCapabilities = strings.TrimSpace(n.NvidiaDriverCapabilities)
if n.NvidiaDriverCapabilities == "" {
// use default driver capability: utility
return nil
}

if n.NvidiaDriverCapabilities == "all" {
// enable all capabilities
return nil
}

supportedDrivers := []string{"compute", "compat32", "graphics", "utility", "video", "display"}
drivers := strings.Split(n.NvidiaDriverCapabilities, ",")
for _, d := range drivers {
d = strings.TrimSpace(d)
found := utils.StringInSlice(supportedDrivers, d)
if !found {
return fmt.Errorf("invalid nvidia driver capability (%s)", d)
}
}
return nil
}

func validateNvidiaDevice(hostConfig *types.HostConfig) error {
n := hostConfig.Resources.NvidiaConfig
n.NvidiaVisibleDevices = strings.TrimSpace(n.NvidiaVisibleDevices)

// none: no GPU will be accessible, but driver capabilities will be enabled.
// void or empty: no GPU will be accessible, and driver capabilities will be disabled.
// all: all GPUs will be accessible
if n.NvidiaDriverCapabilities == "" {
return nil
}
supportedDevices := []string{"all", "none", "void"}
found := utils.StringInSlice(supportedDevices, n.NvidiaVisibleDevices)
if found {
return nil
}
devs := strings.Split(n.NvidiaVisibleDevices, ",")
for _, dev := range devs {
dev = strings.TrimSpace(dev)
if _, err := strconv.Atoi(dev); err == nil {
//dev is numeric, the realDev should be /dev/nvidiaN
realDev := fmt.Sprintf("/dev/nvidia%s", dev)
if _, err := os.Stat(realDev); err != nil {
return fmt.Errorf("invalid nvidia device %s", realDev)
}
}
// TODO: how to validate GPU UUID
}
return nil
}
24 changes: 24 additions & 0 deletions daemon/mgr/spec_hook.go
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@ import (
"fmt"
"io/ioutil"
"os"
"os/exec"
"path/filepath"
"sort"
"strconv"
Expand Down Expand Up @@ -65,6 +66,11 @@ func setupHook(ctx context.Context, c *Container, specWrapper *SpecWrapper) erro
return errors.Wrap(err, "failed to set volume mount tab prestart hook")
}

// set nvidia config
if err := setNvidiaHook(ctx, c, specWrapper); err != nil {
return errors.Wrap(err, "failed to set nvidia prestart hook")
}

return nil
}

Expand Down Expand Up @@ -153,6 +159,24 @@ func setMountTab(ctx context.Context, c *Container, spec *SpecWrapper) error {
return nil
}

func setNvidiaHook(ctx context.Context, c *Container, spec *SpecWrapper) error {
n := c.HostConfig.NvidiaConfig
if n == nil {
return nil
}
path, err := exec.LookPath("nvidia-container-runtime-hook")
if err != nil {
return err
}
args := []string{path}
nvidiaPrestart := specs.Hook{
Path: path,
Args: append(args, "prestart"),
}
spec.s.Hooks.Prestart = append(spec.s.Hooks.Prestart, nvidiaPrestart)
return nil
}

type hookArray []*wrapperEmbedPrestart

// Len is defined in order to support sort
Expand Down
14 changes: 14 additions & 0 deletions daemon/mgr/spec_process.go
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@ package mgr

import (
"context"
"fmt"
"io/ioutil"
"os"
"strings"
Expand Down Expand Up @@ -61,6 +62,9 @@ func setupProcess(ctx context.Context, c *Container, s *specs.Spec) error {
return err
}

if err := setupNvidiaEnv(ctx, c, s); err != nil {
return err
}
return nil
}

Expand Down Expand Up @@ -163,3 +167,13 @@ func setupRlimits(ctx context.Context, hostConfig *types.HostConfig, s *specs.Sp
s.Process.Rlimits = rlimits
return nil
}

func setupNvidiaEnv(ctx context.Context, c *Container, s *specs.Spec) error {
n := c.HostConfig.NvidiaConfig
if n == nil {
return nil
}
s.Process.Env = append(s.Process.Env, fmt.Sprintf("NVIDIA_DRIVER_CAPABILITIES=%s", n.NvidiaDriverCapabilities))
s.Process.Env = append(s.Process.Env, fmt.Sprintf("NVIDIA_VISIBLE_DEVICES=%s", n.NvidiaVisibleDevices))
return nil
}
Loading

0 comments on commit 49a2f4e

Please sign in to comment.