Skip to content

Commit

Permalink
Merge pull request #2029 from CodeJuan/nvidia2
Browse files Browse the repository at this point in the history
feature: support nvidia-container 2.0 to enable GPU access
  • Loading branch information
allencloud authored Aug 10, 2018
2 parents 7a555f1 + cf12087 commit a77e0da
Show file tree
Hide file tree
Showing 12 changed files with 462 additions and 0 deletions.
23 changes: 23 additions & 0 deletions apis/swagger.yml
Original file line number Diff line number Diff line change
Expand Up @@ -2483,6 +2483,29 @@ definitions:
x-nullable: false
minimum: 0
maximum: 1
NvidiaConfig:
$ref: "#/definitions/NvidiaConfig"

NvidiaConfig:
type: "object"
properties:
NvidiaVisibleDevices:
description: "NvidiaVisibleDevices controls which GPUs will be made accessible inside the container"
type: "string"
example: |
Possible values.
0,1,2, GPU-fef8089b …: a comma-separated list of GPU UUID(s) or index(es).
all: all GPUs will be accessible, this is the default value in our container images.
none: no GPU will be accessible, but driver capabilities will be enabled.
x-nullable: false
NvidiaDriverCapabilities:
description: "NvidiaDriverCapabilities controls which driver libraries/binaries will be mounted inside the container"
type: "string"
example: |
Possible values
compute,video, graphics,utility …: a comma-separated list of driver features the container needs.
all: enable all available driver capabilities.
x-nullable: false

ThrottleDevice:
type: "object"
Expand Down
57 changes: 57 additions & 0 deletions apis/types/nvidia_config.go

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

29 changes: 29 additions & 0 deletions apis/types/resources.go

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

4 changes: 4 additions & 0 deletions cli/common_flags.go
Original file line number Diff line number Diff line change
Expand Up @@ -109,5 +109,9 @@ func addCommonFlags(flagSet *pflag.FlagSet) *container {
// additional runtime spec annotations
flagSet.StringSliceVar(&c.specAnnotation, "annotation", nil, "Additional annotation for runtime")

// nvidia container
flagSet.StringVar(&c.nvidiaDriverCapabilities, "nvidia-capabilities", "", "NvidiaDriverCapabilities controls which driver libraries/binaries will be mounted inside the container")
flagSet.StringVar(&c.nvidiaVisibleDevices, "nvidia-visible-devs", "", "NvidiaVisibleDevices controls which GPUs will be made accessible inside the container")

return c
}
11 changes: 11 additions & 0 deletions cli/container.go
Original file line number Diff line number Diff line change
Expand Up @@ -83,6 +83,10 @@ type container struct {
rich bool
richMode string
initScript string

// nvidia container
nvidiaVisibleDevices string
nvidiaDriverCapabilities string
}

func (c *container) config() (*types.ContainerCreateConfig, error) {
Expand Down Expand Up @@ -269,5 +273,12 @@ func (c *container) config() (*types.ContainerCreateConfig, error) {
NetworkingConfig: networkingConfig,
}

if c.nvidiaDriverCapabilities != "" || c.nvidiaVisibleDevices != "" {
config.HostConfig.Resources.NvidiaConfig = &types.NvidiaConfig{
NvidiaDriverCapabilities: c.nvidiaDriverCapabilities,
NvidiaVisibleDevices: c.nvidiaVisibleDevices,
}
}

return config, nil
}
93 changes: 93 additions & 0 deletions daemon/mgr/container_validation.go
Original file line number Diff line number Diff line change
@@ -1,7 +1,11 @@
package mgr

import (
"errors"
"fmt"
"os"
"strconv"
"strings"

"github.com/alibaba/pouch/apis/types"
"github.com/alibaba/pouch/daemon/logger/syslog"
Expand All @@ -10,6 +14,20 @@ import (
"github.com/sirupsen/logrus"
)

var (
// all: all GPUs will be accessible, this is the default value in our container images.
// none: no GPU will be accessible, but driver capabilities will be enabled.
supportedDevices = map[string]*struct{}{"all": nil, "none": nil, "void": nil}

// none: no GPU will be accessible, but driver capabilities will be enabled.
// void or empty: no GPU will be accessible, and driver capabilities will be disabled.
// all: all GPUs will be accessible
supportedDrivers = map[string]*struct{}{"compute": nil, "compat32": nil, "graphics": nil, "utility": nil, "video": nil, "display": nil}

errInvalidDevice = errors.New("invalid nvidia device")
errInvalidDriver = errors.New("invalid nvidia driver capability")
)

// validateConfig validates container config
func (mgr *ContainerManager) validateConfig(c *Container, update bool) ([]string, error) {
// validates container hostconfig
Expand All @@ -19,6 +37,10 @@ func (mgr *ContainerManager) validateConfig(c *Container, update bool) ([]string
if err != nil {
return nil, err
}
// validates nvidia config
if err := validateNvidiaConfig(&hostConfig.Resources); err != nil {
return warnings, err
}
warnings = append(warnings, warns...)

if hostConfig.OomScoreAdj < -1000 || hostConfig.OomScoreAdj > 1000 {
Expand Down Expand Up @@ -188,3 +210,74 @@ func (mgr *ContainerManager) validateLogConfig(c *Container) error {
return fmt.Errorf("not support (%v) log driver yet", logCfg.LogDriver)
}
}

// validateNvidiaConfig
func validateNvidiaConfig(r *types.Resources) error {
if r.NvidiaConfig == nil {
return nil
}

if err := validateNvidiaDriver(r); err != nil {
return err
}

if err := validateNvidiaDevice(r); err != nil {
return err
}

return nil
}

func validateNvidiaDriver(r *types.Resources) error {
n := r.NvidiaConfig
n.NvidiaDriverCapabilities = strings.TrimSpace(n.NvidiaDriverCapabilities)

if n.NvidiaDriverCapabilities == "" {
// use default driver capability: utility
return nil
}

if n.NvidiaDriverCapabilities == "all" {
// enable all capabilities
return nil
}

drivers := strings.Split(n.NvidiaDriverCapabilities, ",")

for _, d := range drivers {
d = strings.TrimSpace(d)
if _, found := supportedDrivers[d]; !found {
return errInvalidDriver
}
}
return nil
}

func validateNvidiaDevice(r *types.Resources) error {
n := r.NvidiaConfig
n.NvidiaVisibleDevices = strings.TrimSpace(n.NvidiaVisibleDevices)

if n.NvidiaVisibleDevices == "" {
// no GPU will be accessible, and driver capabilities will be disabled.
return nil
}

if _, found := supportedDevices[n.NvidiaVisibleDevices]; found {
return nil
}

// 0,1,2, GPU-fef8089b …: a comma-separated list of GPU UUID(s) or index(es).
devs := strings.Split(n.NvidiaVisibleDevices, ",")
for _, dev := range devs {
dev = strings.TrimSpace(dev)
if _, err := strconv.Atoi(dev); err == nil {
//dev is numeric, the realDev should be /dev/nvidiaN
realDev := fmt.Sprintf("/dev/nvidia%s", dev)
if _, err := os.Stat(realDev); err != nil {
return errInvalidDevice
}
}
// TODO: how to validate GPU UUID
}
return nil
}
Loading

0 comments on commit a77e0da

Please sign in to comment.