diff --git a/apis/swagger.yml b/apis/swagger.yml index 8f17289270..d3ce61822c 100644 --- a/apis/swagger.yml +++ b/apis/swagger.yml @@ -2479,6 +2479,29 @@ definitions: x-nullable: false minimum: 0 maximum: 1 + NvidiaConfig: + $ref: "#/definitions/NvidiaConfig" + + NvidiaConfig: + type: "object" + properties: + NvidiaVisibleDevices: + description: "NvidiaVisibleDevices controls which GPUs will be made accessible inside the container" + type: "string" + example: | + Possible values. + 0,1,2, GPU-fef8089b …: a comma-separated list of GPU UUID(s) or index(es). + all: all GPUs will be accessible, this is the default value in our container images. + none: no GPU will be accessible, but driver capabilities will be enabled. + x-nullable: false + NvidiaDriverCapabilities: + description: "NvidiaDriverCapabilities controls which driver libraries/binaries will be mounted inside the container" + type: "string" + example: | + Possible values + compute,video, graphics,utility …: a comma-separated list of driver features the container needs. + all: enable all available driver capabilities. + x-nullable: false ThrottleDevice: type: "object" diff --git a/apis/types/nvidia_config.go b/apis/types/nvidia_config.go new file mode 100644 index 0000000000..ab018bf9ef --- /dev/null +++ b/apis/types/nvidia_config.go @@ -0,0 +1,57 @@ +// Code generated by go-swagger; DO NOT EDIT. + +package types + +// This file was generated by the swagger tool. +// Editing this file might prove futile when you re-run the swagger generate command + +import ( + strfmt "github.com/go-openapi/strfmt" + + "github.com/go-openapi/errors" + "github.com/go-openapi/swag" +) + +// NvidiaConfig nvidia config +// swagger:model NvidiaConfig + +type NvidiaConfig struct { + + // NvidiaDriverCapabilities controls which driver libraries/binaries will be mounted inside the container + NvidiaDriverCapabilities string `json:"NvidiaDriverCapabilities,omitempty"` + + // NvidiaVisibleDevices controls which GPUs will be made accessible inside the container + NvidiaVisibleDevices string `json:"NvidiaVisibleDevices,omitempty"` +} + +/* polymorph NvidiaConfig NvidiaDriverCapabilities false */ + +/* polymorph NvidiaConfig NvidiaVisibleDevices false */ + +// Validate validates this nvidia config +func (m *NvidiaConfig) Validate(formats strfmt.Registry) error { + var res []error + + if len(res) > 0 { + return errors.CompositeValidationError(res...) + } + return nil +} + +// MarshalBinary interface implementation +func (m *NvidiaConfig) MarshalBinary() ([]byte, error) { + if m == nil { + return nil, nil + } + return swag.WriteJSON(m) +} + +// UnmarshalBinary interface implementation +func (m *NvidiaConfig) UnmarshalBinary(b []byte) error { + var res NvidiaConfig + if err := swag.ReadJSON(b, &res); err != nil { + return err + } + *m = res + return nil +} diff --git a/apis/types/resources.go b/apis/types/resources.go index de01a9aa96..18dbb9d900 100644 --- a/apis/types/resources.go +++ b/apis/types/resources.go @@ -164,6 +164,9 @@ type Resources struct { // Required: true NanoCpus int64 `json:"NanoCpus"` + // nvidia config + NvidiaConfig *NvidiaConfig `json:"NvidiaConfig,omitempty"` + // Disable OOM Killer for the container. // Required: true OomKillDisable *bool `json:"OomKillDisable"` @@ -244,6 +247,8 @@ type Resources struct { /* polymorph Resources NanoCpus false */ +/* polymorph Resources NvidiaConfig false */ + /* polymorph Resources OomKillDisable false */ /* polymorph Resources PidsLimit false */ @@ -406,6 +411,11 @@ func (m *Resources) Validate(formats strfmt.Registry) error { res = append(res, err) } + if err := m.validateNvidiaConfig(formats); err != nil { + // prop + res = append(res, err) + } + if err := m.validateOomKillDisable(formats); err != nil { // prop res = append(res, err) @@ -862,6 +872,25 @@ func (m *Resources) validateNanoCpus(formats strfmt.Registry) error { return nil } +func (m *Resources) validateNvidiaConfig(formats strfmt.Registry) error { + + if swag.IsZero(m.NvidiaConfig) { // not required + return nil + } + + if m.NvidiaConfig != nil { + + if err := m.NvidiaConfig.Validate(formats); err != nil { + if ve, ok := err.(*errors.Validation); ok { + return ve.ValidateName("NvidiaConfig") + } + return err + } + } + + return nil +} + func (m *Resources) validateOomKillDisable(formats strfmt.Registry) error { if err := validate.Required("OomKillDisable", "body", m.OomKillDisable); err != nil { diff --git a/cli/common_flags.go b/cli/common_flags.go index 17da663e69..196c8a9d04 100644 --- a/cli/common_flags.go +++ b/cli/common_flags.go @@ -109,5 +109,9 @@ func addCommonFlags(flagSet *pflag.FlagSet) *container { // additional runtime spec annotations flagSet.StringSliceVar(&c.specAnnotation, "annotation", nil, "Additional annotation for runtime") + // nvidia container + flagSet.StringVar(&c.nvidiaDriverCapabilities, "nvidia-capabilities", "", "NvidiaDriverCapabilities controls which driver libraries/binaries will be mounted inside the container") + flagSet.StringVar(&c.nvidiaVisibleDevices, "nvidia-visible-devs", "", "NvidiaVisibleDevices controls which GPUs will be made accessible inside the container") + return c } diff --git a/cli/container.go b/cli/container.go index 92a28c2be6..7ccd1834ae 100644 --- a/cli/container.go +++ b/cli/container.go @@ -83,6 +83,10 @@ type container struct { rich bool richMode string initScript string + + // nvidia container + nvidiaVisibleDevices string + nvidiaDriverCapabilities string } func (c *container) config() (*types.ContainerCreateConfig, error) { @@ -269,5 +273,12 @@ func (c *container) config() (*types.ContainerCreateConfig, error) { NetworkingConfig: networkingConfig, } + if c.nvidiaDriverCapabilities != "" || c.nvidiaVisibleDevices != "" { + config.HostConfig.Resources.NvidiaConfig = &types.NvidiaConfig{ + NvidiaDriverCapabilities: c.nvidiaDriverCapabilities, + NvidiaVisibleDevices: c.nvidiaVisibleDevices, + } + } + return config, nil } diff --git a/daemon/mgr/container_utils.go b/daemon/mgr/container_utils.go index f5faac875a..091323ed03 100644 --- a/daemon/mgr/container_utils.go +++ b/daemon/mgr/container_utils.go @@ -5,14 +5,14 @@ import ( "strconv" "strings" + "github.com/opencontainers/selinux/go-selinux/label" + "github.com/pkg/errors" + "github.com/alibaba/pouch/apis/types" networktypes "github.com/alibaba/pouch/network/types" "github.com/alibaba/pouch/pkg/errtypes" "github.com/alibaba/pouch/pkg/meta" "github.com/alibaba/pouch/pkg/randomid" - - "github.com/opencontainers/selinux/go-selinux/label" - "github.com/pkg/errors" ) // containerID returns the container's id, the parameter 'nameOrPrefix' may be container's diff --git a/daemon/mgr/container_validation.go b/daemon/mgr/container_validation.go index 876b34159c..2d7b98f6c8 100644 --- a/daemon/mgr/container_validation.go +++ b/daemon/mgr/container_validation.go @@ -2,12 +2,16 @@ package mgr import ( "fmt" + "os" + "strconv" + "strings" + + "github.com/sirupsen/logrus" "github.com/alibaba/pouch/apis/types" "github.com/alibaba/pouch/daemon/logger/syslog" "github.com/alibaba/pouch/pkg/system" - - "github.com/sirupsen/logrus" + "github.com/alibaba/pouch/pkg/utils" ) // validateConfig validates container config @@ -19,6 +23,10 @@ func (mgr *ContainerManager) validateConfig(c *Container, update bool) ([]string if err != nil { return nil, err } + // validates nvidia config + if err := validateNvidiaConfig(hostConfig); err != nil { + return warnings, err + } warnings = append(warnings, warns...) if hostConfig.OomScoreAdj < -1000 || hostConfig.OomScoreAdj > 1000 { @@ -188,3 +196,76 @@ func (mgr *ContainerManager) validateLogConfig(c *Container) error { return fmt.Errorf("not support (%v) log driver yet", logCfg.LogDriver) } } + +// validateNvidiaConfig +func validateNvidiaConfig(hostConfig *types.HostConfig) error { + r := &hostConfig.Resources + if r.NvidiaConfig == nil { + return nil + } + + if err := validateNvidiaDriver(r); err != nil { + return err + } + + if err := validateNvidiaDevice(hostConfig); err != nil { + return err + } + + return nil +} + +func validateNvidiaDriver(r *types.Resources) error { + n := r.NvidiaConfig + n.NvidiaDriverCapabilities = strings.TrimSpace(n.NvidiaDriverCapabilities) + if n.NvidiaDriverCapabilities == "" { + // use default driver capability: utility + return nil + } + + if n.NvidiaDriverCapabilities == "all" { + // enable all capabilities + return nil + } + + supportedDrivers := []string{"compute", "compat32", "graphics", "utility", "video", "display"} + drivers := strings.Split(n.NvidiaDriverCapabilities, ",") + for _, d := range drivers { + d = strings.TrimSpace(d) + found := utils.StringInSlice(supportedDrivers, d) + if !found { + return fmt.Errorf("invalid nvidia driver capability (%s)", d) + } + } + return nil +} + +func validateNvidiaDevice(hostConfig *types.HostConfig) error { + n := hostConfig.Resources.NvidiaConfig + n.NvidiaVisibleDevices = strings.TrimSpace(n.NvidiaVisibleDevices) + + // none: no GPU will be accessible, but driver capabilities will be enabled. + // void or empty: no GPU will be accessible, and driver capabilities will be disabled. + // all: all GPUs will be accessible + if n.NvidiaDriverCapabilities == "" { + return nil + } + supportedDevices := []string{"all", "none", "void"} + found := utils.StringInSlice(supportedDevices, n.NvidiaVisibleDevices) + if found { + return nil + } + devs := strings.Split(n.NvidiaVisibleDevices, ",") + for _, dev := range devs { + dev = strings.TrimSpace(dev) + if _, err := strconv.Atoi(dev); err == nil { + //dev is numeric, the realDev should be /dev/nvidiaN + realDev := fmt.Sprintf("/dev/nvidia%s", dev) + if _, err := os.Stat(realDev); err != nil { + return fmt.Errorf("invalid nvidia device %s", realDev) + } + } + // TODO: how to validate GPU UUID + } + return nil +} diff --git a/daemon/mgr/spec_hook.go b/daemon/mgr/spec_hook.go index cffca91cf1..46cda58ff9 100644 --- a/daemon/mgr/spec_hook.go +++ b/daemon/mgr/spec_hook.go @@ -5,6 +5,7 @@ import ( "fmt" "io/ioutil" "os" + "os/exec" "path/filepath" "sort" "strconv" @@ -65,6 +66,11 @@ func setupHook(ctx context.Context, c *Container, specWrapper *SpecWrapper) erro return errors.Wrap(err, "failed to set volume mount tab prestart hook") } + // set nvidia config + if err := setNvidiaHook(ctx, c, specWrapper); err != nil { + return errors.Wrap(err, "failed to set nvidia prestart hook") + } + return nil } @@ -153,6 +159,24 @@ func setMountTab(ctx context.Context, c *Container, spec *SpecWrapper) error { return nil } +func setNvidiaHook(ctx context.Context, c *Container, spec *SpecWrapper) error { + n := c.HostConfig.NvidiaConfig + if n == nil { + return nil + } + path, err := exec.LookPath("nvidia-container-runtime-hook") + if err != nil { + return err + } + args := []string{path} + nvidiaPrestart := specs.Hook{ + Path: path, + Args: append(args, "prestart"), + } + spec.s.Hooks.Prestart = append(spec.s.Hooks.Prestart, nvidiaPrestart) + return nil +} + type hookArray []*wrapperEmbedPrestart // Len is defined in order to support sort diff --git a/daemon/mgr/spec_process.go b/daemon/mgr/spec_process.go index 18b41031d8..7d02d03338 100644 --- a/daemon/mgr/spec_process.go +++ b/daemon/mgr/spec_process.go @@ -2,6 +2,7 @@ package mgr import ( "context" + "fmt" "io/ioutil" "os" "strings" @@ -61,6 +62,9 @@ func setupProcess(ctx context.Context, c *Container, s *specs.Spec) error { return err } + if err := setupNvidiaEnv(ctx, c, s); err != nil { + return err + } return nil } @@ -163,3 +167,13 @@ func setupRlimits(ctx context.Context, hostConfig *types.HostConfig, s *specs.Sp s.Process.Rlimits = rlimits return nil } + +func setupNvidiaEnv(ctx context.Context, c *Container, s *specs.Spec) error { + n := c.HostConfig.NvidiaConfig + if n == nil { + return nil + } + s.Process.Env = append(s.Process.Env, fmt.Sprintf("NVIDIA_DRIVER_CAPABILITIES=%s", n.NvidiaDriverCapabilities)) + s.Process.Env = append(s.Process.Env, fmt.Sprintf("NVIDIA_VISIBLE_DEVICES=%s", n.NvidiaVisibleDevices)) + return nil +} diff --git a/hack/package/rpm/build.sh b/hack/package/rpm/build.sh index 64227018af..e600eddb4d 100755 --- a/hack/package/rpm/build.sh +++ b/hack/package/rpm/build.sh @@ -16,6 +16,8 @@ POUCHDIR=$TMP/source [ -d "$POUCHDIR" ] || mkdir -p "$POUCHDIR" BINDIR=$POUCHDIR/bin [ -d "$BINDIR" ] || mkdir -p "$BINDIR" +LIBDIR=$POUCHDIR/lib +[ -d "$LIBDIR" ] || mkdir -p "$LIBDIR" LXC_DIR=$TMP/lxc [ -d "$LXC_DIR" ] || mkdir -p "$LXC_DIR" @@ -34,6 +36,9 @@ CATEGORY='Tools/Pouch' MAINTAINER='Pouch pouch-dev@list.alibaba-inc.com' VENDOR='Pouch' +LIB_NVIDIA_VERSION="1.0.0-rc.2" +NVIDIA_RUNTIME_VERSION="1.4.0-1" + # build lxcfs function build_lxcfs () { @@ -69,6 +74,24 @@ function build_pouch() popd } +# install nvidia-container-runtime +function build_nvidia_runtime(){ + echo "Downloading libnvidia-container." + wget --quiet "https://github.com/NVIDIA/libnvidia-container/releases/download/v${LIB_NVIDIA_VERSION}/libnvidia-container_${LIB_NVIDIA_VERSION}_x86_64.tar.xz" -P "${TMP}" + tar -xf "${TMP}/libnvidia-container_${LIB_NVIDIA_VERSION}_x86_64.tar.xz" -C "${TMP}" + cp "${TMP}/libnvidia-container_${LIB_NVIDIA_VERSION}/usr/local/bin/nvidia-container-cli" "${BINDIR}/" + cp "${TMP}/libnvidia-container_${LIB_NVIDIA_VERSION}/usr/local/lib/libnvidia-container.so" "${LIBDIR}/" + cp "${TMP}/libnvidia-container_${LIB_NVIDIA_VERSION}/usr/local/lib/libnvidia-container.so.1" "${LIBDIR}/" + cp "${TMP}/libnvidia-container_${LIB_NVIDIA_VERSION}/usr/local/lib/libnvidia-container.so.1.0.0" "${LIBDIR}/" + + echo "Downloading nvidia-container-runtime." + wget --quiet "https://github.com/NVIDIA/nvidia-container-runtime/archive/v${NVIDIA_RUNTIME_VERSION}.tar.gz" -P "${TMP}" + mkdir -p "${GOPATH}/src/github.com/NVIDIA" + tar -xzf "${TMP}/v${NVIDIA_RUNTIME_VERSION}.tar.gz" -C "${GOPATH}/src/github.com/NVIDIA" + mv "${GOPATH}/src/github.com/NVIDIA/nvidia-container-runtime-${NVIDIA_RUNTIME_VERSION}" "${GOPATH}/src/github.com/NVIDIA/nvidia-container-runtime" + go build -o "${BINDIR}/nvidia-container-runtime-hook" "github.com/NVIDIA/nvidia-container-runtime/hook/nvidia-container-runtime-hook" +} + function build_rpm () { pushd $MOUNTDIR @@ -110,6 +133,7 @@ function build_rpm () -d fuse-libs \ -d fuse \ "$BINDIR/"=/usr/local/bin/ \ + "$LIBDIR/"=/usr/lib64/ \ "$SERVICEDIR/"=/usr/lib/systemd/system/ \ "$LXC_DIR/usr/local/bin/lxcfs"=/usr/bin/pouch-lxcfs \ "$LXC_DIR/usr/local/lib/lxcfs/libpouchlxcfs.so"=/usr/lib64/libpouchlxcfs.so \ @@ -121,6 +145,7 @@ function main() echo "Building rpm package." build_pouch build_lxcfs + build_nvidia_runtime build_rpm } diff --git a/test/api_container_create_test.go b/test/api_container_create_test.go index f7f49756e3..1ae903796e 100644 --- a/test/api_container_create_test.go +++ b/test/api_container_create_test.go @@ -196,3 +196,38 @@ func (suite *APIContainerCreateSuite) TestBadParam(c *check.C) { // 2. Invalid Parameters helpwantedForMissingCase(c, "container api create with bad request") } + +func (suite *APIContainerCreateSuite) TestCreateNvidiaConfig(c *check.C) { + cname := "TestCreateNvidiaConfig" + q := url.Values{} + q.Add("name", cname) + query := request.WithQuery(q) + + obj := map[string]interface{}{ + "Image": busyboxImage, + "HostConfig": map[string]interface{}{ + "NvidiaConfig": map[string]interface{}{ + "NvidiaDriverCapabilities": "all", + "NvidiaVisibleDevices": "none", + }, + }, + } + body := request.WithJSONBody(obj) + + resp, err := request.Post("/containers/create", query, body) + c.Assert(err, check.IsNil) + CheckRespStatus(c, resp, 201) + + resp, err = request.Get("/containers/" + cname + "/json") + c.Assert(err, check.IsNil) + CheckRespStatus(c, resp, 200) + + got := types.ContainerJSON{} + err = request.DecodeBody(&got, resp.Body) + c.Assert(err, check.IsNil) + + c.Assert(got.HostConfig.Resources.NvidiaConfig.NvidiaVisibleDevices, check.Equals, "none") + c.Assert(got.HostConfig.Resources.NvidiaConfig.NvidiaDriverCapabilities, check.Equals, "all") + + DelContainerForceMultyTime(c, cname) +} diff --git a/test/cli_create_test.go b/test/cli_create_test.go index 5bcca65275..ac0c253c0b 100644 --- a/test/cli_create_test.go +++ b/test/cli_create_test.go @@ -498,3 +498,36 @@ func (suite *PouchRunSuite) TestCreateWithNonExistImage(c *check.C) { res := command.PouchRun("create", "--name", cname, image) res.Assert(c, icmd.Success) } + +// TestCreateWithNonExistImage tests running container with image not exist. +func (suite *PouchRunSuite) TestCreateWithNvidiaConfig(c *check.C) { + cname := "TestCreateWithNvidiaConfig" + image := "docker.io/library/alpine" + res := command.PouchRun("create", "--name", cname, "--nvidia-capabilities", "all", "--nvidia-visible-devs", "none", image) + res.Assert(c, icmd.Success) + + output := command.PouchRun("inspect", cname).Stdout() + result := []types.ContainerJSON{} + if err := json.Unmarshal([]byte(output), &result); err != nil { + c.Errorf("failed to decode inspect output: %v", err) + } + cap := result[0].HostConfig.Resources.NvidiaConfig.NvidiaDriverCapabilities + drv := result[0].HostConfig.Resources.NvidiaConfig.NvidiaVisibleDevices + c.Assert(cap, check.Equals, "all") + c.Assert(drv, check.Equals, "none") +} + +// TestCreateWithNonExistImage tests running container with image not exist. +func (suite *PouchRunSuite) TestCreateWithoutNvidiaConfig(c *check.C) { + cname := "TestCreateWithoutNvidiaConfig" + image := "docker.io/library/alpine" + res := command.PouchRun("create", "--name", cname, image) + res.Assert(c, icmd.Success) + + output := command.PouchRun("inspect", cname).Stdout() + result := []types.ContainerJSON{} + if err := json.Unmarshal([]byte(output), &result); err != nil { + c.Errorf("failed to decode inspect output: %v", err) + } + c.Assert(result[0].HostConfig.Resources.NvidiaConfig, check.Equals, nil) +}