From 7f5e5110fbfe706f21fdd6024e784614e2bae916 Mon Sep 17 00:00:00 2001 From: jerryzhuang Date: Fri, 12 Apr 2019 16:21:27 +0800 Subject: [PATCH] refactor: setup podNetwork before creating sandbox VM-like container required the network initialization before creating sandbox container. This order won't affect the runc container. Signed-off-by: zhuangqh --- apis/swagger.yml | 2 +- apis/types/host_config.go | 2 +- cri/ocicni/cni_manager.go | 3 + cri/ocicni/interface.go | 15 + cri/ocicni/netns.go | 166 ++++++++++ cri/v1alpha2/cri.go | 152 +++++---- cri/v1alpha2/cri_utils.go | 45 +-- daemon/mgr/container.go | 8 +- daemon/mgr/network_utils.go | 8 +- daemon/mgr/spec_linux.go | 3 + .../containernetworking/plugins/LICENSE | 201 ++++++++++++ .../plugins/pkg/ns/README.md | 40 +++ .../plugins/pkg/ns/ns_linux.go | 305 ++++++++++++++++++ vendor/vendor.json | 8 + 14 files changed, 870 insertions(+), 88 deletions(-) create mode 100644 cri/ocicni/netns.go create mode 100644 vendor/github.com/containernetworking/plugins/LICENSE create mode 100644 vendor/github.com/containernetworking/plugins/pkg/ns/README.md create mode 100644 vendor/github.com/containernetworking/plugins/pkg/ns/ns_linux.go diff --git a/apis/swagger.yml b/apis/swagger.yml index 71b2c1df5..a8d5f274f 100644 --- a/apis/swagger.yml +++ b/apis/swagger.yml @@ -2320,7 +2320,7 @@ definitions: $ref: "#/definitions/RestartPolicy" NetworkMode: type: "string" - description: "Network mode to use for this container. Supported standard values are: `bridge`, `host`, `none`, and `container:`. Any other value is taken as a custom network's name to which this container should connect to." + description: "Network mode to use for this container. Supported standard values are: `netns:`, `bridge`, `host`, `none`, and `container:`. Any other value is taken as a custom network's name to which this container should connect to." PortBindings: type: "object" description: "A map of exposed container ports and the host port they should map to." diff --git a/apis/types/host_config.go b/apis/types/host_config.go index d1f2aeead..90edd28dc 100644 --- a/apis/types/host_config.go +++ b/apis/types/host_config.go @@ -94,7 +94,7 @@ type HostConfig struct { // Masks over the provided paths inside the container. MaskedPaths []string `json:"MaskedPaths"` - // Network mode to use for this container. Supported standard values are: `bridge`, `host`, `none`, and `container:`. Any other value is taken as a custom network's name to which this container should connect to. + // Network mode to use for this container. Supported standard values are: `netns:`, `bridge`, `host`, `none`, and `container:`. Any other value is taken as a custom network's name to which this container should connect to. NetworkMode string `json:"NetworkMode,omitempty"` // An integer value containing the score given to the container in order to tune OOM killer preferences. diff --git a/cri/ocicni/cni_manager.go b/cri/ocicni/cni_manager.go index d2a284c7e..99451eb23 100644 --- a/cri/ocicni/cni_manager.go +++ b/cri/ocicni/cni_manager.go @@ -85,6 +85,9 @@ func (c *CniManager) TearDownPodNetwork(podNetwork *ocicni.PodNetwork) error { // if netNSPath is not found, should return the error of IsNotExist. if _, err = os.Stat(podNetwork.NetNS); err != nil { + if os.IsNotExist(err) { + return nil + } return err } return errors.Wrapf(err, "failed to destroy network for sandbox %q", podNetwork.ID) diff --git a/cri/ocicni/interface.go b/cri/ocicni/interface.go index ce90ee3bc..9b10e4253 100644 --- a/cri/ocicni/interface.go +++ b/cri/ocicni/interface.go @@ -21,4 +21,19 @@ type CniMgr interface { // Status returns error if the network plugin is in error state. Status() error + + // NewNetNS creates a new persistent network namespace and returns the + // namespace path, without switching to it + NewNetNS() (string, error) + + // RemoveNetNS unmounts the network namespace + RemoveNetNS(path string) error + + // CloseNetNS cleans up this instance of the network namespace; if this instance + // is the last user the namespace will be destroyed + CloseNetNS(path string) error + + // RecoverNetNS recreate a persistent network namespace if the ns is not exists. + // Otherwise, do nothing. + RecoverNetNS(path string) error } diff --git a/cri/ocicni/netns.go b/cri/ocicni/netns.go new file mode 100644 index 000000000..3e96c157a --- /dev/null +++ b/cri/ocicni/netns.go @@ -0,0 +1,166 @@ +package ocicni + +import ( + "crypto/rand" + "fmt" + "os" + "path" + "runtime" + "strings" + "sync" + + "github.com/containernetworking/plugins/pkg/ns" + "github.com/pkg/errors" + "golang.org/x/sys/unix" +) + +const nsRunDir = "/var/run/netns" + +// NewNetNS creates a new persistent network namespace and returns the +// namespace path, without switching to it +func (c *CniManager) NewNetNS() (string, error) { + return createNS("") +} + +// RemoveNetNS unmounts the network namespace +func (c *CniManager) RemoveNetNS(path string) error { + if _, err := os.Stat(path); err != nil { + if os.IsNotExist(err) { + return nil + } + return errors.Wrap(err, "failed to stat netns") + } + if strings.HasPrefix(path, nsRunDir) { + if err := unix.Unmount(path, 0); err != nil { + return errors.Wrapf(err, "failed to unmount NS: at %s", path) + } + + if err := os.Remove(path); err != nil { + return errors.Wrapf(err, "failed to remove ns path %s", path) + } + } + + return nil +} + +// CloseNetNS cleans up this instance of the network namespace; if this instance +// is the last user the namespace will be destroyed +func (c *CniManager) CloseNetNS(path string) error { + netns, err := ns.GetNS(path) + + if err != nil { + if _, ok := err.(ns.NSPathNotExistErr); ok { + return nil + } + if _, ok := err.(ns.NSPathNotNSErr); ok { + if err := os.RemoveAll(path); err != nil { + return errors.Wrapf(err, "failed to remove netns path %s", path) + } + return nil + } + return errors.Wrapf(err, "failed to get netns path %s", path) + } + if err := netns.Close(); err != nil { + return errors.Wrapf(err, " failed to clean up netns path %s", path) + } + return nil +} + +// RecoverNetNS recreate a persistent network namespace if the ns is not exists. +// Otherwise, do nothing. +func (c *CniManager) RecoverNetNS(path string) error { + _, err := ns.GetNS(path) + + // net ns already exists + if err == nil { + return nil + } + + _, err = createNS(path) + return err +} + +// getCurrentThreadNetNSPath copied from pkg/ns +func getCurrentThreadNetNSPath() string { + // /proc/self/ns/net returns the namespace of the main thread, not + // of whatever thread this goroutine is running on. Make sure we + // use the thread's net namespace since the thread is switching around + return fmt.Sprintf("/proc/%d/task/%d/ns/net", os.Getpid(), unix.Gettid()) +} + +// createNS create and mount the network namespace of the given path, or create a brand new one. +// partially copy some code from https://github.com/containernetworking/plugins/blob/master/pkg/testutils/netns_linux.go +// notes: DO NOT open the nsPath like above repo do, or pouchd will hold the reference of the created network namespace. +// pouchd will fail to remove the netns when stop pod sandbox. +func createNS(nsPath string) (res string, err error) { + if err = os.MkdirAll(nsRunDir, 0755); err != nil { + return "", err + } + + // if the ns path is not given, create an empty file + if nsPath == "" { + b := make([]byte, 16) + if _, err := rand.Reader.Read(b); err != nil { + return "", errors.Wrap(err, "failed to generate random netns name") + } + + nsName := fmt.Sprintf("cni-%x-%x-%x-%x-%x", b[0:4], b[4:6], b[6:8], b[8:10], b[10:]) + nsPath = path.Join(nsRunDir, nsName) + } + + if _, err := os.Stat(nsPath); err != nil { + if os.IsNotExist(err) { + mountPointFd, err := os.Create(nsPath) + if err != nil { + return "", err + } + mountPointFd.Close() + } + } + + // Ensure the mount point is cleaned up on errors + defer func() { + if err != nil { + os.RemoveAll(nsPath) + } + }() + + var wg sync.WaitGroup + wg.Add(1) + + // do namespace work in a dedicated goroutine, so that we can safely + // Lock/Unlock OSThread without upsetting the lock/unlock state of + // the caller of this function + go (func() { + defer wg.Done() + runtime.LockOSThread() + + var origNS ns.NetNS + origNS, err = ns.GetNS(getCurrentThreadNetNSPath()) + if err != nil { + return + } + defer origNS.Close() + + // create a new netns on the current thread + err = unix.Unshare(unix.CLONE_NEWNET) + if err != nil { + return + } + defer origNS.Set() + + // bind mount the new netns from the current thread onto the mount point + err = unix.Mount(getCurrentThreadNetNSPath(), nsPath, "none", unix.MS_BIND, "") + if err != nil { + return + } + })() + wg.Wait() + + if err != nil { + unix.Unmount(nsPath, unix.MNT_DETACH) + return "", errors.Wrapf(err, "failed to create namespace %s", nsPath) + } + + return nsPath, nil +} diff --git a/cri/v1alpha2/cri.go b/cri/v1alpha2/cri.go index 13f25e190..27357dcc1 100644 --- a/cri/v1alpha2/cri.go +++ b/cri/v1alpha2/cri.go @@ -32,7 +32,6 @@ import ( util_metrics "github.com/alibaba/pouch/pkg/utils/metrics" "github.com/alibaba/pouch/version" - "github.com/cri-o/ocicni/pkg/ocicni" "github.com/pkg/errors" "github.com/sirupsen/logrus" ) @@ -63,7 +62,6 @@ const ( nameDelimiter = "_" namespaceModeHost = "host" - namespaceModeNone = "none" // resolvConfPath is the abs path of resolv.conf on host or container. resolvConfPath = "/etc/resolv.conf" @@ -237,8 +235,6 @@ func (c *CriManager) RunPodSandbox(ctx context.Context, r *runtime.RunPodSandbox return nil, err } - // Step 2: Create the sandbox container. - // prepare the sandboxID and store it. id, err := c.generateSandboxID(ctx) if err != nil { @@ -262,6 +258,35 @@ func (c *CriManager) RunPodSandbox(ctx context.Context, r *runtime.RunPodSandbox } }() + // Step 2: Setup networking for the sandbox. + + // If it is in host network, no need to configure the network of sandbox. + if sandboxNetworkMode(config) != runtime.NamespaceMode_NODE { + sandboxMeta.NetNS, err = c.CniMgr.NewNetNS() + if err != nil { + return nil, err + } + defer func() { + if retErr != nil { + if err := c.CniMgr.RemoveNetNS(sandboxMeta.NetNS); err != nil { + logrus.Errorf("failed to remove net ns for sandbox %q: %v", id, err) + } + } + }() + if err := c.setupPodNetwork(id, sandboxMeta.NetNS, config); err != nil { + return nil, err + } + defer func() { + if retErr != nil { + if err := c.teardownNetwork(id, sandboxMeta.NetNS, config); err != nil { + logrus.Errorf("failed to teardown pod network for sandbox %q: %v", id, err) + } + } + }() + } + + // Step 3: Create the sandbox container. + // applies the runtime of container specified by the caller. if err := c.applySandboxRuntimeHandler(sandboxMeta, r.GetRuntimeHandler(), config.Annotations); err != nil { return nil, err @@ -272,7 +297,7 @@ func (c *CriManager) RunPodSandbox(ctx context.Context, r *runtime.RunPodSandbox return nil, err } - createConfig, err := makeSandboxPouchConfig(config, sandboxMeta.Runtime, image) + createConfig, err := makeSandboxPouchConfig(config, sandboxMeta, image) if err != nil { return nil, fmt.Errorf("failed to make sandbox pouch config for pod %q: %v", config.Metadata.Name, err) @@ -301,7 +326,7 @@ func (c *CriManager) RunPodSandbox(ctx context.Context, r *runtime.RunPodSandbox } }() - // Step 3: Start the sandbox container. + // Step 4: Start the sandbox container. err = c.ContainerMgr.Start(ctx, id, &apitypes.ContainerStartOptions{}) if err != nil { return nil, fmt.Errorf("failed to start sandbox container for pod %q: %v", config.Metadata.Name, err) @@ -327,16 +352,6 @@ func (c *CriManager) RunPodSandbox(ctx context.Context, r *runtime.RunPodSandbox return nil, fmt.Errorf("failed to setup sandbox files: %v", err) } - // Step 4: Setup networking for the sandbox. - networkNamespaceMode := config.GetLinux().GetSecurityContext().GetNamespaceOptions().GetNetwork() - // If it is in host network, no need to configure the network of sandbox. - if networkNamespaceMode != runtime.NamespaceMode_NODE { - err = c.setupPodNetwork(ctx, id, config) - if err != nil { - return nil, err - } - } - metrics.PodSuccessActionsCounter.WithLabelValues(label).Inc() return &runtime.RunPodSandboxResponse{PodSandboxId: id}, nil @@ -345,7 +360,7 @@ func (c *CriManager) RunPodSandbox(ctx context.Context, r *runtime.RunPodSandbox // StartPodSandbox restart a sandbox pod which was stopped by accident // and we should reconfigure it with network plugin which will make sure it reacquire its original network configuration, // like IP address. -func (c *CriManager) StartPodSandbox(ctx context.Context, r *runtime.StartPodSandboxRequest) (*runtime.StartPodSandboxResponse, error) { +func (c *CriManager) StartPodSandbox(ctx context.Context, r *runtime.StartPodSandboxRequest) (_ *runtime.StartPodSandboxResponse, retErr error) { label := util_metrics.ActionStartLabel defer func(start time.Time) { metrics.PodActionsCounter.WithLabelValues(label).Inc() @@ -354,15 +369,50 @@ func (c *CriManager) StartPodSandbox(ctx context.Context, r *runtime.StartPodSan podSandboxID := r.GetPodSandboxId() + sandbox, err := c.ContainerMgr.Get(ctx, podSandboxID) + if err != nil { + return nil, fmt.Errorf("failed to get container %q: %v", podSandboxID, err) + } + + res, err := c.SandboxStore.Get(podSandboxID) + if err != nil { + return nil, fmt.Errorf("failed to get metadata of %q from SandboxStore: %v", podSandboxID, err) + } + sandboxMeta := res.(*metatypes.SandboxMeta) + + ip, _ := c.CniMgr.GetPodNetworkStatus(sandboxMeta.NetNS) + + if mgr.IsNetNS(sandbox.HostConfig.NetworkMode) && ip == "" { + if err := c.CniMgr.RecoverNetNS(sandboxMeta.NetNS); err != nil { + return nil, fmt.Errorf("failed to recover netns %s for sandbox %q: %v", sandboxMeta.NetNS, podSandboxID, err) + } + defer func() { + if retErr != nil { + if err := c.CniMgr.RemoveNetNS(sandboxMeta.NetNS); err != nil { + logrus.Errorf("failed to remove net ns for sandbox %q: %v", podSandboxID, err) + } + } + }() + + if err = c.setupPodNetwork(podSandboxID, sandboxMeta.NetNS, sandboxMeta.Config); err != nil { + return nil, err + } + defer func() { + if retErr != nil { + if err := c.teardownNetwork(podSandboxID, sandboxMeta.NetNS, sandboxMeta.Config); err != nil { + logrus.Errorf("failed to teardown pod network for sandbox %q: %v", podSandboxID, err) + } + } + }() + } + // start PodSandbox. startErr := c.ContainerMgr.Start(ctx, podSandboxID, &apitypes.ContainerStartOptions{}) if startErr != nil { return nil, fmt.Errorf("failed to start podSandbox %q: %v", podSandboxID, startErr) } - - var err error defer func() { - if err != nil { + if retErr != nil { stopErr := c.ContainerMgr.Stop(ctx, podSandboxID, defaultStopTimeout) if stopErr != nil { logrus.Errorf("failed to stop sandbox %q: %v", podSandboxID, stopErr) @@ -370,19 +420,9 @@ func (c *CriManager) StartPodSandbox(ctx context.Context, r *runtime.StartPodSan } }() - // get the sandbox's meta data. - res, err := c.SandboxStore.Get(podSandboxID) - if err != nil { - return nil, fmt.Errorf("failed to get metadata of %q from SandboxStore: %v", podSandboxID, err) - } - sandboxMeta := res.(*metatypes.SandboxMeta) - - // setup networking for the sandbox. - networkNamespaceMode := sandboxMeta.Config.GetLinux().GetSecurityContext().GetNamespaceOptions().GetNetwork() - // If it is in host network, no need to configure the network of sandbox. - if networkNamespaceMode != runtime.NamespaceMode_NODE { - err = c.setupPodNetwork(ctx, podSandboxID, sandboxMeta.Config) - if err != nil { + // legacy container using /proc/$pid/ns/net as the sandbox netns. + if mgr.IsNone(sandbox.HostConfig.NetworkMode) { + if err = c.setupPodNetwork(podSandboxID, containerNetns(sandbox), sandboxMeta.Config); err != nil { return nil, err } } @@ -416,7 +456,7 @@ func (c *CriManager) StopPodSandbox(ctx context.Context, r *runtime.StopPodSandb containers, err := c.ContainerMgr.List(ctx, opts) if err != nil { - return nil, fmt.Errorf("failed to stop sandbox %q: %v", podSandboxID, err) + return nil, fmt.Errorf("failed to get the containers belong to sandbox %q: %v", podSandboxID, err) } // Stop all containers in the sandbox. @@ -428,38 +468,10 @@ func (c *CriManager) StopPodSandbox(ctx context.Context, r *runtime.StopPodSandb logrus.Infof("success to stop container %q of sandbox %q", container.ID, podSandboxID) } - container, err := c.ContainerMgr.Get(ctx, podSandboxID) - if err != nil { - return nil, err - } - metadata, err := parseSandboxName(container.Name) - if err != nil { - return nil, fmt.Errorf("failed to parse metadata of sandbox %q from container name: %v", podSandboxID, err) - } - - securityContext := sandboxMeta.Config.GetLinux().GetSecurityContext() - hostNet := securityContext.GetNamespaceOptions().GetNetwork() == runtime.NamespaceMode_NODE - // Teardown network of the pod, if it is not in host network mode. - if !hostNet { - sandbox, err := c.ContainerMgr.Get(ctx, podSandboxID) - if err != nil { - return nil, fmt.Errorf("failed to get sandbox %q: %v", podSandboxID, err) - } - - netNSPath := containerNetns(sandbox) - err = c.CniMgr.TearDownPodNetwork(&ocicni.PodNetwork{ - Name: metadata.GetName(), - Namespace: metadata.GetNamespace(), - ID: podSandboxID, - NetNS: netNSPath, - PortMappings: toCNIPortMappings(sandboxMeta.Config.GetPortMappings()), - }) - if err != nil { - if !os.IsNotExist(err) { - return nil, err - } - logrus.Warnf("failed to find network namespace file %s of sandbox %s which may have been already stopped", netNSPath, podSandboxID) + if sandboxNetworkMode(sandboxMeta.Config) != runtime.NamespaceMode_NODE { + if err = c.teardownNetwork(podSandboxID, sandboxMeta.NetNS, sandboxMeta.Config); err != nil { + logrus.Warnf("failed to find network namespace file %s of sandbox %s which may have been already stopped", sandboxMeta.NetNS, podSandboxID) } } @@ -469,6 +481,16 @@ func (c *CriManager) StopPodSandbox(ctx context.Context, r *runtime.StopPodSandb return nil, fmt.Errorf("failed to stop sandbox %q: %v", podSandboxID, err) } + // after container stop, no one refer the net namespace, do the clean up job. + if sandboxNetworkMode(sandboxMeta.Config) != runtime.NamespaceMode_NODE { + if err := c.CniMgr.CloseNetNS(sandboxMeta.NetNS); err != nil { + return nil, fmt.Errorf("failed to close net ns %s of sandbox %q: %v", sandboxMeta.NetNS, podSandboxID, err) + } + if err := c.CniMgr.RemoveNetNS(sandboxMeta.NetNS); err != nil { + return nil, fmt.Errorf("failed to remove net ns %s of sandbox %q: %v", sandboxMeta.NetNS, podSandboxID, err) + } + } + metrics.PodSuccessActionsCounter.WithLabelValues(label).Inc() return &runtime.StopPodSandboxResponse{}, nil diff --git a/cri/v1alpha2/cri_utils.go b/cri/v1alpha2/cri_utils.go index 1ee387240..10b80b93c 100644 --- a/cri/v1alpha2/cri_utils.go +++ b/cri/v1alpha2/cri_utils.go @@ -249,9 +249,6 @@ func modifySandboxNamespaceOptions(nsOpts *runtime.NamespaceOption, hostConfig * if nsOpts.GetIpc() == runtime.NamespaceMode_NODE { hostConfig.IpcMode = namespaceModeHost } - if nsOpts.GetNetwork() == runtime.NamespaceMode_NODE { - hostConfig.NetworkMode = namespaceModeHost - } } func applySandboxSecurityContext(lc *runtime.LinuxPodSandboxConfig, config *apitypes.ContainerConfig, hc *apitypes.HostConfig) error { @@ -287,9 +284,6 @@ func applySandboxSecurityContext(lc *runtime.LinuxPodSandboxConfig, config *apit // applySandboxLinuxOptions applies LinuxPodSandboxConfig to pouch's HostConfig and ContainerCreateConfig. func applySandboxLinuxOptions(hc *apitypes.HostConfig, lc *runtime.LinuxPodSandboxConfig, createConfig *apitypes.ContainerCreateConfig, image string) error { - // apply the sandbox network_mode, "none" is default. - hc.NetworkMode = namespaceModeNone - if lc == nil { return nil } @@ -338,7 +332,7 @@ func (c *CriManager) applySandboxAnnotations(sandboxMeta *metatypes.SandboxMeta, } // makeSandboxPouchConfig returns apitypes.ContainerCreateConfig based on runtime.PodSandboxConfig. -func makeSandboxPouchConfig(config *runtime.PodSandboxConfig, runtimehandler, image string) (*apitypes.ContainerCreateConfig, error) { +func makeSandboxPouchConfig(config *runtime.PodSandboxConfig, sandboxMeta *metatypes.SandboxMeta, image string) (*apitypes.ContainerCreateConfig, error) { // Merge annotations and labels because pouch supports only labels. labels := makeLabels(config.GetLabels(), config.GetAnnotations()) // Apply a label to distinguish sandboxes from regular containers. @@ -346,9 +340,15 @@ func makeSandboxPouchConfig(config *runtime.PodSandboxConfig, runtimehandler, im hc := &apitypes.HostConfig{} + if sandboxMeta.NetNS == "" { + hc.NetworkMode = namespaceModeHost + } else { + hc.NetworkMode = fmt.Sprintf("netns:%s", sandboxMeta.NetNS) + } + // Apply runtime options. // NOTE: whether to add UntrustedWorkload - hc.Runtime = runtimehandler + hc.Runtime = sandboxMeta.Runtime createConfig := &apitypes.ContainerCreateConfig{ ContainerConfig: apitypes.ContainerConfig{ @@ -543,18 +543,9 @@ func setupSandboxFiles(sandboxRootDir string, config *runtime.PodSandboxConfig) return nil } -// setupPodNetwork sets up the network of PodSandbox and return the netnsPath of PodSandbox +// setupPodNetwork sets up the network of PodSandbox // and do nothing when networkNamespaceMode equals runtime.NamespaceMode_NODE. -func (c *CriManager) setupPodNetwork(ctx context.Context, id string, config *runtime.PodSandboxConfig) error { - container, err := c.ContainerMgr.Get(ctx, id) - if err != nil { - return err - } - netnsPath := containerNetns(container) - if netnsPath == "" { - return fmt.Errorf("failed to find network namespace path for sandbox %q", id) - } - +func (c *CriManager) setupPodNetwork(id, netnsPath string, config *runtime.PodSandboxConfig) error { return c.CniMgr.SetUpPodNetwork(&ocicni.PodNetwork{ Name: config.GetMetadata().GetName(), Namespace: config.GetMetadata().GetNamespace(), @@ -564,6 +555,22 @@ func (c *CriManager) setupPodNetwork(ctx context.Context, id string, config *run }) } +// teardownNetwork teardown the network of PodSandbox. +// and do nothing when networkNamespaceMode equals runtime.NamespaceMode_NODE. +func (c *CriManager) teardownNetwork(id, netnsPath string, config *runtime.PodSandboxConfig) error { + return c.CniMgr.TearDownPodNetwork(&ocicni.PodNetwork{ + Name: config.GetMetadata().GetName(), + Namespace: config.GetMetadata().GetNamespace(), + ID: id, + NetNS: netnsPath, + PortMappings: toCNIPortMappings(config.GetPortMappings()), + }) +} + +func sandboxNetworkMode(config *runtime.PodSandboxConfig) runtime.NamespaceMode { + return config.GetLinux().GetSecurityContext().GetNamespaceOptions().GetNetwork() +} + // Container related tool functions. func makeContainerName(s *runtime.PodSandboxConfig, c *runtime.ContainerConfig) string { diff --git a/daemon/mgr/container.go b/daemon/mgr/container.go index c6d5b01fd..717886f11 100644 --- a/daemon/mgr/container.go +++ b/daemon/mgr/container.go @@ -479,7 +479,8 @@ func (mgr *ContainerManager) Create(ctx context.Context, name string, config *ty if len(config.NetworkingConfig.EndpointsConfig) > 0 { container.NetworkSettings.Networks = config.NetworkingConfig.EndpointsConfig } - if container.NetworkSettings.Networks == nil && !IsContainer(config.HostConfig.NetworkMode) { + if container.NetworkSettings.Networks == nil && + !IsContainer(config.HostConfig.NetworkMode) && !IsNetNS(config.HostConfig.NetworkMode) { container.NetworkSettings.Networks = make(map[string]*types.EndpointSettings) container.NetworkSettings.Networks[config.HostConfig.NetworkMode] = new(types.EndpointSettings) } @@ -683,6 +684,11 @@ func (mgr *ContainerManager) prepareContainerNetwork(ctx context.Context, c *Con return nil } + // network is prepared by upper system. do nothing here. + if IsNetNS(networkMode) { + return nil + } + // initialise host network mode if IsHost(networkMode) { hostname, err := os.Hostname() diff --git a/daemon/mgr/network_utils.go b/daemon/mgr/network_utils.go index 9b0092be5..a08454cba 100644 --- a/daemon/mgr/network_utils.go +++ b/daemon/mgr/network_utils.go @@ -30,9 +30,15 @@ func IsBridge(mode string) bool { return mode == "bridge" } +// IsNetNS is used to check if network mode is netns mode. +func IsNetNS(mode string) bool { + parts := strings.SplitN(mode, ":", 2) + return len(parts) > 1 && parts[0] == "netns" +} + // IsUserDefined is used to check if network mode is user-created. func IsUserDefined(mode string) bool { - return !IsBridge(mode) && !IsContainer(mode) && !IsHost(mode) && !IsNone(mode) + return !IsBridge(mode) && !IsContainer(mode) && !IsHost(mode) && !IsNone(mode) && !IsNetNS(mode) } // IsDefault indicates whether container uses the default network stack. diff --git a/daemon/mgr/spec_linux.go b/daemon/mgr/spec_linux.go index 91c33123b..2ce109b1e 100644 --- a/daemon/mgr/spec_linux.go +++ b/daemon/mgr/spec_linux.go @@ -425,9 +425,12 @@ func setupNetworkNamespace(ctx context.Context, c *Container, specWrapper *SpecW } ns.Path = fmt.Sprintf("/proc/%d/ns/net", origContainer.State.Pid) + } else if IsNetNS(networkMode) { + ns.Path = strings.SplitN(networkMode, ":", 2)[1] } else if IsHost(networkMode) { ns.Path = c.NetworkSettings.SandboxKey } + setNamespace(s, ns) for _, ns := range s.Linux.Namespaces { diff --git a/vendor/github.com/containernetworking/plugins/LICENSE b/vendor/github.com/containernetworking/plugins/LICENSE new file mode 100644 index 000000000..8dada3eda --- /dev/null +++ b/vendor/github.com/containernetworking/plugins/LICENSE @@ -0,0 +1,201 @@ + Apache License + Version 2.0, January 2004 + http://www.apache.org/licenses/ + + TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION + + 1. Definitions. + + "License" shall mean the terms and conditions for use, reproduction, + and distribution as defined by Sections 1 through 9 of this document. + + "Licensor" shall mean the copyright owner or entity authorized by + the copyright owner that is granting the License. + + "Legal Entity" shall mean the union of the acting entity and all + other entities that control, are controlled by, or are under common + control with that entity. For the purposes of this definition, + "control" means (i) the power, direct or indirect, to cause the + direction or management of such entity, whether by contract or + otherwise, or (ii) ownership of fifty percent (50%) or more of the + outstanding shares, or (iii) beneficial ownership of such entity. + + "You" (or "Your") shall mean an individual or Legal Entity + exercising permissions granted by this License. + + "Source" form shall mean the preferred form for making modifications, + including but not limited to software source code, documentation + source, and configuration files. + + "Object" form shall mean any form resulting from mechanical + transformation or translation of a Source form, including but + not limited to compiled object code, generated documentation, + and conversions to other media types. + + "Work" shall mean the work of authorship, whether in Source or + Object form, made available under the License, as indicated by a + copyright notice that is included in or attached to the work + (an example is provided in the Appendix below). + + "Derivative Works" shall mean any work, whether in Source or Object + form, that is based on (or derived from) the Work and for which the + editorial revisions, annotations, elaborations, or other modifications + represent, as a whole, an original work of authorship. For the purposes + of this License, Derivative Works shall not include works that remain + separable from, or merely link (or bind by name) to the interfaces of, + the Work and Derivative Works thereof. + + "Contribution" shall mean any work of authorship, including + the original version of the Work and any modifications or additions + to that Work or Derivative Works thereof, that is intentionally + submitted to Licensor for inclusion in the Work by the copyright owner + or by an individual or Legal Entity authorized to submit on behalf of + the copyright owner. For the purposes of this definition, "submitted" + means any form of electronic, verbal, or written communication sent + to the Licensor or its representatives, including but not limited to + communication on electronic mailing lists, source code control systems, + and issue tracking systems that are managed by, or on behalf of, the + Licensor for the purpose of discussing and improving the Work, but + excluding communication that is conspicuously marked or otherwise + designated in writing by the copyright owner as "Not a Contribution." + + "Contributor" shall mean Licensor and any individual or Legal Entity + on behalf of whom a Contribution has been received by Licensor and + subsequently incorporated within the Work. + + 2. Grant of Copyright License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + copyright license to reproduce, prepare Derivative Works of, + publicly display, publicly perform, sublicense, and distribute the + Work and such Derivative Works in Source or Object form. + + 3. Grant of Patent License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + (except as stated in this section) patent license to make, have made, + use, offer to sell, sell, import, and otherwise transfer the Work, + where such license applies only to those patent claims licensable + by such Contributor that are necessarily infringed by their + Contribution(s) alone or by combination of their Contribution(s) + with the Work to which such Contribution(s) was submitted. If You + institute patent litigation against any entity (including a + cross-claim or counterclaim in a lawsuit) alleging that the Work + or a Contribution incorporated within the Work constitutes direct + or contributory patent infringement, then any patent licenses + granted to You under this License for that Work shall terminate + as of the date such litigation is filed. + + 4. Redistribution. You may reproduce and distribute copies of the + Work or Derivative Works thereof in any medium, with or without + modifications, and in Source or Object form, provided that You + meet the following conditions: + + (a) You must give any other recipients of the Work or + Derivative Works a copy of this License; and + + (b) You must cause any modified files to carry prominent notices + stating that You changed the files; and + + (c) You must retain, in the Source form of any Derivative Works + that You distribute, all copyright, patent, trademark, and + attribution notices from the Source form of the Work, + excluding those notices that do not pertain to any part of + the Derivative Works; and + + (d) If the Work includes a "NOTICE" text file as part of its + distribution, then any Derivative Works that You distribute must + include a readable copy of the attribution notices contained + within such NOTICE file, excluding those notices that do not + pertain to any part of the Derivative Works, in at least one + of the following places: within a NOTICE text file distributed + as part of the Derivative Works; within the Source form or + documentation, if provided along with the Derivative Works; or, + within a display generated by the Derivative Works, if and + wherever such third-party notices normally appear. The contents + of the NOTICE file are for informational purposes only and + do not modify the License. You may add Your own attribution + notices within Derivative Works that You distribute, alongside + or as an addendum to the NOTICE text from the Work, provided + that such additional attribution notices cannot be construed + as modifying the License. + + You may add Your own copyright statement to Your modifications and + may provide additional or different license terms and conditions + for use, reproduction, or distribution of Your modifications, or + for any such Derivative Works as a whole, provided Your use, + reproduction, and distribution of the Work otherwise complies with + the conditions stated in this License. + + 5. Submission of Contributions. Unless You explicitly state otherwise, + any Contribution intentionally submitted for inclusion in the Work + by You to the Licensor shall be under the terms and conditions of + this License, without any additional terms or conditions. + Notwithstanding the above, nothing herein shall supersede or modify + the terms of any separate license agreement you may have executed + with Licensor regarding such Contributions. + + 6. Trademarks. This License does not grant permission to use the trade + names, trademarks, service marks, or product names of the Licensor, + except as required for reasonable and customary use in describing the + origin of the Work and reproducing the content of the NOTICE file. + + 7. Disclaimer of Warranty. Unless required by applicable law or + agreed to in writing, Licensor provides the Work (and each + Contributor provides its Contributions) on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or + implied, including, without limitation, any warranties or conditions + of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A + PARTICULAR PURPOSE. You are solely responsible for determining the + appropriateness of using or redistributing the Work and assume any + risks associated with Your exercise of permissions under this License. + + 8. Limitation of Liability. In no event and under no legal theory, + whether in tort (including negligence), contract, or otherwise, + unless required by applicable law (such as deliberate and grossly + negligent acts) or agreed to in writing, shall any Contributor be + liable to You for damages, including any direct, indirect, special, + incidental, or consequential damages of any character arising as a + result of this License or out of the use or inability to use the + Work (including but not limited to damages for loss of goodwill, + work stoppage, computer failure or malfunction, or any and all + other commercial damages or losses), even if such Contributor + has been advised of the possibility of such damages. + + 9. Accepting Warranty or Additional Liability. While redistributing + the Work or Derivative Works thereof, You may choose to offer, + and charge a fee for, acceptance of support, warranty, indemnity, + or other liability obligations and/or rights consistent with this + License. However, in accepting such obligations, You may act only + on Your own behalf and on Your sole responsibility, not on behalf + of any other Contributor, and only if You agree to indemnify, + defend, and hold each Contributor harmless for any liability + incurred by, or claims asserted against, such Contributor by reason + of your accepting any such warranty or additional liability. + + END OF TERMS AND CONDITIONS + + APPENDIX: How to apply the Apache License to your work. + + To apply the Apache License to your work, attach the following + boilerplate notice, with the fields enclosed by brackets "{}" + replaced with your own identifying information. (Don't include + the brackets!) The text should be enclosed in the appropriate + comment syntax for the file format. We also recommend that a + file or class name and description of purpose be included on the + same "printed page" as the copyright notice for easier + identification within third-party archives. + + Copyright {yyyy} {name of copyright owner} + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. diff --git a/vendor/github.com/containernetworking/plugins/pkg/ns/README.md b/vendor/github.com/containernetworking/plugins/pkg/ns/README.md new file mode 100644 index 000000000..c0f5cf2e8 --- /dev/null +++ b/vendor/github.com/containernetworking/plugins/pkg/ns/README.md @@ -0,0 +1,40 @@ +### Namespaces, Threads, and Go +On Linux each OS thread can have a different network namespace. Go's thread scheduling model switches goroutines between OS threads based on OS thread load and whether the goroutine would block other goroutines. This can result in a goroutine switching network namespaces without notice and lead to errors in your code. + +### Namespace Switching +Switching namespaces with the `ns.Set()` method is not recommended without additional strategies to prevent unexpected namespace changes when your goroutines switch OS threads. + +Go provides the `runtime.LockOSThread()` function to ensure a specific goroutine executes on its current OS thread and prevents any other goroutine from running in that thread until the locked one exits. Careful usage of `LockOSThread()` and goroutines can provide good control over which network namespace a given goroutine executes in. + +For example, you cannot rely on the `ns.Set()` namespace being the current namespace after the `Set()` call unless you do two things. First, the goroutine calling `Set()` must have previously called `LockOSThread()`. Second, you must ensure `runtime.UnlockOSThread()` is not called somewhere in-between. You also cannot rely on the initial network namespace remaining the current network namespace if any other code in your program switches namespaces, unless you have already called `LockOSThread()` in that goroutine. Note that `LockOSThread()` prevents the Go scheduler from optimally scheduling goroutines for best performance, so `LockOSThread()` should only be used in small, isolated goroutines that release the lock quickly. + +### Do() The Recommended Thing +The `ns.Do()` method provides **partial** control over network namespaces for you by implementing these strategies. All code dependent on a particular network namespace (including the root namespace) should be wrapped in the `ns.Do()` method to ensure the correct namespace is selected for the duration of your code. For example: + +```go +targetNs, err := ns.NewNS() +if err != nil { + return err +} +err = targetNs.Do(func(hostNs ns.NetNS) error { + dummy := &netlink.Dummy{ + LinkAttrs: netlink.LinkAttrs{ + Name: "dummy0", + }, + } + return netlink.LinkAdd(dummy) +}) +``` + +Note this requirement to wrap every network call is very onerous - any libraries you call might call out to network services such as DNS, and all such calls need to be protected after you call `ns.Do()`. The CNI plugins all exit very soon after calling `ns.Do()` which helps to minimize the problem. + +Also: If the runtime spawns a new OS thread, it will inherit the network namespace of the parent thread, which may have been temporarily switched, and thus the new OS thread will be permanently "stuck in the wrong namespace". + +In short, **there is no safe way to change network namespaces from within a long-lived, multithreaded Go process**. If your daemon process needs to be namespace aware, consider spawning a separate process (like a CNI plugin) for each namespace. + +### Further Reading + - https://github.com/golang/go/wiki/LockOSThread + - http://morsmachine.dk/go-scheduler + - https://github.com/containernetworking/cni/issues/262 + - https://golang.org/pkg/runtime/ + - https://www.weave.works/blog/linux-namespaces-and-go-don-t-mix diff --git a/vendor/github.com/containernetworking/plugins/pkg/ns/ns_linux.go b/vendor/github.com/containernetworking/plugins/pkg/ns/ns_linux.go new file mode 100644 index 000000000..4ce989467 --- /dev/null +++ b/vendor/github.com/containernetworking/plugins/pkg/ns/ns_linux.go @@ -0,0 +1,305 @@ +// Copyright 2015-2017 CNI authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package ns + +import ( + "crypto/rand" + "fmt" + "os" + "path" + "runtime" + "sync" + "syscall" + + "golang.org/x/sys/unix" +) + +// Returns an object representing the current OS thread's network namespace +func GetCurrentNS() (NetNS, error) { + return GetNS(getCurrentThreadNetNSPath()) +} + +func getCurrentThreadNetNSPath() string { + // /proc/self/ns/net returns the namespace of the main thread, not + // of whatever thread this goroutine is running on. Make sure we + // use the thread's net namespace since the thread is switching around + return fmt.Sprintf("/proc/%d/task/%d/ns/net", os.Getpid(), unix.Gettid()) +} + +// Creates a new persistent network namespace and returns an object +// representing that namespace, without switching to it +func NewNS() (NetNS, error) { + const nsRunDir = "/var/run/netns" + + b := make([]byte, 16) + _, err := rand.Reader.Read(b) + if err != nil { + return nil, fmt.Errorf("failed to generate random netns name: %v", err) + } + + err = os.MkdirAll(nsRunDir, 0755) + if err != nil { + return nil, err + } + + // create an empty file at the mount point + nsName := fmt.Sprintf("cni-%x-%x-%x-%x-%x", b[0:4], b[4:6], b[6:8], b[8:10], b[10:]) + nsPath := path.Join(nsRunDir, nsName) + mountPointFd, err := os.Create(nsPath) + if err != nil { + return nil, err + } + mountPointFd.Close() + + // Ensure the mount point is cleaned up on errors; if the namespace + // was successfully mounted this will have no effect because the file + // is in-use + defer os.RemoveAll(nsPath) + + var wg sync.WaitGroup + wg.Add(1) + + // do namespace work in a dedicated goroutine, so that we can safely + // Lock/Unlock OSThread without upsetting the lock/unlock state of + // the caller of this function + var fd *os.File + go (func() { + defer wg.Done() + runtime.LockOSThread() + + var origNS NetNS + origNS, err = GetNS(getCurrentThreadNetNSPath()) + if err != nil { + return + } + defer origNS.Close() + + // create a new netns on the current thread + err = unix.Unshare(unix.CLONE_NEWNET) + if err != nil { + return + } + defer origNS.Set() + + // bind mount the new netns from the current thread onto the mount point + err = unix.Mount(getCurrentThreadNetNSPath(), nsPath, "none", unix.MS_BIND, "") + if err != nil { + return + } + + fd, err = os.Open(nsPath) + if err != nil { + return + } + })() + wg.Wait() + + if err != nil { + unix.Unmount(nsPath, unix.MNT_DETACH) + return nil, fmt.Errorf("failed to create namespace: %v", err) + } + + return &netNS{file: fd, mounted: true}, nil +} + +func (ns *netNS) Close() error { + if err := ns.errorIfClosed(); err != nil { + return err + } + + if err := ns.file.Close(); err != nil { + return fmt.Errorf("Failed to close %q: %v", ns.file.Name(), err) + } + ns.closed = true + + if ns.mounted { + if err := unix.Unmount(ns.file.Name(), unix.MNT_DETACH); err != nil { + return fmt.Errorf("Failed to unmount namespace %s: %v", ns.file.Name(), err) + } + if err := os.RemoveAll(ns.file.Name()); err != nil { + return fmt.Errorf("Failed to clean up namespace %s: %v", ns.file.Name(), err) + } + ns.mounted = false + } + + return nil +} + +func (ns *netNS) Set() error { + if err := ns.errorIfClosed(); err != nil { + return err + } + + if err := unix.Setns(int(ns.Fd()), unix.CLONE_NEWNET); err != nil { + return fmt.Errorf("Error switching to ns %v: %v", ns.file.Name(), err) + } + + return nil +} + +type NetNS interface { + // Executes the passed closure in this object's network namespace, + // attempting to restore the original namespace before returning. + // However, since each OS thread can have a different network namespace, + // and Go's thread scheduling is highly variable, callers cannot + // guarantee any specific namespace is set unless operations that + // require that namespace are wrapped with Do(). Also, no code called + // from Do() should call runtime.UnlockOSThread(), or the risk + // of executing code in an incorrect namespace will be greater. See + // https://github.com/golang/go/wiki/LockOSThread for further details. + Do(toRun func(NetNS) error) error + + // Sets the current network namespace to this object's network namespace. + // Note that since Go's thread scheduling is highly variable, callers + // cannot guarantee the requested namespace will be the current namespace + // after this function is called; to ensure this wrap operations that + // require the namespace with Do() instead. + Set() error + + // Returns the filesystem path representing this object's network namespace + Path() string + + // Returns a file descriptor representing this object's network namespace + Fd() uintptr + + // Cleans up this instance of the network namespace; if this instance + // is the last user the namespace will be destroyed + Close() error +} + +type netNS struct { + file *os.File + mounted bool + closed bool +} + +// netNS implements the NetNS interface +var _ NetNS = &netNS{} + +const ( + // https://github.com/torvalds/linux/blob/master/include/uapi/linux/magic.h + NSFS_MAGIC = 0x6e736673 + PROCFS_MAGIC = 0x9fa0 +) + +type NSPathNotExistErr struct{ msg string } + +func (e NSPathNotExistErr) Error() string { return e.msg } + +type NSPathNotNSErr struct{ msg string } + +func (e NSPathNotNSErr) Error() string { return e.msg } + +func IsNSorErr(nspath string) error { + stat := syscall.Statfs_t{} + if err := syscall.Statfs(nspath, &stat); err != nil { + if os.IsNotExist(err) { + err = NSPathNotExistErr{msg: fmt.Sprintf("failed to Statfs %q: %v", nspath, err)} + } else { + err = fmt.Errorf("failed to Statfs %q: %v", nspath, err) + } + return err + } + + switch stat.Type { + case PROCFS_MAGIC, NSFS_MAGIC: + return nil + default: + return NSPathNotNSErr{msg: fmt.Sprintf("unknown FS magic on %q: %x", nspath, stat.Type)} + } +} + +// Returns an object representing the namespace referred to by @path +func GetNS(nspath string) (NetNS, error) { + err := IsNSorErr(nspath) + if err != nil { + return nil, err + } + + fd, err := os.Open(nspath) + if err != nil { + return nil, err + } + + return &netNS{file: fd}, nil +} + +func (ns *netNS) Path() string { + return ns.file.Name() +} + +func (ns *netNS) Fd() uintptr { + return ns.file.Fd() +} + +func (ns *netNS) errorIfClosed() error { + if ns.closed { + return fmt.Errorf("%q has already been closed", ns.file.Name()) + } + return nil +} + +func (ns *netNS) Do(toRun func(NetNS) error) error { + if err := ns.errorIfClosed(); err != nil { + return err + } + + containedCall := func(hostNS NetNS) error { + threadNS, err := GetCurrentNS() + if err != nil { + return fmt.Errorf("failed to open current netns: %v", err) + } + defer threadNS.Close() + + // switch to target namespace + if err = ns.Set(); err != nil { + return fmt.Errorf("error switching to ns %v: %v", ns.file.Name(), err) + } + defer threadNS.Set() // switch back + + return toRun(hostNS) + } + + // save a handle to current network namespace + hostNS, err := GetCurrentNS() + if err != nil { + return fmt.Errorf("Failed to open current namespace: %v", err) + } + defer hostNS.Close() + + var wg sync.WaitGroup + wg.Add(1) + + var innerError error + go func() { + defer wg.Done() + runtime.LockOSThread() + innerError = containedCall(hostNS) + }() + wg.Wait() + + return innerError +} + +// WithNetNSPath executes the passed closure under the given network +// namespace, restoring the original namespace afterwards. +func WithNetNSPath(nspath string, toRun func(NetNS) error) error { + ns, err := GetNS(nspath) + if err != nil { + return err + } + defer ns.Close() + return ns.Do(toRun) +} diff --git a/vendor/vendor.json b/vendor/vendor.json index 70fa6b514..a72946e31 100644 --- a/vendor/vendor.json +++ b/vendor/vendor.json @@ -633,6 +633,14 @@ "revisionTime": "2018-06-21T05:42:04Z", "tree": true }, + { + "checksumSHA1": "umfzSifCgODa2RhhkS8/cjw0WjU=", + "path": "github.com/containernetworking/plugins/pkg/ns", + "revision": "a62711a5da7a2dc2eb93eac47e103738ad923fd6", + "revisionTime": "2019-03-15T16:54:57Z", + "version": "v0.7", + "versionExact": "v0.7.5" + }, { "checksumSHA1": "JOgERxEIEiFk4AN0IielftEFKmY=", "path": "github.com/contiv/executor",