Skip to content

Commit

Permalink
libcontainer: remove all mount logic from nsexec
Browse files Browse the repository at this point in the history
With open_tree(OPEN_TREE_CLONE), it is possible to implement both the
id-mapped mounts and bind-mount source file descriptor logic entirely in
Go without requiring any complicated handling from nsexec.

However, implementing it the naive way (do the OPEN_TREE_CLONE in the
host namespace before the rootfs is set up -- which is what the existing
implementation did) exposes issues in how mount ordering (in particular
when handling mount sources from inside the container rootfs, but also
in relation to mount propagation) was handled for idmapped mounts and
bind-mount sources. In order to solve this problem completely, it is
necessary to spawn a thread which joins the container mount namespace
and provides mountfds when requested by the rootfs setup code (ensuring
that the mount order and mount propagation of the source of the
bind-mount are handled correctly). While the need to join the mount
namespace leads to other complicated (such as with the usage of
/proc/self -- fixed in a later patch) the resulting code is still
reasonable and is the only real way to solve the issue.

This allows us to reduce the amount of C code we have in nsexec, as well
as simplifying a whole host of places that were made more complicated
with the addition of id-mapped mounts and the bind sourcefd logic.
Because we join the container namespace, we can continue to use regular
O_PATH file descriptors for non-id-mapped bind-mount sources (which
means we don't have to raise the kernel requirement for that case).

In addition, we can easily add support for id-mappings that don't match
the container's user namespace. The approach taken here is to use Go's
officially supported mechanism for spawning a process in a user
namespace, but (ab)use PTRACE_TRACEME to avoid actually having to exec a
different process. The most efficient way to implement this would be to
do clone() in cgo directly to run a function that just does
kill(getpid(), SIGSTOP) -- we can always switch to that if it turns out
this approach is too slow. It should be noted that the included
micro-benchmark seems to indicate this is Fast Enough(TM):

  goos: linux
  goarch: amd64
  pkg: github.com/opencontainers/runc/libcontainer/userns
  cpu: Intel(R) Core(TM) i5-10210U CPU @ 1.60GHz
  BenchmarkSpawnProc
  BenchmarkSpawnProc-8        1670            770065 ns/op

Fixes: fda12ab ("Support idmap mounts on volumes")
Fixes: 9c44407 ("Open bind mount sources from the host userns")
Signed-off-by: Aleksa Sarai <cyphar@cyphar.com>
  • Loading branch information
cyphar committed Dec 14, 2023
1 parent 99f7fa1 commit ba0b5e2
Show file tree
Hide file tree
Showing 14 changed files with 617 additions and 687 deletions.
161 changes: 3 additions & 158 deletions libcontainer/container_linux.go
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,6 @@ package libcontainer

import (
"bytes"
"encoding/json"
"errors"
"fmt"
"io"
Expand Down Expand Up @@ -629,112 +628,6 @@ func (c *Container) newParentProcess(p *Process) (parentProcess, error) {
return c.newSetnsProcess(p, cmd, comm)
}

// shouldSendMountSources says whether the child process must setup bind mounts with
// the source pre-opened (O_PATH) in the host user namespace.
// See https://github.com/opencontainers/runc/issues/2484
func (c *Container) shouldSendMountSources() bool {
// Passing the mount sources via SCM_RIGHTS is only necessary when
// both userns and mntns are active.
if !c.config.Namespaces.Contains(configs.NEWUSER) ||
!c.config.Namespaces.Contains(configs.NEWNS) {
return false
}

// nsexec.c send_mountsources() requires setns(mntns) capabilities
// CAP_SYS_CHROOT and CAP_SYS_ADMIN.
if c.config.RootlessEUID {
return false
}

// We need to send sources if there are non-idmap bind-mounts.
for _, m := range c.config.Mounts {
if m.IsBind() && !m.IsIDMapped() {
return true
}
}

return false
}

// shouldSendIdmapSources says whether the child process must setup idmap mounts with
// the mount_setattr already done in the host user namespace.
func (c *Container) shouldSendIdmapSources() bool {
// nsexec.c mount_setattr() requires CAP_SYS_ADMIN in:
// * the user namespace the filesystem was mounted in;
// * the user namespace we're trying to idmap the mount to;
// * the owning user namespace of the mount namespace you're currently located in.
//
// See the comment from Christian Brauner:
// https://github.com/opencontainers/runc/pull/3717#discussion_r1103607972
//
// Let's just rule out rootless, we don't have those permission in the
// rootless case.
if c.config.RootlessEUID {
return false
}

// For the time being we require userns to be in use.
if !c.config.Namespaces.Contains(configs.NEWUSER) {
return false
}

// We need to send sources if there are idmap bind-mounts.
for _, m := range c.config.Mounts {
if m.IsBind() && m.IsIDMapped() {
return true
}
}

return false
}

func (c *Container) sendMountSources(cmd *exec.Cmd, comm *processComm) error {
if !c.shouldSendMountSources() {
return nil
}

return c.sendFdsSources(cmd, comm, "_LIBCONTAINER_MOUNT_FDS", func(m *configs.Mount) bool {
return m.IsBind() && !m.IsIDMapped()
})
}

func (c *Container) sendIdmapSources(cmd *exec.Cmd, comm *processComm) error {
if !c.shouldSendIdmapSources() {
return nil
}

return c.sendFdsSources(cmd, comm, "_LIBCONTAINER_IDMAP_FDS", func(m *configs.Mount) bool {
return m.IsBind() && m.IsIDMapped()
})
}

func (c *Container) sendFdsSources(cmd *exec.Cmd, comm *processComm, envVar string, condition func(*configs.Mount) bool) error {
// Elements on these slices will be paired with mounts (see StartInitialization() and
// prepareRootfs()). These slices MUST have the same size as c.config.Mounts.
fds := make([]int, len(c.config.Mounts))
for i, m := range c.config.Mounts {
if !condition(m) {
// The -1 fd is ignored later.
fds[i] = -1
continue
}

// The fd passed here will not be used: nsexec.c will overwrite it with
// dup3(). We just need to allocate a fd so that we know the number to
// pass in the environment variable. The fd must not be closed before
// cmd.Start(), so we reuse initSockChild because the lifecycle of that
// fd is already taken care of.
cmd.ExtraFiles = append(cmd.ExtraFiles, comm.initSockChild)
fds[i] = stdioFdCount + len(cmd.ExtraFiles) - 1
}
fdsJSON, err := json.Marshal(fds)
if err != nil {
return fmt.Errorf("Error creating %v: %w", envVar, err)
}
cmd.Env = append(cmd.Env, envVar+"="+string(fdsJSON))
return nil
}

func (c *Container) newInitProcess(p *Process, cmd *exec.Cmd, comm *processComm) (*initProcess, error) {
cmd.Env = append(cmd.Env, "_LIBCONTAINER_INITTYPE="+string(initStandard))
nsMaps := make(map[configs.NamespaceType]string)
Expand All @@ -743,16 +636,10 @@ func (c *Container) newInitProcess(p *Process, cmd *exec.Cmd, comm *processComm)
nsMaps[ns.Type] = ns.Path
}
}
data, err := c.bootstrapData(c.config.Namespaces.CloneFlags(), nsMaps, initStandard)
data, err := c.bootstrapData(c.config.Namespaces.CloneFlags(), nsMaps)
if err != nil {
return nil, err
}
if err := c.sendMountSources(cmd, comm); err != nil {
return nil, err
}
if err := c.sendIdmapSources(cmd, comm); err != nil {
return nil, err
}

init := &initProcess{
cmd: cmd,
Expand All @@ -776,7 +663,7 @@ func (c *Container) newSetnsProcess(p *Process, cmd *exec.Cmd, comm *processComm
}
// for setns process, we don't have to set cloneflags as the process namespaces
// will only be set via setns syscall
data, err := c.bootstrapData(0, state.NamespacePaths, initSetns)
data, err := c.bootstrapData(0, state.NamespacePaths)
if err != nil {
return nil, err
}
Expand Down Expand Up @@ -1165,7 +1052,7 @@ type netlinkError struct{ error }
// such as one that uses nsenter package to bootstrap the container's
// init process correctly, i.e. with correct namespaces, uid/gid
// mapping etc.
func (c *Container) bootstrapData(cloneFlags uintptr, nsMaps map[configs.NamespaceType]string, it initType) (_ io.Reader, Err error) {
func (c *Container) bootstrapData(cloneFlags uintptr, nsMaps map[configs.NamespaceType]string) (_ io.Reader, Err error) {
// create the netlink message
r := nl.NewNetlinkRequest(int(InitMsg), 0)

Expand Down Expand Up @@ -1267,48 +1154,6 @@ func (c *Container) bootstrapData(cloneFlags uintptr, nsMaps map[configs.Namespa
Value: c.config.RootlessEUID,
})

// Bind mount source to open.
if it == initStandard && c.shouldSendMountSources() {
var mounts []byte
for _, m := range c.config.Mounts {
if m.IsBind() && !m.IsIDMapped() {
if strings.IndexByte(m.Source, 0) >= 0 {
return nil, fmt.Errorf("mount source string contains null byte: %q", m.Source)
}
mounts = append(mounts, []byte(m.Source)...)
}
mounts = append(mounts, byte(0))
}

r.AddData(&Bytemsg{
Type: MountSourcesAttr,
Value: mounts,
})
}

// Idmap mount sources to open.
if it == initStandard && c.shouldSendIdmapSources() {
var mounts []byte
for _, m := range c.config.Mounts {
if m.IsBind() && m.IsIDMapped() {
// While other parts of the code check this too (like
// libcontainer/specconv/spec_linux.go) we do it here also because some libcontainer
// users don't use those functions.
if strings.IndexByte(m.Source, 0) >= 0 {
return nil, fmt.Errorf("mount source string contains null byte: %q", m.Source)
}

mounts = append(mounts, []byte(m.Source)...)
}
mounts = append(mounts, byte(0))
}

r.AddData(&Bytemsg{
Type: IdmapSourcesAttr,
Value: mounts,
})
}

// write boottime and monotonic time ns offsets.
if c.config.TimeOffsets != nil {
var offsetSpec bytes.Buffer
Expand Down
4 changes: 2 additions & 2 deletions libcontainer/criu_linux.go
Original file line number Diff line number Diff line change
Expand Up @@ -618,8 +618,8 @@ func (c *Container) prepareCriuRestoreMounts(mounts []*configs.Mount) error {
// because during initial container creation mounts are
// set up in the order they are configured.
if m.Device == "bind" {
if err := utils.WithProcfd(c.config.Rootfs, m.Destination, func(dstFD string) error {
return mountViaFDs(m.Source, nil, m.Destination, dstFD, "", unix.MS_BIND|unix.MS_REC, "")
if err := utils.WithProcfd(c.config.Rootfs, m.Destination, func(dstFd string) error {
return mountViaFds(m.Source, nil, m.Destination, dstFd, "", unix.MS_BIND|unix.MS_REC, "")
}); err != nil {
return err
}
Expand Down
15 changes: 0 additions & 15 deletions libcontainer/factory_linux.go
Original file line number Diff line number Diff line change
Expand Up @@ -214,18 +214,3 @@ func validateID(id string) error {

return nil
}

func parseFdsFromEnv(envVar string) ([]int, error) {
fdsJSON := os.Getenv(envVar)
if fdsJSON == "" {
// Always return the nil slice if no fd is present.
return nil, nil
}

var fds []int
if err := json.Unmarshal([]byte(fdsJSON), &fds); err != nil {
return nil, fmt.Errorf("Error unmarshalling %v: %w", envVar, err)
}

return fds, nil
}
34 changes: 2 additions & 32 deletions libcontainer/init_linux.go
Original file line number Diff line number Diff line change
Expand Up @@ -47,18 +47,6 @@ type network struct {
TempVethPeerName string `json:"temp_veth_peer_name"`
}

type mountFds struct {
// sourceFds are the fds to use as source when mounting.
// The slice size should be the same as container mounts, as it will be
// paired with them.
// The value -1 is used when no fd is needed for the mount.
// Can't have a valid fd in the same position that other slices in this struct.
// We need to use only one of these fds on any single mount.
sourceFds []int
// Idem sourceFds, but fds of already created idmap mounts, to use with unix.MoveMount().
idmapFds []int
}

// initConfig is used for transferring parameters from Exec() to Init()
type initConfig struct {
Args []string `json:"args"`
Expand Down Expand Up @@ -189,18 +177,6 @@ func startInitialization() (retErr error) {
defer pidfdSocket.Close()
}

// Get mount files (O_PATH).
mountSrcFds, err := parseFdsFromEnv("_LIBCONTAINER_MOUNT_FDS")
if err != nil {
return err
}

// Get idmap fds.
idmapFds, err := parseFdsFromEnv("_LIBCONTAINER_IDMAP_FDS")
if err != nil {
return err
}

// Get runc-dmz fds.
var dmzExe *os.File
if dmzFdStr := os.Getenv("_LIBCONTAINER_DMZEXEFD"); dmzFdStr != "" {
Expand Down Expand Up @@ -232,21 +208,16 @@ func startInitialization() (retErr error) {
}

// If init succeeds, it will not return, hence none of the defers will be called.
return containerInit(it, &config, syncPipe, consoleSocket, pidfdSocket, fifofd, logFD, dmzExe, mountFds{sourceFds: mountSrcFds, idmapFds: idmapFds})
return containerInit(it, &config, syncPipe, consoleSocket, pidfdSocket, fifofd, logFD, dmzExe)
}

func containerInit(t initType, config *initConfig, pipe *syncSocket, consoleSocket, pidfdSocket *os.File, fifoFd, logFd int, dmzExe *os.File, mountFds mountFds) error {
func containerInit(t initType, config *initConfig, pipe *syncSocket, consoleSocket, pidfdSocket *os.File, fifoFd, logFd int, dmzExe *os.File) error {
if err := populateProcessEnvironment(config.Env); err != nil {
return err
}

switch t {
case initSetns:
// mount and idmap fds must be nil in this case. We don't mount while doing runc exec.
if mountFds.sourceFds != nil || mountFds.idmapFds != nil {
return errors.New("mount and idmap fds must be nil; can't mount from exec")
}

i := &linuxSetnsInit{
pipe: pipe,
consoleSocket: consoleSocket,
Expand All @@ -266,7 +237,6 @@ func containerInit(t initType, config *initConfig, pipe *syncSocket, consoleSock
fifoFd: fifoFd,
logFd: logFd,
dmzExe: dmzExe,
mountFds: mountFds,
}
return i.Init()
}
Expand Down
4 changes: 1 addition & 3 deletions libcontainer/message_linux.go
Original file line number Diff line number Diff line change
Expand Up @@ -21,9 +21,7 @@ const (
RootlessEUIDAttr uint16 = 27287
UidmapPathAttr uint16 = 27288
GidmapPathAttr uint16 = 27289
MountSourcesAttr uint16 = 27290
IdmapSourcesAttr uint16 = 27291
TimeOffsetsAttr uint16 = 27292
TimeOffsetsAttr uint16 = 27290
)

type Int32msg struct {
Expand Down
Loading

0 comments on commit ba0b5e2

Please sign in to comment.