From 38767e297c0d9d7d3d6966d48dd778f53c9b05db Mon Sep 17 00:00:00 2001
From: Aleksa Sarai <cyphar@cyphar.com>
Date: Thu, 3 Aug 2023 16:59:21 +1000
Subject: [PATCH 01/11] gha: stash away a procfs mount

This is needed to make sure that our userns tests work in GitHub Actions
if the host /proc is masked.

Signed-off-by: Aleksa Sarai <cyphar@cyphar.com>
---
 .github/workflows/test.yml | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
index 56bae35f36f..8d35c69c339 100644
--- a/.github/workflows/test.yml
+++ b/.github/workflows/test.yml
@@ -78,6 +78,15 @@ jobs:
       with:
         bats-version: 1.9.0
 
+    - name: procfs mount
+      run: |
+        # Get the list of mounts to help with debugging.
+        cat /proc/self/mounts
+        # Create a procfs mount that is not masked, to ensure that container
+        # procfs mounts will succeed.
+        sudo mkdir -p /tmp/.procfs-stashed-mount
+        sudo unshare -pf mount -t proc -o subset=pid proc /tmp/.procfs-stashed-mount
+
     - name: unit test
       if: matrix.rootless != 'rootless'
       run: sudo -E PATH="$PATH" -- make TESTFLAGS="${{ matrix.race }}" localunittest

From b35718b4f6b28793389f027e510fb8fee8993ddc Mon Sep 17 00:00:00 2001
From: Aleksa Sarai <cyphar@cyphar.com>
Date: Fri, 4 Aug 2023 17:57:35 +1000
Subject: [PATCH 02/11] makefile: quote TESTFLAGS when passing to containerised
 make

Otherwise TESTFLAGS="-run FooBar" will result in TESTFLAGS=-run being
executed in the container.

Signed-off-by: Aleksa Sarai <cyphar@cyphar.com>
---
 Makefile | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/Makefile b/Makefile
index c72dd2f414d..0d48fe8c521 100644
--- a/Makefile
+++ b/Makefile
@@ -105,7 +105,7 @@ unittest: runcimage
 		-t --privileged --rm \
 		-v /lib/modules:/lib/modules:ro \
 		-v $(CURDIR):/go/src/$(PROJECT) \
-		$(RUNC_IMAGE) make localunittest TESTFLAGS=$(TESTFLAGS)
+		$(RUNC_IMAGE) make localunittest TESTFLAGS="$(TESTFLAGS)"
 
 localunittest: all
 	$(GO) test -timeout 3m -tags "$(BUILDTAGS)" $(TESTFLAGS) -v ./...
@@ -115,7 +115,7 @@ integration: runcimage
 		-t --privileged --rm \
 		-v /lib/modules:/lib/modules:ro \
 		-v $(CURDIR):/go/src/$(PROJECT) \
-		$(RUNC_IMAGE) make localintegration TESTPATH=$(TESTPATH)
+		$(RUNC_IMAGE) make localintegration TESTPATH="$(TESTPATH)"
 
 localintegration: all
 	bats -t tests/integration$(TESTPATH)

From dbdc562b20791d5ed0f484246c4107083031db43 Mon Sep 17 00:00:00 2001
From: Aleksa Sarai <cyphar@cyphar.com>
Date: Tue, 8 Aug 2023 11:33:06 +1000
Subject: [PATCH 03/11] libcontainer: sync: cleanup synchronisation code

This includes quite a few cleanups and improvements to the way we do
syncrhonisation. The core behaviour is unchanged, but switching to
embedding json.RawMessage into the synchronisation structure will allow
us to do more complicated synchronisation operations in future patches.

The file descriptor passing through the syncrhonisation system feature
will be used as part of the idmapped-mount and bind-mount-source
features when switching that code to use the new mount API outside of
nsexec.c.

Signed-off-by: Aleksa Sarai <cyphar@cyphar.com>
---
 contrib/cmd/recvtty/recvtty.go          |   4 +-
 libcontainer/criu_linux.go              |   2 +-
 libcontainer/init_linux.go              |  62 +++++------
 libcontainer/integration/execin_test.go |   4 +-
 libcontainer/process_linux.go           |  83 ++++++++------
 libcontainer/rootfs_linux.go            |   3 +-
 libcontainer/setns_init_linux.go        |   2 -
 libcontainer/sync.go                    | 141 +++++++++++++++---------
 libcontainer/utils/cmsg.go              |  81 +++++++++-----
 libcontainer/utils/utils_unix.go        |   2 +-
 tty.go                                  |   2 +-
 11 files changed, 223 insertions(+), 163 deletions(-)

diff --git a/contrib/cmd/recvtty/recvtty.go b/contrib/cmd/recvtty/recvtty.go
index 35c293de642..532a4c33904 100644
--- a/contrib/cmd/recvtty/recvtty.go
+++ b/contrib/cmd/recvtty/recvtty.go
@@ -98,7 +98,7 @@ func handleSingle(path string, noStdin bool) error {
 	defer socket.Close()
 
 	// Get the master file descriptor from runC.
-	master, err := utils.RecvFd(socket)
+	master, err := utils.RecvFile(socket)
 	if err != nil {
 		return err
 	}
@@ -171,7 +171,7 @@ func handleNull(path string) error {
 			defer socket.Close()
 
 			// Get the master file descriptor from runC.
-			master, err := utils.RecvFd(socket)
+			master, err := utils.RecvFile(socket)
 			if err != nil {
 				return
 			}
diff --git a/libcontainer/criu_linux.go b/libcontainer/criu_linux.go
index 31c8a900eb1..ec6228399d8 100644
--- a/libcontainer/criu_linux.go
+++ b/libcontainer/criu_linux.go
@@ -1185,7 +1185,7 @@ func (c *Container) criuNotifications(resp *criurpc.CriuResp, process *Process,
 		defer master.Close()
 
 		// While we can access console.master, using the API is a good idea.
-		if err := utils.SendFd(process.ConsoleSocket, master.Name(), master.Fd()); err != nil {
+		if err := utils.SendFile(process.ConsoleSocket, master); err != nil {
 			return err
 		}
 	case "status-ready":
diff --git a/libcontainer/init_linux.go b/libcontainer/init_linux.go
index 42cae1ccb65..39f121cd909 100644
--- a/libcontainer/init_linux.go
+++ b/libcontainer/init_linux.go
@@ -5,9 +5,9 @@ import (
 	"encoding/json"
 	"errors"
 	"fmt"
-	"io"
 	"net"
 	"os"
+	"runtime"
 	"runtime/debug"
 	"strconv"
 	"strings"
@@ -102,11 +102,8 @@ func StartInitialization() (retErr error) {
 	defer func() {
 		// We have an error during the initialization of the container's init,
 		// send it back to the parent process in the form of an initError.
-		if err := writeSync(pipe, procError); err != nil {
-			fmt.Fprintln(os.Stderr, retErr)
-			return
-		}
-		if err := utils.WriteJSON(pipe, &initError{Message: retErr.Error()}); err != nil {
+		ierr := initError{Message: retErr.Error()}
+		if err := writeSyncArg(pipe, procError, ierr); err != nil {
 			fmt.Fprintln(os.Stderr, retErr)
 			return
 		}
@@ -317,7 +314,6 @@ func setupConsole(socket *os.File, config *initConfig, mount bool) error {
 	if err != nil {
 		return err
 	}
-
 	// After we return from here, we don't need the console anymore.
 	defer pty.Close()
 
@@ -339,9 +335,11 @@ func setupConsole(socket *os.File, config *initConfig, mount bool) error {
 		}
 	}
 	// While we can access console.master, using the API is a good idea.
-	if err := utils.SendFd(socket, pty.Name(), pty.Fd()); err != nil {
+	if err := utils.SendRawFd(socket, pty.Name(), pty.Fd()); err != nil {
 		return err
 	}
+	runtime.KeepAlive(pty)
+
 	// Now, dup over all the things.
 	return dupStdio(slavePath)
 }
@@ -349,12 +347,11 @@ func setupConsole(socket *os.File, config *initConfig, mount bool) error {
 // syncParentReady sends to the given pipe a JSON payload which indicates that
 // the init is ready to Exec the child process. It then waits for the parent to
 // indicate that it is cleared to Exec.
-func syncParentReady(pipe io.ReadWriter) error {
+func syncParentReady(pipe *os.File) error {
 	// Tell parent.
 	if err := writeSync(pipe, procReady); err != nil {
 		return err
 	}
-
 	// Wait for parent to give the all-clear.
 	return readSync(pipe, procRun)
 }
@@ -362,44 +359,37 @@ func syncParentReady(pipe io.ReadWriter) error {
 // syncParentHooks sends to the given pipe a JSON payload which indicates that
 // the parent should execute pre-start hooks. It then waits for the parent to
 // indicate that it is cleared to resume.
-func syncParentHooks(pipe io.ReadWriter) error {
+func syncParentHooks(pipe *os.File) error {
 	// Tell parent.
 	if err := writeSync(pipe, procHooks); err != nil {
 		return err
 	}
-
 	// Wait for parent to give the all-clear.
 	return readSync(pipe, procResume)
 }
 
-// syncParentSeccomp sends to the given pipe a JSON payload which
-// indicates that the parent should pick up the seccomp fd with pidfd_getfd()
-// and send it to the seccomp agent over a unix socket. It then waits for
-// the parent to indicate that it is cleared to resume and closes the seccompFd.
-// If the seccompFd is -1, there isn't anything to sync with the parent, so it
-// returns no error.
-func syncParentSeccomp(pipe io.ReadWriter, seccompFd int) error {
-	if seccompFd == -1 {
+// syncParentSeccomp sends the fd associated with the seccomp file descriptor
+// to the parent, and wait for the parent to do pidfd_getfd() to grab a copy.
+func syncParentSeccomp(pipe *os.File, seccompFd int) error {
+	if seccompFd >= 0 {
 		return nil
 	}
-
-	// Tell parent.
-	if err := writeSyncWithFd(pipe, procSeccomp, seccompFd); err != nil {
-		unix.Close(seccompFd)
+	defer unix.Close(seccompFd)
+
+	// Tell parent to grab our fd.
+	//
+	// Notably, we do not use writeSyncFile here because a container might have
+	// an SCMP_ACT_NOTIFY action on sendmsg(2) so we need to use the smallest
+	// possible number of system calls here because all of those syscalls
+	// cannot be used with SCMP_ACT_NOTIFY as a result (any syscall we use here
+	// before the parent gets the file descriptor would deadlock "runc init" if
+	// we allowed it for SCMP_ACT_NOTIFY). See seccomp.InitSeccomp() for more
+	// details.
+	if err := writeSyncArg(pipe, procSeccomp, seccompFd); err != nil {
 		return err
 	}
-
-	// Wait for parent to give the all-clear.
-	if err := readSync(pipe, procSeccompDone); err != nil {
-		unix.Close(seccompFd)
-		return fmt.Errorf("sync parent seccomp: %w", err)
-	}
-
-	if err := unix.Close(seccompFd); err != nil {
-		return fmt.Errorf("close seccomp fd: %w", err)
-	}
-
-	return nil
+	// Wait for parent to tell us they've grabbed the seccompfd.
+	return readSync(pipe, procSeccompDone)
 }
 
 // setupUser changes the groups, gid, and uid for the user inside the container
diff --git a/libcontainer/integration/execin_test.go b/libcontainer/integration/execin_test.go
index 58519a419c0..c5c324130c6 100644
--- a/libcontainer/integration/execin_test.go
+++ b/libcontainer/integration/execin_test.go
@@ -277,9 +277,9 @@ func TestExecInTTY(t *testing.T) {
 
 		done := make(chan (error))
 		go func() {
-			f, err := utils.RecvFd(parent)
+			f, err := utils.RecvFile(parent)
 			if err != nil {
-				done <- fmt.Errorf("RecvFd: %w", err)
+				done <- fmt.Errorf("RecvFile: %w", err)
 				return
 			}
 			c, err := console.ConsoleFromFile(f)
diff --git a/libcontainer/process_linux.go b/libcontainer/process_linux.go
index 48861406dba..1f5e37bd166 100644
--- a/libcontainer/process_linux.go
+++ b/libcontainer/process_linux.go
@@ -9,6 +9,7 @@ import (
 	"os"
 	"os/exec"
 	"path/filepath"
+	"runtime"
 	"strconv"
 	"time"
 
@@ -171,14 +172,27 @@ func (p *setnsProcess) start() (retErr error) {
 			panic("unexpected procHooks in setns")
 		case procSeccomp:
 			if p.config.Config.Seccomp.ListenerPath == "" {
-				return errors.New("listenerPath is not set")
+				return errors.New("seccomp listenerPath is not set")
 			}
-
-			seccompFd, err := recvSeccompFd(uintptr(p.pid()), uintptr(sync.Fd))
+			if sync.Arg == nil {
+				return fmt.Errorf("sync %q is missing an argument", sync.Type)
+			}
+			var srcFd int
+			if err := json.Unmarshal(*sync.Arg, &srcFd); err != nil {
+				return fmt.Errorf("sync %q passed invalid fd arg: %w", sync.Type, err)
+			}
+			seccompFd, err := pidGetFd(p.pid(), srcFd)
 			if err != nil {
+				return fmt.Errorf("sync %q get fd %d from child failed: %w", sync.Type, srcFd, err)
+			}
+			defer seccompFd.Close()
+			// We have a copy, the child can keep working. We don't need to
+			// wait for the seccomp notify listener to get the fd before we
+			// permit the child to continue because the child will happily wait
+			// for the listener if it hits SCMP_ACT_NOTIFY.
+			if err := writeSync(p.messageSockPair.parent, procSeccompDone); err != nil {
 				return err
 			}
-			defer unix.Close(seccompFd)
 
 			bundle, annotations := utils.Annotations(p.config.Config.Labels)
 			containerProcessState := &specs.ContainerProcessState{
@@ -199,15 +213,10 @@ func (p *setnsProcess) start() (retErr error) {
 				containerProcessState, seccompFd); err != nil {
 				return err
 			}
-
-			// Sync with child.
-			if err := writeSync(p.messageSockPair.parent, procSeccompDone); err != nil {
-				return err
-			}
-			return nil
 		default:
 			return errors.New("invalid JSON payload from child")
 		}
+		return nil
 	})
 
 	if err := unix.Shutdown(int(p.messageSockPair.parent.Fd()), unix.SHUT_WR); err != nil {
@@ -460,14 +469,27 @@ func (p *initProcess) start() (retErr error) {
 		switch sync.Type {
 		case procSeccomp:
 			if p.config.Config.Seccomp.ListenerPath == "" {
-				return errors.New("listenerPath is not set")
+				return errors.New("seccomp listenerPath is not set")
 			}
-
-			seccompFd, err := recvSeccompFd(uintptr(childPid), uintptr(sync.Fd))
+			var srcFd int
+			if sync.Arg == nil {
+				return fmt.Errorf("sync %q is missing an argument", sync.Type)
+			}
+			if err := json.Unmarshal(*sync.Arg, &srcFd); err != nil {
+				return fmt.Errorf("sync %q passed invalid fd arg: %w", sync.Type, err)
+			}
+			seccompFd, err := pidGetFd(p.pid(), srcFd)
 			if err != nil {
+				return fmt.Errorf("sync %q get fd %d from child failed: %w", sync.Type, srcFd, err)
+			}
+			defer seccompFd.Close()
+			// We have a copy, the child can keep working. We don't need to
+			// wait for the seccomp notify listener to get the fd before we
+			// permit the child to continue because the child will happily wait
+			// for the listener if it hits SCMP_ACT_NOTIFY.
+			if err := writeSync(p.messageSockPair.parent, procSeccompDone); err != nil {
 				return err
 			}
-			defer unix.Close(seccompFd)
 
 			s, err := p.container.currentOCIState()
 			if err != nil {
@@ -488,11 +510,6 @@ func (p *initProcess) start() (retErr error) {
 				containerProcessState, seccompFd); err != nil {
 				return err
 			}
-
-			// Sync with child.
-			if err := writeSync(p.messageSockPair.parent, procSeccompDone); err != nil {
-				return err
-			}
 		case procReady:
 			// set rlimits, this has to be done here because we lose permissions
 			// to raise the limits once we enter a user-namespace
@@ -591,7 +608,6 @@ func (p *initProcess) start() (retErr error) {
 		default:
 			return errors.New("invalid JSON payload from child")
 		}
-
 		return nil
 	})
 
@@ -684,22 +700,20 @@ func (p *initProcess) forwardChildLogs() chan error {
 	return logs.ForwardLogs(p.logFilePair.parent)
 }
 
-func recvSeccompFd(childPid, childFd uintptr) (int, error) {
-	pidfd, _, errno := unix.Syscall(unix.SYS_PIDFD_OPEN, childPid, 0, 0)
-	if errno != 0 {
-		return -1, fmt.Errorf("performing SYS_PIDFD_OPEN syscall: %w", errno)
+func pidGetFd(pid, srcFd int) (*os.File, error) {
+	pidFd, err := unix.PidfdOpen(pid, 0)
+	if err != nil {
+		return nil, os.NewSyscallError("pidfd_open", err)
 	}
-	defer unix.Close(int(pidfd))
-
-	seccompFd, _, errno := unix.Syscall(unix.SYS_PIDFD_GETFD, pidfd, childFd, 0)
-	if errno != 0 {
-		return -1, fmt.Errorf("performing SYS_PIDFD_GETFD syscall: %w", errno)
+	defer unix.Close(pidFd)
+	fd, err := unix.PidfdGetfd(pidFd, srcFd, 0)
+	if err != nil {
+		return nil, os.NewSyscallError("pidfd_getfd", err)
 	}
-
-	return int(seccompFd), nil
+	return os.NewFile(uintptr(fd), "[pidfd_getfd]"), nil
 }
 
-func sendContainerProcessState(listenerPath string, state *specs.ContainerProcessState, fd int) error {
+func sendContainerProcessState(listenerPath string, state *specs.ContainerProcessState, file *os.File) error {
 	conn, err := net.Dial("unix", listenerPath)
 	if err != nil {
 		return fmt.Errorf("failed to connect with seccomp agent specified in the seccomp profile: %w", err)
@@ -716,11 +730,10 @@ func sendContainerProcessState(listenerPath string, state *specs.ContainerProces
 		return fmt.Errorf("cannot marshall seccomp state: %w", err)
 	}
 
-	err = utils.SendFds(socket, b, fd)
-	if err != nil {
+	if err := utils.SendRawFd(socket, string(b), file.Fd()); err != nil {
 		return fmt.Errorf("cannot send seccomp fd to %s: %w", listenerPath, err)
 	}
-
+	runtime.KeepAlive(file)
 	return nil
 }
 
diff --git a/libcontainer/rootfs_linux.go b/libcontainer/rootfs_linux.go
index ac3ba183e12..5386d34184e 100644
--- a/libcontainer/rootfs_linux.go
+++ b/libcontainer/rootfs_linux.go
@@ -3,7 +3,6 @@ package libcontainer
 import (
 	"errors"
 	"fmt"
-	"io"
 	"os"
 	"path"
 	"path/filepath"
@@ -64,7 +63,7 @@ func needsSetupDev(config *configs.Config) bool {
 // prepareRootfs sets up the devices, mount points, and filesystems for use
 // inside a new mount namespace. It doesn't set anything as ro. You must call
 // finalizeRootfs after this function to finish setting up the rootfs.
-func prepareRootfs(pipe io.ReadWriter, iConfig *initConfig, mountFds mountFds) (err error) {
+func prepareRootfs(pipe *os.File, iConfig *initConfig, mountFds mountFds) (err error) {
 	config := iConfig.Config
 	if err := prepareRoot(config); err != nil {
 		return fmt.Errorf("error preparing rootfs: %w", err)
diff --git a/libcontainer/setns_init_linux.go b/libcontainer/setns_init_linux.go
index ac58190758a..40a47a2e95c 100644
--- a/libcontainer/setns_init_linux.go
+++ b/libcontainer/setns_init_linux.go
@@ -75,7 +75,6 @@ func (l *linuxSetnsInit) Init() error {
 		if err != nil {
 			return err
 		}
-
 		if err := syncParentSeccomp(l.pipe, seccompFd); err != nil {
 			return err
 		}
@@ -94,7 +93,6 @@ func (l *linuxSetnsInit) Init() error {
 		if err != nil {
 			return fmt.Errorf("unable to init seccomp: %w", err)
 		}
-
 		if err := syncParentSeccomp(l.pipe, seccompFd); err != nil {
 			return err
 		}
diff --git a/libcontainer/sync.go b/libcontainer/sync.go
index 25dc2863071..25507e58ad9 100644
--- a/libcontainer/sync.go
+++ b/libcontainer/sync.go
@@ -5,6 +5,7 @@ import (
 	"errors"
 	"fmt"
 	"io"
+	"os"
 
 	"github.com/opencontainers/runc/libcontainer/utils"
 )
@@ -15,15 +16,19 @@ type syncType string
 // during container setup. They come in pairs (with procError being a generic
 // response which is followed by an &initError).
 //
-//	[  child  ] <-> [   parent   ]
+//	     [  child  ] <-> [   parent   ]
 //
-//	procHooks   --> [run hooks]
-//	            <-- procResume
+//	procSeccomp         --> [forward fd to listenerPath]
+//	  file: seccomp fd
+//	                    --- no return synchronisation
 //
-//	procReady   --> [final setup]
-//	            <-- procRun
+//	procHooks --> [run hooks]
+//	          <-- procResume
 //
-//	procSeccomp --> [pick up seccomp fd with pidfd_getfd()]
+//	procReady --> [final setup]
+//	          <-- procRun
+//
+//	procSeccomp --> [grab seccomp fd with pidfd_getfd()]
 //	            <-- procSeccompDone
 const (
 	procError       syncType = "procError"
@@ -35,9 +40,17 @@ const (
 	procSeccompDone syncType = "procSeccompDone"
 )
 
+type syncFlags int
+
+const (
+	syncFlagHasFd syncFlags = (1 << iota)
+)
+
 type syncT struct {
-	Type syncType `json:"type"`
-	Fd   int      `json:"fd"`
+	Type  syncType         `json:"type"`
+	Flags syncFlags        `json:"flags"`
+	Arg   *json.RawMessage `json:"arg,omitempty"`
+	File  *os.File         `json:"-"` // passed oob through SCM_RIGHTS
 }
 
 // initError is used to wrap errors for passing them via JSON,
@@ -50,74 +63,100 @@ func (i initError) Error() string {
 	return i.Message
 }
 
-// writeSync is used to write to a synchronisation pipe. An error is returned
-// if there was a problem writing the payload.
-func writeSync(pipe io.Writer, sync syncType) error {
-	return writeSyncWithFd(pipe, sync, -1)
+func doWriteSync(pipe *os.File, sync syncT) error {
+	sync.Flags &= ^syncFlagHasFd
+	if sync.File != nil {
+		sync.Flags |= syncFlagHasFd
+	}
+	if err := utils.WriteJSON(pipe, sync); err != nil {
+		return fmt.Errorf("writing sync %q: %w", sync.Type, err)
+	}
+	if sync.Flags&syncFlagHasFd != 0 {
+		if err := utils.SendFile(pipe, sync.File); err != nil {
+			return fmt.Errorf("sending file after sync %q: %w", sync.Type, err)
+		}
+	}
+	return nil
+}
+
+func writeSync(pipe *os.File, sync syncType) error {
+	return doWriteSync(pipe, syncT{Type: sync})
 }
 
-// writeSyncWithFd is used to write to a synchronisation pipe. An error is
-// returned if there was a problem writing the payload.
-func writeSyncWithFd(pipe io.Writer, sync syncType, fd int) error {
-	if err := utils.WriteJSON(pipe, syncT{sync, fd}); err != nil {
-		return fmt.Errorf("writing syncT %q: %w", string(sync), err)
+func writeSyncArg(pipe *os.File, sync syncType, arg interface{}) error {
+	argJSON, err := json.Marshal(arg)
+	if err != nil {
+		return fmt.Errorf("writing sync %q: marshal argument failed: %w", sync, err)
 	}
-	return nil
+	argJSONMsg := json.RawMessage(argJSON)
+	return doWriteSync(pipe, syncT{Type: sync, Arg: &argJSONMsg})
 }
 
-// readSync is used to read from a synchronisation pipe. An error is returned
-// if we got an initError, the pipe was closed, or we got an unexpected flag.
-func readSync(pipe io.Reader, expected syncType) error {
-	var procSync syncT
-	if err := json.NewDecoder(pipe).Decode(&procSync); err != nil {
+func doReadSync(pipe *os.File) (syncT, error) {
+	var sync syncT
+	if err := json.NewDecoder(pipe).Decode(&sync); err != nil {
 		if errors.Is(err, io.EOF) {
-			return errors.New("parent closed synchronisation channel")
+			return sync, err
 		}
-		return fmt.Errorf("failed reading error from parent: %w", err)
+		return sync, fmt.Errorf("reading from parent failed: %w", err)
 	}
-
-	if procSync.Type == procError {
+	if sync.Type == procError {
 		var ierr initError
-
-		if err := json.NewDecoder(pipe).Decode(&ierr); err != nil {
-			return fmt.Errorf("failed reading error from parent: %w", err)
+		if sync.Arg == nil {
+			return sync, errors.New("procError missing error payload")
 		}
+		if err := json.Unmarshal(*sync.Arg, &ierr); err != nil {
+			return sync, fmt.Errorf("unmarshal procError failed: %w", err)
+		}
+		return sync, &ierr
+	}
+	if sync.Flags&syncFlagHasFd != 0 {
+		file, err := utils.RecvFile(pipe)
+		if err != nil {
+			return sync, fmt.Errorf("receiving fd from sync %q failed: %w", sync.Type, err)
+		}
+		sync.File = file
+	}
+	return sync, nil
+}
 
-		return &ierr
+func readSyncFull(pipe *os.File, expected syncType) (syncT, error) {
+	sync, err := doReadSync(pipe)
+	if err != nil {
+		return sync, err
+	}
+	if sync.Type != expected {
+		return sync, fmt.Errorf("unexpected synchronisation flag: got %q, expected %q", sync.Type, expected)
 	}
+	return sync, nil
+}
 
-	if procSync.Type != expected {
-		return errors.New("invalid synchronisation flag from parent")
+func readSync(pipe *os.File, expected syncType) error {
+	sync, err := readSyncFull(pipe, expected)
+	if err != nil {
+		return err
+	}
+	if sync.Arg != nil {
+		return fmt.Errorf("sync %q had unexpected argument passed: %q", expected, string(*sync.Arg))
+	}
+	if sync.File != nil {
+		_ = sync.File.Close()
+		return fmt.Errorf("sync %q had unexpected file passed", sync.Type)
 	}
 	return nil
 }
 
 // parseSync runs the given callback function on each syncT received from the
 // child. It will return once io.EOF is returned from the given pipe.
-func parseSync(pipe io.Reader, fn func(*syncT) error) error {
-	dec := json.NewDecoder(pipe)
+func parseSync(pipe *os.File, fn func(*syncT) error) error {
 	for {
-		var sync syncT
-		if err := dec.Decode(&sync); err != nil {
+		sync, err := doReadSync(pipe)
+		if err != nil {
 			if errors.Is(err, io.EOF) {
 				break
 			}
 			return err
 		}
-
-		// We handle this case outside fn for cleanliness reasons.
-		var ierr *initError
-		if sync.Type == procError {
-			if err := dec.Decode(&ierr); err != nil && !errors.Is(err, io.EOF) {
-				return fmt.Errorf("error decoding proc error from init: %w", err)
-			}
-			if ierr != nil {
-				return ierr
-			}
-			// Programmer error.
-			panic("No error following JSON procError payload.")
-		}
-
 		if err := fn(&sync); err != nil {
 			return err
 		}
diff --git a/libcontainer/utils/cmsg.go b/libcontainer/utils/cmsg.go
index fd9326cb59a..c4c122e41fe 100644
--- a/libcontainer/utils/cmsg.go
+++ b/libcontainer/utils/cmsg.go
@@ -19,13 +19,14 @@ package utils
 import (
 	"fmt"
 	"os"
+	"runtime"
 
 	"golang.org/x/sys/unix"
 )
 
-// MaxNameLen is the maximum length of the name of a file descriptor being
-// sent using SendFd. The name of the file handle returned by RecvFd will never
-// be larger than this value.
+// MaxNameLen is the maximum length of the name of a file descriptor being sent
+// using SendFd. The name of the file handle returned by RecvFile will never be
+// larger than this value.
 const MaxNameLen = 4096
 
 // oobSpace is the size of the oob slice required to store a single FD. Note
@@ -33,26 +34,21 @@ const MaxNameLen = 4096
 // so sizeof(fd) = 4.
 var oobSpace = unix.CmsgSpace(4)
 
-// RecvFd waits for a file descriptor to be sent over the given AF_UNIX
+// RecvFile waits for a file descriptor to be sent over the given AF_UNIX
 // socket. The file name of the remote file descriptor will be recreated
 // locally (it is sent as non-auxiliary data in the same payload).
-func RecvFd(socket *os.File) (*os.File, error) {
-	// For some reason, unix.Recvmsg uses the length rather than the capacity
-	// when passing the msg_controllen and other attributes to recvmsg.  So we
-	// have to actually set the length.
+func RecvFile(socket *os.File) (_ *os.File, Err error) {
 	name := make([]byte, MaxNameLen)
 	oob := make([]byte, oobSpace)
 
 	sockfd := socket.Fd()
-	n, oobn, _, _, err := unix.Recvmsg(int(sockfd), name, oob, 0)
+	n, oobn, _, _, err := unix.Recvmsg(int(sockfd), name, oob, unix.MSG_CMSG_CLOEXEC)
 	if err != nil {
 		return nil, err
 	}
-
 	if n >= MaxNameLen || oobn != oobSpace {
 		return nil, fmt.Errorf("recvfd: incorrect number of bytes read (n=%d oobn=%d)", n, oobn)
 	}
-
 	// Truncate.
 	name = name[:n]
 	oob = oob[:oobn]
@@ -61,36 +57,61 @@ func RecvFd(socket *os.File) (*os.File, error) {
 	if err != nil {
 		return nil, err
 	}
-	if len(scms) != 1 {
-		return nil, fmt.Errorf("recvfd: number of SCMs is not 1: %d", len(scms))
+
+	// We cannot control how many SCM_RIGHTS we receive, and upon receiving
+	// them all of the descriptors are installed in our fd table, so we need to
+	// parse all of the SCM_RIGHTS we received in order to close all of the
+	// descriptors on error.
+	var fds []int
+	defer func() {
+		for i, fd := range fds {
+			// Only close 0 if err != nil, and close everything else.
+			if i != 0 || err != nil {
+				_ = unix.Close(fd)
+			}
+		}
+	}()
+	var lastErr error
+	for _, scm := range scms {
+		if scm.Header.Type == unix.SCM_RIGHTS {
+			scmFds, err := unix.ParseUnixRights(&scm)
+			if err != nil {
+				lastErr = err
+			} else {
+				fds = append(fds, scmFds...)
+			}
+		}
+	}
+	if lastErr != nil {
+		return nil, lastErr
 	}
-	scm := scms[0]
 
-	fds, err := unix.ParseUnixRights(&scm)
-	if err != nil {
-		return nil, err
+	// We do this after collecting the fds to make sure we close them all when
+	// returning an error here.
+	if len(scms) != 1 {
+		return nil, fmt.Errorf("recvfd: number of SCMs is not 1: %d", len(scms))
 	}
 	if len(fds) != 1 {
 		return nil, fmt.Errorf("recvfd: number of fds is not 1: %d", len(fds))
 	}
-	fd := uintptr(fds[0])
-
-	return os.NewFile(fd, string(name)), nil
+	return os.NewFile(uintptr(fds[0]), string(name)), nil
 }
 
-// SendFd sends a file descriptor over the given AF_UNIX socket. In
-// addition, the file.Name() of the given file will also be sent as
-// non-auxiliary data in the same payload (allowing to send contextual
-// information for a file descriptor).
-func SendFd(socket *os.File, name string, fd uintptr) error {
+// SendFile sends a file over the given AF_UNIX socket. file.Name() is also
+// included so that if the other end uses RecvFile, the file will have the same
+// name information.
+func SendFile(socket *os.File, file *os.File) error {
+	name := file.Name()
 	if len(name) >= MaxNameLen {
 		return fmt.Errorf("sendfd: filename too long: %s", name)
 	}
-	return SendFds(socket, []byte(name), int(fd))
+	err := SendRawFd(socket, name, file.Fd())
+	runtime.KeepAlive(file)
+	return err
 }
 
-// SendFds sends a list of files descriptor and msg over the given AF_UNIX socket.
-func SendFds(socket *os.File, msg []byte, fds ...int) error {
-	oob := unix.UnixRights(fds...)
-	return unix.Sendmsg(int(socket.Fd()), msg, oob, nil, 0)
+// SendRawFd sends a specific file descriptor over the given AF_UNIX socket.
+func SendRawFd(socket *os.File, msg string, fd uintptr) error {
+	oob := unix.UnixRights(int(fd))
+	return unix.Sendmsg(int(socket.Fd()), []byte(msg), oob, nil, 0)
 }
diff --git a/libcontainer/utils/utils_unix.go b/libcontainer/utils/utils_unix.go
index 6b9a7be038f..ca520b63b36 100644
--- a/libcontainer/utils/utils_unix.go
+++ b/libcontainer/utils/utils_unix.go
@@ -91,7 +91,7 @@ func CloseExecFrom(minFd int) error {
 }
 
 // NewSockPair returns a new unix socket pair
-func NewSockPair(name string) (parent *os.File, child *os.File, err error) {
+func NewSockPair(name string) (parent, child *os.File, err error) {
 	fds, err := unix.Socketpair(unix.AF_LOCAL, unix.SOCK_STREAM|unix.SOCK_CLOEXEC, 0)
 	if err != nil {
 		return nil, nil, err
diff --git a/tty.go b/tty.go
index fba3e025bc0..c101aacb78b 100644
--- a/tty.go
+++ b/tty.go
@@ -100,7 +100,7 @@ func (t *tty) initHostConsole() error {
 }
 
 func (t *tty) recvtty(socket *os.File) (Err error) {
-	f, err := utils.RecvFd(socket)
+	f, err := utils.RecvFile(socket)
 	if err != nil {
 		return err
 	}

From a61cfd1b12ba9683439682d585bdc3f2105ce1d8 Mon Sep 17 00:00:00 2001
From: Aleksa Sarai <cyphar@cyphar.com>
Date: Tue, 8 Aug 2023 11:42:24 +1000
Subject: [PATCH 04/11] libcontainer: seccomp: pass around *os.File for
 notifyfd

*os.File is correctly tracked by the garbage collector, and there's no
need to use raw file descriptors for this code.

Signed-off-by: Aleksa Sarai <cyphar@cyphar.com>
---
 libcontainer/init_linux.go                    |  8 ++---
 libcontainer/seccomp/patchbpf/enosys_linux.go | 13 ++++---
 libcontainer/seccomp/seccomp_linux.go         | 35 +++++++++----------
 libcontainer/seccomp/seccomp_unsupported.go   |  7 ++--
 4 files changed, 31 insertions(+), 32 deletions(-)

diff --git a/libcontainer/init_linux.go b/libcontainer/init_linux.go
index 39f121cd909..cb9f549460b 100644
--- a/libcontainer/init_linux.go
+++ b/libcontainer/init_linux.go
@@ -370,11 +370,11 @@ func syncParentHooks(pipe *os.File) error {
 
 // syncParentSeccomp sends the fd associated with the seccomp file descriptor
 // to the parent, and wait for the parent to do pidfd_getfd() to grab a copy.
-func syncParentSeccomp(pipe *os.File, seccompFd int) error {
-	if seccompFd >= 0 {
+func syncParentSeccomp(pipe *os.File, seccompFd *os.File) error {
+	if seccompFd == nil {
 		return nil
 	}
-	defer unix.Close(seccompFd)
+	defer seccompFd.Close()
 
 	// Tell parent to grab our fd.
 	//
@@ -385,7 +385,7 @@ func syncParentSeccomp(pipe *os.File, seccompFd int) error {
 	// before the parent gets the file descriptor would deadlock "runc init" if
 	// we allowed it for SCMP_ACT_NOTIFY). See seccomp.InitSeccomp() for more
 	// details.
-	if err := writeSyncArg(pipe, procSeccomp, seccompFd); err != nil {
+	if err := writeSyncArg(pipe, procSeccomp, seccompFd.Fd()); err != nil {
 		return err
 	}
 	// Wait for parent to tell us they've grabbed the seccompfd.
diff --git a/libcontainer/seccomp/patchbpf/enosys_linux.go b/libcontainer/seccomp/patchbpf/enosys_linux.go
index 7fc9fd662c3..40e51533410 100644
--- a/libcontainer/seccomp/patchbpf/enosys_linux.go
+++ b/libcontainer/seccomp/patchbpf/enosys_linux.go
@@ -690,17 +690,17 @@ func sysSeccompSetFilter(flags uint, filter []unix.SockFilter) (fd int, err erro
 // patches said filter to handle -ENOSYS in a much nicer manner than the
 // default libseccomp default action behaviour, and loads the patched filter
 // into the kernel for the current process.
-func PatchAndLoad(config *configs.Seccomp, filter *libseccomp.ScmpFilter) (int, error) {
+func PatchAndLoad(config *configs.Seccomp, filter *libseccomp.ScmpFilter) (*os.File, error) {
 	// Generate a patched filter.
 	fprog, err := enosysPatchFilter(config, filter)
 	if err != nil {
-		return -1, fmt.Errorf("error patching filter: %w", err)
+		return nil, fmt.Errorf("error patching filter: %w", err)
 	}
 
 	// Get the set of libseccomp flags set.
 	seccompFlags, noNewPrivs, err := filterFlags(config, filter)
 	if err != nil {
-		return -1, fmt.Errorf("unable to fetch seccomp filter flags: %w", err)
+		return nil, fmt.Errorf("unable to fetch seccomp filter flags: %w", err)
 	}
 
 	// Set no_new_privs if it was requested, though in runc we handle
@@ -708,15 +708,14 @@ func PatchAndLoad(config *configs.Seccomp, filter *libseccomp.ScmpFilter) (int,
 	if noNewPrivs {
 		logrus.Warnf("potentially misconfigured filter -- setting no_new_privs in seccomp path")
 		if err := unix.Prctl(unix.PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0); err != nil {
-			return -1, fmt.Errorf("error enabling no_new_privs bit: %w", err)
+			return nil, fmt.Errorf("error enabling no_new_privs bit: %w", err)
 		}
 	}
 
 	// Finally, load the filter.
 	fd, err := sysSeccompSetFilter(seccompFlags, fprog)
 	if err != nil {
-		return -1, fmt.Errorf("error loading seccomp filter: %w", err)
+		return nil, fmt.Errorf("error loading seccomp filter: %w", err)
 	}
-
-	return fd, nil
+	return os.NewFile(uintptr(fd), "[seccomp filter]"), nil
 }
diff --git a/libcontainer/seccomp/seccomp_linux.go b/libcontainer/seccomp/seccomp_linux.go
index fed02bcedc4..2c64ebbbadd 100644
--- a/libcontainer/seccomp/seccomp_linux.go
+++ b/libcontainer/seccomp/seccomp_linux.go
@@ -6,6 +6,7 @@ package seccomp
 import (
 	"errors"
 	"fmt"
+	"os"
 
 	libseccomp "github.com/seccomp/libseccomp-golang"
 	"github.com/sirupsen/logrus"
@@ -27,17 +28,16 @@ const (
 )
 
 // InitSeccomp installs the seccomp filters to be used in the container as
-// specified in config.
-// Returns the seccomp file descriptor if any of the filters include a
-// SCMP_ACT_NOTIFY action, otherwise returns -1.
-func InitSeccomp(config *configs.Seccomp) (int, error) {
+// specified in config. Returns the seccomp file descriptor if any of the
+// filters include a SCMP_ACT_NOTIFY action.
+func InitSeccomp(config *configs.Seccomp) (*os.File, error) {
 	if config == nil {
-		return -1, errors.New("cannot initialize Seccomp - nil config passed")
+		return nil, errors.New("cannot initialize Seccomp - nil config passed")
 	}
 
 	defaultAction, err := getAction(config.DefaultAction, config.DefaultErrnoRet)
 	if err != nil {
-		return -1, errors.New("error initializing seccomp - invalid default action")
+		return nil, errors.New("error initializing seccomp - invalid default action")
 	}
 
 	// Ignore the error since pre-2.4 libseccomp is treated as API level 0.
@@ -45,7 +45,7 @@ func InitSeccomp(config *configs.Seccomp) (int, error) {
 	for _, call := range config.Syscalls {
 		if call.Action == configs.Notify {
 			if apiLevel < 6 {
-				return -1, fmt.Errorf("seccomp notify unsupported: API level: got %d, want at least 6. Please try with libseccomp >= 2.5.0 and Linux >= 5.7", apiLevel)
+				return nil, fmt.Errorf("seccomp notify unsupported: API level: got %d, want at least 6. Please try with libseccomp >= 2.5.0 and Linux >= 5.7", apiLevel)
 			}
 
 			// We can't allow the write syscall to notify to the seccomp agent.
@@ -61,36 +61,36 @@ func InitSeccomp(config *configs.Seccomp) (int, error) {
 			// agent allows those syscalls to proceed, initialization works just fine and the agent can
 			// handle future read()/close() syscalls as it wanted.
 			if call.Name == "write" {
-				return -1, errors.New("SCMP_ACT_NOTIFY cannot be used for the write syscall")
+				return nil, errors.New("SCMP_ACT_NOTIFY cannot be used for the write syscall")
 			}
 		}
 	}
 
 	// See comment on why write is not allowed. The same reason applies, as this can mean handling write too.
 	if defaultAction == libseccomp.ActNotify {
-		return -1, errors.New("SCMP_ACT_NOTIFY cannot be used as default action")
+		return nil, errors.New("SCMP_ACT_NOTIFY cannot be used as default action")
 	}
 
 	filter, err := libseccomp.NewFilter(defaultAction)
 	if err != nil {
-		return -1, fmt.Errorf("error creating filter: %w", err)
+		return nil, fmt.Errorf("error creating filter: %w", err)
 	}
 
 	// Add extra architectures
 	for _, arch := range config.Architectures {
 		scmpArch, err := libseccomp.GetArchFromString(arch)
 		if err != nil {
-			return -1, fmt.Errorf("error validating Seccomp architecture: %w", err)
+			return nil, fmt.Errorf("error validating Seccomp architecture: %w", err)
 		}
 		if err := filter.AddArch(scmpArch); err != nil {
-			return -1, fmt.Errorf("error adding architecture to seccomp filter: %w", err)
+			return nil, fmt.Errorf("error adding architecture to seccomp filter: %w", err)
 		}
 	}
 
 	// Add extra flags.
 	for _, flag := range config.Flags {
 		if err := setFlag(filter, flag); err != nil {
-			return -1, err
+			return nil, err
 		}
 	}
 
@@ -110,25 +110,24 @@ func InitSeccomp(config *configs.Seccomp) (int, error) {
 
 	// Unset no new privs bit
 	if err := filter.SetNoNewPrivsBit(false); err != nil {
-		return -1, fmt.Errorf("error setting no new privileges: %w", err)
+		return nil, fmt.Errorf("error setting no new privileges: %w", err)
 	}
 
 	// Add a rule for each syscall
 	for _, call := range config.Syscalls {
 		if call == nil {
-			return -1, errors.New("encountered nil syscall while initializing Seccomp")
+			return nil, errors.New("encountered nil syscall while initializing Seccomp")
 		}
 
 		if err := matchCall(filter, call, defaultAction); err != nil {
-			return -1, err
+			return nil, err
 		}
 	}
 
 	seccompFd, err := patchbpf.PatchAndLoad(config, filter)
 	if err != nil {
-		return -1, fmt.Errorf("error loading seccomp filter into kernel: %w", err)
+		return nil, fmt.Errorf("error loading seccomp filter into kernel: %w", err)
 	}
-
 	return seccompFd, nil
 }
 
diff --git a/libcontainer/seccomp/seccomp_unsupported.go b/libcontainer/seccomp/seccomp_unsupported.go
index 885529dc7d0..b08a3498687 100644
--- a/libcontainer/seccomp/seccomp_unsupported.go
+++ b/libcontainer/seccomp/seccomp_unsupported.go
@@ -5,6 +5,7 @@ package seccomp
 
 import (
 	"errors"
+	"os"
 
 	"github.com/opencontainers/runc/libcontainer/configs"
 	"github.com/opencontainers/runtime-spec/specs-go"
@@ -13,11 +14,11 @@ import (
 var ErrSeccompNotEnabled = errors.New("seccomp: config provided but seccomp not supported")
 
 // InitSeccomp does nothing because seccomp is not supported.
-func InitSeccomp(config *configs.Seccomp) (int, error) {
+func InitSeccomp(config *configs.Seccomp) (*os.File, error) {
 	if config != nil {
-		return -1, ErrSeccompNotEnabled
+		return nil, ErrSeccompNotEnabled
 	}
-	return -1, nil
+	return nil, nil
 }
 
 // FlagSupported tells if a provided seccomp flag is supported.

From 3ceb838730c9e84ab1163113fe760253f7b54798 Mon Sep 17 00:00:00 2001
From: Aleksa Sarai <cyphar@cyphar.com>
Date: Wed, 2 Aug 2023 12:33:14 +1000
Subject: [PATCH 05/11] rootfs: use empty src for MS_REMOUNT

The kernel ignores these arguments, and passing them can lead to
confusing error messages (the old source is irrelevant for MS_REMOUNT),
as well as causing issues for a future patch where we switch to
move_mount(2).

Signed-off-by: Aleksa Sarai <cyphar@cyphar.com>
---
 libcontainer/rootfs_linux.go | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/libcontainer/rootfs_linux.go b/libcontainer/rootfs_linux.go
index 5386d34184e..f65df8197d2 100644
--- a/libcontainer/rootfs_linux.go
+++ b/libcontainer/rootfs_linux.go
@@ -1105,7 +1105,7 @@ func writeSystemProperty(key, value string) error {
 func remount(m mountEntry, rootfs string, noMountFallback bool) error {
 	return utils.WithProcfd(rootfs, m.Destination, func(dstFD string) error {
 		flags := uintptr(m.Flags | unix.MS_REMOUNT)
-		err := mountViaFDs(m.Source, m.srcFD, m.Destination, dstFD, m.Device, flags, "")
+		err := mountViaFDs("", nil, m.Destination, dstFD, m.Device, flags, "")
 		if err == nil {
 			return nil
 		}
@@ -1129,7 +1129,7 @@ func remount(m mountEntry, rootfs string, noMountFallback bool) error {
 		}
 		// ... and retry the mount with flags found above.
 		flags |= uintptr(int(s.Flags) & checkflags)
-		return mountViaFDs(m.Source, m.srcFD, m.Destination, dstFD, m.Device, flags, "")
+		return mountViaFDs("", nil, m.Destination, dstFD, m.Device, flags, "")
 	})
 }
 

From 39e4217f1a9c1106f96088b9831de5a40c905208 Mon Sep 17 00:00:00 2001
From: Aleksa Sarai <cyphar@cyphar.com>
Date: Tue, 1 Aug 2023 20:30:53 +1000
Subject: [PATCH 06/11] nsexec: remove cgroupns special-casing

The original implementation of cgroupns had additional synchronisation
to "ensure" that the process is in the correct cgroup before unsharing
the cgroupns. This behaviour was actually never necessary, and after
commit 5110bd2fc026 ("nsenter: remove cgroupns sync mechanism") there is
no synchronisation at all, meaning that CLONE_NEWCGROUP should not get
any special treatment.

Fixes: 5110bd2fc026 ("nsenter: remove cgroupns sync mechanism")
Fixes: df3fa115f974 ("Add support for cgroup namespace")
Signed-off-by: Aleksa Sarai <cyphar@cyphar.com>
---
 libcontainer/nsenter/nsexec.c | 6 +-----
 1 file changed, 1 insertion(+), 5 deletions(-)

diff --git a/libcontainer/nsenter/nsexec.c b/libcontainer/nsenter/nsexec.c
index 22b6ea1cd21..c0b9fc367e9 100644
--- a/libcontainer/nsenter/nsexec.c
+++ b/libcontainer/nsenter/nsexec.c
@@ -1151,7 +1151,7 @@ void nsexec(void)
 			 * some old kernel versions where clone(CLONE_PARENT | CLONE_NEWPID)
 			 * was broken, so we'll just do it the long way anyway.
 			 */
-			try_unshare(config.cloneflags & ~CLONE_NEWCGROUP, "remaining namespaces (except cgroupns)");
+			try_unshare(config.cloneflags, "remaining namespaces");
 
 			/* Ask our parent to send the mount sources fds. */
 			if (config.mountsources) {
@@ -1281,10 +1281,6 @@ void nsexec(void)
 					bail("setgroups failed");
 			}
 
-			if (config.cloneflags & CLONE_NEWCGROUP) {
-				try_unshare(CLONE_NEWCGROUP, "cgroup namespace");
-			}
-
 			write_log(DEBUG, "signal completion to stage-0");
 			s = SYNC_CHILD_FINISH;
 			if (write(syncfd, &s, sizeof(s)) != sizeof(s))

From d84e1502538dee4b4c67bbd3b2343158c8191f86 Mon Sep 17 00:00:00 2001
From: Aleksa Sarai <cyphar@cyphar.com>
Date: Fri, 4 Aug 2023 11:59:31 +1000
Subject: [PATCH 07/11] nsexec: migrate memfd /proc/self/exe logic to Go code

This allow us to remove the amount of C code in runc quite
substantially, as well as removing a whole execve(2) from the nsexec
path because we no longer spawn "runc init" only to re-exec "runc init"
after doing the clone.

Signed-off-by: Aleksa Sarai <cyphar@cyphar.com>
---
 libcontainer/container_linux.go      | 177 ++++++++-
 libcontainer/nsenter/cloned_binary.c | 567 ---------------------------
 libcontainer/nsenter/nsexec.c        |  11 -
 libcontainer/system/linux.go         |  41 ++
 4 files changed, 198 insertions(+), 598 deletions(-)
 delete mode 100644 libcontainer/nsenter/cloned_binary.c

diff --git a/libcontainer/container_linux.go b/libcontainer/container_linux.go
index 889dadd34d1..9b4bc9c033a 100644
--- a/libcontainer/container_linux.go
+++ b/libcontainer/container_linux.go
@@ -45,6 +45,7 @@ type Container struct {
 	state                containerState
 	created              time.Time
 	fifo                 *os.File
+	safeExeFile          *os.File
 }
 
 // State represents a running container's state
@@ -316,6 +317,12 @@ func (c *Container) start(process *Process) (retErr error) {
 	if err != nil {
 		return fmt.Errorf("unable to create new parent process: %w", err)
 	}
+	// This is no longer needed after the process has been spawned. We also
+	// want to make sure that (especially in the case of O_TMPFILE descriptors)
+	// that we use a new copy for each execution, because an attacker
+	// overwriting our copy would be just as bad as overwiting the host runc
+	// binary if we re-use the copy.
+	defer c.clearSafeExe()
 
 	logsDone := parent.forwardChildLogs()
 	if logsDone != nil {
@@ -441,6 +448,125 @@ func (c *Container) includeExecFifo(cmd *exec.Cmd) error {
 	return nil
 }
 
+func (c *Container) clearSafeExe() {
+	if c.safeExeFile != nil {
+		_ = c.safeExeFile.Close()
+		c.safeExeFile = nil
+	}
+}
+
+// makeSafeExe makes a copy of /proc/self/exe that is safe to use when
+// executing inside a container. On modern kernels, this is a locked executable
+// memfd that contains a copy of /proc/self/exe. The returned string is a
+// /proc/self/fd/... path that can be used directly with "os/exec".Command().
+// For more details on why this is necessary, see CVE-2019-5736.
+func (c *Container) makeSafeExe() (path string, Err error) {
+	if c.safeExeFile == nil {
+		var err error
+		var sealFn func(**os.File) error
+
+		// Close safeExeFile if we fail to make it properly.
+		defer func() {
+			if c.safeExeFile != nil && Err != nil {
+				c.clearSafeExe()
+			}
+		}()
+
+		// First, try an executable memfd (supported since Linux 3.17).
+		c.safeExeFile, err = system.ExecutableMemfd("runc_cloned:/proc/self/exe", unix.MFD_ALLOW_SEALING|unix.MFD_CLOEXEC)
+		if err != nil {
+			logrus.Debugf("memfd cloned binary failed, falling back to O_TMPFILE: %v", err)
+		} else {
+			sealFn = func(f **os.File) error {
+				if err := (*f).Chmod(0o511); err != nil {
+					return fmt.Errorf("chmod memfd: %w", err)
+				}
+				// Try to set the newer memfd sealing flags, but we ignore
+				// errors because they are not needed and we want to continue
+				// to work on older kernels.
+				fd := (*f).Fd()
+				// F_SEAL_FUTURE_WRITE -- Linux 5.1
+				_, _ = unix.FcntlInt(fd, unix.F_ADD_SEALS, unix.F_SEAL_FUTURE_WRITE)
+				// F_SEAL_EXEC -- Linux 6.3
+				const F_SEAL_EXEC = 0x2000 //nolint:revive // this matches the unix.* name
+				_, _ = unix.FcntlInt(fd, unix.F_ADD_SEALS, F_SEAL_EXEC)
+				// Apply all original memfd seals.
+				_, err := unix.FcntlInt(fd, unix.F_ADD_SEALS, unix.F_SEAL_SEAL|unix.F_SEAL_SHRINK|unix.F_SEAL_GROW|unix.F_SEAL_WRITE)
+				return os.NewSyscallError("fcntl(F_ADD_SEALS)", err)
+			}
+		}
+
+		// Try to fallback to O_TMPFILE (supported since Linux 3.11).
+		if c.safeExeFile == nil {
+			var stat unix.Stat_t
+			c.safeExeFile, err = os.OpenFile(c.root, unix.O_TMPFILE|unix.O_RDWR|unix.O_EXCL|unix.O_CLOEXEC, 0o700)
+			if err != nil {
+				logrus.Debugf("O_TMPFILE cloned binary failed, falling back to mktemp(): %v", err)
+			} else if err := unix.Fstat(int(c.safeExeFile.Fd()), &stat); err != nil || stat.Nlink != 0 {
+				logrus.Debugf("O_TMPFILE cloned binary has non-zero nlink, falling back to mktemp(): %v", err)
+				c.clearSafeExe()
+			}
+
+			// Finally, fallback to a classic temporary file we unlink.
+			if c.safeExeFile == nil {
+				c.safeExeFile, err = os.CreateTemp(c.root, "runc.")
+				if err != nil {
+					return "", fmt.Errorf("could not clone binary: %w", err)
+				}
+				// Unlink the file and verify it was unlinked.
+				if err := os.Remove(c.safeExeFile.Name()); err != nil {
+					return "", fmt.Errorf("unlinking classic tmpfile: %w", err)
+				}
+				if err := unix.Fstat(int(c.safeExeFile.Fd()), &stat); err != nil {
+					return "", fmt.Errorf("classic tmpfile fstat: %w", err)
+				} else if stat.Nlink != 0 {
+					return "", fmt.Errorf("classic tmpfile %s has non-zero nlink after unlink", c.safeExeFile.Name())
+				}
+			}
+			sealFn = func(f **os.File) error {
+				if err := (*f).Chmod(0o511); err != nil {
+					return fmt.Errorf("chmod tmpfile: %w", err)
+				}
+				// When sealing an O_TMPFILE-style descriptor we need to
+				// re-open the path as O_PATH to clear the existing write
+				// handle we have.
+				opath, err := os.OpenFile(fmt.Sprintf("/proc/self/fd/%d", (*f).Fd()), unix.O_PATH|unix.O_CLOEXEC, 0)
+				if err != nil {
+					return fmt.Errorf("reopen tmpfile: %w", err)
+				}
+				_ = (*f).Close()
+				*f = opath
+				return nil
+			}
+		}
+
+		// Copy the contents of /proc/self/exe to the cloned fd.
+		srcExeFile, err := os.Open("/proc/self/exe")
+		if err != nil {
+			return "", fmt.Errorf("cannot open current process exe: %w", err)
+		}
+		defer srcExeFile.Close()
+		stat, err := srcExeFile.Stat()
+		if err != nil {
+			return "", fmt.Errorf("checking /proc/self/exe size: %w", err)
+		}
+		exeSize := stat.Size()
+
+		copied, err := system.Copy(c.safeExeFile, srcExeFile)
+		if err != nil {
+			return "", fmt.Errorf("copy binary: %w", err)
+		} else if copied != exeSize {
+			return "", fmt.Errorf("copied binary size mismatch: %d != %d", copied, exeSize)
+		}
+
+		// Seal the descriptor.
+		if err := sealFn(&c.safeExeFile); err != nil {
+			return "", fmt.Errorf("could not seal fd: %w", err)
+		}
+	}
+	return fmt.Sprintf("/proc/self/fd/%d", c.safeExeFile.Fd()), nil
+}
+
 func (c *Container) newParentProcess(p *Process) (parentProcess, error) {
 	parentInitPipe, childInitPipe, err := utils.NewSockPair("init")
 	if err != nil {
@@ -454,24 +580,12 @@ func (c *Container) newParentProcess(p *Process) (parentProcess, error) {
 	}
 	logFilePair := filePair{parentLogPipe, childLogPipe}
 
-	cmd := c.commandTemplate(p, childInitPipe, childLogPipe)
-	if !p.Init {
-		return c.newSetnsProcess(p, cmd, messageSockPair, logFilePair)
-	}
-
-	// We only set up fifoFd if we're not doing a `runc exec`. The historic
-	// reason for this is that previously we would pass a dirfd that allowed
-	// for container rootfs escape (and not doing it in `runc exec` avoided
-	// that problem), but we no longer do that. However, there's no need to do
-	// this for `runc exec` so we just keep it this way to be safe.
-	if err := c.includeExecFifo(cmd); err != nil {
-		return nil, fmt.Errorf("unable to setup exec fifo: %w", err)
+	exePath, err := c.makeSafeExe()
+	if err != nil {
+		return nil, fmt.Errorf("unable to create safe /proc/self/exe clone: %w", err)
 	}
-	return c.newInitProcess(p, cmd, messageSockPair, logFilePair)
-}
 
-func (c *Container) commandTemplate(p *Process, childInitPipe *os.File, childLogPipe *os.File) *exec.Cmd {
-	cmd := exec.Command("/proc/self/exe", "init")
+	cmd := exec.Command(exePath, "init")
 	cmd.Args[0] = os.Args[0]
 	cmd.Stdin = p.Stdin
 	cmd.Stdout = p.Stdout
@@ -500,13 +614,36 @@ func (c *Container) commandTemplate(p *Process, childInitPipe *os.File, childLog
 		"_LIBCONTAINER_LOGLEVEL="+p.LogLevel,
 	)
 
-	// NOTE: when running a container with no PID namespace and the parent process spawning the container is
-	// PID1 the pdeathsig is being delivered to the container's init process by the kernel for some reason
-	// even with the parent still running.
+	// Due to a Go stdlib bug, we need to add c.safeExeFile to the set of
+	// ExtraFiles otherwise it is possible for the stdlib to clobber the fd
+	// during forkAndExecInChild1 and replace it with some other file that
+	// might be malicious. This is less than ideal (because the descriptor will
+	// be non-O_CLOEXEC) however we have protections in "runc init" to stop us
+	// from leaking extra file descriptors.
+	//
+	// See <https://github.com/golang/go/issues/61751>.
+	cmd.ExtraFiles = append(cmd.ExtraFiles, c.safeExeFile)
+
+	// NOTE: when running a container with no PID namespace and the parent
+	//       process spawning the container is PID1 the pdeathsig is being
+	//       delivered to the container's init process by the kernel for some
+	//       reason even with the parent still running.
 	if c.config.ParentDeathSignal > 0 {
 		cmd.SysProcAttr.Pdeathsig = unix.Signal(c.config.ParentDeathSignal)
 	}
-	return cmd
+
+	if p.Init {
+		// We only set up fifoFd if we're not doing a `runc exec`. The historic
+		// reason for this is that previously we would pass a dirfd that allowed
+		// for container rootfs escape (and not doing it in `runc exec` avoided
+		// that problem), but we no longer do that. However, there's no need to do
+		// this for `runc exec` so we just keep it this way to be safe.
+		if err := c.includeExecFifo(cmd); err != nil {
+			return nil, fmt.Errorf("unable to setup exec fifo: %w", err)
+		}
+		return c.newInitProcess(p, cmd, messageSockPair, logFilePair)
+	}
+	return c.newSetnsProcess(p, cmd, messageSockPair, logFilePair)
 }
 
 // shouldSendMountSources says whether the child process must setup bind mounts with
diff --git a/libcontainer/nsenter/cloned_binary.c b/libcontainer/nsenter/cloned_binary.c
deleted file mode 100644
index a7f992fddd7..00000000000
--- a/libcontainer/nsenter/cloned_binary.c
+++ /dev/null
@@ -1,567 +0,0 @@
-// SPDX-License-Identifier: Apache-2.0 OR LGPL-2.1-or-later
-/*
- * Copyright (C) 2019 Aleksa Sarai <cyphar@cyphar.com>
- * Copyright (C) 2019 SUSE LLC
- *
- * This work is dual licensed under the following licenses. You may use,
- * redistribute, and/or modify the work under the conditions of either (or
- * both) licenses.
- *
- * === Apache-2.0 ===
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *
- * === LGPL-2.1-or-later ===
- *
- * This library is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License as published by the Free Software Foundation; either
- * version 2.1 of the License, or (at your option) any later version.
- *
- * This library is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with this library. If not, see
- * <https://www.gnu.org/licenses/>.
- *
- */
-
-#define _GNU_SOURCE
-#include <unistd.h>
-#include <stdio.h>
-#include <stdlib.h>
-#include <stdbool.h>
-#include <string.h>
-#include <limits.h>
-#include <fcntl.h>
-#include <errno.h>
-
-#include <sched.h>
-#include <sys/types.h>
-#include <sys/stat.h>
-#include <sys/statfs.h>
-#include <sys/vfs.h>
-#include <sys/mman.h>
-#include <sys/mount.h>
-#include <sys/sendfile.h>
-#include <sys/socket.h>
-#include <sys/syscall.h>
-#include <sys/wait.h>
-
-#include "ipc.h"
-#include "log.h"
-
-/* Use our own wrapper for memfd_create. */
-#ifndef SYS_memfd_create
-#  ifdef __NR_memfd_create
-#    define SYS_memfd_create __NR_memfd_create
-#  else
-/* These values come from <https://fedora.juszkiewicz.com.pl/syscalls.html>. */
-#    warning "libc is outdated -- using hard-coded SYS_memfd_create"
-#    if defined(__x86_64__)
-#      define SYS_memfd_create 319
-#    elif defined(__i386__)
-#      define SYS_memfd_create 356
-#    elif defined(__ia64__)
-#      define SYS_memfd_create 1340
-#    elif defined(__arm__)
-#      define SYS_memfd_create 385
-#    elif defined(__aarch64__)
-#      define SYS_memfd_create 279
-#    elif defined(__ppc__) || defined(__PPC64__) || defined(__powerpc64__)
-#      define SYS_memfd_create 360
-#    elif defined(__s390__) || defined(__s390x__)
-#      define SYS_memfd_create 350
-#    else
-#      warning "unknown architecture -- cannot hard-code SYS_memfd_create"
-#    endif
-#  endif
-#endif
-
-/* memfd_create(2) flags -- copied from <linux/memfd.h>. */
-#ifndef MFD_CLOEXEC
-#  define MFD_CLOEXEC       0x0001U
-#  define MFD_ALLOW_SEALING 0x0002U
-#endif
-#ifndef MFD_EXEC
-#  define MFD_EXEC          0x0010U
-#endif
-
-int memfd_create(const char *name, unsigned int flags)
-{
-#ifdef SYS_memfd_create
-	return syscall(SYS_memfd_create, name, flags);
-#else
-	errno = ENOSYS;
-	return -1;
-#endif
-}
-
-/* This comes directly from <linux/fcntl.h>. */
-#ifndef F_LINUX_SPECIFIC_BASE
-#  define F_LINUX_SPECIFIC_BASE 1024
-#endif
-#ifndef F_ADD_SEALS
-#  define F_ADD_SEALS (F_LINUX_SPECIFIC_BASE + 9)
-#  define F_GET_SEALS (F_LINUX_SPECIFIC_BASE + 10)
-#endif
-#ifndef F_SEAL_SEAL
-#  define F_SEAL_SEAL          0x0001	/* prevent further seals from being set */
-#  define F_SEAL_SHRINK        0x0002	/* prevent file from shrinking */
-#  define F_SEAL_GROW          0x0004	/* prevent file from growing */
-#  define F_SEAL_WRITE         0x0008	/* prevent writes */
-#endif
-#ifndef F_SEAL_FUTURE_WRITE
-#  define F_SEAL_FUTURE_WRITE  0x0010	/* prevent future writes while mapped */
-#endif
-#ifndef F_SEAL_EXEC
-#  define F_SEAL_EXEC          0x0020	/* prevent chmod modifying exec bits */
-#endif
-
-#define CLONED_BINARY_ENV "_LIBCONTAINER_CLONED_BINARY"
-#define RUNC_MEMFD_COMMENT "runc_cloned:/proc/self/exe"
-/*
- * There are newer memfd seals (such as F_SEAL_FUTURE_WRITE and F_SEAL_EXEC),
- * which we use opportunistically. However, this set is the original set of
- * memfd seals, and we require them all to be set to trust our /proc/self/exe
- * if it is a memfd.
- */
-#define RUNC_MEMFD_MIN_SEALS \
-	(F_SEAL_SEAL | F_SEAL_SHRINK | F_SEAL_GROW | F_SEAL_WRITE)
-
-static void *must_realloc(void *ptr, size_t size)
-{
-	void *old = ptr;
-	do {
-		ptr = realloc(old, size);
-	} while (!ptr);
-	return ptr;
-}
-
-/*
- * Verify whether we are currently in a self-cloned program (namely, is
- * /proc/self/exe a memfd). F_GET_SEALS will only succeed for memfds (or rather
- * for shmem files), and we want to be sure it's actually sealed.
- */
-static int is_self_cloned(void)
-{
-	int fd, seals = 0, is_cloned = false;
-	struct stat statbuf = { };
-	struct statfs fsbuf = { };
-
-	fd = open("/proc/self/exe", O_RDONLY | O_CLOEXEC);
-	if (fd < 0) {
-		write_log(ERROR, "cannot open runc binary for reading: open /proc/self/exe: %m");
-		return -ENOTRECOVERABLE;
-	}
-
-	/*
-	 * Is the binary a fully-sealed memfd? We don't need CLONED_BINARY_ENV for
-	 * this, because you cannot write to a sealed memfd no matter what.
-	 */
-	seals = fcntl(fd, F_GET_SEALS);
-	if (seals >= 0) {
-		write_log(DEBUG, "checking /proc/self/exe memfd seals: 0x%x", seals);
-		is_cloned = (seals & RUNC_MEMFD_MIN_SEALS) == RUNC_MEMFD_MIN_SEALS;
-		if (is_cloned)
-			goto out;
-	}
-
-	/*
-	 * All other forms require CLONED_BINARY_ENV, since they are potentially
-	 * writeable (or we can't tell if they're fully safe) and thus we must
-	 * check the environment as an extra layer of defence.
-	 */
-	if (!getenv(CLONED_BINARY_ENV)) {
-		is_cloned = false;
-		goto out;
-	}
-
-	/*
-	 * Is the binary on a read-only filesystem? We can't detect bind-mounts in
-	 * particular (in-kernel they are identical to regular mounts) but we can
-	 * at least be sure that it's read-only. In addition, to make sure that
-	 * it's *our* bind-mount we check CLONED_BINARY_ENV.
-	 */
-	if (fstatfs(fd, &fsbuf) >= 0)
-		is_cloned |= (fsbuf.f_flags & MS_RDONLY);
-
-	/*
-	 * Okay, we're a tmpfile -- or we're currently running on RHEL <=7.6
-	 * which appears to have a borked backport of F_GET_SEALS. Either way,
-	 * having a file which has no hardlinks indicates that we aren't using
-	 * a host-side "runc" binary and this is something that a container
-	 * cannot fake (because unlinking requires being able to resolve the
-	 * path that you want to unlink).
-	 */
-	if (fstat(fd, &statbuf) >= 0)
-		is_cloned |= (statbuf.st_nlink == 0);
-
-out:
-	close(fd);
-	return is_cloned;
-}
-
-/* Read a given file into a new buffer, and providing the length. */
-static char *read_file(char *path, size_t *length)
-{
-	int fd;
-	char buf[4096], *copy = NULL;
-
-	if (!length)
-		return NULL;
-
-	fd = open(path, O_RDONLY | O_CLOEXEC);
-	if (fd < 0)
-		return NULL;
-
-	*length = 0;
-	for (;;) {
-		ssize_t n;
-
-		n = read(fd, buf, sizeof(buf));
-		if (n < 0)
-			goto error;
-		if (!n)
-			break;
-
-		copy = must_realloc(copy, (*length + n) * sizeof(*copy));
-		memcpy(copy + *length, buf, n);
-		*length += n;
-	}
-	close(fd);
-	return copy;
-
-error:
-	close(fd);
-	free(copy);
-	return NULL;
-}
-
-/*
- * A poor-man's version of "xargs -0". Basically parses a given block of
- * NUL-delimited data, within the given length and adds a pointer to each entry
- * to the array of pointers.
- */
-static int parse_xargs(char *data, int data_length, char ***output)
-{
-	int num = 0;
-	char *cur = data;
-
-	if (!data || *output != NULL)
-		return -1;
-
-	while (cur < data + data_length) {
-		num++;
-		*output = must_realloc(*output, (num + 1) * sizeof(**output));
-		(*output)[num - 1] = cur;
-		cur += strlen(cur) + 1;
-	}
-	(*output)[num] = NULL;
-	return num;
-}
-
-/*
- * "Parse" out argv from /proc/self/cmdline.
- * This is necessary because we are running in a context where we don't have a
- * main() that we can just get the arguments from.
- */
-static int fetchve(char ***argv)
-{
-	char *cmdline = NULL;
-	size_t cmdline_size;
-
-	cmdline = read_file("/proc/self/cmdline", &cmdline_size);
-	if (!cmdline)
-		goto error;
-
-	if (parse_xargs(cmdline, cmdline_size, argv) <= 0)
-		goto error;
-
-	return 0;
-
-error:
-	free(cmdline);
-	return -EINVAL;
-}
-
-enum {
-	EFD_NONE = 0,
-	EFD_MEMFD,
-	EFD_FILE,
-};
-
-/*
- * This comes from <linux/fcntl.h>. We can't hard-code __O_TMPFILE because it
- * changes depending on the architecture. If we don't have O_TMPFILE we always
- * have the mkostemp(3) fallback.
- */
-#ifndef O_TMPFILE
-#  if defined(__O_TMPFILE) && defined(O_DIRECTORY)
-#    define O_TMPFILE (__O_TMPFILE | O_DIRECTORY)
-#  endif
-#endif
-
-static inline bool is_memfd_unsupported_error(int err)
-{
-	/*
-	 * - ENOSYS is obviously an "unsupported" error.
-	 *
-	 * - EINVAL could be hit if MFD_EXEC is not supported (pre-6.3 kernel),
-	 *   but it can also be hit if vm.memfd_noexec=2 (in kernels without
-	 *   [1] applied) and the flags does not contain MFD_EXEC. However,
-	 *   there was a bug in the original 6.3 implementation of
-	 *   vm.memfd_noexec=2, which meant that MFD_EXEC would work even in
-	 *   the "strict" mode. Because we try MFD_EXEC first, we won't get
-	 *   EINVAL in the vm.memfd_noexec=2 case (which means we don't need to
-	 *   figure out whether to log the message about memfd_create).
-	 *
-	 * - EACCES is returned in kernels that contain [1] in the
-	 *   vm.memfd_noexec=2 case.
-	 *
-	 * At time of writing, [1] is not in Linus's tree and it't not clear if
-	 * it will be backported to stable, so what exact versions apply here
-	 * is unclear. But the bug is present in 6.3-6.5 at the very least.
-	 *
-	 * [1]: https://lore.kernel.org/all/20230705063315.3680666-2-jeffxu@google.com/
-	 */
-	if (err == EACCES)
-		write_log(INFO,
-			  "memfd_create(MFD_EXEC) failed, possibly due to vm.memfd_noexec=2 -- falling back to less secure O_TMPFILE");
-	return err == ENOSYS || err == EINVAL || err == EACCES;
-}
-
-static int make_execfd(int *fdtype)
-{
-	int fd = -1;
-	char template[PATH_MAX] = { 0 };
-	char *prefix = getenv("_LIBCONTAINER_STATEDIR");
-
-	if (!prefix || *prefix != '/')
-		prefix = "/tmp";
-	if (snprintf(template, sizeof(template), "%s/runc.XXXXXX", prefix) < 0)
-		return -1;
-
-	/*
-	 * Now try memfd, it's much nicer than actually creating a file in STATEDIR
-	 * since it's easily detected thanks to sealing and also doesn't require
-	 * assumptions about STATEDIR.
-	 */
-	*fdtype = EFD_MEMFD;
-	/*
-	 * On newer kernels we should set MFD_EXEC to indicate we need +x
-	 * permissions. Otherwise an admin with vm.memfd_noexec=1 would subtly
-	 * break runc. vm.memfd_noexec=2 is a little bit more complicated, see the
-	 * comment in is_memfd_unsupported_error() -- the upshot is that doing it
-	 * this way works, but only because of two overlapping bugs in the sysctl
-	 * implementation.
-	 */
-	fd = memfd_create(RUNC_MEMFD_COMMENT, MFD_EXEC | MFD_CLOEXEC | MFD_ALLOW_SEALING);
-	if (fd < 0 && is_memfd_unsupported_error(errno))
-		fd = memfd_create(RUNC_MEMFD_COMMENT, MFD_CLOEXEC | MFD_ALLOW_SEALING);
-	if (fd >= 0)
-		return fd;
-	if (!is_memfd_unsupported_error(errno))
-		goto error;
-
-#ifdef O_TMPFILE
-	/*
-	 * Try O_TMPFILE to avoid races where someone might snatch our file. Note
-	 * that O_EXCL isn't actually a security measure here (since you can just
-	 * fd re-open it and clear O_EXCL).
-	 */
-	*fdtype = EFD_FILE;
-	fd = open(prefix, O_TMPFILE | O_EXCL | O_RDWR | O_CLOEXEC, 0700);
-	if (fd >= 0) {
-		struct stat statbuf = { };
-		bool working_otmpfile = false;
-
-		/*
-		 * open(2) ignores unknown O_* flags -- yeah, I was surprised when I
-		 * found this out too. As a result we can't check for EINVAL. However,
-		 * if we get nlink != 0 (or EISDIR) then we know that this kernel
-		 * doesn't support O_TMPFILE.
-		 */
-		if (fstat(fd, &statbuf) >= 0)
-			working_otmpfile = (statbuf.st_nlink == 0);
-
-		if (working_otmpfile)
-			return fd;
-
-		/* Pretend that we got EISDIR since O_TMPFILE failed. */
-		close(fd);
-		errno = EISDIR;
-	}
-	if (errno != EISDIR)
-		goto error;
-#endif /* defined(O_TMPFILE) */
-
-	/*
-	 * Our final option is to create a temporary file the old-school way, and
-	 * then unlink it so that nothing else sees it by accident.
-	 */
-	*fdtype = EFD_FILE;
-	fd = mkostemp(template, O_CLOEXEC);
-	if (fd >= 0) {
-		if (unlink(template) >= 0)
-			return fd;
-		close(fd);
-	}
-
-error:
-	*fdtype = EFD_NONE;
-	return -1;
-}
-
-static int seal_execfd(int *fd, int fdtype)
-{
-	switch (fdtype) {
-	case EFD_MEMFD:{
-			/*
-			 * Try to seal with newer seals, but we ignore errors because older
-			 * kernels don't support some of them. For container security only
-			 * RUNC_MEMFD_MIN_SEALS are strictly required, but the rest are
-			 * nice-to-haves. We apply RUNC_MEMFD_MIN_SEALS at the end because it
-			 * contains F_SEAL_SEAL.
-			 */
-			int __attribute__((unused)) _err1 = fcntl(*fd, F_ADD_SEALS, F_SEAL_FUTURE_WRITE);	// Linux 5.1
-			int __attribute__((unused)) _err2 = fcntl(*fd, F_ADD_SEALS, F_SEAL_EXEC);	// Linux 6.3
-			return fcntl(*fd, F_ADD_SEALS, RUNC_MEMFD_MIN_SEALS);
-		}
-	case EFD_FILE:{
-			/* Need to re-open our pseudo-memfd as an O_PATH to avoid execve(2) giving -ETXTBSY. */
-			int newfd;
-			char fdpath[PATH_MAX] = { 0 };
-
-			if (fchmod(*fd, 0100) < 0)
-				return -1;
-
-			if (snprintf(fdpath, sizeof(fdpath), "/proc/self/fd/%d", *fd) < 0)
-				return -1;
-
-			newfd = open(fdpath, O_PATH | O_CLOEXEC);
-			if (newfd < 0)
-				return -1;
-
-			close(*fd);
-			*fd = newfd;
-			return 0;
-		}
-	default:
-		break;
-	}
-	return -1;
-}
-
-static ssize_t fd_to_fd(int outfd, int infd)
-{
-	ssize_t total = 0;
-	char buffer[4096];
-
-	for (;;) {
-		ssize_t nread, nwritten = 0;
-
-		nread = read(infd, buffer, sizeof(buffer));
-		if (nread < 0)
-			return -1;
-		if (!nread)
-			break;
-
-		do {
-			ssize_t n = write(outfd, buffer + nwritten, nread - nwritten);
-			if (n < 0)
-				return -1;
-			nwritten += n;
-		} while (nwritten < nread);
-
-		total += nwritten;
-	}
-
-	return total;
-}
-
-static int clone_binary(void)
-{
-	int binfd, execfd;
-	struct stat statbuf = { };
-	size_t sent = 0;
-	int fdtype = EFD_NONE;
-
-	execfd = make_execfd(&fdtype);
-	if (execfd < 0 || fdtype == EFD_NONE)
-		return -ENOTRECOVERABLE;
-
-	binfd = open("/proc/self/exe", O_RDONLY | O_CLOEXEC);
-	if (binfd < 0)
-		goto error;
-
-	if (fstat(binfd, &statbuf) < 0)
-		goto error_binfd;
-
-	while (sent < statbuf.st_size) {
-		int n = sendfile(execfd, binfd, NULL, statbuf.st_size - sent);
-		if (n < 0) {
-			/* sendfile can fail so we fallback to a dumb user-space copy. */
-			n = fd_to_fd(execfd, binfd);
-			if (n < 0)
-				goto error_binfd;
-		}
-		sent += n;
-	}
-	close(binfd);
-	if (sent != statbuf.st_size)
-		goto error;
-
-	if (seal_execfd(&execfd, fdtype) < 0)
-		goto error;
-
-	return execfd;
-
-error_binfd:
-	close(binfd);
-error:
-	close(execfd);
-	return -EIO;
-}
-
-/* Get cheap access to the environment. */
-extern char **environ;
-
-int ensure_cloned_binary(void)
-{
-	int execfd;
-	char **argv = NULL;
-
-	/* Check that we're not self-cloned, and if we are then bail. */
-	int cloned = is_self_cloned();
-	if (cloned > 0 || cloned == -ENOTRECOVERABLE)
-		return cloned;
-
-	if (fetchve(&argv) < 0)
-		return -EINVAL;
-
-	execfd = clone_binary();
-	if (execfd < 0)
-		return -EIO;
-
-	if (putenv(CLONED_BINARY_ENV "=1"))
-		goto error;
-
-	fexecve(execfd, argv, environ);
-error:
-	close(execfd);
-	return -ENOEXEC;
-}
diff --git a/libcontainer/nsenter/nsexec.c b/libcontainer/nsenter/nsexec.c
index c0b9fc367e9..d94e2cbac04 100644
--- a/libcontainer/nsenter/nsexec.c
+++ b/libcontainer/nsenter/nsexec.c
@@ -525,9 +525,6 @@ void join_namespaces(char *nslist)
 	free(namespaces);
 }
 
-/* Defined in cloned_binary.c. */
-extern int ensure_cloned_binary(void);
-
 static inline int sane_kill(pid_t pid, int signum)
 {
 	if (pid > 0)
@@ -771,14 +768,6 @@ void nsexec(void)
 		return;
 	}
 
-	/*
-	 * We need to re-exec if we are not in a cloned binary. This is necessary
-	 * to ensure that containers won't be able to access the host binary
-	 * through /proc/self/exe. See CVE-2019-5736.
-	 */
-	if (ensure_cloned_binary() < 0)
-		bail("could not ensure we are a cloned binary");
-
 	/*
 	 * Inform the parent we're past initial setup.
 	 * For the other side of this, see initWaiter.
diff --git a/libcontainer/system/linux.go b/libcontainer/system/linux.go
index e1d6eb18034..200d375e840 100644
--- a/libcontainer/system/linux.go
+++ b/libcontainer/system/linux.go
@@ -4,10 +4,14 @@
 package system
 
 import (
+	"errors"
+	"fmt"
+	"io"
 	"os"
 	"os/exec"
 	"unsafe"
 
+	"github.com/sirupsen/logrus"
 	"golang.org/x/sys/unix"
 )
 
@@ -102,3 +106,40 @@ func GetSubreaper() (int, error) {
 
 	return int(i), nil
 }
+
+func ExecutableMemfd(comment string, flags int) (*os.File, error) {
+	// Try to use MFD_EXEC first. On pre-6.3 kernels we get -EINVAL for this
+	// flag. On post-6.3 kernels, with vm.memfd_noexec=1 this ensures we get an
+	// executable memfd. For vm.memfd_noexec=2 this is a bit more complicated.
+	// The original vm.memfd_noexec=2 implementation incorrectly silently
+	// allowed MFD_EXEC[1] -- this should be fixed in 6.6. On 6.6 and newer
+	// kernels, we will get -EACCES if we try to use MFD_EXEC with
+	// vm.memfd_noexec=2 (for 6.3-6.5, -EINVAL was the intended return value).
+	//
+	// The upshot is we only need to retry without MFD_EXEC on -EINVAL because
+	// it just so happens that passing MFD_EXEC bypasses vm.memfd_noexec=2 on
+	// kernels where -EINVAL is actually a security denial.
+	memfd, err := unix.MemfdCreate(comment, flags|unix.MFD_EXEC)
+	if errors.Is(err, unix.EINVAL) {
+		memfd, err = unix.MemfdCreate(comment, flags)
+	}
+	if err != nil {
+		if errors.Is(err, unix.EACCES) {
+			logrus.Infof("memfd_create(MFD_EXEC) failed, possibly due to vm.memfd_noexec=2 -- falling back to less secure O_TMPFILE")
+		}
+		err := os.NewSyscallError("memfd_create", err)
+		return nil, fmt.Errorf("failed to create executable memfd: %w", err)
+	}
+	return os.NewFile(uintptr(memfd), "/memfd:"+comment), nil
+}
+
+// Copy is a wrapper around io.Copy that continues the copy despite EINTR.
+func Copy(dst io.Writer, src io.Reader) (written int64, err error) {
+	for {
+		n, err := io.Copy(dst, src)
+		written += n
+		if !errors.Is(err, unix.EINTR) {
+			return written, err
+		}
+	}
+}

From ffce2c0ef6820a0cf45dbd5987d7b1a584d9641c Mon Sep 17 00:00:00 2001
From: Aleksa Sarai <cyphar@cyphar.com>
Date: Sun, 6 Aug 2023 13:59:06 +1000
Subject: [PATCH 08/11] configs: fix idmapped mounts json field names

In the runc state JSON we always use snake_case. This is a no-op change,
but it will cause any existing container state files to be incorrectly
parsed. Luckily, commit fbf183c6f8c4 ("Add uid and gid mappings to
mounts") has never been in a runc release so we can change this before a
1.2.z release.

Fixes: fbf183c6f8c4 ("Add uid and gid mappings to mounts")
Signed-off-by: Aleksa Sarai <cyphar@cyphar.com>
---
 libcontainer/configs/mount_linux.go | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/libcontainer/configs/mount_linux.go b/libcontainer/configs/mount_linux.go
index a4275df3a03..6d4106de0c6 100644
--- a/libcontainer/configs/mount_linux.go
+++ b/libcontainer/configs/mount_linux.go
@@ -34,13 +34,13 @@ type Mount struct {
 	// Note that, the underlying filesystem should support this feature to be
 	// used.
 	// Every mount point could have its own mapping.
-	UIDMappings []IDMap `json:"uidMappings,omitempty"`
+	UIDMappings []IDMap `json:"uid_mappings,omitempty"`
 
 	// GIDMappings is used to changing file group owners w/o calling chown.
 	// Note that, the underlying filesystem should support this feature to be
 	// used.
 	// Every mount point could have its own mapping.
-	GIDMappings []IDMap `json:"gidMappings,omitempty"`
+	GIDMappings []IDMap `json:"gid_mappings,omitempty"`
 }
 
 func (m *Mount) IsBind() bool {

From acaf423d7154b6707de9a56a24ac61117d7f9b98 Mon Sep 17 00:00:00 2001
From: Aleksa Sarai <cyphar@cyphar.com>
Date: Tue, 1 Aug 2023 20:07:49 +1000
Subject: [PATCH 09/11] libcontainer: remove all mount logic from nsexec

With open_tree(OPEN_TREE_CLONE), it is possible to implement both the
id-mapped mounts and bind-mount source file descriptor logic entirely in
Go without requiring any complicated handling from nsexec.

This allows us to remove the amount of C code we have in nsexec, as well
as simplifying a whole host of places that were made more complicated
with the addition of id-mapped mounts and the bind sourcefd logic. The
one downside of this is that the bind sourcefd feature now depends on
Linux 5.4.

In addition, we can easily add support for id-mappings that don't match
the container's user namespace. The approach taken here is to use Go's
officially supported mechanism for spawning a process in a user
namespace, but (ab)use PTRACE_TRACEME to avoid actually having to exec a
different process. The most efficient way to implement this would be to
do clone() in cgo directly to run a function that just does
kill(getpid(), SIGSTOP) -- we can always switch to that if it turns out
this approach is too slow.

This also includes a partial fix for a bug in the handling of idmap
mounts and mount propagation. In short, the issue is that because we do
OPEN_TREE_CLONE on the host, the RootPropagation flag does not apply
(nor any other mount propagation flags configured in config.json) and
thus recursive bind-mounts can and will be leaked to the host. Because
this patch switches the feature from 9c444070ec7b ("Open bind mount
sources from the host userns") to use OPEN_TREE_CLONE, this resulted in
all bind-mounts having this behaviour. The partial fix here is to try to
emulate the behaviour of RootPropagation for bind-mounts from the host.

It turns out that bind-mounts inside containers were broken when using
the fd-passing feature from 9c444070ec7b ("Open bind mount sources from
the host userns") because the file descriptor opens were done before we
start doing any mounts in rootfs_linux.go. The solution for this is more
involved and is fixed in a separate patch. At the very least, this patch
doesn't worsen the mount propagation situation.

Fixes: fda12ab1014e ("Support idmap mounts on volumes")
Fixes: 9c444070ec7b ("Open bind mount sources from the host userns")
Signed-off-by: Aleksa Sarai <cyphar@cyphar.com>
---
 libcontainer/configs/mount_linux.go        |   8 +
 libcontainer/container_linux.go            | 287 ++++++++++-----------
 libcontainer/factory_linux.go              |  15 --
 libcontainer/init_linux.go                 |  34 +--
 libcontainer/message_linux.go              |   2 -
 libcontainer/mount_linux.go                |  52 +++-
 libcontainer/nsenter/nsexec.c              | 268 -------------------
 libcontainer/rootfs_linux.go               |  77 ++----
 libcontainer/standard_init_linux.go        |  13 +-
 libcontainer/userns/usernsfd_linux.go      | 134 ++++++++++
 libcontainer/userns/usernsfd_linux_test.go |  52 ++++
 11 files changed, 409 insertions(+), 533 deletions(-)
 create mode 100644 libcontainer/userns/usernsfd_linux.go
 create mode 100644 libcontainer/userns/usernsfd_linux_test.go

diff --git a/libcontainer/configs/mount_linux.go b/libcontainer/configs/mount_linux.go
index 6d4106de0c6..0cec2e5aed4 100644
--- a/libcontainer/configs/mount_linux.go
+++ b/libcontainer/configs/mount_linux.go
@@ -6,6 +6,14 @@ type Mount struct {
 	// Source path for the mount.
 	Source string `json:"source"`
 
+	// SourceFd is a open_tree(2)-style file descriptor created with the new
+	// mount API. If non-nil, this indicates that SourceFd is a configured
+	// mountfd that should be used for the mount into the container.
+	//
+	// Note that you cannot use /proc/self/fd/$n-style paths with
+	// open_tree(2)-style file descriptors.
+	SourceFd *int
+
 	// Destination path for the mount inside the container.
 	Destination string `json:"destination"`
 
diff --git a/libcontainer/container_linux.go b/libcontainer/container_linux.go
index 9b4bc9c033a..932a415cb8a 100644
--- a/libcontainer/container_linux.go
+++ b/libcontainer/container_linux.go
@@ -2,7 +2,6 @@ package libcontainer
 
 import (
 	"bytes"
-	"encoding/json"
 	"errors"
 	"fmt"
 	"io"
@@ -16,6 +15,7 @@ import (
 	"sync"
 	"time"
 
+	"github.com/moby/sys/mountinfo"
 	"github.com/opencontainers/runtime-spec/specs-go"
 	"github.com/sirupsen/logrus"
 	"github.com/vishvananda/netlink/nl"
@@ -26,6 +26,7 @@ import (
 	"github.com/opencontainers/runc/libcontainer/configs"
 	"github.com/opencontainers/runc/libcontainer/intelrdt"
 	"github.com/opencontainers/runc/libcontainer/system"
+	"github.com/opencontainers/runc/libcontainer/userns"
 	"github.com/opencontainers/runc/libcontainer/utils"
 )
 
@@ -646,108 +647,149 @@ func (c *Container) newParentProcess(p *Process) (parentProcess, error) {
 	return c.newSetnsProcess(p, cmd, messageSockPair, logFilePair)
 }
 
-// shouldSendMountSources says whether the child process must setup bind mounts with
-// the source pre-opened (O_PATH) in the host user namespace.
-// See https://github.com/opencontainers/runc/issues/2484
-func (c *Container) shouldSendMountSources() bool {
-	// Passing the mount sources via SCM_RIGHTS is only necessary when
-	// both userns and mntns are active.
-	if !c.config.Namespaces.Contains(configs.NEWUSER) ||
-		!c.config.Namespaces.Contains(configs.NEWNS) {
-		return false
-	}
-
-	// nsexec.c send_mountsources() requires setns(mntns) capabilities
-	// CAP_SYS_CHROOT and CAP_SYS_ADMIN.
-	if c.config.RootlessEUID {
-		return false
-	}
-
-	// We need to send sources if there are non-idmap bind-mounts.
-	for _, m := range c.config.Mounts {
-		if m.IsBind() && !m.IsIDMapped() {
-			return true
+// remapMountSources tries to remap any applicable mount sources in the
+// container configuration to use open_tree(2)-style mountfds. This allows
+// containers to have bind-mounts from source directories the container process
+// cannot resolve (#2576) as well as apply MOUNT_ATTR_IDMAP to mounts (which
+// requires privileges the container process might not have).
+//
+// The core idea is that we create a mountfd with the right configuration, and
+// then replace the source with a reference to the file descriptor. Ideally
+// this would be /proc/self/fd/<the-fd-runc-init-will-see>, but we cannot use
+// new mount API file descriptors as bind-mount sources (they run afoul of the
+// check_mnt() checks that stop us from doing bind-mounts from an fd in a
+// different namespace). So instead, we set the source to \x00<file-descriptor>
+// which is an invalid path under Linux.
+func (c *Container) remapMountSources(cmd *exec.Cmd) (Err error) {
+	nsHandles := new(userns.Handles)
+	defer nsHandles.Release()
+	for i, m := range c.config.Mounts {
+		var mountFile *os.File
+		if m.IsBind() {
+			flags := uint(unix.OPEN_TREE_CLONE | unix.O_CLOEXEC)
+			if m.Flags&unix.MS_REC == unix.MS_REC {
+				flags |= unix.AT_RECURSIVE
+			}
+			mountFd, err := unix.OpenTree(unix.AT_FDCWD, m.Source, flags)
+			if err != nil {
+				// For non-id-mapped mounts, this functionality is optional. We
+				// can just fallback to letting the rootfs code use the
+				// original source as a mountpoint.
+				if !m.IsIDMapped() {
+					logrus.Debugf("remap mount sources: skipping remap of %s due to failure: open_tree(OPEN_TREE_CLONE): %v", m.Source, err)
+					continue
+				}
+				return &os.PathError{Op: "open_tree(OPEN_TREE_CLONE)", Path: m.Source, Err: err}
+			}
+			mountFile = os.NewFile(uintptr(mountFd), m.Source+" (open_tree)")
+			// Only close the file if the remapping failed -- otherwise we keep
+			// the file open to be passed to "runc init".
+			defer func() {
+				if Err != nil {
+					_ = mountFile.Close()
+				}
+			}()
 		}
-	}
 
-	return false
-}
-
-// shouldSendIdmapSources says whether the child process must setup idmap mounts with
-// the mount_setattr already done in the host user namespace.
-func (c *Container) shouldSendIdmapSources() bool {
-	// nsexec.c mount_setattr() requires CAP_SYS_ADMIN in:
-	// * the user namespace the filesystem was mounted in;
-	// * the user namespace we're trying to idmap the mount to;
-	// * the owning user namespace of the mount namespace you're currently located in.
-	//
-	// See the comment from Christian Brauner:
-	//	https://github.com/opencontainers/runc/pull/3717#discussion_r1103607972
-	//
-	// Let's just rule out rootless, we don't have those permission in the
-	// rootless case.
-	if c.config.RootlessEUID {
-		return false
-	}
-
-	// For the time being we require userns to be in use.
-	if !c.config.Namespaces.Contains(configs.NEWUSER) {
-		return false
-	}
-
-	// We need to send sources if there are idmap bind-mounts.
-	for _, m := range c.config.Mounts {
-		if m.IsBind() && m.IsIDMapped() {
-			return true
+		if m.IsIDMapped() {
+			if mountFile == nil {
+				return fmt.Errorf("remap mount sources: invalid mount source %s: id-mapping of non-bind-mounts is not supported", m.Source)
+			}
+			usernsFile, err := nsHandles.Get(userns.Mapping{
+				UIDMappings: m.UIDMappings,
+				GIDMappings: m.GIDMappings,
+			})
+			if err != nil {
+				return fmt.Errorf("remap mount sources: failed to create userns for %s id-mapping: %w", m.Source, err)
+			}
+			defer usernsFile.Close()
+			if err := unix.MountSetattr(int(mountFile.Fd()), "", unix.AT_EMPTY_PATH, &unix.MountAttr{
+				Attr_set:  unix.MOUNT_ATTR_IDMAP,
+				Userns_fd: uint64(usernsFile.Fd()),
+			}); err != nil {
+				return fmt.Errorf("remap mount sources: failed to set IDMAP_SOURCE_ATTR on %s: %w", m.Source, err)
+			}
 		}
-	}
-
-	return false
-}
 
-func (c *Container) sendMountSources(cmd *exec.Cmd, messageSockPair filePair) error {
-	if !c.shouldSendMountSources() {
-		return nil
-	}
-
-	return c.sendFdsSources(cmd, messageSockPair, "_LIBCONTAINER_MOUNT_FDS", func(m *configs.Mount) bool {
-		return m.IsBind() && !m.IsIDMapped()
-	})
-}
-
-func (c *Container) sendIdmapSources(cmd *exec.Cmd, messageSockPair filePair) error {
-	if !c.shouldSendIdmapSources() {
-		return nil
-	}
+		if mountFile != nil {
+			// We need to emulate the propagation behaviour when bind-mounting
+			// from a root filesystem with config.RootPropagation applied. This
+			// is needed because existing runc users expect bind-mounts to act
+			// this way (the "classic" method), regardless of the mount API
+			// used to create the bind-mount.
+			//
+			// NOTE: This explicitly does not handle configurations where there
+			// is a bind-mount source of a path from inside the container
+			// rootfs. We could do this in rootfs_linux.go but this would
+			// require doing criu-style shennanigans to try to figure out how
+			// to recreate the state in /proc/self/mountinfo. For now, it's
+			// much simpler to implement this based purely on RootPropagation.
+
+			// Same logic as prepareRoot().
+			propFlags := unix.MS_SLAVE | unix.MS_REC
+			if c.config.RootPropagation != 0 {
+				propFlags = c.config.RootPropagation
+			}
 
-	return c.sendFdsSources(cmd, messageSockPair, "_LIBCONTAINER_IDMAP_FDS", func(m *configs.Mount) bool {
-		return m.IsBind() && m.IsIDMapped()
-	})
-}
+			// The one thing to consider is whether the RootPropagation is
+			// MS_REC and whether the mount source is the same as /. If it is
+			// MS_REC then we apply the propagation flags (nix MS_REC)
+			// recursively. Otherwise we apply the propagation flags
+			// non-recursively if m.Source is part of the / mount. We do
+			// nothing if neither is the case.
+			var setattrFlags uint
+			if propFlags&unix.MS_REC == unix.MS_REC {
+				// If the RootPropagation is recursive, then any bind-mount
+				// from the host should inherit the propagation setting of /.
+				logrus.Debugf("remap mount sources: applying recursive RootPropagation of 0x%x to %q bind-mount propagation", propFlags, m.Source)
+				setattrFlags = unix.AT_RECURSIVE
+			} else {
+				// If the RootPropagation is not recursive, only bind-mounts
+				// from paths within the / mount should inherit the propagation
+				// setting /.
+				info, err := mountinfo.GetMounts(mountinfo.ParentsFilter(m.Source))
+				if err != nil {
+					return fmt.Errorf("remap mount sources: failed to parse /proc/self/mountinfo: %w", err)
+				}
+				// If there is only a single entry with a mountpoint path of /,
+				// the source was a child of /. Otherwise, do not set the
+				// propagation setting of bind-mounts.
+				if len(info) == 1 && info[0].Mountpoint == "/" {
+					logrus.Debugf("remap mount sources: applying non-recursive RootPropagation of 0x%x to child of / %q bind-mount propagation", propFlags, m.Source)
+				} else {
+					logrus.Debugf("remap mount sources: using default mount propagation for %q bind-mount", m.Source)
+					propFlags = 0
+				}
+			}
+			if propFlags != 0 {
+				if err := unix.MountSetattr(int(mountFile.Fd()), "", unix.AT_EMPTY_PATH|setattrFlags, &unix.MountAttr{
+					Propagation: uint64(propFlags &^ unix.MS_REC),
+				}); err != nil {
+					return fmt.Errorf("remap mount sources: failed to set mount propagation of %q bind-mount to 0x%x: %w", m.Source, propFlags, err)
+				}
+			}
 
-func (c *Container) sendFdsSources(cmd *exec.Cmd, messageSockPair filePair, envVar string, condition func(*configs.Mount) bool) error {
-	// Elements on these slices will be paired with mounts (see StartInitialization() and
-	// prepareRootfs()). These slices MUST have the same size as c.config.Mounts.
-	fds := make([]int, len(c.config.Mounts))
-	for i, m := range c.config.Mounts {
-		if !condition(m) {
-			// The -1 fd is ignored later.
-			fds[i] = -1
-			continue
+			// We have to leak these file descriptors to "runc init", which
+			// will mean that Go will set them as ~O_CLOEXEC during ForkExec,
+			// meaning that by default these would be leaked to the container
+			// process.
+			//
+			// While this is a bit scary, we have several processes to make
+			// sure this never leaks to the actual container (the SET_DUMPABLE
+			// bit, setting everything to O_CLOEXEC at the end of init, etc) --
+			// and in addition, OPEN_TREE_CLONE file descriptors are completely
+			// safe to leak to the container because they are in an anonymous
+			// mount namespace so they cannot be used to escape the container
+			// a-la CVE-2016-9962.
+			cmd.ExtraFiles = append(cmd.ExtraFiles, mountFile)
+			fd := stdioFdCount + len(cmd.ExtraFiles) - 1
+			logrus.Debugf("remapping mount source %s to fd %d", m.Source, fd)
+			c.config.Mounts[i].SourceFd = &fd
+			// Prepend \x00 to m.Source to make sure that attempts to operate
+			// on it in rootfs_linux.go fail.
+			c.config.Mounts[i].Source = "\x00" + m.Source
 		}
-
-		// The fd passed here will not be used: nsexec.c will overwrite it with dup3(). We just need
-		// to allocate a fd so that we know the number to pass in the environment variable. The fd
-		// must not be closed before cmd.Start(), so we reuse messageSockPair.child because the
-		// lifecycle of that fd is already taken care of.
-		cmd.ExtraFiles = append(cmd.ExtraFiles, messageSockPair.child)
-		fds[i] = stdioFdCount + len(cmd.ExtraFiles) - 1
 	}
-	fdsJSON, err := json.Marshal(fds)
-	if err != nil {
-		return fmt.Errorf("Error creating %v: %w", envVar, err)
-	}
-	cmd.Env = append(cmd.Env, envVar+"="+string(fdsJSON))
 	return nil
 }
 
@@ -759,14 +801,11 @@ func (c *Container) newInitProcess(p *Process, cmd *exec.Cmd, messageSockPair, l
 			nsMaps[ns.Type] = ns.Path
 		}
 	}
-	data, err := c.bootstrapData(c.config.Namespaces.CloneFlags(), nsMaps, initStandard)
-	if err != nil {
-		return nil, err
-	}
-	if err := c.sendMountSources(cmd, messageSockPair); err != nil {
+	if err := c.remapMountSources(cmd); err != nil {
 		return nil, err
 	}
-	if err := c.sendIdmapSources(cmd, messageSockPair); err != nil {
+	data, err := c.bootstrapData(c.config.Namespaces.CloneFlags(), nsMaps)
+	if err != nil {
 		return nil, err
 	}
 
@@ -793,7 +832,7 @@ func (c *Container) newSetnsProcess(p *Process, cmd *exec.Cmd, messageSockPair,
 	}
 	// for setns process, we don't have to set cloneflags as the process namespaces
 	// will only be set via setns syscall
-	data, err := c.bootstrapData(0, state.NamespacePaths, initSetns)
+	data, err := c.bootstrapData(0, state.NamespacePaths)
 	if err != nil {
 		return nil, err
 	}
@@ -1183,7 +1222,7 @@ type netlinkError struct{ error }
 // such as one that uses nsenter package to bootstrap the container's
 // init process correctly, i.e. with correct namespaces, uid/gid
 // mapping etc.
-func (c *Container) bootstrapData(cloneFlags uintptr, nsMaps map[configs.NamespaceType]string, it initType) (_ io.Reader, Err error) {
+func (c *Container) bootstrapData(cloneFlags uintptr, nsMaps map[configs.NamespaceType]string) (_ io.Reader, Err error) {
 	// create the netlink message
 	r := nl.NewNetlinkRequest(int(InitMsg), 0)
 
@@ -1285,48 +1324,6 @@ func (c *Container) bootstrapData(cloneFlags uintptr, nsMaps map[configs.Namespa
 		Value: c.config.RootlessEUID,
 	})
 
-	// Bind mount source to open.
-	if it == initStandard && c.shouldSendMountSources() {
-		var mounts []byte
-		for _, m := range c.config.Mounts {
-			if m.IsBind() && !m.IsIDMapped() {
-				if strings.IndexByte(m.Source, 0) >= 0 {
-					return nil, fmt.Errorf("mount source string contains null byte: %q", m.Source)
-				}
-				mounts = append(mounts, []byte(m.Source)...)
-			}
-			mounts = append(mounts, byte(0))
-		}
-
-		r.AddData(&Bytemsg{
-			Type:  MountSourcesAttr,
-			Value: mounts,
-		})
-	}
-
-	// Idmap mount sources to open.
-	if it == initStandard && c.shouldSendIdmapSources() {
-		var mounts []byte
-		for _, m := range c.config.Mounts {
-			if m.IsBind() && m.IsIDMapped() {
-				// While other parts of the code check this too (like
-				// libcontainer/specconv/spec_linux.go) we do it here also because some libcontainer
-				// users don't use those functions.
-				if strings.IndexByte(m.Source, 0) >= 0 {
-					return nil, fmt.Errorf("mount source string contains null byte: %q", m.Source)
-				}
-
-				mounts = append(mounts, []byte(m.Source)...)
-			}
-			mounts = append(mounts, byte(0))
-		}
-
-		r.AddData(&Bytemsg{
-			Type:  IdmapSourcesAttr,
-			Value: mounts,
-		})
-	}
-
 	return bytes.NewReader(r.Serialize()), nil
 }
 
diff --git a/libcontainer/factory_linux.go b/libcontainer/factory_linux.go
index 4cb4a88bf0d..54a7bec91a1 100644
--- a/libcontainer/factory_linux.go
+++ b/libcontainer/factory_linux.go
@@ -214,18 +214,3 @@ func validateID(id string) error {
 
 	return nil
 }
-
-func parseFdsFromEnv(envVar string) ([]int, error) {
-	fdsJSON := os.Getenv(envVar)
-	if fdsJSON == "" {
-		// Always return the nil slice if no fd is present.
-		return nil, nil
-	}
-
-	var fds []int
-	if err := json.Unmarshal([]byte(fdsJSON), &fds); err != nil {
-		return nil, fmt.Errorf("Error unmarshalling %v: %w", envVar, err)
-	}
-
-	return fds, nil
-}
diff --git a/libcontainer/init_linux.go b/libcontainer/init_linux.go
index cb9f549460b..d72ca8a3dc9 100644
--- a/libcontainer/init_linux.go
+++ b/libcontainer/init_linux.go
@@ -47,18 +47,6 @@ type network struct {
 	TempVethPeerName string `json:"temp_veth_peer_name"`
 }
 
-type mountFds struct {
-	// sourceFds are the fds to use as source when mounting.
-	// The slice size should be the same as container mounts, as it will be
-	// paired with them.
-	// The value -1 is used when no fd is needed for the mount.
-	// Can't have a valid fd in the same position that other slices in this struct.
-	// We need to use only one of these fds on any single mount.
-	sourceFds []int
-	// Idem sourceFds, but fds of already created idmap mounts, to use with unix.MoveMount().
-	idmapFds []int
-}
-
 // initConfig is used for transferring parameters from Exec() to Init()
 type initConfig struct {
 	Args             []string              `json:"args"`
@@ -136,18 +124,6 @@ func StartInitialization() (retErr error) {
 		return fmt.Errorf("unable to convert _LIBCONTAINER_LOGPIPE: %w", err)
 	}
 
-	// Get mount files (O_PATH).
-	mountSrcFds, err := parseFdsFromEnv("_LIBCONTAINER_MOUNT_FDS")
-	if err != nil {
-		return err
-	}
-
-	// Get idmap fds.
-	idmapFds, err := parseFdsFromEnv("_LIBCONTAINER_IDMAP_FDS")
-	if err != nil {
-		return err
-	}
-
 	// clear the current process's environment to clean any libcontainer
 	// specific env vars.
 	os.Clearenv()
@@ -163,10 +139,10 @@ func StartInitialization() (retErr error) {
 	}()
 
 	// If init succeeds, it will not return, hence none of the defers will be called.
-	return containerInit(it, pipe, consoleSocket, fifofd, logPipeFd, mountFds{sourceFds: mountSrcFds, idmapFds: idmapFds})
+	return containerInit(it, pipe, consoleSocket, fifofd, logPipeFd)
 }
 
-func containerInit(t initType, pipe *os.File, consoleSocket *os.File, fifoFd, logFd int, mountFds mountFds) error {
+func containerInit(t initType, pipe *os.File, consoleSocket *os.File, fifoFd, logFd int) error {
 	var config *initConfig
 	if err := json.NewDecoder(pipe).Decode(&config); err != nil {
 		return err
@@ -176,11 +152,6 @@ func containerInit(t initType, pipe *os.File, consoleSocket *os.File, fifoFd, lo
 	}
 	switch t {
 	case initSetns:
-		// mount and idmap fds must be nil in this case. We don't mount while doing runc exec.
-		if mountFds.sourceFds != nil || mountFds.idmapFds != nil {
-			return errors.New("mount and idmap fds must be nil; can't mount from exec")
-		}
-
 		i := &linuxSetnsInit{
 			pipe:          pipe,
 			consoleSocket: consoleSocket,
@@ -196,7 +167,6 @@ func containerInit(t initType, pipe *os.File, consoleSocket *os.File, fifoFd, lo
 			config:        config,
 			fifoFd:        fifoFd,
 			logFd:         logFd,
-			mountFds:      mountFds,
 		}
 		return i.Init()
 	}
diff --git a/libcontainer/message_linux.go b/libcontainer/message_linux.go
index 17db81a29f3..d78e0328a20 100644
--- a/libcontainer/message_linux.go
+++ b/libcontainer/message_linux.go
@@ -21,8 +21,6 @@ const (
 	RootlessEUIDAttr uint16 = 27287
 	UidmapPathAttr   uint16 = 27288
 	GidmapPathAttr   uint16 = 27289
-	MountSourcesAttr uint16 = 27290
-	IdmapSourcesAttr uint16 = 27291
 )
 
 type Int32msg struct {
diff --git a/libcontainer/mount_linux.go b/libcontainer/mount_linux.go
index da11da68ea0..8d25f7911f8 100644
--- a/libcontainer/mount_linux.go
+++ b/libcontainer/mount_linux.go
@@ -1,9 +1,12 @@
 package libcontainer
 
 import (
+	"errors"
 	"io/fs"
+	"os"
 	"strconv"
 
+	"github.com/sirupsen/logrus"
 	"golang.org/x/sys/unix"
 )
 
@@ -59,22 +62,57 @@ func mount(source, target, fstype string, flags uintptr, data string) error {
 
 // mountViaFDs is a unix.Mount wrapper which uses srcFD instead of source,
 // and dstFD instead of target, unless those are empty.
-// If srcFD is different than nil, its path (i.e. "/proc/self/fd/NN") will be
-// constructed by this function.
-// dstFD argument, if non-empty, is expected to be in the form of a path to an
-// opened file descriptor on procfs (i.e. "/proc/self/fd/NN").
+//
+// If srcFD is non-nil and flags contains MS_BIND, mountViaFDs will attempt to
+// do a move_mount(2) as srcFD is assumed to be a "new mount API" file
+// descriptor that cannot always be mounted using the old mount API (such as
+// with bind-mounts).
+//
+// The dstFD argument, if non-empty, is expected to be in the form of a path to
+// an opened file descriptor on procfs (i.e. "/proc/self/fd/NN").
 //
 // If case an FD is used instead of a source or a target path, the
 // corresponding path is only used to add context to an error in case
 // the mount operation has failed.
 func mountViaFDs(source string, srcFD *int, target, dstFD, fstype string, flags uintptr, data string) error {
+	dst := target
+	if dstFD != "" {
+		dst = dstFD
+	}
 	src := source
 	if srcFD != nil {
 		src = "/proc/self/fd/" + strconv.Itoa(*srcFD)
 	}
-	dst := target
-	if dstFD != "" {
-		dst = dstFD
+
+	// Do a move_mount(2) if the source is a mountfd. For MS_REMOUNT,
+	// move_mount(2) doesn't make sense (though we should never get into that
+	// case).
+	if srcFD != nil && flags&unix.MS_REMOUNT == 0 {
+		if err := unix.MoveMount(*srcFD, "", unix.AT_FDCWD, dstFD, unix.MOVE_MOUNT_F_EMPTY_PATH|unix.MOVE_MOUNT_T_SYMLINKS); err == nil {
+			// all done
+			return nil
+		} else if errors.Is(err, unix.EINVAL) || errors.Is(err, unix.ENOSYS) {
+			// fall back to old-school mount -- even if it's almost certainly
+			// not going to work...
+			logrus.Debugf("move_mount of srcfd-based mount failed, falling back to classic mount")
+		} else {
+			// Prettify the source argument so the user knows what
+			// /proc/self/fd/... refers to here.
+			srcLink, err := os.Readlink(src)
+			if err != nil {
+				srcLink = "<unknown source path>"
+			}
+			return &mountError{
+				op:     "move_mount",
+				source: srcLink,
+				srcFD:  srcFD,
+				target: target,
+				dstFD:  dstFD,
+				flags:  flags,
+				data:   data,
+				err:    err,
+			}
+		}
 	}
 	if err := unix.Mount(src, dst, fstype, flags, data); err != nil {
 		return &mountError{
diff --git a/libcontainer/nsenter/nsexec.c b/libcontainer/nsenter/nsexec.c
index d94e2cbac04..45d5c291e4f 100644
--- a/libcontainer/nsenter/nsexec.c
+++ b/libcontainer/nsenter/nsexec.c
@@ -44,10 +44,6 @@ enum sync_t {
 	SYNC_RECVPID_ACK = 0x43,	/* PID was correctly received by parent. */
 	SYNC_GRANDCHILD = 0x44,	/* The grandchild is ready to run. */
 	SYNC_CHILD_FINISH = 0x45,	/* The child or grandchild has finished. */
-	SYNC_MOUNTSOURCES_PLS = 0x46,	/* Tell parent to send mount sources by SCM_RIGHTS. */
-	SYNC_MOUNTSOURCES_ACK = 0x47,	/* All mount sources have been sent. */
-	SYNC_MOUNT_IDMAP_PLS = 0x48,	/* Tell parent to mount idmap sources. */
-	SYNC_MOUNT_IDMAP_ACK = 0x49,	/* All idmap mounts have been done. */
 };
 
 #define STAGE_SETUP  -1
@@ -96,14 +92,6 @@ struct nlconfig_t {
 	size_t uidmappath_len;
 	char *gidmappath;
 	size_t gidmappath_len;
-
-	/* Mount sources opened outside the container userns. */
-	char *mountsources;
-	size_t mountsources_len;
-
-	/* Idmap sources opened outside the container userns which will be id mapped. */
-	char *idmapsources;
-	size_t idmapsources_len;
 };
 
 /*
@@ -120,8 +108,6 @@ struct nlconfig_t {
 #define ROOTLESS_EUID_ATTR	27287
 #define UIDMAPPATH_ATTR		27288
 #define GIDMAPPATH_ATTR		27289
-#define MOUNT_SOURCES_ATTR	27290
-#define IDMAP_SOURCES_ATTR	27291
 
 /*
  * Use the raw syscall for versions of glibc which don't include a function for
@@ -437,14 +423,6 @@ static void nl_parse(int fd, struct nlconfig_t *config)
 		case SETGROUP_ATTR:
 			config->is_setgroup = readint8(current);
 			break;
-		case MOUNT_SOURCES_ATTR:
-			config->mountsources = current;
-			config->mountsources_len = payload_len;
-			break;
-		case IDMAP_SOURCES_ATTR:
-			config->idmapsources = current;
-			config->idmapsources_len = payload_len;
-			break;
 		default:
 			bail("unknown netlink message type %d", nlattr->nla_type);
 		}
@@ -533,115 +511,6 @@ static inline int sane_kill(pid_t pid, int signum)
 		return 0;
 }
 
-/* receive_fd_sources parses env_var as an array of fd numbers and, for each element that is
- * not -1, it receives an fd via SCM_RIGHTS and dup3 it to the fd requested in
- * the element of the env var.
- */
-void receive_fd_sources(int sockfd, const char *env_var)
-{
-	char *fds, *endp;
-	long new_fd;
-
-	// This env var must be a json array of ints.
-	fds = getenv(env_var);
-
-	if (fds[0] != '[') {
-		bail("malformed %s env var: missing '['", env_var);
-	}
-	fds++;
-
-	for (endp = fds; *endp != ']'; fds = endp + 1) {
-		new_fd = strtol(fds, &endp, 10);
-		if (endp == fds) {
-			bail("malformed %s env var: not a number", env_var);
-		}
-		if (*endp == '\0') {
-			bail("malformed %s env var: missing ]", env_var);
-		}
-		// The list contains -1 when no fd is needed. Ignore them.
-		if (new_fd == -1) {
-			continue;
-		}
-
-		if (new_fd == LONG_MAX || new_fd < 0 || new_fd > INT_MAX) {
-			bail("malformed %s env var: fds out of range", env_var);
-		}
-
-		int recv_fd = receive_fd(sockfd);
-		if (dup3(recv_fd, new_fd, O_CLOEXEC) < 0) {
-			bail("cannot dup3 fd %d to %ld", recv_fd, new_fd);
-		}
-		if (close(recv_fd) < 0) {
-			bail("cannot close fd %d", recv_fd);
-		}
-	}
-}
-
-void receive_mountsources(int sockfd)
-{
-	receive_fd_sources(sockfd, "_LIBCONTAINER_MOUNT_FDS");
-}
-
-void send_mountsources(int sockfd, pid_t child, char *mountsources, size_t mountsources_len)
-{
-	char proc_path[PATH_MAX];
-	int host_mntns_fd;
-	int container_mntns_fd;
-	int fd;
-	int ret;
-
-	// container_linux.go shouldSendMountSources() decides if mount sources
-	// should be pre-opened (O_PATH) and passed via SCM_RIGHTS
-	if (mountsources == NULL)
-		return;
-
-	host_mntns_fd = open("/proc/self/ns/mnt", O_RDONLY | O_CLOEXEC);
-	if (host_mntns_fd == -1)
-		bail("failed to get current mount namespace");
-
-	if (snprintf(proc_path, PATH_MAX, "/proc/%d/ns/mnt", child) < 0)
-		bail("failed to get mount namespace path");
-
-	container_mntns_fd = open(proc_path, O_RDONLY | O_CLOEXEC);
-	if (container_mntns_fd == -1)
-		bail("failed to get container mount namespace");
-
-	if (setns(container_mntns_fd, CLONE_NEWNS) < 0)
-		bail("failed to setns to container mntns");
-
-	char *mountsources_end = mountsources + mountsources_len;
-	while (mountsources < mountsources_end) {
-		if (mountsources[0] == '\0') {
-			mountsources++;
-			continue;
-		}
-
-		fd = open(mountsources, O_PATH | O_CLOEXEC);
-		if (fd < 0)
-			bail("failed to open mount source %s", mountsources);
-
-		write_log(DEBUG, "~> sending fd for: %s", mountsources);
-		if (send_fd(sockfd, fd) < 0)
-			bail("failed to send fd %d via unix socket %d", fd, sockfd);
-
-		ret = close(fd);
-		if (ret != 0)
-			bail("failed to close mount source fd %d", fd);
-
-		mountsources += strlen(mountsources) + 1;
-	}
-
-	if (setns(host_mntns_fd, CLONE_NEWNS) < 0)
-		bail("failed to setns to host mntns");
-
-	ret = close(host_mntns_fd);
-	if (ret != 0)
-		bail("failed to close host mount namespace fd %d", host_mntns_fd);
-	ret = close(container_mntns_fd);
-	if (ret != 0)
-		bail("failed to close container mount namespace fd %d", container_mntns_fd);
-}
-
 void try_unshare(int flags, const char *msg)
 {
 	write_log(DEBUG, "unshare %s", msg);
@@ -661,89 +530,6 @@ void try_unshare(int flags, const char *msg)
 	bail("failed to unshare %s", msg);
 }
 
-void send_idmapsources(int sockfd, pid_t pid, char *idmap_src, int idmap_src_len)
-{
-	char proc_user_path[PATH_MAX];
-
-	/* Open the userns fd only once.
-	 * Currently we only support idmap mounts that use the same mapping than
-	 * the userns. This is validated in libcontainer/configs/validate/validator.go,
-	 * so if we reached here, we know the mapping for the idmap is the same
-	 * as the userns. This is why we just open the userns_fd once from the
-	 * PID of the child process that has the userns already applied.
-	 */
-	int ret = snprintf(proc_user_path, sizeof(proc_user_path), "/proc/%d/ns/user", pid);
-	if (ret < 0 || (size_t)ret >= sizeof(proc_user_path)) {
-		sane_kill(pid, SIGKILL);
-		bail("failed to create userns path string");
-	}
-
-	int userns_fd = open(proc_user_path, O_RDONLY | O_CLOEXEC | O_NOCTTY);
-	if (userns_fd < 0) {
-		sane_kill(pid, SIGKILL);
-		bail("failed to get user namespace fd");
-	}
-
-	char *idmap_end = idmap_src + idmap_src_len;
-	while (idmap_src < idmap_end) {
-		if (idmap_src[0] == '\0') {
-			idmap_src++;
-			continue;
-		}
-
-		int fd_tree = sys_open_tree(-EBADF, idmap_src,
-					    OPEN_TREE_CLONE | OPEN_TREE_CLOEXEC |
-					    AT_SYMLINK_NOFOLLOW | AT_NO_AUTOMOUNT);
-		if (fd_tree < 0) {
-			sane_kill(pid, SIGKILL);
-			if (errno == ENOSYS) {
-				bail("open_tree(2) failed, the kernel doesn't support ID-mapped mounts");
-			} else if (errno == EINVAL) {
-				bail("open_tree(2) failed with path: %s, the kernel doesn't support ID-mapped mounts",
-				     idmap_src);
-			} else {
-				bail("open_tree(2) failed with path: %s", idmap_src);
-			}
-		}
-
-		struct mount_attr attr = {
-			.attr_set = MOUNT_ATTR_IDMAP,
-			.userns_fd = userns_fd,
-		};
-
-		ret = sys_mount_setattr(fd_tree, "", AT_EMPTY_PATH, &attr, sizeof(attr));
-		if (ret < 0) {
-			sane_kill(pid, SIGKILL);
-			if (errno == ENOSYS)
-				bail("mount_setattr(2) failed, the kernel doesn't support ID-mapped mounts");
-			else if (errno == EINVAL)
-				bail("mount_setattr(2) failed with path: %s, maybe the filesystem doesn't support ID-mapped mounts", idmap_src);
-			else
-				bail("mount_setattr(2) failed with path: %s", idmap_src);
-		}
-
-		write_log(DEBUG, "~> sending idmap source: %s with mapping from: %s", idmap_src, proc_user_path);
-		send_fd(sockfd, fd_tree);
-
-		if (close(fd_tree) < 0) {
-			sane_kill(pid, SIGKILL);
-			bail("error closing fd_tree");
-		}
-
-		idmap_src += strlen(idmap_src) + 1;
-	}
-
-	if (close(userns_fd) < 0) {
-		sane_kill(pid, SIGKILL);
-		bail("error closing userns fd");
-	}
-}
-
-void receive_idmapsources(int sockfd)
-{
-	receive_fd_sources(sockfd, "_LIBCONTAINER_IDMAP_FDS");
-}
-
 void nsexec(void)
 {
 	int pipenum;
@@ -966,28 +752,6 @@ void nsexec(void)
 						sane_kill(stage2_pid, SIGKILL);
 						bail("failed to sync with runc: write(pid-JSON)");
 					}
-					break;
-				case SYNC_MOUNTSOURCES_PLS:
-					write_log(DEBUG, "stage-1 requested to open mount sources");
-					send_mountsources(syncfd, stage1_pid, config.mountsources,
-							  config.mountsources_len);
-
-					s = SYNC_MOUNTSOURCES_ACK;
-					if (write(syncfd, &s, sizeof(s)) != sizeof(s)) {
-						sane_kill(stage1_pid, SIGKILL);
-						bail("failed to sync with child: write(SYNC_MOUNTSOURCES_ACK)");
-					}
-					break;
-				case SYNC_MOUNT_IDMAP_PLS:
-					write_log(DEBUG, "stage-1 requested to open idmap sources");
-					send_idmapsources(syncfd, stage1_pid, config.idmapsources,
-							  config.idmapsources_len);
-					s = SYNC_MOUNT_IDMAP_ACK;
-					if (write(syncfd, &s, sizeof(s)) != sizeof(s)) {
-						sane_kill(stage1_pid, SIGKILL);
-						bail("failed to sync with child: write(SYNC_MOUNT_IDMAP_ACK)");
-					}
-
 					break;
 				case SYNC_CHILD_FINISH:
 					write_log(DEBUG, "stage-1 complete");
@@ -1142,38 +906,6 @@ void nsexec(void)
 			 */
 			try_unshare(config.cloneflags, "remaining namespaces");
 
-			/* Ask our parent to send the mount sources fds. */
-			if (config.mountsources) {
-				write_log(DEBUG, "request stage-0 to send mount sources");
-				s = SYNC_MOUNTSOURCES_PLS;
-				if (write(syncfd, &s, sizeof(s)) != sizeof(s))
-					bail("failed to sync with parent: write(SYNC_MOUNTSOURCES_PLS)");
-
-				/* Receive and install all mount sources fds. */
-				receive_mountsources(syncfd);
-
-				/* Parent finished to send the mount sources fds. */
-				if (read(syncfd, &s, sizeof(s)) != sizeof(s))
-					bail("failed to sync with parent: read(SYNC_MOUNTSOURCES_ACK)");
-				if (s != SYNC_MOUNTSOURCES_ACK)
-					bail("failed to sync with parent: SYNC_MOUNTSOURCES_ACK: got %u", s);
-			}
-
-			if (config.idmapsources) {
-				write_log(DEBUG, "request stage-0 to send idmap sources");
-				s = SYNC_MOUNT_IDMAP_PLS;
-				if (write(syncfd, &s, sizeof(s)) != sizeof(s))
-					bail("failed to sync with parent: write(SYNC_MOUNT_IDMAP_PLS)");
-
-				/* Receive and install all idmap fds. */
-				receive_idmapsources(syncfd);
-
-				if (read(syncfd, &s, sizeof(s)) != sizeof(s))
-					bail("failed to sync with parent: read(SYNC_MOUNT_IDMAP_ACK)");
-				if (s != SYNC_MOUNT_IDMAP_ACK)
-					bail("failed to sync with parent: SYNC_MOUNT_IDMAP_ACK: got %u", s);
-			}
-
 			/*
 			 * TODO: What about non-namespace clone flags that we're dropping here?
 			 *
diff --git a/libcontainer/rootfs_linux.go b/libcontainer/rootfs_linux.go
index f65df8197d2..e08956e11e8 100644
--- a/libcontainer/rootfs_linux.go
+++ b/libcontainer/rootfs_linux.go
@@ -63,20 +63,12 @@ func needsSetupDev(config *configs.Config) bool {
 // prepareRootfs sets up the devices, mount points, and filesystems for use
 // inside a new mount namespace. It doesn't set anything as ro. You must call
 // finalizeRootfs after this function to finish setting up the rootfs.
-func prepareRootfs(pipe *os.File, iConfig *initConfig, mountFds mountFds) (err error) {
+func prepareRootfs(pipe *os.File, iConfig *initConfig) (err error) {
 	config := iConfig.Config
 	if err := prepareRoot(config); err != nil {
 		return fmt.Errorf("error preparing rootfs: %w", err)
 	}
 
-	if mountFds.sourceFds != nil && len(mountFds.sourceFds) != len(config.Mounts) {
-		return fmt.Errorf("malformed mountFds slice. Expected size: %v, got: %v", len(config.Mounts), len(mountFds.sourceFds))
-	}
-
-	if mountFds.idmapFds != nil && len(mountFds.idmapFds) != len(config.Mounts) {
-		return fmt.Errorf("malformed idmapFds slice: expected size: %v, got: %v", len(config.Mounts), len(mountFds.idmapFds))
-	}
-
 	mountConfig := &mountConfig{
 		root:            config.Rootfs,
 		label:           config.MountLabel,
@@ -85,24 +77,31 @@ func prepareRootfs(pipe *os.File, iConfig *initConfig, mountFds mountFds) (err e
 		cgroupns:        config.Namespaces.Contains(configs.NEWCGROUP),
 		noMountFallback: config.NoMountFallback,
 	}
-	for i, m := range config.Mounts {
+	for _, m := range config.Mounts {
 		entry := mountEntry{Mount: m}
-		// Just before the loop we checked that if not empty, len(mountFds.sourceFds) == len(config.Mounts).
-		// Therefore, we can access mountFds.sourceFds[i] without any concerns.
-		if mountFds.sourceFds != nil && mountFds.sourceFds[i] != -1 {
-			entry.srcFD = &mountFds.sourceFds[i]
+		if m.SourceFd != nil {
+			// m.SourceFd indicates that we have an open_tree()-like mountfd
+			// which was passed to us from the parent runc process, which we
+			// have to operate on using the new mount API.
+			// TODO: Remove mountEntry and srcFD.
+			entry.srcFD = m.SourceFd
+			// As an extra precaution, mark the descriptor as O_CLOEXEC. In
+			// theory this could be raced against and leaked in other ways, but
+			// we have other protections. We ignore errors.
+			_, _ = unix.FcntlInt(uintptr(*m.SourceFd), unix.F_SETFD, unix.O_CLOEXEC)
 		}
-
-		// We validated before we can access mountFds.idmapFds[i].
-		if mountFds.idmapFds != nil && mountFds.idmapFds[i] != -1 {
+		if err := mountToRootfs(mountConfig, entry); err != nil {
+			// Clean up error formatting for \x00 prefix entries to tell the
+			// user what the original mount source was.
+			srcName := fmt.Sprintf("%q", m.Source)
 			if entry.srcFD != nil {
-				return fmt.Errorf("malformed mountFds and idmapFds slice, entry: %v has fds in both slices", i)
+				if link, err := os.Readlink(fmt.Sprintf("/proc/self/fd/%d", *entry.srcFD)); err != nil {
+					srcName = fmt.Sprintf("<unknown source path>[fd=%d]", *entry.srcFD)
+				} else {
+					srcName = fmt.Sprintf("%q[fd=%d]", link, *entry.srcFD)
+				}
 			}
-			entry.srcFD = &mountFds.idmapFds[i]
-		}
-
-		if err := mountToRootfs(mountConfig, entry); err != nil {
-			return fmt.Errorf("error mounting %q to rootfs at %q: %w", m.Source, m.Destination, err)
+			return fmt.Errorf("error mounting %s to rootfs at %q: %w", srcName, m.Destination, err)
 		}
 	}
 
@@ -479,35 +478,9 @@ func mountToRootfs(c *mountConfig, m mountEntry) error {
 		if err := prepareBindMount(m, rootfs); err != nil {
 			return err
 		}
-
-		if m.IsBind() && m.IsIDMapped() {
-			if m.srcFD == nil {
-				return fmt.Errorf("error creating mount %+v: idmapFD is invalid, should point to a valid fd", m)
-			}
-			if err := unix.MoveMount(*m.srcFD, "", -1, dest, unix.MOVE_MOUNT_F_EMPTY_PATH); err != nil {
-				return fmt.Errorf("error on unix.MoveMount %+v: %w", m, err)
-			}
-
-			// In nsexec.c, we did not set the propagation field of mount_attr struct.
-			// So, let's deal with these flags right now!
-			if err := utils.WithProcfd(rootfs, dest, func(dstFD string) error {
-				for _, pflag := range m.PropagationFlags {
-					// When using mount for setting propagations flags, the source, file
-					// system type and data arguments are ignored:
-					// https://man7.org/linux/man-pages/man2/mount.2.html
-					// We also ignore procfd because we want to act on dest.
-					if err := mountViaFDs("", nil, dest, dstFD, "", uintptr(pflag), ""); err != nil {
-						return err
-					}
-				}
-				return nil
-			}); err != nil {
-				return fmt.Errorf("change mount propagation through procfd: %w", err)
-			}
-		} else {
-			if err := mountPropagate(m, rootfs, mountLabel); err != nil {
-				return err
-			}
+		// open_tree()-related shenanigans are all handled in mountViaFDs.
+		if err := mountPropagate(m, rootfs, mountLabel); err != nil {
+			return err
 		}
 		// bind mount won't change mount options, we need remount to make mount options effective.
 		// first check that we have non-default options required before attempting a remount
diff --git a/libcontainer/standard_init_linux.go b/libcontainer/standard_init_linux.go
index f3d04282362..3d97b06d1b5 100644
--- a/libcontainer/standard_init_linux.go
+++ b/libcontainer/standard_init_linux.go
@@ -25,7 +25,6 @@ type linuxStandardInit struct {
 	parentPid     int
 	fifoFd        int
 	logFd         int
-	mountFds      mountFds
 	config        *initConfig
 }
 
@@ -86,17 +85,7 @@ func (l *linuxStandardInit) Init() error {
 	// initialises the labeling system
 	selinux.GetEnabled()
 
-	// We don't need the mount nor idmap fds after prepareRootfs() nor if it fails.
-	err := prepareRootfs(l.pipe, l.config, l.mountFds)
-	for _, m := range append(l.mountFds.sourceFds, l.mountFds.idmapFds...) {
-		if m == -1 {
-			continue
-		}
-		if err := unix.Close(m); err != nil {
-			return fmt.Errorf("unable to close mountFds fds: %w", err)
-		}
-	}
-
+	err := prepareRootfs(l.pipe, l.config)
 	if err != nil {
 		return err
 	}
diff --git a/libcontainer/userns/usernsfd_linux.go b/libcontainer/userns/usernsfd_linux.go
new file mode 100644
index 00000000000..0f518c5d5fe
--- /dev/null
+++ b/libcontainer/userns/usernsfd_linux.go
@@ -0,0 +1,134 @@
+package userns
+
+import (
+	"fmt"
+	"os"
+	"sort"
+	"strings"
+	"sync"
+	"syscall"
+
+	"github.com/sirupsen/logrus"
+	"golang.org/x/sys/unix"
+
+	"github.com/opencontainers/runc/libcontainer/configs"
+)
+
+type Mapping struct {
+	UIDMappings []configs.IDMap
+	GIDMappings []configs.IDMap
+}
+
+func (m Mapping) toSys() (uids, gids []syscall.SysProcIDMap) {
+	for _, uid := range m.UIDMappings {
+		uids = append(uids, syscall.SysProcIDMap{
+			ContainerID: uid.ContainerID,
+			HostID:      uid.HostID,
+			Size:        uid.Size,
+		})
+	}
+	for _, gid := range m.GIDMappings {
+		gids = append(gids, syscall.SysProcIDMap{
+			ContainerID: gid.ContainerID,
+			HostID:      gid.HostID,
+			Size:        gid.Size,
+		})
+	}
+	return
+}
+
+// id returns a unique identifier for this mapping, agnostic of the order of
+// the uid and gid mappings (because the order doesn't matter to the kernel).
+// The set of userns handles is indexed using this ID.
+func (m Mapping) id() string {
+	var uids, gids []string
+	for _, idmap := range m.UIDMappings {
+		uids = append(uids, fmt.Sprintf("%d:%d:%d", idmap.ContainerID, idmap.HostID, idmap.Size))
+	}
+	for _, idmap := range m.GIDMappings {
+		gids = append(gids, fmt.Sprintf("%d:%d:%d", idmap.ContainerID, idmap.HostID, idmap.Size))
+	}
+	// We don't care about the sort order -- just sort them.
+	sort.Strings(uids)
+	sort.Strings(gids)
+	return "uid=" + strings.Join(uids, ",") + ";gid=" + strings.Join(gids, ",")
+}
+
+type Handles struct {
+	m     sync.Mutex
+	procs map[string]*os.Process
+}
+
+// Release kills all of the child processes created with this Handle, releasing
+// the resources used to create existing userns handles. The kernel guarantees
+// all existing files returned from Get() will continue to work even after
+// calling Release(). The same Handles can be re-used after calling Release().
+func (hs *Handles) Release() {
+	hs.m.Lock()
+	defer hs.m.Unlock()
+
+	for _, proc := range hs.procs {
+		// We can't do anything about errors here, and in the case of runc we
+		// don't care because in the worst case we end up with zombies that
+		// pid1 can deal with.
+		_ = proc.Kill()
+		_, _ = proc.Wait()
+	}
+	hs.procs = make(map[string]*os.Process)
+}
+
+func spawnProc(req Mapping) (*os.Process, error) {
+	// We need to spawn a subprocess with the requested mappings, which is
+	// unfortunately quite expensive. The "safe" way of doing this is natively
+	// with Go (and then spawning something like "sleep infinity"), but
+	// execve() is a waste of cycles because we just need some process to have
+	// the right mapping, we don't care what it's executing. The "unsafe"
+	// option of doing a clone() behind the back of Go is probably okay in
+	// theory as long as we just do kill(getpid(), SIGSTOP). However, if we
+	// tell Go to put the new process into PTRACE_TRACEME mode, we can avoid
+	// the exec and not have to faff around with the mappings.
+	//
+	// Note that Go's stdlib does not support newuidmap, but in the case of
+	// id-mapped mounts, it seems incredibly unlikely that the user will be
+	// requesting us to do a remapping as an unprivileged user with mappings
+	// they have privileges over.
+	logrus.Debugf("spawning dummy process for id-mapping %s", req.id())
+	uidMappings, gidMappings := req.toSys()
+	return os.StartProcess("/proc/self/exe", []string{"runc", "--help"}, &os.ProcAttr{
+		Sys: &syscall.SysProcAttr{
+			Cloneflags:                 unix.CLONE_NEWUSER,
+			UidMappings:                uidMappings,
+			GidMappings:                gidMappings,
+			GidMappingsEnableSetgroups: false,
+			// Put the process into PTRACE_TRACEME mode to allow us to get the
+			// userns without having a proper execve() target.
+			Ptrace: true,
+		},
+	})
+}
+
+// Get returns a handle to a /proc/$pid/ns/user nsfs file with the requested
+// mapping. The processes spawned to produce userns nsfds are cached, so if
+// equivalent user namespace mappings are requested, the same user namespace
+// will be returned. The caller is responsible for closing the returned file
+// descriptor.
+func (hs *Handles) Get(req Mapping) (f *os.File, err error) {
+	hs.m.Lock()
+	defer hs.m.Unlock()
+
+	if hs.procs == nil {
+		hs.procs = make(map[string]*os.Process)
+	}
+
+	proc, ok := hs.procs[req.id()]
+	if !ok {
+		proc, err = spawnProc(req)
+		if err != nil {
+			return nil, fmt.Errorf("failed to spawn dummy process for map %s: %w", req.id(), err)
+		}
+		// TODO: Maybe we should just cache handles to /proc/$pid/ns/user, so we
+		//       can kill the process immediately?
+		hs.procs[req.id()] = proc
+	}
+	return os.Open(fmt.Sprintf("/proc/%d/ns/user", proc.Pid))
+}
diff --git a/libcontainer/userns/usernsfd_linux_test.go b/libcontainer/userns/usernsfd_linux_test.go
new file mode 100644
index 00000000000..c28f55d8eff
--- /dev/null
+++ b/libcontainer/userns/usernsfd_linux_test.go
@@ -0,0 +1,52 @@
+package userns
+
+import (
+	"os"
+	"testing"
+
+	"github.com/opencontainers/runc/libcontainer/configs"
+)
+
+func BenchmarkSpawnProc(b *testing.B) {
+	if os.Geteuid() != 0 {
+		b.Skip("spawning user namespaced processes requires root")
+	}
+
+	// We can re-use the mapping as we call spawnProc() directly.
+	mapping := Mapping{
+		UIDMappings: []configs.IDMap{
+			{ContainerID: 0, HostID: 1337, Size: 142},
+			{ContainerID: 150, HostID: 0, Size: 1},
+			{ContainerID: 442, HostID: 1111, Size: 12},
+			{ContainerID: 1000, HostID: 9999, Size: 92},
+			{ContainerID: 9999, HostID: 1000000, Size: 4},
+			// Newer kernels support more than 5 entries, but stick to 5 here.
+		},
+		GIDMappings: []configs.IDMap{
+			{ContainerID: 1, HostID: 2337, Size: 142},
+			{ContainerID: 145, HostID: 6, Size: 1},
+			{ContainerID: 200, HostID: 1000, Size: 12},
+			{ContainerID: 1000, HostID: 9888, Size: 92},
+			{ContainerID: 8999, HostID: 1000000, Size: 4},
+			// Newer kernels support more than 5 entries, but stick to 5 here.
+		},
+	}
+
+	procs := make([]*os.Process, 0, b.N)
+	b.Cleanup(func() {
+		for _, proc := range procs {
+			if proc != nil {
+				_ = proc.Kill()
+				_, _ = proc.Wait()
+			}
+		}
+	})
+
+	for i := 0; i < b.N; i++ {
+		proc, err := spawnProc(mapping)
+		if err != nil {
+			b.Error(err)
+		}
+		procs = append(procs, proc)
+	}
+}

From 18ce9de4e3e0c64d94ebd98bb5b6034876988782 Mon Sep 17 00:00:00 2001
From: Aleksa Sarai <cyphar@cyphar.com>
Date: Sat, 5 Aug 2023 18:04:50 +1000
Subject: [PATCH 10/11] idmap: allow arbitrary idmap mounts regardless of
 userns configuration

With the rework of nsexec.c to handle MOUNT_ATTR_IDMAP in our Go code we
can now handle arbitrary mappings without issue, so remove the primary
artificial limit of mappings (must use the same mapping as the
container's userns) and add some tests.

We still only support idmap mounts for bind-mounts because configuring
mappings for other filesystems would require switching our entire mount
machinery to the new mount API. The current design would easily allow
for this but we would need to convert new mount options entirely to the
fsopen/fsconfig/fsmount API. This can be done in the future.

Signed-off-by: Aleksa Sarai <cyphar@cyphar.com>
---
 libcontainer/configs/validate/validator.go    |  38 +-
 .../configs/validate/validator_test.go        |  81 ++--
 tests/integration/idmap.bats                  | 353 +++++++++++++-----
 3 files changed, 287 insertions(+), 185 deletions(-)

diff --git a/libcontainer/configs/validate/validator.go b/libcontainer/configs/validate/validator.go
index 196b431dba1..d4ca4bd707e 100644
--- a/libcontainer/configs/validate/validator.go
+++ b/libcontainer/configs/validate/validator.go
@@ -259,24 +259,11 @@ func checkIDMapMounts(config *configs.Config, m *configs.Mount) error {
 	}
 
 	if !m.IsBind() {
-		return fmt.Errorf("gidMappings/uidMappings is supported only for mounts with the option 'bind'")
+		return fmt.Errorf("id-mapped mounts are only supported for bind-mounts")
 	}
 	if config.RootlessEUID {
-		return fmt.Errorf("gidMappings/uidMappings is not supported when runc is being launched with EUID != 0, needs CAP_SYS_ADMIN on the runc parent's user namespace")
+		return fmt.Errorf("id-mapped mounts are not supported for rootless containers")
 	}
-	if len(config.UIDMappings) == 0 || len(config.GIDMappings) == 0 {
-		return fmt.Errorf("not yet supported to use gidMappings/uidMappings in a mount without also using a user namespace")
-	}
-	if !sameMapping(config.UIDMappings, m.UIDMappings) {
-		return fmt.Errorf("not yet supported for the mount uidMappings to be different than user namespace uidMapping")
-	}
-	if !sameMapping(config.GIDMappings, m.GIDMappings) {
-		return fmt.Errorf("not yet supported for the mount gidMappings to be different than user namespace gidMapping")
-	}
-	if !filepath.IsAbs(m.Source) {
-		return fmt.Errorf("mount source not absolute")
-	}
-
 	return nil
 }
 
@@ -298,27 +285,6 @@ func mounts(config *configs.Config) error {
 	return nil
 }
 
-// sameMapping checks if the mappings are the same. If the mappings are the same
-// but in different order, it returns false.
-func sameMapping(a, b []configs.IDMap) bool {
-	if len(a) != len(b) {
-		return false
-	}
-
-	for i := range a {
-		if a[i].ContainerID != b[i].ContainerID {
-			return false
-		}
-		if a[i].HostID != b[i].HostID {
-			return false
-		}
-		if a[i].Size != b[i].Size {
-			return false
-		}
-	}
-	return true
-}
-
 func isHostNetNS(path string) (bool, error) {
 	const currentProcessNetns = "/proc/self/ns/net"
 
diff --git a/libcontainer/configs/validate/validator_test.go b/libcontainer/configs/validate/validator_test.go
index 58aae7a68f8..18f6814a380 100644
--- a/libcontainer/configs/validate/validator_test.go
+++ b/libcontainer/configs/validate/validator_test.go
@@ -436,22 +436,7 @@ func TestValidateIDMapMounts(t *testing.T) {
 			},
 		},
 		{
-			name:  "idmap mount without userns mappings",
-			isErr: true,
-			config: &configs.Config{
-				Mounts: []*configs.Mount{
-					{
-						Source:      "/abs/path/",
-						Destination: "/abs/path/",
-						Flags:       unix.MS_BIND,
-						UIDMappings: mapping,
-						GIDMappings: mapping,
-					},
-				},
-			},
-		},
-		{
-			name:  "idmap mounts with different userns and mount mappings",
+			name:  "idmap mounts without abs dest path",
 			isErr: true,
 			config: &configs.Config{
 				UIDMappings: mapping,
@@ -459,54 +444,40 @@ func TestValidateIDMapMounts(t *testing.T) {
 				Mounts: []*configs.Mount{
 					{
 						Source:      "/abs/path/",
-						Destination: "/abs/path/",
+						Destination: "./rel/path/",
 						Flags:       unix.MS_BIND,
-						UIDMappings: []configs.IDMap{
-							{
-								ContainerID: 10,
-								HostID:      10,
-								Size:        1,
-							},
-						},
+						UIDMappings: mapping,
 						GIDMappings: mapping,
 					},
 				},
 			},
 		},
 		{
-			name:  "idmap mounts with different userns and mount mappings",
-			isErr: true,
+			name: "simple idmap mount",
 			config: &configs.Config{
 				UIDMappings: mapping,
 				GIDMappings: mapping,
 				Mounts: []*configs.Mount{
 					{
-						Source:      "/abs/path/",
+						Source:      "/another-abs/path/",
 						Destination: "/abs/path/",
 						Flags:       unix.MS_BIND,
 						UIDMappings: mapping,
-						GIDMappings: []configs.IDMap{
-							{
-								ContainerID: 10,
-								HostID:      10,
-								Size:        1,
-							},
-						},
+						GIDMappings: mapping,
 					},
 				},
 			},
 		},
 		{
-			name:  "idmap mounts without abs source path",
-			isErr: true,
+			name: "idmap mount with more flags",
 			config: &configs.Config{
 				UIDMappings: mapping,
 				GIDMappings: mapping,
 				Mounts: []*configs.Mount{
 					{
-						Source:      "./rel/path/",
+						Source:      "/another-abs/path/",
 						Destination: "/abs/path/",
-						Flags:       unix.MS_BIND,
+						Flags:       unix.MS_BIND | unix.MS_RDONLY,
 						UIDMappings: mapping,
 						GIDMappings: mapping,
 					},
@@ -514,15 +485,12 @@ func TestValidateIDMapMounts(t *testing.T) {
 			},
 		},
 		{
-			name:  "idmap mounts without abs dest path",
-			isErr: true,
+			name: "idmap mount without userns mappings",
 			config: &configs.Config{
-				UIDMappings: mapping,
-				GIDMappings: mapping,
 				Mounts: []*configs.Mount{
 					{
 						Source:      "/abs/path/",
-						Destination: "./rel/path/",
+						Destination: "/abs/path/",
 						Flags:       unix.MS_BIND,
 						UIDMappings: mapping,
 						GIDMappings: mapping,
@@ -530,35 +498,46 @@ func TestValidateIDMapMounts(t *testing.T) {
 				},
 			},
 		},
-
 		{
-			name: "simple idmap mount",
+			name: "idmap mounts with different userns and mount mappings",
 			config: &configs.Config{
 				UIDMappings: mapping,
 				GIDMappings: mapping,
 				Mounts: []*configs.Mount{
 					{
-						Source:      "/another-abs/path/",
+						Source:      "/abs/path/",
 						Destination: "/abs/path/",
 						Flags:       unix.MS_BIND,
-						UIDMappings: mapping,
+						UIDMappings: []configs.IDMap{
+							{
+								ContainerID: 10,
+								HostID:      10,
+								Size:        1,
+							},
+						},
 						GIDMappings: mapping,
 					},
 				},
 			},
 		},
 		{
-			name: "idmap mount with more flags",
+			name: "idmap mounts with different userns and mount mappings",
 			config: &configs.Config{
 				UIDMappings: mapping,
 				GIDMappings: mapping,
 				Mounts: []*configs.Mount{
 					{
-						Source:      "/another-abs/path/",
+						Source:      "/abs/path/",
 						Destination: "/abs/path/",
-						Flags:       unix.MS_BIND | unix.MS_RDONLY,
+						Flags:       unix.MS_BIND,
 						UIDMappings: mapping,
-						GIDMappings: mapping,
+						GIDMappings: []configs.IDMap{
+							{
+								ContainerID: 10,
+								HostID:      10,
+								Size:        1,
+							},
+						},
 					},
 				},
 			},
diff --git a/tests/integration/idmap.bats b/tests/integration/idmap.bats
index 375191bf705..0dac40b21e7 100644
--- a/tests/integration/idmap.bats
+++ b/tests/integration/idmap.bats
@@ -2,6 +2,9 @@
 
 load helpers
 
+OVERFLOW_UID="$(cat /proc/sys/kernel/overflowuid)"
+OVERFLOW_GID="$(cat /proc/sys/kernel/overflowgid)"
+
 function setup() {
 	requires root
 	requires_kernel 5.12
@@ -9,152 +12,306 @@ function setup() {
 
 	setup_debian
 
-	# Prepare source folders for bind mount
-	mkdir -p source-{1,2}/
-	touch source-{1,2}/foo.txt
+	# Prepare source folders for mounts.
+	mkdir -p source-{1,2,multi{1,2,3}}/
+	touch source-{1,2,multi{1,2,3}}/foo.txt
+	touch source-multi{1,2,3}/{bar,baz}.txt
 
-	# Use other owner for source-2
+	# Change the owners for everything other than source-1.
 	chown 1:1 source-2/foo.txt
 
-	mkdir -p rootfs/{proc,sys,tmp}
-	mkdir -p rootfs/tmp/mount-{1,2}
-	mkdir -p rootfs/mnt/bind-mount-{1,2}
-
-	update_config ' .linux.namespaces += [{"type": "user"}]
-		| .linux.uidMappings += [{"hostID": 100000, "containerID": 0, "size": 65536}]
-		| .linux.gidMappings += [{"hostID": 100000, "containerID": 0, "size": 65536}]
-		| .process.args += ["-c", "stat -c =%u=%g= /tmp/mount-1/foo.txt"]
-		| .mounts += [
-					{
-						"source": "source-1/",
-						"destination": "/tmp/mount-1",
-						"options": ["bind"],
-						"uidMappings": [ {
-						                  "containerID": 0,
-						                  "hostID": 100000,
-						                  "size": 65536
-						                }
-						],
-						"gidMappings": [        {
-						                  "containerID": 0,
-						                  "hostID": 100000,
-						                  "size": 65536
-						                }
-						]
-					}
-				] '
+	chown 100:211 source-multi1/foo.txt
+	chown 101:222 source-multi1/bar.txt
+	chown 102:233 source-multi1/baz.txt
+
+	# Same gids as multi1, different uids.
+	chown 200:211 source-multi2/foo.txt
+	chown 201:222 source-multi2/bar.txt
+	chown 202:233 source-multi2/baz.txt
+
+	# 1000 uids, 500 gids
+	chown 5000528:6000491 source-multi3/foo.txt
+	chown 5000133:6000337 source-multi3/bar.txt
+	chown 5000999:6000444 source-multi3/baz.txt
 }
 
 function teardown() {
 	teardown_bundle
 }
 
-@test "simple idmap mount" {
+function setup_idmap_userns() {
+	update_config '.linux.namespaces += [{"type": "user"}]
+		| .linux.uidMappings += [{"containerID": 0, "hostID": 100000, "size": 65536}]
+		| .linux.gidMappings += [{"containerID": 0, "hostID": 100000, "size": 65536}]'
+}
+
+function setup_bind_mount() {
+	mountname="${1:-1}"
+	update_config '.mounts += [
+			{
+				"source": "source-'"$mountname"'/",
+				"destination": "/tmp/bind-mount-'"$mountname"'",
+				"options": ["bind"]
+			}
+		]'
+}
+
+function setup_idmap_single_mount() {
+	uidmap="$1"       # ctr:host:size
+	gidmap="${2:-$1}" # ctr:host:size
+	mountname="${3:-1}"
+	destname="${4:-$mountname}"
+
+	read -r uid_containerID uid_hostID uid_size <<<"$(tr : ' ' <<<"$uidmap")"
+	read -r gid_containerID gid_hostID gid_size <<<"$(tr : ' ' <<<"$gidmap")"
+
+	update_config '.mounts += [
+			{
+				"source": "source-'"$mountname"'/",
+				"destination": "/tmp/mount-'"$destname"'",
+				"options": ["bind"],
+				"uidMappings": [{"containerID": '"$uid_containerID"', "hostID": '"$uid_hostID"', "size": '"$uid_size"'}],
+				"gidMappings": [{"containerID": '"$gid_containerID"', "hostID": '"$gid_hostID"', "size": '"$gid_size"'}]
+			}
+		]'
+}
+
+function setup_idmap_basic_mount() {
+	mountname="${1:-1}"
+	setup_idmap_single_mount 0:100000:65536 0:100000:65536 "$mountname"
+}
+
+@test "simple idmap mount [userns]" {
+	setup_idmap_userns
+	setup_idmap_basic_mount
+
+	update_config '.process.args = ["sh", "-c", "stat -c =%u=%g= /tmp/mount-1/foo.txt"]'
+
 	runc run test_debian
 	[ "$status" -eq 0 ]
 	[[ "$output" == *"=0=0="* ]]
 }
 
-@test "write to an idmap mount" {
-	update_config ' .process.args = ["sh", "-c", "touch /tmp/mount-1/bar && stat -c =%u=%g= /tmp/mount-1/bar"]'
+@test "simple idmap mount [no userns]" {
+	setup_idmap_basic_mount
+
+	update_config '.process.args = ["sh", "-c", "stat -c =%u=%g= /tmp/mount-1/foo.txt"]'
+
+	runc run test_debian
+	[ "$status" -eq 0 ]
+	[[ "$output" == *"=100000=100000="* ]]
+}
+
+@test "write to an idmap mount [userns]" {
+	setup_idmap_userns
+	setup_idmap_basic_mount
+
+	update_config '.process.args = ["sh", "-c", "touch /tmp/mount-1/bar && stat -c =%u=%g= /tmp/mount-1/bar"]'
+
 	runc run test_debian
 	[ "$status" -eq 0 ]
 	[[ "$output" == *"=0=0="* ]]
 }
 
-@test "idmap mount with propagation flag" {
-	update_config ' .process.args = ["sh", "-c", "findmnt -o PROPAGATION /tmp/mount-1"]'
+@test "write to an idmap mount [no userns]" {
+	setup_idmap_basic_mount
+
+	update_config '.process.args = ["sh", "-c", "touch /tmp/mount-1/bar && stat -c =%u=%g= /tmp/mount-1/bar"]'
+
+	runc run test_debian
+	# The write must fail because the user is unmapped.
+	[ "$status" -ne 0 ]
+	[[ "$output" == *"Value too large for defined data type"* ]] # ERANGE
+}
+
+@test "idmap mount with propagation flag [userns]" {
+	setup_idmap_userns
+	setup_idmap_basic_mount
+
+	update_config '.process.args = ["sh", "-c", "findmnt -o PROPAGATION /tmp/mount-1"]'
 	# Add the shared option to the idmap mount
-	update_config ' .mounts |= map((select(.source == "source-1/") | .options += ["shared"]) // .)'
+	update_config '.mounts |= map((select(.source == "source-1/") | .options += ["shared"]) // .)'
 
 	runc run test_debian
 	[ "$status" -eq 0 ]
 	[[ "$output" == *"shared"* ]]
 }
 
-@test "idmap mount with bind mount" {
-	update_config '   .mounts += [
-					{
-						"source": "source-2/",
-						"destination": "/tmp/mount-2",
-						"options": ["bind"]
-					}
-				] '
+@test "idmap mount with bind mount [userns]" {
+	setup_idmap_userns
+	setup_idmap_basic_mount
+	setup_bind_mount
+
+	update_config '.process.args = ["bash", "-c", "stat -c =%n:%u=%g= /tmp/{,bind-}mount-1/foo.txt"]'
 
 	runc run test_debian
 	[ "$status" -eq 0 ]
-	[[ "$output" == *"=0=0="* ]]
+	[[ "$output" == *"=/tmp/mount-1/foo.txt:0=0="* ]]
+	[[ "$output" == *"=/tmp/bind-mount-1/foo.txt:$OVERFLOW_UID=$OVERFLOW_GID="* ]]
 }
 
-@test "two idmap mounts with two bind mounts" {
-	update_config '   .process.args = ["sh", "-c", "stat -c =%u=%g= /tmp/mount-1/foo.txt /tmp/mount-2/foo.txt"]
-			| .mounts += [
-					{
-						"source": "source-1/",
-						"destination": "/mnt/bind-mount-1",
-						"options": ["bind"]
-					},
-					{
-						"source": "source-2/",
-						"destination": "/mnt/bind-mount-2",
-						"options": ["bind"]
-					},
-					{
-						"source": "source-2/",
-						"destination": "/tmp/mount-2",
-						"options": ["bind"],
-						"uidMappings": [ {
-						                  "containerID": 0,
-						                  "hostID": 100000,
-						                  "size": 65536
-						                }
-						],
-						"gidMappings": [        {
-						                  "containerID": 0,
-						                  "hostID": 100000,
-						                  "size": 65536
-						                }
-						]
-					}
-				] '
+@test "idmap mount with bind mount [no userns]" {
+	setup_idmap_basic_mount
+	setup_bind_mount
+
+	update_config '.process.args = ["bash", "-c", "stat -c =%n:%u=%g= /tmp/{,bind-}mount-1/foo.txt"]'
 
 	runc run test_debian
 	[ "$status" -eq 0 ]
-	[[ "$output" == *"=0=0="* ]]
-	# source-2/ is owned by 1:1, so we expect this with the idmap mount too.
-	[[ "$output" == *"=1=1="* ]]
+	[[ "$output" == *"=/tmp/mount-1/foo.txt:100000=100000="* ]]
+	[[ "$output" == *"=/tmp/bind-mount-1/foo.txt:0=0="* ]]
+}
+
+@test "two idmap mounts (same mapping) with two bind mounts [userns]" {
+	setup_idmap_userns
+
+	setup_idmap_basic_mount 1
+	setup_bind_mount 1
+	setup_bind_mount 2
+	setup_idmap_basic_mount 2
+
+	update_config '.process.args = ["bash", "-c", "stat -c =%n:%u=%g= /tmp/mount-[12]/foo.txt"]'
+
+	runc run test_debian
+	[ "$status" -eq 0 ]
+	[[ "$output" == *"=/tmp/mount-1/foo.txt:0=0="* ]]
+	[[ "$output" == *"=/tmp/mount-2/foo.txt:1=1="* ]]
+}
+
+@test "same idmap mount (different mappings) [userns]" {
+	setup_idmap_userns
+
+	# Mount the same directory with different mappings. Make sure we also use
+	# different mappings for uids and gids.
+	setup_idmap_single_mount 100:100000:100 200:100000:100 multi1
+	setup_idmap_single_mount 100:101000:100 200:102000:100 multi1 multi1-alt
+
+	update_config '.process.args = ["bash", "-c", "stat -c =%n:%u=%g= /tmp/mount-multi1{,-alt}/{foo,bar,baz}.txt"]'
+
+	runc run test_debian
+	[ "$status" -eq 0 ]
+	[[ "$output" == *"=/tmp/mount-multi1/foo.txt:0=11="* ]]
+	[[ "$output" == *"=/tmp/mount-multi1/bar.txt:1=22="* ]]
+	[[ "$output" == *"=/tmp/mount-multi1/baz.txt:2=33="* ]]
+	[[ "$output" == *"=/tmp/mount-multi1-alt/foo.txt:1000=2011="* ]]
+	[[ "$output" == *"=/tmp/mount-multi1-alt/bar.txt:1001=2022="* ]]
+	[[ "$output" == *"=/tmp/mount-multi1-alt/baz.txt:1002=2033="* ]]
 }
 
-@test "idmap mount without gidMappings fails" {
-	update_config ' .mounts |= map((select(.source == "source-1/") | del(.gidMappings) ) // .)'
+@test "same idmap mount (different mappings) [no userns]" {
+	# Mount the same directory with different mappings. Make sure we also use
+	# different mappings for uids and gids.
+	setup_idmap_single_mount 100:100000:100 200:100000:100 multi1
+	setup_idmap_single_mount 100:101000:100 200:102000:100 multi1 multi1-alt
+
+	update_config '.process.args = ["bash", "-c", "stat -c =%n:%u=%g= /tmp/mount-multi1{,-alt}/{foo,bar,baz}.txt"]'
 
 	runc run test_debian
-	[ "$status" -eq 1 ]
-	[[ "${output}" == *"invalid mount"* ]]
+	[ "$status" -eq 0 ]
+	[[ "$output" == *"=/tmp/mount-multi1/foo.txt:100000=100011="* ]]
+	[[ "$output" == *"=/tmp/mount-multi1/bar.txt:100001=100022="* ]]
+	[[ "$output" == *"=/tmp/mount-multi1/baz.txt:100002=100033="* ]]
+	[[ "$output" == *"=/tmp/mount-multi1-alt/foo.txt:101000=102011="* ]]
+	[[ "$output" == *"=/tmp/mount-multi1-alt/bar.txt:101001=102022="* ]]
+	[[ "$output" == *"=/tmp/mount-multi1-alt/baz.txt:101002=102033="* ]]
 }
 
-@test "idmap mount without uidMappings fails" {
-	update_config ' .mounts |= map((select(.source == "source-1/") | del(.uidMappings) ) // .)'
+@test "multiple idmap mounts (different mappings) [userns]" {
+	setup_idmap_userns
+
+	# Make sure we use different mappings for uids and gids.
+	setup_idmap_single_mount 100:101100:3 200:101900:50 multi1
+	setup_idmap_single_mount 200:102200:3 200:102900:100 multi2
+	setup_idmap_single_mount 5000000:103000:1000 6000000:103000:500 multi3
+
+	update_config '.process.args = ["bash", "-c", "stat -c =%n:%u=%g= /tmp/mount-multi[123]/{foo,bar,baz}.txt"]'
 
 	runc run test_debian
-	[ "$status" -eq 1 ]
-	[[ "${output}" == *"invalid mount"* ]]
+	[ "$status" -eq 0 ]
+	[[ "$output" == *"=/tmp/mount-multi1/foo.txt:1100=1911="* ]]
+	[[ "$output" == *"=/tmp/mount-multi1/bar.txt:1101=1922="* ]]
+	[[ "$output" == *"=/tmp/mount-multi1/baz.txt:1102=1933="* ]]
+	[[ "$output" == *"=/tmp/mount-multi2/foo.txt:2200=2911="* ]]
+	[[ "$output" == *"=/tmp/mount-multi2/bar.txt:2201=2922="* ]]
+	[[ "$output" == *"=/tmp/mount-multi2/baz.txt:2202=2933="* ]]
+	[[ "$output" == *"=/tmp/mount-multi3/foo.txt:3528=3491="* ]]
+	[[ "$output" == *"=/tmp/mount-multi3/bar.txt:3133=3337="* ]]
+	[[ "$output" == *"=/tmp/mount-multi3/baz.txt:3999=3444="* ]]
 }
 
-@test "idmap mount without bind fails" {
-	update_config ' .mounts |= map((select(.source == "source-1/") | .options = [""]) // .)'
+@test "multiple idmap mounts (different mappings) [no userns]" {
+	# Make sure we use different mappings for uids and gids.
+	setup_idmap_single_mount 100:1100:3 200:1900:50 multi1
+	setup_idmap_single_mount 200:2200:3 200:2900:100 multi2
+	setup_idmap_single_mount 5000000:3000:1000 6000000:3000:500 multi3
+
+	update_config '.process.args = ["bash", "-c", "stat -c =%n:%u=%g= /tmp/mount-multi[123]/{foo,bar,baz}.txt"]'
 
 	runc run test_debian
-	[ "$status" -eq 1 ]
-	[[ "${output}" == *"invalid mount"* ]]
+	[ "$status" -eq 0 ]
+	[[ "$output" == *"=/tmp/mount-multi1/foo.txt:1100=1911="* ]]
+	[[ "$output" == *"=/tmp/mount-multi1/bar.txt:1101=1922="* ]]
+	[[ "$output" == *"=/tmp/mount-multi1/baz.txt:1102=1933="* ]]
+	[[ "$output" == *"=/tmp/mount-multi2/foo.txt:2200=2911="* ]]
+	[[ "$output" == *"=/tmp/mount-multi2/bar.txt:2201=2922="* ]]
+	[[ "$output" == *"=/tmp/mount-multi2/baz.txt:2202=2933="* ]]
+	[[ "$output" == *"=/tmp/mount-multi3/foo.txt:3528=3491="* ]]
+	[[ "$output" == *"=/tmp/mount-multi3/bar.txt:3133=3337="* ]]
+	[[ "$output" == *"=/tmp/mount-multi3/baz.txt:3999=3444="* ]]
 }
 
-@test "idmap mount with different mapping than userns fails" {
-	# Let's modify the containerID to be 1, instead of 0 as it is in the
-	# userns config.
-	update_config ' .mounts |= map((select(.source == "source-1/") | .uidMappings[0]["containerID"] = 1 ) // .)'
+@test "idmap mount (complicated mapping) [userns]" {
+	setup_idmap_userns
 
+	update_config '.mounts += [
+			{
+				"source": "source-multi1/",
+				"destination": "/tmp/mount-multi1",
+				"options": ["bind"],
+				"uidMappings": [
+					{"containerID": 100, "hostID": 101000, "size": 1},
+					{"containerID": 101, "hostID": 102000, "size": 1},
+					{"containerID": 102, "hostID": 103000, "size": 1}
+				],
+				"gidMappings": [
+					{"containerID": 210, "hostID": 101100, "size": 10},
+					{"containerID": 220, "hostID": 102200, "size": 10},
+					{"containerID": 230, "hostID": 103300, "size": 10}
+				]
+			}
+		]'
+
+	update_config '.process.args = ["bash", "-c", "stat -c =%n:%u=%g= /tmp/mount-multi1/{foo,bar,baz}.txt"]'
 	runc run test_debian
-	[ "$status" -eq 1 ]
-	[[ "${output}" == *"invalid mount"* ]]
+	[ "$status" -eq 0 ]
+	[[ "$output" == *"=/tmp/mount-multi1/foo.txt:1000=1101="* ]]
+	[[ "$output" == *"=/tmp/mount-multi1/bar.txt:2000=2202="* ]]
+	[[ "$output" == *"=/tmp/mount-multi1/baz.txt:3000=3303="* ]]
+}
+
+@test "idmap mount (complicated mapping) [no userns]" {
+	update_config '.mounts += [
+			{
+				"source": "source-multi1/",
+				"destination": "/tmp/mount-multi1",
+				"options": ["bind"],
+				"uidMappings": [
+					{"containerID": 100, "hostID": 1000, "size": 1},
+					{"containerID": 101, "hostID": 2000, "size": 1},
+					{"containerID": 102, "hostID": 3000, "size": 1}
+				],
+				"gidMappings": [
+					{"containerID": 210, "hostID": 1100, "size": 10},
+					{"containerID": 220, "hostID": 2200, "size": 10},
+					{"containerID": 230, "hostID": 3300, "size": 10}
+				]
+			}
+		]'
+
+	update_config '.process.args = ["bash", "-c", "stat -c =%n:%u=%g= /tmp/mount-multi1/{foo,bar,baz}.txt"]'
+	runc run test_debian
+	[ "$status" -eq 0 ]
+	[[ "$output" == *"=/tmp/mount-multi1/foo.txt:1000=1101="* ]]
+	[[ "$output" == *"=/tmp/mount-multi1/bar.txt:2000=2202="* ]]
+	[[ "$output" == *"=/tmp/mount-multi1/baz.txt:3000=3303="* ]]
 }

From 7a3c81b27953d8475ca512c3e5b26dc6cafb289b Mon Sep 17 00:00:00 2001
From: Aleksa Sarai <cyphar@cyphar.com>
Date: Tue, 8 Aug 2023 15:37:29 +1000
Subject: [PATCH 11/11] squash! libcontainer: remove all mount logic from
 nsexec

Signed-off-by: Aleksa Sarai <cyphar@cyphar.com>
---
 libcontainer/container_linux.go | 151 ----------------------
 libcontainer/criu_linux.go      |   4 +-
 libcontainer/mount_linux.go     | 216 ++++++++++++++++++++++----------
 libcontainer/process_linux.go   | 143 ++++++++++++++++++++-
 libcontainer/rootfs_linux.go    | 114 ++++++++++-------
 libcontainer/sync.go            |   7 ++
 6 files changed, 363 insertions(+), 272 deletions(-)

diff --git a/libcontainer/container_linux.go b/libcontainer/container_linux.go
index 932a415cb8a..11ea28afcad 100644
--- a/libcontainer/container_linux.go
+++ b/libcontainer/container_linux.go
@@ -15,7 +15,6 @@ import (
 	"sync"
 	"time"
 
-	"github.com/moby/sys/mountinfo"
 	"github.com/opencontainers/runtime-spec/specs-go"
 	"github.com/sirupsen/logrus"
 	"github.com/vishvananda/netlink/nl"
@@ -26,7 +25,6 @@ import (
 	"github.com/opencontainers/runc/libcontainer/configs"
 	"github.com/opencontainers/runc/libcontainer/intelrdt"
 	"github.com/opencontainers/runc/libcontainer/system"
-	"github.com/opencontainers/runc/libcontainer/userns"
 	"github.com/opencontainers/runc/libcontainer/utils"
 )
 
@@ -647,152 +645,6 @@ func (c *Container) newParentProcess(p *Process) (parentProcess, error) {
 	return c.newSetnsProcess(p, cmd, messageSockPair, logFilePair)
 }
 
-// remapMountSources tries to remap any applicable mount sources in the
-// container configuration to use open_tree(2)-style mountfds. This allows
-// containers to have bind-mounts from source directories the container process
-// cannot resolve (#2576) as well as apply MOUNT_ATTR_IDMAP to mounts (which
-// requires privileges the container process might not have).
-//
-// The core idea is that we create a mountfd with the right configuration, and
-// then replace the source with a reference to the file descriptor. Ideally
-// this would be /proc/self/fd/<the-fd-runc-init-will-see>, but we cannot use
-// new mount API file descriptors as bind-mount sources (they run afoul of the
-// check_mnt() checks that stop us from doing bind-mounts from an fd in a
-// different namespace). So instead, we set the source to \x00<file-descriptor>
-// which is an invalid path under Linux.
-func (c *Container) remapMountSources(cmd *exec.Cmd) (Err error) {
-	nsHandles := new(userns.Handles)
-	defer nsHandles.Release()
-	for i, m := range c.config.Mounts {
-		var mountFile *os.File
-		if m.IsBind() {
-			flags := uint(unix.OPEN_TREE_CLONE | unix.O_CLOEXEC)
-			if m.Flags&unix.MS_REC == unix.MS_REC {
-				flags |= unix.AT_RECURSIVE
-			}
-			mountFd, err := unix.OpenTree(unix.AT_FDCWD, m.Source, flags)
-			if err != nil {
-				// For non-id-mapped mounts, this functionality is optional. We
-				// can just fallback to letting the rootfs code use the
-				// original source as a mountpoint.
-				if !m.IsIDMapped() {
-					logrus.Debugf("remap mount sources: skipping remap of %s due to failure: open_tree(OPEN_TREE_CLONE): %v", m.Source, err)
-					continue
-				}
-				return &os.PathError{Op: "open_tree(OPEN_TREE_CLONE)", Path: m.Source, Err: err}
-			}
-			mountFile = os.NewFile(uintptr(mountFd), m.Source+" (open_tree)")
-			// Only close the file if the remapping failed -- otherwise we keep
-			// the file open to be passed to "runc init".
-			defer func() {
-				if Err != nil {
-					_ = mountFile.Close()
-				}
-			}()
-		}
-
-		if m.IsIDMapped() {
-			if mountFile == nil {
-				return fmt.Errorf("remap mount sources: invalid mount source %s: id-mapping of non-bind-mounts is not supported", m.Source)
-			}
-			usernsFile, err := nsHandles.Get(userns.Mapping{
-				UIDMappings: m.UIDMappings,
-				GIDMappings: m.GIDMappings,
-			})
-			if err != nil {
-				return fmt.Errorf("remap mount sources: failed to create userns for %s id-mapping: %w", m.Source, err)
-			}
-			defer usernsFile.Close()
-			if err := unix.MountSetattr(int(mountFile.Fd()), "", unix.AT_EMPTY_PATH, &unix.MountAttr{
-				Attr_set:  unix.MOUNT_ATTR_IDMAP,
-				Userns_fd: uint64(usernsFile.Fd()),
-			}); err != nil {
-				return fmt.Errorf("remap mount sources: failed to set IDMAP_SOURCE_ATTR on %s: %w", m.Source, err)
-			}
-		}
-
-		if mountFile != nil {
-			// We need to emulate the propagation behaviour when bind-mounting
-			// from a root filesystem with config.RootPropagation applied. This
-			// is needed because existing runc users expect bind-mounts to act
-			// this way (the "classic" method), regardless of the mount API
-			// used to create the bind-mount.
-			//
-			// NOTE: This explicitly does not handle configurations where there
-			// is a bind-mount source of a path from inside the container
-			// rootfs. We could do this in rootfs_linux.go but this would
-			// require doing criu-style shennanigans to try to figure out how
-			// to recreate the state in /proc/self/mountinfo. For now, it's
-			// much simpler to implement this based purely on RootPropagation.
-
-			// Same logic as prepareRoot().
-			propFlags := unix.MS_SLAVE | unix.MS_REC
-			if c.config.RootPropagation != 0 {
-				propFlags = c.config.RootPropagation
-			}
-
-			// The one thing to consider is whether the RootPropagation is
-			// MS_REC and whether the mount source is the same as /. If it is
-			// MS_REC then we apply the propagation flags (nix MS_REC)
-			// recursively. Otherwise we apply the propagation flags
-			// non-recursively if m.Source is part of the / mount. We do
-			// nothing if neither is the case.
-			var setattrFlags uint
-			if propFlags&unix.MS_REC == unix.MS_REC {
-				// If the RootPropagation is recursive, then any bind-mount
-				// from the host should inherit the propagation setting of /.
-				logrus.Debugf("remap mount sources: applying recursive RootPropagation of 0x%x to %q bind-mount propagation", propFlags, m.Source)
-				setattrFlags = unix.AT_RECURSIVE
-			} else {
-				// If the RootPropagation is not recursive, only bind-mounts
-				// from paths within the / mount should inherit the propagation
-				// setting /.
-				info, err := mountinfo.GetMounts(mountinfo.ParentsFilter(m.Source))
-				if err != nil {
-					return fmt.Errorf("remap mount sources: failed to parse /proc/self/mountinfo: %w", err)
-				}
-				// If there is only a single entry with a mountpoint path of /,
-				// the source was a child of /. Otherwise, do not set the
-				// propagation setting of bind-mounts.
-				if len(info) == 1 && info[0].Mountpoint == "/" {
-					logrus.Debugf("remap mount sources: applying non-recursive RootPropagation of 0x%x to child of / %q bind-mount propagation", propFlags, m.Source)
-				} else {
-					logrus.Debugf("remap mount sources: using default mount propagation for %q bind-mount", m.Source)
-					propFlags = 0
-				}
-			}
-			if propFlags != 0 {
-				if err := unix.MountSetattr(int(mountFile.Fd()), "", unix.AT_EMPTY_PATH|setattrFlags, &unix.MountAttr{
-					Propagation: uint64(propFlags &^ unix.MS_REC),
-				}); err != nil {
-					return fmt.Errorf("remap mount sources: failed to set mount propagation of %q bind-mount to 0x%x: %w", m.Source, propFlags, err)
-				}
-			}
-
-			// We have to leak these file descriptors to "runc init", which
-			// will mean that Go will set them as ~O_CLOEXEC during ForkExec,
-			// meaning that by default these would be leaked to the container
-			// process.
-			//
-			// While this is a bit scary, we have several processes to make
-			// sure this never leaks to the actual container (the SET_DUMPABLE
-			// bit, setting everything to O_CLOEXEC at the end of init, etc) --
-			// and in addition, OPEN_TREE_CLONE file descriptors are completely
-			// safe to leak to the container because they are in an anonymous
-			// mount namespace so they cannot be used to escape the container
-			// a-la CVE-2016-9962.
-			cmd.ExtraFiles = append(cmd.ExtraFiles, mountFile)
-			fd := stdioFdCount + len(cmd.ExtraFiles) - 1
-			logrus.Debugf("remapping mount source %s to fd %d", m.Source, fd)
-			c.config.Mounts[i].SourceFd = &fd
-			// Prepend \x00 to m.Source to make sure that attempts to operate
-			// on it in rootfs_linux.go fail.
-			c.config.Mounts[i].Source = "\x00" + m.Source
-		}
-	}
-	return nil
-}
-
 func (c *Container) newInitProcess(p *Process, cmd *exec.Cmd, messageSockPair, logFilePair filePair) (*initProcess, error) {
 	cmd.Env = append(cmd.Env, "_LIBCONTAINER_INITTYPE="+string(initStandard))
 	nsMaps := make(map[configs.NamespaceType]string)
@@ -801,9 +653,6 @@ func (c *Container) newInitProcess(p *Process, cmd *exec.Cmd, messageSockPair, l
 			nsMaps[ns.Type] = ns.Path
 		}
 	}
-	if err := c.remapMountSources(cmd); err != nil {
-		return nil, err
-	}
 	data, err := c.bootstrapData(c.config.Namespaces.CloneFlags(), nsMaps)
 	if err != nil {
 		return nil, err
diff --git a/libcontainer/criu_linux.go b/libcontainer/criu_linux.go
index ec6228399d8..7b209aa07af 100644
--- a/libcontainer/criu_linux.go
+++ b/libcontainer/criu_linux.go
@@ -618,8 +618,8 @@ func (c *Container) prepareCriuRestoreMounts(mounts []*configs.Mount) error {
 			// because during initial container creation mounts are
 			// set up in the order they are configured.
 			if m.Device == "bind" {
-				if err := utils.WithProcfd(c.config.Rootfs, m.Destination, func(dstFD string) error {
-					return mountViaFDs(m.Source, nil, m.Destination, dstFD, "", unix.MS_BIND|unix.MS_REC, "")
+				if err := utils.WithProcfd(c.config.Rootfs, m.Destination, func(dstFd string) error {
+					return mountViaFds(m.Source, nil, m.Destination, dstFd, "", unix.MS_BIND|unix.MS_REC, "")
 				}); err != nil {
 					return err
 				}
diff --git a/libcontainer/mount_linux.go b/libcontainer/mount_linux.go
index 8d25f7911f8..ce71f63ad68 100644
--- a/libcontainer/mount_linux.go
+++ b/libcontainer/mount_linux.go
@@ -2,24 +2,46 @@ package libcontainer
 
 import (
 	"errors"
+	"fmt"
 	"io/fs"
 	"os"
 	"strconv"
 
 	"github.com/sirupsen/logrus"
 	"golang.org/x/sys/unix"
+
+	"github.com/opencontainers/runc/libcontainer/configs"
+	"github.com/opencontainers/runc/libcontainer/userns"
 )
 
+// mountSourceType indicates what type of file descriptor is being returned. It
+// is used to tell rootfs_linux.go whether or not to use move_mount(2) to
+// install the mount.
+type mountSourceType string
+
+const (
+	// An open_tree(2)-style file descriptor that needs to be installed using
+	// move_mount(2) to install.
+	mountSourceOpenTree mountSourceType = "open_tree"
+	// A plain file descriptor that can be mounted through /proc/self/fd.
+	mountSourcePlain mountSourceType = "plain-open"
+)
+
+type mountSource struct {
+	Type mountSourceType `json:"source_type"`
+	file *os.File        `json:"-"`
+}
+
 // mountError holds an error from a failed mount or unmount operation.
 type mountError struct {
-	op     string
-	source string
-	srcFD  *int
-	target string
-	dstFD  string
-	flags  uintptr
-	data   string
-	err    error
+	op      string
+	source  string
+	srcFile *mountSource
+	target  string
+	dstFd   string
+	flags   uintptr
+	data    string
+	err     error
 }
 
 // Error provides a string error representation.
@@ -28,13 +50,14 @@ func (e *mountError) Error() string {
 
 	if e.source != "" {
 		out += "src=" + e.source + ", "
-		if e.srcFD != nil {
-			out += "srcFD=" + strconv.Itoa(*e.srcFD) + ", "
+		if e.srcFile != nil {
+			out += "srcType=" + string(e.srcFile.Type) + ", "
+			out += "srcFd=" + strconv.Itoa(int(e.srcFile.file.Fd())) + ", "
 		}
 	}
 	out += "dst=" + e.target
-	if e.dstFD != "" {
-		out += ", dstFD=" + e.dstFD
+	if e.dstFd != "" {
+		out += ", dstFd=" + e.dstFd
 	}
 
 	if e.flags != uintptr(0) {
@@ -57,73 +80,58 @@ func (e *mountError) Unwrap() error {
 // mount is a simple unix.Mount wrapper, returning an error with more context
 // in case it failed.
 func mount(source, target, fstype string, flags uintptr, data string) error {
-	return mountViaFDs(source, nil, target, "", fstype, flags, data)
+	return mountViaFds(source, nil, target, "", fstype, flags, data)
 }
 
-// mountViaFDs is a unix.Mount wrapper which uses srcFD instead of source,
-// and dstFD instead of target, unless those are empty.
+// mountViaFds is a unix.Mount wrapper which uses srcFile instead of source,
+// and dstFd instead of target, unless those are empty.
 //
-// If srcFD is non-nil and flags contains MS_BIND, mountViaFDs will attempt to
-// do a move_mount(2) as srcFD is assumed to be a "new mount API" file
-// descriptor that cannot always be mounted using the old mount API (such as
-// with bind-mounts).
+// If srcFile is non-nil and flags does not contain MS_REMOUNT, mountViaFds
+// will mount it according to the mountSourceType of the file descriptor.
 //
-// The dstFD argument, if non-empty, is expected to be in the form of a path to
+// The dstFd argument, if non-empty, is expected to be in the form of a path to
 // an opened file descriptor on procfs (i.e. "/proc/self/fd/NN").
 //
-// If case an FD is used instead of a source or a target path, the
-// corresponding path is only used to add context to an error in case
-// the mount operation has failed.
-func mountViaFDs(source string, srcFD *int, target, dstFD, fstype string, flags uintptr, data string) error {
+// If a file descriptor is used instead of a source or a target path, the
+// corresponding path is only used to add context to an error in case the mount
+// operation has failed.
+func mountViaFds(source string, srcFile *mountSource, target, dstFd, fstype string, flags uintptr, data string) error {
+	// MS_REMOUNT and srcFile don't make sense together.
+	if srcFile != nil && flags&unix.MS_REMOUNT != 0 {
+		logrus.Debugf("mount source passed along with MS_REMOUNT -- ignoring srcFile")
+		srcFile = nil
+	}
+
 	dst := target
-	if dstFD != "" {
-		dst = dstFD
+	if dstFd != "" {
+		dst = dstFd
 	}
 	src := source
-	if srcFD != nil {
-		src = "/proc/self/fd/" + strconv.Itoa(*srcFD)
-	}
-
-	// Do a move_mount(2) if the source is a mountfd. For MS_REMOUNT,
-	// move_mount(2) doesn't make sense (though we should never get into that
-	// case).
-	if srcFD != nil && flags&unix.MS_REMOUNT == 0 {
-		if err := unix.MoveMount(*srcFD, "", unix.AT_FDCWD, dstFD, unix.MOVE_MOUNT_F_EMPTY_PATH|unix.MOVE_MOUNT_T_SYMLINKS); err == nil {
-			// all done
-			return nil
-		} else if errors.Is(err, unix.EINVAL) || errors.Is(err, unix.ENOSYS) {
-			// fall back to old-school mount -- even if it's almost certainly
-			// not going to work...
-			logrus.Debugf("move_mount of srcfd-based mount failed, falling back to classic mount")
-		} else {
-			// Prettify the source argument so the user knows what
-			// /proc/self/fd/... refers to here.
-			srcLink, err := os.Readlink(src)
-			if err != nil {
-				srcLink = "<unknown source path>"
-			}
-			return &mountError{
-				op:     "move_mount",
-				source: srcLink,
-				srcFD:  srcFD,
-				target: target,
-				dstFD:  dstFD,
-				flags:  flags,
-				data:   data,
-				err:    err,
-			}
-		}
+	if srcFile != nil {
+		src = "/proc/self/fd/" + strconv.Itoa(int(srcFile.file.Fd()))
+	}
+
+	var op string
+	var err error
+	if srcFile != nil && srcFile.Type == mountSourceOpenTree {
+		op = "move_mount"
+		err = unix.MoveMount(int(srcFile.file.Fd()), "",
+			unix.AT_FDCWD, dstFd,
+			unix.MOVE_MOUNT_F_EMPTY_PATH|unix.MOVE_MOUNT_T_SYMLINKS)
+	} else {
+		op = "mount"
+		err = unix.Mount(src, dst, fstype, flags, data)
 	}
-	if err := unix.Mount(src, dst, fstype, flags, data); err != nil {
+	if err != nil {
 		return &mountError{
-			op:     "mount",
-			source: source,
-			srcFD:  srcFD,
-			target: target,
-			dstFD:  dstFD,
-			flags:  flags,
-			data:   data,
-			err:    err,
+			op:      op,
+			source:  source,
+			srcFile: srcFile,
+			target:  target,
+			dstFd:   dstFd,
+			flags:   flags,
+			data:    data,
+			err:     err,
 		}
 	}
 	return nil
@@ -159,3 +167,75 @@ func syscallMode(i fs.FileMode) (o uint32) {
 	// No mapping for Go's ModeTemporary (plan9 only).
 	return
 }
+
+// mountFd creates an open_tree(2)-like mount fd from the provided
+// configuration. This function must be called from within the container's
+// mount namespace.
+func mountFd(nsHandles *userns.Handles, m *configs.Mount) (mountSource, error) {
+	if !m.IsBind() {
+		return mountSource{}, errors.New("new mount api: only bind-mounts are supported")
+	}
+	if nsHandles == nil {
+		nsHandles = new(userns.Handles)
+		defer nsHandles.Release()
+	}
+
+	var mountFile *os.File
+	var sourceType mountSourceType
+
+	if m.IsBind() {
+		// We only need to use OPEN_TREE_CLONE in the case where we need to use
+		// mount_setattr(2). We are currently in the container namespace and
+		// there is no risk of an opened directory being used to escape the
+		// container. OPEN_TREE_CLONE is more expensive than open(2) because it
+		// requires doing mounts inside a new anonymous mount namespace.
+		if m.IsIDMapped() {
+			flags := uint(unix.OPEN_TREE_CLONE | unix.O_CLOEXEC)
+			if m.Flags&unix.MS_REC == unix.MS_REC {
+				flags |= unix.AT_RECURSIVE
+			}
+			mountFd, err := unix.OpenTree(unix.AT_FDCWD, m.Source, flags)
+			if err != nil {
+				return mountSource{}, &os.PathError{Op: "open_tree(OPEN_TREE_CLONE)", Path: m.Source, Err: err}
+			}
+			mountFile = os.NewFile(uintptr(mountFd), m.Source)
+			sourceType = mountSourceOpenTree
+		} else {
+			var err error
+			mountFile, err = os.OpenFile(m.Source, unix.O_PATH|unix.O_CLOEXEC, 0)
+			if err != nil {
+				return mountSource{}, err
+			}
+			sourceType = mountSourcePlain
+		}
+	}
+
+	if m.IsIDMapped() {
+		if mountFile == nil {
+			return mountSource{}, fmt.Errorf("invalid mount source %q: id-mapping of non-bind-mounts is not supported", m.Source)
+		}
+		if sourceType != mountSourceOpenTree {
+			// should never happen
+			return mountSource{}, fmt.Errorf("invalid mount source %q: id-mapped target mistakenly opened without OPEN_TREE_CLONE", m.Source)
+		}
+
+		usernsFile, err := nsHandles.Get(userns.Mapping{
+			UIDMappings: m.UIDMappings,
+			GIDMappings: m.GIDMappings,
+		})
+		if err != nil {
+			return mountSource{}, fmt.Errorf("failed to create userns for %s id-mapping: %w", m.Source, err)
+		}
+		defer usernsFile.Close()
+		if err := unix.MountSetattr(int(mountFile.Fd()), "", unix.AT_EMPTY_PATH, &unix.MountAttr{
+			Attr_set:  unix.MOUNT_ATTR_IDMAP,
+			Userns_fd: uint64(usernsFile.Fd()),
+		}); err != nil {
+			return mountSource{}, fmt.Errorf("failed to set IDMAP_SOURCE_ATTR on %s: %w", m.Source, err)
+		}
+	}
+	return mountSource{
+		Type: sourceType,
+		file: mountFile,
+	}, nil
+}
diff --git a/libcontainer/process_linux.go b/libcontainer/process_linux.go
index 1f5e37bd166..fb5cdc432b6 100644
--- a/libcontainer/process_linux.go
+++ b/libcontainer/process_linux.go
@@ -1,6 +1,7 @@
 package libcontainer
 
 import (
+	"context"
 	"encoding/json"
 	"errors"
 	"fmt"
@@ -13,16 +14,18 @@ import (
 	"strconv"
 	"time"
 
+	"github.com/opencontainers/runtime-spec/specs-go"
+	"github.com/sirupsen/logrus"
+	"golang.org/x/sys/unix"
+
 	"github.com/opencontainers/runc/libcontainer/cgroups"
 	"github.com/opencontainers/runc/libcontainer/cgroups/fs2"
 	"github.com/opencontainers/runc/libcontainer/configs"
 	"github.com/opencontainers/runc/libcontainer/intelrdt"
 	"github.com/opencontainers/runc/libcontainer/logs"
 	"github.com/opencontainers/runc/libcontainer/system"
+	"github.com/opencontainers/runc/libcontainer/userns"
 	"github.com/opencontainers/runc/libcontainer/utils"
-	"github.com/opencontainers/runtime-spec/specs-go"
-	"github.com/sirupsen/logrus"
-	"golang.org/x/sys/unix"
 )
 
 type parentProcess interface {
@@ -168,8 +171,11 @@ func (p *setnsProcess) start() (retErr error) {
 			// This shouldn't happen.
 			panic("unexpected procReady in setns")
 		case procHooks:
-			// This shouldn't happen.
+			// this shouldn't happen.
 			panic("unexpected procHooks in setns")
+		case procMountPlease:
+			// this shouldn't happen.
+			panic("unexpected procMountPlease in setns")
 		case procSeccomp:
 			if p.config.Config.Seccomp.ListenerPath == "" {
 				return errors.New("seccomp listenerPath is not set")
@@ -361,6 +367,95 @@ func (p *initProcess) waitForChildExit(childPid int) error {
 	return nil
 }
 
+type mountSourceRequestFn func(*configs.Mount) (mountSource, error)
+
+// goCreateMountSources spawns a goroutine which creates open_tree(2)-style
+// mountfds based on the requested configs.Mount configuration. The returned
+// requestFn and cancelFn are used to interact with the goroutine.
+func (p *initProcess) goCreateMountSources(ctx context.Context) (mountSourceRequestFn, context.CancelFunc, error) {
+	type response struct {
+		src mountSource
+		err error
+	}
+
+	errCh := make(chan error, 1)
+	requestCh := make(chan *configs.Mount)
+	responseCh := make(chan response)
+
+	ctx, cancelFn := context.WithCancel(ctx)
+	go func() {
+		// We lock this thread because we need to setns(2) here. There is no
+		// UnlockOSThread() here, to ensure that the Go runtime will kill this
+		// thread once this goroutine returns (ensuring no other goroutines run
+		// in this context).
+		runtime.LockOSThread()
+
+		// Detach from the shared fs of the rest of the Go process.
+		if err := unix.Unshare(unix.CLONE_FS); err != nil {
+			err = os.NewSyscallError("unshare(CLONE_FS)", err)
+			errCh <- fmt.Errorf("mount source thread: %w", err)
+		}
+
+		// Attach to the container's mount namespace.
+		nsFd, err := os.Open(fmt.Sprintf("/proc/%d/ns/mnt", p.pid()))
+		if err != nil {
+			errCh <- fmt.Errorf("mount source thread: open container mntns: %w", err)
+			return
+		}
+		defer nsFd.Close()
+		if err := unix.Setns(int(nsFd.Fd()), unix.CLONE_NEWNS); err != nil {
+			err = os.NewSyscallError("setns", err)
+			errCh <- fmt.Errorf("mount source thread: join container mntns: %w", err)
+			return
+		}
+
+		// No errors during setup!
+		errCh <- nil
+		logrus.Debugf("mount source thread: successfully running in container mntns")
+
+		nsHandles := new(userns.Handles)
+		defer nsHandles.Release()
+		for {
+			select {
+			case m := <-requestCh:
+				src, err := mountFd(nsHandles, m)
+				logrus.Debugf("mount source thread: handling request for %q: %v %v", m.Source, src, err)
+				responseCh <- response{
+					src: src,
+					err: err,
+				}
+			case <-ctx.Done():
+				close(requestCh)
+				close(responseCh)
+				return
+			}
+		}
+	}()
+
+	// Check for setup errors.
+	err := <-errCh
+	close(errCh)
+	if err != nil {
+		cancelFn()
+		return nil, nil, err
+	}
+
+	requestFn := func(m *configs.Mount) (mountSource, error) {
+		select {
+		case requestCh <- m:
+			select {
+			case resp := <-responseCh:
+				return resp.src, resp.err
+			case <-ctx.Done():
+				return mountSource{}, errors.New("receive mount source failed: channel closed")
+			}
+		case <-ctx.Done():
+			return mountSource{}, errors.New("send mount source request failed: channel closed")
+		}
+	}
+	return requestFn, cancelFn, nil
+}
+
 func (p *initProcess) start() (retErr error) {
 	defer p.messageSockPair.parent.Close() //nolint: errcheck
 	err := p.cmd.Start()
@@ -451,6 +546,17 @@ func (p *initProcess) start() (retErr error) {
 		return fmt.Errorf("error waiting for our first child to exit: %w", err)
 	}
 
+	// Spin up a goroutine to handle remapping mount requests by runc init. There is no point doing this for rootless containers becuase
+	var mountRequest mountSourceRequestFn
+	if !p.container.config.RootlessEUID {
+		request, cancel, err := p.goCreateMountSources(context.Background())
+		if err != nil {
+			return fmt.Errorf("error spawning mount remapping thread: %w", err)
+		}
+		defer cancel()
+		mountRequest = request
+	}
+
 	if err := p.createNetworkInterfaces(); err != nil {
 		return fmt.Errorf("error creating network interfaces: %w", err)
 	}
@@ -467,6 +573,35 @@ func (p *initProcess) start() (retErr error) {
 
 	ierr := parseSync(p.messageSockPair.parent, func(sync *syncT) error {
 		switch sync.Type {
+		case procMountPlease:
+			if mountRequest == nil {
+				return fmt.Errorf("cannot fulfil mount requests as a rootless user")
+			}
+			var m *configs.Mount
+			if sync.Arg == nil {
+				return fmt.Errorf("sync %q is missing an argument", sync.Type)
+			}
+			if err := json.Unmarshal(*sync.Arg, &m); err != nil {
+				return fmt.Errorf("sync %q passed invalid mount arg: %w", sync.Type, err)
+			}
+			mnt, err := mountRequest(m)
+			if err != nil {
+				return fmt.Errorf("failed to fulfil mount request: %w", err)
+			}
+			defer mnt.file.Close()
+
+			arg, err := json.Marshal(mnt)
+			if err != nil {
+				return fmt.Errorf("sync %q failed to marshal mountSource: %w", sync.Type, err)
+			}
+			argMsg := json.RawMessage(arg)
+			if err := doWriteSync(p.messageSockPair.parent, syncT{
+				Type: procMountFd,
+				Arg:  &argMsg,
+				File: mnt.file,
+			}); err != nil {
+				return err
+			}
 		case procSeccomp:
 			if p.config.Config.Seccomp.ListenerPath == "" {
 				return errors.New("seccomp listenerPath is not set")
diff --git a/libcontainer/rootfs_linux.go b/libcontainer/rootfs_linux.go
index e08956e11e8..fb7e917832c 100644
--- a/libcontainer/rootfs_linux.go
+++ b/libcontainer/rootfs_linux.go
@@ -1,6 +1,7 @@
 package libcontainer
 
 import (
+	"encoding/json"
 	"errors"
 	"fmt"
 	"os"
@@ -13,16 +14,17 @@ import (
 	securejoin "github.com/cyphar/filepath-securejoin"
 	"github.com/moby/sys/mountinfo"
 	"github.com/mrunalp/fileutils"
+	"github.com/opencontainers/runtime-spec/specs-go"
+	"github.com/opencontainers/selinux/go-selinux/label"
+	"github.com/sirupsen/logrus"
+	"golang.org/x/sys/unix"
+
 	"github.com/opencontainers/runc/libcontainer/cgroups"
 	"github.com/opencontainers/runc/libcontainer/cgroups/fs2"
 	"github.com/opencontainers/runc/libcontainer/configs"
 	"github.com/opencontainers/runc/libcontainer/devices"
 	"github.com/opencontainers/runc/libcontainer/userns"
 	"github.com/opencontainers/runc/libcontainer/utils"
-	"github.com/opencontainers/runtime-spec/specs-go"
-	"github.com/opencontainers/selinux/go-selinux/label"
-	"github.com/sirupsen/logrus"
-	"golang.org/x/sys/unix"
 )
 
 const defaultMountFlags = unix.MS_NOEXEC | unix.MS_NOSUID | unix.MS_NODEV
@@ -40,12 +42,12 @@ type mountConfig struct {
 // mountEntry contains mount data specific to a mount point.
 type mountEntry struct {
 	*configs.Mount
-	srcFD *int
+	srcFile *mountSource
 }
 
 func (m *mountEntry) src() string {
-	if m.srcFD != nil {
-		return "/proc/self/fd/" + strconv.Itoa(*m.srcFD)
+	if m.srcFile != nil {
+		return "/proc/self/fd/" + strconv.Itoa(int(m.srcFile.file.Fd()))
 	}
 	return m.Source
 }
@@ -79,29 +81,48 @@ func prepareRootfs(pipe *os.File, iConfig *initConfig) (err error) {
 	}
 	for _, m := range config.Mounts {
 		entry := mountEntry{Mount: m}
-		if m.SourceFd != nil {
-			// m.SourceFd indicates that we have an open_tree()-like mountfd
-			// which was passed to us from the parent runc process, which we
-			// have to operate on using the new mount API.
-			// TODO: Remove mountEntry and srcFD.
-			entry.srcFD = m.SourceFd
-			// As an extra precaution, mark the descriptor as O_CLOEXEC. In
-			// theory this could be raced against and leaked in other ways, but
-			// we have other protections. We ignore errors.
-			_, _ = unix.FcntlInt(uintptr(*m.SourceFd), unix.F_SETFD, unix.O_CLOEXEC)
+		// Figure out whether we need to request runc to give us an
+		// open_tree(2)-style mountfd. For idmapped mounts, this is always
+		// necessary. For bind-mounts, this is only necessary if we cannot
+		// resolve the parent mount (this is only hit if you are running in a
+		// userns).
+		wantSourceFile := m.IsIDMapped()
+		if m.IsBind() {
+			if _, err := os.Stat(m.Source); err != nil {
+				wantSourceFile = true
+			}
 		}
-		if err := mountToRootfs(mountConfig, entry); err != nil {
-			// Clean up error formatting for \x00 prefix entries to tell the
-			// user what the original mount source was.
-			srcName := fmt.Sprintf("%q", m.Source)
-			if entry.srcFD != nil {
-				if link, err := os.Readlink(fmt.Sprintf("/proc/self/fd/%d", *entry.srcFD)); err != nil {
-					srcName = fmt.Sprintf("<unknown source path>[fd=%d]", *entry.srcFD)
-				} else {
-					srcName = fmt.Sprintf("%q[fd=%d]", link, *entry.srcFD)
-				}
+		if wantSourceFile {
+			// Request a source file from the host.
+			if err := writeSyncArg(pipe, procMountPlease, m); err != nil {
+				return fmt.Errorf("failed to request mountfd for %q: %w", m.Source, err)
+			}
+			sync, err := readSyncFull(pipe, procMountFd)
+			if err != nil {
+				return fmt.Errorf("mountfd request for %q failed: %w", m.Source, err)
+			}
+			if sync.File == nil {
+				return fmt.Errorf("mountfd request for %q: response missing attached fd", m.Source)
 			}
-			return fmt.Errorf("error mounting %s to rootfs at %q: %w", srcName, m.Destination, err)
+			defer sync.File.Close()
+			// Sanity-check to make sure we didn't get the wrong fd back. This
+			// should never happen.
+			if sync.File.Name() != m.Source {
+				return fmt.Errorf("returned mountfd for %q doesn't match requested mount configuration: mountfd path is %q", m.Source, sync.File.Name())
+			}
+			// Unmarshal the procMountFd argument (the file is sync.File).
+			var src mountSource
+			if sync.Arg == nil {
+				return fmt.Errorf("sync %q is missing an argument", sync.Type)
+			}
+			if err := json.Unmarshal(*sync.Arg, &src); err != nil {
+				return fmt.Errorf("invalid mount fd response argument %q: %w", string(*sync.Arg), err)
+			}
+			src.file = sync.File
+			entry.srcFile = &src
+		}
+		if err := mountToRootfs(mountConfig, entry); err != nil {
+			return fmt.Errorf("error mounting %q to rootfs at %q: %w", m.Source, m.Destination, err)
 		}
 	}
 
@@ -282,7 +303,7 @@ func mountCgroupV1(m *configs.Mount, c *mountConfig) error {
 			if err := os.MkdirAll(subsystemPath, 0o755); err != nil {
 				return err
 			}
-			if err := utils.WithProcfd(c.root, b.Destination, func(dstFD string) error {
+			if err := utils.WithProcfd(c.root, b.Destination, func(dstFd string) error {
 				flags := defaultMountFlags
 				if m.Flags&unix.MS_RDONLY != 0 {
 					flags = flags | unix.MS_RDONLY
@@ -295,7 +316,7 @@ func mountCgroupV1(m *configs.Mount, c *mountConfig) error {
 					data = cgroups.CgroupNamePrefix + data
 					source = "systemd"
 				}
-				return mountViaFDs(source, nil, b.Destination, dstFD, "cgroup", uintptr(flags), data)
+				return mountViaFds(source, nil, b.Destination, dstFd, "cgroup", uintptr(flags), data)
 			}); err != nil {
 				return err
 			}
@@ -326,8 +347,8 @@ func mountCgroupV2(m *configs.Mount, c *mountConfig) error {
 	if err := os.MkdirAll(dest, 0o755); err != nil {
 		return err
 	}
-	err = utils.WithProcfd(c.root, m.Destination, func(dstFD string) error {
-		return mountViaFDs(m.Source, nil, m.Destination, dstFD, "cgroup2", uintptr(m.Flags), m.Data)
+	err = utils.WithProcfd(c.root, m.Destination, func(dstFd string) error {
+		return mountViaFds(m.Source, nil, m.Destination, dstFd, "cgroup2", uintptr(m.Flags), m.Data)
 	})
 	if err == nil || !(errors.Is(err, unix.EPERM) || errors.Is(err, unix.EBUSY)) {
 		return err
@@ -348,7 +369,6 @@ func mountCgroupV2(m *configs.Mount, c *mountConfig) error {
 		bindM.Source = c.cgroup2Path
 	}
 	// mountToRootfs() handles remounting for MS_RDONLY.
-	// No need to set mountEntry.srcFD here, because mountToRootfs() calls utils.WithProcfd() by itself in mountPropagate().
 	err = mountToRootfs(c, mountEntry{Mount: bindM})
 	if c.rootlessCgroups && errors.Is(err, unix.ENOENT) {
 		// ENOENT (for `src = c.cgroup2Path`) happens when rootless runc is being executed
@@ -393,15 +413,15 @@ func doTmpfsCopyUp(m mountEntry, rootfs, mountLabel string) (Err error) {
 		}
 	}()
 
-	return utils.WithProcfd(rootfs, m.Destination, func(dstFD string) (Err error) {
+	return utils.WithProcfd(rootfs, m.Destination, func(dstFd string) (Err error) {
 		// Copy the container data to the host tmpdir. We append "/" to force
 		// CopyDirectory to resolve the symlink rather than trying to copy the
 		// symlink itself.
-		if err := fileutils.CopyDirectory(dstFD+"/", tmpDir); err != nil {
-			return fmt.Errorf("tmpcopyup: failed to copy %s to %s (%s): %w", m.Destination, dstFD, tmpDir, err)
+		if err := fileutils.CopyDirectory(dstFd+"/", tmpDir); err != nil {
+			return fmt.Errorf("tmpcopyup: failed to copy %s to %s (%s): %w", m.Destination, dstFd, tmpDir, err)
 		}
 		// Now move the mount into the container.
-		if err := mountViaFDs(tmpDir, nil, m.Destination, dstFD, "", unix.MS_MOVE, ""); err != nil {
+		if err := mountViaFds(tmpDir, nil, m.Destination, dstFd, "", unix.MS_MOVE, ""); err != nil {
 			return fmt.Errorf("tmpcopyup: failed to move mount: %w", err)
 		}
 		return nil
@@ -478,7 +498,7 @@ func mountToRootfs(c *mountConfig, m mountEntry) error {
 		if err := prepareBindMount(m, rootfs); err != nil {
 			return err
 		}
-		// open_tree()-related shenanigans are all handled in mountViaFDs.
+		// open_tree()-related shenanigans are all handled in mountViaFds.
 		if err := mountPropagate(m, rootfs, mountLabel); err != nil {
 			return err
 		}
@@ -704,8 +724,8 @@ func bindMountDeviceNode(rootfs, dest string, node *devices.Device) error {
 	if f != nil {
 		_ = f.Close()
 	}
-	return utils.WithProcfd(rootfs, dest, func(dstFD string) error {
-		return mountViaFDs(node.Path, nil, dest, dstFD, "bind", unix.MS_BIND, "")
+	return utils.WithProcfd(rootfs, dest, func(dstFd string) error {
+		return mountViaFds(node.Path, nil, dest, dstFd, "bind", unix.MS_BIND, "")
 	})
 }
 
@@ -1076,9 +1096,9 @@ func writeSystemProperty(key, value string) error {
 }
 
 func remount(m mountEntry, rootfs string, noMountFallback bool) error {
-	return utils.WithProcfd(rootfs, m.Destination, func(dstFD string) error {
+	return utils.WithProcfd(rootfs, m.Destination, func(dstFd string) error {
 		flags := uintptr(m.Flags | unix.MS_REMOUNT)
-		err := mountViaFDs("", nil, m.Destination, dstFD, m.Device, flags, "")
+		err := mountViaFds("", nil, m.Destination, dstFd, m.Device, flags, "")
 		if err == nil {
 			return nil
 		}
@@ -1102,7 +1122,7 @@ func remount(m mountEntry, rootfs string, noMountFallback bool) error {
 		}
 		// ... and retry the mount with flags found above.
 		flags |= uintptr(int(s.Flags) & checkflags)
-		return mountViaFDs("", nil, m.Destination, dstFD, m.Device, flags, "")
+		return mountViaFds("", nil, m.Destination, dstFd, m.Device, flags, "")
 	})
 }
 
@@ -1125,17 +1145,17 @@ func mountPropagate(m mountEntry, rootfs string, mountLabel string) error {
 	// mutating underneath us, we verify that we are actually going to mount
 	// inside the container with WithProcfd() -- mounting through a procfd
 	// mounts on the target.
-	if err := utils.WithProcfd(rootfs, m.Destination, func(dstFD string) error {
-		return mountViaFDs(m.Source, m.srcFD, m.Destination, dstFD, m.Device, uintptr(flags), data)
+	if err := utils.WithProcfd(rootfs, m.Destination, func(dstFd string) error {
+		return mountViaFds(m.Source, m.srcFile, m.Destination, dstFd, m.Device, uintptr(flags), data)
 	}); err != nil {
 		return err
 	}
 	// We have to apply mount propagation flags in a separate WithProcfd() call
 	// because the previous call invalidates the passed procfd -- the mount
 	// target needs to be re-opened.
-	if err := utils.WithProcfd(rootfs, m.Destination, func(dstFD string) error {
+	if err := utils.WithProcfd(rootfs, m.Destination, func(dstFd string) error {
 		for _, pflag := range m.PropagationFlags {
-			if err := mountViaFDs("", nil, m.Destination, dstFD, "", uintptr(pflag), ""); err != nil {
+			if err := mountViaFds("", nil, m.Destination, dstFd, "", uintptr(pflag), ""); err != nil {
 				return err
 			}
 		}
diff --git a/libcontainer/sync.go b/libcontainer/sync.go
index 25507e58ad9..2cb659ff94e 100644
--- a/libcontainer/sync.go
+++ b/libcontainer/sync.go
@@ -18,6 +18,11 @@ type syncType string
 //
 //	     [  child  ] <-> [   parent   ]
 //
+//	procMountPlease      --> [open_tree(2) and configure mount]
+//	  Arg: configs.Mount
+//	                     <-- procMountFd
+//	                           file: mountfd
+//
 //	procSeccomp         --> [forward fd to listenerPath]
 //	  file: seccomp fd
 //	                    --- no return synchronisation
@@ -36,6 +41,8 @@ const (
 	procRun         syncType = "procRun"
 	procHooks       syncType = "procHooks"
 	procResume      syncType = "procResume"
+	procMountPlease syncType = "procMountPlease"
+	procMountFd     syncType = "procMountFd"
 	procSeccomp     syncType = "procSeccomp"
 	procSeccompDone syncType = "procSeccompDone"
 )