diff --git a/libcontainer/apparmor/apparmor_linux.go b/libcontainer/apparmor/apparmor_linux.go index 8b1483c7de7..35a28890786 100644 --- a/libcontainer/apparmor/apparmor_linux.go +++ b/libcontainer/apparmor/apparmor_linux.go @@ -4,6 +4,7 @@ import ( "errors" "fmt" "os" + "runtime" "sync" "github.com/opencontainers/runc/libcontainer/utils" @@ -26,12 +27,19 @@ func isEnabled() bool { } func setProcAttr(attr, value string) error { - // Under AppArmor you can only change your own attr, so use /proc/self/ - // instead of /proc// like libapparmor does - attrPath := "/proc/self/attr/apparmor/" + attr + // We need to lock ourselves to the current thread, to make sure that this + // thread stays alive while we are using /proc/thread-self. Otherwise, Go + // could switch our running thread and kill the original thread underneath + // us at any point where we might yield to other goroutines. + runtime.LockOSThread() + defer runtime.UnlockOSThread() + + // Under AppArmor you can only change your own attr, so use + // /proc/thread-self/ instead of /proc// like libapparmor does. + attrPath := utils.ProcThreadSelf("attr/apparmor/" + attr) if _, err := os.Stat(attrPath); errors.Is(err, os.ErrNotExist) { // fall back to the old convention - attrPath = "/proc/self/attr/" + attr + attrPath = utils.ProcThreadSelf("attr/" + attr) } f, err := os.OpenFile(attrPath, os.O_WRONLY, 0) diff --git a/libcontainer/cgroups/cgroups_test.go b/libcontainer/cgroups/cgroups_test.go index b31412f5a04..2b825657b46 100644 --- a/libcontainer/cgroups/cgroups_test.go +++ b/libcontainer/cgroups/cgroups_test.go @@ -5,6 +5,8 @@ import ( ) func TestParseCgroups(t *testing.T) { + // We don't need to use /proc/thread-self here because runc always runs + // with every thread in the same cgroup. cgroups, err := ParseCgroupFile("/proc/self/cgroup") if err != nil { t.Fatal(err) diff --git a/libcontainer/cgroups/file.go b/libcontainer/cgroups/file.go index 0cdaf747849..e71e97b01b3 100644 --- a/libcontainer/cgroups/file.go +++ b/libcontainer/cgroups/file.go @@ -10,6 +10,8 @@ import ( "strings" "sync" + "github.com/opencontainers/runc/libcontainer/utils" + "github.com/sirupsen/logrus" "golang.org/x/sys/unix" ) @@ -147,7 +149,7 @@ func openFile(dir, file string, flags int) (*os.File, error) { // TODO: if such usage will ever be common, amend this // to reopen cgroupFd and retry openat2. fdStr := strconv.Itoa(cgroupFd) - fdDest, _ := os.Readlink("/proc/self/fd/" + fdStr) + fdDest, _ := os.Readlink(utils.ProcThreadSelf("fd/" + fdStr)) if fdDest != cgroupfsDir { // Wrap the error so it is clear that cgroupFd // is opened to an unexpected/wrong directory. diff --git a/libcontainer/cgroups/fs2/defaultpath.go b/libcontainer/cgroups/fs2/defaultpath.go index 9c949c91f08..7252356675d 100644 --- a/libcontainer/cgroups/fs2/defaultpath.go +++ b/libcontainer/cgroups/fs2/defaultpath.go @@ -55,6 +55,8 @@ func _defaultDirPath(root, cgPath, cgParent, cgName string) (string, error) { return filepath.Join(root, innerPath), nil } + // we don't need to use /proc/thread-self here because runc always runs + // with every thread in the same cgroup. ownCgroup, err := parseCgroupFile("/proc/self/cgroup") if err != nil { return "", err diff --git a/libcontainer/cgroups/v1_utils.go b/libcontainer/cgroups/v1_utils.go index 8524c468434..0701b0cbf5e 100644 --- a/libcontainer/cgroups/v1_utils.go +++ b/libcontainer/cgroups/v1_utils.go @@ -5,10 +5,13 @@ import ( "fmt" "os" "path/filepath" + "runtime" "strings" "sync" "syscall" + "github.com/opencontainers/runc/libcontainer/utils" + securejoin "github.com/cyphar/filepath-securejoin" "github.com/moby/sys/mountinfo" "golang.org/x/sys/unix" @@ -99,11 +102,32 @@ func tryDefaultPath(cgroupPath, subsystem string) string { // expensive), so it is assumed that cgroup mounts are not being changed. func readCgroupMountinfo() ([]*mountinfo.Info, error) { readMountinfoOnce.Do(func() { - cgroupMountinfo, readMountinfoErr = mountinfo.GetMounts( + // We need to lock ourselves to the current thread, to make sure that + // this thread stays alive while we are using /proc/thread-self. + // Otherwise, Go could switch our running thread and kill the original + // thread underneath us at any point where we might yield to other + // goroutines. + runtime.LockOSThread() + defer runtime.UnlockOSThread() + + // mountinfo.GetMounts uses /proc/self/mountinfo, however in runc we + // have threads that join different namespaces and thus the output of + // the thread-group's mountinfo can depend on Go scheduling. Instead, + // use thread-self as we know the current thread has the "correct" + // mount namespace (we use LockOSThread for threads which change + // namespaces). + threadSelfMountinfo, err := os.Open(utils.ProcThreadSelf("mountinfo")) + if err != nil { + readMountinfoErr = err + return + } + defer threadSelfMountinfo.Close() + + cgroupMountinfo, readMountinfoErr = mountinfo.GetMountsFromReader( + threadSelfMountinfo, mountinfo.FSTypeFilter("cgroup"), ) }) - return cgroupMountinfo, readMountinfoErr } @@ -196,6 +220,8 @@ func getCgroupMountsV1(all bool) ([]Mount, error) { return nil, err } + // We don't need to use /proc/thread-self here because runc always runs + // with every thread in the same cgroup. allSubsystems, err := ParseCgroupFile("/proc/self/cgroup") if err != nil { return nil, err @@ -214,6 +240,9 @@ func GetOwnCgroup(subsystem string) (string, error) { if IsCgroup2UnifiedMode() { return "", errUnified } + + // We don't need to use /proc/thread-self here because runc always runs + // with every thread in the same cgroup. cgroups, err := ParseCgroupFile("/proc/self/cgroup") if err != nil { return "", err diff --git a/libcontainer/configs/namespaces_linux.go b/libcontainer/configs/namespaces_linux.go index 5062432f8c3..b947eedca8b 100644 --- a/libcontainer/configs/namespaces_linux.go +++ b/libcontainer/configs/namespaces_linux.go @@ -59,6 +59,8 @@ func IsNamespaceSupported(ns NamespaceType) bool { if nsFile == "" { return false } + // We don't need to use /proc/thread-self here because the list of + // namespace types is unrelated to the thread. _, err := os.Stat("/proc/self/ns/" + nsFile) // a namespace is supported if it exists and we have permissions to read it supported = err == nil diff --git a/libcontainer/container_linux.go b/libcontainer/container_linux.go index 75411e8dead..38ebf661692 100644 --- a/libcontainer/container_linux.go +++ b/libcontainer/container_linux.go @@ -463,6 +463,8 @@ func (c *Container) newParentProcess(p *Process) (parentProcess, error) { } func (c *Container) commandTemplate(p *Process, comm *processComm) *exec.Cmd { + // We don't need to use /proc/thread-self here because the exe mm of a + // thread-group is guaranteed to be the same for all threads by definition. cmd := exec.Command("/proc/self/exe", "init") cmd.Args[0] = os.Args[0] cmd.Stdin = p.Stdin diff --git a/libcontainer/criu_linux.go b/libcontainer/criu_linux.go index 7b209aa07af..16e3b04543c 100644 --- a/libcontainer/criu_linux.go +++ b/libcontainer/criu_linux.go @@ -549,7 +549,7 @@ func (c *Container) makeCriuRestoreMountpoints(m *configs.Mount) error { if err != nil { return err } - if err := checkProcMount(c.config.Rootfs, dest, ""); err != nil { + if err := checkProcMount(c.config.Rootfs, dest, nil); err != nil { return err } if err := os.MkdirAll(dest, 0o755); err != nil { diff --git a/libcontainer/init_linux.go b/libcontainer/init_linux.go index a760ebad26d..b1ea5f95b61 100644 --- a/libcontainer/init_linux.go +++ b/libcontainer/init_linux.go @@ -463,6 +463,8 @@ func setupUser(config *initConfig) error { return err } + // We don't need to use /proc/thread-self here because setgroups is a + // per-userns file and thus is global to all threads in a thread-group. setgroups, err := os.ReadFile("/proc/self/setgroups") if err != nil && !os.IsNotExist(err) { return err diff --git a/libcontainer/integration/exec_test.go b/libcontainer/integration/exec_test.go index 9795964caf2..8ed8adda070 100644 --- a/libcontainer/integration/exec_test.go +++ b/libcontainer/integration/exec_test.go @@ -9,6 +9,7 @@ import ( "os/exec" "path/filepath" "reflect" + "runtime" "strconv" "strings" "syscall" @@ -18,6 +19,7 @@ import ( "github.com/opencontainers/runc/libcontainer/cgroups" "github.com/opencontainers/runc/libcontainer/cgroups/systemd" "github.com/opencontainers/runc/libcontainer/configs" + "github.com/opencontainers/runc/libcontainer/utils" "github.com/opencontainers/runtime-spec/specs-go" "golang.org/x/sys/unix" @@ -1685,6 +1687,24 @@ func TestFdLeaksSystemd(t *testing.T) { testFdLeaks(t, true) } +func fdList(t *testing.T) []string { + // We need to lock ourselves to the current thread, to make sure that this + // thread stays alive while we are using /proc/thread-self. Otherwise, Go + // could switch our running thread and kill the original thread underneath + // us at any point where we might yield to other goroutines. + runtime.LockOSThread() + defer runtime.UnlockOSThread() + + pfd, err := os.Open(utils.ProcThreadSelf("fd")) + ok(t, err) + defer pfd.Close() + + fds, err := pfd.Readdirnames(-1) + ok(t, err) + + return fds +} + func testFdLeaks(t *testing.T, systemd bool) { if testing.Short() { return @@ -1699,21 +1719,12 @@ func testFdLeaks(t *testing.T, systemd bool) { // - /sys/fs/cgroup dirfd opened by prepareOpenat2 in libct/cgroups; // - dbus connection opened by getConnection in libct/cgroups/systemd. _ = runContainerOk(t, config, "true") - - pfd, err := os.Open("/proc/self/fd") - ok(t, err) - defer pfd.Close() - fds0, err := pfd.Readdirnames(0) - ok(t, err) - _, err = pfd.Seek(0, 0) - ok(t, err) + fds0 := fdList(t) _ = runContainerOk(t, config, "true") + fds1 := fdList(t) - fds1, err := pfd.Readdirnames(0) - ok(t, err) - - if len(fds1) == len(fds0) { + if reflect.DeepEqual(fds0, fds1) { return } // Show the extra opened files. @@ -1730,7 +1741,7 @@ next_fd: continue next_fd } } - dst, _ := os.Readlink("/proc/self/fd/" + fd1) + dst, _ := os.Readlink(utils.ProcThreadSelf("fd/" + fd1)) for _, ex := range excludedPaths { if ex == dst { continue next_fd diff --git a/libcontainer/mount_linux.go b/libcontainer/mount_linux.go index ce71f63ad68..2b20f7a1763 100644 --- a/libcontainer/mount_linux.go +++ b/libcontainer/mount_linux.go @@ -5,6 +5,7 @@ import ( "fmt" "io/fs" "os" + "runtime" "strconv" "github.com/sirupsen/logrus" @@ -12,6 +13,7 @@ import ( "github.com/opencontainers/runc/libcontainer/configs" "github.com/opencontainers/runc/libcontainer/userns" + "github.com/opencontainers/runc/libcontainer/utils" ) // mountSourceType indicates what type of file descriptor is being returned. It @@ -23,7 +25,7 @@ const ( // An open_tree(2)-style file descriptor that needs to be installed using // move_mount(2) to install. mountSourceOpenTree mountSourceType = "open_tree" - // A plain file descriptor that can be mounted through /proc/self/fd. + // A plain file descriptor that can be mounted through /proc/thread-self/fd. mountSourcePlain mountSourceType = "plain-open" ) @@ -90,7 +92,7 @@ func mount(source, target, fstype string, flags uintptr, data string) error { // will mount it according to the mountSourceType of the file descriptor. // // The dstFd argument, if non-empty, is expected to be in the form of a path to -// an opened file descriptor on procfs (i.e. "/proc/self/fd/NN"). +// an opened file descriptor on procfs (i.e. "/proc/thread-self/fd/NN"). // // If a file descriptor is used instead of a source or a target path, the // corresponding path is only used to add context to an error in case the mount @@ -101,19 +103,32 @@ func mountViaFds(source string, srcFile *mountSource, target, dstFd, fstype stri logrus.Debugf("mount source passed along with MS_REMOUNT -- ignoring srcFile") srcFile = nil } - dst := target if dstFd != "" { dst = dstFd } src := source + isMoveMount := srcFile != nil && srcFile.Type == mountSourceOpenTree if srcFile != nil { - src = "/proc/self/fd/" + strconv.Itoa(int(srcFile.file.Fd())) + // If we're going to use the /proc/thread-self/... path for classic + // mount(2), we need to lock ourselves to the current thread, to make + // sure that this thread stays alive while we are using + // /proc/thread-self. Otherwise, Go could switch our running thread and + // kill the original thread underneath us at any point where we might + // yield to other goroutines. + // + // This isn't needed for move_mount(2) because in that case the thread + // is only used for error info. + if !isMoveMount { + runtime.LockOSThread() + defer runtime.UnlockOSThread() + } + src = utils.ProcThreadSelf("fd/" + strconv.Itoa(int(srcFile.file.Fd()))) } var op string var err error - if srcFile != nil && srcFile.Type == mountSourceOpenTree { + if isMoveMount { op = "move_mount" err = unix.MoveMount(int(srcFile.file.Fd()), "", unix.AT_FDCWD, dstFd, diff --git a/libcontainer/rootfs_linux.go b/libcontainer/rootfs_linux.go index 11c4ef48274..d3076868065 100644 --- a/libcontainer/rootfs_linux.go +++ b/libcontainer/rootfs_linux.go @@ -45,11 +45,25 @@ type mountEntry struct { srcFile *mountSource } -func (m *mountEntry) src() string { +func (m *mountEntry) srcStat() (os.FileInfo, error) { if m.srcFile != nil { - return "/proc/self/fd/" + strconv.Itoa(int(m.srcFile.file.Fd())) + return m.srcFile.file.Stat() } - return m.Source + return os.Stat(m.Source) +} + +func (m *mountEntry) srcStatfs() (unix.Statfs_t, error) { + var st unix.Statfs_t + if m.srcFile != nil { + if err := unix.Fstatfs(int(m.srcFile.file.Fd()), &st); err != nil { + return st, os.NewSyscallError("fstatfs", err) + } + } else { + if err := unix.Statfs(m.Source, &st); err != nil { + return st, &os.PathError{Op: "statfs", Path: m.Source, Err: err} + } + } + return st, nil } // needsSetupDev returns true if /dev needs to be set up. @@ -247,8 +261,7 @@ func cleanupTmp(tmpdir string) { } func prepareBindMount(m mountEntry, rootfs string) error { - source := m.src() - stat, err := os.Stat(source) + fi, err := m.srcStat() if err != nil { // error out if the source of a bind mount does not exist as we will be // unable to bind anything to it. @@ -262,14 +275,10 @@ func prepareBindMount(m mountEntry, rootfs string) error { if dest, err = securejoin.SecureJoin(rootfs, m.Destination); err != nil { return err } - if err := checkProcMount(rootfs, dest, source); err != nil { + if err := checkProcMount(rootfs, dest, &m); err != nil { return err } - if err := createIfNotExists(dest, stat.IsDir()); err != nil { - return err - } - - return nil + return createIfNotExists(dest, fi.IsDir()) } func mountCgroupV1(m *configs.Mount, c *mountConfig) error { @@ -527,7 +536,7 @@ func mountToRootfs(c *mountConfig, m mountEntry) error { } return mountCgroupV1(m.Mount, c) default: - if err := checkProcMount(rootfs, dest, m.Source); err != nil { + if err := checkProcMount(rootfs, dest, &m); err != nil { return err } if err := os.MkdirAll(dest, 0o755); err != nil { @@ -543,6 +552,8 @@ func getCgroupMounts(m *configs.Mount) ([]*configs.Mount, error) { return nil, err } + // We don't need to use /proc/thread-self here because runc always runs + // with every thread in the same cgroup. cgroupPaths, err := cgroups.ParseCgroupFile("/proc/self/cgroup") if err != nil { return nil, err @@ -574,8 +585,8 @@ func getCgroupMounts(m *configs.Mount) ([]*configs.Mount, error) { // checkProcMount checks to ensure that the mount destination is not over the top of /proc. // dest is required to be an abs path and have any symlinks resolved before calling this function. // -// if source is nil, don't stat the filesystem. This is used for restore of a checkpoint. -func checkProcMount(rootfs, dest, source string) error { +// If m is nil, don't stat the filesystem. This is used for restore of a checkpoint. +func checkProcMount(rootfs, dest string, m *mountEntry) error { const procPath = "/proc" path, err := filepath.Rel(filepath.Join(rootfs, procPath), dest) if err != nil { @@ -587,19 +598,23 @@ func checkProcMount(rootfs, dest, source string) error { } if path == "." { // an empty source is pasted on restore - if source == "" { + if m == nil { return nil } - // only allow a mount on-top of proc if it's source is "proc" - isproc, err := isProc(source) - if err != nil { - return err - } - // pass if the mount is happening on top of /proc and the source of - // the mount is a proc filesystem - if isproc { + // only allow a mount on-top of proc if its source is a procfs mount ... + if m.Device == "proc" { return nil } + // ... or if the mount source is a bind-mount of a procfs mount. + if m.Flags&unix.MS_BIND != 0 { + st, err := m.srcStatfs() + if err != nil { + return err + } + if st.Type == unix.PROC_SUPER_MAGIC { + return nil + } + } return fmt.Errorf("%q cannot be mounted because it is not of type proc", dest) } @@ -630,15 +645,10 @@ func checkProcMount(rootfs, dest, source string) error { return fmt.Errorf("%q cannot be mounted because it is inside /proc", dest) } -func isProc(path string) (bool, error) { - var s unix.Statfs_t - if err := unix.Statfs(path, &s); err != nil { - return false, &os.PathError{Op: "statfs", Path: path, Err: err} - } - return s.Type == unix.PROC_SUPER_MAGIC, nil -} - func setupDevSymlinks(rootfs string) error { + // In theory, these should be links to /proc/thread-self, but systems + // expect these to be /proc/self and this matches how most distributions + // work. links := [][2]string{ {"/proc/self/fd", "/dev/fd"}, {"/proc/self/fd/0", "/dev/stdin"}, @@ -1103,10 +1113,9 @@ func remount(m mountEntry, rootfs string, noMountFallback bool) error { return nil } // Check if the source has flags set according to noMountFallback - src := m.src() - var s unix.Statfs_t - if err := unix.Statfs(src, &s); err != nil { - return &os.PathError{Op: "statfs", Path: src, Err: err} + s, err := m.srcStatfs() + if err != nil { + return err } var checkflags int if noMountFallback { diff --git a/libcontainer/rootfs_linux_test.go b/libcontainer/rootfs_linux_test.go index 8709a5e47f7..1727fe648ef 100644 --- a/libcontainer/rootfs_linux_test.go +++ b/libcontainer/rootfs_linux_test.go @@ -4,45 +4,55 @@ import ( "testing" "github.com/opencontainers/runc/libcontainer/configs" + + "golang.org/x/sys/unix" ) func TestCheckMountDestOnProc(t *testing.T) { dest := "/rootfs/proc/sys" - err := checkProcMount("/rootfs", dest, "") + err := checkProcMount("/rootfs", dest, nil) if err == nil { t.Fatal("destination inside proc should return an error") } } func TestCheckMountDestOnProcChroot(t *testing.T) { + var m mountEntry + m.Source = "/proc" + m.Flags = unix.MS_BIND + dest := "/rootfs/proc/" - err := checkProcMount("/rootfs", dest, "/proc") + err := checkProcMount("/rootfs", dest, &m) if err != nil { - t.Fatal("destination inside proc when using chroot should not return an error") + t.Fatalf("destination inside proc when using chroot should not return an error: %v", err) } } func TestCheckMountDestInSys(t *testing.T) { dest := "/rootfs//sys/fs/cgroup" - err := checkProcMount("/rootfs", dest, "") + err := checkProcMount("/rootfs", dest, nil) if err != nil { - t.Fatal("destination inside /sys should not return an error") + t.Fatalf("destination inside /sys should not return an error: %v", err) } } func TestCheckMountDestFalsePositive(t *testing.T) { dest := "/rootfs/sysfiles/fs/cgroup" - err := checkProcMount("/rootfs", dest, "") + err := checkProcMount("/rootfs", dest, nil) if err != nil { t.Fatal(err) } } func TestCheckMountDestNsLastPid(t *testing.T) { + var m mountEntry + m.Source = "/dev/null" + m.Flags = unix.MS_BIND + dest := "/rootfs/proc/sys/kernel/ns_last_pid" - err := checkProcMount("/rootfs", dest, "/proc") + err := checkProcMount("/rootfs", dest, &m) if err != nil { - t.Fatal("/proc/sys/kernel/ns_last_pid should not return an error") + t.Fatalf("/proc/sys/kernel/ns_last_pid should not return an error: %v", err) } } diff --git a/libcontainer/standard_init_linux.go b/libcontainer/standard_init_linux.go index 64db2c71061..4907c36f0de 100644 --- a/libcontainer/standard_init_linux.go +++ b/libcontainer/standard_init_linux.go @@ -5,6 +5,7 @@ import ( "fmt" "os" "os/exec" + "runtime" "strconv" "github.com/opencontainers/runtime-spec/specs-go" @@ -17,6 +18,7 @@ import ( "github.com/opencontainers/runc/libcontainer/keys" "github.com/opencontainers/runc/libcontainer/seccomp" "github.com/opencontainers/runc/libcontainer/system" + "github.com/opencontainers/runc/libcontainer/utils" ) type linuxStandardInit struct { @@ -224,11 +226,18 @@ func (l *linuxStandardInit) Init() error { return &os.PathError{Op: "close log pipe", Path: "fd " + strconv.Itoa(l.logFd), Err: err} } + // We need to lock ourselves to the current thread, to make sure that this + // thread stays alive while we are using /proc/thread-self. Otherwise, Go + // could switch our running thread and kill the original thread underneath + // us at any point where we might yield to other goroutines. + runtime.LockOSThread() + defer runtime.UnlockOSThread() + fifoPath := utils.ProcThreadSelf("fd/" + strconv.Itoa(l.fifoFd)) + // Wait for the FIFO to be opened on the other side before exec-ing the // user process. We open it through /proc/self/fd/$fd, because the fd that // was given to us was an O_PATH fd to the fifo itself. Linux allows us to // re-open an O_PATH fd through /proc. - fifoPath := "/proc/self/fd/" + strconv.Itoa(l.fifoFd) fd, err := unix.Open(fifoPath, unix.O_WRONLY|unix.O_CLOEXEC, 0) if err != nil { return &os.PathError{Op: "open exec fifo", Path: fifoPath, Err: err} diff --git a/libcontainer/user/lookup_unix.go b/libcontainer/user/lookup_unix.go index f95c1409fce..aa22f12db83 100644 --- a/libcontainer/user/lookup_unix.go +++ b/libcontainer/user/lookup_unix.go @@ -149,9 +149,13 @@ func CurrentUserSubGIDs() ([]SubID, error) { } func CurrentProcessUIDMap() ([]IDMap, error) { + // We don't need to use /proc/thread-self here because uid_map is a + // per-userns file and thus is global to all threads in a thread-group. return ParseIDMapFile("/proc/self/uid_map") } func CurrentProcessGIDMap() ([]IDMap, error) { + // We don't need to use /proc/thread-self here because gid_map is a + // per-userns file and thus is global to all threads in a thread-group. return ParseIDMapFile("/proc/self/gid_map") } diff --git a/libcontainer/userns/usernsfd_linux.go b/libcontainer/userns/usernsfd_linux.go index 0f518c5d5fe..f3e0e79ebd5 100644 --- a/libcontainer/userns/usernsfd_linux.go +++ b/libcontainer/userns/usernsfd_linux.go @@ -94,6 +94,8 @@ func spawnProc(req Mapping) (*os.Process, error) { // they have privileges over. logrus.Debugf("spawning dummy process for id-mapping %s", req.id()) uidMappings, gidMappings := req.toSys() + // We don't need to use /proc/thread-self here because the exe mm of a + // thread-group is guaranteed to be the same for all threads by definition. return os.StartProcess("/proc/self/exe", []string{"runc", "--help"}, &os.ProcAttr{ Sys: &syscall.SysProcAttr{ Cloneflags: unix.CLONE_NEWUSER, diff --git a/libcontainer/utils/utils.go b/libcontainer/utils/utils.go index 74d9d20c7f1..1b523d8ac54 100644 --- a/libcontainer/utils/utils.go +++ b/libcontainer/utils/utils.go @@ -3,15 +3,12 @@ package utils import ( "encoding/binary" "encoding/json" - "fmt" "io" "os" "path/filepath" - "strconv" "strings" "unsafe" - securejoin "github.com/cyphar/filepath-securejoin" "golang.org/x/sys/unix" ) @@ -102,39 +99,6 @@ func stripRoot(root, path string) string { return CleanPath("/" + path) } -// WithProcfd runs the passed closure with a procfd path (/proc/self/fd/...) -// corresponding to the unsafePath resolved within the root. Before passing the -// fd, this path is verified to have been inside the root -- so operating on it -// through the passed fdpath should be safe. Do not access this path through -// the original path strings, and do not attempt to use the pathname outside of -// the passed closure (the file handle will be freed once the closure returns). -func WithProcfd(root, unsafePath string, fn func(procfd string) error) error { - // Remove the root then forcefully resolve inside the root. - unsafePath = stripRoot(root, unsafePath) - path, err := securejoin.SecureJoin(root, unsafePath) - if err != nil { - return fmt.Errorf("resolving path inside rootfs failed: %w", err) - } - - // Open the target path. - fh, err := os.OpenFile(path, unix.O_PATH|unix.O_CLOEXEC, 0) - if err != nil { - return fmt.Errorf("open o_path procfd: %w", err) - } - defer fh.Close() - - // Double-check the path is the one we expected. - procfd := "/proc/self/fd/" + strconv.Itoa(int(fh.Fd())) - if realpath, err := os.Readlink(procfd); err != nil { - return fmt.Errorf("procfd verification failed: %w", err) - } else if realpath != path { - return fmt.Errorf("possibly malicious path detected -- refusing to operate on %s", realpath) - } - - // Run the closure. - return fn(procfd) -} - // SearchLabels searches through a list of key=value pairs for a given key, // returning its value, and the binary flag telling whether the key exist. func SearchLabels(labels []string, key string) (string, bool) { diff --git a/libcontainer/utils/utils_unix.go b/libcontainer/utils/utils_unix.go index e5f11523d1d..96064dff608 100644 --- a/libcontainer/utils/utils_unix.go +++ b/libcontainer/utils/utils_unix.go @@ -7,9 +7,12 @@ import ( "fmt" "math" "os" + "runtime" "strconv" "sync" + securejoin "github.com/cyphar/filepath-securejoin" + "github.com/sirupsen/logrus" "golang.org/x/sys/unix" ) @@ -57,7 +60,14 @@ func CloseExecFrom(minFd int) error { return os.NewSyscallError("close_range", err) } - fdDir, err := os.Open("/proc/self/fd") + // We need to lock ourselves to the current thread, to make sure that this + // thread stays alive while we are using /proc/thread-self. Otherwise, Go + // could switch our running thread and kill the original thread underneath + // us at any point where we might yield to other goroutines. + runtime.LockOSThread() + defer runtime.UnlockOSThread() + + fdDir, err := os.Open(ProcThreadSelf("fd")) if err != nil { return err } @@ -98,3 +108,102 @@ func NewSockPair(name string) (parent, child *os.File, err error) { } return os.NewFile(uintptr(fds[1]), name+"-p"), os.NewFile(uintptr(fds[0]), name+"-c"), nil } + +// WithProcfd runs the passed closure with a procfd path (/proc/self/fd/...) +// corresponding to the unsafePath resolved within the root. Before passing the +// fd, this path is verified to have been inside the root -- so operating on it +// through the passed fdpath should be safe. Do not access this path through +// the original path strings, and do not attempt to use the pathname outside of +// the passed closure (the file handle will be freed once the closure returns). +func WithProcfd(root, unsafePath string, fn func(procfd string) error) error { + // Remove the root then forcefully resolve inside the root. + unsafePath = stripRoot(root, unsafePath) + path, err := securejoin.SecureJoin(root, unsafePath) + if err != nil { + return fmt.Errorf("resolving path inside rootfs failed: %w", err) + } + + // We need to lock ourselves to the current thread, to make sure that this + // thread stays alive while the closure uses /proc/thread-self. Otherwise, + // Go could switch our running thread and kill the original thread + // underneath us at any point where we might yield to other goroutines. + runtime.LockOSThread() + defer runtime.UnlockOSThread() + + // Open the target path. + fh, err := os.OpenFile(path, unix.O_PATH|unix.O_CLOEXEC, 0) + if err != nil { + return fmt.Errorf("open o_path procfd: %w", err) + } + defer fh.Close() + + // Double-check the path is the one we expected. + procfd := ProcThreadSelf("fd/" + strconv.Itoa(int(fh.Fd()))) + if realpath, err := os.Readlink(procfd); err != nil { + return fmt.Errorf("procfd verification failed: %w", err) + } else if realpath != path { + return fmt.Errorf("possibly malicious path detected -- refusing to operate on %s", realpath) + } + + // We don't need to worry about LockOSThread-ing when using this path, + // because the usage of this path in a syscall is atomic with respect to + // this thread's lifecycle and (unlike per-tid procfs files) once opened or + // operated on it has nothing to do with the thread lifetime. + err = fn(procfd) + // Make sure the fd stays alive until after the closure returns. + runtime.KeepAlive(fh) + return err +} + +var ( + haveProcThreadSelf bool + haveProcThreadSelfOnce sync.Once +) + +// ProcThreadSelf returns a string that is equivalent to +// /proc/thread-self/, with a graceful fallback on older kernels where +// /proc/thread-self doesn't exist. This method DOES NOT use SecureJoin, +// meaning that the passed string needs to be trusted. +// +// In addition, it is almost certainly advisable to use runtime.LockOSThread() +// when dealing with this path because it references the current thread, and +// the Go runtime can change the currently executing thread (and kill the +// original thread) whenever the Go routine yields to the scheduler. This is +// particularly important if you open the referenced procfs file and plan to +// operate on it several times (not locking the OS thread could result in +// pull-your-hair-out-frustrating bugs -- I figured this out the hard way). +func ProcThreadSelf(path string) string { + haveProcThreadSelfOnce.Do(func() { + if _, err := os.Stat("/proc/thread-self/"); err == nil { + haveProcThreadSelf = true + } else { + logrus.Debugf("cannot stat /proc/thread-self (%v), falling back to /proc/self/task/", err) + } + }) + + if haveProcThreadSelf { + return "/proc/thread-self/" + path + } + + // Pre-3.17 kernels did not have /proc/thread-self, so do it manually. + threadSelf := "/proc/self/task/" + strconv.Itoa(unix.Gettid()) + "/" + if _, err := os.Stat(threadSelf); err != nil { + // Unfortunately, this code is called from rootfs_linux.go where we are + // running inside the pid namespace of the container but /proc is the + // host's procfs. Unfortunately there is no real way to get the correct + // tid to use here (the kernel age means we cannot do things like set + // up a private fsopen("proc") -- even scanning NSpid in all of the + // tasks in /proc/self/task/status requires Linux 4.1). + // + // So, we just have to assume that /proc/self is acceptable in this one + // specific case. + if os.Getpid() == 1 { + logrus.Debugf("/proc/thread-self (tid=%d) cannot be emulated inside the initial container setup -- using /proc/self instead: %v", unix.Gettid(), err) + } else { + // This should never happen, but the fallback should work in most cases... + logrus.Warnf("/proc/thread-self could not be emulated for pid=%d (tid=%d) -- using more buggy /proc/self fallback instead: %v", os.Getpid(), unix.Gettid(), err) + } + threadSelf = "/proc/self/" + } + return threadSelf + path +} diff --git a/utils_linux.go b/utils_linux.go index 0f787cb3387..7ca606dafae 100644 --- a/utils_linux.go +++ b/utils_linux.go @@ -225,7 +225,7 @@ func (r *runner) run(config *specs.Process) (int, error) { } baseFd := 3 + len(process.ExtraFiles) for i := baseFd; i < baseFd+r.preserveFDs; i++ { - _, err = os.Stat("/proc/self/fd/" + strconv.Itoa(i)) + _, err = os.Stat(utils.ProcThreadSelf("fd/" + strconv.Itoa(i))) if err != nil { return -1, fmt.Errorf("unable to stat preserved-fd %d (of %d): %w", i-baseFd, r.preserveFDs, err) }