From cc33f92529ed0e2b6fadad2fdb03080d295429c2 Mon Sep 17 00:00:00 2001 From: Sebastiaan van Stijn Date: Thu, 14 Feb 2019 00:00:10 +0100 Subject: [PATCH 1/8] Revert "Merge pull request #11 from seemethere/apply_patches_1806" This reverts commit a592beb5bc4c4092b1b1bac971afed27687340c5, reversing changes made to 69663f0bd4b60df09991c08812a60108003fa340. Signed-off-by: Sebastiaan van Stijn --- libcontainer/nsenter/cloned_binary.c | 238 --------------------------- libcontainer/nsenter/nsexec.c | 11 -- 2 files changed, 249 deletions(-) delete mode 100644 libcontainer/nsenter/cloned_binary.c diff --git a/libcontainer/nsenter/cloned_binary.c b/libcontainer/nsenter/cloned_binary.c deleted file mode 100644 index 2410eaa3a96..00000000000 --- a/libcontainer/nsenter/cloned_binary.c +++ /dev/null @@ -1,238 +0,0 @@ -#define _GNU_SOURCE -#include -#include -#include -#include -#include -#include -#include -#include - -#include -#include -#include -#include -#include -#include - -#include - -/* flags for memfd_create(2) (unsigned int), taken from linux/memfd.h */ -#define MFD_CLOEXEC 0x0001U -#define MFD_ALLOW_SEALING 0x0002U - -/* Use our own wrapper for memfd_create. */ -#if !defined(__NR_memfd_create) -# ifdef __i386__ -# define __NR_memfd_create 356 -# elif __x86_64__ -# define __NR_memfd_create 319 -# elif __powerpc64__ -# define __NR_memfd_create 360 -# elif __aarch64__ -# define __NR_memfd_create 279 -# elif __arm__ -# define __NR_memfd_create 385 -# endif -#endif /* !__NR_memfd_create */ - -#if !defined(SYS_memfd_create) && defined(__NR_memfd_create) -# define SYS_memfd_create __NR_memfd_create -#endif -#ifndef SYS_memfd_create -# error "memfd_create(2) syscall not supported by this glibc version" -#endif -int memfd_create(const char *name, unsigned int flags) -{ - return syscall(SYS_memfd_create, name, flags); -} - -/* This comes directly from . */ -#ifndef F_LINUX_SPECIFIC_BASE -# define F_LINUX_SPECIFIC_BASE 1024 -#endif -#ifndef F_ADD_SEALS -# define F_ADD_SEALS (F_LINUX_SPECIFIC_BASE + 9) -# define F_GET_SEALS (F_LINUX_SPECIFIC_BASE + 10) -#endif -#ifndef F_SEAL_SEAL -# define F_SEAL_SEAL 0x0001 /* prevent further seals from being set */ -# define F_SEAL_SHRINK 0x0002 /* prevent file from shrinking */ -# define F_SEAL_GROW 0x0004 /* prevent file from growing */ -# define F_SEAL_WRITE 0x0008 /* prevent writes */ -#endif - - -#define OUR_MEMFD_COMMENT "runc_cloned:/proc/self/exe" -#define OUR_MEMFD_SEALS \ - (F_SEAL_SEAL | F_SEAL_SHRINK | F_SEAL_GROW | F_SEAL_WRITE) - -static void *must_realloc(void *ptr, size_t size) -{ - void *old = ptr; - do { - ptr = realloc(old, size); - } while(!ptr); - return ptr; -} - -/* - * Verify whether we are currently in a self-cloned program (namely, is - * /proc/self/exe a memfd). F_GET_SEALS will only succeed for memfds (or rather - * for shmem files), and we want to be sure it's actually sealed. - */ -static int is_self_cloned(void) -{ - int fd, seals; - - fd = open("/proc/self/exe", O_RDONLY|O_CLOEXEC); - if (fd < 0) - return -ENOTRECOVERABLE; - - seals = fcntl(fd, F_GET_SEALS); - close(fd); - return seals == OUR_MEMFD_SEALS; -} - -/* - * Basic wrapper around mmap(2) that gives you the file length so you can - * safely treat it as an ordinary buffer. Only gives you read access. - */ -static char *read_file(char *path, size_t *length) -{ - int fd; - char buf[4096], *copy = NULL; - - if (!length) - return NULL; - - fd = open(path, O_RDONLY | O_CLOEXEC); - if (fd < 0) - return NULL; - - *length = 0; - for (;;) { - int n; - - n = read(fd, buf, sizeof(buf)); - if (n < 0) - goto error; - if (!n) - break; - - copy = must_realloc(copy, (*length + n) * sizeof(*copy)); - memcpy(copy + *length, buf, n); - *length += n; - } - close(fd); - return copy; - -error: - close(fd); - free(copy); - return NULL; -} - -/* - * A poor-man's version of "xargs -0". Basically parses a given block of - * NUL-delimited data, within the given length and adds a pointer to each entry - * to the array of pointers. - */ -static int parse_xargs(char *data, int data_length, char ***output) -{ - int num = 0; - char *cur = data; - - if (!data || *output != NULL) - return -1; - - while (cur < data + data_length) { - num++; - *output = must_realloc(*output, (num + 1) * sizeof(**output)); - (*output)[num - 1] = cur; - cur += strlen(cur) + 1; - } - (*output)[num] = NULL; - return num; -} - -/* - * "Parse" out argv and envp from /proc/self/cmdline and /proc/self/environ. - * This is necessary because we are running in a context where we don't have a - * main() that we can just get the arguments from. - */ -static int fetchve(char ***argv, char ***envp) -{ - char *cmdline = NULL, *environ = NULL; - size_t cmdline_size, environ_size; - - cmdline = read_file("/proc/self/cmdline", &cmdline_size); - if (!cmdline) - goto error; - environ = read_file("/proc/self/environ", &environ_size); - if (!environ) - goto error; - - if (parse_xargs(cmdline, cmdline_size, argv) <= 0) - goto error; - if (parse_xargs(environ, environ_size, envp) <= 0) - goto error; - - return 0; - -error: - free(environ); - free(cmdline); - return -EINVAL; -} - -#define SENDFILE_MAX 0x7FFFF000 /* sendfile(2) is limited to 2GB. */ -static int clone_binary(void) -{ - int binfd, memfd, err; - ssize_t sent = 0; - - memfd = memfd_create(OUR_MEMFD_COMMENT, MFD_CLOEXEC | MFD_ALLOW_SEALING); - if (memfd < 0) - return -ENOTRECOVERABLE; - - binfd = open("/proc/self/exe", O_RDONLY | O_CLOEXEC); - if (binfd < 0) - goto error; - - sent = sendfile(memfd, binfd, NULL, SENDFILE_MAX); - close(binfd); - if (sent < 0) - goto error; - - err = fcntl(memfd, F_ADD_SEALS, OUR_MEMFD_SEALS); - if (err < 0) - goto error; - - return memfd; - -error: - close(memfd); - return -EIO; -} - -int ensure_cloned_binary(void) -{ - int execfd; - char **argv = NULL, **envp = NULL; - - /* Check that we're not self-cloned, and if we are then bail. */ - int cloned = is_self_cloned(); - if (cloned > 0 || cloned == -ENOTRECOVERABLE) - return cloned; - - if (fetchve(&argv, &envp) < 0) - return -EINVAL; - - execfd = clone_binary(); - if (execfd < 0) - return -EIO; - - fexecve(execfd, argv, envp); - return -ENOEXEC; -} diff --git a/libcontainer/nsenter/nsexec.c b/libcontainer/nsenter/nsexec.c index b6b58b817f1..2c69cee5d64 100644 --- a/libcontainer/nsenter/nsexec.c +++ b/libcontainer/nsenter/nsexec.c @@ -527,9 +527,6 @@ void join_namespaces(char *nslist) free(namespaces); } -/* Defined in cloned_binary.c. */ -int ensure_cloned_binary(void); - void nsexec(void) { int pipenum; @@ -545,14 +542,6 @@ void nsexec(void) if (pipenum == -1) return; - /* - * We need to re-exec if we are not in a cloned binary. This is necessary - * to ensure that containers won't be able to access the host binary - * through /proc/self/exe. See CVE-2019-5736. - */ - if (ensure_cloned_binary() < 0) - bail("could not ensure we are a cloned binary"); - /* Parse all of the netlink configuration. */ nl_parse(pipenum, &config); From f87d8bbb1f0f006dcbe6fd0b0938a92fd295130e Mon Sep 17 00:00:00 2001 From: Aleksa Sarai Date: Wed, 9 Jan 2019 13:40:01 +1100 Subject: [PATCH 2/8] nsenter: clone /proc/self/exe to avoid exposing host binary to container There are quite a few circumstances where /proc/self/exe pointing to a pretty important container binary is a _bad_ thing, so to avoid this we have to make a copy (preferably doing self-clean-up and not being writeable). We require memfd_create(2) -- though there is an O_TMPFILE fallback -- but we can always extend this to use a scratch MNT_DETACH overlayfs or tmpfs. The main downside to this approach is no page-cache sharing for the runc binary (which overlayfs would give us) but this is far less complicated. This is only done during nsenter so that it happens transparently to the Go code, and any libcontainer users benefit from it. This also makes ExtraFiles and --preserve-fds handling trivial (because we don't need to worry about it). Fixes: CVE-2019-5736 Co-developed-by: Christian Brauner Signed-off-by: Aleksa Sarai (cherry picked from commit 0a8e4117e7f715d5fbeef398405813ce8e88558b) Signed-off-by: Sebastiaan van Stijn --- libcontainer/nsenter/cloned_binary.c | 268 +++++++++++++++++++++++++++ libcontainer/nsenter/nsexec.c | 11 ++ 2 files changed, 279 insertions(+) create mode 100644 libcontainer/nsenter/cloned_binary.c diff --git a/libcontainer/nsenter/cloned_binary.c b/libcontainer/nsenter/cloned_binary.c new file mode 100644 index 00000000000..c8a42c23f73 --- /dev/null +++ b/libcontainer/nsenter/cloned_binary.c @@ -0,0 +1,268 @@ +/* + * Copyright (C) 2019 Aleksa Sarai + * Copyright (C) 2019 SUSE LLC + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#define _GNU_SOURCE +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include + +/* Use our own wrapper for memfd_create. */ +#if !defined(SYS_memfd_create) && defined(__NR_memfd_create) +# define SYS_memfd_create __NR_memfd_create +#endif +#ifdef SYS_memfd_create +# define HAVE_MEMFD_CREATE +/* memfd_create(2) flags -- copied from . */ +# ifndef MFD_CLOEXEC +# define MFD_CLOEXEC 0x0001U +# define MFD_ALLOW_SEALING 0x0002U +# endif +int memfd_create(const char *name, unsigned int flags) +{ + return syscall(SYS_memfd_create, name, flags); +} +#endif + +/* This comes directly from . */ +#ifndef F_LINUX_SPECIFIC_BASE +# define F_LINUX_SPECIFIC_BASE 1024 +#endif +#ifndef F_ADD_SEALS +# define F_ADD_SEALS (F_LINUX_SPECIFIC_BASE + 9) +# define F_GET_SEALS (F_LINUX_SPECIFIC_BASE + 10) +#endif +#ifndef F_SEAL_SEAL +# define F_SEAL_SEAL 0x0001 /* prevent further seals from being set */ +# define F_SEAL_SHRINK 0x0002 /* prevent file from shrinking */ +# define F_SEAL_GROW 0x0004 /* prevent file from growing */ +# define F_SEAL_WRITE 0x0008 /* prevent writes */ +#endif + +#define RUNC_SENDFILE_MAX 0x7FFFF000 /* sendfile(2) is limited to 2GB. */ +#ifdef HAVE_MEMFD_CREATE +# define RUNC_MEMFD_COMMENT "runc_cloned:/proc/self/exe" +# define RUNC_MEMFD_SEALS \ + (F_SEAL_SEAL | F_SEAL_SHRINK | F_SEAL_GROW | F_SEAL_WRITE) +#endif + +static void *must_realloc(void *ptr, size_t size) +{ + void *old = ptr; + do { + ptr = realloc(old, size); + } while(!ptr); + return ptr; +} + +/* + * Verify whether we are currently in a self-cloned program (namely, is + * /proc/self/exe a memfd). F_GET_SEALS will only succeed for memfds (or rather + * for shmem files), and we want to be sure it's actually sealed. + */ +static int is_self_cloned(void) +{ + int fd, ret, is_cloned = 0; + + fd = open("/proc/self/exe", O_RDONLY|O_CLOEXEC); + if (fd < 0) + return -ENOTRECOVERABLE; + +#ifdef HAVE_MEMFD_CREATE + ret = fcntl(fd, F_GET_SEALS); + is_cloned = (ret == RUNC_MEMFD_SEALS); +#else + struct stat statbuf = {0}; + ret = fstat(fd, &statbuf); + if (ret >= 0) + is_cloned = (statbuf.st_nlink == 0); +#endif + close(fd); + return is_cloned; +} + +/* + * Basic wrapper around mmap(2) that gives you the file length so you can + * safely treat it as an ordinary buffer. Only gives you read access. + */ +static char *read_file(char *path, size_t *length) +{ + int fd; + char buf[4096], *copy = NULL; + + if (!length) + return NULL; + + fd = open(path, O_RDONLY | O_CLOEXEC); + if (fd < 0) + return NULL; + + *length = 0; + for (;;) { + int n; + + n = read(fd, buf, sizeof(buf)); + if (n < 0) + goto error; + if (!n) + break; + + copy = must_realloc(copy, (*length + n) * sizeof(*copy)); + memcpy(copy + *length, buf, n); + *length += n; + } + close(fd); + return copy; + +error: + close(fd); + free(copy); + return NULL; +} + +/* + * A poor-man's version of "xargs -0". Basically parses a given block of + * NUL-delimited data, within the given length and adds a pointer to each entry + * to the array of pointers. + */ +static int parse_xargs(char *data, int data_length, char ***output) +{ + int num = 0; + char *cur = data; + + if (!data || *output != NULL) + return -1; + + while (cur < data + data_length) { + num++; + *output = must_realloc(*output, (num + 1) * sizeof(**output)); + (*output)[num - 1] = cur; + cur += strlen(cur) + 1; + } + (*output)[num] = NULL; + return num; +} + +/* + * "Parse" out argv and envp from /proc/self/cmdline and /proc/self/environ. + * This is necessary because we are running in a context where we don't have a + * main() that we can just get the arguments from. + */ +static int fetchve(char ***argv, char ***envp) +{ + char *cmdline = NULL, *environ = NULL; + size_t cmdline_size, environ_size; + + cmdline = read_file("/proc/self/cmdline", &cmdline_size); + if (!cmdline) + goto error; + environ = read_file("/proc/self/environ", &environ_size); + if (!environ) + goto error; + + if (parse_xargs(cmdline, cmdline_size, argv) <= 0) + goto error; + if (parse_xargs(environ, environ_size, envp) <= 0) + goto error; + + return 0; + +error: + free(environ); + free(cmdline); + return -EINVAL; +} + +static int clone_binary(void) +{ + int binfd, memfd; + ssize_t sent = 0; + +#ifdef HAVE_MEMFD_CREATE + memfd = memfd_create(RUNC_MEMFD_COMMENT, MFD_CLOEXEC | MFD_ALLOW_SEALING); +#else + memfd = open("/tmp", O_TMPFILE | O_EXCL | O_RDWR | O_CLOEXEC, 0711); +#endif + if (memfd < 0) + return -ENOTRECOVERABLE; + + binfd = open("/proc/self/exe", O_RDONLY | O_CLOEXEC); + if (binfd < 0) + goto error; + + sent = sendfile(memfd, binfd, NULL, RUNC_SENDFILE_MAX); + close(binfd); + if (sent < 0) + goto error; + +#ifdef HAVE_MEMFD_CREATE + int err = fcntl(memfd, F_ADD_SEALS, RUNC_MEMFD_SEALS); + if (err < 0) + goto error; +#else + /* Need to re-open "memfd" as read-only to avoid execve(2) giving -EXTBUSY. */ + int newfd; + char *fdpath = NULL; + + if (asprintf(&fdpath, "/proc/self/fd/%d", memfd) < 0) + goto error; + newfd = open(fdpath, O_RDONLY | O_CLOEXEC); + free(fdpath); + if (newfd < 0) + goto error; + + close(memfd); + memfd = newfd; +#endif + return memfd; + +error: + close(memfd); + return -EIO; +} + +int ensure_cloned_binary(void) +{ + int execfd; + char **argv = NULL, **envp = NULL; + + /* Check that we're not self-cloned, and if we are then bail. */ + int cloned = is_self_cloned(); + if (cloned > 0 || cloned == -ENOTRECOVERABLE) + return cloned; + + if (fetchve(&argv, &envp) < 0) + return -EINVAL; + + execfd = clone_binary(); + if (execfd < 0) + return -EIO; + + fexecve(execfd, argv, envp); + return -ENOEXEC; +} diff --git a/libcontainer/nsenter/nsexec.c b/libcontainer/nsenter/nsexec.c index 2c69cee5d64..1c4c792fe64 100644 --- a/libcontainer/nsenter/nsexec.c +++ b/libcontainer/nsenter/nsexec.c @@ -527,6 +527,9 @@ void join_namespaces(char *nslist) free(namespaces); } +/* Defined in cloned_binary.c. */ +extern int ensure_cloned_binary(void); + void nsexec(void) { int pipenum; @@ -542,6 +545,14 @@ void nsexec(void) if (pipenum == -1) return; + /* + * We need to re-exec if we are not in a cloned binary. This is necessary + * to ensure that containers won't be able to access the host binary + * through /proc/self/exe. See CVE-2019-5736. + */ + if (ensure_cloned_binary() < 0) + bail("could not ensure we are a cloned binary"); + /* Parse all of the netlink configuration. */ nl_parse(pipenum, &config); From 497c526127d2aa82e2c9d051a28a253900fed025 Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Thu, 14 Feb 2019 15:56:26 +0100 Subject: [PATCH 3/8] nsexec (CVE-2019-5736): avoid parsing environ My first attempt to simplify this and make it less costly focussed on the way constructors are called. I was under the impression that the ELF specification mandated that arg, argv, and actually even envp need to be passed to functions located in the .init_arry section (aka "constructors"). Actually, the specifications is (cf. [2]): SHT_INIT_ARRAY This section contains an array of pointers to initialization functions, as described in ``Initialization and Termination Functions'' in Chapter 5. Each pointer in the array is taken as a parameterless procedure with a void return. which means that this becomes a libc specific decision. Glibc passes down those args, musl doesn't. So this approach can't work. However, we can at least remove the environment parsing part based on POSIX since [1] mandates that there should be an environ variable defined in unistd.h which provides access to the environment. See also the relevant Open Group specification [1]. [1]: http://pubs.opengroup.org/onlinepubs/9699919799/ [2]: http://www.sco.com/developers/gabi/latest/ch4.sheader.html#init_array Fixes: CVE-2019-5736 Signed-off-by: Christian Brauner (cherry picked from commit bb7d8b1f41f7bf0399204d54009d6da57c3cc775) Signed-off-by: Sebastiaan van Stijn --- libcontainer/nsenter/cloned_binary.c | 23 ++++++++++------------- 1 file changed, 10 insertions(+), 13 deletions(-) diff --git a/libcontainer/nsenter/cloned_binary.c b/libcontainer/nsenter/cloned_binary.c index c8a42c23f73..c97dfcb70d3 100644 --- a/libcontainer/nsenter/cloned_binary.c +++ b/libcontainer/nsenter/cloned_binary.c @@ -169,31 +169,25 @@ static int parse_xargs(char *data, int data_length, char ***output) } /* - * "Parse" out argv and envp from /proc/self/cmdline and /proc/self/environ. + * "Parse" out argv from /proc/self/cmdline. * This is necessary because we are running in a context where we don't have a * main() that we can just get the arguments from. */ -static int fetchve(char ***argv, char ***envp) +static int fetchve(char ***argv) { - char *cmdline = NULL, *environ = NULL; - size_t cmdline_size, environ_size; + char *cmdline = NULL; + size_t cmdline_size; cmdline = read_file("/proc/self/cmdline", &cmdline_size); if (!cmdline) goto error; - environ = read_file("/proc/self/environ", &environ_size); - if (!environ) - goto error; if (parse_xargs(cmdline, cmdline_size, argv) <= 0) goto error; - if (parse_xargs(environ, environ_size, envp) <= 0) - goto error; return 0; error: - free(environ); free(cmdline); return -EINVAL; } @@ -246,23 +240,26 @@ static int clone_binary(void) return -EIO; } +/* Get cheap access to the environment. */ +extern char **environ; + int ensure_cloned_binary(void) { int execfd; - char **argv = NULL, **envp = NULL; + char **argv = NULL; /* Check that we're not self-cloned, and if we are then bail. */ int cloned = is_self_cloned(); if (cloned > 0 || cloned == -ENOTRECOVERABLE) return cloned; - if (fetchve(&argv, &envp) < 0) + if (fetchve(&argv) < 0) return -EINVAL; execfd = clone_binary(); if (execfd < 0) return -EIO; - fexecve(execfd, argv, envp); + fexecve(execfd, argv, environ); return -ENOEXEC; } From 7b7e23d61f75e605deceb01b56de65e013ee7328 Mon Sep 17 00:00:00 2001 From: Aleksa Sarai Date: Sat, 16 Feb 2019 01:18:14 +1100 Subject: [PATCH 4/8] nsenter: cloned_binary: detect and handle short copies For a variety of reasons, sendfile(2) can end up doing a short-copy so we need to just loop until we hit the binary size. Since /proc/self/exe is tautologically our own binary, there's no chance someone is going to modify it underneath us (or changing the size). Signed-off-by: Aleksa Sarai (cherry picked from commit 5b775bf297c47a6bc50e36da89d1ec74a6fa01dc) Signed-off-by: Sebastiaan van Stijn --- libcontainer/nsenter/cloned_binary.c | 18 ++++++++++++++---- 1 file changed, 14 insertions(+), 4 deletions(-) diff --git a/libcontainer/nsenter/cloned_binary.c b/libcontainer/nsenter/cloned_binary.c index c97dfcb70d3..104bab4fb4d 100644 --- a/libcontainer/nsenter/cloned_binary.c +++ b/libcontainer/nsenter/cloned_binary.c @@ -64,7 +64,6 @@ int memfd_create(const char *name, unsigned int flags) # define F_SEAL_WRITE 0x0008 /* prevent writes */ #endif -#define RUNC_SENDFILE_MAX 0x7FFFF000 /* sendfile(2) is limited to 2GB. */ #ifdef HAVE_MEMFD_CREATE # define RUNC_MEMFD_COMMENT "runc_cloned:/proc/self/exe" # define RUNC_MEMFD_SEALS \ @@ -195,7 +194,8 @@ static int fetchve(char ***argv) static int clone_binary(void) { int binfd, memfd; - ssize_t sent = 0; + struct stat statbuf = {}; + size_t sent = 0; #ifdef HAVE_MEMFD_CREATE memfd = memfd_create(RUNC_MEMFD_COMMENT, MFD_CLOEXEC | MFD_ALLOW_SEALING); @@ -209,9 +209,17 @@ static int clone_binary(void) if (binfd < 0) goto error; - sent = sendfile(memfd, binfd, NULL, RUNC_SENDFILE_MAX); + if (fstat(binfd, &statbuf) < 0) + goto error_binfd; + + while (sent < statbuf.st_size) { + int n = sendfile(memfd, binfd, NULL, statbuf.st_size - sent); + if (n < 0) + goto error_binfd; + sent += n; + } close(binfd); - if (sent < 0) + if (sent != statbuf.st_size) goto error; #ifdef HAVE_MEMFD_CREATE @@ -235,6 +243,8 @@ static int clone_binary(void) #endif return memfd; +error_binfd: + close(binfd); error: close(memfd); return -EIO; From 22abfb24956425f6c5b1d42f4c217daed7702193 Mon Sep 17 00:00:00 2001 From: Aleksa Sarai Date: Sat, 16 Feb 2019 04:34:27 +1100 Subject: [PATCH 5/8] nsenter: cloned_binary: expand and add pre-3.11 fallbacks In order to get around the memfd_create(2) requirement, 0a8e4117e7f7 ("nsenter: clone /proc/self/exe to avoid exposing host binary to container") added an O_TMPFILE fallback. However, this fallback was flawed in two ways: * It required O_TMPFILE which is relatively new (having been added to Linux 3.11). * The fallback choice was made at compile-time, not runtime. This results in several complications when it comes to running binaries on different machines to the ones they were built on. The easiest way to resolve these things is to have fallbacks work in a more procedural way (though it does make the code unfortunately more complicated) and to add a new fallback that uses mkotemp(3). Signed-off-by: Aleksa Sarai (cherry picked from commit 2429d59352b81f6b9cc79b5ed26780c5fe6ba4ec) Signed-off-by: Sebastiaan van Stijn --- libcontainer/nsenter/cloned_binary.c | 193 ++++++++++++++++++++------- 1 file changed, 146 insertions(+), 47 deletions(-) diff --git a/libcontainer/nsenter/cloned_binary.c b/libcontainer/nsenter/cloned_binary.c index 104bab4fb4d..24b79895147 100644 --- a/libcontainer/nsenter/cloned_binary.c +++ b/libcontainer/nsenter/cloned_binary.c @@ -36,18 +36,21 @@ #if !defined(SYS_memfd_create) && defined(__NR_memfd_create) # define SYS_memfd_create __NR_memfd_create #endif -#ifdef SYS_memfd_create -# define HAVE_MEMFD_CREATE /* memfd_create(2) flags -- copied from . */ -# ifndef MFD_CLOEXEC -# define MFD_CLOEXEC 0x0001U -# define MFD_ALLOW_SEALING 0x0002U -# endif +#ifndef MFD_CLOEXEC +# define MFD_CLOEXEC 0x0001U +# define MFD_ALLOW_SEALING 0x0002U +#endif int memfd_create(const char *name, unsigned int flags) { +#ifdef SYS_memfd_create return syscall(SYS_memfd_create, name, flags); -} +#else + errno = ENOSYS; + return -1; #endif +} + /* This comes directly from . */ #ifndef F_LINUX_SPECIFIC_BASE @@ -64,11 +67,9 @@ int memfd_create(const char *name, unsigned int flags) # define F_SEAL_WRITE 0x0008 /* prevent writes */ #endif -#ifdef HAVE_MEMFD_CREATE -# define RUNC_MEMFD_COMMENT "runc_cloned:/proc/self/exe" -# define RUNC_MEMFD_SEALS \ +#define RUNC_MEMFD_COMMENT "runc_cloned:/proc/self/exe" +#define RUNC_MEMFD_SEALS \ (F_SEAL_SEAL | F_SEAL_SHRINK | F_SEAL_GROW | F_SEAL_WRITE) -#endif static void *must_realloc(void *ptr, size_t size) { @@ -92,23 +93,29 @@ static int is_self_cloned(void) if (fd < 0) return -ENOTRECOVERABLE; -#ifdef HAVE_MEMFD_CREATE + /* First check memfd. */ ret = fcntl(fd, F_GET_SEALS); - is_cloned = (ret == RUNC_MEMFD_SEALS); -#else - struct stat statbuf = {0}; - ret = fstat(fd, &statbuf); - if (ret >= 0) - is_cloned = (statbuf.st_nlink == 0); -#endif + if (ret >= 0) { + is_cloned = (ret == RUNC_MEMFD_SEALS); + } else { + /* + * Okay, we're a tmpfile -- or we're currently running on RHEL <=7.6 + * which appears to have a borked backport of F_GET_SEALS. Either way, + * having a file which has no hardlinks indicates that we aren't using + * a host-side "runc" binary and this is something that a container + * cannot fake (because unlinking requires being able to resolve the + * path that you want to unlink). + */ + struct stat statbuf = {}; + if (fstat(fd, &statbuf) >= 0) + is_cloned = (statbuf.st_nlink == 0); + } + close(fd); return is_cloned; } -/* - * Basic wrapper around mmap(2) that gives you the file length so you can - * safely treat it as an ordinary buffer. Only gives you read access. - */ +/* Read a given file into a new buffer, and providing the length. */ static char *read_file(char *path, size_t *length) { int fd; @@ -191,18 +198,127 @@ static int fetchve(char ***argv) return -EINVAL; } +enum { + EFD_NONE = 0, + EFD_MEMFD, + EFD_FILE, +}; + +/* + * This comes from . We can't hard-code __O_TMPFILE because it + * changes depending on the architecture. If we don't have O_TMPFILE we always + * have the mkostemp(3) fallback. + */ +#ifndef O_TMPFILE +# if defined(__O_TMPFILE) && defined(O_DIRECTORY) +# define O_TMPFILE (__O_TMPFILE | O_DIRECTORY) +# endif +#endif + +static int make_execfd(int *fdtype) +{ + int fd; + char template[] = "/tmp/runc-cloned-binary.XXXXXX"; + + /* + * Try memfd first, it's much nicer since it's easily detected thanks to + * sealing and also doesn't require assumptions like /tmp. + */ + *fdtype = EFD_MEMFD; + fd = memfd_create(RUNC_MEMFD_COMMENT, MFD_CLOEXEC | MFD_ALLOW_SEALING); + if (fd >= 0) + return fd; + if (errno != ENOSYS) + goto err; + +#ifdef O_TMPFILE + /* + * Try O_TMPFILE to avoid races where someone might snatch our file. Note + * that O_EXCL isn't actually a security measure here (since you can just + * fd re-open it and clear O_EXCL). + */ + *fdtype = EFD_FILE; + fd = open("/tmp", O_TMPFILE | O_EXCL | O_RDWR | O_CLOEXEC, 0700); + if (fd >= 0) { + struct stat statbuf = {}; + bool working_otmpfile = false; + + /* + * open(2) ignores unknown O_* flags -- yeah, I was surprised when I + * found this out too. As a result we can't check for EINVAL. However, + * if we get nlink != 0 (or EISDIR) then we know that this kernel + * doesn't support O_TMPFILE. + */ + if (fstat(fd, &statbuf) >= 0) + working_otmpfile = (statbuf.st_nlink == 0); + + if (working_otmpfile) + return fd; + + /* Pretend that we got EISDIR since O_TMPFILE failed. */ + close(fd); + errno = EISDIR; + } + if (errno != EISDIR) + goto err; +#endif /* defined(O_TMPFILE) */ + + /* + * Our final option is to create a temporary file the old-school way, and + * then unlink it so that nothing else sees it by accident. + */ + *fdtype = EFD_FILE; + fd = mkostemp(template, O_CLOEXEC); + if (fd >= 0) { + if (unlink(template) >= 0) + return fd; + close(fd); + } + +err: + *fdtype = EFD_NONE; + return -1; +} + +static int seal_execfd(int *fd, int fdtype) +{ + switch (fdtype) { + case EFD_MEMFD: + return fcntl(*fd, F_ADD_SEALS, RUNC_MEMFD_SEALS); + case EFD_FILE: { + /* Need to re-open our pseudo-memfd as an O_PATH to avoid execve(2) giving -ETXTBSY. */ + int newfd; + char fdpath[PATH_MAX] = {0}; + + if (fchmod(*fd, 0100) < 0) + return -1; + + if (snprintf(fdpath, sizeof(fdpath), "/proc/self/fd/%d", *fd) < 0) + return -1; + + newfd = open(fdpath, O_PATH | O_CLOEXEC); + if (newfd < 0) + return -1; + + close(*fd); + *fd = newfd; + return 0; + } + default: + break; + } + return -1; +} + static int clone_binary(void) { int binfd, memfd; struct stat statbuf = {}; size_t sent = 0; + int fdtype = EFD_NONE; -#ifdef HAVE_MEMFD_CREATE - memfd = memfd_create(RUNC_MEMFD_COMMENT, MFD_CLOEXEC | MFD_ALLOW_SEALING); -#else - memfd = open("/tmp", O_TMPFILE | O_EXCL | O_RDWR | O_CLOEXEC, 0711); -#endif - if (memfd < 0) + memfd = make_execfd(&fdtype); + if (memfd < 0 || fdtype == EFD_NONE) return -ENOTRECOVERABLE; binfd = open("/proc/self/exe", O_RDONLY | O_CLOEXEC); @@ -222,25 +338,8 @@ static int clone_binary(void) if (sent != statbuf.st_size) goto error; -#ifdef HAVE_MEMFD_CREATE - int err = fcntl(memfd, F_ADD_SEALS, RUNC_MEMFD_SEALS); - if (err < 0) - goto error; -#else - /* Need to re-open "memfd" as read-only to avoid execve(2) giving -EXTBUSY. */ - int newfd; - char *fdpath = NULL; - - if (asprintf(&fdpath, "/proc/self/fd/%d", memfd) < 0) - goto error; - newfd = open(fdpath, O_RDONLY | O_CLOEXEC); - free(fdpath); - if (newfd < 0) + if (seal_execfd(&memfd, fdtype) < 0) goto error; - - close(memfd); - memfd = newfd; -#endif return memfd; error_binfd: From e8ffbab24d369b4167c9ce33c394159b9655ab1a Mon Sep 17 00:00:00 2001 From: Aleksa Sarai Date: Tue, 19 Feb 2019 22:33:09 +1100 Subject: [PATCH 6/8] nsenter: cloned_binary: use the runc statedir for O_TMPFILE Writing a file to tmpfs actually incurs a memcg penalty, and thus the benefit of being able to disable memfd_create(2) with _LIBCONTAINER_DISABLE_MEMFD_CLONE is fairly minimal -- though it should be noted that quite a few distributions don't use tmpfs for /tmp (and instead have it as a regular directory or subvolume of the host filesystem). Since runc must have write access to the state directory anyway (and the state directory is usually not on a tmpfs) we can use that instead of /tmp -- avoiding potential memcg costs with no real downside. Signed-off-by: Aleksa Sarai (cherry picked from commit af9da0a45082783f6005b252488943b5ee2e2138) Signed-off-by: Sebastiaan van Stijn --- libcontainer/container_linux.go | 1 + libcontainer/nsenter/cloned_binary.c | 12 +++++++++--- 2 files changed, 10 insertions(+), 3 deletions(-) diff --git a/libcontainer/container_linux.go b/libcontainer/container_linux.go index db2242e2696..4bdfd5778e4 100644 --- a/libcontainer/container_linux.go +++ b/libcontainer/container_linux.go @@ -487,6 +487,7 @@ func (c *linuxContainer) commandTemplate(p *Process, childPipe *os.File) (*exec. cmd.ExtraFiles = append(cmd.ExtraFiles, childPipe) cmd.Env = append(cmd.Env, fmt.Sprintf("_LIBCONTAINER_INITPIPE=%d", stdioFdCount+len(cmd.ExtraFiles)-1), + fmt.Sprintf("_LIBCONTAINER_STATEDIR=%s", c.root), ) // NOTE: when running a container with no PID namespace and the parent process spawning the container is // PID1 the pdeathsig is being delivered to the container's init process by the kernel for some reason diff --git a/libcontainer/nsenter/cloned_binary.c b/libcontainer/nsenter/cloned_binary.c index 24b79895147..548ff9f2425 100644 --- a/libcontainer/nsenter/cloned_binary.c +++ b/libcontainer/nsenter/cloned_binary.c @@ -217,8 +217,14 @@ enum { static int make_execfd(int *fdtype) { - int fd; - char template[] = "/tmp/runc-cloned-binary.XXXXXX"; + int fd = -1; + char template[PATH_MAX] = {0}; + char *prefix = secure_getenv("_LIBCONTAINER_STATEDIR"); + + if (!prefix || *prefix != '/') + prefix = "/tmp"; + if (snprintf(template, sizeof(template), "%s/runc.XXXXXX", prefix) < 0) + return -1; /* * Try memfd first, it's much nicer since it's easily detected thanks to @@ -238,7 +244,7 @@ static int make_execfd(int *fdtype) * fd re-open it and clear O_EXCL). */ *fdtype = EFD_FILE; - fd = open("/tmp", O_TMPFILE | O_EXCL | O_RDWR | O_CLOEXEC, 0700); + fd = open(prefix, O_TMPFILE | O_EXCL | O_RDWR | O_CLOEXEC, 0700); if (fd >= 0) { struct stat statbuf = {}; bool working_otmpfile = false; From 293faf43accca3aafd1c4c4be8431368a19be5c8 Mon Sep 17 00:00:00 2001 From: Aleksa Sarai Date: Tue, 26 Feb 2019 20:16:17 +1100 Subject: [PATCH 7/8] nsenter: cloned_binary: try to ro-bind /proc/self/exe before copying The usage of memfd_create(2) and other copying techniques is quite wasteful, despite attempts to minimise it with _LIBCONTAINER_STATEDIR. memfd_create(2) added ~10M of memory usage to the cgroup associated with the container, which can result in some setups getting OOM'd (or just hogging the hosts' memory when you have lots of created-but-not-started containers sticking around). The easiest way of solving this is by creating a read-only bind-mount of the binary, opening that read-only bindmount, and then umounting it to ensure that the host won't accidentally be re-mounted read-write. This avoids all copying and cleans up naturally like the other techniques used. Unfortunately, like the O_TMPFILE fallback, this requires being able to create a file inside _LIBCONTAINER_STATEDIR (since bind-mounting over the most obvious path -- /proc/self/exe -- is a *very bad idea*). Unfortunately detecting this isn't fool-proof -- on a system with a read-only root filesystem (that might become read-write during "runc init" execution), we cannot tell whether we have already done an ro remount. As a partial mitigation, we store a _LIBCONTAINER_CLONED_BINARY environment variable which is checked *alongside* the protection being present. Signed-off-by: Aleksa Sarai (cherry picked from commit 16612d74de5f84977e50a9c8ead7f0e9e13b8628) Signed-off-by: Sebastiaan van Stijn --- libcontainer/nsenter/cloned_binary.c | 157 ++++++++++++++++++++++----- 1 file changed, 131 insertions(+), 26 deletions(-) diff --git a/libcontainer/nsenter/cloned_binary.c b/libcontainer/nsenter/cloned_binary.c index 548ff9f2425..1381324bcea 100644 --- a/libcontainer/nsenter/cloned_binary.c +++ b/libcontainer/nsenter/cloned_binary.c @@ -27,8 +27,10 @@ #include #include +#include #include #include +#include #include #include @@ -67,6 +69,7 @@ int memfd_create(const char *name, unsigned int flags) # define F_SEAL_WRITE 0x0008 /* prevent writes */ #endif +#define CLONED_BINARY_ENV "_LIBCONTAINER_CLONED_BINARY" #define RUNC_MEMFD_COMMENT "runc_cloned:/proc/self/exe" #define RUNC_MEMFD_SEALS \ (F_SEAL_SEAL | F_SEAL_SHRINK | F_SEAL_GROW | F_SEAL_WRITE) @@ -88,29 +91,56 @@ static void *must_realloc(void *ptr, size_t size) static int is_self_cloned(void) { int fd, ret, is_cloned = 0; + struct stat statbuf = {}; + struct statfs fsbuf = {}; fd = open("/proc/self/exe", O_RDONLY|O_CLOEXEC); if (fd < 0) return -ENOTRECOVERABLE; - /* First check memfd. */ + /* + * Is the binary a fully-sealed memfd? We don't need CLONED_BINARY_ENV for + * this, because you cannot write to a sealed memfd no matter what (so + * sharing it isn't a bad thing -- and an admin could bind-mount a sealed + * memfd to /usr/bin/runc to allow re-use). + */ ret = fcntl(fd, F_GET_SEALS); if (ret >= 0) { is_cloned = (ret == RUNC_MEMFD_SEALS); - } else { - /* - * Okay, we're a tmpfile -- or we're currently running on RHEL <=7.6 - * which appears to have a borked backport of F_GET_SEALS. Either way, - * having a file which has no hardlinks indicates that we aren't using - * a host-side "runc" binary and this is something that a container - * cannot fake (because unlinking requires being able to resolve the - * path that you want to unlink). - */ - struct stat statbuf = {}; - if (fstat(fd, &statbuf) >= 0) - is_cloned = (statbuf.st_nlink == 0); + goto out; } + /* + * All other forms require CLONED_BINARY_ENV, since they are potentially + * writeable (or we can't tell if they're fully safe) and thus we must + * check the environment as an extra layer of defence. + */ + if (!getenv(CLONED_BINARY_ENV)) { + is_cloned = false; + goto out; + } + + /* + * Is the binary on a read-only filesystem? We can't detect bind-mounts in + * particular (in-kernel they are identical to regular mounts) but we can + * at least be sure that it's read-only. In addition, to make sure that + * it's *our* bind-mount we check CLONED_BINARY_ENV. + */ + if (fstatfs(fd, &fsbuf) >= 0) + is_cloned |= (fsbuf.f_flags & MS_RDONLY); + + /* + * Okay, we're a tmpfile -- or we're currently running on RHEL <=7.6 + * which appears to have a borked backport of F_GET_SEALS. Either way, + * having a file which has no hardlinks indicates that we aren't using + * a host-side "runc" binary and this is something that a container + * cannot fake (because unlinking requires being able to resolve the + * path that you want to unlink). + */ + if (fstat(fd, &statbuf) >= 0) + is_cloned |= (statbuf.st_nlink == 0); + +out: close(fd); return is_cloned; } @@ -227,15 +257,16 @@ static int make_execfd(int *fdtype) return -1; /* - * Try memfd first, it's much nicer since it's easily detected thanks to - * sealing and also doesn't require assumptions like /tmp. + * Now try memfd, it's much nicer than actually creating a file in STATEDIR + * since it's easily detected thanks to sealing and also doesn't require + * assumptions about STATEDIR. */ *fdtype = EFD_MEMFD; fd = memfd_create(RUNC_MEMFD_COMMENT, MFD_CLOEXEC | MFD_ALLOW_SEALING); if (fd >= 0) return fd; - if (errno != ENOSYS) - goto err; + if (errno != ENOSYS && errno != EINVAL) + goto error; #ifdef O_TMPFILE /* @@ -266,7 +297,7 @@ static int make_execfd(int *fdtype) errno = EISDIR; } if (errno != EISDIR) - goto err; + goto error; #endif /* defined(O_TMPFILE) */ /* @@ -281,7 +312,7 @@ static int make_execfd(int *fdtype) close(fd); } -err: +error: *fdtype = EFD_NONE; return -1; } @@ -316,15 +347,83 @@ static int seal_execfd(int *fd, int fdtype) return -1; } +static int try_bindfd(void) +{ + int fd, ret = -1; + char template[PATH_MAX] = {0}; + char *prefix = secure_getenv("_LIBCONTAINER_STATEDIR"); + + if (!prefix || *prefix != '/') + prefix = "/tmp"; + if (snprintf(template, sizeof(template), "%s/runc.XXXXXX", prefix) < 0) + return ret; + + /* + * We need somewhere to mount it, mounting anything over /proc/self is a + * BAD idea on the host -- even if we do it temporarily. + */ + fd = mkstemp(template); + if (fd < 0) + return ret; + close(fd); + + /* + * For obvious reasons this won't work in rootless mode because we haven't + * created a userns+mntns -- but getting that to work will be a bit + * complicated and it's only worth doing if someone actually needs it. + */ + ret = -EPERM; + if (mount("/proc/self/exe", template, "", MS_BIND, "") < 0) + goto out; + if (mount("", template, "", MS_REMOUNT | MS_BIND | MS_RDONLY, "") < 0) + goto out_umount; + + + /* Get read-only handle that we're sure can't be made read-write. */ + ret = open(template, O_PATH | O_CLOEXEC); + +out_umount: + /* + * Make sure the MNT_DETACH works, otherwise we could get remounted + * read-write and that would be quite bad (the fd would be made read-write + * too, invalidating the protection). + */ + if (umount2(template, MNT_DETACH) < 0) { + if (ret >= 0) + close(ret); + ret = -ENOTRECOVERABLE; + } + +out: + /* + * We don't care about unlink errors, the worst that happens is that + * there's an empty file left around in STATEDIR. + */ + unlink(template); + return ret; +} + static int clone_binary(void) { - int binfd, memfd; + int binfd, execfd; struct stat statbuf = {}; size_t sent = 0; int fdtype = EFD_NONE; - memfd = make_execfd(&fdtype); - if (memfd < 0 || fdtype == EFD_NONE) + /* + * Before we resort to copying, let's try creating an ro-binfd in one shot + * by getting a handle for a read-only bind-mount of the execfd. + */ + execfd = try_bindfd(); + if (execfd >= 0) + return execfd; + + /* + * Dammit, that didn't work -- time to copy the binary to a safe place we + * can seal the contents. + */ + execfd = make_execfd(&fdtype); + if (execfd < 0 || fdtype == EFD_NONE) return -ENOTRECOVERABLE; binfd = open("/proc/self/exe", O_RDONLY | O_CLOEXEC); @@ -335,7 +434,7 @@ static int clone_binary(void) goto error_binfd; while (sent < statbuf.st_size) { - int n = sendfile(memfd, binfd, NULL, statbuf.st_size - sent); + int n = sendfile(execfd, binfd, NULL, statbuf.st_size - sent); if (n < 0) goto error_binfd; sent += n; @@ -344,14 +443,15 @@ static int clone_binary(void) if (sent != statbuf.st_size) goto error; - if (seal_execfd(&memfd, fdtype) < 0) + if (seal_execfd(&execfd, fdtype) < 0) goto error; - return memfd; + + return execfd; error_binfd: close(binfd); error: - close(memfd); + close(execfd); return -EIO; } @@ -375,6 +475,11 @@ int ensure_cloned_binary(void) if (execfd < 0) return -EIO; + if (putenv(CLONED_BINARY_ENV "=1")) + goto error; + fexecve(execfd, argv, environ); +error: + close(execfd); return -ENOEXEC; } From e89ebf34d44f2b15fb15bf8dbb79dea630a7b3a6 Mon Sep 17 00:00:00 2001 From: Aleksa Sarai Date: Fri, 1 Mar 2019 18:23:54 +1100 Subject: [PATCH 8/8] nsenter: cloned_binary: userspace copy fallback if sendfile fails There are some circumstances where sendfile(2) can fail (one example is that AppArmor appears to block writing to deleted files with sendfile(2) under some circumstances) and so we need to have a userspace fallback. It's fairly trivial (and handles short-writes). Signed-off-by: Aleksa Sarai (cherry picked from commit 2d4a37b427167907ef2402586a8e8e2931a22490) Signed-off-by: Sebastiaan van Stijn --- libcontainer/nsenter/cloned_binary.c | 37 +++++++++++++++++++++++++--- 1 file changed, 34 insertions(+), 3 deletions(-) diff --git a/libcontainer/nsenter/cloned_binary.c b/libcontainer/nsenter/cloned_binary.c index 1381324bcea..b410e295170 100644 --- a/libcontainer/nsenter/cloned_binary.c +++ b/libcontainer/nsenter/cloned_binary.c @@ -160,7 +160,7 @@ static char *read_file(char *path, size_t *length) *length = 0; for (;;) { - int n; + ssize_t n; n = read(fd, buf, sizeof(buf)); if (n < 0) @@ -403,6 +403,33 @@ static int try_bindfd(void) return ret; } +static ssize_t fd_to_fd(int outfd, int infd) +{ + ssize_t total = 0; + char buffer[4096]; + + for (;;) { + ssize_t nread, nwritten = 0; + + nread = read(infd, buffer, sizeof(buffer)); + if (nread < 0) + return -1; + if (!nread) + break; + + do { + ssize_t n = write(outfd, buffer + nwritten, nread - nwritten); + if (n < 0) + return -1; + nwritten += n; + } while(nwritten < nread); + + total += nwritten; + } + + return total; +} + static int clone_binary(void) { int binfd, execfd; @@ -435,8 +462,12 @@ static int clone_binary(void) while (sent < statbuf.st_size) { int n = sendfile(execfd, binfd, NULL, statbuf.st_size - sent); - if (n < 0) - goto error_binfd; + if (n < 0) { + /* sendfile can fail so we fallback to a dumb user-space copy. */ + n = fd_to_fd(execfd, binfd); + if (n < 0) + goto error_binfd; + } sent += n; } close(binfd);