From 7020692c995d2b2a25146f0aa57ef26324206755 Mon Sep 17 00:00:00 2001 From: Grzegorz Uriasz Date: Mon, 20 Mar 2023 06:24:06 +0100 Subject: [PATCH] Use the close_range syscall for sanitizing the jailer process Closes all open FDs in a single syscall. In case close_range is not available then fallback to the old approach. In case the fallback fails then discover open FDs by reading from /proc/self/fds. Fixes a bug where firecracker would end up spending minutes starting which arises when running on systems with OPEN_MAX set to a very high number. Signed-off-by: Grzegorz Uriasz --- Cargo.lock | 29 ++++++++- src/jailer/Cargo.toml | 1 + src/jailer/src/main.rs | 144 ++++++++++++++++++++++++++++++++++++----- 3 files changed, 157 insertions(+), 17 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 02483fe4cef5..482a15c816dc 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -587,6 +587,7 @@ name = "jailer" version = "1.3.0" dependencies = [ "libc", + "nix 0.26.2", "regex", "thiserror", "utils", @@ -764,6 +765,20 @@ dependencies = [ "memoffset 0.6.5", ] +[[package]] +name = "nix" +version = "0.26.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bfdda3d196821d6af13126e40375cdf7da646a96114af134d5f417a9a1dc8e1a" +dependencies = [ + "bitflags", + "cfg-if", + "libc", + "memoffset 0.7.1", + "pin-utils", + "static_assertions", +] + [[package]] name = "nom" version = "7.1.1" @@ -823,6 +838,12 @@ version = "0.1.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "19b17cddbe7ec3f8bc800887bab5e717348c95ea2ca0b1bf0837fb964dc67099" +[[package]] +name = "pin-utils" +version = "0.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8b870d8c151b6f2fb93e84a13146138f05d02ed11c7e7c54f8826aaaf7c9f184" + [[package]] name = "plotters" version = "0.3.4" @@ -1110,6 +1131,12 @@ dependencies = [ "versionize_derive", ] +[[package]] +name = "static_assertions" +version = "1.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a2eb9349b6444b326872e140eb1cf5e7c522154d69e7a0ffb0fb81c06b37543f" + [[package]] name = "subtle" version = "2.4.1" @@ -1212,7 +1239,7 @@ dependencies = [ "bitflags", "cfg-if", "libc", - "nix", + "nix 0.23.1", "thiserror", "userfaultfd-sys", ] diff --git a/src/jailer/Cargo.toml b/src/jailer/Cargo.toml index 517067b8f58f..2882ec032d6d 100644 --- a/src/jailer/Cargo.toml +++ b/src/jailer/Cargo.toml @@ -10,6 +10,7 @@ license = "Apache-2.0" [dependencies] libc = "0.2.117" +nix = { version = "0.26.2", features = ["dir"] } regex = { version = "1.5.5", default-features = false, features = ["std"] } thiserror = "1.0.32" diff --git a/src/jailer/src/main.rs b/src/jailer/src/main.rs index ceff5a12a607..4bcbc87d7e89 100644 --- a/src/jailer/src/main.rs +++ b/src/jailer/src/main.rs @@ -6,6 +6,7 @@ mod chroot; mod env; mod resource_limits; use std::ffi::{CString, NulError, OsString}; +use std::os::unix::prelude::AsRawFd; use std::path::{Path, PathBuf}; use std::{env as p_env, fs, io, process, result}; @@ -235,21 +236,91 @@ pub fn readln_special>(file_path: &T) -> Result { Ok(line) } -fn sanitize_process() { - // First thing to do is make sure we don't keep any inherited FDs - // other that IN, OUT and ERR. +fn close_fds_by_close_range() -> Option<()> { + // First try using the close_range syscall to close all open FDs in the range of 4..UINT_MAX + // SAFETY: if the syscall is not available then ENOSYS will be returned + if unsafe { + libc::syscall( + libc::SYS_close_range, + 3, + libc::c_uint::MAX, + libc::CLOSE_RANGE_UNSHARE, + ) + } == 0 + { + return Some(()); + } + return None; +} + +fn close_fds_by_iteration() -> Option<()> { + // Probably the kernel does not support close_range, kernel < 5.9 // SAFETY: Always safe. let fd_limit = i32::try_from(unsafe { libc::sysconf(libc::_SC_OPEN_MAX) }).unwrap(); - // Close all file descriptors excluding 0 (STDIN), 1 (STDOUT) and 2 (STDERR). - for fd in 3..fd_limit { - // SAFETY: Safe because close() cannot fail when passed a valid parameter. - unsafe { - libc::close(fd); + + // Check if the fd_limit is small enough to iterate through - 65536 sounds sane + if fd_limit <= 2_i32.pow(16) { + // Do not trust values bellow 1024 + for fd in 3..fd_limit.max(2_i32.pow(10)) { + // SAFETY: Safe because close() cannot fail when passed a valid parameter. + unsafe { + libc::close(fd); + } + } + return Some(()); + } + return None; +} + +fn close_fds_by_reading_proc() -> Option<()> { + // If both methods failed then fallback to reading /proc + // This means that we are on kernel < 5.9 and OPEN_MAX is set to a pathological size + // We can't use std::fs::ReadDir here as under the hood we need access to the dirfd in order to + // not close it twice + if let Ok(mut dir) = nix::dir::Dir::open( + "/proc/self/fd", + nix::fcntl::OFlag::O_DIRECTORY | nix::fcntl::OFlag::O_NOATIME, + nix::sys::stat::Mode::empty(), + ) { + let dirfd = dir.as_raw_fd(); + + let mut c = dir.iter(); + while let Some(Ok(path)) = c.next() { + let file_name = path.file_name(); + let fd_str = file_name.to_str().unwrap_or("0"); + let fd = fd_str.parse::().unwrap_or(0); + + if fd > 2 && fd != dirfd { + // SAFETY: Safe because close() cannot fail when passed a valid parameter. + unsafe { libc::close(fd) }; + } } + + return Some(()); + } + + return None; +} + +// Closes all FDs other than 0 (STDIN), 1 (STDOUT) and 2 (STDERR) +fn close_inherited_fds() -> Option<()> { + // Try variuos methods until one of them works + if close_fds_by_close_range().is_none() + && close_fds_by_iteration().is_none() + && close_fds_by_reading_proc().is_none() + { + return None; } + return Some(()); +} +fn sanitize_process() -> Option<()> { + // First thing to do is make sure we don't keep any inherited FDs + // other that IN, OUT and ERR. + close_inherited_fds()?; // Cleanup environment variables clean_env_vars(); + Some(()) } fn clean_env_vars() { @@ -275,7 +346,9 @@ pub fn to_cstring>(path: T) -> Result { } fn main() { - sanitize_process(); + if sanitize_process().is_none() { + panic!("Unable to sanitize the process") + } let mut arg_parser = build_arg_parser(); @@ -321,28 +394,31 @@ fn main() { mod tests { #![allow(clippy::undocumented_unsafe_blocks)] use std::env; + use std::ffi::CStr; use std::fs::File; use std::os::unix::io::IntoRawFd; - use utils::arg_parser; + use utils::{arg_parser, rand}; use super::*; - #[test] - fn test_sanitize_process() { + fn run_close_fds_test(test_fn: fn() -> Option<()>) { let n = 100; - let tmp_dir_path = "/tmp/jailer/tests/sanitize_process"; - assert!(fs::create_dir_all(tmp_dir_path).is_ok()); + let tmp_dir_path = format!( + "/tmp/jailer/tests/close_fds/_{}", + rand::rand_alphanumerics(4).into_string().unwrap() + ); + assert!(fs::create_dir_all(&tmp_dir_path).is_ok()); let mut fds = Vec::new(); for i in 0..n { - let maybe_file = File::create(format!("{}/{}", tmp_dir_path, i)); + let maybe_file = File::create(format!("{}/{}", &tmp_dir_path, i)); assert!(maybe_file.is_ok()); fds.push(maybe_file.unwrap().into_raw_fd()); } - sanitize_process(); + assert_eq!(test_fn(), Some(())); for fd in fds { let is_fd_opened = unsafe { libc::fcntl(fd, libc::F_GETFD) } == 0; @@ -352,6 +428,42 @@ mod tests { assert!(fs::remove_dir_all(tmp_dir_path).is_ok()); } + #[test] + fn test_fds_close_range() { + // SAFETY: Always safe + let mut n = unsafe { std::mem::zeroed() }; + // SAFETY: We check if the uname call succeeded + assert_eq!(unsafe { libc::uname(&mut n) }, 0); + // SAFETY: Always safe + let release = unsafe { CStr::from_ptr(n.release.as_ptr()) } + .to_string_lossy() + .into_owned(); + // Parse the major and minor version of the kernel + let mut r = release.split("."); + let major: i32 = str::parse(r.next().unwrap()).unwrap(); + let minor: i32 = str::parse(r.next().unwrap()).unwrap(); + + // Skip this test if we're running on a too old kernel + if major > 5 || (major == 5 && minor >= 9) { + run_close_fds_test(close_fds_by_close_range); + } + } + + #[test] + fn test_fds_iteration() { + run_close_fds_test(close_fds_by_iteration); + } + + #[test] + fn test_fds_proc() { + run_close_fds_test(close_fds_by_reading_proc); + } + + #[test] + fn test_sanitize_process() { + run_close_fds_test(sanitize_process); + } + #[test] fn test_clean_env_vars() { let env_vars: [&str; 5] = ["VAR1", "VAR2", "VAR3", "VAR4", "VAR5"];