From caeb01c48e3200254a2f26506a4cc4059cf62130 Mon Sep 17 00:00:00 2001 From: Zhang Tianyang Date: Fri, 5 Apr 2024 16:25:09 +0800 Subject: [PATCH] bugfix: residual fds of netlink socket and tun Signed-off-by: Zhang Tianyang --- vmm/sandbox/src/cloud_hypervisor/mod.rs | 31 ++++++------ vmm/sandbox/src/device.rs | 20 +++----- vmm/sandbox/src/network/link.rs | 7 +-- vmm/sandbox/src/qemu/devices/vsock.rs | 8 +-- vmm/sandbox/src/qemu/mod.rs | 29 ++++++----- vmm/sandbox/src/stratovirt/devices/vsock.rs | 8 +-- vmm/sandbox/src/stratovirt/mod.rs | 27 ++++++----- vmm/sandbox/src/utils.rs | 54 ++++++++++++++------- 8 files changed, 102 insertions(+), 82 deletions(-) diff --git a/vmm/sandbox/src/cloud_hypervisor/mod.rs b/vmm/sandbox/src/cloud_hypervisor/mod.rs index 8eeaef27..ac422d79 100644 --- a/vmm/sandbox/src/cloud_hypervisor/mod.rs +++ b/vmm/sandbox/src/cloud_hypervisor/mod.rs @@ -14,7 +14,7 @@ See the License for the specific language governing permissions and limitations under the License. */ -use std::{os::unix::io::RawFd, process::Stdio, time::Duration}; +use std::{os::fd::OwnedFd, process::Stdio, time::Duration}; use anyhow::anyhow; use async_trait::async_trait; @@ -72,7 +72,8 @@ pub struct CloudHypervisorVM { wait_chan: Option>, #[serde(skip)] client: Option, - fds: Vec, + #[serde(skip)] + fds: Vec, pids: Pids, } @@ -142,7 +143,7 @@ impl CloudHypervisorVM { Ok(pid) } - fn append_fd(&mut self, fd: RawFd) -> usize { + fn append_fd(&mut self, fd: OwnedFd) -> usize { self.fds.push(fd); self.fds.len() - 1 + 3 } @@ -175,17 +176,19 @@ impl VM for CloudHypervisorVM { params.push("-vv".to_string()); } - let mut cmd = tokio::process::Command::new(&self.config.path); - cmd.args(params.as_slice()); - - set_cmd_fd(&mut cmd, self.fds.to_vec())?; - set_cmd_netns(&mut cmd, self.netns.to_string())?; - cmd.stdout(Stdio::piped()); - cmd.stderr(Stdio::piped()); - info!("start cloud hypervisor with cmdline: {:?}", cmd); - let child = cmd - .spawn() - .map_err(|e| anyhow!("failed to spawn cloud hypervisor command: {}", e))?; + // Drop cmd immediately to let the fds in pre_exec be closed. + let child = { + let mut cmd = tokio::process::Command::new(&self.config.path); + cmd.args(params.as_slice()); + + set_cmd_fd(&mut cmd, self.fds.drain(..).collect())?; + set_cmd_netns(&mut cmd, self.netns.to_string())?; + cmd.stdout(Stdio::piped()); + cmd.stderr(Stdio::piped()); + info!("start cloud hypervisor with cmdline: {:?}", cmd); + cmd.spawn() + .map_err(|e| anyhow!("failed to spawn cloud hypervisor command: {}", e))? + }; let pid = child.id(); self.pids.vmm_pid = pid; let pid_file = format!("{}/pid", self.base_dir); diff --git a/vmm/sandbox/src/device.rs b/vmm/sandbox/src/device.rs index 09d39e8f..38fd8a3e 100644 --- a/vmm/sandbox/src/device.rs +++ b/vmm/sandbox/src/device.rs @@ -14,7 +14,7 @@ See the License for the specific language governing permissions and limitations under the License. */ -use std::os::unix::io::RawFd; +use std::os::fd::OwnedFd; use containerd_sandbox::error::{Error, Result}; @@ -182,41 +182,38 @@ impl Transport { } } -#[derive(Debug, Clone)] +#[derive(Debug)] pub enum DeviceInfo { Block(BlockDeviceInfo), - #[allow(dead_code)] Tap(TapDeviceInfo), - #[allow(dead_code)] Physical(PhysicalDeviceInfo), - #[allow(dead_code)] VhostUser(VhostUserDeviceInfo), Char(CharDeviceInfo), } -#[derive(Debug, Clone)] +#[derive(Debug)] pub struct BlockDeviceInfo { pub id: String, pub path: String, pub read_only: bool, } -#[derive(Debug, Clone)] +#[derive(Debug)] pub struct TapDeviceInfo { pub id: String, pub index: u32, pub name: String, pub mac_address: String, - pub fds: Vec, + pub fds: Vec, } -#[derive(Debug, Clone)] +#[derive(Debug)] pub struct PhysicalDeviceInfo { pub id: String, pub bdf: String, } -#[derive(Debug, Clone)] +#[derive(Debug)] pub struct VhostUserDeviceInfo { pub id: String, pub socket_path: String, @@ -224,7 +221,7 @@ pub struct VhostUserDeviceInfo { pub r#type: String, } -#[derive(Debug, Clone)] +#[derive(Debug)] pub struct CharDeviceInfo { pub id: String, pub chardev_id: String, @@ -235,6 +232,5 @@ pub struct CharDeviceInfo { #[derive(Debug, Clone, Eq, PartialEq)] pub enum CharBackendType { Pipe(String), - #[allow(dead_code)] Socket(String), } diff --git a/vmm/sandbox/src/network/link.rs b/vmm/sandbox/src/network/link.rs index 5ddc2785..c33cdad2 100644 --- a/vmm/sandbox/src/network/link.rs +++ b/vmm/sandbox/src/network/link.rs @@ -324,11 +324,11 @@ impl NetworkInterface { Ok(()) } - pub async fn attach_to(&self, sandbox: &mut KuasarSandbox) -> Result<()> { + pub async fn attach_to(&mut self, sandbox: &mut KuasarSandbox) -> Result<()> { let id = format!("intf-{}", self.index); match &self.r#type { LinkType::Veth => { - if let Some(intf) = &self.twin { + if let Some(intf) = self.twin.as_mut() { sandbox .vm .attach(DeviceInfo::Tap(TapDeviceInfo { @@ -336,7 +336,7 @@ impl NetworkInterface { index: self.index, name: intf.name.to_string(), mac_address: self.mac_address.to_string(), - fds: intf.fds.iter().map(|fd| fd.as_raw_fd()).collect(), + fds: intf.fds.drain(..).collect(), })) .await?; } else { @@ -470,6 +470,7 @@ fn get_bdf_for_eth(if_name: &str) -> Result { e ) })?; + close(sock).unwrap_or_default(); Ok(bdf.to_string()) } diff --git a/vmm/sandbox/src/qemu/devices/vsock.rs b/vmm/sandbox/src/qemu/devices/vsock.rs index 7833dedf..6bc13f11 100644 --- a/vmm/sandbox/src/qemu/devices/vsock.rs +++ b/vmm/sandbox/src/qemu/devices/vsock.rs @@ -14,7 +14,7 @@ See the License for the specific language governing permissions and limitations under the License. */ -use std::os::unix::io::{IntoRawFd, RawFd}; +use std::os::fd::{AsRawFd, OwnedFd}; use anyhow::anyhow; use containerd_sandbox::error::Result; @@ -60,7 +60,7 @@ impl VSockDevice { } } -pub async fn find_context_id() -> Result<(RawFd, u64)> { +pub async fn find_context_id() -> Result<(OwnedFd, u64)> { // TODO make sure if this thread_rng is enough, if we should new a seedable rng everytime. let vsock_file = tokio::fs::OpenOptions::new() .read(true) @@ -68,10 +68,10 @@ pub async fn find_context_id() -> Result<(RawFd, u64)> { .mode(0o666) .open(VHOST_VSOCK_DEV_PATH) .await?; - let vsockfd = vsock_file.into_std().await.into_raw_fd(); + let vsockfd = OwnedFd::from(vsock_file.into_std().await); for _i in 0..IOCTL_TRY_TIMES { let cid = thread_rng().gen_range(3..i32::MAX as u64); - let res = unsafe { set_vhost_guest_cid(vsockfd, &cid) }; + let res = unsafe { set_vhost_guest_cid(vsockfd.as_raw_fd(), &cid) }; match res { Ok(_) => return Ok((vsockfd, cid)), Err(_) => continue, diff --git a/vmm/sandbox/src/qemu/mod.rs b/vmm/sandbox/src/qemu/mod.rs index 9b8ddcec..855457eb 100644 --- a/vmm/sandbox/src/qemu/mod.rs +++ b/vmm/sandbox/src/qemu/mod.rs @@ -16,7 +16,10 @@ limitations under the License. use std::{ collections::HashMap, - os::unix::io::{AsRawFd, FromRawFd, RawFd}, + os::{ + fd::OwnedFd, + unix::io::{AsRawFd, FromRawFd, RawFd}, + }, time::{Duration, SystemTime}, }; @@ -82,7 +85,8 @@ pub struct QemuVM { devices: Vec>, #[serde(skip)] hot_attached_devices: Vec>, - fds: Vec, + #[serde(skip)] + fds: Vec, console_socket: String, agent_socket: String, netns: String, @@ -101,8 +105,6 @@ impl VM for QemuVM { debug!("start vm {}", self.id); let wait_chan = self.launch().await?; self.wait_chan = Some(wait_chan); - // close the fds after launch qemu - self.fds = vec![]; let start_time = SystemTime::now(); loop { match self.create_client().await { @@ -342,17 +344,17 @@ impl QemuVM { self.devices.push(Box::new(device)); } - fn append_fd(&mut self, fd: RawFd) -> usize { + fn append_fd(&mut self, fd: OwnedFd) -> usize { self.fds.push(fd); self.fds.len() - 1 + 3 } - async fn launch(&self) -> Result> { + async fn launch(&mut self) -> Result> { let mut params = self.config.to_cmdline_params("-"); for d in self.devices.iter() { params.extend(d.to_cmdline_params("-")); } - let fds = self.fds.to_vec(); + let fds: Vec = self.fds.drain(..).collect(); let path = self.config.path.to_string(); // pid file should not be empty let pid_file = self.config.pid_file.to_string(); @@ -367,20 +369,17 @@ impl QemuVM { spawn_blocking(move || -> Result<()> { let mut cmd = unshare::Command::new(&*path_clone); cmd.args(params.as_slice()); - for (i, &x) in fds.iter().enumerate() { - cmd.file_descriptor( - (3 + i) as RawFd, - Fd::from_file(unsafe { std::fs::File::from_raw_fd(x) }), - ); + let pipe_writer2 = pipe_writer.try_clone()?; + cmd.stdout(unshare::Stdio::from_file(pipe_writer)); + cmd.stderr(unshare::Stdio::from_file(pipe_writer2)); + for (i, x) in fds.into_iter().enumerate() { + cmd.file_descriptor((3 + i) as RawFd, Fd::from_file(std::fs::File::from(x))); } if !netns.is_empty() { let netns_fd = nix::fcntl::open(&*netns, OFlag::O_CLOEXEC, Mode::empty()) .map_err(|e| anyhow!("failed to open netns {}", e))?; cmd.set_namespace(&netns_fd, unshare::Namespace::Net)?; } - let pipe_writer2 = pipe_writer.try_clone()?; - cmd.stdout(unshare::Stdio::from_file(pipe_writer)); - cmd.stderr(unshare::Stdio::from_file(pipe_writer2)); let mut child = cmd .spawn() .map_err(|e| anyhow!("failed to spawn qemu command: {}", e))?; diff --git a/vmm/sandbox/src/stratovirt/devices/vsock.rs b/vmm/sandbox/src/stratovirt/devices/vsock.rs index 2239f011..f441fc9f 100644 --- a/vmm/sandbox/src/stratovirt/devices/vsock.rs +++ b/vmm/sandbox/src/stratovirt/devices/vsock.rs @@ -14,7 +14,7 @@ See the License for the specific language governing permissions and limitations under the License. */ -use std::os::unix::io::{IntoRawFd, RawFd}; +use std::os::fd::{AsRawFd, OwnedFd}; use anyhow::anyhow; use containerd_sandbox::error::Result; @@ -62,7 +62,7 @@ impl VSockDevice { } } -pub async fn find_context_id() -> Result<(RawFd, u64)> { +pub async fn find_context_id() -> Result<(OwnedFd, u64)> { // TODO make sure if this thread_rng is enough, if we should new a seedable rng everytime. let vsock_file = tokio::fs::OpenOptions::new() .read(true) @@ -70,10 +70,10 @@ pub async fn find_context_id() -> Result<(RawFd, u64)> { .mode(0o666) .open(VHOST_VSOCK_DEV_PATH) .await?; - let vsockfd = vsock_file.into_std().await.into_raw_fd(); + let vsockfd = OwnedFd::from(vsock_file.into_std().await); for _i in 0..IOCTL_TRY_TIMES { let cid = thread_rng().gen_range(3..i32::MAX as u64); - let res = unsafe { set_vhost_guest_cid(vsockfd, &cid) }; + let res = unsafe { set_vhost_guest_cid(vsockfd.as_raw_fd(), &cid) }; match res { Ok(_) => return Ok((vsockfd, cid)), Err(_) => continue, diff --git a/vmm/sandbox/src/stratovirt/mod.rs b/vmm/sandbox/src/stratovirt/mod.rs index 787cfff1..14e0620c 100644 --- a/vmm/sandbox/src/stratovirt/mod.rs +++ b/vmm/sandbox/src/stratovirt/mod.rs @@ -16,7 +16,10 @@ limitations under the License. use std::{ collections::HashMap, - os::unix::io::{AsRawFd, FromRawFd, RawFd}, + os::{ + fd::OwnedFd, + unix::io::{AsRawFd, FromRawFd, RawFd}, + }, time::{Duration, SystemTime}, }; @@ -87,7 +90,8 @@ pub struct StratoVirtVM { devices: Vec>, #[serde(skip)] hot_attached_devices: Vec>, - fds: Vec, + #[serde(skip)] + fds: Vec, console_socket: String, agent_socket: String, netns: String, @@ -309,17 +313,17 @@ impl StratoVirtVM { self.devices.push(Box::new(device)); } - fn append_fd(&mut self, fd: RawFd) -> usize { + fn append_fd(&mut self, fd: OwnedFd) -> usize { self.fds.push(fd); self.fds.len() - 1 + 3 } - async fn launch(&self) -> Result> { + async fn launch(&mut self) -> Result> { let mut params = self.config.to_cmdline_params("-"); for d in self.devices.iter() { params.extend(d.to_cmdline_params("-")); } - let fds = self.fds.to_vec(); + let fds: Vec = self.fds.drain(..).collect(); let path = self.config.path.to_string(); // pid file should not be empty let pid_file = self.config.pid_file.to_string(); @@ -335,11 +339,11 @@ impl StratoVirtVM { let mut cmd = unshare::Command::new(&*path_clone); cmd.args(params.as_slice()); - for (i, &x) in fds.iter().enumerate() { - cmd.file_descriptor( - (3 + i) as RawFd, - Fd::from_file(unsafe { std::fs::File::from_raw_fd(x) }), - ); + let pipe_writer2 = pipe_writer.try_clone()?; + cmd.stdout(unshare::Stdio::from_file(pipe_writer)); + cmd.stderr(unshare::Stdio::from_file(pipe_writer2)); + for (i, x) in fds.into_iter().enumerate() { + cmd.file_descriptor((3 + i) as RawFd, Fd::from_file(std::fs::File::from(x))); } if !netns.is_empty() { @@ -347,9 +351,6 @@ impl StratoVirtVM { .map_err(|e| anyhow!("failed to open netns {}", e))?; cmd.set_namespace(&netns_fd, unshare::Namespace::Net)?; } - let pipe_writer2 = pipe_writer.try_clone()?; - cmd.stdout(unshare::Stdio::from_file(pipe_writer)); - cmd.stderr(unshare::Stdio::from_file(pipe_writer2)); let mut child = cmd .spawn() .map_err(|e| anyhow!("failed to spawn stratovirt command: {}", e))?; diff --git a/vmm/sandbox/src/utils.rs b/vmm/sandbox/src/utils.rs index 37f11261..729473bf 100644 --- a/vmm/sandbox/src/utils.rs +++ b/vmm/sandbox/src/utils.rs @@ -15,9 +15,13 @@ limitations under the License. */ use std::{ - os::unix::{ - io::RawFd, - prelude::{AsRawFd, FromRawFd, OwnedFd}, + mem, + os::{ + fd::IntoRawFd, + unix::{ + io::RawFd, + prelude::{AsRawFd, FromRawFd, OwnedFd}, + }, }, path::Path, str::FromStr, @@ -32,10 +36,11 @@ use containerd_sandbox::{ }; use log::{error, info}; use nix::{ - fcntl::{open, OFlag}, - libc::{dup2, fcntl, kill, setns, FD_CLOEXEC, F_GETFD, F_SETFD}, + fcntl::{fcntl, open, FdFlag, OFlag, F_GETFD, F_SETFD}, + libc::{kill, setns, FD_CLOEXEC}, sched::CloneFlags, sys::stat::Mode, + unistd::dup2, }; use time::OffsetDateTime; use tokio::{ @@ -450,24 +455,39 @@ pub fn set_cmd_netns(cmd: &mut Command, netns: String) -> Result<()> { Ok(()) } -pub fn set_cmd_fd(cmd: &mut Command, fds: Vec) -> Result<()> { +pub fn set_cmd_fd(cmd: &mut Command, mut fds: Vec) -> Result<()> { unsafe { cmd.pre_exec(move || { - for (i, &fd) in fds.iter().enumerate() { - let dest_fd = (3 + i) as RawFd; - let src_fd = fd; - - if src_fd == dest_fd { - let flags = fcntl(src_fd, F_GETFD); - if flags < 0 || fcntl(src_fd, F_SETFD, flags & !FD_CLOEXEC) < 0 { + for (i, fd) in mem::take(&mut fds).into_iter().enumerate() { + let new_fd = (3 + i) as RawFd; + + // Closing the fd when its lifecycle finished is unsafe, so transfer it into RawFD + // to let its closing not be influenced by rust lifecycle management. + let old_fd = fd.into_raw_fd(); + + if old_fd == new_fd { + // old_fd equals new_fd means the index is in the right place, so child process + // could used it directly. In this case, should remove CLOEXEC flag to avoid + // closing it after execve. + let flags = fcntl(old_fd, F_GETFD)?; + if flags < 0 { let e = std::io::Error::last_os_error(); - eprintln!("failed to call fnctl: {}", e); + eprintln!("failed to get fnctl F_GETFD: {}", e); return Err(e); + } else if let Err(e) = fcntl( + old_fd, + F_SETFD(FdFlag::from_bits_truncate(flags & !FD_CLOEXEC)), + ) { + eprintln!("failed to call fnctl F_SETFD: {}", e); + return Err(e.into()); } - } else if dup2(src_fd, dest_fd) < 0 { - let e = std::io::Error::last_os_error(); + } else if let Err(e) = dup2(old_fd, new_fd) { + // If not equals, old_fd will be closed after execve with CLOEXEC flag, + // which is also safe. eprintln!("failed to call dup2: {}", e); - return Err(e); + return Err(e.into()); + } else { + // dup2 succeeds, do nothing } } Ok(())