Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

open pidfd in child process and send to the parent via SOCK_SEQPACKET+CMSG #113939

Merged
merged 1 commit into from
Aug 9, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
221 changes: 131 additions & 90 deletions library/std/src/sys/unix/process/process_unix.rs
Original file line number Diff line number Diff line change
Expand Up @@ -10,9 +10,6 @@ use core::ffi::NonZero_c_int;
#[cfg(target_os = "linux")]
use crate::os::linux::process::PidFd;

#[cfg(target_os = "linux")]
use crate::sys::weak::raw_syscall;

#[cfg(any(
target_os = "macos",
target_os = "watchos",
Expand Down Expand Up @@ -91,6 +88,11 @@ impl Command {
if let Some(ret) = self.posix_spawn(&theirs, envp.as_ref())? {
return Ok((ret, ours));
}

#[cfg(target_os = "linux")]
let (input, output) = sys::net::Socket::new_pair(libc::AF_UNIX, libc::SOCK_SEQPACKET)?;

#[cfg(not(target_os = "linux"))]
let (input, output) = sys::pipe::anon_pipe()?;

// Whatever happens after the fork is almost for sure going to touch or
Expand All @@ -104,12 +106,16 @@ impl Command {
// The child calls `mem::forget` to leak the lock, which is crucial because
// releasing a lock is not async-signal-safe.
let env_lock = sys::os::env_read_lock();
let (pid, pidfd) = unsafe { self.do_fork()? };
let pid = unsafe { self.do_fork()? };

if pid == 0 {
crate::panic::always_abort();
mem::forget(env_lock); // avoid non-async-signal-safe unlocking
drop(input);
#[cfg(target_os = "linux")]
if self.get_create_pidfd() {
self.send_pidfd(&output);
}
let Err(err) = unsafe { self.do_exec(theirs, envp.as_ref()) };
let errno = err.raw_os_error().unwrap_or(libc::EINVAL) as u32;
let errno = errno.to_be_bytes();
Expand All @@ -133,6 +139,12 @@ impl Command {
drop(env_lock);
drop(output);

#[cfg(target_os = "linux")]
let pidfd = if self.get_create_pidfd() { self.recv_pidfd(&input) } else { -1 };

#[cfg(not(target_os = "linux"))]
let pidfd = -1;

// Safety: We obtained the pidfd from calling `clone3` with
// `CLONE_PIDFD` so it's valid an otherwise unowned.
let mut p = unsafe { Process::new(pid, pidfd) };
Expand Down Expand Up @@ -160,6 +172,7 @@ impl Command {
}
Ok(..) => {
// pipe I/O up to PIPE_BUF bytes should be atomic
// similarly SOCK_SEQPACKET messages should arrive whole
assert!(p.wait().is_ok(), "wait() should either return Ok or panic");
panic!("short read on the CLOEXEC pipe")
}
Expand All @@ -185,28 +198,27 @@ impl Command {
);

#[cfg(any(target_os = "tvos", target_os = "watchos"))]
unsafe fn do_fork(&mut self) -> Result<(pid_t, pid_t), io::Error> {
unsafe fn do_fork(&mut self) -> Result<pid_t, io::Error> {
return Err(Self::ERR_APPLE_TV_WATCH_NO_FORK_EXEC);
}

// Attempts to fork the process. If successful, returns Ok((0, -1))
// in the child, and Ok((child_pid, -1)) in the parent.
#[cfg(not(any(
target_os = "linux",
target_os = "watchos",
target_os = "tvos",
all(target_os = "nto", target_env = "nto71"),
)))]
unsafe fn do_fork(&mut self) -> Result<(pid_t, pid_t), io::Error> {
cvt(libc::fork()).map(|res| (res, -1))
unsafe fn do_fork(&mut self) -> Result<pid_t, io::Error> {
cvt(libc::fork())
}

// On QNX Neutrino, fork can fail with EBADF in case "another thread might have opened
// or closed a file descriptor while the fork() was occurring".
// Documentation says "... or try calling fork() again". This is what we do here.
// See also https://www.qnx.com/developers/docs/7.1/#com.qnx.doc.neutrino.lib_ref/topic/f/fork.html
#[cfg(all(target_os = "nto", target_env = "nto71"))]
unsafe fn do_fork(&mut self) -> Result<(pid_t, pid_t), io::Error> {
unsafe fn do_fork(&mut self) -> Result<pid_t, io::Error> {
use crate::sys::os::errno;

let mut delay = MIN_FORKSPAWN_SLEEP;
Expand All @@ -229,91 +241,11 @@ impl Command {
delay *= 2;
continue;
} else {
return cvt(r).map(|res| (res, -1));
return cvt(r);
}
}
}

// Attempts to fork the process. If successful, returns Ok((0, -1))
// in the child, and Ok((child_pid, child_pidfd)) in the parent.
#[cfg(target_os = "linux")]
unsafe fn do_fork(&mut self) -> Result<(pid_t, pid_t), io::Error> {
use crate::sync::atomic::{AtomicBool, Ordering};

static HAS_CLONE3: AtomicBool = AtomicBool::new(true);
const CLONE_PIDFD: u64 = 0x00001000;

#[repr(C)]
struct clone_args {
flags: u64,
pidfd: u64,
child_tid: u64,
parent_tid: u64,
exit_signal: u64,
stack: u64,
stack_size: u64,
tls: u64,
set_tid: u64,
set_tid_size: u64,
cgroup: u64,
}

raw_syscall! {
fn clone3(cl_args: *mut clone_args, len: libc::size_t) -> libc::c_long
}

// Bypassing libc for `clone3` can make further libc calls unsafe,
// so we use it sparingly for now. See #89522 for details.
// Some tools (e.g. sandboxing tools) may also expect `fork`
// rather than `clone3`.
let want_clone3_pidfd = self.get_create_pidfd();

// If we fail to create a pidfd for any reason, this will
// stay as -1, which indicates an error.
let mut pidfd: pid_t = -1;

// Attempt to use the `clone3` syscall, which supports more arguments
// (in particular, the ability to create a pidfd). If this fails,
// we will fall through this block to a call to `fork()`
if want_clone3_pidfd && HAS_CLONE3.load(Ordering::Relaxed) {
let mut args = clone_args {
flags: CLONE_PIDFD,
pidfd: &mut pidfd as *mut pid_t as u64,
child_tid: 0,
parent_tid: 0,
exit_signal: libc::SIGCHLD as u64,
stack: 0,
stack_size: 0,
tls: 0,
set_tid: 0,
set_tid_size: 0,
cgroup: 0,
};

let args_ptr = &mut args as *mut clone_args;
let args_size = crate::mem::size_of::<clone_args>();

let res = cvt(clone3(args_ptr, args_size));
match res {
Ok(n) => return Ok((n as pid_t, pidfd)),
Err(e) => match e.raw_os_error() {
// Multiple threads can race to execute this store,
// but that's fine - that just means that multiple threads
// will have tried and failed to execute the same syscall,
// with no other side effects.
Some(libc::ENOSYS) => HAS_CLONE3.store(false, Ordering::Relaxed),
// Fallback to fork if `EPERM` is returned. (e.g. blocked by seccomp)
Some(libc::EPERM) => {}
_ => return Err(e),
},
}
}

// Generally, we just call `fork`. If we get here after wanting `clone3`,
// then the syscall does not exist or we do not have permission to call it.
cvt(libc::fork()).map(|res| (res, pidfd))
}

pub fn exec(&mut self, default: Stdio) -> io::Error {
let envp = self.capture_env();

Expand Down Expand Up @@ -722,6 +654,115 @@ impl Command {
Ok(Some(p))
}
}

#[cfg(target_os = "linux")]
fn send_pidfd(&self, sock: &crate::sys::net::Socket) {
use crate::io::IoSlice;
use crate::os::fd::RawFd;
use crate::sys::cvt_r;
use libc::{CMSG_DATA, CMSG_FIRSTHDR, CMSG_LEN, CMSG_SPACE, SCM_RIGHTS, SOL_SOCKET};

unsafe {
let child_pid = libc::getpid();
// pidfd_open sets CLOEXEC by default
let pidfd = libc::syscall(libc::SYS_pidfd_open, child_pid, 0);

let fds: [c_int; 1] = [pidfd as RawFd];

const SCM_MSG_LEN: usize = mem::size_of::<[c_int; 1]>();

#[repr(C)]
union Cmsg {
buf: [u8; unsafe { CMSG_SPACE(SCM_MSG_LEN as u32) as usize }],
_align: libc::cmsghdr,
}

let mut cmsg: Cmsg = mem::zeroed();

// 0-length message to send through the socket so we can pass along the fd
let mut iov = [IoSlice::new(b"")];
let mut msg: libc::msghdr = mem::zeroed();

msg.msg_iov = &mut iov as *mut _ as *mut _;
msg.msg_iovlen = 1;
msg.msg_controllen = mem::size_of_val(&cmsg.buf) as _;
msg.msg_control = &mut cmsg.buf as *mut _ as *mut _;

// only attach cmsg if we successfully acquired the pidfd
if pidfd >= 0 {
the8472 marked this conversation as resolved.
Show resolved Hide resolved
let hdr = CMSG_FIRSTHDR(&mut msg as *mut _ as *mut _);
(*hdr).cmsg_level = SOL_SOCKET;
(*hdr).cmsg_type = SCM_RIGHTS;
(*hdr).cmsg_len = CMSG_LEN(SCM_MSG_LEN as _) as _;
let data = CMSG_DATA(hdr);
crate::ptr::copy_nonoverlapping(
fds.as_ptr().cast::<u8>(),
data as *mut _,
SCM_MSG_LEN,
);
}

// we send the 0-length message even if we failed to acquire the pidfd
// so we get a consistent SEQPACKET order
match cvt_r(|| libc::sendmsg(sock.as_raw(), &msg, 0)) {
Ok(0) => {}
_ => rtabort!("failed to communicate with parent process"),
}
}
}

#[cfg(target_os = "linux")]
fn recv_pidfd(&self, sock: &crate::sys::net::Socket) -> pid_t {
use crate::io::IoSliceMut;
use crate::sys::cvt_r;

use libc::{CMSG_DATA, CMSG_FIRSTHDR, CMSG_LEN, CMSG_SPACE, SCM_RIGHTS, SOL_SOCKET};

unsafe {
const SCM_MSG_LEN: usize = mem::size_of::<[c_int; 1]>();

#[repr(C)]
union Cmsg {
_buf: [u8; unsafe { CMSG_SPACE(SCM_MSG_LEN as u32) as usize }],
_align: libc::cmsghdr,
}
let mut cmsg: Cmsg = mem::zeroed();
// 0-length read to get the fd
let mut iov = [IoSliceMut::new(&mut [])];

let mut msg: libc::msghdr = mem::zeroed();

msg.msg_iov = &mut iov as *mut _ as *mut _;
msg.msg_iovlen = 1;
msg.msg_controllen = mem::size_of::<Cmsg>() as _;
msg.msg_control = &mut cmsg as *mut _ as *mut _;

match cvt_r(|| libc::recvmsg(sock.as_raw(), &mut msg, 0)) {
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Do we need a timeout here (or something like it) at all? E.g., if we're unlucky and the child process is kill'd by something before we get the pidfd sent back? Or will that close the stream and end here?

My sense is that we should be ok, but wanted to raise in case I overlooked something.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The parent closes its output, so only the child holds the remote side of the socket open. If the child gets killed that gets closed, which disconnects the socket which in turn leads to a 0-length read in the parent and a return from this function.

Err(_) => return -1,
Ok(_) => {}
}

let hdr = CMSG_FIRSTHDR(&mut msg as *mut _ as *mut _);
if hdr.is_null()
|| (*hdr).cmsg_level != SOL_SOCKET
|| (*hdr).cmsg_type != SCM_RIGHTS
|| (*hdr).cmsg_len != CMSG_LEN(SCM_MSG_LEN as _) as _
{
return -1;
}
let data = CMSG_DATA(hdr);

let mut fds = [-1 as c_int];

crate::ptr::copy_nonoverlapping(
data as *const _,
fds.as_mut_ptr().cast::<u8>(),
SCM_MSG_LEN,
);

fds[0]
}
}
}

////////////////////////////////////////////////////////////////////////////////
Expand Down
25 changes: 25 additions & 0 deletions library/std/src/sys/unix/process/process_unix/tests.rs
Original file line number Diff line number Diff line change
Expand Up @@ -60,3 +60,28 @@ fn test_command_fork_no_unwind() {
|| signal == libc::SIGSEGV
);
}

#[test]
#[cfg(target_os = "linux")]
fn test_command_pidfd() {
use crate::os::fd::RawFd;
use crate::os::linux::process::{ChildExt, CommandExt};
use crate::process::Command;

let our_pid = crate::process::id();
let pidfd = unsafe { libc::syscall(libc::SYS_pidfd_open, our_pid, 0) };
let pidfd_open_available = if pidfd >= 0 {
unsafe { libc::close(pidfd as RawFd) };
true
} else {
false
};

// always exercise creation attempts
let child = Command::new("echo").create_pidfd(true).spawn().unwrap();

// but only check if we know that the kernel supports pidfds
if pidfd_open_available {
assert!(child.pidfd().is_ok())
}
}
Loading