From 411eda0602fdfef64fb5cf2cec73d769b6414961 Mon Sep 17 00:00:00 2001 From: Babis Chalios Date: Wed, 30 Aug 2023 14:48:41 +0000 Subject: [PATCH 1/2] userfaultfd-sys: expose USERFAULTFD_IOC ioctl number Expose the USERFAULTFD_IOC ioctl number for `/dev/userfaultfd` device. This device is only present on kernels >= 6.1, but we expose it unconditionally so that userfaultfd-sys has the same exports for all kernels. Signed-off-by: Babis Chalios --- userfaultfd-sys/src/consts.c | 4 ++++ userfaultfd-sys/wrapper.h | 6 ++++++ 2 files changed, 10 insertions(+) diff --git a/userfaultfd-sys/src/consts.c b/userfaultfd-sys/src/consts.c index e19ca74..df78dbb 100644 --- a/userfaultfd-sys/src/consts.c +++ b/userfaultfd-sys/src/consts.c @@ -69,3 +69,7 @@ const __u32 _const_UFFDIO_ZEROPAGE = UFFDIO_ZEROPAGE; #ifdef UFFDIO_WRITEPROTECT const __u32 _const_UFFDIO_WRITEPROTECT = UFFDIO_WRITEPROTECT; #endif + +#ifdef USERFAULTFD_IOC +const __u32 _const_USERFAULTFD_IOC = USERFAULTFD_IOC; +#endif diff --git a/userfaultfd-sys/wrapper.h b/userfaultfd-sys/wrapper.h index 42ab19c..7cffde4 100644 --- a/userfaultfd-sys/wrapper.h +++ b/userfaultfd-sys/wrapper.h @@ -4,3 +4,9 @@ // userfaultfd-sys has the same exports on all kernels #define UFFD_USER_MODE_ONLY 1 #endif + + +#ifndef USERFAULTFD_IOC +// Similarly, the ioctl() for `/dev/userfaultfd` is introduced with Linux 6.1. +#define USERFAULTFD_IOC 0xAA +#endif From 89ff11242c7c48653b707ba241b7b47b02080d5c Mon Sep 17 00:00:00 2001 From: Babis Chalios Date: Wed, 30 Aug 2023 14:51:55 +0000 Subject: [PATCH 2/2] use /dev/userfaultfd when present Linux kernel 6.1 introduces a /dev/userfaultfd pseudo-device which allows creating a UFFD objects by issuing USERFAULTFD_IOC_NEW ioctl() to the device [1]. This way, a process does not need to have the CAP_SYS_PTRACE capability for creating UFFD objects that can handle kernel-triggered page faults. UFFDs created through this interface can always handle kernel-triggered page faults. Access to the device is granted through normal filesystem permissions to the device file. This commit changes the way we create Uffd objects, to first try to create the file descriptor through /dev/userfaultfd. If the file exists we will try to create the descriptor using ioctl in this device. If the device does not exist, we will fall back to the syscall. Signed-off-by: Babis Chalios --- CHANGELOG.md | 12 ++++++++++ src/builder.rs | 60 +++++++++++++++++++++++++++++++++++++++----------- src/error.rs | 6 +++++ src/raw.rs | 15 +++++++++++++ 4 files changed, 80 insertions(+), 13 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index cec9502..b0797ad 100755 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -2,6 +2,18 @@ - Added `Uffd::read_events` that can read multiple events from the userfaultfd file descriptor. - Updated `bitflags` dependency to `2.2.1`. +- Use `/dev/userfaultfd` as the default API for creating userfaultfd file descriptors. + + Since Linux 5.11 a process can select if it wants to handle page faults triggered in kernel space + or not. Under this mechanism, processes that wish to handle those, need to have `CAP_SYS_PTRACE` + capability. `CAP_SYS_PTRACE` allows a process to do much more than create userfault fds, so with + 6.1 Linux introduces `/dev/userfaultfd`, a special character device that allows creating + userfault file descriptors using the `USERFAULTFD_IOC_NEW` `ioctl`. Access to this device is + granted via file system permissions and does not require `CAP_SYS_PTRACE` to handle kernel + triggered page faults. + + We now default to using `/dev/userfaultfd` for creating the descriptors and only if that file is + not present, we fall back to using the syscall. ### 0.3.1 (2021-02-17) diff --git a/src/builder.rs b/src/builder.rs index 2a46c3c..a200148 100644 --- a/src/builder.rs +++ b/src/builder.rs @@ -3,6 +3,11 @@ use crate::raw; use crate::{IoctlFlags, Uffd}; use bitflags::bitflags; use nix::errno::Errno; +use std::fs::{File, OpenOptions}; +use std::io::ErrorKind; +use std::os::fd::AsRawFd; + +const UFFD_DEVICE_PATH: &str = "/dev/userfaultfd"; cfg_if::cfg_if! { if #[cfg(any(feature = "linux5_7", feature = "linux4_14"))] { @@ -115,6 +120,47 @@ impl UffdBuilder { self } + fn uffd_from_dev(&self, file: &mut File, flags: i32) -> Result { + match unsafe { raw::new_uffd(file.as_raw_fd(), flags) } { + Err(err) => Err(err.into()), + Ok(fd) => Ok(Uffd { fd }), + } + } + + fn uffd_from_syscall(&self, flags: i32) -> Result { + let fd = match Errno::result(unsafe { raw::userfaultfd(flags) }) { + Ok(fd) => fd, + // setting the USER_MODE_ONLY flag on kernel pre-5.11 causes it to return EINVAL. + // If the user asks for the flag, we first try with it set, and if kernel gives + // EINVAL we try again without the flag set. + Err(Errno::EINVAL) if self.user_mode_only => Errno::result(unsafe { + raw::userfaultfd(flags & !raw::UFFD_USER_MODE_ONLY as i32) + })?, + Err(e) => return Err(e.into()), + }; + + // Wrap the fd up so that a failure in this function body closes it with the drop. + Ok(Uffd { fd }) + } + + // Try to get a UFFD file descriptor using `/dev/userfaultfd`. If that fails + // fall back to calling the system call. + fn open_file_descriptor(&self, flags: i32) -> Result { + // If `/dev/userfaultfd` exists we'll try to get the file descriptor from it. If the file + // doesn't exist we will fall back to calling the system call. This means, that if the + // device exists but the calling process does not have access rights to it, this will fail, + // i.e. we will not fall back to calling the system call. + match OpenOptions::new() + .read(true) + .write(true) + .open(UFFD_DEVICE_PATH) + { + Ok(mut file) => self.uffd_from_dev(&mut file, flags), + Err(err) if err.kind() == ErrorKind::NotFound => self.uffd_from_syscall(flags), + Err(err) => Err(Error::OpenDevUserfaultfd(err)), + } + } + /// Create a `Uffd` object with the current settings of this builder. pub fn create(&self) -> Result { // first do the syscall to get the file descriptor @@ -130,19 +176,7 @@ impl UffdBuilder { flags |= raw::UFFD_USER_MODE_ONLY as i32; } - let fd = match Errno::result(unsafe { raw::userfaultfd(flags) }) { - Ok(fd) => fd, - // setting the USER_MODE_ONLY flag on kernel pre-5.11 causes it to return EINVAL. - // If the user asks for the flag, we first try with it set, and if kernel gives - // EINVAL we try again without the flag set. - Err(Errno::EINVAL) if self.user_mode_only => Errno::result(unsafe { - raw::userfaultfd(flags & !raw::UFFD_USER_MODE_ONLY as i32) - })?, - Err(e) => return Err(e.into()), - }; - - // Wrap the fd up so that a failure in this function body closes it with the drop. - let uffd = Uffd { fd }; + let uffd = self.open_file_descriptor(flags)?; // then do the UFFDIO_API ioctl to set up and ensure features and other ioctls are available let mut api = raw::uffdio_api { diff --git a/src/error.rs b/src/error.rs index 5cd8926..f66806a 100644 --- a/src/error.rs +++ b/src/error.rs @@ -1,3 +1,5 @@ +use std::io; + use crate::IoctlFlags; use nix::errno::Errno; use thiserror::Error; @@ -47,6 +49,10 @@ pub enum Error { /// Zeropage ioctl failure with `errno` value. #[error("Zeropage failed: {0}")] ZeropageFailed(Errno), + + /// Could not open /dev/userfaultfd even though it exists + #[error("Error accessing /dev/userfaultfd: {0}")] + OpenDevUserfaultfd(io::Error), } impl From for Error { diff --git a/src/raw.rs b/src/raw.rs index 332c459..28039c7 100644 --- a/src/raw.rs +++ b/src/raw.rs @@ -23,3 +23,18 @@ nix::ioctl_readwrite!( _UFFDIO_WRITEPROTECT, uffdio_writeprotect ); + +// ioctls for /dev/userfaultfd + +// This is the `/dev/userfaultfd` ioctl() from creating a new userfault file descriptor. +// It is a "bad" ioctl in the sense that it is defined as an _IOC: +// https://elixir.bootlin.com/linux/latest/source/include/uapi/linux/userfaultfd.h#L17, +// aka `nix::ioctl_none`, however it does receive an integer argument: +// https://elixir.bootlin.com/linux/latest/source/fs/userfaultfd.c#L2186. That is the same argument +// that the userfaultfd() system call receives. +nix::ioctl_write_int_bad!( + /// Create a new userfault file descriptor from the `/dev/userfaultfd` + /// device. This receives the same arguments as the userfaultfd system call. + new_uffd, + nix::request_code_none!(USERFAULTFD_IOC, 0x00) +);