Use clone instead of fork on pvf (#2477)

@mrcnski Done the change on the prepare worker, once the prepare worker part is good I'll do the same for the execute worker. This is based on https://github.com/koute/polkavm/blob/11beebd06276ce9b84f335350138479e714f6caf/crates/polkavm/src/sandbox/linux.rs#L711. ## TODO - [x] Add a check for this capability at startup - [x] Add prdoc mentioning the new Secure Validator Mode (optional) requirement. ## Related Closes #2162 --------- Co-authored-by: Marcin S <marcin@realemail.net>
paritytech · Jan 21, 2024 · 21ef949 · 21ef949
1 parent caa987d
commit 21ef949
Show file tree

Hide file tree

Showing 20 changed files with 860 additions and 394 deletions.
diff --git a/Cargo.lock b/Cargo.lock
diff --git a/polkadot/node/core/pvf/common/Cargo.toml b/polkadot/node/core/pvf/common/Cargo.toml
@@ -33,6 +33,7 @@ sp-tracing = { path = "../../../../../substrate/primitives/tracing" }
 
 [target.'cfg(target_os = "linux")'.dependencies]
 landlock = "0.3.0"
+nix = { version = "0.27.1", features = ["sched"] }
 seccompiler = "0.4.0"
 
 [dev-dependencies]

diff --git a/polkadot/node/core/pvf/common/src/execute.rs b/polkadot/node/core/pvf/common/src/execute.rs
@@ -92,10 +92,11 @@ pub enum JobError {
 	TimedOut,
 	#[error("An unexpected panic has occurred in the execution job: {0}")]
 	Panic(String),
+	/// Some error occurred when interfacing with the kernel.
+	#[error("Error interfacing with the kernel: {0}")]
+	Kernel(String),
 	#[error("Could not spawn the requested thread: {0}")]
 	CouldNotSpawnThread(String),
 	#[error("An error occurred in the CPU time monitor thread: {0}")]
 	CpuTimeMonitorThread(String),
-	#[error("Could not set pdeathsig: {0}")]
-	CouldNotSetPdeathsig(String),
 }
diff --git a/polkadot/node/core/pvf/common/src/executor_interface.rs b/polkadot/node/core/pvf/common/src/executor_interface.rs
@@ -140,15 +140,18 @@ pub unsafe fn create_runtime_from_artifact_bytes(
 	executor_params: &ExecutorParams,
 ) -> Result<WasmtimeRuntime, WasmError> {
 	let mut config = DEFAULT_CONFIG.clone();
-	config.semantics = params_to_wasmtime_semantics(executor_params);
+	config.semantics = params_to_wasmtime_semantics(executor_params).0;
 
 	sc_executor_wasmtime::create_runtime_from_artifact_bytes::<HostFunctions>(
 		compiled_artifact_blob,
 		config,
 	)
 }
 
-pub fn params_to_wasmtime_semantics(par: &ExecutorParams) -> Semantics {
+/// Takes the default config and overwrites any settings with existing executor parameters.
+///
+/// Returns the semantics as well as the stack limit (since we are guaranteed to have it).
+pub fn params_to_wasmtime_semantics(par: &ExecutorParams) -> (Semantics, DeterministicStackLimit) {
 	let mut sem = DEFAULT_CONFIG.semantics.clone();
 	let mut stack_limit = sem
 		.deterministic_stack_limit
@@ -169,8 +172,8 @@ pub fn params_to_wasmtime_semantics(par: &ExecutorParams) -> Semantics {
 			ExecutorParam::PvfExecTimeout(_, _) => (), /* Not used here */
 		}
 	}
-	sem.deterministic_stack_limit = Some(stack_limit);
-	sem
+	sem.deterministic_stack_limit = Some(stack_limit.clone());
+	(sem, stack_limit)
 }
 
 /// Runs the prevalidation on the given code. Returns a [`RuntimeBlob`] if it succeeds.
@@ -187,7 +190,7 @@ pub fn prepare(
 	blob: RuntimeBlob,
 	executor_params: &ExecutorParams,
 ) -> Result<Vec<u8>, sc_executor_common::error::WasmError> {
-	let semantics = params_to_wasmtime_semantics(executor_params);
+	let (semantics, _) = params_to_wasmtime_semantics(executor_params);
 	sc_executor_wasmtime::prepare_runtime_artifact(blob, &semantics)
 }
 

diff --git a/polkadot/node/core/pvf/common/src/lib.rs b/polkadot/node/core/pvf/common/src/lib.rs
@@ -57,6 +57,8 @@ pub struct SecurityStatus {
 	pub can_enable_seccomp: bool,
 	/// Whether we are able to unshare the user namespace and change the filesystem root.
 	pub can_unshare_user_namespace_and_change_root: bool,
+	/// Whether we are able to call `clone` with all sandboxing flags.
+	pub can_do_secure_clone: bool,
 }
 
 /// A handshake with information for the worker.

diff --git a/polkadot/node/core/pvf/common/src/worker/mod.rs b/polkadot/node/core/pvf/common/src/worker/mod.rs
@@ -18,14 +18,19 @@
 
 pub mod security;
 
-use crate::{framed_recv_blocking, WorkerHandshake, LOG_TARGET};
+use crate::{framed_recv_blocking, SecurityStatus, WorkerHandshake, LOG_TARGET};
 use cpu_time::ProcessTime;
 use futures::never::Never;
 use parity_scale_codec::Decode;
 use std::{
 	any::Any,
-	fmt, io,
-	os::unix::net::UnixStream,
+	fmt::{self},
+	fs::File,
+	io::{self, Read, Write},
+	os::{
+		fd::{AsRawFd, FromRawFd, RawFd},
+		unix::net::UnixStream,
+	},
 	path::PathBuf,
 	sync::mpsc::{Receiver, RecvTimeoutError},
 	time::Duration,
@@ -78,7 +83,7 @@ macro_rules! decl_worker_main {
 
 				"--check-can-enable-landlock" => {
 					#[cfg(target_os = "linux")]
-					let status = if let Err(err) = security::landlock::check_is_fully_enabled() {
+					let status = if let Err(err) = security::landlock::check_can_fully_enable() {
 						// Write the error to stderr, log it on the host-side.
 						eprintln!("{}", err);
 						-1
@@ -91,7 +96,7 @@ macro_rules! decl_worker_main {
 				},
 				"--check-can-enable-seccomp" => {
 					#[cfg(all(target_os = "linux", target_arch = "x86_64"))]
-					let status = if let Err(err) = security::seccomp::check_is_fully_enabled() {
+					let status = if let Err(err) = security::seccomp::check_can_fully_enable() {
 						// Write the error to stderr, log it on the host-side.
 						eprintln!("{}", err);
 						-1
@@ -107,7 +112,7 @@ macro_rules! decl_worker_main {
 					let cache_path_tempdir = std::path::Path::new(&args[2]);
 					#[cfg(target_os = "linux")]
 					let status = if let Err(err) =
-						security::change_root::check_is_fully_enabled(&cache_path_tempdir)
+						security::change_root::check_can_fully_enable(&cache_path_tempdir)
 					{
 						// Write the error to stderr, log it on the host-side.
 						eprintln!("{}", err);
@@ -119,6 +124,21 @@ macro_rules! decl_worker_main {
 					let status = -1;
 					std::process::exit(status)
 				},
+				"--check-can-do-secure-clone" => {
+					#[cfg(target_os = "linux")]
+					// SAFETY: new process is spawned within a single threaded process. This
+					// invariant is enforced by tests.
+					let status = if let Err(err) = unsafe { security::clone::check_can_fully_clone() } {
+						// Write the error to stderr, log it on the host-side.
+						eprintln!("{}", err);
+						-1
+					} else {
+						0
+					};
+					#[cfg(not(target_os = "linux"))]
+					let status = -1;
+					std::process::exit(status)
+				},
 
 				"test-sleep" => {
 					std::thread::sleep(std::time::Duration::from_secs(5));
@@ -171,6 +191,84 @@ macro_rules! decl_worker_main {
 	};
 }
 
+//taken from the os_pipe crate. Copied here to reduce one dependency and
+// because its type-safe abstractions do not play well with nix's clone
+#[cfg(not(target_os = "macos"))]
+pub fn pipe2_cloexec() -> io::Result<(libc::c_int, libc::c_int)> {
+	let mut fds: [libc::c_int; 2] = [0; 2];
+	let res = unsafe { libc::pipe2(fds.as_mut_ptr(), libc::O_CLOEXEC) };
+	if res != 0 {
+		return Err(io::Error::last_os_error())
+	}
+	Ok((fds[0], fds[1]))
+}
+
+#[cfg(target_os = "macos")]
+pub fn pipe2_cloexec() -> io::Result<(libc::c_int, libc::c_int)> {
+	let mut fds: [libc::c_int; 2] = [0; 2];
+	let res = unsafe { libc::pipe(fds.as_mut_ptr()) };
+	if res != 0 {
+		return Err(io::Error::last_os_error())
+	}
+	let res = unsafe { libc::fcntl(fds[0], libc::F_SETFD, libc::FD_CLOEXEC) };
+	if res != 0 {
+		return Err(io::Error::last_os_error())
+	}
+	let res = unsafe { libc::fcntl(fds[1], libc::F_SETFD, libc::FD_CLOEXEC) };
+	if res != 0 {
+		return Err(io::Error::last_os_error())
+	}
+	Ok((fds[0], fds[1]))
+}
+
+/// A wrapper around a file descriptor used to encapsulate and restrict
+/// functionality for pipe operations.
+pub struct PipeFd {
+	file: File,
+}
+
+impl AsRawFd for PipeFd {
+	/// Returns the raw file descriptor associated with this `PipeFd`
+	fn as_raw_fd(&self) -> RawFd {
+		self.file.as_raw_fd()
+	}
+}
+
+impl FromRawFd for PipeFd {
+	/// Creates a new `PipeFd` instance from a raw file descriptor.
+	///
+	/// # Safety
+	///
+	/// The fd passed in must be an owned file descriptor; in particular, it must be open.
+	unsafe fn from_raw_fd(fd: RawFd) -> Self {
+		PipeFd { file: File::from_raw_fd(fd) }
+	}
+}
+
+impl Read for PipeFd {
+	fn read(&mut self, buf: &mut [u8]) -> io::Result<usize> {
+		self.file.read(buf)
+	}
+
+	fn read_to_end(&mut self, buf: &mut Vec<u8>) -> io::Result<usize> {
+		self.file.read_to_end(buf)
+	}
+}
+
+impl Write for PipeFd {
+	fn write(&mut self, buf: &[u8]) -> io::Result<usize> {
+		self.file.write(buf)
+	}
+
+	fn flush(&mut self) -> io::Result<()> {
+		self.file.flush()
+	}
+
+	fn write_all(&mut self, buf: &[u8]) -> io::Result<()> {
+		self.file.write_all(buf)
+	}
+}
+
 /// Some allowed overhead that we account for in the "CPU time monitor" thread's sleeps, on the
 /// child process.
 pub const JOB_TIMEOUT_OVERHEAD: Duration = Duration::from_millis(50);
@@ -192,14 +290,12 @@ impl fmt::Display for WorkerKind {
 	}
 }
 
-// Some fields are only used for logging, and dead-code analysis ignores Debug.
-#[allow(dead_code)]
 #[derive(Debug)]
 pub struct WorkerInfo {
-	pid: u32,
-	kind: WorkerKind,
-	version: Option<String>,
-	worker_dir_path: PathBuf,
+	pub pid: u32,
+	pub kind: WorkerKind,
+	pub version: Option<String>,
+	pub worker_dir_path: PathBuf,
 }
 
 // NOTE: The worker version must be passed in so that we accurately get the version of the worker,
@@ -218,7 +314,7 @@ pub fn run_worker<F>(
 	worker_version: Option<&str>,
 	mut event_loop: F,
 ) where
-	F: FnMut(UnixStream, PathBuf) -> io::Result<Never>,
+	F: FnMut(UnixStream, &WorkerInfo, SecurityStatus) -> io::Result<Never>,
 {
 	#[cfg_attr(not(target_os = "linux"), allow(unused_mut))]
 	let mut worker_info = WorkerInfo {
@@ -250,11 +346,8 @@ pub fn run_worker<F>(
 	}
 
 	// Make sure that we can read the worker dir path, and log its contents.
-	let entries = || -> Result<Vec<_>, io::Error> {
-		std::fs::read_dir(&worker_info.worker_dir_path)?
-			.map(|res| res.map(|e| e.file_name()))
-			.collect()
-	}();
+	let entries: io::Result<Vec<_>> = std::fs::read_dir(&worker_info.worker_dir_path)
+		.and_then(|d| d.map(|res| res.map(|e| e.file_name())).collect());
 	match entries {
 		Ok(entries) =>
 			gum::trace!(target: LOG_TARGET, ?worker_info, "content of worker dir: {:?}", entries),
@@ -284,6 +377,22 @@ pub fn run_worker<F>(
 	{
 		gum::trace!(target: LOG_TARGET, ?security_status, "Enabling security features");
 
+		// First, make sure env vars were cleared, to match the environment we perform the checks
+		// within. (In theory, running checks with different env vars could result in different
+		// outcomes of the checks.)
+		if !security::check_env_vars_were_cleared(&worker_info) {
+			let err = "not all env vars were cleared when spawning the process";
+			gum::error!(
+				target: LOG_TARGET,
+				?worker_info,
+				"{}",
+				err
+			);
+			if security_status.secure_validator_mode {
+				worker_shutdown(worker_info, err);
+			}
+		}
+
 		// Call based on whether we can change root. Error out if it should work but fails.
 		//
 		// NOTE: This should not be called in a multi-threaded context (i.e. inside the tokio
@@ -337,23 +446,10 @@ pub fn run_worker<F>(
 				}
 			}
 		}
-
-		if !security::check_env_vars_were_cleared(&worker_info) {
-			let err = "not all env vars were cleared when spawning the process";
-			gum::error!(
-				target: LOG_TARGET,
-				?worker_info,
-				"{}",
-				err
-			);
-			if security_status.secure_validator_mode {
-				worker_shutdown(worker_info, err);
-			}
-		}
 	}
 
 	// Run the main worker loop.
-	let err = event_loop(stream, worker_info.worker_dir_path.clone())
+	let err = event_loop(stream, &worker_info, security_status)
 		// It's never `Ok` because it's `Ok(Never)`.
 		.unwrap_err();
 

diff --git a/polkadot/node/core/pvf/common/src/worker/security/change_root.rs b/polkadot/node/core/pvf/common/src/worker/security/change_root.rs
@@ -54,8 +54,7 @@ pub fn enable_for_worker(worker_info: &WorkerInfo) -> Result<()> {
 ///
 /// NOTE: This should not be called in a multi-threaded context. `unshare(2)`:
 ///       "CLONE_NEWUSER requires that the calling process is not threaded."
-#[cfg(target_os = "linux")]
-pub fn check_is_fully_enabled(tempdir: &Path) -> Result<()> {
+pub fn check_can_fully_enable(tempdir: &Path) -> Result<()> {
 	let worker_dir_path = tempdir.to_owned();
 	try_restrict(&WorkerInfo {
 		pid: std::process::id(),
@@ -69,7 +68,6 @@ pub fn check_is_fully_enabled(tempdir: &Path) -> Result<()> {
 ///
 /// NOTE: This should not be called in a multi-threaded context. `unshare(2)`:
 ///       "CLONE_NEWUSER requires that the calling process is not threaded."
-#[cfg(target_os = "linux")]
 fn try_restrict(worker_info: &WorkerInfo) -> Result<()> {
 	// TODO: Remove this once this is stable: https://github.com/rust-lang/rust/issues/105723
 	macro_rules! cstr_ptr {
@@ -78,12 +76,6 @@ fn try_restrict(worker_info: &WorkerInfo) -> Result<()> {
 		};
 	}
 
-	gum::trace!(
-		target: LOG_TARGET,
-		?worker_info,
-		"unsharing the user namespace and calling pivot_root",
-	);
-
 	let worker_dir_path_c = CString::new(worker_info.worker_dir_path.as_os_str().as_bytes())
 		.expect("on unix; the path will never contain 0 bytes; qed");