From 146e6e232e36279b3876ce9414856df6bf060349 Mon Sep 17 00:00:00 2001 From: Marcin S Date: Fri, 26 May 2023 18:10:46 -0400 Subject: [PATCH 01/17] Begin adding landlock + test --- Cargo.lock | 13 +++ node/core/pvf/common/Cargo.toml | 6 ++ .../common/src/{worker.rs => worker/mod.rs} | 24 ++++-- node/core/pvf/common/src/worker/security.rs | 79 +++++++++++++++++++ 4 files changed, 116 insertions(+), 6 deletions(-) rename node/core/pvf/common/src/{worker.rs => worker/mod.rs} (94%) create mode 100644 node/core/pvf/common/src/worker/security.rs diff --git a/Cargo.lock b/Cargo.lock index 8c4df5edb58d..2ccbac182193 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -4034,6 +4034,17 @@ dependencies = [ "kvdb", ] +[[package]] +name = "landlock" +version = "0.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "520baa32708c4e957d2fc3a186bc5bd8d26637c33137f399ddfc202adb240068" +dependencies = [ + "enumflags2", + "libc", + "thiserror", +] + [[package]] name = "lazy_static" version = "1.4.0" @@ -7517,8 +7528,10 @@ dependencies = [ name = "polkadot-node-core-pvf-common" version = "0.9.41" dependencies = [ + "assert_matches", "cpu-time", "futures", + "landlock", "libc", "parity-scale-codec", "polkadot-parachain", diff --git a/node/core/pvf/common/Cargo.toml b/node/core/pvf/common/Cargo.toml index de9fa10804c7..56695fbd4ea8 100644 --- a/node/core/pvf/common/Cargo.toml +++ b/node/core/pvf/common/Cargo.toml @@ -22,5 +22,11 @@ sc-executor-wasmtime = { git = "https://github.com/paritytech/substrate", branch sp-core = { git = "https://github.com/paritytech/substrate", branch = "master" } sp-tracing = { git = "https://github.com/paritytech/substrate", branch = "master" } +[target.'cfg(target_os = "linux")'.dependencies] +landlock = "0.2.0" + +[dev-dependencies] +assert_matches = "1.4.0" + [build-dependencies] substrate-build-script-utils = { git = "https://github.com/paritytech/substrate", branch = "master" } diff --git a/node/core/pvf/common/src/worker.rs b/node/core/pvf/common/src/worker/mod.rs similarity index 94% rename from node/core/pvf/common/src/worker.rs rename to node/core/pvf/common/src/worker/mod.rs index debe18985b37..e80aaee46f13 100644 --- a/node/core/pvf/common/src/worker.rs +++ b/node/core/pvf/common/src/worker/mod.rs @@ -16,6 +16,8 @@ //! Functionality common to both prepare and execute workers. +pub mod security; + use crate::LOG_TARGET; use cpu_time::ProcessTime; use futures::never::Never; @@ -203,7 +205,7 @@ pub mod thread { }; /// Contains the outcome of waiting on threads, or `Pending` if none are ready. - #[derive(Clone, Copy)] + #[derive(Debug, Clone, Copy)] pub enum WaitOutcome { Finished, TimedOut, @@ -224,8 +226,9 @@ pub mod thread { Arc::new((Mutex::new(WaitOutcome::Pending), Condvar::new())) } - /// Runs a thread, afterwards notifying the threads waiting on the condvar. Catches panics and - /// resumes them after triggering the condvar, so that the waiting thread is notified on panics. + /// Runs a worker thread. Will first enable security features, and afterwards notify the threads waiting on the + /// condvar. Catches panics during execution and resumes the panics after triggering the condvar, so that the + /// waiting thread is notified on panics. pub fn spawn_worker_thread( name: &str, f: F, @@ -237,9 +240,18 @@ pub mod thread { F: Send + 'static + panic::UnwindSafe, R: Send + 'static, { - thread::Builder::new() - .name(name.into()) - .spawn(move || cond_notify_on_done(f, cond, outcome)) + thread::Builder::new().name(name.into()).spawn(move || { + cond_notify_on_done( + || { + #[cfg(target_os = "linux")] + let _ = crate::worker::security::landlock::try_restrict_thread(); + + f() + }, + cond, + outcome, + ) + }) } /// Runs a worker thread with the given stack size. See [`spawn_worker_thread`]. diff --git a/node/core/pvf/common/src/worker/security.rs b/node/core/pvf/common/src/worker/security.rs new file mode 100644 index 000000000000..716aee7205c3 --- /dev/null +++ b/node/core/pvf/common/src/worker/security.rs @@ -0,0 +1,79 @@ +// Copyright (C) Parity Technologies (UK) Ltd. +// This file is part of Polkadot. + +// Polkadot is free software: you can redistribute it and/or modify +// it under the terms of the GNU General Public License as published by +// the Free Software Foundation, either version 3 of the License, or +// (at your option) any later version. + +// Polkadot is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU General Public License for more details. + +// You should have received a copy of the GNU General Public License +// along with Polkadot. If not, see . + +//! Functionality for securing workers. +//! +//! This is needed because workers are used to compile and execute untrusted code (PVFs). This means +//! it is possible for erroneous disputes and slashing to occur, whether intentional (as a result of +//! a malicious attacker) or not (a bug or operator error occurred). + +#[cfg(target_os = "linux")] +pub mod landlock { + use landlock::{ + path_beneath_rules, Access, AccessFs, Ruleset, RulesetAttr, RulesetCreatedAttr, + RulesetError, RulesetStatus, ABI, + }; + + /// Returns to what degree landlock is enabled on the current Linux environment. + /// + /// This is a separate check so it can run outside a worker thread, so the results can be sent via telemetry. + pub fn check_enabled() {} + + /// Tries to restrict the current thread with the following landlock access controls: + /// + /// 1. all global filesystem access + /// 2. ... more may be supported in the future. + pub fn try_restrict_thread() -> Result { + let abi = ABI::V2; + let status = Ruleset::new() + .handle_access(AccessFs::from_all(abi))? + .create()? + .restrict_self()?; + Ok(status.ruleset) + } + + #[cfg(test)] + mod tests { + use super::*; + use crate::worker::thread::{self, WaitOutcome}; + use assert_matches::assert_matches; + use std::sync::Arc; + + #[test] + fn restricted_thread_cannot_access_fs() { + // This would be nice: . + if !check_enabled() { + return + } + + let cond = thread::get_condvar(); + + // Use the same method we use to spawn workers in production. + let handle = thread::spawn_worker_thread( + "test_restricted_thread_cannot_access_fs", + || true, + Arc::clone(&cond), + WaitOutcome::Finished, + ) + .unwrap(); + + let outcome = thread::wait_for_threads(cond); + + assert_matches!(outcome, WaitOutcome::Finished); + assert_matches!(handle.join(), Ok(true)); + } + } +} From 950add020543dc95721b465ec6a216d4449b912b Mon Sep 17 00:00:00 2001 From: Marcin S Date: Sun, 28 May 2023 10:13:09 -0400 Subject: [PATCH 02/17] Move PVF implementer's guide section to own page, document security --- node/core/pvf/common/src/worker/security.rs | 4 +- roadmap/implementers-guide/src/SUMMARY.md | 1 + .../src/node/utility/candidate-validation.md | 80 +---------- .../src/node/utility/pvf-host-and-workers.md | 130 ++++++++++++++++++ 4 files changed, 134 insertions(+), 81 deletions(-) create mode 100644 roadmap/implementers-guide/src/node/utility/pvf-host-and-workers.md diff --git a/node/core/pvf/common/src/worker/security.rs b/node/core/pvf/common/src/worker/security.rs index 716aee7205c3..0fd35bbc0d0d 100644 --- a/node/core/pvf/common/src/worker/security.rs +++ b/node/core/pvf/common/src/worker/security.rs @@ -16,9 +16,7 @@ //! Functionality for securing workers. //! -//! This is needed because workers are used to compile and execute untrusted code (PVFs). This means -//! it is possible for erroneous disputes and slashing to occur, whether intentional (as a result of -//! a malicious attacker) or not (a bug or operator error occurred). +//! This is needed because workers are used to compile and execute untrusted code (PVFs). #[cfg(target_os = "linux")] pub mod landlock { diff --git a/roadmap/implementers-guide/src/SUMMARY.md b/roadmap/implementers-guide/src/SUMMARY.md index 41b52cf2299f..45d1ecb614c9 100644 --- a/roadmap/implementers-guide/src/SUMMARY.md +++ b/roadmap/implementers-guide/src/SUMMARY.md @@ -60,6 +60,7 @@ - [Utility Subsystems](node/utility/README.md) - [Availability Store](node/utility/availability-store.md) - [Candidate Validation](node/utility/candidate-validation.md) + - [PVF Host and Workers](node/utility/pvf-host-and-workers.md) - [Provisioner](node/utility/provisioner.md) - [Network Bridge](node/utility/network-bridge.md) - [Gossip Support](node/utility/gossip-support.md) diff --git a/roadmap/implementers-guide/src/node/utility/candidate-validation.md b/roadmap/implementers-guide/src/node/utility/candidate-validation.md index a238ff511bc5..4a1d02be5560 100644 --- a/roadmap/implementers-guide/src/node/utility/candidate-validation.md +++ b/roadmap/implementers-guide/src/node/utility/candidate-validation.md @@ -44,86 +44,10 @@ Once we have all parameters, we can spin up a background task to perform the val * The collator signature is valid * The PoV provided matches the `pov_hash` field of the descriptor +For more details please see [PVF Host and Workers](pvf-host-and-workers.md). + ### Checking Validation Outputs If we can assume the presence of the relay-chain state (that is, during processing [`CandidateValidationMessage`][CVM]`::ValidateFromChainState`) we can run all the checks that the relay-chain would run at the inclusion time thus confirming that the candidate will be accepted. -### PVF Host - -The PVF host is responsible for handling requests to prepare and execute PVF -code blobs. - -One high-level goal is to make PVF operations as deterministic as possible, to -reduce the rate of disputes. Disputes can happen due to e.g. a job timing out on -one machine, but not another. While we do not yet have full determinism, there -are some dispute reduction mechanisms in place right now. - -#### Retrying execution requests - -If the execution request fails during **preparation**, we will retry if it is -possible that the preparation error was transient (e.g. if the error was a panic -or time out). We will only retry preparation if another request comes in after -15 minutes, to ensure any potential transient conditions had time to be -resolved. We will retry up to 5 times. - -If the actual **execution** of the artifact fails, we will retry once if it was -a possibly transient error, to allow the conditions that led to the error to -hopefully resolve. We use a more brief delay here (1 second as opposed to 15 -minutes for preparation (see above)), because a successful execution must happen -in a short amount of time. - -We currently know of the following specific cases that will lead to a retried -execution request: - -1. **OOM:** The host might have been temporarily low on memory due to other - processes running on the same machine. **NOTE:** This case will lead to - voting against the candidate (and possibly a dispute) if the retry is still - not successful. -2. **Artifact missing:** The prepared artifact might have been deleted due to - operator error or some bug in the system. -3. **Panic:** The worker thread panicked for some indeterminate reason, which - may or may not be independent of the candidate or PVF. - -#### Preparation timeouts - -We use timeouts for both preparation and execution jobs to limit the amount of -time they can take. As the time for a job can vary depending on the machine and -load on the machine, this can potentially lead to disputes where some validators -successfuly execute a PVF and others don't. - -One dispute mitigation we have in place is a more lenient timeout for -preparation during execution than during pre-checking. The rationale is that the -PVF has already passed pre-checking, so we know it should be valid, and we allow -it to take longer than expected, as this is likely due to an issue with the -machine and not the PVF. - -#### CPU clock timeouts - -Another timeout-related mitigation we employ is to measure the time taken by -jobs using CPU time, rather than wall clock time. This is because the CPU time -of a process is less variable under different system conditions. When the -overall system is under heavy load, the wall clock time of a job is affected -more than the CPU time. - -#### Internal errors - -In general, for errors not raising a dispute we have to be very careful. This is -only sound, if we either: - -1. Ruled out that error in pre-checking. If something is not checked in - pre-checking, even if independent of the candidate and PVF, we must raise a - dispute. -2. We are 100% confident that it is a hardware/local issue: Like corrupted file, - etc. - -Reasoning: Otherwise it would be possible to register a PVF where candidates can -not be checked, but we don't get a dispute - so nobody gets punished. Second, we -end up with a finality stall that is not going to resolve! - -There are some error conditions where we can't be sure whether the candidate is -really invalid or some internal glitch occurred, e.g. panics. Whenever we are -unsure, we can never treat an error as internal as we would abstain from voting. -So we will first retry the candidate, and if the issue persists we are forced to -vote invalid. - [CVM]: ../../types/overseer-protocol.md#validationrequesttype diff --git a/roadmap/implementers-guide/src/node/utility/pvf-host-and-workers.md b/roadmap/implementers-guide/src/node/utility/pvf-host-and-workers.md new file mode 100644 index 000000000000..fe4192d7be07 --- /dev/null +++ b/roadmap/implementers-guide/src/node/utility/pvf-host-and-workers.md @@ -0,0 +1,130 @@ +# PVF Host and Workers + +The PVF host is responsible for handling requests to prepare and execute PVF +code blobs, which it sends to PVF workers running in their own child processes. +(This architecture is unlikely to change, but for more implementation details +please look at the code.) + +This system has two high-levels goals that we will touch on here: *determinism* +and *security*. + +## Determinism + +One high-level goal is to make PVF operations as deterministic as possible, to +reduce the rate of disputes. Disputes can happen due to e.g. a job timing out on +one machine, but not another. While we do not have full determinism, there are +some dispute reduction mechanisms in place right now. + +### Retrying execution requests + +If the execution request fails during **preparation**, we will retry if it is +possible that the preparation error was transient (e.g. if the error was a panic +or time out). We will only retry preparation if another request comes in after +15 minutes, to ensure any potential transient conditions had time to be +resolved. We will retry up to 5 times. + +If the actual **execution** of the artifact fails, we will retry once if it was +a possibly transient error, to allow the conditions that led to the error to +hopefully resolve. We use a more brief delay here (1 second as opposed to 15 +minutes for preparation (see above)), because a successful execution must happen +in a short amount of time. + +We currently know of the following specific cases that will lead to a retried +execution request: + +1. **OOM:** The host might have been temporarily low on memory due to other + processes running on the same machine. **NOTE:** This case will lead to + voting against the candidate (and possibly a dispute) if the retry is still + not successful. +2. **Artifact missing:** The prepared artifact might have been deleted due to + operator error or some bug in the system. +3. **Panic:** The worker thread panicked for some indeterminate reason, which + may or may not be independent of the candidate or PVF. + +### Preparation timeouts + +We use timeouts for both preparation and execution jobs to limit the amount of +time they can take. As the time for a job can vary depending on the machine and +load on the machine, this can potentially lead to disputes where some validators +successfuly execute a PVF and others don't. + +One dispute mitigation we have in place is a more lenient timeout for +preparation during execution than during pre-checking. The rationale is that the +PVF has already passed pre-checking, so we know it should be valid, and we allow +it to take longer than expected, as this is likely due to an issue with the +machine and not the PVF. + +### CPU clock timeouts + +Another timeout-related mitigation we employ is to measure the time taken by +jobs using CPU time, rather than wall clock time. This is because the CPU time +of a process is less variable under different system conditions. When the +overall system is under heavy load, the wall clock time of a job is affected +more than the CPU time. + +### Internal errors + +In general, for errors not raising a dispute we have to be very careful. This is +only sound, if we either: + +1. Ruled out that error in pre-checking. If something is not checked in + pre-checking, even if independent of the candidate and PVF, we must raise a + dispute. +2. We are 100% confident that it is a hardware/local issue: Like corrupted file, + etc. + +Reasoning: Otherwise it would be possible to register a PVF where candidates can +not be checked, but we don't get a dispute - so nobody gets punished. Second, we +end up with a finality stall that is not going to resolve! + +There are some error conditions where we can't be sure whether the candidate is +really invalid or some internal glitch occurred, e.g. panics. Whenever we are +unsure, we can never treat an error as internal as we would abstain from voting. +So we will first retry the candidate, and if the issue persists we are forced to +vote invalid. + +## Security + +With [on-demand parachains](https://github.com/orgs/paritytech/projects/67), it +is much easier to submit PVFs to the chain for preparation and execution. This +makes it easier for erroneous disputes and slashing to occur, whether +intentional (as a result of a malicious attacker) or not (a bug or operator +error occurred). + +Therefore, another goal of ours is to harden our security around PVFs, in order +to protect the economic interests of validators and increase overall confidence +in the system. + +### Possible attacks / threat model + +Webassembly is already sandboxed, but there have already been reported multiple +CVEs enabling remote code execution. See e.g. these two advisories from [Mar +2023](https://github.com/bytecodealliance/wasmtime/security/advisories/GHSA-ff4p-7xrq-q5r8) +and [Jul +2022](https://github.com/bytecodealliance/wasmtime/security/advisories/GHSA-7f6x-jwh5-m9r4). + +So what are we actually worried about? Things that come to mind: + +1. **Consensus faults** - If an attacker can get some source of randomness they + could vote against with 50% chance and cause unresolvable disputes. +2. **Targeted slashes** - An attacker can target certain validators (e.g. some + validators running on vulnerable hardware) and make them vote invalid and get + them slashed. +3. **Mass slashes** - With some source of randomness they can do an untargeted + attack. I.e. a baddie can do significant economic damage by voting against + with 1/3 chance, without even stealing keys or completely replacing the + binary. +4. **Stealing keys** - That would be pretty bad. Should not be possible with + sandboxing. We should at least not allow filesystem-access or network access. +5. **Taking control over the validator.** E.g. replacing the `polkadot` binary + with a `polkadot-evil` binary. Should again not be possible with the above + sandboxing in place. +6. **Intercepting and manipulating packages** - Effect very similar to the + above, hard to do without also being able to do 4 or 5. + +### Restricting file-system access + +A basic security mechanism is to make sure that any thread directly interfacing +with untrusted code does not have access to the file-system. This provides some +protection against attackers accessing sensitive data or modifying data on the +host machine. From e48605a2bfff7be3fcfd6f3a6b65bf3de336cf6a Mon Sep 17 00:00:00 2001 From: Marcin S Date: Sun, 28 May 2023 13:20:57 -0400 Subject: [PATCH 03/17] Implement test --- Cargo.lock | 1 + node/core/pvf/common/Cargo.toml | 1 + node/core/pvf/common/src/worker/security.rs | 67 ++++++++++++++++----- 3 files changed, 54 insertions(+), 15 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 2ccbac182193..3c070a35d437 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -7541,6 +7541,7 @@ dependencies = [ "sp-core", "sp-tracing", "substrate-build-script-utils", + "tempfile", "tokio", "tracing-gum", ] diff --git a/node/core/pvf/common/Cargo.toml b/node/core/pvf/common/Cargo.toml index 56695fbd4ea8..bbcccd6937ec 100644 --- a/node/core/pvf/common/Cargo.toml +++ b/node/core/pvf/common/Cargo.toml @@ -27,6 +27,7 @@ landlock = "0.2.0" [dev-dependencies] assert_matches = "1.4.0" +tempfile = "3.3.0" [build-dependencies] substrate-build-script-utils = { git = "https://github.com/paritytech/substrate", branch = "master" } diff --git a/node/core/pvf/common/src/worker/security.rs b/node/core/pvf/common/src/worker/security.rs index 0fd35bbc0d0d..b61d9eb96553 100644 --- a/node/core/pvf/common/src/worker/security.rs +++ b/node/core/pvf/common/src/worker/security.rs @@ -28,7 +28,10 @@ pub mod landlock { /// Returns to what degree landlock is enabled on the current Linux environment. /// /// This is a separate check so it can run outside a worker thread, so the results can be sent via telemetry. - pub fn check_enabled() {} + pub fn check_enabled() -> bool { + // TODO: + true + } /// Tries to restrict the current thread with the following landlock access controls: /// @@ -46,9 +49,13 @@ pub mod landlock { #[cfg(test)] mod tests { use super::*; - use crate::worker::thread::{self, WaitOutcome}; use assert_matches::assert_matches; - use std::sync::Arc; + use std::{ + fs, + io::{ErrorKind, Read, Write}, + sync::Arc, + thread, + }; #[test] fn restricted_thread_cannot_access_fs() { @@ -57,21 +64,51 @@ pub mod landlock { return } - let cond = thread::get_condvar(); + // Restricted thread cannot read from FS. + let handle = thread::spawn(|| { + // Write to a tmp file, this should succeed before landlock is applied. + let text = "foo"; + let tmpfile = tempfile::NamedTempFile::new().unwrap(); + let path = tmpfile.path(); + fs::write(path, text).unwrap(); + let s = fs::read_to_string(path).unwrap(); + assert_eq!(s, text); + + let status = super::try_restrict_thread().unwrap(); + if let RulesetStatus::NotEnforced = status { + panic!("Ruleset should be enforced since we checked if landlock is enabled"); + } + + // Try to read from the tmp file after landlock. + let result = fs::read_to_string(path); + assert!(matches!( + result, + Err(err) if matches!(err.kind(), ErrorKind::PermissionDenied) + )); + }); + + assert_matches!(handle.join(), Ok(())); + + // Restricted thread cannot write to FS. + let handle = thread::spawn(|| { + let text = "foo"; + let tmpfile = tempfile::NamedTempFile::new().unwrap(); + let path = tmpfile.path(); - // Use the same method we use to spawn workers in production. - let handle = thread::spawn_worker_thread( - "test_restricted_thread_cannot_access_fs", - || true, - Arc::clone(&cond), - WaitOutcome::Finished, - ) - .unwrap(); + let status = super::try_restrict_thread().unwrap(); + if let RulesetStatus::NotEnforced = status { + panic!("Ruleset should be enforced since we checked if landlock is enabled"); + } - let outcome = thread::wait_for_threads(cond); + // Try to write to the tmp file after landlock. + let result = fs::write(path, text); + assert!(matches!( + result, + Err(err) if matches!(err.kind(), ErrorKind::PermissionDenied) + )); + }); - assert_matches!(outcome, WaitOutcome::Finished); - assert_matches!(handle.join(), Ok(true)); + assert_matches!(handle.join(), Ok(())); } } } From d1af7ee2386738c5d2c7eafb368f89f4a154ac1b Mon Sep 17 00:00:00 2001 From: Marcin S Date: Sun, 28 May 2023 13:36:11 -0400 Subject: [PATCH 04/17] Add some docs --- node/core/pvf/common/src/worker/security.rs | 20 ++++++++++++++++++-- 1 file changed, 18 insertions(+), 2 deletions(-) diff --git a/node/core/pvf/common/src/worker/security.rs b/node/core/pvf/common/src/worker/security.rs index b61d9eb96553..8d1a5d516ddd 100644 --- a/node/core/pvf/common/src/worker/security.rs +++ b/node/core/pvf/common/src/worker/security.rs @@ -18,6 +18,16 @@ //! //! This is needed because workers are used to compile and execute untrusted code (PVFs). +/// The [landlock] docs say it best: +/// +/// > "Landlock is a security feature available since Linux 5.13. The goal is to enable to restrict +/// ambient rights (e.g., global filesystem access) for a set of processes by creating safe security +/// sandboxes as new security layers in addition to the existing system-wide access-controls. This +/// kind of sandbox is expected to help mitigate the security impact of bugs, unexpected or +/// malicious behaviors in applications. Landlock empowers any process, including unprivileged ones, +/// to securely restrict themselves." +/// +/// [landlock]: https://docs.rs/landlock/latest/landlock/index.html #[cfg(target_os = "linux")] pub mod landlock { use landlock::{ @@ -28,15 +38,21 @@ pub mod landlock { /// Returns to what degree landlock is enabled on the current Linux environment. /// /// This is a separate check so it can run outside a worker thread, so the results can be sent via telemetry. - pub fn check_enabled() -> bool { + pub fn check_enabled() -> RulesetStatus { // TODO: - true + RulesetStatus::FullyEnforced } /// Tries to restrict the current thread with the following landlock access controls: /// /// 1. all global filesystem access /// 2. ... more may be supported in the future. + /// + /// If landlock is not supported in the current environment this is simply a noop. + /// + /// # Returns + /// + /// The status of the restriction (whether it was fully, partially, or not-at-all enforced). pub fn try_restrict_thread() -> Result { let abi = ABI::V2; let status = Ruleset::new() From 3e5b6cd93c8a19e7cedec207319b08711a7ed6fa Mon Sep 17 00:00:00 2001 From: Marcin S Date: Wed, 31 May 2023 10:26:06 -0400 Subject: [PATCH 05/17] Do some cleanup --- node/core/pvf/common/src/worker/security.rs | 49 ++++++++++++--------- 1 file changed, 27 insertions(+), 22 deletions(-) diff --git a/node/core/pvf/common/src/worker/security.rs b/node/core/pvf/common/src/worker/security.rs index 8d1a5d516ddd..8fe6c9888bf9 100644 --- a/node/core/pvf/common/src/worker/security.rs +++ b/node/core/pvf/common/src/worker/security.rs @@ -30,17 +30,29 @@ /// [landlock]: https://docs.rs/landlock/latest/landlock/index.html #[cfg(target_os = "linux")] pub mod landlock { - use landlock::{ - path_beneath_rules, Access, AccessFs, Ruleset, RulesetAttr, RulesetCreatedAttr, - RulesetError, RulesetStatus, ABI, - }; + use landlock::{Access, AccessFs, Ruleset, RulesetAttr, RulesetError, RulesetStatus, ABI}; - /// Returns to what degree landlock is enabled on the current Linux environment. + /// Version of landlock ABI. Use the latest version supported by our reference kernel version. /// - /// This is a separate check so it can run outside a worker thread, so the results can be sent via telemetry. - pub fn check_enabled() -> RulesetStatus { - // TODO: - RulesetStatus::FullyEnforced + /// - Reference kernel version: 5.15 + /// - V1: 5.13 + /// - V2: 5.19 + const LANDLOCK_ABI: ABI = ABI::V1; + + // TODO: + /// Returns to what degree landlock is enabled on the current Linux environment. + pub fn get_status() -> Result> { + match std::thread::spawn(|| try_restrict_thread()).join() { + Ok(Ok(status)) => Ok(status), + Ok(Err(ruleset_err)) => Err(ruleset_err.into()), + Err(_err) => Err("a panic occurred in try_restrict_thread".into()), + } + } + + /// Returns a single bool indicating whether landlock is fully enabled on the current Linux + /// environment. + pub fn is_fully_enabled() -> bool { + matches!(get_status(), Ok(RulesetStatus::FullyEnforced)) } /// Tries to restrict the current thread with the following landlock access controls: @@ -54,9 +66,8 @@ pub mod landlock { /// /// The status of the restriction (whether it was fully, partially, or not-at-all enforced). pub fn try_restrict_thread() -> Result { - let abi = ABI::V2; let status = Ruleset::new() - .handle_access(AccessFs::from_all(abi))? + .handle_access(AccessFs::from_all(LANDLOCK_ABI))? .create()? .restrict_self()?; Ok(status.ruleset) @@ -65,18 +76,12 @@ pub mod landlock { #[cfg(test)] mod tests { use super::*; - use assert_matches::assert_matches; - use std::{ - fs, - io::{ErrorKind, Read, Write}, - sync::Arc, - thread, - }; + use std::{fs, io::ErrorKind, thread}; #[test] fn restricted_thread_cannot_access_fs() { - // This would be nice: . - if !check_enabled() { + // TODO: This would be nice: . + if !is_fully_enabled() { return } @@ -103,7 +108,7 @@ pub mod landlock { )); }); - assert_matches!(handle.join(), Ok(())); + assert!(handle.join().is_ok()); // Restricted thread cannot write to FS. let handle = thread::spawn(|| { @@ -124,7 +129,7 @@ pub mod landlock { )); }); - assert_matches!(handle.join(), Ok(())); + assert!(handle.join().is_ok()); } } } From 39f2495133dfd72d9d03c74ef439eab57eed0d76 Mon Sep 17 00:00:00 2001 From: Marcin S Date: Wed, 31 May 2023 10:34:29 -0400 Subject: [PATCH 06/17] Fix typo --- node/core/pvf/common/src/worker/security.rs | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/node/core/pvf/common/src/worker/security.rs b/node/core/pvf/common/src/worker/security.rs index 8fe6c9888bf9..de02c3f743c6 100644 --- a/node/core/pvf/common/src/worker/security.rs +++ b/node/core/pvf/common/src/worker/security.rs @@ -34,9 +34,11 @@ pub mod landlock { /// Version of landlock ABI. Use the latest version supported by our reference kernel version. /// - /// - Reference kernel version: 5.15 + /// - Reference kernel version: 5.16+ /// - V1: 5.13 /// - V2: 5.19 + /// + /// Please update the above if it is out-of-date. const LANDLOCK_ABI: ABI = ABI::V1; // TODO: From e555165421a94c85f7bf9a4ce10261a20ef4cd29 Mon Sep 17 00:00:00 2001 From: Marcin S Date: Wed, 31 May 2023 13:21:29 -0400 Subject: [PATCH 07/17] Warn on host startup if landlock is not supported --- node/core/pvf/common/src/worker/security.rs | 7 +++-- node/core/pvf/src/host.rs | 29 +++++++++++++++++++++ 2 files changed, 34 insertions(+), 2 deletions(-) diff --git a/node/core/pvf/common/src/worker/security.rs b/node/core/pvf/common/src/worker/security.rs index de02c3f743c6..a2c792004f2f 100644 --- a/node/core/pvf/common/src/worker/security.rs +++ b/node/core/pvf/common/src/worker/security.rs @@ -30,7 +30,10 @@ /// [landlock]: https://docs.rs/landlock/latest/landlock/index.html #[cfg(target_os = "linux")] pub mod landlock { - use landlock::{Access, AccessFs, Ruleset, RulesetAttr, RulesetError, RulesetStatus, ABI}; + // Export for checking the status. + pub use landlock::RulesetStatus; + + use landlock::{Access, AccessFs, Ruleset, RulesetAttr, RulesetError, ABI}; /// Version of landlock ABI. Use the latest version supported by our reference kernel version. /// @@ -39,7 +42,7 @@ pub mod landlock { /// - V2: 5.19 /// /// Please update the above if it is out-of-date. - const LANDLOCK_ABI: ABI = ABI::V1; + pub const LANDLOCK_ABI: ABI = ABI::V1; // TODO: /// Returns to what degree landlock is enabled on the current Linux environment. diff --git a/node/core/pvf/src/host.rs b/node/core/pvf/src/host.rs index 67f4a66e9748..f5fd4c2496cb 100644 --- a/node/core/pvf/src/host.rs +++ b/node/core/pvf/src/host.rs @@ -140,6 +140,7 @@ struct ExecutePvfInputs { } /// Configuration for the validation host. +#[derive(Debug)] pub struct Config { /// The root directory where the prepared artifacts can be stored. pub cache_path: PathBuf, @@ -189,6 +190,11 @@ impl Config { /// In that case all pending requests will be canceled, dropping the result senders and new ones /// will be rejected. pub fn start(config: Config, metrics: Metrics) -> (ValidationHost, impl Future) { + gum::debug!(target: LOG_TARGET, ?config, "starting PVF validation host"); + + // Run checks for supported security features once per host startup. + warn_if_no_landlock(); + let (to_host_tx, to_host_rx) = mpsc::channel(10); let validation_host = ValidationHost { to_host_tx }; @@ -854,6 +860,29 @@ fn pulse_every(interval: std::time::Duration) -> impl futures::Stream .map(|_| ()) } +/// Check if landlock is supported and emit a warning if not. +fn warn_if_no_landlock() { + #[cfg(target_os = "linux")] + { + use polkadot_node_core_pvf_common::worker::security::landlock; + let status = landlock::get_status(); + if !matches!(status, Ok(landlock::RulesetStatus::FullyEnforced)) { + let abi = landlock::LANDLOCK_ABI as u8; + gum::warn!( + target: LOG_TARGET, + ?status, + %abi, + "Could not fully enable landlock, a Linux kernel security feature. Running validation of PVF code has a higher risk of compromising this machine. Consider upgrading the kernel version for maximum security." + ); + } + } + #[cfg(not(target_os = "linux"))] + gum::warn!( + target: LOG_TARGET, + "Could not enable landlock, a Linux kernel security feature. Running validation of PVF code has a higher risk of compromising this machine. Consider running on Linux with landlock support for maximum security." + ); +} + #[cfg(test)] pub(crate) mod tests { use super::*; From 30178c9cc83e473a999b9d9d3ec3ac40ee3430bc Mon Sep 17 00:00:00 2001 From: Marcin S Date: Wed, 31 May 2023 13:37:09 -0400 Subject: [PATCH 08/17] Clarify docs a bit --- node/core/pvf/common/src/worker/security.rs | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/node/core/pvf/common/src/worker/security.rs b/node/core/pvf/common/src/worker/security.rs index a2c792004f2f..7c3fb931b13d 100644 --- a/node/core/pvf/common/src/worker/security.rs +++ b/node/core/pvf/common/src/worker/security.rs @@ -38,10 +38,11 @@ pub mod landlock { /// Version of landlock ABI. Use the latest version supported by our reference kernel version. /// /// - Reference kernel version: 5.16+ - /// - V1: 5.13 - /// - V2: 5.19 + /// - ABI V1: 5.13 + /// - ABI V2: 5.19 /// - /// Please update the above if it is out-of-date. + /// Please update these docs if they are out-of-date, and bump the ABI version to the minimum + /// ABI supported by the reference kernel version. pub const LANDLOCK_ABI: ABI = ABI::V1; // TODO: From c0962849225b634515a1f4881b8221f89f3fb979 Mon Sep 17 00:00:00 2001 From: Marcin S Date: Wed, 31 May 2023 13:41:54 -0400 Subject: [PATCH 09/17] Minor improvements --- node/core/pvf/common/src/worker/security.rs | 8 ++++---- node/core/pvf/src/host.rs | 4 ++-- .../src/node/utility/pvf-host-and-workers.md | 9 +++------ 3 files changed, 9 insertions(+), 12 deletions(-) diff --git a/node/core/pvf/common/src/worker/security.rs b/node/core/pvf/common/src/worker/security.rs index 7c3fb931b13d..cd511d0625d8 100644 --- a/node/core/pvf/common/src/worker/security.rs +++ b/node/core/pvf/common/src/worker/security.rs @@ -101,8 +101,8 @@ pub mod landlock { let s = fs::read_to_string(path).unwrap(); assert_eq!(s, text); - let status = super::try_restrict_thread().unwrap(); - if let RulesetStatus::NotEnforced = status { + let status = try_restrict_thread().unwrap(); + if !matches!(status, RulesetStatus::FullyEnforced) { panic!("Ruleset should be enforced since we checked if landlock is enabled"); } @@ -122,8 +122,8 @@ pub mod landlock { let tmpfile = tempfile::NamedTempFile::new().unwrap(); let path = tmpfile.path(); - let status = super::try_restrict_thread().unwrap(); - if let RulesetStatus::NotEnforced = status { + let status = try_restrict_thread().unwrap(); + if !matches!(status, RulesetStatus::FullyEnforced) { panic!("Ruleset should be enforced since we checked if landlock is enabled"); } diff --git a/node/core/pvf/src/host.rs b/node/core/pvf/src/host.rs index f5fd4c2496cb..4e0cc90af6a7 100644 --- a/node/core/pvf/src/host.rs +++ b/node/core/pvf/src/host.rs @@ -872,14 +872,14 @@ fn warn_if_no_landlock() { target: LOG_TARGET, ?status, %abi, - "Could not fully enable landlock, a Linux kernel security feature. Running validation of PVF code has a higher risk of compromising this machine. Consider upgrading the kernel version for maximum security." + "Cannot fully enable landlock, a Linux kernel security feature. Running validation of PVF code has a higher risk of compromising this machine. Consider upgrading the kernel version for maximum security." ); } } #[cfg(not(target_os = "linux"))] gum::warn!( target: LOG_TARGET, - "Could not enable landlock, a Linux kernel security feature. Running validation of PVF code has a higher risk of compromising this machine. Consider running on Linux with landlock support for maximum security." + "Cannot enable landlock, a Linux kernel security feature. Running validation of PVF code has a higher risk of compromising this machine. Consider running on Linux with landlock support for maximum security." ); } diff --git a/roadmap/implementers-guide/src/node/utility/pvf-host-and-workers.md b/roadmap/implementers-guide/src/node/utility/pvf-host-and-workers.md index fe4192d7be07..017b7fc025cc 100644 --- a/roadmap/implementers-guide/src/node/utility/pvf-host-and-workers.md +++ b/roadmap/implementers-guide/src/node/utility/pvf-host-and-workers.md @@ -2,8 +2,6 @@ The PVF host is responsible for handling requests to prepare and execute PVF code blobs, which it sends to PVF workers running in their own child processes. -(This architecture is unlikely to change, but for more implementation details -please look at the code.) This system has two high-levels goals that we will touch on here: *determinism* and *security*. @@ -98,10 +96,9 @@ in the system. ### Possible attacks / threat model Webassembly is already sandboxed, but there have already been reported multiple -CVEs enabling remote code execution. See e.g. these two advisories from [Mar -2023](https://github.com/bytecodealliance/wasmtime/security/advisories/GHSA-ff4p-7xrq-q5r8) -and [Jul -2022](https://github.com/bytecodealliance/wasmtime/security/advisories/GHSA-7f6x-jwh5-m9r4). +CVEs enabling remote code execution. See e.g. these two advisories from +[Mar 2023](https://github.com/bytecodealliance/wasmtime/security/advisories/GHSA-ff4p-7xrq-q5r8) +and [Jul 2022](https://github.com/bytecodealliance/wasmtime/security/advisories/GHSA-7f6x-jwh5-m9r4). So what are we actually worried about? Things that come to mind: From 41d8d1a8c70725a97937a6e37b15827aab711942 Mon Sep 17 00:00:00 2001 From: Marcin S Date: Thu, 1 Jun 2023 15:49:05 -0400 Subject: [PATCH 10/17] Add some docs about determinism --- node/core/pvf/common/src/worker/security.rs | 32 +++++++++++++++++---- 1 file changed, 27 insertions(+), 5 deletions(-) diff --git a/node/core/pvf/common/src/worker/security.rs b/node/core/pvf/common/src/worker/security.rs index cd511d0625d8..6b6fb946a214 100644 --- a/node/core/pvf/common/src/worker/security.rs +++ b/node/core/pvf/common/src/worker/security.rs @@ -35,18 +35,36 @@ pub mod landlock { use landlock::{Access, AccessFs, Ruleset, RulesetAttr, RulesetError, ABI}; - /// Version of landlock ABI. Use the latest version supported by our reference kernel version. + /// Landlock ABI version. Use the latest version supported by our reference kernel version. + /// + /// # Versions (June 2023) /// /// - Reference kernel version: 5.16+ /// - ABI V1: 5.13 /// - ABI V2: 5.19 /// - /// Please update these docs if they are out-of-date, and bump the ABI version to the minimum + /// Please update these values if they are out-of-date, and bump the ABI version to the minimum /// ABI supported by the reference kernel version. + /// + /// # Determinism + /// + /// You may wonder whether we could always use the latest ABI instead of the ABI supported by + /// the reference kernel version. It seems plausible, since landlock provides a best-effort + /// approach to enabling sandboxing. For example, if the reference version only supported V1 and + /// we were on V2, then landlock would use V2 if it was supported on the current machine, and + /// just fall back to V1 if not. + /// + /// The issue with this is indeterminacy. If half of validators were on V2 and half were on V1, + /// they may have different semantics on some PVFs. So a malicious PVF now has a new attack + /// vector: they can exploit this indeterminism between landlock ABIs! But this is exactly the + /// kind of thing we want to prevent! So, we have to stick only to the latest ABI supported by + /// our reference kernel version (in this example ABI V1). And if a validator's machine does not + /// *fully* support it, we can't let them run as a secure validator. pub const LANDLOCK_ABI: ABI = ABI::V1; // TODO: - /// Returns to what degree landlock is enabled on the current Linux environment. + /// Returns to what degree landlock is enabled with the given ABI on the current Linux + /// environment. pub fn get_status() -> Result> { match std::thread::spawn(|| try_restrict_thread()).join() { Ok(Ok(status)) => Ok(status), @@ -55,8 +73,12 @@ pub mod landlock { } } - /// Returns a single bool indicating whether landlock is fully enabled on the current Linux - /// environment. + /// Returns a single bool indicating whether the given landlock ABI is fully enabled on the + /// current Linux environment. + /// + /// NOTE: Secure validators *should* have this *fully* enabled for the reference ABI. Even having + /// this partially enabled (which landlock does in a best-effort capacity) may lead to + /// indeterminism. See "Determinism" under [`LANDLOCK_ABI`]. pub fn is_fully_enabled() -> bool { matches!(get_status(), Ok(RulesetStatus::FullyEnforced)) } From 6524c8181aab838607203ce11dd1e8e228ef71c3 Mon Sep 17 00:00:00 2001 From: Marcin S Date: Thu, 1 Jun 2023 18:21:29 -0400 Subject: [PATCH 11/17] Address review comments (mainly add warning on landlock error) --- node/core/pvf/common/src/worker/mod.rs | 20 ++++----- node/core/pvf/common/src/worker/security.rs | 49 ++++++++++++++++----- node/core/pvf/execute-worker/src/lib.rs | 46 +++++++++++++++---- node/core/pvf/prepare-worker/src/lib.rs | 33 +++++++++++--- node/core/pvf/src/host.rs | 2 +- 5 files changed, 112 insertions(+), 38 deletions(-) diff --git a/node/core/pvf/common/src/worker/mod.rs b/node/core/pvf/common/src/worker/mod.rs index e80aaee46f13..458dd3157e2a 100644 --- a/node/core/pvf/common/src/worker/mod.rs +++ b/node/core/pvf/common/src/worker/mod.rs @@ -229,6 +229,11 @@ pub mod thread { /// Runs a worker thread. Will first enable security features, and afterwards notify the threads waiting on the /// condvar. Catches panics during execution and resumes the panics after triggering the condvar, so that the /// waiting thread is notified on panics. + /// + /// # Returns + /// + /// Returns the thread's join handle. Calling `.join()` on it returns the result of executing + /// `f()`, as well as whether we were able to enable sandboxing. pub fn spawn_worker_thread( name: &str, f: F, @@ -240,18 +245,9 @@ pub mod thread { F: Send + 'static + panic::UnwindSafe, R: Send + 'static, { - thread::Builder::new().name(name.into()).spawn(move || { - cond_notify_on_done( - || { - #[cfg(target_os = "linux")] - let _ = crate::worker::security::landlock::try_restrict_thread(); - - f() - }, - cond, - outcome, - ) - }) + thread::Builder::new() + .name(name.into()) + .spawn(move || cond_notify_on_done(f, cond, outcome)) } /// Runs a worker thread with the given stack size. See [`spawn_worker_thread`]. diff --git a/node/core/pvf/common/src/worker/security.rs b/node/core/pvf/common/src/worker/security.rs index 6b6fb946a214..cafa7d459124 100644 --- a/node/core/pvf/common/src/worker/security.rs +++ b/node/core/pvf/common/src/worker/security.rs @@ -18,6 +18,28 @@ //! //! This is needed because workers are used to compile and execute untrusted code (PVFs). +/// To what degree landlock is enabled. It's a separate struct from `RulesetStatus` because that is +/// only available on Linux, plus this has a nicer name. +pub enum LandlockStatus { + FullyEnforced, + PartiallyEnforced, + NotEnforced, + /// Thread panicked, we don't know what the status is. + Unavailable, +} + +impl LandlockStatus { + #[cfg(target_os = "linux")] + pub fn from_ruleset_status(ruleset_status: ::landlock::RulesetStatus) -> Self { + use ::landlock::RulesetStatus::*; + match ruleset_status { + FullyEnforced => LandlockStatus::FullyEnforced, + PartiallyEnforced => LandlockStatus::PartiallyEnforced, + NotEnforced => LandlockStatus::NotEnforced, + } + } +} + /// The [landlock] docs say it best: /// /// > "Landlock is a security feature available since Linux 5.13. The goal is to enable to restrict @@ -30,10 +52,7 @@ /// [landlock]: https://docs.rs/landlock/latest/landlock/index.html #[cfg(target_os = "linux")] pub mod landlock { - // Export for checking the status. - pub use landlock::RulesetStatus; - - use landlock::{Access, AccessFs, Ruleset, RulesetAttr, RulesetError, ABI}; + use landlock::{Access, AccessFs, Ruleset, RulesetAttr, RulesetError, RulesetStatus, ABI}; /// Landlock ABI version. Use the latest version supported by our reference kernel version. /// @@ -73,14 +92,22 @@ pub mod landlock { } } - /// Returns a single bool indicating whether the given landlock ABI is fully enabled on the - /// current Linux environment. + /// Basaed on the given `status`, returns a single bool indicating whether the given landlock + /// ABI is fully enabled on the current Linux environment. + /// + /// NOTE: Secure validators must be *fully* enabled. See "Determinism" in [`LANDLOCK_ABI`]. + pub fn status_is_fully_enabled( + status: &Result>, + ) -> bool { + matches!(status, Ok(RulesetStatus::FullyEnforced)) + } + + /// Runs a check for landlock and returns a single bool indicating whether the given landlock + /// ABI is fully enabled on the current Linux environment. /// - /// NOTE: Secure validators *should* have this *fully* enabled for the reference ABI. Even having - /// this partially enabled (which landlock does in a best-effort capacity) may lead to - /// indeterminism. See "Determinism" under [`LANDLOCK_ABI`]. - pub fn is_fully_enabled() -> bool { - matches!(get_status(), Ok(RulesetStatus::FullyEnforced)) + /// NOTE: Secure validators must be *fully* enabled. See "Determinism" in [`LANDLOCK_ABI`]. + pub fn check_is_fully_enabled() -> bool { + status_is_fully_enabled(&get_status()) } /// Tries to restrict the current thread with the following landlock access controls: diff --git a/node/core/pvf/execute-worker/src/lib.rs b/node/core/pvf/execute-worker/src/lib.rs index 0ac39aafb0c9..26cf063f6989 100644 --- a/node/core/pvf/execute-worker/src/lib.rs +++ b/node/core/pvf/execute-worker/src/lib.rs @@ -30,7 +30,9 @@ use polkadot_node_core_pvf_common::{ execute::{Handshake, Response}, framed_recv, framed_send, worker::{ - bytes_to_path, cpu_time_monitor_loop, stringify_panic_payload, + bytes_to_path, cpu_time_monitor_loop, + security::LandlockStatus, + stringify_panic_payload, thread::{self, WaitOutcome}, worker_event_loop, }, @@ -136,11 +138,22 @@ pub fn worker_entrypoint(socket_path: &str, node_version: Option<&str>) { let execute_thread = thread::spawn_worker_thread_with_stack_size( "execute thread", move || { - validate_using_artifact( - &compiled_artifact_blob, - ¶ms, - executor_2, - cpu_time_start, + // Try to enable landlock. + #[cfg(target_os = "linux")] + let landlock_status = polkadot_node_core_pvf_common::worker::security::landlock::try_restrict_thread() + .map(LandlockStatus::from_ruleset_status) + .map_err(|e| e.to_string()); + #[cfg(not(target_os = "linux"))] + let landlock_status: Result = Ok(LandlockStatus::NotEnforced); + + ( + validate_using_artifact( + &compiled_artifact_blob, + ¶ms, + executor_2, + cpu_time_start, + ), + landlock_status, ) }, Arc::clone(&condvar), @@ -153,9 +166,24 @@ pub fn worker_entrypoint(socket_path: &str, node_version: Option<&str>) { let response = match outcome { WaitOutcome::Finished => { let _ = cpu_time_monitor_tx.send(()); - execute_thread - .join() - .unwrap_or_else(|e| Response::Panic(stringify_panic_payload(e))) + let (result, landlock_status) = execute_thread.join().unwrap_or_else(|e| { + ( + Response::Panic(stringify_panic_payload(e)), + Ok(LandlockStatus::Unavailable), + ) + }); + + // Log if landlock threw an error. + if let Err(err) = landlock_status { + gum::warn!( + target: LOG_TARGET, + %worker_pid, + "error enabling landlock: {}", + err + ); + } + + result }, // If the CPU thread is not selected, we signal it to end, the join handle is // dropped and the thread will finish in the background. diff --git a/node/core/pvf/prepare-worker/src/lib.rs b/node/core/pvf/prepare-worker/src/lib.rs index 8f36ef397cfb..be8db1e269a4 100644 --- a/node/core/pvf/prepare-worker/src/lib.rs +++ b/node/core/pvf/prepare-worker/src/lib.rs @@ -34,7 +34,9 @@ use polkadot_node_core_pvf_common::{ prepare::{MemoryStats, PrepareStats}, pvf::PvfPrepData, worker::{ - bytes_to_path, cpu_time_monitor_loop, stringify_panic_payload, + bytes_to_path, cpu_time_monitor_loop, + security::LandlockStatus, + stringify_panic_payload, thread::{self, WaitOutcome}, worker_event_loop, }, @@ -151,13 +153,21 @@ pub fn worker_entrypoint(socket_path: &str, node_version: Option<&str>) { let prepare_thread = thread::spawn_worker_thread( "prepare thread", move || { + // Try to enable landlock. + #[cfg(target_os = "linux")] + let landlock_status = polkadot_node_core_pvf_common::worker::security::landlock::try_restrict_thread() + .map(LandlockStatus::from_ruleset_status) + .map_err(|e| e.to_string()); + #[cfg(not(target_os = "linux"))] + let landlock_status: Result = Ok(LandlockStatus::NotEnforced); + let result = prepare_artifact(pvf, cpu_time_start); // Get the `ru_maxrss` stat. If supported, call getrusage for the thread. #[cfg(target_os = "linux")] let result = result.map(|(artifact, elapsed)| (artifact, elapsed, get_max_rss_thread())); - result + (result, landlock_status) }, Arc::clone(&condvar), WaitOutcome::Finished, @@ -170,13 +180,16 @@ pub fn worker_entrypoint(socket_path: &str, node_version: Option<&str>) { let _ = cpu_time_monitor_tx.send(()); match prepare_thread.join().unwrap_or_else(|err| { - Err(PrepareError::Panic(stringify_panic_payload(err))) + ( + Err(PrepareError::Panic(stringify_panic_payload(err))), + Ok(LandlockStatus::Unavailable), + ) }) { - Err(err) => { + (Err(err), _) => { // Serialized error will be written into the socket. Err(err) }, - Ok(ok) => { + (Ok(ok), landlock_status) => { #[cfg(not(target_os = "linux"))] let (artifact, cpu_time_elapsed) = ok; #[cfg(target_os = "linux")] @@ -192,6 +205,16 @@ pub fn worker_entrypoint(socket_path: &str, node_version: Option<&str>) { max_rss: extract_max_rss_stat(max_rss, worker_pid), }; + // Log if landlock threw an error. + if let Err(err) = landlock_status { + gum::warn!( + target: LOG_TARGET, + %worker_pid, + "error enabling landlock: {}", + err + ); + } + // Write the serialized artifact into a temp file. // // PVF host only keeps artifacts statuses in its memory, successfully diff --git a/node/core/pvf/src/host.rs b/node/core/pvf/src/host.rs index 4e0cc90af6a7..516aefabaa94 100644 --- a/node/core/pvf/src/host.rs +++ b/node/core/pvf/src/host.rs @@ -866,7 +866,7 @@ fn warn_if_no_landlock() { { use polkadot_node_core_pvf_common::worker::security::landlock; let status = landlock::get_status(); - if !matches!(status, Ok(landlock::RulesetStatus::FullyEnforced)) { + if !landlock::status_is_fully_enabled(&status) { let abi = landlock::LANDLOCK_ABI as u8; gum::warn!( target: LOG_TARGET, From a9b2dfd50fce73be46409eeb4d80e189642a0b2e Mon Sep 17 00:00:00 2001 From: Marcin S Date: Fri, 2 Jun 2023 08:06:58 -0400 Subject: [PATCH 12/17] Update node/core/pvf/src/host.rs Co-authored-by: Andrei Sandu <54316454+sandreim@users.noreply.github.com> --- node/core/pvf/src/host.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/node/core/pvf/src/host.rs b/node/core/pvf/src/host.rs index 516aefabaa94..677983793d75 100644 --- a/node/core/pvf/src/host.rs +++ b/node/core/pvf/src/host.rs @@ -872,7 +872,7 @@ fn warn_if_no_landlock() { target: LOG_TARGET, ?status, %abi, - "Cannot fully enable landlock, a Linux kernel security feature. Running validation of PVF code has a higher risk of compromising this machine. Consider upgrading the kernel version for maximum security." + "Cannot fully enable landlock, a Linux kernel security feature. Running validation of malicious PVF code has a higher risk of compromising this machine. Consider upgrading the kernel version for maximum security." ); } } From b9d8fc133fa18bece24f5017ad31e38be1716365 Mon Sep 17 00:00:00 2001 From: Marcin S Date: Fri, 2 Jun 2023 08:07:08 -0400 Subject: [PATCH 13/17] Update node/core/pvf/src/host.rs Co-authored-by: Andrei Sandu <54316454+sandreim@users.noreply.github.com> --- node/core/pvf/src/host.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/node/core/pvf/src/host.rs b/node/core/pvf/src/host.rs index 677983793d75..1a743d9d0959 100644 --- a/node/core/pvf/src/host.rs +++ b/node/core/pvf/src/host.rs @@ -879,7 +879,7 @@ fn warn_if_no_landlock() { #[cfg(not(target_os = "linux"))] gum::warn!( target: LOG_TARGET, - "Cannot enable landlock, a Linux kernel security feature. Running validation of PVF code has a higher risk of compromising this machine. Consider running on Linux with landlock support for maximum security." + "Cannot enable landlock, a Linux kernel security feature. Running validation of malicious PVF code has a higher risk of compromising this machine. Consider running on Linux with landlock support for maximum security." ); } From e9b5c172b8ea0bf613dfacf07b4400230052871b Mon Sep 17 00:00:00 2001 From: Marcin S Date: Fri, 2 Jun 2023 08:56:08 -0400 Subject: [PATCH 14/17] Fix unused fn --- node/core/pvf/common/src/worker/security.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/node/core/pvf/common/src/worker/security.rs b/node/core/pvf/common/src/worker/security.rs index cafa7d459124..a770a34408d0 100644 --- a/node/core/pvf/common/src/worker/security.rs +++ b/node/core/pvf/common/src/worker/security.rs @@ -136,7 +136,7 @@ pub mod landlock { #[test] fn restricted_thread_cannot_access_fs() { // TODO: This would be nice: . - if !is_fully_enabled() { + if !check_is_fully_enabled() { return } From 08d98e91effc22da10f5ae0fc088e3d0e213e8ff Mon Sep 17 00:00:00 2001 From: Marcin S Date: Fri, 2 Jun 2023 12:17:00 -0400 Subject: [PATCH 15/17] Update ABI docs to reflect latest discussions --- node/core/pvf/common/src/worker/security.rs | 28 +++++++++++---------- 1 file changed, 15 insertions(+), 13 deletions(-) diff --git a/node/core/pvf/common/src/worker/security.rs b/node/core/pvf/common/src/worker/security.rs index a770a34408d0..3cf67c3d57b1 100644 --- a/node/core/pvf/common/src/worker/security.rs +++ b/node/core/pvf/common/src/worker/security.rs @@ -54,31 +54,33 @@ impl LandlockStatus { pub mod landlock { use landlock::{Access, AccessFs, Ruleset, RulesetAttr, RulesetError, RulesetStatus, ABI}; - /// Landlock ABI version. Use the latest version supported by our reference kernel version. + /// Landlock ABI version. We use ABI V1 because: /// - /// # Versions (June 2023) + /// 1. It is supported by our reference kernel version. + /// 2. Later versions do not (yet) provide additional security. /// - /// - Reference kernel version: 5.16+ - /// - ABI V1: 5.13 - /// - ABI V2: 5.19 + /// # Versions (June 2023) /// - /// Please update these values if they are out-of-date, and bump the ABI version to the minimum - /// ABI supported by the reference kernel version. + /// - Polkadot reference kernel version: 5.16+ + /// - ABI V1: 5.13 - introduces landlock, including full restrictions on file reads + /// - ABI V2: 5.19 - adds ability to configure file renaming (not used by us) /// /// # Determinism /// - /// You may wonder whether we could always use the latest ABI instead of the ABI supported by - /// the reference kernel version. It seems plausible, since landlock provides a best-effort + /// You may wonder whether we could always use the latest ABI instead of only the ABI supported + /// by the reference kernel version. It seems plausible, since landlock provides a best-effort /// approach to enabling sandboxing. For example, if the reference version only supported V1 and /// we were on V2, then landlock would use V2 if it was supported on the current machine, and /// just fall back to V1 if not. /// /// The issue with this is indeterminacy. If half of validators were on V2 and half were on V1, /// they may have different semantics on some PVFs. So a malicious PVF now has a new attack - /// vector: they can exploit this indeterminism between landlock ABIs! But this is exactly the - /// kind of thing we want to prevent! So, we have to stick only to the latest ABI supported by - /// our reference kernel version (in this example ABI V1). And if a validator's machine does not - /// *fully* support it, we can't let them run as a secure validator. + /// vector: they can exploit this indeterminism between landlock ABIs! + /// + /// On the other hand we do want validators to be as secure as possible and protect their keys + /// from attackers. And, the risk with indeterminacy is low and there are other indeterminacy + /// vectors anyway. So we will only upgrade to a new ABI if either the reference kernel version + /// supports it or if it introduces some new feature that is beneficial to security. pub const LANDLOCK_ABI: ABI = ABI::V1; // TODO: From 2d72e31ba4a6c92b2b16ca62cf09d3fa5f985799 Mon Sep 17 00:00:00 2001 From: Marcin S Date: Sun, 4 Jun 2023 12:41:00 -0400 Subject: [PATCH 16/17] Remove outdated notes --- node/core/pvf/common/src/worker/security.rs | 4 ---- 1 file changed, 4 deletions(-) diff --git a/node/core/pvf/common/src/worker/security.rs b/node/core/pvf/common/src/worker/security.rs index 3cf67c3d57b1..5ba42915238c 100644 --- a/node/core/pvf/common/src/worker/security.rs +++ b/node/core/pvf/common/src/worker/security.rs @@ -96,8 +96,6 @@ pub mod landlock { /// Basaed on the given `status`, returns a single bool indicating whether the given landlock /// ABI is fully enabled on the current Linux environment. - /// - /// NOTE: Secure validators must be *fully* enabled. See "Determinism" in [`LANDLOCK_ABI`]. pub fn status_is_fully_enabled( status: &Result>, ) -> bool { @@ -106,8 +104,6 @@ pub mod landlock { /// Runs a check for landlock and returns a single bool indicating whether the given landlock /// ABI is fully enabled on the current Linux environment. - /// - /// NOTE: Secure validators must be *fully* enabled. See "Determinism" in [`LANDLOCK_ABI`]. pub fn check_is_fully_enabled() -> bool { status_is_fully_enabled(&get_status()) } From e079ce511a47c1fbf7d7112940644947375631c6 Mon Sep 17 00:00:00 2001 From: Marcin S Date: Wed, 5 Jul 2023 17:41:53 +0200 Subject: [PATCH 17/17] Try to trigger new test-linux-oldkernel-stable job Job introduced in https://github.com/paritytech/polkadot/pull/7371. --- node/core/pvf/src/host.rs | 1 + 1 file changed, 1 insertion(+) diff --git a/node/core/pvf/src/host.rs b/node/core/pvf/src/host.rs index bedfd308959c..3ca4ea43de1b 100644 --- a/node/core/pvf/src/host.rs +++ b/node/core/pvf/src/host.rs @@ -876,6 +876,7 @@ fn warn_if_no_landlock() { ); } } + #[cfg(not(target_os = "linux"))] gum::warn!( target: LOG_TARGET,