Skip to content
This repository has been archived by the owner on Nov 15, 2023. It is now read-only.

Candidate Validation/PVF: more fidelity of error metrics #6479

Closed
wants to merge 8 commits into from
Closed
Show file tree
Hide file tree
Changes from 6 commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 1 addition & 1 deletion node/core/backing/src/tests.rs
Original file line number Diff line number Diff line change
Expand Up @@ -1192,7 +1192,7 @@ fn backing_works_after_failed_validation() {
tx,
)
) if pov == pov && c.descriptor() == candidate.descriptor() && timeout == BACKING_EXECUTION_TIMEOUT && c.commitments_hash == candidate.commitments.hash() => {
tx.send(Err(ValidationFailed("Internal test error".into()))).unwrap();
tx.send(Err(ValidationFailed::Other("Internal test error".into()))).unwrap();
}
);

Expand Down
4 changes: 1 addition & 3 deletions node/core/candidate-validation/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -15,13 +15,11 @@ parity-scale-codec = { version = "3.1.5", default-features = false, features = [

polkadot-primitives = { path = "../../../primitives" }
polkadot-parachain = { path = "../../../parachain" }
polkadot-node-core-pvf = { path = "../pvf" }
polkadot-node-primitives = { path = "../../primitives" }
polkadot-node-subsystem = {path = "../../subsystem" }
polkadot-node-subsystem-util = { path = "../../subsystem-util" }

[target.'cfg(not(any(target_os = "android", target_os = "unknown")))'.dependencies]
polkadot-node-core-pvf = { path = "../pvf" }

[dev-dependencies]
sp-keyring = { git = "https://github.com/paritytech/substrate", branch = "master" }
futures = { version = "0.3.21", features = ["thread-pool"] }
Expand Down
54 changes: 34 additions & 20 deletions node/core/candidate-validation/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,8 @@
#![warn(missing_docs)]

use polkadot_node_core_pvf::{
InvalidCandidate as WasmInvalidCandidate, PrepareError, Pvf, ValidationError, ValidationHost,
DeterministicError, InvalidCandidate as WasmInvalidCandidate, NonDeterministicError,
PrepareError, Pvf, ValidationError, ValidationHost,
};
use polkadot_node_primitives::{
BlockData, InvalidCandidate, PoV, ValidationResult, POV_BOMB_LIMIT, VALIDATION_CODE_BOMB_LIMIT,
Expand Down Expand Up @@ -321,7 +322,7 @@ where
match validation_backend.precheck_pvf(validation_code).await {
Ok(_) => PreCheckOutcome::Valid,
Err(prepare_err) =>
if prepare_err.is_deterministic() {
if matches!(prepare_err, PrepareError::Deterministic(_)) {
PreCheckOutcome::Invalid
} else {
PreCheckOutcome::Failed
Expand Down Expand Up @@ -433,7 +434,7 @@ where
Ok(None)
},
AssumptionCheckOutcome::BadRequest =>
Err(ValidationFailed("Assumption Check: Bad request".into())),
Err(ValidationFailed::Other("Assumption Check: Bad request".into())),
}
}

Expand Down Expand Up @@ -483,7 +484,7 @@ where
Ok(true) => {},
Ok(false) => return Ok(ValidationResult::Invalid(InvalidCandidate::InvalidOutputs)),
Err(RuntimeRequestFailed) =>
return Err(ValidationFailed("Check Validation Outputs: Bad request".into())),
return Err(ValidationFailed::Other("Check Validation Outputs: Bad request".into())),
}
}

Expand Down Expand Up @@ -560,18 +561,30 @@ async fn validate_candidate_exhaustive(
}

match result {
Err(ValidationError::InternalError(e)) => Err(ValidationFailed(e)),

Err(ValidationError::InvalidCandidate(WasmInvalidCandidate::HardTimeout)) =>
Ok(ValidationResult::Invalid(InvalidCandidate::Timeout)),
Err(ValidationError::InvalidCandidate(WasmInvalidCandidate::WorkerReportedError(e))) =>
Ok(ValidationResult::Invalid(InvalidCandidate::ExecutionError(e))),
Err(ValidationError::InvalidCandidate(WasmInvalidCandidate::AmbiguousWorkerDeath)) =>
Ok(ValidationResult::Invalid(InvalidCandidate::ExecutionError(
"ambiguous worker death".to_string(),
))),
Err(ValidationError::InvalidCandidate(WasmInvalidCandidate::PrepareError(e))) =>
Ok(ValidationResult::Invalid(InvalidCandidate::ExecutionError(e))),
// Internal errors.
Err(ValidationError::InternalPrepareError(e)) => Err(ValidationFailed::Prepare(e)),
Err(ValidationError::InternalExecuteError(e)) => Err(ValidationFailed::Execute(e)),
Err(ValidationError::InternalOtherError(e)) => Err(ValidationFailed::Other(e)),

// Invalid candidate errors.
Err(ValidationError::InvalidCandidate(e)) => match e {
// Execution errors.
WasmInvalidCandidate::HardTimeout =>
Ok(ValidationResult::Invalid(InvalidCandidate::ExecutionTimeout)),
WasmInvalidCandidate::WorkerReportedError(e) =>
Ok(ValidationResult::Invalid(InvalidCandidate::ExecutionError(e))),
WasmInvalidCandidate::AmbiguousWorkerDeath =>
Ok(ValidationResult::Invalid(InvalidCandidate::AmbiguousWorkerDeath)),

// Prepare errors.
WasmInvalidCandidate::PrepareError(DeterministicError::Prevalidation(e)) =>
Ok(ValidationResult::Invalid(InvalidCandidate::PrevalidationError(e.to_string()))),
WasmInvalidCandidate::PrepareError(DeterministicError::Preparation(e)) =>
Ok(ValidationResult::Invalid(InvalidCandidate::CompilationError(e.to_string()))),
WasmInvalidCandidate::PrepareError(e) =>
// General preparation error.
Ok(ValidationResult::Invalid(InvalidCandidate::PreparationError(e.to_string()))),
},

Ok(res) =>
if res.head_data.hash() != candidate_receipt.descriptor.para_head {
Expand Down Expand Up @@ -654,24 +667,25 @@ impl ValidationBackend for ValidationHost {

let (tx, rx) = oneshot::channel();
if let Err(err) = self.execute_pvf(pvf, timeout, encoded_params, priority, tx).await {
return Err(ValidationError::InternalError(format!(
return Err(ValidationError::InternalOtherError(format!(
"cannot send pvf to the validation host: {:?}",
err
)))
}

rx.await
.map_err(|_| ValidationError::InternalError("validation was cancelled".into()))?
.map_err(|_| ValidationError::InternalOtherError("validation was cancelled".into()))?
}

async fn precheck_pvf(&mut self, pvf: Pvf) -> Result<Duration, PrepareError> {
let (tx, rx) = oneshot::channel();
if let Err(_) = self.precheck_pvf(pvf, tx).await {
// Return an IO error if there was an error communicating with the host.
return Err(PrepareError::IoErr)
return Err(PrepareError::NonDeterministic(NonDeterministicError::IoErr))
}

let precheck_result = rx.await.or(Err(PrepareError::IoErr))?;
let precheck_result =
rx.await.or(Err(PrepareError::NonDeterministic(NonDeterministicError::IoErr)))?;

precheck_result
}
Expand Down
27 changes: 16 additions & 11 deletions node/core/candidate-validation/src/metrics.rs
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@
// You should have received a copy of the GNU General Public License
// along with Polkadot. If not, see <http://www.gnu.org/licenses/>.

use super::{ValidationFailed, ValidationResult};
use super::{InvalidCandidate, ValidationFailed, ValidationResult};
use polkadot_node_subsystem_util::metrics::{self, prometheus};

#[derive(Clone)]
Expand All @@ -32,17 +32,22 @@ pub struct Metrics(Option<MetricsInner>);
impl Metrics {
pub fn on_validation_event(&self, event: &Result<ValidationResult, ValidationFailed>) {
if let Some(metrics) = &self.0 {
match event {
Ok(ValidationResult::Valid(_, _)) => {
metrics.validation_requests.with_label_values(&["valid"]).inc();
let label = match event {
Ok(ValidationResult::Valid(_, _)) => &["valid"],
Ok(ValidationResult::Invalid(err)) => match err {
InvalidCandidate::PrevalidationError(_) => &["invalid (prevalidation)"],
InvalidCandidate::CompilationError(_) => &["invalid (compilation)"],
InvalidCandidate::PreparationTimeout => &["invalid (preparation timeout)"],
InvalidCandidate::ExecutionTimeout => &["invalid (execution timeout)"],
InvalidCandidate::PreparationError(_) => &["invalid (misc preparation)"],
InvalidCandidate::ExecutionError(_) => &["invalid (misc execution)"],
_ => &["invalid (misc)"],
},
Ok(ValidationResult::Invalid(_)) => {
metrics.validation_requests.with_label_values(&["invalid"]).inc();
},
Err(_) => {
metrics.validation_requests.with_label_values(&["validation failure"]).inc();
},
}
Err(ValidationFailed::Prepare(_)) => &["internal failure (preparation)"],
Err(ValidationFailed::Execute(_)) => &["internal failure (execution)"],
Err(ValidationFailed::Other(_)) => &["internal failure (misc)"],
};
metrics.validation_requests.with_label_values(label).inc();
}
}

Expand Down
33 changes: 24 additions & 9 deletions node/core/candidate-validation/src/tests.rs
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@ use super::*;
use ::test_helpers::{dummy_hash, make_valid_candidate_descriptor};
use assert_matches::assert_matches;
use futures::executor;
use polkadot_node_core_pvf::PrepareError;
use polkadot_node_core_pvf::{DeterministicError, NonDeterministicError, PrepareError};
use polkadot_node_subsystem::messages::AllMessages;
use polkadot_node_subsystem_test_helpers as test_helpers;
use polkadot_node_subsystem_util::reexports::SubsystemContext;
Expand Down Expand Up @@ -491,7 +491,7 @@ fn candidate_validation_bad_return_is_invalid() {
))
.unwrap();

assert_matches!(v, ValidationResult::Invalid(InvalidCandidate::Timeout));
assert_matches!(v, ValidationResult::Invalid(InvalidCandidate::ExecutionTimeout));
}

#[test]
Expand Down Expand Up @@ -607,7 +607,7 @@ fn candidate_validation_multiple_ambiguous_errors_is_invalid() {
))
.unwrap();

assert_matches!(v, ValidationResult::Invalid(InvalidCandidate::ExecutionError(_)));
assert_matches!(v, ValidationResult::Invalid(InvalidCandidate::AmbiguousWorkerDeath));
}

#[test]
Expand Down Expand Up @@ -650,7 +650,7 @@ fn candidate_validation_timeout_is_internal_error() {
&Default::default(),
));

assert_matches!(v, Ok(ValidationResult::Invalid(InvalidCandidate::Timeout)));
assert_matches!(v, Ok(ValidationResult::Invalid(InvalidCandidate::ExecutionTimeout)));
}

#[test]
Expand Down Expand Up @@ -1048,10 +1048,25 @@ fn precheck_properly_classifies_outcomes() {
executor::block_on(test_fut);
};

inner(Err(PrepareError::Prevalidation("foo".to_owned())), PreCheckOutcome::Invalid);
inner(Err(PrepareError::Preparation("bar".to_owned())), PreCheckOutcome::Invalid);
inner(Err(PrepareError::Panic("baz".to_owned())), PreCheckOutcome::Invalid);
inner(
Err(PrepareError::Deterministic(DeterministicError::Prevalidation("foo".to_owned()))),
PreCheckOutcome::Invalid,
);
inner(
Err(PrepareError::Deterministic(DeterministicError::Preparation("bar".to_owned()))),
PreCheckOutcome::Invalid,
);
inner(
Err(PrepareError::Deterministic(DeterministicError::Panic("baz".to_owned()))),
PreCheckOutcome::Invalid,
);

inner(Err(PrepareError::TimedOut), PreCheckOutcome::Failed);
inner(Err(PrepareError::IoErr), PreCheckOutcome::Failed);
inner(
Err(PrepareError::NonDeterministic(NonDeterministicError::TimedOut)),
PreCheckOutcome::Failed,
);
inner(
Err(PrepareError::NonDeterministic(NonDeterministicError::IoErr)),
PreCheckOutcome::Failed,
);
}
2 changes: 1 addition & 1 deletion node/core/dispute-coordinator/src/participation/tests.rs
Original file line number Diff line number Diff line change
Expand Up @@ -455,7 +455,7 @@ fn cast_invalid_vote_if_validation_fails_or_is_invalid() {
AllMessages::CandidateValidation(
CandidateValidationMessage::ValidateFromExhaustive(_, _, _, _, timeout, tx)
) if timeout == APPROVAL_EXECUTION_TIMEOUT => {
tx.send(Ok(ValidationResult::Invalid(InvalidCandidate::Timeout))).unwrap();
tx.send(Ok(ValidationResult::Invalid(InvalidCandidate::ExecutionTimeout))).unwrap();
},
"overseer did not receive candidate validation message",
);
Expand Down
2 changes: 1 addition & 1 deletion node/core/pvf/src/artifacts.rs
Original file line number Diff line number Diff line change
Expand Up @@ -191,7 +191,7 @@ impl Artifacts {
let now = SystemTime::now();

let mut to_remove = vec![];
for (k, v) in self.artifacts.iter() {
for (k, v) in &self.artifacts {
if let ArtifactState::Prepared { last_time_needed, .. } = *v {
if now
.duration_since(last_time_needed)
Expand Down
81 changes: 52 additions & 29 deletions node/core/pvf/src/error.rs
Original file line number Diff line number Diff line change
Expand Up @@ -24,12 +24,50 @@ pub type PrepareResult = Result<Duration, PrepareError>;
/// An error that occurred during the prepare part of the PVF pipeline.
#[derive(Debug, Clone, Encode, Decode)]
pub enum PrepareError {
/// An error that should trigger reliably. See [`DeterministicError`].
Deterministic(DeterministicError),
/// An error that may happen spuriously. See [`NonDeterministicError`].
NonDeterministic(NonDeterministicError),
}
Comment on lines +27 to +31
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

IMO prepare error being deterministic or not is a property that's not necessarily needed everywhere this enum is used.

pub fn is_deterministic(&self) -> bool {

looked much better to me.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

These variants do end up being useful actually, there's several places where only "deterministic" (or not) errors are allowed, and having PrepareError in such places didn't quite seem correct.

If we pursue this further, I would probably rename to PrepareFailedError and PrepareInvalidError or something (not totally sure, haven't looked at this in a while).


impl fmt::Display for PrepareError {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
use PrepareError::*;
match self {
Deterministic(err) => write!(f, "{}", err),
NonDeterministic(err) => write!(f, "{}", err),
}
}
}

/// A deterministic error, i.e. one that should trigger reliably. Those errors depend on the PVF
/// itself and the sc-executor/wasmtime logic.
#[derive(Debug, Clone, Encode, Decode)]
pub enum DeterministicError {
/// During the prevalidation stage of preparation an issue was found with the PVF.
Prevalidation(String),
/// Compilation failed for the given PVF.
Preparation(String),
/// An unexpected panic has occured in the preparation worker.
Panic(String),
}

impl fmt::Display for DeterministicError {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
use DeterministicError::*;
match self {
Prevalidation(err) => write!(f, "prevalidation: {}", err),
Preparation(err) => write!(f, "preparation: {}", err),
Panic(err) => write!(f, "panic: {}", err),
}
}
}

/// Non-deterministic errors can happen spuriously. Typically, they occur due to resource
/// starvation, e.g. under heavy load or memory pressure. Those errors are typically transient but
/// may persist e.g. if the node is run by overwhelmingly underpowered machine.
#[derive(Debug, Clone, Encode, Decode)]
pub enum NonDeterministicError {
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I get the naming of deterministic and non deterministic. However, I think this could be improved. I don't have a better terminology at hand right now, but I think it could be better.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yeah, it's awkward. Maybe InternalPrepareError and InvalidPrepareError? IDK.

/// Failed to prepare the PVF due to the time limit.
TimedOut,
/// An IO error occurred while receiving the result from the worker process. This state is reported by the
Expand All @@ -43,29 +81,10 @@ pub enum PrepareError {
RenameTmpFileErr(String),
}

impl PrepareError {
/// Returns whether this is a deterministic error, i.e. one that should trigger reliably. Those
/// errors depend on the PVF itself and the sc-executor/wasmtime logic.
///
/// Non-deterministic errors can happen spuriously. Typically, they occur due to resource
/// starvation, e.g. under heavy load or memory pressure. Those errors are typically transient
/// but may persist e.g. if the node is run by overwhelmingly underpowered machine.
pub fn is_deterministic(&self) -> bool {
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why did you remove this?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I figured that now we can just match on the Deterministic variant directly, so this function seemed extraneous.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yeah for sure, but calling this method is less code 🤣 But yeah, no strong argument here.

use PrepareError::*;
match self {
Prevalidation(_) | Preparation(_) | Panic(_) => true,
TimedOut | IoErr | CreateTmpFileErr(_) | RenameTmpFileErr(_) => false,
}
}
}

impl fmt::Display for PrepareError {
impl fmt::Display for NonDeterministicError {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
use PrepareError::*;
use NonDeterministicError::*;
match self {
Prevalidation(err) => write!(f, "prevalidation: {}", err),
Preparation(err) => write!(f, "preparation: {}", err),
Panic(err) => write!(f, "panic: {}", err),
TimedOut => write!(f, "prepare: timeout"),
IoErr => write!(f, "prepare: io error while receiving response"),
CreateTmpFileErr(err) => write!(f, "prepare: error creating tmp file: {}", err),
Expand All @@ -79,16 +98,20 @@ impl fmt::Display for PrepareError {
pub enum ValidationError {
/// The error was raised because the candidate is invalid.
InvalidCandidate(InvalidCandidate),
/// This error is raised due to inability to serve the request.
InternalError(String),
/// This error is raised due to inability to serve the request during preparation.
InternalPrepare(NonDeterministicError),
/// This error is raised due to inability to serve the request during execution.
InternalExecute(String),
/// This error is raised due to inability to serve the request for some other reason.
InternalOther(String),
}

/// A description of an error raised during executing a PVF and can be attributed to the combination
/// of the candidate [`polkadot_parachain::primitives::ValidationParams`] and the PVF.
#[derive(Debug, Clone)]
pub enum InvalidCandidate {
/// PVF preparation ended up with a deterministic error.
PrepareError(String),
PrepareError(DeterministicError),
/// The failure is reported by the execution worker. The string contains the error message.
WorkerReportedError(String),
/// The worker has died during validation of a candidate. That may fall in one of the following
Expand Down Expand Up @@ -123,12 +146,12 @@ impl From<PrepareError> for ValidationError {
// We treat the deterministic errors as `InvalidCandidate`. Should those occur they could
// potentially trigger disputes.
//
// All non-deterministic errors are qualified as `InternalError`s and will not trigger
// All non-deterministic errors are qualified as `InternalPrepareError`s and will not trigger
// disputes.
if error.is_deterministic() {
ValidationError::InvalidCandidate(InvalidCandidate::PrepareError(error.to_string()))
} else {
ValidationError::InternalError(error.to_string())
match error {
PrepareError::Deterministic(error) =>
ValidationError::InvalidCandidate(InvalidCandidate::PrepareError(error)),
PrepareError::NonDeterministic(error) => ValidationError::InternalPrepareError(error),
}
}
}
Expand Down
Loading