Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Pvf thiserror #2958

Merged
merged 8 commits into from
Jan 19, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
72 changes: 21 additions & 51 deletions polkadot/node/core/pvf/common/src/error.rs
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,6 @@

use crate::prepare::{PrepareSuccess, PrepareWorkerSuccess};
use parity_scale_codec::{Decode, Encode};
use std::fmt;

/// Result of PVF preparation from a worker, with checksum of the compiled PVF and stats of the
/// preparation if successful.
Expand All @@ -32,35 +31,43 @@ pub type PrecheckResult = Result<(), PrepareError>;

/// An error that occurred during the prepare part of the PVF pipeline.
// Codec indexes are intended to stabilize pre-encoded payloads (see `OOM_PAYLOAD`)
#[derive(Debug, Clone, Encode, Decode)]
#[derive(thiserror::Error, Debug, Clone, Encode, Decode)]
pub enum PrepareError {
/// During the prevalidation stage of preparation an issue was found with the PVF.
#[codec(index = 0)]
#[error("prepare: prevalidation error: {0}")]
Prevalidation(String),
/// Compilation failed for the given PVF.
#[codec(index = 1)]
#[error("prepare: preparation error: {0}")]
Preparation(String),
/// Instantiation of the WASM module instance failed.
#[codec(index = 2)]
#[error("prepare: runtime construction: {0}")]
RuntimeConstruction(String),
/// An unexpected error has occurred in the preparation job.
#[codec(index = 3)]
#[error("prepare: job error: {0}")]
JobError(String),
/// Failed to prepare the PVF due to the time limit.
#[codec(index = 4)]
#[error("prepare: timeout")]
TimedOut,
/// An IO error occurred. This state is reported by either the validation host or by the
/// worker.
#[codec(index = 5)]
#[error("prepare: io error while receiving response: {0}")]
IoErr(String),
/// The temporary file for the artifact could not be created at the given cache path. This
/// state is reported by the validation host (not by the worker).
#[codec(index = 6)]
#[error("prepare: error creating tmp file: {0}")]
CreateTmpFile(String),
/// The response from the worker is received, but the file cannot be renamed (moved) to the
/// final destination location. This state is reported by the validation host (not by the
/// worker).
#[codec(index = 7)]
#[error("prepare: error renaming tmp file ({src:?} -> {dest:?}): {err}")]
RenameTmpFile {
err: String,
// Unfortunately `PathBuf` doesn't implement `Encode`/`Decode`, so we do a fallible
Expand All @@ -70,17 +77,21 @@ pub enum PrepareError {
},
/// Memory limit reached
#[codec(index = 8)]
#[error("prepare: out of memory")]
OutOfMemory,
/// The response from the worker is received, but the worker cache could not be cleared. The
/// worker has to be killed to avoid jobs having access to data from other jobs. This state is
/// reported by the validation host (not by the worker).
#[codec(index = 9)]
#[error("prepare: error clearing worker cache: {0}")]
ClearWorkerDir(String),
/// The preparation job process died, due to OOM, a seccomp violation, or some other factor.
JobDied { err: String, job_pid: i32 },
#[codec(index = 10)]
#[error("prepare: prepare job with pid {job_pid} died: {err}")]
JobDied { err: String, job_pid: i32 },
/// Some error occurred when interfacing with the kernel.
#[codec(index = 11)]
#[error("prepare: error interfacing with the kernel: {0}")]
Kernel(String),
}

Expand Down Expand Up @@ -109,74 +120,33 @@ impl PrepareError {
}
}

impl fmt::Display for PrepareError {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
use PrepareError::*;
match self {
Prevalidation(err) => write!(f, "prevalidation: {}", err),
Preparation(err) => write!(f, "preparation: {}", err),
RuntimeConstruction(err) => write!(f, "runtime construction: {}", err),
JobError(err) => write!(f, "panic: {}", err),
TimedOut => write!(f, "prepare: timeout"),
IoErr(err) => write!(f, "prepare: io error while receiving response: {}", err),
JobDied { err, job_pid } =>
write!(f, "prepare: prepare job with pid {job_pid} died: {err}"),
CreateTmpFile(err) => write!(f, "prepare: error creating tmp file: {}", err),
RenameTmpFile { err, src, dest } =>
write!(f, "prepare: error renaming tmp file ({:?} -> {:?}): {}", src, dest, err),
OutOfMemory => write!(f, "prepare: out of memory"),
ClearWorkerDir(err) => write!(f, "prepare: error clearing worker cache: {}", err),
Kernel(err) => write!(f, "prepare: error interfacing with the kernel: {}", err),
}
}
}

/// Some internal error occurred.
///
/// Should only ever be used for validation errors independent of the candidate and PVF, or for
/// errors we ruled out during pre-checking (so preparation errors are fine).
#[derive(Debug, Clone, Encode, Decode)]
#[derive(thiserror::Error, Debug, Clone, Encode, Decode)]
pub enum InternalValidationError {
/// Some communication error occurred with the host.
#[error("validation: some communication error occurred with the host: {0}")]
HostCommunication(String),
/// Host could not create a hard link to the artifact path.
#[error("validation: host could not create a hard link to the artifact path: {0}")]
CouldNotCreateLink(String),
/// Could not find or open compiled artifact file.
#[error("validation: could not find or open compiled artifact file: {0}")]
CouldNotOpenFile(String),
/// Host could not clear the worker cache after a job.
#[error("validation: host could not clear the worker cache ({path:?}) after a job: {err}")]
CouldNotClearWorkerDir {
err: String,
// Unfortunately `PathBuf` doesn't implement `Encode`/`Decode`, so we do a fallible
// conversion to `Option<String>`.
path: Option<String>,
},
/// Some error occurred when interfacing with the kernel.
#[error("validation: error interfacing with the kernel: {0}")]
Kernel(String),

/// Some non-deterministic preparation error occurred.
#[error("validation: prepare: {0}")]
NonDeterministicPrepareError(PrepareError),
}

impl fmt::Display for InternalValidationError {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
use InternalValidationError::*;
match self {
HostCommunication(err) =>
write!(f, "validation: some communication error occurred with the host: {}", err),
CouldNotCreateLink(err) => write!(
f,
"validation: host could not create a hard link to the artifact path: {}",
err
),
CouldNotOpenFile(err) =>
write!(f, "validation: could not find or open compiled artifact file: {}", err),
CouldNotClearWorkerDir { err, path } => write!(
f,
"validation: host could not clear the worker cache ({:?}) after a job: {}",
path, err
),
Kernel(err) => write!(f, "validation: error interfacing with the kernel: {}", err),
NonDeterministicPrepareError(err) => write!(f, "validation: prepare: {}", err),
}
}
}
2 changes: 1 addition & 1 deletion polkadot/node/core/pvf/execute-worker/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -485,7 +485,7 @@ fn recv_child_response(received_data: &mut io::BufReader<&[u8]>) -> io::Result<J
JobResult::decode(&mut response_bytes.as_slice()).map_err(|e| {
io::Error::new(
io::ErrorKind::Other,
format!("execute pvf recv_child_response: decode error: {:?}", e),
format!("execute pvf recv_child_response: decode error: {}", e),
)
})
}
Expand Down
27 changes: 15 additions & 12 deletions polkadot/node/core/pvf/src/error.rs
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@
use polkadot_node_core_pvf_common::error::{InternalValidationError, PrepareError};

/// A error raised during validation of the candidate.
#[derive(Debug, Clone)]
#[derive(thiserror::Error, Debug, Clone)]
pub enum ValidationError {
/// Deterministic preparation issue. In practice, most of the problems should be caught by
/// prechecking, so this may be a sign of internal conditions.
Expand All @@ -27,35 +27,42 @@ pub enum ValidationError {
/// pre-checking enabled only valid runtimes should ever get enacted, so we can be
/// reasonably sure that this is some local problem on the current node. However, as this
/// particular error *seems* to indicate a deterministic error, we raise a warning.
#[error("candidate validation: {0}")]
Preparation(PrepareError),
/// The error was raised because the candidate is invalid. Should vote against.
Invalid(InvalidCandidate),
#[error("candidate validation: {0}")]
Invalid(#[from] InvalidCandidate),
/// Possibly transient issue that may resolve after retries. Should vote against when retries
/// fail.
PossiblyInvalid(PossiblyInvalidError),
#[error("candidate validation: {0}")]
PossiblyInvalid(#[from] PossiblyInvalidError),
/// Preparation or execution issue caused by an internal condition. Should not vote against.
Internal(InternalValidationError),
#[error("candidate validation: internal: {0}")]
Internal(#[from] InternalValidationError),
}

/// A description of an error raised during executing a PVF and can be attributed to the combination
/// of the candidate [`polkadot_parachain_primitives::primitives::ValidationParams`] and the PVF.
#[derive(Debug, Clone)]
#[derive(thiserror::Error, Debug, Clone)]
pub enum InvalidCandidate {
/// The candidate is reported to be invalid by the execution worker. The string contains the
/// error message.
#[error("invalid: worker reported: {0}")]
WorkerReportedInvalid(String),
/// PVF execution (compilation is not included) took more time than was allotted.
#[error("invalid: hard timeout")]
HardTimeout,
}

/// Possibly transient issue that may resolve after retries.
#[derive(Debug, Clone)]
#[derive(thiserror::Error, Debug, Clone)]
pub enum PossiblyInvalidError {
/// The worker process (not the job) has died during validation of a candidate.
///
/// It's unlikely that this is caused by malicious code since workers spawn separate job
/// processes, and those job processes are sandboxed. But, it is possible. We retry in this
/// case, and if the error persists, we assume it's caused by the candidate and vote against.
#[error("possibly invalid: ambiguous worker death")]
AmbiguousWorkerDeath,
/// The job process (not the worker) has died for one of the following reasons:
///
Expand All @@ -69,22 +76,18 @@ pub enum PossiblyInvalidError {
/// (c) Some other reason, perhaps transient or perhaps caused by malicious code.
///
/// We cannot treat this as an internal error because malicious code may have caused this.
#[error("possibly invalid: ambiguous job death: {0}")]
AmbiguousJobDeath(String),
/// An unexpected error occurred in the job process and we can't be sure whether the candidate
/// is really invalid or some internal glitch occurred. Whenever we are unsure, we can never
/// treat an error as internal as we would abstain from voting. This is bad because if the
/// issue was due to the candidate, then all validators would abstain, stalling finality on the
/// chain. So we will first retry the candidate, and if the issue persists we are forced to
/// vote invalid.
#[error("possibly invalid: job error: {0}")]
JobError(String),
}

impl From<InternalValidationError> for ValidationError {
fn from(error: InternalValidationError) -> Self {
Self::Internal(error)
}
}

impl From<PrepareError> for ValidationError {
fn from(error: PrepareError) -> Self {
// Here we need to classify the errors into two errors: deterministic and non-deterministic.
Expand Down
2 changes: 1 addition & 1 deletion polkadot/node/core/pvf/src/execute/queue.rs
Original file line number Diff line number Diff line change
Expand Up @@ -372,7 +372,7 @@ fn handle_job_finish(
?artifact_id,
?worker,
worker_rip = idle_worker.is_none(),
"execution worker concluded, error occurred: {:?}",
"execution worker concluded, error occurred: {}",
err
);
} else {
Expand Down
3 changes: 2 additions & 1 deletion polkadot/node/core/pvf/src/host.rs
Original file line number Diff line number Diff line change
Expand Up @@ -486,7 +486,8 @@ async fn handle_precheck_pvf(
///
/// If the prepare job failed previously, we may retry it under certain conditions.
///
/// When preparing for execution, we use a more lenient timeout ([`LENIENT_PREPARATION_TIMEOUT`])
/// When preparing for execution, we use a more lenient timeout
/// ([`DEFAULT_LENIENT_PREPARATION_TIMEOUT`](polkadot_primitives::executor_params::DEFAULT_LENIENT_PREPARATION_TIMEOUT))
/// than when prechecking.
async fn handle_execute_pvf(
artifacts: &mut Artifacts,
Expand Down
4 changes: 2 additions & 2 deletions polkadot/node/core/pvf/src/prepare/queue.rs
Original file line number Diff line number Diff line change
Expand Up @@ -45,8 +45,8 @@ pub struct FromQueue {
/// Identifier of an artifact.
pub(crate) artifact_id: ArtifactId,
/// Outcome of the PVF processing. [`Ok`] indicates that compiled artifact
/// is successfully stored on disk. Otherwise, an [error](crate::error::PrepareError)
/// is supplied.
/// is successfully stored on disk. Otherwise, an
/// [error](polkadot_node_core_pvf_common::error::PrepareError) is supplied.
pub(crate) result: PrepareResult,
}

Expand Down
2 changes: 1 addition & 1 deletion polkadot/node/core/pvf/src/prepare/worker_interface.rs
Original file line number Diff line number Diff line change
Expand Up @@ -174,7 +174,7 @@ pub async fn start_work(
gum::warn!(
target: LOG_TARGET,
worker_pid = %pid,
"failed to recv a prepare response: {:?}",
"failed to recv a prepare response: {}",
err,
);
Outcome::IoErr(err.to_string())
Expand Down