paritytech · paritytech-processbot · Apr 20, 2023 · Apr 5, 2023 · Apr 5, 2023 · Apr 5, 2023
diff --git a/node/core/candidate-validation/src/lib.rs b/node/core/candidate-validation/src/lib.rs
@@ -691,38 +691,54 @@ trait ValidationBackend {
 
  /// Tries executing a PVF. Will retry once if an error is encountered that may have been
  /// transient.
+ ///
+ /// NOTE: Should retry only on errors that are a result of execution itself, and not of
+ /// preparation.
  async fn validate_candidate_with_retry(
  &mut self,
  raw_validation_code: Vec<u8>,
  exec_timeout: Duration,
  params: ValidationParams,
  executor_params: ExecutorParams,
  ) -> Result<WasmValidationResult, ValidationError> {
- // Construct the PVF a single time, since it is an expensive operation. Cloning it is cheap.
  let prep_timeout = pvf_prep_timeout(&executor_params, PvfPrepTimeoutKind::Lenient);
+ // Construct the PVF a single time, since it is an expensive operation. Cloning it is cheap.
  let pvf = PvfPrepData::from_code(raw_validation_code, executor_params, prep_timeout);
 
  let mut validation_result =
  self.validate_candidate(pvf.clone(), exec_timeout, params.encode()).await;
 
- // If we get an AmbiguousWorkerDeath error, retry once after a brief delay, on the
- // assumption that the conditions that caused this error may have been transient. Note that
- // this error is only a result of execution itself and not of preparation.
- if let Err(ValidationError::InvalidCandidate(WasmInvalidCandidate::AmbiguousWorkerDeath)) =
- validation_result
- {
- // Wait a brief delay before retrying.
- futures_timer::Delay::new(PVF_EXECUTION_RETRY_DELAY).await;
+ // Allow one retry for each kind of error.
+ let mut num_internal_retries_left = 1;
+ let mut num_awd_retries_left = 1;
+ loop {
+ match validation_result {
+ Err(ValidationError::InvalidCandidate(
+ WasmInvalidCandidate::AmbiguousWorkerDeath,
+ )) if num_awd_retries_left > 0 => num_awd_retries_left -= 1,
+ Err(ValidationError::InternalError(_)) if num_internal_retries_left > 0 =>
+ num_internal_retries_left -= 1,
+ _ => break,
+ }
+
+ // If we got a possibly transient error, retry once after a brief delay, on the assumption
+ // that the conditions that caused this error may have resolved on their own.
+ {
+ // Wait a brief delay before retrying.
+ futures_timer::Delay::new(PVF_EXECUTION_RETRY_DELAY).await;
 
- gum::warn!(
- target: LOG_TARGET,
- ?pvf,
- "Re-trying failed candidate validation due to AmbiguousWorkerDeath."
- );
+ gum::warn!(
+ target: LOG_TARGET,
+ ?pvf,
+ "Re-trying failed candidate validation due to possible transient error: {:?}",
+ validation_result
+ );
 
- // Encode the params again when re-trying. We expect the retry case to be relatively
- // rare, and we want to avoid unconditionally cloning data.
- validation_result = self.validate_candidate(pvf, exec_timeout, params.encode()).await;
+ // Encode the params again when re-trying. We expect the retry case to be relatively
+ // rare, and we want to avoid unconditionally cloning data.
+ validation_result =
+ self.validate_candidate(pvf.clone(), exec_timeout, params.encode()).await;
+ }
  }
 
  validation_result

diff --git a/node/core/pvf/src/execute/mod.rs b/node/core/pvf/src/execute/mod.rs
@@ -23,5 +23,5 @@
 mod queue;
 mod worker;
 
-pub use queue::{start, ToQueue};
+pub use queue::{start, PendingExecutionRequest, ToQueue};
 pub use worker::{worker_entrypoint, Response as ExecuteResponse};
diff --git a/node/core/pvf/src/execute/queue.rs b/node/core/pvf/src/execute/queue.rs
@@ -50,13 +50,17 @@ slotmap::new_key_type! { struct Worker; }
 
 #[derive(Debug)]
 pub enum ToQueue {
- Enqueue {
- artifact: ArtifactPathId,
- exec_timeout: Duration,
- params: Vec<u8>,
- executor_params: ExecutorParams,
- result_tx: ResultSender,
- },
+ Enqueue { artifact: ArtifactPathId, pending_execution_request: PendingExecutionRequest },
+}
+
+/// An execution request that should execute the PVF (known in the context) and send the results
+/// to the given result sender.
+#[derive(Debug)]
+pub struct PendingExecutionRequest {
+ pub exec_timeout: Duration,
+ pub params: Vec<u8>,
+ pub executor_params: ExecutorParams,
+ pub result_tx: ResultSender,
 }
 
 struct ExecuteJob {
@@ -259,7 +263,9 @@ async fn purge_dead(metrics: &Metrics, workers: &mut Workers) {
 }
 
 fn handle_to_queue(queue: &mut Queue, to_queue: ToQueue) {
- let ToQueue::Enqueue { artifact, exec_timeout, params, executor_params, result_tx } = to_queue;
+ let ToQueue::Enqueue { artifact, pending_execution_request } = to_queue;
+ let PendingExecutionRequest { exec_timeout, params, executor_params, result_tx } =
+ pending_execution_request;
  gum::debug!(
  target: LOG_TARGET,
  validation_code_hash = ?artifact.id.code_hash,

diff --git a/node/core/pvf/src/execute/worker.rs b/node/core/pvf/src/execute/worker.rs
@@ -261,6 +261,13 @@ impl Response {
  Self::InvalidCandidate(format!("{}: {}", ctx, msg))
  }
  }
+ fn format_internal(ctx: &'static str, msg: &str) -> Self {
+ if msg.is_empty() {
+ Self::InternalError(ctx.to_string())
+ } else {
+ Self::InternalError(format!("{}: {}", ctx, msg))
+ }
+ }
 }
 
 /// The entrypoint that the spawned execute worker should start with. The `socket_path` specifies
@@ -359,7 +366,13 @@ fn validate_using_artifact(
  // [`executor_intf::prepare`].
  executor.execute(artifact_path.as_ref(), params)
  } {
- Err(err) => return Response::format_invalid("execute", &err),
+ Err(err) =>
+ return if err.contains("failed to open file: No such file or directory") {
+ // Raise an internal error if the file is missing.
+ Response::format_internal("execute: missing file", &err)
+ } else {
+ Response::format_invalid("execute", &err)
+ },
  Ok(d) => d,
  };