diff --git a/beacon_node/beacon_chain/src/data_availability_checker/error.rs b/beacon_node/beacon_chain/src/data_availability_checker/error.rs index dbfa00e6e2..cfdb3cfe91 100644 --- a/beacon_node/beacon_chain/src/data_availability_checker/error.rs +++ b/beacon_node/beacon_chain/src/data_availability_checker/error.rs @@ -10,7 +10,6 @@ pub enum Error { blob_commitment: KzgCommitment, block_commitment: KzgCommitment, }, - UnableToDetermineImportRequirement, Unexpected, SszTypes(ssz_types::Error), MissingBlobs, @@ -44,7 +43,6 @@ impl Error { | Error::Unexpected | Error::ParentStateMissing(_) | Error::BlockReplayError(_) - | Error::UnableToDetermineImportRequirement | Error::RebuildingStateCaches(_) | Error::SlotClockError => ErrorCategory::Internal, Error::InvalidBlobs { .. } diff --git a/beacon_node/network/src/network_beacon_processor/sync_methods.rs b/beacon_node/network/src/network_beacon_processor/sync_methods.rs index 82d06c20f8..b7ab8021af 100644 --- a/beacon_node/network/src/network_beacon_processor/sync_methods.rs +++ b/beacon_node/network/src/network_beacon_processor/sync_methods.rs @@ -1,5 +1,5 @@ use crate::metrics; -use crate::network_beacon_processor::{NetworkBeaconProcessor, FUTURE_SLOT_TOLERANCE}; +use crate::network_beacon_processor::NetworkBeaconProcessor; use crate::sync::BatchProcessResult; use crate::sync::{ manager::{BlockProcessType, SyncMessage}, @@ -9,6 +9,7 @@ use beacon_chain::block_verification_types::{AsBlock, RpcBlock}; use beacon_chain::data_availability_checker::AvailabilityCheckError; use beacon_chain::data_availability_checker::MaybeAvailableBlock; use beacon_chain::data_column_verification::verify_kzg_for_data_column_list; +use beacon_chain::ExecutionPayloadError; use beacon_chain::{ validator_monitor::get_slot_delay_ms, AvailabilityProcessingStatus, BeaconChainError, BeaconChainTypes, BlockError, ChainSegmentResult, HistoricalBlockError, NotifyExecutionLayer, @@ -44,6 +45,45 @@ struct ChainSegmentFailed { peer_action: Option, } +#[derive(Debug, PartialEq, Eq)] +pub enum ErrorCategory { + /// Internal Errors (not caused by peers). + /// + /// An internal no-retry error is permanent and block processing should not be + /// re-attempted. + Internal { retry: bool }, + /// Errors caused by faulty / malicious peers. + /// + /// No retry errors are deterministic against the block's root. Re-downloading data + /// key-ed by block root MUST result in the same no-retry error (i.e. invalid parent, + /// invalid state root, etc). + /// + /// The error also indicates which block component index is malicious if applicable. + Malicious { retry: bool, index: usize }, +} + +impl ErrorCategory { + // Helper functions for readibility on large match statements + pub fn internal_no_retry() -> Self { + Self::Internal { retry: false } + } + pub fn internal_retry() -> Self { + Self::Internal { retry: true } + } + pub fn malicious_no_retry() -> Self { + Self::Malicious { + retry: false, + index: 0, + } + } + pub fn malicious_retry() -> Self { + Self::Malicious { + retry: true, + index: 0, + } + } +} + impl NetworkBeaconProcessor { /// Returns an async closure which processes a beacon block received via RPC. /// @@ -709,130 +749,109 @@ impl NetworkBeaconProcessor { /// Helper function to handle a `BlockError` from `process_chain_segment` fn handle_failed_chain_segment(&self, error: BlockError) -> Result<(), ChainSegmentFailed> { - match error { - BlockError::ParentUnknown { parent_root, .. } => { - // blocks should be sequential and all parents should exist - Err(ChainSegmentFailed { - message: format!("Block has an unknown parent: {}", parent_root), - // Peers are faulty if they send non-sequential blocks. - peer_action: Some(PeerAction::LowToleranceError), - }) + let result = match &error { + // blocks should be sequential and all parents should exist + BlockError::ParentUnknown { .. } => Err(ErrorCategory::malicious_retry()), + // A peer may craft a block that is at a future slot. It's possible that + // eventually the slot will no longer be in the future. However, since it's + // malicious action to serve an RPC with a future slot we will not retry. + BlockError::FutureSlot { .. } => Err(ErrorCategory::malicious_no_retry()), + + // Okay to reprocess blocks that are finalized. + BlockError::GenesisBlock | BlockError::WouldRevertFinalizedSlot { .. } => Ok(()), + + // Since we request blocks by slot we should retry all conditions of invalid block. + BlockError::StateRootMismatch { .. } + | BlockError::NotFinalizedDescendant { .. } + | BlockError::BlockSlotLimitReached + | BlockError::IncorrectBlockProposer { .. } + | BlockError::UnknownValidator { .. } + | BlockError::BlockIsNotLaterThanParent { .. } + | BlockError::PerBlockProcessingError(_) + | BlockError::InconsistentFork(_) + | BlockError::WeakSubjectivityConflict + | BlockError::ProposalSignatureInvalid + | BlockError::InvalidSignature + | BlockError::NonLinearParentRoots + | BlockError::NonLinearSlots => Err(ErrorCategory::malicious_retry()), + + // This can happen for many reasons. Head sync's can download multiples and parent + // lookups can download blocks before range sync + BlockError::DuplicateFullyImported { .. } => Ok(()), + BlockError::DuplicateImportStatusUnknown { .. } => { + // This is unreachable because RPC blocks do not undergo gossip verification, and + // this error can *only* come from gossip verification. + Err(ErrorCategory::internal_no_retry()) } - BlockError::DuplicateFullyImported(_) - | BlockError::DuplicateImportStatusUnknown(..) => { - // This can happen for many reasons. Head sync's can download multiples and parent - // lookups can download blocks before range sync - Ok(()) - } - BlockError::FutureSlot { - present_slot, - block_slot, - } => { - if present_slot + FUTURE_SLOT_TOLERANCE >= block_slot { - // The block is too far in the future, drop it. - warn!( - self.log, "Block is ahead of our slot clock"; - "msg" => "block for future slot rejected, check your time", - "present_slot" => present_slot, - "block_slot" => block_slot, - "FUTURE_SLOT_TOLERANCE" => FUTURE_SLOT_TOLERANCE, - ); - } else { - // The block is in the future, but not too far. - debug!( - self.log, "Block is slightly ahead of our slot clock. Ignoring."; - "present_slot" => present_slot, - "block_slot" => block_slot, - "FUTURE_SLOT_TOLERANCE" => FUTURE_SLOT_TOLERANCE, - ); + // TODO: review `ExecutionPayloadError` variants + BlockError::ExecutionPayloadError(e) => Err(match e { + // The peer has nothing to do with this error, do not penalize them. + ExecutionPayloadError::NoExecutionConnection => ErrorCategory::internal_no_retry(), + // The peer has nothing to do with this error, do not penalize them. + ExecutionPayloadError::RequestFailed(_) => ErrorCategory::internal_retry(), + // Execution payload is invalid + ExecutionPayloadError::RejectedByExecutionEngine { .. } + | ExecutionPayloadError::InvalidPayloadTimestamp { .. } + | ExecutionPayloadError::InvalidTerminalPoWBlock { .. } + | ExecutionPayloadError::InvalidActivationEpoch { .. } + | ExecutionPayloadError::InvalidTerminalBlockHash { .. } => { + ErrorCategory::malicious_retry() } - - Err(ChainSegmentFailed { - message: format!( - "Block with slot {} is higher than the current slot {}", - block_slot, present_slot - ), - // Peers are faulty if they send blocks from the future. - peer_action: Some(PeerAction::LowToleranceError), - }) - } - BlockError::WouldRevertFinalizedSlot { .. } => { - debug!(self.log, "Finalized or earlier block processed";); - Ok(()) - } - BlockError::GenesisBlock => { - debug!(self.log, "Genesis block was processed"); - Ok(()) - } - BlockError::BeaconChainError(e) => { - warn!( - self.log, "BlockProcessingFailure"; - "msg" => "unexpected condition in processing block.", - "outcome" => ?e, - ); - - Err(ChainSegmentFailed { - message: format!("Internal error whilst processing block: {:?}", e), - // Do not penalize peers for internal errors. - peer_action: None, - }) - } - ref err @ BlockError::ExecutionPayloadError(ref epe) => { - if !epe.penalize_peer() { - // These errors indicate an issue with the EL and not the `ChainSegment`. - // Pause the syncing while the EL recovers - debug!(self.log, - "Execution layer verification failed"; - "outcome" => "pausing sync", - "err" => ?err - ); - Err(ChainSegmentFailed { - message: format!("Execution layer offline. Reason: {:?}", err), - // Do not penalize peers for internal errors. - peer_action: None, - }) - } else { - debug!(self.log, - "Invalid execution payload"; - "error" => ?err - ); - Err(ChainSegmentFailed { - message: format!( - "Peer sent a block containing invalid execution payload. Reason: {:?}", - err - ), - peer_action: Some(PeerAction::LowToleranceError), - }) + // Do not penalize the peer since it's not their fault that *we're* optimistic. + ExecutionPayloadError::UnverifiedNonOptimisticCandidate => { + ErrorCategory::internal_retry() } + }), + // Should retry? Probably + BlockError::ParentExecutionPayloadInvalid { .. } => { + Err(ErrorCategory::malicious_retry()) } - ref err @ BlockError::ParentExecutionPayloadInvalid { ref parent_root } => { - warn!( - self.log, - "Failed to sync chain built on invalid parent"; - "parent_root" => ?parent_root, - "advice" => "check execution node for corruption then restart it and Lighthouse", - ); - Err(ChainSegmentFailed { - message: format!("Peer sent invalid block. Reason: {err:?}"), - // We need to penalise harshly in case this represents an actual attack. In case - // of a faulty EL it will usually require manual intervention to fix anyway, so - // it's not too bad if we drop most of our peers. - peer_action: Some(PeerAction::LowToleranceError), - }) - } - other => { - debug!( - self.log, "Invalid block received"; - "msg" => "peer sent invalid block", - "outcome" => %other, - ); - - Err(ChainSegmentFailed { - message: format!("Peer sent invalid block. Reason: {:?}", other), - // Do not penalize peers for internal errors. - peer_action: None, - }) + // TODO: Review AvailabilityCheckError variants + BlockError::AvailabilityCheck(e) => Err(match e { + AvailabilityCheckError::SszTypes(_) + | AvailabilityCheckError::StoreError(_) + | AvailabilityCheckError::Unexpected + | AvailabilityCheckError::ParentStateMissing(_) + | AvailabilityCheckError::BlockReplayError(_) + | AvailabilityCheckError::RebuildingStateCaches(_) + | AvailabilityCheckError::SlotClockError => ErrorCategory::internal_retry(), + AvailabilityCheckError::InvalidColumn(index, _) => ErrorCategory::Malicious { + retry: true, + index: *index as usize, + }, + AvailabilityCheckError::InvalidBlobs { .. } + | AvailabilityCheckError::MissingBlobs + | AvailabilityCheckError::MissingCustodyColumns + | AvailabilityCheckError::DecodeError(_) + | AvailabilityCheckError::ReconstructColumnsError { .. } + | AvailabilityCheckError::BlobIndexInvalid(_) + | AvailabilityCheckError::DataColumnIndexInvalid(_) + | AvailabilityCheckError::KzgCommitmentMismatch { .. } => { + ErrorCategory::malicious_retry() + } // Do not use a fallback match, handle all errors explicitly + }), + // The proposer making a slashable block is not the peer's fault nor ours. Mark + // as internal (don't penalize peer), and retry: the serving peer could send the wrong + // block at this slot, which happens to not be slashable. + BlockError::Slashable => Err(ErrorCategory::internal_retry()), + // TODO: BeaconChainError should be retried? + BlockError::BeaconChainError(_) | BlockError::InternalError(_) => { + Err(ErrorCategory::internal_no_retry()) } - } + // unreachable, this error is only part of gossip + BlockError::BlobNotRequired(_) => Err(ErrorCategory::malicious_retry()), + }; + // Do not use a fallback match, handle all errors explicitly + + // Pass retry and malicious signals to range sync to do retry and score properly + result.map_err(|err| ChainSegmentFailed { + message: format!("{:?}", error), + // This is an internal error, don't penalize the peer. + peer_action: if matches!(err, ErrorCategory::Malicious { .. }) { + Some(PeerAction::LowToleranceError) + } else { + None + }, + }) } }