Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Remove all batches related to a peer on disconnect #5969

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
24 commits
Select commit Hold shift + click to select a range
92a6e77
Remove all batches related to a peer on disconnect
pawanjay176 Jun 20, 2024
9d90e39
Cleanup map entries after disconnect
pawanjay176 Jun 21, 2024
70af7d1
Allow lookups to continue in case of disconnections
pawanjay176 Jun 21, 2024
312be2c
Pretty response types
pawanjay176 Jun 21, 2024
a38caf4
fmt
pawanjay176 Jun 21, 2024
c7fd21d
Fix lints
pawanjay176 Jun 24, 2024
a8f64f2
Remove lookup if it cannot progress
pawanjay176 Jun 24, 2024
bc10fb2
Fix tests
pawanjay176 Jun 24, 2024
cd17f9f
Remove poll_close on rpc behaviour
pawanjay176 Jun 24, 2024
6ead176
Remove redundant test
pawanjay176 Jun 24, 2024
3e1c41a
Fix issue raised by lion
pawanjay176 Jun 24, 2024
dfaf238
Revert pretty response types
pawanjay176 Jun 24, 2024
b930c7c
Cleanup
pawanjay176 Jun 24, 2024
62167fb
Fix test
pawanjay176 Jun 24, 2024
0c8efc1
Merge remote-tracking branch 'origin/release-v5.2.1' into rpc-error-o…
michaelsproul Jun 25, 2024
26614f1
Apply suggestions from joao
pawanjay176 Jun 25, 2024
ec90bf0
Fix log
pawanjay176 Jun 25, 2024
e79c71a
update request status on no peers found
pawanjay176 Jun 25, 2024
caee7c8
Do not remove lookup after peer disconnection
pawanjay176 Jun 25, 2024
804f36d
Add comments about expected event api
dapplion Jun 25, 2024
9b2e9e0
Update single_block_lookup.rs
dapplion Jun 25, 2024
cd68550
Update mod.rs
dapplion Jun 25, 2024
5b0fbd9
Merge branch 'rpc-error-on-disconnect-revert' into 5969-review
dapplion Jun 25, 2024
59468f8
Merge pull request #10 from dapplion/5969-review
pawanjay176 Jun 25, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
31 changes: 0 additions & 31 deletions beacon_node/lighthouse_network/src/rpc/handler.rs
Original file line number Diff line number Diff line change
Expand Up @@ -352,37 +352,6 @@ where
!matches!(self.state, HandlerState::Deactivated)
}

// NOTE: This function gets polled to completion upon a connection close.
fn poll_close(&mut self, _: &mut Context<'_>) -> Poll<Option<Self::ToBehaviour>> {
// Inform the network behaviour of any failed requests

while let Some(substream_id) = self.outbound_substreams.keys().next().cloned() {
let outbound_info = self
.outbound_substreams
.remove(&substream_id)
.expect("The value must exist for a key");
// If the state of the connection is closing, we do not need to report this case to
// the behaviour, as the connection has just closed non-gracefully
if matches!(outbound_info.state, OutboundSubstreamState::Closing(_)) {
continue;
}

// Register this request as an RPC Error
return Poll::Ready(Some(HandlerEvent::Err(HandlerErr::Outbound {
error: RPCError::Disconnected,
proto: outbound_info.proto,
id: outbound_info.req_id,
})));
}

// Also handle any events that are awaiting to be sent to the behaviour
if !self.events_out.is_empty() {
return Poll::Ready(Some(self.events_out.remove(0)));
}

Poll::Ready(None)
}

fn poll(
&mut self,
cx: &mut Context<'_>,
Expand Down
94 changes: 1 addition & 93 deletions beacon_node/lighthouse_network/tests/rpc_tests.rs
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
mod common;

use common::Protocol;
use lighthouse_network::rpc::{methods::*, RPCError};
use lighthouse_network::rpc::methods::*;
use lighthouse_network::{rpc::max_rpc_size, NetworkEvent, ReportSource, Request, Response};
use slog::{debug, warn, Level};
use ssz::Encode;
Expand Down Expand Up @@ -1012,98 +1012,6 @@ fn test_tcp_blocks_by_root_chunked_rpc_terminates_correctly() {
})
}

#[test]
fn test_disconnect_triggers_rpc_error() {
// set up the logging. The level and enabled logging or not
let log_level = Level::Debug;
let enable_logging = false;

let log = common::build_log(log_level, enable_logging);
let spec = E::default_spec();

let rt = Arc::new(Runtime::new().unwrap());
// get sender/receiver
rt.block_on(async {
let (mut sender, mut receiver) = common::build_node_pair(
Arc::downgrade(&rt),
&log,
ForkName::Base,
&spec,
Protocol::Tcp,
)
.await;

// BlocksByRoot Request
let rpc_request = Request::BlocksByRoot(BlocksByRootRequest::new(
// Must have at least one root for the request to create a stream
vec![Hash256::from_low_u64_be(0)],
&spec,
));

// build the sender future
let sender_future = async {
loop {
match sender.next_event().await {
NetworkEvent::PeerConnectedOutgoing(peer_id) => {
// Send a STATUS message
debug!(log, "Sending RPC");
sender
.send_request(peer_id, 42, rpc_request.clone())
.unwrap();
}
NetworkEvent::RPCFailed { error, id: 42, .. } => match error {
RPCError::Disconnected => return,
other => panic!("received unexpected error {:?}", other),
},
other => {
warn!(log, "Ignoring other event {:?}", other);
}
}
}
};

// determine messages to send (PeerId, RequestId). If some, indicates we still need to send
// messages
let mut sending_peer = None;
let receiver_future = async {
loop {
// this future either drives the sending/receiving or times out allowing messages to be
// sent in the timeout
match futures::future::select(
Box::pin(receiver.next_event()),
Box::pin(tokio::time::sleep(Duration::from_secs(1))),
)
.await
{
futures::future::Either::Left((ev, _)) => match ev {
NetworkEvent::RequestReceived { peer_id, .. } => {
sending_peer = Some(peer_id);
}
other => {
warn!(log, "Ignoring other event {:?}", other);
}
},
futures::future::Either::Right((_, _)) => {} // The timeout hit, send messages if required
}

// if we need to send messages send them here. This will happen after a delay
if let Some(peer_id) = sending_peer.take() {
warn!(log, "Receiver got request, disconnecting peer");
receiver.__hard_disconnect_testing_only(peer_id);
}
}
};

tokio::select! {
_ = sender_future => {}
_ = receiver_future => {}
_ = sleep(Duration::from_secs(30)) => {
panic!("Future timed out");
}
}
})
}

/// Establishes a pair of nodes and disconnects the pair based on the selected protocol via an RPC
/// Goodbye message.
fn goodbye_test(log_level: Level, enable_logging: bool, protocol: Protocol) {
Expand Down
38 changes: 36 additions & 2 deletions beacon_node/network/src/sync/backfill_sync/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -307,15 +307,49 @@ impl<T: BeaconChainTypes> BackFillSync<T> {
/// A peer has disconnected.
/// If the peer has active batches, those are considered failed and re-requested.
#[must_use = "A failure here indicates the backfill sync has failed and the global sync state should be updated"]
pub fn peer_disconnected(&mut self, peer_id: &PeerId) -> Result<(), BackFillError> {
pub fn peer_disconnected(
&mut self,
peer_id: &PeerId,
network: &mut SyncNetworkContext<T>,
) -> Result<(), BackFillError> {
if matches!(
self.state(),
BackFillState::Failed | BackFillState::NotRequired
) {
return Ok(());
}

self.active_requests.remove(peer_id);
if let Some(batch_ids) = self.active_requests.remove(peer_id) {
// fail the batches.
for id in batch_ids {
if let Some(batch) = self.batches.get_mut(&id) {
match batch.download_failed(false) {
Ok(BatchOperationOutcome::Failed { blacklist: _ }) => {
self.fail_sync(BackFillError::BatchDownloadFailed(id))?;
}
Ok(BatchOperationOutcome::Continue) => {}
Err(e) => {
self.fail_sync(BackFillError::BatchInvalidState(id, e.0))?;
}
}
// If we have run out of peers in which to retry this batch, the backfill state
// transitions to a paused state.
// We still need to reset the state for all the affected batches, so we should not
// short circuit early.
if self.retry_batch_download(network, id).is_err() {
debug!(
self.log,
"Batch could not be retried";
"batch_id" => id,
"error" => "no synced peers"
);
}
} else {
debug!(self.log, "Batch not found while removing peer";
"peer" => %peer_id, "batch" => id)
}
}
}

// Remove the peer from the participation list
self.participating_peers.remove(peer_id);
Expand Down
42 changes: 26 additions & 16 deletions beacon_node/network/src/sync/block_lookups/mod.rs
Original file line number Diff line number Diff line change
@@ -1,3 +1,25 @@
//! Implements block lookup sync.
//!
//! Block lookup sync is triggered when a peer claims to have imported a block we don't know about.
//! For example, a peer attesting to a head block root that is not in our fork-choice. Lookup sync
//! is recursive in nature, as we may discover that this attested head block root has a parent that
//! is also unknown to us.
//!
//! Block lookup is implemented as an event-driven state machine. It sends events to the network and
//! beacon processor, and expects some set of events back. A discrepancy in the expected event API
//! will result in lookups getting "stuck". A lookup becomes stuck when there is no future event
//! that will trigger the lookup to make progress. There's a fallback mechanism that drops lookups
//! that live for too long, logging the line "Notify the devs a sync lookup is stuck".
//!
//! The expected event API is documented in the code paths that are making assumptions with the
//! comment prefix "Lookup sync event safety:"
//!
//! Block lookup sync attempts to not re-download or re-process data that we already have. Block
//! components are cached temporarily in multiple places before they are imported into fork-choice.
//! Therefore, block lookup sync must peek these caches correctly to decide when to skip a download
//! or consider a lookup complete. These caches are read from the `SyncNetworkContext` and its state
//! returned to this module as `LookupRequestResult` variants.

use self::parent_chain::{compute_parent_chains, NodeChain};
pub use self::single_block_lookup::DownloadResult;
use self::single_block_lookup::{LookupRequestError, LookupResult, SingleBlockLookup};
Expand Down Expand Up @@ -410,21 +432,9 @@ impl<T: BeaconChainTypes> BlockLookups<T> {
/* Error responses */

pub fn peer_disconnected(&mut self, peer_id: &PeerId) {
self.single_block_lookups.retain(|_, lookup| {
for (_, lookup) in self.single_block_lookups.iter_mut() {
lookup.remove_peer(peer_id);

// Note: this condition should be removed in the future. It's not strictly necessary to drop a
// lookup if there are no peers left. Lookup should only be dropped if it can not make progress
if lookup.has_no_peers() {
debug!(self.log,
"Dropping single lookup after peer disconnection";
"block_root" => ?lookup.block_root()
);
false
} else {
true
}
});
}
}

/* Processing responses */
Expand Down Expand Up @@ -787,12 +797,12 @@ impl<T: BeaconChainTypes> BlockLookups<T> {
};

if stuck_lookup.id == ancestor_stuck_lookup.id {
warn!(self.log, "Notify the devs, a sync lookup is stuck";
warn!(self.log, "Notify the devs a sync lookup is stuck";
"block_root" => ?stuck_lookup.block_root(),
"lookup" => ?stuck_lookup,
);
} else {
warn!(self.log, "Notify the devs, a sync lookup is stuck";
warn!(self.log, "Notify the devs a sync lookup is stuck";
"block_root" => ?stuck_lookup.block_root(),
"lookup" => ?stuck_lookup,
"ancestor_block_root" => ?ancestor_stuck_lookup.block_root(),
Expand Down
38 changes: 32 additions & 6 deletions beacon_node/network/src/sync/block_lookups/single_block_lookup.rs
Original file line number Diff line number Diff line change
Expand Up @@ -197,21 +197,36 @@ impl<T: BeaconChainTypes> SingleBlockLookup<T> {
}

let Some(peer_id) = self.use_rand_available_peer() else {
// Allow lookup to not have any peers. In that case do nothing. If the lookup does
// not have peers for some time, it will be dropped.
// Allow lookup to not have any peers and do nothing. This is an optimization to not
// lose progress of lookups created from a block with unknown parent before we receive
// attestations for said block.
// Lookup sync event safety: If a lookup requires peers to make progress, and does
// not receive any new peers for some time it will be dropped. If it receives a new
// peer it must attempt to make progress.
R::request_state_mut(self)
.get_state_mut()
.update_awaiting_download_status("no peers");
return Ok(());
};

let request = R::request_state_mut(self);
match request.make_request(id, peer_id, downloaded_block_expected_blobs, cx)? {
LookupRequestResult::RequestSent(req_id) => {
// Lookup sync event safety: If make_request returns `RequestSent`, we are
// guaranteed that `BlockLookups::on_download_response` will be called exactly
// with this `req_id`.
request.get_state_mut().on_download_start(req_id)?
}
LookupRequestResult::NoRequestNeeded => {
// Lookup sync event safety: Advances this request to the terminal `Processed`
// state. If all requests reach this state, the request is marked as completed
// in `Self::continue_requests`.
request.get_state_mut().on_completed_request()?
}
// Sync will receive a future event to make progress on the request, do nothing now
LookupRequestResult::Pending(reason) => {
// Lookup sync event safety: Refer to the code paths constructing
// `LookupRequestResult::Pending`
request
.get_state_mut()
.update_awaiting_download_status(reason);
Expand All @@ -222,16 +237,28 @@ impl<T: BeaconChainTypes> SingleBlockLookup<T> {
// Otherwise, attempt to progress awaiting processing
// If this request is awaiting a parent lookup to be processed, do not send for processing.
// The request will be rejected with unknown parent error.
//
// TODO: The condition `block_is_processed || Block` can be dropped after checking for
// unknown parent root when import RPC blobs
} else if !awaiting_parent
&& (block_is_processed || matches!(R::response_type(), ResponseType::Block))
{
// maybe_start_processing returns Some if state == AwaitingProcess. This pattern is
// useful to conditionally access the result data.
if let Some(result) = request.get_state_mut().maybe_start_processing() {
// Lookup sync event safety: If `send_for_processing` returns Ok() we are guaranteed
// that `BlockLookups::on_processing_result` will be called exactly once with this
// lookup_id
return R::send_for_processing(id, result, cx);
}
// Lookup sync event safety: If the request is not in `AwaitingDownload` or
// `AwaitingProcessing` state it is guaranteed to receive some event to make progress.
}

// Lookup sync event safety: If a lookup is awaiting a parent we are guaranteed to either:
// (1) attempt to make progress with `BlockLookups::continue_child_lookups` if the parent
// lookup completes, or (2) get dropped if the parent fails and is dropped.

Ok(())
}

Expand All @@ -246,10 +273,9 @@ impl<T: BeaconChainTypes> SingleBlockLookup<T> {
self.peers.insert(peer_id)
}

/// Remove peer from available peers. Return true if there are no more available peers and all
/// requests are not expecting any future event (AwaitingDownload).
pub fn remove_peer(&mut self, peer_id: &PeerId) -> bool {
self.peers.remove(peer_id)
/// Remove peer from available peers.
pub fn remove_peer(&mut self, peer_id: &PeerId) {
self.peers.remove(peer_id);
}

/// Returns true if this lookup has zero peers
Expand Down
Loading