Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

collator-protocol: Handle unknown validator heads #5538

Merged
merged 7 commits into from
Sep 2, 2024
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 3 additions & 2 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 1 addition & 1 deletion Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -1178,7 +1178,7 @@ sc-transaction-pool-api = { path = "substrate/client/transaction-pool/api", defa
sc-utils = { path = "substrate/client/utils", default-features = false }
scale-info = { version = "2.11.1", default-features = false }
schemars = { version = "0.8.13", default-features = false }
schnellru = { version = "0.2.1" }
schnellru = { version = "0.2.3" }
schnorrkel = { version = "0.11.4", default-features = false }
seccompiler = { version = "0.4.0" }
secp256k1 = { version = "0.28.0", default-features = false }
Expand Down
1 change: 1 addition & 0 deletions polkadot/node/network/collator-protocol/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@ bitvec = { features = ["alloc"], workspace = true }
futures = { workspace = true }
futures-timer = { workspace = true }
gum = { workspace = true, default-features = true }
schnellru.workspace = true

sp-core = { workspace = true, default-features = true }
sp-runtime = { workspace = true, default-features = true }
Expand Down
78 changes: 57 additions & 21 deletions polkadot/node/network/collator-protocol/src/collator_side/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@ use bitvec::{bitvec, vec::BitVec};
use futures::{
channel::oneshot, future::Fuse, pin_mut, select, stream::FuturesUnordered, FutureExt, StreamExt,
};
use schnellru::{ByLength, LruMap};
use sp_core::Pair;

use polkadot_node_network_protocol::{
Expand All @@ -42,7 +43,7 @@ use polkadot_node_subsystem::{
CollatorProtocolMessage, NetworkBridgeEvent, NetworkBridgeTxMessage, ParentHeadData,
RuntimeApiMessage,
},
overseer, CollatorProtocolSenderTrait, FromOrchestra, OverseerSignal, PerLeafSpan,
overseer, FromOrchestra, OverseerSignal, PerLeafSpan,
};
use polkadot_node_subsystem_util::{
backing_implicit_view::View as ImplicitView,
Expand Down Expand Up @@ -201,6 +202,11 @@ struct PeerData {
view: View,
/// Network protocol version.
version: CollationVersion,
/// Unknown heads in the view.
///
/// This can happen when the validator is faster at importing a block and sending out its
/// `View` than the collator is able to import a block.
unknown_heads: LruMap<Hash, (), ByLength>,
}

/// A type wrapping a collation and it's designated core index.
Expand Down Expand Up @@ -1199,9 +1205,10 @@ async fn handle_peer_view_change<Context>(
peer_id: PeerId,
view: View,
) {
let PeerData { view: current, version } = match state.peer_data.get_mut(&peer_id) {
Some(peer_data) => peer_data,
None => return,
let Some(PeerData { view: current, version, unknown_heads }) =
state.peer_data.get_mut(&peer_id)
else {
return
};

let added: Vec<Hash> = view.difference(&*current).cloned().collect();
Expand Down Expand Up @@ -1229,15 +1236,18 @@ async fn handle_peer_view_change<Context>(
new_leaf = ?added,
"New leaf in peer's view is unknown",
);

unknown_heads.insert(added, ());

continue
},
};

for block_hash in block_hashes {
let per_relay_parent = match state.per_relay_parent.get_mut(block_hash) {
Some(per_relay_parent) => per_relay_parent,
None => continue,
let Some(per_relay_parent) = state.per_relay_parent.get_mut(block_hash) else {
continue
};

advertise_collation(
ctx,
*block_hash,
Expand Down Expand Up @@ -1283,10 +1293,13 @@ async fn handle_network_msg<Context>(
return Ok(())
},
};
state
.peer_data
.entry(peer_id)
.or_insert_with(|| PeerData { view: View::default(), version });
state.peer_data.entry(peer_id).or_insert_with(|| PeerData {
view: View::default(),
version,
// Unlikely that the collator is falling 10 blocks behind and if so, it probably is
// not able to keep up any way.
unknown_heads: LruMap::new(ByLength::new(10)),
});

if let Some(authority_ids) = maybe_authority {
gum::trace!(
Expand All @@ -1311,7 +1324,7 @@ async fn handle_network_msg<Context>(
},
OurViewChange(view) => {
gum::trace!(target: LOG_TARGET, ?view, "Own view change");
handle_our_view_change(ctx.sender(), state, view).await?;
handle_our_view_change(ctx, state, view).await?;
},
PeerMessage(remote, msg) => {
handle_incoming_peer_message(ctx, runtime, state, remote, msg).await?;
Expand All @@ -1333,21 +1346,19 @@ async fn handle_network_msg<Context>(
}

/// Handles our view changes.
async fn handle_our_view_change<Sender>(
sender: &mut Sender,
#[overseer::contextbounds(CollatorProtocol, prefix = crate::overseer)]
async fn handle_our_view_change<Context>(
ctx: &mut Context,
state: &mut State,
view: OurView,
) -> Result<()>
where
Sender: CollatorProtocolSenderTrait,
{
) -> Result<()> {
let current_leaves = state.active_leaves.clone();

let removed = current_leaves.iter().filter(|(h, _)| !view.contains(h));
let added = view.iter().filter(|h| !current_leaves.contains_key(h));

for leaf in added {
let mode = prospective_parachains_mode(sender, *leaf).await?;
let mode = prospective_parachains_mode(ctx.sender(), *leaf).await?;

if let Some(span) = view.span_per_head().get(leaf).cloned() {
let per_leaf_span = PerLeafSpan::new(span, "collator-side");
Expand All @@ -1360,19 +1371,44 @@ where
if mode.is_enabled() {
if let Some(ref mut implicit_view) = state.implicit_view {
implicit_view
.activate_leaf(sender, *leaf)
.activate_leaf(ctx.sender(), *leaf)
.await
.map_err(Error::ImplicitViewFetchError)?;

let allowed_ancestry = implicit_view
.known_allowed_relay_parents_under(leaf, state.collating_on)
.unwrap_or_default();

// Get the peers that already reported us this head, but we didn't knew it at this
// point.
let peers = state
.peer_data
.iter_mut()
.filter_map(|(id, data)| {
data.unknown_heads.remove(leaf).map(|_| (id, data.version))
})
.collect::<Vec<_>>();

for block_hash in allowed_ancestry {
state
let per_relay_parent = state
.per_relay_parent
.entry(*block_hash)
.or_insert_with(|| PerRelayParent::new(mode));

// Announce relevant collations to these peers.
for (peer_id, peer_version) in &peers {
advertise_collation(
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I understand the race condition, but I still don't get why the collations are not advertised when calling distribute_collation adter we built on a known relay parent.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is quite strange for local testing, but in practice it could happen that the collation is not advertised for other reasons. The collator has to be really unlucky.

ctx,
*block_hash,
per_relay_parent,
&peer_id,
*peer_version,
&state.peer_ids,
&mut state.advertisement_timeouts,
&state.metrics,
)
.await;
}
}
}
}
Expand Down
Loading
Loading