From 062ba235844e5231be512903d09bcd3dfe827d27 Mon Sep 17 00:00:00 2001 From: Luca Joss Date: Thu, 30 Nov 2023 17:27:11 +0100 Subject: [PATCH 01/11] Update backlog metric when packet clearing is triggered --- crates/relayer/src/link/relay_path.rs | 18 ++++++++++++++ crates/telemetry/src/state.rs | 34 +++++++++++++++++++++++++++ 2 files changed, 52 insertions(+) diff --git a/crates/relayer/src/link/relay_path.rs b/crates/relayer/src/link/relay_path.rs index b055bf1268..a3349f332a 100644 --- a/crates/relayer/src/link/relay_path.rs +++ b/crates/relayer/src/link/relay_path.rs @@ -1108,6 +1108,15 @@ impl RelayPath { unreceived_packets(self.dst_chain(), self.src_chain(), &self.path_id) .map_err(LinkError::supervisor)?; + // Use queried unreceived packets to update the backlog metric + ibc_telemetry::global().update_backlog( + sequences.iter().map(|&sequence| sequence.into()).collect(), + &self.src_chain().id(), + self.src_channel_id(), + self.src_port_id(), + &self.dst_chain().id(), + ); + let query_height = opt_query_height.unwrap_or(src_response_height); // Skip: no relevant events found. @@ -1170,6 +1179,15 @@ impl RelayPath { return Ok(()); }; + // Use queried unreceived acks to update the backlog metric + ibc_telemetry::global().update_backlog( + sequences.iter().map(|&sequence| sequence.into()).collect(), + &self.dst_chain().id(), + self.dst_channel_id(), + self.dst_port_id(), + &self.src_chain().id(), + ); + let query_height = opt_query_height.unwrap_or(src_response_height); // Skip: no relevant events found. diff --git a/crates/telemetry/src/state.rs b/crates/telemetry/src/state.rs index 4abdb93740..d398fc8ee1 100644 --- a/crates/telemetry/src/state.rs +++ b/crates/telemetry/src/state.rs @@ -968,6 +968,40 @@ impl TelemetryState { self.backlog_size.observe(&cx, total, labels); } + /// Inserts in the backlog a new event for the given sequence number. + /// This happens when the relayer observed a new SendPacket event. + pub fn update_backlog( + &self, + sequences: Vec, + chain_id: &ChainId, + channel_id: &ChannelId, + port_id: &PortId, + counterparty_chain_id: &ChainId, + ) { + // Unique identifier for a chain/channel/port. + let path_uid: PathIdentifier = PathIdentifier::new( + chain_id.to_string(), + channel_id.to_string(), + port_id.to_string(), + ); + + // Remove any sequence number from the backlog which isn't in the list of queried pending packets + // as they might have been relayed without the Hermes instance observing it + if let Some(path_backlog) = self.backlogs.get(&path_uid) { + let _ = path_backlog.iter().map(|entry| { + if !sequences.contains(entry.value()) { + self.backlog_remove( + *entry.value(), + chain_id, + channel_id, + port_id, + counterparty_chain_id, + ) + } + }); + } + } + /// Evicts from the backlog the event for the given sequence number. /// Removing events happens when the relayer observed either an acknowledgment /// or a timeout for a packet sequence number, which means that the corresponding From 0f7431fb996c02eb6648f1a1c63ae54df5b1b76d Mon Sep 17 00:00:00 2001 From: Luca Joss Date: Thu, 7 Dec 2023 09:18:30 +0100 Subject: [PATCH 02/11] Fix update backlog and add unit tests --- crates/telemetry/src/state.rs | 133 +++++++++++++++++++++++++++++++--- 1 file changed, 122 insertions(+), 11 deletions(-) diff --git a/crates/telemetry/src/state.rs b/crates/telemetry/src/state.rs index d398fc8ee1..fca453e20d 100644 --- a/crates/telemetry/src/state.rs +++ b/crates/telemetry/src/state.rs @@ -988,17 +988,15 @@ impl TelemetryState { // Remove any sequence number from the backlog which isn't in the list of queried pending packets // as they might have been relayed without the Hermes instance observing it if let Some(path_backlog) = self.backlogs.get(&path_uid) { - let _ = path_backlog.iter().map(|entry| { - if !sequences.contains(entry.value()) { - self.backlog_remove( - *entry.value(), - chain_id, - channel_id, - port_id, - counterparty_chain_id, - ) - } - }); + let backlog = path_backlog.value(); + let keys_to_remove: Vec = backlog + .iter() + .filter(|entry| !sequences.contains(entry.key())) + .map(|entry| *entry.key()) + .collect(); + for key in keys_to_remove.iter() { + self.backlog_remove(*key, chain_id, channel_id, port_id, counterparty_chain_id) + } } } @@ -1201,3 +1199,116 @@ impl AggregatorSelector for CustomAggregatorSelector { } } } + +//#[cfg(feature = "telemetry")] +mod tests { + use prometheus::proto::Metric; + + use super::*; + + #[test] + fn insert_remove_backlog() { + let state = TelemetryState::new( + Range { + start: 0, + end: 5000, + }, + 5, + Range { + start: 0, + end: 5000, + }, + 5, + ); + + let chain_id = ChainId::from_string("chain-test"); + let counterparty_chain_id = ChainId::from_string("counterpartychain-test"); + let channel_id = ChannelId::new(0); + let port_id = PortId::transfer(); + + state.backlog_insert(1, &chain_id, &channel_id, &port_id, &counterparty_chain_id); + state.backlog_insert(2, &chain_id, &channel_id, &port_id, &counterparty_chain_id); + state.backlog_insert(3, &chain_id, &channel_id, &port_id, &counterparty_chain_id); + state.backlog_insert(4, &chain_id, &channel_id, &port_id, &counterparty_chain_id); + state.backlog_insert(5, &chain_id, &channel_id, &port_id, &counterparty_chain_id); + state.backlog_remove(3, &chain_id, &channel_id, &port_id, &counterparty_chain_id); + state.backlog_remove(1, &chain_id, &channel_id, &port_id, &counterparty_chain_id); + + let metrics = state.exporter.registry().gather().clone(); + let backlog_size = metrics + .iter() + .find(|metric| metric.get_name() == "backlog_size") + .unwrap(); + assert!( + assert_metric_value(backlog_size.get_metric(), 3), + "expected backlog_size to be 3" + ); + let backlog_oldest_sequence = metrics + .iter() + .find(|&metric| metric.get_name() == "backlog_oldest_sequence") + .unwrap(); + assert!( + assert_metric_value(backlog_oldest_sequence.get_metric(), 2), + "expected backlog_oldest_sequence to be 2" + ); + } + + #[test] + fn update_backlog() { + let state = TelemetryState::new( + Range { + start: 0, + end: 5000, + }, + 5, + Range { + start: 0, + end: 5000, + }, + 5, + ); + + let chain_id = ChainId::from_string("chain-test"); + let counterparty_chain_id = ChainId::from_string("counterpartychain-test"); + let channel_id = ChannelId::new(0); + let port_id = PortId::transfer(); + + state.backlog_insert(1, &chain_id, &channel_id, &port_id, &counterparty_chain_id); + state.backlog_insert(2, &chain_id, &channel_id, &port_id, &counterparty_chain_id); + state.backlog_insert(3, &chain_id, &channel_id, &port_id, &counterparty_chain_id); + state.backlog_insert(4, &chain_id, &channel_id, &port_id, &counterparty_chain_id); + state.backlog_insert(5, &chain_id, &channel_id, &port_id, &counterparty_chain_id); + + state.update_backlog( + vec![5], + &chain_id, + &channel_id, + &port_id, + &counterparty_chain_id, + ); + + let metrics = state.exporter.registry().gather().clone(); + let backlog_size = metrics + .iter() + .find(|&metric| metric.get_name() == "backlog_size") + .unwrap(); + assert!( + assert_metric_value(backlog_size.get_metric(), 1), + "expected backlog_size to be 1" + ); + let backlog_oldest_sequence = metrics + .iter() + .find(|&metric| metric.get_name() == "backlog_oldest_sequence") + .unwrap(); + assert!( + assert_metric_value(backlog_oldest_sequence.get_metric(), 5), + "expected backlog_oldest_sequence to be 5" + ); + } + + fn assert_metric_value(metric: &[Metric], expected: u64) -> bool { + metric + .iter() + .any(|m| m.get_gauge().get_value() as u64 == expected) + } +} From 00b2b0e2241c9911a893488027155301b6d3bd43 Mon Sep 17 00:00:00 2001 From: Luca Joss Date: Fri, 8 Dec 2023 17:37:02 +0100 Subject: [PATCH 03/11] Uncomment conditional compilation from telemetry tests --- crates/telemetry/src/state.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/crates/telemetry/src/state.rs b/crates/telemetry/src/state.rs index 6d06d169df..4a82a856e1 100644 --- a/crates/telemetry/src/state.rs +++ b/crates/telemetry/src/state.rs @@ -1201,7 +1201,7 @@ impl AggregatorSelector for CustomAggregatorSelector { } } -//#[cfg(feature = "telemetry")] +#[cfg(feature = "telemetry")] mod tests { use prometheus::proto::Metric; From 2d13f6b2e91d29e409ae3b9e5beac9dc727ef479 Mon Sep 17 00:00:00 2001 From: Luca Joss Date: Fri, 8 Dec 2023 17:39:51 +0100 Subject: [PATCH 04/11] Add changelog entry --- .../bug-fixes/ibc-telemetry/3723-fix-backlog-metrics.md | 2 ++ 1 file changed, 2 insertions(+) create mode 100644 .changelog/unreleased/bug-fixes/ibc-telemetry/3723-fix-backlog-metrics.md diff --git a/.changelog/unreleased/bug-fixes/ibc-telemetry/3723-fix-backlog-metrics.md b/.changelog/unreleased/bug-fixes/ibc-telemetry/3723-fix-backlog-metrics.md new file mode 100644 index 0000000000..902288b2c8 --- /dev/null +++ b/.changelog/unreleased/bug-fixes/ibc-telemetry/3723-fix-backlog-metrics.md @@ -0,0 +1,2 @@ +- Update the values of `backlog` metrics when clearing packets. + ([\#3723](https://github.com/informalsystems/hermes/issues/3723)) \ No newline at end of file From 8acaf5db9099aad13896011f2fbfe9075b8d338b Mon Sep 17 00:00:00 2001 From: Luca Joss Date: Fri, 8 Dec 2023 17:44:13 +0100 Subject: [PATCH 05/11] Update guide section regarding 'backlog_*' metrics --- guide/src/documentation/telemetry/operators.md | 1 + 1 file changed, 1 insertion(+) diff --git a/guide/src/documentation/telemetry/operators.md b/guide/src/documentation/telemetry/operators.md index c550232beb..0f5fc87f0d 100644 --- a/guide/src/documentation/telemetry/operators.md +++ b/guide/src/documentation/telemetry/operators.md @@ -132,6 +132,7 @@ If this metric is increasing, it signals that the packet queue is increasing and - If the `backlog_oldest_sequence` remains unchanged for more than a few minutes, that means that the packet with the respective sequence number is likely blocked and cannot be relayed. To understand for how long the packet is block, Hermes will populate `backlog_oldest_timestamp` with the local time when it first observed the `backlog_oldest_sequence` that is blocked. +- __NOTE__: The Hermes instance might miss the acknowledgment of an observed IBC packets relayed, this will cause the `backlog_*` metrics to contain an invalid value. In order to minimise this issue, whenever the Hermes instance clears packets the `backlog_*` metrics will be updated using the queried pending packets. ## How efficient and how secure is the IBC status on each network? From cd53ee66b986c67227ed523684f66bf662d7626c Mon Sep 17 00:00:00 2001 From: Luca Joss Date: Wed, 13 Dec 2023 14:32:08 +0100 Subject: [PATCH 06/11] Correctly update the backlog when querying commitments on chain --- crates/relayer/src/chain/counterparty.rs | 14 ++++++++++++++ crates/relayer/src/link/relay_path.rs | 18 ------------------ 2 files changed, 14 insertions(+), 18 deletions(-) diff --git a/crates/relayer/src/chain/counterparty.rs b/crates/relayer/src/chain/counterparty.rs index be9425aae2..6cb3a34bd6 100644 --- a/crates/relayer/src/chain/counterparty.rs +++ b/crates/relayer/src/chain/counterparty.rs @@ -32,6 +32,7 @@ use crate::channel::ChannelError; use crate::client_state::IdentifiedAnyClientState; use crate::path::PathIdentifiers; use crate::supervisor::Error; +use crate::telemetry; pub fn counterparty_chain_from_connection( src_chain: &impl ChainHandle, @@ -502,6 +503,19 @@ pub fn unreceived_packets( &path.counterparty_channel_id, )?; + telemetry!( + update_backlog, + commit_sequences + .iter() + .map(|s| std::convert::Into::::into(*s)) + .collect::>() + .clone(), + &counterparty_chain.id(), + &path.counterparty_channel_id, + &path.counterparty_port_id, + &chain.id() + ); + let packet_seq_nrs = unreceived_packets_sequences(chain, &path.port_id, &path.channel_id, commit_sequences)?; diff --git a/crates/relayer/src/link/relay_path.rs b/crates/relayer/src/link/relay_path.rs index a3349f332a..b055bf1268 100644 --- a/crates/relayer/src/link/relay_path.rs +++ b/crates/relayer/src/link/relay_path.rs @@ -1108,15 +1108,6 @@ impl RelayPath { unreceived_packets(self.dst_chain(), self.src_chain(), &self.path_id) .map_err(LinkError::supervisor)?; - // Use queried unreceived packets to update the backlog metric - ibc_telemetry::global().update_backlog( - sequences.iter().map(|&sequence| sequence.into()).collect(), - &self.src_chain().id(), - self.src_channel_id(), - self.src_port_id(), - &self.dst_chain().id(), - ); - let query_height = opt_query_height.unwrap_or(src_response_height); // Skip: no relevant events found. @@ -1179,15 +1170,6 @@ impl RelayPath { return Ok(()); }; - // Use queried unreceived acks to update the backlog metric - ibc_telemetry::global().update_backlog( - sequences.iter().map(|&sequence| sequence.into()).collect(), - &self.dst_chain().id(), - self.dst_channel_id(), - self.dst_port_id(), - &self.src_chain().id(), - ); - let query_height = opt_query_height.unwrap_or(src_response_height); // Skip: no relevant events found. From abbab87f5d3d2495ecd76536bc66d5f1e4570c92 Mon Sep 17 00:00:00 2001 From: Luca Joss Date: Thu, 14 Dec 2023 10:45:40 +0100 Subject: [PATCH 07/11] Update backlog_oldest_timestamp to backlog_latest_update_timestamp --- crates/telemetry/src/state.rs | 75 +++++++++---------- guide/src/assets/grafana_template.json | 2 +- .../documentation/telemetry/integration.md | 8 +- .../src/documentation/telemetry/operators.md | 6 +- 4 files changed, 41 insertions(+), 50 deletions(-) diff --git a/crates/telemetry/src/state.rs b/crates/telemetry/src/state.rs index 4a82a856e1..b35ed8e84b 100644 --- a/crates/telemetry/src/state.rs +++ b/crates/telemetry/src/state.rs @@ -170,9 +170,9 @@ pub struct TelemetryState { /// SendPacket events were relayed. backlog_oldest_sequence: ObservableGauge, - /// Record the timestamp related to `backlog_oldest_sequence`. + /// Record the timestamp of the last time the `backlog_*` metrics have been updated. /// The timestamp is the time passed since since the unix epoch in seconds. - backlog_oldest_timestamp: ObservableGauge, + backlog_latest_update_timestamp: ObservableGauge, /// Records the length of the backlog, i.e., how many packets are pending. backlog_size: ObservableGauge, @@ -350,10 +350,10 @@ impl TelemetryState { .with_description("Sequence number of the oldest SendPacket event in the backlog") .init(), - backlog_oldest_timestamp: meter - .u64_observable_gauge("backlog_oldest_timestamp") + backlog_latest_update_timestamp: meter + .u64_observable_gauge("backlog_latest_update_timestamp") .with_unit(Unit::new("seconds")) - .with_description("Local timestamp for the oldest SendPacket event in the backlog") + .with_description("Local timestamp for the last time the backlog metrics have been updated") .init(), backlog_size: meter @@ -457,7 +457,7 @@ impl TelemetryState { } self.backlog_oldest_sequence.observe(&cx, 0, labels); - self.backlog_oldest_timestamp.observe(&cx, 0, labels); + self.backlog_latest_update_timestamp.observe(&cx, 0, labels); self.backlog_size.observe(&cx, 0, labels); } @@ -922,8 +922,7 @@ impl TelemetryState { }; // Update the backlog with the incoming data and retrieve the oldest values - let (oldest_sn, oldest_ts, total) = if let Some(path_backlog) = self.backlogs.get(&path_uid) - { + let (oldest_sn, total) = if let Some(path_backlog) = self.backlogs.get(&path_uid) { // Avoid having the inner backlog map growing more than a given threshold, by removing // the oldest sequence number entry. if path_backlog.len() > BACKLOG_RESET_THRESHOLD { @@ -935,20 +934,11 @@ impl TelemetryState { // Return the oldest event information to be recorded in telemetry if let Some(min) = path_backlog.iter().map(|v| *v.key()).min() { - if let Some(oldest) = path_backlog.get(&min) { - (min, *oldest.value(), path_backlog.len() as u64) - } else { - // Timestamp was not found, this should not happen, record a 0 ts. - (min, 0, path_backlog.len() as u64) - } + (min, path_backlog.len() as u64) } else { // We just inserted a new key/value, so this else branch is unlikely to activate, // but it can happen in case of concurrent updates to the backlog. - ( - EMPTY_BACKLOG_SYMBOL, - EMPTY_BACKLOG_SYMBOL, - EMPTY_BACKLOG_SYMBOL, - ) + (EMPTY_BACKLOG_SYMBOL, EMPTY_BACKLOG_SYMBOL) } } else { // If there is no inner backlog for this path, create a new map to store it. @@ -958,13 +948,13 @@ impl TelemetryState { self.backlogs.insert(path_uid, new_path_backlog); // Return the current event information to be recorded in telemetry - (seq_nr, timestamp, 1) + (seq_nr, 1) }; // Update metrics to reflect the new state of the backlog self.backlog_oldest_sequence.observe(&cx, oldest_sn, labels); - self.backlog_oldest_timestamp - .observe(&cx, oldest_ts, labels); + self.backlog_latest_update_timestamp + .observe(&cx, timestamp, labels); self.backlog_size.observe(&cx, total, labels); } @@ -985,18 +975,19 @@ impl TelemetryState { port_id.to_string(), ); - // Remove any sequence number from the backlog which isn't in the list of queried pending packets - // as they might have been relayed without the Hermes instance observing it - if let Some(path_backlog) = self.backlogs.get(&path_uid) { - let backlog = path_backlog.value(); - let keys_to_remove: Vec = backlog - .iter() - .filter(|entry| !sequences.contains(entry.key())) - .map(|entry| *entry.key()) - .collect(); - for key in keys_to_remove.iter() { + // This condition is done in order to avoid having an incorrect `backlog_latest_update_timestamp`. + // If the sequences is an empty vector by removing the entries using `backlog_remove` the `backlog_latest_update_timestamp` + // will only be updated if the current backlog is not empty. + // If the sequences is not empty, then it is possible to simple remove the backlog for that path and insert the sequences. + if sequences.is_empty() { + for key in sequences.iter() { self.backlog_remove(*key, chain_id, channel_id, port_id, counterparty_chain_id) } + } else { + self.backlogs.remove(&path_uid); + for key in sequences.iter() { + self.backlog_insert(*key, chain_id, channel_id, port_id, counterparty_chain_id) + } } } @@ -1028,16 +1019,20 @@ impl TelemetryState { KeyValue::new("port", port_id.to_string()), ]; + // Retrieve local timestamp when this SendPacket event was recorded. + let now = Time::now(); + let timestamp = match now.duration_since(Time::unix_epoch()) { + Ok(ts) => ts.as_secs(), + Err(_) => 0, + }; + if let Some(path_backlog) = self.backlogs.get(&path_uid) { if path_backlog.remove(&seq_nr).is_some() { + // If the entry was removed update the latest update timestamp. + self.backlog_latest_update_timestamp + .observe(&cx, timestamp, labels); // The oldest pending sequence number is the minimum key in the inner (path) backlog. if let Some(min_key) = path_backlog.iter().map(|v| *v.key()).min() { - if let Some(oldest) = path_backlog.get(&min_key) { - self.backlog_oldest_timestamp - .observe(&cx, *oldest.value(), labels); - } else { - self.backlog_oldest_timestamp.observe(&cx, 0, labels); - } self.backlog_oldest_sequence.observe(&cx, min_key, labels); self.backlog_size .observe(&cx, path_backlog.len() as u64, labels); @@ -1045,8 +1040,6 @@ impl TelemetryState { // No mimimum found, update the metrics to reflect an empty backlog self.backlog_oldest_sequence .observe(&cx, EMPTY_BACKLOG_SYMBOL, labels); - self.backlog_oldest_timestamp - .observe(&cx, EMPTY_BACKLOG_SYMBOL, labels); self.backlog_size.observe(&cx, EMPTY_BACKLOG_SYMBOL, labels); } } @@ -1188,7 +1181,7 @@ impl AggregatorSelector for CustomAggregatorSelector { match descriptor.name() { "wallet_balance" => Some(Arc::new(last_value())), "backlog_oldest_sequence" => Some(Arc::new(last_value())), - "backlog_oldest_timestamp" => Some(Arc::new(last_value())), + "backlog_latest_update_timestamp" => Some(Arc::new(last_value())), "backlog_size" => Some(Arc::new(last_value())), // Prometheus' supports only collector for histogram, sum, and last value aggregators. // https://docs.rs/opentelemetry-prometheus/0.10.0/src/opentelemetry_prometheus/lib.rs.html#411-418 diff --git a/guide/src/assets/grafana_template.json b/guide/src/assets/grafana_template.json index 2d7e91454a..80846ea8cf 100644 --- a/guide/src/assets/grafana_template.json +++ b/guide/src/assets/grafana_template.json @@ -958,7 +958,7 @@ }, "editorMode": "builder", "exemplar": false, - "expr": "backlog_oldest_timestamp{job=\"hermes\"}", + "expr": "backlog_latest_update_timestamp{job=\"hermes\"}", "format": "table", "hide": false, "instant": true, diff --git a/guide/src/documentation/telemetry/integration.md b/guide/src/documentation/telemetry/integration.md index 451f95b9a7..a33af5f656 100644 --- a/guide/src/documentation/telemetry/integration.md +++ b/guide/src/documentation/telemetry/integration.md @@ -22,10 +22,10 @@ acknowledgment_packets_confirmed_total{dst_chain="ibc-1",dst_channel="channel-0" # TYPE backlog_oldest_sequence gauge backlog_oldest_sequence{chain="ibc-0",channel="channel-0",counterparty="ibc-1",port="transfer",service_name="unknown_service",otel_scope_name="hermes",otel_scope_version=""} 0 backlog_oldest_sequence{chain="ibc-1",channel="channel-0",counterparty="ibc-0",port="transfer",service_name="unknown_service",otel_scope_name="hermes",otel_scope_version=""} 0 -# HELP backlog_oldest_timestamp Local timestamp for the oldest SendPacket event in the backlog -# TYPE backlog_oldest_timestamp gauge -backlog_oldest_timestamp{chain="ibc-0",channel="channel-0",counterparty="ibc-1",port="transfer",service_name="unknown_service",otel_scope_name="hermes",otel_scope_version=""} 0 -backlog_oldest_timestamp{chain="ibc-1",channel="channel-0",counterparty="ibc-0",port="transfer",service_name="unknown_service",otel_scope_name="hermes",otel_scope_version=""} 0 +# HELP backlog_latest_update_timestamp Local timestamp for the last time the backlog metrics have been updated +# TYPE backlog_latest_update_timestamp gauge +backlog_latest_update_timestamp{chain="ibc-0",channel="channel-0",counterparty="ibc-1",port="transfer",service_name="unknown_service",otel_scope_name="hermes",otel_scope_version=""} 0 +backlog_latest_update_timestamp{chain="ibc-1",channel="channel-0",counterparty="ibc-0",port="transfer",service_name="unknown_service",otel_scope_name="hermes",otel_scope_version=""} 0 # HELP backlog_size Total number of SendPacket events in the backlog # TYPE backlog_size gauge backlog_size{chain="ibc-0",channel="channel-0",counterparty="ibc-1",port="transfer",service_name="unknown_service",otel_scope_name="hermes",otel_scope_version=""} 0 diff --git a/guide/src/documentation/telemetry/operators.md b/guide/src/documentation/telemetry/operators.md index 0f5fc87f0d..96204843c4 100644 --- a/guide/src/documentation/telemetry/operators.md +++ b/guide/src/documentation/telemetry/operators.md @@ -121,7 +121,7 @@ Since Hermes v1, we also introduced 3 metrics that sketch the backlog status of | Name | Description | OpenTelemetry type | Configuration Dependencies | | -------------------------- | -------------------------------------------------------------- | ------------------- | -------------------------- | | `backlog_oldest_sequence` | Sequence number of the oldest SendPacket event in the backlog | `u64` ValueRecorder | Packet workers enabled | -| `backlog_oldest_timestamp` | Local timestamp for the oldest SendPacket event in the backlog | `u64` ValueRecorder | Packet workers enabled | +| `backlog_latest_update_timestamp` | Local timestamp for the last time the backlog metrics have been updated | `u64` ValueRecorder | Packet workers enabled | | `backlog_size` | Total number of SendPacket events in the backlog | `u64` ValueRecorder | Packet workers enabled | @@ -129,9 +129,7 @@ Notes: - The `backlog_size` defines how many IBC packets users sent and were not yet relayed (i.e., received on the destination network, or timed-out). If this metric is increasing, it signals that the packet queue is increasing and there may be some errors in the Hermes logs that need your attention. -- If the `backlog_oldest_sequence` remains unchanged for more than a few minutes, that means that the packet with the respective sequence number is likely blocked -and cannot be relayed. To understand for how long the packet is block, Hermes will populate `backlog_oldest_timestamp` with the local time when it first observed -the `backlog_oldest_sequence` that is blocked. +- The `backlog_latest_update_timestamp` is used to get information on the reliability of the `backlog_*` metrics. If the timestamp doesn't change it means there might be an issue with the metrics. - __NOTE__: The Hermes instance might miss the acknowledgment of an observed IBC packets relayed, this will cause the `backlog_*` metrics to contain an invalid value. In order to minimise this issue, whenever the Hermes instance clears packets the `backlog_*` metrics will be updated using the queried pending packets. ## How efficient and how secure is the IBC status on each network? From 92bb5be7e646000a35352a5f6c7a5222db76608d Mon Sep 17 00:00:00 2001 From: Luca Joss Date: Thu, 14 Dec 2023 10:47:53 +0100 Subject: [PATCH 08/11] Update changelog entry --- .../bug-fixes/ibc-telemetry/3723-fix-backlog-metrics.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/.changelog/unreleased/bug-fixes/ibc-telemetry/3723-fix-backlog-metrics.md b/.changelog/unreleased/bug-fixes/ibc-telemetry/3723-fix-backlog-metrics.md index 902288b2c8..fa95766987 100644 --- a/.changelog/unreleased/bug-fixes/ibc-telemetry/3723-fix-backlog-metrics.md +++ b/.changelog/unreleased/bug-fixes/ibc-telemetry/3723-fix-backlog-metrics.md @@ -1,2 +1,4 @@ - Update the values of `backlog` metrics when clearing packets. + Change the `backlog_oldest_timestamp` to `backlog_latest_update_timestamp` + which shows the last time the `backlog` metrics have been updated. ([\#3723](https://github.com/informalsystems/hermes/issues/3723)) \ No newline at end of file From 1fe4cdba42ed861a30feab8ee39e617810e00da0 Mon Sep 17 00:00:00 2001 From: Luca Joss Date: Thu, 14 Dec 2023 11:04:01 +0100 Subject: [PATCH 09/11] Fix bug in backlog update --- crates/telemetry/src/state.rs | 67 +++++++++++++++++++++++++++++++++-- 1 file changed, 64 insertions(+), 3 deletions(-) diff --git a/crates/telemetry/src/state.rs b/crates/telemetry/src/state.rs index b35ed8e84b..7f638c93dd 100644 --- a/crates/telemetry/src/state.rs +++ b/crates/telemetry/src/state.rs @@ -980,8 +980,16 @@ impl TelemetryState { // will only be updated if the current backlog is not empty. // If the sequences is not empty, then it is possible to simple remove the backlog for that path and insert the sequences. if sequences.is_empty() { - for key in sequences.iter() { - self.backlog_remove(*key, chain_id, channel_id, port_id, counterparty_chain_id) + if let Some(path_backlog) = self.backlogs.get(&path_uid) { + let current_keys: Vec = path_backlog + .value() + .iter() + .map(|entry| *entry.key()) + .collect(); + + for key in current_keys.iter() { + self.backlog_remove(*key, chain_id, channel_id, port_id, counterparty_chain_id) + } } } else { self.backlogs.remove(&path_uid); @@ -1194,7 +1202,7 @@ impl AggregatorSelector for CustomAggregatorSelector { } } -#[cfg(feature = "telemetry")] +#[cfg(all(feature = "telemetry", test))] mod tests { use prometheus::proto::Metric; @@ -1300,6 +1308,59 @@ mod tests { ); } + #[test] + fn update_backlog_empty() { + let state = TelemetryState::new( + Range { + start: 0, + end: 5000, + }, + 5, + Range { + start: 0, + end: 5000, + }, + 5, + ); + + let chain_id = ChainId::from_string("chain-test"); + let counterparty_chain_id = ChainId::from_string("counterpartychain-test"); + let channel_id = ChannelId::new(0); + let port_id = PortId::transfer(); + + state.backlog_insert(1, &chain_id, &channel_id, &port_id, &counterparty_chain_id); + state.backlog_insert(2, &chain_id, &channel_id, &port_id, &counterparty_chain_id); + state.backlog_insert(3, &chain_id, &channel_id, &port_id, &counterparty_chain_id); + state.backlog_insert(4, &chain_id, &channel_id, &port_id, &counterparty_chain_id); + state.backlog_insert(5, &chain_id, &channel_id, &port_id, &counterparty_chain_id); + + state.update_backlog( + vec![], + &chain_id, + &channel_id, + &port_id, + &counterparty_chain_id, + ); + + let metrics = state.exporter.registry().gather().clone(); + let backlog_size = metrics + .iter() + .find(|&metric| metric.get_name() == "backlog_size") + .unwrap(); + assert!( + assert_metric_value(backlog_size.get_metric(), 0), + "expected backlog_size to be 0" + ); + let backlog_oldest_sequence = metrics + .iter() + .find(|&metric| metric.get_name() == "backlog_oldest_sequence") + .unwrap(); + assert!( + assert_metric_value(backlog_oldest_sequence.get_metric(), 0), + "expected backlog_oldest_sequence to be 0" + ); + } + fn assert_metric_value(metric: &[Metric], expected: u64) -> bool { metric .iter() From 802633479153f6a90b61a2e7b2044acc7761b083 Mon Sep 17 00:00:00 2001 From: Romain Ruetschi Date: Thu, 14 Dec 2023 15:47:33 +0100 Subject: [PATCH 10/11] Guard tests module by test cfg Signed-off-by: Romain Ruetschi --- crates/telemetry/src/state.rs | 1 + 1 file changed, 1 insertion(+) diff --git a/crates/telemetry/src/state.rs b/crates/telemetry/src/state.rs index 7f638c93dd..b2aa9cd346 100644 --- a/crates/telemetry/src/state.rs +++ b/crates/telemetry/src/state.rs @@ -1203,6 +1203,7 @@ impl AggregatorSelector for CustomAggregatorSelector { } #[cfg(all(feature = "telemetry", test))] +#[cfg(test)] mod tests { use prometheus::proto::Metric; From 69896f5418db7f2a8f044ea8c70be4d63c345825 Mon Sep 17 00:00:00 2001 From: Luca Joss Date: Thu, 14 Dec 2023 15:53:21 +0100 Subject: [PATCH 11/11] Update from GitHub suggestions --- crates/relayer/src/chain/counterparty.rs | 2 +- crates/telemetry/src/state.rs | 1 - 2 files changed, 1 insertion(+), 2 deletions(-) diff --git a/crates/relayer/src/chain/counterparty.rs b/crates/relayer/src/chain/counterparty.rs index 6cb3a34bd6..b99699d3b0 100644 --- a/crates/relayer/src/chain/counterparty.rs +++ b/crates/relayer/src/chain/counterparty.rs @@ -507,7 +507,7 @@ pub fn unreceived_packets( update_backlog, commit_sequences .iter() - .map(|s| std::convert::Into::::into(*s)) + .map(|s| u64::from(*s)) .collect::>() .clone(), &counterparty_chain.id(), diff --git a/crates/telemetry/src/state.rs b/crates/telemetry/src/state.rs index b2aa9cd346..b0aff925df 100644 --- a/crates/telemetry/src/state.rs +++ b/crates/telemetry/src/state.rs @@ -1202,7 +1202,6 @@ impl AggregatorSelector for CustomAggregatorSelector { } } -#[cfg(all(feature = "telemetry", test))] #[cfg(test)] mod tests { use prometheus::proto::Metric;