From e30d17c1876e6f3815ce853dd2988dc06f44938f Mon Sep 17 00:00:00 2001 From: Luca Joss <43531661+ljoss17@users.noreply.github.com> Date: Wed, 6 Dec 2023 09:17:00 +0100 Subject: [PATCH 1/8] Improve `broadcast_errors` metric (#3719) * Use short description for 'broadcast_errors' metric * Add changelog entry * Fix hyperlink in doc comment Signed-off-by: Romain Ruetschi * Fix hyperlink in doc comment --------- Signed-off-by: Romain Ruetschi Co-authored-by: Romain Ruetschi --- .../3720-fix-broadcasting-errors-metric.md | 3 + crates/telemetry/src/broadcast_error.rs | 429 ++++++++++++++++++ crates/telemetry/src/lib.rs | 1 + crates/telemetry/src/state.rs | 9 +- 4 files changed, 438 insertions(+), 4 deletions(-) create mode 100644 .changelog/unreleased/bug-fixes/ibc-telemetry/3720-fix-broadcasting-errors-metric.md create mode 100644 crates/telemetry/src/broadcast_error.rs diff --git a/.changelog/unreleased/bug-fixes/ibc-telemetry/3720-fix-broadcasting-errors-metric.md b/.changelog/unreleased/bug-fixes/ibc-telemetry/3720-fix-broadcasting-errors-metric.md new file mode 100644 index 0000000000..3da82edaf4 --- /dev/null +++ b/.changelog/unreleased/bug-fixes/ibc-telemetry/3720-fix-broadcasting-errors-metric.md @@ -0,0 +1,3 @@ +- Fix the issue where `broadcast_errors` metric would not correctly batch + the same errors together. + together ([\#3720](https://github.com/informalsystems/hermes/issues/3720)) \ No newline at end of file diff --git a/crates/telemetry/src/broadcast_error.rs b/crates/telemetry/src/broadcast_error.rs new file mode 100644 index 0000000000..1d092f08d6 --- /dev/null +++ b/crates/telemetry/src/broadcast_error.rs @@ -0,0 +1,429 @@ +//! The BroadcastError is used by the telemetry in order to correctly batch +//! together the error reports from ibc-go or Cosmos SDK. +//! When a broadcast error is received by Hermes it will contain a code and +//! a description, but the code description depends on the source of the error. +//! For example Cosmos SDK error code 13 is "insufficient fee", and error +//! code 13 for Ibc Go is "invalid packet". +//! The description might contain some variables for example error 32 would be: +//! "account sequence mismatch, expected 1234, got 1235: incorrect account sequence" +//! The BroadcastError will reduce the description to simple: "incorrect account sequence" +//! +//! Cosmos SDK errors: +//! Ibc Go errors: + +pub struct BroadcastError { + pub code: u32, + pub description: String, +} + +impl BroadcastError { + pub fn new(code: u32, description: &str) -> Self { + let short_description = get_short_description(code, description); + Self { + code, + description: short_description, + } + } +} + +fn get_short_description(code: u32, description: &str) -> String { + match code { + 2 => { + let sdk_error = "tx parse error"; + let ibc_go_error = "channel already exists"; + if description.contains(sdk_error) { + Some(sdk_error.to_owned()) + } else if description.contains(ibc_go_error) { + Some(ibc_go_error.to_owned()) + } else { + None + } + } + 3 => { + let sdk_error = "invalid sequence"; + let ibc_go_error = "channel not found"; + if description.contains(sdk_error) { + Some(sdk_error.to_owned()) + } else if description.contains(ibc_go_error) { + Some(ibc_go_error.to_owned()) + } else { + None + } + } + 4 => { + let sdk_error = "unauthorized"; + let ibc_go_error = "invalid channel"; + if description.contains(sdk_error) { + Some(sdk_error.to_owned()) + } else if description.contains(ibc_go_error) { + Some(ibc_go_error.to_owned()) + } else { + None + } + } + 5 => { + let sdk_error = "insufficient funds"; + let ibc_go_error = "invalid channel state"; + if description.contains(sdk_error) { + Some(sdk_error.to_owned()) + } else if description.contains(ibc_go_error) { + Some(ibc_go_error.to_owned()) + } else { + None + } + } + 6 => { + let sdk_error = "unknown request"; + let ibc_go_error = "invalid channel ordering"; + if description.contains(sdk_error) { + Some(sdk_error.to_owned()) + } else if description.contains(ibc_go_error) { + Some(ibc_go_error.to_owned()) + } else { + None + } + } + 7 => { + let sdk_error = "invalid address"; + let ibc_go_error = "invalid counterparty channel"; + if description.contains(sdk_error) { + Some(sdk_error.to_owned()) + } else if description.contains(ibc_go_error) { + Some(ibc_go_error.to_owned()) + } else { + None + } + } + 8 => { + let sdk_error = "invalid pubkey"; + let ibc_go_error = "invalid channel capability"; + if description.contains(sdk_error) { + Some(sdk_error.to_owned()) + } else if description.contains(ibc_go_error) { + Some(ibc_go_error.to_owned()) + } else { + None + } + } + 9 => { + let sdk_error = "unknown address"; + let ibc_go_error = "channel capability not found"; + if description.contains(sdk_error) { + Some(sdk_error.to_owned()) + } else if description.contains(ibc_go_error) { + Some(ibc_go_error.to_owned()) + } else { + None + } + } + 10 => { + let sdk_error = "invalid coins"; + let ibc_go_error = "sequence send not found"; + if description.contains(sdk_error) { + Some(sdk_error.to_owned()) + } else if description.contains(ibc_go_error) { + Some(ibc_go_error.to_owned()) + } else { + None + } + } + 11 => { + let sdk_error = "out of gas"; + let ibc_go_error = "sequence receive not found"; + if description.contains(sdk_error) { + Some(sdk_error.to_owned()) + } else if description.contains(ibc_go_error) { + Some(ibc_go_error.to_owned()) + } else { + None + } + } + 12 => { + let sdk_error = "memo too large"; + let ibc_go_error = "sequence acknowledgement not found"; + if description.contains(sdk_error) { + Some(sdk_error.to_owned()) + } else if description.contains(ibc_go_error) { + Some(ibc_go_error.to_owned()) + } else { + None + } + } + 13 => { + let sdk_error = "insufficient fee"; + let ibc_go_error = "invalid packet"; + if description.contains(sdk_error) { + Some(sdk_error.to_owned()) + } else if description.contains(ibc_go_error) { + Some(ibc_go_error.to_owned()) + } else { + None + } + } + 14 => { + let sdk_error = "maximum number of signatures exceeded"; + let ibc_go_error = "packet timeout"; + if description.contains(sdk_error) { + Some(sdk_error.to_owned()) + } else if description.contains(ibc_go_error) { + Some(ibc_go_error.to_owned()) + } else { + None + } + } + 15 => { + let sdk_error = "no signatures supplied"; + let ibc_go_error = "too many connection hops"; + if description.contains(sdk_error) { + Some(sdk_error.to_owned()) + } else if description.contains(ibc_go_error) { + Some(ibc_go_error.to_owned()) + } else { + None + } + } + 16 => { + let sdk_error = "failed to marshal JSON bytes"; + let ibc_go_error = "invalid acknowledgement"; + if description.contains(sdk_error) { + Some(sdk_error.to_owned()) + } else if description.contains(ibc_go_error) { + Some(ibc_go_error.to_owned()) + } else { + None + } + } + 17 => { + let sdk_error = "failed to unmarshal JSON bytes"; + let ibc_go_error = "acknowledgement for packet already exists"; + if description.contains(sdk_error) { + Some(sdk_error.to_owned()) + } else if description.contains(ibc_go_error) { + Some(ibc_go_error.to_owned()) + } else { + None + } + } + 18 => { + let sdk_error = "invalid request"; + let ibc_go_error = "invalid channel identifier"; + if description.contains(sdk_error) { + Some(sdk_error.to_owned()) + } else if description.contains(ibc_go_error) { + Some(ibc_go_error.to_owned()) + } else { + None + } + } + 19 => { + let sdk_error = "tx already in mempool"; + let ibc_go_error = "packet already received"; + if description.contains(sdk_error) { + Some(sdk_error.to_owned()) + } else if description.contains(ibc_go_error) { + Some(ibc_go_error.to_owned()) + } else { + None + } + } + 20 => { + let sdk_error = "mempool is full"; + let ibc_go_error = "packet commitment not found"; + if description.contains(sdk_error) { + Some(sdk_error.to_owned()) + } else if description.contains(ibc_go_error) { + Some(ibc_go_error.to_owned()) + } else { + None + } + } + 21 => { + let sdk_error = "tx too large"; + let ibc_go_error = "packet sequence is out of order"; + if description.contains(sdk_error) { + Some(sdk_error.to_owned()) + } else if description.contains(ibc_go_error) { + Some(ibc_go_error.to_owned()) + } else { + None + } + } + 22 => { + let sdk_error = "key not found"; + let ibc_go_error = "packet messages are redundant"; + if description.contains(sdk_error) { + Some(sdk_error.to_owned()) + } else if description.contains(ibc_go_error) { + Some(ibc_go_error.to_owned()) + } else { + None + } + } + 23 => { + let sdk_error = "invalid account password"; + let ibc_go_error = "message is redundant, no-op will be performed"; + if description.contains(sdk_error) { + Some(sdk_error.to_owned()) + } else if description.contains(ibc_go_error) { + Some(ibc_go_error.to_owned()) + } else { + None + } + } + 24 => { + let sdk_error = "tx intended signer does not match the given signer"; + let ibc_go_error = "invalid channel version"; + if description.contains(sdk_error) { + Some(sdk_error.to_owned()) + } else if description.contains(ibc_go_error) { + Some(ibc_go_error.to_owned()) + } else { + None + } + } + 25 => { + let sdk_error = "invalid gas adjustment"; + let ibc_go_error = "packet has not been sent"; + if description.contains(sdk_error) { + Some(sdk_error.to_owned()) + } else if description.contains(ibc_go_error) { + Some(ibc_go_error.to_owned()) + } else { + None + } + } + 26 => { + let sdk_error = "invalid height"; + let ibc_go_error = "invalid packet timeout"; + if description.contains(sdk_error) { + Some(sdk_error.to_owned()) + } else if description.contains(ibc_go_error) { + Some(ibc_go_error.to_owned()) + } else { + None + } + } + 27 => { + let sdk_error = "invalid version"; + if description.contains(sdk_error) { + Some(sdk_error.to_owned()) + } else { + None + } + } + 28 => { + let sdk_error = "invalid chain-id"; + if description.contains(sdk_error) { + Some(sdk_error.to_owned()) + } else { + None + } + } + 29 => { + let sdk_error = "invalid type"; + if description.contains(sdk_error) { + Some(sdk_error.to_owned()) + } else { + None + } + } + 30 => { + let sdk_error = "tx timeout height"; + if description.contains(sdk_error) { + Some(sdk_error.to_owned()) + } else { + None + } + } + 31 => { + let sdk_error = "unknown extension options"; + if description.contains(sdk_error) { + Some(sdk_error.to_owned()) + } else { + None + } + } + 32 => { + let sdk_error = "incorrect account sequence"; + if description.contains(sdk_error) { + Some(sdk_error.to_owned()) + } else { + None + } + } + 33 => { + let sdk_error = "failed packing protobuf message to Any"; + if description.contains(sdk_error) { + Some(sdk_error.to_owned()) + } else { + None + } + } + 34 => { + let sdk_error = "failed unpacking protobuf message from Any"; + if description.contains(sdk_error) { + Some(sdk_error.to_owned()) + } else { + None + } + } + 35 => { + let sdk_error = "internal logic error"; + if description.contains(sdk_error) { + Some(sdk_error.to_owned()) + } else { + None + } + } + 36 => { + let sdk_error = "conflict"; + if description.contains(sdk_error) { + Some(sdk_error.to_owned()) + } else { + None + } + } + 37 => { + let sdk_error = "feature not supported"; + if description.contains(sdk_error) { + Some(sdk_error.to_owned()) + } else { + None + } + } + 38 => { + let sdk_error = "not found"; + if description.contains(sdk_error) { + Some(sdk_error.to_owned()) + } else { + None + } + } + 39 => { + let sdk_error = "Internal IO error"; + if description.contains(sdk_error) { + Some(sdk_error.to_owned()) + } else { + None + } + } + 40 => { + let sdk_error = "error in app.toml"; + if description.contains(sdk_error) { + Some(sdk_error.to_owned()) + } else { + None + } + } + 41 => { + let sdk_error = "invalid gas limit"; + if description.contains(sdk_error) { + Some(sdk_error.to_owned()) + } else { + None + } + } + _ => None, + } + .unwrap_or_else(|| "unknown error".to_owned()) +} diff --git a/crates/telemetry/src/lib.rs b/crates/telemetry/src/lib.rs index aa2c8498ce..a26c7177e8 100644 --- a/crates/telemetry/src/lib.rs +++ b/crates/telemetry/src/lib.rs @@ -1,3 +1,4 @@ +pub mod broadcast_error; pub mod encoder; mod path_identifier; pub mod server; diff --git a/crates/telemetry/src/state.rs b/crates/telemetry/src/state.rs index 4abdb93740..ab27df1fb8 100644 --- a/crates/telemetry/src/state.rs +++ b/crates/telemetry/src/state.rs @@ -22,7 +22,7 @@ use ibc_relayer_types::{ use tendermint::Time; -use crate::path_identifier::PathIdentifier; +use crate::{broadcast_error::BroadcastError, path_identifier::PathIdentifier}; const EMPTY_BACKLOG_SYMBOL: u64 = 0; const BACKLOG_CAPACITY: usize = 1000; @@ -1082,13 +1082,14 @@ impl TelemetryState { /// Add an error and its description to the list of errors observed after broadcasting /// a Tx with a specific account. - pub fn broadcast_errors(&self, address: &String, error_code: u32, error_description: &String) { + pub fn broadcast_errors(&self, address: &String, error_code: u32, error_description: &str) { let cx = Context::current(); + let broadcast_error = BroadcastError::new(error_code, error_description); let labels = &[ KeyValue::new("account", address.to_string()), - KeyValue::new("error_code", error_code.to_string()), - KeyValue::new("error_description", error_description.to_string()), + KeyValue::new("error_code", broadcast_error.code.to_string()), + KeyValue::new("error_description", broadcast_error.description), ]; self.broadcast_errors.add(&cx, 1, labels); From dcfc85a97874c5e56fc3aec6229b8fa58619ecfd Mon Sep 17 00:00:00 2001 From: Romain Ruetschi Date: Wed, 13 Dec 2023 17:51:04 +0100 Subject: [PATCH 2/8] Fetch the light block at trusted_height + 1 when detecting misbehavior (#3727) * fetch the light block at trusted_height+1 when detecting misbehaviour * Add changelog entry * Fix typo --------- Co-authored-by: beer-1 <147697694+beer-1@users.noreply.github.com> --- .../bug-fixes/ibc-relayer-cli/3697-fix-evidence-report.md | 4 ++++ crates/relayer/src/light_client/tendermint.rs | 3 ++- 2 files changed, 6 insertions(+), 1 deletion(-) create mode 100644 .changelog/unreleased/bug-fixes/ibc-relayer-cli/3697-fix-evidence-report.md diff --git a/.changelog/unreleased/bug-fixes/ibc-relayer-cli/3697-fix-evidence-report.md b/.changelog/unreleased/bug-fixes/ibc-relayer-cli/3697-fix-evidence-report.md new file mode 100644 index 0000000000..232aa6c15a --- /dev/null +++ b/.changelog/unreleased/bug-fixes/ibc-relayer-cli/3697-fix-evidence-report.md @@ -0,0 +1,4 @@ +- Fix a bug in the `evidence` command which would sometimes + prevent the detected misbehaviour evidence from being submitted, + instead erroring out with a validator set hash mismatch. + ([\#3697](https://github.com/informalsystems/hermes/pull/3697)) diff --git a/crates/relayer/src/light_client/tendermint.rs b/crates/relayer/src/light_client/tendermint.rs index e3ca6ef0ba..f8efc16a45 100644 --- a/crates/relayer/src/light_client/tendermint.rs +++ b/crates/relayer/src/light_client/tendermint.rs @@ -173,7 +173,8 @@ impl super::LightClient for LightClient { provider: self.peer_id, }; - let trusted_block = self.fetch(update_header.trusted_height)?; + // Get the light block at trusted_height + 1 from chain. + let trusted_block = self.fetch(update_header.trusted_height.increment())?; if trusted_block.validators.hash() != update_header.trusted_validator_set.hash() { return Err(Error::misbehaviour(format!( "mismatch between the trusted validator set of the update \ From 000e9f7e8bc485cb44226d4e7b179299d17b54da Mon Sep 17 00:00:00 2001 From: Luca Joss Date: Thu, 14 Dec 2023 13:56:38 +0100 Subject: [PATCH 3/8] Initialise v1.7.4 release --- .../3697-fix-evidence-report.md | 0 .../3720-fix-broadcasting-errors-metric.md | 0 .changelog/v1.7.4/summary.md | 5 +++++ CHANGELOG.md | 20 +++++++++++++++++++ Cargo.lock | 16 +++++++-------- crates/chain-registry/Cargo.toml | 4 ++-- crates/relayer-cli/Cargo.toml | 12 +++++------ crates/relayer-rest/Cargo.toml | 6 +++--- crates/relayer-rest/tests/mock.rs | 2 +- crates/relayer-types/Cargo.toml | 2 +- crates/relayer/Cargo.toml | 8 ++++---- crates/relayer/src/lib.rs | 2 +- crates/telemetry/Cargo.toml | 4 ++-- guide/README.md | 2 +- guide/src/SUMMARY.md | 2 +- guide/src/templates/hermes-version.md | 2 +- tools/integration-test/Cargo.toml | 2 +- tools/test-framework/Cargo.toml | 8 ++++---- 18 files changed, 61 insertions(+), 36 deletions(-) rename .changelog/{unreleased => v1.7.4}/bug-fixes/ibc-relayer-cli/3697-fix-evidence-report.md (100%) rename .changelog/{unreleased => v1.7.4}/bug-fixes/ibc-telemetry/3720-fix-broadcasting-errors-metric.md (100%) create mode 100644 .changelog/v1.7.4/summary.md diff --git a/.changelog/unreleased/bug-fixes/ibc-relayer-cli/3697-fix-evidence-report.md b/.changelog/v1.7.4/bug-fixes/ibc-relayer-cli/3697-fix-evidence-report.md similarity index 100% rename from .changelog/unreleased/bug-fixes/ibc-relayer-cli/3697-fix-evidence-report.md rename to .changelog/v1.7.4/bug-fixes/ibc-relayer-cli/3697-fix-evidence-report.md diff --git a/.changelog/unreleased/bug-fixes/ibc-telemetry/3720-fix-broadcasting-errors-metric.md b/.changelog/v1.7.4/bug-fixes/ibc-telemetry/3720-fix-broadcasting-errors-metric.md similarity index 100% rename from .changelog/unreleased/bug-fixes/ibc-telemetry/3720-fix-broadcasting-errors-metric.md rename to .changelog/v1.7.4/bug-fixes/ibc-telemetry/3720-fix-broadcasting-errors-metric.md diff --git a/.changelog/v1.7.4/summary.md b/.changelog/v1.7.4/summary.md new file mode 100644 index 0000000000..073befef6a --- /dev/null +++ b/.changelog/v1.7.4/summary.md @@ -0,0 +1,5 @@ +*December , 2023* + +This release improves the monitoring of Hermes instances by fixing the `broadcast_errors` metric so +that it correctly batches the same errors together. It also improves the metrics `backlog_*` by +updating them whenever Hermes queries pending packets. diff --git a/CHANGELOG.md b/CHANGELOG.md index 4a4c21d3bc..9888948b5f 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,25 @@ # CHANGELOG +## v1.7.4 + +*December , 2023* + +This release improves the monitoring of Hermes instances by fixing the `broadcast_errors` metric so +that it correctly batches the same errors together. It also improves the metrics `backlog_*` by +updating them whenever Hermes queries pending packets. + +### BUG FIXES + +- [Relayer CLI](relayer-cli) + - Fix a bug in the `evidence` command which would sometimes + prevent the detected misbehaviour evidence from being submitted, + instead erroring out with a validator set hash mismatch. + ([\#3697](https://github.com/informalsystems/hermes/pull/3697)) +- [Telemetry & Metrics](telemetry) + - Fix the issue where `broadcast_errors` metric would not correctly batch + the same errors together. + together ([\#3720](https://github.com/informalsystems/hermes/issues/3720)) + ## v1.7.3 *November 29th, 2023* diff --git a/Cargo.lock b/Cargo.lock index 9804d7c8d7..41544350e5 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1323,7 +1323,7 @@ dependencies = [ [[package]] name = "ibc-chain-registry" -version = "0.26.3" +version = "0.26.4" dependencies = [ "async-trait", "flex-error", @@ -1342,7 +1342,7 @@ dependencies = [ [[package]] name = "ibc-integration-test" -version = "0.26.3" +version = "0.26.4" dependencies = [ "http", "ibc-relayer", @@ -1376,7 +1376,7 @@ dependencies = [ [[package]] name = "ibc-relayer" -version = "0.26.3" +version = "0.26.4" dependencies = [ "anyhow", "async-stream", @@ -1443,7 +1443,7 @@ dependencies = [ [[package]] name = "ibc-relayer-cli" -version = "1.7.3" +version = "1.7.4" dependencies = [ "abscissa_core", "clap", @@ -1484,7 +1484,7 @@ dependencies = [ [[package]] name = "ibc-relayer-rest" -version = "0.26.3" +version = "0.26.4" dependencies = [ "axum", "crossbeam-channel 0.5.8", @@ -1499,7 +1499,7 @@ dependencies = [ [[package]] name = "ibc-relayer-types" -version = "0.26.3" +version = "0.26.4" dependencies = [ "bytes", "derive_more", @@ -1530,7 +1530,7 @@ dependencies = [ [[package]] name = "ibc-telemetry" -version = "0.26.3" +version = "0.26.4" dependencies = [ "axum", "dashmap", @@ -1549,7 +1549,7 @@ dependencies = [ [[package]] name = "ibc-test-framework" -version = "0.26.3" +version = "0.26.4" dependencies = [ "color-eyre", "crossbeam-channel 0.5.8", diff --git a/crates/chain-registry/Cargo.toml b/crates/chain-registry/Cargo.toml index 1d8a714993..12553ba8af 100644 --- a/crates/chain-registry/Cargo.toml +++ b/crates/chain-registry/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "ibc-chain-registry" -version = "0.26.3" +version = "0.26.4" edition = "2021" license = "Apache-2.0" keywords = ["cosmos", "ibc", "relayer", "chain", "registry"] @@ -12,7 +12,7 @@ description = """ """ [dependencies] -ibc-relayer-types = { version = "0.26.3", path = "../relayer-types" } +ibc-relayer-types = { version = "0.26.4", path = "../relayer-types" } ibc-proto = { version = "0.38.0", features = ["serde"] } tendermint-rpc = { version = "0.34.0", features = ["http-client", "websocket-client"] } diff --git a/crates/relayer-cli/Cargo.toml b/crates/relayer-cli/Cargo.toml index 98c3272e84..d23e7729fb 100644 --- a/crates/relayer-cli/Cargo.toml +++ b/crates/relayer-cli/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "ibc-relayer-cli" -version = "1.7.3" +version = "1.7.4" edition = "2021" license = "Apache-2.0" readme = "README.md" @@ -25,11 +25,11 @@ telemetry = ["ibc-relayer/telemetry", "ibc-telemetry"] rest-server = ["ibc-relayer-rest"] [dependencies] -ibc-relayer-types = { version = "0.26.3", path = "../relayer-types" } -ibc-relayer = { version = "0.26.3", path = "../relayer" } -ibc-telemetry = { version = "0.26.3", path = "../telemetry", optional = true } -ibc-relayer-rest = { version = "0.26.3", path = "../relayer-rest", optional = true } -ibc-chain-registry = { version = "0.26.3" , path = "../chain-registry" } +ibc-relayer-types = { version = "0.26.4", path = "../relayer-types" } +ibc-relayer = { version = "0.26.4", path = "../relayer" } +ibc-telemetry = { version = "0.26.4", path = "../telemetry", optional = true } +ibc-relayer-rest = { version = "0.26.4", path = "../relayer-rest", optional = true } +ibc-chain-registry = { version = "0.26.4" , path = "../chain-registry" } clap = { version = "3.2", features = ["cargo"] } clap_complete = "3.2" diff --git a/crates/relayer-rest/Cargo.toml b/crates/relayer-rest/Cargo.toml index 5edaaa07c8..6e07128396 100644 --- a/crates/relayer-rest/Cargo.toml +++ b/crates/relayer-rest/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "ibc-relayer-rest" -version = "0.26.3" +version = "0.26.4" authors = ["Informal Systems "] edition = "2021" license = "Apache-2.0" @@ -14,8 +14,8 @@ description = """ """ [dependencies] -ibc-relayer-types = { version = "0.26.3", path = "../relayer-types" } -ibc-relayer = { version = "0.26.3", path = "../relayer" } +ibc-relayer-types = { version = "0.26.4", path = "../relayer-types" } +ibc-relayer = { version = "0.26.4", path = "../relayer" } crossbeam-channel = "0.5" serde = "1.0" diff --git a/crates/relayer-rest/tests/mock.rs b/crates/relayer-rest/tests/mock.rs index fdcc710e37..c17cb8b12e 100644 --- a/crates/relayer-rest/tests/mock.rs +++ b/crates/relayer-rest/tests/mock.rs @@ -64,7 +64,7 @@ async fn version() { let rest_api_version = VersionInfo { name: "ibc-relayer-rest".to_string(), - version: "0.26.3".to_string(), + version: "0.26.4".to_string(), }; let result: JsonResult<_, ()> = JsonResult::Success(vec![version.clone(), rest_api_version]); diff --git a/crates/relayer-types/Cargo.toml b/crates/relayer-types/Cargo.toml index 6092d7041a..5571886e12 100644 --- a/crates/relayer-types/Cargo.toml +++ b/crates/relayer-types/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "ibc-relayer-types" -version = "0.26.3" +version = "0.26.4" edition = "2021" license = "Apache-2.0" readme = "README.md" diff --git a/crates/relayer/Cargo.toml b/crates/relayer/Cargo.toml index 37f16f3add..e7a8bd779e 100644 --- a/crates/relayer/Cargo.toml +++ b/crates/relayer/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "ibc-relayer" -version = "0.26.3" +version = "0.26.4" edition = "2021" license = "Apache-2.0" readme = "README.md" @@ -21,8 +21,8 @@ telemetry = ["ibc-telemetry"] [dependencies] ibc-proto = { version = "0.38.0", features = ["serde"] } -ibc-telemetry = { version = "0.26.3", path = "../telemetry", optional = true } -ibc-relayer-types = { version = "0.26.3", path = "../relayer-types", features = ["mocks"] } +ibc-telemetry = { version = "0.26.4", path = "../telemetry", optional = true } +ibc-relayer-types = { version = "0.26.4", path = "../relayer-types", features = ["mocks"] } subtle-encoding = "0.5" humantime-serde = "1.1.1" @@ -104,7 +104,7 @@ version = "0.34.0" default-features = false [dev-dependencies] -ibc-relayer-types = { version = "0.26.3", path = "../relayer-types", features = ["mocks"] } +ibc-relayer-types = { version = "0.26.4", path = "../relayer-types", features = ["mocks"] } serial_test = "2.0.0" env_logger = "0.10.0" tracing-subscriber = { version = "0.3.14", features = ["fmt", "env-filter", "json"] } diff --git a/crates/relayer/src/lib.rs b/crates/relayer/src/lib.rs index 968651a0f6..0bca2140c0 100644 --- a/crates/relayer/src/lib.rs +++ b/crates/relayer/src/lib.rs @@ -16,7 +16,7 @@ //! //! For the IBC relayer binary, please see [Hermes] (`ibc-relayer-cli` crate). //! -//! [Hermes]: https://docs.rs/ibc-relayer-cli/1.7.2/ +//! [Hermes]: https://docs.rs/ibc-relayer-cli/1.7.4/ extern crate alloc; diff --git a/crates/telemetry/Cargo.toml b/crates/telemetry/Cargo.toml index 43fe2f4304..b1cf12dd6c 100644 --- a/crates/telemetry/Cargo.toml +++ b/crates/telemetry/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "ibc-telemetry" -version = "0.26.3" +version = "0.26.4" edition = "2021" license = "Apache-2.0" readme = "README.md" @@ -13,7 +13,7 @@ description = """ """ [dependencies] -ibc-relayer-types = { version = "0.26.3", path = "../relayer-types" } +ibc-relayer-types = { version = "0.26.4", path = "../relayer-types" } once_cell = "1.17.0" opentelemetry = { version = "0.19.0", features = ["metrics"] } diff --git a/guide/README.md b/guide/README.md index fa91a9112b..b3234796fd 100644 --- a/guide/README.md +++ b/guide/README.md @@ -10,7 +10,7 @@ mdBook is a utility to create modern online books from Markdown files. This guide should be permanently deployed at its latest stable version at [hermes.informal.systems](https://hermes.informal.systems). -Current version: `v1.7.3`. +Current version: `v1.7.4`. The version of this guide is aligned with the [versioning of the ibc crates](../README.md). diff --git a/guide/src/SUMMARY.md b/guide/src/SUMMARY.md index b7340998c1..e85b810e6b 100644 --- a/guide/src/SUMMARY.md +++ b/guide/src/SUMMARY.md @@ -1,6 +1,6 @@ # Summary -# Hermes v1.7.3 +# Hermes v1.7.4 --- - [Introduction](./index.md) diff --git a/guide/src/templates/hermes-version.md b/guide/src/templates/hermes-version.md index 5f152d81a5..a9bbd79f98 100644 --- a/guide/src/templates/hermes-version.md +++ b/guide/src/templates/hermes-version.md @@ -1 +1 @@ -v1.7.3 +v1.7.4 diff --git a/tools/integration-test/Cargo.toml b/tools/integration-test/Cargo.toml index b5a570bc4e..111922cb24 100644 --- a/tools/integration-test/Cargo.toml +++ b/tools/integration-test/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "ibc-integration-test" -version = "0.26.3" +version = "0.26.4" edition = "2021" rust-version = "1.70" license = "Apache-2.0" diff --git a/tools/test-framework/Cargo.toml b/tools/test-framework/Cargo.toml index 9c8b96877a..de362af182 100644 --- a/tools/test-framework/Cargo.toml +++ b/tools/test-framework/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "ibc-test-framework" -version = "0.26.3" +version = "0.26.4" edition = "2021" license = "Apache-2.0" readme = "README.md" @@ -14,9 +14,9 @@ description = """ """ [dependencies] -ibc-relayer-types = { version = "=0.26.3", path = "../../crates/relayer-types" } -ibc-relayer = { version = "=0.26.3", path = "../../crates/relayer" } -ibc-relayer-cli = { version = "=1.7.3", path = "../../crates/relayer-cli" } +ibc-relayer-types = { version = "=0.26.4", path = "../../crates/relayer-types" } +ibc-relayer = { version = "=0.26.4", path = "../../crates/relayer" } +ibc-relayer-cli = { version = "=1.7.4", path = "../../crates/relayer-cli" } ibc-proto = { version = "0.38.0", features = ["serde"] } tendermint-rpc = { version = "0.34.0", features = ["http-client", "websocket-client"] } From 40ea57c8bc47d1c330654b6b8ae0331929c7cf47 Mon Sep 17 00:00:00 2001 From: Luca Joss <43531661+ljoss17@users.noreply.github.com> Date: Thu, 14 Dec 2023 15:46:01 +0100 Subject: [PATCH 4/8] Avoid retrieving a worker which is being removed by the idle worker clean-up process (#3725) * prevent returning stopping workers * clear only the specific worker * Add ibc-go v8 to integration tests (#3694) * Add simapp v8 to CI jobs * Add simapp v8 to nix flake * Handle CLI breaking changes of Cosmos SDK v0.50 in test bootstrap * Handle genesis config 'voting_perdiod' and CLI 'query txs' changes for Cosmos SDK v0.50.1 * Use 'MsgSubmitProposal' instead of deprecated 'UpgradeProposal' to initiate a chain upgrade * Update 'tx upgrade-chain' CLI template * Fix 'tx chain-upgrade' tests * Update chain upgrade for compatibility between different ibc-go versions * Improve assertion for client upgrade tests * Update ibc-proto-rs to v0.39.0 * Add changelog entry * Fix and improve guide section for client upgrade * Wait before querying client state for client upgrade tests * Apply suggestions from code review Co-authored-by: Romain Ruetschi Signed-off-by: Luca Joss <43531661+ljoss17@users.noreply.github.com> * Rename method 'ibc_version()' to 'version_specs' * Extract the verification of legacy version in a method * Implement 'FromStr' instead of 'TryFrom' for 'ProposalStatus' * Fix cargo-doc warning * Add changelog entry for CLI * Change the '--gov-account' flag as optional but will fail if trying to upgrade a chain with ibc-go v8+ * Update guide * Move and update changelog entry for CLI * Return a 'Specs' struct in 'version_specs()' method * Fix clippy errors --------- Signed-off-by: Luca Joss <43531661+ljoss17@users.noreply.github.com> Co-authored-by: Romain Ruetschi * Update Gaia used in tests from v12 to v13 & v14 (#3700) * Update Gaia v12 to v13 and v14 in tests * Fix Celestia CI job * Replace Gaia v12 with v13 and v14 in multi-chain tests (#3701) * Add changelog entry --------- Signed-off-by: Luca Joss <43531661+ljoss17@users.noreply.github.com> Co-authored-by: beer-1 <147697694+beer-1@users.noreply.github.com> Co-authored-by: Romain Ruetschi --- .../3703-avoid-returning-stopped-worker.md | 3 +++ crates/relayer/src/worker/map.rs | 12 +++++++++++- 2 files changed, 14 insertions(+), 1 deletion(-) create mode 100644 .changelog/unreleased/bug-fixes/ibc-relayer/3703-avoid-returning-stopped-worker.md diff --git a/.changelog/unreleased/bug-fixes/ibc-relayer/3703-avoid-returning-stopped-worker.md b/.changelog/unreleased/bug-fixes/ibc-relayer/3703-avoid-returning-stopped-worker.md new file mode 100644 index 0000000000..c2ab504b5d --- /dev/null +++ b/.changelog/unreleased/bug-fixes/ibc-relayer/3703-avoid-returning-stopped-worker.md @@ -0,0 +1,3 @@ +- Avoid retrieving a worker which is being removed by the idle worker clean-up + process. + process ([\#3703](https://github.com/informalsystems/hermes/issues/3703)) \ No newline at end of file diff --git a/crates/relayer/src/worker/map.rs b/crates/relayer/src/worker/map.rs index 65d80d1af1..c5305f52cb 100644 --- a/crates/relayer/src/worker/map.rs +++ b/crates/relayer/src/worker/map.rs @@ -125,7 +125,17 @@ impl WorkerMap { config: &Config, ) -> &WorkerHandle { if self.workers.contains_key(&object) { - &self.workers[&object] + if self.workers[&object].shutdown_stopped_tasks() { + self.remove_stopped( + self.workers[&object].id(), + self.workers[&object].object().clone(), + ); + + let worker = self.spawn_worker(src, dst, &object, config); + self.workers.entry(object).or_insert(worker) + } else { + &self.workers[&object] + } } else { let worker = self.spawn_worker(src, dst, &object, config); self.workers.entry(object).or_insert(worker) From 29dcd399b773cef2a90f70658124c29b4d13e917 Mon Sep 17 00:00:00 2001 From: Luca Joss Date: Thu, 14 Dec 2023 16:08:51 +0100 Subject: [PATCH 5/8] Add PR 3703 to patch release --- .../ibc-relayer/3703-avoid-returning-stopped-worker.md | 0 .changelog/v1.7.4/summary.md | 5 ++++- CHANGELOG.md | 9 ++++++++- 3 files changed, 12 insertions(+), 2 deletions(-) rename .changelog/{unreleased => v1.7.4}/bug-fixes/ibc-relayer/3703-avoid-returning-stopped-worker.md (100%) diff --git a/.changelog/unreleased/bug-fixes/ibc-relayer/3703-avoid-returning-stopped-worker.md b/.changelog/v1.7.4/bug-fixes/ibc-relayer/3703-avoid-returning-stopped-worker.md similarity index 100% rename from .changelog/unreleased/bug-fixes/ibc-relayer/3703-avoid-returning-stopped-worker.md rename to .changelog/v1.7.4/bug-fixes/ibc-relayer/3703-avoid-returning-stopped-worker.md diff --git a/.changelog/v1.7.4/summary.md b/.changelog/v1.7.4/summary.md index 073befef6a..bfb5550397 100644 --- a/.changelog/v1.7.4/summary.md +++ b/.changelog/v1.7.4/summary.md @@ -1,5 +1,8 @@ -*December , 2023* +*December 15th, 2023* This release improves the monitoring of Hermes instances by fixing the `broadcast_errors` metric so that it correctly batches the same errors together. It also improves the metrics `backlog_*` by updating them whenever Hermes queries pending packets. + +A fix avoiding packets being discarded if the idle worker clean-up is removing the worker at the same +time the packets are received. \ No newline at end of file diff --git a/CHANGELOG.md b/CHANGELOG.md index 9888948b5f..1ce4e162e9 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -2,14 +2,21 @@ ## v1.7.4 -*December , 2023* +*December 15th, 2023* This release improves the monitoring of Hermes instances by fixing the `broadcast_errors` metric so that it correctly batches the same errors together. It also improves the metrics `backlog_*` by updating them whenever Hermes queries pending packets. +A fix avoiding packets being discarded if the idle worker clean-up is removing the worker at the same +time the packets are received. + ### BUG FIXES +- [Relayer Library](relayer) + - Avoid retrieving a worker which is being removed by the idle worker clean-up + process. + process ([\#3703](https://github.com/informalsystems/hermes/issues/3703)) - [Relayer CLI](relayer-cli) - Fix a bug in the `evidence` command which would sometimes prevent the detected misbehaviour evidence from being submitted, From fe781b84ec0e10f8ebe77cb6dad6efa093bd9bb4 Mon Sep 17 00:00:00 2001 From: Luca Joss <43531661+ljoss17@users.noreply.github.com> Date: Thu, 14 Dec 2023 17:01:53 +0100 Subject: [PATCH 6/8] Improve `backlog` metrics (#3722) * Update backlog metric when packet clearing is triggered * Fix update backlog and add unit tests * Uncomment conditional compilation from telemetry tests * Add changelog entry * Update guide section regarding 'backlog_*' metrics * Correctly update the backlog when querying commitments on chain * Update backlog_oldest_timestamp to backlog_latest_update_timestamp * Update changelog entry * Fix bug in backlog update * Guard tests module by test cfg Signed-off-by: Romain Ruetschi * Update from GitHub suggestions --------- Signed-off-by: Romain Ruetschi Co-authored-by: Romain Ruetschi --- .../ibc-telemetry/3723-fix-backlog-metrics.md | 4 + crates/relayer/src/chain/counterparty.rs | 14 + crates/telemetry/src/state.rs | 261 +++++++++++++++--- guide/src/assets/grafana_template.json | 2 +- .../documentation/telemetry/integration.md | 8 +- .../src/documentation/telemetry/operators.md | 7 +- 6 files changed, 256 insertions(+), 40 deletions(-) create mode 100644 .changelog/v1.7.4/bug-fixes/ibc-telemetry/3723-fix-backlog-metrics.md diff --git a/.changelog/v1.7.4/bug-fixes/ibc-telemetry/3723-fix-backlog-metrics.md b/.changelog/v1.7.4/bug-fixes/ibc-telemetry/3723-fix-backlog-metrics.md new file mode 100644 index 0000000000..fa95766987 --- /dev/null +++ b/.changelog/v1.7.4/bug-fixes/ibc-telemetry/3723-fix-backlog-metrics.md @@ -0,0 +1,4 @@ +- Update the values of `backlog` metrics when clearing packets. + Change the `backlog_oldest_timestamp` to `backlog_latest_update_timestamp` + which shows the last time the `backlog` metrics have been updated. + ([\#3723](https://github.com/informalsystems/hermes/issues/3723)) \ No newline at end of file diff --git a/crates/relayer/src/chain/counterparty.rs b/crates/relayer/src/chain/counterparty.rs index be9425aae2..b99699d3b0 100644 --- a/crates/relayer/src/chain/counterparty.rs +++ b/crates/relayer/src/chain/counterparty.rs @@ -32,6 +32,7 @@ use crate::channel::ChannelError; use crate::client_state::IdentifiedAnyClientState; use crate::path::PathIdentifiers; use crate::supervisor::Error; +use crate::telemetry; pub fn counterparty_chain_from_connection( src_chain: &impl ChainHandle, @@ -502,6 +503,19 @@ pub fn unreceived_packets( &path.counterparty_channel_id, )?; + telemetry!( + update_backlog, + commit_sequences + .iter() + .map(|s| u64::from(*s)) + .collect::>() + .clone(), + &counterparty_chain.id(), + &path.counterparty_channel_id, + &path.counterparty_port_id, + &chain.id() + ); + let packet_seq_nrs = unreceived_packets_sequences(chain, &path.port_id, &path.channel_id, commit_sequences)?; diff --git a/crates/telemetry/src/state.rs b/crates/telemetry/src/state.rs index ab27df1fb8..b0aff925df 100644 --- a/crates/telemetry/src/state.rs +++ b/crates/telemetry/src/state.rs @@ -170,9 +170,9 @@ pub struct TelemetryState { /// SendPacket events were relayed. backlog_oldest_sequence: ObservableGauge, - /// Record the timestamp related to `backlog_oldest_sequence`. + /// Record the timestamp of the last time the `backlog_*` metrics have been updated. /// The timestamp is the time passed since since the unix epoch in seconds. - backlog_oldest_timestamp: ObservableGauge, + backlog_latest_update_timestamp: ObservableGauge, /// Records the length of the backlog, i.e., how many packets are pending. backlog_size: ObservableGauge, @@ -350,10 +350,10 @@ impl TelemetryState { .with_description("Sequence number of the oldest SendPacket event in the backlog") .init(), - backlog_oldest_timestamp: meter - .u64_observable_gauge("backlog_oldest_timestamp") + backlog_latest_update_timestamp: meter + .u64_observable_gauge("backlog_latest_update_timestamp") .with_unit(Unit::new("seconds")) - .with_description("Local timestamp for the oldest SendPacket event in the backlog") + .with_description("Local timestamp for the last time the backlog metrics have been updated") .init(), backlog_size: meter @@ -457,7 +457,7 @@ impl TelemetryState { } self.backlog_oldest_sequence.observe(&cx, 0, labels); - self.backlog_oldest_timestamp.observe(&cx, 0, labels); + self.backlog_latest_update_timestamp.observe(&cx, 0, labels); self.backlog_size.observe(&cx, 0, labels); } @@ -922,8 +922,7 @@ impl TelemetryState { }; // Update the backlog with the incoming data and retrieve the oldest values - let (oldest_sn, oldest_ts, total) = if let Some(path_backlog) = self.backlogs.get(&path_uid) - { + let (oldest_sn, total) = if let Some(path_backlog) = self.backlogs.get(&path_uid) { // Avoid having the inner backlog map growing more than a given threshold, by removing // the oldest sequence number entry. if path_backlog.len() > BACKLOG_RESET_THRESHOLD { @@ -935,20 +934,11 @@ impl TelemetryState { // Return the oldest event information to be recorded in telemetry if let Some(min) = path_backlog.iter().map(|v| *v.key()).min() { - if let Some(oldest) = path_backlog.get(&min) { - (min, *oldest.value(), path_backlog.len() as u64) - } else { - // Timestamp was not found, this should not happen, record a 0 ts. - (min, 0, path_backlog.len() as u64) - } + (min, path_backlog.len() as u64) } else { // We just inserted a new key/value, so this else branch is unlikely to activate, // but it can happen in case of concurrent updates to the backlog. - ( - EMPTY_BACKLOG_SYMBOL, - EMPTY_BACKLOG_SYMBOL, - EMPTY_BACKLOG_SYMBOL, - ) + (EMPTY_BACKLOG_SYMBOL, EMPTY_BACKLOG_SYMBOL) } } else { // If there is no inner backlog for this path, create a new map to store it. @@ -958,16 +948,57 @@ impl TelemetryState { self.backlogs.insert(path_uid, new_path_backlog); // Return the current event information to be recorded in telemetry - (seq_nr, timestamp, 1) + (seq_nr, 1) }; // Update metrics to reflect the new state of the backlog self.backlog_oldest_sequence.observe(&cx, oldest_sn, labels); - self.backlog_oldest_timestamp - .observe(&cx, oldest_ts, labels); + self.backlog_latest_update_timestamp + .observe(&cx, timestamp, labels); self.backlog_size.observe(&cx, total, labels); } + /// Inserts in the backlog a new event for the given sequence number. + /// This happens when the relayer observed a new SendPacket event. + pub fn update_backlog( + &self, + sequences: Vec, + chain_id: &ChainId, + channel_id: &ChannelId, + port_id: &PortId, + counterparty_chain_id: &ChainId, + ) { + // Unique identifier for a chain/channel/port. + let path_uid: PathIdentifier = PathIdentifier::new( + chain_id.to_string(), + channel_id.to_string(), + port_id.to_string(), + ); + + // This condition is done in order to avoid having an incorrect `backlog_latest_update_timestamp`. + // If the sequences is an empty vector by removing the entries using `backlog_remove` the `backlog_latest_update_timestamp` + // will only be updated if the current backlog is not empty. + // If the sequences is not empty, then it is possible to simple remove the backlog for that path and insert the sequences. + if sequences.is_empty() { + if let Some(path_backlog) = self.backlogs.get(&path_uid) { + let current_keys: Vec = path_backlog + .value() + .iter() + .map(|entry| *entry.key()) + .collect(); + + for key in current_keys.iter() { + self.backlog_remove(*key, chain_id, channel_id, port_id, counterparty_chain_id) + } + } + } else { + self.backlogs.remove(&path_uid); + for key in sequences.iter() { + self.backlog_insert(*key, chain_id, channel_id, port_id, counterparty_chain_id) + } + } + } + /// Evicts from the backlog the event for the given sequence number. /// Removing events happens when the relayer observed either an acknowledgment /// or a timeout for a packet sequence number, which means that the corresponding @@ -996,16 +1027,20 @@ impl TelemetryState { KeyValue::new("port", port_id.to_string()), ]; + // Retrieve local timestamp when this SendPacket event was recorded. + let now = Time::now(); + let timestamp = match now.duration_since(Time::unix_epoch()) { + Ok(ts) => ts.as_secs(), + Err(_) => 0, + }; + if let Some(path_backlog) = self.backlogs.get(&path_uid) { if path_backlog.remove(&seq_nr).is_some() { + // If the entry was removed update the latest update timestamp. + self.backlog_latest_update_timestamp + .observe(&cx, timestamp, labels); // The oldest pending sequence number is the minimum key in the inner (path) backlog. if let Some(min_key) = path_backlog.iter().map(|v| *v.key()).min() { - if let Some(oldest) = path_backlog.get(&min_key) { - self.backlog_oldest_timestamp - .observe(&cx, *oldest.value(), labels); - } else { - self.backlog_oldest_timestamp.observe(&cx, 0, labels); - } self.backlog_oldest_sequence.observe(&cx, min_key, labels); self.backlog_size .observe(&cx, path_backlog.len() as u64, labels); @@ -1013,8 +1048,6 @@ impl TelemetryState { // No mimimum found, update the metrics to reflect an empty backlog self.backlog_oldest_sequence .observe(&cx, EMPTY_BACKLOG_SYMBOL, labels); - self.backlog_oldest_timestamp - .observe(&cx, EMPTY_BACKLOG_SYMBOL, labels); self.backlog_size.observe(&cx, EMPTY_BACKLOG_SYMBOL, labels); } } @@ -1156,7 +1189,7 @@ impl AggregatorSelector for CustomAggregatorSelector { match descriptor.name() { "wallet_balance" => Some(Arc::new(last_value())), "backlog_oldest_sequence" => Some(Arc::new(last_value())), - "backlog_oldest_timestamp" => Some(Arc::new(last_value())), + "backlog_latest_update_timestamp" => Some(Arc::new(last_value())), "backlog_size" => Some(Arc::new(last_value())), // Prometheus' supports only collector for histogram, sum, and last value aggregators. // https://docs.rs/opentelemetry-prometheus/0.10.0/src/opentelemetry_prometheus/lib.rs.html#411-418 @@ -1168,3 +1201,169 @@ impl AggregatorSelector for CustomAggregatorSelector { } } } + +#[cfg(test)] +mod tests { + use prometheus::proto::Metric; + + use super::*; + + #[test] + fn insert_remove_backlog() { + let state = TelemetryState::new( + Range { + start: 0, + end: 5000, + }, + 5, + Range { + start: 0, + end: 5000, + }, + 5, + ); + + let chain_id = ChainId::from_string("chain-test"); + let counterparty_chain_id = ChainId::from_string("counterpartychain-test"); + let channel_id = ChannelId::new(0); + let port_id = PortId::transfer(); + + state.backlog_insert(1, &chain_id, &channel_id, &port_id, &counterparty_chain_id); + state.backlog_insert(2, &chain_id, &channel_id, &port_id, &counterparty_chain_id); + state.backlog_insert(3, &chain_id, &channel_id, &port_id, &counterparty_chain_id); + state.backlog_insert(4, &chain_id, &channel_id, &port_id, &counterparty_chain_id); + state.backlog_insert(5, &chain_id, &channel_id, &port_id, &counterparty_chain_id); + state.backlog_remove(3, &chain_id, &channel_id, &port_id, &counterparty_chain_id); + state.backlog_remove(1, &chain_id, &channel_id, &port_id, &counterparty_chain_id); + + let metrics = state.exporter.registry().gather().clone(); + let backlog_size = metrics + .iter() + .find(|metric| metric.get_name() == "backlog_size") + .unwrap(); + assert!( + assert_metric_value(backlog_size.get_metric(), 3), + "expected backlog_size to be 3" + ); + let backlog_oldest_sequence = metrics + .iter() + .find(|&metric| metric.get_name() == "backlog_oldest_sequence") + .unwrap(); + assert!( + assert_metric_value(backlog_oldest_sequence.get_metric(), 2), + "expected backlog_oldest_sequence to be 2" + ); + } + + #[test] + fn update_backlog() { + let state = TelemetryState::new( + Range { + start: 0, + end: 5000, + }, + 5, + Range { + start: 0, + end: 5000, + }, + 5, + ); + + let chain_id = ChainId::from_string("chain-test"); + let counterparty_chain_id = ChainId::from_string("counterpartychain-test"); + let channel_id = ChannelId::new(0); + let port_id = PortId::transfer(); + + state.backlog_insert(1, &chain_id, &channel_id, &port_id, &counterparty_chain_id); + state.backlog_insert(2, &chain_id, &channel_id, &port_id, &counterparty_chain_id); + state.backlog_insert(3, &chain_id, &channel_id, &port_id, &counterparty_chain_id); + state.backlog_insert(4, &chain_id, &channel_id, &port_id, &counterparty_chain_id); + state.backlog_insert(5, &chain_id, &channel_id, &port_id, &counterparty_chain_id); + + state.update_backlog( + vec![5], + &chain_id, + &channel_id, + &port_id, + &counterparty_chain_id, + ); + + let metrics = state.exporter.registry().gather().clone(); + let backlog_size = metrics + .iter() + .find(|&metric| metric.get_name() == "backlog_size") + .unwrap(); + assert!( + assert_metric_value(backlog_size.get_metric(), 1), + "expected backlog_size to be 1" + ); + let backlog_oldest_sequence = metrics + .iter() + .find(|&metric| metric.get_name() == "backlog_oldest_sequence") + .unwrap(); + assert!( + assert_metric_value(backlog_oldest_sequence.get_metric(), 5), + "expected backlog_oldest_sequence to be 5" + ); + } + + #[test] + fn update_backlog_empty() { + let state = TelemetryState::new( + Range { + start: 0, + end: 5000, + }, + 5, + Range { + start: 0, + end: 5000, + }, + 5, + ); + + let chain_id = ChainId::from_string("chain-test"); + let counterparty_chain_id = ChainId::from_string("counterpartychain-test"); + let channel_id = ChannelId::new(0); + let port_id = PortId::transfer(); + + state.backlog_insert(1, &chain_id, &channel_id, &port_id, &counterparty_chain_id); + state.backlog_insert(2, &chain_id, &channel_id, &port_id, &counterparty_chain_id); + state.backlog_insert(3, &chain_id, &channel_id, &port_id, &counterparty_chain_id); + state.backlog_insert(4, &chain_id, &channel_id, &port_id, &counterparty_chain_id); + state.backlog_insert(5, &chain_id, &channel_id, &port_id, &counterparty_chain_id); + + state.update_backlog( + vec![], + &chain_id, + &channel_id, + &port_id, + &counterparty_chain_id, + ); + + let metrics = state.exporter.registry().gather().clone(); + let backlog_size = metrics + .iter() + .find(|&metric| metric.get_name() == "backlog_size") + .unwrap(); + assert!( + assert_metric_value(backlog_size.get_metric(), 0), + "expected backlog_size to be 0" + ); + let backlog_oldest_sequence = metrics + .iter() + .find(|&metric| metric.get_name() == "backlog_oldest_sequence") + .unwrap(); + assert!( + assert_metric_value(backlog_oldest_sequence.get_metric(), 0), + "expected backlog_oldest_sequence to be 0" + ); + } + + fn assert_metric_value(metric: &[Metric], expected: u64) -> bool { + metric + .iter() + .any(|m| m.get_gauge().get_value() as u64 == expected) + } +} diff --git a/guide/src/assets/grafana_template.json b/guide/src/assets/grafana_template.json index 2d7e91454a..80846ea8cf 100644 --- a/guide/src/assets/grafana_template.json +++ b/guide/src/assets/grafana_template.json @@ -958,7 +958,7 @@ }, "editorMode": "builder", "exemplar": false, - "expr": "backlog_oldest_timestamp{job=\"hermes\"}", + "expr": "backlog_latest_update_timestamp{job=\"hermes\"}", "format": "table", "hide": false, "instant": true, diff --git a/guide/src/documentation/telemetry/integration.md b/guide/src/documentation/telemetry/integration.md index 451f95b9a7..a33af5f656 100644 --- a/guide/src/documentation/telemetry/integration.md +++ b/guide/src/documentation/telemetry/integration.md @@ -22,10 +22,10 @@ acknowledgment_packets_confirmed_total{dst_chain="ibc-1",dst_channel="channel-0" # TYPE backlog_oldest_sequence gauge backlog_oldest_sequence{chain="ibc-0",channel="channel-0",counterparty="ibc-1",port="transfer",service_name="unknown_service",otel_scope_name="hermes",otel_scope_version=""} 0 backlog_oldest_sequence{chain="ibc-1",channel="channel-0",counterparty="ibc-0",port="transfer",service_name="unknown_service",otel_scope_name="hermes",otel_scope_version=""} 0 -# HELP backlog_oldest_timestamp Local timestamp for the oldest SendPacket event in the backlog -# TYPE backlog_oldest_timestamp gauge -backlog_oldest_timestamp{chain="ibc-0",channel="channel-0",counterparty="ibc-1",port="transfer",service_name="unknown_service",otel_scope_name="hermes",otel_scope_version=""} 0 -backlog_oldest_timestamp{chain="ibc-1",channel="channel-0",counterparty="ibc-0",port="transfer",service_name="unknown_service",otel_scope_name="hermes",otel_scope_version=""} 0 +# HELP backlog_latest_update_timestamp Local timestamp for the last time the backlog metrics have been updated +# TYPE backlog_latest_update_timestamp gauge +backlog_latest_update_timestamp{chain="ibc-0",channel="channel-0",counterparty="ibc-1",port="transfer",service_name="unknown_service",otel_scope_name="hermes",otel_scope_version=""} 0 +backlog_latest_update_timestamp{chain="ibc-1",channel="channel-0",counterparty="ibc-0",port="transfer",service_name="unknown_service",otel_scope_name="hermes",otel_scope_version=""} 0 # HELP backlog_size Total number of SendPacket events in the backlog # TYPE backlog_size gauge backlog_size{chain="ibc-0",channel="channel-0",counterparty="ibc-1",port="transfer",service_name="unknown_service",otel_scope_name="hermes",otel_scope_version=""} 0 diff --git a/guide/src/documentation/telemetry/operators.md b/guide/src/documentation/telemetry/operators.md index c550232beb..96204843c4 100644 --- a/guide/src/documentation/telemetry/operators.md +++ b/guide/src/documentation/telemetry/operators.md @@ -121,7 +121,7 @@ Since Hermes v1, we also introduced 3 metrics that sketch the backlog status of | Name | Description | OpenTelemetry type | Configuration Dependencies | | -------------------------- | -------------------------------------------------------------- | ------------------- | -------------------------- | | `backlog_oldest_sequence` | Sequence number of the oldest SendPacket event in the backlog | `u64` ValueRecorder | Packet workers enabled | -| `backlog_oldest_timestamp` | Local timestamp for the oldest SendPacket event in the backlog | `u64` ValueRecorder | Packet workers enabled | +| `backlog_latest_update_timestamp` | Local timestamp for the last time the backlog metrics have been updated | `u64` ValueRecorder | Packet workers enabled | | `backlog_size` | Total number of SendPacket events in the backlog | `u64` ValueRecorder | Packet workers enabled | @@ -129,9 +129,8 @@ Notes: - The `backlog_size` defines how many IBC packets users sent and were not yet relayed (i.e., received on the destination network, or timed-out). If this metric is increasing, it signals that the packet queue is increasing and there may be some errors in the Hermes logs that need your attention. -- If the `backlog_oldest_sequence` remains unchanged for more than a few minutes, that means that the packet with the respective sequence number is likely blocked -and cannot be relayed. To understand for how long the packet is block, Hermes will populate `backlog_oldest_timestamp` with the local time when it first observed -the `backlog_oldest_sequence` that is blocked. +- The `backlog_latest_update_timestamp` is used to get information on the reliability of the `backlog_*` metrics. If the timestamp doesn't change it means there might be an issue with the metrics. +- __NOTE__: The Hermes instance might miss the acknowledgment of an observed IBC packets relayed, this will cause the `backlog_*` metrics to contain an invalid value. In order to minimise this issue, whenever the Hermes instance clears packets the `backlog_*` metrics will be updated using the queried pending packets. ## How efficient and how secure is the IBC status on each network? From e246ac816e66e17d58422775b911a5899c3651d8 Mon Sep 17 00:00:00 2001 From: Luca Joss Date: Thu, 14 Dec 2023 17:18:03 +0100 Subject: [PATCH 7/8] Update changelog --- .changelog/v1.7.4/summary.md | 3 +-- CHANGELOG.md | 12 ++++++++++-- 2 files changed, 11 insertions(+), 4 deletions(-) diff --git a/.changelog/v1.7.4/summary.md b/.changelog/v1.7.4/summary.md index bfb5550397..d1571730af 100644 --- a/.changelog/v1.7.4/summary.md +++ b/.changelog/v1.7.4/summary.md @@ -4,5 +4,4 @@ This release improves the monitoring of Hermes instances by fixing the `broadcas that it correctly batches the same errors together. It also improves the metrics `backlog_*` by updating them whenever Hermes queries pending packets. -A fix avoiding packets being discarded if the idle worker clean-up is removing the worker at the same -time the packets are received. \ No newline at end of file +Some additional stability has been added in the idle worker clean-up and the `evidence` command. \ No newline at end of file diff --git a/CHANGELOG.md b/CHANGELOG.md index 1ce4e162e9..6415a31fc8 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -4,12 +4,13 @@ *December 15th, 2023* +Special thanks to Yun Yeo (@beer-1) for his contributions ([#3697] and [#3703]). + This release improves the monitoring of Hermes instances by fixing the `broadcast_errors` metric so that it correctly batches the same errors together. It also improves the metrics `backlog_*` by updating them whenever Hermes queries pending packets. -A fix avoiding packets being discarded if the idle worker clean-up is removing the worker at the same -time the packets are received. +Some additional stability has been added in the idle worker clean-up and the `evidence` command. ### BUG FIXES @@ -26,6 +27,13 @@ time the packets are received. - Fix the issue where `broadcast_errors` metric would not correctly batch the same errors together. together ([\#3720](https://github.com/informalsystems/hermes/issues/3720)) + - Update the values of `backlog` metrics when clearing packets. + Change the `backlog_oldest_timestamp` to `backlog_latest_update_timestamp` + which shows the last time the `backlog` metrics have been updated. + ([\#3723](https://github.com/informalsystems/hermes/issues/3723)) + +[#3697]: https://github.com/informalsystems/hermes/issues/3697 +[#3703]: https://github.com/informalsystems/hermes/issues/3703 ## v1.7.3 From 44786816f09b3ed469b105da0ad93bbea3f005cf Mon Sep 17 00:00:00 2001 From: Romain Ruetschi Date: Fri, 15 Dec 2023 09:42:41 +0100 Subject: [PATCH 8/8] Update release summary --- .changelog/v1.7.4/summary.md | 4 +++- CHANGELOG.md | 4 +++- 2 files changed, 6 insertions(+), 2 deletions(-) diff --git a/.changelog/v1.7.4/summary.md b/.changelog/v1.7.4/summary.md index d1571730af..00b15e27f1 100644 --- a/.changelog/v1.7.4/summary.md +++ b/.changelog/v1.7.4/summary.md @@ -4,4 +4,6 @@ This release improves the monitoring of Hermes instances by fixing the `broadcas that it correctly batches the same errors together. It also improves the metrics `backlog_*` by updating them whenever Hermes queries pending packets. -Some additional stability has been added in the idle worker clean-up and the `evidence` command. \ No newline at end of file +This release also improves the reliability of the idle worker clean-up and +fixes a bug within the `evidence` command which would sometimes prevent +the misbehaviour evidence from being reported. diff --git a/CHANGELOG.md b/CHANGELOG.md index 6415a31fc8..eef66be240 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -10,7 +10,9 @@ This release improves the monitoring of Hermes instances by fixing the `broadcas that it correctly batches the same errors together. It also improves the metrics `backlog_*` by updating them whenever Hermes queries pending packets. -Some additional stability has been added in the idle worker clean-up and the `evidence` command. +This release also improves the reliability of the idle worker clean-up and +fixes a bug within the `evidence` command which would sometimes prevent +the misbehaviour evidence from being reported. ### BUG FIXES