From 41b79555137806a777a79c0290d126727ee0f547 Mon Sep 17 00:00:00 2001 From: Techno Freak Date: Sat, 10 Feb 2024 18:25:57 +0300 Subject: [PATCH 1/8] feat: add simulate errors metrics --- crates/relayer/src/chain/cosmos/estimate.rs | 19 ++++++++++++++++- crates/telemetry/src/state.rs | 23 +++++++++++++++++++++ 2 files changed, 41 insertions(+), 1 deletion(-) diff --git a/crates/relayer/src/chain/cosmos/estimate.rs b/crates/relayer/src/chain/cosmos/estimate.rs index 87a18a26b4..25d00329a5 100644 --- a/crates/relayer/src/chain/cosmos/estimate.rs +++ b/crates/relayer/src/chain/cosmos/estimate.rs @@ -15,6 +15,7 @@ use crate::config::types::Memo; use crate::error::Error; use crate::keyring::Secp256k1KeyPair; use crate::util::pretty::PrettyFee; +use crate::telemetry; pub async fn estimate_tx_fees( config: &TxConfig, @@ -51,6 +52,7 @@ pub async fn estimate_tx_fees( &config.rpc_address, &config.chain_id, tx, + account, ) .await?; @@ -63,6 +65,7 @@ async fn estimate_fee_with_tx( rpc_address: &Url, chain_id: &ChainId, tx: Tx, + account: &Account, ) -> Result { let estimated_gas = { crate::time!( @@ -72,7 +75,7 @@ async fn estimate_fee_with_tx( } ); - estimate_gas_with_tx(gas_config, grpc_address, tx).await + estimate_gas_with_tx(gas_config, grpc_address, tx, account).await }?; if estimated_gas > gas_config.max_gas { @@ -112,6 +115,7 @@ async fn estimate_gas_with_tx( gas_config: &GasConfig, grpc_address: &Uri, tx: Tx, + account: &Account, ) -> Result { let simulated_gas = send_tx_simulate(grpc_address, tx) .await @@ -147,6 +151,12 @@ async fn estimate_gas_with_tx( e.detail() ); + telemetry!( + simulate_errors, + &account.address.to_string(), + true, + ); + Ok(gas_config.default_gas) } @@ -155,6 +165,13 @@ async fn estimate_gas_with_tx( "failed to simulate tx. propagating error to caller: {}", e.detail() ); + + telemetry!( + simulate_errors, + &account.address.to_string(), + false, + ); + // Propagate the error, the retrying mechanism at caller may catch & retry. Err(e) } diff --git a/crates/telemetry/src/state.rs b/crates/telemetry/src/state.rs index bf48381d1f..fb084dbfd4 100644 --- a/crates/telemetry/src/state.rs +++ b/crates/telemetry/src/state.rs @@ -201,6 +201,9 @@ pub struct TelemetryState { /// Number of errors observed by Hermes when broadcasting a Tx broadcast_errors: Counter, + /// Number of errors observed by Hermes when simulating a Tx + simulate_errors: Counter, + /// The EIP-1559 base fee queried dynamic_gas_queried_fees: ObservableGauge, @@ -394,6 +397,13 @@ impl TelemetryState { ) .init(), + simulate_errors: meter + .u64_counter("simulate_errors") + .with_description( + "Number of errors observed by Hermes when simulating a Tx", + ) + .init(), + dynamic_gas_queried_fees: meter .f64_observable_gauge("dynamic_gas_queried_fees") .with_description("The EIP-1559 base fee queried") @@ -1160,6 +1170,19 @@ impl TelemetryState { self.broadcast_errors.add(&cx, 1, labels); } + /// Add an error and its description to the list of errors observed after simulating + /// a Tx with a specific account. + pub fn simulate_errors(&self, address: &String, recoverable: bool) { + let cx = Context::current(); + + let labels = &[ + KeyValue::new("account", address.to_string()), + KeyValue::new("recoverable", recoverable.to_string()), + ]; + + self.simulate_errors.add(&cx, 1, labels); + } + pub fn dynamic_gas_queried_fees(&self, chain_id: &ChainId, amount: f64) { let cx = Context::current(); From 7b73f9e1e4ed02548418089b0618d149e9aa0876 Mon Sep 17 00:00:00 2001 From: Techno Freak Date: Sat, 10 Feb 2024 21:20:51 +0300 Subject: [PATCH 2/8] feat: add error message --- crates/relayer/src/chain/cosmos/estimate.rs | 12 ++++++++++++ crates/telemetry/src/state.rs | 3 ++- 2 files changed, 14 insertions(+), 1 deletion(-) diff --git a/crates/relayer/src/chain/cosmos/estimate.rs b/crates/relayer/src/chain/cosmos/estimate.rs index 25d00329a5..a8df073511 100644 --- a/crates/relayer/src/chain/cosmos/estimate.rs +++ b/crates/relayer/src/chain/cosmos/estimate.rs @@ -155,6 +155,7 @@ async fn estimate_gas_with_tx( simulate_errors, &account.address.to_string(), true, + get_error_text(&e), ); Ok(gas_config.default_gas) @@ -170,6 +171,7 @@ async fn estimate_gas_with_tx( simulate_errors, &account.address.to_string(), false, + get_error_text(&e), ); // Propagate the error, the retrying mechanism at caller may catch & retry. @@ -192,3 +194,13 @@ fn can_recover_from_simulation_failure(e: &Error) -> bool { _ => false, } } + + +fn get_error_text(e: &Error) -> String { + use crate::error::ErrorDetail::*; + + match e.detail() { + GrpcStatus(detail) => detail.status.code().to_string(), + detail => detail.to_string(), + } +} diff --git a/crates/telemetry/src/state.rs b/crates/telemetry/src/state.rs index fb084dbfd4..465a367044 100644 --- a/crates/telemetry/src/state.rs +++ b/crates/telemetry/src/state.rs @@ -1172,12 +1172,13 @@ impl TelemetryState { /// Add an error and its description to the list of errors observed after simulating /// a Tx with a specific account. - pub fn simulate_errors(&self, address: &String, recoverable: bool) { + pub fn simulate_errors(&self, address: &String, recoverable: bool, error_description: String) { let cx = Context::current(); let labels = &[ KeyValue::new("account", address.to_string()), KeyValue::new("recoverable", recoverable.to_string()), + KeyValue::new("error_description", error_description.to_owned()), ]; self.simulate_errors.add(&cx, 1, labels); From 4285a9fe75035067c9906cd128153f8bd1d3e172 Mon Sep 17 00:00:00 2001 From: Techno Freak Date: Sat, 10 Feb 2024 21:27:42 +0300 Subject: [PATCH 3/8] chore: add docs --- guide/src/documentation/telemetry/operators.md | 1 + 1 file changed, 1 insertion(+) diff --git a/guide/src/documentation/telemetry/operators.md b/guide/src/documentation/telemetry/operators.md index 2496b6e91a..dd0d137d2e 100644 --- a/guide/src/documentation/telemetry/operators.md +++ b/guide/src/documentation/telemetry/operators.md @@ -142,6 +142,7 @@ If this metric is increasing, it signals that the packet queue is increasing and | `cleared_send_packet_count_total`  | Number of SendPacket events received during the initial and periodic clearing, per chain, counterparty chain, channel and port | `u64` Counter | Packet workers enabled, and periodic packet clearing or clear on start enabled | | `cleared_acknowledgment_count_total` | Number of WriteAcknowledgement events received during the initial and periodic clearing, per chain, counterparty chain, channel and port | `u64` Counter | Packet workers enabled, and periodic packet clearing or clear on start enabled | | `broadcast_errors_total` | Number of errors observed by Hermes when broadcasting a Tx, per error type and account | `u64` Counter | Packet workers enabled | +| `simulate_errors_total` | Number of errors observed by Hermes when simulating a Tx, per error type, account and whether the error is recoverable or not | `u64` Counter | Packet workers enabled | | `filtered_packets` | Number of ICS-20 packets filtered because the memo and/or the receiver fields were exceeding the configured limits | `u64` Counter | Packet workers enabled, and `ics20_max_memo_size` and/or `ics20_max_receiver_size` enabled | Notes: From ad8929a314438f70ef01c2de1ea391639d8b9aa0 Mon Sep 17 00:00:00 2001 From: Techno Freak Date: Sat, 10 Feb 2024 21:30:58 +0300 Subject: [PATCH 4/8] chore: add unclog entry --- .../unreleased/features/3845-add-simulate-errors-metric.md | 2 ++ 1 file changed, 2 insertions(+) create mode 100644 .changelog/unreleased/features/3845-add-simulate-errors-metric.md diff --git a/.changelog/unreleased/features/3845-add-simulate-errors-metric.md b/.changelog/unreleased/features/3845-add-simulate-errors-metric.md new file mode 100644 index 0000000000..8bea10c844 --- /dev/null +++ b/.changelog/unreleased/features/3845-add-simulate-errors-metric.md @@ -0,0 +1,2 @@ +- Added simulate errors metric for Prometheus + ([\#3845](https://github.com/informalsystems/hermes/issues/3845)) \ No newline at end of file From dcba98d49b2cdce1eabb2e35bef307660f5a26b7 Mon Sep 17 00:00:00 2001 From: Techno Freak Date: Sat, 10 Feb 2024 21:45:43 +0300 Subject: [PATCH 5/8] chore: cargo fmt --- crates/relayer/src/chain/cosmos/estimate.rs | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/crates/relayer/src/chain/cosmos/estimate.rs b/crates/relayer/src/chain/cosmos/estimate.rs index a8df073511..79772e92c8 100644 --- a/crates/relayer/src/chain/cosmos/estimate.rs +++ b/crates/relayer/src/chain/cosmos/estimate.rs @@ -14,8 +14,8 @@ use crate::chain::cosmos::types::gas::GasConfig; use crate::config::types::Memo; use crate::error::Error; use crate::keyring::Secp256k1KeyPair; -use crate::util::pretty::PrettyFee; use crate::telemetry; +use crate::util::pretty::PrettyFee; pub async fn estimate_tx_fees( config: &TxConfig, @@ -195,7 +195,6 @@ fn can_recover_from_simulation_failure(e: &Error) -> bool { } } - fn get_error_text(e: &Error) -> String { use crate::error::ErrorDetail::*; From f45faaf6605d7124419d2998ad1a6371b8ff95d4 Mon Sep 17 00:00:00 2001 From: Sergey <83376337+freak12techno@users.noreply.github.com> Date: Wed, 21 Feb 2024 15:48:29 +0300 Subject: [PATCH 6/8] Update .changelog/unreleased/features/3845-add-simulate-errors-metric.md Co-authored-by: Luca Joss <43531661+ljoss17@users.noreply.github.com> Signed-off-by: Sergey <83376337+freak12techno@users.noreply.github.com> --- .../unreleased/features/3845-add-simulate-errors-metric.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.changelog/unreleased/features/3845-add-simulate-errors-metric.md b/.changelog/unreleased/features/3845-add-simulate-errors-metric.md index 8bea10c844..898cceca1d 100644 --- a/.changelog/unreleased/features/3845-add-simulate-errors-metric.md +++ b/.changelog/unreleased/features/3845-add-simulate-errors-metric.md @@ -1,2 +1,2 @@ -- Added simulate errors metric for Prometheus +- Added a new metric `simulate_errors` for Prometheus ([\#3845](https://github.com/informalsystems/hermes/issues/3845)) \ No newline at end of file From 8b916de2d8cd955b94ef305b582c0c429151add9 Mon Sep 17 00:00:00 2001 From: Sergey Date: Wed, 21 Feb 2024 15:49:53 +0300 Subject: [PATCH 7/8] chore: renamed unreleased file --- .../{ => ibc-telemetry}/3845-add-simulate-errors-metric.md | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename .changelog/unreleased/features/{ => ibc-telemetry}/3845-add-simulate-errors-metric.md (100%) diff --git a/.changelog/unreleased/features/3845-add-simulate-errors-metric.md b/.changelog/unreleased/features/ibc-telemetry/3845-add-simulate-errors-metric.md similarity index 100% rename from .changelog/unreleased/features/3845-add-simulate-errors-metric.md rename to .changelog/unreleased/features/ibc-telemetry/3845-add-simulate-errors-metric.md From f9de23d9178483a5d3d8b03d1514c5d4f05fdf72 Mon Sep 17 00:00:00 2001 From: Romain Ruetschi Date: Wed, 21 Feb 2024 14:45:06 +0100 Subject: [PATCH 8/8] Update changelog entry --- .../3845-add-simulate-errors-metric.md | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/.changelog/unreleased/features/ibc-telemetry/3845-add-simulate-errors-metric.md b/.changelog/unreleased/features/ibc-telemetry/3845-add-simulate-errors-metric.md index 898cceca1d..7e83a36b32 100644 --- a/.changelog/unreleased/features/ibc-telemetry/3845-add-simulate-errors-metric.md +++ b/.changelog/unreleased/features/ibc-telemetry/3845-add-simulate-errors-metric.md @@ -1,2 +1,11 @@ -- Added a new metric `simulate_errors` for Prometheus - ([\#3845](https://github.com/informalsystems/hermes/issues/3845)) \ No newline at end of file +- Added a new Prometheus metric `simulate_errors` for tracking when a transaction simulation fails, with the following labels: + * `recoverable` (can the execution continue if this happened?) + * `account` (account from which the tx was sent) + * `error_description` (description of the error) + ([\#3845](https://github.com/informalsystems/hermes/issues/3845)) + + ``` + # HELP simulate_errors_total Number of errors observed by Hermes when simulating a Tx + # TYPE simulate_errors_total counter + simulate_errors_total{account="osmo17ndx5qfku28ymxgmq6zq4a6d02dvpfjjul0hyh",error_description="Unknown error",recoverable="false",service_name="unknown_service",otel_scope_name="hermes",otel_scope_version=""} 4 + ```