From f831d96c74f46f1d265b9bd5e4960af5466fb180 Mon Sep 17 00:00:00 2001 From: Nikolay Komarevskiy Date: Tue, 25 Jun 2024 17:20:16 +0200 Subject: [PATCH 01/24] feat: dynamic_route_provider --- ic-agent/Cargo.toml | 8 + .../dynamic_routing/dynamic_route_provider.rs | 596 ++++++++++++++++++ .../dynamic_routing/health_check.rs | 265 ++++++++ .../dynamic_routing/messages.rs | 11 + .../http_transport/dynamic_routing/mod.rs | 9 + .../http_transport/dynamic_routing/node.rs | 43 ++ .../dynamic_routing/nodes_fetch.rs | 143 +++++ .../snapshot/latency_based_routing.rs | 365 +++++++++++ .../dynamic_routing/snapshot/mod.rs | 3 + .../snapshot/round_robin_routing.rs | 237 +++++++ .../snapshot/routing_snapshot.rs | 10 + .../dynamic_routing/test_utils.rs | 121 ++++ .../dynamic_routing/type_aliases.rs | 12 + ic-agent/src/agent/http_transport/mod.rs | 1 + 14 files changed, 1824 insertions(+) create mode 100644 ic-agent/src/agent/http_transport/dynamic_routing/dynamic_route_provider.rs create mode 100644 ic-agent/src/agent/http_transport/dynamic_routing/health_check.rs create mode 100644 ic-agent/src/agent/http_transport/dynamic_routing/messages.rs create mode 100644 ic-agent/src/agent/http_transport/dynamic_routing/mod.rs create mode 100644 ic-agent/src/agent/http_transport/dynamic_routing/node.rs create mode 100644 ic-agent/src/agent/http_transport/dynamic_routing/nodes_fetch.rs create mode 100644 ic-agent/src/agent/http_transport/dynamic_routing/snapshot/latency_based_routing.rs create mode 100644 ic-agent/src/agent/http_transport/dynamic_routing/snapshot/mod.rs create mode 100644 ic-agent/src/agent/http_transport/dynamic_routing/snapshot/round_robin_routing.rs create mode 100644 ic-agent/src/agent/http_transport/dynamic_routing/snapshot/routing_snapshot.rs create mode 100644 ic-agent/src/agent/http_transport/dynamic_routing/test_utils.rs create mode 100644 ic-agent/src/agent/http_transport/dynamic_routing/type_aliases.rs diff --git a/ic-agent/Cargo.toml b/ic-agent/Cargo.toml index 1dcacdc7..5864f104 100644 --- a/ic-agent/Cargo.toml +++ b/ic-agent/Cargo.toml @@ -44,6 +44,14 @@ simple_asn1 = "0.6.1" thiserror = { workspace = true } time = { workspace = true } url = "2.1.0" +tokio = {version = "1.24.2", features = ["full"]} +async-trait = "0.1.80" +tracing = "0.1.40" +arc-swap = "1.7.1" +anyhow = "1.0.86" +simple_moving_average = "1.0.2" +tracing-subscriber = "0.3.18" +tokio-util = { version = "0.7.11", features = ["full"] } [dependencies.hyper] version = "1.0.1" diff --git a/ic-agent/src/agent/http_transport/dynamic_routing/dynamic_route_provider.rs b/ic-agent/src/agent/http_transport/dynamic_routing/dynamic_route_provider.rs new file mode 100644 index 00000000..261a37e0 --- /dev/null +++ b/ic-agent/src/agent/http_transport/dynamic_routing/dynamic_route_provider.rs @@ -0,0 +1,596 @@ +//! An implementation of the [`RouteProvider`](crate::agent::http_transport::route_provider::RouteProvider) for dynamic generation of routing urls. + +use std::{ + sync::Arc, + time::{Duration, Instant}, +}; + +use anyhow::anyhow; +use arc_swap::ArcSwap; +use candid::Principal; +use reqwest::Client; +use tokio::{ + sync::{mpsc, watch}, + time::timeout, +}; +use tokio_util::{sync::CancellationToken, task::TaskTracker}; +use tracing::{error, info, warn}; +use url::Url; + +use crate::{ + agent::http_transport::{ + dynamic_routing::{ + health_check::{HealthCheck, HealthChecker, HealthManagerActor}, + messages::FetchedNodes, + node::Node, + nodes_fetch::{Fetch, NodesFetchActor, NodesFetcher}, + snapshot::routing_snapshot::RoutingSnapshot, + type_aliases::GlobalShared, + }, + route_provider::RouteProvider, + }, + AgentError, +}; + +pub const IC0_SEED_DOMAIN: &str = "ic0.app"; + +const MAINNET_ROOT_SUBNET_ID: &str = + "tdb26-jop6k-aogll-7ltgs-eruif-6kk7m-qpktf-gdiqx-mxtrf-vb5e6-eqe"; + +const FETCH_PERIOD: Duration = Duration::from_secs(5); +const FETCH_RETRY_INTERVAL: Duration = Duration::from_millis(250); +const TIMEOUT_AWAIT_HEALTHY_SEED: Duration = Duration::from_millis(1000); +const HEALTH_CHECK_TIMEOUT: Duration = Duration::from_secs(2); +const HEALTH_CHECK_PERIOD: Duration = Duration::from_secs(1); + +const DYNAMIC_ROUTE_PROVIDER: &str = "DynamicRouteProvider"; + +#[derive(Debug)] +pub struct DynamicRouteProvider { + fetcher: Arc, + fetch_period: Duration, + fetch_retry_interval: Duration, + checker: Arc, + check_period: Duration, + snapshot: GlobalShared, + tracker: TaskTracker, + seeds: Vec, + token: CancellationToken, +} + +impl RouteProvider for DynamicRouteProvider +where + S: RoutingSnapshot + 'static, +{ + fn route(&self) -> Result { + let snapshot = self.snapshot.load(); + let node = snapshot.next().ok_or_else(|| { + AgentError::RouteProviderError("No healthy API nodes found.".to_string()) + })?; + Ok(node.to_routing_url()) + } +} + +impl DynamicRouteProvider +where + S: RoutingSnapshot + 'static, +{ + pub fn new(snapshot: S, seeds: Vec, http_client: Client) -> Self { + let fetcher = Arc::new(NodesFetcher::new( + http_client.clone(), + Principal::from_text(MAINNET_ROOT_SUBNET_ID).unwrap(), + )); + let checker = Arc::new(HealthChecker::new(http_client, HEALTH_CHECK_TIMEOUT)); + Self { + fetcher, + fetch_period: FETCH_PERIOD, + fetch_retry_interval: FETCH_RETRY_INTERVAL, + checker, + check_period: HEALTH_CHECK_PERIOD, + seeds, + snapshot: Arc::new(ArcSwap::from_pointee(snapshot)), + tracker: TaskTracker::new(), + token: CancellationToken::new(), + } + } + + pub fn with_fetcher(mut self, fetcher: Arc) -> Self { + self.fetcher = fetcher; + self + } + + pub fn with_fetch_period(mut self, period: Duration) -> Self { + self.fetch_period = period; + self + } + + pub fn with_checker(mut self, checker: Arc) -> Self { + self.checker = checker; + self + } + + pub fn with_check_period(mut self, period: Duration) -> Self { + self.check_period = period; + self + } + + /// Starts two background tasks: + /// - Task1: NodesFetchActor + /// - Periodically fetches existing API nodes (gets latest nodes topology) and sends discovered nodes to HealthManagerActor. + /// - Task2: HealthManagerActor: + /// - Listens to the fetched nodes messages from the NodesFetchActor. + /// - Starts/stops health check tasks (HealthCheckActors) based on the newly added/removed nodes. + /// - These spawned health check tasks periodically update the snapshot with the latest node health info. + pub async fn run(&self) -> anyhow::Result<()> { + info!("{DYNAMIC_ROUTE_PROVIDER}: start run() "); + // Communication channel between NodesFetchActor and HealthManagerActor. + let (fetch_sender, fetch_receiver) = watch::channel(None); + + // Communication channel with HealthManagerActor to receive info about healthy seeds. + let (init_sender, mut init_receiver) = mpsc::channel(1); + + // Start the receiving part first. + let health_manager_actor = HealthManagerActor::new( + Arc::clone(&self.checker), + self.check_period, + Arc::clone(&self.snapshot), + fetch_receiver, + init_sender, + self.token.clone(), + ); + self.tracker + .spawn(async move { health_manager_actor.run().await }); + + // Dispatch all seed nodes for initial health checks + let start = Instant::now(); + if let Err(err) = fetch_sender.send(Some(FetchedNodes { + nodes: self.seeds.clone(), + })) { + error!("{DYNAMIC_ROUTE_PROVIDER}: failed to send results to HealthManager: {err:?}"); + } + + // Try await healthy seeds. + let found_healthy_seeds = + match timeout(TIMEOUT_AWAIT_HEALTHY_SEED, init_receiver.recv()).await { + Ok(_) => { + info!( + "{DYNAMIC_ROUTE_PROVIDER}: found healthy seeds within {:?}", + start.elapsed() + ); + true + } + Err(_) => { + warn!( + "{DYNAMIC_ROUTE_PROVIDER}: no healthy seeds found within {:?}", + start.elapsed() + ); + false + } + }; + init_receiver.close(); + + let fetch_actor = NodesFetchActor::new( + Arc::clone(&self.fetcher), + self.fetch_period, + self.fetch_retry_interval, + fetch_sender, + Arc::clone(&self.snapshot), + self.token.clone(), + ); + self.tracker.spawn(async move { fetch_actor.run().await }); + info!( + "{DYNAMIC_ROUTE_PROVIDER}: NodesFetchActor and HealthManagerActor started successfully" + ); + + (found_healthy_seeds) + .then(|| ()) + .ok_or_else(|| anyhow!("No healthy seeds found")) + } + + // Kill all running tasks. + pub async fn stop(&self) { + self.token.cancel(); + self.tracker.close(); + self.tracker.wait().await; + warn!("{DYNAMIC_ROUTE_PROVIDER}: gracefully stopped"); + } +} + +#[cfg(test)] +mod tests { + use reqwest::Client; + use std::{ + sync::{Arc, Once}, + time::Duration, + }; + use tracing::Level; + use tracing_subscriber::FmtSubscriber; + + use crate::{ + agent::http_transport::{ + dynamic_routing::{ + dynamic_route_provider::{DynamicRouteProvider, IC0_SEED_DOMAIN}, + node::Node, + snapshot::round_robin_routing::RoundRobinRoutingSnapshot, + test_utils::{ + assert_routed_domains, route_n_times, NodeHealthCheckerMock, NodesFetcherMock, + }, + }, + route_provider::RouteProvider, + }, + AgentError, + }; + + static TRACING_INIT: Once = Once::new(); + + pub fn setup_tracing() { + TRACING_INIT.call_once(|| { + FmtSubscriber::builder().with_max_level(Level::TRACE).init(); + }); + } + + #[tokio::test] + async fn test_routing_with_topology_and_node_health_updates() { + // Setup. + setup_tracing(); + let node_1 = Node::new(IC0_SEED_DOMAIN).unwrap(); + // Set nodes fetching params: topology, fetching periodicity. + let fetcher = Arc::new(NodesFetcherMock::new()); + fetcher.overwrite_nodes(vec![node_1.clone()]); + let fetch_interval = Duration::from_secs(2); + // Set health checking params: healthy nodes, checking periodicity. + let checker = Arc::new(NodeHealthCheckerMock::new()); + let check_interval = Duration::from_secs(1); + // A single healthy node exists in the topology. This node happens to be the seed node. + fetcher.overwrite_nodes(vec![node_1.clone()]); + checker.overwrite_healthy_nodes(vec![node_1.clone()]); + // Configure RouteProvider + let snapshot = RoundRobinRoutingSnapshot::new(); + let client = Client::builder().build().unwrap(); + let route_provider = Arc::new( + DynamicRouteProvider::new(snapshot, vec![node_1.clone()], client) + .with_fetcher(fetcher.clone()) + .with_checker(checker.clone()) + .with_fetch_period(fetch_interval) + .with_check_period(check_interval), + ); + route_provider.run().await.expect("no healthy seeds found"); + + // This time span is required for the snapshot to be fully updated with the new nodes and their health info. + let snapshot_update_duration = fetch_interval + 2 * check_interval; + + // Test 1: multiple route() calls return a single domain=ic0.app. + // Only a single node exists, which is initially healthy. + tokio::time::sleep(snapshot_update_duration).await; + let routed_domains = route_n_times(6, Arc::clone(&route_provider)); + assert_routed_domains(routed_domains, vec![node_1.domain.clone()], 6); + + // Test 2: multiple route() calls return 3 different domains with equal fairness (repetition). + // Two healthy nodes are added to the topology. + let node_2 = Node::new("api1.com").unwrap(); + let node_3 = Node::new("api2.com").unwrap(); + checker.overwrite_healthy_nodes(vec![node_1.clone(), node_2.clone(), node_3.clone()]); + fetcher.overwrite_nodes(vec![node_1.clone(), node_2.clone(), node_3.clone()]); + tokio::time::sleep(snapshot_update_duration).await; + let routed_domains = route_n_times(6, Arc::clone(&route_provider)); + assert_routed_domains( + routed_domains, + vec![ + node_1.clone().domain, + node_2.clone().domain, + node_3.clone().domain, + ], + 2, + ); + + // Test 3: multiple route() calls return 2 different domains with equal fairness (repetition). + // One node is set to unhealthy. + checker.overwrite_healthy_nodes(vec![node_1.clone(), node_3.clone()]); + tokio::time::sleep(snapshot_update_duration).await; + let routed_domains = route_n_times(6, Arc::clone(&route_provider)); + assert_routed_domains( + routed_domains, + vec![node_1.clone().domain, node_3.clone().domain], + 3, + ); + + // Test 4: multiple route() calls return 3 different domains with equal fairness (repetition). + // Unhealthy node is set back to healthy. + checker.overwrite_healthy_nodes(vec![node_1.clone(), node_2.clone(), node_3.clone()]); + tokio::time::sleep(snapshot_update_duration).await; + let routed_domains = route_n_times(6, Arc::clone(&route_provider)); + assert_routed_domains( + routed_domains, + vec![ + node_1.clone().domain, + node_2.clone().domain, + node_3.clone().domain, + ], + 2, + ); + + // Test 5: multiple route() calls return 3 different domains with equal fairness (repetition). + // One healthy node is added, but another one goes unhealthy. + let node_4 = Node::new("api3.com").unwrap(); + checker.overwrite_healthy_nodes(vec![node_2.clone(), node_3.clone(), node_4.clone()]); + fetcher.overwrite_nodes(vec![ + node_1.clone(), + node_2.clone(), + node_3.clone(), + node_4.clone(), + ]); + tokio::time::sleep(snapshot_update_duration).await; + let routed_domains = route_n_times(6, Arc::clone(&route_provider)); + assert_routed_domains( + routed_domains, + vec![ + node_2.clone().domain, + node_3.clone().domain, + node_4.clone().domain, + ], + 2, + ); + + // Test 6: multiple route() calls return a single domain=api1.com. + // One node is set to unhealthy and one is removed from the topology. + checker.overwrite_healthy_nodes(vec![node_2.clone(), node_3.clone()]); + fetcher.overwrite_nodes(vec![node_1.clone(), node_2.clone(), node_4.clone()]); + tokio::time::sleep(snapshot_update_duration).await; + let routed_domains = route_n_times(3, Arc::clone(&route_provider)); + assert_routed_domains(routed_domains, vec![node_2.clone().domain], 3); + + // Teardown. + route_provider.stop().await; + } + + #[tokio::test] + async fn test_route_with_initially_unhealthy_seeds_becoming_healthy() { + // Setup. + setup_tracing(); + let node_1 = Node::new(IC0_SEED_DOMAIN).unwrap(); + let node_2 = Node::new("api1.com").unwrap(); + // Set nodes fetching params: topology, fetching periodicity. + let fetcher = Arc::new(NodesFetcherMock::new()); + let fetch_interval = Duration::from_secs(2); + // Set health checking params: healthy nodes, checking periodicity. + let checker = Arc::new(NodeHealthCheckerMock::new()); + let check_interval = Duration::from_secs(1); + // Two nodes exist, which are initially unhealthy. + fetcher.overwrite_nodes(vec![node_1.clone(), node_2.clone()]); + checker.overwrite_healthy_nodes(vec![]); + // Configure RouteProvider + let snapshot = RoundRobinRoutingSnapshot::new(); + let client = Client::builder().build().unwrap(); + let route_provider = Arc::new( + DynamicRouteProvider::new(snapshot, vec![node_1.clone(), node_2.clone()], client) + .with_fetcher(fetcher.clone()) + .with_checker(checker.clone()) + .with_fetch_period(fetch_interval) + .with_check_period(check_interval), + ); + assert!(route_provider + .run() + .await + .unwrap_err() + .to_string() + .contains("No healthy seeds found")); + + // Test 1: calls to route() return an error, as no healthy seeds exist. + for _ in 0..4 { + tokio::time::sleep(check_interval).await; + let result = route_provider.route(); + assert_eq!( + result.unwrap_err(), + AgentError::RouteProviderError("No healthy API nodes found.".to_string()) + ); + } + + // Test 2: calls to route() return both seeds, as they become healthy. + checker.overwrite_healthy_nodes(vec![node_1.clone(), node_2.clone()]); + tokio::time::sleep(3 * check_interval).await; + let routed_domains = route_n_times(6, Arc::clone(&route_provider)); + assert_routed_domains( + routed_domains, + vec![node_1.domain.clone(), node_2.domain.clone()], + 3, + ); + + // Teardown. + route_provider.stop().await; + } + + #[tokio::test] + async fn test_routing_with_no_healthy_nodes_returns_an_error() { + // Setup. + setup_tracing(); + let node_1 = Node::new(IC0_SEED_DOMAIN).unwrap(); + // Set nodes fetching params: topology, fetching periodicity. + let fetcher = Arc::new(NodesFetcherMock::new()); + let fetch_interval = Duration::from_secs(2); + // Set health checking params: healthy nodes, checking periodicity. + let checker = Arc::new(NodeHealthCheckerMock::new()); + let check_interval = Duration::from_secs(1); + // A single seed node which is initially healthy. + fetcher.overwrite_nodes(vec![node_1.clone()]); + checker.overwrite_healthy_nodes(vec![node_1.clone()]); + // Configure RouteProvider + let snapshot = RoundRobinRoutingSnapshot::new(); + let client = Client::builder().build().unwrap(); + let route_provider = Arc::new( + DynamicRouteProvider::new(snapshot, vec![node_1.clone()], client) + .with_fetcher(fetcher.clone()) + .with_checker(checker.clone()) + .with_fetch_period(fetch_interval) + .with_check_period(check_interval), + ); + route_provider.run().await.expect("no healthy seeds found"); + + // Test 1: multiple route() calls return a single domain=ic0.app, as the seed is healthy. + tokio::time::sleep(2 * check_interval).await; + let routed_domains = route_n_times(3, Arc::clone(&route_provider)); + assert_routed_domains(routed_domains, vec![node_1.clone().domain], 3); + + // Test 2: calls to route() return an error, as no healthy nodes exist. + checker.overwrite_healthy_nodes(vec![]); + tokio::time::sleep(2 * check_interval).await; + for _ in 0..4 { + let result = route_provider.route(); + assert_eq!( + result.unwrap_err(), + AgentError::RouteProviderError("No healthy API nodes found.".to_string()) + ); + } + + // Teardown. + route_provider.stop().await; + } + + #[tokio::test] + async fn test_route_with_no_healthy_seeds_errors() { + // Setup. + setup_tracing(); + let node_1 = Node::new(IC0_SEED_DOMAIN).unwrap(); + // Set nodes fetching params: topology, fetching periodicity. + let fetcher = Arc::new(NodesFetcherMock::new()); + let fetch_interval = Duration::from_secs(2); + // Set health checking params: healthy nodes, checking periodicity. + let checker = Arc::new(NodeHealthCheckerMock::new()); + let check_interval = Duration::from_secs(1); + // No healthy seed nodes present, this should lead to errors. + fetcher.overwrite_nodes(vec![]); + checker.overwrite_healthy_nodes(vec![]); + // Configure RouteProvider + let snapshot = RoundRobinRoutingSnapshot::new(); + let client = Client::builder().build().unwrap(); + let route_provider = Arc::new( + DynamicRouteProvider::new(snapshot, vec![node_1.clone()], client) + .with_fetcher(fetcher.clone()) + .with_checker(checker.clone()) + .with_fetch_period(fetch_interval) + .with_check_period(check_interval), + ); + assert!(route_provider + .run() + .await + .unwrap_err() + .to_string() + .contains("No healthy seeds found")); + + // Test: calls to route() return an error, as no healthy seeds exist. + for _ in 0..4 { + tokio::time::sleep(check_interval).await; + let result = route_provider.route(); + assert_eq!( + result.unwrap_err(), + AgentError::RouteProviderError("No healthy API nodes found.".to_string()) + ); + } + + // Teardown. + route_provider.stop().await; + } + + #[tokio::test] + async fn test_route_with_one_healthy_and_one_unhealthy_seed() { + // Setup. + setup_tracing(); + let node_1 = Node::new(IC0_SEED_DOMAIN).unwrap(); + let node_2 = Node::new("api1.com").unwrap(); + // Set nodes fetching params: topology, fetching periodicity. + let fetcher = Arc::new(NodesFetcherMock::new()); + let fetch_interval = Duration::from_secs(2); + // Set health checking params: healthy nodes, checking periodicity. + let checker = Arc::new(NodeHealthCheckerMock::new()); + let check_interval = Duration::from_secs(1); + // One healthy seed is present, it should be discovered during the initialization time. + fetcher.overwrite_nodes(vec![node_1.clone(), node_2.clone()]); + checker.overwrite_healthy_nodes(vec![node_1.clone()]); + // Configure RouteProvider + let snapshot = RoundRobinRoutingSnapshot::new(); + let client = Client::builder().build().unwrap(); + let route_provider = Arc::new( + DynamicRouteProvider::new(snapshot, vec![node_1.clone(), node_2.clone()], client) + .with_fetcher(fetcher.clone()) + .with_checker(checker.clone()) + .with_fetch_period(fetch_interval) + .with_check_period(check_interval), + ); + route_provider.run().await.expect("no healthy seeds found"); + + // Test 1: calls to route() return only a healthy seed ic0.app. + let routed_domains = route_n_times(3, Arc::clone(&route_provider)); + assert_routed_domains(routed_domains, vec![node_1.clone().domain], 3); + + // Test 2: calls to route() return two healthy seeds, as the unhealthy seed becomes healthy. + checker.overwrite_healthy_nodes(vec![node_1.clone(), node_2.clone()]); + tokio::time::sleep(2 * check_interval).await; + let routed_domains = route_n_times(6, Arc::clone(&route_provider)); + assert_routed_domains( + routed_domains, + vec![node_1.clone().domain, node_2.clone().domain], + 3, + ); + + // Teardown. + route_provider.stop().await; + } + + #[tokio::test] + async fn test_routing_with_an_empty_fetched_list_of_api_nodes() { + // Check resiliency to an empty list of fetched API nodes (this should never happen in normal IC operation). + // Setup. + setup_tracing(); + let node_1 = Node::new(IC0_SEED_DOMAIN).unwrap(); + let node_2 = Node::new("api1.com").unwrap(); + // Set nodes fetching params: topology, fetching periodicity. + let fetcher = Arc::new(NodesFetcherMock::new()); + let fetch_interval = Duration::from_secs(2); + // Set health checking params: healthy nodes, checking periodicity. + let checker = Arc::new(NodeHealthCheckerMock::new()); + let check_interval = Duration::from_secs(1); + // One healthy seed is initially present, but the topology has no node. + fetcher.overwrite_nodes(vec![]); + checker.overwrite_healthy_nodes(vec![node_1.clone()]); + // Configure RouteProvider + let snapshot = RoundRobinRoutingSnapshot::new(); + let client = Client::builder().build().unwrap(); + let route_provider = Arc::new( + DynamicRouteProvider::new(snapshot, vec![node_1.clone()], client) + .with_fetcher(fetcher.clone()) + .with_checker(checker.clone()) + .with_fetch_period(fetch_interval) + .with_check_period(check_interval), + ); + route_provider.run().await.expect("no healthy seeds found"); + + // This time span is required for the snapshot to be fully updated with the new nodes topology and health info. + let snapshot_update_duration = fetch_interval + 2 * check_interval; + + // Test 1: multiple route() calls return a single domain=ic0.app. + // HealthManagerActor shouldn't update the snapshot, if the list of fetched nodes is empty, thus we observe the healthy seed. + tokio::time::sleep(snapshot_update_duration).await; + let routed_domains = route_n_times(3, Arc::clone(&route_provider)); + assert_routed_domains(routed_domains, vec![node_1.clone().domain], 3); + + // Test 2: multiple route() calls should now return 3 different domains with equal fairness (repetition). + // Three nodes are added to the topology, i.e. now the fetched nodes list is non-empty. + let node_2 = Node::new("api1.com").unwrap(); + let node_3 = Node::new("api2.com").unwrap(); + fetcher.overwrite_nodes(vec![node_1.clone(), node_2.clone(), node_3.clone()]); + checker.overwrite_healthy_nodes(vec![node_1.clone(), node_2.clone(), node_3.clone()]); + tokio::time::sleep(snapshot_update_duration).await; + let routed_domains = route_n_times(6, Arc::clone(&route_provider)); + assert_routed_domains( + routed_domains, + vec![ + node_1.clone().domain, + node_2.clone().domain, + node_3.clone().domain, + ], + 2, + ); + + // Teardown. + route_provider.stop().await; + } +} diff --git a/ic-agent/src/agent/http_transport/dynamic_routing/health_check.rs b/ic-agent/src/agent/http_transport/dynamic_routing/health_check.rs new file mode 100644 index 00000000..a1565aa7 --- /dev/null +++ b/ic-agent/src/agent/http_transport/dynamic_routing/health_check.rs @@ -0,0 +1,265 @@ +use anyhow::bail; +use async_trait::async_trait; +use http::{Method, StatusCode}; +use reqwest::{Client, Request}; +use std::{ + fmt::Debug, + sync::Arc, + time::{Duration, Instant}, +}; +use tokio::{sync::mpsc, time}; +use tokio_util::{sync::CancellationToken, task::TaskTracker}; +use tracing::{debug, error, info, warn}; +use url::Url; + +use crate::agent::http_transport::dynamic_routing::{ + messages::{FetchedNodes, NodeHealthState}, + node::Node, + snapshot::routing_snapshot::RoutingSnapshot, + type_aliases::{GlobalShared, ReceiverMpsc, ReceiverWatch, SenderMpsc}, +}; + +const CHANNEL_BUFFER: usize = 128; + +#[async_trait] +pub trait HealthCheck: Send + Sync + Debug { + async fn check(&self, node: &Node) -> anyhow::Result; +} + +#[derive(Clone, PartialEq, Debug, Default)] +pub struct HealthCheckStatus { + pub latency: Option, +} + +impl HealthCheckStatus { + pub fn new(latency: Option) -> Self { + Self { latency } + } + + pub fn is_healthy(&self) -> bool { + self.latency.is_some() + } +} + +#[derive(Debug)] +pub struct HealthChecker { + http_client: Client, + timeout: Duration, +} + +impl HealthChecker { + pub fn new(http_client: Client, timeout: Duration) -> Self { + Self { + http_client, + timeout, + } + } +} + +const HEALTH_CHECKER: &str = "HealthChecker"; + +#[async_trait] +impl HealthCheck for HealthChecker { + async fn check(&self, node: &Node) -> anyhow::Result { + // API boundary node exposes /health endpoint and should respond with 204 (No Content) if it's healthy. + let url = Url::parse(&format!("https://{}/health", node.domain))?; + + let mut request = Request::new(Method::GET, url.clone()); + *request.timeout_mut() = Some(self.timeout); + + let start = Instant::now(); + let response = self.http_client.execute(request).await?; + let latency = start.elapsed(); + + if response.status() != StatusCode::NO_CONTENT { + let err_msg = format!( + "{HEALTH_CHECKER}: Unexpected http status code {} for url={url} received", + response.status() + ); + error!(err_msg); + bail!(err_msg); + } + + Ok(HealthCheckStatus::new(Some(latency))) + } +} + +const HEALTH_CHECK_ACTOR: &str = "HealthCheckActor"; + +struct HealthCheckActor { + checker: Arc, + period: Duration, + node: Node, + sender_channel: SenderMpsc, + token: CancellationToken, +} + +impl HealthCheckActor { + pub fn new( + checker: Arc, + period: Duration, + node: Node, + sender_channel: SenderMpsc, + token: CancellationToken, + ) -> Self { + Self { + checker, + period, + node, + sender_channel, + token, + } + } + + pub async fn run(self) { + let mut interval = time::interval(self.period); + loop { + tokio::select! { + _ = interval.tick() => { + let health = self.checker.check(&self.node).await.unwrap_or_default(); + let message = NodeHealthState { + node: self.node.clone(), + health, + }; + // Inform the listener about node's health. It can only fail if the listener was closed/dropped. + self.sender_channel.send(message).await.expect("Failed to send node's health state"); + } + _ = self.token.cancelled() => { + info!("{HEALTH_CHECK_ACTOR}: was gracefully cancelled for node {:?}", self.node); + break; + } + } + } + } +} + +pub const HEALTH_MANAGER_ACTOR: &str = "HealthManagerActor"; + +pub struct HealthManagerActor { + checker: Arc, + period: Duration, + nodes_snapshot: GlobalShared, + fetch_receiver: ReceiverWatch, + check_sender: SenderMpsc, + check_receiver: ReceiverMpsc, + init_sender: SenderMpsc, + token: CancellationToken, + nodes_token: CancellationToken, + nodes_tracker: TaskTracker, + is_initialized: bool, +} + +impl HealthManagerActor +where + S: RoutingSnapshot, +{ + pub fn new( + checker: Arc, + period: Duration, + nodes_snapshot: GlobalShared, + fetch_receiver: ReceiverWatch, + init_sender: SenderMpsc, + token: CancellationToken, + ) -> Self { + let (check_sender, check_receiver) = mpsc::channel(CHANNEL_BUFFER); + + Self { + checker, + period, + nodes_snapshot, + fetch_receiver, + check_sender, + check_receiver, + init_sender, + token, + nodes_token: CancellationToken::new(), + nodes_tracker: TaskTracker::new(), + is_initialized: false, + } + } + + pub async fn run(mut self) { + loop { + tokio::select! { + // Check if a new array of fetched nodes appeared in the channel from NodesFetchService. + result = self.fetch_receiver.changed() => { + if let Err(err) = result { + error!("{HEALTH_MANAGER_ACTOR}: nodes fetch sender has been dropped: {err:?}"); + self.token.cancel(); + continue; + } + // Get the latest value from the channel and mark it as seen. + let Some(FetchedNodes { nodes }) = self.fetch_receiver.borrow_and_update().clone() else { continue }; + self.handle_fetch_update(nodes).await; + } + // Receive health check messages from all running NodeHealthChecker/s. + Some(msg) = self.check_receiver.recv() => { + self.handle_health_update(msg).await; + } + _ = self.token.cancelled() => { + self.stop_all_checks().await; + self.check_receiver.close(); + warn!("{HEALTH_MANAGER_ACTOR}: was gracefully cancelled, all nodes health checks stopped"); + break; + } + } + } + } + + async fn handle_health_update(&mut self, msg: NodeHealthState) { + let current_snapshot = self.nodes_snapshot.load_full(); + let mut new_snapshot = (*current_snapshot).clone(); + if let Err(err) = new_snapshot.update_node(&msg.node, msg.health.clone()) { + error!("{HEALTH_MANAGER_ACTOR}: failed to update snapshot: {err:?}"); + return; + } + self.nodes_snapshot.store(Arc::new(new_snapshot)); + if !self.is_initialized && msg.health.is_healthy() { + self.is_initialized = true; + // If TIMEOUT_AWAIT_HEALTHY_SEED has been exceeded, the receiver was dropped and send would thus fail. We ignore the failure. + let _ = self.init_sender.send(true).await; + } + } + + async fn handle_fetch_update(&mut self, nodes: Vec) { + if nodes.is_empty() { + // This is a bug in the IC registry. There should be at least one API Boundary Node in the registry. + // Updating nodes snapshot with an empty array, would lead to an irrecoverable error, as new nodes couldn't be fetched. + // We avoid such updates and just wait for a non-empty list. + error!("{HEALTH_MANAGER_ACTOR}: list of fetched nodes is empty"); + return; + } + debug!("{HEALTH_MANAGER_ACTOR}: fetched nodes received {:?}", nodes); + let current_snapshot = self.nodes_snapshot.load_full(); + let mut new_snapshot = (*current_snapshot).clone(); + // If the snapshot has changed, store it and restart all node's health checks. + if let Ok(true) = new_snapshot.sync_nodes(&nodes) { + self.nodes_snapshot.store(Arc::new(new_snapshot)); + self.stop_all_checks().await; + self.start_checks(nodes.to_vec()); + } + } + + fn start_checks(&mut self, nodes: Vec) { + // Create a single cancellation token for all started health checks. + self.nodes_token = CancellationToken::new(); + for node in nodes { + debug!("{HEALTH_MANAGER_ACTOR}: starting health check for node {node:?}"); + let actor = HealthCheckActor::new( + Arc::clone(&self.checker), + self.period, + node, + self.check_sender.clone(), + self.nodes_token.clone(), + ); + self.nodes_tracker.spawn(async move { actor.run().await }); + } + } + + async fn stop_all_checks(&self) { + warn!("{HEALTH_MANAGER_ACTOR}: stopping all running health checks"); + self.nodes_token.cancel(); + self.nodes_tracker.close(); + self.nodes_tracker.wait().await; + } +} diff --git a/ic-agent/src/agent/http_transport/dynamic_routing/messages.rs b/ic-agent/src/agent/http_transport/dynamic_routing/messages.rs new file mode 100644 index 00000000..0e3d3935 --- /dev/null +++ b/ic-agent/src/agent/http_transport/dynamic_routing/messages.rs @@ -0,0 +1,11 @@ +use crate::agent::http_transport::dynamic_routing::{health_check::HealthCheckStatus, node::Node}; + +#[derive(Debug, Clone)] +pub struct FetchedNodes { + pub nodes: Vec, +} + +pub struct NodeHealthState { + pub node: Node, + pub health: HealthCheckStatus, +} diff --git a/ic-agent/src/agent/http_transport/dynamic_routing/mod.rs b/ic-agent/src/agent/http_transport/dynamic_routing/mod.rs new file mode 100644 index 00000000..42dc4e08 --- /dev/null +++ b/ic-agent/src/agent/http_transport/dynamic_routing/mod.rs @@ -0,0 +1,9 @@ +pub mod dynamic_route_provider; +pub mod health_check; +pub mod messages; +pub mod node; +pub mod nodes_fetch; +pub mod snapshot; +#[cfg(test)] +pub mod test_utils; +pub mod type_aliases; diff --git a/ic-agent/src/agent/http_transport/dynamic_routing/node.rs b/ic-agent/src/agent/http_transport/dynamic_routing/node.rs new file mode 100644 index 00000000..2ba2c989 --- /dev/null +++ b/ic-agent/src/agent/http_transport/dynamic_routing/node.rs @@ -0,0 +1,43 @@ +use url::Url; + +use crate::agent::ApiBoundaryNode; +use anyhow::anyhow; + +#[derive(Debug, Clone, PartialEq, Eq, Hash)] +pub struct Node { + pub domain: String, +} + +impl Node { + pub fn new(domain: &str) -> anyhow::Result { + if !is_valid_domain(domain) { + return Err(anyhow!("Invalid domain name {domain}")); + } + Ok(Self { + domain: domain.to_string(), + }) + } +} + +impl Node { + pub fn to_routing_url(&self) -> Url { + Url::parse(&format!("https://{}/api/v2/", self.domain)).expect("failed to parse URL") + } +} + +impl From<&Node> for Url { + fn from(node: &Node) -> Self { + Url::parse(&format!("https://{}", node.domain)).expect("failed to parse URL") + } +} + +impl From<&ApiBoundaryNode> for Node { + fn from(api_bn: &ApiBoundaryNode) -> Self { + Node::new(api_bn.domain.as_str()).unwrap() + } +} + +pub fn is_valid_domain>(domain: S) -> bool { + // TODO + true +} diff --git a/ic-agent/src/agent/http_transport/dynamic_routing/nodes_fetch.rs b/ic-agent/src/agent/http_transport/dynamic_routing/nodes_fetch.rs new file mode 100644 index 00000000..3ce8fec4 --- /dev/null +++ b/ic-agent/src/agent/http_transport/dynamic_routing/nodes_fetch.rs @@ -0,0 +1,143 @@ +use anyhow::Context; +use async_trait::async_trait; +use candid::Principal; +use reqwest::Client; +use std::{fmt::Debug, sync::Arc, time::Duration}; +use tokio::time::{self, sleep}; +use tokio_util::sync::CancellationToken; +use tracing::{error, warn}; +use url::Url; + +use crate::agent::{ + http_transport::{ + dynamic_routing::{ + health_check::HEALTH_MANAGER_ACTOR, messages::FetchedNodes, node::Node, + snapshot::routing_snapshot::RoutingSnapshot, type_aliases::GlobalShared, + type_aliases::SenderWatch, + }, + reqwest_transport::ReqwestTransport, + }, + Agent, +}; + +const NODES_FETCH_ACTOR: &str = "NodesFetchActor"; + +#[async_trait] +pub trait Fetch: Sync + Send + Debug { + async fn fetch(&self, url: Url) -> anyhow::Result>; +} + +#[derive(Debug)] +pub struct NodesFetcher { + http_client: Client, + subnet_id: Principal, +} + +impl NodesFetcher { + pub fn new(http_client: Client, subnet_id: Principal) -> Self { + Self { + http_client, + subnet_id, + } + } +} + +#[async_trait] +impl Fetch for NodesFetcher { + async fn fetch(&self, url: Url) -> anyhow::Result> { + let transport = ReqwestTransport::create_with_client(url, self.http_client.clone()) + .with_context(|| "Failed to build transport: {err}")?; + let agent = Agent::builder() + .with_transport(transport) + .build() + .with_context(|| "Failed to build an agent: {err}")?; + agent + .fetch_root_key() + .await + .with_context(|| "Failed to fetch root key: {err}")?; + let api_bns = agent + .fetch_api_boundary_nodes_by_subnet_id(self.subnet_id) + .await?; + let nodes: Vec = api_bns.iter().map(|node| node.into()).collect(); + return Ok(nodes); + } +} + +pub struct NodesFetchActor { + fetcher: Arc, + period: Duration, + fetch_retry_interval: Duration, + fetch_sender: SenderWatch, + snapshot: GlobalShared, + token: CancellationToken, +} + +impl NodesFetchActor +where + S: RoutingSnapshot, +{ + pub fn new( + fetcher: Arc, + period: Duration, + retry_interval: Duration, + fetch_sender: SenderWatch, + snapshot: GlobalShared, + token: CancellationToken, + ) -> Self { + Self { + fetcher, + period, + fetch_retry_interval: retry_interval, + fetch_sender, + snapshot, + token, + } + } + + pub async fn run(self) { + let mut interval = time::interval(self.period); + loop { + tokio::select! { + _ = interval.tick() => { + // Retry until success: + // - try to get a healthy node from the snapshot + // - if snapshot is empty, break the cycle and wait for the next fetch cycle + // - using the healthy node, try to fetch new nodes from topology + // - if failure, sleep and retry + // - try send fetched nodes to the listener + // - failure should never happen + loop { + let snapshot = self.snapshot.load(); + if let Some(node) = snapshot.next() { + match self.fetcher.fetch((&node).into()).await { + Ok(nodes) => { + let msg = Some( + FetchedNodes {nodes}); + match self.fetch_sender.send(msg) { + Ok(()) => break, // message sent successfully, exist the loop + Err(err) => { + error!("{NODES_FETCH_ACTOR}: failed to send results to {HEALTH_MANAGER_ACTOR}: {err:?}"); + } + } + }, + Err(err) => { + error!("{NODES_FETCH_ACTOR}: failed to fetch nodes: {err:?}"); + } + }; + } else { + // No healthy nodes in the snapshot, break the cycle and wait for the next fetch cycle + error!("{NODES_FETCH_ACTOR}: no nodes in the snapshot"); + break; + }; + warn!("Retrying to fetch the nodes in {:?}", self.fetch_retry_interval); + sleep(self.fetch_retry_interval).await; + } + } + _ = self.token.cancelled() => { + warn!("{NODES_FETCH_ACTOR}: was gracefully cancelled"); + break; + } + } + } + } +} diff --git a/ic-agent/src/agent/http_transport/dynamic_routing/snapshot/latency_based_routing.rs b/ic-agent/src/agent/http_transport/dynamic_routing/snapshot/latency_based_routing.rs new file mode 100644 index 00000000..b8328724 --- /dev/null +++ b/ic-agent/src/agent/http_transport/dynamic_routing/snapshot/latency_based_routing.rs @@ -0,0 +1,365 @@ +use std::{collections::HashSet, time::Duration}; + +use rand::Rng; +use simple_moving_average::{SumTreeSMA, SMA}; + +use crate::agent::http_transport::dynamic_routing::{ + health_check::HealthCheckStatus, node::Node, snapshot::routing_snapshot::RoutingSnapshot, +}; + +// Some big value implying that node is unhealthy, should be much bigger than node's latency. +const MAX_LATENCY: Duration = Duration::from_secs(500); + +const WINDOW_SIZE: usize = 15; + +type LatencyMovAvg = SumTreeSMA; + +#[derive(Clone, Debug)] +struct WeightedNode { + node: Node, + latency_mov_avg: LatencyMovAvg, + weight: f64, +} + +#[derive(Default, Debug, Clone)] +pub struct LatencyRoutingSnapshot { + weighted_nodes: Vec, + existing_nodes: HashSet, +} + +impl LatencyRoutingSnapshot { + pub fn new() -> Self { + Self { + weighted_nodes: vec![], + existing_nodes: HashSet::new(), + } + } +} + +// select weight index based on the input number in range [0, 1] +#[inline(always)] +fn weighted_sample(weights: &[f64], number: f64) -> Option { + if number < 0.0 || number > 1.0 { + return None; + } + let sum: f64 = weights.iter().sum(); + let mut weighted_number = number * sum; + for (idx, weight) in weights.iter().enumerate() { + weighted_number -= weight; + if weighted_number <= 0.0 { + return Some(idx); + } + } + None +} + +impl RoutingSnapshot for LatencyRoutingSnapshot { + fn has_nodes(&self) -> bool { + !self.weighted_nodes.is_empty() + } + + fn next(&self) -> Option { + // We select a node based on it's weight, using a stochastic weighted random sampling approach. + let weights = self + .weighted_nodes + .iter() + .map(|n| n.weight) + .collect::>(); + // Generate a random float in the range [0, 1) + let mut rng = rand::thread_rng(); + let rand_num = rng.gen::(); + // Using this random float and an array of weights we get an index of the node. + let idx = weighted_sample(weights.as_slice(), rand_num); + idx.map(|idx| self.weighted_nodes[idx].node.clone()) + } + + fn sync_nodes(&mut self, nodes: &[Node]) -> anyhow::Result { + let new_nodes = HashSet::from_iter(nodes.into_iter().cloned()); + // Find nodes removed from snapshot. + let nodes_removed: Vec<_> = self + .existing_nodes + .difference(&new_nodes) + .cloned() + .collect(); + let has_removed_nodes = !nodes_removed.is_empty(); + // Find nodes added to topology. + let nodes_added: Vec<_> = new_nodes + .difference(&self.existing_nodes) + .cloned() + .collect(); + let has_added_nodes = !nodes_added.is_empty(); + self.existing_nodes.extend(nodes_added.into_iter()); + // NOTE: newly added nodes will appear in the weighted_nodes later. + // This happens after the first node health check round and a consequent update_node() invocation. + for node in nodes_removed.into_iter() { + self.existing_nodes.remove(&node); + let idx = self.weighted_nodes.iter().position(|x| x.node == node); + idx.map(|idx| self.weighted_nodes.swap_remove(idx)); + } + Ok(has_added_nodes || has_removed_nodes) + } + + fn update_node(&mut self, node: &Node, health: HealthCheckStatus) -> anyhow::Result { + if !self.existing_nodes.contains(node) { + return Ok(false); + } + + // If latency is None (meaning Node is unhealthy), we assign some big value + let latency = health.latency.unwrap_or(MAX_LATENCY).as_millis() as f64; + + if let Some(idx) = self.weighted_nodes.iter().position(|x| &x.node == node) { + // Node is already in the array (it is not the first update_node() call). + self.weighted_nodes[idx].latency_mov_avg.add_sample(latency); + let latency_avg = self.weighted_nodes[idx].latency_mov_avg.get_average(); + // As nodes with smaller average latencies are preferred for routing, we use inverted values for weights. + self.weighted_nodes[idx].weight = 1.0 / latency_avg; + } else { + // Node is not yet in array (first update_node() call). + let mut latency_mov_avg = LatencyMovAvg::new(); + latency_mov_avg.add_sample(latency); + let weight = 1.0 / latency_mov_avg.get_average(); + self.weighted_nodes.push(WeightedNode { + latency_mov_avg, + node: node.clone(), + weight, + }) + } + Ok(true) + } +} + +#[cfg(test)] +mod tests { + use std::{collections::HashSet, time::Duration}; + + use simple_moving_average::SMA; + + use crate::agent::http_transport::dynamic_routing::{ + health_check::HealthCheckStatus, + node::Node, + snapshot::{ + latency_based_routing::{ + weighted_sample, LatencyMovAvg, LatencyRoutingSnapshot, WeightedNode, MAX_LATENCY, + }, + routing_snapshot::RoutingSnapshot, + }, + }; + + #[test] + fn test_snapshot_init() { + // Arrange + let snapshot = LatencyRoutingSnapshot::new(); + // Assert + assert!(snapshot.weighted_nodes.is_empty()); + assert!(snapshot.existing_nodes.is_empty()); + assert!(!snapshot.has_nodes()); + assert!(snapshot.next().is_none()); + } + + #[test] + fn test_update_for_non_existing_node_fails() { + // Arrange + let mut snapshot = LatencyRoutingSnapshot::new(); + let node = Node::new("api1.com").unwrap(); + let health = HealthCheckStatus { + latency: Some(Duration::from_secs(1)), + }; + // Act + let is_updated = snapshot + .update_node(&node, health) + .expect("node update failed"); + // Assert + assert!(!is_updated); + assert!(snapshot.weighted_nodes.is_empty()); + assert!(!snapshot.has_nodes()); + assert!(snapshot.next().is_none()); + } + + #[test] + fn test_update_for_existing_node_succeeds() { + // Arrange + let mut snapshot = LatencyRoutingSnapshot::new(); + let node = Node::new("api1.com").unwrap(); + let health = HealthCheckStatus { + latency: Some(Duration::from_secs(1)), + }; + snapshot.existing_nodes.insert(node.clone()); + // Check first update + let is_updated = snapshot + .update_node(&node, health) + .expect("node update failed"); + assert!(is_updated); + assert!(snapshot.has_nodes()); + let weighted_node = snapshot.weighted_nodes.first().unwrap(); + assert_eq!(weighted_node.latency_mov_avg.get_average(), 1000.0); + assert_eq!(weighted_node.weight, 0.001); + assert_eq!(snapshot.next().unwrap(), node); + // Check second update + let health = HealthCheckStatus { + latency: Some(Duration::from_secs(2)), + }; + let is_updated = snapshot + .update_node(&node, health) + .expect("node update failed"); + assert!(is_updated); + let weighted_node = snapshot.weighted_nodes.first().unwrap(); + assert_eq!(weighted_node.latency_mov_avg.get_average(), 1500.0); + assert_eq!(weighted_node.weight, 1.0 / 1500.0); + // Check third update + let health = HealthCheckStatus { + latency: Some(Duration::from_secs(3)), + }; + let is_updated = snapshot + .update_node(&node, health) + .expect("node update failed"); + assert!(is_updated); + let weighted_node = snapshot.weighted_nodes.first().unwrap(); + assert_eq!(weighted_node.latency_mov_avg.get_average(), 2000.0); + assert_eq!(weighted_node.weight, 1.0 / 2000.0); + // Check forth update with none + let health = HealthCheckStatus { latency: None }; + let is_updated = snapshot + .update_node(&node, health) + .expect("node update failed"); + assert!(is_updated); + let weighted_node = snapshot.weighted_nodes.first().unwrap(); + assert_eq!( + weighted_node.latency_mov_avg.get_average(), + (MAX_LATENCY.as_millis() as f64 + 6000.0) / 4.0 + ); + assert_eq!( + weighted_node.weight, + 4.0 / (MAX_LATENCY.as_millis() as f64 + 6000.0) + ); + assert_eq!(snapshot.weighted_nodes.len(), 1); + assert_eq!(snapshot.existing_nodes.len(), 1); + assert_eq!(snapshot.next().unwrap(), node); + } + + #[test] + fn test_sync_node_scenarios() { + // Arrange + let mut snapshot = LatencyRoutingSnapshot::new(); + let node_1 = Node::new("api1.com").unwrap(); + // Sync with node_1 + let nodes_changed = snapshot.sync_nodes(&[node_1.clone()]).unwrap(); + assert!(nodes_changed); + assert!(snapshot.weighted_nodes.is_empty()); + assert_eq!( + snapshot.existing_nodes, + HashSet::from_iter(vec![node_1.clone()]) + ); + // Add node_1 to weighted_nodes manually + snapshot.weighted_nodes.push(WeightedNode { + node: node_1.clone(), + latency_mov_avg: LatencyMovAvg::new(), + weight: 0.0, + }); + // Sync with node_1 again + let nodes_changed = snapshot.sync_nodes(&[node_1.clone()]).unwrap(); + assert!(!nodes_changed); + assert_eq!( + snapshot.existing_nodes, + HashSet::from_iter(vec![node_1.clone()]) + ); + assert_eq!(snapshot.weighted_nodes[0].node, node_1); + // Sync with node_2 + let node_2 = Node::new("api2.com").unwrap(); + let nodes_changed = snapshot.sync_nodes(&[node_2.clone()]).unwrap(); + assert!(nodes_changed); + assert_eq!( + snapshot.existing_nodes, + HashSet::from_iter(vec![node_2.clone()]) + ); + // Make sure node_1 was removed from weighted_nodes too + assert!(snapshot.weighted_nodes.is_empty()); + // Add node_2 to weighted_nodes manually + snapshot.weighted_nodes.push(WeightedNode { + node: node_2.clone(), + latency_mov_avg: LatencyMovAvg::new(), + weight: 0.0, + }); + // Sync with [node_2, node_3] + let node_3 = Node::new("api3.com").unwrap(); + let nodes_changed = snapshot + .sync_nodes(&[node_3.clone(), node_2.clone()]) + .unwrap(); + assert!(nodes_changed); + assert_eq!( + snapshot.existing_nodes, + HashSet::from_iter(vec![node_3.clone(), node_2.clone()]) + ); + assert_eq!(snapshot.weighted_nodes[0].node, node_2); + // Add node_3 to weighted_nodes manually + snapshot.weighted_nodes.push(WeightedNode { + node: node_3.clone(), + latency_mov_avg: LatencyMovAvg::new(), + weight: 0.0, + }); + // Sync with [] + let nodes_changed = snapshot.sync_nodes(&[]).unwrap(); + assert!(nodes_changed); + assert!(snapshot.existing_nodes.is_empty()); + // Make sure all nodes were removed from the healthy_nodes + assert!(snapshot.weighted_nodes.is_empty()); + // Sync with [] again + let nodes_changed = snapshot.sync_nodes(&[]).unwrap(); + assert!(!nodes_changed); + assert!(snapshot.existing_nodes.is_empty()); + } + + #[test] + fn test_weighted_sample() { + // Case 1: empty array + let arr: &[f64] = &[]; + let idx = weighted_sample(arr, 0.5); + assert_eq!(idx, None); + // Case 2: single element in array + let arr: &[f64] = &[1.0]; + let idx = weighted_sample(arr, 0.0); + assert_eq!(idx, Some(0)); + let idx = weighted_sample(arr, 1.0); + assert_eq!(idx, Some(0)); + // check bounds + let idx = weighted_sample(arr, -1.0); + assert_eq!(idx, None); + let idx = weighted_sample(arr, 1.1); + assert_eq!(idx, None); + // Case 3: two elements in array (second element has twice the weight of the first) + let arr: &[f64] = &[1.0, 2.0]; // prefixed_sum = [1.0, 3.0] + let idx = weighted_sample(arr, 0.0); // 0.0 * 3.0 < 1.0 + assert_eq!(idx, Some(0)); + let idx = weighted_sample(arr, 0.33); // 0.33 * 3.0 < 1.0 + assert_eq!(idx, Some(0)); // selection probability ~0.33 + let idx = weighted_sample(arr, 0.35); // 0.35 * 3.0 > 1.0 + assert_eq!(idx, Some(1)); // selection probability ~0.66 + let idx = weighted_sample(arr, 1.0); // 1.0 * 3.0 > 1.0 + assert_eq!(idx, Some(1)); + // check bounds + let idx = weighted_sample(arr, -1.0); + assert_eq!(idx, None); + let idx = weighted_sample(arr, 1.1); + assert_eq!(idx, None); + // Case 4: four elements in array + let arr: &[f64] = &[1.0, 2.0, 1.5, 2.5]; // prefixed_sum = [1.0, 3.0, 4.5, 7.0] + let idx = weighted_sample(arr, 0.14); // 0.14 * 7 < 1.0 + assert_eq!(idx, Some(0)); // probability ~0.14 + let idx = weighted_sample(arr, 0.15); // 0.15 * 7 > 1.0 + assert_eq!(idx, Some(1)); + let idx = weighted_sample(arr, 0.42); // 0.42 * 7 < 3.0 + assert_eq!(idx, Some(1)); // probability ~0.28 + let idx = weighted_sample(arr, 0.43); // 0.43 * 7 > 3.0 + assert_eq!(idx, Some(2)); + let idx = weighted_sample(arr, 0.64); // 0.64 * 7 < 4.5 + assert_eq!(idx, Some(2)); // probability ~0.22 + let idx = weighted_sample(arr, 0.65); // 0.65 * 7 > 4.5 + assert_eq!(idx, Some(3)); + let idx = weighted_sample(arr, 0.99); + assert_eq!(idx, Some(3)); // probability ~0.35 + // check bounds + let idx = weighted_sample(arr, -1.0); + assert_eq!(idx, None); + let idx = weighted_sample(arr, 1.1); + assert_eq!(idx, None); + } +} diff --git a/ic-agent/src/agent/http_transport/dynamic_routing/snapshot/mod.rs b/ic-agent/src/agent/http_transport/dynamic_routing/snapshot/mod.rs new file mode 100644 index 00000000..1c63df8b --- /dev/null +++ b/ic-agent/src/agent/http_transport/dynamic_routing/snapshot/mod.rs @@ -0,0 +1,3 @@ +pub mod latency_based_routing; +pub mod round_robin_routing; +pub mod routing_snapshot; diff --git a/ic-agent/src/agent/http_transport/dynamic_routing/snapshot/round_robin_routing.rs b/ic-agent/src/agent/http_transport/dynamic_routing/snapshot/round_robin_routing.rs new file mode 100644 index 00000000..d7dc4995 --- /dev/null +++ b/ic-agent/src/agent/http_transport/dynamic_routing/snapshot/round_robin_routing.rs @@ -0,0 +1,237 @@ +use std::{ + collections::HashSet, + sync::{ + atomic::{AtomicUsize, Ordering}, + Arc, + }, +}; + +use crate::agent::http_transport::dynamic_routing::{ + health_check::HealthCheckStatus, node::Node, snapshot::routing_snapshot::RoutingSnapshot, +}; + +#[derive(Default, Debug, Clone)] +pub struct RoundRobinRoutingSnapshot { + current_idx: Arc, + healthy_nodes: HashSet, + existing_nodes: HashSet, +} + +impl RoundRobinRoutingSnapshot { + pub fn new() -> Self { + Self { + current_idx: Arc::new(AtomicUsize::new(0)), + healthy_nodes: HashSet::new(), + existing_nodes: HashSet::new(), + } + } +} + +impl RoutingSnapshot for RoundRobinRoutingSnapshot { + fn has_nodes(&self) -> bool { + !self.healthy_nodes.is_empty() + } + + fn next(&self) -> Option { + if self.healthy_nodes.is_empty() { + return None; + } + let prev_idx = self.current_idx.fetch_add(1, Ordering::Relaxed); + self.healthy_nodes + .iter() + .nth(prev_idx % self.healthy_nodes.len()) + .cloned() + } + + fn sync_nodes(&mut self, nodes: &[Node]) -> anyhow::Result { + let new_nodes = HashSet::from_iter(nodes.into_iter().cloned()); + // Find nodes removed from snapshot. + let nodes_removed: Vec<_> = self + .existing_nodes + .difference(&new_nodes) + .cloned() + .collect(); + let has_removed_nodes = !nodes_removed.is_empty(); + // Find nodes added to snapshot. + let nodes_added: Vec<_> = new_nodes + .difference(&self.existing_nodes) + .cloned() + .collect(); + let has_added_nodes = !nodes_added.is_empty(); + // NOTE: newly added nodes will appear in the healthy_nodes later. + // This happens after the first node health check round and a consequent update_node() invocation. + self.existing_nodes.extend(nodes_added); + nodes_removed.iter().for_each(|node| { + self.existing_nodes.remove(node); + self.healthy_nodes.remove(node); + }); + Ok(has_added_nodes || has_removed_nodes) + } + + fn update_node(&mut self, node: &Node, health: HealthCheckStatus) -> anyhow::Result { + if !self.existing_nodes.contains(&node) { + return Ok(false); + } + if health.latency.is_some() { + Ok(self.healthy_nodes.insert(node.clone())) + } else { + Ok(self.healthy_nodes.remove(&node)) + } + } +} + +#[cfg(test)] +mod tests { + use std::time::Duration; + use std::{collections::HashSet, sync::atomic::Ordering}; + + use crate::agent::http_transport::dynamic_routing::{ + health_check::HealthCheckStatus, + node::Node, + snapshot::{ + round_robin_routing::RoundRobinRoutingSnapshot, routing_snapshot::RoutingSnapshot, + }, + }; + + #[test] + fn test_snapshot_init() { + // Arrange + let snapshot = RoundRobinRoutingSnapshot::new(); + // Assert + assert!(snapshot.healthy_nodes.is_empty()); + assert!(snapshot.existing_nodes.is_empty()); + assert!(!snapshot.has_nodes()); + assert_eq!(snapshot.current_idx.load(Ordering::SeqCst), 0); + assert!(snapshot.next().is_none()); + } + + #[test] + fn test_update_of_non_existing_node_always_returns_false() { + // Arrange + let mut snapshot = RoundRobinRoutingSnapshot::new(); + // This node is not present in existing_nodes + let node = Node::new("api1.com").unwrap(); + let healthy = HealthCheckStatus { + latency: Some(Duration::from_secs(1)), + }; + let unhealthy = HealthCheckStatus { latency: None }; + // Act 1 + let is_updated = snapshot + .update_node(&node, healthy) + .expect("node update failed"); + // Assert + assert!(!is_updated); + assert!(snapshot.existing_nodes.is_empty()); + assert!(snapshot.next().is_none()); + // Act 2 + let is_updated = snapshot + .update_node(&node, unhealthy) + .expect("node update failed"); + // Assert + assert!(!is_updated); + assert!(snapshot.existing_nodes.is_empty()); + assert!(snapshot.next().is_none()); + } + + #[test] + fn test_update_of_existing_unhealthy_node_with_healthy_node_returns_true() { + // Arrange + let mut snapshot = RoundRobinRoutingSnapshot::new(); + let node = Node::new("api1.com").unwrap(); + // node is present in existing_nodes, but not in healthy_nodes + snapshot.existing_nodes.insert(node.clone()); + let health = HealthCheckStatus { + latency: Some(Duration::from_secs(1)), + }; + // Act + let is_updated = snapshot + .update_node(&node, health) + .expect("node update failed"); + assert!(is_updated); + assert!(snapshot.has_nodes()); + assert_eq!(snapshot.next().unwrap(), node); + assert_eq!(snapshot.current_idx.load(Ordering::SeqCst), 1); + } + + #[test] + fn test_update_of_existing_healthy_node_with_unhealthy_node_returns_true() { + // Arrange + let mut snapshot = RoundRobinRoutingSnapshot::new(); + let node = Node::new("api1.com").unwrap(); + snapshot.existing_nodes.insert(node.clone()); + snapshot.healthy_nodes.insert(node.clone()); + let unhealthy = HealthCheckStatus { latency: None }; + // Act + let is_updated = snapshot + .update_node(&node, unhealthy) + .expect("node update failed"); + assert!(is_updated); + assert!(!snapshot.has_nodes()); + assert!(snapshot.next().is_none()); + } + + #[test] + fn test_sync_node_scenarios() { + // Arrange + let mut snapshot = RoundRobinRoutingSnapshot::new(); + let node_1 = Node::new("api1.com").unwrap(); + // Sync with node_1 + let nodes_changed = snapshot.sync_nodes(&[node_1.clone()]).unwrap(); + assert!(nodes_changed); + assert!(snapshot.healthy_nodes.is_empty()); + assert_eq!( + snapshot.existing_nodes, + HashSet::from_iter(vec![node_1.clone()]) + ); + // Add node_1 to healthy_nodes manually + snapshot.healthy_nodes.insert(node_1.clone()); + // Sync with node_1 again + let nodes_changed = snapshot.sync_nodes(&[node_1.clone()]).unwrap(); + assert!(!nodes_changed); + assert_eq!( + snapshot.existing_nodes, + HashSet::from_iter(vec![node_1.clone()]) + ); + assert_eq!( + snapshot.healthy_nodes, + HashSet::from_iter(vec![node_1.clone()]) + ); + // Sync with node_2 + let node_2 = Node::new("api2.com").unwrap(); + let nodes_changed = snapshot.sync_nodes(&[node_2.clone()]).unwrap(); + assert!(nodes_changed); + assert_eq!( + snapshot.existing_nodes, + HashSet::from_iter(vec![node_2.clone()]) + ); + // Make sure node_1 was removed from healthy nodes + assert!(snapshot.healthy_nodes.is_empty()); + // Add node_2 to healthy_nodes manually + snapshot.healthy_nodes.insert(node_2.clone()); + // Sync with [node_2, node_3] + let node_3 = Node::new("api3.com").unwrap(); + let nodes_changed = snapshot + .sync_nodes(&[node_3.clone(), node_2.clone()]) + .unwrap(); + assert!(nodes_changed); + assert_eq!( + snapshot.existing_nodes, + HashSet::from_iter(vec![node_3.clone(), node_2.clone()]) + ); + assert_eq!( + snapshot.healthy_nodes, + HashSet::from_iter(vec![node_2.clone()]) + ); + snapshot.healthy_nodes.insert(node_3.clone()); + // Sync with [] + let nodes_changed = snapshot.sync_nodes(&[]).unwrap(); + assert!(nodes_changed); + assert!(snapshot.existing_nodes.is_empty()); + // Make sure all nodes were removed from the healthy_nodes + assert!(snapshot.healthy_nodes.is_empty()); + // Sync with [] again + let nodes_changed = snapshot.sync_nodes(&[]).unwrap(); + assert!(!nodes_changed); + assert!(snapshot.existing_nodes.is_empty()); + } +} diff --git a/ic-agent/src/agent/http_transport/dynamic_routing/snapshot/routing_snapshot.rs b/ic-agent/src/agent/http_transport/dynamic_routing/snapshot/routing_snapshot.rs new file mode 100644 index 00000000..f4f2cc5f --- /dev/null +++ b/ic-agent/src/agent/http_transport/dynamic_routing/snapshot/routing_snapshot.rs @@ -0,0 +1,10 @@ +use std::fmt::Debug; + +use crate::agent::http_transport::dynamic_routing::{health_check::HealthCheckStatus, node::Node}; + +pub trait RoutingSnapshot: Send + Sync + Clone + Debug { + fn has_nodes(&self) -> bool; + fn next(&self) -> Option; + fn sync_nodes(&mut self, nodes: &[Node]) -> anyhow::Result; + fn update_node(&mut self, node: &Node, health: HealthCheckStatus) -> anyhow::Result; +} diff --git a/ic-agent/src/agent/http_transport/dynamic_routing/test_utils.rs b/ic-agent/src/agent/http_transport/dynamic_routing/test_utils.rs new file mode 100644 index 00000000..a1d659c9 --- /dev/null +++ b/ic-agent/src/agent/http_transport/dynamic_routing/test_utils.rs @@ -0,0 +1,121 @@ +use std::collections::{HashMap, HashSet}; +use std::time::Duration; +use std::{fmt::Debug, hash::Hash, sync::Arc}; + +use arc_swap::ArcSwap; +use async_trait::async_trait; +use url::Url; + +use crate::agent::http_transport::{ + dynamic_routing::{ + health_check::{HealthCheck, HealthCheckStatus}, + node::Node, + nodes_fetch::Fetch, + type_aliases::GlobalShared, + }, + route_provider::RouteProvider, +}; + +pub fn route_n_times(n: usize, f: Arc) -> Vec { + (0..n) + .map(|_| f.route().unwrap().domain().unwrap().to_string()) + .collect() +} + +pub fn assert_routed_domains(actual: Vec, expected: Vec, expected_repetitions: usize) +where + T: AsRef + Eq + Hash + Debug + Ord, +{ + fn build_count_map(items: &[T]) -> HashMap<&T, usize> + where + T: Eq + Hash, + { + items.iter().fold(HashMap::new(), |mut map, item| { + *map.entry(item).or_insert(0) += 1; + map + }) + } + let count_actual = build_count_map(&actual); + let count_expected = build_count_map(&expected); + + let mut keys_actual = count_actual.keys().collect::>(); + keys_actual.sort(); + let mut keys_expected = count_expected.keys().collect::>(); + keys_expected.sort(); + // Assert all routed domains are present. + assert_eq!(keys_actual, keys_expected); + + // Assert the expected repetition count of each routed domain. + let actual_repetitions = count_actual.values().collect::>(); + assert!(actual_repetitions + .iter() + .all(|&x| x == &expected_repetitions)); +} + +#[derive(Debug)] +pub struct NodesFetcherMock { + // A mocked set of nodes existing in the topology. + pub nodes: GlobalShared>, +} + +#[async_trait] +impl Fetch for NodesFetcherMock { + async fn fetch(&self, _url: Url) -> anyhow::Result> { + let nodes = (*self.nodes.load_full()).clone(); + Ok(nodes) + } +} + +impl Default for NodesFetcherMock { + fn default() -> Self { + Self::new() + } +} + +impl NodesFetcherMock { + pub fn new() -> Self { + Self { + nodes: Arc::new(ArcSwap::from_pointee(vec![])), + } + } + + pub fn overwrite_nodes(&self, nodes: Vec) { + self.nodes.store(Arc::new(nodes)); + } +} + +#[derive(Debug)] +pub struct NodeHealthCheckerMock { + pub healthy_nodes: GlobalShared>, +} + +impl Default for NodeHealthCheckerMock { + fn default() -> Self { + Self::new() + } +} + +#[async_trait] +impl HealthCheck for NodeHealthCheckerMock { + async fn check(&self, node: &Node) -> anyhow::Result { + let nodes = self.healthy_nodes.load_full(); + let latency = match nodes.contains(node) { + true => Some(Duration::from_secs(1)), + false => None, + }; + Ok(HealthCheckStatus { latency }) + } +} + +impl NodeHealthCheckerMock { + pub fn new() -> Self { + Self { + healthy_nodes: Arc::new(ArcSwap::from_pointee(HashSet::new())), + } + } + + pub fn overwrite_healthy_nodes(&self, healthy_nodes: Vec) { + self.healthy_nodes + .store(Arc::new(HashSet::from_iter(healthy_nodes.into_iter()))); + } +} diff --git a/ic-agent/src/agent/http_transport/dynamic_routing/type_aliases.rs b/ic-agent/src/agent/http_transport/dynamic_routing/type_aliases.rs new file mode 100644 index 00000000..92f922d4 --- /dev/null +++ b/ic-agent/src/agent/http_transport/dynamic_routing/type_aliases.rs @@ -0,0 +1,12 @@ +use std::sync::Arc; + +use arc_swap::ArcSwap; +use tokio::sync::{mpsc, watch}; + +pub type SenderWatch = watch::Sender>; +pub type ReceiverWatch = watch::Receiver>; + +pub type SenderMpsc = mpsc::Sender; +pub type ReceiverMpsc = mpsc::Receiver; + +pub type GlobalShared = Arc>; diff --git a/ic-agent/src/agent/http_transport/mod.rs b/ic-agent/src/agent/http_transport/mod.rs index 8f2220d6..f938a90d 100644 --- a/ic-agent/src/agent/http_transport/mod.rs +++ b/ic-agent/src/agent/http_transport/mod.rs @@ -30,4 +30,5 @@ const ICP0_SUB_DOMAIN: &str = ".icp0.io"; const ICP_API_SUB_DOMAIN: &str = ".icp-api.io"; #[allow(dead_code)] const LOCALHOST_SUB_DOMAIN: &str = ".localhost"; +pub mod dynamic_routing; pub mod route_provider; From 556ad7914de39e3294ed9d3afe062acc3cc047c9 Mon Sep 17 00:00:00 2001 From: Nikolay Komarevskiy Date: Wed, 26 Jun 2024 10:45:05 +0200 Subject: [PATCH 02/24] chore: add is_valid_domain() --- .../dynamic_routing/dynamic_route_provider.rs | 52 +++++-------------- .../dynamic_routing/health_check.rs | 2 +- .../http_transport/dynamic_routing/node.rs | 20 ++++--- .../dynamic_routing/nodes_fetch.rs | 6 ++- 4 files changed, 32 insertions(+), 48 deletions(-) diff --git a/ic-agent/src/agent/http_transport/dynamic_routing/dynamic_route_provider.rs b/ic-agent/src/agent/http_transport/dynamic_routing/dynamic_route_provider.rs index 261a37e0..c4cdebd5 100644 --- a/ic-agent/src/agent/http_transport/dynamic_routing/dynamic_route_provider.rs +++ b/ic-agent/src/agent/http_transport/dynamic_routing/dynamic_route_provider.rs @@ -263,7 +263,7 @@ mod tests { // Only a single node exists, which is initially healthy. tokio::time::sleep(snapshot_update_duration).await; let routed_domains = route_n_times(6, Arc::clone(&route_provider)); - assert_routed_domains(routed_domains, vec![node_1.domain.clone()], 6); + assert_routed_domains(routed_domains, vec![node_1.domain()], 6); // Test 2: multiple route() calls return 3 different domains with equal fairness (repetition). // Two healthy nodes are added to the topology. @@ -275,11 +275,7 @@ mod tests { let routed_domains = route_n_times(6, Arc::clone(&route_provider)); assert_routed_domains( routed_domains, - vec![ - node_1.clone().domain, - node_2.clone().domain, - node_3.clone().domain, - ], + vec![node_1.domain(), node_2.domain(), node_3.domain()], 2, ); @@ -288,11 +284,7 @@ mod tests { checker.overwrite_healthy_nodes(vec![node_1.clone(), node_3.clone()]); tokio::time::sleep(snapshot_update_duration).await; let routed_domains = route_n_times(6, Arc::clone(&route_provider)); - assert_routed_domains( - routed_domains, - vec![node_1.clone().domain, node_3.clone().domain], - 3, - ); + assert_routed_domains(routed_domains, vec![node_1.domain(), node_3.domain()], 3); // Test 4: multiple route() calls return 3 different domains with equal fairness (repetition). // Unhealthy node is set back to healthy. @@ -301,11 +293,7 @@ mod tests { let routed_domains = route_n_times(6, Arc::clone(&route_provider)); assert_routed_domains( routed_domains, - vec![ - node_1.clone().domain, - node_2.clone().domain, - node_3.clone().domain, - ], + vec![node_1.domain(), node_2.domain(), node_3.domain()], 2, ); @@ -323,11 +311,7 @@ mod tests { let routed_domains = route_n_times(6, Arc::clone(&route_provider)); assert_routed_domains( routed_domains, - vec![ - node_2.clone().domain, - node_3.clone().domain, - node_4.clone().domain, - ], + vec![node_2.domain(), node_3.domain(), node_4.domain()], 2, ); @@ -337,7 +321,7 @@ mod tests { fetcher.overwrite_nodes(vec![node_1.clone(), node_2.clone(), node_4.clone()]); tokio::time::sleep(snapshot_update_duration).await; let routed_domains = route_n_times(3, Arc::clone(&route_provider)); - assert_routed_domains(routed_domains, vec![node_2.clone().domain], 3); + assert_routed_domains(routed_domains, vec![node_2.domain()], 3); // Teardown. route_provider.stop().await; @@ -389,11 +373,7 @@ mod tests { checker.overwrite_healthy_nodes(vec![node_1.clone(), node_2.clone()]); tokio::time::sleep(3 * check_interval).await; let routed_domains = route_n_times(6, Arc::clone(&route_provider)); - assert_routed_domains( - routed_domains, - vec![node_1.domain.clone(), node_2.domain.clone()], - 3, - ); + assert_routed_domains(routed_domains, vec![node_1.domain(), node_2.domain()], 3); // Teardown. route_provider.stop().await; @@ -428,7 +408,7 @@ mod tests { // Test 1: multiple route() calls return a single domain=ic0.app, as the seed is healthy. tokio::time::sleep(2 * check_interval).await; let routed_domains = route_n_times(3, Arc::clone(&route_provider)); - assert_routed_domains(routed_domains, vec![node_1.clone().domain], 3); + assert_routed_domains(routed_domains, vec![node_1.domain()], 3); // Test 2: calls to route() return an error, as no healthy nodes exist. checker.overwrite_healthy_nodes(vec![]); @@ -519,17 +499,13 @@ mod tests { // Test 1: calls to route() return only a healthy seed ic0.app. let routed_domains = route_n_times(3, Arc::clone(&route_provider)); - assert_routed_domains(routed_domains, vec![node_1.clone().domain], 3); + assert_routed_domains(routed_domains, vec![node_1.domain()], 3); // Test 2: calls to route() return two healthy seeds, as the unhealthy seed becomes healthy. checker.overwrite_healthy_nodes(vec![node_1.clone(), node_2.clone()]); tokio::time::sleep(2 * check_interval).await; let routed_domains = route_n_times(6, Arc::clone(&route_provider)); - assert_routed_domains( - routed_domains, - vec![node_1.clone().domain, node_2.clone().domain], - 3, - ); + assert_routed_domains(routed_domains, vec![node_1.domain(), node_2.domain()], 3); // Teardown. route_provider.stop().await; @@ -570,7 +546,7 @@ mod tests { // HealthManagerActor shouldn't update the snapshot, if the list of fetched nodes is empty, thus we observe the healthy seed. tokio::time::sleep(snapshot_update_duration).await; let routed_domains = route_n_times(3, Arc::clone(&route_provider)); - assert_routed_domains(routed_domains, vec![node_1.clone().domain], 3); + assert_routed_domains(routed_domains, vec![node_1.domain()], 3); // Test 2: multiple route() calls should now return 3 different domains with equal fairness (repetition). // Three nodes are added to the topology, i.e. now the fetched nodes list is non-empty. @@ -582,11 +558,7 @@ mod tests { let routed_domains = route_n_times(6, Arc::clone(&route_provider)); assert_routed_domains( routed_domains, - vec![ - node_1.clone().domain, - node_2.clone().domain, - node_3.clone().domain, - ], + vec![node_1.domain(), node_2.domain(), node_3.domain()], 2, ); diff --git a/ic-agent/src/agent/http_transport/dynamic_routing/health_check.rs b/ic-agent/src/agent/http_transport/dynamic_routing/health_check.rs index a1565aa7..a9cb7a97 100644 --- a/ic-agent/src/agent/http_transport/dynamic_routing/health_check.rs +++ b/ic-agent/src/agent/http_transport/dynamic_routing/health_check.rs @@ -62,7 +62,7 @@ const HEALTH_CHECKER: &str = "HealthChecker"; impl HealthCheck for HealthChecker { async fn check(&self, node: &Node) -> anyhow::Result { // API boundary node exposes /health endpoint and should respond with 204 (No Content) if it's healthy. - let url = Url::parse(&format!("https://{}/health", node.domain))?; + let url = Url::parse(&format!("https://{}/health", node.domain()))?; let mut request = Request::new(Method::GET, url.clone()); *request.timeout_mut() = Some(self.timeout); diff --git a/ic-agent/src/agent/http_transport/dynamic_routing/node.rs b/ic-agent/src/agent/http_transport/dynamic_routing/node.rs index 2ba2c989..1cadc543 100644 --- a/ic-agent/src/agent/http_transport/dynamic_routing/node.rs +++ b/ic-agent/src/agent/http_transport/dynamic_routing/node.rs @@ -5,7 +5,7 @@ use anyhow::anyhow; #[derive(Debug, Clone, PartialEq, Eq, Hash)] pub struct Node { - pub domain: String, + domain: String, } impl Node { @@ -17,6 +17,10 @@ impl Node { domain: domain.to_string(), }) } + + pub fn domain(&self) -> String { + self.domain.clone() + } } impl Node { @@ -27,17 +31,21 @@ impl Node { impl From<&Node> for Url { fn from(node: &Node) -> Self { + // Parsing can't fail, as the domain was checked at node instantiation. Url::parse(&format!("https://{}", node.domain)).expect("failed to parse URL") } } -impl From<&ApiBoundaryNode> for Node { - fn from(api_bn: &ApiBoundaryNode) -> Self { - Node::new(api_bn.domain.as_str()).unwrap() +impl TryFrom<&ApiBoundaryNode> for Node { + type Error = anyhow::Error; + + fn try_from(value: &ApiBoundaryNode) -> Result { + Node::new(&value.domain) } } pub fn is_valid_domain>(domain: S) -> bool { - // TODO - true + // Prepend scheme to make it a valid URL + let url_string = format!("http://{}", domain.as_ref()); + Url::parse(&url_string).is_ok() } diff --git a/ic-agent/src/agent/http_transport/dynamic_routing/nodes_fetch.rs b/ic-agent/src/agent/http_transport/dynamic_routing/nodes_fetch.rs index 3ce8fec4..0c04fc0e 100644 --- a/ic-agent/src/agent/http_transport/dynamic_routing/nodes_fetch.rs +++ b/ic-agent/src/agent/http_transport/dynamic_routing/nodes_fetch.rs @@ -58,7 +58,11 @@ impl Fetch for NodesFetcher { let api_bns = agent .fetch_api_boundary_nodes_by_subnet_id(self.subnet_id) .await?; - let nodes: Vec = api_bns.iter().map(|node| node.into()).collect(); + // If some API BNs have invalid domain names, they are discarded. + let nodes = api_bns + .iter() + .filter_map(|api_node| api_node.try_into().ok()) + .collect(); return Ok(nodes); } } From bfc28fe5f7e2e5fc24936cfa6c354f5efd045d61 Mon Sep 17 00:00:00 2001 From: Nikolay Komarevskiy Date: Wed, 26 Jun 2024 12:51:42 +0200 Subject: [PATCH 03/24] fix: clippy --- ic-agent/Cargo.toml | 17 ++++++++-------- .../dynamic_routing/dynamic_route_provider.rs | 14 +++++++++---- .../dynamic_routing/health_check.rs | 14 +++++++++++++ .../dynamic_routing/messages.rs | 5 +++++ .../http_transport/dynamic_routing/mod.rs | 6 ++++++ .../http_transport/dynamic_routing/node.rs | 5 +++++ .../dynamic_routing/nodes_fetch.rs | 7 +++++++ .../snapshot/latency_based_routing.rs | 9 ++++++--- .../dynamic_routing/snapshot/mod.rs | 3 +++ .../snapshot/round_robin_routing.rs | 20 ++++++++----------- .../snapshot/routing_snapshot.rs | 5 +++++ .../dynamic_routing/type_aliases.rs | 5 +++++ ic-agent/src/agent/http_transport/mod.rs | 3 +++ 13 files changed, 85 insertions(+), 28 deletions(-) diff --git a/ic-agent/Cargo.toml b/ic-agent/Cargo.toml index 5864f104..64fba968 100644 --- a/ic-agent/Cargo.toml +++ b/ic-agent/Cargo.toml @@ -44,14 +44,6 @@ simple_asn1 = "0.6.1" thiserror = { workspace = true } time = { workspace = true } url = "2.1.0" -tokio = {version = "1.24.2", features = ["full"]} -async-trait = "0.1.80" -tracing = "0.1.40" -arc-swap = "1.7.1" -anyhow = "1.0.86" -simple_moving_average = "1.0.2" -tracing-subscriber = "0.3.18" -tokio-util = { version = "0.7.11", features = ["full"] } [dependencies.hyper] version = "1.0.1" @@ -80,9 +72,16 @@ hyper-rustls = { version = "0.26.0", features = [ "webpki-roots", "http2", ], optional = true } -tokio = { version = "1.24.2", features = ["time"] } +tokio = { version = "1.24.2", features = ["full"] } tower = { version = "0.4.13", optional = true } rustls-webpki = "0.101.4" +async-trait = "0.1.80" +tracing = "0.1.40" +arc-swap = "1.7.1" +anyhow = "1.0.86" +simple_moving_average = "1.0.2" +tracing-subscriber = "0.3.18" +tokio-util = { version = "0.7.11", features = ["full"] } [target.'cfg(target_family = "wasm")'.dependencies] getrandom = { version = "0.2", features = ["js"], optional = true } diff --git a/ic-agent/src/agent/http_transport/dynamic_routing/dynamic_route_provider.rs b/ic-agent/src/agent/http_transport/dynamic_routing/dynamic_route_provider.rs index c4cdebd5..acb3ae9b 100644 --- a/ic-agent/src/agent/http_transport/dynamic_routing/dynamic_route_provider.rs +++ b/ic-agent/src/agent/http_transport/dynamic_routing/dynamic_route_provider.rs @@ -32,6 +32,7 @@ use crate::{ AgentError, }; +/// pub const IC0_SEED_DOMAIN: &str = "ic0.app"; const MAINNET_ROOT_SUBNET_ID: &str = @@ -45,6 +46,7 @@ const HEALTH_CHECK_PERIOD: Duration = Duration::from_secs(1); const DYNAMIC_ROUTE_PROVIDER: &str = "DynamicRouteProvider"; +/// #[derive(Debug)] pub struct DynamicRouteProvider { fetcher: Arc, @@ -75,6 +77,7 @@ impl DynamicRouteProvider where S: RoutingSnapshot + 'static, { + /// pub fn new(snapshot: S, seeds: Vec, http_client: Client) -> Self { let fetcher = Arc::new(NodesFetcher::new( http_client.clone(), @@ -94,21 +97,25 @@ where } } + /// pub fn with_fetcher(mut self, fetcher: Arc) -> Self { self.fetcher = fetcher; self } + /// pub fn with_fetch_period(mut self, period: Duration) -> Self { self.fetch_period = period; self } + /// pub fn with_checker(mut self, checker: Arc) -> Self { self.checker = checker; self } + /// pub fn with_check_period(mut self, period: Duration) -> Self { self.check_period = period; self @@ -183,11 +190,11 @@ where ); (found_healthy_seeds) - .then(|| ()) - .ok_or_else(|| anyhow!("No healthy seeds found")) + .then_some(()) + .ok_or(anyhow!("No healthy seeds found")) } - // Kill all running tasks. + /// Kill all running tasks. pub async fn stop(&self) { self.token.cancel(); self.tracker.close(); @@ -517,7 +524,6 @@ mod tests { // Setup. setup_tracing(); let node_1 = Node::new(IC0_SEED_DOMAIN).unwrap(); - let node_2 = Node::new("api1.com").unwrap(); // Set nodes fetching params: topology, fetching periodicity. let fetcher = Arc::new(NodesFetcherMock::new()); let fetch_interval = Duration::from_secs(2); diff --git a/ic-agent/src/agent/http_transport/dynamic_routing/health_check.rs b/ic-agent/src/agent/http_transport/dynamic_routing/health_check.rs index a9cb7a97..9ce88b99 100644 --- a/ic-agent/src/agent/http_transport/dynamic_routing/health_check.rs +++ b/ic-agent/src/agent/http_transport/dynamic_routing/health_check.rs @@ -21,33 +21,43 @@ use crate::agent::http_transport::dynamic_routing::{ const CHANNEL_BUFFER: usize = 128; +/// #[async_trait] pub trait HealthCheck: Send + Sync + Debug { + /// async fn check(&self, node: &Node) -> anyhow::Result; } +/// #[derive(Clone, PartialEq, Debug, Default)] pub struct HealthCheckStatus { + /// pub latency: Option, } +/// impl HealthCheckStatus { + /// pub fn new(latency: Option) -> Self { Self { latency } } + /// pub fn is_healthy(&self) -> bool { self.latency.is_some() } } +/// #[derive(Debug)] pub struct HealthChecker { http_client: Client, timeout: Duration, } +/// impl HealthChecker { + /// pub fn new(http_client: Client, timeout: Duration) -> Self { Self { http_client, @@ -133,8 +143,10 @@ impl HealthCheckActor { } } +/// pub const HEALTH_MANAGER_ACTOR: &str = "HealthManagerActor"; +/// pub struct HealthManagerActor { checker: Arc, period: Duration, @@ -153,6 +165,7 @@ impl HealthManagerActor where S: RoutingSnapshot, { + /// pub fn new( checker: Arc, period: Duration, @@ -178,6 +191,7 @@ where } } + /// pub async fn run(mut self) { loop { tokio::select! { diff --git a/ic-agent/src/agent/http_transport/dynamic_routing/messages.rs b/ic-agent/src/agent/http_transport/dynamic_routing/messages.rs index 0e3d3935..90e25cee 100644 --- a/ic-agent/src/agent/http_transport/dynamic_routing/messages.rs +++ b/ic-agent/src/agent/http_transport/dynamic_routing/messages.rs @@ -1,11 +1,16 @@ use crate::agent::http_transport::dynamic_routing::{health_check::HealthCheckStatus, node::Node}; +/// #[derive(Debug, Clone)] pub struct FetchedNodes { + /// pub nodes: Vec, } +/// pub struct NodeHealthState { + /// pub node: Node, + /// pub health: HealthCheckStatus, } diff --git a/ic-agent/src/agent/http_transport/dynamic_routing/mod.rs b/ic-agent/src/agent/http_transport/dynamic_routing/mod.rs index 42dc4e08..d450ce67 100644 --- a/ic-agent/src/agent/http_transport/dynamic_routing/mod.rs +++ b/ic-agent/src/agent/http_transport/dynamic_routing/mod.rs @@ -1,9 +1,15 @@ pub mod dynamic_route_provider; +/// pub mod health_check; +/// pub mod messages; +/// pub mod node; +/// pub mod nodes_fetch; +/// pub mod snapshot; #[cfg(test)] pub mod test_utils; +/// pub mod type_aliases; diff --git a/ic-agent/src/agent/http_transport/dynamic_routing/node.rs b/ic-agent/src/agent/http_transport/dynamic_routing/node.rs index 1cadc543..d74a7cd0 100644 --- a/ic-agent/src/agent/http_transport/dynamic_routing/node.rs +++ b/ic-agent/src/agent/http_transport/dynamic_routing/node.rs @@ -3,12 +3,14 @@ use url::Url; use crate::agent::ApiBoundaryNode; use anyhow::anyhow; +/// #[derive(Debug, Clone, PartialEq, Eq, Hash)] pub struct Node { domain: String, } impl Node { + /// pub fn new(domain: &str) -> anyhow::Result { if !is_valid_domain(domain) { return Err(anyhow!("Invalid domain name {domain}")); @@ -18,12 +20,14 @@ impl Node { }) } + /// pub fn domain(&self) -> String { self.domain.clone() } } impl Node { + /// pub fn to_routing_url(&self) -> Url { Url::parse(&format!("https://{}/api/v2/", self.domain)).expect("failed to parse URL") } @@ -44,6 +48,7 @@ impl TryFrom<&ApiBoundaryNode> for Node { } } +/// pub fn is_valid_domain>(domain: S) -> bool { // Prepend scheme to make it a valid URL let url_string = format!("http://{}", domain.as_ref()); diff --git a/ic-agent/src/agent/http_transport/dynamic_routing/nodes_fetch.rs b/ic-agent/src/agent/http_transport/dynamic_routing/nodes_fetch.rs index 0c04fc0e..e10b5f55 100644 --- a/ic-agent/src/agent/http_transport/dynamic_routing/nodes_fetch.rs +++ b/ic-agent/src/agent/http_transport/dynamic_routing/nodes_fetch.rs @@ -22,11 +22,14 @@ use crate::agent::{ const NODES_FETCH_ACTOR: &str = "NodesFetchActor"; +/// #[async_trait] pub trait Fetch: Sync + Send + Debug { + /// async fn fetch(&self, url: Url) -> anyhow::Result>; } +/// #[derive(Debug)] pub struct NodesFetcher { http_client: Client, @@ -34,6 +37,7 @@ pub struct NodesFetcher { } impl NodesFetcher { + /// pub fn new(http_client: Client, subnet_id: Principal) -> Self { Self { http_client, @@ -67,6 +71,7 @@ impl Fetch for NodesFetcher { } } +/// pub struct NodesFetchActor { fetcher: Arc, period: Duration, @@ -80,6 +85,7 @@ impl NodesFetchActor where S: RoutingSnapshot, { + /// pub fn new( fetcher: Arc, period: Duration, @@ -98,6 +104,7 @@ where } } + /// pub async fn run(self) { let mut interval = time::interval(self.period); loop { diff --git a/ic-agent/src/agent/http_transport/dynamic_routing/snapshot/latency_based_routing.rs b/ic-agent/src/agent/http_transport/dynamic_routing/snapshot/latency_based_routing.rs index b8328724..70710cce 100644 --- a/ic-agent/src/agent/http_transport/dynamic_routing/snapshot/latency_based_routing.rs +++ b/ic-agent/src/agent/http_transport/dynamic_routing/snapshot/latency_based_routing.rs @@ -21,13 +21,16 @@ struct WeightedNode { weight: f64, } +/// #[derive(Default, Debug, Clone)] pub struct LatencyRoutingSnapshot { weighted_nodes: Vec, existing_nodes: HashSet, } +/// impl LatencyRoutingSnapshot { + /// pub fn new() -> Self { Self { weighted_nodes: vec![], @@ -39,7 +42,7 @@ impl LatencyRoutingSnapshot { // select weight index based on the input number in range [0, 1] #[inline(always)] fn weighted_sample(weights: &[f64], number: f64) -> Option { - if number < 0.0 || number > 1.0 { + if !(0.0..=1.0).contains(&number) { return None; } let sum: f64 = weights.iter().sum(); @@ -74,7 +77,7 @@ impl RoutingSnapshot for LatencyRoutingSnapshot { } fn sync_nodes(&mut self, nodes: &[Node]) -> anyhow::Result { - let new_nodes = HashSet::from_iter(nodes.into_iter().cloned()); + let new_nodes = HashSet::from_iter(nodes.iter().cloned()); // Find nodes removed from snapshot. let nodes_removed: Vec<_> = self .existing_nodes @@ -292,7 +295,7 @@ mod tests { assert_eq!(snapshot.weighted_nodes[0].node, node_2); // Add node_3 to weighted_nodes manually snapshot.weighted_nodes.push(WeightedNode { - node: node_3.clone(), + node: node_3, latency_mov_avg: LatencyMovAvg::new(), weight: 0.0, }); diff --git a/ic-agent/src/agent/http_transport/dynamic_routing/snapshot/mod.rs b/ic-agent/src/agent/http_transport/dynamic_routing/snapshot/mod.rs index 1c63df8b..3695f1d3 100644 --- a/ic-agent/src/agent/http_transport/dynamic_routing/snapshot/mod.rs +++ b/ic-agent/src/agent/http_transport/dynamic_routing/snapshot/mod.rs @@ -1,3 +1,6 @@ +/// pub mod latency_based_routing; +/// pub mod round_robin_routing; +/// pub mod routing_snapshot; diff --git a/ic-agent/src/agent/http_transport/dynamic_routing/snapshot/round_robin_routing.rs b/ic-agent/src/agent/http_transport/dynamic_routing/snapshot/round_robin_routing.rs index d7dc4995..21216c5c 100644 --- a/ic-agent/src/agent/http_transport/dynamic_routing/snapshot/round_robin_routing.rs +++ b/ic-agent/src/agent/http_transport/dynamic_routing/snapshot/round_robin_routing.rs @@ -10,6 +10,7 @@ use crate::agent::http_transport::dynamic_routing::{ health_check::HealthCheckStatus, node::Node, snapshot::routing_snapshot::RoutingSnapshot, }; +/// #[derive(Default, Debug, Clone)] pub struct RoundRobinRoutingSnapshot { current_idx: Arc, @@ -18,6 +19,7 @@ pub struct RoundRobinRoutingSnapshot { } impl RoundRobinRoutingSnapshot { + /// pub fn new() -> Self { Self { current_idx: Arc::new(AtomicUsize::new(0)), @@ -44,7 +46,7 @@ impl RoutingSnapshot for RoundRobinRoutingSnapshot { } fn sync_nodes(&mut self, nodes: &[Node]) -> anyhow::Result { - let new_nodes = HashSet::from_iter(nodes.into_iter().cloned()); + let new_nodes = HashSet::from_iter(nodes.iter().cloned()); // Find nodes removed from snapshot. let nodes_removed: Vec<_> = self .existing_nodes @@ -69,13 +71,13 @@ impl RoutingSnapshot for RoundRobinRoutingSnapshot { } fn update_node(&mut self, node: &Node, health: HealthCheckStatus) -> anyhow::Result { - if !self.existing_nodes.contains(&node) { + if !self.existing_nodes.contains(node) { return Ok(false); } if health.latency.is_some() { Ok(self.healthy_nodes.insert(node.clone())) } else { - Ok(self.healthy_nodes.remove(&node)) + Ok(self.healthy_nodes.remove(node)) } } } @@ -192,10 +194,7 @@ mod tests { snapshot.existing_nodes, HashSet::from_iter(vec![node_1.clone()]) ); - assert_eq!( - snapshot.healthy_nodes, - HashSet::from_iter(vec![node_1.clone()]) - ); + assert_eq!(snapshot.healthy_nodes, HashSet::from_iter(vec![node_1])); // Sync with node_2 let node_2 = Node::new("api2.com").unwrap(); let nodes_changed = snapshot.sync_nodes(&[node_2.clone()]).unwrap(); @@ -218,11 +217,8 @@ mod tests { snapshot.existing_nodes, HashSet::from_iter(vec![node_3.clone(), node_2.clone()]) ); - assert_eq!( - snapshot.healthy_nodes, - HashSet::from_iter(vec![node_2.clone()]) - ); - snapshot.healthy_nodes.insert(node_3.clone()); + assert_eq!(snapshot.healthy_nodes, HashSet::from_iter(vec![node_2])); + snapshot.healthy_nodes.insert(node_3); // Sync with [] let nodes_changed = snapshot.sync_nodes(&[]).unwrap(); assert!(nodes_changed); diff --git a/ic-agent/src/agent/http_transport/dynamic_routing/snapshot/routing_snapshot.rs b/ic-agent/src/agent/http_transport/dynamic_routing/snapshot/routing_snapshot.rs index f4f2cc5f..ef88b8df 100644 --- a/ic-agent/src/agent/http_transport/dynamic_routing/snapshot/routing_snapshot.rs +++ b/ic-agent/src/agent/http_transport/dynamic_routing/snapshot/routing_snapshot.rs @@ -2,9 +2,14 @@ use std::fmt::Debug; use crate::agent::http_transport::dynamic_routing::{health_check::HealthCheckStatus, node::Node}; +/// pub trait RoutingSnapshot: Send + Sync + Clone + Debug { + /// fn has_nodes(&self) -> bool; + /// fn next(&self) -> Option; + /// fn sync_nodes(&mut self, nodes: &[Node]) -> anyhow::Result; + /// fn update_node(&mut self, node: &Node, health: HealthCheckStatus) -> anyhow::Result; } diff --git a/ic-agent/src/agent/http_transport/dynamic_routing/type_aliases.rs b/ic-agent/src/agent/http_transport/dynamic_routing/type_aliases.rs index 92f922d4..33ef77ea 100644 --- a/ic-agent/src/agent/http_transport/dynamic_routing/type_aliases.rs +++ b/ic-agent/src/agent/http_transport/dynamic_routing/type_aliases.rs @@ -3,10 +3,15 @@ use std::sync::Arc; use arc_swap::ArcSwap; use tokio::sync::{mpsc, watch}; +/// pub type SenderWatch = watch::Sender>; +/// pub type ReceiverWatch = watch::Receiver>; +/// pub type SenderMpsc = mpsc::Sender; +/// pub type ReceiverMpsc = mpsc::Receiver; +/// pub type GlobalShared = Arc>; diff --git a/ic-agent/src/agent/http_transport/mod.rs b/ic-agent/src/agent/http_transport/mod.rs index f938a90d..91959ac0 100644 --- a/ic-agent/src/agent/http_transport/mod.rs +++ b/ic-agent/src/agent/http_transport/mod.rs @@ -30,5 +30,8 @@ const ICP0_SUB_DOMAIN: &str = ".icp0.io"; const ICP_API_SUB_DOMAIN: &str = ".icp-api.io"; #[allow(dead_code)] const LOCALHOST_SUB_DOMAIN: &str = ".localhost"; +/// +#[cfg(not(target_family = "wasm"))] +#[cfg(feature = "reqwest")] pub mod dynamic_routing; pub mod route_provider; From 973b377d61b17dc703954f9fb77b7934fc986003 Mon Sep 17 00:00:00 2001 From: Nikolay Komarevskiy Date: Thu, 27 Jun 2024 10:11:11 +0200 Subject: [PATCH 04/24] chore: use Duration for LatencyMovAvg --- .../snapshot/latency_based_routing.rs | 50 +++++++++++-------- 1 file changed, 28 insertions(+), 22 deletions(-) diff --git a/ic-agent/src/agent/http_transport/dynamic_routing/snapshot/latency_based_routing.rs b/ic-agent/src/agent/http_transport/dynamic_routing/snapshot/latency_based_routing.rs index 70710cce..cb4826a6 100644 --- a/ic-agent/src/agent/http_transport/dynamic_routing/snapshot/latency_based_routing.rs +++ b/ic-agent/src/agent/http_transport/dynamic_routing/snapshot/latency_based_routing.rs @@ -12,7 +12,9 @@ const MAX_LATENCY: Duration = Duration::from_secs(500); const WINDOW_SIZE: usize = 15; -type LatencyMovAvg = SumTreeSMA; +// Algorithmic complexity: add sample - O(log(N)), get average - O(1). +// Space complexity: O(N) +type LatencyMovAvg = SumTreeSMA; #[derive(Clone, Debug)] struct WeightedNode { @@ -108,19 +110,19 @@ impl RoutingSnapshot for LatencyRoutingSnapshot { } // If latency is None (meaning Node is unhealthy), we assign some big value - let latency = health.latency.unwrap_or(MAX_LATENCY).as_millis() as f64; + let latency = health.latency.unwrap_or(MAX_LATENCY); if let Some(idx) = self.weighted_nodes.iter().position(|x| &x.node == node) { // Node is already in the array (it is not the first update_node() call). self.weighted_nodes[idx].latency_mov_avg.add_sample(latency); let latency_avg = self.weighted_nodes[idx].latency_mov_avg.get_average(); // As nodes with smaller average latencies are preferred for routing, we use inverted values for weights. - self.weighted_nodes[idx].weight = 1.0 / latency_avg; + self.weighted_nodes[idx].weight = 1.0 / latency_avg.as_secs_f64(); } else { // Node is not yet in array (first update_node() call). - let mut latency_mov_avg = LatencyMovAvg::new(); + let mut latency_mov_avg = LatencyMovAvg::from_zero(Duration::ZERO); latency_mov_avg.add_sample(latency); - let weight = 1.0 / latency_mov_avg.get_average(); + let weight = 1.0 / latency_mov_avg.get_average().as_secs_f64(); self.weighted_nodes.push(WeightedNode { latency_mov_avg, node: node.clone(), @@ -194,8 +196,11 @@ mod tests { assert!(is_updated); assert!(snapshot.has_nodes()); let weighted_node = snapshot.weighted_nodes.first().unwrap(); - assert_eq!(weighted_node.latency_mov_avg.get_average(), 1000.0); - assert_eq!(weighted_node.weight, 0.001); + assert_eq!( + weighted_node.latency_mov_avg.get_average(), + Duration::from_secs(1) + ); + assert_eq!(weighted_node.weight, 1.0); assert_eq!(snapshot.next().unwrap(), node); // Check second update let health = HealthCheckStatus { @@ -206,8 +211,11 @@ mod tests { .expect("node update failed"); assert!(is_updated); let weighted_node = snapshot.weighted_nodes.first().unwrap(); - assert_eq!(weighted_node.latency_mov_avg.get_average(), 1500.0); - assert_eq!(weighted_node.weight, 1.0 / 1500.0); + assert_eq!( + weighted_node.latency_mov_avg.get_average(), + Duration::from_millis(1500) + ); + assert_eq!(weighted_node.weight, 1.0 / 1.5); // Check third update let health = HealthCheckStatus { latency: Some(Duration::from_secs(3)), @@ -217,8 +225,11 @@ mod tests { .expect("node update failed"); assert!(is_updated); let weighted_node = snapshot.weighted_nodes.first().unwrap(); - assert_eq!(weighted_node.latency_mov_avg.get_average(), 2000.0); - assert_eq!(weighted_node.weight, 1.0 / 2000.0); + assert_eq!( + weighted_node.latency_mov_avg.get_average(), + Duration::from_millis(2000) + ); + assert_eq!(weighted_node.weight, 0.5); // Check forth update with none let health = HealthCheckStatus { latency: None }; let is_updated = snapshot @@ -226,14 +237,9 @@ mod tests { .expect("node update failed"); assert!(is_updated); let weighted_node = snapshot.weighted_nodes.first().unwrap(); - assert_eq!( - weighted_node.latency_mov_avg.get_average(), - (MAX_LATENCY.as_millis() as f64 + 6000.0) / 4.0 - ); - assert_eq!( - weighted_node.weight, - 4.0 / (MAX_LATENCY.as_millis() as f64 + 6000.0) - ); + let avg_latency = Duration::from_secs_f64((MAX_LATENCY.as_secs() as f64 + 6.0) / 4.0); + assert_eq!(weighted_node.latency_mov_avg.get_average(), avg_latency); + assert_eq!(weighted_node.weight, 1.0 / avg_latency.as_secs_f64()); assert_eq!(snapshot.weighted_nodes.len(), 1); assert_eq!(snapshot.existing_nodes.len(), 1); assert_eq!(snapshot.next().unwrap(), node); @@ -255,7 +261,7 @@ mod tests { // Add node_1 to weighted_nodes manually snapshot.weighted_nodes.push(WeightedNode { node: node_1.clone(), - latency_mov_avg: LatencyMovAvg::new(), + latency_mov_avg: LatencyMovAvg::from_zero(Duration::ZERO), weight: 0.0, }); // Sync with node_1 again @@ -279,7 +285,7 @@ mod tests { // Add node_2 to weighted_nodes manually snapshot.weighted_nodes.push(WeightedNode { node: node_2.clone(), - latency_mov_avg: LatencyMovAvg::new(), + latency_mov_avg: LatencyMovAvg::from_zero(Duration::ZERO), weight: 0.0, }); // Sync with [node_2, node_3] @@ -296,7 +302,7 @@ mod tests { // Add node_3 to weighted_nodes manually snapshot.weighted_nodes.push(WeightedNode { node: node_3, - latency_mov_avg: LatencyMovAvg::new(), + latency_mov_avg: LatencyMovAvg::from_zero(Duration::ZERO), weight: 0.0, }); // Sync with [] From 32bc62e7e471f32854d43ee45be8f911b2555c36 Mon Sep 17 00:00:00 2001 From: Nikolay Komarevskiy Date: Fri, 28 Jun 2024 11:54:36 +0200 Subject: [PATCH 05/24] docs: add --- .../dynamic_routing/dynamic_route_provider.rs | 50 +++++++----- .../dynamic_routing/health_check.rs | 76 ++++++++++++------- .../dynamic_routing/messages.rs | 10 +-- .../http_transport/dynamic_routing/mod.rs | 13 ++-- .../http_transport/dynamic_routing/node.rs | 10 +-- .../dynamic_routing/nodes_fetch.rs | 36 +++++---- .../snapshot/latency_based_routing.rs | 36 +++++---- .../dynamic_routing/snapshot/mod.rs | 6 +- .../snapshot/round_robin_routing.rs | 22 +++--- .../dynamic_routing/test_utils.rs | 10 +-- .../dynamic_routing/type_aliases.rs | 17 +++-- 11 files changed, 161 insertions(+), 125 deletions(-) diff --git a/ic-agent/src/agent/http_transport/dynamic_routing/dynamic_route_provider.rs b/ic-agent/src/agent/http_transport/dynamic_routing/dynamic_route_provider.rs index acb3ae9b..eff84d3b 100644 --- a/ic-agent/src/agent/http_transport/dynamic_routing/dynamic_route_provider.rs +++ b/ic-agent/src/agent/http_transport/dynamic_routing/dynamic_route_provider.rs @@ -25,7 +25,7 @@ use crate::{ node::Node, nodes_fetch::{Fetch, NodesFetchActor, NodesFetcher}, snapshot::routing_snapshot::RoutingSnapshot, - type_aliases::GlobalShared, + type_aliases::AtomicSwap, }, route_provider::RouteProvider, }, @@ -46,17 +46,28 @@ const HEALTH_CHECK_PERIOD: Duration = Duration::from_secs(1); const DYNAMIC_ROUTE_PROVIDER: &str = "DynamicRouteProvider"; -/// +/// A dynamic route provider. +/// It spawns the discovery service (`NodesFetchActor`) for fetching the latest nodes topology. +/// It also spawns the `HealthManagerActor`, which orchestrates the health check tasks for each node and updates routing snapshot. #[derive(Debug)] pub struct DynamicRouteProvider { + /// Fetcher for fetching the latest nodes topology. fetcher: Arc, + /// Periodicity of fetching the latest nodes topology. fetch_period: Duration, + /// Interval for retrying fetching the nodes in case of error. fetch_retry_interval: Duration, + /// Health checker for checking the health of the nodes. checker: Arc, + /// Periodicity of checking the health of the nodes. check_period: Duration, - snapshot: GlobalShared, + /// Snapshot of the routing nodes. + routing_snapshot: AtomicSwap, + /// Task tracker for managing the spawned tasks. tracker: TaskTracker, + /// Initial seed nodes, which are used for the initial fetching of the nodes. seeds: Vec, + /// Cancellation token for stopping the spawned tasks. token: CancellationToken, } @@ -65,7 +76,7 @@ where S: RoutingSnapshot + 'static, { fn route(&self) -> Result { - let snapshot = self.snapshot.load(); + let snapshot = self.routing_snapshot.load(); let node = snapshot.next().ok_or_else(|| { AgentError::RouteProviderError("No healthy API nodes found.".to_string()) })?; @@ -77,7 +88,7 @@ impl DynamicRouteProvider where S: RoutingSnapshot + 'static, { - /// + /// Creates a new instance of `DynamicRouteProvider`. pub fn new(snapshot: S, seeds: Vec, http_client: Client) -> Self { let fetcher = Arc::new(NodesFetcher::new( http_client.clone(), @@ -91,31 +102,31 @@ where checker, check_period: HEALTH_CHECK_PERIOD, seeds, - snapshot: Arc::new(ArcSwap::from_pointee(snapshot)), + routing_snapshot: Arc::new(ArcSwap::from_pointee(snapshot)), tracker: TaskTracker::new(), token: CancellationToken::new(), } } - /// + /// Sets the fetcher for fetching the latest nodes topology. pub fn with_fetcher(mut self, fetcher: Arc) -> Self { self.fetcher = fetcher; self } - /// + /// Sets the periodicity of fetching the latest nodes topology. pub fn with_fetch_period(mut self, period: Duration) -> Self { self.fetch_period = period; self } - /// + /// Sets the interval for retrying fetching the nodes in case of error. pub fn with_checker(mut self, checker: Arc) -> Self { self.checker = checker; self } - /// + /// Sets the periodicity of checking the health of the nodes. pub fn with_check_period(mut self, period: Duration) -> Self { self.check_period = period; self @@ -133,14 +144,14 @@ where // Communication channel between NodesFetchActor and HealthManagerActor. let (fetch_sender, fetch_receiver) = watch::channel(None); - // Communication channel with HealthManagerActor to receive info about healthy seeds. + // Communication channel with HealthManagerActor to receive info about healthy seed nodes (used only once). let (init_sender, mut init_receiver) = mpsc::channel(1); // Start the receiving part first. let health_manager_actor = HealthManagerActor::new( Arc::clone(&self.checker), self.check_period, - Arc::clone(&self.snapshot), + Arc::clone(&self.routing_snapshot), fetch_receiver, init_sender, self.token.clone(), @@ -156,7 +167,7 @@ where error!("{DYNAMIC_ROUTE_PROVIDER}: failed to send results to HealthManager: {err:?}"); } - // Try await healthy seeds. + // Try await for healthy seeds. let found_healthy_seeds = match timeout(TIMEOUT_AWAIT_HEALTHY_SEED, init_receiver.recv()).await { Ok(_) => { @@ -174,6 +185,7 @@ where false } }; + // We can close the channel now. init_receiver.close(); let fetch_actor = NodesFetchActor::new( @@ -181,7 +193,7 @@ where self.fetch_period, self.fetch_retry_interval, fetch_sender, - Arc::clone(&self.snapshot), + Arc::clone(&self.routing_snapshot), self.token.clone(), ); self.tracker.spawn(async move { fetch_actor.run().await }); @@ -189,9 +201,9 @@ where "{DYNAMIC_ROUTE_PROVIDER}: NodesFetchActor and HealthManagerActor started successfully" ); - (found_healthy_seeds) - .then_some(()) - .ok_or(anyhow!("No healthy seeds found")) + (found_healthy_seeds).then_some(()).ok_or(anyhow!( + "No healthy seeds found, they may become healthy later ..." + )) } /// Kill all running tasks. @@ -364,7 +376,7 @@ mod tests { .await .unwrap_err() .to_string() - .contains("No healthy seeds found")); + .contains("No healthy seeds found, they may become healthy later ...")); // Test 1: calls to route() return an error, as no healthy seeds exist. for _ in 0..4 { @@ -461,7 +473,7 @@ mod tests { .await .unwrap_err() .to_string() - .contains("No healthy seeds found")); + .contains("No healthy seeds found, they may become healthy later ...")); // Test: calls to route() return an error, as no healthy seeds exist. for _ in 0..4 { diff --git a/ic-agent/src/agent/http_transport/dynamic_routing/health_check.rs b/ic-agent/src/agent/http_transport/dynamic_routing/health_check.rs index 9ce88b99..befbd99e 100644 --- a/ic-agent/src/agent/http_transport/dynamic_routing/health_check.rs +++ b/ic-agent/src/agent/http_transport/dynamic_routing/health_check.rs @@ -16,48 +16,50 @@ use crate::agent::http_transport::dynamic_routing::{ messages::{FetchedNodes, NodeHealthState}, node::Node, snapshot::routing_snapshot::RoutingSnapshot, - type_aliases::{GlobalShared, ReceiverMpsc, ReceiverWatch, SenderMpsc}, + type_aliases::{AtomicSwap, ReceiverMpsc, ReceiverWatch, SenderMpsc}, }; const CHANNEL_BUFFER: usize = 128; -/// +/// A trait representing a health check of the node. #[async_trait] pub trait HealthCheck: Send + Sync + Debug { - /// + /// Checks the health of the node. async fn check(&self, node: &Node) -> anyhow::Result; } -/// +/// A struct representing the health check status of the node. #[derive(Clone, PartialEq, Debug, Default)] pub struct HealthCheckStatus { - /// - pub latency: Option, + latency: Option, } -/// impl HealthCheckStatus { - /// + /// Creates a new `HealthCheckStatus` instance. pub fn new(latency: Option) -> Self { Self { latency } } - /// + /// Checks if the node is healthy. pub fn is_healthy(&self) -> bool { self.latency.is_some() } + + /// Get the latency of the health check. + pub fn latency(&self) -> Option { + self.latency + } } -/// +/// A struct implementing the `HealthCheck` for the nodes. #[derive(Debug)] pub struct HealthChecker { http_client: Client, timeout: Duration, } -/// impl HealthChecker { - /// + /// Creates a new `HealthChecker` instance. pub fn new(http_client: Client, timeout: Duration) -> Self { Self { http_client, @@ -96,16 +98,22 @@ impl HealthCheck for HealthChecker { const HEALTH_CHECK_ACTOR: &str = "HealthCheckActor"; +/// A struct performing the health check of the node and sending the health status to the listener. struct HealthCheckActor { + /// The health checker. checker: Arc, + /// The period of the health check. period: Duration, + /// The node to check. node: Node, + /// The sender channel (listener) to send the health status. sender_channel: SenderMpsc, + /// The cancellation token of the actor. token: CancellationToken, } impl HealthCheckActor { - pub fn new( + fn new( checker: Arc, period: Duration, node: Node, @@ -121,7 +129,8 @@ impl HealthCheckActor { } } - pub async fn run(self) { + /// Runs the actor. + async fn run(self) { let mut interval = time::interval(self.period); loop { tokio::select! { @@ -143,21 +152,34 @@ impl HealthCheckActor { } } -/// +/// The name of the health manager actor. pub const HEALTH_MANAGER_ACTOR: &str = "HealthManagerActor"; -/// +/// A struct managing the health checks of the nodes. +/// It receives the fetched nodes from the `NodesFetchActor` and starts the health checks for them. +/// It also receives the health status of the nodes from the `HealthCheckActor/s` and updates the routing snapshot. pub struct HealthManagerActor { + /// The health checker. checker: Arc, + /// The period of the health check. period: Duration, - nodes_snapshot: GlobalShared, + /// The routing snapshot, storing the nodes. + routing_snapshot: AtomicSwap, + /// The receiver channel to listen to the fetched nodes messages. fetch_receiver: ReceiverWatch, + /// The sender channel to send the health status of the nodes back to HealthManagerActor. check_sender: SenderMpsc, + /// The receiver channel to receive the health status of the nodes from the `HealthCheckActor/s`. check_receiver: ReceiverMpsc, + /// The sender channel to send the initialization status to DynamicRouteProvider (used only once in the init phase). init_sender: SenderMpsc, + /// The cancellation token of the actor. token: CancellationToken, + /// The cancellation token for all the health checks. nodes_token: CancellationToken, + /// The task tracker of the health checks, waiting for the tasks to exit (graceful termination). nodes_tracker: TaskTracker, + /// The flag indicating if this actor is initialized with healthy nodes. is_initialized: bool, } @@ -165,11 +187,11 @@ impl HealthManagerActor where S: RoutingSnapshot, { - /// + /// Creates a new `HealthManagerActor` instance. pub fn new( checker: Arc, period: Duration, - nodes_snapshot: GlobalShared, + routing_snapshot: AtomicSwap, fetch_receiver: ReceiverWatch, init_sender: SenderMpsc, token: CancellationToken, @@ -179,7 +201,7 @@ where Self { checker, period, - nodes_snapshot, + routing_snapshot, fetch_receiver, check_sender, check_receiver, @@ -191,11 +213,11 @@ where } } - /// + /// Runs the actor. pub async fn run(mut self) { loop { tokio::select! { - // Check if a new array of fetched nodes appeared in the channel from NodesFetchService. + // Process a new array of fetched nodes from NodesFetchActor, if it appeared in the channel. result = self.fetch_receiver.changed() => { if let Err(err) = result { error!("{HEALTH_MANAGER_ACTOR}: nodes fetch sender has been dropped: {err:?}"); @@ -206,7 +228,7 @@ where let Some(FetchedNodes { nodes }) = self.fetch_receiver.borrow_and_update().clone() else { continue }; self.handle_fetch_update(nodes).await; } - // Receive health check messages from all running NodeHealthChecker/s. + // Receive health check messages from all running HealthCheckActor/s. Some(msg) = self.check_receiver.recv() => { self.handle_health_update(msg).await; } @@ -221,13 +243,13 @@ where } async fn handle_health_update(&mut self, msg: NodeHealthState) { - let current_snapshot = self.nodes_snapshot.load_full(); + let current_snapshot = self.routing_snapshot.load_full(); let mut new_snapshot = (*current_snapshot).clone(); if let Err(err) = new_snapshot.update_node(&msg.node, msg.health.clone()) { error!("{HEALTH_MANAGER_ACTOR}: failed to update snapshot: {err:?}"); return; } - self.nodes_snapshot.store(Arc::new(new_snapshot)); + self.routing_snapshot.store(Arc::new(new_snapshot)); if !self.is_initialized && msg.health.is_healthy() { self.is_initialized = true; // If TIMEOUT_AWAIT_HEALTHY_SEED has been exceeded, the receiver was dropped and send would thus fail. We ignore the failure. @@ -244,11 +266,11 @@ where return; } debug!("{HEALTH_MANAGER_ACTOR}: fetched nodes received {:?}", nodes); - let current_snapshot = self.nodes_snapshot.load_full(); + let current_snapshot = self.routing_snapshot.load_full(); let mut new_snapshot = (*current_snapshot).clone(); // If the snapshot has changed, store it and restart all node's health checks. if let Ok(true) = new_snapshot.sync_nodes(&nodes) { - self.nodes_snapshot.store(Arc::new(new_snapshot)); + self.routing_snapshot.store(Arc::new(new_snapshot)); self.stop_all_checks().await; self.start_checks(nodes.to_vec()); } diff --git a/ic-agent/src/agent/http_transport/dynamic_routing/messages.rs b/ic-agent/src/agent/http_transport/dynamic_routing/messages.rs index 90e25cee..5feeae25 100644 --- a/ic-agent/src/agent/http_transport/dynamic_routing/messages.rs +++ b/ic-agent/src/agent/http_transport/dynamic_routing/messages.rs @@ -1,16 +1,16 @@ use crate::agent::http_transport::dynamic_routing::{health_check::HealthCheckStatus, node::Node}; -/// +/// Represents a message with fetched nodes. #[derive(Debug, Clone)] pub struct FetchedNodes { - /// + /// The fetched nodes. pub nodes: Vec, } -/// +/// Represents a message with the health state of a node. pub struct NodeHealthState { - /// + /// The node. pub node: Node, - /// + /// The health state of the node. pub health: HealthCheckStatus, } diff --git a/ic-agent/src/agent/http_transport/dynamic_routing/mod.rs b/ic-agent/src/agent/http_transport/dynamic_routing/mod.rs index d450ce67..9b6b69de 100644 --- a/ic-agent/src/agent/http_transport/dynamic_routing/mod.rs +++ b/ic-agent/src/agent/http_transport/dynamic_routing/mod.rs @@ -1,15 +1,16 @@ +//! Dynamic routing implementation. pub mod dynamic_route_provider; -/// +/// Health check implementation. pub mod health_check; -/// +/// Messages used in dynamic routing. pub mod messages; -/// +/// Node implementation. pub mod node; -/// +/// Nodes fetch implementation. pub mod nodes_fetch; -/// +/// Routing snapshot implementation. pub mod snapshot; #[cfg(test)] pub mod test_utils; -/// +/// Type aliases used in dynamic routing. pub mod type_aliases; diff --git a/ic-agent/src/agent/http_transport/dynamic_routing/node.rs b/ic-agent/src/agent/http_transport/dynamic_routing/node.rs index d74a7cd0..482dfe89 100644 --- a/ic-agent/src/agent/http_transport/dynamic_routing/node.rs +++ b/ic-agent/src/agent/http_transport/dynamic_routing/node.rs @@ -3,14 +3,14 @@ use url::Url; use crate::agent::ApiBoundaryNode; use anyhow::anyhow; -/// +/// Represents a node in the dynamic routing. #[derive(Debug, Clone, PartialEq, Eq, Hash)] pub struct Node { domain: String, } impl Node { - /// + /// Creates a new `Node` instance from the domain name. pub fn new(domain: &str) -> anyhow::Result { if !is_valid_domain(domain) { return Err(anyhow!("Invalid domain name {domain}")); @@ -20,14 +20,14 @@ impl Node { }) } - /// + /// Returns the domain name of the node. pub fn domain(&self) -> String { self.domain.clone() } } impl Node { - /// + /// Converts the node to a routing URL. pub fn to_routing_url(&self) -> Url { Url::parse(&format!("https://{}/api/v2/", self.domain)).expect("failed to parse URL") } @@ -48,7 +48,7 @@ impl TryFrom<&ApiBoundaryNode> for Node { } } -/// +/// Checks if the given domain is a valid URL. pub fn is_valid_domain>(domain: S) -> bool { // Prepend scheme to make it a valid URL let url_string = format!("http://{}", domain.as_ref()); diff --git a/ic-agent/src/agent/http_transport/dynamic_routing/nodes_fetch.rs b/ic-agent/src/agent/http_transport/dynamic_routing/nodes_fetch.rs index e10b5f55..9f81a401 100644 --- a/ic-agent/src/agent/http_transport/dynamic_routing/nodes_fetch.rs +++ b/ic-agent/src/agent/http_transport/dynamic_routing/nodes_fetch.rs @@ -12,7 +12,7 @@ use crate::agent::{ http_transport::{ dynamic_routing::{ health_check::HEALTH_MANAGER_ACTOR, messages::FetchedNodes, node::Node, - snapshot::routing_snapshot::RoutingSnapshot, type_aliases::GlobalShared, + snapshot::routing_snapshot::RoutingSnapshot, type_aliases::AtomicSwap, type_aliases::SenderWatch, }, reqwest_transport::ReqwestTransport, @@ -22,14 +22,14 @@ use crate::agent::{ const NODES_FETCH_ACTOR: &str = "NodesFetchActor"; -/// +/// Fetcher of nodes in the topology. #[async_trait] pub trait Fetch: Sync + Send + Debug { - /// + /// Fetches the nodes from the topology. async fn fetch(&self, url: Url) -> anyhow::Result>; } -/// +/// A struct representing the fetcher of the nodes from the topology. #[derive(Debug)] pub struct NodesFetcher { http_client: Client, @@ -37,7 +37,7 @@ pub struct NodesFetcher { } impl NodesFetcher { - /// + /// Creates a new `NodesFetcher` instance. pub fn new(http_client: Client, subnet_id: Principal) -> Self { Self { http_client, @@ -71,13 +71,19 @@ impl Fetch for NodesFetcher { } } -/// +/// A struct representing the actor responsible for fetching existing nodes and communicating it with the listener. pub struct NodesFetchActor { + /// The fetcher object responsible for fetching the nodes. fetcher: Arc, + /// Time period between fetches. period: Duration, + /// The interval to wait before retrying to fetch the nodes in case of failures. fetch_retry_interval: Duration, + /// Communication channel with the listener. fetch_sender: SenderWatch, - snapshot: GlobalShared, + /// The snapshot of the routing table. + routing_snapshot: AtomicSwap, + /// The token to cancel/stop the actor. token: CancellationToken, } @@ -85,13 +91,13 @@ impl NodesFetchActor where S: RoutingSnapshot, { - /// + /// Creates a new `NodesFetchActor` instance. pub fn new( fetcher: Arc, period: Duration, retry_interval: Duration, fetch_sender: SenderWatch, - snapshot: GlobalShared, + snapshot: AtomicSwap, token: CancellationToken, ) -> Self { Self { @@ -99,26 +105,26 @@ where period, fetch_retry_interval: retry_interval, fetch_sender, - snapshot, + routing_snapshot: snapshot, token, } } - /// + /// Runs the actor. pub async fn run(self) { let mut interval = time::interval(self.period); loop { tokio::select! { _ = interval.tick() => { // Retry until success: - // - try to get a healthy node from the snapshot + // - try to get a healthy node from the routing snapshot // - if snapshot is empty, break the cycle and wait for the next fetch cycle - // - using the healthy node, try to fetch new nodes from topology + // - using the healthy node, try to fetch nodes from topology // - if failure, sleep and retry // - try send fetched nodes to the listener - // - failure should never happen + // - failure should never happen, but we trace it if it does loop { - let snapshot = self.snapshot.load(); + let snapshot = self.routing_snapshot.load(); if let Some(node) = snapshot.next() { match self.fetcher.fetch((&node).into()).await { Ok(nodes) => { diff --git a/ic-agent/src/agent/http_transport/dynamic_routing/snapshot/latency_based_routing.rs b/ic-agent/src/agent/http_transport/dynamic_routing/snapshot/latency_based_routing.rs index cb4826a6..cec2580c 100644 --- a/ic-agent/src/agent/http_transport/dynamic_routing/snapshot/latency_based_routing.rs +++ b/ic-agent/src/agent/http_transport/dynamic_routing/snapshot/latency_based_routing.rs @@ -16,23 +16,28 @@ const WINDOW_SIZE: usize = 15; // Space complexity: O(N) type LatencyMovAvg = SumTreeSMA; +/// A node, which stores health check latencies in the form of moving average. #[derive(Clone, Debug)] struct WeightedNode { node: Node, + /// Moving mean of latencies measurements. latency_mov_avg: LatencyMovAvg, + /// Weight of the node (invers of the average latency), used for stochastic weighted random sampling. weight: f64, } -/// +/// Routing snapshot for latency-based routing. +/// In this routing strategy, nodes are randomly selected based on their averaged latency of the last WINDOW_SIZE health checks. +/// Nodes with smaller average latencies are preferred for routing. #[derive(Default, Debug, Clone)] pub struct LatencyRoutingSnapshot { weighted_nodes: Vec, existing_nodes: HashSet, } -/// +/// Implementation of the LatencyRoutingSnapshot. impl LatencyRoutingSnapshot { - /// + /// Creates a new LatencyRoutingSnapshot. pub fn new() -> Self { Self { weighted_nodes: vec![], @@ -41,7 +46,8 @@ impl LatencyRoutingSnapshot { } } -// select weight index based on the input number in range [0, 1] +/// Helper function to sample nodes based on their weights. +/// Here weight index is selected based on the input number in range [0, 1] #[inline(always)] fn weighted_sample(weights: &[f64], number: f64) -> Option { if !(0.0..=1.0).contains(&number) { @@ -80,7 +86,7 @@ impl RoutingSnapshot for LatencyRoutingSnapshot { fn sync_nodes(&mut self, nodes: &[Node]) -> anyhow::Result { let new_nodes = HashSet::from_iter(nodes.iter().cloned()); - // Find nodes removed from snapshot. + // Find nodes removed from topology. let nodes_removed: Vec<_> = self .existing_nodes .difference(&new_nodes) @@ -110,7 +116,7 @@ impl RoutingSnapshot for LatencyRoutingSnapshot { } // If latency is None (meaning Node is unhealthy), we assign some big value - let latency = health.latency.unwrap_or(MAX_LATENCY); + let latency = health.latency().unwrap_or(MAX_LATENCY); if let Some(idx) = self.weighted_nodes.iter().position(|x| &x.node == node) { // Node is already in the array (it is not the first update_node() call). @@ -166,9 +172,7 @@ mod tests { // Arrange let mut snapshot = LatencyRoutingSnapshot::new(); let node = Node::new("api1.com").unwrap(); - let health = HealthCheckStatus { - latency: Some(Duration::from_secs(1)), - }; + let health = HealthCheckStatus::new(Some(Duration::from_secs(1))); // Act let is_updated = snapshot .update_node(&node, health) @@ -185,9 +189,7 @@ mod tests { // Arrange let mut snapshot = LatencyRoutingSnapshot::new(); let node = Node::new("api1.com").unwrap(); - let health = HealthCheckStatus { - latency: Some(Duration::from_secs(1)), - }; + let health = HealthCheckStatus::new(Some(Duration::from_secs(1))); snapshot.existing_nodes.insert(node.clone()); // Check first update let is_updated = snapshot @@ -203,9 +205,7 @@ mod tests { assert_eq!(weighted_node.weight, 1.0); assert_eq!(snapshot.next().unwrap(), node); // Check second update - let health = HealthCheckStatus { - latency: Some(Duration::from_secs(2)), - }; + let health = HealthCheckStatus::new(Some(Duration::from_secs(2))); let is_updated = snapshot .update_node(&node, health) .expect("node update failed"); @@ -217,9 +217,7 @@ mod tests { ); assert_eq!(weighted_node.weight, 1.0 / 1.5); // Check third update - let health = HealthCheckStatus { - latency: Some(Duration::from_secs(3)), - }; + let health = HealthCheckStatus::new(Some(Duration::from_secs(3))); let is_updated = snapshot .update_node(&node, health) .expect("node update failed"); @@ -231,7 +229,7 @@ mod tests { ); assert_eq!(weighted_node.weight, 0.5); // Check forth update with none - let health = HealthCheckStatus { latency: None }; + let health = HealthCheckStatus::new(None); let is_updated = snapshot .update_node(&node, health) .expect("node update failed"); diff --git a/ic-agent/src/agent/http_transport/dynamic_routing/snapshot/mod.rs b/ic-agent/src/agent/http_transport/dynamic_routing/snapshot/mod.rs index 3695f1d3..73df1537 100644 --- a/ic-agent/src/agent/http_transport/dynamic_routing/snapshot/mod.rs +++ b/ic-agent/src/agent/http_transport/dynamic_routing/snapshot/mod.rs @@ -1,6 +1,6 @@ -/// +/// Snapshot of the routing table. pub mod latency_based_routing; -/// +/// Node implementation. pub mod round_robin_routing; -/// +/// Routing snapshot implementation. pub mod routing_snapshot; diff --git a/ic-agent/src/agent/http_transport/dynamic_routing/snapshot/round_robin_routing.rs b/ic-agent/src/agent/http_transport/dynamic_routing/snapshot/round_robin_routing.rs index 21216c5c..e468d262 100644 --- a/ic-agent/src/agent/http_transport/dynamic_routing/snapshot/round_robin_routing.rs +++ b/ic-agent/src/agent/http_transport/dynamic_routing/snapshot/round_robin_routing.rs @@ -10,7 +10,7 @@ use crate::agent::http_transport::dynamic_routing::{ health_check::HealthCheckStatus, node::Node, snapshot::routing_snapshot::RoutingSnapshot, }; -/// +/// Routing snapshot, which samples nodes in a round-robin fashion. #[derive(Default, Debug, Clone)] pub struct RoundRobinRoutingSnapshot { current_idx: Arc, @@ -19,7 +19,7 @@ pub struct RoundRobinRoutingSnapshot { } impl RoundRobinRoutingSnapshot { - /// + /// Creates a new instance of `RoundRobinRoutingSnapshot`. pub fn new() -> Self { Self { current_idx: Arc::new(AtomicUsize::new(0)), @@ -47,14 +47,14 @@ impl RoutingSnapshot for RoundRobinRoutingSnapshot { fn sync_nodes(&mut self, nodes: &[Node]) -> anyhow::Result { let new_nodes = HashSet::from_iter(nodes.iter().cloned()); - // Find nodes removed from snapshot. + // Find nodes removed from topology. let nodes_removed: Vec<_> = self .existing_nodes .difference(&new_nodes) .cloned() .collect(); let has_removed_nodes = !nodes_removed.is_empty(); - // Find nodes added to snapshot. + // Find nodes added to topology. let nodes_added: Vec<_> = new_nodes .difference(&self.existing_nodes) .cloned() @@ -74,7 +74,7 @@ impl RoutingSnapshot for RoundRobinRoutingSnapshot { if !self.existing_nodes.contains(node) { return Ok(false); } - if health.latency.is_some() { + if health.is_healthy() { Ok(self.healthy_nodes.insert(node.clone())) } else { Ok(self.healthy_nodes.remove(node)) @@ -113,10 +113,8 @@ mod tests { let mut snapshot = RoundRobinRoutingSnapshot::new(); // This node is not present in existing_nodes let node = Node::new("api1.com").unwrap(); - let healthy = HealthCheckStatus { - latency: Some(Duration::from_secs(1)), - }; - let unhealthy = HealthCheckStatus { latency: None }; + let healthy = HealthCheckStatus::new(Some(Duration::from_secs(1))); + let unhealthy = HealthCheckStatus::new(None); // Act 1 let is_updated = snapshot .update_node(&node, healthy) @@ -142,9 +140,7 @@ mod tests { let node = Node::new("api1.com").unwrap(); // node is present in existing_nodes, but not in healthy_nodes snapshot.existing_nodes.insert(node.clone()); - let health = HealthCheckStatus { - latency: Some(Duration::from_secs(1)), - }; + let health = HealthCheckStatus::new(Some(Duration::from_secs(1))); // Act let is_updated = snapshot .update_node(&node, health) @@ -162,7 +158,7 @@ mod tests { let node = Node::new("api1.com").unwrap(); snapshot.existing_nodes.insert(node.clone()); snapshot.healthy_nodes.insert(node.clone()); - let unhealthy = HealthCheckStatus { latency: None }; + let unhealthy = HealthCheckStatus::new(None); // Act let is_updated = snapshot .update_node(&node, unhealthy) diff --git a/ic-agent/src/agent/http_transport/dynamic_routing/test_utils.rs b/ic-agent/src/agent/http_transport/dynamic_routing/test_utils.rs index a1d659c9..8b411ed2 100644 --- a/ic-agent/src/agent/http_transport/dynamic_routing/test_utils.rs +++ b/ic-agent/src/agent/http_transport/dynamic_routing/test_utils.rs @@ -11,7 +11,7 @@ use crate::agent::http_transport::{ health_check::{HealthCheck, HealthCheckStatus}, node::Node, nodes_fetch::Fetch, - type_aliases::GlobalShared, + type_aliases::AtomicSwap, }, route_provider::RouteProvider, }; @@ -54,8 +54,8 @@ where #[derive(Debug)] pub struct NodesFetcherMock { - // A mocked set of nodes existing in the topology. - pub nodes: GlobalShared>, + // A set of nodes, existing in the topology. + pub nodes: AtomicSwap>, } #[async_trait] @@ -86,7 +86,7 @@ impl NodesFetcherMock { #[derive(Debug)] pub struct NodeHealthCheckerMock { - pub healthy_nodes: GlobalShared>, + healthy_nodes: Arc>>, } impl Default for NodeHealthCheckerMock { @@ -103,7 +103,7 @@ impl HealthCheck for NodeHealthCheckerMock { true => Some(Duration::from_secs(1)), false => None, }; - Ok(HealthCheckStatus { latency }) + Ok(HealthCheckStatus::new(latency)) } } diff --git a/ic-agent/src/agent/http_transport/dynamic_routing/type_aliases.rs b/ic-agent/src/agent/http_transport/dynamic_routing/type_aliases.rs index 33ef77ea..04f32b66 100644 --- a/ic-agent/src/agent/http_transport/dynamic_routing/type_aliases.rs +++ b/ic-agent/src/agent/http_transport/dynamic_routing/type_aliases.rs @@ -1,17 +1,18 @@ -use std::sync::Arc; - use arc_swap::ArcSwap; +use std::sync::Arc; use tokio::sync::{mpsc, watch}; -/// +/// A type alias for the sender end of a watch channel. pub type SenderWatch = watch::Sender>; -/// + +/// A type alias for the receiver end of a watch channel. pub type ReceiverWatch = watch::Receiver>; -/// +/// A type alias for the sender end of a multi-producer, single-consumer channel. pub type SenderMpsc = mpsc::Sender; -/// + +/// A type alias for the receiver end of a multi-producer, single-consumer channel. pub type ReceiverMpsc = mpsc::Receiver; -/// -pub type GlobalShared = Arc>; +/// A type alias for an atomic swap operation on a shared value. +pub type AtomicSwap = Arc>; From 8bfe51b26115a69a386a1cc406aed6d93bb5a216 Mon Sep 17 00:00:00 2001 From: Nikolay Komarevskiy Date: Tue, 16 Jul 2024 10:11:28 +0200 Subject: [PATCH 06/24] chore: add DynamicRouteProviderBuilder --- .../dynamic_routing/dynamic_route_provider.rs | 219 ++++++++++-------- 1 file changed, 124 insertions(+), 95 deletions(-) diff --git a/ic-agent/src/agent/http_transport/dynamic_routing/dynamic_route_provider.rs b/ic-agent/src/agent/http_transport/dynamic_routing/dynamic_route_provider.rs index eff84d3b..ceb87e08 100644 --- a/ic-agent/src/agent/http_transport/dynamic_routing/dynamic_route_provider.rs +++ b/ic-agent/src/agent/http_transport/dynamic_routing/dynamic_route_provider.rs @@ -10,6 +10,7 @@ use arc_swap::ArcSwap; use candid::Principal; use reqwest::Client; use tokio::{ + runtime::Handle, sync::{mpsc, watch}, time::timeout, }; @@ -71,24 +72,19 @@ pub struct DynamicRouteProvider { token: CancellationToken, } -impl RouteProvider for DynamicRouteProvider -where - S: RoutingSnapshot + 'static, -{ - fn route(&self) -> Result { - let snapshot = self.routing_snapshot.load(); - let node = snapshot.next().ok_or_else(|| { - AgentError::RouteProviderError("No healthy API nodes found.".to_string()) - })?; - Ok(node.to_routing_url()) - } +/// A builder for the `DynamicRouteProvider`. +pub struct DynamicRouteProviderBuilder { + fetcher: Arc, + fetch_period: Duration, + fetch_retry_interval: Duration, + checker: Arc, + check_period: Duration, + routing_snapshot: AtomicSwap, + seeds: Vec, } -impl DynamicRouteProvider -where - S: RoutingSnapshot + 'static, -{ - /// Creates a new instance of `DynamicRouteProvider`. +impl DynamicRouteProviderBuilder { + /// Creates a new instance of the builder. pub fn new(snapshot: S, seeds: Vec, http_client: Client) -> Self { let fetcher = Arc::new(NodesFetcher::new( http_client.clone(), @@ -103,35 +99,75 @@ where check_period: HEALTH_CHECK_PERIOD, seeds, routing_snapshot: Arc::new(ArcSwap::from_pointee(snapshot)), - tracker: TaskTracker::new(), - token: CancellationToken::new(), } } - /// Sets the fetcher for fetching the latest nodes topology. + /// Sets the fetcher of the nodes in the topology. pub fn with_fetcher(mut self, fetcher: Arc) -> Self { self.fetcher = fetcher; self } - /// Sets the periodicity of fetching the latest nodes topology. + /// Sets the fetching periodicity. pub fn with_fetch_period(mut self, period: Duration) -> Self { self.fetch_period = period; self } - /// Sets the interval for retrying fetching the nodes in case of error. + /// Sets the node health checker. pub fn with_checker(mut self, checker: Arc) -> Self { self.checker = checker; self } - /// Sets the periodicity of checking the health of the nodes. + /// Sets the periodicity of node health checking. pub fn with_check_period(mut self, period: Duration) -> Self { self.check_period = period; self } + /// Builds an instance of the `DynamicRouteProvider`. + pub async fn build(self) -> DynamicRouteProvider + where + S: RoutingSnapshot + 'static, + { + let route_provider = DynamicRouteProvider { + fetcher: self.fetcher, + fetch_period: self.fetch_period, + fetch_retry_interval: self.fetch_retry_interval, + checker: self.checker, + check_period: self.check_period, + routing_snapshot: self.routing_snapshot, + tracker: TaskTracker::new(), + seeds: self.seeds, + token: CancellationToken::new(), + }; + + if let Err(err) = route_provider.run().await { + error!("{DYNAMIC_ROUTE_PROVIDER}: started in unhealthy state: {err:?}"); + } + + route_provider + } +} + +impl RouteProvider for DynamicRouteProvider +where + S: RoutingSnapshot + 'static, +{ + fn route(&self) -> Result { + let snapshot = self.routing_snapshot.load(); + let node = snapshot.next().ok_or_else(|| { + AgentError::RouteProviderError("No healthy API nodes found.".to_string()) + })?; + Ok(node.to_routing_url()) + } +} + +impl DynamicRouteProvider +where + S: RoutingSnapshot + 'static, +{ /// Starts two background tasks: /// - Task1: NodesFetchActor /// - Periodically fetches existing API nodes (gets latest nodes topology) and sends discovered nodes to HealthManagerActor. @@ -202,16 +238,26 @@ where ); (found_healthy_seeds).then_some(()).ok_or(anyhow!( - "No healthy seeds found, they may become healthy later ..." + "No healthy seeds found within {TIMEOUT_AWAIT_HEALTHY_SEED:?}, they may become healthy later ..." )) } +} - /// Kill all running tasks. - pub async fn stop(&self) { +// Gracefully stop the inner spawned tasks running in the background. +impl Drop for DynamicRouteProvider { + fn drop(&mut self) { self.token.cancel(); self.tracker.close(); - self.tracker.wait().await; - warn!("{DYNAMIC_ROUTE_PROVIDER}: gracefully stopped"); + let tracker = self.tracker.clone(); + // If no runtime is available do nothing. + if let Ok(handle) = Handle::try_current() { + let _ = handle.spawn(async move { + tracker.wait().await; + warn!("{DYNAMIC_ROUTE_PROVIDER}: stopped gracefully"); + }); + } else { + error!("{DYNAMIC_ROUTE_PROVIDER}: no runtime available, cannot stop the spawned tasks"); + } } } @@ -228,7 +274,7 @@ mod tests { use crate::{ agent::http_transport::{ dynamic_routing::{ - dynamic_route_provider::{DynamicRouteProvider, IC0_SEED_DOMAIN}, + dynamic_route_provider::{DynamicRouteProviderBuilder, IC0_SEED_DOMAIN}, node::Node, snapshot::round_robin_routing::RoundRobinRoutingSnapshot, test_utils::{ @@ -266,14 +312,15 @@ mod tests { // Configure RouteProvider let snapshot = RoundRobinRoutingSnapshot::new(); let client = Client::builder().build().unwrap(); - let route_provider = Arc::new( - DynamicRouteProvider::new(snapshot, vec![node_1.clone()], client) + let route_provider = + DynamicRouteProviderBuilder::new(snapshot, vec![node_1.clone()], client) .with_fetcher(fetcher.clone()) .with_checker(checker.clone()) .with_fetch_period(fetch_interval) - .with_check_period(check_interval), - ); - route_provider.run().await.expect("no healthy seeds found"); + .with_check_period(check_interval) + .build() + .await; + let route_provider = Arc::new(route_provider); // This time span is required for the snapshot to be fully updated with the new nodes and their health info. let snapshot_update_duration = fetch_interval + 2 * check_interval; @@ -341,9 +388,6 @@ mod tests { tokio::time::sleep(snapshot_update_duration).await; let routed_domains = route_n_times(3, Arc::clone(&route_provider)); assert_routed_domains(routed_domains, vec![node_2.domain()], 3); - - // Teardown. - route_provider.stop().await; } #[tokio::test] @@ -364,19 +408,18 @@ mod tests { // Configure RouteProvider let snapshot = RoundRobinRoutingSnapshot::new(); let client = Client::builder().build().unwrap(); - let route_provider = Arc::new( - DynamicRouteProvider::new(snapshot, vec![node_1.clone(), node_2.clone()], client) - .with_fetcher(fetcher.clone()) - .with_checker(checker.clone()) - .with_fetch_period(fetch_interval) - .with_check_period(check_interval), - ); - assert!(route_provider - .run() - .await - .unwrap_err() - .to_string() - .contains("No healthy seeds found, they may become healthy later ...")); + let route_provider = DynamicRouteProviderBuilder::new( + snapshot, + vec![node_1.clone(), node_2.clone()], + client, + ) + .with_fetcher(fetcher) + .with_checker(checker.clone()) + .with_fetch_period(fetch_interval) + .with_check_period(check_interval) + .build() + .await; + let route_provider = Arc::new(route_provider); // Test 1: calls to route() return an error, as no healthy seeds exist. for _ in 0..4 { @@ -393,9 +436,6 @@ mod tests { tokio::time::sleep(3 * check_interval).await; let routed_domains = route_n_times(6, Arc::clone(&route_provider)); assert_routed_domains(routed_domains, vec![node_1.domain(), node_2.domain()], 3); - - // Teardown. - route_provider.stop().await; } #[tokio::test] @@ -415,14 +455,15 @@ mod tests { // Configure RouteProvider let snapshot = RoundRobinRoutingSnapshot::new(); let client = Client::builder().build().unwrap(); - let route_provider = Arc::new( - DynamicRouteProvider::new(snapshot, vec![node_1.clone()], client) - .with_fetcher(fetcher.clone()) + let route_provider = + DynamicRouteProviderBuilder::new(snapshot, vec![node_1.clone()], client) + .with_fetcher(fetcher) .with_checker(checker.clone()) .with_fetch_period(fetch_interval) - .with_check_period(check_interval), - ); - route_provider.run().await.expect("no healthy seeds found"); + .with_check_period(check_interval) + .build() + .await; + let route_provider = Arc::new(route_provider); // Test 1: multiple route() calls return a single domain=ic0.app, as the seed is healthy. tokio::time::sleep(2 * check_interval).await; @@ -439,9 +480,6 @@ mod tests { AgentError::RouteProviderError("No healthy API nodes found.".to_string()) ); } - - // Teardown. - route_provider.stop().await; } #[tokio::test] @@ -461,19 +499,14 @@ mod tests { // Configure RouteProvider let snapshot = RoundRobinRoutingSnapshot::new(); let client = Client::builder().build().unwrap(); - let route_provider = Arc::new( - DynamicRouteProvider::new(snapshot, vec![node_1.clone()], client) - .with_fetcher(fetcher.clone()) - .with_checker(checker.clone()) + let route_provider = + DynamicRouteProviderBuilder::new(snapshot, vec![node_1.clone()], client) + .with_fetcher(fetcher) + .with_checker(checker) .with_fetch_period(fetch_interval) - .with_check_period(check_interval), - ); - assert!(route_provider - .run() - .await - .unwrap_err() - .to_string() - .contains("No healthy seeds found, they may become healthy later ...")); + .with_check_period(check_interval) + .build() + .await; // Test: calls to route() return an error, as no healthy seeds exist. for _ in 0..4 { @@ -484,9 +517,6 @@ mod tests { AgentError::RouteProviderError("No healthy API nodes found.".to_string()) ); } - - // Teardown. - route_provider.stop().await; } #[tokio::test] @@ -507,14 +537,18 @@ mod tests { // Configure RouteProvider let snapshot = RoundRobinRoutingSnapshot::new(); let client = Client::builder().build().unwrap(); - let route_provider = Arc::new( - DynamicRouteProvider::new(snapshot, vec![node_1.clone(), node_2.clone()], client) - .with_fetcher(fetcher.clone()) - .with_checker(checker.clone()) - .with_fetch_period(fetch_interval) - .with_check_period(check_interval), - ); - route_provider.run().await.expect("no healthy seeds found"); + let route_provider = DynamicRouteProviderBuilder::new( + snapshot, + vec![node_1.clone(), node_2.clone()], + client, + ) + .with_fetcher(fetcher) + .with_checker(checker.clone()) + .with_fetch_period(fetch_interval) + .with_check_period(check_interval) + .build() + .await; + let route_provider = Arc::new(route_provider); // Test 1: calls to route() return only a healthy seed ic0.app. let routed_domains = route_n_times(3, Arc::clone(&route_provider)); @@ -525,9 +559,6 @@ mod tests { tokio::time::sleep(2 * check_interval).await; let routed_domains = route_n_times(6, Arc::clone(&route_provider)); assert_routed_domains(routed_domains, vec![node_1.domain(), node_2.domain()], 3); - - // Teardown. - route_provider.stop().await; } #[tokio::test] @@ -548,14 +579,15 @@ mod tests { // Configure RouteProvider let snapshot = RoundRobinRoutingSnapshot::new(); let client = Client::builder().build().unwrap(); - let route_provider = Arc::new( - DynamicRouteProvider::new(snapshot, vec![node_1.clone()], client) + let route_provider = + DynamicRouteProviderBuilder::new(snapshot, vec![node_1.clone()], client) .with_fetcher(fetcher.clone()) .with_checker(checker.clone()) .with_fetch_period(fetch_interval) - .with_check_period(check_interval), - ); - route_provider.run().await.expect("no healthy seeds found"); + .with_check_period(check_interval) + .build() + .await; + let route_provider = Arc::new(route_provider); // This time span is required for the snapshot to be fully updated with the new nodes topology and health info. let snapshot_update_duration = fetch_interval + 2 * check_interval; @@ -579,8 +611,5 @@ mod tests { vec![node_1.domain(), node_2.domain(), node_3.domain()], 2, ); - - // Teardown. - route_provider.stop().await; } } From ea4c8e5d9e908d45147689e90df2604d30742c61 Mon Sep 17 00:00:00 2001 From: Nikolay Komarevskiy Date: Tue, 16 Jul 2024 10:15:50 +0200 Subject: [PATCH 07/24] chore: remove fetch_root_key() call --- .../src/agent/http_transport/dynamic_routing/nodes_fetch.rs | 4 ---- 1 file changed, 4 deletions(-) diff --git a/ic-agent/src/agent/http_transport/dynamic_routing/nodes_fetch.rs b/ic-agent/src/agent/http_transport/dynamic_routing/nodes_fetch.rs index 9f81a401..d9516bb5 100644 --- a/ic-agent/src/agent/http_transport/dynamic_routing/nodes_fetch.rs +++ b/ic-agent/src/agent/http_transport/dynamic_routing/nodes_fetch.rs @@ -55,10 +55,6 @@ impl Fetch for NodesFetcher { .with_transport(transport) .build() .with_context(|| "Failed to build an agent: {err}")?; - agent - .fetch_root_key() - .await - .with_context(|| "Failed to fetch root key: {err}")?; let api_bns = agent .fetch_api_boundary_nodes_by_subnet_id(self.subnet_id) .await?; From f79efc3a2f9af7ca206876e6da4bd2218f14fd56 Mon Sep 17 00:00:00 2001 From: Nikolay Komarevskiy Date: Tue, 16 Jul 2024 10:22:37 +0200 Subject: [PATCH 08/24] fix: clippy --- .../http_transport/dynamic_routing/dynamic_route_provider.rs | 2 +- .../dynamic_routing/snapshot/latency_based_routing.rs | 2 +- ic-agent/src/agent/http_transport/dynamic_routing/test_utils.rs | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/ic-agent/src/agent/http_transport/dynamic_routing/dynamic_route_provider.rs b/ic-agent/src/agent/http_transport/dynamic_routing/dynamic_route_provider.rs index ceb87e08..2e374ecb 100644 --- a/ic-agent/src/agent/http_transport/dynamic_routing/dynamic_route_provider.rs +++ b/ic-agent/src/agent/http_transport/dynamic_routing/dynamic_route_provider.rs @@ -251,7 +251,7 @@ impl Drop for DynamicRouteProvider { let tracker = self.tracker.clone(); // If no runtime is available do nothing. if let Ok(handle) = Handle::try_current() { - let _ = handle.spawn(async move { + handle.spawn(async move { tracker.wait().await; warn!("{DYNAMIC_ROUTE_PROVIDER}: stopped gracefully"); }); diff --git a/ic-agent/src/agent/http_transport/dynamic_routing/snapshot/latency_based_routing.rs b/ic-agent/src/agent/http_transport/dynamic_routing/snapshot/latency_based_routing.rs index cec2580c..c5c33cfd 100644 --- a/ic-agent/src/agent/http_transport/dynamic_routing/snapshot/latency_based_routing.rs +++ b/ic-agent/src/agent/http_transport/dynamic_routing/snapshot/latency_based_routing.rs @@ -99,7 +99,7 @@ impl RoutingSnapshot for LatencyRoutingSnapshot { .cloned() .collect(); let has_added_nodes = !nodes_added.is_empty(); - self.existing_nodes.extend(nodes_added.into_iter()); + self.existing_nodes.extend(nodes_added); // NOTE: newly added nodes will appear in the weighted_nodes later. // This happens after the first node health check round and a consequent update_node() invocation. for node in nodes_removed.into_iter() { diff --git a/ic-agent/src/agent/http_transport/dynamic_routing/test_utils.rs b/ic-agent/src/agent/http_transport/dynamic_routing/test_utils.rs index 8b411ed2..0fe5f14e 100644 --- a/ic-agent/src/agent/http_transport/dynamic_routing/test_utils.rs +++ b/ic-agent/src/agent/http_transport/dynamic_routing/test_utils.rs @@ -116,6 +116,6 @@ impl NodeHealthCheckerMock { pub fn overwrite_healthy_nodes(&self, healthy_nodes: Vec) { self.healthy_nodes - .store(Arc::new(HashSet::from_iter(healthy_nodes.into_iter()))); + .store(Arc::new(HashSet::from_iter(healthy_nodes))); } } From a2d47993b52983e951a93a59b3b238d1518f427e Mon Sep 17 00:00:00 2001 From: Nikolay Komarevskiy Date: Tue, 16 Jul 2024 13:29:26 +0200 Subject: [PATCH 09/24] chore: add mainnet test --- .../dynamic_routing/dynamic_route_provider.rs | 91 ++++++++++++++++++- .../dynamic_routing/test_utils.rs | 2 +- 2 files changed, 88 insertions(+), 5 deletions(-) diff --git a/ic-agent/src/agent/http_transport/dynamic_routing/dynamic_route_provider.rs b/ic-agent/src/agent/http_transport/dynamic_routing/dynamic_route_provider.rs index 2e374ecb..669c87f5 100644 --- a/ic-agent/src/agent/http_transport/dynamic_routing/dynamic_route_provider.rs +++ b/ic-agent/src/agent/http_transport/dynamic_routing/dynamic_route_provider.rs @@ -263,10 +263,11 @@ impl Drop for DynamicRouteProvider { #[cfg(test)] mod tests { + use candid::Principal; use reqwest::Client; use std::{ sync::{Arc, Once}, - time::Duration, + time::{Duration, Instant}, }; use tracing::Level; use tracing_subscriber::FmtSubscriber; @@ -274,16 +275,22 @@ mod tests { use crate::{ agent::http_transport::{ dynamic_routing::{ - dynamic_route_provider::{DynamicRouteProviderBuilder, IC0_SEED_DOMAIN}, + dynamic_route_provider::{ + DynamicRouteProviderBuilder, IC0_SEED_DOMAIN, MAINNET_ROOT_SUBNET_ID, + }, node::Node, - snapshot::round_robin_routing::RoundRobinRoutingSnapshot, + snapshot::{ + latency_based_routing::LatencyRoutingSnapshot, + round_robin_routing::RoundRobinRoutingSnapshot, + }, test_utils::{ assert_routed_domains, route_n_times, NodeHealthCheckerMock, NodesFetcherMock, }, }, route_provider::RouteProvider, + ReqwestTransport, }, - AgentError, + Agent, AgentError, }; static TRACING_INIT: Once = Once::new(); @@ -294,6 +301,82 @@ mod tests { }); } + async fn assert_no_routing_via_domains( + route_provider: Arc, + excluded_domains: Vec<&str>, + timeout: Duration, + route_call_interval: Duration, + ) { + if excluded_domains.is_empty() { + panic!("List of excluded domains can't be empty"); + } + + let route_calls = 30; + let start = Instant::now(); + + while start.elapsed() < timeout { + let routed_domains = (0..route_calls) + .map(|_| { + route_provider.route().map(|url| { + let domain = url.domain().expect("no domain name in url"); + domain.to_string() + }) + }) + .collect::, _>>() + .unwrap_or_default(); + + // Exit when excluded domains are not used for routing any more. + if !routed_domains.is_empty() + && !routed_domains + .iter() + .any(|d| excluded_domains.contains(&d.as_str())) + { + return; + } + + tokio::time::sleep(route_call_interval).await; + } + panic!("Expected excluded domains {excluded_domains:?} are still observed in routing over the last {route_calls} calls"); + } + + #[tokio::test] + async fn test_mainnet() { + // Setup. + setup_tracing(); + let seed = Node::new(IC0_SEED_DOMAIN).unwrap(); + let client = Client::builder().build().unwrap(); + let route_provider = DynamicRouteProviderBuilder::new( + LatencyRoutingSnapshot::new(), + vec![seed], + client.clone(), + ) + .build() + .await; + let route_provider = Arc::new(route_provider) as Arc; + let transport = + ReqwestTransport::create_with_client_route(Arc::clone(&route_provider), client) + .expect("failed to create transport"); + let agent = Agent::builder() + .with_transport(transport) + .build() + .expect("failed to create an agent"); + let subnet_id = Principal::from_text(MAINNET_ROOT_SUBNET_ID).unwrap(); + // Assert that seed (ic0.app) is not used for routing. Henceforth, only discovered API nodes are used. + assert_no_routing_via_domains( + Arc::clone(&route_provider), + vec![IC0_SEED_DOMAIN], + Duration::from_secs(40), + Duration::from_secs(2), + ) + .await; + // Act: perform /read_state call via dynamically discovered API BNs. + let api_bns = agent + .fetch_api_boundary_nodes_by_subnet_id(subnet_id) + .await + .expect("failed to fetch api boundary nodes"); + assert!(!api_bns.is_empty()); + } + #[tokio::test] async fn test_routing_with_topology_and_node_health_updates() { // Setup. diff --git a/ic-agent/src/agent/http_transport/dynamic_routing/test_utils.rs b/ic-agent/src/agent/http_transport/dynamic_routing/test_utils.rs index 0fe5f14e..3773c158 100644 --- a/ic-agent/src/agent/http_transport/dynamic_routing/test_utils.rs +++ b/ic-agent/src/agent/http_transport/dynamic_routing/test_utils.rs @@ -16,7 +16,7 @@ use crate::agent::http_transport::{ route_provider::RouteProvider, }; -pub fn route_n_times(n: usize, f: Arc) -> Vec { +pub fn route_n_times(n: usize, f: Arc) -> Vec { (0..n) .map(|_| f.route().unwrap().domain().unwrap().to_string()) .collect() From 627331ca0e774bde588ba1535ae727b28b2f0d11 Mon Sep 17 00:00:00 2001 From: Nikolay Komarevskiy Date: Tue, 16 Jul 2024 17:34:21 +0200 Subject: [PATCH 10/24] chore: add with_discovery_transport() --- ic-agent/src/agent/builder.rs | 38 ++++++++++++++++++++++++++++++++++- 1 file changed, 37 insertions(+), 1 deletion(-) diff --git a/ic-agent/src/agent/builder.rs b/ic-agent/src/agent/builder.rs index 77c998d4..5ad2cc17 100644 --- a/ic-agent/src/agent/builder.rs +++ b/ic-agent/src/agent/builder.rs @@ -1,5 +1,18 @@ +use reqwest::Client; + use crate::{ - agent::{agent_config::AgentConfig, Agent, Transport}, + agent::{ + agent_config::AgentConfig, + http_transport::{ + dynamic_routing::{ + dynamic_route_provider::{DynamicRouteProviderBuilder, IC0_SEED_DOMAIN}, + node::Node, + snapshot::latency_based_routing::LatencyRoutingSnapshot, + }, + route_provider::RouteProvider, + }, + Agent, Transport, + }, AgentError, Identity, NonceFactory, NonceGenerator, }; use std::sync::Arc; @@ -16,6 +29,29 @@ impl AgentBuilder { Agent::new(self.config) } + /// Set the dynamic transport layer for the [`Agent`], performing continuos discovery of the API boundary nodes and routing traffic via them based on the latencies. + pub async fn with_discovery_transport(self, client: Client) -> Self { + use crate::agent::http_transport::ReqwestTransport; + + // TODO: This is a temporary solution to get the seed node. + let seed = Node::new(IC0_SEED_DOMAIN).unwrap(); + + let route_provider = DynamicRouteProviderBuilder::new( + LatencyRoutingSnapshot::new(), + vec![seed], + client.clone(), + ) + .build() + .await; + + let route_provider = Arc::new(route_provider) as Arc; + + let transport = ReqwestTransport::create_with_client_route(route_provider, client) + .expect("failed to create transport"); + + self.with_transport(transport) + } + /// Set the URL of the [Agent]. #[cfg(feature = "reqwest")] pub fn with_url>(self, url: S) -> Self { From 22fd2417da9ccedc152fe09a30957742fefbf988 Mon Sep 17 00:00:00 2001 From: Nikolay Komarevskiy Date: Tue, 16 Jul 2024 17:50:37 +0200 Subject: [PATCH 11/24] chore: fix imports --- ic-agent/src/agent/builder.rs | 28 ++++++++++-------------- ic-agent/src/agent/http_transport/mod.rs | 4 +--- 2 files changed, 13 insertions(+), 19 deletions(-) diff --git a/ic-agent/src/agent/builder.rs b/ic-agent/src/agent/builder.rs index 5ad2cc17..02a5d037 100644 --- a/ic-agent/src/agent/builder.rs +++ b/ic-agent/src/agent/builder.rs @@ -1,18 +1,5 @@ -use reqwest::Client; - use crate::{ - agent::{ - agent_config::AgentConfig, - http_transport::{ - dynamic_routing::{ - dynamic_route_provider::{DynamicRouteProviderBuilder, IC0_SEED_DOMAIN}, - node::Node, - snapshot::latency_based_routing::LatencyRoutingSnapshot, - }, - route_provider::RouteProvider, - }, - Agent, Transport, - }, + agent::{agent_config::AgentConfig, Agent, Transport}, AgentError, Identity, NonceFactory, NonceGenerator, }; use std::sync::Arc; @@ -29,9 +16,18 @@ impl AgentBuilder { Agent::new(self.config) } + #[cfg(all(feature = "reqwest", not(target_family = "wasm")))] /// Set the dynamic transport layer for the [`Agent`], performing continuos discovery of the API boundary nodes and routing traffic via them based on the latencies. - pub async fn with_discovery_transport(self, client: Client) -> Self { - use crate::agent::http_transport::ReqwestTransport; + pub async fn with_discovery_transport(self, client: reqwest::Client) -> Self { + use crate::agent::http_transport::{ + dynamic_routing::{ + dynamic_route_provider::{DynamicRouteProviderBuilder, IC0_SEED_DOMAIN}, + node::Node, + snapshot::latency_based_routing::LatencyRoutingSnapshot, + }, + route_provider::RouteProvider, + ReqwestTransport, + }; // TODO: This is a temporary solution to get the seed node. let seed = Node::new(IC0_SEED_DOMAIN).unwrap(); diff --git a/ic-agent/src/agent/http_transport/mod.rs b/ic-agent/src/agent/http_transport/mod.rs index 91959ac0..7ffdd622 100644 --- a/ic-agent/src/agent/http_transport/mod.rs +++ b/ic-agent/src/agent/http_transport/mod.rs @@ -30,8 +30,6 @@ const ICP0_SUB_DOMAIN: &str = ".icp0.io"; const ICP_API_SUB_DOMAIN: &str = ".icp-api.io"; #[allow(dead_code)] const LOCALHOST_SUB_DOMAIN: &str = ".localhost"; -/// -#[cfg(not(target_family = "wasm"))] -#[cfg(feature = "reqwest")] +#[cfg(all(feature = "reqwest", not(target_family = "wasm")))] pub mod dynamic_routing; pub mod route_provider; From e7b639574279bbd07ae6d5048e29fca2c5fd4bef Mon Sep 17 00:00:00 2001 From: Nikolay Komarevskiy Date: Wed, 17 Jul 2024 16:28:47 +0200 Subject: [PATCH 12/24] chore: remove error on run() --- .../dynamic_routing/dynamic_route_provider.rs | 46 ++++++++----------- 1 file changed, 18 insertions(+), 28 deletions(-) diff --git a/ic-agent/src/agent/http_transport/dynamic_routing/dynamic_route_provider.rs b/ic-agent/src/agent/http_transport/dynamic_routing/dynamic_route_provider.rs index 669c87f5..5604cb98 100644 --- a/ic-agent/src/agent/http_transport/dynamic_routing/dynamic_route_provider.rs +++ b/ic-agent/src/agent/http_transport/dynamic_routing/dynamic_route_provider.rs @@ -5,7 +5,6 @@ use std::{ time::{Duration, Instant}, }; -use anyhow::anyhow; use arc_swap::ArcSwap; use candid::Principal; use reqwest::Client; @@ -143,9 +142,7 @@ impl DynamicRouteProviderBuilder { token: CancellationToken::new(), }; - if let Err(err) = route_provider.run().await { - error!("{DYNAMIC_ROUTE_PROVIDER}: started in unhealthy state: {err:?}"); - } + route_provider.run().await; route_provider } @@ -175,8 +172,8 @@ where /// - Listens to the fetched nodes messages from the NodesFetchActor. /// - Starts/stops health check tasks (HealthCheckActors) based on the newly added/removed nodes. /// - These spawned health check tasks periodically update the snapshot with the latest node health info. - pub async fn run(&self) -> anyhow::Result<()> { - info!("{DYNAMIC_ROUTE_PROVIDER}: start run() "); + pub async fn run(&self) { + info!("{DYNAMIC_ROUTE_PROVIDER}: started ..."); // Communication channel between NodesFetchActor and HealthManagerActor. let (fetch_sender, fetch_receiver) = watch::channel(None); @@ -196,7 +193,6 @@ where .spawn(async move { health_manager_actor.run().await }); // Dispatch all seed nodes for initial health checks - let start = Instant::now(); if let Err(err) = fetch_sender.send(Some(FetchedNodes { nodes: self.seeds.clone(), })) { @@ -204,23 +200,17 @@ where } // Try await for healthy seeds. - let found_healthy_seeds = - match timeout(TIMEOUT_AWAIT_HEALTHY_SEED, init_receiver.recv()).await { - Ok(_) => { - info!( - "{DYNAMIC_ROUTE_PROVIDER}: found healthy seeds within {:?}", - start.elapsed() - ); - true - } - Err(_) => { - warn!( - "{DYNAMIC_ROUTE_PROVIDER}: no healthy seeds found within {:?}", - start.elapsed() - ); - false - } - }; + let start = Instant::now(); + match timeout(TIMEOUT_AWAIT_HEALTHY_SEED, init_receiver.recv()).await { + Ok(_) => info!( + "{DYNAMIC_ROUTE_PROVIDER}: found healthy seeds within {:?}", + start.elapsed() + ), + Err(_) => warn!( + "{DYNAMIC_ROUTE_PROVIDER}: no healthy seeds found within {:?}", + start.elapsed() + ), + }; // We can close the channel now. init_receiver.close(); @@ -236,10 +226,6 @@ where info!( "{DYNAMIC_ROUTE_PROVIDER}: NodesFetchActor and HealthManagerActor started successfully" ); - - (found_healthy_seeds).then_some(()).ok_or(anyhow!( - "No healthy seeds found within {TIMEOUT_AWAIT_HEALTHY_SEED:?}, they may become healthy later ..." - )) } } @@ -696,3 +682,7 @@ mod tests { ); } } + +// - none of the seeds [] are healthy +// - none of the API node [] is healthy +// - return a vector of errors: HealthCheckErrors, FetchErrors, etc. From a493bebb969f4196acaa21612a69fdb40a18bb15 Mon Sep 17 00:00:00 2001 From: Nikolay Komarevskiy Date: Wed, 17 Jul 2024 16:47:53 +0200 Subject: [PATCH 13/24] chore: remove errors in RoutingSnapshot --- .../dynamic_routing/health_check.rs | 7 +-- .../snapshot/latency_based_routing.rs | 46 ++++++++----------- .../snapshot/round_robin_routing.rs | 43 +++++++---------- .../snapshot/routing_snapshot.rs | 14 +++--- 4 files changed, 44 insertions(+), 66 deletions(-) diff --git a/ic-agent/src/agent/http_transport/dynamic_routing/health_check.rs b/ic-agent/src/agent/http_transport/dynamic_routing/health_check.rs index befbd99e..1c0d0b39 100644 --- a/ic-agent/src/agent/http_transport/dynamic_routing/health_check.rs +++ b/ic-agent/src/agent/http_transport/dynamic_routing/health_check.rs @@ -245,10 +245,7 @@ where async fn handle_health_update(&mut self, msg: NodeHealthState) { let current_snapshot = self.routing_snapshot.load_full(); let mut new_snapshot = (*current_snapshot).clone(); - if let Err(err) = new_snapshot.update_node(&msg.node, msg.health.clone()) { - error!("{HEALTH_MANAGER_ACTOR}: failed to update snapshot: {err:?}"); - return; - } + new_snapshot.update_node(&msg.node, msg.health.clone()); self.routing_snapshot.store(Arc::new(new_snapshot)); if !self.is_initialized && msg.health.is_healthy() { self.is_initialized = true; @@ -269,7 +266,7 @@ where let current_snapshot = self.routing_snapshot.load_full(); let mut new_snapshot = (*current_snapshot).clone(); // If the snapshot has changed, store it and restart all node's health checks. - if let Ok(true) = new_snapshot.sync_nodes(&nodes) { + if new_snapshot.sync_nodes(&nodes) { self.routing_snapshot.store(Arc::new(new_snapshot)); self.stop_all_checks().await; self.start_checks(nodes.to_vec()); diff --git a/ic-agent/src/agent/http_transport/dynamic_routing/snapshot/latency_based_routing.rs b/ic-agent/src/agent/http_transport/dynamic_routing/snapshot/latency_based_routing.rs index c5c33cfd..1ae10136 100644 --- a/ic-agent/src/agent/http_transport/dynamic_routing/snapshot/latency_based_routing.rs +++ b/ic-agent/src/agent/http_transport/dynamic_routing/snapshot/latency_based_routing.rs @@ -84,7 +84,7 @@ impl RoutingSnapshot for LatencyRoutingSnapshot { idx.map(|idx| self.weighted_nodes[idx].node.clone()) } - fn sync_nodes(&mut self, nodes: &[Node]) -> anyhow::Result { + fn sync_nodes(&mut self, nodes: &[Node]) -> bool { let new_nodes = HashSet::from_iter(nodes.iter().cloned()); // Find nodes removed from topology. let nodes_removed: Vec<_> = self @@ -107,12 +107,13 @@ impl RoutingSnapshot for LatencyRoutingSnapshot { let idx = self.weighted_nodes.iter().position(|x| x.node == node); idx.map(|idx| self.weighted_nodes.swap_remove(idx)); } - Ok(has_added_nodes || has_removed_nodes) + + has_added_nodes || has_removed_nodes } - fn update_node(&mut self, node: &Node, health: HealthCheckStatus) -> anyhow::Result { + fn update_node(&mut self, node: &Node, health: HealthCheckStatus) -> bool { if !self.existing_nodes.contains(node) { - return Ok(false); + return false; } // If latency is None (meaning Node is unhealthy), we assign some big value @@ -135,7 +136,8 @@ impl RoutingSnapshot for LatencyRoutingSnapshot { weight, }) } - Ok(true) + + true } } @@ -174,9 +176,7 @@ mod tests { let node = Node::new("api1.com").unwrap(); let health = HealthCheckStatus::new(Some(Duration::from_secs(1))); // Act - let is_updated = snapshot - .update_node(&node, health) - .expect("node update failed"); + let is_updated = snapshot.update_node(&node, health); // Assert assert!(!is_updated); assert!(snapshot.weighted_nodes.is_empty()); @@ -192,9 +192,7 @@ mod tests { let health = HealthCheckStatus::new(Some(Duration::from_secs(1))); snapshot.existing_nodes.insert(node.clone()); // Check first update - let is_updated = snapshot - .update_node(&node, health) - .expect("node update failed"); + let is_updated = snapshot.update_node(&node, health); assert!(is_updated); assert!(snapshot.has_nodes()); let weighted_node = snapshot.weighted_nodes.first().unwrap(); @@ -206,9 +204,7 @@ mod tests { assert_eq!(snapshot.next().unwrap(), node); // Check second update let health = HealthCheckStatus::new(Some(Duration::from_secs(2))); - let is_updated = snapshot - .update_node(&node, health) - .expect("node update failed"); + let is_updated = snapshot.update_node(&node, health); assert!(is_updated); let weighted_node = snapshot.weighted_nodes.first().unwrap(); assert_eq!( @@ -218,9 +214,7 @@ mod tests { assert_eq!(weighted_node.weight, 1.0 / 1.5); // Check third update let health = HealthCheckStatus::new(Some(Duration::from_secs(3))); - let is_updated = snapshot - .update_node(&node, health) - .expect("node update failed"); + let is_updated = snapshot.update_node(&node, health); assert!(is_updated); let weighted_node = snapshot.weighted_nodes.first().unwrap(); assert_eq!( @@ -230,9 +224,7 @@ mod tests { assert_eq!(weighted_node.weight, 0.5); // Check forth update with none let health = HealthCheckStatus::new(None); - let is_updated = snapshot - .update_node(&node, health) - .expect("node update failed"); + let is_updated = snapshot.update_node(&node, health); assert!(is_updated); let weighted_node = snapshot.weighted_nodes.first().unwrap(); let avg_latency = Duration::from_secs_f64((MAX_LATENCY.as_secs() as f64 + 6.0) / 4.0); @@ -249,7 +241,7 @@ mod tests { let mut snapshot = LatencyRoutingSnapshot::new(); let node_1 = Node::new("api1.com").unwrap(); // Sync with node_1 - let nodes_changed = snapshot.sync_nodes(&[node_1.clone()]).unwrap(); + let nodes_changed = snapshot.sync_nodes(&[node_1.clone()]); assert!(nodes_changed); assert!(snapshot.weighted_nodes.is_empty()); assert_eq!( @@ -263,7 +255,7 @@ mod tests { weight: 0.0, }); // Sync with node_1 again - let nodes_changed = snapshot.sync_nodes(&[node_1.clone()]).unwrap(); + let nodes_changed = snapshot.sync_nodes(&[node_1.clone()]); assert!(!nodes_changed); assert_eq!( snapshot.existing_nodes, @@ -272,7 +264,7 @@ mod tests { assert_eq!(snapshot.weighted_nodes[0].node, node_1); // Sync with node_2 let node_2 = Node::new("api2.com").unwrap(); - let nodes_changed = snapshot.sync_nodes(&[node_2.clone()]).unwrap(); + let nodes_changed = snapshot.sync_nodes(&[node_2.clone()]); assert!(nodes_changed); assert_eq!( snapshot.existing_nodes, @@ -288,9 +280,7 @@ mod tests { }); // Sync with [node_2, node_3] let node_3 = Node::new("api3.com").unwrap(); - let nodes_changed = snapshot - .sync_nodes(&[node_3.clone(), node_2.clone()]) - .unwrap(); + let nodes_changed = snapshot.sync_nodes(&[node_3.clone(), node_2.clone()]); assert!(nodes_changed); assert_eq!( snapshot.existing_nodes, @@ -304,13 +294,13 @@ mod tests { weight: 0.0, }); // Sync with [] - let nodes_changed = snapshot.sync_nodes(&[]).unwrap(); + let nodes_changed = snapshot.sync_nodes(&[]); assert!(nodes_changed); assert!(snapshot.existing_nodes.is_empty()); // Make sure all nodes were removed from the healthy_nodes assert!(snapshot.weighted_nodes.is_empty()); // Sync with [] again - let nodes_changed = snapshot.sync_nodes(&[]).unwrap(); + let nodes_changed = snapshot.sync_nodes(&[]); assert!(!nodes_changed); assert!(snapshot.existing_nodes.is_empty()); } diff --git a/ic-agent/src/agent/http_transport/dynamic_routing/snapshot/round_robin_routing.rs b/ic-agent/src/agent/http_transport/dynamic_routing/snapshot/round_robin_routing.rs index e468d262..149e49d2 100644 --- a/ic-agent/src/agent/http_transport/dynamic_routing/snapshot/round_robin_routing.rs +++ b/ic-agent/src/agent/http_transport/dynamic_routing/snapshot/round_robin_routing.rs @@ -45,7 +45,7 @@ impl RoutingSnapshot for RoundRobinRoutingSnapshot { .cloned() } - fn sync_nodes(&mut self, nodes: &[Node]) -> anyhow::Result { + fn sync_nodes(&mut self, nodes: &[Node]) -> bool { let new_nodes = HashSet::from_iter(nodes.iter().cloned()); // Find nodes removed from topology. let nodes_removed: Vec<_> = self @@ -67,17 +67,18 @@ impl RoutingSnapshot for RoundRobinRoutingSnapshot { self.existing_nodes.remove(node); self.healthy_nodes.remove(node); }); - Ok(has_added_nodes || has_removed_nodes) + + has_added_nodes || has_removed_nodes } - fn update_node(&mut self, node: &Node, health: HealthCheckStatus) -> anyhow::Result { + fn update_node(&mut self, node: &Node, health: HealthCheckStatus) -> bool { if !self.existing_nodes.contains(node) { - return Ok(false); + return false; } if health.is_healthy() { - Ok(self.healthy_nodes.insert(node.clone())) + self.healthy_nodes.insert(node.clone()) } else { - Ok(self.healthy_nodes.remove(node)) + self.healthy_nodes.remove(node) } } } @@ -116,17 +117,13 @@ mod tests { let healthy = HealthCheckStatus::new(Some(Duration::from_secs(1))); let unhealthy = HealthCheckStatus::new(None); // Act 1 - let is_updated = snapshot - .update_node(&node, healthy) - .expect("node update failed"); + let is_updated = snapshot.update_node(&node, healthy); // Assert assert!(!is_updated); assert!(snapshot.existing_nodes.is_empty()); assert!(snapshot.next().is_none()); // Act 2 - let is_updated = snapshot - .update_node(&node, unhealthy) - .expect("node update failed"); + let is_updated = snapshot.update_node(&node, unhealthy); // Assert assert!(!is_updated); assert!(snapshot.existing_nodes.is_empty()); @@ -142,9 +139,7 @@ mod tests { snapshot.existing_nodes.insert(node.clone()); let health = HealthCheckStatus::new(Some(Duration::from_secs(1))); // Act - let is_updated = snapshot - .update_node(&node, health) - .expect("node update failed"); + let is_updated = snapshot.update_node(&node, health); assert!(is_updated); assert!(snapshot.has_nodes()); assert_eq!(snapshot.next().unwrap(), node); @@ -160,9 +155,7 @@ mod tests { snapshot.healthy_nodes.insert(node.clone()); let unhealthy = HealthCheckStatus::new(None); // Act - let is_updated = snapshot - .update_node(&node, unhealthy) - .expect("node update failed"); + let is_updated = snapshot.update_node(&node, unhealthy); assert!(is_updated); assert!(!snapshot.has_nodes()); assert!(snapshot.next().is_none()); @@ -174,7 +167,7 @@ mod tests { let mut snapshot = RoundRobinRoutingSnapshot::new(); let node_1 = Node::new("api1.com").unwrap(); // Sync with node_1 - let nodes_changed = snapshot.sync_nodes(&[node_1.clone()]).unwrap(); + let nodes_changed = snapshot.sync_nodes(&[node_1.clone()]); assert!(nodes_changed); assert!(snapshot.healthy_nodes.is_empty()); assert_eq!( @@ -184,7 +177,7 @@ mod tests { // Add node_1 to healthy_nodes manually snapshot.healthy_nodes.insert(node_1.clone()); // Sync with node_1 again - let nodes_changed = snapshot.sync_nodes(&[node_1.clone()]).unwrap(); + let nodes_changed = snapshot.sync_nodes(&[node_1.clone()]); assert!(!nodes_changed); assert_eq!( snapshot.existing_nodes, @@ -193,7 +186,7 @@ mod tests { assert_eq!(snapshot.healthy_nodes, HashSet::from_iter(vec![node_1])); // Sync with node_2 let node_2 = Node::new("api2.com").unwrap(); - let nodes_changed = snapshot.sync_nodes(&[node_2.clone()]).unwrap(); + let nodes_changed = snapshot.sync_nodes(&[node_2.clone()]); assert!(nodes_changed); assert_eq!( snapshot.existing_nodes, @@ -205,9 +198,7 @@ mod tests { snapshot.healthy_nodes.insert(node_2.clone()); // Sync with [node_2, node_3] let node_3 = Node::new("api3.com").unwrap(); - let nodes_changed = snapshot - .sync_nodes(&[node_3.clone(), node_2.clone()]) - .unwrap(); + let nodes_changed = snapshot.sync_nodes(&[node_3.clone(), node_2.clone()]); assert!(nodes_changed); assert_eq!( snapshot.existing_nodes, @@ -216,13 +207,13 @@ mod tests { assert_eq!(snapshot.healthy_nodes, HashSet::from_iter(vec![node_2])); snapshot.healthy_nodes.insert(node_3); // Sync with [] - let nodes_changed = snapshot.sync_nodes(&[]).unwrap(); + let nodes_changed = snapshot.sync_nodes(&[]); assert!(nodes_changed); assert!(snapshot.existing_nodes.is_empty()); // Make sure all nodes were removed from the healthy_nodes assert!(snapshot.healthy_nodes.is_empty()); // Sync with [] again - let nodes_changed = snapshot.sync_nodes(&[]).unwrap(); + let nodes_changed = snapshot.sync_nodes(&[]); assert!(!nodes_changed); assert!(snapshot.existing_nodes.is_empty()); } diff --git a/ic-agent/src/agent/http_transport/dynamic_routing/snapshot/routing_snapshot.rs b/ic-agent/src/agent/http_transport/dynamic_routing/snapshot/routing_snapshot.rs index ef88b8df..155b8eac 100644 --- a/ic-agent/src/agent/http_transport/dynamic_routing/snapshot/routing_snapshot.rs +++ b/ic-agent/src/agent/http_transport/dynamic_routing/snapshot/routing_snapshot.rs @@ -2,14 +2,14 @@ use std::fmt::Debug; use crate::agent::http_transport::dynamic_routing::{health_check::HealthCheckStatus, node::Node}; -/// +/// A trait for interacting with the snapshot of nodes (routing table). pub trait RoutingSnapshot: Send + Sync + Clone + Debug { - /// + /// Returns `true` if the snapshot has nodes. fn has_nodes(&self) -> bool; - /// + /// Get the next node in the snapshot. fn next(&self) -> Option; - /// - fn sync_nodes(&mut self, nodes: &[Node]) -> anyhow::Result; - /// - fn update_node(&mut self, node: &Node, health: HealthCheckStatus) -> anyhow::Result; + /// Syncs the nodes in the snapshot with the provided list of nodes, returning `true` if the snapshot was updated. + fn sync_nodes(&mut self, nodes: &[Node]) -> bool; + /// Updates the health status of a specific node, returning `true` if the node was found and updated. + fn update_node(&mut self, node: &Node, health: HealthCheckStatus) -> bool; } From df7b557f4b9a70190e06e5fa131a46e278941c05 Mon Sep 17 00:00:00 2001 From: Nikolay Komarevskiy Date: Wed, 17 Jul 2024 17:28:13 +0200 Subject: [PATCH 14/24] chore: remove anyhow --- ic-agent/Cargo.toml | 1 - .../dynamic_routing/dynamic_route_provider.rs | 15 +++++++++ .../dynamic_routing/health_check.rs | 16 +++++---- .../http_transport/dynamic_routing/node.rs | 11 ++++--- .../dynamic_routing/nodes_fetch.rs | 33 ++++++++++++++----- .../dynamic_routing/test_utils.rs | 5 +-- 6 files changed, 59 insertions(+), 22 deletions(-) diff --git a/ic-agent/Cargo.toml b/ic-agent/Cargo.toml index b100e009..beebef4e 100644 --- a/ic-agent/Cargo.toml +++ b/ic-agent/Cargo.toml @@ -79,7 +79,6 @@ tower = { version = "0.4.13", optional = true } async-trait = "0.1.80" tracing = "0.1.40" arc-swap = "1.7.1" -anyhow = "1.0.86" simple_moving_average = "1.0.2" tracing-subscriber = "0.3.18" tokio-util = { version = "0.7.11", features = ["full"] } diff --git a/ic-agent/src/agent/http_transport/dynamic_routing/dynamic_route_provider.rs b/ic-agent/src/agent/http_transport/dynamic_routing/dynamic_route_provider.rs index 5604cb98..820ea9e5 100644 --- a/ic-agent/src/agent/http_transport/dynamic_routing/dynamic_route_provider.rs +++ b/ic-agent/src/agent/http_transport/dynamic_routing/dynamic_route_provider.rs @@ -8,6 +8,7 @@ use std::{ use arc_swap::ArcSwap; use candid::Principal; use reqwest::Client; +use thiserror::Error; use tokio::{ runtime::Handle, sync::{mpsc, watch}, @@ -71,6 +72,20 @@ pub struct DynamicRouteProvider { token: CancellationToken, } +/// An error that occurred when the DynamicRouteProvider service was running. +#[derive(Error, Debug)] +pub enum DynamicRouteProviderError { + /// An error when fetching topology of the API nodes. + #[error("An error when fetching API nodes: {0}")] + NodesFetchError(String), + /// An error when checking API node's health. + #[error("An error when checking API node's health: {0}")] + HealthCheckError(String), + /// An invalid domain name provided. + #[error("Provided domain name is invalid: {0}")] + InvalidDomainName(String), +} + /// A builder for the `DynamicRouteProvider`. pub struct DynamicRouteProviderBuilder { fetcher: Arc, diff --git a/ic-agent/src/agent/http_transport/dynamic_routing/health_check.rs b/ic-agent/src/agent/http_transport/dynamic_routing/health_check.rs index 1c0d0b39..d45a480e 100644 --- a/ic-agent/src/agent/http_transport/dynamic_routing/health_check.rs +++ b/ic-agent/src/agent/http_transport/dynamic_routing/health_check.rs @@ -1,4 +1,3 @@ -use anyhow::bail; use async_trait::async_trait; use http::{Method, StatusCode}; use reqwest::{Client, Request}; @@ -13,6 +12,7 @@ use tracing::{debug, error, info, warn}; use url::Url; use crate::agent::http_transport::dynamic_routing::{ + dynamic_route_provider::DynamicRouteProviderError, messages::{FetchedNodes, NodeHealthState}, node::Node, snapshot::routing_snapshot::RoutingSnapshot, @@ -25,7 +25,7 @@ const CHANNEL_BUFFER: usize = 128; #[async_trait] pub trait HealthCheck: Send + Sync + Debug { /// Checks the health of the node. - async fn check(&self, node: &Node) -> anyhow::Result; + async fn check(&self, node: &Node) -> Result; } /// A struct representing the health check status of the node. @@ -72,15 +72,19 @@ const HEALTH_CHECKER: &str = "HealthChecker"; #[async_trait] impl HealthCheck for HealthChecker { - async fn check(&self, node: &Node) -> anyhow::Result { + async fn check(&self, node: &Node) -> Result { // API boundary node exposes /health endpoint and should respond with 204 (No Content) if it's healthy. - let url = Url::parse(&format!("https://{}/health", node.domain()))?; + let url = Url::parse(&format!("https://{}/health", node.domain())).unwrap(); let mut request = Request::new(Method::GET, url.clone()); *request.timeout_mut() = Some(self.timeout); let start = Instant::now(); - let response = self.http_client.execute(request).await?; + let response = self.http_client.execute(request).await.map_err(|err| { + DynamicRouteProviderError::HealthCheckError(format!( + "Failed to execute GET request to {url}: {err}" + )) + })?; let latency = start.elapsed(); if response.status() != StatusCode::NO_CONTENT { @@ -89,7 +93,7 @@ impl HealthCheck for HealthChecker { response.status() ); error!(err_msg); - bail!(err_msg); + return Err(DynamicRouteProviderError::HealthCheckError(err_msg)); } Ok(HealthCheckStatus::new(Some(latency))) diff --git a/ic-agent/src/agent/http_transport/dynamic_routing/node.rs b/ic-agent/src/agent/http_transport/dynamic_routing/node.rs index 482dfe89..baf9d185 100644 --- a/ic-agent/src/agent/http_transport/dynamic_routing/node.rs +++ b/ic-agent/src/agent/http_transport/dynamic_routing/node.rs @@ -1,7 +1,8 @@ use url::Url; use crate::agent::ApiBoundaryNode; -use anyhow::anyhow; + +use super::dynamic_route_provider::DynamicRouteProviderError; /// Represents a node in the dynamic routing. #[derive(Debug, Clone, PartialEq, Eq, Hash)] @@ -11,9 +12,11 @@ pub struct Node { impl Node { /// Creates a new `Node` instance from the domain name. - pub fn new(domain: &str) -> anyhow::Result { + pub fn new(domain: &str) -> Result { if !is_valid_domain(domain) { - return Err(anyhow!("Invalid domain name {domain}")); + return Err(DynamicRouteProviderError::InvalidDomainName( + domain.to_string(), + )); } Ok(Self { domain: domain.to_string(), @@ -41,7 +44,7 @@ impl From<&Node> for Url { } impl TryFrom<&ApiBoundaryNode> for Node { - type Error = anyhow::Error; + type Error = DynamicRouteProviderError; fn try_from(value: &ApiBoundaryNode) -> Result { Node::new(&value.domain) diff --git a/ic-agent/src/agent/http_transport/dynamic_routing/nodes_fetch.rs b/ic-agent/src/agent/http_transport/dynamic_routing/nodes_fetch.rs index d9516bb5..88fa372f 100644 --- a/ic-agent/src/agent/http_transport/dynamic_routing/nodes_fetch.rs +++ b/ic-agent/src/agent/http_transport/dynamic_routing/nodes_fetch.rs @@ -1,4 +1,3 @@ -use anyhow::Context; use async_trait::async_trait; use candid::Principal; use reqwest::Client; @@ -11,9 +10,12 @@ use url::Url; use crate::agent::{ http_transport::{ dynamic_routing::{ - health_check::HEALTH_MANAGER_ACTOR, messages::FetchedNodes, node::Node, - snapshot::routing_snapshot::RoutingSnapshot, type_aliases::AtomicSwap, - type_aliases::SenderWatch, + dynamic_route_provider::DynamicRouteProviderError, + health_check::HEALTH_MANAGER_ACTOR, + messages::FetchedNodes, + node::Node, + snapshot::routing_snapshot::RoutingSnapshot, + type_aliases::{AtomicSwap, SenderWatch}, }, reqwest_transport::ReqwestTransport, }, @@ -26,7 +28,7 @@ const NODES_FETCH_ACTOR: &str = "NodesFetchActor"; #[async_trait] pub trait Fetch: Sync + Send + Debug { /// Fetches the nodes from the topology. - async fn fetch(&self, url: Url) -> anyhow::Result>; + async fn fetch(&self, url: Url) -> Result, DynamicRouteProviderError>; } /// A struct representing the fetcher of the nodes from the topology. @@ -48,16 +50,29 @@ impl NodesFetcher { #[async_trait] impl Fetch for NodesFetcher { - async fn fetch(&self, url: Url) -> anyhow::Result> { + async fn fetch(&self, url: Url) -> Result, DynamicRouteProviderError> { let transport = ReqwestTransport::create_with_client(url, self.http_client.clone()) - .with_context(|| "Failed to build transport: {err}")?; + .map_err(|err| { + DynamicRouteProviderError::NodesFetchError(format!( + "Failed to build transport: {err}" + )) + })?; let agent = Agent::builder() .with_transport(transport) .build() - .with_context(|| "Failed to build an agent: {err}")?; + .map_err(|err| { + DynamicRouteProviderError::NodesFetchError(format!( + "Failed to build the agent: {err}" + )) + })?; let api_bns = agent .fetch_api_boundary_nodes_by_subnet_id(self.subnet_id) - .await?; + .await + .map_err(|err| { + DynamicRouteProviderError::NodesFetchError(format!( + "Failed to fetch API nodes: {err}" + )) + })?; // If some API BNs have invalid domain names, they are discarded. let nodes = api_bns .iter() diff --git a/ic-agent/src/agent/http_transport/dynamic_routing/test_utils.rs b/ic-agent/src/agent/http_transport/dynamic_routing/test_utils.rs index 3773c158..92de4c78 100644 --- a/ic-agent/src/agent/http_transport/dynamic_routing/test_utils.rs +++ b/ic-agent/src/agent/http_transport/dynamic_routing/test_utils.rs @@ -8,6 +8,7 @@ use url::Url; use crate::agent::http_transport::{ dynamic_routing::{ + dynamic_route_provider::DynamicRouteProviderError, health_check::{HealthCheck, HealthCheckStatus}, node::Node, nodes_fetch::Fetch, @@ -60,7 +61,7 @@ pub struct NodesFetcherMock { #[async_trait] impl Fetch for NodesFetcherMock { - async fn fetch(&self, _url: Url) -> anyhow::Result> { + async fn fetch(&self, _url: Url) -> Result, DynamicRouteProviderError> { let nodes = (*self.nodes.load_full()).clone(); Ok(nodes) } @@ -97,7 +98,7 @@ impl Default for NodeHealthCheckerMock { #[async_trait] impl HealthCheck for NodeHealthCheckerMock { - async fn check(&self, node: &Node) -> anyhow::Result { + async fn check(&self, node: &Node) -> Result { let nodes = self.healthy_nodes.load_full(); let latency = match nodes.contains(node) { true => Some(Duration::from_secs(1)), From 38a8b45a37c8dd53c0fc3c102ac64742ae69550c Mon Sep 17 00:00:00 2001 From: Nikolay Komarevskiy Date: Wed, 17 Jul 2024 19:43:09 +0200 Subject: [PATCH 15/24] chore: reduce public APIs --- .../http_transport/dynamic_routing/health_check.rs | 4 ++-- .../src/agent/http_transport/dynamic_routing/mod.rs | 6 +++--- .../agent/http_transport/dynamic_routing/node.rs | 9 +++++---- .../http_transport/dynamic_routing/nodes_fetch.rs | 2 +- .../http_transport/dynamic_routing/test_utils.rs | 13 ++++++++----- .../http_transport/dynamic_routing/type_aliases.rs | 10 +++++----- 6 files changed, 24 insertions(+), 20 deletions(-) diff --git a/ic-agent/src/agent/http_transport/dynamic_routing/health_check.rs b/ic-agent/src/agent/http_transport/dynamic_routing/health_check.rs index d45a480e..491f010b 100644 --- a/ic-agent/src/agent/http_transport/dynamic_routing/health_check.rs +++ b/ic-agent/src/agent/http_transport/dynamic_routing/health_check.rs @@ -157,12 +157,12 @@ impl HealthCheckActor { } /// The name of the health manager actor. -pub const HEALTH_MANAGER_ACTOR: &str = "HealthManagerActor"; +pub(super) const HEALTH_MANAGER_ACTOR: &str = "HealthManagerActor"; /// A struct managing the health checks of the nodes. /// It receives the fetched nodes from the `NodesFetchActor` and starts the health checks for them. /// It also receives the health status of the nodes from the `HealthCheckActor/s` and updates the routing snapshot. -pub struct HealthManagerActor { +pub(super) struct HealthManagerActor { /// The health checker. checker: Arc, /// The period of the health check. diff --git a/ic-agent/src/agent/http_transport/dynamic_routing/mod.rs b/ic-agent/src/agent/http_transport/dynamic_routing/mod.rs index 9b6b69de..07570f0f 100644 --- a/ic-agent/src/agent/http_transport/dynamic_routing/mod.rs +++ b/ic-agent/src/agent/http_transport/dynamic_routing/mod.rs @@ -3,7 +3,7 @@ pub mod dynamic_route_provider; /// Health check implementation. pub mod health_check; /// Messages used in dynamic routing. -pub mod messages; +pub(super) mod messages; /// Node implementation. pub mod node; /// Nodes fetch implementation. @@ -11,6 +11,6 @@ pub mod nodes_fetch; /// Routing snapshot implementation. pub mod snapshot; #[cfg(test)] -pub mod test_utils; +pub(super) mod test_utils; /// Type aliases used in dynamic routing. -pub mod type_aliases; +pub(super) mod type_aliases; diff --git a/ic-agent/src/agent/http_transport/dynamic_routing/node.rs b/ic-agent/src/agent/http_transport/dynamic_routing/node.rs index baf9d185..dd87116c 100644 --- a/ic-agent/src/agent/http_transport/dynamic_routing/node.rs +++ b/ic-agent/src/agent/http_transport/dynamic_routing/node.rs @@ -1,8 +1,9 @@ use url::Url; -use crate::agent::ApiBoundaryNode; - -use super::dynamic_route_provider::DynamicRouteProviderError; +use crate::agent::{ + http_transport::dynamic_routing::dynamic_route_provider::DynamicRouteProviderError, + ApiBoundaryNode, +}; /// Represents a node in the dynamic routing. #[derive(Debug, Clone, PartialEq, Eq, Hash)] @@ -52,7 +53,7 @@ impl TryFrom<&ApiBoundaryNode> for Node { } /// Checks if the given domain is a valid URL. -pub fn is_valid_domain>(domain: S) -> bool { +fn is_valid_domain>(domain: S) -> bool { // Prepend scheme to make it a valid URL let url_string = format!("http://{}", domain.as_ref()); Url::parse(&url_string).is_ok() diff --git a/ic-agent/src/agent/http_transport/dynamic_routing/nodes_fetch.rs b/ic-agent/src/agent/http_transport/dynamic_routing/nodes_fetch.rs index 88fa372f..8cbdb474 100644 --- a/ic-agent/src/agent/http_transport/dynamic_routing/nodes_fetch.rs +++ b/ic-agent/src/agent/http_transport/dynamic_routing/nodes_fetch.rs @@ -83,7 +83,7 @@ impl Fetch for NodesFetcher { } /// A struct representing the actor responsible for fetching existing nodes and communicating it with the listener. -pub struct NodesFetchActor { +pub(super) struct NodesFetchActor { /// The fetcher object responsible for fetching the nodes. fetcher: Arc, /// Time period between fetches. diff --git a/ic-agent/src/agent/http_transport/dynamic_routing/test_utils.rs b/ic-agent/src/agent/http_transport/dynamic_routing/test_utils.rs index 92de4c78..60004d75 100644 --- a/ic-agent/src/agent/http_transport/dynamic_routing/test_utils.rs +++ b/ic-agent/src/agent/http_transport/dynamic_routing/test_utils.rs @@ -17,14 +17,17 @@ use crate::agent::http_transport::{ route_provider::RouteProvider, }; -pub fn route_n_times(n: usize, f: Arc) -> Vec { +pub(super) fn route_n_times(n: usize, f: Arc) -> Vec { (0..n) .map(|_| f.route().unwrap().domain().unwrap().to_string()) .collect() } -pub fn assert_routed_domains(actual: Vec, expected: Vec, expected_repetitions: usize) -where +pub(super) fn assert_routed_domains( + actual: Vec, + expected: Vec, + expected_repetitions: usize, +) where T: AsRef + Eq + Hash + Debug + Ord, { fn build_count_map(items: &[T]) -> HashMap<&T, usize> @@ -54,7 +57,7 @@ where } #[derive(Debug)] -pub struct NodesFetcherMock { +pub(super) struct NodesFetcherMock { // A set of nodes, existing in the topology. pub nodes: AtomicSwap>, } @@ -86,7 +89,7 @@ impl NodesFetcherMock { } #[derive(Debug)] -pub struct NodeHealthCheckerMock { +pub(super) struct NodeHealthCheckerMock { healthy_nodes: Arc>>, } diff --git a/ic-agent/src/agent/http_transport/dynamic_routing/type_aliases.rs b/ic-agent/src/agent/http_transport/dynamic_routing/type_aliases.rs index 04f32b66..6be931fb 100644 --- a/ic-agent/src/agent/http_transport/dynamic_routing/type_aliases.rs +++ b/ic-agent/src/agent/http_transport/dynamic_routing/type_aliases.rs @@ -3,16 +3,16 @@ use std::sync::Arc; use tokio::sync::{mpsc, watch}; /// A type alias for the sender end of a watch channel. -pub type SenderWatch = watch::Sender>; +pub(super) type SenderWatch = watch::Sender>; /// A type alias for the receiver end of a watch channel. -pub type ReceiverWatch = watch::Receiver>; +pub(super) type ReceiverWatch = watch::Receiver>; /// A type alias for the sender end of a multi-producer, single-consumer channel. -pub type SenderMpsc = mpsc::Sender; +pub(super) type SenderMpsc = mpsc::Sender; /// A type alias for the receiver end of a multi-producer, single-consumer channel. -pub type ReceiverMpsc = mpsc::Receiver; +pub(super) type ReceiverMpsc = mpsc::Receiver; /// A type alias for an atomic swap operation on a shared value. -pub type AtomicSwap = Arc>; +pub(super) type AtomicSwap = Arc>; From 1c93d1c683b2207af9b62b4f517c036b563f3af0 Mon Sep 17 00:00:00 2001 From: Nikolay Komarevskiy Date: Fri, 19 Jul 2024 12:29:36 +0200 Subject: [PATCH 16/24] chore: relax versions --- ic-agent/Cargo.toml | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/ic-agent/Cargo.toml b/ic-agent/Cargo.toml index beebef4e..fa0eb645 100644 --- a/ic-agent/Cargo.toml +++ b/ic-agent/Cargo.toml @@ -76,12 +76,12 @@ hyper-rustls = { version = "0.27", default-features = false, features = [ ], optional = true } tokio = { version = "1.24.2", features = ["full"] } tower = { version = "0.4.13", optional = true } -async-trait = "0.1.80" -tracing = "0.1.40" -arc-swap = "1.7.1" -simple_moving_average = "1.0.2" -tracing-subscriber = "0.3.18" -tokio-util = { version = "0.7.11", features = ["full"] } +async-trait = "^0.1.0" +tracing = "^0.1.0" +arc-swap = "^1.0.0" +simple_moving_average = "^1.0.0" +tracing-subscriber = "^0.2.0" +tokio-util = { version = "^0.7.0", features = ["full"] } rustls-webpki = "0.102" [target.'cfg(target_family = "wasm")'.dependencies] From 05a7baa2514c9d1e6adcc35a45a9236228ad606e Mon Sep 17 00:00:00 2001 From: Nikolay Komarevskiy Date: Mon, 22 Jul 2024 16:14:44 +0200 Subject: [PATCH 17/24] chore: tmp use different name for bazel deps resolution --- ic-agent/Cargo.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ic-agent/Cargo.toml b/ic-agent/Cargo.toml index fa0eb645..8cb649d1 100644 --- a/ic-agent/Cargo.toml +++ b/ic-agent/Cargo.toml @@ -1,5 +1,5 @@ [package] -name = "ic-agent" +name = "ic-agent-with-discovery" version.workspace = true authors.workspace = true edition.workspace = true From e0ba8597b50e295646187c5e0e09b82085b4674d Mon Sep 17 00:00:00 2001 From: Nikolay Komarevskiy Date: Mon, 22 Jul 2024 16:48:03 +0200 Subject: [PATCH 18/24] Revert "chore: tmp use different name for bazel deps resolution" This reverts commit 05a7baa2514c9d1e6adcc35a45a9236228ad606e. --- ic-agent/Cargo.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ic-agent/Cargo.toml b/ic-agent/Cargo.toml index 8cb649d1..fa0eb645 100644 --- a/ic-agent/Cargo.toml +++ b/ic-agent/Cargo.toml @@ -1,5 +1,5 @@ [package] -name = "ic-agent-with-discovery" +name = "ic-agent" version.workspace = true authors.workspace = true edition.workspace = true From fe39797feacadc9de64e6a8eee33a9355329c694 Mon Sep 17 00:00:00 2001 From: Nikolay Komarevskiy Date: Tue, 23 Jul 2024 11:39:47 +0200 Subject: [PATCH 19/24] chore: tmp enable fetch root key --- ic-agent/Cargo.toml | 2 +- .../src/agent/http_transport/dynamic_routing/nodes_fetch.rs | 3 +++ 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/ic-agent/Cargo.toml b/ic-agent/Cargo.toml index fa0eb645..8cb649d1 100644 --- a/ic-agent/Cargo.toml +++ b/ic-agent/Cargo.toml @@ -1,5 +1,5 @@ [package] -name = "ic-agent" +name = "ic-agent-with-discovery" version.workspace = true authors.workspace = true edition.workspace = true diff --git a/ic-agent/src/agent/http_transport/dynamic_routing/nodes_fetch.rs b/ic-agent/src/agent/http_transport/dynamic_routing/nodes_fetch.rs index 8cbdb474..1e97374c 100644 --- a/ic-agent/src/agent/http_transport/dynamic_routing/nodes_fetch.rs +++ b/ic-agent/src/agent/http_transport/dynamic_routing/nodes_fetch.rs @@ -65,6 +65,9 @@ impl Fetch for NodesFetcher { "Failed to build the agent: {err}" )) })?; + agent.fetch_root_key().await.map_err(|err| { + DynamicRouteProviderError::NodesFetchError(format!("Failed to fetch root key: {err}")) + })?; let api_bns = agent .fetch_api_boundary_nodes_by_subnet_id(self.subnet_id) .await From 684aa06e1386fcb5f157ad676c4481d8812a1dbf Mon Sep 17 00:00:00 2001 From: Nikolay Komarevskiy Date: Tue, 23 Jul 2024 13:36:30 +0200 Subject: [PATCH 20/24] Revert "chore: tmp enable fetch root key" This reverts commit fe39797feacadc9de64e6a8eee33a9355329c694. --- ic-agent/Cargo.toml | 2 +- .../src/agent/http_transport/dynamic_routing/nodes_fetch.rs | 3 --- 2 files changed, 1 insertion(+), 4 deletions(-) diff --git a/ic-agent/Cargo.toml b/ic-agent/Cargo.toml index 8cb649d1..fa0eb645 100644 --- a/ic-agent/Cargo.toml +++ b/ic-agent/Cargo.toml @@ -1,5 +1,5 @@ [package] -name = "ic-agent-with-discovery" +name = "ic-agent" version.workspace = true authors.workspace = true edition.workspace = true diff --git a/ic-agent/src/agent/http_transport/dynamic_routing/nodes_fetch.rs b/ic-agent/src/agent/http_transport/dynamic_routing/nodes_fetch.rs index 1e97374c..8cbdb474 100644 --- a/ic-agent/src/agent/http_transport/dynamic_routing/nodes_fetch.rs +++ b/ic-agent/src/agent/http_transport/dynamic_routing/nodes_fetch.rs @@ -65,9 +65,6 @@ impl Fetch for NodesFetcher { "Failed to build the agent: {err}" )) })?; - agent.fetch_root_key().await.map_err(|err| { - DynamicRouteProviderError::NodesFetchError(format!("Failed to fetch root key: {err}")) - })?; let api_bns = agent .fetch_api_boundary_nodes_by_subnet_id(self.subnet_id) .await From ad521f41db1b1e05ebce71d9df65db0a6885012b Mon Sep 17 00:00:00 2001 From: Nikolay Komarevskiy Date: Tue, 23 Jul 2024 13:53:59 +0200 Subject: [PATCH 21/24] chore: use specific tokio/tokio-util features --- ic-agent/Cargo.toml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/ic-agent/Cargo.toml b/ic-agent/Cargo.toml index fa0eb645..8966c3d2 100644 --- a/ic-agent/Cargo.toml +++ b/ic-agent/Cargo.toml @@ -74,14 +74,14 @@ hyper-rustls = { version = "0.27", default-features = false, features = [ "http1", "http2", ], optional = true } -tokio = { version = "1.24.2", features = ["full"] } +tokio = { version = "1.24.2", features = ["macros", "time"] } tower = { version = "0.4.13", optional = true } async-trait = "^0.1.0" tracing = "^0.1.0" arc-swap = "^1.0.0" simple_moving_average = "^1.0.0" tracing-subscriber = "^0.2.0" -tokio-util = { version = "^0.7.0", features = ["full"] } +tokio-util = { version = "^0.7.0", features = ["rt"] } rustls-webpki = "0.102" [target.'cfg(target_family = "wasm")'.dependencies] From 7e9489a65aba3e6993d8b96f6ef09e34a50837c1 Mon Sep 17 00:00:00 2001 From: Adam Spofford <93943719+adamspofford-dfinity@users.noreply.github.com> Date: Thu, 25 Jul 2024 09:53:41 -0700 Subject: [PATCH 22/24] Release 0.37.1 (#576) --- CHANGELOG.md | 2 ++ Cargo.lock | 12 ++++++------ Cargo.toml | 8 ++++---- 3 files changed, 12 insertions(+), 10 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index f95901ec..09e8a970 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -8,6 +8,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ## Unreleased +## [0.37.1] - 2024-07-25 + * Bug fix: Add `api/v2` prefix to read_state requests for hyper transport ## [0.37.0] - 2024-07-23 diff --git a/Cargo.lock b/Cargo.lock index 7a31d329..c2e4fb46 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1083,7 +1083,7 @@ dependencies = [ [[package]] name = "ic-agent" -version = "0.37.0" +version = "0.37.1" dependencies = [ "async-lock", "backoff", @@ -1148,7 +1148,7 @@ dependencies = [ [[package]] name = "ic-identity-hsm" -version = "0.37.0" +version = "0.37.1" dependencies = [ "hex", "ic-agent", @@ -1160,7 +1160,7 @@ dependencies = [ [[package]] name = "ic-transport-types" -version = "0.37.0" +version = "0.37.1" dependencies = [ "candid", "hex", @@ -1176,7 +1176,7 @@ dependencies = [ [[package]] name = "ic-utils" -version = "0.37.0" +version = "0.37.1" dependencies = [ "async-trait", "candid", @@ -1239,7 +1239,7 @@ dependencies = [ [[package]] name = "icx" -version = "0.37.0" +version = "0.37.1" dependencies = [ "anyhow", "candid", @@ -1257,7 +1257,7 @@ dependencies = [ [[package]] name = "icx-cert" -version = "0.37.0" +version = "0.37.1" dependencies = [ "anyhow", "base64", diff --git a/Cargo.toml b/Cargo.toml index a435ed5e..535c20fa 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -11,7 +11,7 @@ members = [ ] [workspace.package] -version = "0.37.0" +version = "0.37.1" authors = ["DFINITY Stiftung "] edition = "2021" repository = "https://github.com/dfinity/agent-rs" @@ -22,9 +22,9 @@ rust-version = "1.75.0" license = "Apache-2.0" [workspace.dependencies] -ic-agent = { path = "ic-agent", version = "0.37.0", default-features = false } -ic-utils = { path = "ic-utils", version = "0.37.0" } -ic-transport-types = { path = "ic-transport-types", version = "0.37.0" } +ic-agent = { path = "ic-agent", version = "0.37.1", default-features = false } +ic-utils = { path = "ic-utils", version = "0.37.1" } +ic-transport-types = { path = "ic-transport-types", version = "0.37.1" } ic-certification = "2.2" candid = "0.10.1" From af770c1aca2b0b91bac0604c6aac2a8db6165216 Mon Sep 17 00:00:00 2001 From: Nikolay Komarevskiy Date: Tue, 30 Jul 2024 09:37:35 +0200 Subject: [PATCH 23/24] chore: set root_key for NodesFetcher --- .../dynamic_routing/dynamic_route_provider.rs | 1 + .../agent/http_transport/dynamic_routing/nodes_fetch.rs | 9 ++++++++- 2 files changed, 9 insertions(+), 1 deletion(-) diff --git a/ic-agent/src/agent/http_transport/dynamic_routing/dynamic_route_provider.rs b/ic-agent/src/agent/http_transport/dynamic_routing/dynamic_route_provider.rs index 820ea9e5..cb657ae2 100644 --- a/ic-agent/src/agent/http_transport/dynamic_routing/dynamic_route_provider.rs +++ b/ic-agent/src/agent/http_transport/dynamic_routing/dynamic_route_provider.rs @@ -103,6 +103,7 @@ impl DynamicRouteProviderBuilder { let fetcher = Arc::new(NodesFetcher::new( http_client.clone(), Principal::from_text(MAINNET_ROOT_SUBNET_ID).unwrap(), + None, )); let checker = Arc::new(HealthChecker::new(http_client, HEALTH_CHECK_TIMEOUT)); Self { diff --git a/ic-agent/src/agent/http_transport/dynamic_routing/nodes_fetch.rs b/ic-agent/src/agent/http_transport/dynamic_routing/nodes_fetch.rs index 8cbdb474..7e01d145 100644 --- a/ic-agent/src/agent/http_transport/dynamic_routing/nodes_fetch.rs +++ b/ic-agent/src/agent/http_transport/dynamic_routing/nodes_fetch.rs @@ -36,14 +36,18 @@ pub trait Fetch: Sync + Send + Debug { pub struct NodesFetcher { http_client: Client, subnet_id: Principal, + // By default, the nodes fetcher is configured to talk to the mainnet of Internet Computer, and verifies responses using a hard-coded public key. + // However, for testnets one can set up a custom public key. + root_key: Option>, } impl NodesFetcher { /// Creates a new `NodesFetcher` instance. - pub fn new(http_client: Client, subnet_id: Principal) -> Self { + pub fn new(http_client: Client, subnet_id: Principal, root_key: Option>) -> Self { Self { http_client, subnet_id, + root_key, } } } @@ -65,6 +69,9 @@ impl Fetch for NodesFetcher { "Failed to build the agent: {err}" )) })?; + if let Some(key) = self.root_key.clone() { + agent.set_root_key(key); + } let api_bns = agent .fetch_api_boundary_nodes_by_subnet_id(self.subnet_id) .await From 05f7520259aa2df841272bd9ea8ae177e3c191fb Mon Sep 17 00:00:00 2001 From: Nikolay Komarevskiy Date: Tue, 30 Jul 2024 10:35:05 +0200 Subject: [PATCH 24/24] fix: remove suffix /api/v2 --- ic-agent/src/agent/http_transport/dynamic_routing/node.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ic-agent/src/agent/http_transport/dynamic_routing/node.rs b/ic-agent/src/agent/http_transport/dynamic_routing/node.rs index dd87116c..37716da3 100644 --- a/ic-agent/src/agent/http_transport/dynamic_routing/node.rs +++ b/ic-agent/src/agent/http_transport/dynamic_routing/node.rs @@ -33,7 +33,7 @@ impl Node { impl Node { /// Converts the node to a routing URL. pub fn to_routing_url(&self) -> Url { - Url::parse(&format!("https://{}/api/v2/", self.domain)).expect("failed to parse URL") + Url::parse(&format!("https://{}", self.domain)).expect("failed to parse URL") } }