Skip to content

Commit 32bc62e

Browse files
docs: add
1 parent 973b377 commit 32bc62e

File tree

11 files changed

+161
-125
lines changed

11 files changed

+161
-125
lines changed

ic-agent/src/agent/http_transport/dynamic_routing/dynamic_route_provider.rs

Lines changed: 31 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -25,7 +25,7 @@ use crate::{
2525
node::Node,
2626
nodes_fetch::{Fetch, NodesFetchActor, NodesFetcher},
2727
snapshot::routing_snapshot::RoutingSnapshot,
28-
type_aliases::GlobalShared,
28+
type_aliases::AtomicSwap,
2929
},
3030
route_provider::RouteProvider,
3131
},
@@ -46,17 +46,28 @@ const HEALTH_CHECK_PERIOD: Duration = Duration::from_secs(1);
4646

4747
const DYNAMIC_ROUTE_PROVIDER: &str = "DynamicRouteProvider";
4848

49-
///
49+
/// A dynamic route provider.
50+
/// It spawns the discovery service (`NodesFetchActor`) for fetching the latest nodes topology.
51+
/// It also spawns the `HealthManagerActor`, which orchestrates the health check tasks for each node and updates routing snapshot.
5052
#[derive(Debug)]
5153
pub struct DynamicRouteProvider<S> {
54+
/// Fetcher for fetching the latest nodes topology.
5255
fetcher: Arc<dyn Fetch>,
56+
/// Periodicity of fetching the latest nodes topology.
5357
fetch_period: Duration,
58+
/// Interval for retrying fetching the nodes in case of error.
5459
fetch_retry_interval: Duration,
60+
/// Health checker for checking the health of the nodes.
5561
checker: Arc<dyn HealthCheck>,
62+
/// Periodicity of checking the health of the nodes.
5663
check_period: Duration,
57-
snapshot: GlobalShared<S>,
64+
/// Snapshot of the routing nodes.
65+
routing_snapshot: AtomicSwap<S>,
66+
/// Task tracker for managing the spawned tasks.
5867
tracker: TaskTracker,
68+
/// Initial seed nodes, which are used for the initial fetching of the nodes.
5969
seeds: Vec<Node>,
70+
/// Cancellation token for stopping the spawned tasks.
6071
token: CancellationToken,
6172
}
6273

@@ -65,7 +76,7 @@ where
6576
S: RoutingSnapshot + 'static,
6677
{
6778
fn route(&self) -> Result<Url, AgentError> {
68-
let snapshot = self.snapshot.load();
79+
let snapshot = self.routing_snapshot.load();
6980
let node = snapshot.next().ok_or_else(|| {
7081
AgentError::RouteProviderError("No healthy API nodes found.".to_string())
7182
})?;
@@ -77,7 +88,7 @@ impl<S> DynamicRouteProvider<S>
7788
where
7889
S: RoutingSnapshot + 'static,
7990
{
80-
///
91+
/// Creates a new instance of `DynamicRouteProvider`.
8192
pub fn new(snapshot: S, seeds: Vec<Node>, http_client: Client) -> Self {
8293
let fetcher = Arc::new(NodesFetcher::new(
8394
http_client.clone(),
@@ -91,31 +102,31 @@ where
91102
checker,
92103
check_period: HEALTH_CHECK_PERIOD,
93104
seeds,
94-
snapshot: Arc::new(ArcSwap::from_pointee(snapshot)),
105+
routing_snapshot: Arc::new(ArcSwap::from_pointee(snapshot)),
95106
tracker: TaskTracker::new(),
96107
token: CancellationToken::new(),
97108
}
98109
}
99110

100-
///
111+
/// Sets the fetcher for fetching the latest nodes topology.
101112
pub fn with_fetcher(mut self, fetcher: Arc<dyn Fetch>) -> Self {
102113
self.fetcher = fetcher;
103114
self
104115
}
105116

106-
///
117+
/// Sets the periodicity of fetching the latest nodes topology.
107118
pub fn with_fetch_period(mut self, period: Duration) -> Self {
108119
self.fetch_period = period;
109120
self
110121
}
111122

112-
///
123+
/// Sets the interval for retrying fetching the nodes in case of error.
113124
pub fn with_checker(mut self, checker: Arc<dyn HealthCheck>) -> Self {
114125
self.checker = checker;
115126
self
116127
}
117128

118-
///
129+
/// Sets the periodicity of checking the health of the nodes.
119130
pub fn with_check_period(mut self, period: Duration) -> Self {
120131
self.check_period = period;
121132
self
@@ -133,14 +144,14 @@ where
133144
// Communication channel between NodesFetchActor and HealthManagerActor.
134145
let (fetch_sender, fetch_receiver) = watch::channel(None);
135146

136-
// Communication channel with HealthManagerActor to receive info about healthy seeds.
147+
// Communication channel with HealthManagerActor to receive info about healthy seed nodes (used only once).
137148
let (init_sender, mut init_receiver) = mpsc::channel(1);
138149

139150
// Start the receiving part first.
140151
let health_manager_actor = HealthManagerActor::new(
141152
Arc::clone(&self.checker),
142153
self.check_period,
143-
Arc::clone(&self.snapshot),
154+
Arc::clone(&self.routing_snapshot),
144155
fetch_receiver,
145156
init_sender,
146157
self.token.clone(),
@@ -156,7 +167,7 @@ where
156167
error!("{DYNAMIC_ROUTE_PROVIDER}: failed to send results to HealthManager: {err:?}");
157168
}
158169

159-
// Try await healthy seeds.
170+
// Try await for healthy seeds.
160171
let found_healthy_seeds =
161172
match timeout(TIMEOUT_AWAIT_HEALTHY_SEED, init_receiver.recv()).await {
162173
Ok(_) => {
@@ -174,24 +185,25 @@ where
174185
false
175186
}
176187
};
188+
// We can close the channel now.
177189
init_receiver.close();
178190

179191
let fetch_actor = NodesFetchActor::new(
180192
Arc::clone(&self.fetcher),
181193
self.fetch_period,
182194
self.fetch_retry_interval,
183195
fetch_sender,
184-
Arc::clone(&self.snapshot),
196+
Arc::clone(&self.routing_snapshot),
185197
self.token.clone(),
186198
);
187199
self.tracker.spawn(async move { fetch_actor.run().await });
188200
info!(
189201
"{DYNAMIC_ROUTE_PROVIDER}: NodesFetchActor and HealthManagerActor started successfully"
190202
);
191203

192-
(found_healthy_seeds)
193-
.then_some(())
194-
.ok_or(anyhow!("No healthy seeds found"))
204+
(found_healthy_seeds).then_some(()).ok_or(anyhow!(
205+
"No healthy seeds found, they may become healthy later ..."
206+
))
195207
}
196208

197209
/// Kill all running tasks.
@@ -364,7 +376,7 @@ mod tests {
364376
.await
365377
.unwrap_err()
366378
.to_string()
367-
.contains("No healthy seeds found"));
379+
.contains("No healthy seeds found, they may become healthy later ..."));
368380

369381
// Test 1: calls to route() return an error, as no healthy seeds exist.
370382
for _ in 0..4 {
@@ -461,7 +473,7 @@ mod tests {
461473
.await
462474
.unwrap_err()
463475
.to_string()
464-
.contains("No healthy seeds found"));
476+
.contains("No healthy seeds found, they may become healthy later ..."));
465477

466478
// Test: calls to route() return an error, as no healthy seeds exist.
467479
for _ in 0..4 {

ic-agent/src/agent/http_transport/dynamic_routing/health_check.rs

Lines changed: 49 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -16,48 +16,50 @@ use crate::agent::http_transport::dynamic_routing::{
1616
messages::{FetchedNodes, NodeHealthState},
1717
node::Node,
1818
snapshot::routing_snapshot::RoutingSnapshot,
19-
type_aliases::{GlobalShared, ReceiverMpsc, ReceiverWatch, SenderMpsc},
19+
type_aliases::{AtomicSwap, ReceiverMpsc, ReceiverWatch, SenderMpsc},
2020
};
2121

2222
const CHANNEL_BUFFER: usize = 128;
2323

24-
///
24+
/// A trait representing a health check of the node.
2525
#[async_trait]
2626
pub trait HealthCheck: Send + Sync + Debug {
27-
///
27+
/// Checks the health of the node.
2828
async fn check(&self, node: &Node) -> anyhow::Result<HealthCheckStatus>;
2929
}
3030

31-
///
31+
/// A struct representing the health check status of the node.
3232
#[derive(Clone, PartialEq, Debug, Default)]
3333
pub struct HealthCheckStatus {
34-
///
35-
pub latency: Option<Duration>,
34+
latency: Option<Duration>,
3635
}
3736

38-
///
3937
impl HealthCheckStatus {
40-
///
38+
/// Creates a new `HealthCheckStatus` instance.
4139
pub fn new(latency: Option<Duration>) -> Self {
4240
Self { latency }
4341
}
4442

45-
///
43+
/// Checks if the node is healthy.
4644
pub fn is_healthy(&self) -> bool {
4745
self.latency.is_some()
4846
}
47+
48+
/// Get the latency of the health check.
49+
pub fn latency(&self) -> Option<Duration> {
50+
self.latency
51+
}
4952
}
5053

51-
///
54+
/// A struct implementing the `HealthCheck` for the nodes.
5255
#[derive(Debug)]
5356
pub struct HealthChecker {
5457
http_client: Client,
5558
timeout: Duration,
5659
}
5760

58-
///
5961
impl HealthChecker {
60-
///
62+
/// Creates a new `HealthChecker` instance.
6163
pub fn new(http_client: Client, timeout: Duration) -> Self {
6264
Self {
6365
http_client,
@@ -96,16 +98,22 @@ impl HealthCheck for HealthChecker {
9698

9799
const HEALTH_CHECK_ACTOR: &str = "HealthCheckActor";
98100

101+
/// A struct performing the health check of the node and sending the health status to the listener.
99102
struct HealthCheckActor {
103+
/// The health checker.
100104
checker: Arc<dyn HealthCheck>,
105+
/// The period of the health check.
101106
period: Duration,
107+
/// The node to check.
102108
node: Node,
109+
/// The sender channel (listener) to send the health status.
103110
sender_channel: SenderMpsc<NodeHealthState>,
111+
/// The cancellation token of the actor.
104112
token: CancellationToken,
105113
}
106114

107115
impl HealthCheckActor {
108-
pub fn new(
116+
fn new(
109117
checker: Arc<dyn HealthCheck>,
110118
period: Duration,
111119
node: Node,
@@ -121,7 +129,8 @@ impl HealthCheckActor {
121129
}
122130
}
123131

124-
pub async fn run(self) {
132+
/// Runs the actor.
133+
async fn run(self) {
125134
let mut interval = time::interval(self.period);
126135
loop {
127136
tokio::select! {
@@ -143,33 +152,46 @@ impl HealthCheckActor {
143152
}
144153
}
145154

146-
///
155+
/// The name of the health manager actor.
147156
pub const HEALTH_MANAGER_ACTOR: &str = "HealthManagerActor";
148157

149-
///
158+
/// A struct managing the health checks of the nodes.
159+
/// It receives the fetched nodes from the `NodesFetchActor` and starts the health checks for them.
160+
/// It also receives the health status of the nodes from the `HealthCheckActor/s` and updates the routing snapshot.
150161
pub struct HealthManagerActor<S> {
162+
/// The health checker.
151163
checker: Arc<dyn HealthCheck>,
164+
/// The period of the health check.
152165
period: Duration,
153-
nodes_snapshot: GlobalShared<S>,
166+
/// The routing snapshot, storing the nodes.
167+
routing_snapshot: AtomicSwap<S>,
168+
/// The receiver channel to listen to the fetched nodes messages.
154169
fetch_receiver: ReceiverWatch<FetchedNodes>,
170+
/// The sender channel to send the health status of the nodes back to HealthManagerActor.
155171
check_sender: SenderMpsc<NodeHealthState>,
172+
/// The receiver channel to receive the health status of the nodes from the `HealthCheckActor/s`.
156173
check_receiver: ReceiverMpsc<NodeHealthState>,
174+
/// The sender channel to send the initialization status to DynamicRouteProvider (used only once in the init phase).
157175
init_sender: SenderMpsc<bool>,
176+
/// The cancellation token of the actor.
158177
token: CancellationToken,
178+
/// The cancellation token for all the health checks.
159179
nodes_token: CancellationToken,
180+
/// The task tracker of the health checks, waiting for the tasks to exit (graceful termination).
160181
nodes_tracker: TaskTracker,
182+
/// The flag indicating if this actor is initialized with healthy nodes.
161183
is_initialized: bool,
162184
}
163185

164186
impl<S> HealthManagerActor<S>
165187
where
166188
S: RoutingSnapshot,
167189
{
168-
///
190+
/// Creates a new `HealthManagerActor` instance.
169191
pub fn new(
170192
checker: Arc<dyn HealthCheck>,
171193
period: Duration,
172-
nodes_snapshot: GlobalShared<S>,
194+
routing_snapshot: AtomicSwap<S>,
173195
fetch_receiver: ReceiverWatch<FetchedNodes>,
174196
init_sender: SenderMpsc<bool>,
175197
token: CancellationToken,
@@ -179,7 +201,7 @@ where
179201
Self {
180202
checker,
181203
period,
182-
nodes_snapshot,
204+
routing_snapshot,
183205
fetch_receiver,
184206
check_sender,
185207
check_receiver,
@@ -191,11 +213,11 @@ where
191213
}
192214
}
193215

194-
///
216+
/// Runs the actor.
195217
pub async fn run(mut self) {
196218
loop {
197219
tokio::select! {
198-
// Check if a new array of fetched nodes appeared in the channel from NodesFetchService.
220+
// Process a new array of fetched nodes from NodesFetchActor, if it appeared in the channel.
199221
result = self.fetch_receiver.changed() => {
200222
if let Err(err) = result {
201223
error!("{HEALTH_MANAGER_ACTOR}: nodes fetch sender has been dropped: {err:?}");
@@ -206,7 +228,7 @@ where
206228
let Some(FetchedNodes { nodes }) = self.fetch_receiver.borrow_and_update().clone() else { continue };
207229
self.handle_fetch_update(nodes).await;
208230
}
209-
// Receive health check messages from all running NodeHealthChecker/s.
231+
// Receive health check messages from all running HealthCheckActor/s.
210232
Some(msg) = self.check_receiver.recv() => {
211233
self.handle_health_update(msg).await;
212234
}
@@ -221,13 +243,13 @@ where
221243
}
222244

223245
async fn handle_health_update(&mut self, msg: NodeHealthState) {
224-
let current_snapshot = self.nodes_snapshot.load_full();
246+
let current_snapshot = self.routing_snapshot.load_full();
225247
let mut new_snapshot = (*current_snapshot).clone();
226248
if let Err(err) = new_snapshot.update_node(&msg.node, msg.health.clone()) {
227249
error!("{HEALTH_MANAGER_ACTOR}: failed to update snapshot: {err:?}");
228250
return;
229251
}
230-
self.nodes_snapshot.store(Arc::new(new_snapshot));
252+
self.routing_snapshot.store(Arc::new(new_snapshot));
231253
if !self.is_initialized && msg.health.is_healthy() {
232254
self.is_initialized = true;
233255
// If TIMEOUT_AWAIT_HEALTHY_SEED has been exceeded, the receiver was dropped and send would thus fail. We ignore the failure.
@@ -244,11 +266,11 @@ where
244266
return;
245267
}
246268
debug!("{HEALTH_MANAGER_ACTOR}: fetched nodes received {:?}", nodes);
247-
let current_snapshot = self.nodes_snapshot.load_full();
269+
let current_snapshot = self.routing_snapshot.load_full();
248270
let mut new_snapshot = (*current_snapshot).clone();
249271
// If the snapshot has changed, store it and restart all node's health checks.
250272
if let Ok(true) = new_snapshot.sync_nodes(&nodes) {
251-
self.nodes_snapshot.store(Arc::new(new_snapshot));
273+
self.routing_snapshot.store(Arc::new(new_snapshot));
252274
self.stop_all_checks().await;
253275
self.start_checks(nodes.to_vec());
254276
}

0 commit comments

Comments
 (0)