Skip to content

Commit 72fa123

Browse files
committed
Push probe zones to sled-agent instead of pulling them from Nexus
- Add APIs in the sled-agent for creating / deleting probes, and have Nexus use them when managing probes from the external API, especially replacing the entire set of probes with a PUT. - Rework the probe manager to accept the list of expected probes from Nexus, and drive the state toward that, rather than periodically pollling Nexus. - Add background task for periodically pushing probes to sleds, and omdb innards for reporting its state. - Closes #9157
1 parent a78f456 commit 72fa123

File tree

28 files changed

+9461
-323
lines changed

28 files changed

+9461
-323
lines changed

dev-tools/omdb/src/bin/omdb/nexus.rs

Lines changed: 30 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -58,6 +58,7 @@ use nexus_types::internal_api::background::InstanceReincarnationStatus;
5858
use nexus_types::internal_api::background::InstanceUpdaterStatus;
5959
use nexus_types::internal_api::background::InventoryLoadStatus;
6060
use nexus_types::internal_api::background::LookupRegionPortStatus;
61+
use nexus_types::internal_api::background::ProbeDistributorStatus;
6162
use nexus_types::internal_api::background::ReadOnlyRegionReplacementStartStatus;
6263
use nexus_types::internal_api::background::RegionReplacementDriverStatus;
6364
use nexus_types::internal_api::background::RegionReplacementStatus;
@@ -1189,6 +1190,9 @@ fn print_task_details(bgtask: &BackgroundTask, details: &serde_json::Value) {
11891190
"phantom_disks" => {
11901191
print_task_phantom_disks(details);
11911192
}
1193+
"probe_distributor" => {
1194+
print_task_probe_distributor(details);
1195+
}
11921196
"read_only_region_replacement_start" => {
11931197
print_task_read_only_region_replacement_start(details);
11941198
}
@@ -2110,6 +2114,32 @@ fn print_task_phantom_disks(details: &serde_json::Value) {
21102114
};
21112115
}
21122116

2117+
fn print_task_probe_distributor(details: &serde_json::Value) {
2118+
match serde_json::from_value::<ProbeDistributorStatus>(details.clone()) {
2119+
Err(error) => eprintln!(
2120+
"warning: failed to interpret task details: {:?}: {:?}",
2121+
error, details
2122+
),
2123+
Ok(status) => {
2124+
let n_total_probes: usize = status.probes_by_sled.values().sum();
2125+
println!(" succesfully-pushed probes: {} total", n_total_probes);
2126+
for (sled_id, count) in status.probes_by_sled {
2127+
println!(" sled_id={} n_probes={}", sled_id, count);
2128+
}
2129+
println!(
2130+
" errors while pushing probes: {} total",
2131+
status.errors.len()
2132+
);
2133+
for err in status.errors {
2134+
println!(
2135+
" sled_id={} sled_ip={} error={}",
2136+
err.sled_id, err.sled_ip, err.error,
2137+
);
2138+
}
2139+
}
2140+
};
2141+
}
2142+
21132143
fn print_task_read_only_region_replacement_start(details: &serde_json::Value) {
21142144
match serde_json::from_value::<ReadOnlyRegionReplacementStartStatus>(
21152145
details.clone(),

illumos-utils/src/opte/port.rs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -57,7 +57,7 @@ impl Drop for PortInner {
5757
Err(e) => e,
5858
};
5959
eprintln!(
60-
"WARNING: Failed to delete the xde device. It must be deleted
60+
"WARNING: Failed to delete the xde device. It must be deleted \
6161
out of band, and it will not be possible to recreate the xde \
6262
device until then. Error: {:?}",
6363
err,

nexus-config/src/nexus_config.rs

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -441,6 +441,8 @@ pub struct BackgroundTaskConfig {
441441
pub webhook_deliverator: WebhookDeliveratorConfig,
442442
/// configuration for SP ereport ingester task
443443
pub sp_ereport_ingester: SpEreportIngesterConfig,
444+
/// configuration for networking probe distributor
445+
pub probe_distributor: ProbeDistributorConfig,
444446
}
445447

446448
#[serde_as]
@@ -870,6 +872,15 @@ impl Default for SpEreportIngesterConfig {
870872
}
871873
}
872874

875+
#[serde_as]
876+
#[derive(Clone, Debug, Deserialize, Eq, PartialEq, Serialize)]
877+
pub struct ProbeDistributorConfig {
878+
/// period (in seconds) for periodic activations of the background task that
879+
/// distributes networking probe zones to sled-agents.
880+
#[serde_as(as = "DurationSeconds<u64>")]
881+
pub period_secs: Duration,
882+
}
883+
873884
/// Configuration for a nexus server
874885
#[derive(Clone, Debug, Deserialize, PartialEq, Serialize)]
875886
pub struct PackageConfig {
@@ -1172,6 +1183,7 @@ mod test {
11721183
webhook_deliverator.first_retry_backoff_secs = 45
11731184
webhook_deliverator.second_retry_backoff_secs = 46
11741185
sp_ereport_ingester.period_secs = 47
1186+
probe_distributor.period_secs = 48
11751187
[default_region_allocation_strategy]
11761188
type = "random"
11771189
seed = 0
@@ -1416,6 +1428,9 @@ mod test {
14161428
period_secs: Duration::from_secs(47),
14171429
disable: false,
14181430
},
1431+
probe_distributor: ProbeDistributorConfig {
1432+
period_secs: Duration::from_secs(48),
1433+
},
14191434
},
14201435
default_region_allocation_strategy:
14211436
crate::nexus_config::RegionAllocationStrategy::Random {
@@ -1514,6 +1529,7 @@ mod test {
15141529
alert_dispatcher.period_secs = 42
15151530
webhook_deliverator.period_secs = 43
15161531
sp_ereport_ingester.period_secs = 44
1532+
probe_distributor.period_secs = 45
15171533
15181534
[default_region_allocation_strategy]
15191535
type = "random"

nexus/background-task-interface/src/init.rs

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -51,6 +51,7 @@ pub struct BackgroundTasks {
5151
pub task_webhook_deliverator: Activator,
5252
pub task_sp_ereport_ingester: Activator,
5353
pub task_reconfigurator_config_loader: Activator,
54+
pub task_probe_distributor: Activator,
5455

5556
// Handles to activate background tasks that do not get used by Nexus
5657
// at-large. These background tasks are implementation details as far as

0 commit comments

Comments
 (0)