Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
24 changes: 24 additions & 0 deletions dev-tools/omdb/src/bin/omdb/db/alert.rs
Original file line number Diff line number Diff line change
Expand Up @@ -85,6 +85,16 @@ struct AlertListArgs {
#[clap(long)]
dispatched_after: Option<DateTime<Utc>>,

/// Include only alerts requested by the fault management case(s) with the
/// specified UUIDs.
///
/// If multiple case IDs are provided, alerts requested by any of those
/// cases will be included in the output.
///
/// Note that not all alerts are requested by fault management cases.
#[clap(long = "case")]
cases: Vec<Uuid>,

/// If `true`, include only alerts that have been fully dispatched.
/// If `false`, include only alerts that have not been fully dispatched.
///
Expand Down Expand Up @@ -871,6 +881,7 @@ async fn cmd_db_alert_list(
payload,
before,
after,
cases,
dispatched_before,
dispatched_after,
dispatched,
Expand Down Expand Up @@ -924,6 +935,10 @@ async fn cmd_db_alert_list(
}
}

if !cases.is_empty() {
query = query.filter(alert_dsl::case_id.eq_any(cases.clone()));
}

let ctx = || "loading alerts";
let alerts = query.load_async(&*conn).await.with_context(ctx)?;

Expand All @@ -939,6 +954,8 @@ async fn cmd_db_alert_list(
#[tabled(display_with = "datetime_opt_rfc3339_concise")]
time_dispatched: Option<DateTime<Utc>>,
dispatched: i64,
#[tabled(display_with = "display_option_blank")]
fm_case_id: Option<Uuid>,
}

impl From<&'_ Alert> for AlertRow {
Expand All @@ -949,6 +966,7 @@ async fn cmd_db_alert_list(
time_created: alert.identity.time_created,
time_dispatched: alert.time_dispatched,
dispatched: alert.num_dispatched,
fm_case_id: alert.case_id.map(GenericUuid::into_untyped_uuid),
}
}
}
Expand Down Expand Up @@ -1012,11 +1030,13 @@ async fn cmd_db_alert_info(
class,
payload,
num_dispatched,
case_id,
} = alert;

const CLASS: &str = "class";
const TIME_DISPATCHED: &str = "fully dispatched at";
const NUM_DISPATCHED: &str = "deliveries dispatched";
const CASE_ID: &str = "requested by FM case";

const WIDTH: usize = const_max_len(&[
ID,
Expand All @@ -1025,6 +1045,7 @@ async fn cmd_db_alert_info(
TIME_DISPATCHED,
NUM_DISPATCHED,
CLASS,
CASE_ID,
]);

println!("\n{:=<80}", "== ALERT ");
Expand All @@ -1037,6 +1058,9 @@ async fn cmd_db_alert_info(
if let Some(t) = time_dispatched {
println!(" {TIME_DISPATCHED:>WIDTH$}: {t}")
}
if let Some(case_id) = case_id {
println!(" {CASE_ID:>WIDTH$}: {case_id:?}");
}

println!("\n{:=<80}", "== ALERT PAYLOAD ");
serde_json::to_writer_pretty(std::io::stdout(), &payload).with_context(
Expand Down
64 changes: 63 additions & 1 deletion dev-tools/omdb/src/bin/omdb/nexus.rs
Original file line number Diff line number Diff line change
Expand Up @@ -57,6 +57,8 @@ use nexus_types::internal_api::background::BlueprintRendezvousStats;
use nexus_types::internal_api::background::BlueprintRendezvousStatus;
use nexus_types::internal_api::background::DatasetsRendezvousStats;
use nexus_types::internal_api::background::EreporterStatus;
use nexus_types::internal_api::background::FmAlertStats;
use nexus_types::internal_api::background::FmRendezvousStatus;
use nexus_types::internal_api::background::InstanceReincarnationStatus;
use nexus_types::internal_api::background::InstanceUpdaterStatus;
use nexus_types::internal_api::background::InventoryLoadStatus;
Expand Down Expand Up @@ -1303,6 +1305,9 @@ fn print_task_details(bgtask: &BackgroundTask, details: &serde_json::Value) {
"fm_sitrep_gc" => {
print_task_fm_sitrep_gc(details);
}
"fm_rendezvous" => {
print_task_fm_rendezvous(details);
}
"trust_quorum_manager" => {
print_task_trust_quorum_manager(details);
}
Expand Down Expand Up @@ -3308,6 +3313,64 @@ fn print_task_fm_sitrep_gc(details: &serde_json::Value) {
);
}

fn print_task_fm_rendezvous(details: &serde_json::Value) {
match serde_json::from_value::<FmRendezvousStatus>(details.clone()) {
Err(error) => {
eprintln!(
"warning: failed to interpret task details: {:?}: {:?}",
error, details
);
return;
}
Ok(FmRendezvousStatus::NoSitrep) => {
println!(" no FM situation report loaded");
}
Ok(FmRendezvousStatus::Executed { sitrep_id, alerts }) => {
println!(" current sitrep: {sitrep_id}");
display_fm_alert_stats(&alerts);
}
}
}

fn display_fm_alert_stats(stats: &FmAlertStats) {
let FmAlertStats {
total_alerts_requested,
current_sitrep_alerts_requested,
alerts_created,
errors,
} = stats;
let already_created =
total_alerts_requested - alerts_created - errors.len();
pub const REQUESTED: &str = "alerts requested:";
pub const REQUESTED_THIS_SITREP: &str = " requested in this sitrep:";
pub const CREATED: &str = " created in this activation:";
pub const ALREADY_CREATED: &str = " already created:";
pub const ERRORS: &str = " errors:";
pub const WIDTH: usize = const_max_len(&[
REQUESTED,
REQUESTED_THIS_SITREP,
CREATED,
ALREADY_CREATED,
ERRORS,
]) + 1;
pub const NUM_WIDTH: usize = 4;
println!(" {REQUESTED:<WIDTH$}{total_alerts_requested:>NUM_WIDTH$}");
println!(
" {REQUESTED_THIS_SITREP:<WIDTH$}{:>NUM_WIDTH$}",
current_sitrep_alerts_requested
);
println!(" {CREATED:<WIDTH$}{alerts_created:>NUM_WIDTH$}");
println!(" {ALREADY_CREATED:<WIDTH$}{already_created:>NUM_WIDTH$}");
println!(
"{} {ERRORS:<WIDTH$}{:>NUM_WIDTH$}",
warn_if_nonzero(errors.len()),
errors.len()
);
for error in errors {
println!(" > {error}");
}
}

fn print_task_trust_quorum_manager(details: &serde_json::Value) {
let status = match serde_json::from_value::<TrustQuorumManagerStatus>(
details.clone(),
Expand All @@ -3321,7 +3384,6 @@ fn print_task_trust_quorum_manager(details: &serde_json::Value) {
return;
}
};

match status {
TrustQuorumManagerStatus::PerRackStatus { statuses, errors } => {
if statuses.is_empty() && errors.is_empty() {
Expand Down
15 changes: 15 additions & 0 deletions dev-tools/omdb/tests/env.out
Original file line number Diff line number Diff line change
Expand Up @@ -99,6 +99,11 @@ task: "external_endpoints"
on each one


task: "fm_rendezvous"
updates externally visible database tables to match the current fault
management sitrep


task: "fm_sitrep_gc"
garbage collects fault management situation reports

Expand Down Expand Up @@ -336,6 +341,11 @@ task: "external_endpoints"
on each one


task: "fm_rendezvous"
updates externally visible database tables to match the current fault
management sitrep


task: "fm_sitrep_gc"
garbage collects fault management situation reports

Expand Down Expand Up @@ -560,6 +570,11 @@ task: "external_endpoints"
on each one


task: "fm_rendezvous"
updates externally visible database tables to match the current fault
management sitrep


task: "fm_sitrep_gc"
garbage collects fault management situation reports

Expand Down
17 changes: 17 additions & 0 deletions dev-tools/omdb/tests/successes.out
Original file line number Diff line number Diff line change
Expand Up @@ -334,6 +334,11 @@ task: "external_endpoints"
on each one


task: "fm_rendezvous"
updates externally visible database tables to match the current fault
management sitrep


task: "fm_sitrep_gc"
garbage collects fault management situation reports

Expand Down Expand Up @@ -642,6 +647,12 @@ task: "external_endpoints"

TLS certificates: 0

task: "fm_rendezvous"
configured period: every <REDACTED_DURATION>m
last completed activation: <REDACTED ITERATIONS>, triggered by <TRIGGERED_BY_REDACTED>
started at <REDACTED_TIMESTAMP> (<REDACTED DURATION>s ago) and ran for <REDACTED DURATION>ms
no FM situation report loaded

task: "fm_sitrep_gc"
configured period: every <REDACTED_DURATION>s
last completed activation: <REDACTED ITERATIONS>, triggered by <TRIGGERED_BY_REDACTED>
Expand Down Expand Up @@ -1222,6 +1233,12 @@ task: "external_endpoints"

TLS certificates: 0

task: "fm_rendezvous"
configured period: every <REDACTED_DURATION>m
last completed activation: <REDACTED ITERATIONS>, triggered by <TRIGGERED_BY_REDACTED>
started at <REDACTED_TIMESTAMP> (<REDACTED DURATION>s ago) and ran for <REDACTED DURATION>ms
no FM situation report loaded

task: "fm_sitrep_gc"
configured period: every <REDACTED_DURATION>s
last completed activation: <REDACTED ITERATIONS>, triggered by <TRIGGERED_BY_REDACTED>
Expand Down
11 changes: 11 additions & 0 deletions nexus-config/src/nexus_config.rs
Original file line number Diff line number Diff line change
Expand Up @@ -926,6 +926,11 @@ pub struct FmTasksConfig {
/// garbage collects unneeded fault management sitreps in the database.
#[serde_as(as = "DurationSeconds<u64>")]
pub sitrep_gc_period_secs: Duration,
/// period (in seconds) for periodic activations of the background task that
/// updates externally-visible database tables to match the current situation
/// report.
#[serde_as(as = "DurationSeconds<u64>")]
pub rendezvous_period_secs: Duration,
}

impl Default for FmTasksConfig {
Expand All @@ -936,6 +941,9 @@ impl Default for FmTasksConfig {
// time the current sitrep changes, and activating it more
// frequently won't make things more responsive.
sitrep_gc_period_secs: Duration::from_secs(600),
// This, too, is activated whenever a new sitrep is loaded, so we
// need not set the periodic activation interval too high.
rendezvous_period_secs: Duration::from_secs(300),
}
}
}
Expand Down Expand Up @@ -1240,6 +1248,7 @@ mod test {
fm.sitrep_gc_period_secs = 49
probe_distributor.period_secs = 50
multicast_reconciler.period_secs = 60
fm.rendezvous_period_secs = 51
trust_quorum.period_secs = 60
[default_region_allocation_strategy]
type = "random"
Expand Down Expand Up @@ -1489,6 +1498,7 @@ mod test {
fm: FmTasksConfig {
sitrep_load_period_secs: Duration::from_secs(48),
sitrep_gc_period_secs: Duration::from_secs(49),
rendezvous_period_secs: Duration::from_secs(51),
},
probe_distributor: ProbeDistributorConfig {
period_secs: Duration::from_secs(50),
Expand Down Expand Up @@ -1603,6 +1613,7 @@ mod test {
fm.sitrep_load_period_secs = 45
fm.sitrep_gc_period_secs = 46
probe_distributor.period_secs = 47
fm.rendezvous_period_secs = 48
multicast_reconciler.period_secs = 60
trust_quorum.period_secs = 60

Expand Down
1 change: 1 addition & 0 deletions nexus/background-task-interface/src/init.rs
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,7 @@ pub struct BackgroundTasks {
pub task_webhook_deliverator: Activator,
pub task_sp_ereport_ingester: Activator,
pub task_reconfigurator_config_loader: Activator,
pub task_fm_rendezvous: Activator,
pub task_fm_sitrep_loader: Activator,
pub task_fm_sitrep_gc: Activator,
pub task_probe_distributor: Activator,
Expand Down
44 changes: 44 additions & 0 deletions nexus/db-model/src/alert.rs
Original file line number Diff line number Diff line change
Expand Up @@ -3,9 +3,14 @@
// file, You can obtain one at https://mozilla.org/MPL/2.0/.

use crate::AlertClass;
use crate::DbTypedUuid;
use chrono::{DateTime, Utc};
use db_macros::Asset;
use nexus_db_schema::schema::alert;
use nexus_types::fm::case;
use omicron_uuid_kinds::AlertUuid;
use omicron_uuid_kinds::CaseKind;
use omicron_uuid_kinds::CaseUuid;
use serde::{Deserialize, Serialize};

/// A webhook event.
Expand Down Expand Up @@ -40,10 +45,49 @@ pub struct Alert {
pub payload: serde_json::Value,

pub num_dispatched: i64,

/// The ID of the fault management case that created this alert, if any.
pub case_id: Option<DbTypedUuid<CaseKind>>,
}

impl Alert {
/// UUID of the singleton event entry for alert receiver liveness probes.
pub const PROBE_ALERT_ID: uuid::Uuid =
uuid::Uuid::from_u128(0x001de000_7768_4000_8000_000000000001);

/// Returns an `Alert` model representing a newly-created alert, with the
/// provided ID, alert class, and JSON payload.
pub fn new(
id: impl Into<AlertUuid>,
class: impl Into<AlertClass>,
payload: impl Into<serde_json::Value>,
) -> Self {
Self {
identity: AlertIdentity::new(id.into()),
time_dispatched: None,
class: class.into(),
payload: payload.into(),
num_dispatched: 0,
case_id: None,
}
}

pub fn for_fm_alert_request(
req: &case::AlertRequest,
case_id: CaseUuid,
) -> Self {
let &case::AlertRequest {
id,
class,
ref payload,
// Ignore the sitrep ID fields, as they are not included in the
// alert model.
requested_sitrep_id: _,
} = req;

Self {
case_id: Some(case_id.into()),
..Self::new(id, class, payload.clone())
}
}
}
Loading
Loading