diff --git a/dev-tools/omdb/src/bin/omdb/db/alert.rs b/dev-tools/omdb/src/bin/omdb/db/alert.rs index 1e019a3e2d6..eeec094138b 100644 --- a/dev-tools/omdb/src/bin/omdb/db/alert.rs +++ b/dev-tools/omdb/src/bin/omdb/db/alert.rs @@ -85,6 +85,16 @@ struct AlertListArgs { #[clap(long)] dispatched_after: Option>, + /// Include only alerts requested by the fault management case(s) with the + /// specified UUIDs. + /// + /// If multiple case IDs are provided, alerts requested by any of those + /// cases will be included in the output. + /// + /// Note that not all alerts are requested by fault management cases. + #[clap(long = "case")] + cases: Vec, + /// If `true`, include only alerts that have been fully dispatched. /// If `false`, include only alerts that have not been fully dispatched. /// @@ -871,6 +881,7 @@ async fn cmd_db_alert_list( payload, before, after, + cases, dispatched_before, dispatched_after, dispatched, @@ -924,6 +935,10 @@ async fn cmd_db_alert_list( } } + if !cases.is_empty() { + query = query.filter(alert_dsl::case_id.eq_any(cases.clone())); + } + let ctx = || "loading alerts"; let alerts = query.load_async(&*conn).await.with_context(ctx)?; @@ -939,6 +954,8 @@ async fn cmd_db_alert_list( #[tabled(display_with = "datetime_opt_rfc3339_concise")] time_dispatched: Option>, dispatched: i64, + #[tabled(display_with = "display_option_blank")] + fm_case_id: Option, } impl From<&'_ Alert> for AlertRow { @@ -949,6 +966,7 @@ async fn cmd_db_alert_list( time_created: alert.identity.time_created, time_dispatched: alert.time_dispatched, dispatched: alert.num_dispatched, + fm_case_id: alert.case_id.map(GenericUuid::into_untyped_uuid), } } } @@ -1012,11 +1030,13 @@ async fn cmd_db_alert_info( class, payload, num_dispatched, + case_id, } = alert; const CLASS: &str = "class"; const TIME_DISPATCHED: &str = "fully dispatched at"; const NUM_DISPATCHED: &str = "deliveries dispatched"; + const CASE_ID: &str = "requested by FM case"; const WIDTH: usize = const_max_len(&[ ID, @@ -1025,6 +1045,7 @@ async fn cmd_db_alert_info( TIME_DISPATCHED, NUM_DISPATCHED, CLASS, + CASE_ID, ]); println!("\n{:=<80}", "== ALERT "); @@ -1037,6 +1058,9 @@ async fn cmd_db_alert_info( if let Some(t) = time_dispatched { println!(" {TIME_DISPATCHED:>WIDTH$}: {t}") } + if let Some(case_id) = case_id { + println!(" {CASE_ID:>WIDTH$}: {case_id:?}"); + } println!("\n{:=<80}", "== ALERT PAYLOAD "); serde_json::to_writer_pretty(std::io::stdout(), &payload).with_context( diff --git a/dev-tools/omdb/src/bin/omdb/nexus.rs b/dev-tools/omdb/src/bin/omdb/nexus.rs index e0be6d58ccd..e412c9ee81c 100644 --- a/dev-tools/omdb/src/bin/omdb/nexus.rs +++ b/dev-tools/omdb/src/bin/omdb/nexus.rs @@ -57,6 +57,8 @@ use nexus_types::internal_api::background::BlueprintRendezvousStats; use nexus_types::internal_api::background::BlueprintRendezvousStatus; use nexus_types::internal_api::background::DatasetsRendezvousStats; use nexus_types::internal_api::background::EreporterStatus; +use nexus_types::internal_api::background::FmAlertStats; +use nexus_types::internal_api::background::FmRendezvousStatus; use nexus_types::internal_api::background::InstanceReincarnationStatus; use nexus_types::internal_api::background::InstanceUpdaterStatus; use nexus_types::internal_api::background::InventoryLoadStatus; @@ -1303,6 +1305,9 @@ fn print_task_details(bgtask: &BackgroundTask, details: &serde_json::Value) { "fm_sitrep_gc" => { print_task_fm_sitrep_gc(details); } + "fm_rendezvous" => { + print_task_fm_rendezvous(details); + } "trust_quorum_manager" => { print_task_trust_quorum_manager(details); } @@ -3308,6 +3313,64 @@ fn print_task_fm_sitrep_gc(details: &serde_json::Value) { ); } +fn print_task_fm_rendezvous(details: &serde_json::Value) { + match serde_json::from_value::(details.clone()) { + Err(error) => { + eprintln!( + "warning: failed to interpret task details: {:?}: {:?}", + error, details + ); + return; + } + Ok(FmRendezvousStatus::NoSitrep) => { + println!(" no FM situation report loaded"); + } + Ok(FmRendezvousStatus::Executed { sitrep_id, alerts }) => { + println!(" current sitrep: {sitrep_id}"); + display_fm_alert_stats(&alerts); + } + } +} + +fn display_fm_alert_stats(stats: &FmAlertStats) { + let FmAlertStats { + total_alerts_requested, + current_sitrep_alerts_requested, + alerts_created, + errors, + } = stats; + let already_created = + total_alerts_requested - alerts_created - errors.len(); + pub const REQUESTED: &str = "alerts requested:"; + pub const REQUESTED_THIS_SITREP: &str = " requested in this sitrep:"; + pub const CREATED: &str = " created in this activation:"; + pub const ALREADY_CREATED: &str = " already created:"; + pub const ERRORS: &str = " errors:"; + pub const WIDTH: usize = const_max_len(&[ + REQUESTED, + REQUESTED_THIS_SITREP, + CREATED, + ALREADY_CREATED, + ERRORS, + ]) + 1; + pub const NUM_WIDTH: usize = 4; + println!(" {REQUESTED:NUM_WIDTH$}"); + println!( + " {REQUESTED_THIS_SITREP:NUM_WIDTH$}", + current_sitrep_alerts_requested + ); + println!(" {CREATED:NUM_WIDTH$}"); + println!(" {ALREADY_CREATED:NUM_WIDTH$}"); + println!( + "{} {ERRORS:NUM_WIDTH$}", + warn_if_nonzero(errors.len()), + errors.len() + ); + for error in errors { + println!(" > {error}"); + } +} + fn print_task_trust_quorum_manager(details: &serde_json::Value) { let status = match serde_json::from_value::( details.clone(), @@ -3321,7 +3384,6 @@ fn print_task_trust_quorum_manager(details: &serde_json::Value) { return; } }; - match status { TrustQuorumManagerStatus::PerRackStatus { statuses, errors } => { if statuses.is_empty() && errors.is_empty() { diff --git a/dev-tools/omdb/tests/env.out b/dev-tools/omdb/tests/env.out index 499aa8d2d09..8c32e973e25 100644 --- a/dev-tools/omdb/tests/env.out +++ b/dev-tools/omdb/tests/env.out @@ -99,6 +99,11 @@ task: "external_endpoints" on each one +task: "fm_rendezvous" + updates externally visible database tables to match the current fault + management sitrep + + task: "fm_sitrep_gc" garbage collects fault management situation reports @@ -336,6 +341,11 @@ task: "external_endpoints" on each one +task: "fm_rendezvous" + updates externally visible database tables to match the current fault + management sitrep + + task: "fm_sitrep_gc" garbage collects fault management situation reports @@ -560,6 +570,11 @@ task: "external_endpoints" on each one +task: "fm_rendezvous" + updates externally visible database tables to match the current fault + management sitrep + + task: "fm_sitrep_gc" garbage collects fault management situation reports diff --git a/dev-tools/omdb/tests/successes.out b/dev-tools/omdb/tests/successes.out index 109d74823a6..185e0f208ac 100644 --- a/dev-tools/omdb/tests/successes.out +++ b/dev-tools/omdb/tests/successes.out @@ -334,6 +334,11 @@ task: "external_endpoints" on each one +task: "fm_rendezvous" + updates externally visible database tables to match the current fault + management sitrep + + task: "fm_sitrep_gc" garbage collects fault management situation reports @@ -642,6 +647,12 @@ task: "external_endpoints" TLS certificates: 0 +task: "fm_rendezvous" + configured period: every m + last completed activation: , triggered by + started at (s ago) and ran for ms + no FM situation report loaded + task: "fm_sitrep_gc" configured period: every s last completed activation: , triggered by @@ -1222,6 +1233,12 @@ task: "external_endpoints" TLS certificates: 0 +task: "fm_rendezvous" + configured period: every m + last completed activation: , triggered by + started at (s ago) and ran for ms + no FM situation report loaded + task: "fm_sitrep_gc" configured period: every s last completed activation: , triggered by diff --git a/nexus-config/src/nexus_config.rs b/nexus-config/src/nexus_config.rs index d9defb6c1bc..b16cc954bee 100644 --- a/nexus-config/src/nexus_config.rs +++ b/nexus-config/src/nexus_config.rs @@ -926,6 +926,11 @@ pub struct FmTasksConfig { /// garbage collects unneeded fault management sitreps in the database. #[serde_as(as = "DurationSeconds")] pub sitrep_gc_period_secs: Duration, + /// period (in seconds) for periodic activations of the background task that + /// updates externally-visible database tables to match the current situation + /// report. + #[serde_as(as = "DurationSeconds")] + pub rendezvous_period_secs: Duration, } impl Default for FmTasksConfig { @@ -936,6 +941,9 @@ impl Default for FmTasksConfig { // time the current sitrep changes, and activating it more // frequently won't make things more responsive. sitrep_gc_period_secs: Duration::from_secs(600), + // This, too, is activated whenever a new sitrep is loaded, so we + // need not set the periodic activation interval too high. + rendezvous_period_secs: Duration::from_secs(300), } } } @@ -1240,6 +1248,7 @@ mod test { fm.sitrep_gc_period_secs = 49 probe_distributor.period_secs = 50 multicast_reconciler.period_secs = 60 + fm.rendezvous_period_secs = 51 trust_quorum.period_secs = 60 [default_region_allocation_strategy] type = "random" @@ -1489,6 +1498,7 @@ mod test { fm: FmTasksConfig { sitrep_load_period_secs: Duration::from_secs(48), sitrep_gc_period_secs: Duration::from_secs(49), + rendezvous_period_secs: Duration::from_secs(51), }, probe_distributor: ProbeDistributorConfig { period_secs: Duration::from_secs(50), @@ -1603,6 +1613,7 @@ mod test { fm.sitrep_load_period_secs = 45 fm.sitrep_gc_period_secs = 46 probe_distributor.period_secs = 47 + fm.rendezvous_period_secs = 48 multicast_reconciler.period_secs = 60 trust_quorum.period_secs = 60 diff --git a/nexus/background-task-interface/src/init.rs b/nexus/background-task-interface/src/init.rs index 7eb7e3ad11a..42ce76ea36a 100644 --- a/nexus/background-task-interface/src/init.rs +++ b/nexus/background-task-interface/src/init.rs @@ -51,6 +51,7 @@ pub struct BackgroundTasks { pub task_webhook_deliverator: Activator, pub task_sp_ereport_ingester: Activator, pub task_reconfigurator_config_loader: Activator, + pub task_fm_rendezvous: Activator, pub task_fm_sitrep_loader: Activator, pub task_fm_sitrep_gc: Activator, pub task_probe_distributor: Activator, diff --git a/nexus/db-model/src/alert.rs b/nexus/db-model/src/alert.rs index 9c4dc24aed3..85d06188932 100644 --- a/nexus/db-model/src/alert.rs +++ b/nexus/db-model/src/alert.rs @@ -3,9 +3,14 @@ // file, You can obtain one at https://mozilla.org/MPL/2.0/. use crate::AlertClass; +use crate::DbTypedUuid; use chrono::{DateTime, Utc}; use db_macros::Asset; use nexus_db_schema::schema::alert; +use nexus_types::fm::case; +use omicron_uuid_kinds::AlertUuid; +use omicron_uuid_kinds::CaseKind; +use omicron_uuid_kinds::CaseUuid; use serde::{Deserialize, Serialize}; /// A webhook event. @@ -40,10 +45,49 @@ pub struct Alert { pub payload: serde_json::Value, pub num_dispatched: i64, + + /// The ID of the fault management case that created this alert, if any. + pub case_id: Option>, } impl Alert { /// UUID of the singleton event entry for alert receiver liveness probes. pub const PROBE_ALERT_ID: uuid::Uuid = uuid::Uuid::from_u128(0x001de000_7768_4000_8000_000000000001); + + /// Returns an `Alert` model representing a newly-created alert, with the + /// provided ID, alert class, and JSON payload. + pub fn new( + id: impl Into, + class: impl Into, + payload: impl Into, + ) -> Self { + Self { + identity: AlertIdentity::new(id.into()), + time_dispatched: None, + class: class.into(), + payload: payload.into(), + num_dispatched: 0, + case_id: None, + } + } + + pub fn for_fm_alert_request( + req: &case::AlertRequest, + case_id: CaseUuid, + ) -> Self { + let &case::AlertRequest { + id, + class, + ref payload, + // Ignore the sitrep ID fields, as they are not included in the + // alert model. + requested_sitrep_id: _, + } = req; + + Self { + case_id: Some(case_id.into()), + ..Self::new(id, class, payload.clone()) + } + } } diff --git a/nexus/db-model/src/alert_class.rs b/nexus/db-model/src/alert_class.rs index 5f0b2129707..416b5714137 100644 --- a/nexus/db-model/src/alert_class.rs +++ b/nexus/db-model/src/alert_class.rs @@ -3,8 +3,7 @@ // file, You can obtain one at https://mozilla.org/MPL/2.0/. use super::impl_enum_type; -use nexus_types::external_api::views; -use serde::de::{self, Deserialize, Deserializer}; +use serde::de::{Deserialize, Deserializer}; use serde::ser::{Serialize, Serializer}; use std::fmt; @@ -34,49 +33,18 @@ impl_enum_type!( impl AlertClass { pub fn as_str(&self) -> &'static str { - // TODO(eliza): it would be really nice if these strings were all - // declared a single time, rather than twice (in both `impl_enum_type!` - // and here)... - match self { - Self::Probe => "probe", - Self::TestFoo => "test.foo", - Self::TestFooBar => "test.foo.bar", - Self::TestFooBaz => "test.foo.baz", - Self::TestQuuxBar => "test.quux.bar", - Self::TestQuuxBarBaz => "test.quux.bar.baz", - } + nexus_types::alert::AlertClass::from(*self).as_str() } /// Returns `true` if this event class is only used for testing and should /// not be incldued in the public event class list API endpoint. pub fn is_test(&self) -> bool { - matches!( - self, - Self::TestFoo - | Self::TestFooBar - | Self::TestFooBaz - | Self::TestQuuxBar - | Self::TestQuuxBarBaz - ) + nexus_types::alert::AlertClass::from(*self).is_test() } /// Returns a human-readable description string describing this event class. pub fn description(&self) -> &'static str { - match self { - Self::Probe => { - "Synthetic events sent for webhook receiver liveness probes.\n\ - Receivers should return 2xx HTTP responses for these events, \ - but they should NOT be treated as notifications of an actual \ - event in the system." - } - Self::TestFoo - | Self::TestFooBar - | Self::TestFooBaz - | Self::TestQuuxBar - | Self::TestQuuxBarBaz => { - "This is a test of the emergency alert system" - } - } + nexus_types::alert::AlertClass::from(*self).description() } /// All webhook event classes. @@ -84,6 +52,33 @@ impl AlertClass { ::VARIANTS; } +impl From for AlertClass { + fn from(input: nexus_types::alert::AlertClass) -> Self { + use nexus_types::alert::AlertClass as In; + match input { + In::Probe => Self::Probe, + In::TestFoo => Self::TestFoo, + In::TestFooBar => Self::TestFooBar, + In::TestFooBaz => Self::TestFooBaz, + In::TestQuuxBar => Self::TestQuuxBar, + In::TestQuuxBarBaz => Self::TestQuuxBarBaz, + } + } +} + +impl From for nexus_types::alert::AlertClass { + fn from(input: AlertClass) -> Self { + match input { + AlertClass::Probe => Self::Probe, + AlertClass::TestFoo => Self::TestFoo, + AlertClass::TestFooBar => Self::TestFooBar, + AlertClass::TestFooBaz => Self::TestFooBaz, + AlertClass::TestQuuxBar => Self::TestQuuxBar, + AlertClass::TestQuuxBarBaz => Self::TestQuuxBarBaz, + } + } +} + impl fmt::Display for AlertClass { fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { write!(f, "{}", self.as_str()) @@ -95,7 +90,7 @@ impl Serialize for AlertClass { where S: Serializer, { - serializer.serialize_str(self.as_str()) + nexus_types::alert::AlertClass::from(*self).serialize(serializer) } } @@ -104,83 +99,7 @@ impl<'de> Deserialize<'de> for AlertClass { where D: Deserializer<'de>, { - <&'de str>::deserialize(deserializer)? - .parse::() - .map_err(de::Error::custom) - } -} - -impl std::str::FromStr for AlertClass { - type Err = AlertClassParseError; - fn from_str(s: &str) -> Result { - for &class in Self::ALL_CLASSES { - if s == class.as_str() { - return Ok(class); - } - } - - Err(AlertClassParseError(())) - } -} - -impl From for views::AlertClass { - fn from(class: AlertClass) -> Self { - Self { - name: class.to_string(), - description: class.description().to_string(), - } - } -} - -#[derive(Debug, Eq, PartialEq)] -pub struct AlertClassParseError(()); - -impl fmt::Display for AlertClassParseError { - fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { - write!(f, "expected one of [")?; - let mut variants = AlertClass::ALL_CLASSES.iter(); - if let Some(v) = variants.next() { - write!(f, "{v}")?; - for v in variants { - write!(f, ", {v}")?; - } - } - f.write_str("]") - } -} - -impl std::error::Error for AlertClassParseError {} - -#[cfg(test)] -mod tests { - use super::*; - - #[test] - fn test_from_str_roundtrips() { - for &variant in AlertClass::ALL_CLASSES { - assert_eq!(Ok(dbg!(variant)), dbg!(variant.to_string().parse())); - } - } - - // This is mainly a regression test to ensure that, should anyone add new - // `test.` variants in future, the `AlertClass::is_test()` method - // returns `true` for them. - #[test] - fn test_is_test() { - let problematic_variants = AlertClass::ALL_CLASSES - .iter() - .copied() - .filter(|variant| { - variant.as_str().starts_with("test.") && !variant.is_test() - }) - .collect::>(); - assert_eq!( - problematic_variants, - Vec::::new(), - "you have added one or more new `test.*` webhook event class \ - variant(s), but you seem to have not updated the \ - `AlertClass::is_test()` method!\nthe problematic \ - variant(s) are: {problematic_variants:?}", - ); + nexus_types::alert::AlertClass::deserialize(deserializer) + .map(Self::from) } } diff --git a/nexus/db-model/src/alert_subscription.rs b/nexus/db-model/src/alert_subscription.rs index 1f1c559d0e4..578e7bacc21 100644 --- a/nexus/db-model/src/alert_subscription.rs +++ b/nexus/db-model/src/alert_subscription.rs @@ -3,11 +3,11 @@ // file, You can obtain one at https://mozilla.org/MPL/2.0/. use crate::AlertClass; -use crate::AlertClassParseError; use crate::SemverVersion; use crate::typed_uuid::DbTypedUuid; use chrono::{DateTime, Utc}; use nexus_db_schema::schema::{alert_glob, alert_subscription}; +use nexus_types::alert::AlertClassParseError; use nexus_types::external_api::shared; use omicron_common::api::external::Error; use omicron_uuid_kinds::{AlertReceiverKind, AlertReceiverUuid}; @@ -81,9 +81,12 @@ impl AlertSubscriptionKind { return Ok(Self::Glob(AlertGlob { regex, glob: value })); } - let class = value.parse().map_err(|e: AlertClassParseError| { - Error::invalid_value("alert_class", e.to_string()) - })?; + let class: AlertClass = value + .parse::() + .map_err(|e: AlertClassParseError| { + Error::invalid_value("alert_class", e.to_string()) + })? + .into(); if class == AlertClass::Probe { return Err(Error::invalid_value( diff --git a/nexus/db-model/src/fm.rs b/nexus/db-model/src/fm.rs index e14149ae170..353a2174aac 100644 --- a/nexus/db-model/src/fm.rs +++ b/nexus/db-model/src/fm.rs @@ -19,6 +19,8 @@ use chrono::{DateTime, Utc}; use nexus_db_schema::schema::{fm_sitrep, fm_sitrep_history}; use omicron_uuid_kinds::{CollectionKind, OmicronZoneKind, SitrepKind}; +mod alert_request; +pub use alert_request::*; mod case; pub use case::*; mod diagnosis_engine; diff --git a/nexus/db-model/src/fm/alert_request.rs b/nexus/db-model/src/fm/alert_request.rs new file mode 100644 index 00000000000..61d97100cd5 --- /dev/null +++ b/nexus/db-model/src/fm/alert_request.rs @@ -0,0 +1,53 @@ +// This Source Code Form is subject to the terms of the Mozilla Public +// License, v. 2.0. If a copy of the MPL was not distributed with this +// file, You can obtain one at https://mozilla.org/MPL/2.0/. + +//! Fault management alert requests. + +use crate::AlertClass; +use crate::DbTypedUuid; +use nexus_db_schema::schema::fm_alert_request; +use nexus_types::fm; +use omicron_uuid_kinds::{AlertKind, CaseKind, SitrepKind}; + +#[derive(Queryable, Insertable, Clone, Debug, Selectable)] +#[diesel(table_name = fm_alert_request)] +pub struct AlertRequest { + pub id: DbTypedUuid, + pub sitrep_id: DbTypedUuid, + pub requested_sitrep_id: DbTypedUuid, + pub case_id: DbTypedUuid, + #[diesel(column_name = "alert_class")] + pub class: AlertClass, + pub payload: serde_json::Value, +} + +impl AlertRequest { + pub fn from_sitrep( + sitrep_id: impl Into>, + case_id: impl Into>, + req: fm::case::AlertRequest, + ) -> Self { + let fm::case::AlertRequest { id, requested_sitrep_id, payload, class } = + req; + AlertRequest { + id: id.into(), + sitrep_id: sitrep_id.into(), + requested_sitrep_id: requested_sitrep_id.into(), + case_id: case_id.into(), + class: class.into(), + payload, + } + } +} + +impl From for fm::case::AlertRequest { + fn from(req: AlertRequest) -> Self { + fm::case::AlertRequest { + id: req.id.into(), + requested_sitrep_id: req.requested_sitrep_id.into(), + payload: req.payload, + class: req.class.into(), + } + } +} diff --git a/nexus/db-model/src/fm/case.rs b/nexus/db-model/src/fm/case.rs index 896d3db27fd..efe5d4e5487 100644 --- a/nexus/db-model/src/fm/case.rs +++ b/nexus/db-model/src/fm/case.rs @@ -4,13 +4,14 @@ //! Fault management cases. +use super::AlertRequest; use super::DiagnosisEngine; use crate::DbTypedUuid; use crate::ereport; use nexus_db_schema::schema::{fm_case, fm_ereport_in_case}; use nexus_types::fm; use omicron_uuid_kinds::{ - CaseEreportKind, CaseKind, EreporterRestartKind, SitrepKind, SitrepUuid, + CaseEreportKind, CaseKind, EreporterRestartKind, SitrepKind, }; /// Metadata describing a fault management case. @@ -45,6 +46,31 @@ pub struct CaseMetadata { pub comment: String, } +impl CaseMetadata { + pub fn from_sitrep( + sitrep_id: impl Into>, + case: &fm::Case, + ) -> Self { + let fm::Case { + id, + created_sitrep_id, + closed_sitrep_id, + de, + comment, + alerts_requested: _, + ereports: _, + } = case; + Self { + sitrep_id: sitrep_id.into(), + id: (*id).into(), + created_sitrep_id: (*created_sitrep_id).into(), + closed_sitrep_id: closed_sitrep_id.map(Into::into), + de: (*de).into(), + comment: comment.clone(), + } + } +} + /// An association between an ereport and a case. #[derive(Queryable, Insertable, Clone, Debug, Selectable)] #[diesel(table_name = fm_ereport_in_case)] @@ -83,53 +109,33 @@ pub struct CaseEreport { pub comment: String, } +impl CaseEreport { + pub fn from_sitrep( + sitrep_id: impl Into>, + case_id: impl Into>, + ereport: fm::case::CaseEreport, + ) -> Self { + let fm::case::CaseEreport { id, ereport, assigned_sitrep_id, comment } = + ereport; + let restart_id = ereport.id().restart_id.into(); + let ena = ereport.id().ena.into(); + Self { + id: id.into(), + case_id: case_id.into(), + restart_id, + ena, + comment, + sitrep_id: sitrep_id.into(), + assigned_sitrep_id: assigned_sitrep_id.into(), + } + } +} + /// The complete state of a case in a particular sitrep, consisting of the /// [`CaseMetadata`] record and any other records belonging to the case. #[derive(Clone, Debug)] pub struct Case { pub metadata: CaseMetadata, pub ereports: Vec, -} - -impl Case { - pub fn from_sitrep(sitrep_id: SitrepUuid, case: fm::Case) -> Self { - let sitrep_id = sitrep_id.into(); - let case_id = case.id.into(); - let ereports = case - .ereports - .into_iter() - .map( - |fm::case::CaseEreport { - id, - ereport, - assigned_sitrep_id, - comment, - }| { - let restart_id = ereport.id().restart_id.into(); - let ena = ereport.id().ena.into(); - CaseEreport { - id: id.into(), - case_id, - restart_id, - ena, - comment, - sitrep_id, - assigned_sitrep_id: assigned_sitrep_id.into(), - } - }, - ) - .collect(); - - Self { - metadata: CaseMetadata { - id: case_id, - sitrep_id, - de: case.de.into(), - created_sitrep_id: case.created_sitrep_id.into(), - closed_sitrep_id: case.closed_sitrep_id.map(Into::into), - comment: case.comment, - }, - ereports, - } - } + pub alerts_requested: Vec, } diff --git a/nexus/db-model/src/schema_versions.rs b/nexus/db-model/src/schema_versions.rs index 1209e1c98f2..54104a33246 100644 --- a/nexus/db-model/src/schema_versions.rs +++ b/nexus/db-model/src/schema_versions.rs @@ -16,7 +16,7 @@ use std::{collections::BTreeMap, sync::LazyLock}; /// /// This must be updated when you change the database schema. Refer to /// schema/crdb/README.adoc in the root of this repository for details. -pub const SCHEMA_VERSION: Version = Version::new(228, 0, 0); +pub const SCHEMA_VERSION: Version = Version::new(229, 0, 0); /// List of all past database schema versions, in *reverse* order /// @@ -28,6 +28,7 @@ static KNOWN_VERSIONS: LazyLock> = LazyLock::new(|| { // | leaving the first copy as an example for the next person. // v // KnownVersion::new(next_int, "unique-dirname-with-the-sql-files"), + KnownVersion::new(229, "fm-alert-request"), KnownVersion::new(228, "read-only-crucible-disks"), KnownVersion::new(227, "local-storage-unencrypted-dataset"), KnownVersion::new(226, "measurement-proper-inventory"), diff --git a/nexus/db-queries/src/db/datastore/alert.rs b/nexus/db-queries/src/db/datastore/alert.rs index d0b06c33943..0843367803a 100644 --- a/nexus/db-queries/src/db/datastore/alert.rs +++ b/nexus/db-queries/src/db/datastore/alert.rs @@ -7,14 +7,15 @@ use super::DataStore; use crate::context::OpContext; use crate::db::model::Alert; -use crate::db::model::AlertClass; -use crate::db::model::AlertIdentity; use async_bb8_diesel::AsyncRunQueryDsl; use diesel::prelude::*; +use diesel::result::DatabaseErrorKind; +use diesel::result::Error as DieselError; use diesel::result::OptionalExtension; use nexus_db_errors::ErrorHandler; use nexus_db_errors::public_error_from_diesel; use nexus_db_schema::schema::alert::dsl as alert_dsl; +use nexus_types::identity::Asset; use omicron_common::api::external::CreateResult; use omicron_common::api::external::Error; use omicron_common::api::external::UpdateResult; @@ -24,23 +25,32 @@ impl DataStore { pub async fn alert_create( &self, opctx: &OpContext, - id: AlertUuid, - class: AlertClass, - payload: serde_json::Value, + alert: Alert, ) -> CreateResult { let conn = self.pool_connection_authorized(&opctx).await?; - diesel::insert_into(alert_dsl::alert) - .values(Alert { - identity: AlertIdentity::new(id), - time_dispatched: None, - class, - payload, - num_dispatched: 0, - }) + let alert = diesel::insert_into(alert_dsl::alert) + .values(alert) .returning(Alert::as_returning()) .get_result_async(&*conn) .await - .map_err(|e| public_error_from_diesel(e, ErrorHandler::Server)) + .map_err(|e| match e { + DieselError::DatabaseError( + DatabaseErrorKind::UniqueViolation, + _, + ) => Error::conflict("alert already exists"), + e => public_error_from_diesel(e, ErrorHandler::Server), + })?; + + slog::debug!( + &opctx.log, + "published alert"; + "alert_id" => ?alert.id(), + "alert_class" => %alert.class, + "alert_case_id" => ?alert.case_id, + "time_created" => ?alert.identity.time_created, + ); + + Ok(alert) } pub async fn alert_select_next_for_dispatch( diff --git a/nexus/db-queries/src/db/datastore/alert_rx.rs b/nexus/db-queries/src/db/datastore/alert_rx.rs index 65e0207213e..36a02d5902f 100644 --- a/nexus/db-queries/src/db/datastore/alert_rx.rs +++ b/nexus/db-queries/src/db/datastore/alert_rx.rs @@ -1171,6 +1171,7 @@ mod test { use super::*; use crate::authz; use crate::db::explain::ExplainableAsync; + use crate::db::model::Alert; use crate::db::pub_test_utils::TestDatabase; use nexus_db_lookup::LookupPath; use omicron_common::api::external::IdentityMetadataCreateParams; @@ -1204,14 +1205,17 @@ mod test { .expect("cant create ye webhook receiver!!!!") } - async fn create_event( + async fn create_alert( datastore: &DataStore, opctx: &OpContext, alert_class: AlertClass, - ) -> (authz::Alert, crate::db::model::Alert) { + ) -> (authz::Alert, Alert) { let id = AlertUuid::new_v4(); datastore - .alert_create(opctx, id, alert_class, serde_json::json!({})) + .alert_create( + opctx, + Alert::new(id, alert_class, serde_json::json!({})), + ) .await .expect("cant create ye event"); LookupPath::new(opctx, datastore).alert_id(id).fetch().await.expect( @@ -1456,11 +1460,11 @@ mod test { .expect("cant get ye receiver"); let (authz_foo, _) = - create_event(datastore, opctx, AlertClass::TestFoo).await; + create_alert(datastore, opctx, AlertClass::TestFoo).await; let (authz_foo_bar, _) = - create_event(datastore, opctx, AlertClass::TestFooBar).await; + create_alert(datastore, opctx, AlertClass::TestFooBar).await; let (authz_quux_bar, _) = - create_event(datastore, opctx, AlertClass::TestQuuxBar).await; + create_alert(datastore, opctx, AlertClass::TestQuuxBar).await; let is_subscribed_foo = datastore .alert_rx_is_subscribed_to_alert(opctx, &authz_rx, &authz_foo) diff --git a/nexus/db-queries/src/db/datastore/fm.rs b/nexus/db-queries/src/db/datastore/fm.rs index c9e0f0c4684..b0eb90a67c9 100644 --- a/nexus/db-queries/src/db/datastore/fm.rs +++ b/nexus/db-queries/src/db/datastore/fm.rs @@ -32,6 +32,7 @@ use nexus_db_errors::TransactionError; use nexus_db_errors::public_error_from_diesel; use nexus_db_lookup::DbConnection; use nexus_db_schema::schema::ereport::dsl as ereport_dsl; +use nexus_db_schema::schema::fm_alert_request::dsl as alert_req_dsl; use nexus_db_schema::schema::fm_case::dsl as case_dsl; use nexus_db_schema::schema::fm_ereport_in_case::dsl as case_ereport_dsl; use nexus_db_schema::schema::fm_sitrep::dsl as sitrep_dsl; @@ -41,6 +42,8 @@ use nexus_types::fm::Sitrep; use omicron_common::api::external::DataPageParams; use omicron_common::api::external::Error; use omicron_common::api::external::ListResultVec; +use omicron_uuid_kinds::AlertKind; +use omicron_uuid_kinds::AlertUuid; use omicron_uuid_kinds::CaseEreportKind; use omicron_uuid_kinds::CaseKind; use omicron_uuid_kinds::CaseUuid; @@ -280,8 +283,63 @@ impl DataStore { map }; + + // Next, fetch all the alert requests belonging to cases in this sitrep. + // We do this in one big batched query that's paginated by case ID, + // rather than by querying alert requests per case, since this will + // generally result in fewer queries overall (most cases will have less + // than SQL_BATCH_SIZE alerts in them). + let mut alert_requests = { + let mut by_case = HashMap::< + CaseUuid, + iddqd::IdOrdMap, + >::new(); + + let mut paginator: Paginator> = + Paginator::new(SQL_BATCH_SIZE, PaginationOrder::Descending); + while let Some(p) = paginator.next() { + let batch = paginated( + alert_req_dsl::fm_alert_request, + alert_req_dsl::id, + &p.current_pagparams(), + ) + .filter(alert_req_dsl::sitrep_id.eq(id.into_untyped_uuid())) + .select(model::fm::AlertRequest::as_select()) + .load_async(conn) + .await + .map_err(|e| { + public_error_from_diesel(e, ErrorHandler::Server) + .internal_context( + "failed to load case alert requests assignments", + ) + })?; + + paginator = p.found_batch(&batch, &|alert_req| alert_req.id); + for alert_req in batch { + let case_id = alert_req.case_id.into(); + let id: AlertUuid = alert_req.id.into(); + by_case + .entry(case_id) + .or_default() + .insert_unique(alert_req.into()) + .map_err(|_| { + let internal_message = format!( + "encountered multiple alert requests for case \ + {case_id} with the same alert UUID {id}. \ + this should really not be possible, as the \ + alert UUID is a primary key!", + ); + Error::InternalError { internal_message } + })?; + } + } + + by_case + }; + // Next, load the case metadata entries and marry them to the sets of - // ereports assigned to those cases that we loaded in the previous step. + // ereports and alert requests for to those cases that we loaded in the + // previous steps. let cases = { let mut cases = iddqd::IdOrdMap::new(); let mut paginator = @@ -317,6 +375,8 @@ impl DataStore { // case has no ereports assigned to it, so insert an empty // map here. .unwrap_or_default(); + let alerts_requested = + alert_requests.remove(&id).unwrap_or_default(); fm::Case { id, created_sitrep_id: created_sitrep_id.into(), @@ -324,6 +384,7 @@ impl DataStore { de: de.into(), comment, ereports, + alerts_requested, } })); } @@ -436,29 +497,52 @@ impl DataStore { })?; // Create case records. + // + // We do this by collecting all the records for cases into big `Vec`s + // and inserting each category of case records in one big INSERT query, + // rather than doing smaller ones for each case in the sitrep. This uses + // more memory in Nexus but reduces the number of small db queries we + // perform. let mut cases = Vec::with_capacity(sitrep.cases.len()); + let mut alerts_requested = Vec::new(); + let mut case_ereports = Vec::new(); for case in sitrep.cases { - // TODO(eliza): some of this could be done in parallel using a - // `ParallelTaskSet`, if the time it takes to insert a sitrep were - // to become important? - let model::fm::Case { metadata, ereports } = - model::fm::Case::from_sitrep(sitrep_id, case); - - if !ereports.is_empty() { - diesel::insert_into(case_ereport_dsl::fm_ereport_in_case) - .values(ereports) - .execute_async(&*conn) - .await - .map_err(|e| { - public_error_from_diesel(e, ErrorHandler::Server) - .internal_context(format!( - "failed to insert ereport records for case {}", - metadata.id - )) - })?; - } + let case_id = case.id; + cases.push(model::fm::CaseMetadata::from_sitrep(sitrep_id, &case)); + case_ereports.extend(case.ereports.into_iter().map(|ereport| { + model::fm::CaseEreport::from_sitrep(sitrep_id, case_id, ereport) + })); + alerts_requested.extend(case.alerts_requested.into_iter().map( + |req| { + model::fm::AlertRequest::from_sitrep( + sitrep_id, case_id, req, + ) + }, + )); + } + + if !case_ereports.is_empty() { + diesel::insert_into(case_ereport_dsl::fm_ereport_in_case) + .values(case_ereports) + .execute_async(&*conn) + .await + .map_err(|e| { + public_error_from_diesel(e, ErrorHandler::Server) + .internal_context( + "failed to insert case ereport assignments", + ) + })?; + } - cases.push(metadata); + if !alerts_requested.is_empty() { + diesel::insert_into(alert_req_dsl::fm_alert_request) + .values(alerts_requested) + .execute_async(&*conn) + .await + .map_err(|e| { + public_error_from_diesel(e, ErrorHandler::Server) + .internal_context("failed to insert alert requests") + })?; } if !cases.is_empty() { @@ -815,6 +899,7 @@ impl DataStore { struct SitrepDeleteResult { sitreps_deleted: usize, case_ereports_deleted: usize, + alert_requests_deleted: usize, cases_deleted: usize, } @@ -822,6 +907,7 @@ impl DataStore { let SitrepDeleteResult { sitreps_deleted, case_ereports_deleted, + alert_requests_deleted, cases_deleted, } = self // Sitrep deletion is transactional to prevent a sitrep from being @@ -853,6 +939,13 @@ impl DataStore { .execute_async(&conn) .await?; + // Delete case alert requests. + let alert_requests_deleted = diesel::delete( + alert_req_dsl::fm_alert_request.filter(alert_req_dsl::sitrep_id.eq_any(ids.clone())) + ) + .execute_async(&conn) + .await?; + // Delete case metadata records. let cases_deleted = diesel::delete( case_dsl::fm_case @@ -872,6 +965,7 @@ impl DataStore { Ok(SitrepDeleteResult { sitreps_deleted, cases_deleted, + alert_requests_deleted, case_ereports_deleted, }) } @@ -884,11 +978,12 @@ impl DataStore { slog::info!( &opctx.log, - "deleted {sitreps_deleted} of {} sitreps sitreps", ids.len(); + "deleted {sitreps_deleted} of {} sitreps", ids.len(); "ids" => ?ids, "sitreps_deleted" => sitreps_deleted, "cases_deleted" => cases_deleted, "case_ereports_deleted" => case_ereports_deleted, + "alert_requests_deleted" => alert_requests_deleted, ); Ok(sitreps_deleted) @@ -947,6 +1042,7 @@ mod tests { use chrono::Utc; use diesel::pg::Pg; use ereport_types; + use nexus_types::alert::AlertClass; use nexus_types::fm; use nexus_types::fm::ereport::{EreportData, Reporter}; use omicron_test_utils::dev; @@ -1455,26 +1551,49 @@ mod tests { comment, de, ereports, - } = dbg!(case); - let Some(expected) = this.cases.get(&case.id) else { - panic!("expected case {id} to exist in the original sitrep") + alerts_requested, + } = case; + let case_id = id; + let Some(expected) = this.cases.get(&case_id) else { + panic!( + "assertion failed: left == right\n \ + right sitrep contained case {case_id}\n \ + left sitrep contains only cases {:?}\n", + this.cases.iter().map(|case| case.id).collect::>(), + ) }; // N.B.: we must assert each bit of the case manually, as ereports // contain `time_collected` timestamps which will lose a bit of // precision when roundtripped through the database. // :( - assert_eq!(id, &expected.id); - assert_eq!(created_sitrep_id, &expected.created_sitrep_id); - assert_eq!(closed_sitrep_id, &expected.closed_sitrep_id); - assert_eq!(comment, &expected.comment); - assert_eq!(de, &expected.de); + assert_eq!(&expected.id, id, "while checking case {case_id}"); + assert_eq!( + &expected.created_sitrep_id, created_sitrep_id, + "while checking case {case_id}" + ); + assert_eq!( + &expected.closed_sitrep_id, closed_sitrep_id, + "while checking case {case_id}" + ); + assert_eq!( + &expected.comment, comment, + "while checking case {case_id}" + ); + assert_eq!(&expected.de, de, "while checking case {case_id}"); // Now, check that all the ereports are present in both cases. assert_eq!(ereports.len(), expected.ereports.len()); for expected in &expected.ereports { - let Some(ereport) = ereports.get(&expected.ereport.id()) else { + let ereport_id = expected.ereport.id(); + let Some(ereport) = ereports.get(&ereport_id) else { panic!( - "expected ereport {id} to exist in the original case" + "assertion failed: left == right (while checking case {case_id})\n \ + case in right sitrep did not contain ereport {ereport_id}\n \ + it contains only these ereports: {:?}\n", + ereports + .iter() + .map(|e| e.ereport.id().to_string()) + .collect::>(), ) }; let fm::case::CaseEreport { @@ -1482,15 +1601,35 @@ mod tests { ereport, assigned_sitrep_id, comment, - } = dbg!(ereport); - assert_eq!(id, &expected.id); + } = ereport; + assert_eq!( + &expected.id, id, + "ereport assignment IDs should be equal \ + (while checking ereport {ereport_id} in case {case_id})", + ); // This is where we go out of our way to avoid the timestamp, // btw. - assert_eq!(ereport.id(), expected.ereport.id()); - assert_eq!(assigned_sitrep_id, &expected.assigned_sitrep_id); - assert_eq!(comment, &expected.comment); + assert_eq!( + expected.ereport.id(), + ereport.id(), + "while checking ereport {ereport_id} in case {case_id}", + ); + assert_eq!( + &expected.assigned_sitrep_id, assigned_sitrep_id, + "while checking ereport {ereport_id} in case {case_id}", + ); + assert_eq!( + &expected.comment, comment, + "while checking ereport {ereport_id} in case {case_id}", + ); } - eprintln!(); + + // Since these don't have any timestamps in them, we can just assert + // the whole map is the same. + assert_eq!( + alerts_requested, &expected.alerts_requested, + "while checking case {case_id}" + ); } } @@ -1606,12 +1745,31 @@ mod tests { }) .unwrap(); + let mut alerts_requested = iddqd::IdOrdMap::new(); + alerts_requested + .insert_unique(fm::case::AlertRequest { + id: AlertUuid::new_v4(), + class: AlertClass::TestFoo, + payload: serde_json::json!({}), + requested_sitrep_id: sitrep_id, + }) + .unwrap(); + alerts_requested + .insert_unique(fm::case::AlertRequest { + id: AlertUuid::new_v4(), + class: AlertClass::TestFooBar, + payload: serde_json::json!({}), + requested_sitrep_id: sitrep_id, + }) + .unwrap(); + fm::Case { id: omicron_uuid_kinds::CaseUuid::new_v4(), created_sitrep_id: sitrep_id, closed_sitrep_id: None, de: fm::DiagnosisEngineKind::PowerShelf, ereports, + alerts_requested, comment: "my cool case".to_string(), } }; @@ -1626,12 +1784,24 @@ mod tests { comment: "this has something to do with case 2".to_string(), }) .unwrap(); + + let mut alerts_requested = iddqd::IdOrdMap::new(); + alerts_requested + .insert_unique(fm::case::AlertRequest { + id: AlertUuid::new_v4(), + class: AlertClass::TestQuuxBar, + payload: serde_json::json!({}), + requested_sitrep_id: sitrep_id, + }) + .unwrap(); + fm::Case { id: omicron_uuid_kinds::CaseUuid::new_v4(), created_sitrep_id: sitrep_id, closed_sitrep_id: None, de: fm::DiagnosisEngineKind::PowerShelf, ereports, + alerts_requested: Default::default(), comment: "break in case of emergency".to_string(), } }; @@ -1717,11 +1887,7 @@ mod tests { .expect("failed to insert second sitrep"); // Verify the sitrep, cases, and ereport assignments exist - let conn = db - .datastore() - .pool_connection_authorized(&opctx) - .await - .expect("failed to get connection"); + let conn = datastore.pool_connection_for_tests().await.unwrap(); let sitreps_before: i64 = sitrep_dsl::fm_sitrep .filter(sitrep_dsl::id.eq(sitrep_id.into_untyped_uuid())) @@ -1752,6 +1918,17 @@ mod tests { "two case ereport assignments should exist before deletion" ); + let alert_requests_before: i64 = alert_req_dsl::fm_alert_request + .filter(alert_req_dsl::sitrep_id.eq(sitrep_id.into_untyped_uuid())) + .count() + .get_result_async::(&*conn) + .await + .expect("failed to count alert requests before deletion"); + assert_eq!( + alert_requests_before, 2, + "two alert requests should exist before deletion" + ); + // Now delete the sitrep let deleted_count = datastore .fm_sitrep_delete_all(&opctx, vec![sitrep_id]) @@ -1790,6 +1967,17 @@ mod tests { "case ereport assignments should not exist after deletion" ); + let alert_requests_after: i64 = alert_req_dsl::fm_alert_request + .filter(alert_req_dsl::sitrep_id.eq(sitrep_id.into_untyped_uuid())) + .count() + .get_result_async::(&*conn) + .await + .expect("failed to count alert requests before deletion"); + assert_eq!( + alert_requests_after, 0, + "alert requests should not exist after deletion" + ); + // Clean up db.terminate().await; logctx.cleanup_successful(); diff --git a/nexus/db-queries/src/db/datastore/webhook_delivery.rs b/nexus/db-queries/src/db/datastore/webhook_delivery.rs index 7fec81761d4..2c1d0a80dea 100644 --- a/nexus/db-queries/src/db/datastore/webhook_delivery.rs +++ b/nexus/db-queries/src/db/datastore/webhook_delivery.rs @@ -448,6 +448,7 @@ impl DataStore { mod test { use super::*; use crate::db::explain::ExplainableAsync; + use crate::db::model; use crate::db::pagination::Paginator; use crate::db::pub_test_utils::TestDatabase; use crate::db::raw_query_builder::expectorate_query_contents; @@ -485,17 +486,17 @@ mod test { .unwrap(); let rx_id = rx.rx.identity.id.into(); let alert_id = AlertUuid::new_v4(); + let alert = model::Alert::new( + alert_id, + model::AlertClass::TestFoo, + serde_json::json!({ + "answer": 42, + }), + ); datastore - .alert_create( - &opctx, - alert_id, - AlertClass::TestFoo, - serde_json::json!({ - "answer": 42, - }), - ) + .alert_create(&opctx, alert) .await - .expect("can't create ye event"); + .expect("can't create ye alert"); let dispatch1 = WebhookDelivery::new( &alert_id, diff --git a/nexus/db-queries/tests/output/webhook_rx_list_resendable_events.sql b/nexus/db-queries/tests/output/webhook_rx_list_resendable_events.sql index 4c0795a750b..2201d6b4ea8 100644 --- a/nexus/db-queries/tests/output/webhook_rx_list_resendable_events.sql +++ b/nexus/db-queries/tests/output/webhook_rx_list_resendable_events.sql @@ -6,7 +6,8 @@ SELECT alert.time_dispatched, alert.alert_class, alert.payload, - alert.num_dispatched + alert.num_dispatched, + alert.case_id FROM alert INNER JOIN webhook_delivery AS delivery ON delivery.alert_id = alert.id WHERE diff --git a/nexus/db-schema/src/schema.rs b/nexus/db-schema/src/schema.rs index 903c6961aa6..6d3cd9474a0 100644 --- a/nexus/db-schema/src/schema.rs +++ b/nexus/db-schema/src/schema.rs @@ -2803,6 +2803,7 @@ table! { payload -> Jsonb, time_dispatched -> Nullable, num_dispatched -> Int8, + case_id -> Nullable, } } @@ -3166,6 +3167,17 @@ table! { allow_tables_to_appear_in_same_query!(fm_ereport_in_case, ereport); allow_tables_to_appear_in_same_query!(fm_sitrep, fm_case); +table! { + fm_alert_request (sitrep_id, id) { + id -> Uuid, + sitrep_id -> Uuid, + requested_sitrep_id -> Uuid, + case_id -> Uuid, + alert_class -> crate::enums::AlertClassEnum, + payload -> Jsonb, + } +} + table! { trust_quorum_configuration (rack_id, epoch) { rack_id -> Uuid, diff --git a/nexus/examples/config-second.toml b/nexus/examples/config-second.toml index dfbe17968f5..c63a25ac884 100644 --- a/nexus/examples/config-second.toml +++ b/nexus/examples/config-second.toml @@ -179,6 +179,10 @@ fm.sitrep_load_period_secs = 15 # is activated every time the current sitrep changes. Periodic activations are # only necessary to ensure that it always happens eventually. fm.sitrep_gc_period_secs = 600 +# Updating database state to match the current sitrep is triggered whenever a +# new sitrep is loaded, so we need not set the periodic activation interval +# too high. +fm.rendezvous_period_secs = 300 probe_distributor.period_secs = 60 multicast_reconciler.period_secs = 60 # TTL for sled-to-backplane-port mapping cache diff --git a/nexus/examples/config.toml b/nexus/examples/config.toml index bab6eeb7f5f..4efa0d134c4 100644 --- a/nexus/examples/config.toml +++ b/nexus/examples/config.toml @@ -163,6 +163,10 @@ fm.sitrep_load_period_secs = 15 # is activated every time the current sitrep changes. Periodic activations are # only necessary to ensure that it always happens eventually. fm.sitrep_gc_period_secs = 600 +# Updating database state to match the current sitrep is triggered whenever a +# new sitrep is loaded, so we need not set the periodic activation interval +# too high. +fm.rendezvous_period_secs = 300 probe_distributor.period_secs = 60 multicast_reconciler.period_secs = 60 # TTL for sled-to-backplane-port mapping cache diff --git a/nexus/src/app/alert.rs b/nexus/src/app/alert.rs index 91f21571723..31bb900e468 100644 --- a/nexus/src/app/alert.rs +++ b/nexus/src/app/alert.rs @@ -188,18 +188,13 @@ impl Nexus { &self, opctx: &OpContext, id: AlertUuid, - class: AlertClass, + class: impl Into, event: serde_json::Value, ) -> Result { - let alert = - self.datastore().alert_create(opctx, id, class, event).await?; - slog::debug!( - &opctx.log, - "published alert"; - "alert_id" => ?id, - "alert_class" => %alert.class, - "time_created" => ?alert.identity.time_created, - ); + let alert = self + .datastore() + .alert_create(opctx, Alert::new(id, class, event)) + .await?; // Once the alert has been inserted, activate the dispatcher task to // ensure its propagated to receivers. @@ -322,7 +317,7 @@ impl Nexus { return None; } } - Some(class.into()) + Some(nexus_types::alert::AlertClass::from(class).into()) }) .take(pagparams.limit.get() as usize) .collect::>(); diff --git a/nexus/src/app/background/init.rs b/nexus/src/app/background/init.rs index 16452d155bd..f0deb8095e8 100644 --- a/nexus/src/app/background/init.rs +++ b/nexus/src/app/background/init.rs @@ -104,6 +104,7 @@ use super::tasks::dns_propagation; use super::tasks::dns_servers; use super::tasks::ereport_ingester; use super::tasks::external_endpoints; +use super::tasks::fm_rendezvous::FmRendezvous; use super::tasks::fm_sitrep_gc; use super::tasks::fm_sitrep_load; use super::tasks::instance_reincarnation; @@ -261,6 +262,7 @@ impl BackgroundTasksInitializer { task_reconfigurator_config_loader: Activator::new(), task_fm_sitrep_loader: Activator::new(), task_fm_sitrep_gc: Activator::new(), + task_fm_rendezvous: Activator::new(), task_probe_distributor: Activator::new(), task_multicast_reconciler: Activator::new(), task_trust_quorum_manager: Activator::new(), @@ -350,6 +352,7 @@ impl BackgroundTasksInitializer { task_reconfigurator_config_loader, task_fm_sitrep_loader, task_fm_sitrep_gc, + task_fm_rendezvous, task_probe_distributor, task_multicast_reconciler, task_trust_quorum_manager, @@ -1117,6 +1120,18 @@ impl BackgroundTasksInitializer { activator: task_fm_sitrep_loader, }); + driver.register(TaskDefinition { + name: "fm_rendezvous", + description: + "updates externally visible database tables to match the \ + current fault management sitrep", + period: config.fm.rendezvous_period_secs, + task_impl: Box::new(FmRendezvous::new(datastore.clone(), sitrep_watcher.clone(), task_alert_dispatcher.clone())), + opctx: opctx.child(BTreeMap::new()), + watchers: vec![Box::new(sitrep_watcher.clone())], + activator: task_fm_rendezvous, + }); + driver.register(TaskDefinition { name: "fm_sitrep_gc", description: "garbage collects fault management situation reports", diff --git a/nexus/src/app/background/tasks/alert_dispatcher.rs b/nexus/src/app/background/tasks/alert_dispatcher.rs index 943df03e0b7..a3d63a83a83 100644 --- a/nexus/src/app/background/tasks/alert_dispatcher.rs +++ b/nexus/src/app/background/tasks/alert_dispatcher.rs @@ -452,13 +452,13 @@ mod test { // activates the dispatcher task, and for this test, we would like to be // responsible for activating it. let alert_id = AlertUuid::new_v4(); + let alert = db::model::Alert::new( + alert_id, + db::model::AlertClass::TestQuuxBar, + serde_json::json!({"msg": "help im trapped in a webhook event factory"}), + ); datastore - .alert_create( - &opctx, - alert_id, - db::model::AlertClass::TestQuuxBar, - serde_json::json!({"msg": "help im trapped in a webhook event factory"}), - ) + .alert_create(&opctx, alert) .await .expect("creating the event should work"); diff --git a/nexus/src/app/background/tasks/fm_rendezvous.rs b/nexus/src/app/background/tasks/fm_rendezvous.rs new file mode 100644 index 00000000000..4d15465b84c --- /dev/null +++ b/nexus/src/app/background/tasks/fm_rendezvous.rs @@ -0,0 +1,404 @@ +// This Source Code Form is subject to the terms of the Mozilla Public +// License, v. 2.0. If a copy of the MPL was not distributed with this +// file, You can obtain one at https://mozilla.org/MPL/2.0/. + +//! Background task for executing requested state changes in the fault +//! management blueprint. + +use crate::app::background::BackgroundTask; +use crate::app::background::tasks::fm_sitrep_load::CurrentSitrep; +use futures::future::BoxFuture; +use nexus_background_task_interface::Activator; +use nexus_db_queries::context::OpContext; +use nexus_db_queries::db; +use nexus_db_queries::db::DataStore; +use nexus_types::fm::Sitrep; +use nexus_types::fm::SitrepVersion; +use nexus_types::fm::case::AlertRequest; +use nexus_types::internal_api::background::FmAlertStats as AlertStats; +use nexus_types::internal_api::background::FmRendezvousStatus as Status; +use omicron_common::api::external::Error; +use serde_json::json; +use slog_error_chain::InlineErrorChain; +use std::sync::Arc; +use tokio::sync::watch; + +pub struct FmRendezvous { + datastore: Arc, + sitrep_watcher: watch::Receiver, + alert_dispatcher: Activator, +} + +impl BackgroundTask for FmRendezvous { + fn activate<'a>( + &'a mut self, + opctx: &'a OpContext, + ) -> BoxFuture<'a, serde_json::Value> { + Box::pin(async { + let status = self.actually_activate(opctx).await; + match serde_json::to_value(status) { + Ok(val) => val, + Err(err) => { + let err = format!( + "could not serialize task status: {}", + InlineErrorChain::new(&err) + ); + json!({ "error": err }) + } + } + }) + } +} + +impl FmRendezvous { + pub fn new( + datastore: Arc, + rx: watch::Receiver, + alert_dispatcher: Activator, + ) -> Self { + Self { datastore, sitrep_watcher: rx, alert_dispatcher } + } + + async fn actually_activate(&mut self, opctx: &OpContext) -> Status { + let Some(sitrep) = self.sitrep_watcher.borrow_and_update().clone() + else { + return Status::NoSitrep; + }; + + // TODO(eliza): as we start doing other things (i.e. requesting support + // bundles, updating problems), consider spawning these in their own tasks... + let alerts = self.create_requested_alerts(&sitrep, opctx).await; + + Status::Executed { sitrep_id: sitrep.1.id(), alerts } + } + + async fn create_requested_alerts( + &self, + sitrep: &Arc<(SitrepVersion, Sitrep)>, + opctx: &OpContext, + ) -> AlertStats { + let (_, ref sitrep) = **sitrep; + let mut status = AlertStats::default(); + + // XXX(eliza): is it better to allocate all of these into a big array + // and do a single `INSERT INTO` query, or iterate over them one by one + // (not allocating) but insert one at a time? + for (case_id, req) in sitrep.alerts_requested() { + let &AlertRequest { id, class, requested_sitrep_id, .. } = req; + status.total_alerts_requested += 1; + if requested_sitrep_id == sitrep.id() { + status.current_sitrep_alerts_requested += 1; + } + match self + .datastore + .alert_create( + &opctx, + db::model::Alert::for_fm_alert_request(req, case_id), + ) + .await + { + // Alert already exists, that's fine. + Err(Error::Conflict { .. }) => {} + Err(e) => { + slog::warn!( + opctx.log, + "failed to create requested alert"; + "case_id" => %case_id, + "alert_id" => %id, + "alert_class" => %class, + "error" => %e, + ); + status + .errors + .push(format!("alert {id} (class: {class}): {e}")); + } + Ok(_) => status.alerts_created += 1, + } + } + + let n_errors = status.errors.len(); + if n_errors > 0 { + slog::warn!( + opctx.log, + "created {} alerts requested by the current sitrep, but \ + {n_errors} alerts could not be created!", + status.alerts_created; + "sitrep_id" => %sitrep.id(), + "total_alerts_requested" => status.total_alerts_requested, + "alerts_created" => status.alerts_created, + "errors" => n_errors, + ); + } else if status.alerts_created > 0 { + slog::info!( + opctx.log, + "created {} alerts requested by the current sitrep", + status.alerts_created; + "sitrep_id" => %sitrep.id(), + "total_alerts_requested" => status.total_alerts_requested, + "alerts_created" => status.alerts_created, + ); + } else if status.total_alerts_requested > 0 { + slog::debug!( + opctx.log, + "all alerts requested by the current sitrep already exist"; + "sitrep_id" => %sitrep.id(), + "total_alerts_requested" => status.total_alerts_requested, + "alerts_created" => status.alerts_created, + ); + } else { + slog::debug!( + opctx.log, + "current sitrep requests no alerts"; + "sitrep_id" => %sitrep.id(), + "total_alerts_requested" => status.total_alerts_requested, + "alerts_created" => status.alerts_created, + ); + } + + // We created some alerts, so let the alert dispatcher know. + if status.alerts_created > 0 { + self.alert_dispatcher.activate(); + } + + status + } +} + +#[cfg(test)] +mod tests { + use super::*; + use async_bb8_diesel::AsyncRunQueryDsl; + use chrono::Utc; + use diesel::prelude::*; + use nexus_db_queries::db; + use nexus_db_queries::db::pub_test_utils::TestDatabase; + use nexus_db_schema::schema::alert::dsl as alert_dsl; + use nexus_types::alert::AlertClass; + use nexus_types::fm; + use omicron_test_utils::dev; + use omicron_uuid_kinds::AlertUuid; + use omicron_uuid_kinds::CaseUuid; + use omicron_uuid_kinds::CollectionUuid; + use omicron_uuid_kinds::GenericUuid; + use omicron_uuid_kinds::OmicronZoneUuid; + use omicron_uuid_kinds::SitrepUuid; + + async fn fetch_alert( + datastore: &DataStore, + alert: AlertUuid, + ) -> Result { + let conn = datastore.pool_connection_for_tests().await.unwrap(); + let result = alert_dsl::alert + .filter(alert_dsl::id.eq(alert.into_untyped_uuid())) + .select(db::model::Alert::as_select()) + .first_async(&*conn) + .await; + dbg!(result) + } + + #[tokio::test] + async fn test_alert_requests() { + let logctx = dev::test_setup_log("test_alert_requests"); + let db = TestDatabase::new_with_datastore(&logctx.log).await; + let (opctx, datastore) = (db.opctx(), db.datastore()); + + let (sitrep_tx, sitrep_rx) = watch::channel(None); + + let alert_dispatcher_activator = Activator::new(); + // We are going to check that the alert dispatcher task is activated, so + // we must actually wire it up first. + alert_dispatcher_activator.mark_wired_up().unwrap(); + + let mut task = FmRendezvous::new( + datastore.clone(), + sitrep_rx, + alert_dispatcher_activator.clone(), + ); + + // Initial activation should do nothing. + let status = dbg!(task.actually_activate(opctx).await); + assert_eq!(status, Status::NoSitrep); + + // Now, create a new sitrep with alert requests. + let sitrep1_id = SitrepUuid::new_v4(); + let alert1_id = AlertUuid::new_v4(); + let case1_id = CaseUuid::new_v4(); + let mut case1 = fm::Case { + id: case1_id, + created_sitrep_id: sitrep1_id, + closed_sitrep_id: None, + de: fm::DiagnosisEngineKind::PowerShelf, + alerts_requested: iddqd::IdOrdMap::new(), + ereports: iddqd::IdOrdMap::new(), + comment: "my great case".to_string(), + }; + case1 + .alerts_requested + .insert_unique(fm::case::AlertRequest { + id: alert1_id, + class: AlertClass::TestFoo, + requested_sitrep_id: sitrep1_id, + payload: serde_json::json!({}), + }) + .unwrap(); + let sitrep1 = { + let mut cases = iddqd::IdOrdMap::new(); + cases.insert_unique(case1.clone()).unwrap(); + fm::Sitrep { + metadata: fm::SitrepMetadata { + id: sitrep1_id, + inv_collection_id: CollectionUuid::new_v4(), + parent_sitrep_id: None, + creator_id: OmicronZoneUuid::new_v4(), + comment: "test sitrep 1".to_string(), + time_created: Utc::now(), + }, + cases, + } + }; + + sitrep_tx + .send(Some(Arc::new(( + fm::SitrepVersion { + id: sitrep1_id, + version: 1, + time_made_current: Utc::now(), + }, + sitrep1, + )))) + .unwrap(); + + let status = dbg!(task.actually_activate(opctx).await); + let Status::Executed { sitrep_id, alerts } = status else { + panic!("rendezvous should have executed, as there is a sitrep"); + }; + assert_eq!(sitrep_id, sitrep1_id); + assert_eq!( + alerts, + AlertStats { + total_alerts_requested: 1, + current_sitrep_alerts_requested: 1, + alerts_created: 1, + errors: Vec::new() + } + ); + let db_alert1 = fetch_alert(&datastore, alert1_id) + .await + .expect("alert1 must have been created"); + assert_eq!(db_alert1.class, db::model::AlertClass::TestFoo); + assert_eq!( + db_alert1.case_id.map(|id| id.into()), + Some(case1_id), + "alert1 should have case_id set to case1" + ); + + // Now, create a second sitrep sitrep with more alert requests. + let sitrep2_id = SitrepUuid::new_v4(); + let alert2_id = AlertUuid::new_v4(); + let alert3_id = AlertUuid::new_v4(); + // Make a new case with its own alert request. + let case2_id = CaseUuid::new_v4(); + let mut case2 = fm::Case { + id: case2_id, + created_sitrep_id: sitrep1_id, + closed_sitrep_id: None, + de: fm::DiagnosisEngineKind::PowerShelf, + alerts_requested: iddqd::IdOrdMap::new(), + ereports: iddqd::IdOrdMap::new(), + comment: "my other great case".to_string(), + }; + case2 + .alerts_requested + .insert_unique(fm::case::AlertRequest { + id: alert2_id, + class: AlertClass::TestFooBar, + requested_sitrep_id: sitrep2_id, + payload: serde_json::json!({}), + }) + .unwrap(); + // Also, add a second alert request to the existing case. + case1 + .alerts_requested + .insert_unique(fm::case::AlertRequest { + id: alert3_id, + class: AlertClass::TestFooBaz, + requested_sitrep_id: sitrep2_id, + payload: serde_json::json!({}), + }) + .unwrap(); + let sitrep2 = { + let mut cases = iddqd::IdOrdMap::new(); + cases.insert_unique(case1.clone()).unwrap(); + cases.insert_unique(case2.clone()).unwrap(); + fm::Sitrep { + metadata: fm::SitrepMetadata { + id: sitrep2_id, + inv_collection_id: CollectionUuid::new_v4(), + parent_sitrep_id: Some(sitrep1_id), + creator_id: OmicronZoneUuid::new_v4(), + comment: "test sitrep 2".to_string(), + time_created: Utc::now(), + }, + cases, + } + }; + + sitrep_tx + .send(Some(Arc::new(( + fm::SitrepVersion { + id: sitrep2_id, + version: 2, + time_made_current: Utc::now(), + }, + sitrep2, + )))) + .unwrap(); + + let status = dbg!(task.actually_activate(opctx).await); + let Status::Executed { sitrep_id, alerts } = status else { + panic!("rendezvous should have executed, as there is a sitrep"); + }; + assert_eq!(sitrep_id, sitrep2_id); + assert_eq!( + alerts, + AlertStats { + total_alerts_requested: 3, + current_sitrep_alerts_requested: 2, + alerts_created: 2, + errors: Vec::new() + } + ); + + let db_alert1 = fetch_alert(&datastore, alert1_id) + .await + .expect("alert1 must have been created"); + assert_eq!(db_alert1.class, db::model::AlertClass::TestFoo); + assert_eq!( + db_alert1.case_id.map(|id| id.into()), + Some(case1_id), + "alert1 should have case_id set to case1" + ); + let db_alert2 = fetch_alert(&datastore, alert2_id) + .await + .expect("alert2 must have been created"); + assert_eq!(db_alert2.class, db::model::AlertClass::TestFooBar); + assert_eq!( + db_alert2.case_id.map(|id| id.into()), + Some(case2_id), + "alert2 should have case_id set to case2" + ); + let db_alert3 = fetch_alert(&datastore, alert3_id) + .await + .expect("alert3 must have been created"); + assert_eq!(db_alert3.class, db::model::AlertClass::TestFooBaz); + assert_eq!( + db_alert3.case_id.map(|id| id.into()), + Some(case1_id), + "alert3 should have case_id set to case1" + ); + + // Cleanup + db.terminate().await; + logctx.cleanup_successful(); + } +} diff --git a/nexus/src/app/background/tasks/mod.rs b/nexus/src/app/background/tasks/mod.rs index 8eef0f2aeab..516b8e0ca57 100644 --- a/nexus/src/app/background/tasks/mod.rs +++ b/nexus/src/app/background/tasks/mod.rs @@ -18,6 +18,7 @@ pub mod dns_propagation; pub mod dns_servers; pub mod ereport_ingester; pub mod external_endpoints; +pub mod fm_rendezvous; pub mod fm_sitrep_gc; pub mod fm_sitrep_load; pub mod instance_reincarnation; diff --git a/nexus/tests/config.test.toml b/nexus/tests/config.test.toml index 93e71a8cc45..fd80de6ad54 100644 --- a/nexus/tests/config.test.toml +++ b/nexus/tests/config.test.toml @@ -200,6 +200,10 @@ fm.sitrep_load_period_secs = 15 # is activated every time the current sitrep changes. Periodic activations are # only necessary to ensure that it always happens eventually. fm.sitrep_gc_period_secs = 600 +# Updating database state to match the current sitrep is triggered whenever a +# new sitrep is loaded, so we need not set the periodic activation interval +# too high. +fm.rendezvous_period_secs = 300 probe_distributor.period_secs = 60 multicast_reconciler.period_secs = 60 # Use shorter TTLs for tests to ensure cache invalidation logic is exercised diff --git a/nexus/types/src/alert.rs b/nexus/types/src/alert.rs new file mode 100644 index 00000000000..fdc4a64d3c2 --- /dev/null +++ b/nexus/types/src/alert.rs @@ -0,0 +1,166 @@ +// This Source Code Form is subject to the terms of the Mozilla Public +// License, v. 2.0. If a copy of the MPL was not distributed with this +// file, You can obtain one at https://mozilla.org/MPL/2.0/. + +//! Internal alert types. + +use std::fmt; + +/// Alert classes. +/// +/// This is an internal, structured representation of the list of all alert +/// classes as an enum. +/// +/// Note that this type is distinct from the +/// [`shared::AlertSubscription`](crate::external_api::shared::AlertSubscription) +/// type, which represents an alert *subscription* rather than a single alert +/// class. A subscription may be to a single alert class, *or* to a glob pattern +/// that matches multiple alert classes. The +/// [`external_api::views::AlertClass`](crate::external_api::views::AlertClass) +/// type represents the response message to the alert class view API and +/// contains a human-readable description of that alert class. +#[derive( + Copy, + Clone, + Debug, + PartialEq, + Eq, + strum::EnumString, + strum::Display, + strum::IntoStaticStr, + strum::VariantArray, + serde_with::DeserializeFromStr, + serde_with::SerializeDisplay, +)] +#[strum( + parse_err_fn = AlertClassParseError::from_input, + parse_err_ty = AlertClassParseError, +)] +pub enum AlertClass { + // TODO(eliza): it would be really nice if these strings were all + // declared a single time, rather than twice (in both `impl_enum_type!` + // macro in the db model and here)... + #[strum(serialize = "probe")] + Probe, + #[strum(serialize = "test.foo")] + TestFoo, + #[strum(serialize = "test.foo.bar")] + TestFooBar, + #[strum(serialize = "test.foo.baz")] + TestFooBaz, + #[strum(serialize = "test.quux.bar")] + TestQuuxBar, + #[strum(serialize = "test.quux.bar.baz")] + TestQuuxBarBaz, +} + +impl AlertClass { + pub fn as_str(&self) -> &'static str { + // This is just a wrapper for the `strum::IntoStaticStr` derive. + self.into() + } + + /// Returns `true` if this event class is only used for testing and should + /// not be incldued in the public event class list API endpoint. + pub fn is_test(&self) -> bool { + matches!( + self, + Self::TestFoo + | Self::TestFooBar + | Self::TestFooBaz + | Self::TestQuuxBar + | Self::TestQuuxBarBaz + ) + } + + /// Returns a human-readable description string describing this event class. + pub fn description(&self) -> &'static str { + match self { + Self::Probe => { + "Synthetic events sent for webhook receiver liveness probes.\n\ + Receivers should return 2xx HTTP responses for these events, \ + but they should NOT be treated as notifications of an actual \ + event in the system." + } + Self::TestFoo + | Self::TestFooBar + | Self::TestFooBaz + | Self::TestQuuxBar + | Self::TestQuuxBarBaz => { + "This is a test of the emergency alert system" + } + } + } + + pub const ALL_CLASSES: &[Self] = ::VARIANTS; +} + +impl From for crate::external_api::views::AlertClass { + fn from(class: AlertClass) -> Self { + Self { + name: class.to_string(), + description: class.description().to_string(), + } + } +} + +#[derive(Debug, Eq, PartialEq)] +pub struct AlertClassParseError(()); + +impl fmt::Display for AlertClassParseError { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + write!(f, "expected one of [")?; + let mut variants = AlertClass::ALL_CLASSES.iter(); + if let Some(v) = variants.next() { + write!(f, "{v}")?; + for v in variants { + write!(f, ", {v}")?; + } + } + f.write_str("]") + } +} + +impl AlertClassParseError { + // Strum's derive requires that this function take a &str, but we ignore + // it. + fn from_input(_: &str) -> Self { + Self(()) + } +} + +impl std::error::Error for AlertClassParseError {} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_from_str_roundtrips() { + for &variant in AlertClass::ALL_CLASSES { + assert_eq!(Ok(dbg!(variant)), dbg!(variant.to_string().parse())); + } + } + + // This is mainly a regression test to ensure that, should anyone add new + // `test.` variants in future, the `AlertClass::is_test()` method + // returns `true` for them. + #[test] + fn test_is_test() { + let problematic_variants = AlertClass::ALL_CLASSES + .iter() + .copied() + .filter(|variant| { + variant.as_str().starts_with("test.") && !variant.is_test() + }) + .collect::>(); + assert_eq!( + problematic_variants, + Vec::::new(), + "you have added one or more new `test.*` alert class \ + variant(s), but you seem to have not updated the \ + `AlertClass::is_test()` method!\nthe problematic \ + variant(s) are: {problematic_variants:?}", + ); + } +} diff --git a/nexus/types/src/fm.rs b/nexus/types/src/fm.rs index 2dd6387c6d4..dbfd7bd10a0 100644 --- a/nexus/types/src/fm.rs +++ b/nexus/types/src/fm.rs @@ -9,13 +9,15 @@ pub mod ereport; pub use ereport::{Ereport, EreportId}; - pub mod case; pub use case::Case; +use case::AlertRequest; use chrono::{DateTime, Utc}; use iddqd::IdOrdMap; -use omicron_uuid_kinds::{CollectionUuid, OmicronZoneUuid, SitrepUuid}; +use omicron_uuid_kinds::{ + CaseUuid, CollectionUuid, OmicronZoneUuid, SitrepUuid, +}; use serde::{Deserialize, Serialize}; /// A fault management situation report, or _sitrep_. @@ -69,6 +71,16 @@ impl Sitrep { pub fn open_cases(&self) -> impl Iterator + '_ { self.cases.iter().filter(|c| c.is_open()) } + + /// Iterate over all alerts requested by cases in this sitrep. + pub fn alerts_requested( + &self, + ) -> impl Iterator + '_ { + self.cases.iter().flat_map(|case| { + let case_id = case.id; + case.alerts_requested.iter().map(move |alert| (case_id, alert)) + }) + } } /// Metadata describing a sitrep. diff --git a/nexus/types/src/fm/case.rs b/nexus/types/src/fm/case.rs index bde7ac3f1d9..afcefd3dd93 100644 --- a/nexus/types/src/fm/case.rs +++ b/nexus/types/src/fm/case.rs @@ -2,10 +2,11 @@ // License, v. 2.0. If a copy of the MPL was not distributed with this // file, You can obtain one at https://mozilla.org/MPL/2.0/. +use crate::alert::AlertClass; use crate::fm::DiagnosisEngineKind; use crate::fm::Ereport; use iddqd::{IdOrdItem, IdOrdMap}; -use omicron_uuid_kinds::{CaseEreportUuid, CaseUuid, SitrepUuid}; +use omicron_uuid_kinds::{AlertUuid, CaseEreportUuid, CaseUuid, SitrepUuid}; use serde::{Deserialize, Serialize}; use std::fmt; use std::sync::Arc; @@ -19,6 +20,7 @@ pub struct Case { pub de: DiagnosisEngineKind, pub ereports: IdOrdMap, + pub alerts_requested: IdOrdMap, pub comment: String, } @@ -69,6 +71,23 @@ impl IdOrdItem for CaseEreport { iddqd::id_upcast!(); } +#[derive(Clone, Debug, Serialize, Deserialize, PartialEq, Eq)] +pub struct AlertRequest { + pub id: AlertUuid, + pub class: AlertClass, + pub payload: serde_json::Value, + pub requested_sitrep_id: SitrepUuid, +} + +impl iddqd::IdOrdItem for AlertRequest { + type Key<'a> = &'a AlertUuid; + fn key(&self) -> Self::Key<'_> { + &self.id + } + + iddqd::id_upcast!(); +} + struct DisplayCase<'a> { case: &'a Case, indent: usize, @@ -100,6 +119,7 @@ impl fmt::Display for DisplayCase<'_> { de, ereports, comment, + alerts_requested, }, indent, sitrep_id, @@ -190,6 +210,30 @@ impl fmt::Display for DisplayCase<'_> { } } + if !alerts_requested.is_empty() { + writeln!(f, "\n{:>indent$}alerts requested:", "")?; + writeln!(f, "{:>indent$}-----------------", "")?; + + let indent = indent + 2; + for AlertRequest { id, class, payload: _, requested_sitrep_id } in + alerts_requested.iter() + { + const CLASS: &str = "class:"; + const REQUESTED_IN: &str = "requested in:"; + + const WIDTH: usize = const_max_len(&[CLASS, REQUESTED_IN]); + + writeln!(f, "{BULLET:>indent$}alert {id}",)?; + writeln!(f, "{:>indent$}{CLASS:indent$}{REQUESTED_IN:, } -/// The status of a `fm_sitrep_execution` background task activation. +/// The status of a `fm_rendezvous` background task activation. #[derive(Clone, Debug, Deserialize, Serialize, PartialEq, Eq)] -pub enum SitrepExecutionStatus { +pub enum FmRendezvousStatus { NoSitrep, - Executed { sitrep_id: SitrepUuid, alerts: SitrepAlertRequestStatus }, + Executed { sitrep_id: SitrepUuid, alerts: FmAlertStats }, } #[derive(Clone, Debug, Default, Deserialize, Serialize, PartialEq, Eq)] -pub struct SitrepAlertRequestStatus { +pub struct FmAlertStats { /// The total number of alerts requested by the current sitrep. pub total_alerts_requested: usize, /// The total number of alerts which were *first* requested in the current sitrep. diff --git a/nexus/types/src/lib.rs b/nexus/types/src/lib.rs index f96185a2b1d..20747abef52 100644 --- a/nexus/types/src/lib.rs +++ b/nexus/types/src/lib.rs @@ -29,6 +29,7 @@ //! rules, so our model layer knows about our views. That seems to be a //! relatively minor offense, so it's the way we leave things for now. +pub mod alert; pub mod authn; pub mod deployment; pub mod external_api; diff --git a/schema/crdb/dbinit.sql b/schema/crdb/dbinit.sql index 0fc4520b3ae..63be483471c 100644 --- a/schema/crdb/dbinit.sql +++ b/schema/crdb/dbinit.sql @@ -6791,6 +6791,9 @@ CREATE TABLE IF NOT EXISTS omicron.public.alert ( -- The number of receivers that this alart was dispatched to. num_dispatched INT8 NOT NULL, + -- The ID of the fault management case that created this alert, if any. + case_id UUID, + CONSTRAINT time_dispatched_set_if_dispatched CHECK ( (num_dispatched = 0) OR (time_dispatched IS NOT NULL) ), @@ -7334,6 +7337,45 @@ CREATE INDEX IF NOT EXISTS lookup_ereports_assigned_to_fm_case ON omicron.public.fm_ereport_in_case (sitrep_id, case_id); +-- Alerts requested by a fault management sitrep. +-- +-- These represent the list of alerts which the fault management system would +-- like to have created as of a given sitrep. If (and only if) the sitrep is +-- made current, then the `fm_rendezvous` background task will create +-- corresponding entries in the `omicron.public.fm_alert` table. The primary +-- keys for these alerts (the alert UUID) is provided by the alert request, +-- preventing the same alert from being created multiple times. +-- +-- Records in this table are inserted when a sitrep is created, and are deleted +-- when the sitrep corresponding to the alert request record's `sitrep_id` is +-- deleted. However, alert requests are carried forwards into subsequent sitreps +-- for as long as the fault management case containing them exists, so that the +-- fault management system may track which events within a case it has already +-- requested alerts for. +CREATE TABLE IF NOT EXISTS omicron.public.fm_alert_request ( + -- Requested alert UUID + id UUID NOT NULL, + -- UUID of the current sitrep that this alert request record is part of. + -- + -- Note that this is *not* the sitrep in which the alert was requested. + sitrep_id UUID NOT NULL, + -- UUID of the original sitrep in which the alert was first requested. + requested_sitrep_id UUID NOT NULL, + -- UUID of the case to which this alert request belongs. + case_id UUID NOT NULL, + + -- The class of alert that was requested + alert_class omicron.public.alert_class NOT NULL, + -- Actual alert data. The structure of this depends on the alert class. + payload JSONB NOT NULL, + + PRIMARY KEY (sitrep_id, id) +); + +CREATE INDEX IF NOT EXISTS + lookup_fm_alert_requests_for_case +ON omicron.public.fm_alert_request (sitrep_id, case_id); + /* * List of datasets available to be sliced up and passed to VMMs for encrypted * instance local storage. @@ -8173,7 +8215,7 @@ INSERT INTO omicron.public.db_metadata ( version, target_version ) VALUES - (TRUE, NOW(), NOW(), '228.0.0', NULL) + (TRUE, NOW(), NOW(), '229.0.0', NULL) ON CONFLICT DO NOTHING; COMMIT; diff --git a/schema/crdb/fm-alert-request/up1.sql b/schema/crdb/fm-alert-request/up1.sql new file mode 100644 index 00000000000..81205b01c86 --- /dev/null +++ b/schema/crdb/fm-alert-request/up1.sql @@ -0,0 +1,17 @@ +CREATE TABLE IF NOT EXISTS omicron.public.fm_alert_request ( + -- Requested alert UUID + id UUID NOT NULL, + -- UUID of the sitrep in which the alert is requested. + sitrep_id UUID NOT NULL, + -- UUID of the sitrep in which the alert request was created. + requested_sitrep_id UUID NOT NULL, + -- UUID of the case to which this alert request belongs. + case_id UUID NOT NULL, + + -- The class of alert that was requested + alert_class omicron.public.alert_class NOT NULL, + -- Actual alert data. The structure of this depends on the alert class. + payload JSONB NOT NULL, + + PRIMARY KEY (sitrep_id, id) +); diff --git a/schema/crdb/fm-alert-request/up2.sql b/schema/crdb/fm-alert-request/up2.sql new file mode 100644 index 00000000000..49262355d51 --- /dev/null +++ b/schema/crdb/fm-alert-request/up2.sql @@ -0,0 +1,3 @@ +CREATE INDEX IF NOT EXISTS + lookup_fm_alert_requests_for_case +ON omicron.public.fm_alert_request (sitrep_id, case_id); diff --git a/schema/crdb/fm-alert-request/up3.sql b/schema/crdb/fm-alert-request/up3.sql new file mode 100644 index 00000000000..d271eba3fd6 --- /dev/null +++ b/schema/crdb/fm-alert-request/up3.sql @@ -0,0 +1,2 @@ +ALTER TABLE omicron.public.alert + ADD COLUMN IF NOT EXISTS case_id UUID; diff --git a/smf/nexus/multi-sled/config-partial.toml b/smf/nexus/multi-sled/config-partial.toml index 96b0281db1d..dfef3031a92 100644 --- a/smf/nexus/multi-sled/config-partial.toml +++ b/smf/nexus/multi-sled/config-partial.toml @@ -107,6 +107,10 @@ fm.sitrep_load_period_secs = 15 # is activated every time the current sitrep changes. Periodic activations are # only necessary to ensure that it always happens eventually. fm.sitrep_gc_period_secs = 600 +# Updating database state to match the current sitrep is triggered whenever a +# new sitrep is loaded, so we need not set the periodic activation interval +# too high. +fm.rendezvous_period_secs = 300 probe_distributor.period_secs = 60 multicast_reconciler.period_secs = 60 trust_quorum.period_secs = 60 diff --git a/smf/nexus/single-sled/config-partial.toml b/smf/nexus/single-sled/config-partial.toml index 07d847b8ae0..45529059c40 100644 --- a/smf/nexus/single-sled/config-partial.toml +++ b/smf/nexus/single-sled/config-partial.toml @@ -107,6 +107,10 @@ fm.sitrep_load_period_secs = 15 # is activated every time the current sitrep changes. Periodic activations are # only necessary to ensure that it always happens eventually. fm.sitrep_gc_period_secs = 600 +# Updating database state to match the current sitrep is triggered whenever a +# new sitrep is loaded, so we need not set the periodic activation interval +# too high. +fm.rendezvous_period_secs = 300 probe_distributor.period_secs = 60 trust_quorum.period_secs = 60 multicast_reconciler.period_secs = 60