From 1ede6a9b28638705ed3cc2ec61ad0e33b06d4e4b Mon Sep 17 00:00:00 2001 From: John Gallagher Date: Thu, 11 Dec 2025 14:40:43 -0500 Subject: [PATCH 01/27] add expunged_and_unreferenced --- nexus/types/src/deployment/planning_input.rs | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/nexus/types/src/deployment/planning_input.rs b/nexus/types/src/deployment/planning_input.rs index a2af81e5972..3238b9b17e5 100644 --- a/nexus/types/src/deployment/planning_input.rs +++ b/nexus/types/src/deployment/planning_input.rs @@ -145,6 +145,9 @@ pub struct PlanningInput { /// Generation number in the blueprint, which triggers active Nexuses to /// quiesce. not_yet_nexus_zones: BTreeSet, + + /// TODO-john + expunged_and_unreferenced: BTreeSet, } impl PlanningInput { @@ -358,6 +361,7 @@ impl PlanningInput { .ignore_impossible_mgs_updates_since, active_nexus_zones: self.active_nexus_zones, not_yet_nexus_zones: self.not_yet_nexus_zones, + expunged_and_unreferenced: self.expunged_and_unreferenced, } } } @@ -1598,6 +1602,7 @@ pub struct PlanningInputBuilder { ignore_impossible_mgs_updates_since: DateTime, active_nexus_zones: BTreeSet, not_yet_nexus_zones: BTreeSet, + expunged_and_unreferenced: BTreeSet, } impl PlanningInputBuilder { @@ -1620,6 +1625,7 @@ impl PlanningInputBuilder { - MGS_UPDATE_SETTLE_TIMEOUT, active_nexus_zones: BTreeSet::new(), not_yet_nexus_zones: BTreeSet::new(), + expunged_and_unreferenced: BTreeSet::new(), } } @@ -1742,6 +1748,7 @@ impl PlanningInputBuilder { .ignore_impossible_mgs_updates_since, active_nexus_zones: self.active_nexus_zones, not_yet_nexus_zones: self.not_yet_nexus_zones, + expunged_and_unreferenced: self.expunged_and_unreferenced, } } } From 73da04c60ce47e4d697cff3696774314e2402bb6 Mon Sep 17 00:00:00 2001 From: John Gallagher Date: Tue, 13 Jan 2026 14:09:09 -0500 Subject: [PATCH 02/27] input/builder helper methods --- nexus/types/src/deployment.rs | 8 ++++ nexus/types/src/deployment/planning_input.rs | 47 +++++++++++++++++--- 2 files changed, 50 insertions(+), 5 deletions(-) diff --git a/nexus/types/src/deployment.rs b/nexus/types/src/deployment.rs index 99f739e2da8..3b023ff8915 100644 --- a/nexus/types/src/deployment.rs +++ b/nexus/types/src/deployment.rs @@ -858,6 +858,14 @@ pub enum BlueprintExpungedZoneAccessReason { /// (Moving them to "ready for cleanup" is a _prerequisite_ for pruning.) PlannerCheckReadyForCleanup, + /// When constructing a [`PlanningInput`], its builder has a guard that any + /// "expunged and unreferenced" zone ID actually is expunged. + /// + /// This guard is implemented by asking the parent blueprint for its list of + /// expunged zones. The planner does not need to account for this when + /// pruning zones. + PlanningInputExpungedZoneGuard, + // -------------------------------------------------------------------- // Catch-all variants for non-production callers. The planner does not need // to account for these when pruning. diff --git a/nexus/types/src/deployment/planning_input.rs b/nexus/types/src/deployment/planning_input.rs index 3238b9b17e5..fe1d17dc535 100644 --- a/nexus/types/src/deployment/planning_input.rs +++ b/nexus/types/src/deployment/planning_input.rs @@ -7,10 +7,12 @@ use super::AddNetworkResourceError; use super::Blueprint; +use super::BlueprintExpungedZoneAccessReason; use super::BlueprintZoneImageSource; use super::OmicronZoneExternalIp; use super::OmicronZoneNetworkResources; use super::OmicronZoneNic; +use super::ZoneRunningStatus; use crate::deployment::PlannerConfig; use crate::external_api::views::PhysicalDiskPolicy; use crate::external_api::views::PhysicalDiskState; @@ -147,7 +149,8 @@ pub struct PlanningInput { not_yet_nexus_zones: BTreeSet, /// TODO-john - expunged_and_unreferenced: BTreeSet, + /// add blippy: any zone id here must be expunged in `parent_blueprint` + expunged_and_unreferenced_zones: BTreeSet, } impl PlanningInput { @@ -344,6 +347,12 @@ impl PlanningInput { self.ignore_impossible_mgs_updates_since } + pub fn expunged_and_unreferenced_zones( + &self, + ) -> &BTreeSet { + &self.expunged_and_unreferenced_zones + } + /// Convert this `PlanningInput` back into a [`PlanningInputBuilder`] /// /// This is primarily useful for tests that want to mutate an existing @@ -361,7 +370,8 @@ impl PlanningInput { .ignore_impossible_mgs_updates_since, active_nexus_zones: self.active_nexus_zones, not_yet_nexus_zones: self.not_yet_nexus_zones, - expunged_and_unreferenced: self.expunged_and_unreferenced, + expunged_and_unreferenced_zones: self + .expunged_and_unreferenced_zones, } } } @@ -1587,6 +1597,8 @@ pub enum PlanningInputBuildError { }, #[error("sled not found: {0}")] SledNotFound(SledUuid), + #[error("attempted to mark non-expunged zone as expunged: {0}")] + ZoneNotExpunged(OmicronZoneUuid), } /// Constructor for [`PlanningInput`]. @@ -1602,7 +1614,7 @@ pub struct PlanningInputBuilder { ignore_impossible_mgs_updates_since: DateTime, active_nexus_zones: BTreeSet, not_yet_nexus_zones: BTreeSet, - expunged_and_unreferenced: BTreeSet, + expunged_and_unreferenced_zones: BTreeSet, } impl PlanningInputBuilder { @@ -1625,7 +1637,7 @@ impl PlanningInputBuilder { - MGS_UPDATE_SETTLE_TIMEOUT, active_nexus_zones: BTreeSet::new(), not_yet_nexus_zones: BTreeSet::new(), - expunged_and_unreferenced: BTreeSet::new(), + expunged_and_unreferenced_zones: BTreeSet::new(), } } @@ -1735,6 +1747,30 @@ impl PlanningInputBuilder { self.not_yet_nexus_zones = not_yet_nexus_zones; } + pub fn insert_expunged_and_unreferenced_zone( + &mut self, + zone_id: OmicronZoneUuid, + ) -> Result<(), PlanningInputBuildError> { + use BlueprintExpungedZoneAccessReason::PlanningInputExpungedZoneGuard; + + // We have no way to confirm that this zone is "unreferenced" - that's a + // property of the system at large, mostly CRDB state - but we can + // confirm that it's expunged by looking at the parent blueprint. + if !self + .parent_blueprint + .expunged_zones( + ZoneRunningStatus::Any, + PlanningInputExpungedZoneGuard, + ) + .any(|(_sled_id, zone)| zone.id == zone_id) + { + return Err(PlanningInputBuildError::ZoneNotExpunged(zone_id)); + } + + self.expunged_and_unreferenced_zones.insert(zone_id); + Ok(()) + } + pub fn build(self) -> PlanningInput { PlanningInput { parent_blueprint: self.parent_blueprint, @@ -1748,7 +1784,8 @@ impl PlanningInputBuilder { .ignore_impossible_mgs_updates_since, active_nexus_zones: self.active_nexus_zones, not_yet_nexus_zones: self.not_yet_nexus_zones, - expunged_and_unreferenced: self.expunged_and_unreferenced, + expunged_and_unreferenced_zones: self + .expunged_and_unreferenced_zones, } } } From 4a4f0a04b8fdb626e0c481c517fac22fa09c6b7c Mon Sep 17 00:00:00 2001 From: John Gallagher Date: Wed, 14 Jan 2026 16:18:20 -0500 Subject: [PATCH 03/27] initial collecting "expunged and unreferenced" --- .../src/expunged_and_unreferenced.rs | 259 ++++++++++++++++++ nexus/reconfigurator/preparation/src/lib.rs | 25 ++ nexus/types/src/deployment.rs | 8 + 3 files changed, 292 insertions(+) create mode 100644 nexus/reconfigurator/preparation/src/expunged_and_unreferenced.rs diff --git a/nexus/reconfigurator/preparation/src/expunged_and_unreferenced.rs b/nexus/reconfigurator/preparation/src/expunged_and_unreferenced.rs new file mode 100644 index 00000000000..fb73715fb02 --- /dev/null +++ b/nexus/reconfigurator/preparation/src/expunged_and_unreferenced.rs @@ -0,0 +1,259 @@ +// This Source Code Form is subject to the terms of the Mozilla Public +// License, v. 2.0. If a copy of the MPL was not distributed with this +// file, You can obtain one at https://mozilla.org/MPL/2.0/. + +//! Helpers for identifying when expunged zones are no longer referenced in the +//! database. + +use nexus_db_queries::context::OpContext; +use nexus_db_queries::db::DataStore; +use nexus_types::deployment::Blueprint; +use nexus_types::deployment::BlueprintExpungedZoneAccessReason; +use nexus_types::deployment::BlueprintZoneType; +use nexus_types::deployment::ZoneRunningStatus; +use nexus_types::deployment::blueprint_zone_type; +use omicron_common::api::external::Error; +use omicron_uuid_kinds::GenericUuid; +use omicron_uuid_kinds::OmicronZoneUuid; +use std::cell::OnceCell; +use std::collections::BTreeSet; +use std::net::IpAddr; + +pub(super) async fn find_expunged_and_unreferenced_zones( + opctx: &OpContext, + datastore: &DataStore, + parent_blueprint: &Blueprint, + external_ip_rows: &[nexus_db_model::ExternalIp], + service_nic_rows: &[nexus_db_model::ServiceNetworkInterface], +) -> Result, Error> { + let mut expunged_and_unreferenced = BTreeSet::new(); + + static_check_all_reasons_handled( + BlueprintExpungedZoneAccessReason::PlanningInputDetermineUnreferenced, + ); + + let bp_refs = BlueprintReferencesCache::new(parent_blueprint); + + for (_, zone) in parent_blueprint.expunged_zones( + ZoneRunningStatus::Shutdown, + BlueprintExpungedZoneAccessReason::PlanningInputDetermineUnreferenced, + ) { + let is_referenced = match &zone.zone_type { + BlueprintZoneType::BoundaryNtp(boundary_ntp) => { + is_boundary_ntp_referenced(boundary_ntp, &bp_refs) + || is_external_ip_referenced(zone.id, external_ip_rows) + || is_service_nic_referenced(zone.id, service_nic_rows) + } + BlueprintZoneType::ClickhouseKeeper(_) + | BlueprintZoneType::ClickhouseServer(_) => { + is_multinode_clickhouse_referenced(zone.id, parent_blueprint) + } + BlueprintZoneType::ExternalDns(external_dns) => { + is_external_dns_referenced(external_dns, &bp_refs) + || is_external_ip_referenced(zone.id, external_ip_rows) + || is_service_nic_referenced(zone.id, service_nic_rows) + } + BlueprintZoneType::Nexus(nexus) => { + todo!("john-{nexus:?}") + } + BlueprintZoneType::Oximeter(_) => { + is_oximeter_referenced(opctx, datastore, zone.id).await? + } + BlueprintZoneType::CockroachDb(_) => { + // BlueprintExpungedZoneAccessReason::CockroachDecommission + // means we consider cockroach zones referenced until the + // cockroach cluster has decommissioned the node that was + // present in that zone; however, we don't currently + // decommission cockroach nodes (tracked by + // ). We + // therefore always consider cockroach nodes "still referenced". + true + } + + // These zone types currently have no associated + // `BlueprintExpungedZoneAccessReason`; there is no cleanup action + // required for them, so they're considered "unreferenced" and may + // be pruned as soon as they've been expunged. + BlueprintZoneType::Clickhouse(_) + | BlueprintZoneType::Crucible(_) + | BlueprintZoneType::CruciblePantry(_) + | BlueprintZoneType::InternalDns(_) + | BlueprintZoneType::InternalNtp(_) => false, + }; + + if !is_referenced { + expunged_and_unreferenced.insert(zone.id); + } + } + + Ok(expunged_and_unreferenced) +} + +fn is_external_ip_referenced( + zone_id: OmicronZoneUuid, + external_ip_rows: &[nexus_db_model::ExternalIp], +) -> bool { + let zone_id = zone_id.into_untyped_uuid(); + external_ip_rows.iter().any(|row| row.parent_id == Some(zone_id)) +} + +fn is_service_nic_referenced( + zone_id: OmicronZoneUuid, + service_nic_rows: &[nexus_db_model::ServiceNetworkInterface], +) -> bool { + let zone_id = zone_id.into_untyped_uuid(); + service_nic_rows.iter().any(|row| row.service_id == zone_id) +} + +fn is_boundary_ntp_referenced( + boundary_ntp: &blueprint_zone_type::BoundaryNtp, + bp_refs: &BlueprintReferencesCache<'_>, +) -> bool { + // Check BlueprintExpungedZoneAccessReason::BoundaryNtpUpstreamConfig; if + // this zone's upstream config is not covered by an in-service zone, then + // it's still "referenced" (in that the planner needs to refer to it to set + // up a new boundary NTP zone). + let expunged_config = BoundaryNtpUpstreamConfig::new(boundary_ntp); + bp_refs + .in_service_boundary_ntp_upstream_configs() + .contains(&expunged_config) +} + +fn is_multinode_clickhouse_referenced( + zone_id: OmicronZoneUuid, + parent_blueprint: &Blueprint, +) -> bool { + // Check BlueprintExpungedZoneAccessReason::ClickhouseKeeperServerConfigIps; + // if this zone is still present in the clickhouse cluster config, it's + // still referenced. + let Some(clickhouse_config) = &parent_blueprint.clickhouse_cluster_config + else { + return false; + }; + + clickhouse_config.keepers.contains_key(&zone_id) + || clickhouse_config.servers.contains_key(&zone_id) +} + +fn is_external_dns_referenced( + external_dns: &blueprint_zone_type::ExternalDns, + bp_refs: &BlueprintReferencesCache<'_>, +) -> bool { + // Check BlueprintExpungedZoneAccessReason::ExternalDnsExternalIps; we + // consider an external DNS zone "still referenced" if its IP is _not_ + // assigned to an in-service external DNS zone. (If the IP _is_ assigned to + // an in-service external DNS zone, the expunged zone is no longer + // referenced and can be safely pruned.) + let expunged_zone_ip = external_dns.dns_address.addr.ip(); + !bp_refs.in_service_external_dns_ips().contains(&expunged_zone_ip) +} + +async fn is_oximeter_referenced( + opctx: &OpContext, + datastore: &DataStore, + zone_id: OmicronZoneUuid, +) -> Result { + // Check + // BlueprintExpungedZoneAccessReason::OximeterExpungeAndReassignProducers: + // this zone ID should not refer to an in-service Oximeter collector, and it + // should have no producers assigned to it. + match datastore.oximeter_lookup(opctx, zone_id.as_untyped_uuid()).await { + Ok(_info) => { + // If the lookup succeeded, we haven't yet performed the necessary + // cleanup to mark this oximeter as expunged. + return Ok(true); + } + Err(err) => { + todo!("john") + } + } +} + +#[derive(Debug, PartialEq, Eq, PartialOrd, Ord)] +struct BoundaryNtpUpstreamConfig<'a> { + ntp_servers: &'a [String], + dns_servers: &'a [IpAddr], +} + +impl<'a> BoundaryNtpUpstreamConfig<'a> { + fn new(config: &'a blueprint_zone_type::BoundaryNtp) -> Self { + Self { + ntp_servers: &config.ntp_servers, + dns_servers: &config.dns_servers, + } + } +} + +struct BlueprintReferencesCache<'a> { + parent_blueprint: &'a Blueprint, + in_service_boundary_ntp_upstream_configs: + OnceCell>>, + in_service_external_dns_ips: OnceCell>, +} + +impl<'a> BlueprintReferencesCache<'a> { + fn new(parent_blueprint: &'a Blueprint) -> Self { + Self { + parent_blueprint, + in_service_boundary_ntp_upstream_configs: OnceCell::new(), + in_service_external_dns_ips: OnceCell::new(), + } + } + + fn in_service_boundary_ntp_upstream_configs( + &self, + ) -> &BTreeSet> { + OnceCell::get_or_init( + &self.in_service_boundary_ntp_upstream_configs, + || { + self.parent_blueprint + .in_service_zones() + .filter_map(|(_, zone)| match &zone.zone_type { + BlueprintZoneType::BoundaryNtp(config) => { + Some(BoundaryNtpUpstreamConfig::new(config)) + } + _ => None, + }) + .collect() + }, + ) + } + + fn in_service_external_dns_ips(&self) -> &BTreeSet { + OnceCell::get_or_init(&self.in_service_external_dns_ips, || { + self.parent_blueprint + .in_service_zones() + .filter_map(|(_, zone)| match &zone.zone_type { + BlueprintZoneType::ExternalDns(config) => { + Some(config.dns_address.addr.ip()) + } + _ => None, + }) + .collect() + }) + } +} + +// TODO-john +fn static_check_all_reasons_handled(reason: BlueprintExpungedZoneAccessReason) { + match reason { + BlueprintExpungedZoneAccessReason::BoundaryNtpUpstreamConfig => {}, + BlueprintExpungedZoneAccessReason::ClickhouseKeeperServerConfigIps => {}, + BlueprintExpungedZoneAccessReason::CockroachDecommission => {}, + BlueprintExpungedZoneAccessReason::DeallocateExternalNetworkingResources => {}, + BlueprintExpungedZoneAccessReason::ExternalDnsExternalIps => {}, + BlueprintExpungedZoneAccessReason::NexusDeleteMetadataRecord => {}, + BlueprintExpungedZoneAccessReason::NexusExternalConfig => {}, + BlueprintExpungedZoneAccessReason::NexusSelfIsQuiescing => {}, + BlueprintExpungedZoneAccessReason::NexusSagaReassignment => {}, + BlueprintExpungedZoneAccessReason::NexusSupportBundleReassign => {}, + BlueprintExpungedZoneAccessReason::OximeterExpungeAndReassignProducers => {}, + BlueprintExpungedZoneAccessReason::PlannerCheckReadyForCleanup => {}, + BlueprintExpungedZoneAccessReason::PlanningInputDetermineUnreferenced => {}, + BlueprintExpungedZoneAccessReason::PlanningInputExpungedZoneGuard => {}, + BlueprintExpungedZoneAccessReason::Blippy => {}, + BlueprintExpungedZoneAccessReason::Omdb => {}, + BlueprintExpungedZoneAccessReason::ReconfiguratorCli => {}, + BlueprintExpungedZoneAccessReason::Test => {}, + } +} diff --git a/nexus/reconfigurator/preparation/src/lib.rs b/nexus/reconfigurator/preparation/src/lib.rs index 05fd714d993..0c8f94bbe7c 100644 --- a/nexus/reconfigurator/preparation/src/lib.rs +++ b/nexus/reconfigurator/preparation/src/lib.rs @@ -63,6 +63,8 @@ use std::collections::BTreeSet; use std::net::IpAddr; use std::sync::Arc; +mod expunged_and_unreferenced; + /// Given various pieces of database state that go into the blueprint planning /// process, produce a `PlanningInput` object encapsulating what the planner /// needs to generate a blueprint @@ -92,6 +94,7 @@ pub struct PlanningInputFromDb<'a> { pub planner_config: PlannerConfig, pub active_nexus_zones: BTreeSet, pub not_yet_nexus_zones: BTreeSet, + pub expunged_and_unreferenced_zones: BTreeSet, pub log: &'a Logger, } @@ -269,6 +272,15 @@ impl PlanningInputFromDb<'_> { active_nexus_zones.into_iter().map(|n| n.nexus_id()).collect(); let not_yet_nexus_zones = not_yet_nexus_zones.into_iter().map(|n| n.nexus_id()).collect(); + let expunged_and_unreferenced_zones = + expunged_and_unreferenced::find_expunged_and_unreferenced_zones( + opctx, + datastore, + &parent_blueprint, + &external_ip_rows, + &service_nic_rows, + ) + .await?; let planning_input = PlanningInputFromDb { parent_blueprint, @@ -297,6 +309,7 @@ impl PlanningInputFromDb<'_> { planner_config, active_nexus_zones, not_yet_nexus_zones, + expunged_and_unreferenced_zones, } .build() .internal_context("assembling planning_input")?; @@ -456,6 +469,18 @@ impl PlanningInputFromDb<'_> { })?; } + for &zone_id in &self.expunged_and_unreferenced_zones { + builder.insert_expunged_and_unreferenced_zone(zone_id).map_err( + |e| { + Error::internal_error(&format!( + "unexpectedly failed to add expunged and unreferenced \ + zone ID {zone_id} to planning input: {}", + InlineErrorChain::new(&e), + )) + }, + )?; + } + Ok(builder.build()) } } diff --git a/nexus/types/src/deployment.rs b/nexus/types/src/deployment.rs index 3b023ff8915..0e39710ff95 100644 --- a/nexus/types/src/deployment.rs +++ b/nexus/types/src/deployment.rs @@ -858,6 +858,14 @@ pub enum BlueprintExpungedZoneAccessReason { /// (Moving them to "ready for cleanup" is a _prerequisite_ for pruning.) PlannerCheckReadyForCleanup, + /// When constructing a [`PlanningInput`], it needs to loop over the + /// blueprint's expunged zones in order to know which zones it needs to + /// check for references, so that it can assemble the "expunged and + /// unreferenced" zone IDs. + /// + /// The planner does not need to account for this when pruning zones. + PlanningInputDetermineUnreferenced, + /// When constructing a [`PlanningInput`], its builder has a guard that any /// "expunged and unreferenced" zone ID actually is expunged. /// From 97d3b233973f12027bd624b17e9820dd4f95cc98 Mon Sep 17 00:00:00 2001 From: John Gallagher Date: Fri, 16 Jan 2026 13:32:34 -0500 Subject: [PATCH 04/27] flesh out oximeter reference check --- .../src/expunged_and_unreferenced.rs | 46 ++++++++++++++++--- 1 file changed, 40 insertions(+), 6 deletions(-) diff --git a/nexus/reconfigurator/preparation/src/expunged_and_unreferenced.rs b/nexus/reconfigurator/preparation/src/expunged_and_unreferenced.rs index fb73715fb02..81064c809cb 100644 --- a/nexus/reconfigurator/preparation/src/expunged_and_unreferenced.rs +++ b/nexus/reconfigurator/preparation/src/expunged_and_unreferenced.rs @@ -4,6 +4,8 @@ //! Helpers for identifying when expunged zones are no longer referenced in the //! database. +//! +//! TODO-john more explanation use nexus_db_queries::context::OpContext; use nexus_db_queries::db::DataStore; @@ -12,12 +14,15 @@ use nexus_types::deployment::BlueprintExpungedZoneAccessReason; use nexus_types::deployment::BlueprintZoneType; use nexus_types::deployment::ZoneRunningStatus; use nexus_types::deployment::blueprint_zone_type; +use omicron_common::api::external::DataPageParams; use omicron_common::api::external::Error; +use omicron_common::api::external::PaginationOrder; use omicron_uuid_kinds::GenericUuid; use omicron_uuid_kinds::OmicronZoneUuid; use std::cell::OnceCell; use std::collections::BTreeSet; use std::net::IpAddr; +use std::num::NonZeroU32; pub(super) async fn find_expunged_and_unreferenced_zones( opctx: &OpContext, @@ -53,8 +58,10 @@ pub(super) async fn find_expunged_and_unreferenced_zones( || is_external_ip_referenced(zone.id, external_ip_rows) || is_service_nic_referenced(zone.id, service_nic_rows) } - BlueprintZoneType::Nexus(nexus) => { - todo!("john-{nexus:?}") + BlueprintZoneType::Nexus(_) => { + is_nexus_referenced(opctx, datastore, zone.id).await? + || is_external_ip_referenced(zone.id, external_ip_rows) + || is_service_nic_referenced(zone.id, service_nic_rows) } BlueprintZoneType::Oximeter(_) => { is_oximeter_referenced(opctx, datastore, zone.id).await? @@ -148,6 +155,14 @@ fn is_external_dns_referenced( !bp_refs.in_service_external_dns_ips().contains(&expunged_zone_ip) } +async fn is_nexus_referenced( + opctx: &OpContext, + datastore: &DataStore, + zone_id: OmicronZoneUuid, +) -> Result { + unimplemented!() +} + async fn is_oximeter_referenced( opctx: &OpContext, datastore: &DataStore, @@ -157,16 +172,35 @@ async fn is_oximeter_referenced( // BlueprintExpungedZoneAccessReason::OximeterExpungeAndReassignProducers: // this zone ID should not refer to an in-service Oximeter collector, and it // should have no producers assigned to it. - match datastore.oximeter_lookup(opctx, zone_id.as_untyped_uuid()).await { - Ok(_info) => { + match datastore.oximeter_lookup(opctx, zone_id.as_untyped_uuid()).await? { + Some(_info) => { // If the lookup succeeded, we haven't yet performed the necessary // cleanup to mark this oximeter as expunged. return Ok(true); } - Err(err) => { - todo!("john") + None => { + // Oximeter has been expunged (or was never inserted in the first + // place); fall through to check whether there are any producers + // assigned to it. } } + + // Ask for a page with a single item; all we care about is whether _any_ + // producers are assigned to this oximeter. + let assigned_producers = datastore + .producers_list_by_oximeter_id( + opctx, + zone_id.into_untyped_uuid(), + &DataPageParams { + marker: None, + direction: PaginationOrder::Ascending, + limit: NonZeroU32::new(1).expect("1 is not 0"), + }, + ) + .await?; + + // This oximeter is referenced if our set of assigned producers is nonempty. + Ok(!assigned_producers.is_empty()) } #[derive(Debug, PartialEq, Eq, PartialOrd, Ord)] From c2eb0e33c03f536de2ced66c74823601fc41fbee Mon Sep 17 00:00:00 2001 From: John Gallagher Date: Tue, 27 Jan 2026 11:35:06 -0500 Subject: [PATCH 05/27] flesh out is_nexus_referenced() --- nexus/db-queries/src/db/datastore/saga.rs | 48 ++++++--- .../src/expunged_and_unreferenced.rs | 101 ++++++++++++++++-- nexus/types/src/deployment.rs | 6 +- 3 files changed, 130 insertions(+), 25 deletions(-) diff --git a/nexus/db-queries/src/db/datastore/saga.rs b/nexus/db-queries/src/db/datastore/saga.rs index ccc6a92b27b..bc279b16dd2 100644 --- a/nexus/db-queries/src/db/datastore/saga.rs +++ b/nexus/db-queries/src/db/datastore/saga.rs @@ -19,9 +19,11 @@ use nexus_auth::context::OpContext; use nexus_db_errors::ErrorHandler; use nexus_db_errors::public_error_from_diesel; use nexus_db_model::SagaState; +use omicron_common::api::external::DataPageParams; use omicron_common::api::external::Error; use omicron_common::api::external::LookupType; use omicron_common::api::external::ResourceType; +use uuid::Uuid; use std::ops::Add; impl DataStore { @@ -130,6 +132,27 @@ impl DataStore { } } + /// Returns a single page of unfinished sagas assigned to SEC `sec_id`. + pub async fn saga_list_recovery_candidates( + &self, + opctx: &OpContext, + sec_id: db::saga_types::SecId, + pagparams: &DataPageParams<'_, Uuid>, + ) -> Result, Error> { + use nexus_db_schema::schema::saga::dsl; + let conn = self.pool_connection_authorized(opctx).await?; + paginated(dsl::saga, dsl::id, pagparams) + .filter( + dsl::saga_state.eq_any(SagaState::RECOVERY_CANDIDATE_STATES), + ) + .filter(dsl::current_sec.eq(sec_id)) + .select(db::saga_types::Saga::as_select()) + .load_async(&*conn) + .await + .map_err(|e| public_error_from_diesel(e, ErrorHandler::Server)) + } + + /// many queries as needed (in batches) to get them all /// Returns a list of unfinished sagas assigned to SEC `sec_id`, making as /// many queries as needed (in batches) to get them all pub async fn saga_list_recovery_candidates_batched( @@ -142,25 +165,16 @@ impl DataStore { SQL_BATCH_SIZE, dropshot::PaginationOrder::Ascending, ); - let conn = self.pool_connection_authorized(opctx).await?; while let Some(p) = paginator.next() { - use nexus_db_schema::schema::saga::dsl; - - let mut batch = - paginated(dsl::saga, dsl::id, &p.current_pagparams()) - .filter( - dsl::saga_state - .eq_any(SagaState::RECOVERY_CANDIDATE_STATES), - ) - .filter(dsl::current_sec.eq(sec_id)) - .select(db::saga_types::Saga::as_select()) - .load_async(&*conn) - .await - .map_err(|e| { - public_error_from_diesel(e, ErrorHandler::Server) - })?; + let mut batch = self + .saga_list_recovery_candidates( + opctx, + sec_id, + &p.current_pagparams(), + ) + .await?; - paginator = p.found_batch(&batch, &|row| row.id); + paginator = p.found_batch(&batch, &|row| row.id.0.0); sagas.append(&mut batch); } Ok(sagas) diff --git a/nexus/reconfigurator/preparation/src/expunged_and_unreferenced.rs b/nexus/reconfigurator/preparation/src/expunged_and_unreferenced.rs index 81064c809cb..c1d81865b80 100644 --- a/nexus/reconfigurator/preparation/src/expunged_and_unreferenced.rs +++ b/nexus/reconfigurator/preparation/src/expunged_and_unreferenced.rs @@ -7,6 +7,7 @@ //! //! TODO-john more explanation +use nexus_db_model::SupportBundleState; use nexus_db_queries::context::OpContext; use nexus_db_queries::db::DataStore; use nexus_types::deployment::Blueprint; @@ -74,6 +75,16 @@ pub(super) async fn find_expunged_and_unreferenced_zones( // decommission cockroach nodes (tracked by // ). We // therefore always consider cockroach nodes "still referenced". + // + // This shouldn't be a huge deal in practice; Cockroach zones + // are updated in place, not by an expunge/add pair, so a + // typical update does not produce an expunged Cockroach zone + // that needs pruning. Only expunging a disk or sled can produce + // an expunged Cockroach node, and we expect the number of those + // to remain relatively small for any given deployment. + // Hopefully we can revisit decommissioning Cockroach nodes long + // before we need to worry about the amount of garbage leftover + // from expunged disks/sleds. true } @@ -135,6 +146,8 @@ fn is_multinode_clickhouse_referenced( // still referenced. let Some(clickhouse_config) = &parent_blueprint.clickhouse_cluster_config else { + // If there is no clickhouse cluster config at all, the zone isn't + // referenced in it! return false; }; @@ -160,7 +173,79 @@ async fn is_nexus_referenced( datastore: &DataStore, zone_id: OmicronZoneUuid, ) -> Result { - unimplemented!() + // Check BlueprintExpungedZoneAccessReason::NexusDeleteMetadataRecord: is + // this Nexus zone still present in the `db_metadata_nexus` table? + if !datastore + .database_nexus_access_all(opctx, &BTreeSet::from([zone_id])) + .await? + .is_empty() + { + return Ok(true); + } + + // Check BlueprintExpungedZoneAccessReason::NexusSagaReassignment: does + // this Nexus zone still have unfinished sagas assigned? + if !datastore + .saga_list_recovery_candidates( + opctx, + zone_id.into(), + &single_item_pagparams(), + ) + .await? + .is_empty() + { + return Ok(true); + } + + // This is a no-op match that exists solely to ensure we update our logic if + // the possible support bundle states change. We need to query for any + // support bundle assigned to the `zone_id` Nexus in any state that might + // require cleanup work; currently, that means "any state other than + // `Failed`". + // + // If updating this match, you must also ensure you update the query below! + match SupportBundleState::Active { + SupportBundleState::Collecting + | SupportBundleState::Active + | SupportBundleState::Destroying + | SupportBundleState::Failing => { + // We need to query for these states. + } + SupportBundleState::Failed => { + // The sole state we don't care about. + } + } + + // Check BlueprintExpungedZoneAccessReason::NexusSupportBundleReassign: does + // this Nexus zone still have support bundles assigned to it in any state + // that requires cleanup work? This requires explicitly listing the states + // we care about; the no-op match statement above will hopefully keep this + // in sync with any changes to the enum. + if !datastore + .support_bundle_list_assigned_to_nexus( + opctx, + &single_item_pagparams(), + zone_id, + vec![ + SupportBundleState::Collecting, + SupportBundleState::Active, + SupportBundleState::Destroying, + SupportBundleState::Failing, + ], + ) + .await? + .is_empty() + { + return Ok(true); + } + + // These Nexus-related zone access reasons are documented as "planner does + // not need to account for this", so we don't check anything: + // + // * BlueprintExpungedZoneAccessReason::NexusExternalConfig + // * BlueprintExpungedZoneAccessReason::NexusSelfIsQuiescing + + Ok(false) } async fn is_oximeter_referenced( @@ -191,11 +276,7 @@ async fn is_oximeter_referenced( .producers_list_by_oximeter_id( opctx, zone_id.into_untyped_uuid(), - &DataPageParams { - marker: None, - direction: PaginationOrder::Ascending, - limit: NonZeroU32::new(1).expect("1 is not 0"), - }, + &single_item_pagparams(), ) .await?; @@ -203,6 +284,14 @@ async fn is_oximeter_referenced( Ok(!assigned_producers.is_empty()) } +fn single_item_pagparams() -> DataPageParams<'static, T> { + DataPageParams { + marker: None, + direction: PaginationOrder::Ascending, + limit: NonZeroU32::new(1).expect("1 is not 0"), + } +} + #[derive(Debug, PartialEq, Eq, PartialOrd, Ord)] struct BoundaryNtpUpstreamConfig<'a> { ntp_servers: &'a [String], diff --git a/nexus/types/src/deployment.rs b/nexus/types/src/deployment.rs index 0e39710ff95..1d9843b714b 100644 --- a/nexus/types/src/deployment.rs +++ b/nexus/types/src/deployment.rs @@ -818,8 +818,10 @@ pub enum BlueprintExpungedZoneAccessReason { /// Carrying forward the external Nexus configuration provided by the /// operator during rack setup; see [`Blueprint::operator_nexus_config()`]. /// - /// The planner must not prune a Nexus zone if it's the last zone - /// remaining with the set of configuration. + /// The planner does not need to account for this when pruning Nexus zones. + /// (The planner runs _inside Nexus_, which guarantees a Nexus exists that + /// is not ready for cleanup, which guarantees there's still a Nexus present + /// in the blueprint with the external Nexus configuration.) NexusExternalConfig, /// Nexus needs to whether it itself should be quiescing. If the From 5cad9b21dee334219ea9443f6370e987274040c0 Mon Sep 17 00:00:00 2001 From: John Gallagher Date: Tue, 27 Jan 2026 12:47:46 -0500 Subject: [PATCH 06/27] docs on static_check_all_reasons_handled() --- .../src/expunged_and_unreferenced.rs | 110 ++++++++++++------ 1 file changed, 76 insertions(+), 34 deletions(-) diff --git a/nexus/reconfigurator/preparation/src/expunged_and_unreferenced.rs b/nexus/reconfigurator/preparation/src/expunged_and_unreferenced.rs index c1d81865b80..d9435470a1a 100644 --- a/nexus/reconfigurator/preparation/src/expunged_and_unreferenced.rs +++ b/nexus/reconfigurator/preparation/src/expunged_and_unreferenced.rs @@ -47,8 +47,11 @@ pub(super) async fn find_expunged_and_unreferenced_zones( let is_referenced = match &zone.zone_type { BlueprintZoneType::BoundaryNtp(boundary_ntp) => { is_boundary_ntp_referenced(boundary_ntp, &bp_refs) - || is_external_ip_referenced(zone.id, external_ip_rows) - || is_service_nic_referenced(zone.id, service_nic_rows) + || is_external_networking_referenced( + zone.id, + external_ip_rows, + service_nic_rows, + ) } BlueprintZoneType::ClickhouseKeeper(_) | BlueprintZoneType::ClickhouseServer(_) => { @@ -56,13 +59,19 @@ pub(super) async fn find_expunged_and_unreferenced_zones( } BlueprintZoneType::ExternalDns(external_dns) => { is_external_dns_referenced(external_dns, &bp_refs) - || is_external_ip_referenced(zone.id, external_ip_rows) - || is_service_nic_referenced(zone.id, service_nic_rows) + || is_external_networking_referenced( + zone.id, + external_ip_rows, + service_nic_rows, + ) } BlueprintZoneType::Nexus(_) => { is_nexus_referenced(opctx, datastore, zone.id).await? - || is_external_ip_referenced(zone.id, external_ip_rows) - || is_service_nic_referenced(zone.id, service_nic_rows) + || is_external_networking_referenced( + zone.id, + external_ip_rows, + service_nic_rows, + ) } BlueprintZoneType::Oximeter(_) => { is_oximeter_referenced(opctx, datastore, zone.id).await? @@ -107,20 +116,18 @@ pub(super) async fn find_expunged_and_unreferenced_zones( Ok(expunged_and_unreferenced) } -fn is_external_ip_referenced( +fn is_external_networking_referenced( zone_id: OmicronZoneUuid, external_ip_rows: &[nexus_db_model::ExternalIp], -) -> bool { - let zone_id = zone_id.into_untyped_uuid(); - external_ip_rows.iter().any(|row| row.parent_id == Some(zone_id)) -} - -fn is_service_nic_referenced( - zone_id: OmicronZoneUuid, service_nic_rows: &[nexus_db_model::ServiceNetworkInterface], ) -> bool { + // Check + // BlueprintExpungedZoneAccessReason::DeallocateExternalNetworkingResources; + // if this zone's external IP or NIC are still present in the DB, then it's + // the zone is still referenced. let zone_id = zone_id.into_untyped_uuid(); - service_nic_rows.iter().any(|row| row.service_id == zone_id) + external_ip_rows.iter().any(|row| row.parent_id == Some(zone_id)) + || service_nic_rows.iter().any(|row| row.service_id == zone_id) } fn is_boundary_ntp_referenced( @@ -357,26 +364,61 @@ impl<'a> BlueprintReferencesCache<'a> { } } -// TODO-john +// This is a no-op function that exists solely to ensure this file is considered +// when there are any changes to `BlueprintExpungedZoneAccessReason` variants. +// +// If you're adding a new variant and that variant affects whether it's safe to +// prune an expunged zone from the blueprint, you _must_ also update the code +// above to check for whatever your new variant is. If your new variant does not +// affect pruning, put it in one of the appropriate sections below (or a new one +// with a relevant comment). fn static_check_all_reasons_handled(reason: BlueprintExpungedZoneAccessReason) { + // Help rustfmt out (with the full enum name it gives up on formatting). + use BlueprintExpungedZoneAccessReason as Reason; + match reason { - BlueprintExpungedZoneAccessReason::BoundaryNtpUpstreamConfig => {}, - BlueprintExpungedZoneAccessReason::ClickhouseKeeperServerConfigIps => {}, - BlueprintExpungedZoneAccessReason::CockroachDecommission => {}, - BlueprintExpungedZoneAccessReason::DeallocateExternalNetworkingResources => {}, - BlueprintExpungedZoneAccessReason::ExternalDnsExternalIps => {}, - BlueprintExpungedZoneAccessReason::NexusDeleteMetadataRecord => {}, - BlueprintExpungedZoneAccessReason::NexusExternalConfig => {}, - BlueprintExpungedZoneAccessReason::NexusSelfIsQuiescing => {}, - BlueprintExpungedZoneAccessReason::NexusSagaReassignment => {}, - BlueprintExpungedZoneAccessReason::NexusSupportBundleReassign => {}, - BlueprintExpungedZoneAccessReason::OximeterExpungeAndReassignProducers => {}, - BlueprintExpungedZoneAccessReason::PlannerCheckReadyForCleanup => {}, - BlueprintExpungedZoneAccessReason::PlanningInputDetermineUnreferenced => {}, - BlueprintExpungedZoneAccessReason::PlanningInputExpungedZoneGuard => {}, - BlueprintExpungedZoneAccessReason::Blippy => {}, - BlueprintExpungedZoneAccessReason::Omdb => {}, - BlueprintExpungedZoneAccessReason::ReconfiguratorCli => {}, - BlueprintExpungedZoneAccessReason::Test => {}, + // Checked by is_boundary_ntp_referenced() + Reason::BoundaryNtpUpstreamConfig => {} + + // Checked by is_multinode_clickhouse_referenced() + Reason::ClickhouseKeeperServerConfigIps => {} + + // NOT CHECKED: find_expunged_and_unreferenced_zones() will never + // consider a cockroach node "unreferenced", because we have currently + // disabled decommissioning (see + // https://github.com/oxidecomputer/omicron/issues/8447). + Reason::CockroachDecommission => {} + + // Checked by is_external_networking_referenced(), which is called for + // each zone type with external networking (boundary NTP, external DNS, + // Nexus) + Reason::DeallocateExternalNetworkingResources => {} + + // Checked by is_external_dns_referenced() + Reason::ExternalDnsExternalIps => {} + + // Each of these are checked by is_nexus_referenced() + Reason::NexusDeleteMetadataRecord + | Reason::NexusSagaReassignment + | Reason::NexusSupportBundleReassign => {} + + // Checked by is_oximeter_referenced() + Reason::OximeterExpungeAndReassignProducers => {} + + // Nexus-related reasons that don't need to be checked (see + // `BlueprintExpungedZoneAccessReason` for specifics) + Reason::NexusExternalConfig | Reason::NexusSelfIsQuiescing => {} + + // Planner-related reasons that don't need to be checked (see + // `BlueprintExpungedZoneAccessReason` for specifics) + Reason::PlannerCheckReadyForCleanup + | Reason::PlanningInputDetermineUnreferenced + | Reason::PlanningInputExpungedZoneGuard => {} + + // Test / development reasons that don't need to be checked + Reason::Blippy + | Reason::Omdb + | Reason::ReconfiguratorCli + | Reason::Test => {} } } From 1315325f411265e3e245cc7dbc218ec67ab2d5c0 Mon Sep 17 00:00:00 2001 From: John Gallagher Date: Tue, 27 Jan 2026 15:18:31 -0500 Subject: [PATCH 07/27] cleanup and comments --- .../src/expunged_and_unreferenced.rs | 25 ++++++++++++++++++- nexus/types/src/deployment/planning_input.rs | 23 ++++++++++++++--- 2 files changed, 43 insertions(+), 5 deletions(-) diff --git a/nexus/reconfigurator/preparation/src/expunged_and_unreferenced.rs b/nexus/reconfigurator/preparation/src/expunged_and_unreferenced.rs index d9435470a1a..40d896c0aa5 100644 --- a/nexus/reconfigurator/preparation/src/expunged_and_unreferenced.rs +++ b/nexus/reconfigurator/preparation/src/expunged_and_unreferenced.rs @@ -5,7 +5,30 @@ //! Helpers for identifying when expunged zones are no longer referenced in the //! database. //! -//! TODO-john more explanation +//! When a zone is expunged, the expunged zone is still present in the +//! blueprint. Two high-level conditions must be satisfied before it's safe to +//! prune an expunged zone from the blueprint (i.e., delete it entirely): +//! +//! 1. The sled-agent responsible for running the zone must confirm the zone is +//! no longer running and that it's ledgered an `OmicronSledConfig` with a +//! generation past the point in which the zone was expunged. This guarantees +//! the zone will never run again. This is tracked in the blueprint as the +//! `ready_for_cleanup` field inside the expunged zone disposition, and can +//! be queried by asking for expunged zones with the +//! `ZoneRunningStatus::Shutdown` state. +//! 2. Any cleanup work that operates on expunged zones must be complete. This +//! is zone-type-specific. Some zone types have no cleanup work at all and +//! can be pruned as soon as the first condition is satisfied. Others have +//! multiple, disparate cleanup work, all of which must be completed. +//! +//! This module is primarily considered with checking the second condition. The +//! [`BlueprintExpungedZoneAccessReason`] enum tracks a variant for every reason +//! a caller wants to access the expunged zones of a blueprint, including all +//! known cleanup actions. For each zone type, if the zone-type-specific cleanup +//! work is complete, we included the zone ID in the "expunged and unreferenced" +//! zone set in the `PlanningInput`. The planner can cheaply act on this set: +//! for every zone ID present, it can safely prune it from the blueprint (i.e., +//! do not include it in the child blueprint it emits). use nexus_db_model::SupportBundleState; use nexus_db_queries::context::OpContext; diff --git a/nexus/types/src/deployment/planning_input.rs b/nexus/types/src/deployment/planning_input.rs index fe1d17dc535..3da996d12b1 100644 --- a/nexus/types/src/deployment/planning_input.rs +++ b/nexus/types/src/deployment/planning_input.rs @@ -148,8 +148,14 @@ pub struct PlanningInput { /// quiesce. not_yet_nexus_zones: BTreeSet, - /// TODO-john - /// add blippy: any zone id here must be expunged in `parent_blueprint` + /// IDs of zones that are: + /// + /// * Expunged + /// * Confirmed shutdown and will never restart + /// * Not referenced elsewhere in Cockroach + /// + /// and can therefore be pruned from the blueprint, because any potential + /// cleanup required from their expungement has completed. expunged_and_unreferenced_zones: BTreeSet, } @@ -1747,6 +1753,14 @@ impl PlanningInputBuilder { self.not_yet_nexus_zones = not_yet_nexus_zones; } + /// Insert a zone that is expunged and unreferenced. + /// + /// # Errors + /// + /// Fails if this zone: + /// + /// * Does not exist in the parent blueprint + /// * Exists in the parent blueprint but is not expunged and shutdown pub fn insert_expunged_and_unreferenced_zone( &mut self, zone_id: OmicronZoneUuid, @@ -1755,11 +1769,12 @@ impl PlanningInputBuilder { // We have no way to confirm that this zone is "unreferenced" - that's a // property of the system at large, mostly CRDB state - but we can - // confirm that it's expunged by looking at the parent blueprint. + // confirm that it's expunged and ready for cleanup by looking at the + // parent blueprint. if !self .parent_blueprint .expunged_zones( - ZoneRunningStatus::Any, + ZoneRunningStatus::Shutdown, PlanningInputExpungedZoneGuard, ) .any(|(_sled_id, zone)| zone.id == zone_id) From c80d7e6661ec9db6fec0f997be36cb8f5cc98853 Mon Sep 17 00:00:00 2001 From: John Gallagher Date: Tue, 27 Jan 2026 15:19:52 -0500 Subject: [PATCH 08/27] cargo fmt --- nexus/db-queries/src/db/datastore/saga.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/nexus/db-queries/src/db/datastore/saga.rs b/nexus/db-queries/src/db/datastore/saga.rs index bc279b16dd2..2809dd6b902 100644 --- a/nexus/db-queries/src/db/datastore/saga.rs +++ b/nexus/db-queries/src/db/datastore/saga.rs @@ -23,8 +23,8 @@ use omicron_common::api::external::DataPageParams; use omicron_common::api::external::Error; use omicron_common::api::external::LookupType; use omicron_common::api::external::ResourceType; -use uuid::Uuid; use std::ops::Add; +use uuid::Uuid; impl DataStore { pub async fn saga_create( From 70a506fd9c0b4e616286d7865529357b0fdb45e9 Mon Sep 17 00:00:00 2001 From: John Gallagher Date: Fri, 30 Jan 2026 10:34:33 -0500 Subject: [PATCH 09/27] comments --- nexus/db-queries/src/db/datastore/saga.rs | 1 - .../src/expunged_and_unreferenced.rs | 48 ++++++++++++------- 2 files changed, 31 insertions(+), 18 deletions(-) diff --git a/nexus/db-queries/src/db/datastore/saga.rs b/nexus/db-queries/src/db/datastore/saga.rs index 2809dd6b902..86145ec1c47 100644 --- a/nexus/db-queries/src/db/datastore/saga.rs +++ b/nexus/db-queries/src/db/datastore/saga.rs @@ -152,7 +152,6 @@ impl DataStore { .map_err(|e| public_error_from_diesel(e, ErrorHandler::Server)) } - /// many queries as needed (in batches) to get them all /// Returns a list of unfinished sagas assigned to SEC `sec_id`, making as /// many queries as needed (in batches) to get them all pub async fn saga_list_recovery_candidates_batched( diff --git a/nexus/reconfigurator/preparation/src/expunged_and_unreferenced.rs b/nexus/reconfigurator/preparation/src/expunged_and_unreferenced.rs index 40d896c0aa5..c5aa064cbf3 100644 --- a/nexus/reconfigurator/preparation/src/expunged_and_unreferenced.rs +++ b/nexus/reconfigurator/preparation/src/expunged_and_unreferenced.rs @@ -5,30 +5,33 @@ //! Helpers for identifying when expunged zones are no longer referenced in the //! database. //! -//! When a zone is expunged, the expunged zone is still present in the +//! When a zone is first expunged, the expunged zone initially remains in the //! blueprint. Two high-level conditions must be satisfied before it's safe to //! prune an expunged zone from the blueprint (i.e., delete it entirely): //! -//! 1. The sled-agent responsible for running the zone must confirm the zone is -//! no longer running and that it's ledgered an `OmicronSledConfig` with a -//! generation past the point in which the zone was expunged. This guarantees -//! the zone will never run again. This is tracked in the blueprint as the -//! `ready_for_cleanup` field inside the expunged zone disposition, and can -//! be queried by asking for expunged zones with the -//! `ZoneRunningStatus::Shutdown` state. +//! 1. We must know the zone is not running and will never run again. The +//! typical case of this confirmation is that the sled-agent responsible for +//! running the zone has confirmed the zone is no longer running and that +//! it's ledgered an `OmicronSledConfig` with a generation past the point in +//! which the zone was expunged. The uncommon case of this confirmation is +//! that the sled where this zone ran has been expunged. //! 2. Any cleanup work that operates on expunged zones must be complete. This //! is zone-type-specific. Some zone types have no cleanup work at all and //! can be pruned as soon as the first condition is satisfied. Others have -//! multiple, disparate cleanup work, all of which must be completed. +//! multiple, disparate cleanup steps, all of which must be completed. //! -//! This module is primarily considered with checking the second condition. The -//! [`BlueprintExpungedZoneAccessReason`] enum tracks a variant for every reason -//! a caller wants to access the expunged zones of a blueprint, including all -//! known cleanup actions. For each zone type, if the zone-type-specific cleanup -//! work is complete, we included the zone ID in the "expunged and unreferenced" -//! zone set in the `PlanningInput`. The planner can cheaply act on this set: -//! for every zone ID present, it can safely prune it from the blueprint (i.e., -//! do not include it in the child blueprint it emits). +//! The first condition is tracked in the blueprint as the `ready_for_cleanup` +//! field inside the expunged zone disposition. We query for it below by asking +//! for expunged zones with the `ZoneRunningStatus::Shutdown` state. +//! +//! This bulk of this module is concerned with checking the second condition. +//! The [`BlueprintExpungedZoneAccessReason`] enum tracks a variant for every +//! reason a caller wants to access the expunged zones of a blueprint, including +//! all known cleanup actions. For each zone type, if the zone-type-specific +//! cleanup work is complete, we included the zone ID in the "expunged and +//! unreferenced" zone set in the `PlanningInput`. The planner can cheaply act +//! on this set: for every zone ID present, it can safely prune it from the +//! blueprint (i.e., do not include it in the child blueprint it emits). use nexus_db_model::SupportBundleState; use nexus_db_queries::context::OpContext; @@ -48,6 +51,17 @@ use std::collections::BTreeSet; use std::net::IpAddr; use std::num::NonZeroU32; +/// Find all Omicron zones within `parent_blueprint` that can be safely pruned +/// by a future run of the planner. +/// +/// A zone ID contained in the returned set satisfies both conditions for +/// pruning: +/// +/// 1. We know the zone is not running and will not run again. +/// 2. Any cleanup work required after the zone has been expunged has been +/// completed. +/// +/// See this module's documentation for more details. pub(super) async fn find_expunged_and_unreferenced_zones( opctx: &OpContext, datastore: &DataStore, From b5bf8952278e1af31f67bc944184d1fc7aefef73 Mon Sep 17 00:00:00 2001 From: John Gallagher Date: Fri, 30 Jan 2026 11:32:35 -0500 Subject: [PATCH 10/27] WIP: initial runtime reason coverage checking --- Cargo.lock | 1 + nexus/reconfigurator/preparation/Cargo.toml | 1 + .../src/expunged_and_unreferenced.rs | 180 ++++++++++++------ nexus/types/src/deployment.rs | 2 +- 4 files changed, 124 insertions(+), 60 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index ea320d4f6b7..79c53c970fa 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -7416,6 +7416,7 @@ dependencies = [ "sled-hardware-types", "slog", "slog-error-chain", + "strum 0.27.2", ] [[package]] diff --git a/nexus/reconfigurator/preparation/Cargo.toml b/nexus/reconfigurator/preparation/Cargo.toml index 87ee75f2f2c..de87c03947f 100644 --- a/nexus/reconfigurator/preparation/Cargo.toml +++ b/nexus/reconfigurator/preparation/Cargo.toml @@ -23,5 +23,6 @@ sled-agent-types.workspace = true sled-hardware-types.workspace = true slog.workspace = true slog-error-chain.workspace = true +strum.workspace = true omicron-workspace-hack.workspace = true diff --git a/nexus/reconfigurator/preparation/src/expunged_and_unreferenced.rs b/nexus/reconfigurator/preparation/src/expunged_and_unreferenced.rs index c5aa064cbf3..989b62e0568 100644 --- a/nexus/reconfigurator/preparation/src/expunged_and_unreferenced.rs +++ b/nexus/reconfigurator/preparation/src/expunged_and_unreferenced.rs @@ -50,6 +50,7 @@ use std::cell::OnceCell; use std::collections::BTreeSet; use std::net::IpAddr; use std::num::NonZeroU32; +use strum::IntoEnumIterator; /// Find all Omicron zones within `parent_blueprint` that can be safely pruned /// by a future run of the planner. @@ -71,9 +72,16 @@ pub(super) async fn find_expunged_and_unreferenced_zones( ) -> Result, Error> { let mut expunged_and_unreferenced = BTreeSet::new(); - static_check_all_reasons_handled( - BlueprintExpungedZoneAccessReason::PlanningInputDetermineUnreferenced, - ); + // It's critically important we confirm all known reasons for accessing + // expunged zones, as most of them are related to cleanup that has to happen + // (and that we have to confirm is complete!). + // `BlueprintExpungedZoneAccessReasonChecker` has both static and runtime + // components; for the runtime check, we initialize it here, tell it of + // particular reasons we check as we do so below, then call + // `assert_all_reasons_checked()` at the end, which will panic if we've + // missed any. Our checking is unconditional, so this should trip in tests + // if it's going to trip, and never in production. + let mut reasons_checked = BlueprintExpungedZoneAccessReasonChecker::new(); let bp_refs = BlueprintReferencesCache::new(parent_blueprint); @@ -150,6 +158,8 @@ pub(super) async fn find_expunged_and_unreferenced_zones( } } + reasons_checked.assert_all_reasons_checked(); + Ok(expunged_and_unreferenced) } @@ -401,61 +411,113 @@ impl<'a> BlueprintReferencesCache<'a> { } } -// This is a no-op function that exists solely to ensure this file is considered -// when there are any changes to `BlueprintExpungedZoneAccessReason` variants. -// -// If you're adding a new variant and that variant affects whether it's safe to -// prune an expunged zone from the blueprint, you _must_ also update the code -// above to check for whatever your new variant is. If your new variant does not -// affect pruning, put it in one of the appropriate sections below (or a new one -// with a relevant comment). -fn static_check_all_reasons_handled(reason: BlueprintExpungedZoneAccessReason) { - // Help rustfmt out (with the full enum name it gives up on formatting). - use BlueprintExpungedZoneAccessReason as Reason; - - match reason { - // Checked by is_boundary_ntp_referenced() - Reason::BoundaryNtpUpstreamConfig => {} - - // Checked by is_multinode_clickhouse_referenced() - Reason::ClickhouseKeeperServerConfigIps => {} - - // NOT CHECKED: find_expunged_and_unreferenced_zones() will never - // consider a cockroach node "unreferenced", because we have currently - // disabled decommissioning (see - // https://github.com/oxidecomputer/omicron/issues/8447). - Reason::CockroachDecommission => {} - - // Checked by is_external_networking_referenced(), which is called for - // each zone type with external networking (boundary NTP, external DNS, - // Nexus) - Reason::DeallocateExternalNetworkingResources => {} - - // Checked by is_external_dns_referenced() - Reason::ExternalDnsExternalIps => {} - - // Each of these are checked by is_nexus_referenced() - Reason::NexusDeleteMetadataRecord - | Reason::NexusSagaReassignment - | Reason::NexusSupportBundleReassign => {} - - // Checked by is_oximeter_referenced() - Reason::OximeterExpungeAndReassignProducers => {} - - // Nexus-related reasons that don't need to be checked (see - // `BlueprintExpungedZoneAccessReason` for specifics) - Reason::NexusExternalConfig | Reason::NexusSelfIsQuiescing => {} - - // Planner-related reasons that don't need to be checked (see - // `BlueprintExpungedZoneAccessReason` for specifics) - Reason::PlannerCheckReadyForCleanup - | Reason::PlanningInputDetermineUnreferenced - | Reason::PlanningInputExpungedZoneGuard => {} - - // Test / development reasons that don't need to be checked - Reason::Blippy - | Reason::Omdb - | Reason::ReconfiguratorCli - | Reason::Test => {} +/// Helper type to ensure we've covered every +/// [`BlueprintExpungedZoneAccessReason`] in our checks above. +/// +/// This type has both compile-time (the `match` in `new()`) and runtime (the +/// `assert_all_reasons_checked()` function) guards that confirm we cover any +/// new variants added to [`BlueprintExpungedZoneAccessReason`] in the future. +struct BlueprintExpungedZoneAccessReasonChecker { + reasons_checked: BTreeSet, +} + +impl BlueprintExpungedZoneAccessReasonChecker { + fn new() -> Self { + // Help rustfmt out (with the full enum name it gives up on formatting). + use BlueprintExpungedZoneAccessReason as Reason; + + let mut reasons_checked = BTreeSet::new(); + + for reason in Reason::iter() { + // This match exists for two purposes: + // + // 1. Force a compilation error if a new variant is added, leading + // you to this module and this comment. + // 2. Seed `reasons_checked` with reasons we _don't_ explicitly + // check, because they're documented as "they don't actually need + // to be checked". + // + // If you're in case 1 and you've added a new variant to + // `BlueprintExpungedZoneAccessReason`, you must update the code in + // this module. Either update the match below to add the reason to + // the group of "doesn't need to be checked" cases, or update the + // checks above for zone-specific reasons and add an appropriate + // call to `add_reason_checked()`. + match reason { + // Checked by is_boundary_ntp_referenced() + Reason::BoundaryNtpUpstreamConfig => {} + + // Checked by is_multinode_clickhouse_referenced() + Reason::ClickhouseKeeperServerConfigIps => {} + + // TODO-john FIXME + // NOT CHECKED: find_expunged_and_unreferenced_zones() will + // never consider a cockroach node "unreferenced", because we + // have currently disabled decommissioning (see + // https://github.com/oxidecomputer/omicron/issues/8447). + Reason::CockroachDecommission => {} + + // TODO-john FIXME + // Checked by is_external_networking_referenced(), which is + // called for each zone type with external networking (boundary + // NTP, external DNS, Nexus) + Reason::DeallocateExternalNetworkingResources => {} + + // Checked by is_external_dns_referenced() + Reason::ExternalDnsExternalIps => {} + + // Each of these are checked by is_nexus_referenced() + Reason::NexusDeleteMetadataRecord + | Reason::NexusSagaReassignment + | Reason::NexusSupportBundleReassign => {} + + // Checked by is_oximeter_referenced() + Reason::OximeterExpungeAndReassignProducers => {} + + // Nexus-related reasons that don't need to be checked (see + // `BlueprintExpungedZoneAccessReason` for specifics) + Reason::NexusExternalConfig + | Reason::NexusSelfIsQuiescing + + // Planner-related reasons that don't need to be checked (see + // `BlueprintExpungedZoneAccessReason` for specifics) + |Reason::PlannerCheckReadyForCleanup + | Reason::PlanningInputDetermineUnreferenced + | Reason::PlanningInputExpungedZoneGuard + + // Test / development reasons that don't need to be checked + | Reason::Blippy + | Reason::Omdb + | Reason::ReconfiguratorCli + | Reason::Test => { + reasons_checked.insert(reason); + } + } + } + + Self { reasons_checked } + } + + fn add_reason_checked( + &mut self, + reason: BlueprintExpungedZoneAccessReason, + ) { + self.reasons_checked.insert(reason); + } + + fn assert_all_reasons_checked(self) { + let mut unchecked = BTreeSet::new(); + + for reason in BlueprintExpungedZoneAccessReason::iter() { + if !self.reasons_checked.contains(&reason) { + unchecked.insert(reason); + } + } + + assert!( + unchecked.is_empty(), + "Correctness error: Planning input construction failed to \ + consider some `BlueprintExpungedZoneAccessReason`s: {unchecked:?}" + ); } } diff --git a/nexus/types/src/deployment.rs b/nexus/types/src/deployment.rs index 1d9843b714b..5fdf735095d 100644 --- a/nexus/types/src/deployment.rs +++ b/nexus/types/src/deployment.rs @@ -763,7 +763,7 @@ pub enum ZoneRunningStatus { /// 1. Add a new variant to this enum. /// 2. Update the planner to account for it, to prevent the planner from pruning /// the zone before whatever your use of it is completed. -#[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord)] +#[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord, EnumIter)] pub enum BlueprintExpungedZoneAccessReason { // -------------------------------------------------------------------- // Zone-kind-specific variants. Keep this sorted alphabetically, prefix From fe3d0bfe1bdac4c8005d99a21a94a647cb17d5f7 Mon Sep 17 00:00:00 2001 From: John Gallagher Date: Fri, 30 Jan 2026 13:56:29 -0500 Subject: [PATCH 11/27] wip --- Cargo.lock | 4 ++ nexus/reconfigurator/preparation/Cargo.toml | 6 +++ .../src/expunged_and_unreferenced.rs | 46 +++++++++++++++++++ 3 files changed, 56 insertions(+) diff --git a/Cargo.lock b/Cargo.lock index 79c53c970fa..5745c1dba82 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -7406,9 +7406,12 @@ dependencies = [ "futures", "nexus-db-model", "nexus-db-queries", + "nexus-reconfigurator-planning", + "nexus-test-utils", "nexus-types", "omicron-common", "omicron-rpaths", + "omicron-test-utils", "omicron-uuid-kinds", "omicron-workspace-hack", "pq-sys", @@ -7417,6 +7420,7 @@ dependencies = [ "slog", "slog-error-chain", "strum 0.27.2", + "tokio", ] [[package]] diff --git a/nexus/reconfigurator/preparation/Cargo.toml b/nexus/reconfigurator/preparation/Cargo.toml index de87c03947f..f8c4a8cadc2 100644 --- a/nexus/reconfigurator/preparation/Cargo.toml +++ b/nexus/reconfigurator/preparation/Cargo.toml @@ -26,3 +26,9 @@ slog-error-chain.workspace = true strum.workspace = true omicron-workspace-hack.workspace = true + +[dev-dependencies] +nexus-reconfigurator-planning.workspace = true +nexus-test-utils.workspace = true +omicron-test-utils.workspace = true +tokio.workspace = true diff --git a/nexus/reconfigurator/preparation/src/expunged_and_unreferenced.rs b/nexus/reconfigurator/preparation/src/expunged_and_unreferenced.rs index 989b62e0568..9f087ec6a1e 100644 --- a/nexus/reconfigurator/preparation/src/expunged_and_unreferenced.rs +++ b/nexus/reconfigurator/preparation/src/expunged_and_unreferenced.rs @@ -521,3 +521,49 @@ impl BlueprintExpungedZoneAccessReasonChecker { ); } } + +#[cfg(test)] +mod tests { + use super::*; + use nexus_reconfigurator_planning::blueprint_builder::BlueprintBuilder; + use nexus_reconfigurator_planning::planner::PlannerRng; + use nexus_test_utils::db::TestDatabase; + use omicron_test_utils::dev; + use omicron_test_utils::dev::LogContext; + use std::sync::Arc; + use std::sync::LazyLock; + + static EMPTY_BLUEPRINT: LazyLock = LazyLock::new(|| { + BlueprintBuilder::build_empty_seeded( + "test", + PlannerRng::from_seed("empty blueprint for tests"), + ) + }); + + // Helper to reduce boilerplate in the tests below. This sets up a blueprint + // builder with an empty parent blueprint, a single sled, and no zones. + struct Harness { + logctx: LogContext, + bp: BlueprintBuilder<'static>, + } + + #[tokio::test] + async fn john_fixme() { + const TEST_NAME: &str = "john_fixme"; + + let logctx = dev::test_setup_log(TEST_NAME); + let log = &logctx.log; + let db = TestDatabase::new_with_datastore(log).await; + + let builder = BlueprintBuilder::new_based_on( + log, + Arc::new(BlueprintBuilder::build_empty("test")), + "test", + PlannerRng::from_seed(TEST_NAME), + ) + .expect("created builder"); + + db.terminate().await; + logctx.cleanup_successful(); + } +} From b0ca4f47c7569ee8c66c8576521d54af829335ee Mon Sep 17 00:00:00 2001 From: John Gallagher Date: Fri, 30 Jan 2026 15:47:44 -0500 Subject: [PATCH 12/27] new caching helpers --- .../src/expunged_and_unreferenced.rs | 44 +++++++++++++++++++ nexus/reconfigurator/preparation/src/lib.rs | 20 +++++---- 2 files changed, 55 insertions(+), 9 deletions(-) diff --git a/nexus/reconfigurator/preparation/src/expunged_and_unreferenced.rs b/nexus/reconfigurator/preparation/src/expunged_and_unreferenced.rs index 9f087ec6a1e..98101b9689e 100644 --- a/nexus/reconfigurator/preparation/src/expunged_and_unreferenced.rs +++ b/nexus/reconfigurator/preparation/src/expunged_and_unreferenced.rs @@ -411,6 +411,50 @@ impl<'a> BlueprintReferencesCache<'a> { } } +struct ZonesWithExternalIpRows<'a> { + external_ip_rows: &'a [nexus_db_model::ExternalIp], + zone_ids: OnceCell>, +} + +impl<'a> ZonesWithExternalIpRows<'a> { + fn new(external_ip_rows: &'a [nexus_db_model::ExternalIp]) -> Self { + Self { external_ip_rows, zone_ids: OnceCell::new() } + } + + fn contains_zone_id(&self, zone_id: &OmicronZoneUuid) -> bool { + let zone_ids = self.zone_ids.get_or_init(|| { + self.external_ip_rows + .iter() + .filter_map(|row| { + row.parent_id.map(OmicronZoneUuid::from_untyped_uuid) + }) + .collect() + }); + zone_ids.contains(zone_id) + } +} + +struct ZonesWithServiceNicRows<'a> { + service_nic_rows: &'a [nexus_db_model::ServiceNetworkInterface], + zone_ids: OnceCell>, +} + +impl<'a> ZonesWithServiceNicRows<'a> { + fn new(service_nic_rows: &'a [nexus_db_model::ServiceNetworkInterface]) -> Self { + Self { service_nic_rows, zone_ids: OnceCell::new() } + } + + fn contains_zone_id(&self, zone_id: &OmicronZoneUuid) -> bool { + let zone_ids = self.zone_ids.get_or_init(|| { + self.service_nic_rows + .iter() + .map(|row| OmicronZoneUuid::from_untyped_uuid(row.service_id)) + .collect() + }); + zone_ids.contains(zone_id) + } +} + /// Helper type to ensure we've covered every /// [`BlueprintExpungedZoneAccessReason`] in our checks above. /// diff --git a/nexus/reconfigurator/preparation/src/lib.rs b/nexus/reconfigurator/preparation/src/lib.rs index 0c8f94bbe7c..bd19b787490 100644 --- a/nexus/reconfigurator/preparation/src/lib.rs +++ b/nexus/reconfigurator/preparation/src/lib.rs @@ -65,6 +65,8 @@ use std::sync::Arc; mod expunged_and_unreferenced; +use expunged_and_unreferenced::PruneableZones; + /// Given various pieces of database state that go into the blueprint planning /// process, produce a `PlanningInput` object encapsulating what the planner /// needs to generate a blueprint @@ -272,15 +274,15 @@ impl PlanningInputFromDb<'_> { active_nexus_zones.into_iter().map(|n| n.nexus_id()).collect(); let not_yet_nexus_zones = not_yet_nexus_zones.into_iter().map(|n| n.nexus_id()).collect(); - let expunged_and_unreferenced_zones = - expunged_and_unreferenced::find_expunged_and_unreferenced_zones( - opctx, - datastore, - &parent_blueprint, - &external_ip_rows, - &service_nic_rows, - ) - .await?; + let expunged_and_unreferenced_zones = PruneableZones::new( + opctx, + datastore, + &parent_blueprint, + &external_ip_rows, + &service_nic_rows, + ) + .await? + .into_pruneable_zones(); let planning_input = PlanningInputFromDb { parent_blueprint, From f0b31baaf938a2a407bb8fbb318651994f79dceb Mon Sep 17 00:00:00 2001 From: John Gallagher Date: Fri, 30 Jan 2026 15:48:05 -0500 Subject: [PATCH 13/27] reorg into PruneableZones --- .../src/expunged_and_unreferenced.rs | 215 +++++++++--------- 1 file changed, 112 insertions(+), 103 deletions(-) diff --git a/nexus/reconfigurator/preparation/src/expunged_and_unreferenced.rs b/nexus/reconfigurator/preparation/src/expunged_and_unreferenced.rs index 98101b9689e..44e8db68b35 100644 --- a/nexus/reconfigurator/preparation/src/expunged_and_unreferenced.rs +++ b/nexus/reconfigurator/preparation/src/expunged_and_unreferenced.rs @@ -52,115 +52,124 @@ use std::net::IpAddr; use std::num::NonZeroU32; use strum::IntoEnumIterator; -/// Find all Omicron zones within `parent_blueprint` that can be safely pruned -/// by a future run of the planner. -/// -/// A zone ID contained in the returned set satisfies both conditions for -/// pruning: -/// -/// 1. We know the zone is not running and will not run again. -/// 2. Any cleanup work required after the zone has been expunged has been -/// completed. -/// -/// See this module's documentation for more details. -pub(super) async fn find_expunged_and_unreferenced_zones( - opctx: &OpContext, - datastore: &DataStore, - parent_blueprint: &Blueprint, - external_ip_rows: &[nexus_db_model::ExternalIp], - service_nic_rows: &[nexus_db_model::ServiceNetworkInterface], -) -> Result, Error> { - let mut expunged_and_unreferenced = BTreeSet::new(); - - // It's critically important we confirm all known reasons for accessing - // expunged zones, as most of them are related to cleanup that has to happen - // (and that we have to confirm is complete!). - // `BlueprintExpungedZoneAccessReasonChecker` has both static and runtime - // components; for the runtime check, we initialize it here, tell it of - // particular reasons we check as we do so below, then call - // `assert_all_reasons_checked()` at the end, which will panic if we've - // missed any. Our checking is unconditional, so this should trip in tests - // if it's going to trip, and never in production. - let mut reasons_checked = BlueprintExpungedZoneAccessReasonChecker::new(); - - let bp_refs = BlueprintReferencesCache::new(parent_blueprint); - - for (_, zone) in parent_blueprint.expunged_zones( - ZoneRunningStatus::Shutdown, - BlueprintExpungedZoneAccessReason::PlanningInputDetermineUnreferenced, - ) { - let is_referenced = match &zone.zone_type { - BlueprintZoneType::BoundaryNtp(boundary_ntp) => { - is_boundary_ntp_referenced(boundary_ntp, &bp_refs) - || is_external_networking_referenced( - zone.id, - external_ip_rows, - service_nic_rows, - ) - } - BlueprintZoneType::ClickhouseKeeper(_) - | BlueprintZoneType::ClickhouseServer(_) => { - is_multinode_clickhouse_referenced(zone.id, parent_blueprint) - } - BlueprintZoneType::ExternalDns(external_dns) => { - is_external_dns_referenced(external_dns, &bp_refs) - || is_external_networking_referenced( - zone.id, - external_ip_rows, - service_nic_rows, - ) - } - BlueprintZoneType::Nexus(_) => { - is_nexus_referenced(opctx, datastore, zone.id).await? - || is_external_networking_referenced( +pub(super) struct PruneableZones { + pruneable_zones: BTreeSet, + reason_checker: BlueprintExpungedZoneAccessReasonChecker, +} + +impl PruneableZones { + /// Find all Omicron zones within `parent_blueprint` that can be safely + /// pruned by a future run of the planner. + /// + /// A zone ID contained in the returned set satisfies both conditions for + /// pruning: + /// + /// 1. We know the zone is not running and will not run again. + /// 2. Any cleanup work required after the zone has been expunged has been + /// completed. + /// + /// See this module's documentation for more details. + pub async fn new( + opctx: &OpContext, + datastore: &DataStore, + parent_blueprint: &Blueprint, + external_ip_rows: &[nexus_db_model::ExternalIp], + service_nic_rows: &[nexus_db_model::ServiceNetworkInterface], + ) -> Result { + // Help rustfmt out (with the full enum name it gives up on formatting). + use BlueprintExpungedZoneAccessReason as Reason; + + let mut pruneable_zones = BTreeSet::new(); + let mut reason_checker = + BlueprintExpungedZoneAccessReasonChecker::new(); + + let bp_refs = BlueprintReferencesCache::new(parent_blueprint); + + for (_, zone) in parent_blueprint.expunged_zones( + ZoneRunningStatus::Shutdown, + Reason::PlanningInputDetermineUnreferenced, + ) { + let is_referenced = match &zone.zone_type { + BlueprintZoneType::BoundaryNtp(boundary_ntp) => { + is_boundary_ntp_referenced(boundary_ntp, &bp_refs) + || is_external_networking_referenced( + zone.id, + external_ip_rows, + service_nic_rows, + ) + } + BlueprintZoneType::ClickhouseKeeper(_) + | BlueprintZoneType::ClickhouseServer(_) => { + is_multinode_clickhouse_referenced( zone.id, - external_ip_rows, - service_nic_rows, + parent_blueprint, ) - } - BlueprintZoneType::Oximeter(_) => { - is_oximeter_referenced(opctx, datastore, zone.id).await? - } - BlueprintZoneType::CockroachDb(_) => { - // BlueprintExpungedZoneAccessReason::CockroachDecommission - // means we consider cockroach zones referenced until the - // cockroach cluster has decommissioned the node that was - // present in that zone; however, we don't currently - // decommission cockroach nodes (tracked by - // ). We - // therefore always consider cockroach nodes "still referenced". - // - // This shouldn't be a huge deal in practice; Cockroach zones - // are updated in place, not by an expunge/add pair, so a - // typical update does not produce an expunged Cockroach zone - // that needs pruning. Only expunging a disk or sled can produce - // an expunged Cockroach node, and we expect the number of those - // to remain relatively small for any given deployment. - // Hopefully we can revisit decommissioning Cockroach nodes long - // before we need to worry about the amount of garbage leftover - // from expunged disks/sleds. - true - } + } + BlueprintZoneType::ExternalDns(external_dns) => { + is_external_dns_referenced(external_dns, &bp_refs) + || is_external_networking_referenced( + zone.id, + external_ip_rows, + service_nic_rows, + ) + } + BlueprintZoneType::Nexus(_) => { + is_nexus_referenced(opctx, datastore, zone.id).await? + || is_external_networking_referenced( + zone.id, + external_ip_rows, + service_nic_rows, + ) + } + BlueprintZoneType::Oximeter(_) => { + is_oximeter_referenced(opctx, datastore, zone.id).await? + } + BlueprintZoneType::CockroachDb(_) => { + // BlueprintExpungedZoneAccessReason::CockroachDecommission + // means we consider cockroach zones referenced until the + // cockroach cluster has decommissioned the node that was + // present in that zone; however, we don't currently + // decommission cockroach nodes (tracked by + // ). + // We therefore always consider cockroach nodes "still + // referenced". + // + // This shouldn't be a huge deal in practice; Cockroach + // zones are updated in place, not by an expunge/add pair, + // so a typical update does not produce an expunged + // Cockroach zone that needs pruning. Only expunging a disk + // or sled can produce an expunged Cockroach node, and we + // expect the number of those to remain relatively small for + // any given deployment. Hopefully we can revisit + // decommissioning Cockroach nodes long before we need to + // worry about the amount of garbage leftover from expunged + // disks/sleds. + true + } - // These zone types currently have no associated - // `BlueprintExpungedZoneAccessReason`; there is no cleanup action - // required for them, so they're considered "unreferenced" and may - // be pruned as soon as they've been expunged. - BlueprintZoneType::Clickhouse(_) - | BlueprintZoneType::Crucible(_) - | BlueprintZoneType::CruciblePantry(_) - | BlueprintZoneType::InternalDns(_) - | BlueprintZoneType::InternalNtp(_) => false, - }; - - if !is_referenced { - expunged_and_unreferenced.insert(zone.id); + // These zone types currently have no associated + // `BlueprintExpungedZoneAccessReason`; there is no cleanup + // action required for them, so they're considered + // "unreferenced" and may be pruned as soon as they've been + // expunged. + BlueprintZoneType::Clickhouse(_) + | BlueprintZoneType::Crucible(_) + | BlueprintZoneType::CruciblePantry(_) + | BlueprintZoneType::InternalDns(_) + | BlueprintZoneType::InternalNtp(_) => false, + }; + + if !is_referenced { + pruneable_zones.insert(zone.id); + } } - } - reasons_checked.assert_all_reasons_checked(); + Ok(Self { pruneable_zones, reason_checker }) + } - Ok(expunged_and_unreferenced) + pub fn into_pruneable_zones(self) -> BTreeSet { + self.pruneable_zones + } } fn is_external_networking_referenced( From 6c51581f85882f96d6b41e91388752c0dc88a779 Mon Sep 17 00:00:00 2001 From: John Gallagher Date: Fri, 30 Jan 2026 15:48:15 -0500 Subject: [PATCH 14/27] initial test (failing) --- .../src/expunged_and_unreferenced.rs | 41 ++++++------------- 1 file changed, 13 insertions(+), 28 deletions(-) diff --git a/nexus/reconfigurator/preparation/src/expunged_and_unreferenced.rs b/nexus/reconfigurator/preparation/src/expunged_and_unreferenced.rs index 44e8db68b35..f44baa4c086 100644 --- a/nexus/reconfigurator/preparation/src/expunged_and_unreferenced.rs +++ b/nexus/reconfigurator/preparation/src/expunged_and_unreferenced.rs @@ -578,43 +578,28 @@ impl BlueprintExpungedZoneAccessReasonChecker { #[cfg(test)] mod tests { use super::*; - use nexus_reconfigurator_planning::blueprint_builder::BlueprintBuilder; - use nexus_reconfigurator_planning::planner::PlannerRng; + use nexus_reconfigurator_planning::example::ExampleSystemBuilder; use nexus_test_utils::db::TestDatabase; use omicron_test_utils::dev; - use omicron_test_utils::dev::LogContext; - use std::sync::Arc; - use std::sync::LazyLock; - - static EMPTY_BLUEPRINT: LazyLock = LazyLock::new(|| { - BlueprintBuilder::build_empty_seeded( - "test", - PlannerRng::from_seed("empty blueprint for tests"), - ) - }); - - // Helper to reduce boilerplate in the tests below. This sets up a blueprint - // builder with an empty parent blueprint, a single sled, and no zones. - struct Harness { - logctx: LogContext, - bp: BlueprintBuilder<'static>, - } #[tokio::test] - async fn john_fixme() { - const TEST_NAME: &str = "john_fixme"; + async fn test_pruneable_zones_reason_checker() { + const TEST_NAME: &str = "test_pruneable_zones_reason_checker"; let logctx = dev::test_setup_log(TEST_NAME); let log = &logctx.log; let db = TestDatabase::new_with_datastore(log).await; + let (opctx, datastore) = (db.opctx(), db.datastore()); - let builder = BlueprintBuilder::new_based_on( - log, - Arc::new(BlueprintBuilder::build_empty("test")), - "test", - PlannerRng::from_seed(TEST_NAME), - ) - .expect("created builder"); + let (example, blueprint) = + ExampleSystemBuilder::new(log, TEST_NAME).build(); + + let pruneable_zones = + PruneableZones::new(opctx, datastore, &blueprint, &[], &[]) + .await + .expect("failed to find pruneable zones"); + + pruneable_zones.reason_checker.assert_all_reasons_checked(); db.terminate().await; logctx.cleanup_successful(); From 1e3dd23a88aba575428ead4d19c3c83f12a89a38 Mon Sep 17 00:00:00 2001 From: John Gallagher Date: Fri, 30 Jan 2026 16:09:09 -0500 Subject: [PATCH 15/27] cleanup and clarify lazy sets --- .../src/expunged_and_unreferenced.rs | 144 ++++++++++-------- 1 file changed, 84 insertions(+), 60 deletions(-) diff --git a/nexus/reconfigurator/preparation/src/expunged_and_unreferenced.rs b/nexus/reconfigurator/preparation/src/expunged_and_unreferenced.rs index f44baa4c086..effa3a463e7 100644 --- a/nexus/reconfigurator/preparation/src/expunged_and_unreferenced.rs +++ b/nexus/reconfigurator/preparation/src/expunged_and_unreferenced.rs @@ -83,20 +83,42 @@ impl PruneableZones { let mut reason_checker = BlueprintExpungedZoneAccessReasonChecker::new(); - let bp_refs = BlueprintReferencesCache::new(parent_blueprint); + // Create several `BTreeSet<_>`s of information we need to check in the + // loop below. Each of these is lazily-initialized; we'll only actually + // do the work of constructing the set if we need it during iteration. + let in_service_boundary_ntp_configs = + InServiceBoundaryNtpUpstreamConfigs::new(parent_blueprint); + let in_service_external_dns_ips = + InServiceExternalDnsIps::new(parent_blueprint); + let zones_with_external_ip_rows = + ZonesWithExternalIpRows::new(external_ip_rows); + let zones_with_service_nice_rows = + ZonesWithServiceNicRows::new(service_nic_rows); for (_, zone) in parent_blueprint.expunged_zones( ZoneRunningStatus::Shutdown, Reason::PlanningInputDetermineUnreferenced, ) { + // Check + // BlueprintExpungedZoneAccessReason::DeallocateExternalNetworkingResources; + // this reason applies to multiple zone types, so we check it for + // them all. (Technically we only need to check it for zones that + // _can_ have external networking, but it's basically free to just + // check it for all.) + reason_checker.add_reason_checked( + Reason::DeallocateExternalNetworkingResources, + ); + let has_external_ip_row = + zones_with_external_ip_rows.contains(&zone.id); + let has_service_nic_row = + zones_with_service_nice_rows.contains(&zone.id); + let is_referenced = match &zone.zone_type { BlueprintZoneType::BoundaryNtp(boundary_ntp) => { - is_boundary_ntp_referenced(boundary_ntp, &bp_refs) - || is_external_networking_referenced( - zone.id, - external_ip_rows, - service_nic_rows, - ) + is_boundary_ntp_referenced( + boundary_ntp, + &in_service_boundary_ntp_configs, + ) } BlueprintZoneType::ClickhouseKeeper(_) | BlueprintZoneType::ClickhouseServer(_) => { @@ -106,20 +128,13 @@ impl PruneableZones { ) } BlueprintZoneType::ExternalDns(external_dns) => { - is_external_dns_referenced(external_dns, &bp_refs) - || is_external_networking_referenced( - zone.id, - external_ip_rows, - service_nic_rows, - ) + is_external_dns_referenced( + external_dns, + &in_service_external_dns_ips, + ) } BlueprintZoneType::Nexus(_) => { is_nexus_referenced(opctx, datastore, zone.id).await? - || is_external_networking_referenced( - zone.id, - external_ip_rows, - service_nic_rows, - ) } BlueprintZoneType::Oximeter(_) => { is_oximeter_referenced(opctx, datastore, zone.id).await? @@ -159,7 +174,7 @@ impl PruneableZones { | BlueprintZoneType::InternalNtp(_) => false, }; - if !is_referenced { + if !is_referenced && !has_external_ip_row && !has_service_nic_row { pruneable_zones.insert(zone.id); } } @@ -188,16 +203,14 @@ fn is_external_networking_referenced( fn is_boundary_ntp_referenced( boundary_ntp: &blueprint_zone_type::BoundaryNtp, - bp_refs: &BlueprintReferencesCache<'_>, + boundary_ntp_configs: &InServiceBoundaryNtpUpstreamConfigs<'_>, ) -> bool { // Check BlueprintExpungedZoneAccessReason::BoundaryNtpUpstreamConfig; if // this zone's upstream config is not covered by an in-service zone, then // it's still "referenced" (in that the planner needs to refer to it to set // up a new boundary NTP zone). let expunged_config = BoundaryNtpUpstreamConfig::new(boundary_ntp); - bp_refs - .in_service_boundary_ntp_upstream_configs() - .contains(&expunged_config) + boundary_ntp_configs.contains(&expunged_config) } fn is_multinode_clickhouse_referenced( @@ -220,7 +233,7 @@ fn is_multinode_clickhouse_referenced( fn is_external_dns_referenced( external_dns: &blueprint_zone_type::ExternalDns, - bp_refs: &BlueprintReferencesCache<'_>, + external_dns_ips: &InServiceExternalDnsIps<'_>, ) -> bool { // Check BlueprintExpungedZoneAccessReason::ExternalDnsExternalIps; we // consider an external DNS zone "still referenced" if its IP is _not_ @@ -228,7 +241,7 @@ fn is_external_dns_referenced( // an in-service external DNS zone, the expunged zone is no longer // referenced and can be safely pruned.) let expunged_zone_ip = external_dns.dns_address.addr.ip(); - !bp_refs.in_service_external_dns_ips().contains(&expunged_zone_ip) + !external_dns_ips.contains(&expunged_zone_ip) } async fn is_nexus_referenced( @@ -370,43 +383,48 @@ impl<'a> BoundaryNtpUpstreamConfig<'a> { } } -struct BlueprintReferencesCache<'a> { +/// Lazily-initialized set of boundary NTP upstream configs from in-service +/// zones in a blueprint. +struct InServiceBoundaryNtpUpstreamConfigs<'a> { parent_blueprint: &'a Blueprint, - in_service_boundary_ntp_upstream_configs: - OnceCell>>, - in_service_external_dns_ips: OnceCell>, + configs: OnceCell>>, } -impl<'a> BlueprintReferencesCache<'a> { +impl<'a> InServiceBoundaryNtpUpstreamConfigs<'a> { fn new(parent_blueprint: &'a Blueprint) -> Self { - Self { - parent_blueprint, - in_service_boundary_ntp_upstream_configs: OnceCell::new(), - in_service_external_dns_ips: OnceCell::new(), - } + Self { parent_blueprint, configs: OnceCell::new() } } - fn in_service_boundary_ntp_upstream_configs( - &self, - ) -> &BTreeSet> { - OnceCell::get_or_init( - &self.in_service_boundary_ntp_upstream_configs, - || { - self.parent_blueprint - .in_service_zones() - .filter_map(|(_, zone)| match &zone.zone_type { - BlueprintZoneType::BoundaryNtp(config) => { - Some(BoundaryNtpUpstreamConfig::new(config)) - } - _ => None, - }) - .collect() - }, - ) + fn contains(&self, config: &BoundaryNtpUpstreamConfig) -> bool { + let configs = self.configs.get_or_init(|| { + self.parent_blueprint + .in_service_zones() + .filter_map(|(_, zone)| match &zone.zone_type { + BlueprintZoneType::BoundaryNtp(config) => { + Some(BoundaryNtpUpstreamConfig::new(config)) + } + _ => None, + }) + .collect() + }); + configs.contains(config) } +} - fn in_service_external_dns_ips(&self) -> &BTreeSet { - OnceCell::get_or_init(&self.in_service_external_dns_ips, || { +/// Lazily-initialized set of external DNS IPs from in-service zones in a +/// blueprint. +struct InServiceExternalDnsIps<'a> { + parent_blueprint: &'a Blueprint, + ips: OnceCell>, +} + +impl<'a> InServiceExternalDnsIps<'a> { + fn new(parent_blueprint: &'a Blueprint) -> Self { + Self { parent_blueprint, ips: OnceCell::new() } + } + + fn contains(&self, ip: &IpAddr) -> bool { + let ips = self.ips.get_or_init(|| { self.parent_blueprint .in_service_zones() .filter_map(|(_, zone)| match &zone.zone_type { @@ -416,10 +434,13 @@ impl<'a> BlueprintReferencesCache<'a> { _ => None, }) .collect() - }) + }); + ips.contains(ip) } } +/// Lazily-initialized set of zone IDs that have external IP rows in the +/// database. struct ZonesWithExternalIpRows<'a> { external_ip_rows: &'a [nexus_db_model::ExternalIp], zone_ids: OnceCell>, @@ -430,7 +451,7 @@ impl<'a> ZonesWithExternalIpRows<'a> { Self { external_ip_rows, zone_ids: OnceCell::new() } } - fn contains_zone_id(&self, zone_id: &OmicronZoneUuid) -> bool { + fn contains(&self, zone_id: &OmicronZoneUuid) -> bool { let zone_ids = self.zone_ids.get_or_init(|| { self.external_ip_rows .iter() @@ -443,17 +464,21 @@ impl<'a> ZonesWithExternalIpRows<'a> { } } +/// Lazily-initialized set of zone IDs that have service NIC rows in the +/// database. struct ZonesWithServiceNicRows<'a> { service_nic_rows: &'a [nexus_db_model::ServiceNetworkInterface], zone_ids: OnceCell>, } impl<'a> ZonesWithServiceNicRows<'a> { - fn new(service_nic_rows: &'a [nexus_db_model::ServiceNetworkInterface]) -> Self { + fn new( + service_nic_rows: &'a [nexus_db_model::ServiceNetworkInterface], + ) -> Self { Self { service_nic_rows, zone_ids: OnceCell::new() } } - fn contains_zone_id(&self, zone_id: &OmicronZoneUuid) -> bool { + fn contains(&self, zone_id: &OmicronZoneUuid) -> bool { let zone_ids = self.zone_ids.get_or_init(|| { self.service_nic_rows .iter() @@ -591,8 +616,7 @@ mod tests { let db = TestDatabase::new_with_datastore(log).await; let (opctx, datastore) = (db.opctx(), db.datastore()); - let (example, blueprint) = - ExampleSystemBuilder::new(log, TEST_NAME).build(); + let (_, blueprint) = ExampleSystemBuilder::new(log, TEST_NAME).build(); let pruneable_zones = PruneableZones::new(opctx, datastore, &blueprint, &[], &[]) From ad69b2ca5c743085481bcd617c4684d1dde08e55 Mon Sep 17 00:00:00 2001 From: John Gallagher Date: Fri, 30 Jan 2026 16:23:38 -0500 Subject: [PATCH 16/27] s/referenced/pruneable/ (and invert function return values) --- .../src/expunged_and_unreferenced.rs | 303 ++++++++++-------- 1 file changed, 167 insertions(+), 136 deletions(-) diff --git a/nexus/reconfigurator/preparation/src/expunged_and_unreferenced.rs b/nexus/reconfigurator/preparation/src/expunged_and_unreferenced.rs index effa3a463e7..c669dc41558 100644 --- a/nexus/reconfigurator/preparation/src/expunged_and_unreferenced.rs +++ b/nexus/reconfigurator/preparation/src/expunged_and_unreferenced.rs @@ -103,8 +103,9 @@ impl PruneableZones { // BlueprintExpungedZoneAccessReason::DeallocateExternalNetworkingResources; // this reason applies to multiple zone types, so we check it for // them all. (Technically we only need to check it for zones that - // _can_ have external networking, but it's basically free to just - // check it for all.) + // _can_ have external networking, but it's fine to check ones that + // don't, and now we don't have to keep a list here of which zone + // types could have external networking rows present.) reason_checker.add_reason_checked( Reason::DeallocateExternalNetworkingResources, ); @@ -113,68 +114,65 @@ impl PruneableZones { let has_service_nic_row = zones_with_service_nice_rows.contains(&zone.id); - let is_referenced = match &zone.zone_type { - BlueprintZoneType::BoundaryNtp(boundary_ntp) => { - is_boundary_ntp_referenced( - boundary_ntp, - &in_service_boundary_ntp_configs, - ) - } - BlueprintZoneType::ClickhouseKeeper(_) - | BlueprintZoneType::ClickhouseServer(_) => { - is_multinode_clickhouse_referenced( - zone.id, - parent_blueprint, - ) - } - BlueprintZoneType::ExternalDns(external_dns) => { - is_external_dns_referenced( - external_dns, - &in_service_external_dns_ips, - ) - } - BlueprintZoneType::Nexus(_) => { - is_nexus_referenced(opctx, datastore, zone.id).await? - } - BlueprintZoneType::Oximeter(_) => { - is_oximeter_referenced(opctx, datastore, zone.id).await? - } - BlueprintZoneType::CockroachDb(_) => { - // BlueprintExpungedZoneAccessReason::CockroachDecommission - // means we consider cockroach zones referenced until the - // cockroach cluster has decommissioned the node that was - // present in that zone; however, we don't currently - // decommission cockroach nodes (tracked by - // ). - // We therefore always consider cockroach nodes "still - // referenced". - // - // This shouldn't be a huge deal in practice; Cockroach - // zones are updated in place, not by an expunge/add pair, - // so a typical update does not produce an expunged - // Cockroach zone that needs pruning. Only expunging a disk - // or sled can produce an expunged Cockroach node, and we - // expect the number of those to remain relatively small for - // any given deployment. Hopefully we can revisit - // decommissioning Cockroach nodes long before we need to - // worry about the amount of garbage leftover from expunged - // disks/sleds. - true - } + let is_pruneable = !has_external_ip_row + && !has_service_nic_row + && match &zone.zone_type { + BlueprintZoneType::BoundaryNtp(boundary_ntp) => { + is_boundary_ntp_pruneable( + boundary_ntp, + &in_service_boundary_ntp_configs, + &mut reason_checker, + ) + } + BlueprintZoneType::ClickhouseKeeper(_) + | BlueprintZoneType::ClickhouseServer(_) => { + is_multinode_clickhouse_pruneable( + zone.id, + parent_blueprint, + &mut reason_checker, + ) + } + BlueprintZoneType::ExternalDns(external_dns) => { + is_external_dns_pruneable( + external_dns, + &in_service_external_dns_ips, + &mut reason_checker, + ) + } + BlueprintZoneType::Nexus(_) => { + is_nexus_pruneable( + opctx, + datastore, + zone.id, + &mut reason_checker, + ) + .await? + } + BlueprintZoneType::Oximeter(_) => { + is_oximeter_pruneable( + opctx, + datastore, + zone.id, + &mut reason_checker, + ) + .await? + } + BlueprintZoneType::CockroachDb(_) => { + is_cockroach_pruneable(zone.id, &mut reason_checker) + } - // These zone types currently have no associated - // `BlueprintExpungedZoneAccessReason`; there is no cleanup - // action required for them, so they're considered - // "unreferenced" and may be pruned as soon as they've been - // expunged. - BlueprintZoneType::Clickhouse(_) - | BlueprintZoneType::Crucible(_) - | BlueprintZoneType::CruciblePantry(_) - | BlueprintZoneType::InternalDns(_) - | BlueprintZoneType::InternalNtp(_) => false, - }; - - if !is_referenced && !has_external_ip_row && !has_service_nic_row { + // These zone types currently have no associated + // `BlueprintExpungedZoneAccessReason`; there is no cleanup + // action required for them, so they may be pruned as soon + // as they've been expunged. + BlueprintZoneType::Clickhouse(_) + | BlueprintZoneType::Crucible(_) + | BlueprintZoneType::CruciblePantry(_) + | BlueprintZoneType::InternalDns(_) + | BlueprintZoneType::InternalNtp(_) => true, + }; + + if is_pruneable { pruneable_zones.insert(zone.id); } } @@ -187,80 +185,113 @@ impl PruneableZones { } } -fn is_external_networking_referenced( - zone_id: OmicronZoneUuid, - external_ip_rows: &[nexus_db_model::ExternalIp], - service_nic_rows: &[nexus_db_model::ServiceNetworkInterface], -) -> bool { - // Check - // BlueprintExpungedZoneAccessReason::DeallocateExternalNetworkingResources; - // if this zone's external IP or NIC are still present in the DB, then it's - // the zone is still referenced. - let zone_id = zone_id.into_untyped_uuid(); - external_ip_rows.iter().any(|row| row.parent_id == Some(zone_id)) - || service_nic_rows.iter().any(|row| row.service_id == zone_id) -} - -fn is_boundary_ntp_referenced( +fn is_boundary_ntp_pruneable( boundary_ntp: &blueprint_zone_type::BoundaryNtp, boundary_ntp_configs: &InServiceBoundaryNtpUpstreamConfigs<'_>, + reason_checker: &mut BlueprintExpungedZoneAccessReasonChecker, ) -> bool { - // Check BlueprintExpungedZoneAccessReason::BoundaryNtpUpstreamConfig; if - // this zone's upstream config is not covered by an in-service zone, then - // it's still "referenced" (in that the planner needs to refer to it to set - // up a new boundary NTP zone). + // If this zone's upstream config is also the config of an in-service zone, + // then it's pruneable. Note the reason we're checking here. + reason_checker.add_reason_checked( + BlueprintExpungedZoneAccessReason::BoundaryNtpUpstreamConfig, + ); let expunged_config = BoundaryNtpUpstreamConfig::new(boundary_ntp); boundary_ntp_configs.contains(&expunged_config) } -fn is_multinode_clickhouse_referenced( +fn is_cockroach_pruneable( + _zone_id: OmicronZoneUuid, + reason_checker: &mut BlueprintExpungedZoneAccessReasonChecker, +) -> bool { + // BlueprintExpungedZoneAccessReason::CockroachDecommission means we + // consider cockroach zones nonpruneable until the cockroach cluster has + // decommissioned the node that was present in that zone; however, we don't + // currently decommission cockroach nodes (tracked by + // ). We therefore + // never consider cockroach nodes pruneable + // + // This shouldn't be a huge deal in practice; Cockroach zones are updated in + // place, not by an expunge/add pair, so a typical update does not produce + // an expunged Cockroach zone that needs pruning. Only expunging a disk or + // sled can produce an expunged Cockroach node, and we expect the number of + // those to remain relatively small for any given deployment. Hopefully we + // can revisit decommissioning Cockroach nodes long before we need to worry + // about the amount of garbage leftover from expunged disks/sleds. + // + // Even though we do no work here, we claim we've checked this + // BlueprintExpungedZoneAccessReason. This allows our test to pass that + // we've _considered_ all relevant reasons; in this case, this reason means + // the zone is _never_ pruneable. + reason_checker.add_reason_checked( + BlueprintExpungedZoneAccessReason::CockroachDecommission, + ); + false +} + +fn is_multinode_clickhouse_pruneable( zone_id: OmicronZoneUuid, parent_blueprint: &Blueprint, + reason_checker: &mut BlueprintExpungedZoneAccessReasonChecker, ) -> bool { - // Check BlueprintExpungedZoneAccessReason::ClickhouseKeeperServerConfigIps; - // if this zone is still present in the clickhouse cluster config, it's - // still referenced. + // If this zone is still present in the clickhouse cluster config, it's + // not pruneable. If there is no config at all or there is but it doesn't + // contain this zone, it is prunable. + // + // Note the reason we've checked here. + reason_checker.add_reason_checked( + BlueprintExpungedZoneAccessReason::ClickhouseKeeperServerConfigIps, + ); let Some(clickhouse_config) = &parent_blueprint.clickhouse_cluster_config else { - // If there is no clickhouse cluster config at all, the zone isn't - // referenced in it! - return false; + return true; }; - clickhouse_config.keepers.contains_key(&zone_id) - || clickhouse_config.servers.contains_key(&zone_id) + !clickhouse_config.keepers.contains_key(&zone_id) + && !clickhouse_config.servers.contains_key(&zone_id) } -fn is_external_dns_referenced( +fn is_external_dns_pruneable( external_dns: &blueprint_zone_type::ExternalDns, external_dns_ips: &InServiceExternalDnsIps<'_>, + reason_checker: &mut BlueprintExpungedZoneAccessReasonChecker, ) -> bool { - // Check BlueprintExpungedZoneAccessReason::ExternalDnsExternalIps; we - // consider an external DNS zone "still referenced" if its IP is _not_ - // assigned to an in-service external DNS zone. (If the IP _is_ assigned to - // an in-service external DNS zone, the expunged zone is no longer - // referenced and can be safely pruned.) + // We consider an external DNS zone pruneable if its IP has been reassigned + // to an in-service external DNS zone. (If the IP has not yet been + // reassigned, we have to keep it around so we don't forget it!). + // + // Note the reason we've checked here. + reason_checker.add_reason_checked( + BlueprintExpungedZoneAccessReason::ExternalDnsExternalIps, + ); let expunged_zone_ip = external_dns.dns_address.addr.ip(); - !external_dns_ips.contains(&expunged_zone_ip) + external_dns_ips.contains(&expunged_zone_ip) } -async fn is_nexus_referenced( +async fn is_nexus_pruneable( opctx: &OpContext, datastore: &DataStore, zone_id: OmicronZoneUuid, + reason_checker: &mut BlueprintExpungedZoneAccessReasonChecker, ) -> Result { - // Check BlueprintExpungedZoneAccessReason::NexusDeleteMetadataRecord: is - // this Nexus zone still present in the `db_metadata_nexus` table? + // Nexus zones have multiple reasons they could be non-pruneable; check each + // of them below and note them as we do. + + // Is this Nexus zone still present in the `db_metadata_nexus` table? + reason_checker.add_reason_checked( + BlueprintExpungedZoneAccessReason::NexusDeleteMetadataRecord, + ); if !datastore .database_nexus_access_all(opctx, &BTreeSet::from([zone_id])) .await? .is_empty() { - return Ok(true); + return Ok(false); } - // Check BlueprintExpungedZoneAccessReason::NexusSagaReassignment: does - // this Nexus zone still have unfinished sagas assigned? + // Does this Nexus zone still have unfinished sagas assigned? + reason_checker.add_reason_checked( + BlueprintExpungedZoneAccessReason::NexusSagaReassignment, + ); if !datastore .saga_list_recovery_candidates( opctx, @@ -270,7 +301,7 @@ async fn is_nexus_referenced( .await? .is_empty() { - return Ok(true); + return Ok(false); } // This is a no-op match that exists solely to ensure we update our logic if @@ -292,11 +323,13 @@ async fn is_nexus_referenced( } } - // Check BlueprintExpungedZoneAccessReason::NexusSupportBundleReassign: does - // this Nexus zone still have support bundles assigned to it in any state - // that requires cleanup work? This requires explicitly listing the states - // we care about; the no-op match statement above will hopefully keep this - // in sync with any changes to the enum. + // Does this Nexus zone still have support bundles assigned to it in any + // state that requires cleanup work? This requires explicitly listing the + // states we care about; the no-op match statement above will hopefully keep + // this in sync with any changes to the enum. + reason_checker.add_reason_checked( + BlueprintExpungedZoneAccessReason::NexusSupportBundleReassign, + ); if !datastore .support_bundle_list_assigned_to_nexus( opctx, @@ -312,32 +345,30 @@ async fn is_nexus_referenced( .await? .is_empty() { - return Ok(true); + return Ok(false); } - // These Nexus-related zone access reasons are documented as "planner does - // not need to account for this", so we don't check anything: - // - // * BlueprintExpungedZoneAccessReason::NexusExternalConfig - // * BlueprintExpungedZoneAccessReason::NexusSelfIsQuiescing - - Ok(false) + Ok(true) } -async fn is_oximeter_referenced( +async fn is_oximeter_pruneable( opctx: &OpContext, datastore: &DataStore, zone_id: OmicronZoneUuid, + reason_checker: &mut BlueprintExpungedZoneAccessReasonChecker, ) -> Result { - // Check - // BlueprintExpungedZoneAccessReason::OximeterExpungeAndReassignProducers: - // this zone ID should not refer to an in-service Oximeter collector, and it + // This zone ID should not refer to an in-service Oximeter collector, and it // should have no producers assigned to it. + // + // Note the reason we've checked here. + reason_checker.add_reason_checked( + BlueprintExpungedZoneAccessReason::OximeterExpungeAndReassignProducers, + ); match datastore.oximeter_lookup(opctx, zone_id.as_untyped_uuid()).await? { Some(_info) => { // If the lookup succeeded, we haven't yet performed the necessary // cleanup to mark this oximeter as expunged. - return Ok(true); + return Ok(false); } None => { // Oximeter has been expunged (or was never inserted in the first @@ -356,8 +387,8 @@ async fn is_oximeter_referenced( ) .await?; - // This oximeter is referenced if our set of assigned producers is nonempty. - Ok(!assigned_producers.is_empty()) + // This oximeter is pruneable if our set of assigned producers is empty. + Ok(assigned_producers.is_empty()) } fn single_item_pagparams() -> DataPageParams<'static, T> { @@ -522,10 +553,10 @@ impl BlueprintExpungedZoneAccessReasonChecker { // checks above for zone-specific reasons and add an appropriate // call to `add_reason_checked()`. match reason { - // Checked by is_boundary_ntp_referenced() + // Checked by is_boundary_ntp_pruneable() Reason::BoundaryNtpUpstreamConfig => {} - // Checked by is_multinode_clickhouse_referenced() + // Checked by is_multinode_clickhouse_pruneable() Reason::ClickhouseKeeperServerConfigIps => {} // TODO-john FIXME @@ -535,31 +566,30 @@ impl BlueprintExpungedZoneAccessReasonChecker { // https://github.com/oxidecomputer/omicron/issues/8447). Reason::CockroachDecommission => {} - // TODO-john FIXME - // Checked by is_external_networking_referenced(), which is - // called for each zone type with external networking (boundary - // NTP, external DNS, Nexus) + // Checked directly by the main loop in `PruneableZones::new()`. Reason::DeallocateExternalNetworkingResources => {} - // Checked by is_external_dns_referenced() + // Checked by is_external_dns_pruneable() Reason::ExternalDnsExternalIps => {} - // Each of these are checked by is_nexus_referenced() + // Each of these are checked by is_nexus_pruneable() Reason::NexusDeleteMetadataRecord | Reason::NexusSagaReassignment | Reason::NexusSupportBundleReassign => {} - // Checked by is_oximeter_referenced() + // Checked by is_oximeter_pruneable() Reason::OximeterExpungeAndReassignProducers => {} + // --------------------------------------------------------- + // Nexus-related reasons that don't need to be checked (see // `BlueprintExpungedZoneAccessReason` for specifics) Reason::NexusExternalConfig - | Reason::NexusSelfIsQuiescing + | Reason::NexusSelfIsQuiescing // Planner-related reasons that don't need to be checked (see // `BlueprintExpungedZoneAccessReason` for specifics) - |Reason::PlannerCheckReadyForCleanup + | Reason::PlannerCheckReadyForCleanup | Reason::PlanningInputDetermineUnreferenced | Reason::PlanningInputExpungedZoneGuard @@ -583,6 +613,7 @@ impl BlueprintExpungedZoneAccessReasonChecker { self.reasons_checked.insert(reason); } + #[cfg(test)] fn assert_all_reasons_checked(self) { let mut unchecked = BTreeSet::new(); From 17772e6e326b0e966c728682627d441e2ee1475a Mon Sep 17 00:00:00 2001 From: John Gallagher Date: Fri, 30 Jan 2026 16:51:44 -0500 Subject: [PATCH 17/27] add boundary NTP support to ExampleSystemBuilder --- nexus/reconfigurator/planning/src/example.rs | 100 +++++++++++++++---- 1 file changed, 83 insertions(+), 17 deletions(-) diff --git a/nexus/reconfigurator/planning/src/example.rs b/nexus/reconfigurator/planning/src/example.rs index f47bc679cb5..a655fdfe57f 100644 --- a/nexus/reconfigurator/planning/src/example.rs +++ b/nexus/reconfigurator/planning/src/example.rs @@ -212,6 +212,7 @@ pub struct ExampleSystemBuilder { internal_dns_count: ZoneCount, external_dns_count: ZoneCount, crucible_pantry_count: ZoneCount, + boundary_ntp_count: ZoneCount, create_zones: bool, create_disks_in_blueprint: bool, target_release: TargetReleaseDescription, @@ -248,6 +249,9 @@ impl ExampleSystemBuilder { internal_dns_count: ZoneCount(INTERNAL_DNS_REDUNDANCY), external_dns_count: ZoneCount(Self::DEFAULT_EXTERNAL_DNS_COUNT), crucible_pantry_count: ZoneCount(CRUCIBLE_PANTRY_REDUNDANCY), + // By default we only set up internal NTP; callers that specifically + // want boundary NTP can ask for them. + boundary_ntp_count: ZoneCount(0), create_zones: true, create_disks_in_blueprint: true, target_release: TargetReleaseDescription::Initial, @@ -327,6 +331,28 @@ impl ExampleSystemBuilder { Ok(self) } + /// Set the number of boundary NTP instances in the example system. + /// + /// The default value is 0. A value anywhere between 0 and 30, inclusive, is + /// permitted. (The limit of 30 is primarily to simplify the + /// implementation.) + /// + /// Each NTP server is assigned an external SNAT address in the 198.51.100.x + /// range. + pub fn set_boundary_ntp_count( + mut self, + boundary_ntp_count: usize, + ) -> anyhow::Result { + if boundary_ntp_count > 30 { + anyhow::bail!( + "boundary_ntp_count {} is greater than 30", + boundary_ntp_count, + ); + } + self.boundary_ntp_count = ZoneCount(boundary_ntp_count); + Ok(self) + } + /// Set the number of Crucible pantry instances in the example system. /// /// If [`Self::create_zones`] is set to `false`, this is ignored. @@ -428,6 +454,10 @@ impl ExampleSystemBuilder { self.external_dns_count.0 } + pub fn boundary_ntp_zones(&self) -> usize { + self.boundary_ntp_count.0 + } + /// Create a new example system with the given modifications. /// /// Return the system, and the initial blueprint that matches it. @@ -443,6 +473,7 @@ impl ExampleSystemBuilder { "internal_dns_count" => self.internal_dns_count.0, "external_dns_count" => self.external_dns_count.0, "crucible_pantry_count" => self.crucible_pantry_count.0, + "boundary_ntp_count" => self.boundary_ntp_count.0, "create_zones" => self.create_zones, "create_disks_in_blueprint" => self.create_disks_in_blueprint, ); @@ -517,19 +548,34 @@ impl ExampleSystemBuilder { .unwrap(), ) .unwrap(); - for i in 0..self.external_dns_count.0 { - let lo = (i + 1) - .try_into() - .expect("external_dns_count is always <= 30"); + for i in 1..=self.external_dns_count.0 { + let ip = format!("198.51.100.{i}"); builder - .add_external_dns_ip(IpAddr::V4(Ipv4Addr::new( - 198, 51, 100, lo, - ))) + .add_external_dns_ip(ip.parse().unwrap()) .expect("test IPs are valid service IPs"); } system.set_external_ip_policy(builder.build()); } + // Also add a 30-ip range for boundary NTP. It's very likely we'll + // actually allocate out of the leftover external DNS range, but if + // someone actually asked for 30 external DNS zones we'll shift up into + // this range. + if self.boundary_ntp_count.0 > 0 { + let mut builder = + system.external_ip_policy().clone().into_builder(); + builder + .push_service_pool_ipv4_range( + Ipv4Range::new( + "198.51.100.31".parse::().unwrap(), + "198.51.100.60".parse::().unwrap(), + ) + .unwrap(), + ) + .unwrap(); + system.set_external_ip_policy(builder.build()); + } + let mut input_builder = system .to_planning_input_builder(Arc::new( // Start with an empty blueprint. @@ -565,8 +611,8 @@ impl ExampleSystemBuilder { // * Create disks and non-discretionary zones on all sleds. // * Only create discretionary zones on discretionary sleds. let mut discretionary_ix = 0; - for (sled_id, sled_details) in - base_input.all_sleds(SledFilter::Commissioned) + for (sled_idx, (sled_id, sled_details)) in + base_input.all_sleds(SledFilter::Commissioned).enumerate() { if self.create_disks_in_blueprint { let _ = builder @@ -574,14 +620,34 @@ impl ExampleSystemBuilder { .unwrap(); } if self.create_zones { - let _ = builder - .sled_ensure_zone_ntp( - sled_id, - self.target_release - .zone_image_source(ZoneKind::BoundaryNtp) - .expect("obtained BoundaryNtp image source"), - ) - .unwrap(); + // Add boundary NTP to the first `self.boundary_ntp_count` sleds + // and internal NTP to the rest. + if sled_idx < self.boundary_ntp_count.0 { + let external_ip = external_networking_alloc + .for_new_boundary_ntp() + .expect("should have an external IP for boundary NTP"); + builder + .sled_add_zone_boundary_ntp_with_config( + sled_id, + vec!["ntp.oxide.computer".to_string()], + vec!["1.1.1.1".parse().unwrap()], + None, + self.target_release + .zone_image_source(ZoneKind::BoundaryNtp) + .expect("obtained BoundaryNtp image source"), + external_ip, + ) + .unwrap(); + } else { + let _ = builder + .sled_ensure_zone_ntp( + sled_id, + self.target_release + .zone_image_source(ZoneKind::BoundaryNtp) + .expect("obtained BoundaryNtp image source"), + ) + .unwrap(); + } // Create discretionary zones if allowed. if sled_details.policy.matches(SledFilter::Discretionary) { From 8d0a365943bef79a11f788c53e18cf68f8644654 Mon Sep 17 00:00:00 2001 From: John Gallagher Date: Fri, 30 Jan 2026 16:59:20 -0500 Subject: [PATCH 18/27] flesh out test_pruneable_zones_reason_checker() --- .../src/expunged_and_unreferenced.rs | 134 +++++++++++++++--- 1 file changed, 115 insertions(+), 19 deletions(-) diff --git a/nexus/reconfigurator/preparation/src/expunged_and_unreferenced.rs b/nexus/reconfigurator/preparation/src/expunged_and_unreferenced.rs index c669dc41558..9a665268f3f 100644 --- a/nexus/reconfigurator/preparation/src/expunged_and_unreferenced.rs +++ b/nexus/reconfigurator/preparation/src/expunged_and_unreferenced.rs @@ -54,6 +54,7 @@ use strum::IntoEnumIterator; pub(super) struct PruneableZones { pruneable_zones: BTreeSet, + #[allow(unused)] // only read by tests reason_checker: BlueprintExpungedZoneAccessReasonChecker, } @@ -612,30 +613,19 @@ impl BlueprintExpungedZoneAccessReasonChecker { ) { self.reasons_checked.insert(reason); } - - #[cfg(test)] - fn assert_all_reasons_checked(self) { - let mut unchecked = BTreeSet::new(); - - for reason in BlueprintExpungedZoneAccessReason::iter() { - if !self.reasons_checked.contains(&reason) { - unchecked.insert(reason); - } - } - - assert!( - unchecked.is_empty(), - "Correctness error: Planning input construction failed to \ - consider some `BlueprintExpungedZoneAccessReason`s: {unchecked:?}" - ); - } } #[cfg(test)] mod tests { use super::*; + use nexus_reconfigurator_planning::blueprint_builder::BlueprintBuilder; use nexus_reconfigurator_planning::example::ExampleSystemBuilder; + use nexus_reconfigurator_planning::planner::PlannerRng; use nexus_test_utils::db::TestDatabase; + use nexus_types::deployment::BlueprintSource; + use nexus_types::deployment::BlueprintZoneImageSource; + use nexus_types::deployment::BlueprintZoneType; + use nexus_types::deployment::SledFilter; use omicron_test_utils::dev; #[tokio::test] @@ -647,14 +637,120 @@ mod tests { let db = TestDatabase::new_with_datastore(log).await; let (opctx, datastore) = (db.opctx(), db.datastore()); - let (_, blueprint) = ExampleSystemBuilder::new(log, TEST_NAME).build(); + // Start with the base example system and build from there. + let (example, initial_blueprint) = + ExampleSystemBuilder::new(log, TEST_NAME) + .external_dns_count(1) + .expect("1 is a valid count of external DNS zones") + .set_boundary_ntp_count(1) + .expect("1 is a valid count of boundary NTP zones") + .build(); + let mut builder = BlueprintBuilder::new_based_on( + log, + &initial_blueprint, + TEST_NAME, + PlannerRng::from_seed(("builder", TEST_NAME)), + ) + .expect("failed to create builder"); + + // Pick the first sled to add missing zone types + let sled_id = example + .input + .all_sled_ids(SledFilter::Commissioned) + .next() + .expect("at least one sled"); + + // Add zones for types that aren't in the default example system + let image_source = BlueprintZoneImageSource::InstallDataset; + + builder + .sled_add_zone_oximeter(sled_id, image_source.clone()) + .expect("added oximeter zone"); + builder + .sled_add_zone_cockroachdb(sled_id, image_source.clone()) + .expect("added cockroach zone"); + builder + .sled_add_zone_clickhouse_keeper(sled_id, image_source.clone()) + .expect("added clickhouse keeper zone"); + builder + .sled_add_zone_clickhouse_server(sled_id, image_source.clone()) + .expect("added clickhouse server zone"); + + // Collect one zone ID of each type that has an associated + // `BlueprintExpungedZoneAccessReason` we ought to check. + let mut boundary_ntp_zone_id = None; + let mut clickhouse_keeper_zone_id = None; + let mut external_dns_zone_id = None; + let mut nexus_zone_id = None; + let mut oximeter_zone_id = None; + let mut cockroach_zone_id = None; + + for (sled_id, zone) in builder.current_in_service_zones() { + match &zone.zone_type { + BlueprintZoneType::BoundaryNtp(_) => { + boundary_ntp_zone_id = Some((sled_id, zone.id)); + } + BlueprintZoneType::ClickhouseKeeper(_) => { + clickhouse_keeper_zone_id = Some((sled_id, zone.id)); + } + BlueprintZoneType::ExternalDns(_) => { + external_dns_zone_id = Some((sled_id, zone.id)); + } + BlueprintZoneType::Nexus(_) => { + nexus_zone_id = Some((sled_id, zone.id)); + } + BlueprintZoneType::Oximeter(_) => { + oximeter_zone_id = Some((sled_id, zone.id)); + } + BlueprintZoneType::CockroachDb(_) => { + cockroach_zone_id = Some((sled_id, zone.id)); + } + _ => {} + } + } + + // Expunge and mark each zone as ready for cleanup + for (sled_id, zone_id) in [ + boundary_ntp_zone_id.expect("found boundary ntp zone"), + clickhouse_keeper_zone_id.expect("found clickhouse keeper zone"), + external_dns_zone_id.expect("found external dns zone"), + nexus_zone_id.expect("found nexus zone"), + oximeter_zone_id.expect("found oximeter zone"), + cockroach_zone_id.expect("found cockroach zone"), + ] { + builder.sled_expunge_zone(sled_id, zone_id).expect("expunged zone"); + builder + .sled_mark_expunged_zone_ready_for_cleanup(sled_id, zone_id) + .expect("marked zone ready for cleanup"); + } + + let blueprint = builder.build(BlueprintSource::Test); + // Check for pruneable zones; this test isn't primarily concerned with + // which ones actually are pruneable - it'll be a mix - but whether + // attempting to find PruneableZones did in fact check all possible + // `BlueprintExpungedZoneAccessReason`s. After this function returns, we + // interrogate the internal `reason_checker` and assert that it contains + // all known reasons. let pruneable_zones = PruneableZones::new(opctx, datastore, &blueprint, &[], &[]) .await .expect("failed to find pruneable zones"); - pruneable_zones.reason_checker.assert_all_reasons_checked(); + let mut unchecked = BTreeSet::new(); + + for reason in BlueprintExpungedZoneAccessReason::iter() { + if !pruneable_zones.reason_checker.reasons_checked.contains(&reason) + { + unchecked.insert(reason); + } + } + + assert!( + unchecked.is_empty(), + "PruneableZones failed to consider some \ + `BlueprintExpungedZoneAccessReason`s: {unchecked:?}" + ); db.terminate().await; logctx.cleanup_successful(); From b2dc8f22839df5188883a91c70c317d09f8f832b Mon Sep 17 00:00:00 2001 From: John Gallagher Date: Fri, 30 Jan 2026 17:18:41 -0500 Subject: [PATCH 19/27] add test_oximeter_pruneable_reasons() --- .../src/expunged_and_unreferenced.rs | 180 ++++++++++++++++++ 1 file changed, 180 insertions(+) diff --git a/nexus/reconfigurator/preparation/src/expunged_and_unreferenced.rs b/nexus/reconfigurator/preparation/src/expunged_and_unreferenced.rs index 9a665268f3f..9bad58aed8b 100644 --- a/nexus/reconfigurator/preparation/src/expunged_and_unreferenced.rs +++ b/nexus/reconfigurator/preparation/src/expunged_and_unreferenced.rs @@ -618,6 +618,7 @@ impl BlueprintExpungedZoneAccessReasonChecker { #[cfg(test)] mod tests { use super::*; + use nexus_db_queries::db::datastore::CollectorReassignment; use nexus_reconfigurator_planning::blueprint_builder::BlueprintBuilder; use nexus_reconfigurator_planning::example::ExampleSystemBuilder; use nexus_reconfigurator_planning::planner::PlannerRng; @@ -626,7 +627,11 @@ mod tests { use nexus_types::deployment::BlueprintZoneImageSource; use nexus_types::deployment::BlueprintZoneType; use nexus_types::deployment::SledFilter; + use nexus_types::internal_api::params::OximeterInfo; + use omicron_common::api::internal::nexus::ProducerEndpoint; + use omicron_common::api::internal::nexus::ProducerKind; use omicron_test_utils::dev; + use std::time::Duration; #[tokio::test] async fn test_pruneable_zones_reason_checker() { @@ -755,4 +760,179 @@ mod tests { db.terminate().await; logctx.cleanup_successful(); } + + #[tokio::test] + async fn test_oximeter_pruneable_reasons() { + const TEST_NAME: &str = "test_oximeter_pruneable_reasons"; + + let logctx = dev::test_setup_log(TEST_NAME); + let log = &logctx.log; + let db = TestDatabase::new_with_datastore(log).await; + let (opctx, datastore) = (db.opctx(), db.datastore()); + + // Start with the base example system and build from there. + let (example, initial_blueprint) = + ExampleSystemBuilder::new(log, TEST_NAME).build(); + let mut builder = BlueprintBuilder::new_based_on( + log, + &initial_blueprint, + TEST_NAME, + PlannerRng::from_seed(("builder", TEST_NAME)), + ) + .expect("failed to create builder"); + + // Pick the first sled to add an Oximeter zone + let sled_id = example + .input + .all_sled_ids(SledFilter::Commissioned) + .next() + .expect("at least one sled"); + + // Add an Oximeter zone, then expunge it. + let image_source = BlueprintZoneImageSource::InstallDataset; + builder + .sled_add_zone_oximeter(sled_id, image_source) + .expect("added oximeter zone"); + let oximeter_zone_id = builder + .current_in_service_zones() + .find_map(|(_, zone)| { + matches!(zone.zone_type, BlueprintZoneType::Oximeter(_)) + .then_some(zone.id) + }) + .expect("found oximeter zone"); + builder + .sled_expunge_zone(sled_id, oximeter_zone_id) + .expect("expunged zone"); + builder + .sled_mark_expunged_zone_ready_for_cleanup( + sled_id, + oximeter_zone_id, + ) + .expect("marked zone ready for cleanup"); + + let blueprint = builder.build(BlueprintSource::Test); + + // Check that the zone is pruneable (no oximeter record, no producers) + let pruneable_zones = + PruneableZones::new(opctx, datastore, &blueprint, &[], &[]) + .await + .expect("failed to find pruneable zones"); + assert!( + pruneable_zones.pruneable_zones.contains(&oximeter_zone_id), + "oximeter zone should be pruneable when there's no \ + oximeter record and no producers" + ); + + // Now insert an oximeter collector record for this zone + // This simulates the case where cleanup hasn't yet happened + datastore + .oximeter_create( + opctx, + &nexus_db_model::OximeterInfo::new(&OximeterInfo { + collector_id: oximeter_zone_id.into_untyped_uuid(), + address: "[::1]:12223".parse().unwrap(), + }), + ) + .await + .expect("failed to insert oximeter record"); + + // Check that the zone is no longer pruneable + let pruneable_zones = + PruneableZones::new(opctx, datastore, &blueprint, &[], &[]) + .await + .expect("failed to find pruneable zones"); + assert!( + !pruneable_zones.pruneable_zones.contains(&oximeter_zone_id), + "oximeter zone should not be pruneable when there's an \ + oximeter record" + ); + + // Now add producers assigned to this oximeter + use omicron_uuid_kinds::GenericUuid; + let producer1 = ProducerEndpoint { + id: OmicronZoneUuid::new_v4().into_untyped_uuid(), + kind: ProducerKind::Service, + address: "[::1]:12345".parse().unwrap(), + interval: Duration::from_secs(30), + }; + let producer2 = ProducerEndpoint { + id: OmicronZoneUuid::new_v4().into_untyped_uuid(), + kind: ProducerKind::Service, + address: "[::1]:12346".parse().unwrap(), + interval: Duration::from_secs(30), + }; + + datastore + .producer_endpoint_upsert_and_assign(opctx, &producer1) + .await + .expect("failed to insert producer 1"); + datastore + .producer_endpoint_upsert_and_assign(opctx, &producer2) + .await + .expect("failed to insert producer 2"); + + // Mark the oximeter as expunged in the datastore + datastore + .oximeter_expunge(opctx, oximeter_zone_id.into_untyped_uuid()) + .await + .expect("failed to expunge oximeter"); + + // Check that the zone is still not pruneable (producers are assigned) + let pruneable_zones = + PruneableZones::new(opctx, datastore, &blueprint, &[], &[]) + .await + .expect("failed to find pruneable zones"); + assert!( + !pruneable_zones.pruneable_zones.contains(&oximeter_zone_id), + "oximeter zone should not be pruneable when it has \ + producers assigned, even if expunged" + ); + + // Create a second dummy oximeter to reassign producers to + let dummy_oximeter_id = OmicronZoneUuid::new_v4().into_untyped_uuid(); + datastore + .oximeter_create( + opctx, + &nexus_db_model::OximeterInfo::new(&OximeterInfo { + collector_id: dummy_oximeter_id, + address: "[::1]:12224".parse().unwrap(), + }), + ) + .await + .expect("failed to insert dummy oximeter"); + + // Reassign the producers from the expunged oximeter to the new one + let reassignment = datastore + .oximeter_reassign_all_producers( + opctx, + oximeter_zone_id.into_untyped_uuid(), + ) + .await + .expect("failed to reassign producers"); + match reassignment { + CollectorReassignment::Complete(n) => { + assert_eq!(n, 2, "expected 2 producers to be reassigned"); + } + CollectorReassignment::NoCollectorsAvailable => { + panic!( + "expected producers to be reassigned, but no collectors \ + available" + ); + } + } + + // Check that the zone is now pruneable again (no producers assigned) + let pruneable_zones = + PruneableZones::new(opctx, datastore, &blueprint, &[], &[]) + .await + .expect("failed to find pruneable zones"); + assert!( + pruneable_zones.pruneable_zones.contains(&oximeter_zone_id), + "oximeter zone should be pruneable when expunged and \ + all producers have been reassigned" + ); + + db.terminate().await; + logctx.cleanup_successful(); + } } From 864dade680953a9a5b5154eb4e514ac663d6ddce Mon Sep 17 00:00:00 2001 From: John Gallagher Date: Sat, 31 Jan 2026 10:02:08 -0500 Subject: [PATCH 20/27] add test_boundary_ntp_pruneable_reasons() --- Cargo.lock | 2 + nexus/reconfigurator/preparation/Cargo.toml | 2 + .../src/expunged_and_unreferenced.rs | 232 +++++++++++++++++- 3 files changed, 232 insertions(+), 4 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 5745c1dba82..e245ede0b62 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -7403,6 +7403,7 @@ name = "nexus-reconfigurator-preparation" version = "0.1.0" dependencies = [ "anyhow", + "chrono", "futures", "nexus-db-model", "nexus-db-queries", @@ -7421,6 +7422,7 @@ dependencies = [ "slog-error-chain", "strum 0.27.2", "tokio", + "uuid", ] [[package]] diff --git a/nexus/reconfigurator/preparation/Cargo.toml b/nexus/reconfigurator/preparation/Cargo.toml index f8c4a8cadc2..d08765639e2 100644 --- a/nexus/reconfigurator/preparation/Cargo.toml +++ b/nexus/reconfigurator/preparation/Cargo.toml @@ -28,7 +28,9 @@ strum.workspace = true omicron-workspace-hack.workspace = true [dev-dependencies] +chrono.workspace = true nexus-reconfigurator-planning.workspace = true nexus-test-utils.workspace = true omicron-test-utils.workspace = true tokio.workspace = true +uuid.workspace = true diff --git a/nexus/reconfigurator/preparation/src/expunged_and_unreferenced.rs b/nexus/reconfigurator/preparation/src/expunged_and_unreferenced.rs index 9bad58aed8b..324d64e204f 100644 --- a/nexus/reconfigurator/preparation/src/expunged_and_unreferenced.rs +++ b/nexus/reconfigurator/preparation/src/expunged_and_unreferenced.rs @@ -618,8 +618,20 @@ impl BlueprintExpungedZoneAccessReasonChecker { #[cfg(test)] mod tests { use super::*; + use chrono::Utc; + use nexus_db_model::ExternalIp; + use nexus_db_model::IpAttachState; + use nexus_db_model::IpKind; + use nexus_db_model::Ipv4Addr as DbIpv4Addr; + use nexus_db_model::MacAddr; + use nexus_db_model::Name; + use nexus_db_model::ServiceNetworkInterface; + use nexus_db_model::ServiceNetworkInterfaceIdentity; + use nexus_db_model::SqlU8; + use nexus_db_model::SqlU16; use nexus_db_queries::db::datastore::CollectorReassignment; use nexus_reconfigurator_planning::blueprint_builder::BlueprintBuilder; + use nexus_reconfigurator_planning::blueprint_editor::ExternalSnatNetworkingChoice; use nexus_reconfigurator_planning::example::ExampleSystemBuilder; use nexus_reconfigurator_planning::planner::PlannerRng; use nexus_test_utils::db::TestDatabase; @@ -628,10 +640,68 @@ mod tests { use nexus_types::deployment::BlueprintZoneType; use nexus_types::deployment::SledFilter; use nexus_types::internal_api::params::OximeterInfo; + use omicron_common::api::external; + use omicron_common::api::external::MacAddr as ExternalMacAddr; use omicron_common::api::internal::nexus::ProducerEndpoint; use omicron_common::api::internal::nexus::ProducerKind; + use omicron_common::api::internal::shared::PrivateIpConfig; use omicron_test_utils::dev; + use omicron_uuid_kinds::GenericUuid; + use std::net::Ipv4Addr; use std::time::Duration; + use uuid::Uuid; + + /// Helper to construct a test ExternalIp associated with a specific zone + fn make_external_ip_for_zone( + zone_id: OmicronZoneUuid, + ) -> nexus_db_model::ExternalIp { + ExternalIp { + id: OmicronZoneUuid::new_v4().into_untyped_uuid(), + name: None, + description: None, + time_created: Utc::now(), + time_modified: Utc::now(), + time_deleted: None, + ip_pool_id: Uuid::new_v4(), + ip_pool_range_id: Uuid::new_v4(), + is_service: true, + is_probe: false, + parent_id: Some(zone_id.into_untyped_uuid()), + kind: IpKind::SNat, + ip: "192.168.1.1/32".parse().unwrap(), + first_port: SqlU16::new(0), + last_port: SqlU16::new(65535), + project_id: None, + state: IpAttachState::Attached, + } + } + + /// Helper to construct a test ServiceNetworkInterface associated with a + /// specific zone + fn make_service_nic_for_zone( + zone_id: OmicronZoneUuid, + ) -> nexus_db_model::ServiceNetworkInterface { + ServiceNetworkInterface { + identity: ServiceNetworkInterfaceIdentity { + id: Uuid::new_v4(), + name: Name( + external::Name::try_from("test-nic".to_string()).unwrap(), + ), + description: "test NIC".to_string(), + time_created: Utc::now(), + time_modified: Utc::now(), + time_deleted: None, + }, + service_id: zone_id.into_untyped_uuid(), + vpc_id: Uuid::new_v4(), + subnet_id: Uuid::new_v4(), + mac: MacAddr(external::MacAddr([0, 0, 0, 0, 0, 0].into())), + ipv4: Some(DbIpv4Addr::from("192.168.1.2".parse().unwrap())), + ipv6: None, + slot: SqlU8::new(0), + primary: true, + } + } #[tokio::test] async fn test_pruneable_zones_reason_checker() { @@ -850,13 +920,13 @@ mod tests { // Now add producers assigned to this oximeter use omicron_uuid_kinds::GenericUuid; let producer1 = ProducerEndpoint { - id: OmicronZoneUuid::new_v4().into_untyped_uuid(), + id: Uuid::new_v4(), kind: ProducerKind::Service, address: "[::1]:12345".parse().unwrap(), interval: Duration::from_secs(30), }; let producer2 = ProducerEndpoint { - id: OmicronZoneUuid::new_v4().into_untyped_uuid(), + id: Uuid::new_v4(), kind: ProducerKind::Service, address: "[::1]:12346".parse().unwrap(), interval: Duration::from_secs(30), @@ -889,12 +959,11 @@ mod tests { ); // Create a second dummy oximeter to reassign producers to - let dummy_oximeter_id = OmicronZoneUuid::new_v4().into_untyped_uuid(); datastore .oximeter_create( opctx, &nexus_db_model::OximeterInfo::new(&OximeterInfo { - collector_id: dummy_oximeter_id, + collector_id: Uuid::new_v4(), address: "[::1]:12224".parse().unwrap(), }), ) @@ -935,4 +1004,159 @@ mod tests { db.terminate().await; logctx.cleanup_successful(); } + + #[tokio::test] + async fn test_boundary_ntp_pruneable_reasons() { + const TEST_NAME: &str = "test_boundary_ntp_pruneable_reasons"; + + let logctx = dev::test_setup_log(TEST_NAME); + let log = &logctx.log; + let db = TestDatabase::new_with_datastore(log).await; + let (opctx, datastore) = (db.opctx(), db.datastore()); + + // Start with the base example system with one boundary NTP zone + let (example, initial_blueprint) = + ExampleSystemBuilder::new(log, TEST_NAME) + .set_boundary_ntp_count(1) + .expect("1 is a valid count of boundary NTP zones") + .build(); + let mut builder = BlueprintBuilder::new_based_on( + log, + &initial_blueprint, + TEST_NAME, + PlannerRng::from_seed(("builder", TEST_NAME)), + ) + .expect("failed to create builder"); + + // Find the boundary NTP zone and expunge it. + let (sled_id, boundary_ntp_zone_id, external_ip) = builder + .current_in_service_zones() + .find_map(|(sled_id, zone)| { + if let BlueprintZoneType::BoundaryNtp(config) = &zone.zone_type + { + Some((sled_id, zone.id, config.external_ip)) + } else { + None + } + }) + .expect("found boundary ntp zone"); + builder + .sled_expunge_zone(sled_id, boundary_ntp_zone_id) + .expect("expunged zone"); + builder + .sled_mark_expunged_zone_ready_for_cleanup( + sled_id, + boundary_ntp_zone_id, + ) + .expect("marked zone ready for cleanup"); + + let blueprint = builder.build(BlueprintSource::Test); + + // Confirm that it's NOT pruneable when there's no other boundary NTP + // zone with the same upstream config. + let pruneable_zones = + PruneableZones::new(opctx, datastore, &blueprint, &[], &[]) + .await + .expect("failed to find pruneable zones"); + assert!( + !pruneable_zones.pruneable_zones.contains(&boundary_ntp_zone_id), + "boundary NTP zone should not be pruneable when there's no \ + other in-service zone with the same upstream config" + ); + + // Now add another boundary NTP zone with the same upstream config + let mut builder = BlueprintBuilder::new_based_on( + log, + &blueprint, + TEST_NAME, + PlannerRng::from_seed(("builder2", TEST_NAME)), + ) + .expect("failed to create builder"); + + // Find a different sled to add the second boundary NTP zone + let other_sled_id = example + .input + .all_sled_ids(SledFilter::Commissioned) + .find(|id| *id != sled_id) + .expect("at least two sleds"); + + builder + .sled_promote_internal_ntp_to_boundary_ntp( + other_sled_id, + BlueprintZoneImageSource::InstallDataset, + { + // Construct a ExternalSnatNetworkingChoice; steal the + // existing zone's snat_cfg + let nic_ip_config = PrivateIpConfig::new_ipv4( + Ipv4Addr::new(10, 0, 0, 1), + "10.0.0.0/24".parse().unwrap(), + ) + .expect("valid private IP config"); + ExternalSnatNetworkingChoice { + snat_cfg: external_ip.snat_cfg, + nic_ip_config, + nic_mac: ExternalMacAddr( + omicron_common::api::external::MacAddr( + [0, 0, 0, 0, 0, 1].into(), + ) + .0, + ), + } + }, + ) + .expect("promoted boundary NTP zone"); + + let blueprint = builder.build(BlueprintSource::Test); + + // Check that the zone IS now pruneable (another zone has the same + // config) + let pruneable_zones = + PruneableZones::new(opctx, datastore, &blueprint, &[], &[]) + .await + .expect("failed to find pruneable zones"); + assert!( + pruneable_zones.pruneable_zones.contains(&boundary_ntp_zone_id), + "boundary NTP zone should be pruneable when there's another \ + in-service zone with the same upstream config" + ); + + // Check that it's NOT pruneable when there's an associated external IP + // db row + let external_ip = make_external_ip_for_zone(boundary_ntp_zone_id); + let pruneable_zones = PruneableZones::new( + opctx, + datastore, + &blueprint, + &[external_ip], + &[], + ) + .await + .expect("failed to find pruneable zones"); + assert!( + !pruneable_zones.pruneable_zones.contains(&boundary_ntp_zone_id), + "boundary NTP zone should not be pruneable when there's an \ + associated external IP row" + ); + + // Check that it's NOT pruneable when there's an associated service NIC + // db row + let service_nic = make_service_nic_for_zone(boundary_ntp_zone_id); + let pruneable_zones = PruneableZones::new( + opctx, + datastore, + &blueprint, + &[], + &[service_nic], + ) + .await + .expect("failed to find pruneable zones"); + assert!( + !pruneable_zones.pruneable_zones.contains(&boundary_ntp_zone_id), + "boundary NTP zone should not be pruneable when there's an \ + associated service NIC row" + ); + + db.terminate().await; + logctx.cleanup_successful(); + } } From c51cd3f0e7d9a2826044e8149a633cf1240b6f19 Mon Sep 17 00:00:00 2001 From: John Gallagher Date: Sat, 31 Jan 2026 11:04:45 -0500 Subject: [PATCH 21/27] initial remaining tests --- Cargo.lock | 1 + nexus/reconfigurator/preparation/Cargo.toml | 1 + .../src/expunged_and_unreferenced.rs | 476 +++++++++++++++++- 3 files changed, 477 insertions(+), 1 deletion(-) diff --git a/Cargo.lock b/Cargo.lock index e245ede0b62..457747a947f 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -7404,6 +7404,7 @@ version = "0.1.0" dependencies = [ "anyhow", "chrono", + "clickhouse-admin-types", "futures", "nexus-db-model", "nexus-db-queries", diff --git a/nexus/reconfigurator/preparation/Cargo.toml b/nexus/reconfigurator/preparation/Cargo.toml index d08765639e2..291f78bc5da 100644 --- a/nexus/reconfigurator/preparation/Cargo.toml +++ b/nexus/reconfigurator/preparation/Cargo.toml @@ -29,6 +29,7 @@ omicron-workspace-hack.workspace = true [dev-dependencies] chrono.workspace = true +clickhouse-admin-types.workspace = true nexus-reconfigurator-planning.workspace = true nexus-test-utils.workspace = true omicron-test-utils.workspace = true diff --git a/nexus/reconfigurator/preparation/src/expunged_and_unreferenced.rs b/nexus/reconfigurator/preparation/src/expunged_and_unreferenced.rs index 324d64e204f..edc3d4821b0 100644 --- a/nexus/reconfigurator/preparation/src/expunged_and_unreferenced.rs +++ b/nexus/reconfigurator/preparation/src/expunged_and_unreferenced.rs @@ -619,6 +619,7 @@ impl BlueprintExpungedZoneAccessReasonChecker { mod tests { use super::*; use chrono::Utc; + use clickhouse_admin_types::keeper::KeeperId; use nexus_db_model::ExternalIp; use nexus_db_model::IpAttachState; use nexus_db_model::IpKind; @@ -631,6 +632,7 @@ mod tests { use nexus_db_model::SqlU16; use nexus_db_queries::db::datastore::CollectorReassignment; use nexus_reconfigurator_planning::blueprint_builder::BlueprintBuilder; + use nexus_reconfigurator_planning::blueprint_editor::ExternalNetworkingChoice; use nexus_reconfigurator_planning::blueprint_editor::ExternalSnatNetworkingChoice; use nexus_reconfigurator_planning::example::ExampleSystemBuilder; use nexus_reconfigurator_planning::planner::PlannerRng; @@ -638,6 +640,7 @@ mod tests { use nexus_types::deployment::BlueprintSource; use nexus_types::deployment::BlueprintZoneImageSource; use nexus_types::deployment::BlueprintZoneType; + use nexus_types::deployment::ClickhouseClusterConfig; use nexus_types::deployment::SledFilter; use nexus_types::internal_api::params::OximeterInfo; use omicron_common::api::external; @@ -696,7 +699,9 @@ mod tests { vpc_id: Uuid::new_v4(), subnet_id: Uuid::new_v4(), mac: MacAddr(external::MacAddr([0, 0, 0, 0, 0, 0].into())), - ipv4: Some(DbIpv4Addr::from("192.168.1.2".parse().unwrap())), + ipv4: Some(DbIpv4Addr::from( + "192.168.1.2".parse::().unwrap(), + )), ipv6: None, slot: SqlU8::new(0), primary: true, @@ -1159,4 +1164,473 @@ mod tests { db.terminate().await; logctx.cleanup_successful(); } + + #[tokio::test] + async fn test_nexus_pruneable_reasons() { + const TEST_NAME: &str = "test_nexus_pruneable_reasons"; + + let logctx = dev::test_setup_log(TEST_NAME); + let log = &logctx.log; + let db = TestDatabase::new_with_datastore(log).await; + let (opctx, datastore) = (db.opctx(), db.datastore()); + + // Start with the base example system (which includes Nexus zones) + let (_example, initial_blueprint) = + ExampleSystemBuilder::new(log, TEST_NAME).build(); + let mut builder = BlueprintBuilder::new_based_on( + log, + &initial_blueprint, + TEST_NAME, + PlannerRng::from_seed(("builder", TEST_NAME)), + ) + .expect("failed to create builder"); + + // Find a Nexus zone and expunge it + let (sled_id, nexus_zone_id) = builder + .current_in_service_zones() + .find_map(|(sled_id, zone)| { + matches!(zone.zone_type, BlueprintZoneType::Nexus(_)) + .then_some((sled_id, zone.id)) + }) + .expect("found nexus zone"); + builder + .sled_expunge_zone(sled_id, nexus_zone_id) + .expect("expunged zone"); + builder + .sled_mark_expunged_zone_ready_for_cleanup(sled_id, nexus_zone_id) + .expect("marked zone ready for cleanup"); + + let blueprint = builder.build(BlueprintSource::Test); + + // Check that the zone IS pruneable when no database reasons exist + let pruneable_zones = + PruneableZones::new(opctx, datastore, &blueprint, &[], &[]) + .await + .expect("failed to find pruneable zones"); + assert!( + pruneable_zones.pruneable_zones.contains(&nexus_zone_id), + "nexus zone should be pruneable when no database reasons exist" + ); + + // Check that it's NOT pruneable when there's an associated external IP + // db row + let external_ip = make_external_ip_for_zone(nexus_zone_id); + let pruneable_zones = PruneableZones::new( + opctx, + datastore, + &blueprint, + &[external_ip], + &[], + ) + .await + .expect("failed to find pruneable zones"); + assert!( + !pruneable_zones.pruneable_zones.contains(&nexus_zone_id), + "nexus zone should not be pruneable when there's an \ + associated external IP row" + ); + + // Check that it's NOT pruneable when there's an associated service NIC + // db row + let service_nic = make_service_nic_for_zone(nexus_zone_id); + let pruneable_zones = PruneableZones::new( + opctx, + datastore, + &blueprint, + &[], + &[service_nic], + ) + .await + .expect("failed to find pruneable zones"); + assert!( + !pruneable_zones.pruneable_zones.contains(&nexus_zone_id), + "nexus zone should not be pruneable when there's an \ + associated service NIC row" + ); + + db.terminate().await; + logctx.cleanup_successful(); + } + + #[tokio::test] + async fn test_external_dns_pruneable_reasons() { + const TEST_NAME: &str = "test_external_dns_pruneable_reasons"; + + let logctx = dev::test_setup_log(TEST_NAME); + let log = &logctx.log; + let db = TestDatabase::new_with_datastore(log).await; + let (opctx, datastore) = (db.opctx(), db.datastore()); + + // Start with the base example system with one external DNS zone + let (example, initial_blueprint) = + ExampleSystemBuilder::new(log, TEST_NAME) + .external_dns_count(1) + .expect("1 is a valid count of external DNS zones") + .build(); + let mut builder = BlueprintBuilder::new_based_on( + log, + &initial_blueprint, + TEST_NAME, + PlannerRng::from_seed(("builder", TEST_NAME)), + ) + .expect("failed to create builder"); + + // Find the external DNS zone and expunge it + let (sled_id, external_dns_zone_id, dns_address) = builder + .current_in_service_zones() + .find_map(|(sled_id, zone)| { + if let BlueprintZoneType::ExternalDns(config) = &zone.zone_type + { + Some((sled_id, zone.id, config.dns_address)) + } else { + None + } + }) + .expect("found external dns zone"); + builder + .sled_expunge_zone(sled_id, external_dns_zone_id) + .expect("expunged zone"); + builder + .sled_mark_expunged_zone_ready_for_cleanup( + sled_id, + external_dns_zone_id, + ) + .expect("marked zone ready for cleanup"); + + let blueprint = builder.build(BlueprintSource::Test); + + // Confirm that it's NOT pruneable when there's no other external DNS + // zone with the same IP (meaning the IP hasn't been reassigned yet). + let pruneable_zones = + PruneableZones::new(opctx, datastore, &blueprint, &[], &[]) + .await + .expect("failed to find pruneable zones"); + assert!( + !pruneable_zones.pruneable_zones.contains(&external_dns_zone_id), + "external DNS zone should not be pruneable when its IP \ + hasn't been reassigned to another in-service zone" + ); + + // Now add another external DNS zone with the same IP address + let mut builder = BlueprintBuilder::new_based_on( + log, + &blueprint, + TEST_NAME, + PlannerRng::from_seed(("builder2", TEST_NAME)), + ) + .expect("failed to create builder"); + + // Find a different sled to add the second external DNS zone + let other_sled_id = example + .input + .all_sled_ids(SledFilter::Commissioned) + .find(|id| *id != sled_id) + .expect("at least two sleds"); + + // Construct an ExternalNetworkingChoice with the same external IP + let external_ip_choice = { + let nic_ip_config = PrivateIpConfig::new_ipv4( + Ipv4Addr::new(10, 0, 0, 2), + "10.0.0.0/24".parse().unwrap(), + ) + .expect("valid private IP config"); + ExternalNetworkingChoice { + external_ip: dns_address.addr.ip(), + nic_ip_config, + nic_mac: ExternalMacAddr( + omicron_common::api::external::MacAddr( + [0, 0, 0, 0, 0, 2].into(), + ) + .0, + ), + } + }; + + builder + .sled_add_zone_external_dns( + other_sled_id, + BlueprintZoneImageSource::InstallDataset, + external_ip_choice, + ) + .expect("added external DNS zone"); + + let blueprint = builder.build(BlueprintSource::Test); + + // Check that the zone IS now pruneable (the IP has been reassigned to + // another in-service zone) + let pruneable_zones = + PruneableZones::new(opctx, datastore, &blueprint, &[], &[]) + .await + .expect("failed to find pruneable zones"); + assert!( + pruneable_zones.pruneable_zones.contains(&external_dns_zone_id), + "external DNS zone should be pruneable when its IP has been \ + reassigned to another in-service zone" + ); + + // Check that it's NOT pruneable when there's an associated external IP + // db row + let external_ip = make_external_ip_for_zone(external_dns_zone_id); + let pruneable_zones = PruneableZones::new( + opctx, + datastore, + &blueprint, + &[external_ip], + &[], + ) + .await + .expect("failed to find pruneable zones"); + assert!( + !pruneable_zones.pruneable_zones.contains(&external_dns_zone_id), + "external DNS zone should not be pruneable when there's an \ + associated external IP row" + ); + + // Check that it's NOT pruneable when there's an associated service NIC + // db row + let service_nic = make_service_nic_for_zone(external_dns_zone_id); + let pruneable_zones = PruneableZones::new( + opctx, + datastore, + &blueprint, + &[], + &[service_nic], + ) + .await + .expect("failed to find pruneable zones"); + assert!( + !pruneable_zones.pruneable_zones.contains(&external_dns_zone_id), + "external DNS zone should not be pruneable when there's an \ + associated service NIC row" + ); + + db.terminate().await; + logctx.cleanup_successful(); + } + + #[tokio::test] + async fn test_clickhouse_keeper_pruneable_reasons() { + const TEST_NAME: &str = "test_clickhouse_keeper_pruneable_reasons"; + + let logctx = dev::test_setup_log(TEST_NAME); + let log = &logctx.log; + let db = TestDatabase::new_with_datastore(log).await; + let (opctx, datastore) = (db.opctx(), db.datastore()); + + // Start with the base example system and add a clickhouse keeper zone + let (_example, initial_blueprint) = + ExampleSystemBuilder::new(log, TEST_NAME).build(); + let mut builder = BlueprintBuilder::new_based_on( + log, + &initial_blueprint, + TEST_NAME, + PlannerRng::from_seed(("builder", TEST_NAME)), + ) + .expect("failed to create builder"); + + // Find a sled and add a clickhouse keeper zone + let sled_id = builder + .current_in_service_zones() + .map(|(sled_id, _)| sled_id) + .next() + .expect("at least one sled"); + + builder + .sled_add_zone_clickhouse_keeper( + sled_id, + BlueprintZoneImageSource::InstallDataset, + ) + .expect("added clickhouse keeper zone"); + + // Find the keeper zone we just added + let keeper_zone_id = builder + .current_in_service_zones() + .find_map(|(_, zone)| { + matches!(zone.zone_type, BlueprintZoneType::ClickhouseKeeper(_)) + .then_some(zone.id) + }) + .expect("found clickhouse keeper zone"); + + // Manually add the keeper to the clickhouse cluster config + // (normally the planner does this, but we need it for testing) + let config = + builder.clickhouse_cluster_config().cloned().unwrap_or_else(|| { + ClickhouseClusterConfig::new( + "test_cluster".to_string(), + "test_secret".to_string(), + ) + }); + let mut new_config = config.clone(); + new_config.keepers.insert(keeper_zone_id, KeeperId(1)); + builder.set_clickhouse_cluster_config(Some(new_config)); + + // Expunge the keeper zone + builder + .sled_expunge_zone(sled_id, keeper_zone_id) + .expect("expunged zone"); + builder + .sled_mark_expunged_zone_ready_for_cleanup(sled_id, keeper_zone_id) + .expect("marked zone ready for cleanup"); + + let blueprint = builder.build(BlueprintSource::Test); + + // Confirm that it's NOT pruneable when it's still in the clickhouse + // cluster config + let pruneable_zones = + PruneableZones::new(opctx, datastore, &blueprint, &[], &[]) + .await + .expect("failed to find pruneable zones"); + assert!( + !pruneable_zones.pruneable_zones.contains(&keeper_zone_id), + "clickhouse keeper zone should not be pruneable when it's \ + still in the clickhouse cluster config" + ); + + // Now remove it from the clickhouse cluster config + let mut builder = BlueprintBuilder::new_based_on( + log, + &blueprint, + TEST_NAME, + PlannerRng::from_seed(("builder2", TEST_NAME)), + ) + .expect("failed to create builder"); + + // Get the current config, remove the keeper, and set it back + let mut config = builder + .clickhouse_cluster_config() + .expect("clickhouse cluster config exists") + .clone(); + config.keepers.remove(&keeper_zone_id); + builder.set_clickhouse_cluster_config(Some(config)); + + let blueprint = builder.build(BlueprintSource::Test); + + // Check that the zone IS now pruneable (removed from cluster config) + let pruneable_zones = + PruneableZones::new(opctx, datastore, &blueprint, &[], &[]) + .await + .expect("failed to find pruneable zones"); + assert!( + pruneable_zones.pruneable_zones.contains(&keeper_zone_id), + "clickhouse keeper zone should be pruneable when it's been \ + removed from the clickhouse cluster config" + ); + + db.terminate().await; + logctx.cleanup_successful(); + } + + #[tokio::test] + async fn test_clickhouse_server_pruneable_reasons() { + const TEST_NAME: &str = "test_clickhouse_server_pruneable_reasons"; + + let logctx = dev::test_setup_log(TEST_NAME); + let log = &logctx.log; + let db = TestDatabase::new_with_datastore(log).await; + let (opctx, datastore) = (db.opctx(), db.datastore()); + + // Start with the base example system and add a clickhouse server zone + let (_example, initial_blueprint) = + ExampleSystemBuilder::new(log, TEST_NAME).build(); + let mut builder = BlueprintBuilder::new_based_on( + log, + &initial_blueprint, + TEST_NAME, + PlannerRng::from_seed(("builder", TEST_NAME)), + ) + .expect("failed to create builder"); + + // Find a sled and add a clickhouse server zone + let sled_id = builder + .current_in_service_zones() + .map(|(sled_id, _)| sled_id) + .next() + .expect("at least one sled"); + + builder + .sled_add_zone_clickhouse_server( + sled_id, + BlueprintZoneImageSource::InstallDataset, + ) + .expect("added clickhouse server zone"); + + // Find the server zone we just added + let server_zone_id = builder + .current_in_service_zones() + .find_map(|(_, zone)| { + matches!(zone.zone_type, BlueprintZoneType::ClickhouseServer(_)) + .then_some(zone.id) + }) + .expect("found clickhouse server zone"); + + // Manually add the server to the clickhouse cluster config + // (normally the planner does this, but we need it for testing) + use clickhouse_admin_types::server::ServerId; + use nexus_types::deployment::ClickhouseClusterConfig; + let config = + builder.clickhouse_cluster_config().cloned().unwrap_or_else(|| { + ClickhouseClusterConfig::new( + "test_cluster".to_string(), + "test_secret".to_string(), + ) + }); + let mut new_config = config.clone(); + new_config.servers.insert(server_zone_id, ServerId(1)); + builder.set_clickhouse_cluster_config(Some(new_config)); + + // Expunge the server zone + builder + .sled_expunge_zone(sled_id, server_zone_id) + .expect("expunged zone"); + builder + .sled_mark_expunged_zone_ready_for_cleanup(sled_id, server_zone_id) + .expect("marked zone ready for cleanup"); + + let blueprint = builder.build(BlueprintSource::Test); + + // Confirm that it's NOT pruneable when it's still in the clickhouse + // cluster config + let pruneable_zones = + PruneableZones::new(opctx, datastore, &blueprint, &[], &[]) + .await + .expect("failed to find pruneable zones"); + assert!( + !pruneable_zones.pruneable_zones.contains(&server_zone_id), + "clickhouse server zone should not be pruneable when it's \ + still in the clickhouse cluster config" + ); + + // Now remove it from the clickhouse cluster config + let mut builder = BlueprintBuilder::new_based_on( + log, + &blueprint, + TEST_NAME, + PlannerRng::from_seed(("builder2", TEST_NAME)), + ) + .expect("failed to create builder"); + + // Get the current config, remove the server, and set it back + let mut config = builder + .clickhouse_cluster_config() + .expect("clickhouse cluster config exists") + .clone(); + config.servers.remove(&server_zone_id); + builder.set_clickhouse_cluster_config(Some(config)); + + let blueprint = builder.build(BlueprintSource::Test); + + // Check that the zone IS now pruneable (removed from cluster config) + let pruneable_zones = + PruneableZones::new(opctx, datastore, &blueprint, &[], &[]) + .await + .expect("failed to find pruneable zones"); + assert!( + pruneable_zones.pruneable_zones.contains(&server_zone_id), + "clickhouse server zone should be pruneable when it's been \ + removed from the clickhouse cluster config" + ); + + db.terminate().await; + logctx.cleanup_successful(); + } } From 51298b462100a960c7099895b7731ce3f900a62c Mon Sep 17 00:00:00 2001 From: John Gallagher Date: Mon, 2 Feb 2026 10:27:29 -0500 Subject: [PATCH 22/27] flesh out Nexus test --- Cargo.lock | 3 + nexus/reconfigurator/preparation/Cargo.toml | 3 + .../src/expunged_and_unreferenced.rs | 149 +++++++++++++++++- 3 files changed, 150 insertions(+), 5 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 457747a947f..525e94fdb7f 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -7406,6 +7406,7 @@ dependencies = [ "chrono", "clickhouse-admin-types", "futures", + "nexus-auth", "nexus-db-model", "nexus-db-queries", "nexus-reconfigurator-planning", @@ -7417,10 +7418,12 @@ dependencies = [ "omicron-uuid-kinds", "omicron-workspace-hack", "pq-sys", + "serde_json", "sled-agent-types", "sled-hardware-types", "slog", "slog-error-chain", + "steno", "strum 0.27.2", "tokio", "uuid", diff --git a/nexus/reconfigurator/preparation/Cargo.toml b/nexus/reconfigurator/preparation/Cargo.toml index 291f78bc5da..fda858ddb6a 100644 --- a/nexus/reconfigurator/preparation/Cargo.toml +++ b/nexus/reconfigurator/preparation/Cargo.toml @@ -30,8 +30,11 @@ omicron-workspace-hack.workspace = true [dev-dependencies] chrono.workspace = true clickhouse-admin-types.workspace = true +nexus-auth.workspace = true nexus-reconfigurator-planning.workspace = true nexus-test-utils.workspace = true omicron-test-utils.workspace = true +serde_json.workspace = true +steno.workspace = true tokio.workspace = true uuid.workspace = true diff --git a/nexus/reconfigurator/preparation/src/expunged_and_unreferenced.rs b/nexus/reconfigurator/preparation/src/expunged_and_unreferenced.rs index edc3d4821b0..68a0dd117cd 100644 --- a/nexus/reconfigurator/preparation/src/expunged_and_unreferenced.rs +++ b/nexus/reconfigurator/preparation/src/expunged_and_unreferenced.rs @@ -620,16 +620,22 @@ mod tests { use super::*; use chrono::Utc; use clickhouse_admin_types::keeper::KeeperId; + use clickhouse_admin_types::server::ServerId; + use nexus_auth::authz; use nexus_db_model::ExternalIp; use nexus_db_model::IpAttachState; use nexus_db_model::IpKind; use nexus_db_model::Ipv4Addr as DbIpv4Addr; use nexus_db_model::MacAddr; use nexus_db_model::Name; + use nexus_db_model::RendezvousDebugDataset; use nexus_db_model::ServiceNetworkInterface; use nexus_db_model::ServiceNetworkInterfaceIdentity; use nexus_db_model::SqlU8; use nexus_db_model::SqlU16; + use nexus_db_model::SupportBundleState; + use nexus_db_model::saga_types::Saga; + use nexus_db_model::saga_types::SagaState; use nexus_db_queries::db::datastore::CollectorReassignment; use nexus_reconfigurator_planning::blueprint_builder::BlueprintBuilder; use nexus_reconfigurator_planning::blueprint_editor::ExternalNetworkingChoice; @@ -644,12 +650,14 @@ mod tests { use nexus_types::deployment::SledFilter; use nexus_types::internal_api::params::OximeterInfo; use omicron_common::api::external; + use omicron_common::api::external::LookupType; use omicron_common::api::external::MacAddr as ExternalMacAddr; use omicron_common::api::internal::nexus::ProducerEndpoint; use omicron_common::api::internal::nexus::ProducerKind; use omicron_common::api::internal::shared::PrivateIpConfig; use omicron_test_utils::dev; - use omicron_uuid_kinds::GenericUuid; + use omicron_uuid_kinds::DatasetUuid; + use omicron_uuid_kinds::ZpoolUuid; use std::net::Ipv4Addr; use std::time::Duration; use uuid::Uuid; @@ -923,7 +931,6 @@ mod tests { ); // Now add producers assigned to this oximeter - use omicron_uuid_kinds::GenericUuid; let producer1 = ProducerEndpoint { id: Uuid::new_v4(), kind: ProducerKind::Service, @@ -1177,6 +1184,7 @@ mod tests { // Start with the base example system (which includes Nexus zones) let (_example, initial_blueprint) = ExampleSystemBuilder::new(log, TEST_NAME).build(); + let mut builder = BlueprintBuilder::new_based_on( log, &initial_blueprint, @@ -1202,7 +1210,8 @@ mod tests { let blueprint = builder.build(BlueprintSource::Test); - // Check that the zone IS pruneable when no database reasons exist + // Check that the zone IS pruneable (we haven't added anything to the db + // that would make it non-pruneable). let pruneable_zones = PruneableZones::new(opctx, datastore, &blueprint, &[], &[]) .await @@ -1212,6 +1221,138 @@ mod tests { "nexus zone should be pruneable when no database reasons exist" ); + // Check BlueprintExpungedZoneAccessReason::NexusDeleteMetadataRecord: + // Insert a database metadata record for this Nexus and verify the zone + // is non-pruneable. + let conn = datastore.pool_connection_for_tests().await.unwrap(); + datastore + .initialize_nexus_access_from_blueprint_on_connection( + &conn, + vec![nexus_zone_id], + ) + .await + .expect("failed to create database nexus access"); + let pruneable_zones = + PruneableZones::new(opctx, datastore, &blueprint, &[], &[]) + .await + .expect("failed to find pruneable zones"); + assert!( + !pruneable_zones.pruneable_zones.contains(&nexus_zone_id), + "nexus zone should not be pruneable when there's a \ + database metadata record" + ); + + // Remove the metadata record and verify the zone is pruneable again + datastore + .database_nexus_access_delete(opctx, nexus_zone_id) + .await + .expect("failed to delete nexus access"); + let pruneable_zones = + PruneableZones::new(opctx, datastore, &blueprint, &[], &[]) + .await + .expect("failed to find pruneable zones"); + assert!( + pruneable_zones.pruneable_zones.contains(&nexus_zone_id), + "nexus zone should be pruneable after metadata record removed" + ); + + // Check BlueprintExpungedZoneAccessReason::NexusSagaReassignment + // Create a saga assigned to this Nexus and verify this makes the zone + // non-pruneable. + let saga_id = steno::SagaId(Uuid::new_v4()); + let saga_params = steno::SagaCreateParams { + id: saga_id, + name: steno::SagaName::new("test_saga"), + dag: serde_json::json!({}), + state: steno::SagaCachedState::Running, + }; + let saga = + Saga::new(nexus_zone_id.into_untyped_uuid().into(), saga_params); + datastore.saga_create(&saga).await.expect("failed to create saga"); + let pruneable_zones = + PruneableZones::new(opctx, datastore, &blueprint, &[], &[]) + .await + .expect("failed to find pruneable zones"); + assert!( + !pruneable_zones.pruneable_zones.contains(&nexus_zone_id), + "nexus zone should not be pruneable when there's an \ + unfinished saga assigned" + ); + + // Mark the saga as abandoned and verify the zone is pruneable again. + datastore + .saga_update_state( + saga_id, + SagaState::Abandoned, + nexus_zone_id.into_untyped_uuid().into(), + ) + .await + .expect("failed to mark saga abandoned"); + let pruneable_zones = + PruneableZones::new(opctx, datastore, &blueprint, &[], &[]) + .await + .expect("failed to find pruneable zones"); + assert!( + pruneable_zones.pruneable_zones.contains(&nexus_zone_id), + "nexus zone should be pruneable after saga is abandoned" + ); + + // Check BlueprintExpungedZoneAccessReason::NexusSupportBundleReassign + // Create a debug dataset so support_bundle_create can succeed, then + // create a support bundle, and verify that makes the zone + // non-pruneable. + let dataset = RendezvousDebugDataset::new( + DatasetUuid::new_v4(), + ZpoolUuid::new_v4(), + blueprint.id, + ); + datastore + .debug_dataset_insert_if_not_exists(opctx, dataset) + .await + .expect("failed to create debug dataset"); + let bundle = datastore + .support_bundle_create( + opctx, + "test support bundle", + nexus_zone_id, + None, + ) + .await + .expect("failed to create support bundle"); + let pruneable_zones = + PruneableZones::new(opctx, datastore, &blueprint, &[], &[]) + .await + .expect("failed to find pruneable zones"); + assert!( + !pruneable_zones.pruneable_zones.contains(&nexus_zone_id), + "nexus zone should not be pruneable when there's an \ + assigned support bundle" + ); + + // Mark the support bundle as failed and verify the zone is pruneable + // again. + let authz_bundle = authz::SupportBundle::new( + authz::FLEET, + bundle.id(), + LookupType::by_id(bundle.id()), + ); + datastore + .support_bundle_update( + opctx, + &authz_bundle, + SupportBundleState::Failed, + ) + .await + .expect("failed to mark support bundle as failed"); + let pruneable_zones = + PruneableZones::new(opctx, datastore, &blueprint, &[], &[]) + .await + .expect("failed to find pruneable zones"); + assert!( + pruneable_zones.pruneable_zones.contains(&nexus_zone_id), + "nexus zone should be pruneable after support bundle is failed" + ); + // Check that it's NOT pruneable when there's an associated external IP // db row let external_ip = make_external_ip_for_zone(nexus_zone_id); @@ -1565,8 +1706,6 @@ mod tests { // Manually add the server to the clickhouse cluster config // (normally the planner does this, but we need it for testing) - use clickhouse_admin_types::server::ServerId; - use nexus_types::deployment::ClickhouseClusterConfig; let config = builder.clickhouse_cluster_config().cloned().unwrap_or_else(|| { ClickhouseClusterConfig::new( From 06febb9fe232c46e460813beac18859d666f7f51 Mon Sep 17 00:00:00 2001 From: John Gallagher Date: Mon, 2 Feb 2026 10:35:06 -0500 Subject: [PATCH 23/27] minor cleanup (comments, enum variant name) --- .../src/expunged_and_unreferenced.rs | 36 +++++++++---------- nexus/types/src/deployment.rs | 7 ++-- 2 files changed, 21 insertions(+), 22 deletions(-) diff --git a/nexus/reconfigurator/preparation/src/expunged_and_unreferenced.rs b/nexus/reconfigurator/preparation/src/expunged_and_unreferenced.rs index 68a0dd117cd..aa1007dc568 100644 --- a/nexus/reconfigurator/preparation/src/expunged_and_unreferenced.rs +++ b/nexus/reconfigurator/preparation/src/expunged_and_unreferenced.rs @@ -2,19 +2,20 @@ // License, v. 2.0. If a copy of the MPL was not distributed with this // file, You can obtain one at https://mozilla.org/MPL/2.0/. -//! Helpers for identifying when expunged zones are no longer referenced in the -//! database. +//! Helpers for identifying when expunged zones can be safely pruned from the +//! blueprint (i.e., permanently dropped from being carried forward to +//! subsequent blueprints). //! //! When a zone is first expunged, the expunged zone initially remains in the //! blueprint. Two high-level conditions must be satisfied before it's safe to -//! prune an expunged zone from the blueprint (i.e., delete it entirely): +//! prune an expunged zone from the blueprint: //! //! 1. We must know the zone is not running and will never run again. The //! typical case of this confirmation is that the sled-agent responsible for //! running the zone has confirmed the zone is no longer running and that //! it's ledgered an `OmicronSledConfig` with a generation past the point in -//! which the zone was expunged. The uncommon case of this confirmation is -//! that the sled where this zone ran has been expunged. +//! which the zone was expunged. The atypical (but still valid) case of this +//! confirmation is that the sled where this zone ran has been expunged. //! 2. Any cleanup work that operates on expunged zones must be complete. This //! is zone-type-specific. Some zone types have no cleanup work at all and //! can be pruned as soon as the first condition is satisfied. Others have @@ -28,10 +29,9 @@ //! The [`BlueprintExpungedZoneAccessReason`] enum tracks a variant for every //! reason a caller wants to access the expunged zones of a blueprint, including //! all known cleanup actions. For each zone type, if the zone-type-specific -//! cleanup work is complete, we included the zone ID in the "expunged and -//! unreferenced" zone set in the `PlanningInput`. The planner can cheaply act -//! on this set: for every zone ID present, it can safely prune it from the -//! blueprint (i.e., do not include it in the child blueprint it emits). +//! cleanup work is complete, we included the zone ID in the `pruneable_zones` +//! zone set in the `PlanningInput`. The planner can cheaply act on this set: +//! for every zone ID present, it can safely prune it from the blueprint. use nexus_db_model::SupportBundleState; use nexus_db_queries::context::OpContext; @@ -98,15 +98,15 @@ impl PruneableZones { for (_, zone) in parent_blueprint.expunged_zones( ZoneRunningStatus::Shutdown, - Reason::PlanningInputDetermineUnreferenced, + Reason::PlanningInputFindPruneable, ) { // Check // BlueprintExpungedZoneAccessReason::DeallocateExternalNetworkingResources; // this reason applies to multiple zone types, so we check it for // them all. (Technically we only need to check it for zones that // _can_ have external networking, but it's fine to check ones that - // don't, and now we don't have to keep a list here of which zone - // types could have external networking rows present.) + // don't, and that way we don't have to keep a list here of which + // zone types could have external networking rows present.) reason_checker.add_reason_checked( Reason::DeallocateExternalNetworkingResources, ); @@ -392,6 +392,10 @@ async fn is_oximeter_pruneable( Ok(assigned_producers.is_empty()) } +// Helper function to construct a `DataPageParams` that only asks for a single +// item. In several checks above, we only care "does any item exist" - we can +// get this via any paginated datastore function and limit the work to "just one +// please". fn single_item_pagparams() -> DataPageParams<'static, T> { DataPageParams { marker: None, @@ -560,11 +564,7 @@ impl BlueprintExpungedZoneAccessReasonChecker { // Checked by is_multinode_clickhouse_pruneable() Reason::ClickhouseKeeperServerConfigIps => {} - // TODO-john FIXME - // NOT CHECKED: find_expunged_and_unreferenced_zones() will - // never consider a cockroach node "unreferenced", because we - // have currently disabled decommissioning (see - // https://github.com/oxidecomputer/omicron/issues/8447). + // Checked by is_cockroach_pruneable() Reason::CockroachDecommission => {} // Checked directly by the main loop in `PruneableZones::new()`. @@ -591,7 +591,7 @@ impl BlueprintExpungedZoneAccessReasonChecker { // Planner-related reasons that don't need to be checked (see // `BlueprintExpungedZoneAccessReason` for specifics) | Reason::PlannerCheckReadyForCleanup - | Reason::PlanningInputDetermineUnreferenced + | Reason::PlanningInputFindPruneable | Reason::PlanningInputExpungedZoneGuard // Test / development reasons that don't need to be checked diff --git a/nexus/types/src/deployment.rs b/nexus/types/src/deployment.rs index 5fdf735095d..b13c1d936c1 100644 --- a/nexus/types/src/deployment.rs +++ b/nexus/types/src/deployment.rs @@ -861,12 +861,11 @@ pub enum BlueprintExpungedZoneAccessReason { PlannerCheckReadyForCleanup, /// When constructing a [`PlanningInput`], it needs to loop over the - /// blueprint's expunged zones in order to know which zones it needs to - /// check for references, so that it can assemble the "expunged and - /// unreferenced" zone IDs. + /// blueprint's expunged zones in order to know which zones may be + /// pruneable. /// /// The planner does not need to account for this when pruning zones. - PlanningInputDetermineUnreferenced, + PlanningInputFindPruneable, /// When constructing a [`PlanningInput`], its builder has a guard that any /// "expunged and unreferenced" zone ID actually is expunged. From ff208bc16c024de551337e376660b31bd9440d38 Mon Sep 17 00:00:00 2001 From: John Gallagher Date: Mon, 2 Feb 2026 10:38:09 -0500 Subject: [PATCH 24/27] module rename --- nexus/reconfigurator/preparation/src/lib.rs | 4 ++-- .../src/{expunged_and_unreferenced.rs => pruneable_zones.rs} | 0 2 files changed, 2 insertions(+), 2 deletions(-) rename nexus/reconfigurator/preparation/src/{expunged_and_unreferenced.rs => pruneable_zones.rs} (100%) diff --git a/nexus/reconfigurator/preparation/src/lib.rs b/nexus/reconfigurator/preparation/src/lib.rs index bd19b787490..7dacbecb8d5 100644 --- a/nexus/reconfigurator/preparation/src/lib.rs +++ b/nexus/reconfigurator/preparation/src/lib.rs @@ -63,9 +63,9 @@ use std::collections::BTreeSet; use std::net::IpAddr; use std::sync::Arc; -mod expunged_and_unreferenced; +mod pruneable_zones; -use expunged_and_unreferenced::PruneableZones; +use pruneable_zones::PruneableZones; /// Given various pieces of database state that go into the blueprint planning /// process, produce a `PlanningInput` object encapsulating what the planner diff --git a/nexus/reconfigurator/preparation/src/expunged_and_unreferenced.rs b/nexus/reconfigurator/preparation/src/pruneable_zones.rs similarity index 100% rename from nexus/reconfigurator/preparation/src/expunged_and_unreferenced.rs rename to nexus/reconfigurator/preparation/src/pruneable_zones.rs From dcdc77236adbfab190ba205bef82eac8fb935565 Mon Sep 17 00:00:00 2001 From: John Gallagher Date: Mon, 2 Feb 2026 10:43:12 -0500 Subject: [PATCH 25/27] expunged_and_unreferenced -> pruneable --- nexus/reconfigurator/preparation/src/lib.rs | 24 ++++++------ nexus/types/src/deployment.rs | 2 +- nexus/types/src/deployment/planning_input.rs | 41 +++++++++----------- 3 files changed, 30 insertions(+), 37 deletions(-) diff --git a/nexus/reconfigurator/preparation/src/lib.rs b/nexus/reconfigurator/preparation/src/lib.rs index 7dacbecb8d5..69eea0cd35c 100644 --- a/nexus/reconfigurator/preparation/src/lib.rs +++ b/nexus/reconfigurator/preparation/src/lib.rs @@ -96,7 +96,7 @@ pub struct PlanningInputFromDb<'a> { pub planner_config: PlannerConfig, pub active_nexus_zones: BTreeSet, pub not_yet_nexus_zones: BTreeSet, - pub expunged_and_unreferenced_zones: BTreeSet, + pub pruneable_zones: BTreeSet, pub log: &'a Logger, } @@ -274,7 +274,7 @@ impl PlanningInputFromDb<'_> { active_nexus_zones.into_iter().map(|n| n.nexus_id()).collect(); let not_yet_nexus_zones = not_yet_nexus_zones.into_iter().map(|n| n.nexus_id()).collect(); - let expunged_and_unreferenced_zones = PruneableZones::new( + let pruneable_zones = PruneableZones::new( opctx, datastore, &parent_blueprint, @@ -311,7 +311,7 @@ impl PlanningInputFromDb<'_> { planner_config, active_nexus_zones, not_yet_nexus_zones, - expunged_and_unreferenced_zones, + pruneable_zones, } .build() .internal_context("assembling planning_input")?; @@ -471,16 +471,14 @@ impl PlanningInputFromDb<'_> { })?; } - for &zone_id in &self.expunged_and_unreferenced_zones { - builder.insert_expunged_and_unreferenced_zone(zone_id).map_err( - |e| { - Error::internal_error(&format!( - "unexpectedly failed to add expunged and unreferenced \ - zone ID {zone_id} to planning input: {}", - InlineErrorChain::new(&e), - )) - }, - )?; + for &zone_id in &self.pruneable_zones { + builder.insert_pruneable_zone(zone_id).map_err(|e| { + Error::internal_error(&format!( + "unexpectedly failed to pruneable zone ID {zone_id} \ + to planning input: {}", + InlineErrorChain::new(&e), + )) + })?; } Ok(builder.build()) diff --git a/nexus/types/src/deployment.rs b/nexus/types/src/deployment.rs index b13c1d936c1..154d77aa28c 100644 --- a/nexus/types/src/deployment.rs +++ b/nexus/types/src/deployment.rs @@ -868,7 +868,7 @@ pub enum BlueprintExpungedZoneAccessReason { PlanningInputFindPruneable, /// When constructing a [`PlanningInput`], its builder has a guard that any - /// "expunged and unreferenced" zone ID actually is expunged. + /// supposedly-pruneable zone ID actually is expunged. /// /// This guard is implemented by asking the parent blueprint for its list of /// expunged zones. The planner does not need to account for this when diff --git a/nexus/types/src/deployment/planning_input.rs b/nexus/types/src/deployment/planning_input.rs index 3da996d12b1..ff66bc767fe 100644 --- a/nexus/types/src/deployment/planning_input.rs +++ b/nexus/types/src/deployment/planning_input.rs @@ -152,11 +152,12 @@ pub struct PlanningInput { /// /// * Expunged /// * Confirmed shutdown and will never restart - /// * Not referenced elsewhere in Cockroach + /// * No longer needed by the rest of the system (any zone-type-specific + /// cleanup work is complete, any information they contain is either + /// useless or captured elsewhere, etc.) /// - /// and can therefore be pruned from the blueprint, because any potential - /// cleanup required from their expungement has completed. - expunged_and_unreferenced_zones: BTreeSet, + /// and can therefore be pruned from the blueprint. + pruneable_zones: BTreeSet, } impl PlanningInput { @@ -353,10 +354,8 @@ impl PlanningInput { self.ignore_impossible_mgs_updates_since } - pub fn expunged_and_unreferenced_zones( - &self, - ) -> &BTreeSet { - &self.expunged_and_unreferenced_zones + pub fn pruneable_zones(&self) -> &BTreeSet { + &self.pruneable_zones } /// Convert this `PlanningInput` back into a [`PlanningInputBuilder`] @@ -376,8 +375,7 @@ impl PlanningInput { .ignore_impossible_mgs_updates_since, active_nexus_zones: self.active_nexus_zones, not_yet_nexus_zones: self.not_yet_nexus_zones, - expunged_and_unreferenced_zones: self - .expunged_and_unreferenced_zones, + pruneable_zones: self.pruneable_zones, } } } @@ -1620,7 +1618,7 @@ pub struct PlanningInputBuilder { ignore_impossible_mgs_updates_since: DateTime, active_nexus_zones: BTreeSet, not_yet_nexus_zones: BTreeSet, - expunged_and_unreferenced_zones: BTreeSet, + pruneable_zones: BTreeSet, } impl PlanningInputBuilder { @@ -1643,7 +1641,7 @@ impl PlanningInputBuilder { - MGS_UPDATE_SETTLE_TIMEOUT, active_nexus_zones: BTreeSet::new(), not_yet_nexus_zones: BTreeSet::new(), - expunged_and_unreferenced_zones: BTreeSet::new(), + pruneable_zones: BTreeSet::new(), } } @@ -1753,24 +1751,22 @@ impl PlanningInputBuilder { self.not_yet_nexus_zones = not_yet_nexus_zones; } - /// Insert a zone that is expunged and unreferenced. + /// Insert a zone that is pruneable. /// /// # Errors /// /// Fails if this zone: /// - /// * Does not exist in the parent blueprint - /// * Exists in the parent blueprint but is not expunged and shutdown - pub fn insert_expunged_and_unreferenced_zone( + /// * Does not exist in the parent blueprint. + /// * Exists in the parent blueprint but is not expunged and shutdown (one + /// of the two conditions for "pruneable"; we don't have enough + /// information to check the other one here). + pub fn insert_pruneable_zone( &mut self, zone_id: OmicronZoneUuid, ) -> Result<(), PlanningInputBuildError> { use BlueprintExpungedZoneAccessReason::PlanningInputExpungedZoneGuard; - // We have no way to confirm that this zone is "unreferenced" - that's a - // property of the system at large, mostly CRDB state - but we can - // confirm that it's expunged and ready for cleanup by looking at the - // parent blueprint. if !self .parent_blueprint .expunged_zones( @@ -1782,7 +1778,7 @@ impl PlanningInputBuilder { return Err(PlanningInputBuildError::ZoneNotExpunged(zone_id)); } - self.expunged_and_unreferenced_zones.insert(zone_id); + self.pruneable_zones.insert(zone_id); Ok(()) } @@ -1799,8 +1795,7 @@ impl PlanningInputBuilder { .ignore_impossible_mgs_updates_since, active_nexus_zones: self.active_nexus_zones, not_yet_nexus_zones: self.not_yet_nexus_zones, - expunged_and_unreferenced_zones: self - .expunged_and_unreferenced_zones, + pruneable_zones: self.pruneable_zones, } } } From 9534f8a0f8dfed3a494cd1e17e1c4ff1f8632fcf Mon Sep 17 00:00:00 2001 From: John Gallagher Date: Wed, 4 Feb 2026 14:07:08 -0500 Subject: [PATCH 26/27] PR feedback --- nexus/reconfigurator/preparation/src/lib.rs | 2 +- .../preparation/src/pruneable_zones.rs | 76 +++++++++++-------- nexus/types/src/deployment.rs | 5 +- nexus/types/src/deployment/planning_input.rs | 5 +- 4 files changed, 51 insertions(+), 37 deletions(-) diff --git a/nexus/reconfigurator/preparation/src/lib.rs b/nexus/reconfigurator/preparation/src/lib.rs index 69eea0cd35c..77086a77486 100644 --- a/nexus/reconfigurator/preparation/src/lib.rs +++ b/nexus/reconfigurator/preparation/src/lib.rs @@ -274,7 +274,7 @@ impl PlanningInputFromDb<'_> { active_nexus_zones.into_iter().map(|n| n.nexus_id()).collect(); let not_yet_nexus_zones = not_yet_nexus_zones.into_iter().map(|n| n.nexus_id()).collect(); - let pruneable_zones = PruneableZones::new( + let pruneable_zones = PruneableZones::assemble( opctx, datastore, &parent_blueprint, diff --git a/nexus/reconfigurator/preparation/src/pruneable_zones.rs b/nexus/reconfigurator/preparation/src/pruneable_zones.rs index aa1007dc568..55800fc4d58 100644 --- a/nexus/reconfigurator/preparation/src/pruneable_zones.rs +++ b/nexus/reconfigurator/preparation/src/pruneable_zones.rs @@ -70,7 +70,7 @@ impl PruneableZones { /// completed. /// /// See this module's documentation for more details. - pub async fn new( + pub async fn assemble( opctx: &OpContext, datastore: &DataStore, parent_blueprint: &Blueprint, @@ -191,8 +191,11 @@ fn is_boundary_ntp_pruneable( boundary_ntp_configs: &InServiceBoundaryNtpUpstreamConfigs<'_>, reason_checker: &mut BlueprintExpungedZoneAccessReasonChecker, ) -> bool { - // If this zone's upstream config is also the config of an in-service zone, - // then it's pruneable. Note the reason we're checking here. + // The only reason we might need to hang onto the blueprint information for + // a boundary NTP zone is if it's the only zone that contains the boundary + // NTP configuration. As long as the set of boundary NTP configs from + // in-service zones includes the same configuration, then this zone is + // pruneable. Note the reason we're checking here. reason_checker.add_reason_checked( BlueprintExpungedZoneAccessReason::BoundaryNtpUpstreamConfig, ); @@ -209,7 +212,7 @@ fn is_cockroach_pruneable( // decommissioned the node that was present in that zone; however, we don't // currently decommission cockroach nodes (tracked by // ). We therefore - // never consider cockroach nodes pruneable + // never consider cockroach nodes pruneable. // // This shouldn't be a huge deal in practice; Cockroach zones are updated in // place, not by an expunge/add pair, so a typical update does not produce @@ -236,7 +239,7 @@ fn is_multinode_clickhouse_pruneable( ) -> bool { // If this zone is still present in the clickhouse cluster config, it's // not pruneable. If there is no config at all or there is but it doesn't - // contain this zone, it is prunable. + // contain this zone, it is pruneable. // // Note the reason we've checked here. reason_checker.add_reason_checked( @@ -528,6 +531,11 @@ impl<'a> ZonesWithServiceNicRows<'a> { /// Helper type to ensure we've covered every /// [`BlueprintExpungedZoneAccessReason`] in our checks above. /// +/// It's critical that the system never prune a zone whose information might +/// be needed in the future. So we're pretty careful here. We require all +/// uses of expunged zones to provide a `reason` and we check that there's +/// associated code for each of these reasons using this struct. +/// /// This type has both compile-time (the `match` in `new()`) and runtime (the /// `assert_all_reasons_checked()` function) guards that confirm we cover any /// new variants added to [`BlueprintExpungedZoneAccessReason`] in the future. @@ -567,7 +575,8 @@ impl BlueprintExpungedZoneAccessReasonChecker { // Checked by is_cockroach_pruneable() Reason::CockroachDecommission => {} - // Checked directly by the main loop in `PruneableZones::new()`. + // Checked directly by the main loop in + // `PruneableZones::assemble()`. Reason::DeallocateExternalNetworkingResources => {} // Checked by is_external_dns_pruneable() @@ -716,6 +725,9 @@ mod tests { } } + // Checks that when given an example system with all components on it, the + // implementation above handles all the possible reasons that an expunged + // zone might be used. #[tokio::test] async fn test_pruneable_zones_reason_checker() { const TEST_NAME: &str = "test_pruneable_zones_reason_checker"; @@ -821,7 +833,7 @@ mod tests { // interrogate the internal `reason_checker` and assert that it contains // all known reasons. let pruneable_zones = - PruneableZones::new(opctx, datastore, &blueprint, &[], &[]) + PruneableZones::assemble(opctx, datastore, &blueprint, &[], &[]) .await .expect("failed to find pruneable zones"); @@ -897,7 +909,7 @@ mod tests { // Check that the zone is pruneable (no oximeter record, no producers) let pruneable_zones = - PruneableZones::new(opctx, datastore, &blueprint, &[], &[]) + PruneableZones::assemble(opctx, datastore, &blueprint, &[], &[]) .await .expect("failed to find pruneable zones"); assert!( @@ -921,7 +933,7 @@ mod tests { // Check that the zone is no longer pruneable let pruneable_zones = - PruneableZones::new(opctx, datastore, &blueprint, &[], &[]) + PruneableZones::assemble(opctx, datastore, &blueprint, &[], &[]) .await .expect("failed to find pruneable zones"); assert!( @@ -961,7 +973,7 @@ mod tests { // Check that the zone is still not pruneable (producers are assigned) let pruneable_zones = - PruneableZones::new(opctx, datastore, &blueprint, &[], &[]) + PruneableZones::assemble(opctx, datastore, &blueprint, &[], &[]) .await .expect("failed to find pruneable zones"); assert!( @@ -1004,7 +1016,7 @@ mod tests { // Check that the zone is now pruneable again (no producers assigned) let pruneable_zones = - PruneableZones::new(opctx, datastore, &blueprint, &[], &[]) + PruneableZones::assemble(opctx, datastore, &blueprint, &[], &[]) .await .expect("failed to find pruneable zones"); assert!( @@ -1067,7 +1079,7 @@ mod tests { // Confirm that it's NOT pruneable when there's no other boundary NTP // zone with the same upstream config. let pruneable_zones = - PruneableZones::new(opctx, datastore, &blueprint, &[], &[]) + PruneableZones::assemble(opctx, datastore, &blueprint, &[], &[]) .await .expect("failed to find pruneable zones"); assert!( @@ -1123,7 +1135,7 @@ mod tests { // Check that the zone IS now pruneable (another zone has the same // config) let pruneable_zones = - PruneableZones::new(opctx, datastore, &blueprint, &[], &[]) + PruneableZones::assemble(opctx, datastore, &blueprint, &[], &[]) .await .expect("failed to find pruneable zones"); assert!( @@ -1135,7 +1147,7 @@ mod tests { // Check that it's NOT pruneable when there's an associated external IP // db row let external_ip = make_external_ip_for_zone(boundary_ntp_zone_id); - let pruneable_zones = PruneableZones::new( + let pruneable_zones = PruneableZones::assemble( opctx, datastore, &blueprint, @@ -1153,7 +1165,7 @@ mod tests { // Check that it's NOT pruneable when there's an associated service NIC // db row let service_nic = make_service_nic_for_zone(boundary_ntp_zone_id); - let pruneable_zones = PruneableZones::new( + let pruneable_zones = PruneableZones::assemble( opctx, datastore, &blueprint, @@ -1213,7 +1225,7 @@ mod tests { // Check that the zone IS pruneable (we haven't added anything to the db // that would make it non-pruneable). let pruneable_zones = - PruneableZones::new(opctx, datastore, &blueprint, &[], &[]) + PruneableZones::assemble(opctx, datastore, &blueprint, &[], &[]) .await .expect("failed to find pruneable zones"); assert!( @@ -1233,7 +1245,7 @@ mod tests { .await .expect("failed to create database nexus access"); let pruneable_zones = - PruneableZones::new(opctx, datastore, &blueprint, &[], &[]) + PruneableZones::assemble(opctx, datastore, &blueprint, &[], &[]) .await .expect("failed to find pruneable zones"); assert!( @@ -1248,7 +1260,7 @@ mod tests { .await .expect("failed to delete nexus access"); let pruneable_zones = - PruneableZones::new(opctx, datastore, &blueprint, &[], &[]) + PruneableZones::assemble(opctx, datastore, &blueprint, &[], &[]) .await .expect("failed to find pruneable zones"); assert!( @@ -1270,7 +1282,7 @@ mod tests { Saga::new(nexus_zone_id.into_untyped_uuid().into(), saga_params); datastore.saga_create(&saga).await.expect("failed to create saga"); let pruneable_zones = - PruneableZones::new(opctx, datastore, &blueprint, &[], &[]) + PruneableZones::assemble(opctx, datastore, &blueprint, &[], &[]) .await .expect("failed to find pruneable zones"); assert!( @@ -1289,7 +1301,7 @@ mod tests { .await .expect("failed to mark saga abandoned"); let pruneable_zones = - PruneableZones::new(opctx, datastore, &blueprint, &[], &[]) + PruneableZones::assemble(opctx, datastore, &blueprint, &[], &[]) .await .expect("failed to find pruneable zones"); assert!( @@ -1320,7 +1332,7 @@ mod tests { .await .expect("failed to create support bundle"); let pruneable_zones = - PruneableZones::new(opctx, datastore, &blueprint, &[], &[]) + PruneableZones::assemble(opctx, datastore, &blueprint, &[], &[]) .await .expect("failed to find pruneable zones"); assert!( @@ -1345,7 +1357,7 @@ mod tests { .await .expect("failed to mark support bundle as failed"); let pruneable_zones = - PruneableZones::new(opctx, datastore, &blueprint, &[], &[]) + PruneableZones::assemble(opctx, datastore, &blueprint, &[], &[]) .await .expect("failed to find pruneable zones"); assert!( @@ -1356,7 +1368,7 @@ mod tests { // Check that it's NOT pruneable when there's an associated external IP // db row let external_ip = make_external_ip_for_zone(nexus_zone_id); - let pruneable_zones = PruneableZones::new( + let pruneable_zones = PruneableZones::assemble( opctx, datastore, &blueprint, @@ -1374,7 +1386,7 @@ mod tests { // Check that it's NOT pruneable when there's an associated service NIC // db row let service_nic = make_service_nic_for_zone(nexus_zone_id); - let pruneable_zones = PruneableZones::new( + let pruneable_zones = PruneableZones::assemble( opctx, datastore, &blueprint, @@ -1443,7 +1455,7 @@ mod tests { // Confirm that it's NOT pruneable when there's no other external DNS // zone with the same IP (meaning the IP hasn't been reassigned yet). let pruneable_zones = - PruneableZones::new(opctx, datastore, &blueprint, &[], &[]) + PruneableZones::assemble(opctx, datastore, &blueprint, &[], &[]) .await .expect("failed to find pruneable zones"); assert!( @@ -1500,7 +1512,7 @@ mod tests { // Check that the zone IS now pruneable (the IP has been reassigned to // another in-service zone) let pruneable_zones = - PruneableZones::new(opctx, datastore, &blueprint, &[], &[]) + PruneableZones::assemble(opctx, datastore, &blueprint, &[], &[]) .await .expect("failed to find pruneable zones"); assert!( @@ -1512,7 +1524,7 @@ mod tests { // Check that it's NOT pruneable when there's an associated external IP // db row let external_ip = make_external_ip_for_zone(external_dns_zone_id); - let pruneable_zones = PruneableZones::new( + let pruneable_zones = PruneableZones::assemble( opctx, datastore, &blueprint, @@ -1530,7 +1542,7 @@ mod tests { // Check that it's NOT pruneable when there's an associated service NIC // db row let service_nic = make_service_nic_for_zone(external_dns_zone_id); - let pruneable_zones = PruneableZones::new( + let pruneable_zones = PruneableZones::assemble( opctx, datastore, &blueprint, @@ -1618,7 +1630,7 @@ mod tests { // Confirm that it's NOT pruneable when it's still in the clickhouse // cluster config let pruneable_zones = - PruneableZones::new(opctx, datastore, &blueprint, &[], &[]) + PruneableZones::assemble(opctx, datastore, &blueprint, &[], &[]) .await .expect("failed to find pruneable zones"); assert!( @@ -1648,7 +1660,7 @@ mod tests { // Check that the zone IS now pruneable (removed from cluster config) let pruneable_zones = - PruneableZones::new(opctx, datastore, &blueprint, &[], &[]) + PruneableZones::assemble(opctx, datastore, &blueprint, &[], &[]) .await .expect("failed to find pruneable zones"); assert!( @@ -1730,7 +1742,7 @@ mod tests { // Confirm that it's NOT pruneable when it's still in the clickhouse // cluster config let pruneable_zones = - PruneableZones::new(opctx, datastore, &blueprint, &[], &[]) + PruneableZones::assemble(opctx, datastore, &blueprint, &[], &[]) .await .expect("failed to find pruneable zones"); assert!( @@ -1760,7 +1772,7 @@ mod tests { // Check that the zone IS now pruneable (removed from cluster config) let pruneable_zones = - PruneableZones::new(opctx, datastore, &blueprint, &[], &[]) + PruneableZones::assemble(opctx, datastore, &blueprint, &[], &[]) .await .expect("failed to find pruneable zones"); assert!( diff --git a/nexus/types/src/deployment.rs b/nexus/types/src/deployment.rs index 154d77aa28c..766db7fb8cb 100644 --- a/nexus/types/src/deployment.rs +++ b/nexus/types/src/deployment.rs @@ -819,9 +819,8 @@ pub enum BlueprintExpungedZoneAccessReason { /// operator during rack setup; see [`Blueprint::operator_nexus_config()`]. /// /// The planner does not need to account for this when pruning Nexus zones. - /// (The planner runs _inside Nexus_, which guarantees a Nexus exists that - /// is not ready for cleanup, which guarantees there's still a Nexus present - /// in the blueprint with the external Nexus configuration.) + /// The system will never _not_ have a running Nexus zone, barring some + /// other catastrophic bug or support error. NexusExternalConfig, /// Nexus needs to whether it itself should be quiescing. If the diff --git a/nexus/types/src/deployment/planning_input.rs b/nexus/types/src/deployment/planning_input.rs index ff66bc767fe..d1b20d0c9ec 100644 --- a/nexus/types/src/deployment/planning_input.rs +++ b/nexus/types/src/deployment/planning_input.rs @@ -1601,7 +1601,10 @@ pub enum PlanningInputBuildError { }, #[error("sled not found: {0}")] SledNotFound(SledUuid), - #[error("attempted to mark non-expunged zone as expunged: {0}")] + #[error( + "attempted to mark zone as pruneable that is not expunged \ + or may still be running: {0}" + )] ZoneNotExpunged(OmicronZoneUuid), } From 40916cd6cda5c9b3064fc1f376b8a3050a70f154 Mon Sep 17 00:00:00 2001 From: John Gallagher Date: Wed, 4 Feb 2026 14:17:07 -0500 Subject: [PATCH 27/27] more robust construction of support_bundle_states_needing_cleanup --- nexus/db-model/src/support_bundle.rs | 2 +- .../preparation/src/pruneable_zones.rs | 56 +++++++++---------- 2 files changed, 29 insertions(+), 29 deletions(-) diff --git a/nexus/db-model/src/support_bundle.rs b/nexus/db-model/src/support_bundle.rs index b9dc9f55b17..413339b83ef 100644 --- a/nexus/db-model/src/support_bundle.rs +++ b/nexus/db-model/src/support_bundle.rs @@ -22,7 +22,7 @@ use serde::{Deserialize, Serialize}; impl_enum_type!( SupportBundleStateEnum: - #[derive(Copy, Clone, Debug, AsExpression, FromSqlRow, Serialize, Deserialize, PartialEq)] + #[derive(Copy, Clone, Debug, AsExpression, FromSqlRow, Serialize, Deserialize, PartialEq, strum::EnumIter)] pub enum SupportBundleState; // Enum values diff --git a/nexus/reconfigurator/preparation/src/pruneable_zones.rs b/nexus/reconfigurator/preparation/src/pruneable_zones.rs index 55800fc4d58..bfea416e37b 100644 --- a/nexus/reconfigurator/preparation/src/pruneable_zones.rs +++ b/nexus/reconfigurator/preparation/src/pruneable_zones.rs @@ -308,43 +308,43 @@ async fn is_nexus_pruneable( return Ok(false); } - // This is a no-op match that exists solely to ensure we update our logic if - // the possible support bundle states change. We need to query for any - // support bundle assigned to the `zone_id` Nexus in any state that might - // require cleanup work; currently, that means "any state other than - // `Failed`". - // - // If updating this match, you must also ensure you update the query below! - match SupportBundleState::Active { - SupportBundleState::Collecting - | SupportBundleState::Active - | SupportBundleState::Destroying - | SupportBundleState::Failing => { - // We need to query for these states. - } - SupportBundleState::Failed => { - // The sole state we don't care about. - } - } - // Does this Nexus zone still have support bundles assigned to it in any - // state that requires cleanup work? This requires explicitly listing the - // states we care about; the no-op match statement above will hopefully keep - // this in sync with any changes to the enum. + // state that requires cleanup work? This requires building up a list of the + // support bundle states we care about and then querying for any bundle in + // one of those states assigned to the `zone_id` Nexus. + // + // Currently, any state other than `Failed` means cleanup work is still + // required. We construct the vec of all non-`Failed` states by iterating + // over all possible variants to ensure if any changes are made to the enum + // in the future, we're forced to manually update the construction of this + // set to account for those changes. reason_checker.add_reason_checked( BlueprintExpungedZoneAccessReason::NexusSupportBundleReassign, ); + let support_bundle_states_needing_cleanup = { + let mut states = Vec::new(); + for state in SupportBundleState::iter() { + match state { + SupportBundleState::Collecting + | SupportBundleState::Active + | SupportBundleState::Destroying + | SupportBundleState::Failing => { + // We need to query for bundles in these states. + states.push(state); + } + SupportBundleState::Failed => { + // The sole state we don't care about. + } + } + } + states + }; if !datastore .support_bundle_list_assigned_to_nexus( opctx, &single_item_pagparams(), zone_id, - vec![ - SupportBundleState::Collecting, - SupportBundleState::Active, - SupportBundleState::Destroying, - SupportBundleState::Failing, - ], + support_bundle_states_needing_cleanup, ) .await? .is_empty()