diff --git a/Cargo.lock b/Cargo.lock index bf4a31b1e50..2893090409e 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -7037,6 +7037,7 @@ dependencies = [ "serde", "serde_json", "sled-agent-types-versions", + "sled-hardware-types", "slog", "uuid", ] diff --git a/clients/nexus-lockstep-client/Cargo.toml b/clients/nexus-lockstep-client/Cargo.toml index ccc1d05940d..d1fe80debf7 100644 --- a/clients/nexus-lockstep-client/Cargo.toml +++ b/clients/nexus-lockstep-client/Cargo.toml @@ -24,5 +24,6 @@ reqwest.workspace = true schemars.workspace = true serde.workspace = true serde_json.workspace = true +sled-hardware-types.workspace = true slog.workspace = true uuid.workspace = true diff --git a/clients/nexus-lockstep-client/src/lib.rs b/clients/nexus-lockstep-client/src/lib.rs index d6a3b52a7f0..e248a318512 100644 --- a/clients/nexus-lockstep-client/src/lib.rs +++ b/clients/nexus-lockstep-client/src/lib.rs @@ -33,6 +33,7 @@ progenitor::generate_api!( "oxnet" = "0.1.0", }, replace = { + BaseboardId = sled_hardware_types::BaseboardId, // It's kind of unfortunate to pull in such a complex and unstable type // as "blueprint" this way, but we have really useful functionality // (e.g., diff'ing) that's implemented on our local type. diff --git a/nexus/db-queries/src/db/datastore/rack.rs b/nexus/db-queries/src/db/datastore/rack.rs index b51556e4b34..f3af98fcbe7 100644 --- a/nexus/db-queries/src/db/datastore/rack.rs +++ b/nexus/db-queries/src/db/datastore/rack.rs @@ -57,6 +57,7 @@ use nexus_types::external_api::shared; use nexus_types::external_api::shared::IpRange; use nexus_types::external_api::shared::SiloRole; use nexus_types::identity::Resource; +use nexus_types::internal_api::params::InitialTrustQuorumConfig; use nexus_types::inventory::NetworkInterface; use omicron_common::api::external::AllowedSourceIps; use omicron_common::api::external::DataPageParams; @@ -70,6 +71,7 @@ use omicron_common::api::external::UserId; use omicron_common::api::internal::shared::PrivateIpConfig; use omicron_common::bail_unless; use omicron_uuid_kinds::GenericUuid; +use omicron_uuid_kinds::RackUuid; use omicron_uuid_kinds::SiloUserUuid; use omicron_uuid_kinds::SledUuid; use omicron_uuid_kinds::ZpoolUuid; @@ -96,6 +98,7 @@ pub struct RackInit { pub recovery_user_password_hash: omicron_passwords::PasswordHashString, pub dns_update: DnsVersionUpdateBuilder, pub allowed_source_ips: AllowedSourceIps, + pub initial_trust_quorum_configuration: Option, } /// Possible errors while trying to initialize rack @@ -116,6 +119,7 @@ enum RackInitError { Database(DieselError), // Error adding initial allowed source IP list AllowedSourceIpError(Error), + TrustQuorum(Error), } impl From for RackInitError { @@ -177,6 +181,9 @@ impl From for Error { err )), RackInitError::AllowedSourceIpError(err) => err, + RackInitError::TrustQuorum(err) => err.internal_context( + "failed to insert initial trust quorum configuration", + ), } } } @@ -976,6 +983,20 @@ impl DataStore { DieselError::RollbackTransaction })?; + // Insert the initial trust quorum configuration + if let Some(tq_config) = rack_init.initial_trust_quorum_configuration { + Self::tq_insert_rss_config_after_handoff( + opctx, + &conn, + RackUuid::from_untyped_uuid(rack_id), + tq_config.members, + tq_config.coordinator + ).await.map_err(|e| { + err.set(RackInitError::TrustQuorum(e)).unwrap(); + DieselError::RollbackTransaction + })?; + } + let rack = diesel::update(rack_dsl::rack) .filter(rack_dsl::id.eq(rack_id)) .set(( @@ -1167,6 +1188,7 @@ mod test { "test suite".to_string(), ), allowed_source_ips: AllowedSourceIps::Any, + initial_trust_quorum_configuration: None } } } diff --git a/nexus/db-queries/src/db/datastore/trust_quorum.rs b/nexus/db-queries/src/db/datastore/trust_quorum.rs index 56325789f43..827100698af 100644 --- a/nexus/db-queries/src/db/datastore/trust_quorum.rs +++ b/nexus/db-queries/src/db/datastore/trust_quorum.rs @@ -129,7 +129,7 @@ impl DataStore { /// /// For reconfiguration and lrtq upgrade we always call /// `tq_insert_latest_config`. - pub async fn insert_rss_config_after_handoff( + pub async fn tq_insert_rss_config_after_handoff( opctx: &OpContext, conn: &async_bb8_diesel::Connection, rack_id: RackUuid, @@ -1507,7 +1507,7 @@ mod tests { let coordinator = members.first().unwrap().clone(); // Insert an initial config - DataStore::insert_rss_config_after_handoff( + DataStore::tq_insert_rss_config_after_handoff( opctx, &conn, rack_id, @@ -1590,7 +1590,7 @@ mod tests { let coordinator = members.first().unwrap().clone(); // Insert an initial config - DataStore::insert_rss_config_after_handoff( + DataStore::tq_insert_rss_config_after_handoff( opctx, &conn, rack_id, @@ -1843,7 +1843,7 @@ mod tests { let coordinator = members.first().unwrap().clone(); // Insert an initial config - DataStore::insert_rss_config_after_handoff( + DataStore::tq_insert_rss_config_after_handoff( opctx, &conn, rack_id, @@ -2015,7 +2015,7 @@ mod tests { rack2_members = members.clone(); } let coordinator = members.first().unwrap().clone(); - DataStore::insert_rss_config_after_handoff( + DataStore::tq_insert_rss_config_after_handoff( opctx, &conn, rack_id, diff --git a/nexus/src/app/rack.rs b/nexus/src/app/rack.rs index 5d4f877392a..e555c36298f 100644 --- a/nexus/src/app/rack.rs +++ b/nexus/src/app/rack.rs @@ -733,6 +733,8 @@ impl super::Nexus { .into(), dns_update, allowed_source_ips: request.allowed_source_ips, + initial_trust_quorum_configuration: request + .initial_trust_quorum_configuration, }, ) .await?; diff --git a/nexus/src/lib.rs b/nexus/src/lib.rs index f3e65e5caee..68cc826872e 100644 --- a/nexus/src/lib.rs +++ b/nexus/src/lib.rs @@ -414,6 +414,7 @@ impl nexus_test_interface::NexusServer for Server { bfd: Vec::new(), }, allowed_source_ips: AllowedSourceIps::Any, + initial_trust_quorum_configuration: None, }, false, // blueprint_execution_enabled ) diff --git a/nexus/types/src/internal_api/params.rs b/nexus/types/src/internal_api/params.rs index f21f46027de..596c107ef62 100644 --- a/nexus/types/src/internal_api/params.rs +++ b/nexus/types/src/internal_api/params.rs @@ -25,6 +25,8 @@ use schemars::JsonSchema; use serde::{Deserialize, Serialize}; use sled_agent_types_versions::latest::inventory::{SledCpuFamily, SledRole}; use sled_agent_types_versions::latest::rack_init::RecoverySiloConfig; +use sled_hardware_types::BaseboardId; +use std::collections::BTreeSet; use std::fmt; use std::net::IpAddr; use std::net::SocketAddr; @@ -198,6 +200,13 @@ pub struct RackInitializationRequest { pub rack_network_config: RackNetworkConfig, /// IPs or subnets allowed to make requests to user-facing services pub allowed_source_ips: AllowedSourceIps, + /// Data used to write the initial trust quorum configuration to CRDB + /// + /// This is optional for two reasons: + /// * For clusters fewer than 3 nodes, we don't support trust quorum. + /// * Trust quorum is not fully complete yet, and we only want this to be + /// used in production once it is complete. + pub initial_trust_quorum_configuration: Option, } pub type DnsConfigParams = internal_dns_types::config::DnsConfigParams; @@ -222,3 +231,10 @@ pub struct InstanceMigrateRequest { #[schemars(with = "Uuid")] pub dst_sled_id: SledUuid, } + +/// The configuration generated by RSS and used to initialize trust quorum +#[derive(Debug, Clone, Deserialize, JsonSchema)] +pub struct InitialTrustQuorumConfig { + pub members: BTreeSet, + pub coordinator: BaseboardId, +} diff --git a/openapi/nexus-lockstep.json b/openapi/nexus-lockstep.json index f2a4304865f..51dc5409431 100644 --- a/openapi/nexus-lockstep.json +++ b/openapi/nexus-lockstep.json @@ -4985,6 +4985,26 @@ "time_started" ] }, + "InitialTrustQuorumConfig": { + "description": "The configuration generated by RSS and used to initialize trust quorum", + "type": "object", + "properties": { + "coordinator": { + "$ref": "#/components/schemas/BaseboardId" + }, + "members": { + "type": "array", + "items": { + "$ref": "#/components/schemas/BaseboardId" + }, + "uniqueItems": true + } + }, + "required": [ + "coordinator", + "members" + ] + }, "Instance": { "description": "View of an Instance", "type": "object", @@ -7808,6 +7828,15 @@ } ] }, + "initial_trust_quorum_configuration": { + "nullable": true, + "description": "Data used to write the initial trust quorum configuration to CRDB\n\nThis is optional for two reasons: * For clusters fewer than 3 nodes, we don't support trust quorum. * Trust quorum is not fully complete yet, and we only want this to be used in production once it is complete.", + "allOf": [ + { + "$ref": "#/components/schemas/InitialTrustQuorumConfig" + } + ] + }, "internal_dns_zone_config": { "description": "initial internal DNS config", "allOf": [ diff --git a/sled-agent/src/rack_setup/service.rs b/sled-agent/src/rack_setup/service.rs index 2619dc4d4a1..5da593f0bdb 100644 --- a/sled-agent/src/rack_setup/service.rs +++ b/sled-agent/src/rack_setup/service.rs @@ -81,6 +81,7 @@ use dns_service_client::DnsError; use internal_dns_resolver::Resolver as DnsResolver; use internal_dns_types::names::ServiceName; use itertools::Itertools; +use nexus_lockstep_client::types::InitialTrustQuorumConfig; use nexus_lockstep_client::{ Client as NexusClient, Error as NexusError, types as NexusTypes, }; @@ -771,6 +772,7 @@ impl ServiceInner { service_plan: &ServicePlan, port_discovery_mode: ExternalPortDiscovery, nexus_lockstep_address: SocketAddrV6, + initial_trust_quorum_configuration: Option, ) -> Result<(), SetupServiceError> { info!(self.log, "Handing off control to Nexus"); @@ -1050,6 +1052,7 @@ impl ServiceInner { rack_network_config, external_port_count: port_discovery_mode.into(), allowed_source_ips, + initial_trust_quorum_configuration, }; let notify_nexus = || async { @@ -1282,7 +1285,10 @@ impl ServiceInner { rss_step.update(RssStep::InitTrustQuorum); // Initialize the trust quorum if there are peers configured. - if let Some(peers) = &config.trust_quorum_peers { + + let initial_trust_quorum_configuration = if let Some(peers) = + &config.trust_quorum_peers + { let initial_membership: BTreeSet<_> = peers.iter().cloned().collect(); bootstore @@ -1297,10 +1303,24 @@ impl ServiceInner { .collect(); let rack_id = RackUuid::from_untyped_uuid(sled_plan.rack_id); - init_trust_quorum(&self.log, trust_quorum, tq_members, rack_id) - .await?; + init_trust_quorum( + &self.log, + trust_quorum.clone(), + tq_members.clone(), + rack_id, + ) + .await?; + + Some(InitialTrustQuorumConfig { + members: tq_members.into_iter().collect(), + coordinator: trust_quorum.baseboard_id().clone(), + }) + } else { + None } - } + } else { + None + }; // Save the relevant network config in the bootstore. We want this to // happen before we `initialize_sleds` so each scrimlet (including us) @@ -1479,6 +1499,7 @@ impl ServiceInner { &service_plan, ExternalPortDiscovery::Auto(switch_mgmt_addrs), nexus_lockstep_address, + initial_trust_quorum_configuration, ) .await?; diff --git a/sled-agent/src/sim/server.rs b/sled-agent/src/sim/server.rs index 260e4ef4c06..5837b06f525 100644 --- a/sled-agent/src/sim/server.rs +++ b/sled-agent/src/sim/server.rs @@ -644,6 +644,7 @@ pub async fn run_standalone_server( bfd: Vec::new(), }, allowed_source_ips: AllowedSourceIps::Any, + initial_trust_quorum_configuration: None, }; let mut nexus_lockstep_address = config.nexus_address; diff --git a/sp-sim/src/gimlet.rs b/sp-sim/src/gimlet.rs index 1829e4a7c46..425dc6e2df4 100644 --- a/sp-sim/src/gimlet.rs +++ b/sp-sim/src/gimlet.rs @@ -6,7 +6,7 @@ use crate::HostFlashHashPolicy; use crate::Responsiveness; use crate::SimulatedSp; use crate::config::GimletConfig; -use crate::config::SpComponentConfig; +use crate::config::SpCommonConfig; use crate::ereport; use crate::ereport::EreportState; use crate::helpers::rot_state_v2; @@ -414,15 +414,13 @@ impl Gimlet { servers, ereport_servers, ereport_state, - gimlet.common.components.clone(), + gimlet.common.clone(), attached_mgs, - gimlet.common.serial_number.clone(), incoming_console_tx, power_state, commands_rx, Arc::clone(&last_request_handled), log, - gimlet.common.old_rot_state, update_state, Arc::clone(&power_state_changes), ); @@ -665,27 +663,23 @@ impl UdpTask { servers: [UdpServer; 2], ereport_servers: Option<[UdpServer; 2]>, ereport_state: EreportState, - components: Vec, + common: SpCommonConfig, attached_mgs: AttachedMgsSerialConsole, - serial_number: String, incoming_serial_console: HashMap>>, power_state: watch::Sender, commands: mpsc::UnboundedReceiver, last_request_handled: Arc>>, log: Logger, - old_rot_state: bool, update_state: SimSpUpdate, power_state_changes: Arc, ) -> (Self, Arc>, watch::Receiver) { let [udp0, udp1] = servers; let handler = Arc::new(TokioMutex::new(Handler::new( - serial_number, - components, + common, attached_mgs, incoming_serial_console, power_state, log.clone(), - old_rot_state, update_state, power_state_changes, ))); @@ -817,9 +811,7 @@ impl UdpTask { struct Handler { log: Logger, - serial_number: String, - - components: Vec, + common: SpCommonConfig, // `SpHandler` wants `&'static str` references when describing components; // this is fine on the real SP where the strings are baked in at build time, // but awkward here where we read them in at runtime. We'll leak the strings @@ -845,23 +837,21 @@ struct Handler { // this, our caller will pass us a function to call if they should ignore // whatever result we return and fail to respond at all. should_fail_to_respond_signal: Option>, - old_rot_state: bool, sp_dumps: HashMap<[u8; 16], u32>, } impl Handler { #[allow(clippy::too_many_arguments)] fn new( - serial_number: String, - components: Vec, + common: SpCommonConfig, attached_mgs: AttachedMgsSerialConsole, incoming_serial_console: HashMap>>, power_state: watch::Sender, log: Logger, - old_rot_state: bool, update_state: SimSpUpdate, power_state_changes: Arc, ) -> Self { + let components = common.components.clone(); let mut leaked_component_device_strings = Vec::with_capacity(components.len()); let mut leaked_component_description_strings = @@ -880,11 +870,10 @@ impl Handler { Self { log, - components, + common, sensors, leaked_component_device_strings, leaked_component_description_strings, - serial_number, attached_mgs, incoming_serial_console, startup_options: StartupOptions::empty(), @@ -893,7 +882,6 @@ impl Handler { power_state, last_request_handled: None, should_fail_to_respond_signal: None, - old_rot_state, sp_dumps, power_state_changes, } @@ -902,12 +890,12 @@ impl Handler { fn sp_state_impl(&self) -> SpStateV2 { // Make the Baseboard a PC so that our testbeds work as expected. let mut model = [0; 32]; - model[..FAKE_GIMLET_MODEL.len()] - .copy_from_slice(FAKE_GIMLET_MODEL.as_bytes()); + model[..self.common.part_number.len()] + .copy_from_slice(self.common.part_number.as_bytes()); SpStateV2 { hubris_archive_id: [0; 8], - serial_number: serial_number_padded(&self.serial_number), + serial_number: serial_number_padded(&self.common.serial_number), model, revision: 0, base_mac_address: [0; 6], @@ -1372,7 +1360,7 @@ impl SpHandler for Handler { } fn num_devices(&mut self) -> u32 { - self.components.len().try_into().unwrap() + self.common.components.len().try_into().unwrap() } fn device_description( @@ -1380,7 +1368,7 @@ impl SpHandler for Handler { index: BoundsChecked, ) -> DeviceDescription<'static> { let index = index.0 as usize; - let c = &self.components[index]; + let c = &self.common.components[index]; DeviceDescription { component: SpComponent::try_from(c.id.as_str()).unwrap(), device: self.leaked_component_device_strings[index], @@ -1638,7 +1626,7 @@ impl SpHandler for Handler { &mut self, version: u8, ) -> Result { - if self.old_rot_state { + if self.common.old_rot_state { Err(SpError::RequestUnsupportedForSp) } else { match version {