From eddd29c14d388e90ff425978dd9337a29a0ebfe4 Mon Sep 17 00:00:00 2001 From: Kyle Simpson Date: Wed, 29 Nov 2023 18:37:30 +0000 Subject: [PATCH 01/56] Brutally hacked together, saga-less live FIP mgmt --- illumos-utils/src/opte/port_manager.rs | 114 ++++++++++++++ .../src/db/datastore/external_ip.rs | 29 +++- nexus/src/app/instance.rs | 100 ++++++++++++ nexus/src/external_api/http_entrypoints.rs | 66 ++++++++ nexus/tests/output/nexus_tags.txt | 2 + nexus/types/src/external_api/params.rs | 7 + nexus/types/src/external_api/views.rs | 6 + openapi/nexus.json | 148 ++++++++++++++++++ openapi/sled-agent.json | 115 ++++++++++++++ sled-agent/src/http_entrypoints.rs | 38 ++++- sled-agent/src/instance.rs | 97 +++++++++++- sled-agent/src/instance_manager.rs | 37 +++++ sled-agent/src/params.rs | 8 + sled-agent/src/sled_agent.rs | 28 +++- 14 files changed, 784 insertions(+), 11 deletions(-) diff --git a/illumos-utils/src/opte/port_manager.rs b/illumos-utils/src/opte/port_manager.rs index 3558ef1c78..f2634026a6 100644 --- a/illumos-utils/src/opte/port_manager.rs +++ b/illumos-utils/src/opte/port_manager.rs @@ -29,6 +29,7 @@ use oxide_vpc::api::MacAddr; use oxide_vpc::api::RouterTarget; use oxide_vpc::api::SNat4Cfg; use oxide_vpc::api::SNat6Cfg; +use oxide_vpc::api::SetExternalIpsReq; use oxide_vpc::api::VpcCfg; use slog::debug; use slog::error; @@ -401,6 +402,119 @@ impl PortManager { Ok((port, ticket)) } + /// Ensure external IPs for an OPTE port are up to date. + #[cfg_attr(not(target_os = "illumos"), allow(unused_variables))] + pub fn external_ips_ensure( + &self, + nic_id: Uuid, + nic_kind: NetworkInterfaceKind, + source_nat: Option, + ephemeral_ip: Option, + floating_ips: &[IpAddr], + ) -> Result<(), Error> { + // TODO: new errors + let ports = self.inner.ports.lock().unwrap(); + let port = ports + .get(&(nic_id, nic_kind)) + .ok_or_else(|| Error::ReleaseMissingPort(nic_id, nic_kind))?; + + // Describe the external IP addresses for this port. + macro_rules! ip_cfg { + ($ip:expr, $log_prefix:literal, $ip_t:path, $cidr_t:path, + $ipcfg_e:path, $ipcfg_t:ident, $snat_t:ident) => {{ + let snat = match source_nat { + Some(snat) => { + let $ip_t(snat_ip) = snat.ip else { + error!( + self.inner.log, + concat!($log_prefix, " SNAT config"); + "snat_ip" => ?snat.ip, + ); + return Err(Error::InvalidPortIpConfig); + }; + let ports = snat.first_port..=snat.last_port; + Some($snat_t { external_ip: snat_ip.into(), ports }) + } + None => None, + }; + let ephemeral_ip = match ephemeral_ip { + Some($ip_t(ip)) => Some(ip.into()), + Some(_) => { + error!( + self.inner.log, + concat!($log_prefix, " ephemeral IP"); + "ephemeral_ip" => ?ephemeral_ip, + ); + return Err(Error::InvalidPortIpConfig); + } + None => None, + }; + let floating_ips: Vec<_> = floating_ips + .iter() + .copied() + .map(|ip| match ip { + $ip_t(ip) => Ok(ip.into()), + _ => { + error!( + self.inner.log, + concat!($log_prefix, " ephemeral IP"); + "ephemeral_ip" => ?ephemeral_ip, + ); + Err(Error::InvalidPortIpConfig) + } + }) + .collect::, _>>()?; + + ExternalIpCfg { + ephemeral_ip, + snat, + floating_ips, + } + }} + } + + let mut v4_cfg = None; + let mut v6_cfg = None; + match port.gateway().ip { + IpAddr::V4(_) => { + v4_cfg = Some(ip_cfg!( + ip, + "Expected IPv4", + IpAddr::V4, + IpCidr::Ip4, + IpCfg::Ipv4, + Ipv4Cfg, + SNat4Cfg + )) + } + IpAddr::V6(_) => { + v6_cfg = Some(ip_cfg!( + ip, + "Expected IPv6", + IpAddr::V6, + IpCidr::Ip6, + IpCfg::Ipv6, + Ipv6Cfg, + SNat6Cfg + )) + } + } + + let req = SetExternalIpsReq { + port_name: port.name().into(), + external_ips_v4: v4_cfg, + external_ips_v6: v6_cfg, + }; + + #[cfg(target_os = "illumos")] + let hdl = opte_ioctl::OpteHdl::open(opte_ioctl::OpteHdl::XDE_CTL)?; + + #[cfg(target_os = "illumos")] + hdl.set_external_ips(&req)?; + + Ok(()) + } + #[cfg(target_os = "illumos")] pub fn firewall_rules_ensure( &self, diff --git a/nexus/db-queries/src/db/datastore/external_ip.rs b/nexus/db-queries/src/db/datastore/external_ip.rs index ddf396f871..4e74cea150 100644 --- a/nexus/db-queries/src/db/datastore/external_ip.rs +++ b/nexus/db-queries/src/db/datastore/external_ip.rs @@ -467,7 +467,7 @@ impl DataStore { authz_fip: &authz::FloatingIp, db_fip: &FloatingIp, instance_id: Uuid, - ) -> UpdateResult { + ) -> UpdateResult<(FloatingIp, Option)> { use db::schema::external_ip::dsl; // Verify this FIP is not attached to any instances/services. @@ -485,7 +485,9 @@ impl DataStore { opctx.authorize(authz::Action::Modify, authz_fip).await?; opctx.authorize(authz::Action::Modify, &authz_instance).await?; - diesel::update(dsl::external_ip) + let i = self.instance_fetch_with_vmm(opctx, &authz_instance).await?; + + let out = diesel::update(dsl::external_ip) .filter(dsl::id.eq(db_fip.id())) .filter(dsl::kind.eq(IpKind::Floating)) .filter(dsl::time_deleted.is_null()) @@ -504,7 +506,9 @@ impl DataStore { ) }) .and_then(|r| FloatingIp::try_from(r)) - .map_err(|e| Error::internal_error(&format!("{e}"))) + .map_err(|e| Error::internal_error(&format!("{e}")))?; + + Ok((out, i.sled_id())) } /// Detaches a Floating IP address from an instance. @@ -513,7 +517,8 @@ impl DataStore { opctx: &OpContext, authz_fip: &authz::FloatingIp, db_fip: &FloatingIp, - ) -> UpdateResult { + target_instance_id: Option, + ) -> UpdateResult<(FloatingIp, Option)> { use db::schema::external_ip::dsl; let Some(instance_id) = db_fip.parent_id else { @@ -522,6 +527,14 @@ impl DataStore { )); }; + if let Some(target_instance_id) = target_instance_id { + if target_instance_id != instance_id { + return Err(Error::invalid_request( + "Floating IP is not attached to the target instance", + )); + } + } + let (.., authz_instance, _db_instance) = LookupPath::new(&opctx, self) .instance_id(instance_id) .fetch_for(authz::Action::Modify) @@ -530,7 +543,9 @@ impl DataStore { opctx.authorize(authz::Action::Modify, authz_fip).await?; opctx.authorize(authz::Action::Modify, &authz_instance).await?; - diesel::update(dsl::external_ip) + let i = self.instance_fetch_with_vmm(opctx, &authz_instance).await?; + + let out = diesel::update(dsl::external_ip) .filter(dsl::id.eq(db_fip.id())) .filter(dsl::kind.eq(IpKind::Floating)) .filter(dsl::time_deleted.is_null()) @@ -549,6 +564,8 @@ impl DataStore { ) }) .and_then(|r| FloatingIp::try_from(r)) - .map_err(|e| Error::internal_error(&format!("{e}"))) + .map_err(|e| Error::internal_error(&format!("{e}")))?; + + Ok((out, i.sled_id())) } } diff --git a/nexus/src/app/instance.rs b/nexus/src/app/instance.rs index 93386a66d0..53173e0f7f 100644 --- a/nexus/src/app/instance.rs +++ b/nexus/src/app/instance.rs @@ -26,6 +26,7 @@ use nexus_db_queries::db::datastore::InstanceAndActiveVmm; use nexus_db_queries::db::identity::Resource; use nexus_db_queries::db::lookup; use nexus_db_queries::db::lookup::LookupPath; +use nexus_types::external_api::views; use omicron_common::address::PROPOLIS_PORT; use omicron_common::api::external::http_pagination::PaginatedBy; use omicron_common::api::external::ByteCount; @@ -1872,6 +1873,105 @@ impl super::Nexus { Ok(()) } + + /// Detach a disk from an instance. + pub(crate) async fn instance_attach_external_ip( + &self, + opctx: &OpContext, + instance_lookup: &lookup::Instance<'_>, + ext_ip: ¶ms::ExternalIpCreate, + ) -> UpdateResult { + let (.., authz_project, authz_instance) = + instance_lookup.lookup_for(authz::Action::Modify).await?; + + let (authz_fip, db_fip) = match ext_ip { + params::ExternalIpCreate::Ephemeral { pool_name } => Err(Error::internal_error("ephemeral IP attach/detach not yet supported"))?, + params::ExternalIpCreate::Floating { floating_ip_name } => { + let floating_ip_name = db::model::Name(floating_ip_name.clone()); + let (.., authz_fip, db_fip) = LookupPath::new(&opctx, &self.datastore()) + .project_id(authz_project.id()) + .floating_ip_name(&floating_ip_name) + .fetch_for(authz::Action::Modify) + .await?; + (authz_fip, db_fip) + }, + }; + + let (eip, sled_uuid) = self + .datastore() + .floating_ip_attach(opctx, &authz_fip, &db_fip, authz_instance.id()) + .await?; + + if let Some(uuid) = sled_uuid { + self.sled_client(&uuid) + .await? + .instance_put_external_ip(&authz_instance.id(), &sled_agent_client::types::InstanceExternalIpBody::Floating(db_fip.ip.ip())) + .await?; + + let (.., sled) = self.sled_lookup(opctx, &uuid)?.fetch().await?; + + let boundary_switches = self.boundary_switches(opctx) + .await?; + + for switch in boundary_switches { + let dpd_client = + self.dpd_clients.get(&switch).ok_or_else(|| { + Error::internal_error(&format!( + "unable to find client for switch {switch}" + )) + })?; + + self.instance_ensure_dpd_config( + &opctx, + authz_instance.id(), + &sled.address(), + None, + dpd_client, + ) + .await?; + } + } + + Ok(views::ExternalIp::from(views::FloatingIp::from(eip))) + } + + /// Detach a disk from an instance. + pub(crate) async fn instance_detach_external_ip( + &self, + opctx: &OpContext, + instance_lookup: &lookup::Instance<'_>, + ext_ip: ¶ms::ExternalIpDelete, + ) -> UpdateResult { + let (.., authz_project, authz_instance) = + instance_lookup.lookup_for(authz::Action::Modify).await?; + + let (authz_fip, db_fip) = match ext_ip { + params::ExternalIpDelete::Ephemeral => Err(Error::internal_error("ephemeral IP attach/detach not yet supported"))?, + params::ExternalIpDelete::Floating { floating_ip_name } => { + let floating_ip_name = db::model::Name(floating_ip_name.clone()); + let (.., authz_fip, db_fip) = LookupPath::new(&opctx, &self.datastore()) + .project_id(authz_project.id()) + .floating_ip_name(&floating_ip_name) + .fetch_for(authz::Action::Modify) + .await?; + (authz_fip, db_fip) + }, + }; + + let (eip, sled_uuid) = self + .datastore() + .floating_ip_detach(opctx, &authz_fip, &db_fip, Some(authz_instance.id())) + .await?; + + if let Some(uuid) = sled_uuid { + self.sled_client(&uuid) + .await? + .instance_delete_external_ip(&authz_instance.id(), &sled_agent_client::types::InstanceExternalIpBody::Floating(db_fip.ip.ip())) + .await?; + } + + Ok(views::ExternalIp::from(views::FloatingIp::from(eip))) + } } #[cfg(test)] diff --git a/nexus/src/external_api/http_entrypoints.rs b/nexus/src/external_api/http_entrypoints.rs index 6720f95c39..1d52277a96 100644 --- a/nexus/src/external_api/http_entrypoints.rs +++ b/nexus/src/external_api/http_entrypoints.rs @@ -202,6 +202,8 @@ pub(crate) fn external_api() -> NexusApiDescription { api.register(instance_network_interface_delete)?; api.register(instance_external_ip_list)?; + api.register(instance_external_ip_attach)?; + api.register(instance_external_ip_detach)?; api.register(vpc_router_list)?; api.register(vpc_router_view)?; @@ -2478,6 +2480,8 @@ async fn instance_disk_detach( apictx.external_latencies.instrument_dropshot_handler(&rqctx, handler).await } + + // Certificates /// List certificates for external endpoints @@ -3643,6 +3647,68 @@ async fn instance_external_ip_list( apictx.external_latencies.instrument_dropshot_handler(&rqctx, handler).await } +/// Attach an external IP to an instance +#[endpoint { + method = POST, + path = "/v1/instances/{instance}/external-ips/attach", + tags = ["instances"], +}] +async fn instance_external_ip_attach( + rqctx: RequestContext>, + path_params: Path, + query_params: Query, + ip_to_detach: TypedBody, +) -> Result, HttpError> { + let apictx = rqctx.context(); + let handler = async { + let opctx = crate::context::op_context_for_external_api(&rqctx).await?; + let nexus = &apictx.nexus; + let path = path_params.into_inner(); + let query = query_params.into_inner(); + let instance_selector = params::InstanceSelector { + project: query.project, + instance: path.instance, + }; + let instance_lookup = + nexus.instance_lookup(&opctx, instance_selector)?; + let disk = + nexus.instance_attach_external_ip(&opctx, &instance_lookup, &ip_to_detach.into_inner()).await?; + Ok(HttpResponseAccepted(disk.into())) + }; + apictx.external_latencies.instrument_dropshot_handler(&rqctx, handler).await +} + +/// Detach an external IP from an instance +#[endpoint { + method = POST, + path = "/v1/instances/{instance}/external-ips/detach", + tags = ["instances"], +}] +async fn instance_external_ip_detach( + rqctx: RequestContext>, + path_params: Path, + query_params: Query, + ip_to_detach: TypedBody, +) -> Result, HttpError> { + let apictx = rqctx.context(); + let handler = async { + let opctx = crate::context::op_context_for_external_api(&rqctx).await?; + let nexus = &apictx.nexus; + let path = path_params.into_inner(); + let query = query_params.into_inner(); + let instance_selector = params::InstanceSelector { + project: query.project, + instance: path.instance, + }; + let instance_lookup = + nexus.instance_lookup(&opctx, instance_selector)?; + let disk = + nexus.instance_detach_external_ip(&opctx, &instance_lookup, &ip_to_detach.into_inner()).await?; + Ok(HttpResponseAccepted(disk.into())) + }; + apictx.external_latencies.instrument_dropshot_handler(&rqctx, handler).await +} + // Snapshots /// List snapshots diff --git a/nexus/tests/output/nexus_tags.txt b/nexus/tests/output/nexus_tags.txt index 3f77f4cb26..244d163218 100644 --- a/nexus/tests/output/nexus_tags.txt +++ b/nexus/tests/output/nexus_tags.txt @@ -40,6 +40,8 @@ instance_delete DELETE /v1/instances/{instance} instance_disk_attach POST /v1/instances/{instance}/disks/attach instance_disk_detach POST /v1/instances/{instance}/disks/detach instance_disk_list GET /v1/instances/{instance}/disks +instance_external_ip_attach POST /v1/instances/{instance}/external-ips/attach +instance_external_ip_detach POST /v1/instances/{instance}/external-ips/detach instance_external_ip_list GET /v1/instances/{instance}/external-ips instance_list GET /v1/instances instance_migrate POST /v1/instances/{instance}/migrate diff --git a/nexus/types/src/external_api/params.rs b/nexus/types/src/external_api/params.rs index f27a6619e2..446cac0fae 100644 --- a/nexus/types/src/external_api/params.rs +++ b/nexus/types/src/external_api/params.rs @@ -922,6 +922,13 @@ pub enum ExternalIpCreate { Floating { floating_ip_name: Name }, } +#[derive(Clone, Debug, Deserialize, Serialize, JsonSchema)] +#[serde(tag = "type", rename_all = "snake_case")] +pub enum ExternalIpDelete { + Ephemeral, + Floating { floating_ip_name: Name }, +} + /// Create-time parameters for an `Instance` #[derive(Clone, Debug, Deserialize, Serialize, JsonSchema)] pub struct InstanceCreate { diff --git a/nexus/types/src/external_api/views.rs b/nexus/types/src/external_api/views.rs index ecd459594a..ccf04ee9b1 100644 --- a/nexus/types/src/external_api/views.rs +++ b/nexus/types/src/external_api/views.rs @@ -286,6 +286,12 @@ pub struct FloatingIp { pub instance_id: Option, } +impl From for ExternalIp { + fn from(value: FloatingIp) -> Self { + ExternalIp { ip: value.ip, kind: IpKind::Floating } + } +} + // RACKS /// View of an Rack diff --git a/openapi/nexus.json b/openapi/nexus.json index 2ddd5f0e94..87ddd22d9e 100644 --- a/openapi/nexus.json +++ b/openapi/nexus.json @@ -1826,6 +1826,118 @@ } } }, + "/v1/instances/{instance}/external-ips/attach": { + "post": { + "tags": [ + "instances" + ], + "summary": "Attach an external IP to an instance", + "operationId": "instance_external_ip_attach", + "parameters": [ + { + "in": "path", + "name": "instance", + "description": "Name or ID of the instance", + "required": true, + "schema": { + "$ref": "#/components/schemas/NameOrId" + } + }, + { + "in": "query", + "name": "project", + "description": "Name or ID of the project", + "schema": { + "$ref": "#/components/schemas/NameOrId" + } + } + ], + "requestBody": { + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/ExternalIpCreate" + } + } + }, + "required": true + }, + "responses": { + "202": { + "description": "successfully enqueued operation", + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/ExternalIp" + } + } + } + }, + "4XX": { + "$ref": "#/components/responses/Error" + }, + "5XX": { + "$ref": "#/components/responses/Error" + } + } + } + }, + "/v1/instances/{instance}/external-ips/detach": { + "post": { + "tags": [ + "instances" + ], + "summary": "Detach an external IP from an instance", + "operationId": "instance_external_ip_detach", + "parameters": [ + { + "in": "path", + "name": "instance", + "description": "Name or ID of the instance", + "required": true, + "schema": { + "$ref": "#/components/schemas/NameOrId" + } + }, + { + "in": "query", + "name": "project", + "description": "Name or ID of the project", + "schema": { + "$ref": "#/components/schemas/NameOrId" + } + } + ], + "requestBody": { + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/ExternalIpDelete" + } + } + }, + "required": true + }, + "responses": { + "202": { + "description": "successfully enqueued operation", + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/ExternalIp" + } + } + } + }, + "4XX": { + "$ref": "#/components/responses/Error" + }, + "5XX": { + "$ref": "#/components/responses/Error" + } + } + } + }, "/v1/instances/{instance}/migrate": { "post": { "tags": [ @@ -10685,6 +10797,42 @@ } ] }, + "ExternalIpDelete": { + "oneOf": [ + { + "type": "object", + "properties": { + "type": { + "type": "string", + "enum": [ + "ephemeral" + ] + } + }, + "required": [ + "type" + ] + }, + { + "type": "object", + "properties": { + "floating_ip_name": { + "$ref": "#/components/schemas/Name" + }, + "type": { + "type": "string", + "enum": [ + "floating" + ] + } + }, + "required": [ + "floating_ip_name", + "type" + ] + } + ] + }, "ExternalIpResultsPage": { "description": "A single page of results", "type": "object", diff --git a/openapi/sled-agent.json b/openapi/sled-agent.json index d71f8de644..28dcb3da9d 100644 --- a/openapi/sled-agent.json +++ b/openapi/sled-agent.json @@ -327,6 +327,78 @@ } } }, + "/instances/{instance_id}/external-ip": { + "put": { + "operationId": "instance_put_external_ip", + "parameters": [ + { + "in": "path", + "name": "instance_id", + "required": true, + "schema": { + "type": "string", + "format": "uuid" + } + } + ], + "requestBody": { + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/InstanceExternalIpBody" + } + } + }, + "required": true + }, + "responses": { + "204": { + "description": "resource updated" + }, + "4XX": { + "$ref": "#/components/responses/Error" + }, + "5XX": { + "$ref": "#/components/responses/Error" + } + } + }, + "delete": { + "operationId": "instance_delete_external_ip", + "parameters": [ + { + "in": "path", + "name": "instance_id", + "required": true, + "schema": { + "type": "string", + "format": "uuid" + } + } + ], + "requestBody": { + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/InstanceExternalIpBody" + } + } + }, + "required": true + }, + "responses": { + "204": { + "description": "resource updated" + }, + "4XX": { + "$ref": "#/components/responses/Error" + }, + "5XX": { + "$ref": "#/components/responses/Error" + } + } + } + }, "/instances/{instance_id}/migration-ids": { "put": { "operationId": "instance_put_migration_ids", @@ -4517,6 +4589,49 @@ "vmm_runtime" ] }, + "InstanceExternalIpBody": { + "description": "Used to dynamically update external IPs attached to an instance.", + "oneOf": [ + { + "type": "object", + "properties": { + "type": { + "type": "string", + "enum": [ + "ephemeral" + ] + }, + "value": { + "type": "string", + "format": "ip" + } + }, + "required": [ + "type", + "value" + ] + }, + { + "type": "object", + "properties": { + "type": { + "type": "string", + "enum": [ + "floating" + ] + }, + "value": { + "type": "string", + "format": "ip" + } + }, + "required": [ + "type", + "value" + ] + } + ] + }, "InstanceHardware": { "description": "Describes the instance hardware.", "type": "object", diff --git a/sled-agent/src/http_entrypoints.rs b/sled-agent/src/http_entrypoints.rs index 8c8a5f2a03..8fb3fb0d09 100644 --- a/sled-agent/src/http_entrypoints.rs +++ b/sled-agent/src/http_entrypoints.rs @@ -9,8 +9,8 @@ use crate::bootstrap::early_networking::EarlyNetworkConfig; use crate::bootstrap::params::AddSledRequest; use crate::params::{ CleanupContextUpdate, DiskEnsureBody, InstanceEnsureBody, - InstancePutMigrationIdsBody, InstancePutStateBody, - InstancePutStateResponse, InstanceUnregisterResponse, OmicronZonesConfig, + InstanceExternalIpBody, InstancePutMigrationIdsBody, InstancePutStateBody, + InstancePutStateResponse, InstanceUnregisterResponse, ServiceEnsureBody, SledRole, TimeSync, VpcFirewallRulesEnsureBody, ZoneBundleId, ZoneBundleMetadata, Zpool, }; @@ -53,6 +53,8 @@ pub fn api() -> SledApiDescription { api.register(instance_issue_disk_snapshot_request)?; api.register(instance_put_migration_ids)?; api.register(instance_put_state)?; + api.register(instance_put_external_ip)?; + api.register(instance_delete_external_ip)?; api.register(instance_register)?; api.register(instance_unregister)?; api.register(omicron_zones_get)?; @@ -466,6 +468,38 @@ async fn instance_put_migration_ids( )) } +#[endpoint { + method = PUT, + path = "/instances/{instance_id}/external-ip", +}] +async fn instance_put_external_ip( + rqctx: RequestContext, + path_params: Path, + body: TypedBody, +) -> Result { + let sa = rqctx.context(); + let instance_id = path_params.into_inner().instance_id; + let body_args = body.into_inner(); + sa.instance_put_external_ip(instance_id, &body_args).await?; + Ok(HttpResponseUpdatedNoContent()) +} + +#[endpoint { + method = DELETE, + path = "/instances/{instance_id}/external-ip", +}] +async fn instance_delete_external_ip( + rqctx: RequestContext, + path_params: Path, + body: TypedBody, +) -> Result { + let sa = rqctx.context(); + let instance_id = path_params.into_inner().instance_id; + let body_args = body.into_inner(); + sa.instance_delete_external_ip(instance_id, &body_args).await?; + Ok(HttpResponseUpdatedNoContent()) +} + /// Path parameters for Disk requests (sled agent API) #[derive(Deserialize, JsonSchema)] struct DiskPathParam { diff --git a/sled-agent/src/instance.rs b/sled-agent/src/instance.rs index 057402c57a..797b01334c 100644 --- a/sled-agent/src/instance.rs +++ b/sled-agent/src/instance.rs @@ -10,8 +10,8 @@ use crate::common::instance::{ }; use crate::instance_manager::{InstanceManagerServices, InstanceTicket}; use crate::nexus::NexusClientWithResolver; -use crate::params::ZoneBundleCause; use crate::params::ZoneBundleMetadata; +use crate::params::{InstanceExternalIpBody, ZoneBundleCause}; use crate::params::{ InstanceHardware, InstanceMigrationSourceParams, InstanceMigrationTargetParams, InstanceStateRequested, VpcFirewallRule, @@ -1094,4 +1094,99 @@ impl Instance { Err(Error::InstanceNotRunning(inner.properties.id)) } } + + pub async fn add_external_ip( + &self, + ip: &InstanceExternalIpBody, + ) -> Result<(), Error> { + let mut inner = self.inner.lock().await; + + // TODO: not enforcing v4 + v6 very well here. + // TODO: reset state on fail. + // TODO: error handling is garbage + match ip { + InstanceExternalIpBody::Ephemeral(_) + if inner.ephemeral_ip.is_some() => + { + return Err(Error::Timeout( + "Ephemeral IP already attached".into(), + )); + } + InstanceExternalIpBody::Ephemeral(ip) => { + inner.ephemeral_ip = Some(*ip); + } + InstanceExternalIpBody::Floating(ip) + if inner.floating_ips.contains(ip) => + { + return Err(Error::Timeout( + "Floating IP currently attached to self".into(), + )); + } + InstanceExternalIpBody::Floating(ip) => { + inner.floating_ips.push(*ip); + } + } + + // TODO: actually care about multiple NICs in a sane way. + let nic_id = inner.requested_nics[0].id; + let nic_kind = inner.requested_nics[0].kind; + + inner.port_manager.external_ips_ensure( + nic_id, + nic_kind, + Some(inner.source_nat), + inner.ephemeral_ip, + &inner.floating_ips, + )?; + + Ok(()) + } + + pub async fn delete_external_ip( + &self, + ip: &InstanceExternalIpBody, + ) -> Result<(), Error> { + let mut inner = self.inner.lock().await; + + // TODO: not enforcing v4 + v6 very well here. + // TODO: error handling is garbage + // TODO: reset state on fail. + match ip { + InstanceExternalIpBody::Ephemeral(ip) + if inner.ephemeral_ip != Some(*ip) => + { + return Err(Error::Timeout( + "Couldn't detach intended Ephemeral IP: mismatch".into(), + )); + } + InstanceExternalIpBody::Ephemeral(_) => { + inner.ephemeral_ip = None; + } + InstanceExternalIpBody::Floating(ip) => { + let floating_index = + inner.floating_ips.iter().position(|v| v == ip); + if let Some(pos) = floating_index { + inner.floating_ips.swap_remove(pos); + } else { + return Err(Error::Timeout( + "Target Floating IP not attached to self".into(), + )); + } + } + } + + // TODO: actually care about multiple NICs in a sane way. + let nic_id = inner.requested_nics[0].id; + let nic_kind = inner.requested_nics[0].kind; + + inner.port_manager.external_ips_ensure( + nic_id, + nic_kind, + Some(inner.source_nat), + inner.ephemeral_ip, + &inner.floating_ips, + )?; + + Ok(()) + } } diff --git a/sled-agent/src/instance_manager.rs b/sled-agent/src/instance_manager.rs index c1b7e402a4..eddf98c06c 100644 --- a/sled-agent/src/instance_manager.rs +++ b/sled-agent/src/instance_manager.rs @@ -7,6 +7,7 @@ use crate::instance::propolis_zone_name; use crate::instance::Instance; use crate::nexus::NexusClientWithResolver; +use crate::params::InstanceExternalIpBody; use crate::params::ZoneBundleMetadata; use crate::params::{ InstanceHardware, InstanceMigrationSourceParams, InstancePutStateResponse, @@ -434,6 +435,42 @@ impl InstanceManager { }; instance.request_zone_bundle().await } + + pub async fn add_external_ip( + &self, + instance_id: Uuid, + ip: &InstanceExternalIpBody, + ) -> Result<(), Error> { + let instance = { + let instances = self.inner.instances.lock().unwrap(); + instances.get(&instance_id).map(|(id, v)| v.clone()) + }; + + let Some(instance) = instance else { + return Err(Error::NoSuchInstance(instance_id)); + }; + + instance.add_external_ip(ip).await?; + Ok(()) + } + + pub async fn delete_external_ip( + &self, + instance_id: Uuid, + ip: &InstanceExternalIpBody, + ) -> Result<(), Error> { + let instance = { + let instances = self.inner.instances.lock().unwrap(); + instances.get(&instance_id).map(|(id, v)| v.clone()) + }; + + let Some(instance) = instance else { + return Err(Error::NoSuchInstance(instance_id)); + }; + + instance.delete_external_ip(ip).await?; + Ok(()) + } } /// Represents membership of an instance in the [`InstanceManager`]. diff --git a/sled-agent/src/params.rs b/sled-agent/src/params.rs index a7d91e2b93..e5e1b82977 100644 --- a/sled-agent/src/params.rs +++ b/sled-agent/src/params.rs @@ -825,3 +825,11 @@ pub struct CleanupContextUpdate { /// The new limit on the underlying dataset quota allowed for bundles. pub storage_limit: Option, } + +/// Used to dynamically update external IPs attached to an instance. +#[derive(Clone, Debug, Deserialize, JsonSchema, Serialize)] +#[serde(rename_all = "snake_case", tag = "type", content = "value")] +pub enum InstanceExternalIpBody { + Ephemeral(IpAddr), + Floating(IpAddr), +} diff --git a/sled-agent/src/sled_agent.rs b/sled-agent/src/sled_agent.rs index 5f278b7f38..2fe044530e 100644 --- a/sled-agent/src/sled_agent.rs +++ b/sled-agent/src/sled_agent.rs @@ -16,9 +16,9 @@ use crate::long_running_tasks::LongRunningTaskHandles; use crate::metrics::MetricsManager; use crate::nexus::{ConvertInto, NexusClientWithResolver, NexusRequestQueue}; use crate::params::{ - DiskStateRequested, InstanceHardware, InstanceMigrationSourceParams, + DiskStateRequested, InstanceExternalIpBody, InstanceHardware, InstanceMigrationSourceParams, InstancePutStateResponse, InstanceStateRequested, - InstanceUnregisterResponse, OmicronZonesConfig, SledRole, TimeSync, + InstanceUnregisterResponse, ServiceEnsureBody, SledRole, TimeSync, VpcFirewallRule, ZoneBundleMetadata, Zpool, }; use crate::services::{self, ServiceManager}; @@ -950,6 +950,30 @@ impl SledAgent { .map_err(|e| Error::Instance(e)) } + pub async fn instance_put_external_ip( + &self, + instance_id: Uuid, + external_ip: &InstanceExternalIpBody, + ) -> Result<(), Error> { + self.inner + .instances + .add_external_ip(instance_id, external_ip) + .await + .map_err(|e| Error::Instance(e)) + } + + pub async fn instance_delete_external_ip( + &self, + instance_id: Uuid, + external_ip: &InstanceExternalIpBody, + ) -> Result<(), Error> { + self.inner + .instances + .delete_external_ip(instance_id, external_ip) + .await + .map_err(|e| Error::Instance(e)) + } + /// Idempotently ensures that the given virtual disk is attached (or not) as /// specified. /// From 34c03d74ebfd61a8e436ead235d3cb0384e61206 Mon Sep 17 00:00:00 2001 From: Kyle Simpson Date: Wed, 13 Dec 2023 22:36:01 +0000 Subject: [PATCH 02/56] The sagaization begins --- .../src/db/datastore/external_ip.rs | 6 +- nexus/src/app/instance.rs | 101 ++-- nexus/src/app/instance_network.rs | 30 +- nexus/src/app/sagas/instance_create.rs | 9 +- nexus/src/app/sagas/instance_ip_attach.rs | 430 ++++++++++++++++++ nexus/src/app/sagas/instance_ip_detach.rs | 143 ++++++ nexus/src/app/sagas/mod.rs | 8 + nexus/src/external_api/http_entrypoints.rs | 20 +- sled-agent/src/sled_agent.rs | 8 +- 9 files changed, 662 insertions(+), 93 deletions(-) create mode 100644 nexus/src/app/sagas/instance_ip_attach.rs create mode 100644 nexus/src/app/sagas/instance_ip_detach.rs diff --git a/nexus/db-queries/src/db/datastore/external_ip.rs b/nexus/db-queries/src/db/datastore/external_ip.rs index 4e74cea150..511f867235 100644 --- a/nexus/db-queries/src/db/datastore/external_ip.rs +++ b/nexus/db-queries/src/db/datastore/external_ip.rs @@ -467,7 +467,7 @@ impl DataStore { authz_fip: &authz::FloatingIp, db_fip: &FloatingIp, instance_id: Uuid, - ) -> UpdateResult<(FloatingIp, Option)> { + ) -> UpdateResult { use db::schema::external_ip::dsl; // Verify this FIP is not attached to any instances/services. @@ -485,8 +485,6 @@ impl DataStore { opctx.authorize(authz::Action::Modify, authz_fip).await?; opctx.authorize(authz::Action::Modify, &authz_instance).await?; - let i = self.instance_fetch_with_vmm(opctx, &authz_instance).await?; - let out = diesel::update(dsl::external_ip) .filter(dsl::id.eq(db_fip.id())) .filter(dsl::kind.eq(IpKind::Floating)) @@ -508,7 +506,7 @@ impl DataStore { .and_then(|r| FloatingIp::try_from(r)) .map_err(|e| Error::internal_error(&format!("{e}")))?; - Ok((out, i.sled_id())) + Ok(out) } /// Detaches a Floating IP address from an instance. diff --git a/nexus/src/app/instance.rs b/nexus/src/app/instance.rs index 53173e0f7f..ea64fd380e 100644 --- a/nexus/src/app/instance.rs +++ b/nexus/src/app/instance.rs @@ -1874,68 +1874,37 @@ impl super::Nexus { Ok(()) } - /// Detach a disk from an instance. + /// Attach a disk to an instance. pub(crate) async fn instance_attach_external_ip( - &self, + self: Arc, opctx: &OpContext, instance_lookup: &lookup::Instance<'_>, ext_ip: ¶ms::ExternalIpCreate, ) -> UpdateResult { - let (.., authz_project, authz_instance) = - instance_lookup.lookup_for(authz::Action::Modify).await?; + let (.., authz_project, authz_instance, instance) = + instance_lookup.fetch_for(authz::Action::Modify).await?; - let (authz_fip, db_fip) = match ext_ip { - params::ExternalIpCreate::Ephemeral { pool_name } => Err(Error::internal_error("ephemeral IP attach/detach not yet supported"))?, - params::ExternalIpCreate::Floating { floating_ip_name } => { - let floating_ip_name = db::model::Name(floating_ip_name.clone()); - let (.., authz_fip, db_fip) = LookupPath::new(&opctx, &self.datastore()) - .project_id(authz_project.id()) - .floating_ip_name(&floating_ip_name) - .fetch_for(authz::Action::Modify) - .await?; - (authz_fip, db_fip) - }, + let saga_params = sagas::instance_ip_attach::Params { + create_params: ext_ip.clone(), + authz_instance, + instance, + ephemeral_ip_id: Uuid::new_v4(), + serialized_authn: authn::saga::Serialized::for_opctx(opctx), }; - let (eip, sled_uuid) = self - .datastore() - .floating_ip_attach(opctx, &authz_fip, &db_fip, authz_instance.id()) + let saga_results = self + .execute_saga::( + saga_params, + ) .await?; - if let Some(uuid) = sled_uuid { - self.sled_client(&uuid) - .await? - .instance_put_external_ip(&authz_instance.id(), &sled_agent_client::types::InstanceExternalIpBody::Floating(db_fip.ip.ip())) - .await?; - - let (.., sled) = self.sled_lookup(opctx, &uuid)?.fetch().await?; - - let boundary_switches = self.boundary_switches(opctx) - .await?; - - for switch in boundary_switches { - let dpd_client = - self.dpd_clients.get(&switch).ok_or_else(|| { - Error::internal_error(&format!( - "unable to find client for switch {switch}" - )) - })?; - - self.instance_ensure_dpd_config( - &opctx, - authz_instance.id(), - &sled.address(), - None, - dpd_client, - ) - .await?; - } - } + todo!() - Ok(views::ExternalIp::from(views::FloatingIp::from(eip))) + // XXX: add a From for views::External + // Ok(views::ExternalIp::from(views::FloatingIp::from(eip))) } - /// Detach a disk from an instance. + /// Detach an external IP from an instance. pub(crate) async fn instance_detach_external_ip( &self, opctx: &OpContext, @@ -1946,27 +1915,41 @@ impl super::Nexus { instance_lookup.lookup_for(authz::Action::Modify).await?; let (authz_fip, db_fip) = match ext_ip { - params::ExternalIpDelete::Ephemeral => Err(Error::internal_error("ephemeral IP attach/detach not yet supported"))?, + params::ExternalIpDelete::Ephemeral => Err(Error::internal_error( + "ephemeral IP attach/detach not yet supported", + ))?, params::ExternalIpDelete::Floating { floating_ip_name } => { - let floating_ip_name = db::model::Name(floating_ip_name.clone()); - let (.., authz_fip, db_fip) = LookupPath::new(&opctx, &self.datastore()) - .project_id(authz_project.id()) - .floating_ip_name(&floating_ip_name) - .fetch_for(authz::Action::Modify) - .await?; + let floating_ip_name = + db::model::Name(floating_ip_name.clone()); + let (.., authz_fip, db_fip) = + LookupPath::new(&opctx, &self.datastore()) + .project_id(authz_project.id()) + .floating_ip_name(&floating_ip_name) + .fetch_for(authz::Action::Modify) + .await?; (authz_fip, db_fip) - }, + } }; let (eip, sled_uuid) = self .datastore() - .floating_ip_detach(opctx, &authz_fip, &db_fip, Some(authz_instance.id())) + .floating_ip_detach( + opctx, + &authz_fip, + &db_fip, + Some(authz_instance.id()), + ) .await?; if let Some(uuid) = sled_uuid { self.sled_client(&uuid) .await? - .instance_delete_external_ip(&authz_instance.id(), &sled_agent_client::types::InstanceExternalIpBody::Floating(db_fip.ip.ip())) + .instance_delete_external_ip( + &authz_instance.id(), + &sled_agent_client::types::InstanceExternalIpBody::Floating( + db_fip.ip.ip(), + ), + ) .await?; } diff --git a/nexus/src/app/instance_network.rs b/nexus/src/app/instance_network.rs index 3db749f43b..65430aec12 100644 --- a/nexus/src/app/instance_network.rs +++ b/nexus/src/app/instance_network.rs @@ -296,7 +296,7 @@ impl super::Nexus { opctx: &OpContext, instance_id: Uuid, sled_ip_address: &std::net::SocketAddrV6, - ip_index_filter: Option, + ip_filter: Option, dpd_client: &Arc, ) -> Result<(), Error> { let log = &self.log; @@ -344,33 +344,25 @@ impl super::Nexus { .instance_lookup_external_ips(&opctx, instance_id) .await?; - if let Some(wanted_index) = ip_index_filter { - if let None = ips.get(wanted_index) { + let ips_of_interest = if let Some(wanted_id) = ip_filter { + if let Some(ip) = ips.iter().find(|v| v.id == wanted_id) { + std::slice::from_ref(ip) + } else { return Err(Error::internal_error(&format!( - "failed to find external ip address at index: {}", - wanted_index + "failed to find external ip address with id: {wanted_id}", ))); } - } + } else { + &ips[..] + }; let sled_address = Ipv6Net(Ipv6Network::new(*sled_ip_address.ip(), 128).unwrap()); - for target_ip in ips - .iter() - .enumerate() - .filter(|(index, _)| { - if let Some(wanted_index) = ip_index_filter { - *index == wanted_index - } else { - true - } - }) - .map(|(_, ip)| ip) - { + for external_ip in ips_of_interest { // For each external ip, add a nat entry to the database self.ensure_nat_entry( - target_ip, + external_ip, sled_address, &network_interface, mac_address, diff --git a/nexus/src/app/sagas/instance_create.rs b/nexus/src/app/sagas/instance_create.rs index fd86e2052a..89948d4db5 100644 --- a/nexus/src/app/sagas/instance_create.rs +++ b/nexus/src/app/sagas/instance_create.rs @@ -680,7 +680,14 @@ async fn sic_allocate_instance_external_ip_undo( .fetch_for(authz::Action::Modify) .await?; - datastore.floating_ip_detach(&opctx, &authz_fip, &db_fip).await?; + datastore + .floating_ip_detach( + &opctx, + &authz_fip, + &db_fip, + Some(repeat_saga_params.instance_id), + ) + .await?; } } Ok(()) diff --git a/nexus/src/app/sagas/instance_ip_attach.rs b/nexus/src/app/sagas/instance_ip_attach.rs new file mode 100644 index 0000000000..28fe02acce --- /dev/null +++ b/nexus/src/app/sagas/instance_ip_attach.rs @@ -0,0 +1,430 @@ +// This Source Code Form is subject to the terms of the Mozilla Public +// License, v. 2.0. If a copy of the MPL was not distributed with this +// file, You can obtain one at https://mozilla.org/MPL/2.0/. + +use super::{ + common_storage::{ + call_pantry_attach_for_disk, call_pantry_detach_for_disk, + delete_crucible_regions, ensure_all_datasets_and_regions, + get_pantry_address, + }, + ActionRegistry, NexusActionContext, NexusSaga, SagaInitError, + ACTION_GENERATE_ID, +}; +use crate::app::sagas::declare_saga_actions; +use crate::app::{authn, authz, db}; +use crate::external_api::params; +use futures::TryFutureExt; +use nexus_db_queries::db::identity::{Asset, Resource}; +use nexus_db_queries::db::lookup::LookupPath; +use omicron_common::api::external::DiskState; +use omicron_common::api::external::Error; +use rand::{rngs::StdRng, RngCore, SeedableRng}; +use serde::Deserialize; +use serde::Serialize; +use sled_agent_client::types::{CrucibleOpts, VolumeConstructionRequest}; +use std::convert::TryFrom; +use std::net::IpAddr; +use std::net::SocketAddrV6; +use steno::ActionError; +use steno::Node; +use uuid::Uuid; + +use sled_agent_client::types::InstanceExternalIpBody; + +#[derive(Debug, Deserialize, Serialize)] +enum ExternalIp { + Ephemeral(IpAddr, Uuid), + Floating(IpAddr, Uuid), +} + +impl From for InstanceExternalIpBody { + fn from(value: ExternalIp) -> Self { + match value { + ExternalIp::Ephemeral(ip, _) => { + InstanceExternalIpBody::Ephemeral(ip) + } + ExternalIp::Floating(ip, _) => InstanceExternalIpBody::Floating(ip), + } + } +} + +impl From for Uuid { + fn from(value: ExternalIp) -> Self { + match value { + ExternalIp::Ephemeral(_, id) => id, + ExternalIp::Floating(_, id) => id, + } + } +} + +// rough sequence of evts: +// - take temp ownership of instance while interacting w/ sled agent +// -> mark instance migration id as Some(0) if None +// - Attach+alloc EIP to instance +// - Register routes +// -> ensure_dpd... +// -> must precede OPTE: host may change its sending +// behaviour prematurely +// - Register addr in OPTE +// -> Put addr in sled-agent endpoint +// - free up migration_id of instance. +// -> mark instance migration id as None + +declare_saga_actions! { + instance_ip_attach; + LOCK_MIGRATION -> "sled_id" { + + siia_migration_lock + - siia_migration_lock_undo + } + + ATTACH_EXTERNAL_IP -> "new_ip" { + + siia_attach_ip + - siia_attach_ip_undo + } + + REGISTER_NAT -> "no_result3" { + + siia_nat + - siia_nat + } + + ENSURE_OPTE_PORT -> "no_result4" { + + siia_update_opte + - siia_update_opte_undo + } + + UNLOCK_MIGRATION -> "no_result1" { + + siia_migration_unlock + - siia_migration_unlock_undo + } +} + +#[derive(Debug, Deserialize, Serialize)] +pub struct Params { + pub create_params: params::ExternalIpCreate, + pub authz_instance: authz::Instance, + pub instance: db::model::Instance, + pub ephemeral_ip_id: Uuid, + /// Authentication context to use to fetch the instance's current state from + /// the database. + pub serialized_authn: authn::saga::Serialized, +} + +#[derive(Debug)] +pub struct SagaInstanceIpAttach; +impl NexusSaga for SagaInstanceIpAttach { + const NAME: &'static str = "external-ip-attach"; + type Params = Params; + + fn register_actions(registry: &mut ActionRegistry) { + instance_ip_attach_register_actions(registry); + } + + fn make_saga_dag( + _params: &Self::Params, + mut builder: steno::DagBuilder, + ) -> Result { + builder.append(lock_migration_action()); + builder.append(attach_external_ip_action()); + builder.append(register_nat_action()); + builder.append(ensure_opte_port_action()); + builder.append(unlock_migration_action()); + Ok(builder.build()?) + } +} + +async fn siia_migration_lock( + sagactx: NexusActionContext, +) -> Result, ActionError> { + // TODO: do this. + let osagactx = sagactx.user_data(); + let datastore = osagactx.datastore(); + let params = sagactx.saga_params::()?; + + let inst_and_vmm = datastore + .instance_fetch_with_vmm( + &osagactx.nexus().opctx_alloc, + ¶ms.authz_instance, + ) + .await + .map_err(ActionError::action_failed)?; + + // TODO: actually lock? + // TODO: fail out in a user-friendly way if migrating? + + Ok(inst_and_vmm.vmm().as_ref().map(|v| v.sled_id)) +} + +async fn siia_migration_lock_undo( + sagactx: NexusActionContext, +) -> Result<(), ActionError> { + // TODO: do this iff. we implement migration lock. + Ok(()) +} + +// TODO: factor this out for attach, detach, and instance create +// to share an impl. + +async fn siia_attach_ip( + sagactx: NexusActionContext, +) -> Result { + let osagactx = sagactx.user_data(); + let datastore = osagactx.datastore(); + let params = sagactx.saga_params::()?; + let opctx = crate::context::op_context_for_saga_action( + &sagactx, + ¶ms.serialized_authn, + ); + + match params.create_params { + // Allocate a new IP address from the target, possibly default, pool + params::ExternalIpCreate::Ephemeral { ref pool_name } => { + let pool_name = + pool_name.as_ref().map(|name| db::model::Name(name.clone())); + let eip = datastore + .allocate_instance_ephemeral_ip( + &opctx, + params.ephemeral_ip_id, + params.instance.id(), + pool_name, + ) + .await + .map_err(ActionError::action_failed)?; + + Ok(ExternalIp::Ephemeral(eip.ip.ip(), params.ephemeral_ip_id)) + } + // Set the parent of an existing floating IP to the new instance's ID. + params::ExternalIpCreate::Floating { ref floating_ip_name } => { + let floating_ip_name = db::model::Name(floating_ip_name.clone()); + let (.., authz_fip, db_fip) = LookupPath::new(&opctx, &datastore) + .project_id(params.instance.project_id) + .floating_ip_name(&floating_ip_name) + .fetch_for(authz::Action::Modify) + .await + .map_err(ActionError::action_failed)?; + + let eip = datastore + .floating_ip_attach( + &opctx, + &authz_fip, + &db_fip, + params.instance.id(), + ) + .await + .map_err(ActionError::action_failed)?; + + Ok(ExternalIp::Floating(eip.ip.ip(), db_fip.id())) + } + } +} + +async fn siia_attach_ip_undo( + sagactx: NexusActionContext, +) -> Result<(), anyhow::Error> { + let osagactx = sagactx.user_data(); + let datastore = osagactx.datastore(); + let params = sagactx.saga_params::()?; + let opctx = crate::context::op_context_for_saga_action( + &sagactx, + ¶ms.serialized_authn, + ); + + // TODO: should not be looking up by name here for FIP. + match params.create_params { + params::ExternalIpCreate::Ephemeral { .. } => { + datastore + .deallocate_external_ip(&opctx, params.ephemeral_ip_id) + .await?; + } + params::ExternalIpCreate::Floating { floating_ip_name } => { + let floating_ip_name = db::model::Name(floating_ip_name.clone()); + let (.., authz_fip, db_fip) = LookupPath::new(&opctx, &datastore) + .project_id(params.instance.project_id) + .floating_ip_name(&floating_ip_name) + .fetch_for(authz::Action::Modify) + .await?; + + datastore + .floating_ip_detach( + &opctx, + &authz_fip, + &db_fip, + Some(params.instance.id()), + ) + .await?; + } + } + Ok(()) +} + +async fn siia_nat(sagactx: NexusActionContext) -> Result<(), ActionError> { + // NOTE: on undo we want to do this after unbind. + let osagactx = sagactx.user_data(); + let datastore = osagactx.datastore(); + let params = sagactx.saga_params::()?; + let opctx = crate::context::op_context_for_saga_action( + &sagactx, + ¶ms.serialized_authn, + ); + + // NOTE: mostly copied from instance_start. + + // No physical sled? Don't push NAT. + let Some(sled_uuid) = sagactx.lookup::>("sled_id")? else { + return Ok(()); + }; + + let new_ip = sagactx.lookup::("new_ip")?; + + // Querying sleds requires fleet access; use the instance allocator context + // for this. + let (.., sled) = LookupPath::new(&osagactx.nexus().opctx_alloc, &datastore) + .sled_id(sled_uuid) + .fetch() + .await + .map_err(ActionError::action_failed)?; + + // Querying boundary switches also requires fleet access and the use of the + // instance allocator context. + let boundary_switches = osagactx + .nexus() + .boundary_switches(&osagactx.nexus().opctx_alloc) + .await + .map_err(ActionError::action_failed)?; + + for switch in boundary_switches { + let dpd_client = + osagactx.nexus().dpd_clients.get(&switch).ok_or_else(|| { + ActionError::action_failed(Error::internal_error(&format!( + "unable to find client for switch {switch}" + ))) + })?; + + osagactx + .nexus() + .instance_ensure_dpd_config( + &opctx, + params.instance.id(), + &sled.address(), + Some(new_ip.into()), + dpd_client, + ) + .await + .map_err(ActionError::action_failed)?; + } + + Ok(()) +} + +async fn siia_update_opte( + sagactx: NexusActionContext, +) -> Result<(), ActionError> { + let osagactx = sagactx.user_data(); + let datastore = osagactx.datastore(); + let params = sagactx.saga_params::()?; + let opctx = crate::context::op_context_for_saga_action( + &sagactx, + ¶ms.serialized_authn, + ); + + // No physical sled? Don't inform OPTE. + let Some(sled_uuid) = sagactx.lookup::>("sled_id")? else { + return Ok(()); + }; + + let new_ip = sagactx.lookup::("new_ip")?; + + // TODO: disambiguate the various sled agent errors etc. + osagactx + .nexus() + .sled_client(&sled_uuid) + .await + .map_err(ActionError::action_failed)? + .instance_put_external_ip(¶ms.instance.id(), &new_ip.into()) + .await + .map_err(|_| { + ActionError::action_failed(Error::invalid_request("hmm")) + })?; + + Ok(()) +} + +async fn siia_update_opte_undo( + sagactx: NexusActionContext, +) -> Result<(), ActionError> { + todo!() +} + +// TODO +async fn siia_todo(sagactx: NexusActionContext) -> Result<(), ActionError> { + todo!() +} + +async fn siia_migration_unlock( + sagactx: NexusActionContext, +) -> Result<(), ActionError> { + todo!() +} + +async fn siia_migration_unlock_undo( + sagactx: NexusActionContext, +) -> Result<(), ActionError> { + todo!() +} + +// TODO: backout changes if run state changed illegally? + +#[cfg(test)] +pub(crate) mod test { + use crate::{ + app::saga::create_saga_dag, app::sagas::disk_create::Params, + app::sagas::disk_create::SagaDiskCreate, external_api::params, + }; + use async_bb8_diesel::{AsyncRunQueryDsl, AsyncSimpleConnection}; + use diesel::{ + ExpressionMethods, OptionalExtension, QueryDsl, SelectableHelper, + }; + use dropshot::test_util::ClientTestContext; + use nexus_db_queries::context::OpContext; + use nexus_db_queries::{authn::saga::Serialized, db::datastore::DataStore}; + use nexus_test_utils::resource_helpers::create_ip_pool; + use nexus_test_utils::resource_helpers::create_project; + use nexus_test_utils::resource_helpers::DiskTest; + use nexus_test_utils_macros::nexus_test; + use omicron_common::api::external::ByteCount; + use omicron_common::api::external::IdentityMetadataCreateParams; + use omicron_common::api::external::Name; + use omicron_sled_agent::sim::SledAgent; + use uuid::Uuid; + + type ControlPlaneTestContext = + nexus_test_utils::ControlPlaneTestContext; + + #[nexus_test(server = crate::Server)] + async fn test_saga_basic_usage_succeeds( + cptestctx: &ControlPlaneTestContext, + ) { + todo!() + } + + #[nexus_test(server = crate::Server)] + async fn test_action_failure_can_unwind( + cptestctx: &ControlPlaneTestContext, + ) { + todo!() + } + + #[nexus_test(server = crate::Server)] + async fn test_action_failure_can_unwind_idempotently( + cptestctx: &ControlPlaneTestContext, + ) { + todo!() + } + + #[nexus_test(server = crate::Server)] + async fn test_actions_succeed_idempotently( + cptestctx: &ControlPlaneTestContext, + ) { + todo!() + } +} diff --git a/nexus/src/app/sagas/instance_ip_detach.rs b/nexus/src/app/sagas/instance_ip_detach.rs new file mode 100644 index 0000000000..1a4725d42b --- /dev/null +++ b/nexus/src/app/sagas/instance_ip_detach.rs @@ -0,0 +1,143 @@ +// This Source Code Form is subject to the terms of the Mozilla Public +// License, v. 2.0. If a copy of the MPL was not distributed with this +// file, You can obtain one at https://mozilla.org/MPL/2.0/. + +use super::{ + common_storage::{ + call_pantry_attach_for_disk, call_pantry_detach_for_disk, + delete_crucible_regions, ensure_all_datasets_and_regions, + get_pantry_address, + }, + ActionRegistry, NexusActionContext, NexusSaga, SagaInitError, + ACTION_GENERATE_ID, +}; +use crate::app::sagas::declare_saga_actions; +use crate::app::{authn, authz, db}; +use crate::external_api::params; +use nexus_db_queries::db::identity::{Asset, Resource}; +use nexus_db_queries::db::lookup::LookupPath; +use omicron_common::api::external::DiskState; +use omicron_common::api::external::Error; +use rand::{rngs::StdRng, RngCore, SeedableRng}; +use serde::Deserialize; +use serde::Serialize; +use sled_agent_client::types::{CrucibleOpts, VolumeConstructionRequest}; +use std::convert::TryFrom; +use std::net::SocketAddrV6; +use steno::ActionError; +use steno::Node; +use uuid::Uuid; + +// rough sequence of evts: +// - take temp ownership of instance while interacting w/ sled agent +// -> mark instance migration id as Some(0) if None +// - Detach EIP from instance, hang onto its ID. +// - Withdraw routes +// -> ensure_dpd... (?) Do we actually need to? +// -> must precede OPTE: host may change its sending +// behaviour prematurely +// - Deregister addr in OPTE +// -> Put addr in sled-agent endpoint +// - Delete EIP iff. Ephemeral +// -> why so late? Risk that we can't recover our IP in an unwind. +// - free up migration_id of instance. +// -> mark instance migration id as None + +declare_saga_actions! { + instance_ip_detach; + STAGE1 -> "result1" { + + do_fn + - undo_fn + } +} + +#[derive(Debug, Deserialize, Serialize)] +pub struct Params { + pub authz_instance: authz::Instance, + pub instance: db::model::Instance, + /// Authentication context to use to fetch the instance's current state from + /// the database. + pub serialized_authn: authn::saga::Serialized, +} + +#[derive(Debug)] +pub struct SagaInstanceIpDetach; +impl NexusSaga for SagaInstanceIpDetach { + const NAME: &'static str = "external-ip-detach"; + type Params = Params; + + fn register_actions(registry: &mut ActionRegistry) { + instance_ip_detach_register_actions(registry); + } + + fn make_saga_dag( + _params: &Self::Params, + mut builder: steno::DagBuilder, + ) -> Result { + builder.append(stage1_action()); + Ok(builder.build()?) + } +} + +async fn do_fn(sagactx: NexusActionContext) -> Result<(), ActionError> { + todo!() +} + +async fn undo_fn(sagactx: NexusActionContext) -> Result<(), ActionError> { + todo!() +} + +#[cfg(test)] +pub(crate) mod test { + use crate::{ + app::saga::create_saga_dag, app::sagas::disk_create::Params, + app::sagas::disk_create::SagaDiskCreate, external_api::params, + }; + use async_bb8_diesel::{AsyncRunQueryDsl, AsyncSimpleConnection}; + use diesel::{ + ExpressionMethods, OptionalExtension, QueryDsl, SelectableHelper, + }; + use dropshot::test_util::ClientTestContext; + use nexus_db_queries::context::OpContext; + use nexus_db_queries::{authn::saga::Serialized, db::datastore::DataStore}; + use nexus_test_utils::resource_helpers::create_ip_pool; + use nexus_test_utils::resource_helpers::create_project; + use nexus_test_utils::resource_helpers::DiskTest; + use nexus_test_utils_macros::nexus_test; + use omicron_common::api::external::ByteCount; + use omicron_common::api::external::IdentityMetadataCreateParams; + use omicron_common::api::external::Name; + use omicron_sled_agent::sim::SledAgent; + use uuid::Uuid; + + type ControlPlaneTestContext = + nexus_test_utils::ControlPlaneTestContext; + + #[nexus_test(server = crate::Server)] + async fn test_saga_basic_usage_succeeds( + cptestctx: &ControlPlaneTestContext, + ) { + todo!() + } + + #[nexus_test(server = crate::Server)] + async fn test_action_failure_can_unwind( + cptestctx: &ControlPlaneTestContext, + ) { + todo!() + } + + #[nexus_test(server = crate::Server)] + async fn test_action_failure_can_unwind_idempotently( + cptestctx: &ControlPlaneTestContext, + ) { + todo!() + } + + #[nexus_test(server = crate::Server)] + async fn test_actions_succeed_idempotently( + cptestctx: &ControlPlaneTestContext, + ) { + todo!() + } +} diff --git a/nexus/src/app/sagas/mod.rs b/nexus/src/app/sagas/mod.rs index c5918d32ef..1bd85ecf32 100644 --- a/nexus/src/app/sagas/mod.rs +++ b/nexus/src/app/sagas/mod.rs @@ -26,6 +26,8 @@ pub mod image_delete; mod instance_common; pub mod instance_create; pub mod instance_delete; +pub mod instance_ip_attach; +pub mod instance_ip_detach; pub mod instance_migrate; pub mod instance_start; pub mod loopback_address_create; @@ -130,6 +132,12 @@ fn make_action_registry() -> ActionRegistry { ::register_actions( &mut registry, ); + ::register_actions( + &mut registry, + ); + ::register_actions( + &mut registry, + ); ::register_actions( &mut registry, ); diff --git a/nexus/src/external_api/http_entrypoints.rs b/nexus/src/external_api/http_entrypoints.rs index 1d52277a96..12050ac54b 100644 --- a/nexus/src/external_api/http_entrypoints.rs +++ b/nexus/src/external_api/http_entrypoints.rs @@ -2480,8 +2480,6 @@ async fn instance_disk_detach( apictx.external_latencies.instrument_dropshot_handler(&rqctx, handler).await } - - // Certificates /// List certificates for external endpoints @@ -3671,8 +3669,13 @@ async fn instance_external_ip_attach( }; let instance_lookup = nexus.instance_lookup(&opctx, instance_selector)?; - let disk = - nexus.instance_attach_external_ip(&opctx, &instance_lookup, &ip_to_detach.into_inner()).await?; + let disk = nexus + .instance_attach_external_ip( + &opctx, + &instance_lookup, + &ip_to_detach.into_inner(), + ) + .await?; Ok(HttpResponseAccepted(disk.into())) }; apictx.external_latencies.instrument_dropshot_handler(&rqctx, handler).await @@ -3702,8 +3705,13 @@ async fn instance_external_ip_detach( }; let instance_lookup = nexus.instance_lookup(&opctx, instance_selector)?; - let disk = - nexus.instance_detach_external_ip(&opctx, &instance_lookup, &ip_to_detach.into_inner()).await?; + let disk = nexus + .instance_detach_external_ip( + &opctx, + &instance_lookup, + &ip_to_detach.into_inner(), + ) + .await?; Ok(HttpResponseAccepted(disk.into())) }; apictx.external_latencies.instrument_dropshot_handler(&rqctx, handler).await diff --git a/sled-agent/src/sled_agent.rs b/sled-agent/src/sled_agent.rs index 2fe044530e..c4998cfbf7 100644 --- a/sled-agent/src/sled_agent.rs +++ b/sled-agent/src/sled_agent.rs @@ -16,10 +16,10 @@ use crate::long_running_tasks::LongRunningTaskHandles; use crate::metrics::MetricsManager; use crate::nexus::{ConvertInto, NexusClientWithResolver, NexusRequestQueue}; use crate::params::{ - DiskStateRequested, InstanceExternalIpBody, InstanceHardware, InstanceMigrationSourceParams, - InstancePutStateResponse, InstanceStateRequested, - InstanceUnregisterResponse, ServiceEnsureBody, SledRole, TimeSync, - VpcFirewallRule, ZoneBundleMetadata, Zpool, + DiskStateRequested, InstanceExternalIpBody, InstanceHardware, + InstanceMigrationSourceParams, InstancePutStateResponse, + InstanceStateRequested, InstanceUnregisterResponse, ServiceEnsureBody, + SledRole, TimeSync, VpcFirewallRule, ZoneBundleMetadata, Zpool, }; use crate::services::{self, ServiceManager}; use crate::storage_monitor::UnderlayAccess; From c6235e3fc6e780fa549466df5da683e60a683af5 Mon Sep 17 00:00:00 2001 From: Kyle Simpson Date: Thu, 14 Dec 2023 15:55:32 +0000 Subject: [PATCH 03/56] Theoretically in a testable state post-sagaization Still to solve: migration blocks, handling unexpt runstate change, preventing concurrent attach/detach of a given EIP. Takes the time to refactor the dpd_ensure and nat_removal so that we can target it to a single IP address on a given device. --- nexus/src/app/instance.rs | 66 ++-- nexus/src/app/instance_network.rs | 108 +++--- nexus/src/app/sagas/instance_delete.rs | 2 +- nexus/src/app/sagas/instance_ip_attach.rs | 198 +++++++---- nexus/src/app/sagas/instance_ip_detach.rs | 392 ++++++++++++++++++++- nexus/src/app/sagas/instance_start.rs | 29 +- nexus/src/external_api/http_entrypoints.rs | 8 +- sled-agent/src/http_entrypoints.rs | 2 +- sled-agent/src/sled_agent.rs | 2 +- 9 files changed, 610 insertions(+), 197 deletions(-) diff --git a/nexus/src/app/instance.rs b/nexus/src/app/instance.rs index ea64fd380e..bd47f10a72 100644 --- a/nexus/src/app/instance.rs +++ b/nexus/src/app/instance.rs @@ -1876,7 +1876,7 @@ impl super::Nexus { /// Attach a disk to an instance. pub(crate) async fn instance_attach_external_ip( - self: Arc, + self: &Arc, opctx: &OpContext, instance_lookup: &lookup::Instance<'_>, ext_ip: ¶ms::ExternalIpCreate, @@ -1888,72 +1888,48 @@ impl super::Nexus { create_params: ext_ip.clone(), authz_instance, instance, - ephemeral_ip_id: Uuid::new_v4(), serialized_authn: authn::saga::Serialized::for_opctx(opctx), }; - let saga_results = self + let saga_outputs = self .execute_saga::( saga_params, ) .await?; - todo!() - - // XXX: add a From for views::External - // Ok(views::ExternalIp::from(views::FloatingIp::from(eip))) + saga_outputs + .lookup_node_output::("output") + .map_err(|e| Error::internal_error(&format!("{:#}", &e))) + .internal_context("looking up output from ip attach saga") } /// Detach an external IP from an instance. pub(crate) async fn instance_detach_external_ip( - &self, + self: &Arc, opctx: &OpContext, instance_lookup: &lookup::Instance<'_>, ext_ip: ¶ms::ExternalIpDelete, ) -> UpdateResult { - let (.., authz_project, authz_instance) = - instance_lookup.lookup_for(authz::Action::Modify).await?; + let (.., authz_project, authz_instance, instance) = + instance_lookup.fetch_for(authz::Action::Modify).await?; - let (authz_fip, db_fip) = match ext_ip { - params::ExternalIpDelete::Ephemeral => Err(Error::internal_error( - "ephemeral IP attach/detach not yet supported", - ))?, - params::ExternalIpDelete::Floating { floating_ip_name } => { - let floating_ip_name = - db::model::Name(floating_ip_name.clone()); - let (.., authz_fip, db_fip) = - LookupPath::new(&opctx, &self.datastore()) - .project_id(authz_project.id()) - .floating_ip_name(&floating_ip_name) - .fetch_for(authz::Action::Modify) - .await?; - (authz_fip, db_fip) - } + let saga_params = sagas::instance_ip_detach::Params { + delete_params: ext_ip.clone(), + authz_instance, + instance, + serialized_authn: authn::saga::Serialized::for_opctx(opctx), }; - let (eip, sled_uuid) = self - .datastore() - .floating_ip_detach( - opctx, - &authz_fip, - &db_fip, - Some(authz_instance.id()), + let saga_outputs = self + .execute_saga::( + saga_params, ) .await?; - if let Some(uuid) = sled_uuid { - self.sled_client(&uuid) - .await? - .instance_delete_external_ip( - &authz_instance.id(), - &sled_agent_client::types::InstanceExternalIpBody::Floating( - db_fip.ip.ip(), - ), - ) - .await?; - } - - Ok(views::ExternalIp::from(views::FloatingIp::from(eip))) + saga_outputs + .lookup_node_output::("output") + .map_err(|e| Error::internal_error(&format!("{:#}", &e))) + .internal_context("looking up output from ip attach saga") } } diff --git a/nexus/src/app/instance_network.rs b/nexus/src/app/instance_network.rs index 65430aec12..e6270457ba 100644 --- a/nexus/src/app/instance_network.rs +++ b/nexus/src/app/instance_network.rs @@ -283,12 +283,10 @@ impl super::Nexus { /// - `instance_id`: The ID of the instance to act on. /// - `sled_ip_address`: The internal IP address assigned to the sled's /// sled agent. - /// - `ip_index_filter`: An optional filter on the index into the instance's + /// - `ip_filter`: An optional filter on the index into the instance's /// external IP array. - /// - If this is `Some(n)`, this routine configures DPD state for only the - /// Nth external IP in the collection returned from CRDB. The caller is - /// responsible for ensuring that the IP collection has stable indices - /// when making this call. + /// - If this is `Some(id)`, this routine configures DPD state for only the + /// external IP with `id` in the collection returned from CRDB. /// - If this is `None`, this routine configures DPD for all external /// IPs. pub(crate) async fn instance_ensure_dpd_config( @@ -297,7 +295,6 @@ impl super::Nexus { instance_id: Uuid, sled_ip_address: &std::net::SocketAddrV6, ip_filter: Option, - dpd_client: &Arc, ) -> Result<(), Error> { let log = &self.log; @@ -359,24 +356,41 @@ impl super::Nexus { let sled_address = Ipv6Net(Ipv6Network::new(*sled_ip_address.ip(), 128).unwrap()); - for external_ip in ips_of_interest { - // For each external ip, add a nat entry to the database - self.ensure_nat_entry( - external_ip, - sled_address, - &network_interface, - mac_address, - opctx, - ) - .await?; - } + // Querying boundary switches also requires fleet access and the use of the + // instance allocator context. + let boundary_switches = + self.boundary_switches(&self.opctx_alloc).await?; - // Notify dendrite that there are changes for it to reconcile. - // In the event of a failure to notify dendrite, we'll log an error - // and rely on dendrite's RPW timer to catch it up. - if let Err(e) = dpd_client.ipv4_nat_trigger_update().await { - error!(self.log, "failed to notify dendrite of nat updates"; "error" => ?e); - }; + for switch in &boundary_switches { + debug!(&self.log, "notifying dendrite of updates"; + "instance_id" => %authz_instance.id(), + "switch" => switch.to_string()); + + let dpd_client = self.dpd_clients.get(switch).ok_or_else(|| { + Error::internal_error(&format!( + "unable to find dendrite client for {switch}" + )) + })?; + + for external_ip in ips_of_interest { + // For each external ip, add a nat entry to the database + self.ensure_nat_entry( + external_ip, + sled_address, + &network_interface, + mac_address, + opctx, + ) + .await?; + } + + // Notify dendrite that there are changes for it to reconcile. + // In the event of a failure to notify dendrite, we'll log an error + // and rely on dendrite's RPW timer to catch it up. + if let Err(e) = dpd_client.ipv4_nat_trigger_update().await { + error!(self.log, "failed to notify dendrite of nat updates"; "error" => ?e); + }; + } Ok(()) } @@ -427,10 +441,17 @@ impl super::Nexus { /// - If an operation fails while this routine is walking NAT entries, it /// will continue trying to delete subsequent entries but will return the /// first error it encountered. + /// - `ip_filter`: An optional filter on the index into the instance's + /// external IP array. + /// - If this is `Some(id)`, this routine configures DPD state for only the + /// external IP with `id` in the collection returned from CRDB. + /// - If this is `None`, this routine configures DPD for all external + /// IPs. pub(crate) async fn instance_delete_dpd_config( &self, opctx: &OpContext, authz_instance: &authz::Instance, + ip_filter: Option, ) -> Result<(), Error> { let log = &self.log; let instance_id = authz_instance.id(); @@ -443,8 +464,20 @@ impl super::Nexus { .instance_lookup_external_ips(opctx, instance_id) .await?; + let ips_of_interest = if let Some(wanted_id) = ip_filter { + if let Some(ip) = external_ips.iter().find(|v| v.id == wanted_id) { + std::slice::from_ref(ip) + } else { + return Err(Error::internal_error(&format!( + "failed to find external ip address with id: {wanted_id}", + ))); + } + } else { + &external_ips[..] + }; + let mut errors = vec![]; - for entry in external_ips { + for entry in ips_of_interest { // Soft delete the NAT entry match self .db_datastore @@ -498,7 +531,7 @@ impl super::Nexus { }; } - if let Some(e) = errors.into_iter().nth(0) { + if let Some(e) = errors.into_iter().next() { return Err(e); } @@ -707,24 +740,13 @@ impl super::Nexus { .fetch() .await?; - let boundary_switches = - self.boundary_switches(&self.opctx_alloc).await?; - - for switch in &boundary_switches { - let dpd_client = self.dpd_clients.get(switch).ok_or_else(|| { - Error::internal_error(&format!( - "could not find dpd client for {switch}" - )) - })?; - self.instance_ensure_dpd_config( - opctx, - instance_id, - &sled.address(), - None, - dpd_client, - ) - .await?; - } + self.instance_ensure_dpd_config( + opctx, + instance_id, + &sled.address(), + None, + ) + .await?; Ok(()) } diff --git a/nexus/src/app/sagas/instance_delete.rs b/nexus/src/app/sagas/instance_delete.rs index 7802312b10..8111ee6bd7 100644 --- a/nexus/src/app/sagas/instance_delete.rs +++ b/nexus/src/app/sagas/instance_delete.rs @@ -134,7 +134,7 @@ async fn sid_delete_nat( osagactx .nexus() - .instance_delete_dpd_config(&opctx, &authz_instance) + .instance_delete_dpd_config(&opctx, &authz_instance, None) .await .map_err(ActionError::action_failed)?; diff --git a/nexus/src/app/sagas/instance_ip_attach.rs b/nexus/src/app/sagas/instance_ip_attach.rs index 28fe02acce..6f22d9bf82 100644 --- a/nexus/src/app/sagas/instance_ip_attach.rs +++ b/nexus/src/app/sagas/instance_ip_attach.rs @@ -17,6 +17,7 @@ use crate::external_api::params; use futures::TryFutureExt; use nexus_db_queries::db::identity::{Asset, Resource}; use nexus_db_queries::db::lookup::LookupPath; +use nexus_types::external_api::views; use omicron_common::api::external::DiskState; use omicron_common::api::external::Error; use rand::{rngs::StdRng, RngCore, SeedableRng}; @@ -32,12 +33,27 @@ use uuid::Uuid; use sled_agent_client::types::InstanceExternalIpBody; -#[derive(Debug, Deserialize, Serialize)] +#[derive(Copy, Clone, Debug, Deserialize, Serialize)] enum ExternalIp { Ephemeral(IpAddr, Uuid), Floating(IpAddr, Uuid), } +impl From for views::ExternalIp { + fn from(value: ExternalIp) -> Self { + match value { + ExternalIp::Ephemeral(ip, _) => views::ExternalIp { + ip, + kind: nexus_types::external_api::shared::IpKind::Ephemeral, + }, + ExternalIp::Floating(ip, _) => views::ExternalIp { + ip, + kind: nexus_types::external_api::shared::IpKind::Floating, + }, + } + } +} + impl From for InstanceExternalIpBody { fn from(value: ExternalIp) -> Self { match value { @@ -78,22 +94,26 @@ declare_saga_actions! { - siia_migration_lock_undo } + RESOLVE_EXTERNAL_IP -> "new_ip_uuid" { + + siia_resolve_ip + } + ATTACH_EXTERNAL_IP -> "new_ip" { + siia_attach_ip - siia_attach_ip_undo } - REGISTER_NAT -> "no_result3" { + REGISTER_NAT -> "no_result0" { + siia_nat - - siia_nat + - siia_nat_undo } - ENSURE_OPTE_PORT -> "no_result4" { + ENSURE_OPTE_PORT -> "no_result1" { + siia_update_opte - siia_update_opte_undo } - UNLOCK_MIGRATION -> "no_result1" { + UNLOCK_MIGRATION -> "output" { + siia_migration_unlock - siia_migration_unlock_undo } @@ -104,7 +124,6 @@ pub struct Params { pub create_params: params::ExternalIpCreate, pub authz_instance: authz::Instance, pub instance: db::model::Instance, - pub ephemeral_ip_id: Uuid, /// Authentication context to use to fetch the instance's current state from /// the database. pub serialized_authn: authn::saga::Serialized, @@ -125,6 +144,7 @@ impl NexusSaga for SagaInstanceIpAttach { mut builder: steno::DagBuilder, ) -> Result { builder.append(lock_migration_action()); + builder.append(resolve_external_ip_action()); builder.append(attach_external_ip_action()); builder.append(register_nat_action()); builder.append(ensure_opte_port_action()); @@ -156,12 +176,43 @@ async fn siia_migration_lock( } async fn siia_migration_lock_undo( - sagactx: NexusActionContext, -) -> Result<(), ActionError> { + _sagactx: NexusActionContext, +) -> Result<(), anyhow::Error> { // TODO: do this iff. we implement migration lock. Ok(()) } +// This is split out to prevent double name lookup in event that we +// need to undo `siia_attach_ip`. +async fn siia_resolve_ip( + sagactx: NexusActionContext, +) -> Result { + let osagactx = sagactx.user_data(); + let datastore = osagactx.datastore(); + let params = sagactx.saga_params::()?; + let opctx = crate::context::op_context_for_saga_action( + &sagactx, + ¶ms.serialized_authn, + ); + + match params.create_params { + // Allocate a new IP address from the target, possibly default, pool + params::ExternalIpCreate::Ephemeral { .. } => Ok(Uuid::new_v4()), + // Set the parent of an existing floating IP to the new instance's ID. + params::ExternalIpCreate::Floating { ref floating_ip_name } => { + let floating_ip_name = db::model::Name(floating_ip_name.clone()); + let (.., authz_fip) = LookupPath::new(&opctx, &datastore) + .project_id(params.instance.project_id) + .floating_ip_name(&floating_ip_name) + .lookup_for(authz::Action::Modify) + .await + .map_err(ActionError::action_failed)?; + + Ok(authz_fip.id()) + } + } +} + // TODO: factor this out for attach, detach, and instance create // to share an impl. @@ -176,6 +227,8 @@ async fn siia_attach_ip( ¶ms.serialized_authn, ); + let new_ip_uuid = sagactx.lookup::("new_ip_uuid")?; + match params.create_params { // Allocate a new IP address from the target, possibly default, pool params::ExternalIpCreate::Ephemeral { ref pool_name } => { @@ -184,21 +237,19 @@ async fn siia_attach_ip( let eip = datastore .allocate_instance_ephemeral_ip( &opctx, - params.ephemeral_ip_id, + new_ip_uuid, params.instance.id(), pool_name, ) .await .map_err(ActionError::action_failed)?; - Ok(ExternalIp::Ephemeral(eip.ip.ip(), params.ephemeral_ip_id)) + Ok(ExternalIp::Ephemeral(eip.ip.ip(), new_ip_uuid)) } // Set the parent of an existing floating IP to the new instance's ID. - params::ExternalIpCreate::Floating { ref floating_ip_name } => { - let floating_ip_name = db::model::Name(floating_ip_name.clone()); + params::ExternalIpCreate::Floating { .. } => { let (.., authz_fip, db_fip) = LookupPath::new(&opctx, &datastore) - .project_id(params.instance.project_id) - .floating_ip_name(&floating_ip_name) + .floating_ip_id(new_ip_uuid) .fetch_for(authz::Action::Modify) .await .map_err(ActionError::action_failed)?; @@ -229,18 +280,15 @@ async fn siia_attach_ip_undo( ¶ms.serialized_authn, ); - // TODO: should not be looking up by name here for FIP. + let new_ip_uuid = sagactx.lookup::("new_ip_uuid")?; + match params.create_params { params::ExternalIpCreate::Ephemeral { .. } => { - datastore - .deallocate_external_ip(&opctx, params.ephemeral_ip_id) - .await?; + datastore.deallocate_external_ip(&opctx, new_ip_uuid).await?; } - params::ExternalIpCreate::Floating { floating_ip_name } => { - let floating_ip_name = db::model::Name(floating_ip_name.clone()); + params::ExternalIpCreate::Floating { .. } => { let (.., authz_fip, db_fip) = LookupPath::new(&opctx, &datastore) - .project_id(params.instance.project_id) - .floating_ip_name(&floating_ip_name) + .floating_ip_id(new_ip_uuid) .fetch_for(authz::Action::Modify) .await?; @@ -258,7 +306,6 @@ async fn siia_attach_ip_undo( } async fn siia_nat(sagactx: NexusActionContext) -> Result<(), ActionError> { - // NOTE: on undo we want to do this after unbind. let osagactx = sagactx.user_data(); let datastore = osagactx.datastore(); let params = sagactx.saga_params::()?; @@ -275,6 +322,7 @@ async fn siia_nat(sagactx: NexusActionContext) -> Result<(), ActionError> { }; let new_ip = sagactx.lookup::("new_ip")?; + let ip_id = new_ip.into(); // Querying sleds requires fleet access; use the instance allocator context // for this. @@ -284,49 +332,52 @@ async fn siia_nat(sagactx: NexusActionContext) -> Result<(), ActionError> { .await .map_err(ActionError::action_failed)?; - // Querying boundary switches also requires fleet access and the use of the - // instance allocator context. - let boundary_switches = osagactx + osagactx .nexus() - .boundary_switches(&osagactx.nexus().opctx_alloc) + .instance_ensure_dpd_config( + &opctx, + params.instance.id(), + &sled.address(), + Some(ip_id), + ) .await .map_err(ActionError::action_failed)?; - for switch in boundary_switches { - let dpd_client = - osagactx.nexus().dpd_clients.get(&switch).ok_or_else(|| { - ActionError::action_failed(Error::internal_error(&format!( - "unable to find client for switch {switch}" - ))) - })?; - - osagactx - .nexus() - .instance_ensure_dpd_config( - &opctx, - params.instance.id(), - &sled.address(), - Some(new_ip.into()), - dpd_client, - ) - .await - .map_err(ActionError::action_failed)?; - } - Ok(()) } -async fn siia_update_opte( +async fn siia_nat_undo( sagactx: NexusActionContext, -) -> Result<(), ActionError> { +) -> Result<(), anyhow::Error> { let osagactx = sagactx.user_data(); - let datastore = osagactx.datastore(); let params = sagactx.saga_params::()?; let opctx = crate::context::op_context_for_saga_action( &sagactx, ¶ms.serialized_authn, ); + // If we didn't push NAT before, don't undo it. + if sagactx.lookup::>("sled_id")?.is_none() { + return Ok(()); + } + + let new_ip = sagactx.lookup::("new_ip")?; + let ip_id = new_ip.into(); + + osagactx + .nexus() + .instance_delete_dpd_config(&opctx, ¶ms.authz_instance, Some(ip_id)) + .await?; + + Ok(()) +} + +async fn siia_update_opte( + sagactx: NexusActionContext, +) -> Result<(), ActionError> { + let osagactx = sagactx.user_data(); + let params = sagactx.saga_params::()?; + // No physical sled? Don't inform OPTE. let Some(sled_uuid) = sagactx.lookup::>("sled_id")? else { return Ok(()); @@ -351,25 +402,48 @@ async fn siia_update_opte( async fn siia_update_opte_undo( sagactx: NexusActionContext, -) -> Result<(), ActionError> { - todo!() -} +) -> Result<(), anyhow::Error> { + let osagactx = sagactx.user_data(); + let params = sagactx.saga_params::()?; + + // If we didn't push OPTE before, don't undo it. + let Some(sled_uuid) = sagactx.lookup::>("sled_id")? else { + return Ok(()); + }; + + let new_ip = sagactx.lookup::("new_ip")?; + + // TODO: disambiguate the various sled agent errors etc. + osagactx + .nexus() + .sled_client(&sled_uuid) + .await + .map_err(ActionError::action_failed)? + .instance_delete_external_ip(¶ms.instance.id(), &new_ip.into()) + .await + .map_err(|_| { + ActionError::action_failed(Error::invalid_request("hmm")) + })?; -// TODO -async fn siia_todo(sagactx: NexusActionContext) -> Result<(), ActionError> { - todo!() + Ok(()) } async fn siia_migration_unlock( sagactx: NexusActionContext, -) -> Result<(), ActionError> { - todo!() +) -> Result { + // TODO: do this iff. we implement migration lock. + // TODO: Backtrack if there's an unexpected change to runstate? + + let new_ip = sagactx.lookup::("new_ip")?; + + Ok(new_ip.into()) } async fn siia_migration_unlock_undo( - sagactx: NexusActionContext, -) -> Result<(), ActionError> { - todo!() + _sagactx: NexusActionContext, +) -> Result<(), anyhow::Error> { + // TODO: do this iff. we implement migration lock. + Ok(()) } // TODO: backout changes if run state changed illegally? diff --git a/nexus/src/app/sagas/instance_ip_detach.rs b/nexus/src/app/sagas/instance_ip_detach.rs index 1a4725d42b..8195933b84 100644 --- a/nexus/src/app/sagas/instance_ip_detach.rs +++ b/nexus/src/app/sagas/instance_ip_detach.rs @@ -16,6 +16,7 @@ use crate::app::{authn, authz, db}; use crate::external_api::params; use nexus_db_queries::db::identity::{Asset, Resource}; use nexus_db_queries::db::lookup::LookupPath; +use nexus_types::external_api::views; use omicron_common::api::external::DiskState; use omicron_common::api::external::Error; use rand::{rngs::StdRng, RngCore, SeedableRng}; @@ -23,36 +24,104 @@ use serde::Deserialize; use serde::Serialize; use sled_agent_client::types::{CrucibleOpts, VolumeConstructionRequest}; use std::convert::TryFrom; +use std::net::IpAddr; use std::net::SocketAddrV6; use steno::ActionError; use steno::Node; use uuid::Uuid; +use sled_agent_client::types::InstanceExternalIpBody; + +#[derive(Copy, Clone, Debug, Deserialize, Serialize)] +enum ExternalIp { + Ephemeral(IpAddr, Uuid), + Floating(IpAddr, Uuid), +} + +impl From for views::ExternalIp { + fn from(value: ExternalIp) -> Self { + match value { + ExternalIp::Ephemeral(ip, _) => views::ExternalIp { + ip, + kind: nexus_types::external_api::shared::IpKind::Ephemeral, + }, + ExternalIp::Floating(ip, _) => views::ExternalIp { + ip, + kind: nexus_types::external_api::shared::IpKind::Floating, + }, + } + } +} + +impl From for InstanceExternalIpBody { + fn from(value: ExternalIp) -> Self { + match value { + ExternalIp::Ephemeral(ip, _) => { + InstanceExternalIpBody::Ephemeral(ip) + } + ExternalIp::Floating(ip, _) => InstanceExternalIpBody::Floating(ip), + } + } +} + +impl From for Uuid { + fn from(value: ExternalIp) -> Self { + match value { + ExternalIp::Ephemeral(_, id) => id, + ExternalIp::Floating(_, id) => id, + } + } +} + // rough sequence of evts: // - take temp ownership of instance while interacting w/ sled agent // -> mark instance migration id as Some(0) if None -// - Detach EIP from instance, hang onto its ID. // - Withdraw routes // -> ensure_dpd... (?) Do we actually need to? // -> must precede OPTE: host may change its sending // behaviour prematurely // - Deregister addr in OPTE // -> Put addr in sled-agent endpoint -// - Delete EIP iff. Ephemeral +// - Detach and Delete EIP iff. Ephemeral // -> why so late? Risk that we can't recover our IP in an unwind. // - free up migration_id of instance. // -> mark instance migration id as None declare_saga_actions! { instance_ip_detach; - STAGE1 -> "result1" { - + do_fn - - undo_fn + LOCK_MIGRATION -> "sled_id" { + + siid_migration_lock + - siid_migration_lock_undo + } + + RESOLVE_EXTERNAL_IP -> "new_ip_uuid" { + + siid_resolve_ip + } + + REMOVE_NAT -> "no_result0" { + + siid_nat + - siid_nat_undo + } + + REMOVE_OPTE_PORT -> "no_result1" { + + siid_update_opte + - siid_update_opte_undo + } + + DETACH_EXTERNAL_IP -> "new_ip" { + + siid_detach_ip + - siid_detach_ip_undo + } + + UNLOCK_MIGRATION -> "output" { + + siid_migration_unlock + - siid_migration_unlock_undo } } #[derive(Debug, Deserialize, Serialize)] pub struct Params { + pub delete_params: params::ExternalIpDelete, pub authz_instance: authz::Instance, pub instance: db::model::Instance, /// Authentication context to use to fetch the instance's current state from @@ -60,6 +129,304 @@ pub struct Params { pub serialized_authn: authn::saga::Serialized, } +async fn siid_migration_lock( + sagactx: NexusActionContext, +) -> Result, ActionError> { + // TODO: do this. + let osagactx = sagactx.user_data(); + let datastore = osagactx.datastore(); + let params = sagactx.saga_params::()?; + + let inst_and_vmm = datastore + .instance_fetch_with_vmm( + &osagactx.nexus().opctx_alloc, + ¶ms.authz_instance, + ) + .await + .map_err(ActionError::action_failed)?; + + // TODO: actually lock? + // TODO: fail out in a user-friendly way if migrating? + + Ok(inst_and_vmm.vmm().as_ref().map(|v| v.sled_id)) +} + +async fn siid_migration_lock_undo( + _sagactx: NexusActionContext, +) -> Result<(), anyhow::Error> { + // TODO: do this iff. we implement migration lock. + Ok(()) +} + +// This is split out to prevent double name lookup in event that we +// need to undo `siid_attach_ip`. +async fn siid_resolve_ip( + sagactx: NexusActionContext, +) -> Result { + let osagactx = sagactx.user_data(); + let datastore = osagactx.datastore(); + let params = sagactx.saga_params::()?; + let opctx = crate::context::op_context_for_saga_action( + &sagactx, + ¶ms.serialized_authn, + ); + + match params.delete_params { + // Allocate a new IP address from the target, possibly default, pool + params::ExternalIpDelete::Ephemeral => Ok(Uuid::new_v4()), + // Set the parent of an existing floating IP to the new instance's ID. + params::ExternalIpDelete::Floating { ref floating_ip_name } => { + let floating_ip_name = db::model::Name(floating_ip_name.clone()); + let (.., authz_fip) = LookupPath::new(&opctx, &datastore) + .project_id(params.instance.project_id) + .floating_ip_name(&floating_ip_name) + .lookup_for(authz::Action::Modify) + .await + .map_err(ActionError::action_failed)?; + + Ok(authz_fip.id()) + } + } +} + +async fn siid_nat(sagactx: NexusActionContext) -> Result<(), ActionError> { + let osagactx = sagactx.user_data(); + let params = sagactx.saga_params::()?; + let opctx = crate::context::op_context_for_saga_action( + &sagactx, + ¶ms.serialized_authn, + ); + + // No physical sled? Don't push NAT. + if sagactx.lookup::>("sled_id")?.is_none() { + return Ok(()); + } + + let new_ip = sagactx.lookup::("new_ip")?; + let ip_id = new_ip.into(); + + osagactx + .nexus() + .instance_delete_dpd_config(&opctx, ¶ms.authz_instance, Some(ip_id)) + .await + .map_err(ActionError::action_failed)?; + + Ok(()) +} + +async fn siid_nat_undo( + sagactx: NexusActionContext, +) -> Result<(), anyhow::Error> { + let osagactx = sagactx.user_data(); + let datastore = osagactx.datastore(); + let params = sagactx.saga_params::()?; + let opctx = crate::context::op_context_for_saga_action( + &sagactx, + ¶ms.serialized_authn, + ); + + // NOTE: mostly copied from instance_start. + + // If we didn't push NAT before, don't undo it. + let Some(sled_uuid) = sagactx.lookup::>("sled_id")? else { + return Ok(()); + }; + + let new_ip = sagactx.lookup::("new_ip")?; + let ip_id = new_ip.into(); + + // Querying sleds requires fleet access; use the instance allocator context + // for this. + let (.., sled) = LookupPath::new(&osagactx.nexus().opctx_alloc, &datastore) + .sled_id(sled_uuid) + .fetch() + .await?; + + osagactx + .nexus() + .instance_ensure_dpd_config( + &opctx, + params.instance.id(), + &sled.address(), + Some(ip_id), + ) + .await?; + + Ok(()) +} + +async fn siid_update_opte( + sagactx: NexusActionContext, +) -> Result<(), ActionError> { + let osagactx = sagactx.user_data(); + let params = sagactx.saga_params::()?; + + // No physical sled? Don't inform OPTE. + let Some(sled_uuid) = sagactx.lookup::>("sled_id")? else { + return Ok(()); + }; + + let new_ip = sagactx.lookup::("new_ip")?; + + // TODO: disambiguate the various sled agent errors etc. + osagactx + .nexus() + .sled_client(&sled_uuid) + .await + .map_err(ActionError::action_failed)? + .instance_delete_external_ip(¶ms.instance.id(), &new_ip.into()) + .await + .map_err(|_| { + ActionError::action_failed(Error::invalid_request("hmm")) + })?; + + Ok(()) +} + +async fn siid_update_opte_undo( + sagactx: NexusActionContext, +) -> Result<(), anyhow::Error> { + let osagactx = sagactx.user_data(); + let params = sagactx.saga_params::()?; + + // If we didn't push OPTE before, don't undo it. + let Some(sled_uuid) = sagactx.lookup::>("sled_id")? else { + return Ok(()); + }; + + let new_ip = sagactx.lookup::("new_ip")?; + + // TODO: disambiguate the various sled agent errors etc. + osagactx + .nexus() + .sled_client(&sled_uuid) + .await + .map_err(ActionError::action_failed)? + .instance_put_external_ip(¶ms.instance.id(), &new_ip.into()) + .await?; + + Ok(()) +} + +async fn siid_detach_ip( + sagactx: NexusActionContext, +) -> Result<(), ActionError> { + let osagactx = sagactx.user_data(); + let datastore = osagactx.datastore(); + let params = sagactx.saga_params::()?; + let opctx = crate::context::op_context_for_saga_action( + &sagactx, + ¶ms.serialized_authn, + ); + + let new_ip = sagactx.lookup::("new_ip")?; + let new_ip_uuid = sagactx.lookup::("new_ip_uuid")?; + + match params.delete_params { + params::ExternalIpDelete::Ephemeral => { + datastore + .deallocate_external_ip(&opctx, new_ip_uuid) + .await + .map_err(ActionError::action_failed)?; + } + params::ExternalIpDelete::Floating { .. } => { + let (.., authz_fip, db_fip) = LookupPath::new(&opctx, &datastore) + .floating_ip_id(new_ip_uuid) + .fetch_for(authz::Action::Modify) + .await + .map_err(ActionError::action_failed)?; + + datastore + .floating_ip_detach( + &opctx, + &authz_fip, + &db_fip, + Some(params.instance.id()), + ) + .await + .map_err(ActionError::action_failed)?; + } + } + + Ok(()) +} + +async fn siid_detach_ip_undo( + sagactx: NexusActionContext, +) -> Result<(), anyhow::Error> { + let osagactx = sagactx.user_data(); + let datastore = osagactx.datastore(); + let params = sagactx.saga_params::()?; + let opctx = crate::context::op_context_for_saga_action( + &sagactx, + ¶ms.serialized_authn, + ); + + let new_ip_uuid = sagactx.lookup::("new_ip_uuid")?; + + match params.delete_params { + // Allocate a new IP address from the target, possibly default, pool + params::ExternalIpDelete::Ephemeral => { + // let pool_name = + // pool_name.as_ref().map(|name| db::model::Name(name.clone())); + // let eip = datastore + // .allocate_instance_ephemeral_ip( + // &opctx, + // new_ip_uuid, + // params.instance.id(), + // pool_name, + // ) + // .await + // .map_err(ActionError::action_failed)?; + + // Ok(ExternalIp::Ephemeral(eip.ip.ip(), new_ip_uuid)) + + // TODO::: + // need to think over... can we even reallocate the same IP? + // We can try, and fail, and then completely unwind if so. + // Can we even fail at this point? + Ok(()) + } + // Set the parent of an existing floating IP to the new instance's ID. + params::ExternalIpDelete::Floating { .. } => { + let (.., authz_fip, db_fip) = LookupPath::new(&opctx, &datastore) + .floating_ip_id(new_ip_uuid) + .fetch_for(authz::Action::Modify) + .await + .map_err(ActionError::action_failed)?; + + let eip = datastore + .floating_ip_attach( + &opctx, + &authz_fip, + &db_fip, + params.instance.id(), + ) + .await + .map_err(ActionError::action_failed)?; + + Ok(()) + } + } +} + +async fn siid_migration_unlock( + sagactx: NexusActionContext, +) -> Result { + // TODO: do this iff. we implement migration lock. + // TODO: Backtrack if there's an unexpected change to runstate? + let new_ip = sagactx.lookup::("new_ip")?; + + Ok(new_ip.into()) +} + +async fn siid_migration_unlock_undo( + _sagactx: NexusActionContext, +) -> Result<(), anyhow::Error> { + // TODO: do this iff. we implement migration lock. + Ok(()) +} + #[derive(Debug)] pub struct SagaInstanceIpDetach; impl NexusSaga for SagaInstanceIpDetach { @@ -74,19 +441,16 @@ impl NexusSaga for SagaInstanceIpDetach { _params: &Self::Params, mut builder: steno::DagBuilder, ) -> Result { - builder.append(stage1_action()); + builder.append(lock_migration_action()); + builder.append(resolve_external_ip_action()); + builder.append(remove_nat_action()); + builder.append(remove_opte_port_action()); + builder.append(detach_external_ip_action()); + builder.append(unlock_migration_action()); Ok(builder.build()?) } } -async fn do_fn(sagactx: NexusActionContext) -> Result<(), ActionError> { - todo!() -} - -async fn undo_fn(sagactx: NexusActionContext) -> Result<(), ActionError> { - todo!() -} - #[cfg(test)] pub(crate) mod test { use crate::{ diff --git a/nexus/src/app/sagas/instance_start.rs b/nexus/src/app/sagas/instance_start.rs index e6717b0164..a96e3c8793 100644 --- a/nexus/src/app/sagas/instance_start.rs +++ b/nexus/src/app/sagas/instance_start.rs @@ -405,35 +405,12 @@ async fn sis_dpd_ensure( .await .map_err(ActionError::action_failed)?; - // Querying boundary switches also requires fleet access and the use of the - // instance allocator context. - let boundary_switches = osagactx + osagactx .nexus() - .boundary_switches(&osagactx.nexus().opctx_alloc) + .instance_ensure_dpd_config(&opctx, instance_id, &sled.address(), None) .await .map_err(ActionError::action_failed)?; - for switch in boundary_switches { - let dpd_client = - osagactx.nexus().dpd_clients.get(&switch).ok_or_else(|| { - ActionError::action_failed(Error::internal_error(&format!( - "unable to find client for switch {switch}" - ))) - })?; - - osagactx - .nexus() - .instance_ensure_dpd_config( - &opctx, - instance_id, - &sled.address(), - None, - dpd_client, - ) - .await - .map_err(ActionError::action_failed)?; - } - Ok(()) } @@ -460,7 +437,7 @@ async fn sis_dpd_ensure_undo( osagactx .nexus() - .instance_delete_dpd_config(&opctx, &authz_instance) + .instance_delete_dpd_config(&opctx, &authz_instance, None) .await?; Ok(()) diff --git a/nexus/src/external_api/http_entrypoints.rs b/nexus/src/external_api/http_entrypoints.rs index 12050ac54b..c6b954e46e 100644 --- a/nexus/src/external_api/http_entrypoints.rs +++ b/nexus/src/external_api/http_entrypoints.rs @@ -3669,14 +3669,14 @@ async fn instance_external_ip_attach( }; let instance_lookup = nexus.instance_lookup(&opctx, instance_selector)?; - let disk = nexus + let ip = nexus .instance_attach_external_ip( &opctx, &instance_lookup, &ip_to_detach.into_inner(), ) .await?; - Ok(HttpResponseAccepted(disk.into())) + Ok(HttpResponseAccepted(ip.into())) }; apictx.external_latencies.instrument_dropshot_handler(&rqctx, handler).await } @@ -3705,14 +3705,14 @@ async fn instance_external_ip_detach( }; let instance_lookup = nexus.instance_lookup(&opctx, instance_selector)?; - let disk = nexus + let ip = nexus .instance_detach_external_ip( &opctx, &instance_lookup, &ip_to_detach.into_inner(), ) .await?; - Ok(HttpResponseAccepted(disk.into())) + Ok(HttpResponseAccepted(ip.into())) }; apictx.external_latencies.instrument_dropshot_handler(&rqctx, handler).await } diff --git a/sled-agent/src/http_entrypoints.rs b/sled-agent/src/http_entrypoints.rs index 8fb3fb0d09..1794d69e30 100644 --- a/sled-agent/src/http_entrypoints.rs +++ b/sled-agent/src/http_entrypoints.rs @@ -10,7 +10,7 @@ use crate::bootstrap::params::AddSledRequest; use crate::params::{ CleanupContextUpdate, DiskEnsureBody, InstanceEnsureBody, InstanceExternalIpBody, InstancePutMigrationIdsBody, InstancePutStateBody, - InstancePutStateResponse, InstanceUnregisterResponse, ServiceEnsureBody, + InstancePutStateResponse, InstanceUnregisterResponse, OmicronZonesConfig, SledRole, TimeSync, VpcFirewallRulesEnsureBody, ZoneBundleId, ZoneBundleMetadata, Zpool, }; diff --git a/sled-agent/src/sled_agent.rs b/sled-agent/src/sled_agent.rs index c4998cfbf7..d59a614b8a 100644 --- a/sled-agent/src/sled_agent.rs +++ b/sled-agent/src/sled_agent.rs @@ -18,7 +18,7 @@ use crate::nexus::{ConvertInto, NexusClientWithResolver, NexusRequestQueue}; use crate::params::{ DiskStateRequested, InstanceExternalIpBody, InstanceHardware, InstanceMigrationSourceParams, InstancePutStateResponse, - InstanceStateRequested, InstanceUnregisterResponse, ServiceEnsureBody, + InstanceStateRequested, InstanceUnregisterResponse, OmicronZonesConfig, SledRole, TimeSync, VpcFirewallRule, ZoneBundleMetadata, Zpool, }; use crate::services::{self, ServiceManager}; From 89acdba5d8a158adbe113825e4b4217f0687ca4e Mon Sep 17 00:00:00 2001 From: Kyle Simpson Date: Fri, 15 Dec 2023 14:31:41 +0000 Subject: [PATCH 04/56] Working sagas ...and now the cleanup + locking begins --- nexus/src/app/sagas/instance_ip_attach.rs | 6 ++- nexus/src/app/sagas/instance_ip_detach.rs | 45 +++++++++++++++-------- 2 files changed, 34 insertions(+), 17 deletions(-) diff --git a/nexus/src/app/sagas/instance_ip_attach.rs b/nexus/src/app/sagas/instance_ip_attach.rs index 6f22d9bf82..e62795e29f 100644 --- a/nexus/src/app/sagas/instance_ip_attach.rs +++ b/nexus/src/app/sagas/instance_ip_attach.rs @@ -160,10 +160,14 @@ async fn siia_migration_lock( let osagactx = sagactx.user_data(); let datastore = osagactx.datastore(); let params = sagactx.saga_params::()?; + let opctx = crate::context::op_context_for_saga_action( + &sagactx, + ¶ms.serialized_authn, + ); let inst_and_vmm = datastore .instance_fetch_with_vmm( - &osagactx.nexus().opctx_alloc, + &opctx, ¶ms.authz_instance, ) .await diff --git a/nexus/src/app/sagas/instance_ip_detach.rs b/nexus/src/app/sagas/instance_ip_detach.rs index 8195933b84..55c3723c16 100644 --- a/nexus/src/app/sagas/instance_ip_detach.rs +++ b/nexus/src/app/sagas/instance_ip_detach.rs @@ -14,6 +14,7 @@ use super::{ use crate::app::sagas::declare_saga_actions; use crate::app::{authn, authz, db}; use crate::external_api::params; +use nexus_db_model::IpKind; use nexus_db_queries::db::identity::{Asset, Resource}; use nexus_db_queries::db::lookup::LookupPath; use nexus_types::external_api::views; @@ -94,7 +95,7 @@ declare_saga_actions! { - siid_migration_lock_undo } - RESOLVE_EXTERNAL_IP -> "new_ip_uuid" { + RESOLVE_EXTERNAL_IP -> "target_ip" { + siid_resolve_ip } @@ -108,7 +109,7 @@ declare_saga_actions! { - siid_update_opte_undo } - DETACH_EXTERNAL_IP -> "new_ip" { + DETACH_EXTERNAL_IP -> "no_result2" { + siid_detach_ip - siid_detach_ip_undo } @@ -136,10 +137,14 @@ async fn siid_migration_lock( let osagactx = sagactx.user_data(); let datastore = osagactx.datastore(); let params = sagactx.saga_params::()?; + let opctx = crate::context::op_context_for_saga_action( + &sagactx, + ¶ms.serialized_authn, + ); let inst_and_vmm = datastore .instance_fetch_with_vmm( - &osagactx.nexus().opctx_alloc, + &opctx, ¶ms.authz_instance, ) .await @@ -162,7 +167,7 @@ async fn siid_migration_lock_undo( // need to undo `siid_attach_ip`. async fn siid_resolve_ip( sagactx: NexusActionContext, -) -> Result { +) -> Result { let osagactx = sagactx.user_data(); let datastore = osagactx.datastore(); let params = sagactx.saga_params::()?; @@ -173,18 +178,27 @@ async fn siid_resolve_ip( match params.delete_params { // Allocate a new IP address from the target, possibly default, pool - params::ExternalIpDelete::Ephemeral => Ok(Uuid::new_v4()), + params::ExternalIpDelete::Ephemeral => { + let eips = datastore.instance_lookup_external_ips(&opctx, params.instance.id()) + .await + .map_err(ActionError::action_failed)?; + + let eph_ip = eips.iter().find(|e| e.kind == IpKind::Ephemeral) + .ok_or_else(|| ActionError::action_failed(Error::invalid_request("instance does not have an attached ephemeral IP address")))?; + + Ok(ExternalIp::Ephemeral(eph_ip.ip.ip(), eph_ip.id)) + }, // Set the parent of an existing floating IP to the new instance's ID. params::ExternalIpDelete::Floating { ref floating_ip_name } => { let floating_ip_name = db::model::Name(floating_ip_name.clone()); - let (.., authz_fip) = LookupPath::new(&opctx, &datastore) + let (.., fip) = LookupPath::new(&opctx, &datastore) .project_id(params.instance.project_id) .floating_ip_name(&floating_ip_name) - .lookup_for(authz::Action::Modify) + .fetch_for(authz::Action::Modify) .await .map_err(ActionError::action_failed)?; - Ok(authz_fip.id()) + Ok(ExternalIp::Floating(fip.ip.ip(), fip.id())) } } } @@ -202,7 +216,7 @@ async fn siid_nat(sagactx: NexusActionContext) -> Result<(), ActionError> { return Ok(()); } - let new_ip = sagactx.lookup::("new_ip")?; + let new_ip = sagactx.lookup::("target_ip")?; let ip_id = new_ip.into(); osagactx @@ -232,7 +246,7 @@ async fn siid_nat_undo( return Ok(()); }; - let new_ip = sagactx.lookup::("new_ip")?; + let new_ip = sagactx.lookup::("target_ip")?; let ip_id = new_ip.into(); // Querying sleds requires fleet access; use the instance allocator context @@ -266,7 +280,7 @@ async fn siid_update_opte( return Ok(()); }; - let new_ip = sagactx.lookup::("new_ip")?; + let new_ip = sagactx.lookup::("target_ip")?; // TODO: disambiguate the various sled agent errors etc. osagactx @@ -294,7 +308,7 @@ async fn siid_update_opte_undo( return Ok(()); }; - let new_ip = sagactx.lookup::("new_ip")?; + let new_ip = sagactx.lookup::("target_ip")?; // TODO: disambiguate the various sled agent errors etc. osagactx @@ -319,8 +333,7 @@ async fn siid_detach_ip( ¶ms.serialized_authn, ); - let new_ip = sagactx.lookup::("new_ip")?; - let new_ip_uuid = sagactx.lookup::("new_ip_uuid")?; + let new_ip_uuid = sagactx.lookup::("target_ip")?.into(); match params.delete_params { params::ExternalIpDelete::Ephemeral => { @@ -362,7 +375,7 @@ async fn siid_detach_ip_undo( ¶ms.serialized_authn, ); - let new_ip_uuid = sagactx.lookup::("new_ip_uuid")?; + let new_ip_uuid = sagactx.lookup::("target_ip")?.into(); match params.delete_params { // Allocate a new IP address from the target, possibly default, pool @@ -415,7 +428,7 @@ async fn siid_migration_unlock( ) -> Result { // TODO: do this iff. we implement migration lock. // TODO: Backtrack if there's an unexpected change to runstate? - let new_ip = sagactx.lookup::("new_ip")?; + let new_ip = sagactx.lookup::("target_ip")?; Ok(new_ip.into()) } From 536616221599156957cbbed14b2678623bca1355 Mon Sep 17 00:00:00 2001 From: Kyle Simpson Date: Fri, 15 Dec 2023 14:37:39 +0000 Subject: [PATCH 05/56] `cargo fix` --- nexus/src/app/instance.rs | 4 +- nexus/src/app/instance_network.rs | 2 +- nexus/src/app/sagas/instance_ip_attach.rs | 61 +++++++++------------- nexus/src/app/sagas/instance_ip_detach.rs | 63 ++++++++++------------- sled-agent/src/instance_manager.rs | 4 +- 5 files changed, 56 insertions(+), 78 deletions(-) diff --git a/nexus/src/app/instance.rs b/nexus/src/app/instance.rs index bd47f10a72..d4e67458cb 100644 --- a/nexus/src/app/instance.rs +++ b/nexus/src/app/instance.rs @@ -1881,7 +1881,7 @@ impl super::Nexus { instance_lookup: &lookup::Instance<'_>, ext_ip: ¶ms::ExternalIpCreate, ) -> UpdateResult { - let (.., authz_project, authz_instance, instance) = + let (.., _authz_project, authz_instance, instance) = instance_lookup.fetch_for(authz::Action::Modify).await?; let saga_params = sagas::instance_ip_attach::Params { @@ -1910,7 +1910,7 @@ impl super::Nexus { instance_lookup: &lookup::Instance<'_>, ext_ip: ¶ms::ExternalIpDelete, ) -> UpdateResult { - let (.., authz_project, authz_instance, instance) = + let (.., _authz_project, authz_instance, instance) = instance_lookup.fetch_for(authz::Action::Modify).await?; let saga_params = sagas::instance_ip_detach::Params { diff --git a/nexus/src/app/instance_network.rs b/nexus/src/app/instance_network.rs index e6270457ba..8941cec24d 100644 --- a/nexus/src/app/instance_network.rs +++ b/nexus/src/app/instance_network.rs @@ -24,7 +24,7 @@ use sled_agent_client::types::DeleteVirtualNetworkInterfaceHost; use sled_agent_client::types::SetVirtualNetworkInterfaceHost; use std::collections::HashSet; use std::str::FromStr; -use std::sync::Arc; + use uuid::Uuid; impl super::Nexus { diff --git a/nexus/src/app/sagas/instance_ip_attach.rs b/nexus/src/app/sagas/instance_ip_attach.rs index e62795e29f..64b692a4be 100644 --- a/nexus/src/app/sagas/instance_ip_attach.rs +++ b/nexus/src/app/sagas/instance_ip_attach.rs @@ -3,13 +3,7 @@ // file, You can obtain one at https://mozilla.org/MPL/2.0/. use super::{ - common_storage::{ - call_pantry_attach_for_disk, call_pantry_detach_for_disk, - delete_crucible_regions, ensure_all_datasets_and_regions, - get_pantry_address, - }, - ActionRegistry, NexusActionContext, NexusSaga, SagaInitError, - ACTION_GENERATE_ID, + ActionRegistry, NexusActionContext, NexusSaga, }; use crate::app::sagas::declare_saga_actions; use crate::app::{authn, authz, db}; @@ -18,17 +12,17 @@ use futures::TryFutureExt; use nexus_db_queries::db::identity::{Asset, Resource}; use nexus_db_queries::db::lookup::LookupPath; use nexus_types::external_api::views; -use omicron_common::api::external::DiskState; + use omicron_common::api::external::Error; -use rand::{rngs::StdRng, RngCore, SeedableRng}; + use serde::Deserialize; use serde::Serialize; -use sled_agent_client::types::{CrucibleOpts, VolumeConstructionRequest}; -use std::convert::TryFrom; + + use std::net::IpAddr; -use std::net::SocketAddrV6; + use steno::ActionError; -use steno::Node; + use uuid::Uuid; use sled_agent_client::types::InstanceExternalIpBody; @@ -454,54 +448,49 @@ async fn siia_migration_unlock_undo( #[cfg(test)] pub(crate) mod test { - use crate::{ - app::saga::create_saga_dag, app::sagas::disk_create::Params, - app::sagas::disk_create::SagaDiskCreate, external_api::params, - }; - use async_bb8_diesel::{AsyncRunQueryDsl, AsyncSimpleConnection}; - use diesel::{ - ExpressionMethods, OptionalExtension, QueryDsl, SelectableHelper, - }; - use dropshot::test_util::ClientTestContext; - use nexus_db_queries::context::OpContext; - use nexus_db_queries::{authn::saga::Serialized, db::datastore::DataStore}; - use nexus_test_utils::resource_helpers::create_ip_pool; - use nexus_test_utils::resource_helpers::create_project; - use nexus_test_utils::resource_helpers::DiskTest; + + + + + + + + + use nexus_test_utils_macros::nexus_test; - use omicron_common::api::external::ByteCount; - use omicron_common::api::external::IdentityMetadataCreateParams; - use omicron_common::api::external::Name; - use omicron_sled_agent::sim::SledAgent; - use uuid::Uuid; + + + + + type ControlPlaneTestContext = nexus_test_utils::ControlPlaneTestContext; #[nexus_test(server = crate::Server)] async fn test_saga_basic_usage_succeeds( - cptestctx: &ControlPlaneTestContext, + _cptestctx: &ControlPlaneTestContext, ) { todo!() } #[nexus_test(server = crate::Server)] async fn test_action_failure_can_unwind( - cptestctx: &ControlPlaneTestContext, + _cptestctx: &ControlPlaneTestContext, ) { todo!() } #[nexus_test(server = crate::Server)] async fn test_action_failure_can_unwind_idempotently( - cptestctx: &ControlPlaneTestContext, + _cptestctx: &ControlPlaneTestContext, ) { todo!() } #[nexus_test(server = crate::Server)] async fn test_actions_succeed_idempotently( - cptestctx: &ControlPlaneTestContext, + _cptestctx: &ControlPlaneTestContext, ) { todo!() } diff --git a/nexus/src/app/sagas/instance_ip_detach.rs b/nexus/src/app/sagas/instance_ip_detach.rs index 55c3723c16..1d961377ee 100644 --- a/nexus/src/app/sagas/instance_ip_detach.rs +++ b/nexus/src/app/sagas/instance_ip_detach.rs @@ -3,13 +3,7 @@ // file, You can obtain one at https://mozilla.org/MPL/2.0/. use super::{ - common_storage::{ - call_pantry_attach_for_disk, call_pantry_detach_for_disk, - delete_crucible_regions, ensure_all_datasets_and_regions, - get_pantry_address, - }, - ActionRegistry, NexusActionContext, NexusSaga, SagaInitError, - ACTION_GENERATE_ID, + ActionRegistry, NexusActionContext, NexusSaga, }; use crate::app::sagas::declare_saga_actions; use crate::app::{authn, authz, db}; @@ -18,17 +12,17 @@ use nexus_db_model::IpKind; use nexus_db_queries::db::identity::{Asset, Resource}; use nexus_db_queries::db::lookup::LookupPath; use nexus_types::external_api::views; -use omicron_common::api::external::DiskState; + use omicron_common::api::external::Error; -use rand::{rngs::StdRng, RngCore, SeedableRng}; + use serde::Deserialize; use serde::Serialize; -use sled_agent_client::types::{CrucibleOpts, VolumeConstructionRequest}; -use std::convert::TryFrom; + + use std::net::IpAddr; -use std::net::SocketAddrV6; + use steno::ActionError; -use steno::Node; + use uuid::Uuid; use sled_agent_client::types::InstanceExternalIpBody; @@ -408,7 +402,7 @@ async fn siid_detach_ip_undo( .await .map_err(ActionError::action_failed)?; - let eip = datastore + let _eip = datastore .floating_ip_attach( &opctx, &authz_fip, @@ -466,54 +460,49 @@ impl NexusSaga for SagaInstanceIpDetach { #[cfg(test)] pub(crate) mod test { - use crate::{ - app::saga::create_saga_dag, app::sagas::disk_create::Params, - app::sagas::disk_create::SagaDiskCreate, external_api::params, - }; - use async_bb8_diesel::{AsyncRunQueryDsl, AsyncSimpleConnection}; - use diesel::{ - ExpressionMethods, OptionalExtension, QueryDsl, SelectableHelper, - }; - use dropshot::test_util::ClientTestContext; - use nexus_db_queries::context::OpContext; - use nexus_db_queries::{authn::saga::Serialized, db::datastore::DataStore}; - use nexus_test_utils::resource_helpers::create_ip_pool; - use nexus_test_utils::resource_helpers::create_project; - use nexus_test_utils::resource_helpers::DiskTest; + + + + + + + + + use nexus_test_utils_macros::nexus_test; - use omicron_common::api::external::ByteCount; - use omicron_common::api::external::IdentityMetadataCreateParams; - use omicron_common::api::external::Name; - use omicron_sled_agent::sim::SledAgent; - use uuid::Uuid; + + + + + type ControlPlaneTestContext = nexus_test_utils::ControlPlaneTestContext; #[nexus_test(server = crate::Server)] async fn test_saga_basic_usage_succeeds( - cptestctx: &ControlPlaneTestContext, + _cptestctx: &ControlPlaneTestContext, ) { todo!() } #[nexus_test(server = crate::Server)] async fn test_action_failure_can_unwind( - cptestctx: &ControlPlaneTestContext, + _cptestctx: &ControlPlaneTestContext, ) { todo!() } #[nexus_test(server = crate::Server)] async fn test_action_failure_can_unwind_idempotently( - cptestctx: &ControlPlaneTestContext, + _cptestctx: &ControlPlaneTestContext, ) { todo!() } #[nexus_test(server = crate::Server)] async fn test_actions_succeed_idempotently( - cptestctx: &ControlPlaneTestContext, + _cptestctx: &ControlPlaneTestContext, ) { todo!() } diff --git a/sled-agent/src/instance_manager.rs b/sled-agent/src/instance_manager.rs index eddf98c06c..b66b0400e1 100644 --- a/sled-agent/src/instance_manager.rs +++ b/sled-agent/src/instance_manager.rs @@ -443,7 +443,7 @@ impl InstanceManager { ) -> Result<(), Error> { let instance = { let instances = self.inner.instances.lock().unwrap(); - instances.get(&instance_id).map(|(id, v)| v.clone()) + instances.get(&instance_id).map(|(_id, v)| v.clone()) }; let Some(instance) = instance else { @@ -461,7 +461,7 @@ impl InstanceManager { ) -> Result<(), Error> { let instance = { let instances = self.inner.instances.lock().unwrap(); - instances.get(&instance_id).map(|(id, v)| v.clone()) + instances.get(&instance_id).map(|(_id, v)| v.clone()) }; let Some(instance) = instance else { From b2d08f750e0bf083f0f4917fea66a04ac001e85b Mon Sep 17 00:00:00 2001 From: Kyle Simpson Date: Fri, 15 Dec 2023 17:57:17 +0000 Subject: [PATCH 06/56] Back out iff. migration in progress. --- nexus/src/app/sagas/instance_ip_attach.rs | 38 ++++++------------ nexus/src/app/sagas/instance_ip_detach.rs | 48 +++++++---------------- 2 files changed, 28 insertions(+), 58 deletions(-) diff --git a/nexus/src/app/sagas/instance_ip_attach.rs b/nexus/src/app/sagas/instance_ip_attach.rs index 64b692a4be..b04a02bea9 100644 --- a/nexus/src/app/sagas/instance_ip_attach.rs +++ b/nexus/src/app/sagas/instance_ip_attach.rs @@ -2,14 +2,11 @@ // License, v. 2.0. If a copy of the MPL was not distributed with this // file, You can obtain one at https://mozilla.org/MPL/2.0/. -use super::{ - ActionRegistry, NexusActionContext, NexusSaga, -}; +use super::{ActionRegistry, NexusActionContext, NexusSaga}; use crate::app::sagas::declare_saga_actions; use crate::app::{authn, authz, db}; use crate::external_api::params; -use futures::TryFutureExt; -use nexus_db_queries::db::identity::{Asset, Resource}; +use nexus_db_queries::db::identity::Resource; use nexus_db_queries::db::lookup::LookupPath; use nexus_types::external_api::views; @@ -18,7 +15,6 @@ use omicron_common::api::external::Error; use serde::Deserialize; use serde::Serialize; - use std::net::IpAddr; use steno::ActionError; @@ -160,17 +156,22 @@ async fn siia_migration_lock( ); let inst_and_vmm = datastore - .instance_fetch_with_vmm( - &opctx, - ¶ms.authz_instance, - ) + .instance_fetch_with_vmm(&opctx, ¶ms.authz_instance) .await .map_err(ActionError::action_failed)?; + // TODO: Currently stop if there's a migration. This may be a good case + // for RPW'ing ext_ip_state -> { NAT RPW, sled-agent } in future. + if inst_and_vmm.instance().runtime_state.migration_id.is_some() { + return Err(ActionError::action_failed(Error::ServiceUnavailable { + internal_message: "target instance is migrating".into(), + })); + } + // TODO: actually lock? // TODO: fail out in a user-friendly way if migrating? - Ok(inst_and_vmm.vmm().as_ref().map(|v| v.sled_id)) + Ok(inst_and_vmm.sled_id()) } async fn siia_migration_lock_undo( @@ -448,21 +449,8 @@ async fn siia_migration_unlock_undo( #[cfg(test)] pub(crate) mod test { - - - - - - - - - + use nexus_test_utils_macros::nexus_test; - - - - - type ControlPlaneTestContext = nexus_test_utils::ControlPlaneTestContext; diff --git a/nexus/src/app/sagas/instance_ip_detach.rs b/nexus/src/app/sagas/instance_ip_detach.rs index 1d961377ee..a14d64643f 100644 --- a/nexus/src/app/sagas/instance_ip_detach.rs +++ b/nexus/src/app/sagas/instance_ip_detach.rs @@ -2,31 +2,22 @@ // License, v. 2.0. If a copy of the MPL was not distributed with this // file, You can obtain one at https://mozilla.org/MPL/2.0/. -use super::{ - ActionRegistry, NexusActionContext, NexusSaga, -}; +use super::{ActionRegistry, NexusActionContext, NexusSaga}; use crate::app::sagas::declare_saga_actions; use crate::app::{authn, authz, db}; use crate::external_api::params; use nexus_db_model::IpKind; -use nexus_db_queries::db::identity::{Asset, Resource}; +use nexus_db_queries::db::identity::Resource; use nexus_db_queries::db::lookup::LookupPath; use nexus_types::external_api::views; - use omicron_common::api::external::Error; - use serde::Deserialize; use serde::Serialize; - - +use sled_agent_client::types::InstanceExternalIpBody; use std::net::IpAddr; - use steno::ActionError; - use uuid::Uuid; -use sled_agent_client::types::InstanceExternalIpBody; - #[derive(Copy, Clone, Debug, Deserialize, Serialize)] enum ExternalIp { Ephemeral(IpAddr, Uuid), @@ -137,17 +128,20 @@ async fn siid_migration_lock( ); let inst_and_vmm = datastore - .instance_fetch_with_vmm( - &opctx, - ¶ms.authz_instance, - ) + .instance_fetch_with_vmm(&opctx, ¶ms.authz_instance) .await .map_err(ActionError::action_failed)?; + if inst_and_vmm.instance().runtime_state.migration_id.is_some() { + return Err(ActionError::action_failed(Error::ServiceUnavailable { + internal_message: "target instance is migrating".into(), + })); + } + // TODO: actually lock? // TODO: fail out in a user-friendly way if migrating? - Ok(inst_and_vmm.vmm().as_ref().map(|v| v.sled_id)) + Ok(inst_and_vmm.sled_id()) } async fn siid_migration_lock_undo( @@ -173,7 +167,8 @@ async fn siid_resolve_ip( match params.delete_params { // Allocate a new IP address from the target, possibly default, pool params::ExternalIpDelete::Ephemeral => { - let eips = datastore.instance_lookup_external_ips(&opctx, params.instance.id()) + let eips = datastore + .instance_lookup_external_ips(&opctx, params.instance.id()) .await .map_err(ActionError::action_failed)?; @@ -181,7 +176,7 @@ async fn siid_resolve_ip( .ok_or_else(|| ActionError::action_failed(Error::invalid_request("instance does not have an attached ephemeral IP address")))?; Ok(ExternalIp::Ephemeral(eph_ip.ip.ip(), eph_ip.id)) - }, + } // Set the parent of an existing floating IP to the new instance's ID. params::ExternalIpDelete::Floating { ref floating_ip_name } => { let floating_ip_name = db::model::Name(floating_ip_name.clone()); @@ -460,21 +455,8 @@ impl NexusSaga for SagaInstanceIpDetach { #[cfg(test)] pub(crate) mod test { - - - - - - - - - + use nexus_test_utils_macros::nexus_test; - - - - - type ControlPlaneTestContext = nexus_test_utils::ControlPlaneTestContext; From 81c7d2559d7e08eb4fa67bd59a0589f072f3f278 Mon Sep 17 00:00:00 2001 From: Kyle Simpson Date: Mon, 18 Dec 2023 18:33:53 +0000 Subject: [PATCH 07/56] Make use of `check_and_update` for attach/detach Should close #4628. --- .../src/db/datastore/external_ip.rs | 108 ++++++++---------- nexus/src/app/external_ip.rs | 6 +- nexus/src/app/sagas/instance_create.rs | 13 +-- nexus/src/app/sagas/instance_ip_attach.rs | 24 ++-- nexus/src/app/sagas/instance_ip_detach.rs | 22 +--- 5 files changed, 72 insertions(+), 101 deletions(-) diff --git a/nexus/db-queries/src/db/datastore/external_ip.rs b/nexus/db-queries/src/db/datastore/external_ip.rs index 1e40708b3a..1bb8981625 100644 --- a/nexus/db-queries/src/db/datastore/external_ip.rs +++ b/nexus/db-queries/src/db/datastore/external_ip.rs @@ -436,26 +436,19 @@ impl DataStore { &self, opctx: &OpContext, authz_fip: &authz::FloatingIp, - db_fip: &FloatingIp, ) -> DeleteResult { use db::schema::external_ip::dsl; - // Verify this FIP is not attached to any instances/services. - if db_fip.parent_id.is_some() { - return Err(Error::invalid_request( - "Floating IP cannot be deleted while attached to an instance", - )); - } - opctx.authorize(authz::Action::Delete, authz_fip).await?; let now = Utc::now(); - let updated_rows = diesel::update(dsl::external_ip) - .filter(dsl::id.eq(db_fip.id())) + let result = diesel::update(dsl::external_ip) + .filter(dsl::id.eq(authz_fip.id())) .filter(dsl::time_deleted.is_null()) .filter(dsl::parent_id.is_null()) .set(dsl::time_deleted.eq(now)) - .execute_async(&*self.pool_connection_authorized(opctx).await?) + .check_if_exists::(authz_fip.id()) + .execute_and_check(&*self.pool_connection_authorized(opctx).await?) .await .map_err(|e| { public_error_from_diesel( @@ -464,12 +457,15 @@ impl DataStore { ) })?; - if updated_rows == 0 { - return Err(Error::invalid_request( - "deletion failed due to concurrent modification", - )); + match result.status { + // Verify this FIP is not attached to any instances/services. + UpdateStatus::NotUpdatedButExists if result.found.parent_id.is_some() => Err(Error::invalid_request( + "Floating IP cannot be deleted while attached to an instance", + )), + // Only remaining cause of `NotUpdated` is earlier soft-deletion. + // Return success in this case to maintain idempotency. + UpdateStatus::Updated | UpdateStatus::NotUpdatedButExists => Ok(()), } - Ok(()) } /// Attaches a Floating IP address to an instance. @@ -477,18 +473,10 @@ impl DataStore { &self, opctx: &OpContext, authz_fip: &authz::FloatingIp, - db_fip: &FloatingIp, instance_id: Uuid, ) -> UpdateResult { use db::schema::external_ip::dsl; - // Verify this FIP is not attached to any instances/services. - if db_fip.parent_id.is_some() { - return Err(Error::invalid_request( - "Floating IP cannot be attached to one instance while still attached to another", - )); - } - let (.., authz_instance, _db_instance) = LookupPath::new(&opctx, self) .instance_id(instance_id) .fetch_for(authz::Action::Modify) @@ -497,8 +485,10 @@ impl DataStore { opctx.authorize(authz::Action::Modify, authz_fip).await?; opctx.authorize(authz::Action::Modify, &authz_instance).await?; + let fip_id = authz_fip.id(); + let out = diesel::update(dsl::external_ip) - .filter(dsl::id.eq(db_fip.id())) + .filter(dsl::id.eq(fip_id)) .filter(dsl::kind.eq(IpKind::Floating)) .filter(dsl::time_deleted.is_null()) .filter(dsl::parent_id.is_null()) @@ -506,19 +496,23 @@ impl DataStore { dsl::parent_id.eq(Some(instance_id)), dsl::time_modified.eq(Utc::now()), )) - .returning(ExternalIp::as_returning()) - .get_result_async(&*self.pool_connection_authorized(opctx).await?) + .check_if_exists::(fip_id) + .execute_and_check(&*self.pool_connection_authorized(opctx).await?) .await .map_err(|e| { public_error_from_diesel( e, ErrorHandler::NotFoundByResource(authz_fip), ) - }) - .and_then(|r| FloatingIp::try_from(r)) - .map_err(|e| Error::internal_error(&format!("{e}")))?; + })?; - Ok(out) + match (out.status, out.found.parent_id) { + (UpdateStatus::NotUpdatedButExists, Some(_)) => Err(Error::invalid_request( + "Floating IP cannot be attached to one instance while still attached to another", + )), + (UpdateStatus::Updated, _) => Ok(out.found.into()), + _ => unreachable!(), + } } /// Detaches a Floating IP address from an instance. @@ -526,37 +520,22 @@ impl DataStore { &self, opctx: &OpContext, authz_fip: &authz::FloatingIp, - db_fip: &FloatingIp, - target_instance_id: Option, - ) -> UpdateResult<(FloatingIp, Option)> { + instance_id: Uuid, + ) -> UpdateResult { use db::schema::external_ip::dsl; - let Some(instance_id) = db_fip.parent_id else { - return Err(Error::invalid_request( - "Floating IP is not attached to an instance", - )); - }; - - if let Some(target_instance_id) = target_instance_id { - if target_instance_id != instance_id { - return Err(Error::invalid_request( - "Floating IP is not attached to the target instance", - )); - } - } - - let (.., authz_instance, _db_instance) = LookupPath::new(&opctx, self) + let (.., authz_instance) = LookupPath::new(&opctx, self) .instance_id(instance_id) - .fetch_for(authz::Action::Modify) + .lookup_for(authz::Action::Modify) .await?; opctx.authorize(authz::Action::Modify, authz_fip).await?; opctx.authorize(authz::Action::Modify, &authz_instance).await?; - let i = self.instance_fetch_with_vmm(opctx, &authz_instance).await?; + let fip_id = authz_fip.id(); let out = diesel::update(dsl::external_ip) - .filter(dsl::id.eq(db_fip.id())) + .filter(dsl::id.eq(fip_id)) .filter(dsl::kind.eq(IpKind::Floating)) .filter(dsl::time_deleted.is_null()) .filter(dsl::parent_id.eq(instance_id)) @@ -564,18 +543,31 @@ impl DataStore { dsl::parent_id.eq(Option::::None), dsl::time_modified.eq(Utc::now()), )) - .returning(ExternalIp::as_returning()) - .get_result_async(&*self.pool_connection_authorized(opctx).await?) + .check_if_exists::(fip_id) + .execute_and_check(&*self.pool_connection_authorized(opctx).await?) .await .map_err(|e| { public_error_from_diesel( e, ErrorHandler::NotFoundByResource(authz_fip), ) - }) - .and_then(|r| FloatingIp::try_from(r)) - .map_err(|e| Error::internal_error(&format!("{e}")))?; + })?; - Ok((out, i.sled_id())) + match (out.status, out.found.parent_id) { + (UpdateStatus::NotUpdatedButExists, Some(id)) + if id != instance_id => + { + Err(Error::invalid_request( + "Floating IP is not attached to the target instance", + )) + } + (UpdateStatus::NotUpdatedButExists, None) => { + Err(Error::invalid_request( + "Floating IP is not attached to an instance", + )) + } + (UpdateStatus::Updated, _) => Ok(out.found.into()), + _ => unreachable!(), + } } } diff --git a/nexus/src/app/external_ip.rs b/nexus/src/app/external_ip.rs index 404f597288..fba34f767d 100644 --- a/nexus/src/app/external_ip.rs +++ b/nexus/src/app/external_ip.rs @@ -115,9 +115,9 @@ impl super::Nexus { opctx: &OpContext, ip_lookup: lookup::FloatingIp<'_>, ) -> DeleteResult { - let (.., authz_fip, db_fip) = - ip_lookup.fetch_for(authz::Action::Delete).await?; + let (.., authz_fip) = + ip_lookup.lookup_for(authz::Action::Delete).await?; - self.db_datastore.floating_ip_delete(opctx, &authz_fip, &db_fip).await + self.db_datastore.floating_ip_delete(opctx, &authz_fip).await } } diff --git a/nexus/src/app/sagas/instance_create.rs b/nexus/src/app/sagas/instance_create.rs index 89948d4db5..c5e7adcadc 100644 --- a/nexus/src/app/sagas/instance_create.rs +++ b/nexus/src/app/sagas/instance_create.rs @@ -634,15 +634,15 @@ async fn sic_allocate_instance_external_ip( // Set the parent of an existing floating IP to the new instance's ID. params::ExternalIpCreate::Floating { ref floating_ip_name } => { let floating_ip_name = db::model::Name(floating_ip_name.clone()); - let (.., authz_fip, db_fip) = LookupPath::new(&opctx, &datastore) + let (.., authz_fip) = LookupPath::new(&opctx, &datastore) .project_id(saga_params.project_id) .floating_ip_name(&floating_ip_name) - .fetch_for(authz::Action::Modify) + .lookup_for(authz::Action::Modify) .await .map_err(ActionError::action_failed)?; datastore - .floating_ip_attach(&opctx, &authz_fip, &db_fip, instance_id) + .floating_ip_attach(&opctx, &authz_fip, instance_id) .await .map_err(ActionError::action_failed)?; } @@ -674,18 +674,17 @@ async fn sic_allocate_instance_external_ip_undo( } params::ExternalIpCreate::Floating { floating_ip_name } => { let floating_ip_name = db::model::Name(floating_ip_name.clone()); - let (.., authz_fip, db_fip) = LookupPath::new(&opctx, &datastore) + let (.., authz_fip) = LookupPath::new(&opctx, &datastore) .project_id(saga_params.project_id) .floating_ip_name(&floating_ip_name) - .fetch_for(authz::Action::Modify) + .lookup_for(authz::Action::Modify) .await?; datastore .floating_ip_detach( &opctx, &authz_fip, - &db_fip, - Some(repeat_saga_params.instance_id), + repeat_saga_params.instance_id, ) .await?; } diff --git a/nexus/src/app/sagas/instance_ip_attach.rs b/nexus/src/app/sagas/instance_ip_attach.rs index b04a02bea9..5d01a4fc02 100644 --- a/nexus/src/app/sagas/instance_ip_attach.rs +++ b/nexus/src/app/sagas/instance_ip_attach.rs @@ -247,23 +247,18 @@ async fn siia_attach_ip( } // Set the parent of an existing floating IP to the new instance's ID. params::ExternalIpCreate::Floating { .. } => { - let (.., authz_fip, db_fip) = LookupPath::new(&opctx, &datastore) + let (.., authz_fip) = LookupPath::new(&opctx, &datastore) .floating_ip_id(new_ip_uuid) - .fetch_for(authz::Action::Modify) + .lookup_for(authz::Action::Modify) .await .map_err(ActionError::action_failed)?; let eip = datastore - .floating_ip_attach( - &opctx, - &authz_fip, - &db_fip, - params.instance.id(), - ) + .floating_ip_attach(&opctx, &authz_fip, params.instance.id()) .await .map_err(ActionError::action_failed)?; - Ok(ExternalIp::Floating(eip.ip.ip(), db_fip.id())) + Ok(ExternalIp::Floating(eip.ip.ip(), authz_fip.id())) } } } @@ -286,18 +281,13 @@ async fn siia_attach_ip_undo( datastore.deallocate_external_ip(&opctx, new_ip_uuid).await?; } params::ExternalIpCreate::Floating { .. } => { - let (.., authz_fip, db_fip) = LookupPath::new(&opctx, &datastore) + let (.., authz_fip) = LookupPath::new(&opctx, &datastore) .floating_ip_id(new_ip_uuid) - .fetch_for(authz::Action::Modify) + .lookup_for(authz::Action::Modify) .await?; datastore - .floating_ip_detach( - &opctx, - &authz_fip, - &db_fip, - Some(params.instance.id()), - ) + .floating_ip_detach(&opctx, &authz_fip, params.instance.id()) .await?; } } diff --git a/nexus/src/app/sagas/instance_ip_detach.rs b/nexus/src/app/sagas/instance_ip_detach.rs index a14d64643f..ac948ad364 100644 --- a/nexus/src/app/sagas/instance_ip_detach.rs +++ b/nexus/src/app/sagas/instance_ip_detach.rs @@ -332,19 +332,14 @@ async fn siid_detach_ip( .map_err(ActionError::action_failed)?; } params::ExternalIpDelete::Floating { .. } => { - let (.., authz_fip, db_fip) = LookupPath::new(&opctx, &datastore) + let (.., authz_fip) = LookupPath::new(&opctx, &datastore) .floating_ip_id(new_ip_uuid) - .fetch_for(authz::Action::Modify) + .lookup_for(authz::Action::Modify) .await .map_err(ActionError::action_failed)?; datastore - .floating_ip_detach( - &opctx, - &authz_fip, - &db_fip, - Some(params.instance.id()), - ) + .floating_ip_detach(&opctx, &authz_fip, params.instance.id()) .await .map_err(ActionError::action_failed)?; } @@ -391,19 +386,14 @@ async fn siid_detach_ip_undo( } // Set the parent of an existing floating IP to the new instance's ID. params::ExternalIpDelete::Floating { .. } => { - let (.., authz_fip, db_fip) = LookupPath::new(&opctx, &datastore) + let (.., authz_fip) = LookupPath::new(&opctx, &datastore) .floating_ip_id(new_ip_uuid) - .fetch_for(authz::Action::Modify) + .lookup_for(authz::Action::Modify) .await .map_err(ActionError::action_failed)?; let _eip = datastore - .floating_ip_attach( - &opctx, - &authz_fip, - &db_fip, - params.instance.id(), - ) + .floating_ip_attach(&opctx, &authz_fip, params.instance.id()) .await .map_err(ActionError::action_failed)?; From a0fe2fd636835b4aa873e32b9e5d1816ff7e394e Mon Sep 17 00:00:00 2001 From: Kyle Simpson Date: Tue, 19 Dec 2023 11:08:18 +0000 Subject: [PATCH 08/56] Fixes to atomic attach/detach/delete --- nexus/db-queries/src/db/datastore/external_ip.rs | 13 ++++++++----- nexus/src/app/sagas/instance_ip_detach.rs | 9 +++++++-- nexus/src/external_api/http_entrypoints.rs | 4 ++-- 3 files changed, 17 insertions(+), 9 deletions(-) diff --git a/nexus/db-queries/src/db/datastore/external_ip.rs b/nexus/db-queries/src/db/datastore/external_ip.rs index 1bb8981625..a600844fe4 100644 --- a/nexus/db-queries/src/db/datastore/external_ip.rs +++ b/nexus/db-queries/src/db/datastore/external_ip.rs @@ -447,7 +447,7 @@ impl DataStore { .filter(dsl::time_deleted.is_null()) .filter(dsl::parent_id.is_null()) .set(dsl::time_deleted.eq(now)) - .check_if_exists::(authz_fip.id()) + .check_if_exists::(authz_fip.id()) .execute_and_check(&*self.pool_connection_authorized(opctx).await?) .await .map_err(|e| { @@ -496,7 +496,7 @@ impl DataStore { dsl::parent_id.eq(Some(instance_id)), dsl::time_modified.eq(Utc::now()), )) - .check_if_exists::(fip_id) + .check_if_exists::(fip_id) .execute_and_check(&*self.pool_connection_authorized(opctx).await?) .await .map_err(|e| { @@ -510,7 +510,7 @@ impl DataStore { (UpdateStatus::NotUpdatedButExists, Some(_)) => Err(Error::invalid_request( "Floating IP cannot be attached to one instance while still attached to another", )), - (UpdateStatus::Updated, _) => Ok(out.found.into()), + (UpdateStatus::Updated, _) => Ok(out.found.try_into().map_err(|e| Error::internal_error(&format!("{e}")))?), _ => unreachable!(), } } @@ -543,7 +543,7 @@ impl DataStore { dsl::parent_id.eq(Option::::None), dsl::time_modified.eq(Utc::now()), )) - .check_if_exists::(fip_id) + .check_if_exists::(fip_id) .execute_and_check(&*self.pool_connection_authorized(opctx).await?) .await .map_err(|e| { @@ -566,7 +566,10 @@ impl DataStore { "Floating IP is not attached to an instance", )) } - (UpdateStatus::Updated, _) => Ok(out.found.into()), + (UpdateStatus::Updated, _) => Ok(out + .found + .try_into() + .map_err(|e| Error::internal_error(&format!("{e}")))?), _ => unreachable!(), } } diff --git a/nexus/src/app/sagas/instance_ip_detach.rs b/nexus/src/app/sagas/instance_ip_detach.rs index ac948ad364..7084a23b1f 100644 --- a/nexus/src/app/sagas/instance_ip_detach.rs +++ b/nexus/src/app/sagas/instance_ip_detach.rs @@ -208,6 +208,11 @@ async fn siid_nat(sagactx: NexusActionContext) -> Result<(), ActionError> { let new_ip = sagactx.lookup::("target_ip")?; let ip_id = new_ip.into(); + // Currently getting an unfortunate error from here since 'detach' + // comes so late. + // Possible soln: use states, capture logic in 'begin_detach/attach' + // and call early? + osagactx .nexus() .instance_delete_dpd_config(&opctx, ¶ms.authz_instance, Some(ip_id)) @@ -359,7 +364,7 @@ async fn siid_detach_ip_undo( ¶ms.serialized_authn, ); - let new_ip_uuid = sagactx.lookup::("target_ip")?.into(); + let new_ip = sagactx.lookup::("target_ip")?; match params.delete_params { // Allocate a new IP address from the target, possibly default, pool @@ -387,7 +392,7 @@ async fn siid_detach_ip_undo( // Set the parent of an existing floating IP to the new instance's ID. params::ExternalIpDelete::Floating { .. } => { let (.., authz_fip) = LookupPath::new(&opctx, &datastore) - .floating_ip_id(new_ip_uuid) + .floating_ip_id(new_ip.into()) .lookup_for(authz::Action::Modify) .await .map_err(ActionError::action_failed)?; diff --git a/nexus/src/external_api/http_entrypoints.rs b/nexus/src/external_api/http_entrypoints.rs index f178e020b5..8c94e1e6f7 100644 --- a/nexus/src/external_api/http_entrypoints.rs +++ b/nexus/src/external_api/http_entrypoints.rs @@ -3760,7 +3760,7 @@ async fn instance_external_ip_attach( &ip_to_detach.into_inner(), ) .await?; - Ok(HttpResponseAccepted(ip.into())) + Ok(HttpResponseAccepted(ip)) }; apictx.external_latencies.instrument_dropshot_handler(&rqctx, handler).await } @@ -3796,7 +3796,7 @@ async fn instance_external_ip_detach( &ip_to_detach.into_inner(), ) .await?; - Ok(HttpResponseAccepted(ip.into())) + Ok(HttpResponseAccepted(ip)) }; apictx.external_latencies.instrument_dropshot_handler(&rqctx, handler).await } From 5bc4789c168640222f1f1ff33c59749598628bfb Mon Sep 17 00:00:00 2001 From: Kyle Simpson Date: Tue, 19 Dec 2023 12:28:03 +0000 Subject: [PATCH 09/56] Better revert of external IPs on sled-agent err --- illumos-utils/src/opte/illumos.rs | 10 ++ illumos-utils/src/opte/non_illumos.rs | 10 ++ illumos-utils/src/opte/port_manager.rs | 14 +- sled-agent/src/instance.rs | 201 ++++++++++++++++--------- 4 files changed, 156 insertions(+), 79 deletions(-) diff --git a/illumos-utils/src/opte/illumos.rs b/illumos-utils/src/opte/illumos.rs index 88e8d343b1..527172b976 100644 --- a/illumos-utils/src/opte/illumos.rs +++ b/illumos-utils/src/opte/illumos.rs @@ -11,6 +11,7 @@ use omicron_common::api::internal::shared::NetworkInterfaceKind; use opte_ioctl::OpteHdl; use slog::info; use slog::Logger; +use std::net::IpAddr; #[derive(thiserror::Error, Debug)] pub enum Error { @@ -46,6 +47,15 @@ pub enum Error { #[error("Tried to release non-existent port ({0}, {1:?})")] ReleaseMissingPort(uuid::Uuid, NetworkInterfaceKind), + + #[error("Tried to update external IPs on non-existent port ({0}, {1:?})")] + ExternalIpUpdateMissingPort(uuid::Uuid, NetworkInterfaceKind), + + #[error("Could not find Primary NIC")] + NoPrimaryNic, + + #[error("Can't attach new ephemeral IP {0}, currently have {1}")] + ImplicitEphemeralIpDetach(IpAddr, IpAddr), } /// Delete all xde devices on the system. diff --git a/illumos-utils/src/opte/non_illumos.rs b/illumos-utils/src/opte/non_illumos.rs index ccd4990d5f..bf61249fb1 100644 --- a/illumos-utils/src/opte/non_illumos.rs +++ b/illumos-utils/src/opte/non_illumos.rs @@ -8,6 +8,7 @@ use slog::Logger; use crate::addrobj::AddrObject; use omicron_common::api::internal::shared::NetworkInterfaceKind; +use std::net::IpAddr; #[derive(thiserror::Error, Debug)] pub enum Error { @@ -16,6 +17,15 @@ pub enum Error { #[error("Tried to release non-existent port ({0}, {1:?})")] ReleaseMissingPort(uuid::Uuid, NetworkInterfaceKind), + + #[error("Tried to update external IPs on non-existent port ({0}, {1:?})")] + ExternalIpUpdateMissingPort(uuid::Uuid, NetworkInterfaceKind), + + #[error("Could not find Primary NIC")] + NoPrimaryNic, + + #[error("Can't attach new ephemeral IP {0}, currently have {1}")] + ImplicitEphemeralIpDetach(IpAddr, IpAddr), } pub fn initialize_xde_driver( diff --git a/illumos-utils/src/opte/port_manager.rs b/illumos-utils/src/opte/port_manager.rs index f2634026a6..ef848dcbf1 100644 --- a/illumos-utils/src/opte/port_manager.rs +++ b/illumos-utils/src/opte/port_manager.rs @@ -412,14 +412,14 @@ impl PortManager { ephemeral_ip: Option, floating_ips: &[IpAddr], ) -> Result<(), Error> { - // TODO: new errors let ports = self.inner.ports.lock().unwrap(); - let port = ports - .get(&(nic_id, nic_kind)) - .ok_or_else(|| Error::ReleaseMissingPort(nic_id, nic_kind))?; + let port = ports.get(&(nic_id, nic_kind)).ok_or_else(|| { + Error::ExternalIpUpdateMissingPort(nic_id, nic_kind) + })?; + // TODO: massively cleanup. // Describe the external IP addresses for this port. - macro_rules! ip_cfg { + macro_rules! ext_ip_cfg { ($ip:expr, $log_prefix:literal, $ip_t:path, $cidr_t:path, $ipcfg_e:path, $ipcfg_t:ident, $snat_t:ident) => {{ let snat = match source_nat { @@ -477,7 +477,7 @@ impl PortManager { let mut v6_cfg = None; match port.gateway().ip { IpAddr::V4(_) => { - v4_cfg = Some(ip_cfg!( + v4_cfg = Some(ext_ip_cfg!( ip, "Expected IPv4", IpAddr::V4, @@ -488,7 +488,7 @@ impl PortManager { )) } IpAddr::V6(_) => { - v6_cfg = Some(ip_cfg!( + v6_cfg = Some(ext_ip_cfg!( ip, "Expected IPv6", IpAddr::V6, diff --git a/sled-agent/src/instance.rs b/sled-agent/src/instance.rs index 797b01334c..3bbe0762f8 100644 --- a/sled-agent/src/instance.rs +++ b/sled-agent/src/instance.rs @@ -558,6 +558,110 @@ impl InstanceInner { Ok(()) } + + pub async fn add_external_ip( + &mut self, + ip: &InstanceExternalIpBody, + ) -> Result<(), Error> { + // v4 + v6 handling is delegated to `external_ips_ensure`. + // If OPTE is unhappy, we undo at `Instance` level. + + match ip { + // For idempotency of add/delete, we want to return + // success on 'already done'. + InstanceExternalIpBody::Ephemeral(ip) + if Some(ip) == self.ephemeral_ip.as_ref() => + { + return Ok(()); + } + InstanceExternalIpBody::Floating(ip) + if self.floating_ips.contains(ip) => + { + return Ok(()); + } + // New Ephemeral IP while current exists -- error without + // explicit delete. + InstanceExternalIpBody::Ephemeral(ip) + if self.ephemeral_ip.is_some() => + { + return Err(Error::Opte( + illumos_utils::opte::Error::ImplicitEphemeralIpDetach( + *ip, + self.ephemeral_ip.unwrap(), + ), + )); + } + // Not found, proceed with OPTE update. + InstanceExternalIpBody::Ephemeral(ip) => { + self.ephemeral_ip = Some(*ip); + } + InstanceExternalIpBody::Floating(ip) => { + self.floating_ips.push(*ip); + } + } + + let Some(primary_nic) = self.requested_nics.get(0) else { + return Err(Error::Opte(illumos_utils::opte::Error::NoPrimaryNic)); + }; + + self.port_manager.external_ips_ensure( + primary_nic.id, + primary_nic.kind, + Some(self.source_nat), + self.ephemeral_ip, + &self.floating_ips, + )?; + + Ok(()) + } + + pub async fn delete_external_ip( + &mut self, + ip: &InstanceExternalIpBody, + ) -> Result<(), Error> { + // v4 + v6 handling is delegated to `external_ips_ensure`. + // If OPTE is unhappy, we undo at `Instance` level. + + match ip { + // For idempotency of add/delete, we want to return + // success on 'already done'. + // IP Mismatch and 'deleted in past' can't really be + // disambiguated here. + InstanceExternalIpBody::Ephemeral(ip) + if self.ephemeral_ip != Some(*ip) => + { + return Ok(()); + } + InstanceExternalIpBody::Ephemeral(_) => { + self.ephemeral_ip = None; + } + InstanceExternalIpBody::Floating(ip) => { + let floating_index = + self.floating_ips.iter().position(|v| v == ip); + if let Some(pos) = floating_index { + // Swap remove is valid here, OPTE is not sensitive + // to Floating Ip ordering. + self.floating_ips.swap_remove(pos); + } else { + return Ok(()); + } + } + } + + let Some(primary_nic) = self.requested_nics.get(0) else { + return Err(Error::Opte(illumos_utils::opte::Error::NoPrimaryNic)); + }; + + self.port_manager.external_ips_ensure( + primary_nic.id, + primary_nic.kind, + Some(self.source_nat), + self.ephemeral_ip, + &self.floating_ips, + )?; + + Ok(()) + } } /// A reference to a single instance running a running Propolis server. @@ -1101,45 +1205,22 @@ impl Instance { ) -> Result<(), Error> { let mut inner = self.inner.lock().await; - // TODO: not enforcing v4 + v6 very well here. - // TODO: reset state on fail. - // TODO: error handling is garbage - match ip { - InstanceExternalIpBody::Ephemeral(_) - if inner.ephemeral_ip.is_some() => - { - return Err(Error::Timeout( - "Ephemeral IP already attached".into(), - )); - } - InstanceExternalIpBody::Ephemeral(ip) => { - inner.ephemeral_ip = Some(*ip); - } - InstanceExternalIpBody::Floating(ip) - if inner.floating_ips.contains(ip) => - { - return Err(Error::Timeout( - "Floating IP currently attached to self".into(), - )); - } - InstanceExternalIpBody::Floating(ip) => { - inner.floating_ips.push(*ip); + // The internal call can either fail on adding the IP + // to the list, or on the OPTE step. + // Be cautious and reset state if either fails. + // Note we don't need to re-ensure port manager/OPTE state + // since that's the last call we make internally. + let old_eph = inner.ephemeral_ip; + let out = inner.add_external_ip(ip).await; + + if out.is_err() { + inner.ephemeral_ip = old_eph; + if let InstanceExternalIpBody::Floating(ip) = ip { + inner.floating_ips.retain(|v| v != ip); } } - // TODO: actually care about multiple NICs in a sane way. - let nic_id = inner.requested_nics[0].id; - let nic_kind = inner.requested_nics[0].kind; - - inner.port_manager.external_ips_ensure( - nic_id, - nic_kind, - Some(inner.source_nat), - inner.ephemeral_ip, - &inner.floating_ips, - )?; - - Ok(()) + out } pub async fn delete_external_ip( @@ -1148,45 +1229,21 @@ impl Instance { ) -> Result<(), Error> { let mut inner = self.inner.lock().await; - // TODO: not enforcing v4 + v6 very well here. - // TODO: error handling is garbage - // TODO: reset state on fail. - match ip { - InstanceExternalIpBody::Ephemeral(ip) - if inner.ephemeral_ip != Some(*ip) => - { - return Err(Error::Timeout( - "Couldn't detach intended Ephemeral IP: mismatch".into(), - )); - } - InstanceExternalIpBody::Ephemeral(_) => { - inner.ephemeral_ip = None; - } - InstanceExternalIpBody::Floating(ip) => { - let floating_index = - inner.floating_ips.iter().position(|v| v == ip); - if let Some(pos) = floating_index { - inner.floating_ips.swap_remove(pos); - } else { - return Err(Error::Timeout( - "Target Floating IP not attached to self".into(), - )); + // Similar logic to `add_external_ip`, except here we + // need to readd the floating IP if it was removed. + // OPTE doesn't care about the order of floating IPs. + let old_eph = inner.ephemeral_ip; + let out = inner.delete_external_ip(ip).await; + + if out.is_err() { + inner.ephemeral_ip = old_eph; + if let InstanceExternalIpBody::Floating(ip) = ip { + if !inner.floating_ips.contains(ip) { + inner.floating_ips.push(*ip); } } } - // TODO: actually care about multiple NICs in a sane way. - let nic_id = inner.requested_nics[0].id; - let nic_kind = inner.requested_nics[0].kind; - - inner.port_manager.external_ips_ensure( - nic_id, - nic_kind, - Some(inner.source_nat), - inner.ephemeral_ip, - &inner.floating_ips, - )?; - - Ok(()) + out } } From 7034efdd2110ace3165293fb14b14fd9129db429 Mon Sep 17 00:00:00 2001 From: Kyle Simpson Date: Wed, 20 Dec 2023 18:04:20 +0000 Subject: [PATCH 10/56] Add attach state to external IPs. a) needs further testing b) instance stop/delete need to be made state-aware --- nexus/db-model/src/external_ip.rs | 84 +++++- nexus/db-model/src/schema.rs | 3 +- .../src/db/datastore/external_ip.rs | 131 ++++++++- nexus/db-queries/src/db/datastore/mod.rs | 3 + .../db-queries/src/db/queries/external_ip.rs | 13 +- nexus/src/app/sagas/instance_create.rs | 72 +++-- nexus/src/app/sagas/instance_ip_attach.rs | 254 +++++++---------- nexus/src/app/sagas/instance_ip_detach.rs | 262 +++++++----------- schema/crdb/22.0.0/up01.sql | 6 + schema/crdb/22.0.0/up02.sql | 4 + schema/crdb/22.0.0/up03.sql | 7 + schema/crdb/22.0.0/up04.sql | 7 + schema/crdb/22.0.0/up05.sql | 2 + schema/crdb/22.0.0/up06.sql | 4 + schema/crdb/dbinit.sql | 20 +- 15 files changed, 514 insertions(+), 358 deletions(-) create mode 100644 schema/crdb/22.0.0/up01.sql create mode 100644 schema/crdb/22.0.0/up02.sql create mode 100644 schema/crdb/22.0.0/up03.sql create mode 100644 schema/crdb/22.0.0/up04.sql create mode 100644 schema/crdb/22.0.0/up05.sql create mode 100644 schema/crdb/22.0.0/up06.sql diff --git a/nexus/db-model/src/external_ip.rs b/nexus/db-model/src/external_ip.rs index 1a755f0396..8c76911781 100644 --- a/nexus/db-model/src/external_ip.rs +++ b/nexus/db-model/src/external_ip.rs @@ -23,6 +23,7 @@ use omicron_common::api::external::Error; use omicron_common::api::external::IdentityMetadata; use serde::Deserialize; use serde::Serialize; +use sled_agent_client::types::InstanceExternalIpBody; use std::convert::TryFrom; use std::net::IpAddr; use uuid::Uuid; @@ -32,7 +33,7 @@ impl_enum_type!( #[diesel(postgres_type(name = "ip_kind"))] pub struct IpKindEnum; - #[derive(Clone, Copy, Debug, AsExpression, FromSqlRow, PartialEq)] + #[derive(Clone, Copy, Debug, AsExpression, FromSqlRow, PartialEq, Deserialize, Serialize)] #[diesel(sql_type = IpKindEnum)] pub enum IpKind; @@ -41,6 +42,21 @@ impl_enum_type!( Floating => b"floating" ); +impl_enum_type!( + #[derive(SqlType, Debug, Clone, Copy, QueryId)] + #[diesel(postgres_type(name = "ip_attach_state"))] + pub struct IpAttachStateEnum; + + #[derive(Clone, Copy, Debug, AsExpression, FromSqlRow, PartialEq, Deserialize, Serialize)] + #[diesel(sql_type = IpAttachStateEnum)] + pub enum IpAttachState; + + Detached => b"detached" + Attached => b"attached" + Detaching => b"detaching" + Attaching => b"attaching" +); + /// The main model type for external IP addresses for instances /// and externally-facing services. /// @@ -51,7 +67,9 @@ impl_enum_type!( /// addresses and port ranges, while source NAT IPs are not discoverable in the /// API at all, and only provide outbound connectivity to instances, not /// inbound. -#[derive(Debug, Clone, Selectable, Queryable, Insertable)] +#[derive( + Debug, Clone, Selectable, Queryable, Insertable, Deserialize, Serialize, +)] #[diesel(table_name = external_ip)] pub struct ExternalIp { pub id: Uuid, @@ -76,6 +94,7 @@ pub struct ExternalIp { pub last_port: SqlU16, // Only Some(_) for instance Floating IPs pub project_id: Option, + pub state: IpAttachState, } /// A view type constructed from `ExternalIp` used to represent Floating IP @@ -123,6 +142,7 @@ pub struct IncompleteExternalIp { parent_id: Option, pool_id: Uuid, project_id: Option, + state: IpAttachState, // Optional address requesting that a specific IP address be allocated. explicit_ip: Option, // Optional range when requesting a specific SNAT range be allocated. @@ -135,34 +155,38 @@ impl IncompleteExternalIp { instance_id: Uuid, pool_id: Uuid, ) -> Self { + let kind = IpKind::SNat; Self { id, name: None, description: None, time_created: Utc::now(), - kind: IpKind::SNat, + kind, is_service: false, parent_id: Some(instance_id), pool_id, project_id: None, explicit_ip: None, explicit_port_range: None, + state: kind.initial_state(), } } pub fn for_ephemeral(id: Uuid, instance_id: Uuid, pool_id: Uuid) -> Self { + let kind = IpKind::Ephemeral; Self { id, name: None, description: None, time_created: Utc::now(), - kind: IpKind::Ephemeral, + kind, is_service: false, parent_id: Some(instance_id), pool_id, project_id: None, explicit_ip: None, explicit_port_range: None, + state: kind.initial_state(), } } @@ -173,18 +197,20 @@ impl IncompleteExternalIp { project_id: Uuid, pool_id: Uuid, ) -> Self { + let kind = IpKind::Floating; Self { id, name: Some(name.clone()), description: Some(description.to_string()), time_created: Utc::now(), - kind: IpKind::Floating, + kind, is_service: false, parent_id: None, pool_id, project_id: Some(project_id), explicit_ip: None, explicit_port_range: None, + state: kind.initial_state(), } } @@ -196,18 +222,20 @@ impl IncompleteExternalIp { explicit_ip: IpAddr, pool_id: Uuid, ) -> Self { + let kind = IpKind::Floating; Self { id, name: Some(name.clone()), description: Some(description.to_string()), time_created: Utc::now(), - kind: IpKind::Floating, + kind, is_service: false, parent_id: None, pool_id, project_id: Some(project_id), explicit_ip: Some(explicit_ip.into()), explicit_port_range: None, + state: kind.initial_state(), } } @@ -231,6 +259,7 @@ impl IncompleteExternalIp { project_id: None, explicit_ip: Some(IpNetwork::from(address)), explicit_port_range: None, + state: IpAttachState::Attached, } } @@ -248,18 +277,20 @@ impl IncompleteExternalIp { NUM_SOURCE_NAT_PORTS, ); let explicit_port_range = Some((first_port.into(), last_port.into())); + let kind = IpKind::SNat; Self { id, name: None, description: None, time_created: Utc::now(), - kind: IpKind::SNat, + kind, is_service: true, parent_id: Some(service_id), pool_id, project_id: None, explicit_ip: Some(IpNetwork::from(address)), explicit_port_range, + state: kind.initial_state(), } } @@ -270,34 +301,38 @@ impl IncompleteExternalIp { service_id: Uuid, pool_id: Uuid, ) -> Self { + let kind = IpKind::Floating; Self { id, name: Some(name.clone()), description: Some(description.to_string()), time_created: Utc::now(), - kind: IpKind::Floating, + kind, is_service: true, parent_id: Some(service_id), pool_id, project_id: None, explicit_ip: None, explicit_port_range: None, + state: IpAttachState::Attached, } } pub fn for_service_snat(id: Uuid, service_id: Uuid, pool_id: Uuid) -> Self { + let kind = IpKind::SNat; Self { id, name: None, description: None, time_created: Utc::now(), - kind: IpKind::SNat, + kind, is_service: true, parent_id: Some(service_id), pool_id, project_id: None, explicit_ip: None, explicit_port_range: None, + state: kind.initial_state(), } } @@ -337,6 +372,10 @@ impl IncompleteExternalIp { &self.project_id } + pub fn state(&self) -> &IpAttachState { + &self.state + } + pub fn explicit_ip(&self) -> &Option { &self.explicit_ip } @@ -346,6 +385,18 @@ impl IncompleteExternalIp { } } +impl IpKind { + /// The initial state which a new non-service IP should + /// be allocated in. + pub fn initial_state(&self) -> IpAttachState { + match &self { + IpKind::SNat => IpAttachState::Attached, + IpKind::Ephemeral => IpAttachState::Attaching, + IpKind::Floating => IpAttachState::Detached, + } + } +} + impl TryFrom for shared::IpKind { type Error = Error; @@ -448,3 +499,18 @@ impl From for views::FloatingIp { } } } + +impl TryFrom for InstanceExternalIpBody { + type Error = Error; + + fn try_from(value: ExternalIp) -> Result { + let ip = value.ip.ip(); + match value.kind { + IpKind::Ephemeral => Ok(InstanceExternalIpBody::Ephemeral(ip)), + IpKind::Floating => Ok(InstanceExternalIpBody::Floating(ip)), + IpKind::SNat => Err(Error::invalid_request( + "cannot dynamically add/remove SNAT allocation", + )), + } + } +} diff --git a/nexus/db-model/src/schema.rs b/nexus/db-model/src/schema.rs index 7f4bf51487..7af74036b2 100644 --- a/nexus/db-model/src/schema.rs +++ b/nexus/db-model/src/schema.rs @@ -13,7 +13,7 @@ use omicron_common::api::external::SemverVersion; /// /// This should be updated whenever the schema is changed. For more details, /// refer to: schema/crdb/README.adoc -pub const SCHEMA_VERSION: SemverVersion = SemverVersion::new(21, 0, 0); +pub const SCHEMA_VERSION: SemverVersion = SemverVersion::new(22, 0, 0); table! { disk (id) { @@ -567,6 +567,7 @@ table! { last_port -> Int4, project_id -> Nullable, + state -> crate::IpAttachStateEnum, } } diff --git a/nexus/db-queries/src/db/datastore/external_ip.rs b/nexus/db-queries/src/db/datastore/external_ip.rs index a600844fe4..f25d4c44d0 100644 --- a/nexus/db-queries/src/db/datastore/external_ip.rs +++ b/nexus/db-queries/src/db/datastore/external_ip.rs @@ -27,6 +27,7 @@ use crate::db::update_and_check::UpdateStatus; use async_bb8_diesel::AsyncRunQueryDsl; use chrono::Utc; use diesel::prelude::*; +use nexus_db_model::IpAttachState; use nexus_types::external_api::params; use nexus_types::identity::Resource; use omicron_common::api::external::http_pagination::PaginatedBy; @@ -60,6 +61,12 @@ impl DataStore { } /// Create an Ephemeral IP address for an instance. + /// + /// For consistency between instance create and External IP attach/detach + /// operations, this IP will be created in the `Attaching` state to block + /// concurrent access. + /// Callers must call `external_ip_complete_op` on saga completion to move + /// the IP to `Attached`. pub async fn allocate_instance_ephemeral_ip( &self, opctx: &OpContext, @@ -340,6 +347,33 @@ impl DataStore { .map_err(|e| public_error_from_diesel(e, ErrorHandler::Server)) } + pub async fn begin_deallocate_ephemeral_ip( + &self, + opctx: &OpContext, + ip_id: Uuid, + ) -> Result { + use db::schema::external_ip::dsl; + let now = Utc::now(); + let result = diesel::update(dsl::external_ip) + .filter(dsl::time_deleted.is_null()) + .filter(dsl::id.eq(ip_id)) + .filter(dsl::kind.eq(IpKind::Ephemeral)) + .filter(dsl::state.eq(IpAttachState::Attached)) + .set(( + dsl::time_modified.eq(now), + dsl::state.eq(IpAttachState::Detaching), + )) + .check_if_exists::(ip_id) + .execute_and_check(&*self.pool_connection_authorized(opctx).await?) + .await + .map_err(|e| public_error_from_diesel(e, ErrorHandler::Server))?; + + match result.status { + UpdateStatus::NotUpdatedButExists => todo!(), + UpdateStatus::Updated => todo!(), + } + } + /// Delete all external IP addresses associated with the provided instance /// ID. /// @@ -446,6 +480,7 @@ impl DataStore { .filter(dsl::id.eq(authz_fip.id())) .filter(dsl::time_deleted.is_null()) .filter(dsl::parent_id.is_null()) + .filter(dsl::state.eq(IpAttachState::Detached)) .set(dsl::time_deleted.eq(now)) .check_if_exists::(authz_fip.id()) .execute_and_check(&*self.pool_connection_authorized(opctx).await?) @@ -469,12 +504,16 @@ impl DataStore { } /// Attaches a Floating IP address to an instance. - pub async fn floating_ip_attach( + /// + /// This moves a floating IP into the 'attaching' state. Callers are + /// responsible for calling `external_ip_complete_op` to finalise the + /// IP in 'attached' state at saga completion. + pub async fn floating_ip_begin_attach( &self, opctx: &OpContext, authz_fip: &authz::FloatingIp, instance_id: Uuid, - ) -> UpdateResult { + ) -> UpdateResult { use db::schema::external_ip::dsl; let (.., authz_instance, _db_instance) = LookupPath::new(&opctx, self) @@ -492,9 +531,11 @@ impl DataStore { .filter(dsl::kind.eq(IpKind::Floating)) .filter(dsl::time_deleted.is_null()) .filter(dsl::parent_id.is_null()) + .filter(dsl::state.eq(IpAttachState::Detached)) .set(( dsl::parent_id.eq(Some(instance_id)), dsl::time_modified.eq(Utc::now()), + dsl::state.eq(IpAttachState::Attaching), )) .check_if_exists::(fip_id) .execute_and_check(&*self.pool_connection_authorized(opctx).await?) @@ -506,22 +547,27 @@ impl DataStore { ) })?; + // TODO: include state checks. match (out.status, out.found.parent_id) { (UpdateStatus::NotUpdatedButExists, Some(_)) => Err(Error::invalid_request( "Floating IP cannot be attached to one instance while still attached to another", )), - (UpdateStatus::Updated, _) => Ok(out.found.try_into().map_err(|e| Error::internal_error(&format!("{e}")))?), + (UpdateStatus::Updated, _) => Ok(out.found), _ => unreachable!(), } } /// Detaches a Floating IP address from an instance. - pub async fn floating_ip_detach( + /// + /// This moves a floating IP into the 'detaching' state. Callers are + /// responsible for calling `external_ip_complete_op` to finalise the + /// IP in 'detached' state at saga completion. + pub async fn floating_ip_begin_detach( &self, opctx: &OpContext, authz_fip: &authz::FloatingIp, instance_id: Uuid, - ) -> UpdateResult { + ) -> UpdateResult { use db::schema::external_ip::dsl; let (.., authz_instance) = LookupPath::new(&opctx, self) @@ -539,9 +585,11 @@ impl DataStore { .filter(dsl::kind.eq(IpKind::Floating)) .filter(dsl::time_deleted.is_null()) .filter(dsl::parent_id.eq(instance_id)) + .filter(dsl::state.eq(IpAttachState::Attached)) .set(( dsl::parent_id.eq(Option::::None), dsl::time_modified.eq(Utc::now()), + dsl::state.eq(IpAttachState::Attaching), )) .check_if_exists::(fip_id) .execute_and_check(&*self.pool_connection_authorized(opctx).await?) @@ -553,6 +601,7 @@ impl DataStore { ) })?; + // TODO: include state checks. match (out.status, out.found.parent_id) { (UpdateStatus::NotUpdatedButExists, Some(id)) if id != instance_id => @@ -573,4 +622,76 @@ impl DataStore { _ => unreachable!(), } } + + /// Move an external IP from a transitional state (attaching, detaching) + /// to its intended end state. + // FIXME: what do do in case of undo? + pub async fn external_ip_complete_op( + &self, + opctx: &OpContext, + ip_id: Uuid, + ip_kind: IpKind, + expected_state: IpAttachState, + target_state: IpAttachState, + ) -> Result { + use db::schema::external_ip::dsl; + + if matches!( + expected_state, + IpAttachState::Attached | IpAttachState::Detached + ) { + return Err(Error::internal_error(&format!( + "{expected_state:?} is not a valid transition state for attach/detach" + ))); + } + + let part_out = diesel::update(dsl::external_ip) + .filter(dsl::id.eq(ip_id)) + .filter(dsl::time_deleted.is_null()) + .filter(dsl::state.eq(expected_state)); + + // This leaves out SNat for now, double check where it fits in with + // instance destroy. + let now = Utc::now(); + let conn = self.pool_connection_authorized(opctx).await?; + match (ip_kind, target_state) { + (IpKind::SNat, _) => { + return Err(Error::internal_error( + "shouldn't need to multistage for SNAT", + )) + } + (IpKind::Ephemeral, IpAttachState::Detached) => { + part_out + .set(( + dsl::parent_id.eq(Option::::None), + dsl::time_modified.eq(now), + dsl::time_deleted.eq(now), + dsl::state.eq(target_state), + )) + .execute_async(&*conn) + .await + } + (IpKind::Floating, IpAttachState::Detached) => { + part_out + .set(( + dsl::parent_id.eq(Option::::None), + dsl::time_modified.eq(now), + dsl::state.eq(target_state), + )) + .execute_async(&*conn) + .await + } + (_, IpAttachState::Attached) => { + part_out + .set(( + dsl::time_modified.eq(Utc::now()), + dsl::state.eq(target_state), + )) + .execute_async(&*conn) + .await + } + _ => return Err(Error::internal_error("unreachable")), + } + .map_err(|e| public_error_from_diesel(e, ErrorHandler::Server)) + } } diff --git a/nexus/db-queries/src/db/datastore/mod.rs b/nexus/db-queries/src/db/datastore/mod.rs index 93486771b5..b08310c0d7 100644 --- a/nexus/db-queries/src/db/datastore/mod.rs +++ b/nexus/db-queries/src/db/datastore/mod.rs @@ -1672,6 +1672,7 @@ mod test { ))), first_port: crate::db::model::SqlU16(0), last_port: crate::db::model::SqlU16(10), + state: nexus_db_model::IpAttachState::Attached, }) .collect::>(); diesel::insert_into(dsl::external_ip) @@ -1733,6 +1734,7 @@ mod test { ))), first_port: crate::db::model::SqlU16(0), last_port: crate::db::model::SqlU16(10), + state: nexus_db_model::IpAttachState::Attached, }; diesel::insert_into(dsl::external_ip) .values(ip.clone()) @@ -1803,6 +1805,7 @@ mod test { ip: addresses.next().unwrap().into(), first_port: crate::db::model::SqlU16(0), last_port: crate::db::model::SqlU16(10), + state: nexus_db_model::IpAttachState::Attached, }; // Combinations of NULL and non-NULL for: diff --git a/nexus/db-queries/src/db/queries/external_ip.rs b/nexus/db-queries/src/db/queries/external_ip.rs index 2a76ea7408..d4de9a99f0 100644 --- a/nexus/db-queries/src/db/queries/external_ip.rs +++ b/nexus/db-queries/src/db/queries/external_ip.rs @@ -26,6 +26,8 @@ use diesel::Column; use diesel::Expression; use diesel::QueryResult; use diesel::RunQueryDsl; +use nexus_db_model::IpAttachState; +use nexus_db_model::IpAttachStateEnum; use omicron_common::address::NUM_SOURCE_NAT_PORTS; use omicron_common::api::external; use uuid::Uuid; @@ -99,7 +101,8 @@ const MAX_PORT: u16 = u16::MAX; /// candidate_ip AS ip, /// CAST(candidate_first_port AS INT4) AS first_port, /// CAST(candidate_last_port AS INT4) AS last_port, -/// AS project_id +/// AS project_id, +/// AS state /// FROM /// SELECT * FROM ( /// -- Select all IP addresses by pool and range. @@ -378,6 +381,14 @@ impl NextExternalIp { out.push_bind_param::, Option>(self.ip.project_id())?; out.push_sql(" AS "); out.push_identifier(dsl::project_id::NAME)?; + out.push_sql(", "); + + // Initial state, mainly needed by Ephemeral/Floating IPs. + out.push_bind_param::( + self.ip.state(), + )?; + out.push_sql(" AS "); + out.push_identifier(dsl::state::NAME)?; out.push_sql(" FROM ("); self.push_address_sequence_subquery(out.reborrow())?; diff --git a/nexus/src/app/sagas/instance_create.rs b/nexus/src/app/sagas/instance_create.rs index c5e7adcadc..1db0b6f45d 100644 --- a/nexus/src/app/sagas/instance_create.rs +++ b/nexus/src/app/sagas/instance_create.rs @@ -10,7 +10,7 @@ use crate::app::{ MAX_NICS_PER_INSTANCE, }; use crate::external_api::params; -use nexus_db_model::NetworkInterfaceKind; +use nexus_db_model::{ExternalIp, NetworkInterfaceKind}; use nexus_db_queries::db::identity::Resource; use nexus_db_queries::db::lookup::LookupPath; use nexus_db_queries::db::queries::network_interface::InsertError as InsertNicError; @@ -597,7 +597,7 @@ async fn sic_allocate_instance_snat_ip_undo( /// index `ip_index`, and return its ID if one is created (or None). async fn sic_allocate_instance_external_ip( sagactx: NexusActionContext, -) -> Result<(), ActionError> { +) -> Result, ActionError> { // XXX: may wish to restructure partially: we have at most one ephemeral // and then at most $n$ floating. let osagactx = sagactx.user_data(); @@ -607,7 +607,7 @@ async fn sic_allocate_instance_external_ip( let ip_index = repeat_saga_params.which; let Some(ip_params) = saga_params.create_params.external_ips.get(ip_index) else { - return Ok(()); + return Ok(None); }; let opctx = crate::context::op_context_for_saga_action( &sagactx, @@ -615,7 +615,17 @@ async fn sic_allocate_instance_external_ip( ); let instance_id = repeat_saga_params.instance_id; - match ip_params { + // We need two things here: + // - permanently exfil data + + // We perform the 'complete_op' in this saga stage because our IPs are + // created in the attaching state, and we need to move them to attached. + // We *can* do so because the `creating` state will block the IP attach/detach + // sagas from running, so we can safely undo without worrying they have been + // detached by another API call. + // Runtime state should never be able to make 'complete_op' fallible. + + let ip = match ip_params { // Allocate a new IP address from the target, possibly default, pool params::ExternalIpCreate::Ephemeral { ref pool_name } => { let pool_name = @@ -629,7 +639,7 @@ async fn sic_allocate_instance_external_ip( pool_name, ) .await - .map_err(ActionError::action_failed)?; + .map_err(ActionError::action_failed)? } // Set the parent of an existing floating IP to the new instance's ID. params::ExternalIpCreate::Floating { ref floating_ip_name } => { @@ -642,12 +652,24 @@ async fn sic_allocate_instance_external_ip( .map_err(ActionError::action_failed)?; datastore - .floating_ip_attach(&opctx, &authz_fip, instance_id) + .floating_ip_begin_attach(&opctx, &authz_fip, instance_id) .await - .map_err(ActionError::action_failed)?; + .map_err(ActionError::action_failed)? } - } - Ok(()) + }; + + let n_rows = datastore + .external_ip_complete_op( + &opctx, + ip.id, + ip.kind, + nexus_db_model::IpAttachState::Attaching, + nexus_db_model::IpAttachState::Attached, + ) + .await + .map_err(ActionError::action_failed)?; + + Ok(Some(ip)) } async fn sic_allocate_instance_external_ip_undo( @@ -662,6 +684,16 @@ async fn sic_allocate_instance_external_ip_undo( &sagactx, &saga_params.serialized_authn, ); + + // We store and lookup `ExternalIp` so that we can do the detach + // and/or deallocate without double name resolution. + let new_ip = sagactx + .lookup::>(&format!("external_ip{ip_index}"))?; + + let Some(ip) = new_ip else { + return Ok(()); + }; + let Some(ip_params) = saga_params.create_params.external_ips.get(ip_index) else { return Ok(()); @@ -669,24 +701,32 @@ async fn sic_allocate_instance_external_ip_undo( match ip_params { params::ExternalIpCreate::Ephemeral { .. } => { - let ip_id = repeat_saga_params.new_id; - datastore.deallocate_external_ip(&opctx, ip_id).await?; + datastore.deallocate_external_ip(&opctx, ip.id).await?; } - params::ExternalIpCreate::Floating { floating_ip_name } => { - let floating_ip_name = db::model::Name(floating_ip_name.clone()); + params::ExternalIpCreate::Floating { .. } => { let (.., authz_fip) = LookupPath::new(&opctx, &datastore) - .project_id(saga_params.project_id) - .floating_ip_name(&floating_ip_name) + .floating_ip_id(ip.id) .lookup_for(authz::Action::Modify) .await?; datastore - .floating_ip_detach( + .floating_ip_begin_detach( &opctx, &authz_fip, repeat_saga_params.instance_id, ) .await?; + + let n_rows = datastore + .external_ip_complete_op( + &opctx, + ip.id, + ip.kind, + nexus_db_model::IpAttachState::Detaching, + nexus_db_model::IpAttachState::Detached, + ) + .await + .map_err(ActionError::action_failed)?; } } Ok(()) diff --git a/nexus/src/app/sagas/instance_ip_attach.rs b/nexus/src/app/sagas/instance_ip_attach.rs index 5d01a4fc02..c0234bf15d 100644 --- a/nexus/src/app/sagas/instance_ip_attach.rs +++ b/nexus/src/app/sagas/instance_ip_attach.rs @@ -6,64 +6,16 @@ use super::{ActionRegistry, NexusActionContext, NexusSaga}; use crate::app::sagas::declare_saga_actions; use crate::app::{authn, authz, db}; use crate::external_api::params; +use nexus_db_model::ExternalIp; use nexus_db_queries::db::identity::Resource; use nexus_db_queries::db::lookup::LookupPath; use nexus_types::external_api::views; - -use omicron_common::api::external::Error; - +use omicron_common::api::external::{Error, InstanceState}; use serde::Deserialize; use serde::Serialize; - -use std::net::IpAddr; - use steno::ActionError; - use uuid::Uuid; -use sled_agent_client::types::InstanceExternalIpBody; - -#[derive(Copy, Clone, Debug, Deserialize, Serialize)] -enum ExternalIp { - Ephemeral(IpAddr, Uuid), - Floating(IpAddr, Uuid), -} - -impl From for views::ExternalIp { - fn from(value: ExternalIp) -> Self { - match value { - ExternalIp::Ephemeral(ip, _) => views::ExternalIp { - ip, - kind: nexus_types::external_api::shared::IpKind::Ephemeral, - }, - ExternalIp::Floating(ip, _) => views::ExternalIp { - ip, - kind: nexus_types::external_api::shared::IpKind::Floating, - }, - } - } -} - -impl From for InstanceExternalIpBody { - fn from(value: ExternalIp) -> Self { - match value { - ExternalIp::Ephemeral(ip, _) => { - InstanceExternalIpBody::Ephemeral(ip) - } - ExternalIp::Floating(ip, _) => InstanceExternalIpBody::Floating(ip), - } - } -} - -impl From for Uuid { - fn from(value: ExternalIp) -> Self { - match value { - ExternalIp::Ephemeral(_, id) => id, - ExternalIp::Floating(_, id) => id, - } - } -} - // rough sequence of evts: // - take temp ownership of instance while interacting w/ sled agent // -> mark instance migration id as Some(0) if None @@ -84,13 +36,9 @@ declare_saga_actions! { - siia_migration_lock_undo } - RESOLVE_EXTERNAL_IP -> "new_ip_uuid" { - + siia_resolve_ip - } - ATTACH_EXTERNAL_IP -> "new_ip" { - + siia_attach_ip - - siia_attach_ip_undo + + siia_begin_attach_ip + - siia_begin_attach_ip_undo } REGISTER_NAT -> "no_result0" { @@ -105,7 +53,6 @@ declare_saga_actions! { UNLOCK_MIGRATION -> "output" { + siia_migration_unlock - - siia_migration_unlock_undo } } @@ -119,30 +66,6 @@ pub struct Params { pub serialized_authn: authn::saga::Serialized, } -#[derive(Debug)] -pub struct SagaInstanceIpAttach; -impl NexusSaga for SagaInstanceIpAttach { - const NAME: &'static str = "external-ip-attach"; - type Params = Params; - - fn register_actions(registry: &mut ActionRegistry) { - instance_ip_attach_register_actions(registry); - } - - fn make_saga_dag( - _params: &Self::Params, - mut builder: steno::DagBuilder, - ) -> Result { - builder.append(lock_migration_action()); - builder.append(resolve_external_ip_action()); - builder.append(attach_external_ip_action()); - builder.append(register_nat_action()); - builder.append(ensure_opte_port_action()); - builder.append(unlock_migration_action()); - Ok(builder.build()?) - } -} - async fn siia_migration_lock( sagactx: NexusActionContext, ) -> Result, ActionError> { @@ -168,8 +91,21 @@ async fn siia_migration_lock( })); } + let valid_instance_states = [ + InstanceState::Running, + InstanceState::Stopped, + // InstanceState::Rebooting is safe in principle, but likely + // to trip up when backing out iff. state change. + ]; + + let state = inst_and_vmm.instance().runtime_state.nexus_state.0; + if !valid_instance_states.contains(&state) { + return Err(ActionError::action_failed(Error::ServiceUnavailable { + internal_message: "instance must be 'Running' or 'Stopped'".into(), + })); + } + // TODO: actually lock? - // TODO: fail out in a user-friendly way if migrating? Ok(inst_and_vmm.sled_id()) } @@ -181,41 +117,10 @@ async fn siia_migration_lock_undo( Ok(()) } -// This is split out to prevent double name lookup in event that we -// need to undo `siia_attach_ip`. -async fn siia_resolve_ip( - sagactx: NexusActionContext, -) -> Result { - let osagactx = sagactx.user_data(); - let datastore = osagactx.datastore(); - let params = sagactx.saga_params::()?; - let opctx = crate::context::op_context_for_saga_action( - &sagactx, - ¶ms.serialized_authn, - ); - - match params.create_params { - // Allocate a new IP address from the target, possibly default, pool - params::ExternalIpCreate::Ephemeral { .. } => Ok(Uuid::new_v4()), - // Set the parent of an existing floating IP to the new instance's ID. - params::ExternalIpCreate::Floating { ref floating_ip_name } => { - let floating_ip_name = db::model::Name(floating_ip_name.clone()); - let (.., authz_fip) = LookupPath::new(&opctx, &datastore) - .project_id(params.instance.project_id) - .floating_ip_name(&floating_ip_name) - .lookup_for(authz::Action::Modify) - .await - .map_err(ActionError::action_failed)?; - - Ok(authz_fip.id()) - } - } -} - // TODO: factor this out for attach, detach, and instance create // to share an impl. -async fn siia_attach_ip( +async fn siia_begin_attach_ip( sagactx: NexusActionContext, ) -> Result { let osagactx = sagactx.user_data(); @@ -226,44 +131,44 @@ async fn siia_attach_ip( ¶ms.serialized_authn, ); - let new_ip_uuid = sagactx.lookup::("new_ip_uuid")?; - match params.create_params { // Allocate a new IP address from the target, possibly default, pool params::ExternalIpCreate::Ephemeral { ref pool_name } => { let pool_name = pool_name.as_ref().map(|name| db::model::Name(name.clone())); - let eip = datastore + datastore .allocate_instance_ephemeral_ip( &opctx, - new_ip_uuid, + Uuid::new_v4(), params.instance.id(), pool_name, ) .await - .map_err(ActionError::action_failed)?; - - Ok(ExternalIp::Ephemeral(eip.ip.ip(), new_ip_uuid)) + .map_err(ActionError::action_failed) } // Set the parent of an existing floating IP to the new instance's ID. - params::ExternalIpCreate::Floating { .. } => { + params::ExternalIpCreate::Floating { ref floating_ip_name } => { + let floating_ip_name = db::model::Name(floating_ip_name.clone()); let (.., authz_fip) = LookupPath::new(&opctx, &datastore) - .floating_ip_id(new_ip_uuid) + .project_id(params.instance.project_id) + .floating_ip_name(&floating_ip_name) .lookup_for(authz::Action::Modify) .await .map_err(ActionError::action_failed)?; - let eip = datastore - .floating_ip_attach(&opctx, &authz_fip, params.instance.id()) + datastore + .floating_ip_begin_attach( + &opctx, + &authz_fip, + params.instance.id(), + ) .await - .map_err(ActionError::action_failed)?; - - Ok(ExternalIp::Floating(eip.ip.ip(), authz_fip.id())) + .map_err(ActionError::action_failed) } } } -async fn siia_attach_ip_undo( +async fn siia_begin_attach_ip_undo( sagactx: NexusActionContext, ) -> Result<(), anyhow::Error> { let osagactx = sagactx.user_data(); @@ -274,23 +179,19 @@ async fn siia_attach_ip_undo( ¶ms.serialized_authn, ); - let new_ip_uuid = sagactx.lookup::("new_ip_uuid")?; + let new_ip = sagactx.lookup::("new_ip")?; - match params.create_params { - params::ExternalIpCreate::Ephemeral { .. } => { - datastore.deallocate_external_ip(&opctx, new_ip_uuid).await?; - } - params::ExternalIpCreate::Floating { .. } => { - let (.., authz_fip) = LookupPath::new(&opctx, &datastore) - .floating_ip_id(new_ip_uuid) - .lookup_for(authz::Action::Modify) - .await?; + let n_rows = datastore + .external_ip_complete_op( + &opctx, + new_ip.id, + new_ip.kind, + nexus_db_model::IpAttachState::Attaching, + nexus_db_model::IpAttachState::Detached, + ) + .await + .map_err(ActionError::action_failed)?; - datastore - .floating_ip_detach(&opctx, &authz_fip, params.instance.id()) - .await?; - } - } Ok(()) } @@ -311,7 +212,6 @@ async fn siia_nat(sagactx: NexusActionContext) -> Result<(), ActionError> { }; let new_ip = sagactx.lookup::("new_ip")?; - let ip_id = new_ip.into(); // Querying sleds requires fleet access; use the instance allocator context // for this. @@ -327,7 +227,7 @@ async fn siia_nat(sagactx: NexusActionContext) -> Result<(), ActionError> { &opctx, params.instance.id(), &sled.address(), - Some(ip_id), + Some(new_ip.id), ) .await .map_err(ActionError::action_failed)?; @@ -351,11 +251,14 @@ async fn siia_nat_undo( } let new_ip = sagactx.lookup::("new_ip")?; - let ip_id = new_ip.into(); osagactx .nexus() - .instance_delete_dpd_config(&opctx, ¶ms.authz_instance, Some(ip_id)) + .instance_delete_dpd_config( + &opctx, + ¶ms.authz_instance, + Some(new_ip.id), + ) .await?; Ok(()) @@ -373,6 +276,8 @@ async fn siia_update_opte( }; let new_ip = sagactx.lookup::("new_ip")?; + let sled_agent_body = + new_ip.try_into().map_err(ActionError::action_failed)?; // TODO: disambiguate the various sled agent errors etc. osagactx @@ -380,7 +285,7 @@ async fn siia_update_opte( .sled_client(&sled_uuid) .await .map_err(ActionError::action_failed)? - .instance_put_external_ip(¶ms.instance.id(), &new_ip.into()) + .instance_put_external_ip(¶ms.instance.id(), &sled_agent_body) .await .map_err(|_| { ActionError::action_failed(Error::invalid_request("hmm")) @@ -401,6 +306,7 @@ async fn siia_update_opte_undo( }; let new_ip = sagactx.lookup::("new_ip")?; + let sled_agent_body = new_ip.try_into()?; // TODO: disambiguate the various sled agent errors etc. osagactx @@ -408,7 +314,7 @@ async fn siia_update_opte_undo( .sled_client(&sled_uuid) .await .map_err(ActionError::action_failed)? - .instance_delete_external_ip(¶ms.instance.id(), &new_ip.into()) + .instance_delete_external_ip(¶ms.instance.id(), &sled_agent_body) .await .map_err(|_| { ActionError::action_failed(Error::invalid_request("hmm")) @@ -420,23 +326,57 @@ async fn siia_update_opte_undo( async fn siia_migration_unlock( sagactx: NexusActionContext, ) -> Result { + let osagactx = sagactx.user_data(); + let datastore = osagactx.datastore(); + let params = sagactx.saga_params::()?; + let opctx = crate::context::op_context_for_saga_action( + &sagactx, + ¶ms.serialized_authn, + ); // TODO: do this iff. we implement migration lock. // TODO: Backtrack if there's an unexpected change to runstate? let new_ip = sagactx.lookup::("new_ip")?; - Ok(new_ip.into()) -} + let n_rows = datastore + .external_ip_complete_op( + &opctx, + new_ip.id, + new_ip.kind, + nexus_db_model::IpAttachState::Attaching, + nexus_db_model::IpAttachState::Attached, + ) + .await + .map_err(ActionError::action_failed)?; -async fn siia_migration_unlock_undo( - _sagactx: NexusActionContext, -) -> Result<(), anyhow::Error> { - // TODO: do this iff. we implement migration lock. - Ok(()) + new_ip.try_into().map_err(ActionError::action_failed) } // TODO: backout changes if run state changed illegally? +#[derive(Debug)] +pub struct SagaInstanceIpAttach; +impl NexusSaga for SagaInstanceIpAttach { + const NAME: &'static str = "external-ip-attach"; + type Params = Params; + + fn register_actions(registry: &mut ActionRegistry) { + instance_ip_attach_register_actions(registry); + } + + fn make_saga_dag( + _params: &Self::Params, + mut builder: steno::DagBuilder, + ) -> Result { + builder.append(lock_migration_action()); + builder.append(attach_external_ip_action()); + builder.append(register_nat_action()); + builder.append(ensure_opte_port_action()); + builder.append(unlock_migration_action()); + Ok(builder.build()?) + } +} + #[cfg(test)] pub(crate) mod test { diff --git a/nexus/src/app/sagas/instance_ip_detach.rs b/nexus/src/app/sagas/instance_ip_detach.rs index 7084a23b1f..16161e660d 100644 --- a/nexus/src/app/sagas/instance_ip_detach.rs +++ b/nexus/src/app/sagas/instance_ip_detach.rs @@ -6,59 +6,17 @@ use super::{ActionRegistry, NexusActionContext, NexusSaga}; use crate::app::sagas::declare_saga_actions; use crate::app::{authn, authz, db}; use crate::external_api::params; -use nexus_db_model::IpKind; +use futures::TryFutureExt; +use nexus_db_model::{ExternalIp, IpKind}; use nexus_db_queries::db::identity::Resource; use nexus_db_queries::db::lookup::LookupPath; use nexus_types::external_api::views; -use omicron_common::api::external::Error; +use omicron_common::api::external::{Error, InstanceState}; use serde::Deserialize; use serde::Serialize; -use sled_agent_client::types::InstanceExternalIpBody; -use std::net::IpAddr; use steno::ActionError; use uuid::Uuid; -#[derive(Copy, Clone, Debug, Deserialize, Serialize)] -enum ExternalIp { - Ephemeral(IpAddr, Uuid), - Floating(IpAddr, Uuid), -} - -impl From for views::ExternalIp { - fn from(value: ExternalIp) -> Self { - match value { - ExternalIp::Ephemeral(ip, _) => views::ExternalIp { - ip, - kind: nexus_types::external_api::shared::IpKind::Ephemeral, - }, - ExternalIp::Floating(ip, _) => views::ExternalIp { - ip, - kind: nexus_types::external_api::shared::IpKind::Floating, - }, - } - } -} - -impl From for InstanceExternalIpBody { - fn from(value: ExternalIp) -> Self { - match value { - ExternalIp::Ephemeral(ip, _) => { - InstanceExternalIpBody::Ephemeral(ip) - } - ExternalIp::Floating(ip, _) => InstanceExternalIpBody::Floating(ip), - } - } -} - -impl From for Uuid { - fn from(value: ExternalIp) -> Self { - match value { - ExternalIp::Ephemeral(_, id) => id, - ExternalIp::Floating(_, id) => id, - } - } -} - // rough sequence of evts: // - take temp ownership of instance while interacting w/ sled agent // -> mark instance migration id as Some(0) if None @@ -80,8 +38,9 @@ declare_saga_actions! { - siid_migration_lock_undo } - RESOLVE_EXTERNAL_IP -> "target_ip" { - + siid_resolve_ip + DETACH_EXTERNAL_IP -> "target_ip" { + + siid_begin_detach_ip + - siid_begin_detach_ip_undo } REMOVE_NAT -> "no_result0" { @@ -94,14 +53,8 @@ declare_saga_actions! { - siid_update_opte_undo } - DETACH_EXTERNAL_IP -> "no_result2" { - + siid_detach_ip - - siid_detach_ip_undo - } - UNLOCK_MIGRATION -> "output" { + siid_migration_unlock - - siid_migration_unlock_undo } } @@ -138,6 +91,20 @@ async fn siid_migration_lock( })); } + let valid_instance_states = [ + InstanceState::Running, + InstanceState::Stopped, + // InstanceState::Rebooting is safe in principle, but likely + // to trip up when backing out iff. state change. + ]; + + let state = inst_and_vmm.instance().runtime_state.nexus_state.0; + if !valid_instance_states.contains(&state) { + return Err(ActionError::action_failed(Error::ServiceUnavailable { + internal_message: "instance must be 'Running' or 'Stopped'".into(), + })); + } + // TODO: actually lock? // TODO: fail out in a user-friendly way if migrating? @@ -151,9 +118,7 @@ async fn siid_migration_lock_undo( Ok(()) } -// This is split out to prevent double name lookup in event that we -// need to undo `siid_attach_ip`. -async fn siid_resolve_ip( +async fn siid_begin_detach_ip( sagactx: NexusActionContext, ) -> Result { let osagactx = sagactx.user_data(); @@ -165,7 +130,6 @@ async fn siid_resolve_ip( ); match params.delete_params { - // Allocate a new IP address from the target, possibly default, pool params::ExternalIpDelete::Ephemeral => { let eips = datastore .instance_lookup_external_ips(&opctx, params.instance.id()) @@ -175,23 +139,59 @@ async fn siid_resolve_ip( let eph_ip = eips.iter().find(|e| e.kind == IpKind::Ephemeral) .ok_or_else(|| ActionError::action_failed(Error::invalid_request("instance does not have an attached ephemeral IP address")))?; - Ok(ExternalIp::Ephemeral(eph_ip.ip.ip(), eph_ip.id)) + datastore + .begin_deallocate_ephemeral_ip(&opctx, eph_ip.id) + .await + .map_err(ActionError::action_failed) } - // Set the parent of an existing floating IP to the new instance's ID. params::ExternalIpDelete::Floating { ref floating_ip_name } => { let floating_ip_name = db::model::Name(floating_ip_name.clone()); - let (.., fip) = LookupPath::new(&opctx, &datastore) + let (.., authz_fip) = LookupPath::new(&opctx, &datastore) .project_id(params.instance.project_id) .floating_ip_name(&floating_ip_name) - .fetch_for(authz::Action::Modify) + .lookup_for(authz::Action::Modify) .await .map_err(ActionError::action_failed)?; - Ok(ExternalIp::Floating(fip.ip.ip(), fip.id())) + datastore + .floating_ip_begin_detach( + &opctx, + &authz_fip, + params.instance.id(), + ) + .await + .map_err(ActionError::action_failed) } } } +async fn siid_begin_detach_ip_undo( + sagactx: NexusActionContext, +) -> Result<(), anyhow::Error> { + let osagactx = sagactx.user_data(); + let datastore = osagactx.datastore(); + let params = sagactx.saga_params::()?; + let opctx = crate::context::op_context_for_saga_action( + &sagactx, + ¶ms.serialized_authn, + ); + + let target_ip = sagactx.lookup::("target_ip")?; + + let n_rows = datastore + .external_ip_complete_op( + &opctx, + target_ip.id, + target_ip.kind, + nexus_db_model::IpAttachState::Detaching, + nexus_db_model::IpAttachState::Attached, + ) + .await + .map_err(ActionError::action_failed)?; + + Ok(()) +} + async fn siid_nat(sagactx: NexusActionContext) -> Result<(), ActionError> { let osagactx = sagactx.user_data(); let params = sagactx.saga_params::()?; @@ -205,8 +205,7 @@ async fn siid_nat(sagactx: NexusActionContext) -> Result<(), ActionError> { return Ok(()); } - let new_ip = sagactx.lookup::("target_ip")?; - let ip_id = new_ip.into(); + let target_ip = sagactx.lookup::("target_ip")?; // Currently getting an unfortunate error from here since 'detach' // comes so late. @@ -215,7 +214,11 @@ async fn siid_nat(sagactx: NexusActionContext) -> Result<(), ActionError> { osagactx .nexus() - .instance_delete_dpd_config(&opctx, ¶ms.authz_instance, Some(ip_id)) + .instance_delete_dpd_config( + &opctx, + ¶ms.authz_instance, + Some(target_ip.id), + ) .await .map_err(ActionError::action_failed)?; @@ -240,8 +243,7 @@ async fn siid_nat_undo( return Ok(()); }; - let new_ip = sagactx.lookup::("target_ip")?; - let ip_id = new_ip.into(); + let target_ip = sagactx.lookup::("target_ip")?; // Querying sleds requires fleet access; use the instance allocator context // for this. @@ -256,7 +258,7 @@ async fn siid_nat_undo( &opctx, params.instance.id(), &sled.address(), - Some(ip_id), + Some(target_ip.id), ) .await?; @@ -274,7 +276,9 @@ async fn siid_update_opte( return Ok(()); }; - let new_ip = sagactx.lookup::("target_ip")?; + let target_ip = sagactx.lookup::("target_ip")?; + let sled_agent_body = + target_ip.try_into().map_err(ActionError::action_failed)?; // TODO: disambiguate the various sled agent errors etc. osagactx @@ -282,7 +286,7 @@ async fn siid_update_opte( .sled_client(&sled_uuid) .await .map_err(ActionError::action_failed)? - .instance_delete_external_ip(¶ms.instance.id(), &new_ip.into()) + .instance_delete_external_ip(¶ms.instance.id(), &sled_agent_body) .await .map_err(|_| { ActionError::action_failed(Error::invalid_request("hmm")) @@ -302,7 +306,9 @@ async fn siid_update_opte_undo( return Ok(()); }; - let new_ip = sagactx.lookup::("target_ip")?; + let target_ip = sagactx.lookup::("target_ip")?; + let sled_agent_body = + target_ip.try_into().map_err(ActionError::action_failed)?; // TODO: disambiguate the various sled agent errors etc. osagactx @@ -310,52 +316,15 @@ async fn siid_update_opte_undo( .sled_client(&sled_uuid) .await .map_err(ActionError::action_failed)? - .instance_put_external_ip(¶ms.instance.id(), &new_ip.into()) + .instance_put_external_ip(¶ms.instance.id(), &sled_agent_body) .await?; Ok(()) } -async fn siid_detach_ip( - sagactx: NexusActionContext, -) -> Result<(), ActionError> { - let osagactx = sagactx.user_data(); - let datastore = osagactx.datastore(); - let params = sagactx.saga_params::()?; - let opctx = crate::context::op_context_for_saga_action( - &sagactx, - ¶ms.serialized_authn, - ); - - let new_ip_uuid = sagactx.lookup::("target_ip")?.into(); - - match params.delete_params { - params::ExternalIpDelete::Ephemeral => { - datastore - .deallocate_external_ip(&opctx, new_ip_uuid) - .await - .map_err(ActionError::action_failed)?; - } - params::ExternalIpDelete::Floating { .. } => { - let (.., authz_fip) = LookupPath::new(&opctx, &datastore) - .floating_ip_id(new_ip_uuid) - .lookup_for(authz::Action::Modify) - .await - .map_err(ActionError::action_failed)?; - - datastore - .floating_ip_detach(&opctx, &authz_fip, params.instance.id()) - .await - .map_err(ActionError::action_failed)?; - } - } - - Ok(()) -} - -async fn siid_detach_ip_undo( +async fn siid_migration_unlock( sagactx: NexusActionContext, -) -> Result<(), anyhow::Error> { +) -> Result { let osagactx = sagactx.user_data(); let datastore = osagactx.datastore(); let params = sagactx.saga_params::()?; @@ -363,65 +332,23 @@ async fn siid_detach_ip_undo( &sagactx, ¶ms.serialized_authn, ); - - let new_ip = sagactx.lookup::("target_ip")?; - - match params.delete_params { - // Allocate a new IP address from the target, possibly default, pool - params::ExternalIpDelete::Ephemeral => { - // let pool_name = - // pool_name.as_ref().map(|name| db::model::Name(name.clone())); - // let eip = datastore - // .allocate_instance_ephemeral_ip( - // &opctx, - // new_ip_uuid, - // params.instance.id(), - // pool_name, - // ) - // .await - // .map_err(ActionError::action_failed)?; - - // Ok(ExternalIp::Ephemeral(eip.ip.ip(), new_ip_uuid)) - - // TODO::: - // need to think over... can we even reallocate the same IP? - // We can try, and fail, and then completely unwind if so. - // Can we even fail at this point? - Ok(()) - } - // Set the parent of an existing floating IP to the new instance's ID. - params::ExternalIpDelete::Floating { .. } => { - let (.., authz_fip) = LookupPath::new(&opctx, &datastore) - .floating_ip_id(new_ip.into()) - .lookup_for(authz::Action::Modify) - .await - .map_err(ActionError::action_failed)?; - - let _eip = datastore - .floating_ip_attach(&opctx, &authz_fip, params.instance.id()) - .await - .map_err(ActionError::action_failed)?; - - Ok(()) - } - } -} - -async fn siid_migration_unlock( - sagactx: NexusActionContext, -) -> Result { // TODO: do this iff. we implement migration lock. // TODO: Backtrack if there's an unexpected change to runstate? - let new_ip = sagactx.lookup::("target_ip")?; - Ok(new_ip.into()) -} + let target_ip = sagactx.lookup::("target_ip")?; -async fn siid_migration_unlock_undo( - _sagactx: NexusActionContext, -) -> Result<(), anyhow::Error> { - // TODO: do this iff. we implement migration lock. - Ok(()) + let n_rows = datastore + .external_ip_complete_op( + &opctx, + target_ip.id, + target_ip.kind, + nexus_db_model::IpAttachState::Detaching, + nexus_db_model::IpAttachState::Detached, + ) + .await + .map_err(ActionError::action_failed)?; + + target_ip.try_into().map_err(ActionError::action_failed) } #[derive(Debug)] @@ -439,10 +366,9 @@ impl NexusSaga for SagaInstanceIpDetach { mut builder: steno::DagBuilder, ) -> Result { builder.append(lock_migration_action()); - builder.append(resolve_external_ip_action()); + builder.append(detach_external_ip_action()); builder.append(remove_nat_action()); builder.append(remove_opte_port_action()); - builder.append(detach_external_ip_action()); builder.append(unlock_migration_action()); Ok(builder.build()?) } diff --git a/schema/crdb/22.0.0/up01.sql b/schema/crdb/22.0.0/up01.sql new file mode 100644 index 0000000000..0cb511fb91 --- /dev/null +++ b/schema/crdb/22.0.0/up01.sql @@ -0,0 +1,6 @@ +CREATE TYPE IF NOT EXISTS omicron.public.ip_attach_state AS ENUM ( + 'detached', + 'attached', + 'detaching', + 'attaching' +); diff --git a/schema/crdb/22.0.0/up02.sql b/schema/crdb/22.0.0/up02.sql new file mode 100644 index 0000000000..324a907dd4 --- /dev/null +++ b/schema/crdb/22.0.0/up02.sql @@ -0,0 +1,4 @@ +-- Intentionally nullable for now as we need to backfill using the current +-- value of parent_id. +ALTER TABLE omicron.public.external_ip +ADD COLUMN IF NOT EXISTS state omicron.public.ip_attach_state; diff --git a/schema/crdb/22.0.0/up03.sql b/schema/crdb/22.0.0/up03.sql new file mode 100644 index 0000000000..7d6a62e4c0 --- /dev/null +++ b/schema/crdb/22.0.0/up03.sql @@ -0,0 +1,7 @@ +-- initialise external ip state for detached IPs. +set + local disallow_full_table_scans = off; + +UPDATE omicron.public.external_ip +SET state = 'detached' +WHERE parent_id IS NULL; \ No newline at end of file diff --git a/schema/crdb/22.0.0/up04.sql b/schema/crdb/22.0.0/up04.sql new file mode 100644 index 0000000000..e93e151202 --- /dev/null +++ b/schema/crdb/22.0.0/up04.sql @@ -0,0 +1,7 @@ +-- initialise external ip state for attached IPs. +set + local disallow_full_table_scans = off; + +UPDATE omicron.public.external_ip +SET state = 'attached' +WHERE parent_id IS NOT NULL; \ No newline at end of file diff --git a/schema/crdb/22.0.0/up05.sql b/schema/crdb/22.0.0/up05.sql new file mode 100644 index 0000000000..894806a3dc --- /dev/null +++ b/schema/crdb/22.0.0/up05.sql @@ -0,0 +1,2 @@ +-- Now move the new column to its intended state of non-nullable. +ALTER TABLE omicron.public.external_ip ALTER COLUMN state SET NOT NULL; diff --git a/schema/crdb/22.0.0/up06.sql b/schema/crdb/22.0.0/up06.sql new file mode 100644 index 0000000000..48fd3402ce --- /dev/null +++ b/schema/crdb/22.0.0/up06.sql @@ -0,0 +1,4 @@ +ALTER TABLE omicron.public.external_ip +ADD CONSTRAINT detached_null_parent_id CHECK ( + (state = 'detached') != (parent_id IS NOT NULL) +); diff --git a/schema/crdb/dbinit.sql b/schema/crdb/dbinit.sql index cc61148048..b3d96a1595 100644 --- a/schema/crdb/dbinit.sql +++ b/schema/crdb/dbinit.sql @@ -1654,6 +1654,13 @@ CREATE TYPE IF NOT EXISTS omicron.public.ip_kind AS ENUM ( 'floating' ); +CREATE TYPE IF NOT EXISTS omicron.public.ip_attach_state AS ENUM ( + 'detached', + 'attached', + 'detaching', + 'attaching' +); + /* * External IP addresses used for guest instances and externally-facing * services. @@ -1699,6 +1706,12 @@ CREATE TABLE IF NOT EXISTS omicron.public.external_ip ( /* FK to the `project` table. */ project_id UUID, + /* State of this IP with regard to instance attach/detach + * operations. This is mainly used to prevent concurrent use + * across sagas and allow rollback to correct state. + */ + state omicron.public.ip_attach_state NOT NULL, + /* The name must be non-NULL iff this is a floating IP. */ CONSTRAINT null_fip_name CHECK ( (kind != 'floating' AND name IS NULL) OR @@ -1730,6 +1743,11 @@ CREATE TABLE IF NOT EXISTS omicron.public.external_ip ( /* Ephemeral IPs are not supported for services. */ CONSTRAINT ephemeral_kind_service CHECK ( (kind = 'ephemeral' AND is_service = FALSE) OR (kind != 'ephemeral') + ), + + /* parent_id must be null if detached, non-null if not detached */ + CONSTRAINT detached_null_parent_id CHECK ( + (state = 'detached') != (parent_id IS NOT NULL) ) ); @@ -3096,7 +3114,7 @@ INSERT INTO omicron.public.db_metadata ( version, target_version ) VALUES - ( TRUE, NOW(), NOW(), '21.0.0', NULL) + ( TRUE, NOW(), NOW(), '22.0.0', NULL) ON CONFLICT DO NOTHING; COMMIT; From ce1c92d6c092da3cad0c515ef3eb0816d78d81bd Mon Sep 17 00:00:00 2001 From: Kyle Simpson Date: Wed, 20 Dec 2023 19:45:52 +0000 Subject: [PATCH 11/56] Bad state in detach --- nexus/db-queries/src/db/datastore/external_ip.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/nexus/db-queries/src/db/datastore/external_ip.rs b/nexus/db-queries/src/db/datastore/external_ip.rs index f25d4c44d0..8137028f16 100644 --- a/nexus/db-queries/src/db/datastore/external_ip.rs +++ b/nexus/db-queries/src/db/datastore/external_ip.rs @@ -589,7 +589,7 @@ impl DataStore { .set(( dsl::parent_id.eq(Option::::None), dsl::time_modified.eq(Utc::now()), - dsl::state.eq(IpAttachState::Attaching), + dsl::state.eq(IpAttachState::Detaching), )) .check_if_exists::(fip_id) .execute_and_check(&*self.pool_connection_authorized(opctx).await?) From cc130c3f22b971eb765b43ab676c620c5ebcee45 Mon Sep 17 00:00:00 2001 From: Kyle Simpson Date: Thu, 21 Dec 2023 19:39:05 +0000 Subject: [PATCH 12/56] Make use of `Instance::attach_resource` and friends Still need to fixup start/stop/delete to block on ip-progress external IPs, but then we'll be sound! --- nexus/db-model/src/instance.rs | 17 +- nexus/db-queries/src/db/datastore/disk.rs | 4 +- .../src/db/datastore/external_ip.rs | 330 +++++++++++++----- nexus/db-queries/src/db/datastore/instance.rs | 102 +++--- nexus/src/app/sagas/instance_create.rs | 10 +- nexus/src/app/sagas/instance_ip_attach.rs | 6 + nexus/src/app/sagas/instance_ip_detach.rs | 7 +- schema/crdb/22.0.0/up06.sql | 2 +- schema/crdb/dbinit.sql | 8 +- 9 files changed, 343 insertions(+), 143 deletions(-) diff --git a/nexus/db-model/src/instance.rs b/nexus/db-model/src/instance.rs index 9252926547..e10f8c2603 100644 --- a/nexus/db-model/src/instance.rs +++ b/nexus/db-model/src/instance.rs @@ -2,9 +2,11 @@ // License, v. 2.0. If a copy of the MPL was not distributed with this // file, You can obtain one at https://mozilla.org/MPL/2.0/. -use super::{ByteCount, Disk, Generation, InstanceCpuCount, InstanceState}; +use super::{ + ByteCount, Disk, ExternalIp, Generation, InstanceCpuCount, InstanceState, +}; use crate::collection::DatastoreAttachTargetConfig; -use crate::schema::{disk, instance}; +use crate::schema::{disk, external_ip, instance}; use chrono::{DateTime, Utc}; use db_macros::Resource; use nexus_types::external_api::params; @@ -101,6 +103,17 @@ impl DatastoreAttachTargetConfig for Instance { type ResourceTimeDeletedColumn = disk::dsl::time_deleted; } +impl DatastoreAttachTargetConfig for Instance { + type Id = Uuid; + + type CollectionIdColumn = instance::dsl::id; + type CollectionTimeDeletedColumn = instance::dsl::time_deleted; + + type ResourceIdColumn = external_ip::dsl::id; + type ResourceCollectionIdColumn = external_ip::dsl::parent_id; + type ResourceTimeDeletedColumn = external_ip::dsl::time_deleted; +} + /// Runtime state of the Instance, including the actual running state and minimal /// metadata /// diff --git a/nexus/db-queries/src/db/datastore/disk.rs b/nexus/db-queries/src/db/datastore/disk.rs index 2055287e62..390376e627 100644 --- a/nexus/db-queries/src/db/datastore/disk.rs +++ b/nexus/db-queries/src/db/datastore/disk.rs @@ -206,7 +206,7 @@ impl DataStore { let (instance, disk) = query.attach_and_get_result_async(&*self.pool_connection_authorized(opctx).await?) .await - .or_else(|e| { + .or_else(|e: AttachError| { match e { AttachError::CollectionNotFound => { Err(Error::not_found_by_id( @@ -348,7 +348,7 @@ impl DataStore { ) .detach_and_get_result_async(&*self.pool_connection_authorized(opctx).await?) .await - .or_else(|e| { + .or_else(|e: DetachError| { match e { DetachError::CollectionNotFound => { Err(Error::not_found_by_id( diff --git a/nexus/db-queries/src/db/datastore/external_ip.rs b/nexus/db-queries/src/db/datastore/external_ip.rs index 8137028f16..905d95aee4 100644 --- a/nexus/db-queries/src/db/datastore/external_ip.rs +++ b/nexus/db-queries/src/db/datastore/external_ip.rs @@ -9,6 +9,10 @@ use crate::authz; use crate::authz::ApiResource; use crate::context::OpContext; use crate::db; +use crate::db::collection_attach::AttachError; +use crate::db::collection_attach::DatastoreAttachTarget; +use crate::db::collection_detach::DatastoreDetachTarget; +use crate::db::collection_detach::DetachError; use crate::db::error::public_error_from_diesel; use crate::db::error::retryable; use crate::db::error::ErrorHandler; @@ -26,14 +30,18 @@ use crate::db::update_and_check::UpdateAndCheck; use crate::db::update_and_check::UpdateStatus; use async_bb8_diesel::AsyncRunQueryDsl; use chrono::Utc; +use db::model::InstanceState as DbInstanceState; use diesel::prelude::*; +use nexus_db_model::Instance; use nexus_db_model::IpAttachState; use nexus_types::external_api::params; use nexus_types::identity::Resource; +use omicron_common::api; use omicron_common::api::external::http_pagination::PaginatedBy; use omicron_common::api::external::CreateResult; use omicron_common::api::external::DeleteResult; use omicron_common::api::external::Error; +use omicron_common::api::external::InstanceState as ApiInstanceState; use omicron_common::api::external::ListResultVec; use omicron_common::api::external::LookupResult; use omicron_common::api::external::NameOrId; @@ -43,6 +51,29 @@ use ref_cast::RefCast; use std::net::IpAddr; use uuid::Uuid; +// Broadly, we want users to be able to attach/detach at will +// once an instance is created and functional. +// If we're in a state which will naturally resolve to either +// stopped/running, we want users to know that the request can be +// retried safely. +const SAFE_TO_ATTACH_INSTANCE_STATES_CREATING: [DbInstanceState; 3] = [ + DbInstanceState(ApiInstanceState::Stopped), + DbInstanceState(ApiInstanceState::Running), + DbInstanceState(ApiInstanceState::Creating), +]; +const SAFE_TO_ATTACH_INSTANCE_STATES: [DbInstanceState; 2] = [ + DbInstanceState(ApiInstanceState::Stopped), + DbInstanceState(ApiInstanceState::Running), +]; +const SAFE_TRANSITORY_INSTANCE_STATES: [DbInstanceState; 3] = [ + DbInstanceState(ApiInstanceState::Starting), + DbInstanceState(ApiInstanceState::Stopping), + DbInstanceState(ApiInstanceState::Creating), +]; +// FIXME: should be exported from a shared location, original lives in +// nexus app. +const MAX_EXTERNAL_IPS_PER_INSTANCE: u32 = 32; + impl DataStore { /// Create an external IP address for source NAT for an instance. pub async fn allocate_instance_snat_ip( @@ -351,27 +382,78 @@ impl DataStore { &self, opctx: &OpContext, ip_id: Uuid, + instance_id: Uuid, ) -> Result { use db::schema::external_ip::dsl; - let now = Utc::now(); - let result = diesel::update(dsl::external_ip) - .filter(dsl::time_deleted.is_null()) - .filter(dsl::id.eq(ip_id)) - .filter(dsl::kind.eq(IpKind::Ephemeral)) - .filter(dsl::state.eq(IpAttachState::Attached)) - .set(( - dsl::time_modified.eq(now), + use db::schema::external_ip::table; + use db::schema::instance::dsl as inst_dsl; + use db::schema::instance::table as inst_table; + + let _ = LookupPath::new(&opctx, self) + .instance_id(instance_id) + .lookup_for(authz::Action::Modify) + .await?; + + let query = Instance::detach_resource( + instance_id, + ip_id, + inst_table + .into_boxed() + .filter(inst_dsl::state.eq_any(SAFE_TO_ATTACH_INSTANCE_STATES)), + table + .into_boxed() + .filter(dsl::state.eq(IpAttachState::Attached)) + .filter(dsl::kind.eq(IpKind::Ephemeral)), + diesel::update(dsl::external_ip).set(( + dsl::time_modified.eq(Utc::now()), dsl::state.eq(IpAttachState::Detaching), - )) - .check_if_exists::(ip_id) - .execute_and_check(&*self.pool_connection_authorized(opctx).await?) - .await - .map_err(|e| public_error_from_diesel(e, ErrorHandler::Server))?; + )), + ); - match result.status { - UpdateStatus::NotUpdatedButExists => todo!(), - UpdateStatus::Updated => todo!(), - } + let eip = query.detach_and_get_result_async(&*self.pool_connection_authorized(opctx).await?) + .await + .or_else(|e: DetachError| match e { + DetachError::CollectionNotFound => { + Err(Error::not_found_by_id( + ResourceType::Instance, + &instance_id, + )) + }, + DetachError::ResourceNotFound => { + Err(Error::invalid_request("instance has no ephemeral IP to detach")) + }, + DetachError::NoUpdate { resource, collection } => { + match resource.state { + IpAttachState::Attached if resource.parent_id != Some(instance_id) => return Err(Error::internal_error( + "Ephemeral IP is not attached to the target instance", + )), + // User can reattempt depending on how the current saga unfolds. + IpAttachState::Attaching | IpAttachState::Detaching => return Err(Error::ServiceUnavailable { + internal_message: "tried to detach ephemeral IP mid-attach/detach".into() + }), + IpAttachState::Attached => {}, + IpAttachState::Detached => return Err(Error::internal_error( + "Ephemeral IP cannot exist in 'detached' state", + )), + } + + Err(match collection.runtime_state.nexus_state { + state if SAFE_TRANSITORY_INSTANCE_STATES.contains(&state) => Error::ServiceUnavailable { + internal_message: "tried to detach ephemeral IP while instance was changing state".into() + }, + state if SAFE_TO_ATTACH_INSTANCE_STATES.contains(&state) => { + Error::internal_error("failed to detach ephemeral IP") + }, + state => Error::invalid_request(&format!("cannot attach ephemeral IP to instance in {state} state")), + }) + }, + DetachError::DatabaseError(e) => { + Err(public_error_from_diesel(e, ErrorHandler::Server)) + }, + + })?; + + Ok(eip) } /// Delete all external IP addresses associated with the provided instance @@ -513,8 +595,12 @@ impl DataStore { opctx: &OpContext, authz_fip: &authz::FloatingIp, instance_id: Uuid, + creating_instance: bool, ) -> UpdateResult { use db::schema::external_ip::dsl; + use db::schema::external_ip::table; + use db::schema::instance::dsl as inst_dsl; + use db::schema::instance::table as inst_table; let (.., authz_instance, _db_instance) = LookupPath::new(&opctx, self) .instance_id(instance_id) @@ -526,35 +612,86 @@ impl DataStore { let fip_id = authz_fip.id(); - let out = diesel::update(dsl::external_ip) - .filter(dsl::id.eq(fip_id)) - .filter(dsl::kind.eq(IpKind::Floating)) - .filter(dsl::time_deleted.is_null()) - .filter(dsl::parent_id.is_null()) - .filter(dsl::state.eq(IpAttachState::Detached)) - .set(( + let safe_states = if creating_instance { + &SAFE_TO_ATTACH_INSTANCE_STATES_CREATING[..] + } else { + &SAFE_TO_ATTACH_INSTANCE_STATES[..] + }; + + let query = Instance::attach_resource( + instance_id, + fip_id, + inst_table.into_boxed().filter(inst_dsl::state.eq_any(safe_states)), + table + .into_boxed() + .filter(dsl::state.eq(IpAttachState::Detached)) + .filter(dsl::kind.eq(IpKind::Floating)) + .filter(dsl::parent_id.is_null()), + // +1 to account for SNat + MAX_EXTERNAL_IPS_PER_INSTANCE + 1, + diesel::update(dsl::external_ip).set(( dsl::parent_id.eq(Some(instance_id)), dsl::time_modified.eq(Utc::now()), dsl::state.eq(IpAttachState::Attaching), - )) - .check_if_exists::(fip_id) - .execute_and_check(&*self.pool_connection_authorized(opctx).await?) - .await - .map_err(|e| { - public_error_from_diesel( - e, - ErrorHandler::NotFoundByResource(authz_fip), - ) - })?; - - // TODO: include state checks. - match (out.status, out.found.parent_id) { - (UpdateStatus::NotUpdatedButExists, Some(_)) => Err(Error::invalid_request( - "Floating IP cannot be attached to one instance while still attached to another", )), - (UpdateStatus::Updated, _) => Ok(out.found), - _ => unreachable!(), - } + ); + + let (_, eip) = query.attach_and_get_result_async(&*self.pool_connection_authorized(opctx).await?) + .await + .or_else(|e: AttachError| match e { + AttachError::CollectionNotFound => { + Err(Error::not_found_by_id( + ResourceType::Instance, + &instance_id, + )) + }, + AttachError::ResourceNotFound => { + Err(Error::not_found_by_id( + ResourceType::FloatingIp, + &fip_id, + )) + }, + AttachError::NoUpdate { attached_count, resource, collection } => { + match resource.state { + // Idempotent errors: attach succeeded or is in progress for + // same resource pair -- this is fine. + IpAttachState::Attached | IpAttachState::Attaching if resource.parent_id == Some(instance_id) => return Ok((collection, resource)), + IpAttachState::Attached => return Err(Error::invalid_request( + "floating IP cannot be attached to one \ + instance while still attached to another" + )), + // User can reattempt depending on how the current saga unfolds. + IpAttachState::Attaching | IpAttachState::Detaching => return Err(Error::ServiceUnavailable { + internal_message: "tried to attach floating IP mid-attach/detach".into() + }), + + IpAttachState::Detached => {}, + } + + Err(match collection.runtime_state.nexus_state { + state if SAFE_TRANSITORY_INSTANCE_STATES.contains(&state) => Error::ServiceUnavailable { + internal_message: "tried to attach floating IP while instance was changing state".into() + }, + state if SAFE_TO_ATTACH_INSTANCE_STATES.contains(&state) => { + if attached_count >= MAX_EXTERNAL_IPS_PER_INSTANCE as i64 + 1 { + Error::invalid_request(&format!( + "an instance may not have more than {} external IP addresses", + MAX_EXTERNAL_IPS_PER_INSTANCE, + )) + } else { + Error::internal_error("failed to attach floating IP") + } + }, + state => Error::invalid_request(&format!("cannot attach floating IP to instance in {state} state")), + }) + }, + AttachError::DatabaseError(e) => { + Err(public_error_from_diesel(e, ErrorHandler::Server)) + }, + + })?; + + Ok(eip) } /// Detaches a Floating IP address from an instance. @@ -567,12 +704,16 @@ impl DataStore { opctx: &OpContext, authz_fip: &authz::FloatingIp, instance_id: Uuid, + creating_instance: bool, ) -> UpdateResult { use db::schema::external_ip::dsl; + use db::schema::external_ip::table; + use db::schema::instance::dsl as inst_dsl; + use db::schema::instance::table as inst_table; - let (.., authz_instance) = LookupPath::new(&opctx, self) + let (.., authz_instance, _db_instance) = LookupPath::new(&opctx, self) .instance_id(instance_id) - .lookup_for(authz::Action::Modify) + .fetch_for(authz::Action::Modify) .await?; opctx.authorize(authz::Action::Modify, authz_fip).await?; @@ -580,47 +721,74 @@ impl DataStore { let fip_id = authz_fip.id(); - let out = diesel::update(dsl::external_ip) - .filter(dsl::id.eq(fip_id)) - .filter(dsl::kind.eq(IpKind::Floating)) - .filter(dsl::time_deleted.is_null()) - .filter(dsl::parent_id.eq(instance_id)) - .filter(dsl::state.eq(IpAttachState::Attached)) - .set(( - dsl::parent_id.eq(Option::::None), + let safe_states = if creating_instance { + &SAFE_TO_ATTACH_INSTANCE_STATES_CREATING[..] + } else { + &SAFE_TO_ATTACH_INSTANCE_STATES[..] + }; + + let query = Instance::detach_resource( + instance_id, + fip_id, + inst_table.into_boxed().filter(inst_dsl::state.eq_any(safe_states)), + table + .into_boxed() + .filter(dsl::state.eq(IpAttachState::Attached)) + .filter(dsl::kind.eq(IpKind::Floating)), + diesel::update(dsl::external_ip).set(( dsl::time_modified.eq(Utc::now()), dsl::state.eq(IpAttachState::Detaching), - )) - .check_if_exists::(fip_id) - .execute_and_check(&*self.pool_connection_authorized(opctx).await?) - .await - .map_err(|e| { - public_error_from_diesel( - e, - ErrorHandler::NotFoundByResource(authz_fip), - ) - })?; + )), + ); - // TODO: include state checks. - match (out.status, out.found.parent_id) { - (UpdateStatus::NotUpdatedButExists, Some(id)) - if id != instance_id => - { - Err(Error::invalid_request( - "Floating IP is not attached to the target instance", + let eip = query.detach_and_get_result_async(&*self.pool_connection_authorized(opctx).await?) + .await + .or_else(|e: DetachError| match e { + DetachError::CollectionNotFound => { + Err(Error::not_found_by_id( + ResourceType::Instance, + &instance_id, )) - } - (UpdateStatus::NotUpdatedButExists, None) => { - Err(Error::invalid_request( - "Floating IP is not attached to an instance", + }, + DetachError::ResourceNotFound => { + Err(Error::not_found_by_id( + ResourceType::FloatingIp, + &fip_id, )) - } - (UpdateStatus::Updated, _) => Ok(out - .found - .try_into() - .map_err(|e| Error::internal_error(&format!("{e}")))?), - _ => unreachable!(), - } + }, + DetachError::NoUpdate { resource, collection } => { + match resource.state { + IpAttachState::Attached if resource.parent_id != Some(instance_id) => return Err(Error::invalid_request( + "Floating IP is not attached to the target instance", + )), + // TODO: should we just... let this one through? + IpAttachState::Detached => return Err(Error::invalid_request( + "Floating IP is not attached to an instance", + )), + // User can reattempt depending on how the current saga unfolds. + IpAttachState::Attaching | IpAttachState::Detaching => return Err(Error::ServiceUnavailable { + internal_message: "tried to detach floating IP mid-attach/detach".into() + }), + IpAttachState::Attached => {}, + } + + Err(match collection.runtime_state.nexus_state { + state if SAFE_TRANSITORY_INSTANCE_STATES.contains(&state) => Error::ServiceUnavailable { + internal_message: "tried to detach floating IP while instance was changing state".into() + }, + state if SAFE_TO_ATTACH_INSTANCE_STATES.contains(&state) => { + Error::internal_error("failed to detach floating IP") + }, + state => Error::invalid_request(&format!("cannot detach floating IP to instance in {state} state")), + }) + }, + DetachError::DatabaseError(e) => { + Err(public_error_from_diesel(e, ErrorHandler::Server)) + }, + + })?; + + Ok(eip) } /// Move an external IP from a transitional state (attaching, detaching) @@ -663,7 +831,7 @@ impl DataStore { (IpKind::Ephemeral, IpAttachState::Detached) => { part_out .set(( - dsl::parent_id.eq(Option::::None), + // dsl::parent_id.eq(Option::::None), dsl::time_modified.eq(now), dsl::time_deleted.eq(now), dsl::state.eq(target_state), diff --git a/nexus/db-queries/src/db/datastore/instance.rs b/nexus/db-queries/src/db/datastore/instance.rs index 188f5c30c9..c01f40e791 100644 --- a/nexus/db-queries/src/db/datastore/instance.rs +++ b/nexus/db-queries/src/db/datastore/instance.rs @@ -11,6 +11,7 @@ use crate::context::OpContext; use crate::db; use crate::db::collection_detach_many::DatastoreDetachManyTarget; use crate::db::collection_detach_many::DetachManyError; +use crate::db::collection_detach_many::DetachManyFromCollectionStatement; use crate::db::collection_insert::AsyncInsertError; use crate::db::collection_insert::DatastoreCollection; use crate::db::error::public_error_from_diesel; @@ -28,6 +29,7 @@ use crate::db::update_and_check::UpdateStatus; use async_bb8_diesel::AsyncRunQueryDsl; use chrono::Utc; use diesel::prelude::*; +use nexus_db_model::Disk; use nexus_db_model::VmmRuntimeState; use omicron_common::api; use omicron_common::api::external::http_pagination::PaginatedBy; @@ -405,59 +407,63 @@ impl DataStore { let ok_to_detach_disk_state_labels: Vec<_> = ok_to_detach_disk_states.iter().map(|s| s.label()).collect(); - let _instance = Instance::detach_resources( - authz_instance.id(), - instance::table.into_boxed().filter( - instance::dsl::state - .eq_any(ok_to_delete_instance_states) - .and(instance::dsl::active_propolis_id.is_null()), - ), - disk::table.into_boxed().filter( - disk::dsl::disk_state.eq_any(ok_to_detach_disk_state_labels), - ), - diesel::update(instance::dsl::instance).set(( - instance::dsl::state.eq(destroyed), - instance::dsl::time_deleted.eq(Utc::now()), - )), - diesel::update(disk::dsl::disk).set(( - disk::dsl::disk_state.eq(detached_label), - disk::dsl::attach_instance_id.eq(Option::::None), - disk::dsl::slot.eq(Option::::None), - )), - ) - .detach_and_get_result_async( - &*self.pool_connection_authorized(opctx).await?, - ) - .await - .map_err(|e| match e { - DetachManyError::CollectionNotFound => Error::not_found_by_id( - ResourceType::Instance, - &authz_instance.id(), - ), - DetachManyError::NoUpdate { collection } => { - if collection.runtime_state.propolis_id.is_some() { - return Error::invalid_request( + let stmt: DetachManyFromCollectionStatement = + Instance::detach_resources( + authz_instance.id(), + instance::table.into_boxed().filter( + instance::dsl::state + .eq_any(ok_to_delete_instance_states) + .and(instance::dsl::active_propolis_id.is_null()), + ), + disk::table.into_boxed().filter( + disk::dsl::disk_state + .eq_any(ok_to_detach_disk_state_labels), + ), + diesel::update(instance::dsl::instance).set(( + instance::dsl::state.eq(destroyed), + instance::dsl::time_deleted.eq(Utc::now()), + )), + diesel::update(disk::dsl::disk).set(( + disk::dsl::disk_state.eq(detached_label), + disk::dsl::attach_instance_id.eq(Option::::None), + disk::dsl::slot.eq(Option::::None), + )), + ); + + let _instance = stmt + .detach_and_get_result_async( + &*self.pool_connection_authorized(opctx).await?, + ) + .await + .map_err(|e| match e { + DetachManyError::CollectionNotFound => Error::not_found_by_id( + ResourceType::Instance, + &authz_instance.id(), + ), + DetachManyError::NoUpdate { collection } => { + if collection.runtime_state.propolis_id.is_some() { + return Error::invalid_request( "cannot delete instance: instance is running or has \ not yet fully stopped", ); - } - let instance_state = - collection.runtime_state.nexus_state.state(); - match instance_state { - api::external::InstanceState::Stopped - | api::external::InstanceState::Failed => { - Error::internal_error("cannot delete instance") } - _ => Error::invalid_request(&format!( - "instance cannot be deleted in state \"{}\"", - instance_state, - )), + let instance_state = + collection.runtime_state.nexus_state.state(); + match instance_state { + api::external::InstanceState::Stopped + | api::external::InstanceState::Failed => { + Error::internal_error("cannot delete instance") + } + _ => Error::invalid_request(&format!( + "instance cannot be deleted in state \"{}\"", + instance_state, + )), + } } - } - DetachManyError::DatabaseError(e) => { - public_error_from_diesel(e, ErrorHandler::Server) - } - })?; + DetachManyError::DatabaseError(e) => { + public_error_from_diesel(e, ErrorHandler::Server) + } + })?; Ok(()) } diff --git a/nexus/src/app/sagas/instance_create.rs b/nexus/src/app/sagas/instance_create.rs index 1db0b6f45d..088e94f197 100644 --- a/nexus/src/app/sagas/instance_create.rs +++ b/nexus/src/app/sagas/instance_create.rs @@ -223,7 +223,7 @@ impl NexusSaga for SagaInstanceCreate { SagaName::new(&format!("instance-create-external-ip{i}")); let mut subsaga_builder = DagBuilder::new(subsaga_name); subsaga_builder.append(Node::action( - "output", + format!("external-ip-{i}").as_str(), format!("CreateExternalIp{i}").as_str(), CREATE_EXTERNAL_IP.as_ref(), )); @@ -615,9 +615,6 @@ async fn sic_allocate_instance_external_ip( ); let instance_id = repeat_saga_params.instance_id; - // We need two things here: - // - permanently exfil data - // We perform the 'complete_op' in this saga stage because our IPs are // created in the attaching state, and we need to move them to attached. // We *can* do so because the `creating` state will block the IP attach/detach @@ -652,7 +649,7 @@ async fn sic_allocate_instance_external_ip( .map_err(ActionError::action_failed)?; datastore - .floating_ip_begin_attach(&opctx, &authz_fip, instance_id) + .floating_ip_begin_attach(&opctx, &authz_fip, instance_id, true) .await .map_err(ActionError::action_failed)? } @@ -688,7 +685,7 @@ async fn sic_allocate_instance_external_ip_undo( // We store and lookup `ExternalIp` so that we can do the detach // and/or deallocate without double name resolution. let new_ip = sagactx - .lookup::>(&format!("external_ip{ip_index}"))?; + .lookup::>(&format!("external-ip-{ip_index}"))?; let Some(ip) = new_ip else { return Ok(()); @@ -714,6 +711,7 @@ async fn sic_allocate_instance_external_ip_undo( &opctx, &authz_fip, repeat_saga_params.instance_id, + true, ) .await?; diff --git a/nexus/src/app/sagas/instance_ip_attach.rs b/nexus/src/app/sagas/instance_ip_attach.rs index c0234bf15d..be36f19e0f 100644 --- a/nexus/src/app/sagas/instance_ip_attach.rs +++ b/nexus/src/app/sagas/instance_ip_attach.rs @@ -16,6 +16,9 @@ use serde::Serialize; use steno::ActionError; use uuid::Uuid; +// TODO: explain in-depth here how locking works in practice for +// attach and detach wrt create/stop/start. + // rough sequence of evts: // - take temp ownership of instance while interacting w/ sled agent // -> mark instance migration id as Some(0) if None @@ -161,6 +164,7 @@ async fn siia_begin_attach_ip( &opctx, &authz_fip, params.instance.id(), + false, ) .await .map_err(ActionError::action_failed) @@ -349,6 +353,8 @@ async fn siia_migration_unlock( .await .map_err(ActionError::action_failed)?; + // TODO: explain why it is safe to not back out on state change. + new_ip.try_into().map_err(ActionError::action_failed) } diff --git a/nexus/src/app/sagas/instance_ip_detach.rs b/nexus/src/app/sagas/instance_ip_detach.rs index 16161e660d..7607c8abaa 100644 --- a/nexus/src/app/sagas/instance_ip_detach.rs +++ b/nexus/src/app/sagas/instance_ip_detach.rs @@ -140,7 +140,11 @@ async fn siid_begin_detach_ip( .ok_or_else(|| ActionError::action_failed(Error::invalid_request("instance does not have an attached ephemeral IP address")))?; datastore - .begin_deallocate_ephemeral_ip(&opctx, eph_ip.id) + .begin_deallocate_ephemeral_ip( + &opctx, + eph_ip.id, + params.instance.id(), + ) .await .map_err(ActionError::action_failed) } @@ -158,6 +162,7 @@ async fn siid_begin_detach_ip( &opctx, &authz_fip, params.instance.id(), + false, ) .await .map_err(ActionError::action_failed) diff --git a/schema/crdb/22.0.0/up06.sql b/schema/crdb/22.0.0/up06.sql index 48fd3402ce..a224588a37 100644 --- a/schema/crdb/22.0.0/up06.sql +++ b/schema/crdb/22.0.0/up06.sql @@ -1,4 +1,4 @@ ALTER TABLE omicron.public.external_ip ADD CONSTRAINT detached_null_parent_id CHECK ( - (state = 'detached') != (parent_id IS NOT NULL) + (state = 'detached') OR (parent_id IS NOT NULL) ); diff --git a/schema/crdb/dbinit.sql b/schema/crdb/dbinit.sql index b3d96a1595..68c4fd0b43 100644 --- a/schema/crdb/dbinit.sql +++ b/schema/crdb/dbinit.sql @@ -1745,9 +1745,13 @@ CREATE TABLE IF NOT EXISTS omicron.public.external_ip ( (kind = 'ephemeral' AND is_service = FALSE) OR (kind != 'ephemeral') ), - /* parent_id must be null if detached, non-null if not detached */ + /* + * (Not detached) => non-null parent_id. + * This is not a two-way implication because SNAT/Ephemeral IPs + * cannot have a null parent_id. + */ CONSTRAINT detached_null_parent_id CHECK ( - (state = 'detached') != (parent_id IS NOT NULL) + (state = 'detached') OR (parent_id IS NOT NULL) ) ); From 51f8baeb2b86d1b66e9197922c01159fb8a0f8f2 Mon Sep 17 00:00:00 2001 From: Kyle Simpson Date: Thu, 21 Dec 2023 20:13:23 +0000 Subject: [PATCH 13/56] Block instance_start while attaching/detaching Instance stop seems to do nothing, so that's fine -- probably need to make the attach/detach undo pass by failures to communicate with sled since we can't block it directly as-is. Delete is a bit trickier, need to see what Disk does. --- nexus/src/app/instance_network.rs | 16 +++++++++++++--- 1 file changed, 13 insertions(+), 3 deletions(-) diff --git a/nexus/src/app/instance_network.rs b/nexus/src/app/instance_network.rs index 8941cec24d..4f6dbb8f12 100644 --- a/nexus/src/app/instance_network.rs +++ b/nexus/src/app/instance_network.rs @@ -7,6 +7,7 @@ use crate::app::sagas::retry_until_known_result; use ipnetwork::IpNetwork; use ipnetwork::Ipv6Network; +use nexus_db_model::IpAttachState; use nexus_db_model::Ipv4NatValues; use nexus_db_model::Vni as DbVni; use nexus_db_queries::authz; @@ -341,18 +342,27 @@ impl super::Nexus { .instance_lookup_external_ips(&opctx, instance_id) .await?; - let ips_of_interest = if let Some(wanted_id) = ip_filter { + let (ips_of_interest, must_all_be_attached) = if let Some(wanted_id) = ip_filter { if let Some(ip) = ips.iter().find(|v| v.id == wanted_id) { - std::slice::from_ref(ip) + (std::slice::from_ref(ip), false) } else { return Err(Error::internal_error(&format!( "failed to find external ip address with id: {wanted_id}", ))); } } else { - &ips[..] + (&ips[..], true) }; + // This is performed so that an IP attach/detach will block the + // instance_start saga. Return service unavailable to indicate + // the request is retryable. + if ips_of_interest.iter().find(|ip| must_all_be_attached && ip.state != IpAttachState::Attached).is_some() { + return Err(Error::ServiceUnavailable { + internal_message: "cannot push all DPD state: IP attach/detach in progress".into(), + }); + } + let sled_address = Ipv6Net(Ipv6Network::new(*sled_ip_address.ip(), 128).unwrap()); From a1b558e3541fe70cfad1416e7e24c63610c939e3 Mon Sep 17 00:00:00 2001 From: Kyle Simpson Date: Fri, 22 Dec 2023 13:39:21 +0000 Subject: [PATCH 14/56] Refactor, resolve interaction with instance delete saga. --- nexus/db-model/src/instance_state.rs | 6 + .../src/db/datastore/external_ip.rs | 41 +-- .../db-queries/src/db/queries/external_ip.rs | 27 ++ nexus/src/app/instance.rs | 12 +- nexus/src/app/instance_network.rs | 33 +- nexus/src/app/sagas/instance_common.rs | 266 +++++++++++++++- nexus/src/app/sagas/instance_ip_attach.rs | 276 +++++------------ nexus/src/app/sagas/instance_ip_detach.rs | 291 +++++------------- 8 files changed, 497 insertions(+), 455 deletions(-) diff --git a/nexus/db-model/src/instance_state.rs b/nexus/db-model/src/instance_state.rs index 6b4c71da79..644474257a 100644 --- a/nexus/db-model/src/instance_state.rs +++ b/nexus/db-model/src/instance_state.rs @@ -65,3 +65,9 @@ impl From for sled_agent_client::types::InstanceState { } } } + +impl From for InstanceState { + fn from(state: external::InstanceState) -> Self { + Self::new(state) + } +} diff --git a/nexus/db-queries/src/db/datastore/external_ip.rs b/nexus/db-queries/src/db/datastore/external_ip.rs index 905d95aee4..dc8c886fea 100644 --- a/nexus/db-queries/src/db/datastore/external_ip.rs +++ b/nexus/db-queries/src/db/datastore/external_ip.rs @@ -26,6 +26,9 @@ use crate::db::model::Name; use crate::db::pagination::paginated; use crate::db::pool::DbConnection; use crate::db::queries::external_ip::NextExternalIp; +use crate::db::queries::external_ip::SAFE_TO_ATTACH_INSTANCE_STATES; +use crate::db::queries::external_ip::SAFE_TO_ATTACH_INSTANCE_STATES_CREATING; +use crate::db::queries::external_ip::SAFE_TRANSITORY_INSTANCE_STATES; use crate::db::update_and_check::UpdateAndCheck; use crate::db::update_and_check::UpdateStatus; use async_bb8_diesel::AsyncRunQueryDsl; @@ -51,27 +54,8 @@ use ref_cast::RefCast; use std::net::IpAddr; use uuid::Uuid; -// Broadly, we want users to be able to attach/detach at will -// once an instance is created and functional. -// If we're in a state which will naturally resolve to either -// stopped/running, we want users to know that the request can be -// retried safely. -const SAFE_TO_ATTACH_INSTANCE_STATES_CREATING: [DbInstanceState; 3] = [ - DbInstanceState(ApiInstanceState::Stopped), - DbInstanceState(ApiInstanceState::Running), - DbInstanceState(ApiInstanceState::Creating), -]; -const SAFE_TO_ATTACH_INSTANCE_STATES: [DbInstanceState; 2] = [ - DbInstanceState(ApiInstanceState::Stopped), - DbInstanceState(ApiInstanceState::Running), -]; -const SAFE_TRANSITORY_INSTANCE_STATES: [DbInstanceState; 3] = [ - DbInstanceState(ApiInstanceState::Starting), - DbInstanceState(ApiInstanceState::Stopping), - DbInstanceState(ApiInstanceState::Creating), -]; // FIXME: should be exported from a shared location, original lives in -// nexus app. +// nexus/app. const MAX_EXTERNAL_IPS_PER_INSTANCE: u32 = 32; impl DataStore { @@ -456,7 +440,7 @@ impl DataStore { Ok(eip) } - /// Delete all external IP addresses associated with the provided instance + /// Delete all non-floating IP addresses associated with the provided instance /// ID. /// /// This method returns the number of records deleted, rather than the usual @@ -474,16 +458,22 @@ impl DataStore { .filter(dsl::is_service.eq(false)) .filter(dsl::parent_id.eq(instance_id)) .filter(dsl::kind.ne(IpKind::Floating)) - .set(dsl::time_deleted.eq(now)) + .set(( + dsl::time_deleted.eq(now), + dsl::state.eq(IpAttachState::Detached), + )) .execute_async(&*self.pool_connection_authorized(opctx).await?) .await .map_err(|e| public_error_from_diesel(e, ErrorHandler::Server)) } - /// Detach an individual Floating IP address from its parent instance. + /// Detach all Floating IP address from their parent instance. /// /// As in `deallocate_external_ip_by_instance_id`, this method returns the /// number of records altered, rather than an `UpdateResult`. + /// + /// This method ignores ongoing state transitions, and is only safely + /// usable from within the instance_delete saga. pub async fn detach_floating_ips_by_instance_id( &self, opctx: &OpContext, @@ -495,7 +485,10 @@ impl DataStore { .filter(dsl::is_service.eq(false)) .filter(dsl::parent_id.eq(instance_id)) .filter(dsl::kind.eq(IpKind::Floating)) - .set(dsl::parent_id.eq(Option::::None)) + .set(( + dsl::parent_id.eq(Option::::None), + dsl::state.eq(IpAttachState::Detached), + )) .execute_async(&*self.pool_connection_authorized(opctx).await?) .await .map_err(|e| public_error_from_diesel(e, ErrorHandler::Server)) diff --git a/nexus/db-queries/src/db/queries/external_ip.rs b/nexus/db-queries/src/db/queries/external_ip.rs index d4de9a99f0..53e0738b12 100644 --- a/nexus/db-queries/src/db/queries/external_ip.rs +++ b/nexus/db-queries/src/db/queries/external_ip.rs @@ -26,12 +26,39 @@ use diesel::Column; use diesel::Expression; use diesel::QueryResult; use diesel::RunQueryDsl; +use nexus_db_model::InstanceState as DbInstanceState; use nexus_db_model::IpAttachState; use nexus_db_model::IpAttachStateEnum; use omicron_common::address::NUM_SOURCE_NAT_PORTS; use omicron_common::api::external; +use omicron_common::api::external::InstanceState as ApiInstanceState; use uuid::Uuid; +// Broadly, we want users to be able to attach/detach at will +// once an instance is created and functional. +// If we're in a state which will naturally resolve to either +// stopped/running, we want users to know that the request can be +// retried safely. +pub const SAFE_TO_ATTACH_INSTANCE_STATES_CREATING: [DbInstanceState; 3] = [ + DbInstanceState(ApiInstanceState::Stopped), + DbInstanceState(ApiInstanceState::Running), + DbInstanceState(ApiInstanceState::Creating), +]; +pub const SAFE_TO_ATTACH_INSTANCE_STATES: [DbInstanceState; 2] = [ + DbInstanceState(ApiInstanceState::Stopped), + DbInstanceState(ApiInstanceState::Running), +]; +// TODO: Currently stop if there's a migration or other state change. +// This may be a good case for RPWing +// external_ip_state -> { NAT RPW, sled-agent } in future. +pub const SAFE_TRANSITORY_INSTANCE_STATES: [DbInstanceState; 5] = [ + DbInstanceState(ApiInstanceState::Starting), + DbInstanceState(ApiInstanceState::Stopping), + DbInstanceState(ApiInstanceState::Creating), + DbInstanceState(ApiInstanceState::Rebooting), + DbInstanceState(ApiInstanceState::Migrating), +]; + type FromClause = diesel::internal::table_macro::StaticQueryFragmentInstance; type IpPoolRangeFromClause = FromClause; diff --git a/nexus/src/app/instance.rs b/nexus/src/app/instance.rs index e2726e1510..932e10468e 100644 --- a/nexus/src/app/instance.rs +++ b/nexus/src/app/instance.rs @@ -1914,13 +1914,13 @@ impl super::Nexus { instance_lookup: &lookup::Instance<'_>, ext_ip: ¶ms::ExternalIpCreate, ) -> UpdateResult { - let (.., _authz_project, authz_instance, instance) = - instance_lookup.fetch_for(authz::Action::Modify).await?; + let (.., authz_project, authz_instance) = + instance_lookup.lookup_for(authz::Action::Modify).await?; let saga_params = sagas::instance_ip_attach::Params { create_params: ext_ip.clone(), authz_instance, - instance, + project_id: authz_project.id(), serialized_authn: authn::saga::Serialized::for_opctx(opctx), }; @@ -1943,13 +1943,13 @@ impl super::Nexus { instance_lookup: &lookup::Instance<'_>, ext_ip: ¶ms::ExternalIpDelete, ) -> UpdateResult { - let (.., _authz_project, authz_instance, instance) = - instance_lookup.fetch_for(authz::Action::Modify).await?; + let (.., authz_project, authz_instance) = + instance_lookup.lookup_for(authz::Action::Modify).await?; let saga_params = sagas::instance_ip_detach::Params { delete_params: ext_ip.clone(), authz_instance, - instance, + project_id: authz_project.id(), serialized_authn: authn::saga::Serialized::for_opctx(opctx), }; diff --git a/nexus/src/app/instance_network.rs b/nexus/src/app/instance_network.rs index 4f6dbb8f12..f2c3eff7c5 100644 --- a/nexus/src/app/instance_network.rs +++ b/nexus/src/app/instance_network.rs @@ -342,24 +342,33 @@ impl super::Nexus { .instance_lookup_external_ips(&opctx, instance_id) .await?; - let (ips_of_interest, must_all_be_attached) = if let Some(wanted_id) = ip_filter { - if let Some(ip) = ips.iter().find(|v| v.id == wanted_id) { - (std::slice::from_ref(ip), false) - } else { - return Err(Error::internal_error(&format!( + let (ips_of_interest, must_all_be_attached) = + if let Some(wanted_id) = ip_filter { + if let Some(ip) = ips.iter().find(|v| v.id == wanted_id) { + (std::slice::from_ref(ip), false) + } else { + return Err(Error::internal_error(&format!( "failed to find external ip address with id: {wanted_id}", ))); - } - } else { - (&ips[..], true) - }; + } + } else { + (&ips[..], true) + }; // This is performed so that an IP attach/detach will block the // instance_start saga. Return service unavailable to indicate // the request is retryable. - if ips_of_interest.iter().find(|ip| must_all_be_attached && ip.state != IpAttachState::Attached).is_some() { - return Err(Error::ServiceUnavailable { - internal_message: "cannot push all DPD state: IP attach/detach in progress".into(), + if ips_of_interest + .iter() + .find(|ip| { + must_all_be_attached && ip.state != IpAttachState::Attached + }) + .is_some() + { + return Err(Error::ServiceUnavailable { + internal_message: + "cannot push all DPD state: IP attach/detach in progress" + .into(), }); } diff --git a/nexus/src/app/sagas/instance_common.rs b/nexus/src/app/sagas/instance_common.rs index 438b92cb84..86e8e35a39 100644 --- a/nexus/src/app/sagas/instance_common.rs +++ b/nexus/src/app/sagas/instance_common.rs @@ -8,12 +8,22 @@ use std::net::{IpAddr, Ipv6Addr}; use crate::Nexus; use chrono::Utc; -use nexus_db_model::{ByteCount, SledReservationConstraints, SledResource}; -use nexus_db_queries::{context::OpContext, db, db::DataStore}; +use nexus_db_model::{ + ByteCount, ExternalIp, IpAttachState, SledReservationConstraints, + SledResource, +}; +use nexus_db_queries::authz; +use nexus_db_queries::db::lookup::LookupPath; +use nexus_db_queries::db::queries::external_ip::SAFE_TRANSITORY_INSTANCE_STATES; +use nexus_db_queries::{authn, context::OpContext, db, db::DataStore}; +use omicron_common::api::external::Error; use omicron_common::api::external::InstanceState; +use serde::{Deserialize, Serialize}; use steno::ActionError; use uuid::Uuid; +use super::NexusActionContext; + /// Reserves resources for a new VMM whose instance has `ncpus` guest logical /// processors and `guest_memory` bytes of guest RAM. The selected sled is /// random within the set of sleds allowed by the supplied `constraints`. @@ -133,3 +143,255 @@ pub(super) async fn allocate_sled_ipv6( .await .map_err(ActionError::action_failed) } + +/// Instance state needed for IP attach/detachment. +#[derive(Debug, Deserialize, Serialize)] +pub struct InstanceStateForIp { + pub sled_id: Option, + pub state: InstanceState, +} + +/// Move an external IP from one state to another as a saga operation, +/// returning `Ok(true)` if the record was successfully moved and `Ok(false)` +/// if the record was lost. +/// +/// Returns `Err` if given an illegal state transition or several rows +/// were updated, which are programmer errors. +pub async fn instance_ip_move_state( + sagactx: &NexusActionContext, + serialized_authn: &authn::saga::Serialized, + from: IpAttachState, + to: IpAttachState, +) -> Result { + let osagactx = sagactx.user_data(); + let datastore = osagactx.datastore(); + let opctx = + crate::context::op_context_for_saga_action(&sagactx, serialized_authn); + + let new_ip = sagactx.lookup::("target_ip")?; + + match datastore + .external_ip_complete_op(&opctx, new_ip.id, new_ip.kind, from, to) + .await + .map_err(ActionError::action_failed)? + { + 0 => Ok(false), + 1 => Ok(true), + _ => Err(ActionError::action_failed(Error::internal_error( + "ip state change affected > 1 row", + ))), + } +} + +pub async fn instance_ip_get_instance_state( + sagactx: &NexusActionContext, + serialized_authn: &authn::saga::Serialized, + authz_instance: &authz::Instance, + verb: &str, +) -> Result { + // XXX: we can get instance state (but not sled ID) in same transaction + // as attach (but not detach) wth current design. We need to re-query + // for sled ID anyhow, so keep consistent between attach/detach. + let osagactx = sagactx.user_data(); + let datastore = osagactx.datastore(); + let opctx = + crate::context::op_context_for_saga_action(&sagactx, serialized_authn); + + let inst_and_vmm = datastore + .instance_fetch_with_vmm(&opctx, authz_instance) + .await + .map_err(ActionError::action_failed)?; + + let found_state = inst_and_vmm.instance().runtime_state.nexus_state.0; + let mut sled_id = inst_and_vmm.sled_id(); + + // Arriving here means we started in a correct state (running/stopped). + // We need to consider how we interact with the other sagas/ops: + // - starting: our claim on an IP will block it from moving past + // DPD_ensure and instance_start will undo. If we complete + // before then, it can move past and will fill in routes/opte. + // Act as though we have no sled_id. + // - stopping: this is not sagaized, and the propolis/sled-agent might + // go away. Act as though stopped if we catch it here, + // otherwise convert OPTE ensure to 'service unavailable' + // and undo. + // - deleting: can only be called from stopped -- we won't push to dpd + // or sled-agent, and IP record might be deleted or forcibly + // detached. Catch here just in case. + let state = match found_state { + InstanceState::Stopped + | InstanceState::Starting + | InstanceState::Stopping => { + sled_id = None; + InstanceState::Stopped + } + InstanceState::Running => InstanceState::Running, + state if SAFE_TRANSITORY_INSTANCE_STATES.contains(&state.into()) => { + return Err(ActionError::action_failed(Error::unavail(&format!( + "can't {verb} in transient state {state}" + )))) + } + InstanceState::Destroyed => { + return Err(ActionError::action_failed(Error::not_found_by_id( + omicron_common::api::external::ResourceType::Instance, + &authz_instance.id(), + ))) + } + // Final cases are rebooting/failed. + _ => { + return Err(ActionError::action_failed(Error::invalid_request( + "cannot modify instance IPs, instance is in unhealthy state", + ))) + } + }; + + Ok(InstanceStateForIp { sled_id, state }) +} + +pub async fn instance_ip_add_nat( + sagactx: &NexusActionContext, + serialized_authn: &authn::saga::Serialized, + authz_instance: &authz::Instance, +) -> Result<(), ActionError> { + let osagactx = sagactx.user_data(); + let datastore = osagactx.datastore(); + let opctx = + crate::context::op_context_for_saga_action(&sagactx, serialized_authn); + + // No physical sled? Don't push NAT. + let Some(sled_uuid) = + sagactx.lookup::("instance_state")?.sled_id + else { + return Ok(()); + }; + + let target_ip = sagactx.lookup::("target_ip")?; + + // Querying sleds requires fleet access; use the instance allocator context + // for this. + let (.., sled) = LookupPath::new(&osagactx.nexus().opctx_alloc, &datastore) + .sled_id(sled_uuid) + .fetch() + .await + .map_err(ActionError::action_failed)?; + + osagactx + .nexus() + .instance_ensure_dpd_config( + &opctx, + authz_instance.id(), + &sled.address(), + Some(target_ip.id), + ) + .await + .map_err(ActionError::action_failed)?; + + Ok(()) +} + +pub async fn instance_ip_remove_nat( + sagactx: &NexusActionContext, + serialized_authn: &authn::saga::Serialized, + authz_instance: &authz::Instance, +) -> Result<(), ActionError> { + let osagactx = sagactx.user_data(); + let opctx = + crate::context::op_context_for_saga_action(&sagactx, serialized_authn); + + // No physical sled? Don't push NAT. + let Some(_) = + sagactx.lookup::("instance_state")?.sled_id + else { + return Ok(()); + }; + + let target_ip = sagactx.lookup::("target_ip")?; + + osagactx + .nexus() + .instance_delete_dpd_config(&opctx, authz_instance, Some(target_ip.id)) + .await + .map_err(ActionError::action_failed)?; + + Ok(()) +} + +pub async fn instance_ip_add_opte( + sagactx: &NexusActionContext, + authz_instance: &authz::Instance, +) -> Result<(), ActionError> { + let osagactx = sagactx.user_data(); + + // No physical sled? Don't inform OPTE. + let Some(sled_uuid) = + sagactx.lookup::("instance_state")?.sled_id + else { + return Ok(()); + }; + + let new_ip = sagactx.lookup::("target_ip")?; + let sled_agent_body = + new_ip.try_into().map_err(ActionError::action_failed)?; + + osagactx + .nexus() + .sled_client(&sled_uuid) + .await + .map_err(|_| { + ActionError::action_failed(Error::unavail( + "sled agent client went away mid-attach", + )) + })? + .instance_put_external_ip(&authz_instance.id(), &sled_agent_body) + .await + .map_err(|e| { + ActionError::action_failed(match e { + progenitor_client::Error::CommunicationError(_) => { + Error::unavail("sled agent client went away mid-attach") + } + e => Error::internal_error(&format!("{e}")), + }) + })?; + + Ok(()) +} + +pub async fn instance_ip_remove_opte( + sagactx: &NexusActionContext, + authz_instance: &authz::Instance, +) -> Result<(), ActionError> { + let osagactx = sagactx.user_data(); + + // If we didn't push OPTE before, don't undo it. + let Some(sled_uuid) = + sagactx.lookup::("instance_state")?.sled_id + else { + return Ok(()); + }; + + let new_ip = sagactx.lookup::("target_ip")?; + let sled_agent_body = + new_ip.try_into().map_err(ActionError::action_failed)?; + + osagactx + .nexus() + .sled_client(&sled_uuid) + .await + .map_err(|_| { + ActionError::action_failed(Error::unavail( + "sled agent client went away mid-attach", + )) + })? + .instance_put_external_ip(&authz_instance.id(), &sled_agent_body) + .await + .map_err(|e| { + ActionError::action_failed(match e { + progenitor_client::Error::CommunicationError(_) => { + Error::unavail("sled agent client went away mid-attach") + } + e => Error::internal_error(&format!("{e}")), + }) + })?; + + Ok(()) +} diff --git a/nexus/src/app/sagas/instance_ip_attach.rs b/nexus/src/app/sagas/instance_ip_attach.rs index be36f19e0f..a5ca2606c4 100644 --- a/nexus/src/app/sagas/instance_ip_attach.rs +++ b/nexus/src/app/sagas/instance_ip_attach.rs @@ -2,13 +2,19 @@ // License, v. 2.0. If a copy of the MPL was not distributed with this // file, You can obtain one at https://mozilla.org/MPL/2.0/. +use super::instance_common::{ + instance_ip_add_nat, instance_ip_add_opte, instance_ip_get_instance_state, + instance_ip_move_state, instance_ip_remove_nat, instance_ip_remove_opte, + InstanceStateForIp, +}; use super::{ActionRegistry, NexusActionContext, NexusSaga}; use crate::app::sagas::declare_saga_actions; use crate::app::{authn, authz, db}; use crate::external_api::params; -use nexus_db_model::ExternalIp; +use nexus_db_model::{ExternalIp, IpAttachState}; use nexus_db_queries::db::identity::Resource; use nexus_db_queries::db::lookup::LookupPath; +use nexus_db_queries::db::queries::external_ip::SAFE_TRANSITORY_INSTANCE_STATES; use nexus_types::external_api::views; use omicron_common::api::external::{Error, InstanceState}; use serde::Deserialize; @@ -34,16 +40,15 @@ use uuid::Uuid; declare_saga_actions! { instance_ip_attach; - LOCK_MIGRATION -> "sled_id" { - + siia_migration_lock - - siia_migration_lock_undo - } - - ATTACH_EXTERNAL_IP -> "new_ip" { + ATTACH_EXTERNAL_IP -> "target_ip" { + siia_begin_attach_ip - siia_begin_attach_ip_undo } + INSTANCE_STATE -> "instance_state" { + + siia_get_instance_state + } + REGISTER_NAT -> "no_result0" { + siia_nat - siia_nat_undo @@ -54,8 +59,8 @@ declare_saga_actions! { - siia_update_opte_undo } - UNLOCK_MIGRATION -> "output" { - + siia_migration_unlock + COMPLETE_ATTACH -> "output" { + + siia_complete_attach } } @@ -63,63 +68,12 @@ declare_saga_actions! { pub struct Params { pub create_params: params::ExternalIpCreate, pub authz_instance: authz::Instance, - pub instance: db::model::Instance, + pub project_id: Uuid, /// Authentication context to use to fetch the instance's current state from /// the database. pub serialized_authn: authn::saga::Serialized, } -async fn siia_migration_lock( - sagactx: NexusActionContext, -) -> Result, ActionError> { - // TODO: do this. - let osagactx = sagactx.user_data(); - let datastore = osagactx.datastore(); - let params = sagactx.saga_params::()?; - let opctx = crate::context::op_context_for_saga_action( - &sagactx, - ¶ms.serialized_authn, - ); - - let inst_and_vmm = datastore - .instance_fetch_with_vmm(&opctx, ¶ms.authz_instance) - .await - .map_err(ActionError::action_failed)?; - - // TODO: Currently stop if there's a migration. This may be a good case - // for RPW'ing ext_ip_state -> { NAT RPW, sled-agent } in future. - if inst_and_vmm.instance().runtime_state.migration_id.is_some() { - return Err(ActionError::action_failed(Error::ServiceUnavailable { - internal_message: "target instance is migrating".into(), - })); - } - - let valid_instance_states = [ - InstanceState::Running, - InstanceState::Stopped, - // InstanceState::Rebooting is safe in principle, but likely - // to trip up when backing out iff. state change. - ]; - - let state = inst_and_vmm.instance().runtime_state.nexus_state.0; - if !valid_instance_states.contains(&state) { - return Err(ActionError::action_failed(Error::ServiceUnavailable { - internal_message: "instance must be 'Running' or 'Stopped'".into(), - })); - } - - // TODO: actually lock? - - Ok(inst_and_vmm.sled_id()) -} - -async fn siia_migration_lock_undo( - _sagactx: NexusActionContext, -) -> Result<(), anyhow::Error> { - // TODO: do this iff. we implement migration lock. - Ok(()) -} - // TODO: factor this out for attach, detach, and instance create // to share an impl. @@ -143,7 +97,7 @@ async fn siia_begin_attach_ip( .allocate_instance_ephemeral_ip( &opctx, Uuid::new_v4(), - params.instance.id(), + params.authz_instance.id(), pool_name, ) .await @@ -153,7 +107,7 @@ async fn siia_begin_attach_ip( params::ExternalIpCreate::Floating { ref floating_ip_name } => { let floating_ip_name = db::model::Name(floating_ip_name.clone()); let (.., authz_fip) = LookupPath::new(&opctx, &datastore) - .project_id(params.instance.project_id) + .project_id(params.project_id) .floating_ip_name(&floating_ip_name) .lookup_for(authz::Action::Modify) .await @@ -163,7 +117,7 @@ async fn siia_begin_attach_ip( .floating_ip_begin_attach( &opctx, &authz_fip, - params.instance.id(), + params.authz_instance.id(), false, ) .await @@ -175,95 +129,56 @@ async fn siia_begin_attach_ip( async fn siia_begin_attach_ip_undo( sagactx: NexusActionContext, ) -> Result<(), anyhow::Error> { - let osagactx = sagactx.user_data(); - let datastore = osagactx.datastore(); + let log = sagactx.user_data().log(); + warn!(log, "siia_begin_attach_ip_undo: Reverting detached->attaching"); let params = sagactx.saga_params::()?; - let opctx = crate::context::op_context_for_saga_action( + if !instance_ip_move_state( &sagactx, ¶ms.serialized_authn, - ); - - let new_ip = sagactx.lookup::("new_ip")?; - - let n_rows = datastore - .external_ip_complete_op( - &opctx, - new_ip.id, - new_ip.kind, - nexus_db_model::IpAttachState::Attaching, - nexus_db_model::IpAttachState::Detached, - ) - .await - .map_err(ActionError::action_failed)?; + IpAttachState::Attaching, + IpAttachState::Detached, + ) + .await? + { + error!(log, "siia_begin_attach_ip_undo: external IP was deleted") + } Ok(()) } -async fn siia_nat(sagactx: NexusActionContext) -> Result<(), ActionError> { - let osagactx = sagactx.user_data(); - let datastore = osagactx.datastore(); +async fn siia_get_instance_state( + sagactx: NexusActionContext, +) -> Result { let params = sagactx.saga_params::()?; - let opctx = crate::context::op_context_for_saga_action( + instance_ip_get_instance_state( &sagactx, ¶ms.serialized_authn, - ); - - // NOTE: mostly copied from instance_start. - - // No physical sled? Don't push NAT. - let Some(sled_uuid) = sagactx.lookup::>("sled_id")? else { - return Ok(()); - }; - - let new_ip = sagactx.lookup::("new_ip")?; - - // Querying sleds requires fleet access; use the instance allocator context - // for this. - let (.., sled) = LookupPath::new(&osagactx.nexus().opctx_alloc, &datastore) - .sled_id(sled_uuid) - .fetch() - .await - .map_err(ActionError::action_failed)?; - - osagactx - .nexus() - .instance_ensure_dpd_config( - &opctx, - params.instance.id(), - &sled.address(), - Some(new_ip.id), - ) - .await - .map_err(ActionError::action_failed)?; + ¶ms.authz_instance, + "attach", + ) + .await +} - Ok(()) +async fn siia_nat(sagactx: NexusActionContext) -> Result<(), ActionError> { + let params = sagactx.saga_params::()?; + instance_ip_add_nat( + &sagactx, + ¶ms.serialized_authn, + ¶ms.authz_instance, + ) + .await } async fn siia_nat_undo( sagactx: NexusActionContext, ) -> Result<(), anyhow::Error> { - let osagactx = sagactx.user_data(); let params = sagactx.saga_params::()?; - let opctx = crate::context::op_context_for_saga_action( + instance_ip_remove_nat( &sagactx, ¶ms.serialized_authn, - ); - - // If we didn't push NAT before, don't undo it. - if sagactx.lookup::>("sled_id")?.is_none() { - return Ok(()); - } - - let new_ip = sagactx.lookup::("new_ip")?; - - osagactx - .nexus() - .instance_delete_dpd_config( - &opctx, - ¶ms.authz_instance, - Some(new_ip.id), - ) - .await?; + ¶ms.authz_instance, + ) + .await?; Ok(()) } @@ -271,91 +186,44 @@ async fn siia_nat_undo( async fn siia_update_opte( sagactx: NexusActionContext, ) -> Result<(), ActionError> { - let osagactx = sagactx.user_data(); let params = sagactx.saga_params::()?; - - // No physical sled? Don't inform OPTE. - let Some(sled_uuid) = sagactx.lookup::>("sled_id")? else { - return Ok(()); - }; - - let new_ip = sagactx.lookup::("new_ip")?; - let sled_agent_body = - new_ip.try_into().map_err(ActionError::action_failed)?; - - // TODO: disambiguate the various sled agent errors etc. - osagactx - .nexus() - .sled_client(&sled_uuid) - .await - .map_err(ActionError::action_failed)? - .instance_put_external_ip(¶ms.instance.id(), &sled_agent_body) - .await - .map_err(|_| { - ActionError::action_failed(Error::invalid_request("hmm")) - })?; - - Ok(()) + instance_ip_add_opte(&sagactx, ¶ms.authz_instance).await } async fn siia_update_opte_undo( sagactx: NexusActionContext, ) -> Result<(), anyhow::Error> { - let osagactx = sagactx.user_data(); let params = sagactx.saga_params::()?; - - // If we didn't push OPTE before, don't undo it. - let Some(sled_uuid) = sagactx.lookup::>("sled_id")? else { - return Ok(()); - }; - - let new_ip = sagactx.lookup::("new_ip")?; - let sled_agent_body = new_ip.try_into()?; - - // TODO: disambiguate the various sled agent errors etc. - osagactx - .nexus() - .sled_client(&sled_uuid) - .await - .map_err(ActionError::action_failed)? - .instance_delete_external_ip(¶ms.instance.id(), &sled_agent_body) - .await - .map_err(|_| { - ActionError::action_failed(Error::invalid_request("hmm")) - })?; - + instance_ip_remove_opte(&sagactx, ¶ms.authz_instance).await?; Ok(()) } -async fn siia_migration_unlock( +async fn siia_complete_attach( sagactx: NexusActionContext, ) -> Result { - let osagactx = sagactx.user_data(); - let datastore = osagactx.datastore(); let params = sagactx.saga_params::()?; - let opctx = crate::context::op_context_for_saga_action( + let initial_state = + sagactx.lookup::("instance_state")?.state; + let target_ip = sagactx.lookup::("target_ip")?; + + let update_occurred = instance_ip_move_state( &sagactx, ¶ms.serialized_authn, - ); - // TODO: do this iff. we implement migration lock. - // TODO: Backtrack if there's an unexpected change to runstate? - - let new_ip = sagactx.lookup::("new_ip")?; - - let n_rows = datastore - .external_ip_complete_op( - &opctx, - new_ip.id, - new_ip.kind, - nexus_db_model::IpAttachState::Attaching, - nexus_db_model::IpAttachState::Attached, - ) - .await - .map_err(ActionError::action_failed)?; + IpAttachState::Attaching, + IpAttachState::Detached, + ) + .await?; // TODO: explain why it is safe to not back out on state change. - - new_ip.try_into().map_err(ActionError::action_failed) + match (update_occurred, initial_state) { + // Allow failure here on stopped because the instance_delete saga + // may have been concurrently fired off and removed the row. + (false, InstanceState::Stopped) | (true, _) => { + target_ip.try_into().map_err(ActionError::action_failed) + } + _ => Err(Error::internal_error("failed to complete IP attach")) + .map_err(ActionError::action_failed), + } } // TODO: backout changes if run state changed illegally? @@ -374,11 +242,11 @@ impl NexusSaga for SagaInstanceIpAttach { _params: &Self::Params, mut builder: steno::DagBuilder, ) -> Result { - builder.append(lock_migration_action()); builder.append(attach_external_ip_action()); + builder.append(instance_state_action()); builder.append(register_nat_action()); builder.append(ensure_opte_port_action()); - builder.append(unlock_migration_action()); + builder.append(complete_attach_action()); Ok(builder.build()?) } } diff --git a/nexus/src/app/sagas/instance_ip_detach.rs b/nexus/src/app/sagas/instance_ip_detach.rs index 7607c8abaa..8145009efe 100644 --- a/nexus/src/app/sagas/instance_ip_detach.rs +++ b/nexus/src/app/sagas/instance_ip_detach.rs @@ -2,12 +2,17 @@ // License, v. 2.0. If a copy of the MPL was not distributed with this // file, You can obtain one at https://mozilla.org/MPL/2.0/. +use super::instance_common::{ + instance_ip_add_nat, instance_ip_add_opte, instance_ip_get_instance_state, + instance_ip_move_state, instance_ip_remove_nat, instance_ip_remove_opte, + InstanceStateForIp, +}; use super::{ActionRegistry, NexusActionContext, NexusSaga}; use crate::app::sagas::declare_saga_actions; use crate::app::{authn, authz, db}; use crate::external_api::params; use futures::TryFutureExt; -use nexus_db_model::{ExternalIp, IpKind}; +use nexus_db_model::{ExternalIp, IpAttachState, IpKind}; use nexus_db_queries::db::identity::Resource; use nexus_db_queries::db::lookup::LookupPath; use nexus_types::external_api::views; @@ -33,16 +38,15 @@ use uuid::Uuid; declare_saga_actions! { instance_ip_detach; - LOCK_MIGRATION -> "sled_id" { - + siid_migration_lock - - siid_migration_lock_undo - } - DETACH_EXTERNAL_IP -> "target_ip" { + siid_begin_detach_ip - siid_begin_detach_ip_undo } + INSTANCE_STATE -> "instance_state" { + + siid_get_instance_state + } + REMOVE_NAT -> "no_result0" { + siid_nat - siid_nat_undo @@ -53,8 +57,8 @@ declare_saga_actions! { - siid_update_opte_undo } - UNLOCK_MIGRATION -> "output" { - + siid_migration_unlock + COMPLETE_ATTACH -> "output" { + + siid_complete_attach } } @@ -62,62 +66,12 @@ declare_saga_actions! { pub struct Params { pub delete_params: params::ExternalIpDelete, pub authz_instance: authz::Instance, - pub instance: db::model::Instance, + pub project_id: Uuid, /// Authentication context to use to fetch the instance's current state from /// the database. pub serialized_authn: authn::saga::Serialized, } -async fn siid_migration_lock( - sagactx: NexusActionContext, -) -> Result, ActionError> { - // TODO: do this. - let osagactx = sagactx.user_data(); - let datastore = osagactx.datastore(); - let params = sagactx.saga_params::()?; - let opctx = crate::context::op_context_for_saga_action( - &sagactx, - ¶ms.serialized_authn, - ); - - let inst_and_vmm = datastore - .instance_fetch_with_vmm(&opctx, ¶ms.authz_instance) - .await - .map_err(ActionError::action_failed)?; - - if inst_and_vmm.instance().runtime_state.migration_id.is_some() { - return Err(ActionError::action_failed(Error::ServiceUnavailable { - internal_message: "target instance is migrating".into(), - })); - } - - let valid_instance_states = [ - InstanceState::Running, - InstanceState::Stopped, - // InstanceState::Rebooting is safe in principle, but likely - // to trip up when backing out iff. state change. - ]; - - let state = inst_and_vmm.instance().runtime_state.nexus_state.0; - if !valid_instance_states.contains(&state) { - return Err(ActionError::action_failed(Error::ServiceUnavailable { - internal_message: "instance must be 'Running' or 'Stopped'".into(), - })); - } - - // TODO: actually lock? - // TODO: fail out in a user-friendly way if migrating? - - Ok(inst_and_vmm.sled_id()) -} - -async fn siid_migration_lock_undo( - _sagactx: NexusActionContext, -) -> Result<(), anyhow::Error> { - // TODO: do this iff. we implement migration lock. - Ok(()) -} - async fn siid_begin_detach_ip( sagactx: NexusActionContext, ) -> Result { @@ -132,18 +86,27 @@ async fn siid_begin_detach_ip( match params.delete_params { params::ExternalIpDelete::Ephemeral => { let eips = datastore - .instance_lookup_external_ips(&opctx, params.instance.id()) + .instance_lookup_external_ips( + &opctx, + params.authz_instance.id(), + ) .await .map_err(ActionError::action_failed)?; - let eph_ip = eips.iter().find(|e| e.kind == IpKind::Ephemeral) - .ok_or_else(|| ActionError::action_failed(Error::invalid_request("instance does not have an attached ephemeral IP address")))?; + let eph_ip = eips + .iter() + .find(|e| e.kind == IpKind::Ephemeral) + .ok_or_else(|| { + ActionError::action_failed(Error::invalid_request( + "instance does not have an attached ephemeral IP address" + )) + })?; datastore .begin_deallocate_ephemeral_ip( &opctx, eph_ip.id, - params.instance.id(), + params.authz_instance.id(), ) .await .map_err(ActionError::action_failed) @@ -151,7 +114,7 @@ async fn siid_begin_detach_ip( params::ExternalIpDelete::Floating { ref floating_ip_name } => { let floating_ip_name = db::model::Name(floating_ip_name.clone()); let (.., authz_fip) = LookupPath::new(&opctx, &datastore) - .project_id(params.instance.project_id) + .project_id(params.project_id) .floating_ip_name(&floating_ip_name) .lookup_for(authz::Action::Modify) .await @@ -161,7 +124,7 @@ async fn siid_begin_detach_ip( .floating_ip_begin_detach( &opctx, &authz_fip, - params.instance.id(), + params.authz_instance.id(), false, ) .await @@ -173,99 +136,56 @@ async fn siid_begin_detach_ip( async fn siid_begin_detach_ip_undo( sagactx: NexusActionContext, ) -> Result<(), anyhow::Error> { - let osagactx = sagactx.user_data(); - let datastore = osagactx.datastore(); + let log = sagactx.user_data().log(); + warn!(log, "siid_begin_detach_ip_undo: Reverting attached->detaching"); let params = sagactx.saga_params::()?; - let opctx = crate::context::op_context_for_saga_action( + if !instance_ip_move_state( &sagactx, ¶ms.serialized_authn, - ); - - let target_ip = sagactx.lookup::("target_ip")?; - - let n_rows = datastore - .external_ip_complete_op( - &opctx, - target_ip.id, - target_ip.kind, - nexus_db_model::IpAttachState::Detaching, - nexus_db_model::IpAttachState::Attached, - ) - .await - .map_err(ActionError::action_failed)?; + IpAttachState::Detaching, + IpAttachState::Attached, + ) + .await? + { + error!(log, "siid_begin_detach_ip_undo: external IP was deleted") + } Ok(()) } -async fn siid_nat(sagactx: NexusActionContext) -> Result<(), ActionError> { - let osagactx = sagactx.user_data(); +async fn siid_get_instance_state( + sagactx: NexusActionContext, +) -> Result { let params = sagactx.saga_params::()?; - let opctx = crate::context::op_context_for_saga_action( + instance_ip_get_instance_state( &sagactx, ¶ms.serialized_authn, - ); - - // No physical sled? Don't push NAT. - if sagactx.lookup::>("sled_id")?.is_none() { - return Ok(()); - } - - let target_ip = sagactx.lookup::("target_ip")?; - - // Currently getting an unfortunate error from here since 'detach' - // comes so late. - // Possible soln: use states, capture logic in 'begin_detach/attach' - // and call early? - - osagactx - .nexus() - .instance_delete_dpd_config( - &opctx, - ¶ms.authz_instance, - Some(target_ip.id), - ) - .await - .map_err(ActionError::action_failed)?; + ¶ms.authz_instance, + "attach", + ) + .await +} - Ok(()) +async fn siid_nat(sagactx: NexusActionContext) -> Result<(), ActionError> { + let params = sagactx.saga_params::()?; + instance_ip_remove_nat( + &sagactx, + ¶ms.serialized_authn, + ¶ms.authz_instance, + ) + .await } async fn siid_nat_undo( sagactx: NexusActionContext, ) -> Result<(), anyhow::Error> { - let osagactx = sagactx.user_data(); - let datastore = osagactx.datastore(); let params = sagactx.saga_params::()?; - let opctx = crate::context::op_context_for_saga_action( + instance_ip_add_nat( &sagactx, ¶ms.serialized_authn, - ); - - // NOTE: mostly copied from instance_start. - - // If we didn't push NAT before, don't undo it. - let Some(sled_uuid) = sagactx.lookup::>("sled_id")? else { - return Ok(()); - }; - - let target_ip = sagactx.lookup::("target_ip")?; - - // Querying sleds requires fleet access; use the instance allocator context - // for this. - let (.., sled) = LookupPath::new(&osagactx.nexus().opctx_alloc, &datastore) - .sled_id(sled_uuid) - .fetch() - .await?; - - osagactx - .nexus() - .instance_ensure_dpd_config( - &opctx, - params.instance.id(), - &sled.address(), - Some(target_ip.id), - ) - .await?; + ¶ms.authz_instance, + ) + .await?; Ok(()) } @@ -273,87 +193,44 @@ async fn siid_nat_undo( async fn siid_update_opte( sagactx: NexusActionContext, ) -> Result<(), ActionError> { - let osagactx = sagactx.user_data(); let params = sagactx.saga_params::()?; - - // No physical sled? Don't inform OPTE. - let Some(sled_uuid) = sagactx.lookup::>("sled_id")? else { - return Ok(()); - }; - - let target_ip = sagactx.lookup::("target_ip")?; - let sled_agent_body = - target_ip.try_into().map_err(ActionError::action_failed)?; - - // TODO: disambiguate the various sled agent errors etc. - osagactx - .nexus() - .sled_client(&sled_uuid) - .await - .map_err(ActionError::action_failed)? - .instance_delete_external_ip(¶ms.instance.id(), &sled_agent_body) - .await - .map_err(|_| { - ActionError::action_failed(Error::invalid_request("hmm")) - })?; - - Ok(()) + instance_ip_remove_opte(&sagactx, ¶ms.authz_instance).await } async fn siid_update_opte_undo( sagactx: NexusActionContext, ) -> Result<(), anyhow::Error> { - let osagactx = sagactx.user_data(); let params = sagactx.saga_params::()?; - - // If we didn't push OPTE before, don't undo it. - let Some(sled_uuid) = sagactx.lookup::>("sled_id")? else { - return Ok(()); - }; - - let target_ip = sagactx.lookup::("target_ip")?; - let sled_agent_body = - target_ip.try_into().map_err(ActionError::action_failed)?; - - // TODO: disambiguate the various sled agent errors etc. - osagactx - .nexus() - .sled_client(&sled_uuid) - .await - .map_err(ActionError::action_failed)? - .instance_put_external_ip(¶ms.instance.id(), &sled_agent_body) - .await?; - + instance_ip_add_opte(&sagactx, ¶ms.authz_instance).await?; Ok(()) } -async fn siid_migration_unlock( +async fn siid_complete_attach( sagactx: NexusActionContext, ) -> Result { - let osagactx = sagactx.user_data(); - let datastore = osagactx.datastore(); let params = sagactx.saga_params::()?; - let opctx = crate::context::op_context_for_saga_action( - &sagactx, - ¶ms.serialized_authn, - ); - // TODO: do this iff. we implement migration lock. - // TODO: Backtrack if there's an unexpected change to runstate? - + let initial_state = + sagactx.lookup::("instance_state")?.state; let target_ip = sagactx.lookup::("target_ip")?; - let n_rows = datastore - .external_ip_complete_op( - &opctx, - target_ip.id, - target_ip.kind, - nexus_db_model::IpAttachState::Detaching, - nexus_db_model::IpAttachState::Detached, - ) - .await - .map_err(ActionError::action_failed)?; - - target_ip.try_into().map_err(ActionError::action_failed) + let update_occurred = instance_ip_move_state( + &sagactx, + ¶ms.serialized_authn, + IpAttachState::Detaching, + IpAttachState::Detached, + ) + .await?; + + // TODO: explain why it is safe to not back out on state change. + match (update_occurred, initial_state) { + // Allow failure here on stopped because the instance_delete saga + // may have been concurrently fired off and removed the row. + (false, InstanceState::Stopped) | (true, _) => { + target_ip.try_into().map_err(ActionError::action_failed) + } + _ => Err(Error::internal_error("failed to complete IP attach")) + .map_err(ActionError::action_failed), + } } #[derive(Debug)] @@ -370,11 +247,11 @@ impl NexusSaga for SagaInstanceIpDetach { _params: &Self::Params, mut builder: steno::DagBuilder, ) -> Result { - builder.append(lock_migration_action()); builder.append(detach_external_ip_action()); + builder.append(instance_state_action()); builder.append(remove_nat_action()); builder.append(remove_opte_port_action()); - builder.append(unlock_migration_action()); + builder.append(complete_attach_action()); Ok(builder.build()?) } } From c0ffb93c355007390abc5e06c38605baa89cda5c Mon Sep 17 00:00:00 2001 From: Kyle Simpson Date: Fri, 22 Dec 2023 16:09:05 +0000 Subject: [PATCH 15/56] Add EIP state to omdb, clean up error msg on double attach/detach --- dev-tools/omdb/src/bin/omdb/db.rs | 3 +++ nexus/db-model/src/external_ip.rs | 11 +++++++++++ nexus/db-queries/src/db/datastore/external_ip.rs | 15 +++++++++------ nexus/src/app/sagas/instance_ip_attach.rs | 4 +--- nexus/src/app/sagas/instance_ip_detach.rs | 4 ++-- schema/crdb/22.0.0/up07.sql | 4 ++++ schema/crdb/dbinit.sql | 6 ++++++ 7 files changed, 36 insertions(+), 11 deletions(-) create mode 100644 schema/crdb/22.0.0/up07.sql diff --git a/dev-tools/omdb/src/bin/omdb/db.rs b/dev-tools/omdb/src/bin/omdb/db.rs index 08a783d8c8..15b16d515e 100644 --- a/dev-tools/omdb/src/bin/omdb/db.rs +++ b/dev-tools/omdb/src/bin/omdb/db.rs @@ -44,6 +44,7 @@ use nexus_db_model::ExternalIp; use nexus_db_model::HwBaseboardId; use nexus_db_model::Instance; use nexus_db_model::InvCollection; +use nexus_db_model::IpAttachState; use nexus_db_model::Project; use nexus_db_model::Region; use nexus_db_model::RegionSnapshot; @@ -1653,6 +1654,7 @@ async fn cmd_db_eips( ip: ipnetwork::IpNetwork, ports: PortRange, kind: String, + state: IpAttachState, owner: Owner, } @@ -1737,6 +1739,7 @@ async fn cmd_db_eips( first: ip.first_port.into(), last: ip.last_port.into(), }, + state: ip.state, kind: format!("{:?}", ip.kind), owner, }; diff --git a/nexus/db-model/src/external_ip.rs b/nexus/db-model/src/external_ip.rs index 8c76911781..b6f556ab61 100644 --- a/nexus/db-model/src/external_ip.rs +++ b/nexus/db-model/src/external_ip.rs @@ -57,6 +57,17 @@ impl_enum_type!( Attaching => b"attaching" ); +impl std::fmt::Display for IpAttachState { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + f.write_str(match self { + IpAttachState::Detached => "Detached", + IpAttachState::Attached => "Attached", + IpAttachState::Detaching => "Detaching", + IpAttachState::Attaching => "Attaching", + }) + } +} + /// The main model type for external IP addresses for instances /// and externally-facing services. /// diff --git a/nexus/db-queries/src/db/datastore/external_ip.rs b/nexus/db-queries/src/db/datastore/external_ip.rs index dc8c886fea..81f56596b5 100644 --- a/nexus/db-queries/src/db/datastore/external_ip.rs +++ b/nexus/db-queries/src/db/datastore/external_ip.rs @@ -33,18 +33,15 @@ use crate::db::update_and_check::UpdateAndCheck; use crate::db::update_and_check::UpdateStatus; use async_bb8_diesel::AsyncRunQueryDsl; use chrono::Utc; -use db::model::InstanceState as DbInstanceState; use diesel::prelude::*; use nexus_db_model::Instance; use nexus_db_model::IpAttachState; use nexus_types::external_api::params; use nexus_types::identity::Resource; -use omicron_common::api; use omicron_common::api::external::http_pagination::PaginatedBy; use omicron_common::api::external::CreateResult; use omicron_common::api::external::DeleteResult; use omicron_common::api::external::Error; -use omicron_common::api::external::InstanceState as ApiInstanceState; use omicron_common::api::external::ListResultVec; use omicron_common::api::external::LookupResult; use omicron_common::api::external::NameOrId; @@ -261,6 +258,7 @@ impl DataStore { ) } } + // Floating IP: name conflict DatabaseError(UniqueViolation, ..) if name.is_some() => { TransactionError::CustomError(public_error_from_diesel( e, @@ -272,6 +270,12 @@ impl DataStore { ), )) } + // Ephemeral IP: violated one-per-instance rule. + DatabaseError(UniqueViolation, ..) => { + TransactionError::CustomError(Error::invalid_request( + "instance/service cannot have more than one ephemeral IP" + )) + } _ => { if retryable(&e) { return TransactionError::Database(e); @@ -646,9 +650,8 @@ impl DataStore { }, AttachError::NoUpdate { attached_count, resource, collection } => { match resource.state { - // Idempotent errors: attach succeeded or is in progress for - // same resource pair -- this is fine. - IpAttachState::Attached | IpAttachState::Attaching if resource.parent_id == Some(instance_id) => return Ok((collection, resource)), + // Idempotent errors: is in progress forsame resource pair -- this is fine. + IpAttachState::Attaching if resource.parent_id == Some(instance_id) => return Ok((collection, resource)), IpAttachState::Attached => return Err(Error::invalid_request( "floating IP cannot be attached to one \ instance while still attached to another" diff --git a/nexus/src/app/sagas/instance_ip_attach.rs b/nexus/src/app/sagas/instance_ip_attach.rs index a5ca2606c4..61bf2980a2 100644 --- a/nexus/src/app/sagas/instance_ip_attach.rs +++ b/nexus/src/app/sagas/instance_ip_attach.rs @@ -12,9 +12,7 @@ use crate::app::sagas::declare_saga_actions; use crate::app::{authn, authz, db}; use crate::external_api::params; use nexus_db_model::{ExternalIp, IpAttachState}; -use nexus_db_queries::db::identity::Resource; use nexus_db_queries::db::lookup::LookupPath; -use nexus_db_queries::db::queries::external_ip::SAFE_TRANSITORY_INSTANCE_STATES; use nexus_types::external_api::views; use omicron_common::api::external::{Error, InstanceState}; use serde::Deserialize; @@ -210,7 +208,7 @@ async fn siia_complete_attach( &sagactx, ¶ms.serialized_authn, IpAttachState::Attaching, - IpAttachState::Detached, + IpAttachState::Attached, ) .await?; diff --git a/nexus/src/app/sagas/instance_ip_detach.rs b/nexus/src/app/sagas/instance_ip_detach.rs index 8145009efe..7715ec19a0 100644 --- a/nexus/src/app/sagas/instance_ip_detach.rs +++ b/nexus/src/app/sagas/instance_ip_detach.rs @@ -161,7 +161,7 @@ async fn siid_get_instance_state( &sagactx, ¶ms.serialized_authn, ¶ms.authz_instance, - "attach", + "detach", ) .await } @@ -228,7 +228,7 @@ async fn siid_complete_attach( (false, InstanceState::Stopped) | (true, _) => { target_ip.try_into().map_err(ActionError::action_failed) } - _ => Err(Error::internal_error("failed to complete IP attach")) + _ => Err(Error::internal_error("failed to complete IP detach")) .map_err(ActionError::action_failed), } } diff --git a/schema/crdb/22.0.0/up07.sql b/schema/crdb/22.0.0/up07.sql new file mode 100644 index 0000000000..b4acedaf60 --- /dev/null +++ b/schema/crdb/22.0.0/up07.sql @@ -0,0 +1,4 @@ +CREATE UNIQUE INDEX IF NOT EXISTS one_ephemeral_ip_per_instance ON omicron.public.external_ip ( + parent_id +) + WHERE kind = 'ephemeral' AND parent_id IS NOT NULL AND time_deleted IS NULL; \ No newline at end of file diff --git a/schema/crdb/dbinit.sql b/schema/crdb/dbinit.sql index 68c4fd0b43..6580d39cf7 100644 --- a/schema/crdb/dbinit.sql +++ b/schema/crdb/dbinit.sql @@ -1784,6 +1784,12 @@ CREATE UNIQUE INDEX IF NOT EXISTS lookup_external_ip_by_parent ON omicron.public ) WHERE parent_id IS NOT NULL AND time_deleted IS NULL; +/* Enforce a limit of one Ephemeral IP per instance */ +CREATE UNIQUE INDEX IF NOT EXISTS one_ephemeral_ip_per_instance ON omicron.public.external_ip ( + parent_id +) + WHERE kind = 'ephemeral' AND parent_id IS NOT NULL AND time_deleted IS NULL; + /* Enforce name-uniqueness of floating (service) IPs at fleet level. */ CREATE UNIQUE INDEX IF NOT EXISTS lookup_floating_ip_by_name on omicron.public.external_ip ( name From ee6a79073402030c301f5afe94809e5cec1cab6b Mon Sep 17 00:00:00 2001 From: Kyle Simpson Date: Fri, 22 Dec 2023 17:39:46 +0000 Subject: [PATCH 16/56] The great clippy appeasement --- .../src/db/datastore/external_ip.rs | 90 +++++++++---------- .../db-queries/src/db/queries/external_ip.rs | 2 +- nexus/src/app/instance_network.rs | 11 +-- nexus/src/app/sagas/instance_common.rs | 4 +- nexus/src/app/sagas/instance_create.rs | 18 +++- nexus/src/app/sagas/instance_ip_attach.rs | 5 +- nexus/src/app/sagas/instance_ip_detach.rs | 7 +- 7 files changed, 75 insertions(+), 62 deletions(-) diff --git a/nexus/db-queries/src/db/datastore/external_ip.rs b/nexus/db-queries/src/db/datastore/external_ip.rs index 81f56596b5..8da40440eb 100644 --- a/nexus/db-queries/src/db/datastore/external_ip.rs +++ b/nexus/db-queries/src/db/datastore/external_ip.rs @@ -28,7 +28,7 @@ use crate::db::pool::DbConnection; use crate::db::queries::external_ip::NextExternalIp; use crate::db::queries::external_ip::SAFE_TO_ATTACH_INSTANCE_STATES; use crate::db::queries::external_ip::SAFE_TO_ATTACH_INSTANCE_STATES_CREATING; -use crate::db::queries::external_ip::SAFE_TRANSITORY_INSTANCE_STATES; +use crate::db::queries::external_ip::SAFE_TRANSIENT_INSTANCE_STATES; use crate::db::update_and_check::UpdateAndCheck; use crate::db::update_and_check::UpdateStatus; use async_bb8_diesel::AsyncRunQueryDsl; @@ -54,6 +54,7 @@ use uuid::Uuid; // FIXME: should be exported from a shared location, original lives in // nexus/app. const MAX_EXTERNAL_IPS_PER_INSTANCE: u32 = 32; +const MAX_EXTERNAL_IPS_PLUS_SNAT: u32 = MAX_EXTERNAL_IPS_PER_INSTANCE + 1; impl DataStore { /// Create an external IP address for source NAT for an instance. @@ -400,43 +401,43 @@ impl DataStore { let eip = query.detach_and_get_result_async(&*self.pool_connection_authorized(opctx).await?) .await - .or_else(|e: DetachError| match e { + .map_err(|e: DetachError| match e { DetachError::CollectionNotFound => { - Err(Error::not_found_by_id( + Error::not_found_by_id( ResourceType::Instance, &instance_id, - )) + ) }, DetachError::ResourceNotFound => { - Err(Error::invalid_request("instance has no ephemeral IP to detach")) + Error::invalid_request("instance has no ephemeral IP to detach") }, DetachError::NoUpdate { resource, collection } => { match resource.state { - IpAttachState::Attached if resource.parent_id != Some(instance_id) => return Err(Error::internal_error( + IpAttachState::Attached if resource.parent_id != Some(instance_id) => return Error::internal_error( "Ephemeral IP is not attached to the target instance", - )), + ), // User can reattempt depending on how the current saga unfolds. - IpAttachState::Attaching | IpAttachState::Detaching => return Err(Error::ServiceUnavailable { - internal_message: "tried to detach ephemeral IP mid-attach/detach".into() - }), + IpAttachState::Attaching | IpAttachState::Detaching => return Error::unavail ( + "tried to detach ephemeral IP mid-attach/detach" + ), IpAttachState::Attached => {}, - IpAttachState::Detached => return Err(Error::internal_error( + IpAttachState::Detached => return Error::internal_error( "Ephemeral IP cannot exist in 'detached' state", - )), + ), } - Err(match collection.runtime_state.nexus_state { - state if SAFE_TRANSITORY_INSTANCE_STATES.contains(&state) => Error::ServiceUnavailable { - internal_message: "tried to detach ephemeral IP while instance was changing state".into() - }, + match collection.runtime_state.nexus_state { + state if SAFE_TRANSIENT_INSTANCE_STATES.contains(&state) => Error::unavail ( + "tried to detach ephemeral IP while instance was changing state" + ), state if SAFE_TO_ATTACH_INSTANCE_STATES.contains(&state) => { Error::internal_error("failed to detach ephemeral IP") }, state => Error::invalid_request(&format!("cannot attach ephemeral IP to instance in {state} state")), - }) + } }, DetachError::DatabaseError(e) => { - Err(public_error_from_diesel(e, ErrorHandler::Server)) + public_error_from_diesel(e, ErrorHandler::Server) }, })?; @@ -624,8 +625,7 @@ impl DataStore { .filter(dsl::state.eq(IpAttachState::Detached)) .filter(dsl::kind.eq(IpKind::Floating)) .filter(dsl::parent_id.is_null()), - // +1 to account for SNat - MAX_EXTERNAL_IPS_PER_INSTANCE + 1, + MAX_EXTERNAL_IPS_PLUS_SNAT, diesel::update(dsl::external_ip).set(( dsl::parent_id.eq(Some(instance_id)), dsl::time_modified.eq(Utc::now()), @@ -657,19 +657,19 @@ impl DataStore { instance while still attached to another" )), // User can reattempt depending on how the current saga unfolds. - IpAttachState::Attaching | IpAttachState::Detaching => return Err(Error::ServiceUnavailable { - internal_message: "tried to attach floating IP mid-attach/detach".into() - }), + IpAttachState::Attaching | IpAttachState::Detaching => return Err(Error::unavail( + "tried to attach floating IP mid-attach/detach" + )), IpAttachState::Detached => {}, } Err(match collection.runtime_state.nexus_state { - state if SAFE_TRANSITORY_INSTANCE_STATES.contains(&state) => Error::ServiceUnavailable { - internal_message: "tried to attach floating IP while instance was changing state".into() - }, + state if SAFE_TRANSIENT_INSTANCE_STATES.contains(&state) => Error::unavail( + "tried to attach floating IP while instance was changing state" + ), state if SAFE_TO_ATTACH_INSTANCE_STATES.contains(&state) => { - if attached_count >= MAX_EXTERNAL_IPS_PER_INSTANCE as i64 + 1 { + if attached_count >= MAX_EXTERNAL_IPS_PLUS_SNAT as i64 { Error::invalid_request(&format!( "an instance may not have more than {} external IP addresses", MAX_EXTERNAL_IPS_PER_INSTANCE, @@ -739,47 +739,47 @@ impl DataStore { let eip = query.detach_and_get_result_async(&*self.pool_connection_authorized(opctx).await?) .await - .or_else(|e: DetachError| match e { + .map_err(|e: DetachError| match e { DetachError::CollectionNotFound => { - Err(Error::not_found_by_id( + Error::not_found_by_id( ResourceType::Instance, &instance_id, - )) + ) }, DetachError::ResourceNotFound => { - Err(Error::not_found_by_id( + Error::not_found_by_id( ResourceType::FloatingIp, &fip_id, - )) + ) }, DetachError::NoUpdate { resource, collection } => { match resource.state { - IpAttachState::Attached if resource.parent_id != Some(instance_id) => return Err(Error::invalid_request( + IpAttachState::Attached if resource.parent_id != Some(instance_id) => return Error::invalid_request( "Floating IP is not attached to the target instance", - )), + ), // TODO: should we just... let this one through? - IpAttachState::Detached => return Err(Error::invalid_request( + IpAttachState::Detached => return Error::invalid_request( "Floating IP is not attached to an instance", - )), + ), // User can reattempt depending on how the current saga unfolds. - IpAttachState::Attaching | IpAttachState::Detaching => return Err(Error::ServiceUnavailable { - internal_message: "tried to detach floating IP mid-attach/detach".into() - }), + IpAttachState::Attaching | IpAttachState::Detaching => return Error::unavail( + "tried to detach floating IP mid-attach/detach" + ), IpAttachState::Attached => {}, } - Err(match collection.runtime_state.nexus_state { - state if SAFE_TRANSITORY_INSTANCE_STATES.contains(&state) => Error::ServiceUnavailable { - internal_message: "tried to detach floating IP while instance was changing state".into() - }, + match collection.runtime_state.nexus_state { + state if SAFE_TRANSIENT_INSTANCE_STATES.contains(&state) => Error::unavail ( + "tried to detach floating IP while instance was changing state" + ), state if SAFE_TO_ATTACH_INSTANCE_STATES.contains(&state) => { Error::internal_error("failed to detach floating IP") }, state => Error::invalid_request(&format!("cannot detach floating IP to instance in {state} state")), - }) + } }, DetachError::DatabaseError(e) => { - Err(public_error_from_diesel(e, ErrorHandler::Server)) + public_error_from_diesel(e, ErrorHandler::Server) }, })?; diff --git a/nexus/db-queries/src/db/queries/external_ip.rs b/nexus/db-queries/src/db/queries/external_ip.rs index 53e0738b12..7520a34dc3 100644 --- a/nexus/db-queries/src/db/queries/external_ip.rs +++ b/nexus/db-queries/src/db/queries/external_ip.rs @@ -51,7 +51,7 @@ pub const SAFE_TO_ATTACH_INSTANCE_STATES: [DbInstanceState; 2] = [ // TODO: Currently stop if there's a migration or other state change. // This may be a good case for RPWing // external_ip_state -> { NAT RPW, sled-agent } in future. -pub const SAFE_TRANSITORY_INSTANCE_STATES: [DbInstanceState; 5] = [ +pub const SAFE_TRANSIENT_INSTANCE_STATES: [DbInstanceState; 5] = [ DbInstanceState(ApiInstanceState::Starting), DbInstanceState(ApiInstanceState::Stopping), DbInstanceState(ApiInstanceState::Creating), diff --git a/nexus/src/app/instance_network.rs b/nexus/src/app/instance_network.rs index f2c3eff7c5..a0fb217a4e 100644 --- a/nexus/src/app/instance_network.rs +++ b/nexus/src/app/instance_network.rs @@ -360,16 +360,13 @@ impl super::Nexus { // the request is retryable. if ips_of_interest .iter() - .find(|ip| { + .any(|ip| { must_all_be_attached && ip.state != IpAttachState::Attached }) - .is_some() { - return Err(Error::ServiceUnavailable { - internal_message: - "cannot push all DPD state: IP attach/detach in progress" - .into(), - }); + return Err(Error::unavail( + "cannot push all DPD state: IP attach/detach in progress", + )); } let sled_address = diff --git a/nexus/src/app/sagas/instance_common.rs b/nexus/src/app/sagas/instance_common.rs index 86e8e35a39..b325a2da7a 100644 --- a/nexus/src/app/sagas/instance_common.rs +++ b/nexus/src/app/sagas/instance_common.rs @@ -14,7 +14,7 @@ use nexus_db_model::{ }; use nexus_db_queries::authz; use nexus_db_queries::db::lookup::LookupPath; -use nexus_db_queries::db::queries::external_ip::SAFE_TRANSITORY_INSTANCE_STATES; +use nexus_db_queries::db::queries::external_ip::SAFE_TRANSIENT_INSTANCE_STATES; use nexus_db_queries::{authn, context::OpContext, db, db::DataStore}; use omicron_common::api::external::Error; use omicron_common::api::external::InstanceState; @@ -226,7 +226,7 @@ pub async fn instance_ip_get_instance_state( InstanceState::Stopped } InstanceState::Running => InstanceState::Running, - state if SAFE_TRANSITORY_INSTANCE_STATES.contains(&state.into()) => { + state if SAFE_TRANSIENT_INSTANCE_STATES.contains(&state.into()) => { return Err(ActionError::action_failed(Error::unavail(&format!( "can't {verb} in transient state {state}" )))) diff --git a/nexus/src/app/sagas/instance_create.rs b/nexus/src/app/sagas/instance_create.rs index 088e94f197..60f303470a 100644 --- a/nexus/src/app/sagas/instance_create.rs +++ b/nexus/src/app/sagas/instance_create.rs @@ -666,7 +666,14 @@ async fn sic_allocate_instance_external_ip( .await .map_err(ActionError::action_failed)?; - Ok(Some(ip)) + if n_rows != 1 { + Err(ActionError::action_failed(Error::internal_error(&format!( + "failed to completely attach ip address {}", + ip.id + )))) + } else { + Ok(Some(ip)) + } } async fn sic_allocate_instance_external_ip_undo( @@ -725,6 +732,15 @@ async fn sic_allocate_instance_external_ip_undo( ) .await .map_err(ActionError::action_failed)?; + + if n_rows != 1 { + let id = ip.id; + error!( + osagactx.log(), + "sic_allocate_instance_external_ip_undo: failed to \ + completely detach ip {id}" + ); + } } } Ok(()) diff --git a/nexus/src/app/sagas/instance_ip_attach.rs b/nexus/src/app/sagas/instance_ip_attach.rs index 61bf2980a2..92759cbc9a 100644 --- a/nexus/src/app/sagas/instance_ip_attach.rs +++ b/nexus/src/app/sagas/instance_ip_attach.rs @@ -219,8 +219,9 @@ async fn siia_complete_attach( (false, InstanceState::Stopped) | (true, _) => { target_ip.try_into().map_err(ActionError::action_failed) } - _ => Err(Error::internal_error("failed to complete IP attach")) - .map_err(ActionError::action_failed), + _ => Err(ActionError::action_failed(Error::internal_error( + "failed to complete IP attach", + ))), } } diff --git a/nexus/src/app/sagas/instance_ip_detach.rs b/nexus/src/app/sagas/instance_ip_detach.rs index 7715ec19a0..7cce0e1c7f 100644 --- a/nexus/src/app/sagas/instance_ip_detach.rs +++ b/nexus/src/app/sagas/instance_ip_detach.rs @@ -11,9 +11,7 @@ use super::{ActionRegistry, NexusActionContext, NexusSaga}; use crate::app::sagas::declare_saga_actions; use crate::app::{authn, authz, db}; use crate::external_api::params; -use futures::TryFutureExt; use nexus_db_model::{ExternalIp, IpAttachState, IpKind}; -use nexus_db_queries::db::identity::Resource; use nexus_db_queries::db::lookup::LookupPath; use nexus_types::external_api::views; use omicron_common::api::external::{Error, InstanceState}; @@ -228,8 +226,9 @@ async fn siid_complete_attach( (false, InstanceState::Stopped) | (true, _) => { target_ip.try_into().map_err(ActionError::action_failed) } - _ => Err(Error::internal_error("failed to complete IP detach")) - .map_err(ActionError::action_failed), + _ => Err(ActionError::action_failed(Error::internal_error( + "failed to complete IP detach", + ))), } } From 0001eb3073fcf8cb0cf911cfd45dcbcd303319a7 Mon Sep 17 00:00:00 2001 From: Kyle Simpson Date: Fri, 22 Dec 2023 19:38:45 +0000 Subject: [PATCH 17/56] WIP test fixes, resume the quest for idempotency --- .../src/db/datastore/external_ip.rs | 33 +++++++++++-------- nexus/db-queries/src/db/datastore/mod.rs | 20 +++++++++-- .../db-queries/src/db/queries/external_ip.rs | 4 +-- nexus/src/app/sagas/instance_create.rs | 17 ++++------ 4 files changed, 45 insertions(+), 29 deletions(-) diff --git a/nexus/db-queries/src/db/datastore/external_ip.rs b/nexus/db-queries/src/db/datastore/external_ip.rs index 8da40440eb..b021de9580 100644 --- a/nexus/db-queries/src/db/datastore/external_ip.rs +++ b/nexus/db-queries/src/db/datastore/external_ip.rs @@ -417,7 +417,7 @@ impl DataStore { "Ephemeral IP is not attached to the target instance", ), // User can reattempt depending on how the current saga unfolds. - IpAttachState::Attaching | IpAttachState::Detaching => return Error::unavail ( + IpAttachState::Attaching | IpAttachState::Detaching => return Error::unavail( "tried to detach ephemeral IP mid-attach/detach" ), IpAttachState::Attached => {}, @@ -427,7 +427,7 @@ impl DataStore { } match collection.runtime_state.nexus_state { - state if SAFE_TRANSIENT_INSTANCE_STATES.contains(&state) => Error::unavail ( + state if SAFE_TRANSIENT_INSTANCE_STATES.contains(&state) => Error::unavail( "tried to detach ephemeral IP while instance was changing state" ), state if SAFE_TO_ATTACH_INSTANCE_STATES.contains(&state) => { @@ -739,7 +739,7 @@ impl DataStore { let eip = query.detach_and_get_result_async(&*self.pool_connection_authorized(opctx).await?) .await - .map_err(|e: DetachError| match e { + .or_else(|e: DetachError| Err(match e { DetachError::CollectionNotFound => { Error::not_found_by_id( ResourceType::Instance, @@ -753,23 +753,23 @@ impl DataStore { ) }, DetachError::NoUpdate { resource, collection } => { + let parent_match = resource.parent_id == Some(instance_id); match resource.state { - IpAttachState::Attached if resource.parent_id != Some(instance_id) => return Error::invalid_request( + // Idempotent cases: already detached OR detaching from same instance. + IpAttachState::Detached => return Ok(resource), + IpAttachState::Detaching if parent_match => return Ok(resource), + IpAttachState::Attached if !parent_match => return Err(Error::invalid_request( "Floating IP is not attached to the target instance", - ), - // TODO: should we just... let this one through? - IpAttachState::Detached => return Error::invalid_request( - "Floating IP is not attached to an instance", - ), + )), // User can reattempt depending on how the current saga unfolds. - IpAttachState::Attaching | IpAttachState::Detaching => return Error::unavail( + IpAttachState::Attaching | IpAttachState::Detaching => return Err(Error::unavail( "tried to detach floating IP mid-attach/detach" - ), + )), IpAttachState::Attached => {}, } match collection.runtime_state.nexus_state { - state if SAFE_TRANSIENT_INSTANCE_STATES.contains(&state) => Error::unavail ( + state if SAFE_TRANSIENT_INSTANCE_STATES.contains(&state) => Error::unavail( "tried to detach floating IP while instance was changing state" ), state if SAFE_TO_ATTACH_INSTANCE_STATES.contains(&state) => { @@ -782,14 +782,19 @@ impl DataStore { public_error_from_diesel(e, ErrorHandler::Server) }, - })?; + }))?; Ok(eip) } /// Move an external IP from a transitional state (attaching, detaching) /// to its intended end state. - // FIXME: what do do in case of undo? + /// + /// Returns the number of rows modified, this may be zero on: + /// - instance delete by another saga + /// - saga action rerun + /// + /// This is valid in both cases for idempotency. pub async fn external_ip_complete_op( &self, opctx: &OpContext, diff --git a/nexus/db-queries/src/db/datastore/mod.rs b/nexus/db-queries/src/db/datastore/mod.rs index b08310c0d7..2fe6bcefb2 100644 --- a/nexus/db-queries/src/db/datastore/mod.rs +++ b/nexus/db-queries/src/db/datastore/mod.rs @@ -408,6 +408,7 @@ mod test { use chrono::{Duration, Utc}; use futures::stream; use futures::StreamExt; + use nexus_db_model::IpAttachState; use nexus_test_utils::db::test_setup_database; use nexus_types::external_api::params; use omicron_common::api::external::DataPageParams; @@ -1653,7 +1654,8 @@ mod test { // Create a few records. let now = Utc::now(); let instance_id = Uuid::new_v4(); - let ips = (0..4) + let kinds = [IpKind::SNat, IpKind::Ephemeral]; + let ips = (0..2) .map(|i| ExternalIp { id: Uuid::new_v4(), name: None, @@ -1666,7 +1668,7 @@ mod test { project_id: None, is_service: false, parent_id: Some(instance_id), - kind: IpKind::Ephemeral, + kind: kinds[i as usize], ip: ipnetwork::IpNetwork::from(IpAddr::from(Ipv4Addr::new( 10, 0, 0, i, ))), @@ -1813,6 +1815,7 @@ mod test { // - description // - parent (instance / service) UUID // - project UUID + // - attach state let names = [None, Some("foo")]; let descriptions = [None, Some("foo".to_string())]; let parent_ids = [None, Some(Uuid::new_v4())]; @@ -1853,6 +1856,12 @@ mod test { continue; } + let state = if parent_id.is_some() { + IpAttachState::Attached + } else { + IpAttachState::Detached + }; + let new_ip = ExternalIp { id: Uuid::new_v4(), name: name_local.clone(), @@ -1861,6 +1870,7 @@ mod test { is_service, parent_id: *parent_id, project_id: *project_id, + state, ..ip }; @@ -1933,6 +1943,11 @@ mod test { let name_local = name.map(|v| { db::model::Name(Name::try_from(v.to_string()).unwrap()) }); + let state = if parent_id.is_some() { + IpAttachState::Attached + } else { + IpAttachState::Detached + }; let new_ip = ExternalIp { id: Uuid::new_v4(), name: name_local, @@ -1942,6 +1957,7 @@ mod test { is_service, parent_id: *parent_id, project_id: *project_id, + state, ..ip }; let res = diesel::insert_into(dsl::external_ip) diff --git a/nexus/db-queries/src/db/queries/external_ip.rs b/nexus/db-queries/src/db/queries/external_ip.rs index 7520a34dc3..415750165b 100644 --- a/nexus/db-queries/src/db/queries/external_ip.rs +++ b/nexus/db-queries/src/db/queries/external_ip.rs @@ -1811,11 +1811,11 @@ mod tests { context.create_ip_pool("p1", second_range, /* default */ false).await; // Allocate all available addresses in the second pool. - let instance_id = Uuid::new_v4(); let pool_name = Some(Name("p1".parse().unwrap())); let first_octet = first_address.octets()[3]; let last_octet = last_address.octets()[3]; for octet in first_octet..=last_octet { + let instance_id = Uuid::new_v4(); let ip = context .db_datastore .allocate_instance_ephemeral_ip( @@ -1840,7 +1840,7 @@ mod tests { .allocate_instance_ephemeral_ip( &context.opctx, Uuid::new_v4(), - instance_id, + Uuid::new_v4(), pool_name, ) .await diff --git a/nexus/src/app/sagas/instance_create.rs b/nexus/src/app/sagas/instance_create.rs index 60f303470a..779fa140a3 100644 --- a/nexus/src/app/sagas/instance_create.rs +++ b/nexus/src/app/sagas/instance_create.rs @@ -655,7 +655,10 @@ async fn sic_allocate_instance_external_ip( } }; - let n_rows = datastore + // Ignore row count here, this is infallible with correct + // (state, state', kind) but may be zero on repeat call for + // idempotency. + _ = datastore .external_ip_complete_op( &opctx, ip.id, @@ -666,14 +669,7 @@ async fn sic_allocate_instance_external_ip( .await .map_err(ActionError::action_failed)?; - if n_rows != 1 { - Err(ActionError::action_failed(Error::internal_error(&format!( - "failed to completely attach ip address {}", - ip.id - )))) - } else { - Ok(Some(ip)) - } + Ok(Some(ip)) } async fn sic_allocate_instance_external_ip_undo( @@ -734,11 +730,10 @@ async fn sic_allocate_instance_external_ip_undo( .map_err(ActionError::action_failed)?; if n_rows != 1 { - let id = ip.id; error!( osagactx.log(), "sic_allocate_instance_external_ip_undo: failed to \ - completely detach ip {id}" + completely detach ip {}", ip.id ); } } From 89ddffae6f22c0775dd98cecff687a502bd4195c Mon Sep 17 00:00:00 2001 From: Kyle Simpson Date: Fri, 22 Dec 2023 20:34:20 +0000 Subject: [PATCH 18/56] Large block comment for myself/future historians --- nexus/src/app/sagas/instance_ip_attach.rs | 144 +++++++++++++++++----- nexus/src/app/sagas/instance_ip_detach.rs | 33 +++-- 2 files changed, 126 insertions(+), 51 deletions(-) diff --git a/nexus/src/app/sagas/instance_ip_attach.rs b/nexus/src/app/sagas/instance_ip_attach.rs index 92759cbc9a..8a6841b339 100644 --- a/nexus/src/app/sagas/instance_ip_attach.rs +++ b/nexus/src/app/sagas/instance_ip_attach.rs @@ -14,27 +14,26 @@ use crate::external_api::params; use nexus_db_model::{ExternalIp, IpAttachState}; use nexus_db_queries::db::lookup::LookupPath; use nexus_types::external_api::views; -use omicron_common::api::external::{Error, InstanceState}; use serde::Deserialize; use serde::Serialize; use steno::ActionError; use uuid::Uuid; -// TODO: explain in-depth here how locking works in practice for -// attach and detach wrt create/stop/start. - -// rough sequence of evts: -// - take temp ownership of instance while interacting w/ sled agent -// -> mark instance migration id as Some(0) if None -// - Attach+alloc EIP to instance -// - Register routes -// -> ensure_dpd... -// -> must precede OPTE: host may change its sending -// behaviour prematurely -// - Register addr in OPTE -// -> Put addr in sled-agent endpoint -// - free up migration_id of instance. -// -> mark instance migration id as None +// The IP attach/detach sagas do some resource locking -- because we +// allow them to be called in [Running, Stopped], they must contend +// with each other/themselves, instance start, instance delete, and +// the instance stop action (noting the latter is not a saga. +// +// The main means of access control here is an external IP's `state`. +// Entering either saga begins with an atomic swap from Attached/Detached +// to Attaching/Detaching. This prevents concurrent attach/detach on the +// same EIP, and prevents instance start from executing with an +// Error::unavail. +// +// Overlap with stop is handled by treating comms failures with +// sled-agent as temporary errors and unwinding. For the delete case, we +// allow the attach/detach completion to have a missing record. +// See `instance_common::instance_ip_get_instance_state` for more info. declare_saga_actions! { instance_ip_attach; @@ -199,30 +198,25 @@ async fn siia_update_opte_undo( async fn siia_complete_attach( sagactx: NexusActionContext, ) -> Result { + let log = sagactx.user_data().log(); let params = sagactx.saga_params::()?; - let initial_state = - sagactx.lookup::("instance_state")?.state; let target_ip = sagactx.lookup::("target_ip")?; - let update_occurred = instance_ip_move_state( + if !instance_ip_move_state( &sagactx, ¶ms.serialized_authn, IpAttachState::Attaching, IpAttachState::Attached, ) - .await?; - - // TODO: explain why it is safe to not back out on state change. - match (update_occurred, initial_state) { - // Allow failure here on stopped because the instance_delete saga - // may have been concurrently fired off and removed the row. - (false, InstanceState::Stopped) | (true, _) => { - target_ip.try_into().map_err(ActionError::action_failed) - } - _ => Err(ActionError::action_failed(Error::internal_error( - "failed to complete IP attach", - ))), + .await? + { + warn!( + log, + "siia_complete_attach: external IP was deleted or call was idempotent" + ) } + + target_ip.try_into().map_err(ActionError::action_failed) } // TODO: backout changes if run state changed illegally? @@ -252,16 +246,102 @@ impl NexusSaga for SagaInstanceIpAttach { #[cfg(test)] pub(crate) mod test { + use crate::app::sagas::test_helpers; + use super::*; + use dropshot::test_util::ClientTestContext; + use nexus_db_model::Name; + use nexus_db_queries::context::OpContext; + use nexus_test_utils::resource_helpers::{populate_ip_pool, create_project, create_disk, create_floating_ip, object_create}; use nexus_test_utils_macros::nexus_test; + use omicron_common::api::external::{SimpleIdentity, IdentityMetadataCreateParams}; type ControlPlaneTestContext = nexus_test_utils::ControlPlaneTestContext; + const PROJECT_NAME: &str = "cafe"; + const INSTANCE_NAME: &str = "menu"; + const FIP_NAME: &str = "affogato"; + const DISK_NAME: &str = "my-disk"; + + // Test matrix: + // - instance started/stopped + // - fip vs ephemeral + + // async fn create_instance( + // client: &ClientTestContext, + // ) -> omicron_common::api::external::Instance { + // let instances_url = format!("/v1/instances?project={}", PROJECT_NAME); + // object_create( + // client, + // &instances_url, + // ¶ms::InstanceCreate { + // identity: IdentityMetadataCreateParams { + // name: INSTANCE_NAME.parse().unwrap(), + // description: format!("instance {:?}", INSTANCE_NAME), + // }, + // ncpus: InstanceCpuCount(2), + // memory: ByteCount::from_gibibytes_u32(2), + // hostname: String::from(INSTANCE_NAME), + // user_data: b"#cloud-config".to_vec(), + // network_interfaces: + // params::InstanceNetworkInterfaceAttachment::None, + // external_ips: vec![], + // disks: vec![], + // start: false, + // }, + // ) + // .await + // } + + pub async fn ip_manip_test_setup(client: &ClientTestContext) -> Uuid { + populate_ip_pool(&client, "default", None).await; + let project = create_project(client, PROJECT_NAME).await; + create_disk(&client, PROJECT_NAME, DISK_NAME).await; + create_floating_ip( + client, + FIP_NAME, + &project.identity.id.to_string(), + None, + None, + ) + .await; + + + project.id() + } + + async fn new_test_params(opctx: &OpContext, datastore: &db::DataStore, project_id: Uuid, use_floating: bool) -> Params { + let create_params = if use_floating { + params::ExternalIpCreate::Floating { floating_ip_name: FIP_NAME.parse().unwrap() } + } else { + params::ExternalIpCreate::Ephemeral { pool_name: None } + }; + + let (.., authz_instance) = LookupPath::new(opctx, datastore).project_id(project_id) + .instance_name(&Name(INSTANCE_NAME.parse().unwrap())).lookup_for(authz::Action::Modify).await.unwrap(); + Params { + serialized_authn: authn::saga::Serialized::for_opctx(opctx), + project_id, + create_params, + authz_instance, + } + } + #[nexus_test(server = crate::Server)] async fn test_saga_basic_usage_succeeds( - _cptestctx: &ControlPlaneTestContext, + cptestctx: &ControlPlaneTestContext, ) { + let client = &cptestctx.external_client; + let nexus = &cptestctx.server.apictx().nexus; + let opctx = test_helpers::test_opctx(cptestctx); + let instance = create_instance(client).await; + let db_instance = + test_helpers::instance_fetch(cptestctx, instance.identity.id) + .await + .instance() + .clone(); + let project_id = ip_manip_test_setup(&client); todo!() } diff --git a/nexus/src/app/sagas/instance_ip_detach.rs b/nexus/src/app/sagas/instance_ip_detach.rs index 7cce0e1c7f..3db0e92700 100644 --- a/nexus/src/app/sagas/instance_ip_detach.rs +++ b/nexus/src/app/sagas/instance_ip_detach.rs @@ -55,8 +55,8 @@ declare_saga_actions! { - siid_update_opte_undo } - COMPLETE_ATTACH -> "output" { - + siid_complete_attach + COMPLETE_DETACH -> "output" { + + siid_complete_detach } } @@ -203,33 +203,28 @@ async fn siid_update_opte_undo( Ok(()) } -async fn siid_complete_attach( +async fn siid_complete_detach( sagactx: NexusActionContext, ) -> Result { + let log = sagactx.user_data().log(); let params = sagactx.saga_params::()?; - let initial_state = - sagactx.lookup::("instance_state")?.state; let target_ip = sagactx.lookup::("target_ip")?; - let update_occurred = instance_ip_move_state( + if !instance_ip_move_state( &sagactx, ¶ms.serialized_authn, IpAttachState::Detaching, IpAttachState::Detached, ) - .await?; - - // TODO: explain why it is safe to not back out on state change. - match (update_occurred, initial_state) { - // Allow failure here on stopped because the instance_delete saga - // may have been concurrently fired off and removed the row. - (false, InstanceState::Stopped) | (true, _) => { - target_ip.try_into().map_err(ActionError::action_failed) - } - _ => Err(ActionError::action_failed(Error::internal_error( - "failed to complete IP detach", - ))), + .await? + { + warn!( + log, + "siia_complete_attach: external IP was deleted or call was idempotent" + ) } + + target_ip.try_into().map_err(ActionError::action_failed) } #[derive(Debug)] @@ -250,7 +245,7 @@ impl NexusSaga for SagaInstanceIpDetach { builder.append(instance_state_action()); builder.append(remove_nat_action()); builder.append(remove_opte_port_action()); - builder.append(complete_attach_action()); + builder.append(complete_detach_action()); Ok(builder.build()?) } } From 9db79b903e5fb2e6d2dfb5b4d04641a4e9244c26 Mon Sep 17 00:00:00 2001 From: Kyle Simpson Date: Wed, 27 Dec 2023 16:12:53 +0000 Subject: [PATCH 19/56] Test harness progress Next up: putting into action my thoughts on improved idempotency. --- nexus/src/app/instance_network.rs | 9 +- nexus/src/app/sagas/instance_common.rs | 4 +- nexus/src/app/sagas/instance_create.rs | 3 +- nexus/src/app/sagas/instance_ip_attach.rs | 215 ++++++++++++++------ nexus/src/app/sagas/instance_ip_detach.rs | 228 +++++++++++++++++++++- nexus/test-utils/src/resource_helpers.rs | 4 +- sled-agent/src/params.rs | 4 +- sled-agent/src/sim/http_entrypoints.rs | 39 +++- sled-agent/src/sim/sled_agent.rs | 67 ++++++- 9 files changed, 488 insertions(+), 85 deletions(-) diff --git a/nexus/src/app/instance_network.rs b/nexus/src/app/instance_network.rs index a0fb217a4e..ca45025b5e 100644 --- a/nexus/src/app/instance_network.rs +++ b/nexus/src/app/instance_network.rs @@ -358,12 +358,9 @@ impl super::Nexus { // This is performed so that an IP attach/detach will block the // instance_start saga. Return service unavailable to indicate // the request is retryable. - if ips_of_interest - .iter() - .any(|ip| { - must_all_be_attached && ip.state != IpAttachState::Attached - }) - { + if ips_of_interest.iter().any(|ip| { + must_all_be_attached && ip.state != IpAttachState::Attached + }) { return Err(Error::unavail( "cannot push all DPD state: IP attach/detach in progress", )); diff --git a/nexus/src/app/sagas/instance_common.rs b/nexus/src/app/sagas/instance_common.rs index b325a2da7a..c94aea8fb3 100644 --- a/nexus/src/app/sagas/instance_common.rs +++ b/nexus/src/app/sagas/instance_common.rs @@ -362,7 +362,7 @@ pub async fn instance_ip_remove_opte( ) -> Result<(), ActionError> { let osagactx = sagactx.user_data(); - // If we didn't push OPTE before, don't undo it. + // No physical sled? Don't inform OPTE. let Some(sled_uuid) = sagactx.lookup::("instance_state")?.sled_id else { @@ -382,7 +382,7 @@ pub async fn instance_ip_remove_opte( "sled agent client went away mid-attach", )) })? - .instance_put_external_ip(&authz_instance.id(), &sled_agent_body) + .instance_delete_external_ip(&authz_instance.id(), &sled_agent_body) .await .map_err(|e| { ActionError::action_failed(match e { diff --git a/nexus/src/app/sagas/instance_create.rs b/nexus/src/app/sagas/instance_create.rs index 779fa140a3..d921275402 100644 --- a/nexus/src/app/sagas/instance_create.rs +++ b/nexus/src/app/sagas/instance_create.rs @@ -733,7 +733,8 @@ async fn sic_allocate_instance_external_ip_undo( error!( osagactx.log(), "sic_allocate_instance_external_ip_undo: failed to \ - completely detach ip {}", ip.id + completely detach ip {}", + ip.id ); } } diff --git a/nexus/src/app/sagas/instance_ip_attach.rs b/nexus/src/app/sagas/instance_ip_attach.rs index 8a6841b339..2e877cc065 100644 --- a/nexus/src/app/sagas/instance_ip_attach.rs +++ b/nexus/src/app/sagas/instance_ip_attach.rs @@ -246,15 +246,28 @@ impl NexusSaga for SagaInstanceIpAttach { #[cfg(test)] pub(crate) mod test { - use crate::app::sagas::test_helpers; - use super::*; + use crate::app::{ + saga::create_saga_dag, + sagas::test_helpers::{self, instance_simulate}, + }; + use async_bb8_diesel::{AsyncRunQueryDsl, AsyncSimpleConnection}; + use diesel::{ + ExpressionMethods, OptionalExtension, QueryDsl, SelectableHelper, + }; use dropshot::test_util::ClientTestContext; - use nexus_db_model::Name; + use nexus_db_model::{IpKind, Name}; use nexus_db_queries::context::OpContext; - use nexus_test_utils::resource_helpers::{populate_ip_pool, create_project, create_disk, create_floating_ip, object_create}; + use nexus_test_utils::resource_helpers::{ + create_disk, create_floating_ip, create_instance, create_project, + object_create, populate_ip_pool, + }; use nexus_test_utils_macros::nexus_test; - use omicron_common::api::external::{SimpleIdentity, IdentityMetadataCreateParams}; + use nexus_types::external_api::params::ExternalIpCreate; + use omicron_common::api::external::{ + ByteCount, IdentityMetadataCreateParams, InstanceCpuCount, + SimpleIdentity, + }; type ControlPlaneTestContext = nexus_test_utils::ControlPlaneTestContext; @@ -262,42 +275,10 @@ pub(crate) mod test { const PROJECT_NAME: &str = "cafe"; const INSTANCE_NAME: &str = "menu"; const FIP_NAME: &str = "affogato"; - const DISK_NAME: &str = "my-disk"; - - // Test matrix: - // - instance started/stopped - // - fip vs ephemeral - - // async fn create_instance( - // client: &ClientTestContext, - // ) -> omicron_common::api::external::Instance { - // let instances_url = format!("/v1/instances?project={}", PROJECT_NAME); - // object_create( - // client, - // &instances_url, - // ¶ms::InstanceCreate { - // identity: IdentityMetadataCreateParams { - // name: INSTANCE_NAME.parse().unwrap(), - // description: format!("instance {:?}", INSTANCE_NAME), - // }, - // ncpus: InstanceCpuCount(2), - // memory: ByteCount::from_gibibytes_u32(2), - // hostname: String::from(INSTANCE_NAME), - // user_data: b"#cloud-config".to_vec(), - // network_interfaces: - // params::InstanceNetworkInterfaceAttachment::None, - // external_ips: vec![], - // disks: vec![], - // start: false, - // }, - // ) - // .await - // } pub async fn ip_manip_test_setup(client: &ClientTestContext) -> Uuid { populate_ip_pool(&client, "default", None).await; let project = create_project(client, PROJECT_NAME).await; - create_disk(&client, PROJECT_NAME, DISK_NAME).await; create_floating_ip( client, FIP_NAME, @@ -307,22 +288,33 @@ pub(crate) mod test { ) .await; - project.id() } - async fn new_test_params(opctx: &OpContext, datastore: &db::DataStore, project_id: Uuid, use_floating: bool) -> Params { + pub async fn new_test_params( + opctx: &OpContext, + datastore: &db::DataStore, + use_floating: bool, + ) -> Params { let create_params = if use_floating { - params::ExternalIpCreate::Floating { floating_ip_name: FIP_NAME.parse().unwrap() } + params::ExternalIpCreate::Floating { + floating_ip_name: FIP_NAME.parse().unwrap(), + } } else { params::ExternalIpCreate::Ephemeral { pool_name: None } }; - let (.., authz_instance) = LookupPath::new(opctx, datastore).project_id(project_id) - .instance_name(&Name(INSTANCE_NAME.parse().unwrap())).lookup_for(authz::Action::Modify).await.unwrap(); + let (.., authz_project, authz_instance) = + LookupPath::new(opctx, datastore) + .project_name(&Name(PROJECT_NAME.parse().unwrap())) + .instance_name(&Name(INSTANCE_NAME.parse().unwrap())) + .lookup_for(authz::Action::Modify) + .await + .unwrap(); + Params { serialized_authn: authn::saga::Serialized::for_opctx(opctx), - project_id, + project_id: authz_project.id(), create_params, authz_instance, } @@ -333,36 +325,143 @@ pub(crate) mod test { cptestctx: &ControlPlaneTestContext, ) { let client = &cptestctx.external_client; - let nexus = &cptestctx.server.apictx().nexus; + let apictx = &cptestctx.server.apictx(); + let nexus = &apictx.nexus; + let opctx = test_helpers::test_opctx(cptestctx); - let instance = create_instance(client).await; - let db_instance = - test_helpers::instance_fetch(cptestctx, instance.identity.id) - .await - .instance() - .clone(); - let project_id = ip_manip_test_setup(&client); - todo!() + let datastore = &nexus.db_datastore; + let project_id = ip_manip_test_setup(&client).await; + let _instance = + create_instance(client, PROJECT_NAME, INSTANCE_NAME).await; + + for use_float in [false, true] { + let params = new_test_params(&opctx, datastore, use_float).await; + + let dag = create_saga_dag::(params).unwrap(); + let saga = nexus.create_runnable_saga(dag).await.unwrap(); + nexus.run_saga(saga).await.expect("Attach saga should succeed"); + + // TODO: assert sled agent, dpd happy, ... + } + } + + pub(crate) async fn verify_clean_slate( + cptestctx: &ControlPlaneTestContext, + instance_id: Uuid, + ) { + use nexus_db_queries::db::schema::external_ip::dsl; + + let sled_agent = &cptestctx.sled_agent.sled_agent; + let datastore = cptestctx.server.apictx().nexus.datastore(); + + let conn = datastore.pool_connection_for_tests().await.unwrap(); + + // No Floating IPs exist in states other than 'detached'. + assert!(dsl::external_ip + .filter(dsl::kind.eq(IpKind::Floating)) + .filter(dsl::time_deleted.is_null()) + .filter(dsl::parent_id.eq(instance_id)) + .filter(dsl::state.ne(IpAttachState::Detached)) + .select(ExternalIp::as_select()) + .first_async::(&*conn,) + .await + .optional() + .unwrap() + .is_none()); + + // All ephemeral IPs are removed. + assert!(dsl::external_ip + .filter(dsl::kind.eq(IpKind::Ephemeral)) + .filter(dsl::time_deleted.is_null()) + .select(ExternalIp::as_select()) + .first_async::(&*conn,) + .await + .optional() + .unwrap() + .is_none()); + + // No IP bindings remain on sled-agent. + let eips = &*sled_agent.external_ips.lock().await; + for (_nic_id, eip_set) in eips { + assert!(eip_set.is_empty()); + } } #[nexus_test(server = crate::Server)] async fn test_action_failure_can_unwind( - _cptestctx: &ControlPlaneTestContext, + cptestctx: &ControlPlaneTestContext, ) { - todo!() + let log = &cptestctx.logctx.log; + let client = &cptestctx.external_client; + let apictx = &cptestctx.server.apictx(); + let nexus = &apictx.nexus; + + let opctx = test_helpers::test_opctx(cptestctx); + let datastore = &nexus.db_datastore; + let project_id = ip_manip_test_setup(&client).await; + let instance = + create_instance(client, PROJECT_NAME, INSTANCE_NAME).await; + + for use_float in [false, true] { + test_helpers::action_failure_can_unwind::( + nexus, + || Box::pin(new_test_params(&opctx, datastore, use_float) ), + || Box::pin(verify_clean_slate(&cptestctx, instance.id())), + log, + ) + .await; + } } #[nexus_test(server = crate::Server)] async fn test_action_failure_can_unwind_idempotently( - _cptestctx: &ControlPlaneTestContext, + cptestctx: &ControlPlaneTestContext, ) { - todo!() + let log = &cptestctx.logctx.log; + let client = &cptestctx.external_client; + let apictx = &cptestctx.server.apictx(); + let nexus = &apictx.nexus; + + let opctx = test_helpers::test_opctx(cptestctx); + let datastore = &nexus.db_datastore; + let project_id = ip_manip_test_setup(&client).await; + let instance = + create_instance(client, PROJECT_NAME, INSTANCE_NAME).await; + + for use_float in [false, true] { + test_helpers::action_failure_can_unwind_idempotently::< + SagaInstanceIpAttach, + _, + _, + >( + nexus, + || Box::pin(new_test_params(&opctx, datastore, use_float)), + || Box::pin(verify_clean_slate(&cptestctx, instance.id())), + log, + ) + .await; + } } #[nexus_test(server = crate::Server)] async fn test_actions_succeed_idempotently( - _cptestctx: &ControlPlaneTestContext, + cptestctx: &ControlPlaneTestContext, ) { - todo!() + let log = &cptestctx.logctx.log; + let client = &cptestctx.external_client; + let apictx = &cptestctx.server.apictx(); + let nexus = &apictx.nexus; + + let opctx = test_helpers::test_opctx(cptestctx); + let datastore = &nexus.db_datastore; + let project_id = ip_manip_test_setup(&client).await; + let _instance = + create_instance(client, PROJECT_NAME, INSTANCE_NAME).await; + + for use_float in [false, true] { + let params = new_test_params(&opctx, datastore, use_float).await; + let dag = create_saga_dag::(params).unwrap(); + test_helpers::actions_succeed_idempotently(nexus, dag).await; + } } } diff --git a/nexus/src/app/sagas/instance_ip_detach.rs b/nexus/src/app/sagas/instance_ip_detach.rs index 3db0e92700..46edca1d0c 100644 --- a/nexus/src/app/sagas/instance_ip_detach.rs +++ b/nexus/src/app/sagas/instance_ip_detach.rs @@ -252,37 +252,247 @@ impl NexusSaga for SagaInstanceIpDetach { #[cfg(test)] pub(crate) mod test { - + use super::*; + use crate::{ + app::{ + saga::create_saga_dag, + sagas::{ + instance_ip_attach::{self, test::ip_manip_test_setup}, + test_helpers, + }, + }, + Nexus, + }; + use async_bb8_diesel::{AsyncRunQueryDsl, AsyncSimpleConnection}; + use diesel::{ + ExpressionMethods, OptionalExtension, QueryDsl, SelectableHelper, + }; + use nexus_db_model::Name; + use nexus_db_queries::context::OpContext; + use nexus_test_utils::resource_helpers::create_instance; use nexus_test_utils_macros::nexus_test; + use omicron_common::api::external::SimpleIdentity; + use std::sync::Arc; type ControlPlaneTestContext = nexus_test_utils::ControlPlaneTestContext; + const PROJECT_NAME: &str = "cafe"; + const INSTANCE_NAME: &str = "menu"; + const FIP_NAME: &str = "affogato"; + + async fn new_test_params( + opctx: &OpContext, + datastore: &db::DataStore, + use_floating: bool, + ) -> Params { + let delete_params = if use_floating { + params::ExternalIpDelete::Floating { + floating_ip_name: FIP_NAME.parse().unwrap(), + } + } else { + params::ExternalIpDelete::Ephemeral + }; + + let (.., authz_project, authz_instance) = + LookupPath::new(opctx, datastore) + .project_name(&Name(PROJECT_NAME.parse().unwrap())) + .instance_name(&Name(INSTANCE_NAME.parse().unwrap())) + .lookup_for(authz::Action::Modify) + .await + .unwrap(); + + Params { + serialized_authn: authn::saga::Serialized::for_opctx(opctx), + project_id: authz_project.id(), + delete_params, + authz_instance, + } + } + + async fn attach_instance_ips(nexus: &Arc, opctx: &OpContext) { + let datastore = &nexus.db_datastore; + + let proj_name = Name(PROJECT_NAME.parse().unwrap()); + let inst_name = Name(INSTANCE_NAME.parse().unwrap()); + let lookup = LookupPath::new(opctx, datastore) + .project_name(&proj_name) + .instance_name(&inst_name); + + for use_float in [false, true] { + let params = instance_ip_attach::test::new_test_params( + opctx, datastore, use_float, + ) + .await; + nexus + .instance_attach_external_ip( + opctx, + &lookup, + ¶ms.create_params, + ) + .await + .unwrap(); + } + } + #[nexus_test(server = crate::Server)] async fn test_saga_basic_usage_succeeds( - _cptestctx: &ControlPlaneTestContext, + cptestctx: &ControlPlaneTestContext, ) { - todo!() + let client = &cptestctx.external_client; + let apictx = &cptestctx.server.apictx(); + let nexus = &apictx.nexus; + + let opctx = test_helpers::test_opctx(cptestctx); + let datastore = &nexus.db_datastore; + let _ = ip_manip_test_setup(&client).await; + let _instance = + create_instance(client, PROJECT_NAME, INSTANCE_NAME).await; + + attach_instance_ips(nexus, &opctx).await; + + for use_float in [false, true] { + let params = new_test_params(&opctx, datastore, use_float).await; + + let dag = create_saga_dag::(params).unwrap(); + let saga = nexus.create_runnable_saga(dag).await.unwrap(); + nexus.run_saga(saga).await.expect("Detach saga should succeed"); + } + } + + pub(crate) async fn verify_clean_slate( + cptestctx: &ControlPlaneTestContext, + instance_id: Uuid, + ) { + use nexus_db_queries::db::schema::external_ip::dsl; + + let opctx = test_helpers::test_opctx(cptestctx); + let sled_agent = &cptestctx.sled_agent.sled_agent; + let datastore = cptestctx.server.apictx().nexus.datastore(); + + let conn = datastore.pool_connection_for_tests().await.unwrap(); + + // No IPs in transitional states w/ current instance. + assert!(dsl::external_ip + .filter(dsl::time_deleted.is_null()) + .filter(dsl::parent_id.eq(instance_id)) + .filter(dsl::state.ne(IpAttachState::Attached)) + .select(ExternalIp::as_select()) + .first_async::(&*conn,) + .await + .optional() + .unwrap() + .is_none()); + + // No external IPs in detached state. + assert!(dsl::external_ip + .filter(dsl::time_deleted.is_null()) + .filter(dsl::state.eq(IpAttachState::Detached)) + .select(ExternalIp::as_select()) + .first_async::(&*conn,) + .await + .optional() + .unwrap() + .is_none()); + + // Instance still has one Ephemeral IP, and one Floating IP. + let db_eips = datastore + .instance_lookup_external_ips(&opctx, instance_id) + .await + .unwrap(); + assert_eq!(db_eips.len(), 3); + assert!(db_eips.iter().find(|v| v.kind == IpKind::Ephemeral).is_some()); + assert!(db_eips.iter().find(|v| v.kind == IpKind::Floating).is_some()); + assert!(db_eips.iter().find(|v| v.kind == IpKind::SNat).is_some()); + + // No IP bindings remain on sled-agent. + let eips = &*sled_agent.external_ips.lock().await; + for (_nic_id, eip_set) in eips { + assert_eq!(eip_set.len(), 2); + } } #[nexus_test(server = crate::Server)] async fn test_action_failure_can_unwind( - _cptestctx: &ControlPlaneTestContext, + cptestctx: &ControlPlaneTestContext, ) { - todo!() + let log = &cptestctx.logctx.log; + let client = &cptestctx.external_client; + let apictx = &cptestctx.server.apictx(); + let nexus = &apictx.nexus; + + let opctx = test_helpers::test_opctx(cptestctx); + let datastore = &nexus.db_datastore; + let project_id = ip_manip_test_setup(&client).await; + let instance = + create_instance(client, PROJECT_NAME, INSTANCE_NAME).await; + + attach_instance_ips(nexus, &opctx).await; + + for use_float in [false, true] { + test_helpers::action_failure_can_unwind::( + nexus, + || Box::pin(new_test_params(&opctx, datastore, use_float) ), + || Box::pin(verify_clean_slate(&cptestctx, instance.id())), + log, + ) + .await; + } } #[nexus_test(server = crate::Server)] async fn test_action_failure_can_unwind_idempotently( - _cptestctx: &ControlPlaneTestContext, + cptestctx: &ControlPlaneTestContext, ) { - todo!() + let log = &cptestctx.logctx.log; + let client = &cptestctx.external_client; + let apictx = &cptestctx.server.apictx(); + let nexus = &apictx.nexus; + + let opctx = test_helpers::test_opctx(cptestctx); + let datastore = &nexus.db_datastore; + let project_id = ip_manip_test_setup(&client).await; + let instance = + create_instance(client, PROJECT_NAME, INSTANCE_NAME).await; + + attach_instance_ips(nexus, &opctx).await; + + for use_float in [false, true] { + test_helpers::action_failure_can_unwind_idempotently::< + SagaInstanceIpDetach, + _, + _, + >( + nexus, + || Box::pin(new_test_params(&opctx, datastore, use_float)), + || Box::pin(verify_clean_slate(&cptestctx, instance.id())), + log, + ) + .await; + } } #[nexus_test(server = crate::Server)] async fn test_actions_succeed_idempotently( - _cptestctx: &ControlPlaneTestContext, + cptestctx: &ControlPlaneTestContext, ) { - todo!() + let log = &cptestctx.logctx.log; + let client = &cptestctx.external_client; + let apictx = &cptestctx.server.apictx(); + let nexus = &apictx.nexus; + + let opctx = test_helpers::test_opctx(cptestctx); + let datastore = &nexus.db_datastore; + let project_id = ip_manip_test_setup(&client).await; + let _instance = + create_instance(client, PROJECT_NAME, INSTANCE_NAME).await; + + attach_instance_ips(nexus, &opctx).await; + + for use_float in [false, true] { + let params = new_test_params(&opctx, datastore, use_float).await; + let dag = create_saga_dag::(params).unwrap(); + test_helpers::actions_succeed_idempotently(nexus, dag).await; + } } } diff --git a/nexus/test-utils/src/resource_helpers.rs b/nexus/test-utils/src/resource_helpers.rs index c72c7ad780..f564d73119 100644 --- a/nexus/test-utils/src/resource_helpers.rs +++ b/nexus/test-utils/src/resource_helpers.rs @@ -68,8 +68,8 @@ where .authn_as(AuthnMode::PrivilegedUser) .execute() .await - .unwrap_or_else(|_| { - panic!("failed to make \"create\" request to {path}") + .unwrap_or_else(|e| { + panic!("failed to make \"create\" request to {path}: {e}") }) .parsed_body() .unwrap() diff --git a/sled-agent/src/params.rs b/sled-agent/src/params.rs index e5e1b82977..2263aa725d 100644 --- a/sled-agent/src/params.rs +++ b/sled-agent/src/params.rs @@ -827,7 +827,9 @@ pub struct CleanupContextUpdate { } /// Used to dynamically update external IPs attached to an instance. -#[derive(Clone, Debug, Deserialize, JsonSchema, Serialize)] +#[derive( + Copy, Clone, Debug, Eq, PartialEq, Hash, Deserialize, JsonSchema, Serialize, +)] #[serde(rename_all = "snake_case", tag = "type", content = "value")] pub enum InstanceExternalIpBody { Ephemeral(IpAddr), diff --git a/sled-agent/src/sim/http_entrypoints.rs b/sled-agent/src/sim/http_entrypoints.rs index f77da11b0e..d533db3252 100644 --- a/sled-agent/src/sim/http_entrypoints.rs +++ b/sled-agent/src/sim/http_entrypoints.rs @@ -8,8 +8,9 @@ use crate::bootstrap::early_networking::{ EarlyNetworkConfig, EarlyNetworkConfigBody, }; use crate::params::{ - DiskEnsureBody, InstanceEnsureBody, InstancePutMigrationIdsBody, - InstancePutStateBody, InstancePutStateResponse, InstanceUnregisterResponse, + DiskEnsureBody, InstanceEnsureBody, InstanceExternalIpBody, + InstancePutMigrationIdsBody, InstancePutStateBody, + InstancePutStateResponse, InstanceUnregisterResponse, VpcFirewallRulesEnsureBody, }; use dropshot::endpoint; @@ -45,6 +46,8 @@ pub fn api() -> SledApiDescription { api.register(instance_put_state)?; api.register(instance_register)?; api.register(instance_unregister)?; + api.register(instance_put_external_ip)?; + api.register(instance_delete_external_ip)?; api.register(instance_poke_post)?; api.register(disk_put)?; api.register(disk_poke_post)?; @@ -149,6 +152,38 @@ async fn instance_put_migration_ids( )) } +#[endpoint { + method = PUT, + path = "/instances/{instance_id}/external-ip", +}] +async fn instance_put_external_ip( + rqctx: RequestContext>, + path_params: Path, + body: TypedBody, +) -> Result { + let sa = rqctx.context(); + let instance_id = path_params.into_inner().instance_id; + let body_args = body.into_inner(); + sa.instance_put_external_ip(instance_id, &body_args).await?; + Ok(HttpResponseUpdatedNoContent()) +} + +#[endpoint { + method = DELETE, + path = "/instances/{instance_id}/external-ip", +}] +async fn instance_delete_external_ip( + rqctx: RequestContext>, + path_params: Path, + body: TypedBody, +) -> Result { + let sa = rqctx.context(); + let instance_id = path_params.into_inner().instance_id; + let body_args = body.into_inner(); + sa.instance_delete_external_ip(instance_id, &body_args).await?; + Ok(HttpResponseUpdatedNoContent()) +} + #[endpoint { method = POST, path = "/instances/{instance_id}/poke", diff --git a/sled-agent/src/sim/sled_agent.rs b/sled-agent/src/sim/sled_agent.rs index a16049dd2f..3a5633c6c3 100644 --- a/sled-agent/src/sim/sled_agent.rs +++ b/sled-agent/src/sim/sled_agent.rs @@ -13,9 +13,9 @@ use super::storage::Storage; use crate::nexus::NexusClient; use crate::params::{ - DiskStateRequested, InstanceHardware, InstanceMigrationSourceParams, - InstancePutStateResponse, InstanceStateRequested, - InstanceUnregisterResponse, + DiskStateRequested, InstanceExternalIpBody, InstanceHardware, + InstanceMigrationSourceParams, InstancePutStateResponse, + InstanceStateRequested, InstanceUnregisterResponse, }; use crate::sim::simulatable::Simulatable; use crate::updates::UpdateManager; @@ -32,7 +32,7 @@ use std::net::{IpAddr, Ipv6Addr, SocketAddr}; use std::sync::Arc; use uuid::Uuid; -use std::collections::HashMap; +use std::collections::{HashMap, HashSet}; use std::str::FromStr; use dropshot::HttpServer; @@ -68,6 +68,8 @@ pub struct SledAgent { pub v2p_mappings: Mutex>>, mock_propolis: Mutex>, PropolisClient)>>, + /// lists of external IPs assigned to instances + pub external_ips: Mutex>>, instance_ensure_state_error: Mutex>, } @@ -160,6 +162,7 @@ impl SledAgent { nexus_client, disk_id_to_region_ids: Mutex::new(HashMap::new()), v2p_mappings: Mutex::new(HashMap::new()), + external_ips: Mutex::new(HashMap::new()), mock_propolis: Mutex::new(None), instance_ensure_state_error: Mutex::new(None), }) @@ -620,6 +623,62 @@ impl SledAgent { Ok(()) } + pub async fn instance_put_external_ip( + &self, + instance_id: Uuid, + body_args: &InstanceExternalIpBody, + ) -> Result<(), Error> { + if !self.instances.contains_key(&instance_id).await { + return Err(Error::internal_error( + "can't alter IP state for nonexistant instance", + )); + } + + let mut eips = self.external_ips.lock().await; + let my_eips = eips.entry(instance_id).or_default(); + + // High-level behaviour: this should always succeed UNLESS + // trying to add a double ephemeral. + if let InstanceExternalIpBody::Ephemeral(curr_ip) = &body_args { + if my_eips + .iter() + .find(|v| { + if let InstanceExternalIpBody::Ephemeral(other_ip) = v { + curr_ip != other_ip + } else { + false + } + }) + .is_some() + { + return Err(Error::invalid_request("cannot replace exisitng ephemeral IP without explicit removal")); + } + } + + my_eips.insert(*body_args); + + Ok(()) + } + + pub async fn instance_delete_external_ip( + &self, + instance_id: Uuid, + body_args: &InstanceExternalIpBody, + ) -> Result<(), Error> { + if !self.instances.contains_key(&instance_id).await { + return Err(Error::internal_error( + "can't alter IP state for nonexistant instance", + )); + } + + let mut eips = self.external_ips.lock().await; + let my_eips = eips.entry(instance_id).or_default(); + + my_eips.remove(&body_args); + + Ok(()) + } + /// Used for integration tests that require a component to talk to a /// mocked propolis-server API. // TODO: fix schemas so propolis-server's port isn't hardcoded in nexus From c25934293698dacf63edd35cfbcd1ac2dc1d7c6c Mon Sep 17 00:00:00 2001 From: Kyle Simpson Date: Thu, 28 Dec 2023 12:07:25 +0000 Subject: [PATCH 20/56] Clippy + neuter errors in undo path --- nexus/src/app/sagas/instance_ip_attach.rs | 88 ++++++++++++++--------- nexus/src/app/sagas/instance_ip_detach.rs | 66 ++++++++++------- sled-agent/src/sim/sled_agent.rs | 18 ++--- 3 files changed, 102 insertions(+), 70 deletions(-) diff --git a/nexus/src/app/sagas/instance_ip_attach.rs b/nexus/src/app/sagas/instance_ip_attach.rs index 2e877cc065..83fb8184dc 100644 --- a/nexus/src/app/sagas/instance_ip_attach.rs +++ b/nexus/src/app/sagas/instance_ip_attach.rs @@ -34,6 +34,13 @@ use uuid::Uuid; // sled-agent as temporary errors and unwinding. For the delete case, we // allow the attach/detach completion to have a missing record. // See `instance_common::instance_ip_get_instance_state` for more info. +// +// One more consequence of sled state being able to change beneath us +// is that the central undo actions (DPD/OPTE state) *must* be best-effort. +// This is not bad per-se: instance stop does not itself remove NAT routing +// rules. The only reason these should fail is because an instance has stopped, +// at which point there's no good in e.g. adding another entry to a non-existent +// sled-agent regardless. declare_saga_actions! { instance_ip_attach; @@ -71,9 +78,6 @@ pub struct Params { pub serialized_authn: authn::saga::Serialized, } -// TODO: factor this out for attach, detach, and instance create -// to share an impl. - async fn siia_begin_attach_ip( sagactx: NexusActionContext, ) -> Result { @@ -169,13 +173,17 @@ async fn siia_nat(sagactx: NexusActionContext) -> Result<(), ActionError> { async fn siia_nat_undo( sagactx: NexusActionContext, ) -> Result<(), anyhow::Error> { + let log = sagactx.user_data().log(); let params = sagactx.saga_params::()?; - instance_ip_remove_nat( + if let Err(e) = instance_ip_remove_nat( &sagactx, ¶ms.serialized_authn, ¶ms.authz_instance, ) - .await?; + .await + { + error!(log, "siia_nat_undo: failed to notify DPD: {e}"); + } Ok(()) } @@ -190,8 +198,13 @@ async fn siia_update_opte( async fn siia_update_opte_undo( sagactx: NexusActionContext, ) -> Result<(), anyhow::Error> { + let log = sagactx.user_data().log(); let params = sagactx.saga_params::()?; - instance_ip_remove_opte(&sagactx, ¶ms.authz_instance).await?; + if let Err(e) = + instance_ip_remove_opte(&sagactx, ¶ms.authz_instance).await + { + error!(log, "siia_update_opte_undo: failed to notify sled-agent: {e}"); + } Ok(()) } @@ -219,8 +232,6 @@ async fn siia_complete_attach( target_ip.try_into().map_err(ActionError::action_failed) } -// TODO: backout changes if run state changed illegally? - #[derive(Debug)] pub struct SagaInstanceIpAttach; impl NexusSaga for SagaInstanceIpAttach { @@ -247,11 +258,8 @@ impl NexusSaga for SagaInstanceIpAttach { #[cfg(test)] pub(crate) mod test { use super::*; - use crate::app::{ - saga::create_saga_dag, - sagas::test_helpers::{self, instance_simulate}, - }; - use async_bb8_diesel::{AsyncRunQueryDsl, AsyncSimpleConnection}; + use crate::app::{saga::create_saga_dag, sagas::test_helpers}; + use async_bb8_diesel::AsyncRunQueryDsl; use diesel::{ ExpressionMethods, OptionalExtension, QueryDsl, SelectableHelper, }; @@ -259,15 +267,10 @@ pub(crate) mod test { use nexus_db_model::{IpKind, Name}; use nexus_db_queries::context::OpContext; use nexus_test_utils::resource_helpers::{ - create_disk, create_floating_ip, create_instance, create_project, - object_create, populate_ip_pool, + create_floating_ip, create_instance, create_project, populate_ip_pool, }; use nexus_test_utils_macros::nexus_test; - use nexus_types::external_api::params::ExternalIpCreate; - use omicron_common::api::external::{ - ByteCount, IdentityMetadataCreateParams, InstanceCpuCount, - SimpleIdentity, - }; + use omicron_common::api::external::SimpleIdentity; type ControlPlaneTestContext = nexus_test_utils::ControlPlaneTestContext; @@ -327,11 +330,12 @@ pub(crate) mod test { let client = &cptestctx.external_client; let apictx = &cptestctx.server.apictx(); let nexus = &apictx.nexus; + let sled_agent = &cptestctx.sled_agent.sled_agent; let opctx = test_helpers::test_opctx(cptestctx); let datastore = &nexus.db_datastore; - let project_id = ip_manip_test_setup(&client).await; - let _instance = + let _project_id = ip_manip_test_setup(&client).await; + let instance = create_instance(client, PROJECT_NAME, INSTANCE_NAME).await; for use_float in [false, true] { @@ -340,9 +344,31 @@ pub(crate) mod test { let dag = create_saga_dag::(params).unwrap(); let saga = nexus.create_runnable_saga(dag).await.unwrap(); nexus.run_saga(saga).await.expect("Attach saga should succeed"); - - // TODO: assert sled agent, dpd happy, ... } + + let instance_id = instance.id(); + + // Sled agent has a record of the new external IPs. + let mut eips = sled_agent.external_ips.lock().await; + let my_eips = eips.entry(instance_id).or_default(); + assert!(my_eips.iter().any(|v| matches!( + v, + omicron_sled_agent::params::InstanceExternalIpBody::Floating(_) + ))); + assert!(my_eips.iter().any(|v| matches!( + v, + omicron_sled_agent::params::InstanceExternalIpBody::Ephemeral(_) + ))); + + // DB has records for SNAT plus the new IPs. + let db_eips = datastore + .instance_lookup_external_ips(&opctx, instance_id) + .await + .unwrap(); + assert_eq!(db_eips.len(), 3); + assert!(db_eips.iter().any(|v| v.kind == IpKind::Ephemeral)); + assert!(db_eips.iter().any(|v| v.kind == IpKind::Floating)); + assert!(db_eips.iter().any(|v| v.kind == IpKind::SNat)); } pub(crate) async fn verify_clean_slate( @@ -381,10 +407,9 @@ pub(crate) mod test { .is_none()); // No IP bindings remain on sled-agent. - let eips = &*sled_agent.external_ips.lock().await; - for (_nic_id, eip_set) in eips { - assert!(eip_set.is_empty()); - } + let mut eips = sled_agent.external_ips.lock().await; + let my_eips = eips.entry(instance_id).or_default(); + assert!(my_eips.is_empty()); } #[nexus_test(server = crate::Server)] @@ -398,7 +423,7 @@ pub(crate) mod test { let opctx = test_helpers::test_opctx(cptestctx); let datastore = &nexus.db_datastore; - let project_id = ip_manip_test_setup(&client).await; + let _project_id = ip_manip_test_setup(&client).await; let instance = create_instance(client, PROJECT_NAME, INSTANCE_NAME).await; @@ -424,7 +449,7 @@ pub(crate) mod test { let opctx = test_helpers::test_opctx(cptestctx); let datastore = &nexus.db_datastore; - let project_id = ip_manip_test_setup(&client).await; + let _project_id = ip_manip_test_setup(&client).await; let instance = create_instance(client, PROJECT_NAME, INSTANCE_NAME).await; @@ -447,14 +472,13 @@ pub(crate) mod test { async fn test_actions_succeed_idempotently( cptestctx: &ControlPlaneTestContext, ) { - let log = &cptestctx.logctx.log; let client = &cptestctx.external_client; let apictx = &cptestctx.server.apictx(); let nexus = &apictx.nexus; let opctx = test_helpers::test_opctx(cptestctx); let datastore = &nexus.db_datastore; - let project_id = ip_manip_test_setup(&client).await; + let _project_id = ip_manip_test_setup(&client).await; let _instance = create_instance(client, PROJECT_NAME, INSTANCE_NAME).await; diff --git a/nexus/src/app/sagas/instance_ip_detach.rs b/nexus/src/app/sagas/instance_ip_detach.rs index 46edca1d0c..0545bd3c71 100644 --- a/nexus/src/app/sagas/instance_ip_detach.rs +++ b/nexus/src/app/sagas/instance_ip_detach.rs @@ -14,25 +14,14 @@ use crate::external_api::params; use nexus_db_model::{ExternalIp, IpAttachState, IpKind}; use nexus_db_queries::db::lookup::LookupPath; use nexus_types::external_api::views; -use omicron_common::api::external::{Error, InstanceState}; +use omicron_common::api::external::Error; use serde::Deserialize; use serde::Serialize; use steno::ActionError; use uuid::Uuid; -// rough sequence of evts: -// - take temp ownership of instance while interacting w/ sled agent -// -> mark instance migration id as Some(0) if None -// - Withdraw routes -// -> ensure_dpd... (?) Do we actually need to? -// -> must precede OPTE: host may change its sending -// behaviour prematurely -// - Deregister addr in OPTE -// -> Put addr in sled-agent endpoint -// - Detach and Delete EIP iff. Ephemeral -// -> why so late? Risk that we can't recover our IP in an unwind. -// - free up migration_id of instance. -// -> mark instance migration id as None +// This runs on similar logic to instance IP attach: see its head +// comment for an explanation of the structure wrt. other sagas. declare_saga_actions! { instance_ip_detach; @@ -177,13 +166,17 @@ async fn siid_nat(sagactx: NexusActionContext) -> Result<(), ActionError> { async fn siid_nat_undo( sagactx: NexusActionContext, ) -> Result<(), anyhow::Error> { + let log = sagactx.user_data().log(); let params = sagactx.saga_params::()?; - instance_ip_add_nat( + if let Err(e) = instance_ip_add_nat( &sagactx, ¶ms.serialized_authn, ¶ms.authz_instance, ) - .await?; + .await + { + error!(log, "siid_nat_undo: failed to notify DPD: {e}"); + } Ok(()) } @@ -198,8 +191,12 @@ async fn siid_update_opte( async fn siid_update_opte_undo( sagactx: NexusActionContext, ) -> Result<(), anyhow::Error> { + let log = sagactx.user_data().log(); let params = sagactx.saga_params::()?; - instance_ip_add_opte(&sagactx, ¶ms.authz_instance).await?; + if let Err(e) = instance_ip_add_opte(&sagactx, ¶ms.authz_instance).await + { + error!(log, "siid_update_opte_undo: failed to notify sled-agent: {e}"); + } Ok(()) } @@ -220,7 +217,7 @@ async fn siid_complete_detach( { warn!( log, - "siia_complete_attach: external IP was deleted or call was idempotent" + "siid_complete_attach: external IP was deleted or call was idempotent" ) } @@ -263,7 +260,7 @@ pub(crate) mod test { }, Nexus, }; - use async_bb8_diesel::{AsyncRunQueryDsl, AsyncSimpleConnection}; + use async_bb8_diesel::AsyncRunQueryDsl; use diesel::{ ExpressionMethods, OptionalExtension, QueryDsl, SelectableHelper, }; @@ -342,11 +339,12 @@ pub(crate) mod test { let client = &cptestctx.external_client; let apictx = &cptestctx.server.apictx(); let nexus = &apictx.nexus; + let sled_agent = &cptestctx.sled_agent.sled_agent; let opctx = test_helpers::test_opctx(cptestctx); let datastore = &nexus.db_datastore; let _ = ip_manip_test_setup(&client).await; - let _instance = + let instance = create_instance(client, PROJECT_NAME, INSTANCE_NAME).await; attach_instance_ips(nexus, &opctx).await; @@ -358,6 +356,21 @@ pub(crate) mod test { let saga = nexus.create_runnable_saga(dag).await.unwrap(); nexus.run_saga(saga).await.expect("Detach saga should succeed"); } + + let instance_id = instance.id(); + + // Sled agent has removed its records of the external IPs. + let mut eips = sled_agent.external_ips.lock().await; + let my_eips = eips.entry(instance_id).or_default(); + assert!(my_eips.is_empty()); + + // DB only has record for SNAT. + let db_eips = datastore + .instance_lookup_external_ips(&opctx, instance_id) + .await + .unwrap(); + assert_eq!(db_eips.len(), 1); + assert!(db_eips.iter().any(|v| v.kind == IpKind::SNat)); } pub(crate) async fn verify_clean_slate( @@ -401,9 +414,9 @@ pub(crate) mod test { .await .unwrap(); assert_eq!(db_eips.len(), 3); - assert!(db_eips.iter().find(|v| v.kind == IpKind::Ephemeral).is_some()); - assert!(db_eips.iter().find(|v| v.kind == IpKind::Floating).is_some()); - assert!(db_eips.iter().find(|v| v.kind == IpKind::SNat).is_some()); + assert!(db_eips.iter().any(|v| v.kind == IpKind::Ephemeral)); + assert!(db_eips.iter().any(|v| v.kind == IpKind::Floating)); + assert!(db_eips.iter().any(|v| v.kind == IpKind::SNat)); // No IP bindings remain on sled-agent. let eips = &*sled_agent.external_ips.lock().await; @@ -423,7 +436,7 @@ pub(crate) mod test { let opctx = test_helpers::test_opctx(cptestctx); let datastore = &nexus.db_datastore; - let project_id = ip_manip_test_setup(&client).await; + let _project_id = ip_manip_test_setup(&client).await; let instance = create_instance(client, PROJECT_NAME, INSTANCE_NAME).await; @@ -451,7 +464,7 @@ pub(crate) mod test { let opctx = test_helpers::test_opctx(cptestctx); let datastore = &nexus.db_datastore; - let project_id = ip_manip_test_setup(&client).await; + let _project_id = ip_manip_test_setup(&client).await; let instance = create_instance(client, PROJECT_NAME, INSTANCE_NAME).await; @@ -476,14 +489,13 @@ pub(crate) mod test { async fn test_actions_succeed_idempotently( cptestctx: &ControlPlaneTestContext, ) { - let log = &cptestctx.logctx.log; let client = &cptestctx.external_client; let apictx = &cptestctx.server.apictx(); let nexus = &apictx.nexus; let opctx = test_helpers::test_opctx(cptestctx); let datastore = &nexus.db_datastore; - let project_id = ip_manip_test_setup(&client).await; + let _project_id = ip_manip_test_setup(&client).await; let _instance = create_instance(client, PROJECT_NAME, INSTANCE_NAME).await; diff --git a/sled-agent/src/sim/sled_agent.rs b/sled-agent/src/sim/sled_agent.rs index 3a5633c6c3..21a9d39220 100644 --- a/sled-agent/src/sim/sled_agent.rs +++ b/sled-agent/src/sim/sled_agent.rs @@ -640,17 +640,13 @@ impl SledAgent { // High-level behaviour: this should always succeed UNLESS // trying to add a double ephemeral. if let InstanceExternalIpBody::Ephemeral(curr_ip) = &body_args { - if my_eips - .iter() - .find(|v| { - if let InstanceExternalIpBody::Ephemeral(other_ip) = v { - curr_ip != other_ip - } else { - false - } - }) - .is_some() - { + if my_eips.iter().any(|v| { + if let InstanceExternalIpBody::Ephemeral(other_ip) = v { + curr_ip != other_ip + } else { + false + } + }) { return Err(Error::invalid_request("cannot replace exisitng ephemeral IP without explicit removal")); } } From 21d57767f425bffdb58ed5e71fe47614a150e405 Mon Sep 17 00:00:00 2001 From: Kyle Simpson Date: Thu, 28 Dec 2023 18:08:42 +0000 Subject: [PATCH 21/56] Working idempotent double attach/detach. This required that we drop the non-null parent constraint on ephemeral IPs, but I think that's worth it in the name of consistency. --- nexus/db-model/src/external_ip.rs | 6 +- nexus/db-model/src/schema.rs | 2 + .../src/db/datastore/external_ip.rs | 266 ++++++++++++++---- .../db-queries/src/db/queries/external_ip.rs | 18 +- nexus/src/app/instance.rs | 4 +- nexus/src/app/instance_network.rs | 25 +- nexus/src/app/sagas/instance_common.rs | 70 ++++- nexus/src/app/sagas/instance_create.rs | 3 + nexus/src/app/sagas/instance_ip_attach.rs | 30 +- nexus/src/app/sagas/instance_ip_detach.rs | 64 +++-- nexus/src/external_api/http_entrypoints.rs | 2 +- schema/crdb/22.0.0/up08.sql | 2 + schema/crdb/22.0.0/up09.sql | 4 + schema/crdb/dbinit.sql | 10 +- 14 files changed, 382 insertions(+), 124 deletions(-) create mode 100644 schema/crdb/22.0.0/up08.sql create mode 100644 schema/crdb/22.0.0/up09.sql diff --git a/nexus/db-model/src/external_ip.rs b/nexus/db-model/src/external_ip.rs index b6f556ab61..d762d0bb4a 100644 --- a/nexus/db-model/src/external_ip.rs +++ b/nexus/db-model/src/external_ip.rs @@ -183,7 +183,7 @@ impl IncompleteExternalIp { } } - pub fn for_ephemeral(id: Uuid, instance_id: Uuid, pool_id: Uuid) -> Self { + pub fn for_ephemeral(id: Uuid, pool_id: Uuid) -> Self { let kind = IpKind::Ephemeral; Self { id, @@ -192,7 +192,7 @@ impl IncompleteExternalIp { time_created: Utc::now(), kind, is_service: false, - parent_id: Some(instance_id), + parent_id: None, pool_id, project_id: None, explicit_ip: None, @@ -402,7 +402,7 @@ impl IpKind { pub fn initial_state(&self) -> IpAttachState { match &self { IpKind::SNat => IpAttachState::Attached, - IpKind::Ephemeral => IpAttachState::Attaching, + IpKind::Ephemeral => IpAttachState::Detached, IpKind::Floating => IpAttachState::Detached, } } diff --git a/nexus/db-model/src/schema.rs b/nexus/db-model/src/schema.rs index 7af74036b2..ba12c9d041 100644 --- a/nexus/db-model/src/schema.rs +++ b/nexus/db-model/src/schema.rs @@ -1406,6 +1406,8 @@ allow_tables_to_appear_in_same_query!( allow_tables_to_appear_in_same_query!(dns_zone, dns_version, dns_name); allow_tables_to_appear_in_same_query!(external_ip, service); +allow_tables_to_appear_in_same_query!(external_ip, instance); +joinable!(external_ip -> instance (parent_id)); allow_tables_to_appear_in_same_query!( switch_port, diff --git a/nexus/db-queries/src/db/datastore/external_ip.rs b/nexus/db-queries/src/db/datastore/external_ip.rs index b021de9580..2364cbf341 100644 --- a/nexus/db-queries/src/db/datastore/external_ip.rs +++ b/nexus/db-queries/src/db/datastore/external_ip.rs @@ -80,13 +80,33 @@ impl DataStore { /// concurrent access. /// Callers must call `external_ip_complete_op` on saga completion to move /// the IP to `Attached`. + /// + /// To better handle idempotent attachment, this method returns an + /// additional bool: + /// - true: EIP was detached or attaching. proceed with saga. + /// - false: EIP was attached. No-op for remainder of saga. pub async fn allocate_instance_ephemeral_ip( &self, opctx: &OpContext, ip_id: Uuid, instance_id: Uuid, pool_name: Option, - ) -> CreateResult { + creating_instance: bool, + ) -> CreateResult<(ExternalIp, bool)> { + use db::schema::external_ip::dsl; + use db::schema::external_ip::table; + use db::schema::instance::dsl as inst_dsl; + use db::schema::instance::table as inst_table; + use diesel::result::DatabaseErrorKind::UniqueViolation; + + // This is slightly hacky: we need to create an unbound ephemeral IP, and + // then attempt to bind it to respect two separate constraints: + // - At most one Ephemeral IP per instance + // - At most MAX external IPs per instance + // We already catch and convert a UniqueViolation on ephemeral IPs: + // if we see this occur, then + // Naturally, we now *need* to destroy the ephemeral IP if the newly alloc'd + // IP was not attached, including on idempotent success. let pool = match pool_name { Some(name) => { let (.., authz_pool, pool) = LookupPath::new(opctx, &self) @@ -115,9 +135,116 @@ impl DataStore { }; let pool_id = pool.identity.id; - let data = - IncompleteExternalIp::for_ephemeral(ip_id, instance_id, pool_id); - self.allocate_external_ip(opctx, data).await + let data = IncompleteExternalIp::for_ephemeral(ip_id, pool_id); + let temp_ip = self.allocate_external_ip(opctx, data).await?; + + let safe_states = if creating_instance { + &SAFE_TO_ATTACH_INSTANCE_STATES_CREATING[..] + } else { + &SAFE_TO_ATTACH_INSTANCE_STATES[..] + }; + + let query = Instance::attach_resource( + instance_id, + temp_ip.id, + inst_table.into_boxed().filter(inst_dsl::state.eq_any(safe_states)), + table + .into_boxed() + .filter(dsl::state.eq(IpAttachState::Detached)) + .filter(dsl::kind.eq(IpKind::Ephemeral)), + MAX_EXTERNAL_IPS_PLUS_SNAT, + diesel::update(dsl::external_ip).set(( + dsl::parent_id.eq(Some(instance_id)), + dsl::time_modified.eq(Utc::now()), + dsl::state.eq(IpAttachState::Attaching), + )), + ); + + let result = query.attach_and_get_result_async(&*self.pool_connection_authorized(opctx).await?) + .await + .map(Some) + .or_else(|e: AttachError| match e { + AttachError::CollectionNotFound => { + Err(Error::not_found_by_id( + ResourceType::Instance, + &instance_id, + )) + }, + AttachError::ResourceNotFound => { + Err(Error::internal_error("call-scoped ephemeral IP was lost")) + }, + AttachError::NoUpdate { attached_count, resource, collection } => { + match resource.state { + // Idempotent errors: is in progress forsame resource pair -- this is fine. + IpAttachState::Attaching if resource.parent_id == Some(instance_id) => return Ok(Some((collection, resource))), + IpAttachState::Attached => return Err(Error::invalid_request( + "floating IP cannot be attached to one \ + instance while still attached to another" + )), + // User can reattempt depending on how the current saga unfolds. + IpAttachState::Attaching | IpAttachState::Detaching => return Err(Error::unavail( + "tried to attach floating IP mid-attach/detach" + )), + + IpAttachState::Detached => {}, + } + + Err(match &collection.runtime_state.nexus_state { + state if SAFE_TRANSIENT_INSTANCE_STATES.contains(&state) => Error::unavail( + "tried to attach floating IP while instance was changing state" + ), + state if SAFE_TO_ATTACH_INSTANCE_STATES.contains(&state) => { + if attached_count >= MAX_EXTERNAL_IPS_PLUS_SNAT as i64 { + Error::invalid_request(&format!( + "an instance may not have more than {} external IP addresses", + MAX_EXTERNAL_IPS_PER_INSTANCE, + )) + } else { + eprintln!("{resource:?}, {collection:?}"); + Error::internal_error("failed to attach ephemeral IP") + } + }, + state => Error::invalid_request(&format!("cannot attach floating IP to instance in {state} state")), + }) + }, + // This case occurs for both currently attaching and attached IPs: + AttachError::DatabaseError(diesel::result::Error::DatabaseError(UniqueViolation, ..)) => { + Ok(None) + }, + AttachError::DatabaseError(e) => { + Err(public_error_from_diesel(e, ErrorHandler::Server)) + }, + }); + + // if completed (!do_saga), we need to attempt + + match result { + Err(e) => { + self.deallocate_external_ip(opctx, temp_ip.id).await?; + Err(e) + } + // Idempotent cases: + Ok(Some((_, eip))) if eip.id != temp_ip.id => { + // Is this even possible? + eprintln!("mismatch?"); + self.deallocate_external_ip(opctx, temp_ip.id).await?; + Ok((eip, true)) + } + Ok(None) => { + self.deallocate_external_ip(opctx, temp_ip.id).await?; + let eip = self + .instance_lookup_external_ips(opctx, instance_id) + .await? + .into_iter() + .find(|v| v.kind == IpKind::Ephemeral) + .ok_or_else(|| Error::internal_error("hmm"))?; + Ok((eip, false)) + } + Ok(Some((_, eip))) => { + eprintln!(""); + Ok((eip, true)) + } + } } /// Allocates an IP address for internal service usage. @@ -259,8 +386,8 @@ impl DataStore { ) } } - // Floating IP: name conflict - DatabaseError(UniqueViolation, ..) if name.is_some() => { + // Floating IP: name conflict + DatabaseError(UniqueViolation, ..) => { TransactionError::CustomError(public_error_from_diesel( e, ErrorHandler::Conflict( @@ -271,12 +398,6 @@ impl DataStore { ), )) } - // Ephemeral IP: violated one-per-instance rule. - DatabaseError(UniqueViolation, ..) => { - TransactionError::CustomError(Error::invalid_request( - "instance/service cannot have more than one ephemeral IP" - )) - } _ => { if retryable(&e) { return TransactionError::Database(e); @@ -367,12 +488,17 @@ impl DataStore { .map_err(|e| public_error_from_diesel(e, ErrorHandler::Server)) } + /// Moves an instance's ephemeral IP from 'Attached' to 'Detaching'. + /// + /// To support idempotency, this method will succeed if the instance + /// has no ephemeral IP or one is actively being removed. As a result, + /// information on an actual ExternalIp is best-effort. pub async fn begin_deallocate_ephemeral_ip( &self, opctx: &OpContext, ip_id: Uuid, instance_id: Uuid, - ) -> Result { + ) -> Result, Error> { use db::schema::external_ip::dsl; use db::schema::external_ip::table; use db::schema::instance::dsl as inst_dsl; @@ -401,46 +527,48 @@ impl DataStore { let eip = query.detach_and_get_result_async(&*self.pool_connection_authorized(opctx).await?) .await - .map_err(|e: DetachError| match e { - DetachError::CollectionNotFound => { - Error::not_found_by_id( - ResourceType::Instance, - &instance_id, - ) - }, - DetachError::ResourceNotFound => { - Error::invalid_request("instance has no ephemeral IP to detach") - }, - DetachError::NoUpdate { resource, collection } => { - match resource.state { - IpAttachState::Attached if resource.parent_id != Some(instance_id) => return Error::internal_error( - "Ephemeral IP is not attached to the target instance", - ), - // User can reattempt depending on how the current saga unfolds. - IpAttachState::Attaching | IpAttachState::Detaching => return Error::unavail( - "tried to detach ephemeral IP mid-attach/detach" - ), - IpAttachState::Attached => {}, - IpAttachState::Detached => return Error::internal_error( - "Ephemeral IP cannot exist in 'detached' state", - ), - } - - match collection.runtime_state.nexus_state { - state if SAFE_TRANSIENT_INSTANCE_STATES.contains(&state) => Error::unavail( - "tried to detach ephemeral IP while instance was changing state" - ), - state if SAFE_TO_ATTACH_INSTANCE_STATES.contains(&state) => { - Error::internal_error("failed to detach ephemeral IP") + .map(Some) + .or_else(|e: DetachError| { + Err(match e { + DetachError::CollectionNotFound => { + Error::not_found_by_id( + ResourceType::Instance, + &instance_id, + ) }, - state => Error::invalid_request(&format!("cannot attach ephemeral IP to instance in {state} state")), - } - }, - DetachError::DatabaseError(e) => { - public_error_from_diesel(e, ErrorHandler::Server) - }, - - })?; + DetachError::ResourceNotFound => { + return Ok(None); + }, + DetachError::NoUpdate { resource, collection } => { + match resource.state { + // XXX: internal error? + IpAttachState::Attached if resource.parent_id != Some(instance_id) => return Err(Error::internal_error( + "Ephemeral IP is not attached to the target instance", + )), + IpAttachState::Detaching => return Ok(Some(resource)), + // User can reattempt depending on how the current saga unfolds. + IpAttachState::Attaching => return Err(Error::unavail( + "tried to detach ephemeral IP mid-attach/detach" + )), + IpAttachState::Attached => {}, + IpAttachState::Detached => return Err(Error::internal_error( + "Ephemeral IP cannot exist in 'detached' state", + )), + } + match collection.runtime_state.nexus_state { + state if SAFE_TRANSIENT_INSTANCE_STATES.contains(&state) => Error::unavail( + "tried to detach ephemeral IP while instance was changing state" + ), + state if SAFE_TO_ATTACH_INSTANCE_STATES.contains(&state) => { + Error::internal_error("failed to detach ephemeral IP") + }, + state => Error::invalid_request(&format!("cannot attach ephemeral IP to instance in {state} state")), + } + }, + DetachError::DatabaseError(e) => { + public_error_from_diesel(e, ErrorHandler::Server) + }, + })})?; Ok(eip) } @@ -588,13 +716,18 @@ impl DataStore { /// This moves a floating IP into the 'attaching' state. Callers are /// responsible for calling `external_ip_complete_op` to finalise the /// IP in 'attached' state at saga completion. + /// + /// To better handle idempotent attachment, this method returns an + /// additional bool: + /// - true: EIP was detached or attaching. proceed with saga. + /// - false: EIP was attached. No-op for remainder of saga. pub async fn floating_ip_begin_attach( &self, opctx: &OpContext, authz_fip: &authz::FloatingIp, instance_id: Uuid, creating_instance: bool, - ) -> UpdateResult { + ) -> UpdateResult<(ExternalIp, bool)> { use db::schema::external_ip::dsl; use db::schema::external_ip::table; use db::schema::instance::dsl as inst_dsl; @@ -633,6 +766,7 @@ impl DataStore { )), ); + let mut do_saga = true; let (_, eip) = query.attach_and_get_result_async(&*self.pool_connection_authorized(opctx).await?) .await .or_else(|e: AttachError| match e { @@ -650,8 +784,12 @@ impl DataStore { }, AttachError::NoUpdate { attached_count, resource, collection } => { match resource.state { - // Idempotent errors: is in progress forsame resource pair -- this is fine. + // Idempotent errors: is in progress or complete for same resource pair -- this is fine. IpAttachState::Attaching if resource.parent_id == Some(instance_id) => return Ok((collection, resource)), + IpAttachState::Attached if resource.parent_id == Some(instance_id) => { + do_saga = false; + return Ok((collection, resource)) + }, IpAttachState::Attached => return Err(Error::invalid_request( "floating IP cannot be attached to one \ instance while still attached to another" @@ -664,7 +802,7 @@ impl DataStore { IpAttachState::Detached => {}, } - Err(match collection.runtime_state.nexus_state { + Err(match &collection.runtime_state.nexus_state { state if SAFE_TRANSIENT_INSTANCE_STATES.contains(&state) => Error::unavail( "tried to attach floating IP while instance was changing state" ), @@ -675,6 +813,7 @@ impl DataStore { MAX_EXTERNAL_IPS_PER_INSTANCE, )) } else { + eprintln!("{resource:?}, {collection:?}"); Error::internal_error("failed to attach floating IP") } }, @@ -687,7 +826,7 @@ impl DataStore { })?; - Ok(eip) + Ok((eip, do_saga)) } /// Detaches a Floating IP address from an instance. @@ -695,13 +834,18 @@ impl DataStore { /// This moves a floating IP into the 'detaching' state. Callers are /// responsible for calling `external_ip_complete_op` to finalise the /// IP in 'detached' state at saga completion. + /// + /// To better handle idempotent detachment, this method returns an + /// additional bool: + /// - true: EIP was attached or detaching. proceed with saga. + /// - false: EIP was detached. No-op for remainder of saga. pub async fn floating_ip_begin_detach( &self, opctx: &OpContext, authz_fip: &authz::FloatingIp, instance_id: Uuid, creating_instance: bool, - ) -> UpdateResult { + ) -> UpdateResult<(ExternalIp, bool)> { use db::schema::external_ip::dsl; use db::schema::external_ip::table; use db::schema::instance::dsl as inst_dsl; @@ -737,6 +881,7 @@ impl DataStore { )), ); + let mut do_saga = true; let eip = query.detach_and_get_result_async(&*self.pool_connection_authorized(opctx).await?) .await .or_else(|e: DetachError| Err(match e { @@ -756,7 +901,10 @@ impl DataStore { let parent_match = resource.parent_id == Some(instance_id); match resource.state { // Idempotent cases: already detached OR detaching from same instance. - IpAttachState::Detached => return Ok(resource), + IpAttachState::Detached => { + do_saga = false; + return Ok(resource) + }, IpAttachState::Detaching if parent_match => return Ok(resource), IpAttachState::Attached if !parent_match => return Err(Error::invalid_request( "Floating IP is not attached to the target instance", @@ -784,7 +932,7 @@ impl DataStore { }))?; - Ok(eip) + Ok((eip, do_saga)) } /// Move an external IP from a transitional state (attaching, detaching) diff --git a/nexus/db-queries/src/db/queries/external_ip.rs b/nexus/db-queries/src/db/queries/external_ip.rs index 415750165b..0b5eb7c071 100644 --- a/nexus/db-queries/src/db/queries/external_ip.rs +++ b/nexus/db-queries/src/db/queries/external_ip.rs @@ -1065,9 +1065,11 @@ mod tests { Uuid::new_v4(), instance_id, /* pool_name = */ None, + true, ) .await - .expect("Failed to allocate Ephemeral IP when there is space"); + .expect("Failed to allocate Ephemeral IP when there is space") + .0; assert_eq!(ephemeral_ip.ip.ip(), range.last_address()); assert_eq!(ephemeral_ip.first_port.0, 0); assert_eq!(ephemeral_ip.last_port.0, super::MAX_PORT); @@ -1105,6 +1107,7 @@ mod tests { Uuid::new_v4(), instance_id, /* pool_name = */ None, + true, ) .await; assert!( @@ -1250,9 +1253,11 @@ mod tests { id, instance_id, pool_name, + true, ) .await - .expect("Failed to allocate instance ephemeral IP address"); + .expect("Failed to allocate instance ephemeral IP address") + .0; assert_eq!(ip.kind, IpKind::Ephemeral); assert_eq!(ip.ip.ip(), range.first_address()); assert_eq!(ip.first_port.0, 0); @@ -1780,9 +1785,11 @@ mod tests { id, instance_id, pool_name, + true, ) .await - .expect("Failed to allocate instance ephemeral IP address"); + .expect("Failed to allocate instance ephemeral IP address") + .0; assert_eq!(ip.kind, IpKind::Ephemeral); assert_eq!(ip.ip.ip(), second_range.first_address()); assert_eq!(ip.first_port.0, 0); @@ -1823,9 +1830,11 @@ mod tests { Uuid::new_v4(), instance_id, pool_name.clone(), + true, ) .await - .expect("Failed to allocate instance ephemeral IP address"); + .expect("Failed to allocate instance ephemeral IP address") + .0; println!("{ip:#?}"); if let IpAddr::V4(addr) = ip.ip.ip() { assert_eq!(addr.octets()[3], octet); @@ -1842,6 +1851,7 @@ mod tests { Uuid::new_v4(), Uuid::new_v4(), pool_name, + true, ) .await .expect_err("Should not use IP addresses from a different pool"); diff --git a/nexus/src/app/instance.rs b/nexus/src/app/instance.rs index 932e10468e..28aa0dcca0 100644 --- a/nexus/src/app/instance.rs +++ b/nexus/src/app/instance.rs @@ -1942,7 +1942,7 @@ impl super::Nexus { opctx: &OpContext, instance_lookup: &lookup::Instance<'_>, ext_ip: ¶ms::ExternalIpDelete, - ) -> UpdateResult { + ) -> UpdateResult> { let (.., authz_project, authz_instance) = instance_lookup.lookup_for(authz::Action::Modify).await?; @@ -1960,7 +1960,7 @@ impl super::Nexus { .await?; saga_outputs - .lookup_node_output::("output") + .lookup_node_output::>("output") .map_err(|e| Error::internal_error(&format!("{:#}", &e))) .internal_context("looking up output from ip attach saga") } diff --git a/nexus/src/app/instance_network.rs b/nexus/src/app/instance_network.rs index ca45025b5e..663d112673 100644 --- a/nexus/src/app/instance_network.rs +++ b/nexus/src/app/instance_network.rs @@ -342,18 +342,19 @@ impl super::Nexus { .instance_lookup_external_ips(&opctx, instance_id) .await?; - let (ips_of_interest, must_all_be_attached) = - if let Some(wanted_id) = ip_filter { - if let Some(ip) = ips.iter().find(|v| v.id == wanted_id) { - (std::slice::from_ref(ip), false) - } else { - return Err(Error::internal_error(&format!( - "failed to find external ip address with id: {wanted_id}", - ))); - } + let (ips_of_interest, must_all_be_attached) = if let Some(wanted_id) = + ip_filter + { + if let Some(ip) = ips.iter().find(|v| v.id == wanted_id) { + (std::slice::from_ref(ip), false) } else { - (&ips[..], true) - }; + return Err(Error::internal_error(&format!( + "failed to find external ip address with id: {wanted_id}, saw {ips:?}", + ))); + } + } else { + (&ips[..], true) + }; // This is performed so that an IP attach/detach will block the // instance_start saga. Return service unavailable to indicate @@ -482,7 +483,7 @@ impl super::Nexus { std::slice::from_ref(ip) } else { return Err(Error::internal_error(&format!( - "failed to find external ip address with id: {wanted_id}", + "failed to find external ip address with id: {wanted_id}, saw {external_ips:?}", ))); } } else { diff --git a/nexus/src/app/sagas/instance_common.rs b/nexus/src/app/sagas/instance_common.rs index c94aea8fb3..6fedd8d775 100644 --- a/nexus/src/app/sagas/instance_common.rs +++ b/nexus/src/app/sagas/instance_common.rs @@ -151,6 +151,19 @@ pub struct InstanceStateForIp { pub state: InstanceState, } +/// External IP state needed for IP attach/detachment. +/// +/// This holds a record of the mid-processing external IP, where possible. +/// there are cases where this might not be known (e.g., double detach of an +/// ephemeral IP). +/// In particular we need to explicitly no-op if not `do_saga`, to prevent +/// failures borne from instance state changes from knocking out a valid IP binding. +#[derive(Debug, Deserialize, Serialize)] +pub struct ModifyStateForExternalIp { + pub external_ip: Option, + pub do_saga: bool, +} + /// Move an external IP from one state to another as a saga operation, /// returning `Ok(true)` if the record was successfully moved and `Ok(false)` /// if the record was lost. @@ -168,7 +181,16 @@ pub async fn instance_ip_move_state( let opctx = crate::context::op_context_for_saga_action(&sagactx, serialized_authn); - let new_ip = sagactx.lookup::("target_ip")?; + let new_ip = sagactx.lookup::("target_ip")?; + + if !new_ip.do_saga { + return Ok(true); + } + let Some(new_ip) = new_ip.external_ip else { + return Err(ActionError::action_failed(Error::internal_error( + "tried to `do_saga` without valid external IP", + ))); + }; match datastore .external_ip_complete_op(&opctx, new_ip.id, new_ip.kind, from, to) @@ -265,7 +287,15 @@ pub async fn instance_ip_add_nat( return Ok(()); }; - let target_ip = sagactx.lookup::("target_ip")?; + let target_ip = sagactx.lookup::("target_ip")?; + if !target_ip.do_saga { + return Ok(()); + } + let Some(target_ip) = target_ip.external_ip else { + return Err(ActionError::action_failed(Error::internal_error( + "tried to `do_saga` without valid external IP", + ))); + }; // Querying sleds requires fleet access; use the instance allocator context // for this. @@ -305,7 +335,15 @@ pub async fn instance_ip_remove_nat( return Ok(()); }; - let target_ip = sagactx.lookup::("target_ip")?; + let target_ip = sagactx.lookup::("target_ip")?; + if !target_ip.do_saga { + return Ok(()); + } + let Some(target_ip) = target_ip.external_ip else { + return Err(ActionError::action_failed(Error::internal_error( + "tried to `do_saga` without valid external IP", + ))); + }; osagactx .nexus() @@ -329,9 +367,18 @@ pub async fn instance_ip_add_opte( return Ok(()); }; - let new_ip = sagactx.lookup::("target_ip")?; + let target_ip = sagactx.lookup::("target_ip")?; + if !target_ip.do_saga { + return Ok(()); + } + let Some(target_ip) = target_ip.external_ip else { + return Err(ActionError::action_failed(Error::internal_error( + "tried to `do_saga` without valid external IP", + ))); + }; + let sled_agent_body = - new_ip.try_into().map_err(ActionError::action_failed)?; + target_ip.try_into().map_err(ActionError::action_failed)?; osagactx .nexus() @@ -369,9 +416,18 @@ pub async fn instance_ip_remove_opte( return Ok(()); }; - let new_ip = sagactx.lookup::("target_ip")?; + let target_ip = sagactx.lookup::("target_ip")?; + if !target_ip.do_saga { + return Ok(()); + } + let Some(target_ip) = target_ip.external_ip else { + return Err(ActionError::action_failed(Error::internal_error( + "tried to `do_saga` without valid external IP", + ))); + }; + let sled_agent_body = - new_ip.try_into().map_err(ActionError::action_failed)?; + target_ip.try_into().map_err(ActionError::action_failed)?; osagactx .nexus() diff --git a/nexus/src/app/sagas/instance_create.rs b/nexus/src/app/sagas/instance_create.rs index d921275402..53169308c7 100644 --- a/nexus/src/app/sagas/instance_create.rs +++ b/nexus/src/app/sagas/instance_create.rs @@ -634,9 +634,11 @@ async fn sic_allocate_instance_external_ip( ip_id, instance_id, pool_name, + true, ) .await .map_err(ActionError::action_failed)? + .0 } // Set the parent of an existing floating IP to the new instance's ID. params::ExternalIpCreate::Floating { ref floating_ip_name } => { @@ -652,6 +654,7 @@ async fn sic_allocate_instance_external_ip( .floating_ip_begin_attach(&opctx, &authz_fip, instance_id, true) .await .map_err(ActionError::action_failed)? + .0 } }; diff --git a/nexus/src/app/sagas/instance_ip_attach.rs b/nexus/src/app/sagas/instance_ip_attach.rs index 83fb8184dc..3df445b95e 100644 --- a/nexus/src/app/sagas/instance_ip_attach.rs +++ b/nexus/src/app/sagas/instance_ip_attach.rs @@ -5,15 +5,16 @@ use super::instance_common::{ instance_ip_add_nat, instance_ip_add_opte, instance_ip_get_instance_state, instance_ip_move_state, instance_ip_remove_nat, instance_ip_remove_opte, - InstanceStateForIp, + InstanceStateForIp, ModifyStateForExternalIp, }; use super::{ActionRegistry, NexusActionContext, NexusSaga}; use crate::app::sagas::declare_saga_actions; use crate::app::{authn, authz, db}; use crate::external_api::params; -use nexus_db_model::{ExternalIp, IpAttachState}; +use nexus_db_model::IpAttachState; use nexus_db_queries::db::lookup::LookupPath; use nexus_types::external_api::views; +use omicron_common::api::external::Error; use serde::Deserialize; use serde::Serialize; use steno::ActionError; @@ -80,7 +81,7 @@ pub struct Params { async fn siia_begin_attach_ip( sagactx: NexusActionContext, -) -> Result { +) -> Result { let osagactx = sagactx.user_data(); let datastore = osagactx.datastore(); let params = sagactx.saga_params::()?; @@ -100,9 +101,14 @@ async fn siia_begin_attach_ip( Uuid::new_v4(), params.authz_instance.id(), pool_name, + false, ) .await .map_err(ActionError::action_failed) + .map(|(external_ip, do_saga)| ModifyStateForExternalIp { + external_ip: Some(external_ip), + do_saga, + }) } // Set the parent of an existing floating IP to the new instance's ID. params::ExternalIpCreate::Floating { ref floating_ip_name } => { @@ -123,6 +129,10 @@ async fn siia_begin_attach_ip( ) .await .map_err(ActionError::action_failed) + .map(|(external_ip, do_saga)| ModifyStateForExternalIp { + external_ip: Some(external_ip), + do_saga, + }) } } } @@ -213,7 +223,7 @@ async fn siia_complete_attach( ) -> Result { let log = sagactx.user_data().log(); let params = sagactx.saga_params::()?; - let target_ip = sagactx.lookup::("target_ip")?; + let target_ip = sagactx.lookup::("target_ip")?; if !instance_ip_move_state( &sagactx, @@ -229,7 +239,15 @@ async fn siia_complete_attach( ) } - target_ip.try_into().map_err(ActionError::action_failed) + target_ip + .external_ip + .ok_or_else(|| { + Error::internal_error( + "must always have a defined external IP during instance attach", + ) + }) + .and_then(TryInto::try_into) + .map_err(ActionError::action_failed) } #[derive(Debug)] @@ -264,7 +282,7 @@ pub(crate) mod test { ExpressionMethods, OptionalExtension, QueryDsl, SelectableHelper, }; use dropshot::test_util::ClientTestContext; - use nexus_db_model::{IpKind, Name}; + use nexus_db_model::{ExternalIp, IpKind, Name}; use nexus_db_queries::context::OpContext; use nexus_test_utils::resource_helpers::{ create_floating_ip, create_instance, create_project, populate_ip_pool, diff --git a/nexus/src/app/sagas/instance_ip_detach.rs b/nexus/src/app/sagas/instance_ip_detach.rs index 0545bd3c71..d460075f95 100644 --- a/nexus/src/app/sagas/instance_ip_detach.rs +++ b/nexus/src/app/sagas/instance_ip_detach.rs @@ -5,16 +5,15 @@ use super::instance_common::{ instance_ip_add_nat, instance_ip_add_opte, instance_ip_get_instance_state, instance_ip_move_state, instance_ip_remove_nat, instance_ip_remove_opte, - InstanceStateForIp, + InstanceStateForIp, ModifyStateForExternalIp, }; use super::{ActionRegistry, NexusActionContext, NexusSaga}; use crate::app::sagas::declare_saga_actions; use crate::app::{authn, authz, db}; use crate::external_api::params; -use nexus_db_model::{ExternalIp, IpAttachState, IpKind}; +use nexus_db_model::{IpAttachState, IpKind}; use nexus_db_queries::db::lookup::LookupPath; use nexus_types::external_api::views; -use omicron_common::api::external::Error; use serde::Deserialize; use serde::Serialize; use steno::ActionError; @@ -61,7 +60,7 @@ pub struct Params { async fn siid_begin_detach_ip( sagactx: NexusActionContext, -) -> Result { +) -> Result { let osagactx = sagactx.user_data(); let datastore = osagactx.datastore(); let params = sagactx.saga_params::()?; @@ -80,23 +79,28 @@ async fn siid_begin_detach_ip( .await .map_err(ActionError::action_failed)?; - let eph_ip = eips - .iter() - .find(|e| e.kind == IpKind::Ephemeral) - .ok_or_else(|| { - ActionError::action_failed(Error::invalid_request( - "instance does not have an attached ephemeral IP address" - )) - })?; - - datastore - .begin_deallocate_ephemeral_ip( - &opctx, - eph_ip.id, - params.authz_instance.id(), - ) - .await - .map_err(ActionError::action_failed) + // XXX: cleanup. + if let Some(eph_ip) = + eips.iter().find(|e| e.kind == IpKind::Ephemeral) + { + datastore + .begin_deallocate_ephemeral_ip( + &opctx, + eph_ip.id, + params.authz_instance.id(), + ) + .await + .map_err(ActionError::action_failed) + .map(|external_ip| ModifyStateForExternalIp { + do_saga: external_ip.is_some(), + external_ip, + }) + } else { + Ok(ModifyStateForExternalIp { + do_saga: false, + external_ip: None, + }) + } } params::ExternalIpDelete::Floating { ref floating_ip_name } => { let floating_ip_name = db::model::Name(floating_ip_name.clone()); @@ -116,6 +120,10 @@ async fn siid_begin_detach_ip( ) .await .map_err(ActionError::action_failed) + .map(|(external_ip, do_saga)| ModifyStateForExternalIp { + external_ip: Some(external_ip), + do_saga, + }) } } } @@ -202,10 +210,10 @@ async fn siid_update_opte_undo( async fn siid_complete_detach( sagactx: NexusActionContext, -) -> Result { +) -> Result, ActionError> { let log = sagactx.user_data().log(); let params = sagactx.saga_params::()?; - let target_ip = sagactx.lookup::("target_ip")?; + let target_ip = sagactx.lookup::("target_ip")?; if !instance_ip_move_state( &sagactx, @@ -217,11 +225,15 @@ async fn siid_complete_detach( { warn!( log, - "siid_complete_attach: external IP was deleted or call was idempotent" + "siid_complete_detach: external IP was deleted or call was idempotent" ) } - target_ip.try_into().map_err(ActionError::action_failed) + target_ip + .external_ip + .map(TryInto::try_into) + .transpose() + .map_err(ActionError::action_failed) } #[derive(Debug)] @@ -264,7 +276,7 @@ pub(crate) mod test { use diesel::{ ExpressionMethods, OptionalExtension, QueryDsl, SelectableHelper, }; - use nexus_db_model::Name; + use nexus_db_model::{ExternalIp, Name}; use nexus_db_queries::context::OpContext; use nexus_test_utils::resource_helpers::create_instance; use nexus_test_utils_macros::nexus_test; diff --git a/nexus/src/external_api/http_entrypoints.rs b/nexus/src/external_api/http_entrypoints.rs index ea8bbc39c8..8c2d220203 100644 --- a/nexus/src/external_api/http_entrypoints.rs +++ b/nexus/src/external_api/http_entrypoints.rs @@ -3776,7 +3776,7 @@ async fn instance_external_ip_detach( path_params: Path, query_params: Query, ip_to_detach: TypedBody, -) -> Result, HttpError> { +) -> Result>, HttpError> { let apictx = rqctx.context(); let handler = async { let opctx = crate::context::op_context_for_external_api(&rqctx).await?; diff --git a/schema/crdb/22.0.0/up08.sql b/schema/crdb/22.0.0/up08.sql new file mode 100644 index 0000000000..3d85aaad05 --- /dev/null +++ b/schema/crdb/22.0.0/up08.sql @@ -0,0 +1,2 @@ +ALTER TABLE IF EXISTS omicron.public.external_ip +DROP CONSTRAINT IF EXISTS null_non_fip_parent_id; diff --git a/schema/crdb/22.0.0/up09.sql b/schema/crdb/22.0.0/up09.sql new file mode 100644 index 0000000000..bac963cce5 --- /dev/null +++ b/schema/crdb/22.0.0/up09.sql @@ -0,0 +1,4 @@ +ALTER TABLE IF EXISTS omicron.public.external_ip +ADD CONSTRAINT IF NOT EXISTS null_snat_parent_id CHECK ( + (kind != 'snat') OR (parent_id IS NOT NULL) +); diff --git a/schema/crdb/dbinit.sql b/schema/crdb/dbinit.sql index 6580d39cf7..a372403b28 100644 --- a/schema/crdb/dbinit.sql +++ b/schema/crdb/dbinit.sql @@ -1733,11 +1733,13 @@ CREATE TABLE IF NOT EXISTS omicron.public.external_ip ( ), /* - * Only nullable if this is a floating IP, which may exist not - * attached to any instance or service yet. + * Only nullable if this is a floating/ephemeral IP, which may exist not + * attached to any instance or service yet. Ephemeral IPs should not exist + * without parent instances/services, but need to temporarily exist in this + * state for live attachment. */ - CONSTRAINT null_non_fip_parent_id CHECK ( - (kind != 'floating' AND parent_id is NOT NULL) OR (kind = 'floating') + CONSTRAINT null_snat_parent_id CHECK ( + (kind != 'snat') OR (parent_id IS NOT NULL) ), /* Ephemeral IPs are not supported for services. */ From dcdec48e07767323b6ce145df9a5a83335a17d2c Mon Sep 17 00:00:00 2001 From: Kyle Simpson Date: Thu, 28 Dec 2023 18:43:42 +0000 Subject: [PATCH 22/56] Revalidate consumers of `instance_lookup_external_ips` --- nexus/db-queries/src/db/datastore/external_ip.rs | 1 + nexus/src/app/external_ip.rs | 5 ++++- nexus/src/app/instance.rs | 10 ++++++++++ nexus/src/app/sagas/instance_ip_attach.rs | 4 ++-- 4 files changed, 17 insertions(+), 3 deletions(-) diff --git a/nexus/db-queries/src/db/datastore/external_ip.rs b/nexus/db-queries/src/db/datastore/external_ip.rs index 2364cbf341..3396eb9273 100644 --- a/nexus/db-queries/src/db/datastore/external_ip.rs +++ b/nexus/db-queries/src/db/datastore/external_ip.rs @@ -628,6 +628,7 @@ impl DataStore { } /// Fetch all external IP addresses of any kind for the provided instance + /// in all attachment states. pub async fn instance_lookup_external_ips( &self, opctx: &OpContext, diff --git a/nexus/src/app/external_ip.rs b/nexus/src/app/external_ip.rs index fba34f767d..7f41b7fd20 100644 --- a/nexus/src/app/external_ip.rs +++ b/nexus/src/app/external_ip.rs @@ -6,6 +6,7 @@ use crate::external_api::views::ExternalIp; use crate::external_api::views::FloatingIp; +use nexus_db_model::IpAttachState; use nexus_db_queries::authz; use nexus_db_queries::context::OpContext; use nexus_db_queries::db::lookup; @@ -34,7 +35,9 @@ impl super::Nexus { .await? .into_iter() .filter_map(|ip| { - if ip.kind == IpKind::SNat { + if ip.kind == IpKind::SNat + || ip.state != IpAttachState::Attached + { None } else { Some(ip.try_into().unwrap()) diff --git a/nexus/src/app/instance.rs b/nexus/src/app/instance.rs index 28aa0dcca0..19e41c9dbd 100644 --- a/nexus/src/app/instance.rs +++ b/nexus/src/app/instance.rs @@ -17,6 +17,7 @@ use crate::external_api::params; use cancel_safe_futures::prelude::*; use futures::future::Fuse; use futures::{FutureExt, SinkExt, StreamExt}; +use nexus_db_model::IpAttachState; use nexus_db_model::IpKind; use nexus_db_queries::authn; use nexus_db_queries::authz; @@ -1054,6 +1055,15 @@ impl super::Nexus { )); } + // If there are any external IPs not yet fully attached/detached,then + // there are attach/detach sagas in progress. That should complete in + // its own time, so return a 503 to indicate a possible retry. + if external_ips.iter().any(|v| v.state != IpAttachState::Attached) { + return Err(Error::unavail( + "External IP attach/detach is in progress during instance_ensure_registered" + )); + } + // Partition remaining external IPs by class: we can have at most // one ephemeral ip. let (ephemeral_ips, floating_ips): (Vec<_>, Vec<_>) = external_ips diff --git a/nexus/src/app/sagas/instance_ip_attach.rs b/nexus/src/app/sagas/instance_ip_attach.rs index 3df445b95e..872d92eabf 100644 --- a/nexus/src/app/sagas/instance_ip_attach.rs +++ b/nexus/src/app/sagas/instance_ip_attach.rs @@ -28,8 +28,8 @@ use uuid::Uuid; // The main means of access control here is an external IP's `state`. // Entering either saga begins with an atomic swap from Attached/Detached // to Attaching/Detaching. This prevents concurrent attach/detach on the -// same EIP, and prevents instance start from executing with an -// Error::unavail. +// same EIP, and prevents instance start and migrate from completing with an +// Error::unavail via instance_ensure_registered and/or DPD. // // Overlap with stop is handled by treating comms failures with // sled-agent as temporary errors and unwinding. For the delete case, we From 4b65a67f9d5e15325c929a82ebed6e74c1e1366f Mon Sep 17 00:00:00 2001 From: Kyle Simpson Date: Thu, 28 Dec 2023 19:41:49 +0000 Subject: [PATCH 23/56] (Existing) Test fixup. --- .../src/db/datastore/external_ip.rs | 8 +-- nexus/db-queries/src/db/datastore/mod.rs | 3 +- .../db-queries/src/db/queries/external_ip.rs | 49 ++++++++++++++++--- 3 files changed, 50 insertions(+), 10 deletions(-) diff --git a/nexus/db-queries/src/db/datastore/external_ip.rs b/nexus/db-queries/src/db/datastore/external_ip.rs index 3396eb9273..5628a307bf 100644 --- a/nexus/db-queries/src/db/datastore/external_ip.rs +++ b/nexus/db-queries/src/db/datastore/external_ip.rs @@ -176,7 +176,11 @@ impl DataStore { AttachError::NoUpdate { attached_count, resource, collection } => { match resource.state { // Idempotent errors: is in progress forsame resource pair -- this is fine. - IpAttachState::Attaching if resource.parent_id == Some(instance_id) => return Ok(Some((collection, resource))), + // Double attach can be hit by, e.g., repeated call during instance create. + IpAttachState::Attaching + | IpAttachState::Attached + if resource.parent_id == Some(instance_id) => + return Ok(Some((collection, resource))), IpAttachState::Attached => return Err(Error::invalid_request( "floating IP cannot be attached to one \ instance while still attached to another" @@ -225,8 +229,6 @@ impl DataStore { } // Idempotent cases: Ok(Some((_, eip))) if eip.id != temp_ip.id => { - // Is this even possible? - eprintln!("mismatch?"); self.deallocate_external_ip(opctx, temp_ip.id).await?; Ok((eip, true)) } diff --git a/nexus/db-queries/src/db/datastore/mod.rs b/nexus/db-queries/src/db/datastore/mod.rs index 2fe6bcefb2..212b1af2c4 100644 --- a/nexus/db-queries/src/db/datastore/mod.rs +++ b/nexus/db-queries/src/db/datastore/mod.rs @@ -1965,9 +1965,10 @@ mod test { .execute_async(&*conn) .await; let ip_type = if is_service { "Service" } else { "Instance" }; + let null_snat_parent = parent_id.is_none() && kind == IpKind::SNat; if name.is_none() && description.is_none() - && parent_id.is_some() + && !null_snat_parent && project_id.is_none() { // Name/description must be NULL, instance ID cannot diff --git a/nexus/db-queries/src/db/queries/external_ip.rs b/nexus/db-queries/src/db/queries/external_ip.rs index 0b5eb7c071..ad03053137 100644 --- a/nexus/db-queries/src/db/queries/external_ip.rs +++ b/nexus/db-queries/src/db/queries/external_ip.rs @@ -871,7 +871,11 @@ mod tests { use async_bb8_diesel::AsyncRunQueryDsl; use diesel::{ExpressionMethods, QueryDsl, SelectableHelper}; use dropshot::test_util::LogContext; + use nexus_db_model::ByteCount; + use nexus_db_model::Instance; + use nexus_db_model::InstanceCpuCount; use nexus_test_utils::db::test_setup_database; + use nexus_types::external_api::params::InstanceCreate; use nexus_types::external_api::shared::IpRange; use omicron_common::address::NUM_SOURCE_NAT_PORTS; use omicron_common::api::external::Error; @@ -973,6 +977,37 @@ mod tests { .expect("Failed to create IP Pool range"); } + async fn create_instance(&self, name: &str) -> Uuid { + let instance_id = Uuid::new_v4(); + let project_id = Uuid::new_v4(); + let instance = Instance::new(instance_id, project_id, &InstanceCreate { + identity: IdentityMetadataCreateParams { name: String::from(name).parse().unwrap(), description: format!("instance {}", name) }, + ncpus: InstanceCpuCount(omicron_common::api::external::InstanceCpuCount(1)).into(), + memory: ByteCount(omicron_common::api::external::ByteCount::from_gibibytes_u32(1)).into(), + hostname: "test".into(), + user_data: vec![], + network_interfaces: Default::default(), + external_ips: vec![], + disks: vec![], + start: false, + }); + + let conn = self + .db_datastore + .pool_connection_authorized(&self.opctx) + .await + .unwrap(); + + use crate::db::schema::instance::dsl as instance_dsl; + diesel::insert_into(instance_dsl::instance) + .values(instance.clone()) + .execute_async(&*conn) + .await + .expect("Failed to create Instance"); + + instance_id + } + async fn default_pool_id(&self) -> Uuid { let pool = self .db_datastore @@ -1057,7 +1092,7 @@ mod tests { // Allocate an Ephemeral IP, which should take the entire port range of // the only address in the pool. - let instance_id = Uuid::new_v4(); + let instance_id = context.create_instance("for-eph").await; let ephemeral_ip = context .db_datastore .allocate_instance_ephemeral_ip( @@ -1076,7 +1111,7 @@ mod tests { // At this point, we should be able to allocate neither a new Ephemeral // nor any SNAT IPs. - let instance_id = Uuid::new_v4(); + let instance_id = context.create_instance("for-snat").await; let res = context .db_datastore .allocate_instance_snat_ip( @@ -1242,7 +1277,7 @@ mod tests { .unwrap(); context.initialize_ip_pool("default", range).await; - let instance_id = Uuid::new_v4(); + let instance_id = context.create_instance("all-the-ports").await; let id = Uuid::new_v4(); let pool_name = None; @@ -1774,7 +1809,7 @@ mod tests { // Allocating an address on an instance in the second pool should be // respected, even though there are IPs available in the first. - let instance_id = Uuid::new_v4(); + let instance_id = context.create_instance("test").await; let id = Uuid::new_v4(); let pool_name = Some(Name("p1".parse().unwrap())); @@ -1822,7 +1857,8 @@ mod tests { let first_octet = first_address.octets()[3]; let last_octet = last_address.octets()[3]; for octet in first_octet..=last_octet { - let instance_id = Uuid::new_v4(); + let instance_id = + context.create_instance(&format!("o{octet}")).await; let ip = context .db_datastore .allocate_instance_ephemeral_ip( @@ -1844,12 +1880,13 @@ mod tests { } // Allocating another address should _fail_, and not use the first pool. + let instance_id = context.create_instance("final").await; context .db_datastore .allocate_instance_ephemeral_ip( &context.opctx, Uuid::new_v4(), - Uuid::new_v4(), + instance_id, pool_name, true, ) From ec3e01e9eb609dba955e510bdb79a38b6fa637d9 Mon Sep 17 00:00:00 2001 From: Kyle Simpson Date: Thu, 28 Dec 2023 22:09:44 +0000 Subject: [PATCH 24/56] Accidentally forgot an 'IF NOT EXISTS' --- schema/crdb/22.0.0/up06.sql | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/schema/crdb/22.0.0/up06.sql b/schema/crdb/22.0.0/up06.sql index a224588a37..ca19081e37 100644 --- a/schema/crdb/22.0.0/up06.sql +++ b/schema/crdb/22.0.0/up06.sql @@ -1,4 +1,4 @@ ALTER TABLE omicron.public.external_ip -ADD CONSTRAINT detached_null_parent_id CHECK ( +ADD CONSTRAINT IF NOT EXISTS detached_null_parent_id CHECK ( (state = 'detached') OR (parent_id IS NOT NULL) ); From 7800b07b834183b8a33a3ee10cd46c122b9371af Mon Sep 17 00:00:00 2001 From: Kyle Simpson Date: Thu, 28 Dec 2023 23:46:30 +0000 Subject: [PATCH 25/56] Fill out unauthorized endpoint tests. --- nexus/tests/integration_tests/endpoints.rs | 26 ++++++++++++++++++++++ 1 file changed, 26 insertions(+) diff --git a/nexus/tests/integration_tests/endpoints.rs b/nexus/tests/integration_tests/endpoints.rs index 545129d567..e78c7b3cb4 100644 --- a/nexus/tests/integration_tests/endpoints.rs +++ b/nexus/tests/integration_tests/endpoints.rs @@ -313,6 +313,10 @@ lazy_static! { format!("/v1/network-interfaces?project={}&instance={}", *DEMO_PROJECT_NAME, *DEMO_INSTANCE_NAME); pub static ref DEMO_INSTANCE_EXTERNAL_IPS_URL: String = format!("/v1/instances/{}/external-ips?{}", *DEMO_INSTANCE_NAME, *DEMO_PROJECT_SELECTOR); + pub static ref DEMO_INSTANCE_EXTERNAL_IP_ATTACH_URL: String = + format!("/v1/instances/{}/external-ips/attach?{}", *DEMO_INSTANCE_NAME, *DEMO_PROJECT_SELECTOR); + pub static ref DEMO_INSTANCE_EXTERNAL_IP_DETACH_URL: String = + format!("/v1/instances/{}/external-ips/detach?{}", *DEMO_INSTANCE_NAME, *DEMO_PROJECT_SELECTOR); pub static ref DEMO_INSTANCE_CREATE: params::InstanceCreate = params::InstanceCreate { identity: IdentityMetadataCreateParams { @@ -592,6 +596,10 @@ lazy_static! { address: Some(std::net::Ipv4Addr::new(10, 0, 0, 141).into()), pool: None, }; + pub static ref DEMO_FLOAT_IP_ATTACH: params::ExternalIpCreate = + params::ExternalIpCreate::Floating { floating_ip_name: DEMO_FLOAT_IP_NAME.clone() }; + pub static ref DEMO_FLOAT_IP_DETACH: params::ExternalIpDelete = + params::ExternalIpDelete::Floating { floating_ip_name: DEMO_FLOAT_IP_NAME.clone() }; } lazy_static! { @@ -1593,6 +1601,24 @@ lazy_static! { allowed_methods: vec![AllowedMethod::Get], }, + VerifyEndpoint { + url: &DEMO_INSTANCE_EXTERNAL_IP_ATTACH_URL, + visibility: Visibility::Protected, + unprivileged_access: UnprivilegedAccess::None, + allowed_methods: vec![AllowedMethod::Post( + serde_json::to_value(&*DEMO_FLOAT_IP_ATTACH).unwrap() + )], + }, + + VerifyEndpoint { + url: &DEMO_INSTANCE_EXTERNAL_IP_DETACH_URL, + visibility: Visibility::Protected, + unprivileged_access: UnprivilegedAccess::None, + allowed_methods: vec![AllowedMethod::Post( + serde_json::to_value(&*DEMO_FLOAT_IP_DETACH).unwrap() + )], + }, + /* IAM */ VerifyEndpoint { From 24591b35d569a03a1b404ae9f9042e15b99b5091 Mon Sep 17 00:00:00 2001 From: Kyle Simpson Date: Fri, 29 Dec 2023 13:35:57 +0000 Subject: [PATCH 26/56] Additional integration tests. --- nexus/test-utils/src/resource_helpers.rs | 4 +- nexus/tests/integration_tests/disks.rs | 1 + nexus/tests/integration_tests/external_ips.rs | 344 +++++++++++++++++- nexus/tests/integration_tests/instances.rs | 15 +- nexus/tests/integration_tests/ip_pools.rs | 1 + .../integration_tests/subnet_allocation.rs | 1 + nexus/types/src/external_api/shared.rs | 4 +- nexus/types/src/external_api/views.rs | 2 +- 8 files changed, 354 insertions(+), 18 deletions(-) diff --git a/nexus/test-utils/src/resource_helpers.rs b/nexus/test-utils/src/resource_helpers.rs index f564d73119..fe2ccbec9d 100644 --- a/nexus/test-utils/src/resource_helpers.rs +++ b/nexus/test-utils/src/resource_helpers.rs @@ -382,6 +382,7 @@ pub async fn create_instance( Vec::::new(), // External IPs= Vec::::new(), + true, ) .await } @@ -394,6 +395,7 @@ pub async fn create_instance_with( nics: ¶ms::InstanceNetworkInterfaceAttachment, disks: Vec, external_ips: Vec, + start: bool, ) -> Instance { let url = format!("/v1/instances?project={}", project_name); object_create( @@ -413,7 +415,7 @@ pub async fn create_instance_with( network_interfaces: nics.clone(), external_ips, disks, - start: true, + start, }, ) .await diff --git a/nexus/tests/integration_tests/disks.rs b/nexus/tests/integration_tests/disks.rs index a7c9c99509..2c205181bd 100644 --- a/nexus/tests/integration_tests/disks.rs +++ b/nexus/tests/integration_tests/disks.rs @@ -1749,6 +1749,7 @@ async fn create_instance_with_disk(client: &ClientTestContext) { params::InstanceDiskAttach { name: DISK_NAME.parse().unwrap() }, )], Vec::::new(), + true, ) .await; } diff --git a/nexus/tests/integration_tests/external_ips.rs b/nexus/tests/integration_tests/external_ips.rs index daec8e2064..ec71e377fb 100644 --- a/nexus/tests/integration_tests/external_ips.rs +++ b/nexus/tests/integration_tests/external_ips.rs @@ -7,6 +7,7 @@ use std::net::IpAddr; use std::net::Ipv4Addr; +use crate::integration_tests::instances::fetch_instance_external_ips; use crate::integration_tests::instances::instance_simulate; use dropshot::test_util::ClientTestContext; use dropshot::HttpErrorResponseBody; @@ -24,6 +25,7 @@ use nexus_test_utils::resource_helpers::populate_ip_pool; use nexus_test_utils_macros::nexus_test; use nexus_types::external_api::params; use nexus_types::external_api::shared; +use nexus_types::external_api::views; use nexus_types::external_api::views::FloatingIp; use omicron_common::address::IpRange; use omicron_common::address::Ipv4Range; @@ -40,10 +42,26 @@ const PROJECT_NAME: &str = "rootbeer-float"; const FIP_NAMES: &[&str] = &["vanilla", "chocolate", "strawberry", "pistachio", "caramel"]; +const INSTANCE_NAMES: &[&str] = &["anonymous-diner", "anonymous-restaurant"]; + pub fn get_floating_ips_url(project_name: &str) -> String { format!("/v1/floating-ips?project={project_name}") } +pub fn attach_instance_external_ip_url( + instance_name: &str, + project_name: &str, +) -> String { + format!("/v1/instances/{instance_name}/external-ips/attach?project={project_name}") +} + +pub fn detach_instance_external_ip_url( + instance_name: &str, + project_name: &str, +) -> String { + format!("/v1/instances/{instance_name}/external-ips/detach?project={project_name}") +} + pub fn get_floating_ip_by_name_url( fip_name: &str, project_name: &str, @@ -364,7 +382,9 @@ async fn test_floating_ip_delete(cptestctx: &ControlPlaneTestContext) { } #[nexus_test] -async fn test_floating_ip_attachment(cptestctx: &ControlPlaneTestContext) { +async fn test_floating_ip_create_attachment( + cptestctx: &ControlPlaneTestContext, +) { let client = &cptestctx.external_client; let apictx = &cptestctx.server.apictx(); let nexus = &apictx.nexus; @@ -382,16 +402,13 @@ async fn test_floating_ip_attachment(cptestctx: &ControlPlaneTestContext) { .await; // Bind the floating IP to an instance at create time. - let instance_name = "anonymous-diner"; - let instance = create_instance_with( - &client, - PROJECT_NAME, + let instance_name = INSTANCE_NAMES[0]; + let instance = instance_for_external_ips( + client, instance_name, - ¶ms::InstanceNetworkInterfaceAttachment::Default, - vec![], - vec![params::ExternalIpCreate::Floating { - floating_ip_name: FIP_NAMES[0].parse().unwrap(), - }], + true, + false, + &FIP_NAMES[..1], ) .await; @@ -469,6 +486,247 @@ async fn test_floating_ip_attachment(cptestctx: &ControlPlaneTestContext) { .unwrap(); } +#[nexus_test] +async fn test_external_ip_live_attach_detach( + cptestctx: &ControlPlaneTestContext, +) { + let client = &cptestctx.external_client; + let apictx = &cptestctx.server.apictx(); + let nexus = &apictx.nexus; + + populate_ip_pool(&client, "default", None).await; + let project = create_project(client, PROJECT_NAME).await; + + // Create 2 instances, and a floating IP for each instance. + // One instance will be started, and one will be stopped. + let mut fips = vec![]; + for i in 0..2 { + fips.push( + create_floating_ip( + client, + FIP_NAMES[i], + project.identity.name.as_str(), + None, + None, + ) + .await, + ); + } + + let mut instances = vec![]; + for (i, start) in [false, true].iter().enumerate() { + let instance = instance_for_external_ips( + client, + INSTANCE_NAMES[i], + *start, + false, + &[], + ) + .await; + + if *start { + instance_simulate(nexus, &instance.identity.id).await; + instance_simulate(nexus, &instance.identity.id).await; + } + + // Verify that each instance has no external IPs. + assert_eq!( + fetch_instance_external_ips( + client, + INSTANCE_NAMES[i], + PROJECT_NAME + ) + .await + .len(), + 0 + ); + + instances.push(instance); + } + + // Attach a floating IP and ephemeral IP to each instance. + let mut recorded_ephs = vec![]; + for (instance, fip) in instances.iter().zip(&fips) { + let instance_name = instance.identity.name.as_str(); + let eph_resp = external_ip_attach( + client, + instance_name, + ¶ms::ExternalIpCreate::Ephemeral { pool_name: None }, + ) + .await; + let fip_resp = external_ip_attach( + client, + instance_name, + ¶ms::ExternalIpCreate::Floating { + floating_ip_name: fip.identity.name.clone(), + }, + ) + .await; + + // Verify both appear correctly. + // This implicitly checks FIP parent_id matches the instance, + // and state has fully moved into 'Attached'. + let eip_list = + fetch_instance_external_ips(client, instance_name, PROJECT_NAME) + .await; + + assert_eq!(eip_list.len(), 2); + assert!(eip_list.contains(&eph_resp)); + assert!(eip_list.contains(&fip_resp)); + assert_eq!(fip.ip, fip_resp.ip); + + // Check for idempotency: repeat requests should return same values. + let eph_resp_2 = external_ip_attach( + client, + instance_name, + ¶ms::ExternalIpCreate::Ephemeral { pool_name: None }, + ) + .await; + let fip_resp_2 = external_ip_attach( + client, + instance_name, + ¶ms::ExternalIpCreate::Floating { + floating_ip_name: fip.identity.name.clone(), + }, + ) + .await; + + assert_eq!(eph_resp, eph_resp_2); + assert_eq!(fip_resp, fip_resp_2); + + recorded_ephs.push(eph_resp); + } + + // Detach a floating IP and ephemeral IP from each instance. + for ((instance, fip), eph_ip) in + instances.iter().zip(&fips).zip(&recorded_ephs) + { + let instance_name = instance.identity.name.as_str(); + let eph_resp = external_ip_detach( + client, + instance_name, + ¶ms::ExternalIpDelete::Ephemeral, + ) + .await + .unwrap(); + let fip_resp = external_ip_detach( + client, + instance_name, + ¶ms::ExternalIpDelete::Floating { + floating_ip_name: fip.identity.name.clone(), + }, + ) + .await + .unwrap(); + + // Verify both are removed, and that their bodies match the known FIP/EIP combo. + let eip_list = + fetch_instance_external_ips(client, instance_name, PROJECT_NAME) + .await; + + assert_eq!(eip_list.len(), 0); + assert_eq!(fip.ip, fip_resp.ip); + assert_eq!(eph_ip, &eph_resp); + + // Check for idempotency: repeat requests should return same values + // for FIP, but in ephemeral case there is no currently known IP so we get None. + let eph_resp_2 = external_ip_detach( + client, + instance_name, + ¶ms::ExternalIpDelete::Ephemeral, + ) + .await; + let fip_resp_2 = external_ip_detach( + client, + instance_name, + ¶ms::ExternalIpDelete::Floating { + floating_ip_name: fip.identity.name.clone(), + }, + ) + .await; + + assert!(eph_resp_2.is_none()); + assert_eq!(Some(fip_resp), fip_resp_2); + } +} + +#[nexus_test] +async fn test_external_ip_attach_detach_fail_if_in_use_by_other( + cptestctx: &ControlPlaneTestContext, +) { + let client = &cptestctx.external_client; + let apictx = &cptestctx.server.apictx(); + let nexus = &apictx.nexus; + + populate_ip_pool(&client, "default", None).await; + let project = create_project(client, PROJECT_NAME).await; + + // Create 2 instances, bind a FIP to each. + let mut instances = vec![]; + let mut fips = vec![]; + for i in 0..2 { + let fip = create_floating_ip( + client, + FIP_NAMES[i], + project.identity.name.as_str(), + None, + None, + ) + .await; + let instance = instance_for_external_ips( + client, + INSTANCE_NAMES[i], + true, + false, + &[FIP_NAMES[i]], + ) + .await; + + instance_simulate(nexus, &instance.identity.id).await; + instance_simulate(nexus, &instance.identity.id).await; + + instances.push(instance); + fips.push(fip); + } + + // Attach in-use FIP to *other* instance should fail. + let url = attach_instance_external_ip_url(INSTANCE_NAMES[0], PROJECT_NAME); + let error: HttpErrorResponseBody = NexusRequest::new( + RequestBuilder::new(client, Method::POST, &url) + .body(Some(¶ms::ExternalIpCreate::Floating { + floating_ip_name: fips[1].identity.name.clone(), + })) + .expect_status(Some(StatusCode::BAD_REQUEST)), + ) + .authn_as(AuthnMode::PrivilegedUser) + .execute() + .await + .unwrap() + .parsed_body() + .unwrap(); + assert_eq!(error.message, "floating IP cannot be attached to one instance while still attached to another".to_string()); + + // Detach in-use FIP from *other* instance should fail. + let url = detach_instance_external_ip_url(INSTANCE_NAMES[0], PROJECT_NAME); + let error: HttpErrorResponseBody = NexusRequest::new( + RequestBuilder::new(client, Method::POST, &url) + .body(Some(¶ms::ExternalIpDelete::Floating { + floating_ip_name: fips[1].identity.name.clone(), + })) + .expect_status(Some(StatusCode::BAD_REQUEST)), + ) + .authn_as(AuthnMode::PrivilegedUser) + .execute() + .await + .unwrap() + .parsed_body() + .unwrap(); + assert_eq!( + error.message, + "Floating IP is not attached to the target instance".to_string() + ); +} + pub async fn floating_ip_get( client: &ClientTestContext, fip_url: &str, @@ -493,3 +751,69 @@ async fn floating_ip_get_as( panic!("failed to make \"get\" request to {fip_url}: {e}") }) } + +async fn instance_for_external_ips( + client: &ClientTestContext, + instance_name: &str, + start: bool, + use_ephemeral_ip: bool, + floating_ip_names: &[&str], +) -> Instance { + let mut fips: Vec<_> = floating_ip_names + .iter() + .map(|s| params::ExternalIpCreate::Floating { + floating_ip_name: s.parse().unwrap(), + }) + .collect(); + if use_ephemeral_ip { + fips.push(params::ExternalIpCreate::Ephemeral { pool_name: None }) + } + create_instance_with( + &client, + PROJECT_NAME, + instance_name, + ¶ms::InstanceNetworkInterfaceAttachment::Default, + vec![], + fips, + start, + ) + .await +} + +async fn external_ip_attach( + client: &ClientTestContext, + instance_name: &str, + eip: ¶ms::ExternalIpCreate, +) -> views::ExternalIp { + let url = attach_instance_external_ip_url(instance_name, PROJECT_NAME); + NexusRequest::new( + RequestBuilder::new(client, Method::POST, &url) + .body(Some(eip)) + .expect_status(Some(StatusCode::ACCEPTED)), + ) + .authn_as(AuthnMode::PrivilegedUser) + .execute() + .await + .unwrap() + .parsed_body() + .unwrap() +} + +async fn external_ip_detach( + client: &ClientTestContext, + instance_name: &str, + eip: ¶ms::ExternalIpDelete, +) -> Option { + let url = detach_instance_external_ip_url(instance_name, PROJECT_NAME); + NexusRequest::new( + RequestBuilder::new(client, Method::POST, &url) + .body(Some(eip)) + .expect_status(Some(StatusCode::ACCEPTED)), + ) + .authn_as(AuthnMode::PrivilegedUser) + .execute() + .await + .unwrap() + .parsed_body() + .unwrap() +} diff --git a/nexus/tests/integration_tests/instances.rs b/nexus/tests/integration_tests/instances.rs index 44b65fa67b..fd91efb487 100644 --- a/nexus/tests/integration_tests/instances.rs +++ b/nexus/tests/integration_tests/instances.rs @@ -651,6 +651,7 @@ async fn test_instance_migrate(cptestctx: &ControlPlaneTestContext) { ¶ms::InstanceNetworkInterfaceAttachment::Default, Vec::::new(), Vec::::new(), + true, ) .await; let instance_id = instance.identity.id; @@ -754,6 +755,7 @@ async fn test_instance_migrate_v2p(cptestctx: &ControlPlaneTestContext) { // located with their instances. Vec::::new(), Vec::::new(), + true, ) .await; let instance_id = instance.identity.id; @@ -1113,6 +1115,7 @@ async fn test_instance_metrics_with_migration( ¶ms::InstanceNetworkInterfaceAttachment::Default, Vec::::new(), Vec::::new(), + true, ) .await; let instance_id = instance.identity.id; @@ -3752,13 +3755,14 @@ async fn test_instance_attach_several_external_ips( ¶ms::InstanceNetworkInterfaceAttachment::Default, vec![], external_ip_create, + true, ) .await; // Verify that all external IPs are visible on the instance and have // been allocated in order. let external_ips = - fetch_instance_external_ips(&client, instance_name).await; + fetch_instance_external_ips(&client, instance_name, PROJECT_NAME).await; assert_eq!(external_ips.len(), 8); eprintln!("{external_ips:?}"); for (i, eip) in external_ips @@ -3858,17 +3862,18 @@ async fn create_instance_with_pool( vec![params::ExternalIpCreate::Ephemeral { pool_name: pool_name.map(|name| name.parse().unwrap()), }], + true, ) .await } -async fn fetch_instance_external_ips( +pub async fn fetch_instance_external_ips( client: &ClientTestContext, instance_name: &str, + project_name: &str, ) -> Vec { let ips_url = format!( - "/v1/instances/{}/external-ips?project={}", - instance_name, PROJECT_NAME + "/v1/instances/{instance_name}/external-ips?project={project_name}", ); let ips = NexusRequest::object_get(client, &ips_url) .authn_as(AuthnMode::PrivilegedUser) @@ -3884,7 +3889,7 @@ async fn fetch_instance_ephemeral_ip( client: &ClientTestContext, instance_name: &str, ) -> views::ExternalIp { - fetch_instance_external_ips(client, instance_name) + fetch_instance_external_ips(client, instance_name, PROJECT_NAME) .await .into_iter() .find(|v| v.kind == IpKind::Ephemeral) diff --git a/nexus/tests/integration_tests/ip_pools.rs b/nexus/tests/integration_tests/ip_pools.rs index 6a633fc5e1..02aa18bc72 100644 --- a/nexus/tests/integration_tests/ip_pools.rs +++ b/nexus/tests/integration_tests/ip_pools.rs @@ -805,6 +805,7 @@ async fn test_ip_pool_list_usable_by_project( &InstanceNetworkInterfaceAttachment::Default, Vec::::new(), vec![ExternalIpCreate::Ephemeral { pool_name }], + true, ) .await; } diff --git a/nexus/tests/integration_tests/subnet_allocation.rs b/nexus/tests/integration_tests/subnet_allocation.rs index 7f5c27384c..03dbb2c6e5 100644 --- a/nexus/tests/integration_tests/subnet_allocation.rs +++ b/nexus/tests/integration_tests/subnet_allocation.rs @@ -143,6 +143,7 @@ async fn test_subnet_allocation(cptestctx: &ControlPlaneTestContext) { Vec::::new(), // External IPs= Vec::::new(), + true, ) .await; } diff --git a/nexus/types/src/external_api/shared.rs b/nexus/types/src/external_api/shared.rs index a4c5ae1e62..f6b4db18a3 100644 --- a/nexus/types/src/external_api/shared.rs +++ b/nexus/types/src/external_api/shared.rs @@ -221,7 +221,9 @@ pub enum ServiceUsingCertificate { } /// The kind of an external IP address for an instance -#[derive(Debug, Clone, Copy, Deserialize, Serialize, JsonSchema, PartialEq)] +#[derive( + Debug, Clone, Copy, Deserialize, Eq, Serialize, JsonSchema, PartialEq, +)] #[serde(rename_all = "snake_case")] pub enum IpKind { Ephemeral, diff --git a/nexus/types/src/external_api/views.rs b/nexus/types/src/external_api/views.rs index 39ca945e64..1555ab749a 100644 --- a/nexus/types/src/external_api/views.rs +++ b/nexus/types/src/external_api/views.rs @@ -313,7 +313,7 @@ pub struct IpPoolRange { // INSTANCE EXTERNAL IP ADDRESSES -#[derive(Debug, Clone, Deserialize, Serialize, JsonSchema)] +#[derive(Debug, Clone, Deserialize, Eq, PartialEq, Serialize, JsonSchema)] #[serde(rename_all = "snake_case")] pub struct ExternalIp { pub ip: IpAddr, From 019708fb02ec5134096cf8702f13515f6461e625 Mon Sep 17 00:00:00 2001 From: Kyle Simpson Date: Fri, 29 Dec 2023 15:47:29 +0000 Subject: [PATCH 27/56] Tests: one more final --- nexus/tests/integration_tests/external_ips.rs | 82 +++++++++++++++++++ 1 file changed, 82 insertions(+) diff --git a/nexus/tests/integration_tests/external_ips.rs b/nexus/tests/integration_tests/external_ips.rs index ec71e377fb..102c281de3 100644 --- a/nexus/tests/integration_tests/external_ips.rs +++ b/nexus/tests/integration_tests/external_ips.rs @@ -727,6 +727,88 @@ async fn test_external_ip_attach_detach_fail_if_in_use_by_other( ); } +#[nexus_test] +async fn test_external_ip_attach_fails_after_maximum( + cptestctx: &ControlPlaneTestContext, +) { + let client = &cptestctx.external_client; + + populate_ip_pool(&client, "default", None).await; + let project = create_project(client, PROJECT_NAME).await; + + // Create 33 floating IPs, and bind the first 32 to an instance. + let mut fip_names = vec![]; + for i in 0..33 { + let fip_name = format!("fip-{i}"); + create_floating_ip( + client, + &fip_name, + project.identity.name.as_str(), + None, + None, + ) + .await; + fip_names.push(fip_name); + } + + let fip_name_slice = + fip_names.iter().map(String::as_str).collect::>(); + let instance_name = INSTANCE_NAMES[0]; + instance_for_external_ips( + client, + instance_name, + true, + false, + &fip_name_slice[..32], + ) + .await; + + // Attempt to attach the final FIP should fail. + let url = attach_instance_external_ip_url(instance_name, PROJECT_NAME); + let error: HttpErrorResponseBody = NexusRequest::new( + RequestBuilder::new(client, Method::POST, &url) + .body(Some(¶ms::ExternalIpCreate::Floating { + floating_ip_name: fip_name_slice + .last() + .unwrap() + .parse() + .unwrap(), + })) + .expect_status(Some(StatusCode::BAD_REQUEST)), + ) + .authn_as(AuthnMode::PrivilegedUser) + .execute() + .await + .unwrap() + .parsed_body() + .unwrap(); + assert_eq!( + error.message, + "an instance may not have more than 32 external IP addresses" + .to_string() + ); + + // Attempt to attach an ephemeral IP should fail. + let error: HttpErrorResponseBody = NexusRequest::new( + RequestBuilder::new(client, Method::POST, &url) + .body(Some(¶ms::ExternalIpCreate::Ephemeral { + pool_name: None, + })) + .expect_status(Some(StatusCode::BAD_REQUEST)), + ) + .authn_as(AuthnMode::PrivilegedUser) + .execute() + .await + .unwrap() + .parsed_body() + .unwrap(); + assert_eq!( + error.message, + "an instance may not have more than 32 external IP addresses" + .to_string() + ); +} + pub async fn floating_ip_get( client: &ClientTestContext, fip_url: &str, From e5f549aa536b57d6fda99b6121628ac65633bbfc Mon Sep 17 00:00:00 2001 From: Kyle Simpson Date: Fri, 29 Dec 2023 18:19:46 +0000 Subject: [PATCH 28/56] Partial self-review, skipping the tricky bits. --- illumos-utils/src/opte/port_manager.rs | 6 ++- nexus/db-model/src/schema.rs | 2 - .../src/db/datastore/external_ip.rs | 6 +-- .../db-queries/src/db/queries/external_ip.rs | 10 ++--- nexus/src/app/instance.rs | 4 +- nexus/src/app/instance_network.rs | 19 +++++--- nexus/src/app/sagas/instance_common.rs | 45 ++++++++----------- nexus/src/app/sagas/instance_create.rs | 9 ++-- nexus/src/app/sagas/instance_ip_attach.rs | 4 +- nexus/src/app/sagas/instance_ip_detach.rs | 4 +- schema/crdb/22.0.0/up03.sql | 2 +- schema/crdb/22.0.0/up04.sql | 2 +- schema/crdb/22.0.0/up07.sql | 2 +- schema/crdb/dbinit.sql | 8 ++-- 14 files changed, 59 insertions(+), 64 deletions(-) diff --git a/illumos-utils/src/opte/port_manager.rs b/illumos-utils/src/opte/port_manager.rs index ef848dcbf1..c2b753d762 100644 --- a/illumos-utils/src/opte/port_manager.rs +++ b/illumos-utils/src/opte/port_manager.rs @@ -417,8 +417,7 @@ impl PortManager { Error::ExternalIpUpdateMissingPort(nic_id, nic_kind) })?; - // TODO: massively cleanup. - // Describe the external IP addresses for this port. + // XXX: duplicates parts of macro logic in `create_port`. macro_rules! ext_ip_cfg { ($ip:expr, $log_prefix:literal, $ip_t:path, $cidr_t:path, $ipcfg_e:path, $ipcfg_t:ident, $snat_t:ident) => {{ @@ -473,6 +472,9 @@ impl PortManager { }} } + // TODO-completeness: support dual-stack. We'll need to explicitly store + // a v4 and a v6 ephemeral IP + SNat + gateway + ... in `InstanceInner` + // to have enough info to build both. let mut v4_cfg = None; let mut v6_cfg = None; match port.gateway().ip { diff --git a/nexus/db-model/src/schema.rs b/nexus/db-model/src/schema.rs index ba12c9d041..7af74036b2 100644 --- a/nexus/db-model/src/schema.rs +++ b/nexus/db-model/src/schema.rs @@ -1406,8 +1406,6 @@ allow_tables_to_appear_in_same_query!( allow_tables_to_appear_in_same_query!(dns_zone, dns_version, dns_name); allow_tables_to_appear_in_same_query!(external_ip, service); -allow_tables_to_appear_in_same_query!(external_ip, instance); -joinable!(external_ip -> instance (parent_id)); allow_tables_to_appear_in_same_query!( switch_port, diff --git a/nexus/db-queries/src/db/datastore/external_ip.rs b/nexus/db-queries/src/db/datastore/external_ip.rs index 5628a307bf..8aff01642f 100644 --- a/nexus/db-queries/src/db/datastore/external_ip.rs +++ b/nexus/db-queries/src/db/datastore/external_ip.rs @@ -242,10 +242,7 @@ impl DataStore { .ok_or_else(|| Error::internal_error("hmm"))?; Ok((eip, false)) } - Ok(Some((_, eip))) => { - eprintln!(""); - Ok((eip, true)) - } + Ok(Some((_, eip))) => Ok((eip, true)), } } @@ -621,6 +618,7 @@ impl DataStore { .filter(dsl::parent_id.eq(instance_id)) .filter(dsl::kind.eq(IpKind::Floating)) .set(( + dsl::time_modified.eq(Utc::now()), dsl::parent_id.eq(Option::::None), dsl::state.eq(IpAttachState::Detached), )) diff --git a/nexus/db-queries/src/db/queries/external_ip.rs b/nexus/db-queries/src/db/queries/external_ip.rs index ad03053137..aee0d7a919 100644 --- a/nexus/db-queries/src/db/queries/external_ip.rs +++ b/nexus/db-queries/src/db/queries/external_ip.rs @@ -36,9 +36,6 @@ use uuid::Uuid; // Broadly, we want users to be able to attach/detach at will // once an instance is created and functional. -// If we're in a state which will naturally resolve to either -// stopped/running, we want users to know that the request can be -// retried safely. pub const SAFE_TO_ATTACH_INSTANCE_STATES_CREATING: [DbInstanceState; 3] = [ DbInstanceState(ApiInstanceState::Stopped), DbInstanceState(ApiInstanceState::Running), @@ -48,8 +45,11 @@ pub const SAFE_TO_ATTACH_INSTANCE_STATES: [DbInstanceState; 2] = [ DbInstanceState(ApiInstanceState::Stopped), DbInstanceState(ApiInstanceState::Running), ]; -// TODO: Currently stop if there's a migration or other state change. -// This may be a good case for RPWing +// If we're in a state which will naturally resolve to either +// stopped/running, we want users to know that the request can be +// retried safely via Error::unavail. +// TODO: We currently stop if there's a migration or other state change. +// There may be a good case for RPWing // external_ip_state -> { NAT RPW, sled-agent } in future. pub const SAFE_TRANSIENT_INSTANCE_STATES: [DbInstanceState; 5] = [ DbInstanceState(ApiInstanceState::Starting), diff --git a/nexus/src/app/instance.rs b/nexus/src/app/instance.rs index 19e41c9dbd..7e6bc51229 100644 --- a/nexus/src/app/instance.rs +++ b/nexus/src/app/instance.rs @@ -1917,7 +1917,7 @@ impl super::Nexus { Ok(()) } - /// Attach a disk to an instance. + /// Attach an external IP to an instance. pub(crate) async fn instance_attach_external_ip( self: &Arc, opctx: &OpContext, @@ -1972,7 +1972,7 @@ impl super::Nexus { saga_outputs .lookup_node_output::>("output") .map_err(|e| Error::internal_error(&format!("{:#}", &e))) - .internal_context("looking up output from ip attach saga") + .internal_context("looking up output from ip detach saga") } } diff --git a/nexus/src/app/instance_network.rs b/nexus/src/app/instance_network.rs index 663d112673..8d020a9a3b 100644 --- a/nexus/src/app/instance_network.rs +++ b/nexus/src/app/instance_network.rs @@ -25,7 +25,6 @@ use sled_agent_client::types::DeleteVirtualNetworkInterfaceHost; use sled_agent_client::types::SetVirtualNetworkInterfaceHost; use std::collections::HashSet; use std::str::FromStr; - use uuid::Uuid; impl super::Nexus { @@ -287,9 +286,11 @@ impl super::Nexus { /// - `ip_filter`: An optional filter on the index into the instance's /// external IP array. /// - If this is `Some(id)`, this routine configures DPD state for only the - /// external IP with `id` in the collection returned from CRDB. + /// external IP with `id` in the collection returned from CRDB. This will + /// proceed even when the target IP is 'attaching'. /// - If this is `None`, this routine configures DPD for all external - /// IPs. + /// IPs and *will back out* if any IPs are not yet fully attached to + /// the instance. pub(crate) async fn instance_ensure_dpd_config( &self, opctx: &OpContext, @@ -359,9 +360,11 @@ impl super::Nexus { // This is performed so that an IP attach/detach will block the // instance_start saga. Return service unavailable to indicate // the request is retryable. - if ips_of_interest.iter().any(|ip| { - must_all_be_attached && ip.state != IpAttachState::Attached - }) { + if must_all_be_attached + && ips_of_interest + .iter() + .any(|ip| ip.state != IpAttachState::Attached) + { return Err(Error::unavail( "cannot push all DPD state: IP attach/detach in progress", )); @@ -446,6 +449,10 @@ impl super::Nexus { /// Attempts to delete all of the Dendrite NAT configuration for the /// instance identified by `authz_instance`. /// + /// Unlike `instance_ensure_dpd_config`, this function will disregard the + /// attachment states of any external IPs because likely callers (instance + /// delete) cannot be piecewise undone. + /// /// # Return value /// /// - `Ok(())` if all NAT entries were successfully deleted. diff --git a/nexus/src/app/sagas/instance_common.rs b/nexus/src/app/sagas/instance_common.rs index 6fedd8d775..8118b9df8a 100644 --- a/nexus/src/app/sagas/instance_common.rs +++ b/nexus/src/app/sagas/instance_common.rs @@ -144,13 +144,6 @@ pub(super) async fn allocate_sled_ipv6( .map_err(ActionError::action_failed) } -/// Instance state needed for IP attach/detachment. -#[derive(Debug, Deserialize, Serialize)] -pub struct InstanceStateForIp { - pub sled_id: Option, - pub state: InstanceState, -} - /// External IP state needed for IP attach/detachment. /// /// This holds a record of the mid-processing external IP, where possible. @@ -210,7 +203,7 @@ pub async fn instance_ip_get_instance_state( serialized_authn: &authn::saga::Serialized, authz_instance: &authz::Instance, verb: &str, -) -> Result { +) -> Result, ActionError> { // XXX: we can get instance state (but not sled ID) in same transaction // as attach (but not detach) wth current design. We need to re-query // for sled ID anyhow, so keep consistent between attach/detach. @@ -240,14 +233,13 @@ pub async fn instance_ip_get_instance_state( // - deleting: can only be called from stopped -- we won't push to dpd // or sled-agent, and IP record might be deleted or forcibly // detached. Catch here just in case. - let state = match found_state { + match found_state { InstanceState::Stopped | InstanceState::Starting | InstanceState::Stopping => { sled_id = None; - InstanceState::Stopped } - InstanceState::Running => InstanceState::Running, + InstanceState::Running => {} state if SAFE_TRANSIENT_INSTANCE_STATES.contains(&state.into()) => { return Err(ActionError::action_failed(Error::unavail(&format!( "can't {verb} in transient state {state}" @@ -259,15 +251,15 @@ pub async fn instance_ip_get_instance_state( &authz_instance.id(), ))) } - // Final cases are rebooting/failed. + // Final cases are repairing/failed. _ => { return Err(ActionError::action_failed(Error::invalid_request( "cannot modify instance IPs, instance is in unhealthy state", ))) } - }; + } - Ok(InstanceStateForIp { sled_id, state }) + Ok(sled_id) } pub async fn instance_ip_add_nat( @@ -281,8 +273,7 @@ pub async fn instance_ip_add_nat( crate::context::op_context_for_saga_action(&sagactx, serialized_authn); // No physical sled? Don't push NAT. - let Some(sled_uuid) = - sagactx.lookup::("instance_state")?.sled_id + let Some(sled_uuid) = sagactx.lookup::>("instance_state")? else { return Ok(()); }; @@ -329,9 +320,7 @@ pub async fn instance_ip_remove_nat( crate::context::op_context_for_saga_action(&sagactx, serialized_authn); // No physical sled? Don't push NAT. - let Some(_) = - sagactx.lookup::("instance_state")?.sled_id - else { + let Some(_) = sagactx.lookup::>("instance_state")? else { return Ok(()); }; @@ -361,8 +350,7 @@ pub async fn instance_ip_add_opte( let osagactx = sagactx.user_data(); // No physical sled? Don't inform OPTE. - let Some(sled_uuid) = - sagactx.lookup::("instance_state")?.sled_id + let Some(sled_uuid) = sagactx.lookup::>("instance_state")? else { return Ok(()); }; @@ -386,7 +374,7 @@ pub async fn instance_ip_add_opte( .await .map_err(|_| { ActionError::action_failed(Error::unavail( - "sled agent client went away mid-attach", + "sled agent client went away mid-attach/detach", )) })? .instance_put_external_ip(&authz_instance.id(), &sled_agent_body) @@ -394,7 +382,9 @@ pub async fn instance_ip_add_opte( .map_err(|e| { ActionError::action_failed(match e { progenitor_client::Error::CommunicationError(_) => { - Error::unavail("sled agent client went away mid-attach") + Error::unavail( + "sled agent client went away mid-attach/detach", + ) } e => Error::internal_error(&format!("{e}")), }) @@ -410,8 +400,7 @@ pub async fn instance_ip_remove_opte( let osagactx = sagactx.user_data(); // No physical sled? Don't inform OPTE. - let Some(sled_uuid) = - sagactx.lookup::("instance_state")?.sled_id + let Some(sled_uuid) = sagactx.lookup::>("instance_state")? else { return Ok(()); }; @@ -435,7 +424,7 @@ pub async fn instance_ip_remove_opte( .await .map_err(|_| { ActionError::action_failed(Error::unavail( - "sled agent client went away mid-attach", + "sled agent client went away mid-attach/detach", )) })? .instance_delete_external_ip(&authz_instance.id(), &sled_agent_body) @@ -443,7 +432,9 @@ pub async fn instance_ip_remove_opte( .map_err(|e| { ActionError::action_failed(match e { progenitor_client::Error::CommunicationError(_) => { - Error::unavail("sled agent client went away mid-attach") + Error::unavail( + "sled agent client went away mid-attach/detach", + ) } e => Error::internal_error(&format!("{e}")), }) diff --git a/nexus/src/app/sagas/instance_create.rs b/nexus/src/app/sagas/instance_create.rs index 53169308c7..7028e620b1 100644 --- a/nexus/src/app/sagas/instance_create.rs +++ b/nexus/src/app/sagas/instance_create.rs @@ -618,10 +618,9 @@ async fn sic_allocate_instance_external_ip( // We perform the 'complete_op' in this saga stage because our IPs are // created in the attaching state, and we need to move them to attached. // We *can* do so because the `creating` state will block the IP attach/detach - // sagas from running, so we can safely undo without worrying they have been - // detached by another API call. + // sagas from running, so we can safely undo in event of later error in this saga + // without worrying they have been detached by another API call. // Runtime state should never be able to make 'complete_op' fallible. - let ip = match ip_params { // Allocate a new IP address from the target, possibly default, pool params::ExternalIpCreate::Ephemeral { ref pool_name } => { @@ -659,7 +658,7 @@ async fn sic_allocate_instance_external_ip( }; // Ignore row count here, this is infallible with correct - // (state, state', kind) but may be zero on repeat call for + // (state, state', kind) but may be zero on repeat call for // idempotency. _ = datastore .external_ip_complete_op( @@ -688,7 +687,7 @@ async fn sic_allocate_instance_external_ip_undo( &saga_params.serialized_authn, ); - // We store and lookup `ExternalIp` so that we can do the detach + // We store and lookup `ExternalIp` so that we can detach // and/or deallocate without double name resolution. let new_ip = sagactx .lookup::>(&format!("external-ip-{ip_index}"))?; diff --git a/nexus/src/app/sagas/instance_ip_attach.rs b/nexus/src/app/sagas/instance_ip_attach.rs index 872d92eabf..6b96db6e33 100644 --- a/nexus/src/app/sagas/instance_ip_attach.rs +++ b/nexus/src/app/sagas/instance_ip_attach.rs @@ -5,7 +5,7 @@ use super::instance_common::{ instance_ip_add_nat, instance_ip_add_opte, instance_ip_get_instance_state, instance_ip_move_state, instance_ip_remove_nat, instance_ip_remove_opte, - InstanceStateForIp, ModifyStateForExternalIp, + ModifyStateForExternalIp, }; use super::{ActionRegistry, NexusActionContext, NexusSaga}; use crate::app::sagas::declare_saga_actions; @@ -159,7 +159,7 @@ async fn siia_begin_attach_ip_undo( async fn siia_get_instance_state( sagactx: NexusActionContext, -) -> Result { +) -> Result, ActionError> { let params = sagactx.saga_params::()?; instance_ip_get_instance_state( &sagactx, diff --git a/nexus/src/app/sagas/instance_ip_detach.rs b/nexus/src/app/sagas/instance_ip_detach.rs index d460075f95..59d69f65a3 100644 --- a/nexus/src/app/sagas/instance_ip_detach.rs +++ b/nexus/src/app/sagas/instance_ip_detach.rs @@ -5,7 +5,7 @@ use super::instance_common::{ instance_ip_add_nat, instance_ip_add_opte, instance_ip_get_instance_state, instance_ip_move_state, instance_ip_remove_nat, instance_ip_remove_opte, - InstanceStateForIp, ModifyStateForExternalIp, + ModifyStateForExternalIp, }; use super::{ActionRegistry, NexusActionContext, NexusSaga}; use crate::app::sagas::declare_saga_actions; @@ -150,7 +150,7 @@ async fn siid_begin_detach_ip_undo( async fn siid_get_instance_state( sagactx: NexusActionContext, -) -> Result { +) -> Result, ActionError> { let params = sagactx.saga_params::()?; instance_ip_get_instance_state( &sagactx, diff --git a/schema/crdb/22.0.0/up03.sql b/schema/crdb/22.0.0/up03.sql index 7d6a62e4c0..ea1d461250 100644 --- a/schema/crdb/22.0.0/up03.sql +++ b/schema/crdb/22.0.0/up03.sql @@ -4,4 +4,4 @@ set UPDATE omicron.public.external_ip SET state = 'detached' -WHERE parent_id IS NULL; \ No newline at end of file +WHERE parent_id IS NULL; diff --git a/schema/crdb/22.0.0/up04.sql b/schema/crdb/22.0.0/up04.sql index e93e151202..7bf89d6626 100644 --- a/schema/crdb/22.0.0/up04.sql +++ b/schema/crdb/22.0.0/up04.sql @@ -4,4 +4,4 @@ set UPDATE omicron.public.external_ip SET state = 'attached' -WHERE parent_id IS NOT NULL; \ No newline at end of file +WHERE parent_id IS NOT NULL; diff --git a/schema/crdb/22.0.0/up07.sql b/schema/crdb/22.0.0/up07.sql index b4acedaf60..00f9310c2e 100644 --- a/schema/crdb/22.0.0/up07.sql +++ b/schema/crdb/22.0.0/up07.sql @@ -1,4 +1,4 @@ CREATE UNIQUE INDEX IF NOT EXISTS one_ephemeral_ip_per_instance ON omicron.public.external_ip ( parent_id ) - WHERE kind = 'ephemeral' AND parent_id IS NOT NULL AND time_deleted IS NULL; \ No newline at end of file + WHERE kind = 'ephemeral' AND parent_id IS NOT NULL AND time_deleted IS NULL; diff --git a/schema/crdb/dbinit.sql b/schema/crdb/dbinit.sql index a372403b28..0e38859251 100644 --- a/schema/crdb/dbinit.sql +++ b/schema/crdb/dbinit.sql @@ -1734,9 +1734,9 @@ CREATE TABLE IF NOT EXISTS omicron.public.external_ip ( /* * Only nullable if this is a floating/ephemeral IP, which may exist not - * attached to any instance or service yet. Ephemeral IPs should not exist - * without parent instances/services, but need to temporarily exist in this - * state for live attachment. + * attached to any instance or service yet. Ephemeral IPs should not generally + * exist without parent instances/services, but need to temporarily exist in + * this state for live attachment. */ CONSTRAINT null_snat_parent_id CHECK ( (kind != 'snat') OR (parent_id IS NOT NULL) @@ -1749,7 +1749,7 @@ CREATE TABLE IF NOT EXISTS omicron.public.external_ip ( /* * (Not detached) => non-null parent_id. - * This is not a two-way implication because SNAT/Ephemeral IPs + * This is not a two-way implication because SNAT IPs * cannot have a null parent_id. */ CONSTRAINT detached_null_parent_id CHECK ( From 14bb57f1040fd16e7bef55c2df509f76ebf3071f Mon Sep 17 00:00:00 2001 From: Kyle Simpson Date: Wed, 3 Jan 2024 11:09:15 +0000 Subject: [PATCH 29/56] Allow idempotent ephemeral IP attach if pool empty --- .../src/db/datastore/external_ip.rs | 16 ++++- nexus/tests/integration_tests/external_ips.rs | 69 +++++++++++++++++++ sled-agent/src/sim/sled_agent.rs | 6 +- 3 files changed, 87 insertions(+), 4 deletions(-) diff --git a/nexus/db-queries/src/db/datastore/external_ip.rs b/nexus/db-queries/src/db/datastore/external_ip.rs index 8aff01642f..97091d3c71 100644 --- a/nexus/db-queries/src/db/datastore/external_ip.rs +++ b/nexus/db-queries/src/db/datastore/external_ip.rs @@ -136,7 +136,21 @@ impl DataStore { let pool_id = pool.identity.id; let data = IncompleteExternalIp::for_ephemeral(ip_id, pool_id); - let temp_ip = self.allocate_external_ip(opctx, data).await?; + + // We might not be able to acquire a new IP, but in the event of an + // idempotent or double attach this failure is allowed. + let temp_ip = self.allocate_external_ip(opctx, data).await; + if let Err(e) = temp_ip { + let eip = self + .instance_lookup_external_ips(opctx, instance_id) + .await? + .into_iter() + .find(|v| v.kind == IpKind::Ephemeral) + .ok_or(e)?; + + return Ok((eip, false)); + } + let temp_ip = temp_ip?; let safe_states = if creating_instance { &SAFE_TO_ATTACH_INSTANCE_STATES_CREATING[..] diff --git a/nexus/tests/integration_tests/external_ips.rs b/nexus/tests/integration_tests/external_ips.rs index 102c281de3..e360329cc3 100644 --- a/nexus/tests/integration_tests/external_ips.rs +++ b/nexus/tests/integration_tests/external_ips.rs @@ -31,6 +31,7 @@ use omicron_common::address::IpRange; use omicron_common::address::Ipv4Range; use omicron_common::api::external::IdentityMetadataCreateParams; use omicron_common::api::external::Instance; +use omicron_common::api::external::Name; use omicron_common::api::external::NameOrId; use uuid::Uuid; @@ -809,6 +810,74 @@ async fn test_external_ip_attach_fails_after_maximum( ); } +#[nexus_test] +async fn test_external_ip_attach_ephemeral_at_pool_exhaustion( + cptestctx: &ControlPlaneTestContext, +) { + let client = &cptestctx.external_client; + + populate_ip_pool(&client, "default", None).await; + let other_pool_range = IpRange::V4( + Ipv4Range::new(Ipv4Addr::new(10, 1, 0, 1), Ipv4Addr::new(10, 1, 0, 1)) + .unwrap(), + ); + create_ip_pool(&client, "other-pool", Some(other_pool_range), None).await; + + create_project(client, PROJECT_NAME).await; + + // Create two instances, to which we will later add eph IPs from 'other-pool'. + for name in &INSTANCE_NAMES[..2] { + instance_for_external_ips(client, name, false, false, &[]).await; + } + + let pool_name: Name = "other-pool".parse().unwrap(); + + // Attach a new EIP from other-pool to both instances. + // This should succeed for the first, and fail for the second + // due to pool exhaustion. + let eph_resp = external_ip_attach( + client, + INSTANCE_NAMES[0], + ¶ms::ExternalIpCreate::Ephemeral { + pool_name: Some(pool_name.clone()), + }, + ) + .await; + assert_eq!(eph_resp.ip, other_pool_range.first_address()); + assert_eq!(eph_resp.ip, other_pool_range.last_address()); + + let url = attach_instance_external_ip_url(INSTANCE_NAMES[1], PROJECT_NAME); + let error: HttpErrorResponseBody = NexusRequest::new( + RequestBuilder::new(client, Method::POST, &url) + .body(Some(¶ms::ExternalIpCreate::Ephemeral { + pool_name: Some(pool_name.clone()), + })) + .expect_status(Some(StatusCode::INSUFFICIENT_STORAGE)), + ) + .authn_as(AuthnMode::PrivilegedUser) + .execute() + .await + .unwrap() + .parsed_body() + .unwrap(); + assert_eq!( + error.message, + "Insufficient capacity: No external IP addresses available".to_string() + ); + + // Idempotent re-add to the first instance should succeed even if + // an internal attempt to alloc a new EIP would fail. + let eph_resp_2 = external_ip_attach( + client, + INSTANCE_NAMES[0], + ¶ms::ExternalIpCreate::Ephemeral { + pool_name: Some(pool_name.clone()), + }, + ) + .await; + assert_eq!(eph_resp_2, eph_resp); +} + pub async fn floating_ip_get( client: &ClientTestContext, fip_url: &str, diff --git a/sled-agent/src/sim/sled_agent.rs b/sled-agent/src/sim/sled_agent.rs index 21a9d39220..d58fec384b 100644 --- a/sled-agent/src/sim/sled_agent.rs +++ b/sled-agent/src/sim/sled_agent.rs @@ -630,7 +630,7 @@ impl SledAgent { ) -> Result<(), Error> { if !self.instances.contains_key(&instance_id).await { return Err(Error::internal_error( - "can't alter IP state for nonexistant instance", + "can't alter IP state for nonexistent instance", )); } @@ -647,7 +647,7 @@ impl SledAgent { false } }) { - return Err(Error::invalid_request("cannot replace exisitng ephemeral IP without explicit removal")); + return Err(Error::invalid_request("cannot replace existing ephemeral IP without explicit removal")); } } @@ -663,7 +663,7 @@ impl SledAgent { ) -> Result<(), Error> { if !self.instances.contains_key(&instance_id).await { return Err(Error::internal_error( - "can't alter IP state for nonexistant instance", + "can't alter IP state for nonexistent instance", )); } From 03929310ebd999734df3067f19e04be4829eec12 Mon Sep 17 00:00:00 2001 From: Kyle Simpson Date: Wed, 3 Jan 2024 12:22:24 +0000 Subject: [PATCH 30/56] Minor comment typo. --- nexus/src/app/sagas/instance_ip_attach.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/nexus/src/app/sagas/instance_ip_attach.rs b/nexus/src/app/sagas/instance_ip_attach.rs index 6b96db6e33..3ba6a5c2c3 100644 --- a/nexus/src/app/sagas/instance_ip_attach.rs +++ b/nexus/src/app/sagas/instance_ip_attach.rs @@ -23,7 +23,7 @@ use uuid::Uuid; // The IP attach/detach sagas do some resource locking -- because we // allow them to be called in [Running, Stopped], they must contend // with each other/themselves, instance start, instance delete, and -// the instance stop action (noting the latter is not a saga. +// the instance stop action (noting the latter is not a saga). // // The main means of access control here is an external IP's `state`. // Entering either saga begins with an atomic swap from Attached/Detached From 239fb7094f478c01421b72a07b3feb5cd7f70bbb Mon Sep 17 00:00:00 2001 From: Kyle Simpson Date: Wed, 3 Jan 2024 12:33:33 +0000 Subject: [PATCH 31/56] Minor fn breakout before larger refactor --- .../src/db/datastore/external_ip.rs | 26 ++++++++++++++----- 1 file changed, 19 insertions(+), 7 deletions(-) diff --git a/nexus/db-queries/src/db/datastore/external_ip.rs b/nexus/db-queries/src/db/datastore/external_ip.rs index 97091d3c71..94cff2c4f8 100644 --- a/nexus/db-queries/src/db/datastore/external_ip.rs +++ b/nexus/db-queries/src/db/datastore/external_ip.rs @@ -142,10 +142,8 @@ impl DataStore { let temp_ip = self.allocate_external_ip(opctx, data).await; if let Err(e) = temp_ip { let eip = self - .instance_lookup_external_ips(opctx, instance_id) + .instance_lookup_ephemeral_ip(opctx, instance_id) .await? - .into_iter() - .find(|v| v.kind == IpKind::Ephemeral) .ok_or(e)?; return Ok((eip, false)); @@ -249,11 +247,11 @@ impl DataStore { Ok(None) => { self.deallocate_external_ip(opctx, temp_ip.id).await?; let eip = self - .instance_lookup_external_ips(opctx, instance_id) + .instance_lookup_ephemeral_ip(opctx, instance_id) .await? - .into_iter() - .find(|v| v.kind == IpKind::Ephemeral) - .ok_or_else(|| Error::internal_error("hmm"))?; + .ok_or_else(|| Error::internal_error( + "failed to lookup current ephemeral IP for idempotent attach" + ))?; Ok((eip, false)) } Ok(Some((_, eip))) => Ok((eip, true)), @@ -659,6 +657,20 @@ impl DataStore { .map_err(|e| public_error_from_diesel(e, ErrorHandler::Server)) } + /// Fetch the ephmeral IP address assigned to the provided instance, if this + /// has been configured. + pub async fn instance_lookup_ephemeral_ip( + &self, + opctx: &OpContext, + instance_id: Uuid, + ) -> LookupResult> { + Ok(self + .instance_lookup_external_ips(opctx, instance_id) + .await? + .into_iter() + .find(|v| v.kind == IpKind::Ephemeral)) + } + /// Fetch all Floating IP addresses for the provided project. pub async fn floating_ips_list( &self, From 54c969a17b620b98187a9ec31379d040f2b6f7d1 Mon Sep 17 00:00:00 2001 From: Kyle Simpson Date: Wed, 3 Jan 2024 13:52:29 +0000 Subject: [PATCH 32/56] Factor out Floating+Ephemeral attach logic --- .../src/db/datastore/external_ip.rs | 349 ++++++++---------- 1 file changed, 162 insertions(+), 187 deletions(-) diff --git a/nexus/db-queries/src/db/datastore/external_ip.rs b/nexus/db-queries/src/db/datastore/external_ip.rs index 94cff2c4f8..9af2e85dc7 100644 --- a/nexus/db-queries/src/db/datastore/external_ip.rs +++ b/nexus/db-queries/src/db/datastore/external_ip.rs @@ -93,18 +93,10 @@ impl DataStore { pool_name: Option, creating_instance: bool, ) -> CreateResult<(ExternalIp, bool)> { - use db::schema::external_ip::dsl; - use db::schema::external_ip::table; - use db::schema::instance::dsl as inst_dsl; - use db::schema::instance::table as inst_table; - use diesel::result::DatabaseErrorKind::UniqueViolation; - // This is slightly hacky: we need to create an unbound ephemeral IP, and // then attempt to bind it to respect two separate constraints: // - At most one Ephemeral IP per instance // - At most MAX external IPs per instance - // We already catch and convert a UniqueViolation on ephemeral IPs: - // if we see this occur, then // Naturally, we now *need* to destroy the ephemeral IP if the newly alloc'd // IP was not attached, including on idempotent success. let pool = match pool_name { @@ -150,100 +142,22 @@ impl DataStore { } let temp_ip = temp_ip?; - let safe_states = if creating_instance { - &SAFE_TO_ATTACH_INSTANCE_STATES_CREATING[..] - } else { - &SAFE_TO_ATTACH_INSTANCE_STATES[..] - }; - - let query = Instance::attach_resource( - instance_id, - temp_ip.id, - inst_table.into_boxed().filter(inst_dsl::state.eq_any(safe_states)), - table - .into_boxed() - .filter(dsl::state.eq(IpAttachState::Detached)) - .filter(dsl::kind.eq(IpKind::Ephemeral)), - MAX_EXTERNAL_IPS_PLUS_SNAT, - diesel::update(dsl::external_ip).set(( - dsl::parent_id.eq(Some(instance_id)), - dsl::time_modified.eq(Utc::now()), - dsl::state.eq(IpAttachState::Attaching), - )), - ); - - let result = query.attach_and_get_result_async(&*self.pool_connection_authorized(opctx).await?) - .await - .map(Some) - .or_else(|e: AttachError| match e { - AttachError::CollectionNotFound => { - Err(Error::not_found_by_id( - ResourceType::Instance, - &instance_id, - )) - }, - AttachError::ResourceNotFound => { - Err(Error::internal_error("call-scoped ephemeral IP was lost")) - }, - AttachError::NoUpdate { attached_count, resource, collection } => { - match resource.state { - // Idempotent errors: is in progress forsame resource pair -- this is fine. - // Double attach can be hit by, e.g., repeated call during instance create. - IpAttachState::Attaching - | IpAttachState::Attached - if resource.parent_id == Some(instance_id) => - return Ok(Some((collection, resource))), - IpAttachState::Attached => return Err(Error::invalid_request( - "floating IP cannot be attached to one \ - instance while still attached to another" - )), - // User can reattempt depending on how the current saga unfolds. - IpAttachState::Attaching | IpAttachState::Detaching => return Err(Error::unavail( - "tried to attach floating IP mid-attach/detach" - )), - - IpAttachState::Detached => {}, - } - - Err(match &collection.runtime_state.nexus_state { - state if SAFE_TRANSIENT_INSTANCE_STATES.contains(&state) => Error::unavail( - "tried to attach floating IP while instance was changing state" - ), - state if SAFE_TO_ATTACH_INSTANCE_STATES.contains(&state) => { - if attached_count >= MAX_EXTERNAL_IPS_PLUS_SNAT as i64 { - Error::invalid_request(&format!( - "an instance may not have more than {} external IP addresses", - MAX_EXTERNAL_IPS_PER_INSTANCE, - )) - } else { - eprintln!("{resource:?}, {collection:?}"); - Error::internal_error("failed to attach ephemeral IP") - } - }, - state => Error::invalid_request(&format!("cannot attach floating IP to instance in {state} state")), - }) - }, - // This case occurs for both currently attaching and attached IPs: - AttachError::DatabaseError(diesel::result::Error::DatabaseError(UniqueViolation, ..)) => { - Ok(None) - }, - AttachError::DatabaseError(e) => { - Err(public_error_from_diesel(e, ErrorHandler::Server)) - }, - }); - - // if completed (!do_saga), we need to attempt + let result = self + .begin_attach_ip( + opctx, + temp_ip.id, + instance_id, + IpKind::Ephemeral, + creating_instance, + ) + .await; match result { Err(e) => { self.deallocate_external_ip(opctx, temp_ip.id).await?; Err(e) } - // Idempotent cases: - Ok(Some((_, eip))) if eip.id != temp_ip.id => { - self.deallocate_external_ip(opctx, temp_ip.id).await?; - Ok((eip, true)) - } + // Idempotent case: attach failed due to a caught UniqueViolation. Ok(None) => { self.deallocate_external_ip(opctx, temp_ip.id).await?; let eip = self @@ -252,9 +166,10 @@ impl DataStore { .ok_or_else(|| Error::internal_error( "failed to lookup current ephemeral IP for idempotent attach" ))?; - Ok((eip, false)) + let do_saga = eip.state != IpAttachState::Attached; + Ok((eip, do_saga)) } - Ok(Some((_, eip))) => Ok((eip, true)), + Ok(Some(v)) => Ok(v), } } @@ -469,6 +384,142 @@ impl DataStore { self.allocate_external_ip(opctx, data).await } + /// Attempt to move a target external IP from detached to attaching, + /// checking that its parent instance does not have too many addresses + /// and is in a valid state. + /// + /// Returns the `ExternalIp` which was modified, where possible. This + /// is only nullable when trying to double-attach ephemeral IPs. + /// To better handle idempotent attachment, this method returns an + /// additional bool: + /// - true: EIP was detached or attaching. proceed with saga. + /// - false: EIP was attached. No-op for remainder of saga. + async fn begin_attach_ip( + &self, + opctx: &OpContext, + ip_id: Uuid, + instance_id: Uuid, + kind: IpKind, + creating_instance: bool, + ) -> Result, Error> { + use db::schema::external_ip::dsl; + use db::schema::external_ip::table; + use db::schema::instance::dsl as inst_dsl; + use db::schema::instance::table as inst_table; + use diesel::result::DatabaseErrorKind::UniqueViolation; + use diesel::result::Error::DatabaseError; + + let label = match kind { + IpKind::Floating => "floating", + IpKind::Ephemeral => "ephemeral", + IpKind::SNat => { + return Err(Error::internal_error( + "cannot dynamically attach SNAT IP to instance", + )) + } + }; + + let safe_states = if creating_instance { + &SAFE_TO_ATTACH_INSTANCE_STATES_CREATING[..] + } else { + &SAFE_TO_ATTACH_INSTANCE_STATES[..] + }; + + let query = Instance::attach_resource( + instance_id, + ip_id, + inst_table.into_boxed().filter(inst_dsl::state.eq_any(safe_states)), + table + .into_boxed() + .filter(dsl::state.eq(IpAttachState::Detached)) + .filter(dsl::kind.eq(kind)) + .filter(dsl::parent_id.is_null()), + MAX_EXTERNAL_IPS_PLUS_SNAT, + diesel::update(dsl::external_ip).set(( + dsl::parent_id.eq(Some(instance_id)), + dsl::time_modified.eq(Utc::now()), + dsl::state.eq(IpAttachState::Attaching), + )), + ); + + let mut do_saga = true; + query.attach_and_get_result_async(&*self.pool_connection_authorized(opctx).await?) + .await + .map(|(_, resource)| Some(resource)) + .or_else(|e: AttachError| match e { + AttachError::CollectionNotFound => { + Err(Error::not_found_by_id( + ResourceType::Instance, + &instance_id, + )) + }, + AttachError::ResourceNotFound => { + Err(if kind == IpKind::Ephemeral { + Error::internal_error("call-scoped ephemeral IP was lost") + } else { + Error::not_found_by_id( + ResourceType::FloatingIp, + &ip_id, + ) + }) + }, + AttachError::NoUpdate { attached_count, resource, collection } => { + match resource.state { + // Idempotent errors: is in progress or complete for same resource pair -- this is fine. + IpAttachState::Attaching if resource.parent_id == Some(instance_id) => + return Ok(Some(resource)), + IpAttachState::Attached if resource.parent_id == Some(instance_id) => { + do_saga = false; + return Ok(Some(resource)) + }, + IpAttachState::Attached => + return Err(Error::invalid_request(&format!( + "{label} IP cannot be attached to one \ + instance while still attached to another" + ))), + // User can reattempt depending on how the current saga unfolds. + // NB; only floating IP can return this case, eph will return + // a UniqueViolation. + IpAttachState::Attaching | IpAttachState::Detaching + => return Err(Error::unavail(&format!( + "tried to attach {label} IP mid-attach/detach" + ))), + + IpAttachState::Detached => {}, + } + + Err(match &collection.runtime_state.nexus_state { + state if SAFE_TRANSIENT_INSTANCE_STATES.contains(&state) + => Error::unavail(&format!( + "tried to attach {label} IP while instance was changing state" + )), + state if SAFE_TO_ATTACH_INSTANCE_STATES.contains(&state) => { + if attached_count >= MAX_EXTERNAL_IPS_PLUS_SNAT as i64 { + Error::invalid_request(&format!( + "an instance may not have more than {} external IP addresses", + MAX_EXTERNAL_IPS_PER_INSTANCE, + )) + } else { + Error::internal_error(&format!("failed to attach {label} IP")) + } + }, + state => Error::invalid_request(&format!( + "cannot attach {label} IP to instance in {state} state" + )), + }) + }, + // This case occurs for both currently attaching and attached ephemeral IPs: + AttachError::DatabaseError(DatabaseError(UniqueViolation, ..)) + if kind == IpKind::Ephemeral => { + Ok(None) + }, + AttachError::DatabaseError(e) => { + Err(public_error_from_diesel(e, ErrorHandler::Server)) + }, + }) + .map(|eip| eip.map(|v| (v, do_saga))) + } + /// Deallocate the external IP address with the provided ID. /// /// To support idempotency, such as in saga operations, this method returns @@ -755,11 +806,6 @@ impl DataStore { instance_id: Uuid, creating_instance: bool, ) -> UpdateResult<(ExternalIp, bool)> { - use db::schema::external_ip::dsl; - use db::schema::external_ip::table; - use db::schema::instance::dsl as inst_dsl; - use db::schema::instance::table as inst_table; - let (.., authz_instance, _db_instance) = LookupPath::new(&opctx, self) .instance_id(instance_id) .fetch_for(authz::Action::Modify) @@ -768,92 +814,21 @@ impl DataStore { opctx.authorize(authz::Action::Modify, authz_fip).await?; opctx.authorize(authz::Action::Modify, &authz_instance).await?; - let fip_id = authz_fip.id(); - - let safe_states = if creating_instance { - &SAFE_TO_ATTACH_INSTANCE_STATES_CREATING[..] - } else { - &SAFE_TO_ATTACH_INSTANCE_STATES[..] - }; - - let query = Instance::attach_resource( + self.begin_attach_ip( + opctx, + authz_fip.id(), instance_id, - fip_id, - inst_table.into_boxed().filter(inst_dsl::state.eq_any(safe_states)), - table - .into_boxed() - .filter(dsl::state.eq(IpAttachState::Detached)) - .filter(dsl::kind.eq(IpKind::Floating)) - .filter(dsl::parent_id.is_null()), - MAX_EXTERNAL_IPS_PLUS_SNAT, - diesel::update(dsl::external_ip).set(( - dsl::parent_id.eq(Some(instance_id)), - dsl::time_modified.eq(Utc::now()), - dsl::state.eq(IpAttachState::Attaching), - )), - ); - - let mut do_saga = true; - let (_, eip) = query.attach_and_get_result_async(&*self.pool_connection_authorized(opctx).await?) + IpKind::Floating, + creating_instance, + ) .await - .or_else(|e: AttachError| match e { - AttachError::CollectionNotFound => { - Err(Error::not_found_by_id( - ResourceType::Instance, - &instance_id, - )) - }, - AttachError::ResourceNotFound => { - Err(Error::not_found_by_id( - ResourceType::FloatingIp, - &fip_id, - )) - }, - AttachError::NoUpdate { attached_count, resource, collection } => { - match resource.state { - // Idempotent errors: is in progress or complete for same resource pair -- this is fine. - IpAttachState::Attaching if resource.parent_id == Some(instance_id) => return Ok((collection, resource)), - IpAttachState::Attached if resource.parent_id == Some(instance_id) => { - do_saga = false; - return Ok((collection, resource)) - }, - IpAttachState::Attached => return Err(Error::invalid_request( - "floating IP cannot be attached to one \ - instance while still attached to another" - )), - // User can reattempt depending on how the current saga unfolds. - IpAttachState::Attaching | IpAttachState::Detaching => return Err(Error::unavail( - "tried to attach floating IP mid-attach/detach" - )), - - IpAttachState::Detached => {}, - } - - Err(match &collection.runtime_state.nexus_state { - state if SAFE_TRANSIENT_INSTANCE_STATES.contains(&state) => Error::unavail( - "tried to attach floating IP while instance was changing state" - ), - state if SAFE_TO_ATTACH_INSTANCE_STATES.contains(&state) => { - if attached_count >= MAX_EXTERNAL_IPS_PLUS_SNAT as i64 { - Error::invalid_request(&format!( - "an instance may not have more than {} external IP addresses", - MAX_EXTERNAL_IPS_PER_INSTANCE, - )) - } else { - eprintln!("{resource:?}, {collection:?}"); - Error::internal_error("failed to attach floating IP") - } - }, - state => Error::invalid_request(&format!("cannot attach floating IP to instance in {state} state")), - }) - }, - AttachError::DatabaseError(e) => { - Err(public_error_from_diesel(e, ErrorHandler::Server)) - }, - - })?; - - Ok((eip, do_saga)) + .and_then(|v| { + v.ok_or_else(|| { + Error::internal_error( + "floating IP should never return `None` from begin_attach", + ) + }) + }) } /// Detaches a Floating IP address from an instance. From 9200b4a78618c5ba9d954e74a9a84e289ab5621a Mon Sep 17 00:00:00 2001 From: Kyle Simpson Date: Wed, 3 Jan 2024 16:46:36 +0000 Subject: [PATCH 33/56] Factor out detach method bodies. --- nexus/db-model/src/external_ip.rs | 10 + .../src/db/datastore/external_ip.rs | 297 ++++++++---------- nexus/src/app/sagas/instance_ip_detach.rs | 13 +- nexus/tests/integration_tests/external_ips.rs | 2 +- 4 files changed, 152 insertions(+), 170 deletions(-) diff --git a/nexus/db-model/src/external_ip.rs b/nexus/db-model/src/external_ip.rs index d762d0bb4a..62fa6393da 100644 --- a/nexus/db-model/src/external_ip.rs +++ b/nexus/db-model/src/external_ip.rs @@ -68,6 +68,16 @@ impl std::fmt::Display for IpAttachState { } } +impl std::fmt::Display for IpKind { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + f.write_str(match self { + IpKind::Floating => "floating", + IpKind::Ephemeral => "ephemeral", + IpKind::SNat => "SNAT", + }) + } +} + /// The main model type for external IP addresses for instances /// and externally-facing services. /// diff --git a/nexus/db-queries/src/db/datastore/external_ip.rs b/nexus/db-queries/src/db/datastore/external_ip.rs index 9af2e85dc7..4f7dfaee6f 100644 --- a/nexus/db-queries/src/db/datastore/external_ip.rs +++ b/nexus/db-queries/src/db/datastore/external_ip.rs @@ -409,16 +409,6 @@ impl DataStore { use diesel::result::DatabaseErrorKind::UniqueViolation; use diesel::result::Error::DatabaseError; - let label = match kind { - IpKind::Floating => "floating", - IpKind::Ephemeral => "ephemeral", - IpKind::SNat => { - return Err(Error::internal_error( - "cannot dynamically attach SNAT IP to instance", - )) - } - }; - let safe_states = if creating_instance { &SAFE_TO_ATTACH_INSTANCE_STATES_CREATING[..] } else { @@ -474,7 +464,7 @@ impl DataStore { }, IpAttachState::Attached => return Err(Error::invalid_request(&format!( - "{label} IP cannot be attached to one \ + "{kind} IP cannot be attached to one \ instance while still attached to another" ))), // User can reattempt depending on how the current saga unfolds. @@ -482,7 +472,7 @@ impl DataStore { // a UniqueViolation. IpAttachState::Attaching | IpAttachState::Detaching => return Err(Error::unavail(&format!( - "tried to attach {label} IP mid-attach/detach" + "tried to attach {kind} IP mid-attach/detach" ))), IpAttachState::Detached => {}, @@ -491,7 +481,7 @@ impl DataStore { Err(match &collection.runtime_state.nexus_state { state if SAFE_TRANSIENT_INSTANCE_STATES.contains(&state) => Error::unavail(&format!( - "tried to attach {label} IP while instance was changing state" + "tried to attach {kind} IP while instance was changing state" )), state if SAFE_TO_ATTACH_INSTANCE_STATES.contains(&state) => { if attached_count >= MAX_EXTERNAL_IPS_PLUS_SNAT as i64 { @@ -500,11 +490,11 @@ impl DataStore { MAX_EXTERNAL_IPS_PER_INSTANCE, )) } else { - Error::internal_error(&format!("failed to attach {label} IP")) + Error::internal_error(&format!("failed to attach {kind} IP")) } }, state => Error::invalid_request(&format!( - "cannot attach {label} IP to instance in {state} state" + "cannot attach {kind} IP to instance in {state} state" )), }) }, @@ -520,7 +510,114 @@ impl DataStore { .map(|eip| eip.map(|v| (v, do_saga))) } - /// Deallocate the external IP address with the provided ID. + /// Attempt to move a target external IP from attached to detaching, + /// checking that its parent instance is in a valid state. + /// + /// Returns the `ExternalIp` which was modified, where possible. This + /// is only nullable when trying to double-detach ephemeral IPs. + /// To better handle idempotent attachment, this method returns an + /// additional bool: + /// - true: EIP was detached or attaching. proceed with saga. + /// - false: EIP was attached. No-op for remainder of saga. + async fn begin_detach_ip( + &self, + opctx: &OpContext, + ip_id: Uuid, + instance_id: Uuid, + kind: IpKind, + creating_instance: bool, + ) -> UpdateResult> { + use db::schema::external_ip::dsl; + use db::schema::external_ip::table; + use db::schema::instance::dsl as inst_dsl; + use db::schema::instance::table as inst_table; + + let safe_states = if creating_instance { + &SAFE_TO_ATTACH_INSTANCE_STATES_CREATING[..] + } else { + &SAFE_TO_ATTACH_INSTANCE_STATES[..] + }; + + let query = Instance::detach_resource( + instance_id, + ip_id, + inst_table.into_boxed().filter(inst_dsl::state.eq_any(safe_states)), + table + .into_boxed() + .filter(dsl::state.eq(IpAttachState::Attached)) + .filter(dsl::kind.eq(kind)), + diesel::update(dsl::external_ip).set(( + dsl::time_modified.eq(Utc::now()), + dsl::state.eq(IpAttachState::Detaching), + )), + ); + + let mut do_saga = true; + query.detach_and_get_result_async(&*self.pool_connection_authorized(opctx).await?) + .await + .map(Some) + .or_else(|e: DetachError| Err(match e { + DetachError::CollectionNotFound => { + Error::not_found_by_id( + ResourceType::Instance, + &instance_id, + ) + }, + DetachError::ResourceNotFound => { + if kind == IpKind::Ephemeral { + return Ok(None); + } else { + Error::not_found_by_id( + ResourceType::FloatingIp, + &ip_id, + ) + } + }, + DetachError::NoUpdate { resource, collection } => { + let parent_match = resource.parent_id == Some(instance_id); + match resource.state { + // Idempotent cases: already detached OR detaching from same instance. + IpAttachState::Detached => { + do_saga = false; + return Ok(Some(resource)) + }, + IpAttachState::Detaching if parent_match => return Ok(Some(resource)), + IpAttachState::Attached if !parent_match + => return Err(Error::invalid_request(&format!( + "{kind} IP is not attached to the target instance", + ))), + // User can reattempt depending on how the current saga unfolds. + IpAttachState::Attaching + | IpAttachState::Detaching => return Err(Error::unavail(&format!( + "tried to detach {kind} IP mid-attach/detach" + ))), + IpAttachState::Attached => {}, + } + + match collection.runtime_state.nexus_state { + state if SAFE_TRANSIENT_INSTANCE_STATES.contains(&state) => Error::unavail(&format!( + "tried to detach {kind} IP while instance was changing state" + )), + state if SAFE_TO_ATTACH_INSTANCE_STATES.contains(&state) => { + Error::internal_error(&format!("failed to detach {kind} IP")) + }, + state => Error::invalid_request(&format!( + "cannot detach {kind} IP from instance in {state} state" + )), + } + }, + DetachError::DatabaseError(e) => { + public_error_from_diesel(e, ErrorHandler::Server) + }, + + })) + .map(|eip| eip.map(|v| (v, do_saga))) + } + + /// Deallocate the external IP address with the provided ID. This is a complete + /// removal of the IP entry, in contrast with `begin_deallocate_ephemeral_ip`, + /// and should only be used for SNAT entries or cleanup of short-lived ephemeral + /// IPs on failure. /// /// To support idempotency, such as in saga operations, this method returns /// an extra boolean, rather than the usual `DeleteResult`. The meaning of @@ -561,78 +658,20 @@ impl DataStore { ip_id: Uuid, instance_id: Uuid, ) -> Result, Error> { - use db::schema::external_ip::dsl; - use db::schema::external_ip::table; - use db::schema::instance::dsl as inst_dsl; - use db::schema::instance::table as inst_table; - let _ = LookupPath::new(&opctx, self) .instance_id(instance_id) .lookup_for(authz::Action::Modify) .await?; - let query = Instance::detach_resource( - instance_id, + self.begin_detach_ip( + opctx, ip_id, - inst_table - .into_boxed() - .filter(inst_dsl::state.eq_any(SAFE_TO_ATTACH_INSTANCE_STATES)), - table - .into_boxed() - .filter(dsl::state.eq(IpAttachState::Attached)) - .filter(dsl::kind.eq(IpKind::Ephemeral)), - diesel::update(dsl::external_ip).set(( - dsl::time_modified.eq(Utc::now()), - dsl::state.eq(IpAttachState::Detaching), - )), - ); - - let eip = query.detach_and_get_result_async(&*self.pool_connection_authorized(opctx).await?) + instance_id, + IpKind::Ephemeral, + false, + ) .await - .map(Some) - .or_else(|e: DetachError| { - Err(match e { - DetachError::CollectionNotFound => { - Error::not_found_by_id( - ResourceType::Instance, - &instance_id, - ) - }, - DetachError::ResourceNotFound => { - return Ok(None); - }, - DetachError::NoUpdate { resource, collection } => { - match resource.state { - // XXX: internal error? - IpAttachState::Attached if resource.parent_id != Some(instance_id) => return Err(Error::internal_error( - "Ephemeral IP is not attached to the target instance", - )), - IpAttachState::Detaching => return Ok(Some(resource)), - // User can reattempt depending on how the current saga unfolds. - IpAttachState::Attaching => return Err(Error::unavail( - "tried to detach ephemeral IP mid-attach/detach" - )), - IpAttachState::Attached => {}, - IpAttachState::Detached => return Err(Error::internal_error( - "Ephemeral IP cannot exist in 'detached' state", - )), - } - match collection.runtime_state.nexus_state { - state if SAFE_TRANSIENT_INSTANCE_STATES.contains(&state) => Error::unavail( - "tried to detach ephemeral IP while instance was changing state" - ), - state if SAFE_TO_ATTACH_INSTANCE_STATES.contains(&state) => { - Error::internal_error("failed to detach ephemeral IP") - }, - state => Error::invalid_request(&format!("cannot attach ephemeral IP to instance in {state} state")), - } - }, - DetachError::DatabaseError(e) => { - public_error_from_diesel(e, ErrorHandler::Server) - }, - })})?; - - Ok(eip) + .map(|res| res.map(|(ip, _do_saga)| ip)) } /// Delete all non-floating IP addresses associated with the provided instance @@ -806,9 +845,9 @@ impl DataStore { instance_id: Uuid, creating_instance: bool, ) -> UpdateResult<(ExternalIp, bool)> { - let (.., authz_instance, _db_instance) = LookupPath::new(&opctx, self) + let (.., authz_instance) = LookupPath::new(&opctx, self) .instance_id(instance_id) - .fetch_for(authz::Action::Modify) + .lookup_for(authz::Action::Modify) .await?; opctx.authorize(authz::Action::Modify, authz_fip).await?; @@ -848,93 +887,29 @@ impl DataStore { instance_id: Uuid, creating_instance: bool, ) -> UpdateResult<(ExternalIp, bool)> { - use db::schema::external_ip::dsl; - use db::schema::external_ip::table; - use db::schema::instance::dsl as inst_dsl; - use db::schema::instance::table as inst_table; - - let (.., authz_instance, _db_instance) = LookupPath::new(&opctx, self) + let (.., authz_instance) = LookupPath::new(&opctx, self) .instance_id(instance_id) - .fetch_for(authz::Action::Modify) + .lookup_for(authz::Action::Modify) .await?; opctx.authorize(authz::Action::Modify, authz_fip).await?; opctx.authorize(authz::Action::Modify, &authz_instance).await?; - let fip_id = authz_fip.id(); - - let safe_states = if creating_instance { - &SAFE_TO_ATTACH_INSTANCE_STATES_CREATING[..] - } else { - &SAFE_TO_ATTACH_INSTANCE_STATES[..] - }; - - let query = Instance::detach_resource( + self.begin_detach_ip( + opctx, + authz_fip.id(), instance_id, - fip_id, - inst_table.into_boxed().filter(inst_dsl::state.eq_any(safe_states)), - table - .into_boxed() - .filter(dsl::state.eq(IpAttachState::Attached)) - .filter(dsl::kind.eq(IpKind::Floating)), - diesel::update(dsl::external_ip).set(( - dsl::time_modified.eq(Utc::now()), - dsl::state.eq(IpAttachState::Detaching), - )), - ); - - let mut do_saga = true; - let eip = query.detach_and_get_result_async(&*self.pool_connection_authorized(opctx).await?) + IpKind::Floating, + creating_instance, + ) .await - .or_else(|e: DetachError| Err(match e { - DetachError::CollectionNotFound => { - Error::not_found_by_id( - ResourceType::Instance, - &instance_id, - ) - }, - DetachError::ResourceNotFound => { - Error::not_found_by_id( - ResourceType::FloatingIp, - &fip_id, + .and_then(|v| { + v.ok_or_else(|| { + Error::internal_error( + "floating IP should never return `None` from begin_attach", ) - }, - DetachError::NoUpdate { resource, collection } => { - let parent_match = resource.parent_id == Some(instance_id); - match resource.state { - // Idempotent cases: already detached OR detaching from same instance. - IpAttachState::Detached => { - do_saga = false; - return Ok(resource) - }, - IpAttachState::Detaching if parent_match => return Ok(resource), - IpAttachState::Attached if !parent_match => return Err(Error::invalid_request( - "Floating IP is not attached to the target instance", - )), - // User can reattempt depending on how the current saga unfolds. - IpAttachState::Attaching | IpAttachState::Detaching => return Err(Error::unavail( - "tried to detach floating IP mid-attach/detach" - )), - IpAttachState::Attached => {}, - } - - match collection.runtime_state.nexus_state { - state if SAFE_TRANSIENT_INSTANCE_STATES.contains(&state) => Error::unavail( - "tried to detach floating IP while instance was changing state" - ), - state if SAFE_TO_ATTACH_INSTANCE_STATES.contains(&state) => { - Error::internal_error("failed to detach floating IP") - }, - state => Error::invalid_request(&format!("cannot detach floating IP to instance in {state} state")), - } - }, - DetachError::DatabaseError(e) => { - public_error_from_diesel(e, ErrorHandler::Server) - }, - - }))?; - - Ok((eip, do_saga)) + }) + }) } /// Move an external IP from a transitional state (attaching, detaching) diff --git a/nexus/src/app/sagas/instance_ip_detach.rs b/nexus/src/app/sagas/instance_ip_detach.rs index 59d69f65a3..b90d6836cb 100644 --- a/nexus/src/app/sagas/instance_ip_detach.rs +++ b/nexus/src/app/sagas/instance_ip_detach.rs @@ -11,7 +11,7 @@ use super::{ActionRegistry, NexusActionContext, NexusSaga}; use crate::app::sagas::declare_saga_actions; use crate::app::{authn, authz, db}; use crate::external_api::params; -use nexus_db_model::{IpAttachState, IpKind}; +use nexus_db_model::IpAttachState; use nexus_db_queries::db::lookup::LookupPath; use nexus_types::external_api::views; use serde::Deserialize; @@ -71,18 +71,15 @@ async fn siid_begin_detach_ip( match params.delete_params { params::ExternalIpDelete::Ephemeral => { - let eips = datastore - .instance_lookup_external_ips( + let eip = datastore + .instance_lookup_ephemeral_ip( &opctx, params.authz_instance.id(), ) .await .map_err(ActionError::action_failed)?; - // XXX: cleanup. - if let Some(eph_ip) = - eips.iter().find(|e| e.kind == IpKind::Ephemeral) - { + if let Some(eph_ip) = eip { datastore .begin_deallocate_ephemeral_ip( &opctx, @@ -276,7 +273,7 @@ pub(crate) mod test { use diesel::{ ExpressionMethods, OptionalExtension, QueryDsl, SelectableHelper, }; - use nexus_db_model::{ExternalIp, Name}; + use nexus_db_model::{ExternalIp, IpKind, Name}; use nexus_db_queries::context::OpContext; use nexus_test_utils::resource_helpers::create_instance; use nexus_test_utils_macros::nexus_test; diff --git a/nexus/tests/integration_tests/external_ips.rs b/nexus/tests/integration_tests/external_ips.rs index e360329cc3..9b98b2efab 100644 --- a/nexus/tests/integration_tests/external_ips.rs +++ b/nexus/tests/integration_tests/external_ips.rs @@ -724,7 +724,7 @@ async fn test_external_ip_attach_detach_fail_if_in_use_by_other( .unwrap(); assert_eq!( error.message, - "Floating IP is not attached to the target instance".to_string() + "floating IP is not attached to the target instance".to_string() ); } From 877de915c3363bb2f178eea2d5193fc5b584c6c7 Mon Sep 17 00:00:00 2001 From: Kyle Simpson Date: Wed, 3 Jan 2024 17:14:18 +0000 Subject: [PATCH 34/56] Minor cleanup of main datastore changes. --- .../src/db/datastore/external_ip.rs | 34 ++++++++----------- .../db-queries/src/db/queries/external_ip.rs | 3 ++ nexus/src/app/mod.rs | 4 ++- 3 files changed, 20 insertions(+), 21 deletions(-) diff --git a/nexus/db-queries/src/db/datastore/external_ip.rs b/nexus/db-queries/src/db/datastore/external_ip.rs index 4f7dfaee6f..e2e911978d 100644 --- a/nexus/db-queries/src/db/datastore/external_ip.rs +++ b/nexus/db-queries/src/db/datastore/external_ip.rs @@ -26,6 +26,7 @@ use crate::db::model::Name; use crate::db::pagination::paginated; use crate::db::pool::DbConnection; use crate::db::queries::external_ip::NextExternalIp; +use crate::db::queries::external_ip::MAX_EXTERNAL_IPS_PER_INSTANCE; use crate::db::queries::external_ip::SAFE_TO_ATTACH_INSTANCE_STATES; use crate::db::queries::external_ip::SAFE_TO_ATTACH_INSTANCE_STATES_CREATING; use crate::db::queries::external_ip::SAFE_TRANSIENT_INSTANCE_STATES; @@ -51,9 +52,6 @@ use ref_cast::RefCast; use std::net::IpAddr; use uuid::Uuid; -// FIXME: should be exported from a shared location, original lives in -// nexus/app. -const MAX_EXTERNAL_IPS_PER_INSTANCE: u32 = 32; const MAX_EXTERNAL_IPS_PLUS_SNAT: u32 = MAX_EXTERNAL_IPS_PER_INSTANCE + 1; impl DataStore { @@ -142,7 +140,7 @@ impl DataStore { } let temp_ip = temp_ip?; - let result = self + match self .begin_attach_ip( opctx, temp_ip.id, @@ -150,9 +148,8 @@ impl DataStore { IpKind::Ephemeral, creating_instance, ) - .await; - - match result { + .await + { Err(e) => { self.deallocate_external_ip(opctx, temp_ip.id).await?; Err(e) @@ -313,7 +310,7 @@ impl DataStore { } } // Floating IP: name conflict - DatabaseError(UniqueViolation, ..) => { + DatabaseError(UniqueViolation, ..) if name.is_some() => { TransactionError::CustomError(public_error_from_diesel( e, ErrorHandler::Conflict( @@ -486,8 +483,8 @@ impl DataStore { state if SAFE_TO_ATTACH_INSTANCE_STATES.contains(&state) => { if attached_count >= MAX_EXTERNAL_IPS_PLUS_SNAT as i64 { Error::invalid_request(&format!( - "an instance may not have more than {} external IP addresses", - MAX_EXTERNAL_IPS_PER_INSTANCE, + "an instance may not have more than \ + {MAX_EXTERNAL_IPS_PER_INSTANCE} external IP addresses", )) } else { Error::internal_error(&format!("failed to attach {kind} IP")) @@ -651,7 +648,7 @@ impl DataStore { /// /// To support idempotency, this method will succeed if the instance /// has no ephemeral IP or one is actively being removed. As a result, - /// information on an actual ExternalIp is best-effort. + /// information on an actual `ExternalIp` is best-effort. pub async fn begin_deallocate_ephemeral_ip( &self, opctx: &OpContext, @@ -906,7 +903,7 @@ impl DataStore { .and_then(|v| { v.ok_or_else(|| { Error::internal_error( - "floating IP should never return `None` from begin_attach", + "floating IP should never return `None` from begin_detach", ) }) }) @@ -944,20 +941,17 @@ impl DataStore { .filter(dsl::time_deleted.is_null()) .filter(dsl::state.eq(expected_state)); - // This leaves out SNat for now, double check where it fits in with - // instance destroy. let now = Utc::now(); let conn = self.pool_connection_authorized(opctx).await?; match (ip_kind, target_state) { - (IpKind::SNat, _) => { - return Err(Error::internal_error( - "shouldn't need to multistage for SNAT", - )) - } + (IpKind::SNat, _) => return Err(Error::internal_error( + "SNAT should not be removed via `external_ip_complete_op`, \ + use `deallocate_external_ip`", + )), (IpKind::Ephemeral, IpAttachState::Detached) => { part_out .set(( - // dsl::parent_id.eq(Option::::None), + dsl::parent_id.eq(Option::::None), dsl::time_modified.eq(now), dsl::time_deleted.eq(now), dsl::state.eq(target_state), diff --git a/nexus/db-queries/src/db/queries/external_ip.rs b/nexus/db-queries/src/db/queries/external_ip.rs index aee0d7a919..08bfcb2933 100644 --- a/nexus/db-queries/src/db/queries/external_ip.rs +++ b/nexus/db-queries/src/db/queries/external_ip.rs @@ -59,6 +59,9 @@ pub const SAFE_TRANSIENT_INSTANCE_STATES: [DbInstanceState; 5] = [ DbInstanceState(ApiInstanceState::Migrating), ]; +/// The maximum number of disks that can be attached to an instance. +pub const MAX_EXTERNAL_IPS_PER_INSTANCE: u32 = 32; + type FromClause = diesel::internal::table_macro::StaticQueryFragmentInstance; type IpPoolRangeFromClause = FromClause; diff --git a/nexus/src/app/mod.rs b/nexus/src/app/mod.rs index 5af45985db..1905fc9c96 100644 --- a/nexus/src/app/mod.rs +++ b/nexus/src/app/mod.rs @@ -86,7 +86,9 @@ pub(crate) const MAX_NICS_PER_INSTANCE: usize = 8; // The value here is arbitrary, but we need *a* limit for the instance // create saga to have a bounded DAG. We might want to only enforce // this during instance create (rather than live attach) in future. -pub(crate) const MAX_EXTERNAL_IPS_PER_INSTANCE: usize = 32; +pub(crate) const MAX_EXTERNAL_IPS_PER_INSTANCE: usize = + nexus_db_queries::db::queries::external_ip::MAX_EXTERNAL_IPS_PER_INSTANCE + as usize; pub(crate) const MAX_EPHEMERAL_IPS_PER_INSTANCE: usize = 1; pub const MAX_VCPU_PER_INSTANCE: u16 = 64; From 945db759807b7dbb992fcfa7882e85275519b479 Mon Sep 17 00:00:00 2001 From: Kyle Simpson Date: Wed, 3 Jan 2024 17:34:10 +0000 Subject: [PATCH 35/56] final(?) cleanup. --- nexus/src/app/sagas/instance_ip_attach.rs | 7 +++---- nexus/src/app/sagas/instance_ip_detach.rs | 4 ++-- 2 files changed, 5 insertions(+), 6 deletions(-) diff --git a/nexus/src/app/sagas/instance_ip_attach.rs b/nexus/src/app/sagas/instance_ip_attach.rs index 3ba6a5c2c3..d688da9ba2 100644 --- a/nexus/src/app/sagas/instance_ip_attach.rs +++ b/nexus/src/app/sagas/instance_ip_attach.rs @@ -40,8 +40,7 @@ use uuid::Uuid; // is that the central undo actions (DPD/OPTE state) *must* be best-effort. // This is not bad per-se: instance stop does not itself remove NAT routing // rules. The only reason these should fail is because an instance has stopped, -// at which point there's no good in e.g. adding another entry to a non-existent -// sled-agent regardless. +// or DPD has died. declare_saga_actions! { instance_ip_attach; @@ -407,7 +406,7 @@ pub(crate) mod test { .filter(dsl::parent_id.eq(instance_id)) .filter(dsl::state.ne(IpAttachState::Detached)) .select(ExternalIp::as_select()) - .first_async::(&*conn,) + .first_async::(&*conn) .await .optional() .unwrap() @@ -418,7 +417,7 @@ pub(crate) mod test { .filter(dsl::kind.eq(IpKind::Ephemeral)) .filter(dsl::time_deleted.is_null()) .select(ExternalIp::as_select()) - .first_async::(&*conn,) + .first_async::(&*conn) .await .optional() .unwrap() diff --git a/nexus/src/app/sagas/instance_ip_detach.rs b/nexus/src/app/sagas/instance_ip_detach.rs index b90d6836cb..7e5832b35f 100644 --- a/nexus/src/app/sagas/instance_ip_detach.rs +++ b/nexus/src/app/sagas/instance_ip_detach.rs @@ -400,7 +400,7 @@ pub(crate) mod test { .filter(dsl::parent_id.eq(instance_id)) .filter(dsl::state.ne(IpAttachState::Attached)) .select(ExternalIp::as_select()) - .first_async::(&*conn,) + .first_async::(&*conn) .await .optional() .unwrap() @@ -411,7 +411,7 @@ pub(crate) mod test { .filter(dsl::time_deleted.is_null()) .filter(dsl::state.eq(IpAttachState::Detached)) .select(ExternalIp::as_select()) - .first_async::(&*conn,) + .first_async::(&*conn) .await .optional() .unwrap() From ed48f0347e4929103e1c966c2858a6fb593a8670 Mon Sep 17 00:00:00 2001 From: Kyle Simpson Date: Fri, 5 Jan 2024 12:48:16 +0000 Subject: [PATCH 36/56] Review feedback: `NameOrId` for all create/attach/detach --- .../src/db/datastore/external_ip.rs | 8 +-- .../db-queries/src/db/queries/external_ip.rs | 24 ++++---- nexus/src/app/instance.rs | 2 +- nexus/src/app/sagas/instance_create.rs | 47 +++++++++++---- nexus/src/app/sagas/instance_delete.rs | 2 +- nexus/src/app/sagas/instance_ip_attach.rs | 60 ++++++++++++------- nexus/src/app/sagas/instance_ip_detach.rs | 46 +++++++------- nexus/src/external_api/http_entrypoints.rs | 2 +- nexus/tests/integration_tests/endpoints.rs | 10 ++-- nexus/tests/integration_tests/external_ips.rs | 48 +++++++-------- nexus/tests/integration_tests/instances.rs | 10 ++-- nexus/tests/integration_tests/ip_pools.rs | 4 +- nexus/types/src/external_api/params.rs | 11 ++-- openapi/nexus.json | 23 +++---- 14 files changed, 172 insertions(+), 125 deletions(-) diff --git a/nexus/db-queries/src/db/datastore/external_ip.rs b/nexus/db-queries/src/db/datastore/external_ip.rs index e2e911978d..6b53a97765 100644 --- a/nexus/db-queries/src/db/datastore/external_ip.rs +++ b/nexus/db-queries/src/db/datastore/external_ip.rs @@ -88,7 +88,7 @@ impl DataStore { opctx: &OpContext, ip_id: Uuid, instance_id: Uuid, - pool_name: Option, + pool_id: Option, creating_instance: bool, ) -> CreateResult<(ExternalIp, bool)> { // This is slightly hacky: we need to create an unbound ephemeral IP, and @@ -97,10 +97,10 @@ impl DataStore { // - At most MAX external IPs per instance // Naturally, we now *need* to destroy the ephemeral IP if the newly alloc'd // IP was not attached, including on idempotent success. - let pool = match pool_name { - Some(name) => { + let pool = match pool_id { + Some(id) => { let (.., authz_pool, pool) = LookupPath::new(opctx, &self) - .ip_pool_name(&name) + .ip_pool_id(id) // any authenticated user can CreateChild on an IP pool. this is // meant to represent allocating an IP .fetch_for(authz::Action::CreateChild) diff --git a/nexus/db-queries/src/db/queries/external_ip.rs b/nexus/db-queries/src/db/queries/external_ip.rs index 08bfcb2933..3b0fc24181 100644 --- a/nexus/db-queries/src/db/queries/external_ip.rs +++ b/nexus/db-queries/src/db/queries/external_ip.rs @@ -920,7 +920,7 @@ mod tests { name: &str, range: IpRange, is_default: bool, - ) { + ) -> IpPool { let silo_id = self.opctx.authn.silo_required().unwrap().id(); let pool = IpPool::new( &IdentityMetadataCreateParams { @@ -938,13 +938,16 @@ mod tests { .unwrap(); use crate::db::schema::ip_pool::dsl as ip_pool_dsl; - diesel::insert_into(ip_pool_dsl::ip_pool) + let ip_pool = diesel::insert_into(ip_pool_dsl::ip_pool) .values(pool.clone()) - .execute_async(&*conn) + .returning(IpPool::as_returning()) + .get_result_async(&*conn) .await .expect("Failed to create IP Pool"); self.initialize_ip_pool(name, range).await; + + ip_pool } async fn initialize_ip_pool(&self, name: &str, range: IpRange) { @@ -1808,13 +1811,13 @@ mod tests { Ipv4Addr::new(10, 0, 0, 6), )) .unwrap(); - context.create_ip_pool("p1", second_range, /*default*/ false).await; + let p1 = + context.create_ip_pool("p1", second_range, /*default*/ false).await; // Allocating an address on an instance in the second pool should be // respected, even though there are IPs available in the first. let instance_id = context.create_instance("test").await; let id = Uuid::new_v4(); - let pool_name = Some(Name("p1".parse().unwrap())); let ip = context .db_datastore @@ -1822,7 +1825,7 @@ mod tests { &context.opctx, id, instance_id, - pool_name, + Some(p1.id()), true, ) .await @@ -1853,10 +1856,11 @@ mod tests { let last_address = Ipv4Addr::new(10, 0, 0, 6); let second_range = IpRange::try_from((first_address, last_address)).unwrap(); - context.create_ip_pool("p1", second_range, /* default */ false).await; + let p1 = context + .create_ip_pool("p1", second_range, /* default */ false) + .await; // Allocate all available addresses in the second pool. - let pool_name = Some(Name("p1".parse().unwrap())); let first_octet = first_address.octets()[3]; let last_octet = last_address.octets()[3]; for octet in first_octet..=last_octet { @@ -1868,7 +1872,7 @@ mod tests { &context.opctx, Uuid::new_v4(), instance_id, - pool_name.clone(), + Some(p1.id()), true, ) .await @@ -1890,7 +1894,7 @@ mod tests { &context.opctx, Uuid::new_v4(), instance_id, - pool_name, + Some(p1.id()), true, ) .await diff --git a/nexus/src/app/instance.rs b/nexus/src/app/instance.rs index 7e6bc51229..107dfa3cfe 100644 --- a/nexus/src/app/instance.rs +++ b/nexus/src/app/instance.rs @@ -1951,7 +1951,7 @@ impl super::Nexus { self: &Arc, opctx: &OpContext, instance_lookup: &lookup::Instance<'_>, - ext_ip: ¶ms::ExternalIpDelete, + ext_ip: ¶ms::ExternalIpDetach, ) -> UpdateResult> { let (.., authz_project, authz_instance) = instance_lookup.lookup_for(authz::Action::Modify).await?; diff --git a/nexus/src/app/sagas/instance_create.rs b/nexus/src/app/sagas/instance_create.rs index 7028e620b1..f98cdc0355 100644 --- a/nexus/src/app/sagas/instance_create.rs +++ b/nexus/src/app/sagas/instance_create.rs @@ -21,7 +21,9 @@ use omicron_common::api::external::Error; use omicron_common::api::external::IdentityMetadataCreateParams; use omicron_common::api::external::InstanceState; use omicron_common::api::external::Name; +use omicron_common::api::external::NameOrId; use omicron_common::api::internal::shared::SwitchLocation; +use ref_cast::RefCast; use serde::Deserialize; use serde::Serialize; use slog::warn; @@ -623,16 +625,31 @@ async fn sic_allocate_instance_external_ip( // Runtime state should never be able to make 'complete_op' fallible. let ip = match ip_params { // Allocate a new IP address from the target, possibly default, pool - params::ExternalIpCreate::Ephemeral { ref pool_name } => { - let pool_name = - pool_name.as_ref().map(|name| db::model::Name(name.clone())); + params::ExternalIpCreate::Ephemeral { pool } => { + let pool_id = if let Some(name_or_id) = pool { + let (.., authz_pool) = match name_or_id { + NameOrId::Name(name) => LookupPath::new(&opctx, datastore) + .ip_pool_name(db::model::Name::ref_cast(name)), + NameOrId::Id(id) => { + LookupPath::new(&opctx, datastore).ip_pool_id(*id) + } + } + .lookup_for(authz::Action::CreateChild) + .await + .map_err(ActionError::action_failed)?; + + Some(authz_pool.id()) + } else { + None + }; + let ip_id = repeat_saga_params.new_id; datastore .allocate_instance_ephemeral_ip( &opctx, ip_id, instance_id, - pool_name, + pool_id, true, ) .await @@ -640,14 +657,18 @@ async fn sic_allocate_instance_external_ip( .0 } // Set the parent of an existing floating IP to the new instance's ID. - params::ExternalIpCreate::Floating { ref floating_ip_name } => { - let floating_ip_name = db::model::Name(floating_ip_name.clone()); - let (.., authz_fip) = LookupPath::new(&opctx, &datastore) - .project_id(saga_params.project_id) - .floating_ip_name(&floating_ip_name) - .lookup_for(authz::Action::Modify) - .await - .map_err(ActionError::action_failed)?; + params::ExternalIpCreate::Floating { floating_ip } => { + let (.., authz_fip) = match floating_ip { + NameOrId::Name(name) => LookupPath::new(&opctx, datastore) + .project_id(saga_params.project_id) + .floating_ip_name(db::model::Name::ref_cast(name)), + NameOrId::Id(id) => { + LookupPath::new(&opctx, datastore).floating_ip_id(*id) + } + } + .lookup_for(authz::Action::Modify) + .await + .map_err(ActionError::action_failed)?; datastore .floating_ip_begin_attach(&opctx, &authz_fip, instance_id, true) @@ -1011,7 +1032,7 @@ pub mod test { network_interfaces: params::InstanceNetworkInterfaceAttachment::Default, external_ips: vec![params::ExternalIpCreate::Ephemeral { - pool_name: None, + pool: None, }], disks: vec![params::InstanceDiskAttachment::Attach( params::InstanceDiskAttach { diff --git a/nexus/src/app/sagas/instance_delete.rs b/nexus/src/app/sagas/instance_delete.rs index 8111ee6bd7..e0ad62e654 100644 --- a/nexus/src/app/sagas/instance_delete.rs +++ b/nexus/src/app/sagas/instance_delete.rs @@ -240,7 +240,7 @@ mod test { network_interfaces: params::InstanceNetworkInterfaceAttachment::Default, external_ips: vec![params::ExternalIpCreate::Ephemeral { - pool_name: None, + pool: None, }], disks: vec![params::InstanceDiskAttachment::Attach( params::InstanceDiskAttach { name: DISK_NAME.parse().unwrap() }, diff --git a/nexus/src/app/sagas/instance_ip_attach.rs b/nexus/src/app/sagas/instance_ip_attach.rs index d688da9ba2..06854fe520 100644 --- a/nexus/src/app/sagas/instance_ip_attach.rs +++ b/nexus/src/app/sagas/instance_ip_attach.rs @@ -14,7 +14,8 @@ use crate::external_api::params; use nexus_db_model::IpAttachState; use nexus_db_queries::db::lookup::LookupPath; use nexus_types::external_api::views; -use omicron_common::api::external::Error; +use omicron_common::api::external::{Error, NameOrId}; +use ref_cast::RefCast; use serde::Deserialize; use serde::Serialize; use steno::ActionError; @@ -89,17 +90,32 @@ async fn siia_begin_attach_ip( ¶ms.serialized_authn, ); - match params.create_params { + match ¶ms.create_params { // Allocate a new IP address from the target, possibly default, pool - params::ExternalIpCreate::Ephemeral { ref pool_name } => { - let pool_name = - pool_name.as_ref().map(|name| db::model::Name(name.clone())); + params::ExternalIpCreate::Ephemeral { pool } => { + let pool_id = if let Some(name_or_id) = pool { + let (.., authz_pool) = match name_or_id { + NameOrId::Name(name) => LookupPath::new(&opctx, datastore) + .ip_pool_name(db::model::Name::ref_cast(name)), + NameOrId::Id(id) => { + LookupPath::new(&opctx, datastore).ip_pool_id(*id) + } + } + .lookup_for(authz::Action::CreateChild) + .await + .map_err(ActionError::action_failed)?; + + Some(authz_pool.id()) + } else { + None + }; + datastore .allocate_instance_ephemeral_ip( &opctx, Uuid::new_v4(), params.authz_instance.id(), - pool_name, + pool_id, false, ) .await @@ -110,14 +126,18 @@ async fn siia_begin_attach_ip( }) } // Set the parent of an existing floating IP to the new instance's ID. - params::ExternalIpCreate::Floating { ref floating_ip_name } => { - let floating_ip_name = db::model::Name(floating_ip_name.clone()); - let (.., authz_fip) = LookupPath::new(&opctx, &datastore) - .project_id(params.project_id) - .floating_ip_name(&floating_ip_name) - .lookup_for(authz::Action::Modify) - .await - .map_err(ActionError::action_failed)?; + params::ExternalIpCreate::Floating { floating_ip } => { + let (.., authz_fip) = match floating_ip { + NameOrId::Name(name) => LookupPath::new(&opctx, datastore) + .project_id(params.project_id) + .floating_ip_name(db::model::Name::ref_cast(name)), + NameOrId::Id(id) => { + LookupPath::new(&opctx, datastore).floating_ip_id(*id) + } + } + .lookup_for(authz::Action::Modify) + .await + .map_err(ActionError::action_failed)?; datastore .floating_ip_begin_attach( @@ -281,13 +301,13 @@ pub(crate) mod test { ExpressionMethods, OptionalExtension, QueryDsl, SelectableHelper, }; use dropshot::test_util::ClientTestContext; - use nexus_db_model::{ExternalIp, IpKind, Name}; + use nexus_db_model::{ExternalIp, IpKind}; use nexus_db_queries::context::OpContext; use nexus_test_utils::resource_helpers::{ create_floating_ip, create_instance, create_project, populate_ip_pool, }; use nexus_test_utils_macros::nexus_test; - use omicron_common::api::external::SimpleIdentity; + use omicron_common::api::external::{Name, SimpleIdentity}; type ControlPlaneTestContext = nexus_test_utils::ControlPlaneTestContext; @@ -318,16 +338,16 @@ pub(crate) mod test { ) -> Params { let create_params = if use_floating { params::ExternalIpCreate::Floating { - floating_ip_name: FIP_NAME.parse().unwrap(), + floating_ip: FIP_NAME.parse::().unwrap().into(), } } else { - params::ExternalIpCreate::Ephemeral { pool_name: None } + params::ExternalIpCreate::Ephemeral { pool: None } }; let (.., authz_project, authz_instance) = LookupPath::new(opctx, datastore) - .project_name(&Name(PROJECT_NAME.parse().unwrap())) - .instance_name(&Name(INSTANCE_NAME.parse().unwrap())) + .project_name(&db::model::Name(PROJECT_NAME.parse().unwrap())) + .instance_name(&db::model::Name(INSTANCE_NAME.parse().unwrap())) .lookup_for(authz::Action::Modify) .await .unwrap(); diff --git a/nexus/src/app/sagas/instance_ip_detach.rs b/nexus/src/app/sagas/instance_ip_detach.rs index 7e5832b35f..cff14d6ba8 100644 --- a/nexus/src/app/sagas/instance_ip_detach.rs +++ b/nexus/src/app/sagas/instance_ip_detach.rs @@ -14,6 +14,8 @@ use crate::external_api::params; use nexus_db_model::IpAttachState; use nexus_db_queries::db::lookup::LookupPath; use nexus_types::external_api::views; +use omicron_common::api::external::NameOrId; +use ref_cast::RefCast; use serde::Deserialize; use serde::Serialize; use steno::ActionError; @@ -50,7 +52,7 @@ declare_saga_actions! { #[derive(Debug, Deserialize, Serialize)] pub struct Params { - pub delete_params: params::ExternalIpDelete, + pub delete_params: params::ExternalIpDetach, pub authz_instance: authz::Instance, pub project_id: Uuid, /// Authentication context to use to fetch the instance's current state from @@ -69,8 +71,8 @@ async fn siid_begin_detach_ip( ¶ms.serialized_authn, ); - match params.delete_params { - params::ExternalIpDelete::Ephemeral => { + match ¶ms.delete_params { + params::ExternalIpDetach::Ephemeral => { let eip = datastore .instance_lookup_ephemeral_ip( &opctx, @@ -99,14 +101,18 @@ async fn siid_begin_detach_ip( }) } } - params::ExternalIpDelete::Floating { ref floating_ip_name } => { - let floating_ip_name = db::model::Name(floating_ip_name.clone()); - let (.., authz_fip) = LookupPath::new(&opctx, &datastore) - .project_id(params.project_id) - .floating_ip_name(&floating_ip_name) - .lookup_for(authz::Action::Modify) - .await - .map_err(ActionError::action_failed)?; + params::ExternalIpDetach::Floating { floating_ip } => { + let (.., authz_fip) = match floating_ip { + NameOrId::Name(name) => LookupPath::new(&opctx, datastore) + .project_id(params.project_id) + .floating_ip_name(db::model::Name::ref_cast(name)), + NameOrId::Id(id) => { + LookupPath::new(&opctx, datastore).floating_ip_id(*id) + } + } + .lookup_for(authz::Action::Modify) + .await + .map_err(ActionError::action_failed)?; datastore .floating_ip_begin_detach( @@ -273,11 +279,11 @@ pub(crate) mod test { use diesel::{ ExpressionMethods, OptionalExtension, QueryDsl, SelectableHelper, }; - use nexus_db_model::{ExternalIp, IpKind, Name}; + use nexus_db_model::{ExternalIp, IpKind}; use nexus_db_queries::context::OpContext; use nexus_test_utils::resource_helpers::create_instance; use nexus_test_utils_macros::nexus_test; - use omicron_common::api::external::SimpleIdentity; + use omicron_common::api::external::{Name, SimpleIdentity}; use std::sync::Arc; type ControlPlaneTestContext = @@ -293,17 +299,17 @@ pub(crate) mod test { use_floating: bool, ) -> Params { let delete_params = if use_floating { - params::ExternalIpDelete::Floating { - floating_ip_name: FIP_NAME.parse().unwrap(), + params::ExternalIpDetach::Floating { + floating_ip: FIP_NAME.parse::().unwrap().into(), } } else { - params::ExternalIpDelete::Ephemeral + params::ExternalIpDetach::Ephemeral }; let (.., authz_project, authz_instance) = LookupPath::new(opctx, datastore) - .project_name(&Name(PROJECT_NAME.parse().unwrap())) - .instance_name(&Name(INSTANCE_NAME.parse().unwrap())) + .project_name(&db::model::Name(PROJECT_NAME.parse().unwrap())) + .instance_name(&db::model::Name(INSTANCE_NAME.parse().unwrap())) .lookup_for(authz::Action::Modify) .await .unwrap(); @@ -319,8 +325,8 @@ pub(crate) mod test { async fn attach_instance_ips(nexus: &Arc, opctx: &OpContext) { let datastore = &nexus.db_datastore; - let proj_name = Name(PROJECT_NAME.parse().unwrap()); - let inst_name = Name(INSTANCE_NAME.parse().unwrap()); + let proj_name = db::model::Name(PROJECT_NAME.parse().unwrap()); + let inst_name = db::model::Name(INSTANCE_NAME.parse().unwrap()); let lookup = LookupPath::new(opctx, datastore) .project_name(&proj_name) .instance_name(&inst_name); diff --git a/nexus/src/external_api/http_entrypoints.rs b/nexus/src/external_api/http_entrypoints.rs index 120a2a14c5..6ace340008 100644 --- a/nexus/src/external_api/http_entrypoints.rs +++ b/nexus/src/external_api/http_entrypoints.rs @@ -3775,7 +3775,7 @@ async fn instance_external_ip_detach( rqctx: RequestContext>, path_params: Path, query_params: Query, - ip_to_detach: TypedBody, + ip_to_detach: TypedBody, ) -> Result>, HttpError> { let apictx = rqctx.context(); let handler = async { diff --git a/nexus/tests/integration_tests/endpoints.rs b/nexus/tests/integration_tests/endpoints.rs index 1bf68c8398..554faa01d2 100644 --- a/nexus/tests/integration_tests/endpoints.rs +++ b/nexus/tests/integration_tests/endpoints.rs @@ -426,7 +426,7 @@ pub static DEMO_INSTANCE_CREATE: Lazy = user_data: vec![], network_interfaces: params::InstanceNetworkInterfaceAttachment::Default, external_ips: vec![params::ExternalIpCreate::Ephemeral { - pool_name: Some(DEMO_IP_POOL_NAME.clone()), + pool: Some(DEMO_IP_POOL_NAME.clone().into()), }], disks: vec![], start: true, @@ -733,11 +733,11 @@ pub static DEMO_FLOAT_IP_CREATE: Lazy = pub static DEMO_FLOAT_IP_ATTACH: Lazy = Lazy::new(|| params::ExternalIpCreate::Floating { - floating_ip_name: DEMO_FLOAT_IP_NAME.clone(), + floating_ip: DEMO_FLOAT_IP_NAME.clone().into(), }); -pub static DEMO_FLOAT_IP_DETACH: Lazy = - Lazy::new(|| params::ExternalIpDelete::Floating { - floating_ip_name: DEMO_FLOAT_IP_NAME.clone(), +pub static DEMO_FLOAT_IP_DETACH: Lazy = + Lazy::new(|| params::ExternalIpDetach::Floating { + floating_ip: DEMO_FLOAT_IP_NAME.clone().into(), }); // Identity providers diff --git a/nexus/tests/integration_tests/external_ips.rs b/nexus/tests/integration_tests/external_ips.rs index 9b98b2efab..2e3bb86bb9 100644 --- a/nexus/tests/integration_tests/external_ips.rs +++ b/nexus/tests/integration_tests/external_ips.rs @@ -552,14 +552,14 @@ async fn test_external_ip_live_attach_detach( let eph_resp = external_ip_attach( client, instance_name, - ¶ms::ExternalIpCreate::Ephemeral { pool_name: None }, + ¶ms::ExternalIpCreate::Ephemeral { pool: None }, ) .await; let fip_resp = external_ip_attach( client, instance_name, ¶ms::ExternalIpCreate::Floating { - floating_ip_name: fip.identity.name.clone(), + floating_ip: fip.identity.name.clone().into(), }, ) .await; @@ -580,14 +580,14 @@ async fn test_external_ip_live_attach_detach( let eph_resp_2 = external_ip_attach( client, instance_name, - ¶ms::ExternalIpCreate::Ephemeral { pool_name: None }, + ¶ms::ExternalIpCreate::Ephemeral { pool: None }, ) .await; let fip_resp_2 = external_ip_attach( client, instance_name, ¶ms::ExternalIpCreate::Floating { - floating_ip_name: fip.identity.name.clone(), + floating_ip: fip.identity.name.clone().into(), }, ) .await; @@ -606,15 +606,15 @@ async fn test_external_ip_live_attach_detach( let eph_resp = external_ip_detach( client, instance_name, - ¶ms::ExternalIpDelete::Ephemeral, + ¶ms::ExternalIpDetach::Ephemeral, ) .await .unwrap(); let fip_resp = external_ip_detach( client, instance_name, - ¶ms::ExternalIpDelete::Floating { - floating_ip_name: fip.identity.name.clone(), + ¶ms::ExternalIpDetach::Floating { + floating_ip: fip.identity.name.clone().into(), }, ) .await @@ -634,14 +634,14 @@ async fn test_external_ip_live_attach_detach( let eph_resp_2 = external_ip_detach( client, instance_name, - ¶ms::ExternalIpDelete::Ephemeral, + ¶ms::ExternalIpDetach::Ephemeral, ) .await; let fip_resp_2 = external_ip_detach( client, instance_name, - ¶ms::ExternalIpDelete::Floating { - floating_ip_name: fip.identity.name.clone(), + ¶ms::ExternalIpDetach::Floating { + floating_ip: fip.identity.name.clone().into(), }, ) .await; @@ -695,7 +695,7 @@ async fn test_external_ip_attach_detach_fail_if_in_use_by_other( let error: HttpErrorResponseBody = NexusRequest::new( RequestBuilder::new(client, Method::POST, &url) .body(Some(¶ms::ExternalIpCreate::Floating { - floating_ip_name: fips[1].identity.name.clone(), + floating_ip: fips[1].identity.name.clone().into(), })) .expect_status(Some(StatusCode::BAD_REQUEST)), ) @@ -711,8 +711,8 @@ async fn test_external_ip_attach_detach_fail_if_in_use_by_other( let url = detach_instance_external_ip_url(INSTANCE_NAMES[0], PROJECT_NAME); let error: HttpErrorResponseBody = NexusRequest::new( RequestBuilder::new(client, Method::POST, &url) - .body(Some(¶ms::ExternalIpDelete::Floating { - floating_ip_name: fips[1].identity.name.clone(), + .body(Some(¶ms::ExternalIpDetach::Floating { + floating_ip: fips[1].identity.name.clone().into(), })) .expect_status(Some(StatusCode::BAD_REQUEST)), ) @@ -769,11 +769,7 @@ async fn test_external_ip_attach_fails_after_maximum( let error: HttpErrorResponseBody = NexusRequest::new( RequestBuilder::new(client, Method::POST, &url) .body(Some(¶ms::ExternalIpCreate::Floating { - floating_ip_name: fip_name_slice - .last() - .unwrap() - .parse() - .unwrap(), + floating_ip: fip_name_slice[32].parse::().unwrap().into(), })) .expect_status(Some(StatusCode::BAD_REQUEST)), ) @@ -792,9 +788,7 @@ async fn test_external_ip_attach_fails_after_maximum( // Attempt to attach an ephemeral IP should fail. let error: HttpErrorResponseBody = NexusRequest::new( RequestBuilder::new(client, Method::POST, &url) - .body(Some(¶ms::ExternalIpCreate::Ephemeral { - pool_name: None, - })) + .body(Some(¶ms::ExternalIpCreate::Ephemeral { pool: None })) .expect_status(Some(StatusCode::BAD_REQUEST)), ) .authn_as(AuthnMode::PrivilegedUser) @@ -839,7 +833,7 @@ async fn test_external_ip_attach_ephemeral_at_pool_exhaustion( client, INSTANCE_NAMES[0], ¶ms::ExternalIpCreate::Ephemeral { - pool_name: Some(pool_name.clone()), + pool: Some(pool_name.clone().into()), }, ) .await; @@ -850,7 +844,7 @@ async fn test_external_ip_attach_ephemeral_at_pool_exhaustion( let error: HttpErrorResponseBody = NexusRequest::new( RequestBuilder::new(client, Method::POST, &url) .body(Some(¶ms::ExternalIpCreate::Ephemeral { - pool_name: Some(pool_name.clone()), + pool: Some(pool_name.clone().into()), })) .expect_status(Some(StatusCode::INSUFFICIENT_STORAGE)), ) @@ -871,7 +865,7 @@ async fn test_external_ip_attach_ephemeral_at_pool_exhaustion( client, INSTANCE_NAMES[0], ¶ms::ExternalIpCreate::Ephemeral { - pool_name: Some(pool_name.clone()), + pool: Some(pool_name.clone().into()), }, ) .await; @@ -913,11 +907,11 @@ async fn instance_for_external_ips( let mut fips: Vec<_> = floating_ip_names .iter() .map(|s| params::ExternalIpCreate::Floating { - floating_ip_name: s.parse().unwrap(), + floating_ip: s.parse::().unwrap().into(), }) .collect(); if use_ephemeral_ip { - fips.push(params::ExternalIpCreate::Ephemeral { pool_name: None }) + fips.push(params::ExternalIpCreate::Ephemeral { pool: None }) } create_instance_with( &client, @@ -953,7 +947,7 @@ async fn external_ip_attach( async fn external_ip_detach( client: &ClientTestContext, instance_name: &str, - eip: ¶ms::ExternalIpDelete, + eip: ¶ms::ExternalIpDetach, ) -> Option { let url = detach_instance_external_ip_url(instance_name, PROJECT_NAME); NexusRequest::new( diff --git a/nexus/tests/integration_tests/instances.rs b/nexus/tests/integration_tests/instances.rs index fd91efb487..48ec7bc222 100644 --- a/nexus/tests/integration_tests/instances.rs +++ b/nexus/tests/integration_tests/instances.rs @@ -3734,7 +3734,7 @@ async fn test_instance_attach_several_external_ips( // Create several floating IPs for the instance, totalling 8 IPs. let mut external_ip_create = - vec![params::ExternalIpCreate::Ephemeral { pool_name: None }]; + vec![params::ExternalIpCreate::Ephemeral { pool: None }]; let mut fips = vec![]; for i in 1..8 { let name = format!("fip-{i}"); @@ -3742,7 +3742,7 @@ async fn test_instance_attach_several_external_ips( create_floating_ip(&client, &name, PROJECT_NAME, None, None).await, ); external_ip_create.push(params::ExternalIpCreate::Floating { - floating_ip_name: name.parse().unwrap(), + floating_ip: name.parse::().unwrap().into(), }); } @@ -3812,7 +3812,7 @@ async fn test_instance_allow_only_one_ephemeral_ip( populate_ip_pool(&client, "default", Some(default_pool_range)).await; let ephemeral_create = params::ExternalIpCreate::Ephemeral { - pool_name: Some("default".parse().unwrap()), + pool: Some("default".parse::().unwrap().into()), }; let error: HttpErrorResponseBody = NexusRequest::new( RequestBuilder::new(client, Method::POST, &get_instances_url()) @@ -3860,7 +3860,7 @@ async fn create_instance_with_pool( ¶ms::InstanceNetworkInterfaceAttachment::Default, vec![], vec![params::ExternalIpCreate::Ephemeral { - pool_name: pool_name.map(|name| name.parse().unwrap()), + pool: pool_name.map(|name| name.parse::().unwrap().into()), }], true, ) @@ -3955,7 +3955,7 @@ async fn test_instance_create_in_silo(cptestctx: &ControlPlaneTestContext) { user_data: vec![], network_interfaces: params::InstanceNetworkInterfaceAttachment::Default, external_ips: vec![params::ExternalIpCreate::Ephemeral { - pool_name: Some(Name::try_from(String::from("default")).unwrap()), + pool: Some("default".parse::().unwrap().into()), }], disks: vec![], start: true, diff --git a/nexus/tests/integration_tests/ip_pools.rs b/nexus/tests/integration_tests/ip_pools.rs index 02aa18bc72..767db942f9 100644 --- a/nexus/tests/integration_tests/ip_pools.rs +++ b/nexus/tests/integration_tests/ip_pools.rs @@ -797,14 +797,14 @@ async fn test_ip_pool_list_usable_by_project( // should be able to access for pool_name in pool_names { let instance_name = format!("{}-{}", INSTANCE_NAME, pool_name); - let pool_name = Some(Name::try_from(pool_name).unwrap()); + let pool = Some(Name::try_from(pool_name).unwrap().into()); create_instance_with( client, PROJECT_NAME, &instance_name, &InstanceNetworkInterfaceAttachment::Default, Vec::::new(), - vec![ExternalIpCreate::Ephemeral { pool_name }], + vec![ExternalIpCreate::Ephemeral { pool }], true, ) .await; diff --git a/nexus/types/src/external_api/params.rs b/nexus/types/src/external_api/params.rs index ed5b90baf4..53f2bb9906 100644 --- a/nexus/types/src/external_api/params.rs +++ b/nexus/types/src/external_api/params.rs @@ -939,19 +939,20 @@ pub enum ExternalIpCreate { /// An IP address providing both inbound and outbound access. The address is /// automatically-assigned from the provided IP Pool, or all available pools /// if not specified. - Ephemeral { pool_name: Option }, + Ephemeral { pool: Option }, /// An IP address providing both inbound and outbound access. The address is - /// an existing Floating IP object assigned to the current project. + /// an existing floating IP object assigned to the current project. /// /// The floating IP must not be in use by another instance or service. - Floating { floating_ip_name: Name }, + Floating { floating_ip: NameOrId }, } +/// Parameters for detaching an external IP from an instance. #[derive(Clone, Debug, Deserialize, Serialize, JsonSchema)] #[serde(tag = "type", rename_all = "snake_case")] -pub enum ExternalIpDelete { +pub enum ExternalIpDetach { Ephemeral, - Floating { floating_ip_name: Name }, + Floating { floating_ip: NameOrId }, } /// Create-time parameters for an `Instance` diff --git a/openapi/nexus.json b/openapi/nexus.json index 1bd78eb6ab..2b9cfbc91a 100644 --- a/openapi/nexus.json +++ b/openapi/nexus.json @@ -1912,7 +1912,7 @@ "content": { "application/json": { "schema": { - "$ref": "#/components/schemas/ExternalIpDelete" + "$ref": "#/components/schemas/ExternalIpDetach" } } }, @@ -10902,11 +10902,11 @@ "description": "An IP address providing both inbound and outbound access. The address is automatically-assigned from the provided IP Pool, or all available pools if not specified.", "type": "object", "properties": { - "pool_name": { + "pool": { "nullable": true, "allOf": [ { - "$ref": "#/components/schemas/Name" + "$ref": "#/components/schemas/NameOrId" } ] }, @@ -10922,11 +10922,11 @@ ] }, { - "description": "An IP address providing both inbound and outbound access. The address is an existing Floating IP object assigned to the current project.\n\nThe floating IP must not be in use by another instance or service.", + "description": "An IP address providing both inbound and outbound access. The address is an existing floating IP object assigned to the current project.\n\nThe floating IP must not be in use by another instance or service.", "type": "object", "properties": { - "floating_ip_name": { - "$ref": "#/components/schemas/Name" + "floating_ip": { + "$ref": "#/components/schemas/NameOrId" }, "type": { "type": "string", @@ -10936,13 +10936,14 @@ } }, "required": [ - "floating_ip_name", + "floating_ip", "type" ] } ] }, - "ExternalIpDelete": { + "ExternalIpDetach": { + "description": "Parameters for detaching an external IP from an instance.", "oneOf": [ { "type": "object", @@ -10961,8 +10962,8 @@ { "type": "object", "properties": { - "floating_ip_name": { - "$ref": "#/components/schemas/Name" + "floating_ip": { + "$ref": "#/components/schemas/NameOrId" }, "type": { "type": "string", @@ -10972,7 +10973,7 @@ } }, "required": [ - "floating_ip_name", + "floating_ip", "type" ] } From 9680c95187968dc610f541b72a412c3a01c9d4b0 Mon Sep 17 00:00:00 2001 From: Kyle Simpson Date: Fri, 5 Jan 2024 12:57:34 +0000 Subject: [PATCH 37/56] Self review: missed comment --- sled-agent/src/sled_agent.rs | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/sled-agent/src/sled_agent.rs b/sled-agent/src/sled_agent.rs index 2373ae0270..493e297f3b 100644 --- a/sled-agent/src/sled_agent.rs +++ b/sled-agent/src/sled_agent.rs @@ -980,6 +980,11 @@ impl SledAgent { .map_err(|e| Error::Instance(e)) } + /// Idempotently ensures that an instance's OPTE/port state includes the + /// specified external IP address. + /// + /// This method will return an error when trying to register an ephemeral IP which + /// does not match the current ephemeral IP. pub async fn instance_put_external_ip( &self, instance_id: Uuid, @@ -992,6 +997,8 @@ impl SledAgent { .map_err(|e| Error::Instance(e)) } + /// Idempotently ensures that an instance's OPTE/port state does not include the + /// specified external IP address in either its ephemeral or floating IP set. pub async fn instance_delete_external_ip( &self, instance_id: Uuid, From 59f665c32ef4b465c37c95635eadad0a65cde710 Mon Sep 17 00:00:00 2001 From: Kyle Simpson Date: Fri, 5 Jan 2024 13:10:49 +0000 Subject: [PATCH 38/56] Review feedback: Explicit fail on double detach Ephemeral Note that this is a 400 Bad Request rather than a 404, as we lack a ResourceType and an ID/Name to work with. --- nexus/src/app/instance.rs | 11 ++++++- nexus/src/external_api/http_entrypoints.rs | 2 +- nexus/tests/integration_tests/external_ips.rs | 29 ++++++++++++------- 3 files changed, 30 insertions(+), 12 deletions(-) diff --git a/nexus/src/app/instance.rs b/nexus/src/app/instance.rs index 107dfa3cfe..2f762d80c8 100644 --- a/nexus/src/app/instance.rs +++ b/nexus/src/app/instance.rs @@ -1952,7 +1952,7 @@ impl super::Nexus { opctx: &OpContext, instance_lookup: &lookup::Instance<'_>, ext_ip: ¶ms::ExternalIpDetach, - ) -> UpdateResult> { + ) -> UpdateResult { let (.., authz_project, authz_instance) = instance_lookup.lookup_for(authz::Action::Modify).await?; @@ -1973,6 +1973,15 @@ impl super::Nexus { .lookup_node_output::>("output") .map_err(|e| Error::internal_error(&format!("{:#}", &e))) .internal_context("looking up output from ip detach saga") + .and_then(|eip| { + // Saga idempotency means we'll get Ok(None) on double detach + // of an ephemeral IP. Convert this case to an error here. + eip.ok_or_else(|| { + Error::invalid_request( + "instance does not have an ephemeral IP attached", + ) + }) + }) } } diff --git a/nexus/src/external_api/http_entrypoints.rs b/nexus/src/external_api/http_entrypoints.rs index 6ace340008..20240e9e7e 100644 --- a/nexus/src/external_api/http_entrypoints.rs +++ b/nexus/src/external_api/http_entrypoints.rs @@ -3776,7 +3776,7 @@ async fn instance_external_ip_detach( path_params: Path, query_params: Query, ip_to_detach: TypedBody, -) -> Result>, HttpError> { +) -> Result, HttpError> { let apictx = rqctx.context(); let handler = async { let opctx = crate::context::op_context_for_external_api(&rqctx).await?; diff --git a/nexus/tests/integration_tests/external_ips.rs b/nexus/tests/integration_tests/external_ips.rs index 2e3bb86bb9..28a0375073 100644 --- a/nexus/tests/integration_tests/external_ips.rs +++ b/nexus/tests/integration_tests/external_ips.rs @@ -629,14 +629,8 @@ async fn test_external_ip_live_attach_detach( assert_eq!(fip.ip, fip_resp.ip); assert_eq!(eph_ip, &eph_resp); - // Check for idempotency: repeat requests should return same values - // for FIP, but in ephemeral case there is no currently known IP so we get None. - let eph_resp_2 = external_ip_detach( - client, - instance_name, - ¶ms::ExternalIpDetach::Ephemeral, - ) - .await; + // Check for idempotency: repeat requests should return same values for FIP, + // but in ephemeral case there is no currently known IP so we return an error. let fip_resp_2 = external_ip_detach( client, instance_name, @@ -645,9 +639,24 @@ async fn test_external_ip_live_attach_detach( }, ) .await; - - assert!(eph_resp_2.is_none()); assert_eq!(Some(fip_resp), fip_resp_2); + + let url = detach_instance_external_ip_url(instance_name, PROJECT_NAME); + let error: HttpErrorResponseBody = NexusRequest::new( + RequestBuilder::new(client, Method::POST, &url) + .body(Some(¶ms::ExternalIpDetach::Ephemeral)) + .expect_status(Some(StatusCode::BAD_REQUEST)), + ) + .authn_as(AuthnMode::PrivilegedUser) + .execute() + .await + .unwrap() + .parsed_body() + .unwrap(); + assert_eq!( + error.message, + "instance does not have an ephemeral IP attached".to_string() + ); } } From 1c0be9d32a0538aa48210326cbd32f7add7b0ea6 Mon Sep 17 00:00:00 2001 From: Kyle Simpson Date: Fri, 5 Jan 2024 17:29:43 +0000 Subject: [PATCH 39/56] Review feedback: enhance `views::ExternalIp` Converts `views::ExternalIp` into a flattened enum, allowing us to return simplified IP-only entries for ephemeral addresses and full `FloatingIp` objects where possible from attach/detach/list endpoints for external IPs. This change is compatible with the current console and older invocations which parsed only "ip" and "kind" from the JSON body, which is intentional. --- nexus/db-model/src/external_ip.rs | 11 +- nexus/db-queries/src/db/pool_connection.rs | 1 + nexus/tests/integration_tests/external_ips.rs | 27 ++--- nexus/tests/integration_tests/instances.rs | 26 ++--- nexus/types/src/external_api/views.rs | 32 ++++-- openapi/nexus.json | 102 ++++++++++++++---- 6 files changed, 146 insertions(+), 53 deletions(-) diff --git a/nexus/db-model/src/external_ip.rs b/nexus/db-model/src/external_ip.rs index 715df30f03..822f9c6744 100644 --- a/nexus/db-model/src/external_ip.rs +++ b/nexus/db-model/src/external_ip.rs @@ -441,8 +441,15 @@ impl TryFrom for views::ExternalIp { "Service IPs should not be exposed in the API", )); } - let kind = ip.kind.try_into()?; - Ok(views::ExternalIp { kind, ip: ip.ip.ip() }) + match ip.kind { + IpKind::Floating => Ok(views::ExternalIp::Floating(ip.try_into()?)), + IpKind::Ephemeral => { + Ok(views::ExternalIp::Ephemeral { ip: ip.ip.ip() }) + } + IpKind::SNat => Err(Error::internal_error( + "SNAT IP addresses should not be exposed in the API", + )), + } } } diff --git a/nexus/db-queries/src/db/pool_connection.rs b/nexus/db-queries/src/db/pool_connection.rs index 6fb951de84..0159982b90 100644 --- a/nexus/db-queries/src/db/pool_connection.rs +++ b/nexus/db-queries/src/db/pool_connection.rs @@ -47,6 +47,7 @@ static CUSTOM_TYPE_KEYS: &'static [&'static str] = &[ "hw_rot_slot", "identity_type", "instance_state", + "ip_attach_state", "ip_kind", "network_interface_kind", "physical_disk_kind", diff --git a/nexus/tests/integration_tests/external_ips.rs b/nexus/tests/integration_tests/external_ips.rs index 28a0375073..86aaba55c8 100644 --- a/nexus/tests/integration_tests/external_ips.rs +++ b/nexus/tests/integration_tests/external_ips.rs @@ -573,8 +573,11 @@ async fn test_external_ip_live_attach_detach( assert_eq!(eip_list.len(), 2); assert!(eip_list.contains(&eph_resp)); - assert!(eip_list.contains(&fip_resp)); - assert_eq!(fip.ip, fip_resp.ip); + assert!(eip_list + .iter() + .any(|v| matches!(v, views::ExternalIp::Floating(..)) + && v.ip() == fip_resp.ip())); + assert_eq!(fip.ip, fip_resp.ip()); // Check for idempotency: repeat requests should return same values. let eph_resp_2 = external_ip_attach( @@ -593,7 +596,8 @@ async fn test_external_ip_live_attach_detach( .await; assert_eq!(eph_resp, eph_resp_2); - assert_eq!(fip_resp, fip_resp_2); + assert_eq!(fip_resp.ip(), fip_resp_2.ip()); + assert_eq!(fip_resp.kind(), fip_resp_2.kind()); recorded_ephs.push(eph_resp); } @@ -608,8 +612,7 @@ async fn test_external_ip_live_attach_detach( instance_name, ¶ms::ExternalIpDetach::Ephemeral, ) - .await - .unwrap(); + .await; let fip_resp = external_ip_detach( client, instance_name, @@ -617,8 +620,7 @@ async fn test_external_ip_live_attach_detach( floating_ip: fip.identity.name.clone().into(), }, ) - .await - .unwrap(); + .await; // Verify both are removed, and that their bodies match the known FIP/EIP combo. let eip_list = @@ -626,7 +628,7 @@ async fn test_external_ip_live_attach_detach( .await; assert_eq!(eip_list.len(), 0); - assert_eq!(fip.ip, fip_resp.ip); + assert_eq!(fip.ip, fip_resp.ip()); assert_eq!(eph_ip, &eph_resp); // Check for idempotency: repeat requests should return same values for FIP, @@ -639,7 +641,8 @@ async fn test_external_ip_live_attach_detach( }, ) .await; - assert_eq!(Some(fip_resp), fip_resp_2); + assert_eq!(fip_resp.ip(), fip_resp_2.ip()); + assert_eq!(fip_resp.kind(), fip_resp_2.kind()); let url = detach_instance_external_ip_url(instance_name, PROJECT_NAME); let error: HttpErrorResponseBody = NexusRequest::new( @@ -846,8 +849,8 @@ async fn test_external_ip_attach_ephemeral_at_pool_exhaustion( }, ) .await; - assert_eq!(eph_resp.ip, other_pool_range.first_address()); - assert_eq!(eph_resp.ip, other_pool_range.last_address()); + assert_eq!(eph_resp.ip(), other_pool_range.first_address()); + assert_eq!(eph_resp.ip(), other_pool_range.last_address()); let url = attach_instance_external_ip_url(INSTANCE_NAMES[1], PROJECT_NAME); let error: HttpErrorResponseBody = NexusRequest::new( @@ -957,7 +960,7 @@ async fn external_ip_detach( client: &ClientTestContext, instance_name: &str, eip: ¶ms::ExternalIpDetach, -) -> Option { +) -> views::ExternalIp { let url = detach_instance_external_ip_url(instance_name, PROJECT_NAME); NexusRequest::new( RequestBuilder::new(client, Method::POST, &url) diff --git a/nexus/tests/integration_tests/instances.rs b/nexus/tests/integration_tests/instances.rs index 48ec7bc222..1db88300cc 100644 --- a/nexus/tests/integration_tests/instances.rs +++ b/nexus/tests/integration_tests/instances.rs @@ -3653,8 +3653,8 @@ async fn test_instance_ephemeral_ip_from_correct_pool( let ip = fetch_instance_ephemeral_ip(client, "default-pool-inst").await; assert!( - ip.ip >= default_pool_range.first_address() - && ip.ip <= default_pool_range.last_address(), + ip.ip() >= default_pool_range.first_address() + && ip.ip() <= default_pool_range.last_address(), "Expected ephemeral IP to come from default pool" ); @@ -3663,8 +3663,8 @@ async fn test_instance_ephemeral_ip_from_correct_pool( .await; let ip = fetch_instance_ephemeral_ip(client, "other-pool-inst").await; assert!( - ip.ip >= other_pool_range.first_address() - && ip.ip <= other_pool_range.last_address(), + ip.ip() >= other_pool_range.first_address() + && ip.ip() <= other_pool_range.last_address(), "Expected ephemeral IP to come from other pool" ); @@ -3697,8 +3697,8 @@ async fn test_instance_ephemeral_ip_from_correct_pool( .await; let ip = fetch_instance_ephemeral_ip(client, "silo-pool-inst").await; assert!( - ip.ip >= silo_pool_range.first_address() - && ip.ip <= silo_pool_range.last_address(), + ip.ip() >= silo_pool_range.first_address() + && ip.ip() <= silo_pool_range.last_address(), "Expected ephemeral IP to come from the silo default pool" ); @@ -3708,8 +3708,8 @@ async fn test_instance_ephemeral_ip_from_correct_pool( let ip = fetch_instance_ephemeral_ip(client, "other-pool-inst-2").await; assert!( - ip.ip >= other_pool_range.first_address() - && ip.ip <= other_pool_range.last_address(), + ip.ip() >= other_pool_range.first_address() + && ip.ip() <= other_pool_range.last_address(), "Expected ephemeral IP to come from the other pool" ); } @@ -3767,19 +3767,19 @@ async fn test_instance_attach_several_external_ips( eprintln!("{external_ips:?}"); for (i, eip) in external_ips .iter() - .sorted_unstable_by(|a, b| a.ip.cmp(&b.ip)) + .sorted_unstable_by(|a, b| a.ip().cmp(&b.ip())) .enumerate() { let last_octet = i + if i != external_ips.len() - 1 { - assert_eq!(eip.kind, IpKind::Floating); + assert_eq!(eip.kind(), IpKind::Floating); 1 } else { // SNAT will occupy 1.0.0.8 here, since it it alloc'd before // the ephemeral. - assert_eq!(eip.kind, IpKind::Ephemeral); + assert_eq!(eip.kind(), IpKind::Ephemeral); 2 }; - assert_eq!(eip.ip, Ipv4Addr::new(10, 0, 0, last_octet as u8)); + assert_eq!(eip.ip(), Ipv4Addr::new(10, 0, 0, last_octet as u8)); } // Verify that all floating IPs are bound to their parent instance. @@ -3892,7 +3892,7 @@ async fn fetch_instance_ephemeral_ip( fetch_instance_external_ips(client, instance_name, PROJECT_NAME) .await .into_iter() - .find(|v| v.kind == IpKind::Ephemeral) + .find(|v| v.kind() == IpKind::Ephemeral) .unwrap() } diff --git a/nexus/types/src/external_api/views.rs b/nexus/types/src/external_api/views.rs index 1555ab749a..26c41b7465 100644 --- a/nexus/types/src/external_api/views.rs +++ b/nexus/types/src/external_api/views.rs @@ -313,16 +313,34 @@ pub struct IpPoolRange { // INSTANCE EXTERNAL IP ADDRESSES -#[derive(Debug, Clone, Deserialize, Eq, PartialEq, Serialize, JsonSchema)] -#[serde(rename_all = "snake_case")] -pub struct ExternalIp { - pub ip: IpAddr, - pub kind: IpKind, +#[derive(Debug, Clone, Deserialize, PartialEq, Serialize, JsonSchema)] +#[serde(tag = "kind", rename_all = "snake_case")] +pub enum ExternalIp { + Ephemeral { ip: IpAddr }, + Floating(FloatingIp), +} + +impl ExternalIp { + pub fn ip(&self) -> IpAddr { + match self { + Self::Ephemeral { ip } => *ip, + Self::Floating(float) => float.ip, + } + } + + pub fn kind(&self) -> IpKind { + match self { + Self::Ephemeral { .. } => IpKind::Ephemeral, + Self::Floating(_) => IpKind::Floating, + } + } } /// A Floating IP is a well-known IP address which can be attached /// and detached from instances. -#[derive(ObjectIdentity, Debug, Clone, Deserialize, Serialize, JsonSchema)] +#[derive( + ObjectIdentity, Debug, PartialEq, Clone, Deserialize, Serialize, JsonSchema, +)] #[serde(rename_all = "snake_case")] pub struct FloatingIp { #[serde(flatten)] @@ -338,7 +356,7 @@ pub struct FloatingIp { impl From for ExternalIp { fn from(value: FloatingIp) -> Self { - ExternalIp { ip: value.ip, kind: IpKind::Floating } + ExternalIp::Floating(value) } } diff --git a/openapi/nexus.json b/openapi/nexus.json index 2b9cfbc91a..45cb5a72da 100644 --- a/openapi/nexus.json +++ b/openapi/nexus.json @@ -10880,19 +10880,91 @@ ] }, "ExternalIp": { - "type": "object", - "properties": { - "ip": { - "type": "string", - "format": "ip" + "oneOf": [ + { + "type": "object", + "properties": { + "ip": { + "type": "string", + "format": "ip" + }, + "kind": { + "type": "string", + "enum": [ + "ephemeral" + ] + } + }, + "required": [ + "ip", + "kind" + ] }, - "kind": { - "$ref": "#/components/schemas/IpKind" + { + "description": "A Floating IP is a well-known IP address which can be attached and detached from instances.", + "type": "object", + "properties": { + "description": { + "description": "human-readable free-form text about a resource", + "type": "string" + }, + "id": { + "description": "unique, immutable, system-controlled identifier for each resource", + "type": "string", + "format": "uuid" + }, + "instance_id": { + "nullable": true, + "description": "The ID of the instance that this Floating IP is attached to, if it is presently in use.", + "type": "string", + "format": "uuid" + }, + "ip": { + "description": "The IP address held by this resource.", + "type": "string", + "format": "ip" + }, + "kind": { + "type": "string", + "enum": [ + "floating" + ] + }, + "name": { + "description": "unique, mutable, user-controlled identifier for each resource", + "allOf": [ + { + "$ref": "#/components/schemas/Name" + } + ] + }, + "project_id": { + "description": "The project this resource exists within.", + "type": "string", + "format": "uuid" + }, + "time_created": { + "description": "timestamp when this resource was created", + "type": "string", + "format": "date-time" + }, + "time_modified": { + "description": "timestamp when this resource was last modified", + "type": "string", + "format": "date-time" + } + }, + "required": [ + "description", + "id", + "ip", + "kind", + "name", + "project_id", + "time_created", + "time_modified" + ] } - }, - "required": [ - "ip", - "kind" ] }, "ExternalIpCreate": { @@ -12373,14 +12445,6 @@ } ] }, - "IpKind": { - "description": "The kind of an external IP address for an instance", - "type": "string", - "enum": [ - "ephemeral", - "floating" - ] - }, "IpNet": { "oneOf": [ { From f26aa6479418f04acd2db96d4586324b3a892780 Mon Sep 17 00:00:00 2001 From: Kyle Simpson Date: Fri, 5 Jan 2024 18:19:00 +0000 Subject: [PATCH 40/56] Fix up end-to-end tests with new IP structure. --- end-to-end-tests/src/instance_launch.rs | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/end-to-end-tests/src/instance_launch.rs b/end-to-end-tests/src/instance_launch.rs index b3d1406070..01c801cd58 100644 --- a/end-to-end-tests/src/instance_launch.rs +++ b/end-to-end-tests/src/instance_launch.rs @@ -5,9 +5,9 @@ use anyhow::{ensure, Context as _, Result}; use async_trait::async_trait; use omicron_test_utils::dev::poll::{wait_for_condition, CondCheckError}; use oxide_client::types::{ - ByteCount, DiskCreate, DiskSource, ExternalIpCreate, InstanceCpuCount, - InstanceCreate, InstanceDiskAttachment, InstanceNetworkInterfaceAttachment, - SshKeyCreate, + ByteCount, DiskCreate, DiskSource, ExternalIp, ExternalIpCreate, + InstanceCpuCount, InstanceCreate, InstanceDiskAttachment, + InstanceNetworkInterfaceAttachment, SshKeyCreate, }; use oxide_client::{ClientDisksExt, ClientInstancesExt, ClientSessionExt}; use russh::{ChannelMsg, Disconnect}; @@ -70,7 +70,7 @@ async fn instance_launch() -> Result<()> { name: disk_name.clone(), }], network_interfaces: InstanceNetworkInterfaceAttachment::Default, - external_ips: vec![ExternalIpCreate::Ephemeral { pool_name: None }], + external_ips: vec![ExternalIpCreate::Ephemeral { pool: None }], user_data: String::new(), start: true, }) @@ -86,8 +86,10 @@ async fn instance_launch() -> Result<()> { .await? .items .first() - .context("no external IPs")? - .ip; + .context("no external IPs")?; + let ExternalIp::Ephemeral { ip: ip_addr } = ip_addr else { + anyhow::bail!("IP bound to instance was not ephemeral as required.") + }; eprintln!("instance external IP: {}", ip_addr); // poll serial for login prompt, waiting 5 min max From 6184ecc4a032e89d87f20f4d248797ff4288551d Mon Sep 17 00:00:00 2001 From: Kyle Simpson Date: Fri, 5 Jan 2024 18:21:01 +0000 Subject: [PATCH 41/56] Bump up schema version pre-merge --- nexus/db-model/src/schema.rs | 2 +- schema/crdb/{23.0.0 => 24.0.0}/up01.sql | 0 schema/crdb/{23.0.0 => 24.0.0}/up02.sql | 0 schema/crdb/{23.0.0 => 24.0.0}/up03.sql | 0 schema/crdb/{23.0.0 => 24.0.0}/up04.sql | 0 schema/crdb/{23.0.0 => 24.0.0}/up05.sql | 0 schema/crdb/{23.0.0 => 24.0.0}/up06.sql | 0 schema/crdb/{23.0.0 => 24.0.0}/up07.sql | 0 schema/crdb/{23.0.0 => 24.0.0}/up08.sql | 0 schema/crdb/{23.0.0 => 24.0.0}/up09.sql | 0 schema/crdb/dbinit.sql | 2 +- 11 files changed, 2 insertions(+), 2 deletions(-) rename schema/crdb/{23.0.0 => 24.0.0}/up01.sql (100%) rename schema/crdb/{23.0.0 => 24.0.0}/up02.sql (100%) rename schema/crdb/{23.0.0 => 24.0.0}/up03.sql (100%) rename schema/crdb/{23.0.0 => 24.0.0}/up04.sql (100%) rename schema/crdb/{23.0.0 => 24.0.0}/up05.sql (100%) rename schema/crdb/{23.0.0 => 24.0.0}/up06.sql (100%) rename schema/crdb/{23.0.0 => 24.0.0}/up07.sql (100%) rename schema/crdb/{23.0.0 => 24.0.0}/up08.sql (100%) rename schema/crdb/{23.0.0 => 24.0.0}/up09.sql (100%) diff --git a/nexus/db-model/src/schema.rs b/nexus/db-model/src/schema.rs index 7a5a3428bc..9482f5cd77 100644 --- a/nexus/db-model/src/schema.rs +++ b/nexus/db-model/src/schema.rs @@ -13,7 +13,7 @@ use omicron_common::api::external::SemverVersion; /// /// This should be updated whenever the schema is changed. For more details, /// refer to: schema/crdb/README.adoc -pub const SCHEMA_VERSION: SemverVersion = SemverVersion::new(23, 0, 0); +pub const SCHEMA_VERSION: SemverVersion = SemverVersion::new(24, 0, 0); table! { disk (id) { diff --git a/schema/crdb/23.0.0/up01.sql b/schema/crdb/24.0.0/up01.sql similarity index 100% rename from schema/crdb/23.0.0/up01.sql rename to schema/crdb/24.0.0/up01.sql diff --git a/schema/crdb/23.0.0/up02.sql b/schema/crdb/24.0.0/up02.sql similarity index 100% rename from schema/crdb/23.0.0/up02.sql rename to schema/crdb/24.0.0/up02.sql diff --git a/schema/crdb/23.0.0/up03.sql b/schema/crdb/24.0.0/up03.sql similarity index 100% rename from schema/crdb/23.0.0/up03.sql rename to schema/crdb/24.0.0/up03.sql diff --git a/schema/crdb/23.0.0/up04.sql b/schema/crdb/24.0.0/up04.sql similarity index 100% rename from schema/crdb/23.0.0/up04.sql rename to schema/crdb/24.0.0/up04.sql diff --git a/schema/crdb/23.0.0/up05.sql b/schema/crdb/24.0.0/up05.sql similarity index 100% rename from schema/crdb/23.0.0/up05.sql rename to schema/crdb/24.0.0/up05.sql diff --git a/schema/crdb/23.0.0/up06.sql b/schema/crdb/24.0.0/up06.sql similarity index 100% rename from schema/crdb/23.0.0/up06.sql rename to schema/crdb/24.0.0/up06.sql diff --git a/schema/crdb/23.0.0/up07.sql b/schema/crdb/24.0.0/up07.sql similarity index 100% rename from schema/crdb/23.0.0/up07.sql rename to schema/crdb/24.0.0/up07.sql diff --git a/schema/crdb/23.0.0/up08.sql b/schema/crdb/24.0.0/up08.sql similarity index 100% rename from schema/crdb/23.0.0/up08.sql rename to schema/crdb/24.0.0/up08.sql diff --git a/schema/crdb/23.0.0/up09.sql b/schema/crdb/24.0.0/up09.sql similarity index 100% rename from schema/crdb/23.0.0/up09.sql rename to schema/crdb/24.0.0/up09.sql diff --git a/schema/crdb/dbinit.sql b/schema/crdb/dbinit.sql index 88ee585624..aad420df0f 100644 --- a/schema/crdb/dbinit.sql +++ b/schema/crdb/dbinit.sql @@ -3281,7 +3281,7 @@ INSERT INTO omicron.public.db_metadata ( version, target_version ) VALUES - ( TRUE, NOW(), NOW(), '23.0.0', NULL) + ( TRUE, NOW(), NOW(), '24.0.0', NULL) ON CONFLICT DO NOTHING; COMMIT; From 73db56074756198edf0062bfda78cab1bc976a80 Mon Sep 17 00:00:00 2001 From: Kyle Simpson Date: Fri, 5 Jan 2024 22:37:16 +0000 Subject: [PATCH 42/56] Banish `NameOrId` resolution from datastore/external-ip --- .../src/db/datastore/external_ip.rs | 50 ++++++++++--------- .../db-queries/src/db/queries/external_ip.rs | 17 +++++-- nexus/src/app/external_ip.rs | 21 +++++++- nexus/src/app/sagas/instance_create.rs | 6 +-- nexus/src/app/sagas/instance_ip_attach.rs | 6 +-- 5 files changed, 64 insertions(+), 36 deletions(-) diff --git a/nexus/db-queries/src/db/datastore/external_ip.rs b/nexus/db-queries/src/db/datastore/external_ip.rs index 677e1f357d..b76ef43292 100644 --- a/nexus/db-queries/src/db/datastore/external_ip.rs +++ b/nexus/db-queries/src/db/datastore/external_ip.rs @@ -45,7 +45,6 @@ use omicron_common::api::external::DeleteResult; use omicron_common::api::external::Error; use omicron_common::api::external::ListResultVec; use omicron_common::api::external::LookupResult; -use omicron_common::api::external::NameOrId; use omicron_common::api::external::ResourceType; use omicron_common::api::external::UpdateResult; use ref_cast::RefCast; @@ -88,7 +87,7 @@ impl DataStore { opctx: &OpContext, ip_id: Uuid, instance_id: Uuid, - pool_id: Option, + pool: Option, creating_instance: bool, ) -> CreateResult<(ExternalIp, bool)> { // This is slightly hacky: we need to create an unbound ephemeral IP, and @@ -97,16 +96,18 @@ impl DataStore { // - At most MAX external IPs per instance // Naturally, we now *need* to destroy the ephemeral IP if the newly alloc'd // IP was not attached, including on idempotent success. - let pool = match pool_id { - Some(id) => { - let (.., authz_pool, pool) = LookupPath::new(opctx, &self) - .ip_pool_id(id) + let pool = match pool { + Some(authz_pool) => { + let (.., pool) = LookupPath::new(opctx, &self) + .ip_pool_id(authz_pool.id()) // any authenticated user can CreateChild on an IP pool. this is // meant to represent allocating an IP .fetch_for(authz::Action::CreateChild) .await?; // If this pool is not linked to the current silo, 404 + // As name resolution happens one layer up, we need to use the *original* + // authz Pool. if self.ip_pool_fetch_link(opctx, pool.id()).await.is_err() { return Err(authz_pool.not_found()); } @@ -210,33 +211,34 @@ impl DataStore { opctx: &OpContext, project_id: Uuid, params: params::FloatingIpCreate, + pool: Option, ) -> CreateResult { let ip_id = Uuid::new_v4(); - // TODO: NameOrId resolution should happen a level higher, in the nexus function - let (.., authz_pool, pool) = match params.pool { - Some(NameOrId::Name(name)) => { - LookupPath::new(opctx, self) - .ip_pool_name(&Name(name)) - .fetch_for(authz::Action::Read) - .await? + // This implements the same pattern as in `allocate_instance_ephemeral_ip` to + // check that a chosen pool is valid from within the current silo. + let pool = match pool { + Some(authz_pool) => { + let (.., pool) = LookupPath::new(opctx, &self) + .ip_pool_id(authz_pool.id()) + .fetch_for(authz::Action::CreateChild) + .await?; + + if self.ip_pool_fetch_link(opctx, pool.id()).await.is_err() { + return Err(authz_pool.not_found()); + } + + pool } - Some(NameOrId::Id(id)) => { - LookupPath::new(opctx, self) - .ip_pool_id(id) - .fetch_for(authz::Action::Read) - .await? + // If no name given, use the default logic + None => { + let (.., pool) = self.ip_pools_fetch_default(&opctx).await?; + pool } - None => self.ip_pools_fetch_default(opctx).await?, }; let pool_id = pool.id(); - // If this pool is not linked to the current silo, 404 - if self.ip_pool_fetch_link(opctx, pool_id).await.is_err() { - return Err(authz_pool.not_found()); - } - let data = if let Some(ip) = params.address { IncompleteExternalIp::for_floating_explicit( ip_id, diff --git a/nexus/db-queries/src/db/queries/external_ip.rs b/nexus/db-queries/src/db/queries/external_ip.rs index de94e618fa..8114b9e363 100644 --- a/nexus/db-queries/src/db/queries/external_ip.rs +++ b/nexus/db-queries/src/db/queries/external_ip.rs @@ -863,10 +863,12 @@ impl RunQueryDsl for NextExternalIp {} #[cfg(test)] mod tests { + use crate::authz; use crate::context::OpContext; use crate::db::datastore::DataStore; use crate::db::datastore::SERVICE_IP_POOL_NAME; use crate::db::identity::Resource; + use crate::db::lookup::LookupPath; use crate::db::model::IpKind; use crate::db::model::IpPool; use crate::db::model::IpPoolRange; @@ -923,7 +925,7 @@ mod tests { name: &str, range: IpRange, is_default: bool, - ) -> IpPool { + ) -> authz::IpPool { let pool = IpPool::new(&IdentityMetadataCreateParams { name: String::from(name).parse().unwrap(), description: format!("ip pool {}", name), @@ -948,7 +950,12 @@ mod tests { self.initialize_ip_pool(name, range).await; - pool + LookupPath::new(&self.opctx, &self.db_datastore) + .ip_pool_id(pool.id()) + .lookup_for(authz::Action::Read) + .await + .unwrap() + .0 } async fn initialize_ip_pool(&self, name: &str, range: IpRange) { @@ -1825,7 +1832,7 @@ mod tests { &context.opctx, id, instance_id, - Some(p1.id()), + Some(p1), true, ) .await @@ -1870,7 +1877,7 @@ mod tests { &context.opctx, Uuid::new_v4(), instance_id, - Some(p1.id()), + Some(p1.clone()), true, ) .await @@ -1892,7 +1899,7 @@ mod tests { &context.opctx, Uuid::new_v4(), instance_id, - Some(p1.id()), + Some(p1), true, ) .await diff --git a/nexus/src/app/external_ip.rs b/nexus/src/app/external_ip.rs index 7f41b7fd20..3315f4be5a 100644 --- a/nexus/src/app/external_ip.rs +++ b/nexus/src/app/external_ip.rs @@ -20,6 +20,7 @@ use omicron_common::api::external::Error; use omicron_common::api::external::ListResultVec; use omicron_common::api::external::LookupResult; use omicron_common::api::external::NameOrId; +use ref_cast::RefCast; impl super::Nexus { pub(crate) async fn instance_list_external_ips( @@ -105,9 +106,27 @@ impl super::Nexus { let (.., authz_project) = project_lookup.lookup_for(authz::Action::CreateChild).await?; + let pool = match ¶ms.pool { + Some(NameOrId::Name(name)) => Some( + LookupPath::new(opctx, &self.db_datastore) + .ip_pool_name(nexus_db_model::Name::ref_cast(name)) + .lookup_for(authz::Action::Read) + .await? + .0, + ), + Some(NameOrId::Id(id)) => Some( + LookupPath::new(opctx, &self.db_datastore) + .ip_pool_id(*id) + .lookup_for(authz::Action::Read) + .await? + .0, + ), + None => None, + }; + Ok(self .db_datastore - .allocate_floating_ip(opctx, authz_project.id(), params) + .allocate_floating_ip(opctx, authz_project.id(), params, pool) .await? .try_into() .unwrap()) diff --git a/nexus/src/app/sagas/instance_create.rs b/nexus/src/app/sagas/instance_create.rs index f5a2e6756b..fd1640149c 100644 --- a/nexus/src/app/sagas/instance_create.rs +++ b/nexus/src/app/sagas/instance_create.rs @@ -626,7 +626,7 @@ async fn sic_allocate_instance_external_ip( let ip = match ip_params { // Allocate a new IP address from the target, possibly default, pool params::ExternalIpCreate::Ephemeral { pool } => { - let pool_id = if let Some(name_or_id) = pool { + let pool = if let Some(name_or_id) = pool { let (.., authz_pool) = match name_or_id { NameOrId::Name(name) => LookupPath::new(&opctx, datastore) .ip_pool_name(db::model::Name::ref_cast(name)), @@ -638,7 +638,7 @@ async fn sic_allocate_instance_external_ip( .await .map_err(ActionError::action_failed)?; - Some(authz_pool.id()) + Some(authz_pool) } else { None }; @@ -649,7 +649,7 @@ async fn sic_allocate_instance_external_ip( &opctx, ip_id, instance_id, - pool_id, + pool, true, ) .await diff --git a/nexus/src/app/sagas/instance_ip_attach.rs b/nexus/src/app/sagas/instance_ip_attach.rs index 7a0c41b9ec..e5829a0cfb 100644 --- a/nexus/src/app/sagas/instance_ip_attach.rs +++ b/nexus/src/app/sagas/instance_ip_attach.rs @@ -93,7 +93,7 @@ async fn siia_begin_attach_ip( match ¶ms.create_params { // Allocate a new IP address from the target, possibly default, pool params::ExternalIpCreate::Ephemeral { pool } => { - let pool_id = if let Some(name_or_id) = pool { + let pool = if let Some(name_or_id) = pool { let (.., authz_pool) = match name_or_id { NameOrId::Name(name) => LookupPath::new(&opctx, datastore) .ip_pool_name(db::model::Name::ref_cast(name)), @@ -105,7 +105,7 @@ async fn siia_begin_attach_ip( .await .map_err(ActionError::action_failed)?; - Some(authz_pool.id()) + Some(authz_pool) } else { None }; @@ -115,7 +115,7 @@ async fn siia_begin_attach_ip( &opctx, Uuid::new_v4(), params.authz_instance.id(), - pool_id, + pool, false, ) .await From 5a916ca949ec048b5741bbe4ead32d0b545ab6c3 Mon Sep 17 00:00:00 2001 From: Kyle Simpson Date: Sat, 6 Jan 2024 12:20:50 +0000 Subject: [PATCH 43/56] Review feedback: use `Nexus::ip_pool_lookup` --- nexus/preprocessed_configs/config.xml | 0 nexus/src/app/external_ip.rs | 13 ++----------- nexus/src/app/sagas/instance_create.rs | 22 ++++++++++------------ nexus/src/app/sagas/instance_ip_attach.rs | 22 ++++++++++------------ 4 files changed, 22 insertions(+), 35 deletions(-) create mode 100644 nexus/preprocessed_configs/config.xml diff --git a/nexus/preprocessed_configs/config.xml b/nexus/preprocessed_configs/config.xml new file mode 100644 index 0000000000..e69de29bb2 diff --git a/nexus/src/app/external_ip.rs b/nexus/src/app/external_ip.rs index 3315f4be5a..eec3b84cda 100644 --- a/nexus/src/app/external_ip.rs +++ b/nexus/src/app/external_ip.rs @@ -20,7 +20,6 @@ use omicron_common::api::external::Error; use omicron_common::api::external::ListResultVec; use omicron_common::api::external::LookupResult; use omicron_common::api::external::NameOrId; -use ref_cast::RefCast; impl super::Nexus { pub(crate) async fn instance_list_external_ips( @@ -107,16 +106,8 @@ impl super::Nexus { project_lookup.lookup_for(authz::Action::CreateChild).await?; let pool = match ¶ms.pool { - Some(NameOrId::Name(name)) => Some( - LookupPath::new(opctx, &self.db_datastore) - .ip_pool_name(nexus_db_model::Name::ref_cast(name)) - .lookup_for(authz::Action::Read) - .await? - .0, - ), - Some(NameOrId::Id(id)) => Some( - LookupPath::new(opctx, &self.db_datastore) - .ip_pool_id(*id) + Some(pool) => Some( + self.ip_pool_lookup(opctx, pool)? .lookup_for(authz::Action::Read) .await? .0, diff --git a/nexus/src/app/sagas/instance_create.rs b/nexus/src/app/sagas/instance_create.rs index fd1640149c..3aa491d978 100644 --- a/nexus/src/app/sagas/instance_create.rs +++ b/nexus/src/app/sagas/instance_create.rs @@ -627,18 +627,16 @@ async fn sic_allocate_instance_external_ip( // Allocate a new IP address from the target, possibly default, pool params::ExternalIpCreate::Ephemeral { pool } => { let pool = if let Some(name_or_id) = pool { - let (.., authz_pool) = match name_or_id { - NameOrId::Name(name) => LookupPath::new(&opctx, datastore) - .ip_pool_name(db::model::Name::ref_cast(name)), - NameOrId::Id(id) => { - LookupPath::new(&opctx, datastore).ip_pool_id(*id) - } - } - .lookup_for(authz::Action::CreateChild) - .await - .map_err(ActionError::action_failed)?; - - Some(authz_pool) + Some( + osagactx + .nexus() + .ip_pool_lookup(&opctx, name_or_id) + .map_err(ActionError::action_failed)? + .lookup_for(authz::Action::CreateChild) + .await + .map_err(ActionError::action_failed)? + .0, + ) } else { None }; diff --git a/nexus/src/app/sagas/instance_ip_attach.rs b/nexus/src/app/sagas/instance_ip_attach.rs index e5829a0cfb..6c21c46ca5 100644 --- a/nexus/src/app/sagas/instance_ip_attach.rs +++ b/nexus/src/app/sagas/instance_ip_attach.rs @@ -94,18 +94,16 @@ async fn siia_begin_attach_ip( // Allocate a new IP address from the target, possibly default, pool params::ExternalIpCreate::Ephemeral { pool } => { let pool = if let Some(name_or_id) = pool { - let (.., authz_pool) = match name_or_id { - NameOrId::Name(name) => LookupPath::new(&opctx, datastore) - .ip_pool_name(db::model::Name::ref_cast(name)), - NameOrId::Id(id) => { - LookupPath::new(&opctx, datastore).ip_pool_id(*id) - } - } - .lookup_for(authz::Action::CreateChild) - .await - .map_err(ActionError::action_failed)?; - - Some(authz_pool) + Some( + osagactx + .nexus() + .ip_pool_lookup(&opctx, name_or_id) + .map_err(ActionError::action_failed)? + .lookup_for(authz::Action::CreateChild) + .await + .map_err(ActionError::action_failed)? + .0, + ) } else { None }; From 725efa40aa769ef07ef69c36624254cc72638b70 Mon Sep 17 00:00:00 2001 From: Kyle Simpson Date: Wed, 10 Jan 2024 12:16:51 +0000 Subject: [PATCH 44/56] Review feedback: nits and error messages --- .../src/db/datastore/external_ip.rs | 14 ++++++--- nexus/preprocessed_configs/config.xml | 0 nexus/src/app/sagas/instance_common.rs | 28 ++++++++--------- nexus/src/app/sagas/instance_ip_attach.rs | 27 ++++++++++++++-- nexus/src/app/sagas/instance_ip_detach.rs | 31 +++++++++++++++++-- 5 files changed, 77 insertions(+), 23 deletions(-) delete mode 100644 nexus/preprocessed_configs/config.xml diff --git a/nexus/db-queries/src/db/datastore/external_ip.rs b/nexus/db-queries/src/db/datastore/external_ip.rs index b76ef43292..1c97fa6c68 100644 --- a/nexus/db-queries/src/db/datastore/external_ip.rs +++ b/nexus/db-queries/src/db/datastore/external_ip.rs @@ -460,7 +460,9 @@ impl DataStore { // a UniqueViolation. IpAttachState::Attaching | IpAttachState::Detaching => return Err(Error::unavail(&format!( - "tried to attach {kind} IP mid-attach/detach" + "tried to attach {kind} IP mid-attach/detach: \ + attach will be safe to retry once operation on \ + same IP resource completes" ))), IpAttachState::Detached => {}, @@ -469,7 +471,8 @@ impl DataStore { Err(match &collection.runtime_state.nexus_state { state if SAFE_TRANSIENT_INSTANCE_STATES.contains(&state) => Error::unavail(&format!( - "tried to attach {kind} IP while instance was changing state" + "tried to attach {kind} IP while instance was changing state: \ + attach will be safe to retry once start/stop/migrate completes" )), state if SAFE_TO_ATTACH_INSTANCE_STATES.contains(&state) => { if attached_count >= MAX_EXTERNAL_IPS_PLUS_SNAT as i64 { @@ -577,14 +580,17 @@ impl DataStore { // User can reattempt depending on how the current saga unfolds. IpAttachState::Attaching | IpAttachState::Detaching => return Err(Error::unavail(&format!( - "tried to detach {kind} IP mid-attach/detach" + "tried to detach {kind} IP mid-attach/detach: \ + attach will be safe to retry once operation on \ + same IP resource completes" ))), IpAttachState::Attached => {}, } match collection.runtime_state.nexus_state { state if SAFE_TRANSIENT_INSTANCE_STATES.contains(&state) => Error::unavail(&format!( - "tried to detach {kind} IP while instance was changing state" + "tried to attach {kind} IP while instance was changing state: \ + attach will be safe to retry once start/stop/migrate completes" )), state if SAFE_TO_ATTACH_INSTANCE_STATES.contains(&state) => { Error::internal_error(&format!("failed to detach {kind} IP")) diff --git a/nexus/preprocessed_configs/config.xml b/nexus/preprocessed_configs/config.xml deleted file mode 100644 index e69de29bb2..0000000000 diff --git a/nexus/src/app/sagas/instance_common.rs b/nexus/src/app/sagas/instance_common.rs index 8118b9df8a..f885088745 100644 --- a/nexus/src/app/sagas/instance_common.rs +++ b/nexus/src/app/sagas/instance_common.rs @@ -168,18 +168,17 @@ pub async fn instance_ip_move_state( serialized_authn: &authn::saga::Serialized, from: IpAttachState, to: IpAttachState, + new_ip: &ModifyStateForExternalIp, ) -> Result { let osagactx = sagactx.user_data(); let datastore = osagactx.datastore(); let opctx = crate::context::op_context_for_saga_action(&sagactx, serialized_authn); - let new_ip = sagactx.lookup::("target_ip")?; - if !new_ip.do_saga { return Ok(true); } - let Some(new_ip) = new_ip.external_ip else { + let Some(new_ip) = new_ip.external_ip.as_ref() else { return Err(ActionError::action_failed(Error::internal_error( "tried to `do_saga` without valid external IP", ))); @@ -266,6 +265,8 @@ pub async fn instance_ip_add_nat( sagactx: &NexusActionContext, serialized_authn: &authn::saga::Serialized, authz_instance: &authz::Instance, + sled_uuid: Option, + target_ip: ModifyStateForExternalIp, ) -> Result<(), ActionError> { let osagactx = sagactx.user_data(); let datastore = osagactx.datastore(); @@ -273,12 +274,10 @@ pub async fn instance_ip_add_nat( crate::context::op_context_for_saga_action(&sagactx, serialized_authn); // No physical sled? Don't push NAT. - let Some(sled_uuid) = sagactx.lookup::>("instance_state")? - else { + let Some(sled_uuid) = sled_uuid else { return Ok(()); }; - let target_ip = sagactx.lookup::("target_ip")?; if !target_ip.do_saga { return Ok(()); } @@ -314,17 +313,18 @@ pub async fn instance_ip_remove_nat( sagactx: &NexusActionContext, serialized_authn: &authn::saga::Serialized, authz_instance: &authz::Instance, + sled_uuid: Option, + target_ip: ModifyStateForExternalIp, ) -> Result<(), ActionError> { let osagactx = sagactx.user_data(); let opctx = crate::context::op_context_for_saga_action(&sagactx, serialized_authn); // No physical sled? Don't push NAT. - let Some(_) = sagactx.lookup::>("instance_state")? else { + if sled_uuid.is_none() { return Ok(()); }; - let target_ip = sagactx.lookup::("target_ip")?; if !target_ip.do_saga { return Ok(()); } @@ -346,16 +346,16 @@ pub async fn instance_ip_remove_nat( pub async fn instance_ip_add_opte( sagactx: &NexusActionContext, authz_instance: &authz::Instance, + sled_uuid: Option, + target_ip: ModifyStateForExternalIp, ) -> Result<(), ActionError> { let osagactx = sagactx.user_data(); // No physical sled? Don't inform OPTE. - let Some(sled_uuid) = sagactx.lookup::>("instance_state")? - else { + let Some(sled_uuid) = sled_uuid else { return Ok(()); }; - let target_ip = sagactx.lookup::("target_ip")?; if !target_ip.do_saga { return Ok(()); } @@ -396,16 +396,16 @@ pub async fn instance_ip_add_opte( pub async fn instance_ip_remove_opte( sagactx: &NexusActionContext, authz_instance: &authz::Instance, + sled_uuid: Option, + target_ip: ModifyStateForExternalIp, ) -> Result<(), ActionError> { let osagactx = sagactx.user_data(); // No physical sled? Don't inform OPTE. - let Some(sled_uuid) = sagactx.lookup::>("instance_state")? - else { + let Some(sled_uuid) = sled_uuid else { return Ok(()); }; - let target_ip = sagactx.lookup::("target_ip")?; if !target_ip.do_saga { return Ok(()); } diff --git a/nexus/src/app/sagas/instance_ip_attach.rs b/nexus/src/app/sagas/instance_ip_attach.rs index 6c21c46ca5..7e1e10cb2e 100644 --- a/nexus/src/app/sagas/instance_ip_attach.rs +++ b/nexus/src/app/sagas/instance_ip_attach.rs @@ -160,11 +160,13 @@ async fn siia_begin_attach_ip_undo( let log = sagactx.user_data().log(); warn!(log, "siia_begin_attach_ip_undo: Reverting detached->attaching"); let params = sagactx.saga_params::()?; + let new_ip = sagactx.lookup::("target_ip")?; if !instance_ip_move_state( &sagactx, ¶ms.serialized_authn, IpAttachState::Attaching, IpAttachState::Detached, + &new_ip, ) .await? { @@ -189,10 +191,14 @@ async fn siia_get_instance_state( async fn siia_nat(sagactx: NexusActionContext) -> Result<(), ActionError> { let params = sagactx.saga_params::()?; + let sled_id = sagactx.lookup::>("instance_state")?; + let target_ip = sagactx.lookup::("target_ip")?; instance_ip_add_nat( &sagactx, ¶ms.serialized_authn, ¶ms.authz_instance, + sled_id, + target_ip, ) .await } @@ -202,10 +208,14 @@ async fn siia_nat_undo( ) -> Result<(), anyhow::Error> { let log = sagactx.user_data().log(); let params = sagactx.saga_params::()?; + let sled_id = sagactx.lookup::>("instance_state")?; + let target_ip = sagactx.lookup::("target_ip")?; if let Err(e) = instance_ip_remove_nat( &sagactx, ¶ms.serialized_authn, ¶ms.authz_instance, + sled_id, + target_ip, ) .await { @@ -219,7 +229,10 @@ async fn siia_update_opte( sagactx: NexusActionContext, ) -> Result<(), ActionError> { let params = sagactx.saga_params::()?; - instance_ip_add_opte(&sagactx, ¶ms.authz_instance).await + let sled_id = sagactx.lookup::>("instance_state")?; + let target_ip = sagactx.lookup::("target_ip")?; + instance_ip_add_opte(&sagactx, ¶ms.authz_instance, sled_id, target_ip) + .await } async fn siia_update_opte_undo( @@ -227,8 +240,15 @@ async fn siia_update_opte_undo( ) -> Result<(), anyhow::Error> { let log = sagactx.user_data().log(); let params = sagactx.saga_params::()?; - if let Err(e) = - instance_ip_remove_opte(&sagactx, ¶ms.authz_instance).await + let sled_id = sagactx.lookup::>("instance_state")?; + let target_ip = sagactx.lookup::("target_ip")?; + if let Err(e) = instance_ip_remove_opte( + &sagactx, + ¶ms.authz_instance, + sled_id, + target_ip, + ) + .await { error!(log, "siia_update_opte_undo: failed to notify sled-agent: {e}"); } @@ -247,6 +267,7 @@ async fn siia_complete_attach( ¶ms.serialized_authn, IpAttachState::Attaching, IpAttachState::Attached, + &target_ip, ) .await? { diff --git a/nexus/src/app/sagas/instance_ip_detach.rs b/nexus/src/app/sagas/instance_ip_detach.rs index cff14d6ba8..442eba6531 100644 --- a/nexus/src/app/sagas/instance_ip_detach.rs +++ b/nexus/src/app/sagas/instance_ip_detach.rs @@ -137,11 +137,13 @@ async fn siid_begin_detach_ip_undo( let log = sagactx.user_data().log(); warn!(log, "siid_begin_detach_ip_undo: Reverting attached->detaching"); let params = sagactx.saga_params::()?; + let new_ip = sagactx.lookup::("target_ip")?; if !instance_ip_move_state( &sagactx, ¶ms.serialized_authn, IpAttachState::Detaching, IpAttachState::Attached, + &new_ip, ) .await? { @@ -166,10 +168,14 @@ async fn siid_get_instance_state( async fn siid_nat(sagactx: NexusActionContext) -> Result<(), ActionError> { let params = sagactx.saga_params::()?; + let sled_id = sagactx.lookup::>("instance_state")?; + let target_ip = sagactx.lookup::("target_ip")?; instance_ip_remove_nat( &sagactx, ¶ms.serialized_authn, ¶ms.authz_instance, + sled_id, + target_ip, ) .await } @@ -179,10 +185,14 @@ async fn siid_nat_undo( ) -> Result<(), anyhow::Error> { let log = sagactx.user_data().log(); let params = sagactx.saga_params::()?; + let sled_id = sagactx.lookup::>("instance_state")?; + let target_ip = sagactx.lookup::("target_ip")?; if let Err(e) = instance_ip_add_nat( &sagactx, ¶ms.serialized_authn, ¶ms.authz_instance, + sled_id, + target_ip, ) .await { @@ -196,7 +206,15 @@ async fn siid_update_opte( sagactx: NexusActionContext, ) -> Result<(), ActionError> { let params = sagactx.saga_params::()?; - instance_ip_remove_opte(&sagactx, ¶ms.authz_instance).await + let sled_id = sagactx.lookup::>("instance_state")?; + let target_ip = sagactx.lookup::("target_ip")?; + instance_ip_remove_opte( + &sagactx, + ¶ms.authz_instance, + sled_id, + target_ip, + ) + .await } async fn siid_update_opte_undo( @@ -204,7 +222,15 @@ async fn siid_update_opte_undo( ) -> Result<(), anyhow::Error> { let log = sagactx.user_data().log(); let params = sagactx.saga_params::()?; - if let Err(e) = instance_ip_add_opte(&sagactx, ¶ms.authz_instance).await + let sled_id = sagactx.lookup::>("instance_state")?; + let target_ip = sagactx.lookup::("target_ip")?; + if let Err(e) = instance_ip_add_opte( + &sagactx, + ¶ms.authz_instance, + sled_id, + target_ip, + ) + .await { error!(log, "siid_update_opte_undo: failed to notify sled-agent: {e}"); } @@ -223,6 +249,7 @@ async fn siid_complete_detach( ¶ms.serialized_authn, IpAttachState::Detaching, IpAttachState::Detached, + &target_ip, ) .await? { From f5a50b0fcdd3415789060bdf4114ab717f08ac02 Mon Sep 17 00:00:00 2001 From: Kyle Simpson Date: Wed, 10 Jan 2024 12:40:43 +0000 Subject: [PATCH 45/56] Review feedback: correct lockout check with live migration --- .../src/db/datastore/external_ip.rs | 30 +++++++++++++++---- 1 file changed, 25 insertions(+), 5 deletions(-) diff --git a/nexus/db-queries/src/db/datastore/external_ip.rs b/nexus/db-queries/src/db/datastore/external_ip.rs index 1c97fa6c68..fc838467d4 100644 --- a/nexus/db-queries/src/db/datastore/external_ip.rs +++ b/nexus/db-queries/src/db/datastore/external_ip.rs @@ -406,7 +406,10 @@ impl DataStore { let query = Instance::attach_resource( instance_id, ip_id, - inst_table.into_boxed().filter(inst_dsl::state.eq_any(safe_states)), + inst_table + .into_boxed() + .filter(inst_dsl::state.eq_any(safe_states)) + .filter(inst_dsl::migration_id.is_null()), table .into_boxed() .filter(dsl::state.eq(IpAttachState::Detached)) @@ -468,11 +471,18 @@ impl DataStore { IpAttachState::Detached => {}, } + if collection.runtime_state.migration_id.is_some() { + return Err(Error::unavail(&format!( + "tried to attach {kind} IP while instance was migrating: \ + detach will be safe to retry once migrate completes" + ))) + } + Err(match &collection.runtime_state.nexus_state { state if SAFE_TRANSIENT_INSTANCE_STATES.contains(&state) => Error::unavail(&format!( "tried to attach {kind} IP while instance was changing state: \ - attach will be safe to retry once start/stop/migrate completes" + attach will be safe to retry once start/stop completes" )), state if SAFE_TO_ATTACH_INSTANCE_STATES.contains(&state) => { if attached_count >= MAX_EXTERNAL_IPS_PLUS_SNAT as i64 { @@ -532,7 +542,10 @@ impl DataStore { let query = Instance::detach_resource( instance_id, ip_id, - inst_table.into_boxed().filter(inst_dsl::state.eq_any(safe_states)), + inst_table + .into_boxed() + .filter(inst_dsl::state.eq_any(safe_states)) + .filter(inst_dsl::migration_id.is_null()), table .into_boxed() .filter(dsl::state.eq(IpAttachState::Attached)) @@ -581,16 +594,23 @@ impl DataStore { IpAttachState::Attaching | IpAttachState::Detaching => return Err(Error::unavail(&format!( "tried to detach {kind} IP mid-attach/detach: \ - attach will be safe to retry once operation on \ + detach will be safe to retry once operation on \ same IP resource completes" ))), IpAttachState::Attached => {}, } + if collection.runtime_state.migration_id.is_some() { + return Err(Error::unavail(&format!( + "tried to detach {kind} IP while instance was migrating: \ + detach will be safe to retry once migrate completes" + ))) + } + match collection.runtime_state.nexus_state { state if SAFE_TRANSIENT_INSTANCE_STATES.contains(&state) => Error::unavail(&format!( "tried to attach {kind} IP while instance was changing state: \ - attach will be safe to retry once start/stop/migrate completes" + detach will be safe to retry once start/stop completes" )), state if SAFE_TO_ATTACH_INSTANCE_STATES.contains(&state) => { Error::internal_error(&format!("failed to detach {kind} IP")) From d1519b181639b5cb321558a7f251c70a6479d4a2 Mon Sep 17 00:00:00 2001 From: Kyle Simpson Date: Wed, 10 Jan 2024 12:54:09 +0000 Subject: [PATCH 46/56] Self review: missed some comments --- nexus/src/app/sagas/instance_common.rs | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/nexus/src/app/sagas/instance_common.rs b/nexus/src/app/sagas/instance_common.rs index f885088745..95f7a1e689 100644 --- a/nexus/src/app/sagas/instance_common.rs +++ b/nexus/src/app/sagas/instance_common.rs @@ -261,6 +261,11 @@ pub async fn instance_ip_get_instance_state( Ok(sled_id) } +/// Adds a NAT entry to DPD, routing packets bound for `target_ip` to a +/// target sled. +/// +/// This call is a no-op if `sled_uuid` is `None` or the saga is explicitly +/// set to be inactive in event of double attach/detach (`!target_ip.do_saga`). pub async fn instance_ip_add_nat( sagactx: &NexusActionContext, serialized_authn: &authn::saga::Serialized, @@ -309,6 +314,10 @@ pub async fn instance_ip_add_nat( Ok(()) } +/// Remove a single NAT entry from DPD, dropping packets bound for `target_ip`. +/// +/// This call is a no-op if `sled_uuid` is `None` or the saga is explicitly +/// set to be inactive in event of double attach/detach (`!target_ip.do_saga`). pub async fn instance_ip_remove_nat( sagactx: &NexusActionContext, serialized_authn: &authn::saga::Serialized, @@ -343,6 +352,11 @@ pub async fn instance_ip_remove_nat( Ok(()) } +/// Inform OPTE that it should start sending/receiving traffic on a given IP +/// address. +/// +/// This call is a no-op if `sled_uuid` is `None` or the saga is explicitly +/// set to be inactive in event of double attach/detach (`!target_ip.do_saga`). pub async fn instance_ip_add_opte( sagactx: &NexusActionContext, authz_instance: &authz::Instance, @@ -393,6 +407,11 @@ pub async fn instance_ip_add_opte( Ok(()) } +/// Inform OPTE that it should cease sending/receiving traffic on a given IP +/// address. +/// +/// This call is a no-op if `sled_uuid` is `None` or the saga is explicitly +/// set to be inactive in event of double attach/detach (`!target_ip.do_saga`). pub async fn instance_ip_remove_opte( sagactx: &NexusActionContext, authz_instance: &authz::Instance, From 705cb6e5a0b8dec12b1d204d7c6cea92b151cdbb Mon Sep 17 00:00:00 2001 From: Kyle Simpson Date: Wed, 10 Jan 2024 12:58:35 +0000 Subject: [PATCH 47/56] Self review: comment expansion --- nexus/src/app/sagas/instance_common.rs | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/nexus/src/app/sagas/instance_common.rs b/nexus/src/app/sagas/instance_common.rs index 95f7a1e689..ff0c767abb 100644 --- a/nexus/src/app/sagas/instance_common.rs +++ b/nexus/src/app/sagas/instance_common.rs @@ -352,8 +352,8 @@ pub async fn instance_ip_remove_nat( Ok(()) } -/// Inform OPTE that it should start sending/receiving traffic on a given IP -/// address. +/// Inform the OPTE port for a running instance that it should start +/// sending/receiving traffic on a given IP address. /// /// This call is a no-op if `sled_uuid` is `None` or the saga is explicitly /// set to be inactive in event of double attach/detach (`!target_ip.do_saga`). @@ -407,8 +407,8 @@ pub async fn instance_ip_add_opte( Ok(()) } -/// Inform OPTE that it should cease sending/receiving traffic on a given IP -/// address. +/// Inform the OPTE port for a running instance that it should cease +/// sending/receiving traffic on a given IP address. /// /// This call is a no-op if `sled_uuid` is `None` or the saga is explicitly /// set to be inactive in event of double attach/detach (`!target_ip.do_saga`). From 774c183052c0019a67d426a26c7b897234e8e288 Mon Sep 17 00:00:00 2001 From: Kyle Simpson Date: Wed, 10 Jan 2024 13:00:46 +0000 Subject: [PATCH 48/56] Whitespace... --- nexus/src/app/sagas/instance_common.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/nexus/src/app/sagas/instance_common.rs b/nexus/src/app/sagas/instance_common.rs index ff0c767abb..4afd729bc7 100644 --- a/nexus/src/app/sagas/instance_common.rs +++ b/nexus/src/app/sagas/instance_common.rs @@ -352,7 +352,7 @@ pub async fn instance_ip_remove_nat( Ok(()) } -/// Inform the OPTE port for a running instance that it should start +/// Inform the OPTE port for a running instance that it should start /// sending/receiving traffic on a given IP address. /// /// This call is a no-op if `sled_uuid` is `None` or the saga is explicitly @@ -407,7 +407,7 @@ pub async fn instance_ip_add_opte( Ok(()) } -/// Inform the OPTE port for a running instance that it should cease +/// Inform the OPTE port for a running instance that it should cease /// sending/receiving traffic on a given IP address. /// /// This call is a no-op if `sled_uuid` is `None` or the saga is explicitly From f1cd2b1bcfbb201f57e066277c2262d535216335 Mon Sep 17 00:00:00 2001 From: Kyle Simpson Date: Wed, 10 Jan 2024 15:01:02 +0000 Subject: [PATCH 49/56] Unduplicate calls to `ensure_nat_entry` --- nexus/src/app/instance_network.rs | 24 ++++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/nexus/src/app/instance_network.rs b/nexus/src/app/instance_network.rs index 8d020a9a3b..d63bbb279e 100644 --- a/nexus/src/app/instance_network.rs +++ b/nexus/src/app/instance_network.rs @@ -378,6 +378,18 @@ impl super::Nexus { let boundary_switches = self.boundary_switches(&self.opctx_alloc).await?; + for external_ip in ips_of_interest { + // For each external ip, add a nat entry to the database + self.ensure_nat_entry( + external_ip, + sled_address, + &network_interface, + mac_address, + opctx, + ) + .await?; + } + for switch in &boundary_switches { debug!(&self.log, "notifying dendrite of updates"; "instance_id" => %authz_instance.id(), @@ -389,18 +401,6 @@ impl super::Nexus { )) })?; - for external_ip in ips_of_interest { - // For each external ip, add a nat entry to the database - self.ensure_nat_entry( - external_ip, - sled_address, - &network_interface, - mac_address, - opctx, - ) - .await?; - } - // Notify dendrite that there are changes for it to reconcile. // In the event of a failure to notify dendrite, we'll log an error // and rely on dendrite's RPW timer to catch it up. From 540de27e10e81274c2701d9baeb18d4138887cdc Mon Sep 17 00:00:00 2001 From: Kyle Simpson Date: Thu, 11 Jan 2024 14:09:03 +0000 Subject: [PATCH 50/56] Review feedback: tougher NAT cleanup when undoing attach We still need to figure out the best possible `unwind` semantic at the end of unwind, then hopefully this is buttoned up. --- nexus/preprocessed_configs/config.xml | 41 ++++++ nexus/src/app/instance_network.rs | 167 +++++++++++++--------- nexus/src/app/sagas/instance_common.rs | 3 +- nexus/src/app/sagas/instance_delete.rs | 2 +- nexus/src/app/sagas/instance_ip_attach.rs | 20 ++- nexus/src/app/sagas/instance_ip_detach.rs | 1 - nexus/src/app/sagas/instance_start.rs | 2 +- 7 files changed, 160 insertions(+), 76 deletions(-) create mode 100644 nexus/preprocessed_configs/config.xml diff --git a/nexus/preprocessed_configs/config.xml b/nexus/preprocessed_configs/config.xml new file mode 100644 index 0000000000..9b13f12aea --- /dev/null +++ b/nexus/preprocessed_configs/config.xml @@ -0,0 +1,41 @@ + + + + + trace + true + + + 8123 + 9000 + 9004 + + ./ + + true + + + + + + + ::/0 + + + default + default + 1 + + + + + + + + + + + \ No newline at end of file diff --git a/nexus/src/app/instance_network.rs b/nexus/src/app/instance_network.rs index d63bbb279e..ced024e7fb 100644 --- a/nexus/src/app/instance_network.rs +++ b/nexus/src/app/instance_network.rs @@ -7,6 +7,7 @@ use crate::app::sagas::retry_until_known_result; use ipnetwork::IpNetwork; use ipnetwork::Ipv6Network; +use nexus_db_model::ExternalIp; use nexus_db_model::IpAttachState; use nexus_db_model::Ipv4NatValues; use nexus_db_model::Vni as DbVni; @@ -373,43 +374,39 @@ impl super::Nexus { let sled_address = Ipv6Net(Ipv6Network::new(*sled_ip_address.ip(), 128).unwrap()); - // Querying boundary switches also requires fleet access and the use of the - // instance allocator context. - let boundary_switches = - self.boundary_switches(&self.opctx_alloc).await?; - - for external_ip in ips_of_interest { + let mut err_and_limit = None; + for (i, external_ip) in ips_of_interest.iter().enumerate() { // For each external ip, add a nat entry to the database - self.ensure_nat_entry( - external_ip, - sled_address, - &network_interface, - mac_address, - opctx, - ) - .await?; + if let Err(e) = self + .ensure_nat_entry( + external_ip, + sled_address, + &network_interface, + mac_address, + opctx, + ) + .await + { + err_and_limit = Some((e, i)); + break; + } } - for switch in &boundary_switches { - debug!(&self.log, "notifying dendrite of updates"; - "instance_id" => %authz_instance.id(), - "switch" => switch.to_string()); - - let dpd_client = self.dpd_clients.get(switch).ok_or_else(|| { - Error::internal_error(&format!( - "unable to find dendrite client for {switch}" - )) - })?; - - // Notify dendrite that there are changes for it to reconcile. - // In the event of a failure to notify dendrite, we'll log an error - // and rely on dendrite's RPW timer to catch it up. - if let Err(e) = dpd_client.ipv4_nat_trigger_update().await { - error!(self.log, "failed to notify dendrite of nat updates"; "error" => ?e); - }; + // In the event of an ongoing cleanup from IP attach, it's possible + // for an entry pointing at an another instance to still exist, which + // will fail to be ensured if we try to attach/start/... before that unwind + // completes. We need to remove the entries we just added because the undo + // won't call into `instance_delete_dpd_config`. + if let Some((e, max)) = err_and_limit { + for external_ip in &ips_of_interest[..max] { + let _ = self + .external_ip_delete_dpd_config(opctx, external_ip) + .await; + } + return Err(e); } - Ok(()) + self.notify_dendrite_nat_state(Some(instance_id), true).await } async fn ensure_nat_entry( @@ -472,7 +469,6 @@ impl super::Nexus { &self, opctx: &OpContext, authz_instance: &authz::Instance, - ip_filter: Option, ) -> Result<(), Error> { let log = &self.log; let instance_id = authz_instance.id(); @@ -485,20 +481,7 @@ impl super::Nexus { .instance_lookup_external_ips(opctx, instance_id) .await?; - let ips_of_interest = if let Some(wanted_id) = ip_filter { - if let Some(ip) = external_ips.iter().find(|v| v.id == wanted_id) { - std::slice::from_ref(ip) - } else { - return Err(Error::internal_error(&format!( - "failed to find external ip address with id: {wanted_id}, saw {external_ips:?}", - ))); - } - } else { - &external_ips[..] - }; - - let mut errors = vec![]; - for entry in ips_of_interest { + for entry in external_ips { // Soft delete the NAT entry match self .db_datastore @@ -522,12 +505,72 @@ impl super::Nexus { }?; } + self.notify_dendrite_nat_state(Some(instance_id), false).await + } + + /// Attempts to delete Dendrite NAT configuration for a single external IP. + /// + /// This function is primarily used to detach an IP which currently belongs + /// to a known instance, or to cleanup after the failed attach of an `ExternalIp` + /// which may no longer have a parent in the DB. Due to the latter case, this + /// function does not lookup and verify that an IP's parent is still a given instance. + pub(crate) async fn external_ip_delete_dpd_config( + &self, + opctx: &OpContext, + external_ip: &ExternalIp, + ) -> Result<(), Error> { + let log = &self.log; + let instance_id = external_ip.parent_id; + + info!(log, "deleting individual NAT entry from dpd configuration"; + "instance_id" => ?instance_id, + "external_ip" => %external_ip.ip); + + // Soft delete the NAT entry + match self + .db_datastore + .ipv4_nat_delete_by_external_ip(&opctx, external_ip) + .await + { + Ok(_) => Ok(()), + Err(err) => match err { + Error::ObjectNotFound { .. } => { + warn!(log, "no matching nat entries to soft delete"); + Ok(()) + } + _ => { + let message = format!( + "failed to delete nat entry due to error: {err:?}" + ); + error!(log, "{}", message); + Err(Error::internal_error(&message)) + } + }, + }?; + + self.notify_dendrite_nat_state(instance_id, false).await + } + + /// Informs all available boundary switches that the set of NAT entries + /// has changed. + /// + /// When `fail_fast` is set, this function will return on any error when + /// acquiring a handle to a DPD client. Otherwise, it will attempt to notify + /// all clients and then finally return the first error. + async fn notify_dendrite_nat_state( + &self, + instance_id: Option, + fail_fast: bool, + ) -> Result<(), Error> { + // Querying boundary switches also requires fleet access and the use of the + // instance allocator context. let boundary_switches = self.boundary_switches(&self.opctx_alloc).await?; + let mut errors = vec![]; for switch in &boundary_switches { debug!(&self.log, "notifying dendrite of updates"; - "instance_id" => %authz_instance.id(), + "instance_id" => ?instance_id, "switch" => switch.to_string()); let client_result = self.dpd_clients.get(switch).ok_or_else(|| { @@ -540,7 +583,11 @@ impl super::Nexus { Ok(client) => client, Err(new_error) => { errors.push(new_error); - continue; + if fail_fast { + break; + } else { + continue; + } } }; @@ -576,7 +623,6 @@ impl super::Nexus { .instance_lookup_external_ips(opctx, authz_instance.id()) .await?; - let boundary_switches = self.boundary_switches(opctx).await?; for external_ip in external_ips { match self .db_datastore @@ -603,26 +649,7 @@ impl super::Nexus { }?; } - for switch in &boundary_switches { - debug!(&self.log, "notifying dendrite of updates"; - "instance_id" => %authz_instance.id(), - "switch" => switch.to_string()); - - let dpd_client = self.dpd_clients.get(switch).ok_or_else(|| { - Error::internal_error(&format!( - "unable to find dendrite client for {switch}" - )) - })?; - - // Notify dendrite that there are changes for it to reconcile. - // In the event of a failure to notify dendrite, we'll log an error - // and rely on dendrite's RPW timer to catch it up. - if let Err(e) = dpd_client.ipv4_nat_trigger_update().await { - error!(self.log, "failed to notify dendrite of nat updates"; "error" => ?e); - }; - } - - Ok(()) + self.notify_dendrite_nat_state(Some(authz_instance.id()), true).await } /// Given old and new instance runtime states, determines the desired diff --git a/nexus/src/app/sagas/instance_common.rs b/nexus/src/app/sagas/instance_common.rs index 4afd729bc7..77e02ff201 100644 --- a/nexus/src/app/sagas/instance_common.rs +++ b/nexus/src/app/sagas/instance_common.rs @@ -321,7 +321,6 @@ pub async fn instance_ip_add_nat( pub async fn instance_ip_remove_nat( sagactx: &NexusActionContext, serialized_authn: &authn::saga::Serialized, - authz_instance: &authz::Instance, sled_uuid: Option, target_ip: ModifyStateForExternalIp, ) -> Result<(), ActionError> { @@ -345,7 +344,7 @@ pub async fn instance_ip_remove_nat( osagactx .nexus() - .instance_delete_dpd_config(&opctx, authz_instance, Some(target_ip.id)) + .external_ip_delete_dpd_config(&opctx, &target_ip) .await .map_err(ActionError::action_failed)?; diff --git a/nexus/src/app/sagas/instance_delete.rs b/nexus/src/app/sagas/instance_delete.rs index e90b7983cc..aaf5dcb033 100644 --- a/nexus/src/app/sagas/instance_delete.rs +++ b/nexus/src/app/sagas/instance_delete.rs @@ -134,7 +134,7 @@ async fn sid_delete_nat( osagactx .nexus() - .instance_delete_dpd_config(&opctx, &authz_instance, None) + .instance_delete_dpd_config(&opctx, &authz_instance) .await .map_err(ActionError::action_failed)?; diff --git a/nexus/src/app/sagas/instance_ip_attach.rs b/nexus/src/app/sagas/instance_ip_attach.rs index 7e1e10cb2e..ddb86e86a9 100644 --- a/nexus/src/app/sagas/instance_ip_attach.rs +++ b/nexus/src/app/sagas/instance_ip_attach.rs @@ -210,10 +210,28 @@ async fn siia_nat_undo( let params = sagactx.saga_params::()?; let sled_id = sagactx.lookup::>("instance_state")?; let target_ip = sagactx.lookup::("target_ip")?; + + // This requires some explanation in one case, where we can fail because an + // instance may have moved running -> stopped -> deleted. + // An instance delete will cause us to unwind and return to this stage *but* + // the ExternalIp will no longer have a useful parent (or even a + // different parent!). + // + // Internally, we delete the NAT entry *without* checking its instance state because + // it may either be `None`, or another instance may have tried to attach. The + // first case is fine, but we need to consider NAT RPW semantics for the second: + // * The NAT entry table will ensure uniqueness on (external IP, low_port, + // high_port) for non-deleted rows. + // * Instance start and IP attach on a running instance will try to insert such + // a row and fail. + // - Failure in either case will not unwind this NAT entry, because it cannot + // *insert* such an entry. + // * Instance create will successfully set parent, since it won't attempt to ensure + // DPD has correct NAT state unless set to `start: true`. + // So it is safe to remove using the old `ExternalIp` here. if let Err(e) = instance_ip_remove_nat( &sagactx, ¶ms.serialized_authn, - ¶ms.authz_instance, sled_id, target_ip, ) diff --git a/nexus/src/app/sagas/instance_ip_detach.rs b/nexus/src/app/sagas/instance_ip_detach.rs index 442eba6531..da6c92077d 100644 --- a/nexus/src/app/sagas/instance_ip_detach.rs +++ b/nexus/src/app/sagas/instance_ip_detach.rs @@ -173,7 +173,6 @@ async fn siid_nat(sagactx: NexusActionContext) -> Result<(), ActionError> { instance_ip_remove_nat( &sagactx, ¶ms.serialized_authn, - ¶ms.authz_instance, sled_id, target_ip, ) diff --git a/nexus/src/app/sagas/instance_start.rs b/nexus/src/app/sagas/instance_start.rs index 0164a1c214..67d340e192 100644 --- a/nexus/src/app/sagas/instance_start.rs +++ b/nexus/src/app/sagas/instance_start.rs @@ -437,7 +437,7 @@ async fn sis_dpd_ensure_undo( osagactx .nexus() - .instance_delete_dpd_config(&opctx, &authz_instance, None) + .instance_delete_dpd_config(&opctx, &authz_instance) .await?; Ok(()) From a7bf68104e8d4b92b582add7174e088ead26217f Mon Sep 17 00:00:00 2001 From: Kyle Simpson Date: Thu, 11 Jan 2024 19:22:24 +0000 Subject: [PATCH 51/56] Review feedback: unwind on concurrent delete This also includes some extra work designed to make NAT RPW rule management a little more robust in case the IP attach sagas leave behind any mess in event of concurrent stop or delete (sorry!). --- nexus/db-model/src/ipv4_nat_entry.rs | 3 +- nexus/db-model/src/macaddr.rs | 13 +- .../src/db/datastore/external_ip.rs | 46 ++++- .../src/db/datastore/ipv4_nat_entry.rs | 29 ++- nexus/src/app/instance_network.rs | 184 +++++++++++------- nexus/src/app/sagas/instance_common.rs | 21 +- nexus/src/app/sagas/instance_ip_attach.rs | 65 ++++--- 7 files changed, 243 insertions(+), 118 deletions(-) diff --git a/nexus/db-model/src/ipv4_nat_entry.rs b/nexus/db-model/src/ipv4_nat_entry.rs index 570a46b5e9..b0fa2b8eb9 100644 --- a/nexus/db-model/src/ipv4_nat_entry.rs +++ b/nexus/db-model/src/ipv4_nat_entry.rs @@ -5,6 +5,7 @@ use crate::{schema::ipv4_nat_entry, Ipv4Net, Ipv6Net, SqlU16, Vni}; use chrono::{DateTime, Utc}; use omicron_common::api::external; use schemars::JsonSchema; +use serde::Deserialize; use serde::Serialize; use uuid::Uuid; @@ -21,7 +22,7 @@ pub struct Ipv4NatValues { } /// Database representation of an Ipv4 NAT Entry. -#[derive(Queryable, Debug, Clone, Selectable)] +#[derive(Queryable, Debug, Clone, Selectable, Serialize, Deserialize)] #[diesel(table_name = ipv4_nat_entry)] pub struct Ipv4NatEntry { pub id: Uuid, diff --git a/nexus/db-model/src/macaddr.rs b/nexus/db-model/src/macaddr.rs index dceb8acf48..b3329598bd 100644 --- a/nexus/db-model/src/macaddr.rs +++ b/nexus/db-model/src/macaddr.rs @@ -8,8 +8,19 @@ use diesel::pg::Pg; use diesel::serialize::{self, ToSql}; use diesel::sql_types; use omicron_common::api::external; +use serde::Deserialize; +use serde::Serialize; -#[derive(Clone, Copy, Debug, PartialEq, AsExpression, FromSqlRow)] +#[derive( + Clone, + Copy, + Debug, + PartialEq, + AsExpression, + FromSqlRow, + Serialize, + Deserialize, +)] #[diesel(sql_type = sql_types::BigInt)] pub struct MacAddr(pub external::MacAddr); diff --git a/nexus/db-queries/src/db/datastore/external_ip.rs b/nexus/db-queries/src/db/datastore/external_ip.rs index fc838467d4..9d4d947476 100644 --- a/nexus/db-queries/src/db/datastore/external_ip.rs +++ b/nexus/db-queries/src/db/datastore/external_ip.rs @@ -960,12 +960,13 @@ impl DataStore { let now = Utc::now(); let conn = self.pool_connection_authorized(opctx).await?; - match (ip_kind, target_state) { - (IpKind::SNat, _) => return Err(Error::internal_error( + match (ip_kind, expected_state, target_state) { + (IpKind::SNat, _, _) => return Err(Error::internal_error( "SNAT should not be removed via `external_ip_complete_op`, \ use `deallocate_external_ip`", )), - (IpKind::Ephemeral, IpAttachState::Detached) => { + + (IpKind::Ephemeral, _, IpAttachState::Detached) => { part_out .set(( dsl::parent_id.eq(Option::::None), @@ -976,7 +977,8 @@ impl DataStore { .execute_async(&*conn) .await } - (IpKind::Floating, IpAttachState::Detached) => { + + (IpKind::Floating, _, IpAttachState::Detached) => { part_out .set(( dsl::parent_id.eq(Option::::None), @@ -986,7 +988,41 @@ impl DataStore { .execute_async(&*conn) .await } - (_, IpAttachState::Attached) => { + + // Attaching->Attached gets separate logic because we choose to fail + // and unwind on instance delete. This covers two cases: + // - External IP is deleted. + // - Floating IP is suddenly `detached`. + (_, IpAttachState::Attaching, IpAttachState::Attached) => { + return part_out + .set(( + dsl::time_modified.eq(Utc::now()), + dsl::state.eq(target_state), + )) + .check_if_exists::(ip_id) + .execute_and_check( + &*self.pool_connection_authorized(opctx).await?, + ) + .await + .map_err(|e| { + public_error_from_diesel(e, ErrorHandler::Server) + }) + .and_then(|r| match r.status { + UpdateStatus::Updated => Ok(1), + UpdateStatus::NotUpdatedButExists + if r.found.state == IpAttachState::Detached + || r.found.time_deleted.is_some() => + { + Err(Error::internal_error( + "unwinding due to concurrent instance delete", + )) + } + UpdateStatus::NotUpdatedButExists => Ok(0), + }) + } + + // Unwind from failed detach. + (_, _, IpAttachState::Attached) => { part_out .set(( dsl::time_modified.eq(Utc::now()), diff --git a/nexus/db-queries/src/db/datastore/ipv4_nat_entry.rs b/nexus/db-queries/src/db/datastore/ipv4_nat_entry.rs index a44fed4cdf..655a267fe1 100644 --- a/nexus/db-queries/src/db/datastore/ipv4_nat_entry.rs +++ b/nexus/db-queries/src/db/datastore/ipv4_nat_entry.rs @@ -23,12 +23,14 @@ impl DataStore { &self, opctx: &OpContext, nat_entry: Ipv4NatValues, - ) -> CreateResult<()> { + ) -> CreateResult { use db::schema::ipv4_nat_entry::dsl; use diesel::sql_types; // Look up any NAT entries that already have the exact parameters // we're trying to INSERT. + // We want to return any existing entry, but not to mask the UniqueViolation + // when trying to use an existing IP + port range with a different target. let matching_entry_subquery = dsl::ipv4_nat_entry .filter(dsl::external_address.eq(nat_entry.external_address)) .filter(dsl::first_port.eq(nat_entry.first_port)) @@ -58,7 +60,7 @@ impl DataStore { )) .filter(diesel::dsl::not(diesel::dsl::exists(matching_entry_subquery))); - diesel::insert_into(dsl::ipv4_nat_entry) + let out = diesel::insert_into(dsl::ipv4_nat_entry) .values(new_entry_subquery) .into_columns(( dsl::external_address, @@ -68,11 +70,24 @@ impl DataStore { dsl::vni, dsl::mac, )) - .execute_async(&*self.pool_connection_authorized(opctx).await?) - .await - .map_err(|e| public_error_from_diesel(e, ErrorHandler::Server))?; - - Ok(()) + .returning(Ipv4NatEntry::as_returning()) + .get_result_async(&*self.pool_connection_authorized(opctx).await?) + .await; + + match out { + Ok(o) => Ok(o), + Err(diesel::result::Error::NotFound) => { + // Idempotent ensure. Annoyingly, we can't easily extract + // the existing row as part of the insert query: + // - (SELECT ..) UNION (INSERT INTO .. RETURNING ..) isn't + // allowed by crdb. + // - Can't ON CONFLICT with a partial constraint, so we can't + // do a no-op write and return the row that way either. + // So, we do another lookup. + self.ipv4_nat_find_by_values(opctx, nat_entry).await + } + Err(e) => Err(public_error_from_diesel(e, ErrorHandler::Server)), + } } pub async fn ipv4_nat_delete( diff --git a/nexus/src/app/instance_network.rs b/nexus/src/app/instance_network.rs index ced024e7fb..76454774d9 100644 --- a/nexus/src/app/instance_network.rs +++ b/nexus/src/app/instance_network.rs @@ -9,6 +9,7 @@ use ipnetwork::IpNetwork; use ipnetwork::Ipv6Network; use nexus_db_model::ExternalIp; use nexus_db_model::IpAttachState; +use nexus_db_model::Ipv4NatEntry; use nexus_db_model::Ipv4NatValues; use nexus_db_model::Vni as DbVni; use nexus_db_queries::authz; @@ -277,6 +278,10 @@ impl super::Nexus { /// Ensures that the Dendrite configuration for the supplied instance is /// up-to-date. /// + /// Returns a list of live NAT RPW table entries from this call. Generally + /// these should only be needed for specific unwind operations, like in + /// the IP attach saga. + /// /// # Parameters /// /// - `opctx`: An operation context that grants read and list-children @@ -298,7 +303,7 @@ impl super::Nexus { instance_id: Uuid, sled_ip_address: &std::net::SocketAddrV6, ip_filter: Option, - ) -> Result<(), Error> { + ) -> Result, Error> { let log = &self.log; info!(log, "looking up instance's primary network interface"; @@ -309,6 +314,9 @@ impl super::Nexus { .lookup_for(authz::Action::ListChildren) .await?; + // XXX: Need to abstract over v6 and v4 entries here. + let mut nat_entries = vec![]; + // All external IPs map to the primary network interface, so find that // interface. If there is no such interface, there's no way to route // traffic destined to those IPs, so there's nothing to configure and @@ -324,7 +332,7 @@ impl super::Nexus { None => { info!(log, "Instance has no primary network interface"; "instance_id" => %instance_id); - return Ok(()); + return Ok(nat_entries); } }; @@ -374,10 +382,17 @@ impl super::Nexus { let sled_address = Ipv6Net(Ipv6Network::new(*sled_ip_address.ip(), 128).unwrap()); + // If all of our IPs are attached or are guaranteed to be owned + // by the saga calling this fn, then we need to disregard and + // remove conflicting rows. No other instance/service should be + // using these as its own, and we are dealing with detritus, e.g., + // the case where we have a concurrent stop -> detach followed + // by an attach to another instance, or other ongoing attach saga + // cleanup. let mut err_and_limit = None; for (i, external_ip) in ips_of_interest.iter().enumerate() { // For each external ip, add a nat entry to the database - if let Err(e) = self + if let Ok(id) = self .ensure_nat_entry( external_ip, sled_address, @@ -386,27 +401,55 @@ impl super::Nexus { opctx, ) .await + { + nat_entries.push(id); + continue; + } + + // We seem to be blocked by a bad row -- take it out and retry. + // This will return Ok() for a non-existent row. + if let Err(e) = self + .external_ip_delete_dpd_config_inner(opctx, external_ip) + .await { err_and_limit = Some((e, i)); break; + }; + + match self + .ensure_nat_entry( + external_ip, + sled_address, + &network_interface, + mac_address, + opctx, + ) + .await + { + Ok(id) => nat_entries.push(id), + Err(e) => { + err_and_limit = Some((e, i)); + break; + } } } - // In the event of an ongoing cleanup from IP attach, it's possible - // for an entry pointing at an another instance to still exist, which - // will fail to be ensured if we try to attach/start/... before that unwind - // completes. We need to remove the entries we just added because the undo - // won't call into `instance_delete_dpd_config`. + // In the event of an unresolvable failure, we need to remove + // the entries we just added because the undo won't call into + // `instance_delete_dpd_config`. These entries won't stop a + // future caller, but it's better not to pollute switch state. if let Some((e, max)) = err_and_limit { for external_ip in &ips_of_interest[..max] { let _ = self - .external_ip_delete_dpd_config(opctx, external_ip) + .external_ip_delete_dpd_config_inner(opctx, external_ip) .await; } return Err(e); } - self.notify_dendrite_nat_state(Some(instance_id), true).await + self.notify_dendrite_nat_state(Some(instance_id), true).await?; + + Ok(nat_entries) } async fn ensure_nat_entry( @@ -416,7 +459,7 @@ impl super::Nexus { network_interface: &sled_agent_client::types::NetworkInterface, mac_address: macaddr::MacAddr6, opctx: &OpContext, - ) -> Result<(), Error> { + ) -> Result { match target_ip.ip { IpNetwork::V4(v4net) => { let nat_entry = Ipv4NatValues { @@ -429,9 +472,10 @@ impl super::Nexus { omicron_common::api::external::MacAddr(mac_address), ), }; - self.db_datastore + Ok(self + .db_datastore .ensure_ipv4_nat_entry(opctx, nat_entry) - .await?; + .await?) } IpNetwork::V6(_v6net) => { // TODO: implement handling of v6 nat. @@ -439,8 +483,7 @@ impl super::Nexus { internal_message: "ipv6 nat is not yet implemented".into(), }); } - }; - Ok(()) + } } /// Attempts to delete all of the Dendrite NAT configuration for the @@ -482,27 +525,7 @@ impl super::Nexus { .await?; for entry in external_ips { - // Soft delete the NAT entry - match self - .db_datastore - .ipv4_nat_delete_by_external_ip(&opctx, &entry) - .await - { - Ok(_) => Ok(()), - Err(err) => match err { - Error::ObjectNotFound { .. } => { - warn!(log, "no matching nat entries to soft delete"); - Ok(()) - } - _ => { - let message = format!( - "failed to delete nat entry due to error: {err:?}" - ); - error!(log, "{}", message); - Err(Error::internal_error(&message)) - } - }, - }?; + self.external_ip_delete_dpd_config_inner(opctx, &entry).await?; } self.notify_dendrite_nat_state(Some(instance_id), false).await @@ -511,9 +534,7 @@ impl super::Nexus { /// Attempts to delete Dendrite NAT configuration for a single external IP. /// /// This function is primarily used to detach an IP which currently belongs - /// to a known instance, or to cleanup after the failed attach of an `ExternalIp` - /// which may no longer have a parent in the DB. Due to the latter case, this - /// function does not lookup and verify that an IP's parent is still a given instance. + /// to a known instance. pub(crate) async fn external_ip_delete_dpd_config( &self, opctx: &OpContext, @@ -526,6 +547,56 @@ impl super::Nexus { "instance_id" => ?instance_id, "external_ip" => %external_ip.ip); + self.external_ip_delete_dpd_config_inner(opctx, external_ip).await?; + + self.notify_dendrite_nat_state(instance_id, false).await + } + + /// Attempts to soft-delete Dendrite NAT configuration for a specific entry + /// via ID. + /// + /// This function is needed to safely cleanup in at least one unwind scenario + /// where a potential second user could need to use the same (IP, portset) pair, + /// e.g. a rapid reattach or a reallocated ephemeral IP. + pub(crate) async fn delete_dpd_config_by_entry( + &self, + opctx: &OpContext, + nat_entry: &Ipv4NatEntry, + ) -> Result<(), Error> { + let log = &self.log; + + info!(log, "deleting individual NAT entry from dpd configuration"; + "id" => ?nat_entry.id, + "version_added" => %nat_entry.external_address.0); + + match self.db_datastore.ipv4_nat_delete(&opctx, nat_entry).await { + Ok(_) => {} + Err(err) => match err { + Error::ObjectNotFound { .. } => { + warn!(log, "no matching nat entries to soft delete"); + } + _ => { + let message = format!( + "failed to delete nat entry due to error: {err:?}" + ); + error!(log, "{}", message); + return Err(Error::internal_error(&message)); + } + }, + } + + self.notify_dendrite_nat_state(None, false).await + } + + /// Soft-delete an individual external IP from the NAT RPW, without + /// triggering a Dendrite notification. + async fn external_ip_delete_dpd_config_inner( + &self, + opctx: &OpContext, + external_ip: &ExternalIp, + ) -> Result<(), Error> { + let log = &self.log; + // Soft delete the NAT entry match self .db_datastore @@ -546,9 +617,7 @@ impl super::Nexus { Err(Error::internal_error(&message)) } }, - }?; - - self.notify_dendrite_nat_state(instance_id, false).await + } } /// Informs all available boundary switches that the set of NAT entries @@ -618,36 +687,7 @@ impl super::Nexus { ) -> Result<(), Error> { self.delete_instance_v2p_mappings(opctx, authz_instance.id()).await?; - let external_ips = self - .datastore() - .instance_lookup_external_ips(opctx, authz_instance.id()) - .await?; - - for external_ip in external_ips { - match self - .db_datastore - .ipv4_nat_delete_by_external_ip(&opctx, &external_ip) - .await - { - Ok(_) => Ok(()), - Err(err) => match err { - Error::ObjectNotFound { .. } => { - warn!( - self.log, - "no matching nat entries to soft delete" - ); - Ok(()) - } - _ => { - let message = format!( - "failed to delete nat entry due to error: {err:?}" - ); - error!(self.log, "{}", message); - Err(Error::internal_error(&message)) - } - }, - }?; - } + self.instance_delete_dpd_config(opctx, authz_instance).await?; self.notify_dendrite_nat_state(Some(authz_instance.id()), true).await } diff --git a/nexus/src/app/sagas/instance_common.rs b/nexus/src/app/sagas/instance_common.rs index 77e02ff201..d541edb903 100644 --- a/nexus/src/app/sagas/instance_common.rs +++ b/nexus/src/app/sagas/instance_common.rs @@ -9,8 +9,8 @@ use std::net::{IpAddr, Ipv6Addr}; use crate::Nexus; use chrono::Utc; use nexus_db_model::{ - ByteCount, ExternalIp, IpAttachState, SledReservationConstraints, - SledResource, + ByteCount, ExternalIp, IpAttachState, Ipv4NatEntry, + SledReservationConstraints, SledResource, }; use nexus_db_queries::authz; use nexus_db_queries::db::lookup::LookupPath; @@ -272,7 +272,7 @@ pub async fn instance_ip_add_nat( authz_instance: &authz::Instance, sled_uuid: Option, target_ip: ModifyStateForExternalIp, -) -> Result<(), ActionError> { +) -> Result, ActionError> { let osagactx = sagactx.user_data(); let datastore = osagactx.datastore(); let opctx = @@ -280,11 +280,11 @@ pub async fn instance_ip_add_nat( // No physical sled? Don't push NAT. let Some(sled_uuid) = sled_uuid else { - return Ok(()); + return Ok(None); }; if !target_ip.do_saga { - return Ok(()); + return Ok(None); } let Some(target_ip) = target_ip.external_ip else { return Err(ActionError::action_failed(Error::internal_error( @@ -309,9 +309,14 @@ pub async fn instance_ip_add_nat( Some(target_ip.id), ) .await - .map_err(ActionError::action_failed)?; - - Ok(()) + .and_then(|v| { + v.into_iter().next().map(Some).ok_or_else(|| { + Error::internal_error( + "NAT RPW failed to return concrete NAT entry", + ) + }) + }) + .map_err(ActionError::action_failed) } /// Remove a single NAT entry from DPD, dropping packets bound for `target_ip`. diff --git a/nexus/src/app/sagas/instance_ip_attach.rs b/nexus/src/app/sagas/instance_ip_attach.rs index ddb86e86a9..be7f81368e 100644 --- a/nexus/src/app/sagas/instance_ip_attach.rs +++ b/nexus/src/app/sagas/instance_ip_attach.rs @@ -4,14 +4,13 @@ use super::instance_common::{ instance_ip_add_nat, instance_ip_add_opte, instance_ip_get_instance_state, - instance_ip_move_state, instance_ip_remove_nat, instance_ip_remove_opte, - ModifyStateForExternalIp, + instance_ip_move_state, instance_ip_remove_opte, ModifyStateForExternalIp, }; use super::{ActionRegistry, NexusActionContext, NexusSaga}; use crate::app::sagas::declare_saga_actions; use crate::app::{authn, authz, db}; use crate::external_api::params; -use nexus_db_model::IpAttachState; +use nexus_db_model::{IpAttachState, Ipv4NatEntry}; use nexus_db_queries::db::lookup::LookupPath; use nexus_types::external_api::views; use omicron_common::api::external::{Error, NameOrId}; @@ -34,7 +33,9 @@ use uuid::Uuid; // // Overlap with stop is handled by treating comms failures with // sled-agent as temporary errors and unwinding. For the delete case, we -// allow the attach/detach completion to have a missing record. +// allow the detach completion to have a missing record -- both instance delete +// and detach will leave NAT in the correct state. For attach, if we make it +// to completion and an IP is `detached`, we unwind as a precaution. // See `instance_common::instance_ip_get_instance_state` for more info. // // One more consequence of sled state being able to change beneath us @@ -54,7 +55,7 @@ declare_saga_actions! { + siia_get_instance_state } - REGISTER_NAT -> "no_result0" { + REGISTER_NAT -> "nat_entry" { + siia_nat - siia_nat_undo } @@ -189,7 +190,10 @@ async fn siia_get_instance_state( .await } -async fn siia_nat(sagactx: NexusActionContext) -> Result<(), ActionError> { +// XXX: Need to abstract over v4 and v6 NAT entries when the time comes. +async fn siia_nat( + sagactx: NexusActionContext, +) -> Result, ActionError> { let params = sagactx.saga_params::()?; let sled_id = sagactx.lookup::>("instance_state")?; let target_ip = sagactx.lookup::("target_ip")?; @@ -207,9 +211,19 @@ async fn siia_nat_undo( sagactx: NexusActionContext, ) -> Result<(), anyhow::Error> { let log = sagactx.user_data().log(); + let osagactx = sagactx.user_data(); let params = sagactx.saga_params::()?; - let sled_id = sagactx.lookup::>("instance_state")?; - let target_ip = sagactx.lookup::("target_ip")?; + let nat_entry = sagactx.lookup::>("nat_entry")?; + let opctx = crate::context::op_context_for_saga_action( + &sagactx, + ¶ms.serialized_authn, + ); + + let Some(nat_entry) = nat_entry else { + // Seeing `None` here means that we never pushed DPD state in + // the first instance. Nothing to undo. + return Ok(()); + }; // This requires some explanation in one case, where we can fail because an // instance may have moved running -> stopped -> deleted. @@ -218,24 +232,27 @@ async fn siia_nat_undo( // different parent!). // // Internally, we delete the NAT entry *without* checking its instance state because - // it may either be `None`, or another instance may have tried to attach. The + // it may either be `None`, or another instance may have attached. The // first case is fine, but we need to consider NAT RPW semantics for the second: // * The NAT entry table will ensure uniqueness on (external IP, low_port, // high_port) for non-deleted rows. // * Instance start and IP attach on a running instance will try to insert such - // a row and fail. - // - Failure in either case will not unwind this NAT entry, because it cannot - // *insert* such an entry. + // a row, fail, and then delete this row before moving forwards. + // - Until either side deletes the row, we're polluting switch NAT. + // - We can't guarantee quick reuse to remove this rule via attach. + // - This will lead to a *new* NAT entry we need to protect, so we need to be careful + // that we only remove *our* incarnation. This is likelier to be hit + // if an ephemeral IP is deallocated, reallocated, and reused in a short timeframe. // * Instance create will successfully set parent, since it won't attempt to ensure // DPD has correct NAT state unless set to `start: true`. - // So it is safe to remove using the old `ExternalIp` here. - if let Err(e) = instance_ip_remove_nat( - &sagactx, - ¶ms.serialized_authn, - sled_id, - target_ip, - ) - .await + // So it is safe/necessary to remove using the old entry here to target the + // exact row we created.. + + if let Err(e) = osagactx + .nexus() + .delete_dpd_config_by_entry(&opctx, &nat_entry) + .await + .map_err(ActionError::action_failed) { error!(log, "siia_nat_undo: failed to notify DPD: {e}"); } @@ -280,6 +297,9 @@ async fn siia_complete_attach( let params = sagactx.saga_params::()?; let target_ip = sagactx.lookup::("target_ip")?; + // There is a clause in `external_ip_complete_op` which specifically + // causes an unwind here if the instance delete saga fires and an IP is either + // detached or deleted. if !instance_ip_move_state( &sagactx, ¶ms.serialized_authn, @@ -289,10 +309,7 @@ async fn siia_complete_attach( ) .await? { - warn!( - log, - "siia_complete_attach: external IP was deleted or call was idempotent" - ) + warn!(log, "siia_complete_attach: call was idempotent") } target_ip From b2883204a5abdec59178a8f2cc2397517de853a6 Mon Sep 17 00:00:00 2001 From: Kyle Simpson Date: Fri, 19 Jan 2024 13:04:46 +0000 Subject: [PATCH 52/56] Review feedback: begin work on separate FIP + EIP endpoints --- nexus/src/app/external_ip.rs | 63 ++++++++++ nexus/src/external_api/http_entrypoints.rs | 65 ++++++++++ nexus/tests/output/nexus_tags.txt | 2 + nexus/types/src/external_api/params.rs | 19 ++- nexus/types/src/external_api/views.rs | 17 ++- openapi/nexus.json | 139 ++++++++++++++++++++- 6 files changed, 300 insertions(+), 5 deletions(-) diff --git a/nexus/src/app/external_ip.rs b/nexus/src/app/external_ip.rs index eec3b84cda..45b05fbb0b 100644 --- a/nexus/src/app/external_ip.rs +++ b/nexus/src/app/external_ip.rs @@ -4,6 +4,8 @@ //! External IP addresses for instances +use std::sync::Arc; + use crate::external_api::views::ExternalIp; use crate::external_api::views::FloatingIp; use nexus_db_model::IpAttachState; @@ -13,6 +15,7 @@ use nexus_db_queries::db::lookup; use nexus_db_queries::db::lookup::LookupPath; use nexus_db_queries::db::model::IpKind; use nexus_types::external_api::params; +use nexus_types::external_api::views; use omicron_common::api::external::http_pagination::PaginatedBy; use omicron_common::api::external::CreateResult; use omicron_common::api::external::DeleteResult; @@ -20,6 +23,7 @@ use omicron_common::api::external::Error; use omicron_common::api::external::ListResultVec; use omicron_common::api::external::LookupResult; use omicron_common::api::external::NameOrId; +use omicron_common::api::external::UpdateResult; impl super::Nexus { pub(crate) async fn instance_list_external_ips( @@ -133,4 +137,63 @@ impl super::Nexus { self.db_datastore.floating_ip_delete(opctx, &authz_fip).await } + + pub(crate) async fn floating_ip_attach( + self: &Arc, + opctx: &OpContext, + fip_selector: params::FloatingIpSelector, + target: params::FloatingIpAttach, + ) -> UpdateResult { + match target.kind { + params::FloatingIpParentKind::Instance => { + let instance_selector = params::InstanceSelector { + project: fip_selector.project, + instance: target.parent, + }; + let instance = + self.instance_lookup(opctx, instance_selector)?; + let attach_params = ¶ms::ExternalIpCreate::Floating { + floating_ip: fip_selector.floating_ip, + }; + self.instance_attach_external_ip( + opctx, + &instance, + attach_params, + ) + .await + .and_then(FloatingIp::try_from) + } + } + } + + pub(crate) async fn floating_ip_detach( + self: &Arc, + opctx: &OpContext, + ip_lookup: lookup::FloatingIp<'_>, + ) -> UpdateResult { + // XXX: Today, this only happens for instances. + // In future, we will need to separate out by the *type* of + // parent attached to a floating IP. We don't yet store this + // in db for user-facing FIPs (is_service => internal-only + // at this point). + let (.., authz_fip, db_fip) = + ip_lookup.fetch_for(authz::Action::Modify).await?; + + let Some(parent_id) = db_fip.parent_id else { + return Ok(db_fip.into()); + }; + + let instance_selector = params::InstanceSelector { + project: None, + instance: parent_id.into(), + }; + let instance = self.instance_lookup(opctx, instance_selector)?; + let attach_params = ¶ms::ExternalIpDetach::Floating { + floating_ip: authz_fip.id().into(), + }; + + self.instance_detach_external_ip(opctx, &instance, attach_params) + .await + .and_then(FloatingIp::try_from) + } } diff --git a/nexus/src/external_api/http_entrypoints.rs b/nexus/src/external_api/http_entrypoints.rs index 8324e94f90..d446297519 100644 --- a/nexus/src/external_api/http_entrypoints.rs +++ b/nexus/src/external_api/http_entrypoints.rs @@ -142,6 +142,8 @@ pub(crate) fn external_api() -> NexusApiDescription { api.register(floating_ip_create)?; api.register(floating_ip_view)?; api.register(floating_ip_delete)?; + api.register(floating_ip_attach)?; + api.register(floating_ip_detach)?; api.register(disk_list)?; api.register(disk_create)?; @@ -1921,6 +1923,69 @@ async fn floating_ip_view( apictx.external_latencies.instrument_dropshot_handler(&rqctx, handler).await } +/// Attach a floating IP to an instance or other resource +#[endpoint { + method = POST, + path = "/v1/floating-ips/{floating_ip}/attach", + tags = ["instances"], +}] +async fn floating_ip_attach( + rqctx: RequestContext>, + path_params: Path, + query_params: Query, + target: TypedBody, +) -> Result, HttpError> { + let apictx = rqctx.context(); + let handler = async { + let opctx = crate::context::op_context_for_external_api(&rqctx).await?; + let nexus = &apictx.nexus; + let path = path_params.into_inner(); + let query = query_params.into_inner(); + let floating_ip_selector = params::FloatingIpSelector { + floating_ip: path.floating_ip, + project: query.project, + }; + let ip = nexus + .floating_ip_attach( + &opctx, + floating_ip_selector, + target.into_inner(), + ) + .await?; + Ok(HttpResponseAccepted(ip)) + }; + apictx.external_latencies.instrument_dropshot_handler(&rqctx, handler).await +} + +/// Detach a floating IP from an instance or other resource +#[endpoint { + method = POST, + path = "/v1/floating-ips/{floating_ip}/detach", + tags = ["instances"], +}] +async fn floating_ip_detach( + rqctx: RequestContext>, + path_params: Path, + query_params: Query, +) -> Result, HttpError> { + let apictx = rqctx.context(); + let handler = async { + let opctx = crate::context::op_context_for_external_api(&rqctx).await?; + let nexus = &apictx.nexus; + let path = path_params.into_inner(); + let query = query_params.into_inner(); + let floating_ip_selector = params::FloatingIpSelector { + floating_ip: path.floating_ip, + project: query.project, + }; + let fip_lookup = + nexus.floating_ip_lookup(&opctx, floating_ip_selector)?; + let ip = nexus.floating_ip_detach(&opctx, fip_lookup).await?; + Ok(HttpResponseAccepted(ip)) + }; + apictx.external_latencies.instrument_dropshot_handler(&rqctx, handler).await +} + // Disks /// List disks diff --git a/nexus/tests/output/nexus_tags.txt b/nexus/tests/output/nexus_tags.txt index 6d87aaeea2..99809800e5 100644 --- a/nexus/tests/output/nexus_tags.txt +++ b/nexus/tests/output/nexus_tags.txt @@ -35,6 +35,8 @@ image_view GET /v1/images/{image} API operations found with tag "instances" OPERATION ID METHOD URL PATH +floating_ip_attach POST /v1/floating-ips/{floating_ip}/attach +floating_ip_detach POST /v1/floating-ips/{floating_ip}/detach instance_create POST /v1/instances instance_delete DELETE /v1/instances/{instance} instance_disk_attach POST /v1/instances/{instance}/disks/attach diff --git a/nexus/types/src/external_api/params.rs b/nexus/types/src/external_api/params.rs index 2f1d9cfa17..a44a4afc71 100644 --- a/nexus/types/src/external_api/params.rs +++ b/nexus/types/src/external_api/params.rs @@ -71,7 +71,7 @@ path_param!(VpcPath, vpc, "VPC"); path_param!(SubnetPath, subnet, "subnet"); path_param!(RouterPath, router, "router"); path_param!(RoutePath, route, "route"); -path_param!(FloatingIpPath, floating_ip, "Floating IP"); +path_param!(FloatingIpPath, floating_ip, "floating IP"); path_param!(DiskPath, disk, "disk"); path_param!(SnapshotPath, snapshot, "snapshot"); path_param!(ImagePath, image, "image"); @@ -887,6 +887,23 @@ pub struct FloatingIpCreate { pub pool: Option, } +/// The type of resource that a floating IP is attached to +#[derive(Clone, Debug, Deserialize, Serialize, JsonSchema)] +#[serde(rename_all = "snake_case")] +pub enum FloatingIpParentKind { + Instance, +} + +/// Parameters for attaching a floating IP address to another resource +#[derive(Clone, Debug, Deserialize, Serialize, JsonSchema)] +pub struct FloatingIpAttach { + /// Name or ID of the resource that this resource should be attached to + pub parent: NameOrId, + + /// The type of `parent`'s resource + pub kind: FloatingIpParentKind, +} + // INSTANCES /// Describes an attachment of an `InstanceNetworkInterface` to an `Instance`, diff --git a/nexus/types/src/external_api/views.rs b/nexus/types/src/external_api/views.rs index 99db1d8854..2d1c9c7192 100644 --- a/nexus/types/src/external_api/views.rs +++ b/nexus/types/src/external_api/views.rs @@ -12,8 +12,8 @@ use api_identity::ObjectIdentity; use chrono::DateTime; use chrono::Utc; use omicron_common::api::external::{ - ByteCount, Digest, IdentityMetadata, InstanceState, Ipv4Net, Ipv6Net, Name, - ObjectIdentity, RoleName, SemverVersion, SimpleIdentity, + ByteCount, Digest, Error, IdentityMetadata, InstanceState, Ipv4Net, + Ipv6Net, Name, ObjectIdentity, RoleName, SemverVersion, SimpleIdentity, }; use schemars::JsonSchema; use serde::{Deserialize, Serialize}; @@ -372,6 +372,19 @@ impl From for ExternalIp { } } +impl TryFrom for FloatingIp { + type Error = Error; + + fn try_from(value: ExternalIp) -> Result { + match value { + ExternalIp::Ephemeral { .. } => Err(Error::internal_error( + "tried to convert an ephemeral IP into a floating IP", + )), + ExternalIp::Floating(v) => Ok(v), + } + } +} + // RACKS /// View of an Rack diff --git a/openapi/nexus.json b/openapi/nexus.json index 9c3dd7e9dc..7cdac37ef8 100644 --- a/openapi/nexus.json +++ b/openapi/nexus.json @@ -930,7 +930,7 @@ { "in": "path", "name": "floating_ip", - "description": "Name or ID of the Floating IP", + "description": "Name or ID of the floating IP", "required": true, "schema": { "$ref": "#/components/schemas/NameOrId" @@ -974,7 +974,7 @@ { "in": "path", "name": "floating_ip", - "description": "Name or ID of the Floating IP", + "description": "Name or ID of the floating IP", "required": true, "schema": { "$ref": "#/components/schemas/NameOrId" @@ -1002,6 +1002,108 @@ } } }, + "/v1/floating-ips/{floating_ip}/attach": { + "post": { + "tags": [ + "instances" + ], + "summary": "Attach a floating IP to an instance or other resource", + "operationId": "floating_ip_attach", + "parameters": [ + { + "in": "path", + "name": "floating_ip", + "description": "Name or ID of the floating IP", + "required": true, + "schema": { + "$ref": "#/components/schemas/NameOrId" + } + }, + { + "in": "query", + "name": "project", + "description": "Name or ID of the project", + "schema": { + "$ref": "#/components/schemas/NameOrId" + } + } + ], + "requestBody": { + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/FloatingIpAttach" + } + } + }, + "required": true + }, + "responses": { + "202": { + "description": "successfully enqueued operation", + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/FloatingIp" + } + } + } + }, + "4XX": { + "$ref": "#/components/responses/Error" + }, + "5XX": { + "$ref": "#/components/responses/Error" + } + } + } + }, + "/v1/floating-ips/{floating_ip}/detach": { + "post": { + "tags": [ + "instances" + ], + "summary": "Detach a floating IP from an instance or other resource", + "operationId": "floating_ip_detach", + "parameters": [ + { + "in": "path", + "name": "floating_ip", + "description": "Name or ID of the floating IP", + "required": true, + "schema": { + "$ref": "#/components/schemas/NameOrId" + } + }, + { + "in": "query", + "name": "project", + "description": "Name or ID of the project", + "schema": { + "$ref": "#/components/schemas/NameOrId" + } + } + ], + "responses": { + "202": { + "description": "successfully enqueued operation", + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/FloatingIp" + } + } + } + }, + "4XX": { + "$ref": "#/components/responses/Error" + }, + "5XX": { + "$ref": "#/components/responses/Error" + } + } + } + }, "/v1/groups": { "get": { "tags": [ @@ -11379,6 +11481,32 @@ "time_modified" ] }, + "FloatingIpAttach": { + "description": "Parameters for attaching a floating IP address to another resource", + "type": "object", + "properties": { + "kind": { + "description": "The type of `parent`'s resource", + "allOf": [ + { + "$ref": "#/components/schemas/FloatingIpParentKind" + } + ] + }, + "parent": { + "description": "Name or ID of the resource that this resource should be attached to", + "allOf": [ + { + "$ref": "#/components/schemas/NameOrId" + } + ] + } + }, + "required": [ + "kind", + "parent" + ] + }, "FloatingIpCreate": { "description": "Parameters for creating a new floating IP address for instances.", "type": "object", @@ -11410,6 +11538,13 @@ "name" ] }, + "FloatingIpParentKind": { + "description": "The type of resource that a floating IP is attached to", + "type": "string", + "enum": [ + "instance" + ] + }, "FloatingIpResultsPage": { "description": "A single page of results", "type": "object", From 6c2bdf011b28563a921a206023a6ba0afcfcbe46 Mon Sep 17 00:00:00 2001 From: Kyle Simpson Date: Fri, 19 Jan 2024 13:06:38 +0000 Subject: [PATCH 53/56] Move schema in prep for merge --- nexus/db-model/src/schema.rs | 2 +- schema/crdb/{24.0.0 => 25.0.0}/up01.sql | 0 schema/crdb/{24.0.0 => 25.0.0}/up02.sql | 0 schema/crdb/{24.0.0 => 25.0.0}/up03.sql | 0 schema/crdb/{24.0.0 => 25.0.0}/up04.sql | 0 schema/crdb/{24.0.0 => 25.0.0}/up05.sql | 0 schema/crdb/{24.0.0 => 25.0.0}/up06.sql | 0 schema/crdb/{24.0.0 => 25.0.0}/up07.sql | 0 schema/crdb/{24.0.0 => 25.0.0}/up08.sql | 0 schema/crdb/{24.0.0 => 25.0.0}/up09.sql | 0 schema/crdb/dbinit.sql | 2 +- 11 files changed, 2 insertions(+), 2 deletions(-) rename schema/crdb/{24.0.0 => 25.0.0}/up01.sql (100%) rename schema/crdb/{24.0.0 => 25.0.0}/up02.sql (100%) rename schema/crdb/{24.0.0 => 25.0.0}/up03.sql (100%) rename schema/crdb/{24.0.0 => 25.0.0}/up04.sql (100%) rename schema/crdb/{24.0.0 => 25.0.0}/up05.sql (100%) rename schema/crdb/{24.0.0 => 25.0.0}/up06.sql (100%) rename schema/crdb/{24.0.0 => 25.0.0}/up07.sql (100%) rename schema/crdb/{24.0.0 => 25.0.0}/up08.sql (100%) rename schema/crdb/{24.0.0 => 25.0.0}/up09.sql (100%) diff --git a/nexus/db-model/src/schema.rs b/nexus/db-model/src/schema.rs index c4913dea0f..ca511dcd3f 100644 --- a/nexus/db-model/src/schema.rs +++ b/nexus/db-model/src/schema.rs @@ -13,7 +13,7 @@ use omicron_common::api::external::SemverVersion; /// /// This should be updated whenever the schema is changed. For more details, /// refer to: schema/crdb/README.adoc -pub const SCHEMA_VERSION: SemverVersion = SemverVersion::new(24, 0, 0); +pub const SCHEMA_VERSION: SemverVersion = SemverVersion::new(25, 0, 0); table! { disk (id) { diff --git a/schema/crdb/24.0.0/up01.sql b/schema/crdb/25.0.0/up01.sql similarity index 100% rename from schema/crdb/24.0.0/up01.sql rename to schema/crdb/25.0.0/up01.sql diff --git a/schema/crdb/24.0.0/up02.sql b/schema/crdb/25.0.0/up02.sql similarity index 100% rename from schema/crdb/24.0.0/up02.sql rename to schema/crdb/25.0.0/up02.sql diff --git a/schema/crdb/24.0.0/up03.sql b/schema/crdb/25.0.0/up03.sql similarity index 100% rename from schema/crdb/24.0.0/up03.sql rename to schema/crdb/25.0.0/up03.sql diff --git a/schema/crdb/24.0.0/up04.sql b/schema/crdb/25.0.0/up04.sql similarity index 100% rename from schema/crdb/24.0.0/up04.sql rename to schema/crdb/25.0.0/up04.sql diff --git a/schema/crdb/24.0.0/up05.sql b/schema/crdb/25.0.0/up05.sql similarity index 100% rename from schema/crdb/24.0.0/up05.sql rename to schema/crdb/25.0.0/up05.sql diff --git a/schema/crdb/24.0.0/up06.sql b/schema/crdb/25.0.0/up06.sql similarity index 100% rename from schema/crdb/24.0.0/up06.sql rename to schema/crdb/25.0.0/up06.sql diff --git a/schema/crdb/24.0.0/up07.sql b/schema/crdb/25.0.0/up07.sql similarity index 100% rename from schema/crdb/24.0.0/up07.sql rename to schema/crdb/25.0.0/up07.sql diff --git a/schema/crdb/24.0.0/up08.sql b/schema/crdb/25.0.0/up08.sql similarity index 100% rename from schema/crdb/24.0.0/up08.sql rename to schema/crdb/25.0.0/up08.sql diff --git a/schema/crdb/24.0.0/up09.sql b/schema/crdb/25.0.0/up09.sql similarity index 100% rename from schema/crdb/24.0.0/up09.sql rename to schema/crdb/25.0.0/up09.sql diff --git a/schema/crdb/dbinit.sql b/schema/crdb/dbinit.sql index 0abbfcc784..ce101affc7 100644 --- a/schema/crdb/dbinit.sql +++ b/schema/crdb/dbinit.sql @@ -3288,7 +3288,7 @@ INSERT INTO omicron.public.db_metadata ( version, target_version ) VALUES - ( TRUE, NOW(), NOW(), '24.0.0', NULL) + ( TRUE, NOW(), NOW(), '25.0.0', NULL) ON CONFLICT DO NOTHING; COMMIT; From 42afedafc83672bf5af4588af9f695136e85e7d8 Mon Sep 17 00:00:00 2001 From: Kyle Simpson Date: Fri, 19 Jan 2024 14:29:29 +0000 Subject: [PATCH 54/56] Add separate ephemeral IP manipulation endpoint --- nexus/src/external_api/http_entrypoints.rs | 79 +++++++++++++- nexus/tests/output/nexus_tags.txt | 6 +- nexus/types/src/external_api/params.rs | 14 ++- openapi/nexus.json | 116 ++++++++++++++++++++- 4 files changed, 204 insertions(+), 11 deletions(-) diff --git a/nexus/src/external_api/http_entrypoints.rs b/nexus/src/external_api/http_entrypoints.rs index d446297519..ca54687d96 100644 --- a/nexus/src/external_api/http_entrypoints.rs +++ b/nexus/src/external_api/http_entrypoints.rs @@ -204,6 +204,8 @@ pub(crate) fn external_api() -> NexusApiDescription { api.register(instance_external_ip_list)?; api.register(instance_external_ip_attach)?; api.register(instance_external_ip_detach)?; + api.register(instance_ephemeral_ip_attach)?; + api.register(instance_ephemeral_ip_detach)?; api.register(vpc_router_list)?; api.register(vpc_router_view)?; @@ -1927,7 +1929,7 @@ async fn floating_ip_view( #[endpoint { method = POST, path = "/v1/floating-ips/{floating_ip}/attach", - tags = ["instances"], + tags = ["floating-ips"], }] async fn floating_ip_attach( rqctx: RequestContext>, @@ -1961,7 +1963,7 @@ async fn floating_ip_attach( #[endpoint { method = POST, path = "/v1/floating-ips/{floating_ip}/detach", - tags = ["instances"], + tags = ["floating-ips"], }] async fn floating_ip_detach( rqctx: RequestContext>, @@ -3966,6 +3968,79 @@ async fn instance_external_ip_detach( apictx.external_latencies.instrument_dropshot_handler(&rqctx, handler).await } +/// Allocate and attach an ephemeral IP to an instance +#[endpoint { + method = POST, + path = "/v1/instances/{instance}/external-ips/ephemeral", + tags = ["instances"], +}] +async fn instance_ephemeral_ip_attach( + rqctx: RequestContext>, + path_params: Path, + query_params: Query, + ip_to_create: TypedBody, +) -> Result, HttpError> { + let apictx = rqctx.context(); + let handler = async { + let opctx = crate::context::op_context_for_external_api(&rqctx).await?; + let nexus = &apictx.nexus; + let path = path_params.into_inner(); + let query = query_params.into_inner(); + let instance_selector = params::InstanceSelector { + project: query.project, + instance: path.instance, + }; + let instance_lookup = + nexus.instance_lookup(&opctx, instance_selector)?; + let ip = nexus + .instance_attach_external_ip( + &opctx, + &instance_lookup, + ¶ms::ExternalIpCreate::Ephemeral { + pool: ip_to_create.into_inner().pool + }, + ) + .await?; + Ok(HttpResponseAccepted(ip)) + }; + apictx.external_latencies.instrument_dropshot_handler(&rqctx, handler).await +} + +/// Detach and deallocate an ephemeral IP from an instance +#[endpoint { + method = DELETE, + path = "/v1/instances/{instance}/external-ips/ephemeral", + tags = ["instances"], +}] +async fn instance_ephemeral_ip_detach( + rqctx: RequestContext>, + path_params: Path, + query_params: Query, +) -> Result { + let apictx = rqctx.context(); + let handler = async { + let opctx = crate::context::op_context_for_external_api(&rqctx).await?; + let nexus = &apictx.nexus; + let path = path_params.into_inner(); + let query = query_params.into_inner(); + let instance_selector = params::InstanceSelector { + project: query.project, + instance: path.instance, + }; + let instance_lookup = + nexus.instance_lookup(&opctx, instance_selector)?; + nexus + .instance_detach_external_ip( + &opctx, + &instance_lookup, + ¶ms::ExternalIpDetach::Ephemeral, + ) + .await?; + Ok(HttpResponseDeleted()) + }; + apictx.external_latencies.instrument_dropshot_handler(&rqctx, handler).await +} + // Snapshots /// List snapshots diff --git a/nexus/tests/output/nexus_tags.txt b/nexus/tests/output/nexus_tags.txt index 99809800e5..d3614fa939 100644 --- a/nexus/tests/output/nexus_tags.txt +++ b/nexus/tests/output/nexus_tags.txt @@ -12,8 +12,10 @@ disk_view GET /v1/disks/{disk} API operations found with tag "floating-ips" OPERATION ID METHOD URL PATH +floating_ip_attach POST /v1/floating-ips/{floating_ip}/attach floating_ip_create POST /v1/floating-ips floating_ip_delete DELETE /v1/floating-ips/{floating_ip} +floating_ip_detach POST /v1/floating-ips/{floating_ip}/detach floating_ip_list GET /v1/floating-ips floating_ip_view GET /v1/floating-ips/{floating_ip} @@ -35,13 +37,13 @@ image_view GET /v1/images/{image} API operations found with tag "instances" OPERATION ID METHOD URL PATH -floating_ip_attach POST /v1/floating-ips/{floating_ip}/attach -floating_ip_detach POST /v1/floating-ips/{floating_ip}/detach instance_create POST /v1/instances instance_delete DELETE /v1/instances/{instance} instance_disk_attach POST /v1/instances/{instance}/disks/attach instance_disk_detach POST /v1/instances/{instance}/disks/detach instance_disk_list GET /v1/instances/{instance}/disks +instance_ephemeral_ip_attach POST /v1/instances/{instance}/external-ips/ephemeral +instance_ephemeral_ip_detach DELETE /v1/instances/{instance}/external-ips/ephemeral instance_external_ip_attach POST /v1/instances/{instance}/external-ips/attach instance_external_ip_detach POST /v1/instances/{instance}/external-ips/detach instance_external_ip_list GET /v1/instances/{instance}/external-ips diff --git a/nexus/types/src/external_api/params.rs b/nexus/types/src/external_api/params.rs index 18cdc499fc..04cdcf8a24 100644 --- a/nexus/types/src/external_api/params.rs +++ b/nexus/types/src/external_api/params.rs @@ -900,7 +900,7 @@ pub enum FloatingIpParentKind { /// Parameters for attaching a floating IP address to another resource #[derive(Clone, Debug, Deserialize, Serialize, JsonSchema)] pub struct FloatingIpAttach { - /// Name or ID of the resource that this resource should be attached to + /// Name or ID of the resource that this IP address should be attached to pub parent: NameOrId, /// The type of `parent`'s resource @@ -971,8 +971,8 @@ pub struct InstanceDiskAttach { #[serde(tag = "type", rename_all = "snake_case")] pub enum ExternalIpCreate { /// An IP address providing both inbound and outbound access. The address is - /// automatically-assigned from the provided IP Pool, or all available pools - /// if not specified. + /// automatically-assigned from the provided IP Pool, or the current silo's + /// default pool if not specified. Ephemeral { pool: Option }, /// An IP address providing both inbound and outbound access. The address is /// an existing floating IP object assigned to the current project. @@ -981,6 +981,14 @@ pub enum ExternalIpCreate { Floating { floating_ip: NameOrId }, } +/// Parameters for creating an ephemeral IP address for an instance. +#[derive(Clone, Debug, Deserialize, Serialize, JsonSchema)] +#[serde(tag = "type", rename_all = "snake_case")] +pub struct EphemeralIpCreate { + /// Name or ID of the IP pool used to allocate an address + pub pool: Option, +} + /// Parameters for detaching an external IP from an instance. #[derive(Clone, Debug, Deserialize, Serialize, JsonSchema)] #[serde(tag = "type", rename_all = "snake_case")] diff --git a/openapi/nexus.json b/openapi/nexus.json index 7cdac37ef8..99e99997b5 100644 --- a/openapi/nexus.json +++ b/openapi/nexus.json @@ -1005,7 +1005,7 @@ "/v1/floating-ips/{floating_ip}/attach": { "post": { "tags": [ - "instances" + "floating-ips" ], "summary": "Attach a floating IP to an instance or other resource", "operationId": "floating_ip_attach", @@ -1061,7 +1061,7 @@ "/v1/floating-ips/{floating_ip}/detach": { "post": { "tags": [ - "instances" + "floating-ips" ], "summary": "Detach a floating IP from an instance or other resource", "operationId": "floating_ip_detach", @@ -2040,6 +2040,99 @@ } } }, + "/v1/instances/{instance}/external-ips/ephemeral": { + "post": { + "tags": [ + "instances" + ], + "summary": "Allocate and attach an ephemeral IP to an instance", + "operationId": "instance_ephemeral_ip_attach", + "parameters": [ + { + "in": "path", + "name": "instance", + "description": "Name or ID of the instance", + "required": true, + "schema": { + "$ref": "#/components/schemas/NameOrId" + } + }, + { + "in": "query", + "name": "project", + "description": "Name or ID of the project", + "schema": { + "$ref": "#/components/schemas/NameOrId" + } + } + ], + "requestBody": { + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/EphemeralIpCreate" + } + } + }, + "required": true + }, + "responses": { + "202": { + "description": "successfully enqueued operation", + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/ExternalIp" + } + } + } + }, + "4XX": { + "$ref": "#/components/responses/Error" + }, + "5XX": { + "$ref": "#/components/responses/Error" + } + } + }, + "delete": { + "tags": [ + "instances" + ], + "summary": "Detach and deallocate an ephemeral IP from an instance", + "operationId": "instance_ephemeral_ip_detach", + "parameters": [ + { + "in": "path", + "name": "instance", + "description": "Name or ID of the instance", + "required": true, + "schema": { + "$ref": "#/components/schemas/NameOrId" + } + }, + { + "in": "query", + "name": "project", + "description": "Name or ID of the project", + "schema": { + "$ref": "#/components/schemas/NameOrId" + } + } + ], + "responses": { + "204": { + "description": "successful deletion" + }, + "4XX": { + "$ref": "#/components/responses/Error" + }, + "5XX": { + "$ref": "#/components/responses/Error" + } + } + } + }, "/v1/instances/{instance}/migrate": { "post": { "tags": [ @@ -11151,6 +11244,21 @@ } ] }, + "EphemeralIpCreate": { + "description": "Parameters for creating an ephemeral IP address for an instance.", + "type": "object", + "properties": { + "pool": { + "nullable": true, + "description": "Name or ID of the IP pool used to allocate an address", + "allOf": [ + { + "$ref": "#/components/schemas/NameOrId" + } + ] + } + } + }, "Error": { "description": "Error information from a response.", "type": "object", @@ -11262,7 +11370,7 @@ "description": "Parameters for creating an external IP address for instances.", "oneOf": [ { - "description": "An IP address providing both inbound and outbound access. The address is automatically-assigned from the provided IP Pool, or all available pools if not specified.", + "description": "An IP address providing both inbound and outbound access. The address is automatically-assigned from the provided IP Pool, or the current silo's default pool if not specified.", "type": "object", "properties": { "pool": { @@ -11494,7 +11602,7 @@ ] }, "parent": { - "description": "Name or ID of the resource that this resource should be attached to", + "description": "Name or ID of the resource that this IP address should be attached to", "allOf": [ { "$ref": "#/components/schemas/NameOrId" From 5a4614cc67071a71137ed7ce1c3fb2cfa3fe4d66 Mon Sep 17 00:00:00 2001 From: Kyle Simpson Date: Fri, 19 Jan 2024 19:05:05 +0000 Subject: [PATCH 55/56] Excise instance/external-ip/attach + detach --- nexus/src/external_api/http_entrypoints.rs | 76 +----- nexus/tests/integration_tests/endpoints.rs | 89 ++++--- nexus/tests/integration_tests/external_ips.rs | 219 +++++++----------- nexus/tests/output/nexus_tags.txt | 2 - openapi/nexus.json | 149 ------------ 5 files changed, 145 insertions(+), 390 deletions(-) diff --git a/nexus/src/external_api/http_entrypoints.rs b/nexus/src/external_api/http_entrypoints.rs index ca54687d96..41e1670915 100644 --- a/nexus/src/external_api/http_entrypoints.rs +++ b/nexus/src/external_api/http_entrypoints.rs @@ -202,8 +202,6 @@ pub(crate) fn external_api() -> NexusApiDescription { api.register(instance_network_interface_delete)?; api.register(instance_external_ip_list)?; - api.register(instance_external_ip_attach)?; - api.register(instance_external_ip_detach)?; api.register(instance_ephemeral_ip_attach)?; api.register(instance_ephemeral_ip_detach)?; @@ -3896,78 +3894,6 @@ async fn instance_external_ip_list( apictx.external_latencies.instrument_dropshot_handler(&rqctx, handler).await } -/// Attach an external IP to an instance -#[endpoint { - method = POST, - path = "/v1/instances/{instance}/external-ips/attach", - tags = ["instances"], -}] -async fn instance_external_ip_attach( - rqctx: RequestContext>, - path_params: Path, - query_params: Query, - ip_to_detach: TypedBody, -) -> Result, HttpError> { - let apictx = rqctx.context(); - let handler = async { - let opctx = crate::context::op_context_for_external_api(&rqctx).await?; - let nexus = &apictx.nexus; - let path = path_params.into_inner(); - let query = query_params.into_inner(); - let instance_selector = params::InstanceSelector { - project: query.project, - instance: path.instance, - }; - let instance_lookup = - nexus.instance_lookup(&opctx, instance_selector)?; - let ip = nexus - .instance_attach_external_ip( - &opctx, - &instance_lookup, - &ip_to_detach.into_inner(), - ) - .await?; - Ok(HttpResponseAccepted(ip)) - }; - apictx.external_latencies.instrument_dropshot_handler(&rqctx, handler).await -} - -/// Detach an external IP from an instance -#[endpoint { - method = POST, - path = "/v1/instances/{instance}/external-ips/detach", - tags = ["instances"], -}] -async fn instance_external_ip_detach( - rqctx: RequestContext>, - path_params: Path, - query_params: Query, - ip_to_detach: TypedBody, -) -> Result, HttpError> { - let apictx = rqctx.context(); - let handler = async { - let opctx = crate::context::op_context_for_external_api(&rqctx).await?; - let nexus = &apictx.nexus; - let path = path_params.into_inner(); - let query = query_params.into_inner(); - let instance_selector = params::InstanceSelector { - project: query.project, - instance: path.instance, - }; - let instance_lookup = - nexus.instance_lookup(&opctx, instance_selector)?; - let ip = nexus - .instance_detach_external_ip( - &opctx, - &instance_lookup, - &ip_to_detach.into_inner(), - ) - .await?; - Ok(HttpResponseAccepted(ip)) - }; - apictx.external_latencies.instrument_dropshot_handler(&rqctx, handler).await -} - /// Allocate and attach an ephemeral IP to an instance #[endpoint { method = POST, @@ -3997,7 +3923,7 @@ async fn instance_ephemeral_ip_attach( &opctx, &instance_lookup, ¶ms::ExternalIpCreate::Ephemeral { - pool: ip_to_create.into_inner().pool + pool: ip_to_create.into_inner().pool, }, ) .await?; diff --git a/nexus/tests/integration_tests/endpoints.rs b/nexus/tests/integration_tests/endpoints.rs index f6ca63e73b..3c268d06e5 100644 --- a/nexus/tests/integration_tests/endpoints.rs +++ b/nexus/tests/integration_tests/endpoints.rs @@ -388,20 +388,12 @@ pub static DEMO_INSTANCE_DISKS_DETACH_URL: Lazy = Lazy::new(|| { *DEMO_INSTANCE_NAME, *DEMO_PROJECT_SELECTOR ) }); -pub static DEMO_INSTANCE_EXTERNAL_IP_ATTACH_URL: Lazy = - Lazy::new(|| { - format!( - "/v1/instances/{}/external-ips/attach?{}", - *DEMO_INSTANCE_NAME, *DEMO_PROJECT_SELECTOR - ) - }); -pub static DEMO_INSTANCE_EXTERNAL_IP_DETACH_URL: Lazy = - Lazy::new(|| { - format!( - "/v1/instances/{}/external-ips/detach?{}", - *DEMO_INSTANCE_NAME, *DEMO_PROJECT_SELECTOR - ) - }); +pub static DEMO_INSTANCE_EPHEMERAL_IP_URL: Lazy = Lazy::new(|| { + format!( + "/v1/instances/{}/external-ips/ephemeral?{}", + *DEMO_INSTANCE_NAME, *DEMO_PROJECT_SELECTOR + ) +}); pub static DEMO_INSTANCE_NICS_URL: Lazy = Lazy::new(|| { format!( "/v1/network-interfaces?project={}&instance={}", @@ -732,6 +724,19 @@ pub static DEMO_FLOAT_IP_URL: Lazy = Lazy::new(|| { ) }); +pub static DEMO_FLOATING_IP_ATTACH_URL: Lazy = Lazy::new(|| { + format!( + "/v1/floating-ips/{}/attach?{}", + *DEMO_FLOAT_IP_NAME, *DEMO_PROJECT_SELECTOR + ) +}); +pub static DEMO_FLOATING_IP_DETACH_URL: Lazy = Lazy::new(|| { + format!( + "/v1/floating-ips/{}/detach?{}", + *DEMO_FLOAT_IP_NAME, *DEMO_PROJECT_SELECTOR + ) +}); + pub static DEMO_FLOAT_IP_CREATE: Lazy = Lazy::new(|| params::FloatingIpCreate { identity: IdentityMetadataCreateParams { @@ -742,15 +747,13 @@ pub static DEMO_FLOAT_IP_CREATE: Lazy = pool: None, }); -pub static DEMO_FLOAT_IP_ATTACH: Lazy = - Lazy::new(|| params::ExternalIpCreate::Floating { - floating_ip: DEMO_FLOAT_IP_NAME.clone().into(), - }); -pub static DEMO_FLOAT_IP_DETACH: Lazy = - Lazy::new(|| params::ExternalIpDetach::Floating { - floating_ip: DEMO_FLOAT_IP_NAME.clone().into(), +pub static DEMO_FLOAT_IP_ATTACH: Lazy = + Lazy::new(|| params::FloatingIpAttach { + kind: params::FloatingIpParentKind::Instance, + parent: DEMO_FLOAT_IP_NAME.clone().into(), }); - +pub static DEMO_EPHEMERAL_IP_ATTACH: Lazy = + Lazy::new(|| params::EphemeralIpCreate { pool: None }); // Identity providers pub const IDENTITY_PROVIDERS_URL: &'static str = "/v1/system/identity-providers?silo=demo-silo"; @@ -1781,21 +1784,15 @@ pub static VERIFY_ENDPOINTS: Lazy> = Lazy::new(|| { }, VerifyEndpoint { - url: &DEMO_INSTANCE_EXTERNAL_IP_ATTACH_URL, + url: &DEMO_INSTANCE_EPHEMERAL_IP_URL, visibility: Visibility::Protected, unprivileged_access: UnprivilegedAccess::None, - allowed_methods: vec![AllowedMethod::Post( - serde_json::to_value(&*DEMO_FLOAT_IP_ATTACH).unwrap() - )], - }, - - VerifyEndpoint { - url: &DEMO_INSTANCE_EXTERNAL_IP_DETACH_URL, - visibility: Visibility::Protected, - unprivileged_access: UnprivilegedAccess::None, - allowed_methods: vec![AllowedMethod::Post( - serde_json::to_value(&*DEMO_FLOAT_IP_DETACH).unwrap() - )], + allowed_methods: vec![ + AllowedMethod::Post( + serde_json::to_value(&*DEMO_EPHEMERAL_IP_ATTACH).unwrap() + ), + AllowedMethod::Delete, + ], }, /* IAM */ @@ -2271,5 +2268,27 @@ pub static VERIFY_ENDPOINTS: Lazy> = Lazy::new(|| { AllowedMethod::Delete, ], }, + + VerifyEndpoint { + url: &DEMO_FLOATING_IP_ATTACH_URL, + visibility: Visibility::Protected, + unprivileged_access: UnprivilegedAccess::None, + allowed_methods: vec![ + AllowedMethod::Post( + serde_json::to_value(&*DEMO_FLOAT_IP_ATTACH).unwrap(), + ), + ], + }, + + VerifyEndpoint { + url: &DEMO_FLOATING_IP_DETACH_URL, + visibility: Visibility::Protected, + unprivileged_access: UnprivilegedAccess::None, + allowed_methods: vec![ + AllowedMethod::Post( + serde_json::to_value(&()).unwrap(), + ), + ], + }, ] }); diff --git a/nexus/tests/integration_tests/external_ips.rs b/nexus/tests/integration_tests/external_ips.rs index 19128d4cac..57f813d505 100644 --- a/nexus/tests/integration_tests/external_ips.rs +++ b/nexus/tests/integration_tests/external_ips.rs @@ -56,18 +56,25 @@ pub fn get_floating_ips_url(project_name: &str) -> String { format!("/v1/floating-ips?project={project_name}") } -pub fn attach_instance_external_ip_url( +pub fn instance_ephemeral_ip_url( instance_name: &str, project_name: &str, ) -> String { - format!("/v1/instances/{instance_name}/external-ips/attach?project={project_name}") + format!("/v1/instances/{instance_name}/external-ips/ephemeral?project={project_name}") } -pub fn detach_instance_external_ip_url( - instance_name: &str, +pub fn attach_floating_ip_url( + floating_ip_name: &str, + project_name: &str, +) -> String { + format!("/v1/floating-ips/{floating_ip_name}/attach?project={project_name}") +} + +pub fn detach_floating_ip_url( + floating_ip_name: &str, project_name: &str, ) -> String { - format!("/v1/instances/{instance_name}/external-ips/detach?project={project_name}") + format!("/v1/floating-ips/{floating_ip_name}/detach?project={project_name}") } pub fn get_floating_ip_by_name_url( @@ -448,20 +455,12 @@ async fn test_floating_ip_create_attachment( assert_eq!(fetched_fip.instance_id, Some(instance.identity.id)); // Try to delete the floating IP, which should fail. - let error: HttpErrorResponseBody = NexusRequest::new( - RequestBuilder::new( - client, - Method::DELETE, - &get_floating_ip_by_id_url(&fip.identity.id), - ) - .expect_status(Some(StatusCode::BAD_REQUEST)), + let error = object_delete_error( + client, + &get_floating_ip_by_id_url(&fip.identity.id), + StatusCode::BAD_REQUEST, ) - .authn_as(AuthnMode::PrivilegedUser) - .execute() - .await - .unwrap() - .parsed_body() - .unwrap(); + .await; assert_eq!( error.message, format!("Floating IP cannot be deleted while attached to an instance"), @@ -577,18 +576,11 @@ async fn test_external_ip_live_attach_detach( let mut recorded_ephs = vec![]; for (instance, fip) in instances.iter().zip(&fips) { let instance_name = instance.identity.name.as_str(); - let eph_resp = external_ip_attach( - client, - instance_name, - ¶ms::ExternalIpCreate::Ephemeral { pool: None }, - ) - .await; - let fip_resp = external_ip_attach( + let eph_resp = ephemeral_ip_attach(client, instance_name, None).await; + let fip_resp = floating_ip_attach( client, instance_name, - ¶ms::ExternalIpCreate::Floating { - floating_ip: fip.identity.name.clone().into(), - }, + fip.identity.name.as_str(), ) .await; @@ -604,51 +596,30 @@ async fn test_external_ip_live_attach_detach( assert!(eip_list .iter() .any(|v| matches!(v, views::ExternalIp::Floating(..)) - && v.ip() == fip_resp.ip())); - assert_eq!(fip.ip, fip_resp.ip()); + && v.ip() == fip_resp.ip)); + assert_eq!(fip.ip, fip_resp.ip); // Check for idempotency: repeat requests should return same values. - let eph_resp_2 = external_ip_attach( + let eph_resp_2 = ephemeral_ip_attach(client, instance_name, None).await; + let fip_resp_2 = floating_ip_attach( client, instance_name, - ¶ms::ExternalIpCreate::Ephemeral { pool: None }, - ) - .await; - let fip_resp_2 = external_ip_attach( - client, - instance_name, - ¶ms::ExternalIpCreate::Floating { - floating_ip: fip.identity.name.clone().into(), - }, + fip.identity.name.as_str(), ) .await; assert_eq!(eph_resp, eph_resp_2); - assert_eq!(fip_resp.ip(), fip_resp_2.ip()); - assert_eq!(fip_resp.kind(), fip_resp_2.kind()); + assert_eq!(fip_resp.ip, fip_resp_2.ip); recorded_ephs.push(eph_resp); } // Detach a floating IP and ephemeral IP from each instance. - for ((instance, fip), eph_ip) in - instances.iter().zip(&fips).zip(&recorded_ephs) - { + for (instance, fip) in instances.iter().zip(&fips) { let instance_name = instance.identity.name.as_str(); - let eph_resp = external_ip_detach( - client, - instance_name, - ¶ms::ExternalIpDetach::Ephemeral, - ) - .await; - let fip_resp = external_ip_detach( - client, - instance_name, - ¶ms::ExternalIpDetach::Floating { - floating_ip: fip.identity.name.clone().into(), - }, - ) - .await; + ephemeral_ip_detach(client, instance_name).await; + let fip_resp = + floating_ip_detach(client, fip.identity.name.as_str()).await; // Verify both are removed, and that their bodies match the known FIP/EIP combo. let eip_list = @@ -656,34 +627,17 @@ async fn test_external_ip_live_attach_detach( .await; assert_eq!(eip_list.len(), 0); - assert_eq!(fip.ip, fip_resp.ip()); - assert_eq!(eph_ip, &eph_resp); + assert_eq!(fip.ip, fip_resp.ip); // Check for idempotency: repeat requests should return same values for FIP, // but in ephemeral case there is no currently known IP so we return an error. - let fip_resp_2 = external_ip_detach( - client, - instance_name, - ¶ms::ExternalIpDetach::Floating { - floating_ip: fip.identity.name.clone().into(), - }, - ) - .await; - assert_eq!(fip_resp.ip(), fip_resp_2.ip()); - assert_eq!(fip_resp.kind(), fip_resp_2.kind()); - - let url = detach_instance_external_ip_url(instance_name, PROJECT_NAME); - let error: HttpErrorResponseBody = NexusRequest::new( - RequestBuilder::new(client, Method::POST, &url) - .body(Some(¶ms::ExternalIpDetach::Ephemeral)) - .expect_status(Some(StatusCode::BAD_REQUEST)), - ) - .authn_as(AuthnMode::PrivilegedUser) - .execute() - .await - .unwrap() - .parsed_body() - .unwrap(); + let fip_resp_2 = + floating_ip_detach(client, fip.identity.name.as_str()).await; + assert_eq!(fip_resp.ip, fip_resp_2.ip); + + let url = instance_ephemeral_ip_url(instance_name, PROJECT_NAME); + let error = + object_delete_error(client, &url, StatusCode::BAD_REQUEST).await; assert_eq!( error.message, "instance does not have an ephemeral IP attached".to_string() @@ -731,11 +685,13 @@ async fn test_external_ip_attach_detach_fail_if_in_use_by_other( } // Attach in-use FIP to *other* instance should fail. - let url = attach_instance_external_ip_url(INSTANCE_NAMES[0], PROJECT_NAME); + let url = + attach_floating_ip_url(fips[1].identity.name.as_str(), PROJECT_NAME); let error: HttpErrorResponseBody = NexusRequest::new( RequestBuilder::new(client, Method::POST, &url) - .body(Some(¶ms::ExternalIpCreate::Floating { - floating_ip: fips[1].identity.name.clone().into(), + .body(Some(¶ms::FloatingIpAttach { + kind: params::FloatingIpParentKind::Instance, + parent: INSTANCE_NAMES[0].parse::().unwrap().into(), })) .expect_status(Some(StatusCode::BAD_REQUEST)), ) @@ -746,26 +702,6 @@ async fn test_external_ip_attach_detach_fail_if_in_use_by_other( .parsed_body() .unwrap(); assert_eq!(error.message, "floating IP cannot be attached to one instance while still attached to another".to_string()); - - // Detach in-use FIP from *other* instance should fail. - let url = detach_instance_external_ip_url(INSTANCE_NAMES[0], PROJECT_NAME); - let error: HttpErrorResponseBody = NexusRequest::new( - RequestBuilder::new(client, Method::POST, &url) - .body(Some(¶ms::ExternalIpDetach::Floating { - floating_ip: fips[1].identity.name.clone().into(), - })) - .expect_status(Some(StatusCode::BAD_REQUEST)), - ) - .authn_as(AuthnMode::PrivilegedUser) - .execute() - .await - .unwrap() - .parsed_body() - .unwrap(); - assert_eq!( - error.message, - "floating IP is not attached to the target instance".to_string() - ); } #[nexus_test] @@ -805,11 +741,12 @@ async fn test_external_ip_attach_fails_after_maximum( .await; // Attempt to attach the final FIP should fail. - let url = attach_instance_external_ip_url(instance_name, PROJECT_NAME); + let url = attach_floating_ip_url(fip_name_slice[32], PROJECT_NAME); let error: HttpErrorResponseBody = NexusRequest::new( RequestBuilder::new(client, Method::POST, &url) - .body(Some(¶ms::ExternalIpCreate::Floating { - floating_ip: fip_name_slice[32].parse::().unwrap().into(), + .body(Some(¶ms::FloatingIpAttach { + kind: params::FloatingIpParentKind::Instance, + parent: instance_name.parse::().unwrap().into(), })) .expect_status(Some(StatusCode::BAD_REQUEST)), ) @@ -826,9 +763,10 @@ async fn test_external_ip_attach_fails_after_maximum( ); // Attempt to attach an ephemeral IP should fail. + let url = instance_ephemeral_ip_url(instance_name, PROJECT_NAME); let error: HttpErrorResponseBody = NexusRequest::new( RequestBuilder::new(client, Method::POST, &url) - .body(Some(¶ms::ExternalIpCreate::Ephemeral { pool: None })) + .body(Some(¶ms::EphemeralIpCreate { pool: None })) .expect_status(Some(StatusCode::BAD_REQUEST)), ) .authn_as(AuthnMode::PrivilegedUser) @@ -871,18 +809,16 @@ async fn test_external_ip_attach_ephemeral_at_pool_exhaustion( // Attach a new EIP from other-pool to both instances. // This should succeed for the first, and fail for the second // due to pool exhaustion. - let eph_resp = external_ip_attach( + let eph_resp = ephemeral_ip_attach( client, INSTANCE_NAMES[0], - ¶ms::ExternalIpCreate::Ephemeral { - pool: Some(pool_name.clone().into()), - }, + Some(pool_name.as_str()), ) .await; assert_eq!(eph_resp.ip(), other_pool_range.first_address()); assert_eq!(eph_resp.ip(), other_pool_range.last_address()); - let url = attach_instance_external_ip_url(INSTANCE_NAMES[1], PROJECT_NAME); + let url = instance_ephemeral_ip_url(INSTANCE_NAMES[1], PROJECT_NAME); let error: HttpErrorResponseBody = NexusRequest::new( RequestBuilder::new(client, Method::POST, &url) .body(Some(¶ms::ExternalIpCreate::Ephemeral { @@ -903,12 +839,10 @@ async fn test_external_ip_attach_ephemeral_at_pool_exhaustion( // Idempotent re-add to the first instance should succeed even if // an internal attempt to alloc a new EIP would fail. - let eph_resp_2 = external_ip_attach( + let eph_resp_2 = ephemeral_ip_attach( client, INSTANCE_NAMES[0], - ¶ms::ExternalIpCreate::Ephemeral { - pool: Some(pool_name.clone().into()), - }, + Some(pool_name.as_str()), ) .await; assert_eq!(eph_resp_2, eph_resp); @@ -967,15 +901,17 @@ async fn instance_for_external_ips( .await } -async fn external_ip_attach( +async fn ephemeral_ip_attach( client: &ClientTestContext, instance_name: &str, - eip: ¶ms::ExternalIpCreate, + pool_name: Option<&str>, ) -> views::ExternalIp { - let url = attach_instance_external_ip_url(instance_name, PROJECT_NAME); + let url = instance_ephemeral_ip_url(instance_name, PROJECT_NAME); NexusRequest::new( RequestBuilder::new(client, Method::POST, &url) - .body(Some(eip)) + .body(Some(¶ms::EphemeralIpCreate { + pool: pool_name.map(|v| v.parse::().unwrap().into()), + })) .expect_status(Some(StatusCode::ACCEPTED)), ) .authn_as(AuthnMode::PrivilegedUser) @@ -986,15 +922,40 @@ async fn external_ip_attach( .unwrap() } -async fn external_ip_detach( +async fn ephemeral_ip_detach(client: &ClientTestContext, instance_name: &str) { + let url = instance_ephemeral_ip_url(instance_name, PROJECT_NAME); + object_delete(client, &url).await; +} + +async fn floating_ip_attach( client: &ClientTestContext, instance_name: &str, - eip: ¶ms::ExternalIpDetach, -) -> views::ExternalIp { - let url = detach_instance_external_ip_url(instance_name, PROJECT_NAME); + floating_ip_name: &str, +) -> views::FloatingIp { + let url = attach_floating_ip_url(floating_ip_name, PROJECT_NAME); + NexusRequest::new( + RequestBuilder::new(client, Method::POST, &url) + .body(Some(¶ms::FloatingIpAttach { + kind: params::FloatingIpParentKind::Instance, + parent: instance_name.parse::().unwrap().into(), + })) + .expect_status(Some(StatusCode::ACCEPTED)), + ) + .authn_as(AuthnMode::PrivilegedUser) + .execute() + .await + .unwrap() + .parsed_body() + .unwrap() +} + +async fn floating_ip_detach( + client: &ClientTestContext, + floating_ip_name: &str, +) -> views::FloatingIp { + let url = detach_floating_ip_url(floating_ip_name, PROJECT_NAME); NexusRequest::new( RequestBuilder::new(client, Method::POST, &url) - .body(Some(eip)) .expect_status(Some(StatusCode::ACCEPTED)), ) .authn_as(AuthnMode::PrivilegedUser) diff --git a/nexus/tests/output/nexus_tags.txt b/nexus/tests/output/nexus_tags.txt index d3614fa939..2196d0ecfa 100644 --- a/nexus/tests/output/nexus_tags.txt +++ b/nexus/tests/output/nexus_tags.txt @@ -44,8 +44,6 @@ instance_disk_detach POST /v1/instances/{instance}/disks instance_disk_list GET /v1/instances/{instance}/disks instance_ephemeral_ip_attach POST /v1/instances/{instance}/external-ips/ephemeral instance_ephemeral_ip_detach DELETE /v1/instances/{instance}/external-ips/ephemeral -instance_external_ip_attach POST /v1/instances/{instance}/external-ips/attach -instance_external_ip_detach POST /v1/instances/{instance}/external-ips/detach instance_external_ip_list GET /v1/instances/{instance}/external-ips instance_list GET /v1/instances instance_migrate POST /v1/instances/{instance}/migrate diff --git a/openapi/nexus.json b/openapi/nexus.json index 99e99997b5..245224c9ed 100644 --- a/openapi/nexus.json +++ b/openapi/nexus.json @@ -1928,118 +1928,6 @@ } } }, - "/v1/instances/{instance}/external-ips/attach": { - "post": { - "tags": [ - "instances" - ], - "summary": "Attach an external IP to an instance", - "operationId": "instance_external_ip_attach", - "parameters": [ - { - "in": "path", - "name": "instance", - "description": "Name or ID of the instance", - "required": true, - "schema": { - "$ref": "#/components/schemas/NameOrId" - } - }, - { - "in": "query", - "name": "project", - "description": "Name or ID of the project", - "schema": { - "$ref": "#/components/schemas/NameOrId" - } - } - ], - "requestBody": { - "content": { - "application/json": { - "schema": { - "$ref": "#/components/schemas/ExternalIpCreate" - } - } - }, - "required": true - }, - "responses": { - "202": { - "description": "successfully enqueued operation", - "content": { - "application/json": { - "schema": { - "$ref": "#/components/schemas/ExternalIp" - } - } - } - }, - "4XX": { - "$ref": "#/components/responses/Error" - }, - "5XX": { - "$ref": "#/components/responses/Error" - } - } - } - }, - "/v1/instances/{instance}/external-ips/detach": { - "post": { - "tags": [ - "instances" - ], - "summary": "Detach an external IP from an instance", - "operationId": "instance_external_ip_detach", - "parameters": [ - { - "in": "path", - "name": "instance", - "description": "Name or ID of the instance", - "required": true, - "schema": { - "$ref": "#/components/schemas/NameOrId" - } - }, - { - "in": "query", - "name": "project", - "description": "Name or ID of the project", - "schema": { - "$ref": "#/components/schemas/NameOrId" - } - } - ], - "requestBody": { - "content": { - "application/json": { - "schema": { - "$ref": "#/components/schemas/ExternalIpDetach" - } - } - }, - "required": true - }, - "responses": { - "202": { - "description": "successfully enqueued operation", - "content": { - "application/json": { - "schema": { - "$ref": "#/components/schemas/ExternalIp" - } - } - } - }, - "4XX": { - "$ref": "#/components/responses/Error" - }, - "5XX": { - "$ref": "#/components/responses/Error" - } - } - } - }, "/v1/instances/{instance}/external-ips/ephemeral": { "post": { "tags": [ @@ -11413,43 +11301,6 @@ } ] }, - "ExternalIpDetach": { - "description": "Parameters for detaching an external IP from an instance.", - "oneOf": [ - { - "type": "object", - "properties": { - "type": { - "type": "string", - "enum": [ - "ephemeral" - ] - } - }, - "required": [ - "type" - ] - }, - { - "type": "object", - "properties": { - "floating_ip": { - "$ref": "#/components/schemas/NameOrId" - }, - "type": { - "type": "string", - "enum": [ - "floating" - ] - } - }, - "required": [ - "floating_ip", - "type" - ] - } - ] - }, "ExternalIpResultsPage": { "description": "A single page of results", "type": "object", From ee146c27cebdef2ddee22373f01dea919a943fc7 Mon Sep 17 00:00:00 2001 From: Kyle Simpson Date: Fri, 19 Jan 2024 20:10:40 +0000 Subject: [PATCH 56/56] Remove autogen'd file --- nexus/preprocessed_configs/config.xml | 41 --------------------------- 1 file changed, 41 deletions(-) delete mode 100644 nexus/preprocessed_configs/config.xml diff --git a/nexus/preprocessed_configs/config.xml b/nexus/preprocessed_configs/config.xml deleted file mode 100644 index 9b13f12aea..0000000000 --- a/nexus/preprocessed_configs/config.xml +++ /dev/null @@ -1,41 +0,0 @@ - - - - - trace - true - - - 8123 - 9000 - 9004 - - ./ - - true - - - - - - - ::/0 - - - default - default - 1 - - - - - - - - - - - \ No newline at end of file