From 19e150ed90e5e2b72d20c0aed484ac18857e9934 Mon Sep 17 00:00:00 2001 From: Diwakar Sharma Date: Mon, 25 Nov 2024 10:47:04 +0000 Subject: [PATCH] fix(volume): don't allow unpublish from non-frontend node Signed-off-by: Diwakar Sharma --- .../agents/src/bin/core/tests/event/mod.rs | 5 ++- .../agents/src/bin/core/tests/pool/mod.rs | 6 ++- .../core/tests/volume/garbage_collection.rs | 6 ++- .../agents/src/bin/core/tests/volume/mod.rs | 22 +++++----- .../agents/src/bin/core/volume/operations.rs | 19 +++++++++ .../csi-driver/src/bin/controller/client.rs | 4 +- .../src/bin/controller/controller.rs | 13 +++--- .../grpc/proto/v1/volume/volume.proto | 2 + .../grpc/src/operations/volume/traits.rs | 16 +++++++- .../rest/openapi-specs/v0_api_spec.yaml | 7 ++++ control-plane/rest/service/src/v0/volumes.rs | 8 +++- control-plane/rest/tests/v0_test.rs | 2 +- .../src/types/v0/transport/volume.rs | 9 ++++- .../csi/controller/controller.feature | 8 +++- .../csi/controller/test_controller.py | 40 +++++++++++++++++++ tests/io-engine/tests/rebuild.rs | 6 ++- utils/pstor-usage/src/volumes.rs | 2 +- 17 files changed, 147 insertions(+), 28 deletions(-) diff --git a/control-plane/agents/src/bin/core/tests/event/mod.rs b/control-plane/agents/src/bin/core/tests/event/mod.rs index 946ac125b..e097ebe03 100644 --- a/control-plane/agents/src/bin/core/tests/event/mod.rs +++ b/control-plane/agents/src/bin/core/tests/event/mod.rs @@ -130,7 +130,10 @@ async fn events() { let vol_client = cluster.grpc_client().volume(); vol_client - .unpublish(&UnpublishVolume::new(&volid, false), None) + .unpublish( + &UnpublishVolume::new(&volid, false, Some(cluster.csi_node(0))), + None, + ) .await .unwrap(); diff --git a/control-plane/agents/src/bin/core/tests/pool/mod.rs b/control-plane/agents/src/bin/core/tests/pool/mod.rs index 649246558..20a95f432 100644 --- a/control-plane/agents/src/bin/core/tests/pool/mod.rs +++ b/control-plane/agents/src/bin/core/tests/pool/mod.rs @@ -884,7 +884,11 @@ async fn disown_unused_replicas() { cluster.composer().pause(&node).await.unwrap(); volumes_api - .del_volume_target(&volume.spec.uuid, Some(false)) + .del_volume_target( + &volume.spec.uuid, + Some(false), + Some(cluster.csi_node(0).as_str()), + ) .await .expect_err("io-engine is down"); cluster.composer().kill(&node).await.unwrap(); diff --git a/control-plane/agents/src/bin/core/tests/volume/garbage_collection.rs b/control-plane/agents/src/bin/core/tests/volume/garbage_collection.rs index 1462f66c0..7be2b25a4 100644 --- a/control-plane/agents/src/bin/core/tests/volume/garbage_collection.rs +++ b/control-plane/agents/src/bin/core/tests/volume/garbage_collection.rs @@ -326,7 +326,11 @@ async fn unused_reconcile(cluster: &Cluster) { cluster.composer().kill(&nexus_node.id).await.unwrap(); // 2. now we force unpublish the volume volumes_api - .del_volume_target(&volume.spec.uuid, Some(true)) + .del_volume_target( + &volume.spec.uuid, + Some(true), + Some(cluster.csi_node(0).as_str()), + ) .await .unwrap(); // 3. publish on the previously unused node diff --git a/control-plane/agents/src/bin/core/tests/volume/mod.rs b/control-plane/agents/src/bin/core/tests/volume/mod.rs index bfa12745f..1f28b2a74 100644 --- a/control-plane/agents/src/bin/core/tests/volume/mod.rs +++ b/control-plane/agents/src/bin/core/tests/volume/mod.rs @@ -142,7 +142,7 @@ async fn nexus_persistence_test_iteration( let nexus_uuid = nexus.uuid.clone(); volume_client - .unpublish(&UnpublishVolume::new(&volume_state.uuid, false), None) + .unpublish(&UnpublishVolume::new(&volume_state.uuid, false, None), None) .await .unwrap(); @@ -379,7 +379,7 @@ async fn publishing_test(cluster: &Cluster) { .expect_err("The Volume cannot be published again because it's already published"); volume_client - .unpublish(&UnpublishVolume::new(&volume_state.uuid, false), None) + .unpublish(&UnpublishVolume::new(&volume_state.uuid, false, None), None) .await .unwrap(); @@ -447,7 +447,7 @@ async fn publishing_test(cluster: &Cluster) { .expect_err("The volume is already published"); volume_client - .unpublish(&UnpublishVolume::new(&volume_state.uuid, false), None) + .unpublish(&UnpublishVolume::new(&volume_state.uuid, false, None), None) .await .unwrap(); @@ -491,12 +491,12 @@ async fn publishing_test(cluster: &Cluster) { cluster.composer().kill(target_node.as_str()).await.unwrap(); volume_client - .unpublish(&UnpublishVolume::new(&volume_state.uuid, false), None) + .unpublish(&UnpublishVolume::new(&volume_state.uuid, false, None), None) .await .expect_err("The node is not online..."); volume_client - .unpublish(&UnpublishVolume::new(&volume_state.uuid, true), None) + .unpublish(&UnpublishVolume::new(&volume_state.uuid, true, None), None) .await .expect("With force comes great responsibility..."); @@ -747,7 +747,7 @@ async fn replica_count_test(cluster: &Cluster) { let volume_state = volume.state(); volume_client - .unpublish(&UnpublishVolume::new(&volume_state.uuid, false), None) + .unpublish(&UnpublishVolume::new(&volume_state.uuid, false, None), None) .await .unwrap(); @@ -927,7 +927,7 @@ async fn publish_unpublish(cluster: &Cluster) { // Unpublish the volume2 let _ = volume_client .unpublish( - &UnpublishVolume::new(&VolumeId::try_from(VOLUME_2).unwrap(), false), + &UnpublishVolume::new(&VolumeId::try_from(VOLUME_2).unwrap(), false, None), None, ) .await @@ -936,7 +936,7 @@ async fn publish_unpublish(cluster: &Cluster) { // Unpublish the volume1 let _ = volume_client .unpublish( - &UnpublishVolume::new(&VolumeId::try_from(VOLUME_1).unwrap(), false), + &UnpublishVolume::new(&VolumeId::try_from(VOLUME_1).unwrap(), false, None), None, ) .await @@ -983,14 +983,14 @@ async fn target_distribution(cluster: &Cluster) { // Cleanup let _ = volume_client .unpublish( - &UnpublishVolume::new(&VolumeId::try_from(VOLUME_1).unwrap(), false), + &UnpublishVolume::new(&VolumeId::try_from(VOLUME_1).unwrap(), false, None), None, ) .await .expect("The volume should be unpublished"); let _ = volume_client .unpublish( - &UnpublishVolume::new(&VolumeId::try_from(VOLUME_2).unwrap(), false), + &UnpublishVolume::new(&VolumeId::try_from(VOLUME_2).unwrap(), false, None), None, ) .await @@ -1063,7 +1063,7 @@ async fn offline_node(cluster: &Cluster) { // Unpublish volume2 let _ = volume_client .unpublish( - &UnpublishVolume::new(&VolumeId::try_from(VOLUME_2).unwrap(), false), + &UnpublishVolume::new(&VolumeId::try_from(VOLUME_2).unwrap(), false, None), None, ) .await diff --git a/control-plane/agents/src/bin/core/volume/operations.rs b/control-plane/agents/src/bin/core/volume/operations.rs index e430f5b12..bc4aaa763 100644 --- a/control-plane/agents/src/bin/core/volume/operations.rs +++ b/control-plane/agents/src/bin/core/volume/operations.rs @@ -412,6 +412,25 @@ impl ResourcePublishing for OperationGuardArc { .await?; let volume_target = spec_clone.target().expect("already validated"); + if let Some(unpublishing_node) = request.frontend_host() { + if let Some(tgt_cfg) = spec_clone.active_config() { + if !tgt_cfg + .frontend() + .nodename_allowed(unpublishing_node.as_str()) + { + self.validate_update_step( + registry, + Err(SvcError::FrontendNodeNotAllowed { + node: unpublishing_node.to_string(), + vol_id: request.uuid.to_string(), + }), + &spec_clone, + ) + .await?; + } + } + } + let result = match specs.nexus_opt(volume_target.nexus()).await? { None => Ok(()), Some(mut nexus) => { diff --git a/control-plane/csi-driver/src/bin/controller/client.rs b/control-plane/csi-driver/src/bin/controller/client.rs index bd14ae132..201864457 100644 --- a/control-plane/csi-driver/src/bin/controller/client.rs +++ b/control-plane/csi-driver/src/bin/controller/client.rs @@ -83,6 +83,7 @@ impl From> for ApiClientError { StatusCode::PRECONDITION_FAILED => Self::PreconditionFailed(detailed), StatusCode::BAD_REQUEST => Self::InvalidArgument(detailed), StatusCode::NOT_ACCEPTABLE => Self::NotAcceptable(detailed), + StatusCode::UNAUTHORIZED => Self::PreconditionFailed(detailed), status => Self::GenericOperation(status, detailed), } } @@ -365,11 +366,12 @@ impl RestApiClient { &self, volume_id: &uuid::Uuid, force: bool, + frontend_host: Option<&str>, ) -> Result<(), ApiClientError> { Self::delete_idempotent( self.rest_client .volumes_api() - .del_volume_target(volume_id, Some(force)) + .del_volume_target(volume_id, Some(force), frontend_host) .await, true, )?; diff --git a/control-plane/csi-driver/src/bin/controller/controller.rs b/control-plane/csi-driver/src/bin/controller/controller.rs index 1d65887f0..3e51a28d5 100644 --- a/control-plane/csi-driver/src/bin/controller/controller.rs +++ b/control-plane/csi-driver/src/bin/controller/controller.rs @@ -605,15 +605,18 @@ impl rpc::csi::controller_server::Controller for CsiControllerSvc { return Ok(Response::new(ControllerUnpublishVolumeResponse {})); } - // Do forced volume upublish as Kubernetes already detached the volume. + // Do forced volume unpublish as Kubernetes already detached the volume. RestApiClient::get_client() - .unpublish_volume(&volume_uuid, true) + .unpublish_volume(&volume_uuid, true, Some(args.node_id.as_str())) .await - .map_err(|e| { - Status::not_found(format!( + .map_err(|e| match e { + ApiClientError::PreconditionFailed(_) => { + Status::ok("Ignoring precondition failure on unpublish") + } + _ => Status::not_found(format!( "Failed to unpublish volume {}, error = {:?}", &args.volume_id, e - )) + )), })?; debug!("Volume {} successfully unpublished", args.volume_id); diff --git a/control-plane/grpc/proto/v1/volume/volume.proto b/control-plane/grpc/proto/v1/volume/volume.proto index b7f2cb9f7..65ee20ea1 100644 --- a/control-plane/grpc/proto/v1/volume/volume.proto +++ b/control-plane/grpc/proto/v1/volume/volume.proto @@ -323,6 +323,8 @@ message UnpublishVolumeRequest { // the nexus. Note: this option should be used only when we know the node will not become // accessible again and it is safe to do so. bool force = 2; + // frontend host requesting for volume unpublish. + optional string frontend_host = 3; } // Share Volume request diff --git a/control-plane/grpc/src/operations/volume/traits.rs b/control-plane/grpc/src/operations/volume/traits.rs index 7e9d1a5ce..7ea674206 100644 --- a/control-plane/grpc/src/operations/volume/traits.rs +++ b/control-plane/grpc/src/operations/volume/traits.rs @@ -1618,6 +1618,8 @@ pub trait UnpublishVolumeInfo: Send + Sync + std::fmt::Debug { fn uuid(&self) -> VolumeId; /// Force unpublish fn force(&self) -> bool; + /// Frontend node requesting unpublish. + fn frontend_host(&self) -> Option; } impl UnpublishVolumeInfo for UnpublishVolume { @@ -1628,6 +1630,10 @@ impl UnpublishVolumeInfo for UnpublishVolume { fn force(&self) -> bool { self.force() } + + fn frontend_host(&self) -> Option { + self.frontend_host().clone() + } } /// Intermediate structure that validates the conversion to UnpublishVolumeRequest type. @@ -1644,6 +1650,13 @@ impl UnpublishVolumeInfo for ValidatedUnpublishVolumeRequest { fn force(&self) -> bool { self.inner.force } + + fn frontend_host(&self) -> Option { + self.inner + .frontend_host + .clone() + .map(|target_node| target_node.into()) + } } impl ValidateRequestTypes for UnpublishVolumeRequest { @@ -1658,7 +1671,7 @@ impl ValidateRequestTypes for UnpublishVolumeRequest { impl From<&dyn UnpublishVolumeInfo> for UnpublishVolume { fn from(data: &dyn UnpublishVolumeInfo) -> Self { - UnpublishVolume::new(&data.uuid(), data.force()) + UnpublishVolume::new(&data.uuid(), data.force(), data.frontend_host()) } } @@ -1667,6 +1680,7 @@ impl From<&dyn UnpublishVolumeInfo> for UnpublishVolumeRequest { Self { uuid: Some(data.uuid().to_string()), force: data.force(), + frontend_host: data.frontend_host().map(|n| n.into()), } } } diff --git a/control-plane/rest/openapi-specs/v0_api_spec.yaml b/control-plane/rest/openapi-specs/v0_api_spec.yaml index a0a516470..0ffa0a7d4 100644 --- a/control-plane/rest/openapi-specs/v0_api_spec.yaml +++ b/control-plane/rest/openapi-specs/v0_api_spec.yaml @@ -1710,6 +1710,13 @@ paths: schema: type: boolean default: false + - in: query + name: frontend_host + description: |- + Frontend host requesting the volume unpublish. + required: false + schema: + type: string responses: '200': description: OK diff --git a/control-plane/rest/service/src/v0/volumes.rs b/control-plane/rest/service/src/v0/volumes.rs index c2ac421d5..d317b6b48 100644 --- a/control-plane/rest/service/src/v0/volumes.rs +++ b/control-plane/rest/service/src/v0/volumes.rs @@ -58,11 +58,15 @@ impl apis::actix_server::Volumes for RestApi { async fn del_volume_target( Path(volume_id): Path, - Query(force): Query>, + Query((force, frontend_host)): Query<(Option, Option)>, ) -> Result> { let volume = client() .unpublish( - &UnpublishVolume::new(&volume_id.into(), force.unwrap_or(false)), + &UnpublishVolume::new( + &volume_id.into(), + force.unwrap_or(false), + frontend_host.map(|n| n.into()), + ), None, ) .await?; diff --git a/control-plane/rest/tests/v0_test.rs b/control-plane/rest/tests/v0_test.rs index ea68691d0..20bafb268 100644 --- a/control-plane/rest/tests/v0_test.rs +++ b/control-plane/rest/tests/v0_test.rs @@ -345,7 +345,7 @@ async fn client_test(cluster: &Cluster, auth: &bool) { let volume = client .volumes_api() - .del_volume_target(&volume_state.uuid, None) + .del_volume_target(&volume_state.uuid, None, Some(cluster.csi_node(0).as_str())) .await .unwrap(); tracing::info!("Volume: {:#?}", volume); diff --git a/control-plane/stor-port/src/types/v0/transport/volume.rs b/control-plane/stor-port/src/types/v0/transport/volume.rs index 24dc2688d..1d057cdf5 100644 --- a/control-plane/stor-port/src/types/v0/transport/volume.rs +++ b/control-plane/stor-port/src/types/v0/transport/volume.rs @@ -648,19 +648,26 @@ pub struct UnpublishVolume { /// the nexus. Note: this option should be used only when we know the node will not become /// accessible again and it is safe to do so. force: bool, + /// The frontend node that was connected to volume and is unpublishing now. + frontend_host: Option, } impl UnpublishVolume { /// Create a new `UnpublishVolume` for the given uuid. - pub fn new(uuid: &VolumeId, force: bool) -> Self { + pub fn new(uuid: &VolumeId, force: bool, frontend_host: Option) -> Self { Self { uuid: uuid.clone(), force, + frontend_host, } } /// It's a force `Self`. pub fn force(&self) -> bool { self.force } + /// frontend host requesting unpublish of volume. + pub fn frontend_host(&self) -> Option { + self.frontend_host.clone() + } } /// Share Volume request. diff --git a/tests/bdd/features/csi/controller/controller.feature b/tests/bdd/features/csi/controller/controller.feature index b11b79d18..01e7c96e7 100644 --- a/tests/bdd/features/csi/controller/controller.feature +++ b/tests/bdd/features/csi/controller/controller.feature @@ -112,4 +112,10 @@ Scenario: list existing volumes with pagination max entries set to 0 Given 2 existing volumes When a ListVolumesRequest is sent to CSI controller with max_entries set to 0 Then all volumes should be returned - And the next token should be empty \ No newline at end of file + And the next token should be empty + +Scenario: unpublish volume from non-frontend node + Given a 2 replica volume published on a node + When the ControllerUnpublishVolume request arrives from a non-frontend node + Then nvmf target which exposes the volume should not be destroyed + And volume should remain as published \ No newline at end of file diff --git a/tests/bdd/features/csi/controller/test_controller.py b/tests/bdd/features/csi/controller/test_controller.py index a7befa22e..2385ca349 100644 --- a/tests/bdd/features/csi/controller/test_controller.py +++ b/tests/bdd/features/csi/controller/test_controller.py @@ -158,6 +158,11 @@ def test_unpublish_volume(setup): """unpublish volume""" +@scenario("controller.feature", "unpublish volume from non-frontend node") +def test_unpublish_volume_from_nonfrontend_node(setup): + """unpublish volume from non-frontend node.""" + + @scenario("controller.feature", "unpublish volume idempotency") def test_unpublish_volume_idempotency(setup): """unpublish volume idempotency""" @@ -202,6 +207,18 @@ def two_existing_volumes(_create_2_volumes_1_replica): return _create_2_volumes_1_replica +@given("a 2 replica volume published on a node") +def populate_published_2_replica_volume(_create_2_replica_nvmf_volume): + do_publish_volume(VOLUME4_UUID, NODE1) + # Make sure volume is published. + volume = ApiClient.volumes_api().get_volume(VOLUME4_UUID) + print("DSDEBUG: Volume upon initial publish\n", volume) + assert ( + str(volume.spec.target.protocol) == "nvmf" + ), "Protocol mismatches for published volume" + return volume + + @given("a non-existing volume") def a_non_existing_volume(): with pytest.raises(NotFoundException) as e: @@ -575,6 +592,12 @@ def get_controller_capabilities(csi_instance): ) +@when("the ControllerUnpublishVolume request arrives from a non-frontend node") +def unpublish_from_non_frontend_node(): + """the ControllerUnpublishVolume request arrives from a non-frontend node.""" + do_unpublish_volume(VOLUME4_UUID, NODE2) + + @then("CSI controller should report all its capabilities") def check_get_controller_capabilities(get_caps_request): all_capabilities = [ @@ -934,6 +957,17 @@ def check_identical_volume_creation(create_the_same_volume): ) +@then("nvmf target which exposes the volume should not be destroyed") +def volume_target_should_not_be_destroyed(): + """nvmf target which exposes the volume should not be destroyed.""" + vol = ApiClient.volumes_api().get_volume(VOLUME4_UUID) + print("DSDEBUG: volume during verification\n", vol) + assert str(vol.spec.target.protocol) == "nvmf", "Volume protocol mismatches" + assert vol.state.target["device_uri"].startswith( + ("nvmf://", "nvmf+tcp://", "nvmf+rdma+tcp://") + ), "Volume share URI mismatches" + + @then("only a single volume should be returned") def only_a_single_volume_should_be_returned(paginated_volumes_list): """only a single volume should be returned.""" @@ -999,6 +1033,12 @@ def check_unpublish_not_existing_volume(unpublish_not_existing_volume): ), "Volume unpuplishing succeeded with unexpected response" +@then("volume should remain as published") +def volume_remains_published(): + """volume should remain as published.""" + pass + + @then("volume should report itself as published") def check_volume_status_published(): vol = ApiClient.volumes_api().get_volume(VOLUME1_UUID) diff --git a/tests/io-engine/tests/rebuild.rs b/tests/io-engine/tests/rebuild.rs index e1c05b789..40ebb0eb2 100644 --- a/tests/io-engine/tests/rebuild.rs +++ b/tests/io-engine/tests/rebuild.rs @@ -974,7 +974,11 @@ async fn destroy_rebuilding_nexus() { assert!(child.uri.starts_with("nvmf"), "uri: {}", child.uri); volumes_api - .del_volume_target(&volume_1.state.uuid, None) + .del_volume_target( + &volume_1.state.uuid, + None, + Some(cluster.csi_node(0).as_str()), + ) .await .unwrap(); diff --git a/utils/pstor-usage/src/volumes.rs b/utils/pstor-usage/src/volumes.rs index f0f7e8f0a..d436f72d2 100644 --- a/utils/pstor-usage/src/volumes.rs +++ b/utils/pstor-usage/src/volumes.rs @@ -98,7 +98,7 @@ impl ResourceUpdates for Vec { client .volumes_api() - .del_volume_target(&volume.spec.uuid, Some(true)) + .del_volume_target(&volume.spec.uuid, Some(true), Some(node_id.as_str())) .await?; } Ok(())