Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

fix(volume): don't allow unpublish from non-frontend node #889

Open
wants to merge 1 commit into
base: develop
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 4 additions & 1 deletion control-plane/agents/src/bin/core/tests/event/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -130,7 +130,10 @@ async fn events() {
let vol_client = cluster.grpc_client().volume();

vol_client
.unpublish(&UnpublishVolume::new(&volid, false), None)
.unpublish(
&UnpublishVolume::new(&volid, false, Some(cluster.csi_node(0))),
None,
)
.await
.unwrap();

Expand Down
6 changes: 5 additions & 1 deletion control-plane/agents/src/bin/core/tests/pool/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -884,7 +884,11 @@ async fn disown_unused_replicas() {

cluster.composer().pause(&node).await.unwrap();
volumes_api
.del_volume_target(&volume.spec.uuid, Some(false))
.del_volume_target(
&volume.spec.uuid,
Some(false),
Some(cluster.csi_node(0).as_str()),
)
.await
.expect_err("io-engine is down");
cluster.composer().kill(&node).await.unwrap();
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -326,7 +326,11 @@ async fn unused_reconcile(cluster: &Cluster) {
cluster.composer().kill(&nexus_node.id).await.unwrap();
// 2. now we force unpublish the volume
volumes_api
.del_volume_target(&volume.spec.uuid, Some(true))
.del_volume_target(
&volume.spec.uuid,
Some(true),
Some(cluster.csi_node(0).as_str()),
)
.await
.unwrap();
// 3. publish on the previously unused node
Expand Down
22 changes: 11 additions & 11 deletions control-plane/agents/src/bin/core/tests/volume/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -142,7 +142,7 @@ async fn nexus_persistence_test_iteration(
let nexus_uuid = nexus.uuid.clone();

volume_client
.unpublish(&UnpublishVolume::new(&volume_state.uuid, false), None)
.unpublish(&UnpublishVolume::new(&volume_state.uuid, false, None), None)
.await
.unwrap();

Expand Down Expand Up @@ -379,7 +379,7 @@ async fn publishing_test(cluster: &Cluster) {
.expect_err("The Volume cannot be published again because it's already published");

volume_client
.unpublish(&UnpublishVolume::new(&volume_state.uuid, false), None)
.unpublish(&UnpublishVolume::new(&volume_state.uuid, false, None), None)
.await
.unwrap();

Expand Down Expand Up @@ -447,7 +447,7 @@ async fn publishing_test(cluster: &Cluster) {
.expect_err("The volume is already published");

volume_client
.unpublish(&UnpublishVolume::new(&volume_state.uuid, false), None)
.unpublish(&UnpublishVolume::new(&volume_state.uuid, false, None), None)
.await
.unwrap();

Expand Down Expand Up @@ -491,12 +491,12 @@ async fn publishing_test(cluster: &Cluster) {
cluster.composer().kill(target_node.as_str()).await.unwrap();

volume_client
.unpublish(&UnpublishVolume::new(&volume_state.uuid, false), None)
.unpublish(&UnpublishVolume::new(&volume_state.uuid, false, None), None)
.await
.expect_err("The node is not online...");

volume_client
.unpublish(&UnpublishVolume::new(&volume_state.uuid, true), None)
.unpublish(&UnpublishVolume::new(&volume_state.uuid, true, None), None)
.await
.expect("With force comes great responsibility...");

Expand Down Expand Up @@ -747,7 +747,7 @@ async fn replica_count_test(cluster: &Cluster) {

let volume_state = volume.state();
volume_client
.unpublish(&UnpublishVolume::new(&volume_state.uuid, false), None)
.unpublish(&UnpublishVolume::new(&volume_state.uuid, false, None), None)
.await
.unwrap();

Expand Down Expand Up @@ -927,7 +927,7 @@ async fn publish_unpublish(cluster: &Cluster) {
// Unpublish the volume2
let _ = volume_client
.unpublish(
&UnpublishVolume::new(&VolumeId::try_from(VOLUME_2).unwrap(), false),
&UnpublishVolume::new(&VolumeId::try_from(VOLUME_2).unwrap(), false, None),
None,
)
.await
Expand All @@ -936,7 +936,7 @@ async fn publish_unpublish(cluster: &Cluster) {
// Unpublish the volume1
let _ = volume_client
.unpublish(
&UnpublishVolume::new(&VolumeId::try_from(VOLUME_1).unwrap(), false),
&UnpublishVolume::new(&VolumeId::try_from(VOLUME_1).unwrap(), false, None),
None,
)
.await
Expand Down Expand Up @@ -983,14 +983,14 @@ async fn target_distribution(cluster: &Cluster) {
// Cleanup
let _ = volume_client
.unpublish(
&UnpublishVolume::new(&VolumeId::try_from(VOLUME_1).unwrap(), false),
&UnpublishVolume::new(&VolumeId::try_from(VOLUME_1).unwrap(), false, None),
None,
)
.await
.expect("The volume should be unpublished");
let _ = volume_client
.unpublish(
&UnpublishVolume::new(&VolumeId::try_from(VOLUME_2).unwrap(), false),
&UnpublishVolume::new(&VolumeId::try_from(VOLUME_2).unwrap(), false, None),
None,
)
.await
Expand Down Expand Up @@ -1063,7 +1063,7 @@ async fn offline_node(cluster: &Cluster) {
// Unpublish volume2
let _ = volume_client
.unpublish(
&UnpublishVolume::new(&VolumeId::try_from(VOLUME_2).unwrap(), false),
&UnpublishVolume::new(&VolumeId::try_from(VOLUME_2).unwrap(), false, None),
None,
)
.await
Expand Down
19 changes: 19 additions & 0 deletions control-plane/agents/src/bin/core/volume/operations.rs
Original file line number Diff line number Diff line change
Expand Up @@ -412,6 +412,25 @@ impl ResourcePublishing for OperationGuardArc<VolumeSpec> {
.await?;

let volume_target = spec_clone.target().expect("already validated");
if let Some(unpublishing_node) = request.frontend_host() {
if let Some(tgt_cfg) = spec_clone.active_config() {
if !tgt_cfg
.frontend()
.nodename_allowed(unpublishing_node.as_str())
{
self.validate_update_step(
registry,
Err(SvcError::FrontendNodeNotAllowed {
node: unpublishing_node.to_string(),
vol_id: request.uuid.to_string(),
}),
&spec_clone,
)
.await?;
}
}
}

let result = match specs.nexus_opt(volume_target.nexus()).await? {
None => Ok(()),
Some(mut nexus) => {
Expand Down
4 changes: 3 additions & 1 deletion control-plane/csi-driver/src/bin/controller/client.rs
Original file line number Diff line number Diff line change
Expand Up @@ -83,6 +83,7 @@ impl From<clients::tower::Error<RestJsonError>> for ApiClientError {
StatusCode::PRECONDITION_FAILED => Self::PreconditionFailed(detailed),
StatusCode::BAD_REQUEST => Self::InvalidArgument(detailed),
StatusCode::NOT_ACCEPTABLE => Self::NotAcceptable(detailed),
StatusCode::UNAUTHORIZED => Self::PreconditionFailed(detailed),
status => Self::GenericOperation(status, detailed),
}
}
Expand Down Expand Up @@ -365,11 +366,12 @@ impl RestApiClient {
&self,
volume_id: &uuid::Uuid,
force: bool,
frontend_host: Option<&str>,
) -> Result<(), ApiClientError> {
Self::delete_idempotent(
self.rest_client
.volumes_api()
.del_volume_target(volume_id, Some(force))
.del_volume_target(volume_id, Some(force), frontend_host)
.await,
true,
)?;
Expand Down
13 changes: 8 additions & 5 deletions control-plane/csi-driver/src/bin/controller/controller.rs
Original file line number Diff line number Diff line change
Expand Up @@ -605,15 +605,18 @@ impl rpc::csi::controller_server::Controller for CsiControllerSvc {
return Ok(Response::new(ControllerUnpublishVolumeResponse {}));
}

// Do forced volume upublish as Kubernetes already detached the volume.
// Do forced volume unpublish as Kubernetes already detached the volume.
RestApiClient::get_client()
.unpublish_volume(&volume_uuid, true)
.unpublish_volume(&volume_uuid, true, Some(args.node_id.as_str()))
.await
.map_err(|e| {
Status::not_found(format!(
.map_err(|e| match e {
ApiClientError::PreconditionFailed(_) => {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

this feels somewhat wrong, what if target was not deleted because of some other precondition error.
Maybe we need a different error here that is clearer? Something more similar to NOT_FOUND perhaps? Given that the target is not available for this node.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

How about NotAcceptable? I intended to do that earlier

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yeah that seems good. wdyt @Abhinandan-Purkait ?

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yep. Although there is one more mapping of not acceptable but looking at other options this sounds reasonable.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We can also use the error body for additional info if we need to do additional mapping

Status::ok("Ignoring precondition failure on unpublish")
}
_ => Status::not_found(format!(
"Failed to unpublish volume {}, error = {:?}",
&args.volume_id, e
))
)),
})?;

debug!("Volume {} successfully unpublished", args.volume_id);
Expand Down
2 changes: 2 additions & 0 deletions control-plane/grpc/proto/v1/volume/volume.proto
Original file line number Diff line number Diff line change
Expand Up @@ -323,6 +323,8 @@ message UnpublishVolumeRequest {
// the nexus. Note: this option should be used only when we know the node will not become
// accessible again and it is safe to do so.
bool force = 2;
// frontend host requesting for volume unpublish.
optional string frontend_host = 3;
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

In-keeping with publish

Suggested change
optional string frontend_host = 3;
repeated string frontend_nodes = 3;

}

// Share Volume request
Expand Down
16 changes: 15 additions & 1 deletion control-plane/grpc/src/operations/volume/traits.rs
Original file line number Diff line number Diff line change
Expand Up @@ -1618,6 +1618,8 @@ pub trait UnpublishVolumeInfo: Send + Sync + std::fmt::Debug {
fn uuid(&self) -> VolumeId;
/// Force unpublish
fn force(&self) -> bool;
/// Frontend node requesting unpublish.
fn frontend_host(&self) -> Option<NodeId>;
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
fn frontend_host(&self) -> Option<NodeId>;
fn frontend_nodes(&self) -> Vec<NodeId>;

}

impl UnpublishVolumeInfo for UnpublishVolume {
Expand All @@ -1628,6 +1630,10 @@ impl UnpublishVolumeInfo for UnpublishVolume {
fn force(&self) -> bool {
self.force()
}

fn frontend_host(&self) -> Option<NodeId> {
self.frontend_host().clone()
}
}

/// Intermediate structure that validates the conversion to UnpublishVolumeRequest type.
Expand All @@ -1644,6 +1650,13 @@ impl UnpublishVolumeInfo for ValidatedUnpublishVolumeRequest {
fn force(&self) -> bool {
self.inner.force
}

fn frontend_host(&self) -> Option<NodeId> {
self.inner
.frontend_host
.clone()
.map(|target_node| target_node.into())
}
}

impl ValidateRequestTypes for UnpublishVolumeRequest {
Expand All @@ -1658,7 +1671,7 @@ impl ValidateRequestTypes for UnpublishVolumeRequest {

impl From<&dyn UnpublishVolumeInfo> for UnpublishVolume {
fn from(data: &dyn UnpublishVolumeInfo) -> Self {
UnpublishVolume::new(&data.uuid(), data.force())
UnpublishVolume::new(&data.uuid(), data.force(), data.frontend_host())
}
}

Expand All @@ -1667,6 +1680,7 @@ impl From<&dyn UnpublishVolumeInfo> for UnpublishVolumeRequest {
Self {
uuid: Some(data.uuid().to_string()),
force: data.force(),
frontend_host: data.frontend_host().map(|n| n.into()),
}
}
}
Expand Down
7 changes: 7 additions & 0 deletions control-plane/rest/openapi-specs/v0_api_spec.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -1710,6 +1710,13 @@ paths:
schema:
type: boolean
default: false
- in: query
name: frontend_host
description: |-
Frontend host requesting the volume unpublish.
required: false
schema:
type: string
responses:
'200':
description: OK
Expand Down
8 changes: 6 additions & 2 deletions control-plane/rest/service/src/v0/volumes.rs
Original file line number Diff line number Diff line change
Expand Up @@ -58,11 +58,15 @@ impl apis::actix_server::Volumes for RestApi {

async fn del_volume_target(
Path(volume_id): Path<Uuid>,
Query(force): Query<Option<bool>>,
Query((force, frontend_host)): Query<(Option<bool>, Option<String>)>,
) -> Result<models::Volume, RestError<RestJsonError>> {
let volume = client()
.unpublish(
&UnpublishVolume::new(&volume_id.into(), force.unwrap_or(false)),
&UnpublishVolume::new(
&volume_id.into(),
force.unwrap_or(false),
frontend_host.map(|n| n.into()),
),
None,
)
.await?;
Expand Down
2 changes: 1 addition & 1 deletion control-plane/rest/tests/v0_test.rs
Original file line number Diff line number Diff line change
Expand Up @@ -345,7 +345,7 @@ async fn client_test(cluster: &Cluster, auth: &bool) {

let volume = client
.volumes_api()
.del_volume_target(&volume_state.uuid, None)
.del_volume_target(&volume_state.uuid, None, Some(cluster.csi_node(0).as_str()))
.await
.unwrap();
tracing::info!("Volume: {:#?}", volume);
Expand Down
9 changes: 8 additions & 1 deletion control-plane/stor-port/src/types/v0/transport/volume.rs
Original file line number Diff line number Diff line change
Expand Up @@ -648,19 +648,26 @@ pub struct UnpublishVolume {
/// the nexus. Note: this option should be used only when we know the node will not become
/// accessible again and it is safe to do so.
force: bool,
/// The frontend node that was connected to volume and is unpublishing now.
frontend_host: Option<NodeId>,
}
impl UnpublishVolume {
/// Create a new `UnpublishVolume` for the given uuid.
pub fn new(uuid: &VolumeId, force: bool) -> Self {
pub fn new(uuid: &VolumeId, force: bool, frontend_host: Option<NodeId>) -> Self {
Self {
uuid: uuid.clone(),
force,
frontend_host,
}
}
/// It's a force `Self`.
pub fn force(&self) -> bool {
self.force
}
/// frontend host requesting unpublish of volume.
pub fn frontend_host(&self) -> Option<NodeId> {
self.frontend_host.clone()
}
}

/// Share Volume request.
Expand Down
8 changes: 7 additions & 1 deletion tests/bdd/features/csi/controller/controller.feature
Original file line number Diff line number Diff line change
Expand Up @@ -112,4 +112,10 @@ Scenario: list existing volumes with pagination max entries set to 0
Given 2 existing volumes
When a ListVolumesRequest is sent to CSI controller with max_entries set to 0
Then all volumes should be returned
And the next token should be empty
And the next token should be empty

Scenario: unpublish volume from non-frontend node
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We should cover the returned errors when the nodes don't match

Given a 2 replica volume published on a node
When the ControllerUnpublishVolume request arrives from a non-frontend node
Then nvmf target which exposes the volume should not be destroyed
And volume should remain as published
Loading