chore(bors): merge pull request #887

887: Fix regression for pool creation timeout retry r=tiagolobocastro a=tiagolobocastro test: use tmp in project workspace Use a tmp folder from the workspace allowing us to cleanup up things like LVM volumes a lot easier as we can just purge it. Signed-off-by: Tiago Castro <tiagolobocastro@gmail.com> --- test(pool): create on very large or very slow disks Uses LVM Lvols as backend devices for the pool. We suspend these before pool creation, allowing us to simulate slow pool creation. This test ensures that the pool creation is completed by itself and also that a client can also complete it by calling create again. Signed-off-by: Tiago Castro <tiagolobocastro@gmail.com> --- fix: allow pool creation to complete asynchronously When the initial create gRPC times out, the data-plane may still be creating the pool in the background, which can happen for very large pools. Rather than assume failure, we allow this to complete in the background up to a large arbitrary amount of time. If the pool creation completes before, then we retry the creation flow. The reason why we don't simply use very large timeouts is because the gRPC operations are currently sequential, mostly due to historical reasons. Now that the data-plane is allowing concurrent calls, we should also allow this on the control-plane. TODO: allow concurrent node operations Signed-off-by: Tiago Castro <tiagolobocastro@gmail.com> --- fix: check for correct not found error code A previous fix ended up not working correctly because it was merged incorrectly, somehow! Signed-off-by: Tiago Castro <tiagolobocastro@gmail.com> --- chore: update terraform node prep Pull the Release key from a recent k8s version since the old keys are no longer valid. This will have to be updated from time to time. Co-authored-by: Tiago Castro <tiagolobocastro@gmail.com>
openebs · Nov 26, 2024 · 867e7c0 · 867e7c0
1 parent 522139b
commit 867e7c0
Show file tree

Hide file tree

Showing 48 changed files with 495 additions and 382 deletions.
diff --git a/.gitignore b/.gitignore
@@ -24,3 +24,4 @@ __pycache__
 /.rust-toolchain
 /rpc/mayastor-api
 /local-fio-0-verify.state
+/.tmp
diff --git a/control-plane/agents/src/bin/core/controller/reconciler/pool/mod.rs b/control-plane/agents/src/bin/core/controller/reconciler/pool/mod.rs
@@ -53,6 +53,7 @@ impl TaskPoller for PoolReconciler {
             results.push(Self::squash_results(vec![
                 pool.garbage_collect(context).await,
                 pool.recreate_state(context).await,
+                retry_create_pool_reconciler(&mut pool, context).await,
             ]))
         }
         capacity::remove_larger_replicas(context.registry()).await;
@@ -214,3 +215,12 @@ async fn deleting_pool_spec_reconciler(
     ))
     .await
 }
+
+#[tracing::instrument(skip(pool, context), level = "trace", fields(pool.id = %pool.id(), request.reconcile = true))]
+async fn retry_create_pool_reconciler(
+    pool: &mut OperationGuardArc<PoolSpec>,
+    context: &PollContext,
+) -> PollResult {
+    pool.retry_creating(context.registry()).await?;
+    Ok(PollerState::Idle)
+}
diff --git a/control-plane/agents/src/bin/core/controller/registry.rs b/control-plane/agents/src/bin/core/controller/registry.rs
@@ -86,6 +86,11 @@ pub(crate) struct RegistryInner<S: Store> {
     /// The duration for which the reconciler waits for the replica to
     /// to be healthy again before attempting to online the faulted child.
     faulted_child_wait_period: Option<std::time::Duration>,
+    /// When the pool creation gRPC times out, the actual call in the io-engine
+    /// may still progress.
+    /// We wait up to this period before considering the operation a failure and
+    /// GC'ing the pool.
+    pool_async_creat_tmo: std::time::Duration,
     /// Disable partial rebuild for volume targets.
     disable_partial_rebuild: bool,
     /// Disable nvmf target access control gates.
@@ -122,6 +127,7 @@ impl Registry {
         reconcile_period: std::time::Duration,
         reconcile_idle_period: std::time::Duration,
         faulted_child_wait_period: Option<std::time::Duration>,
+        pool_async_creat_tmo: std::time::Duration,
         disable_partial_rebuild: bool,
         disable_target_acc: bool,
         max_rebuilds: Option<NumRebuilds>,
@@ -165,6 +171,7 @@ impl Registry {
                 reconcile_period,
                 reconcile_idle_period,
                 faulted_child_wait_period,
+                pool_async_creat_tmo,
                 disable_partial_rebuild,
                 disable_target_acc,
                 reconciler: ReconcilerControl::new(),
@@ -298,6 +305,10 @@ impl Registry {
     pub(crate) fn faulted_child_wait_period(&self) -> Option<std::time::Duration> {
         self.faulted_child_wait_period
     }
+    /// Allow for this given time before assuming failure and allowing the pool to get deleted.
+    pub(crate) fn pool_async_creat_tmo(&self) -> std::time::Duration {
+        self.pool_async_creat_tmo
+    }
     /// The maximum number of concurrent create volume requests.
     pub(crate) fn create_volume_limit(&self) -> usize {
         self.create_volume_limit

diff --git a/control-plane/agents/src/bin/core/controller/resources/operations_helper.rs b/control-plane/agents/src/bin/core/controller/resources/operations_helper.rs
@@ -54,9 +54,9 @@ enum SpecError {
 }
 
 /// What to do when creation fails.
+#[derive(Debug)]
 pub(crate) enum OnCreateFail {
     /// Leave object as `Creating`, could allow for frontend retries.
-    #[allow(unused)]
     LeaveAsIs,
     /// When frontend retries don't make sense, set it to deleting so we can clean-up.
     SetDeleting,
@@ -71,7 +71,28 @@ impl OnCreateFail {
     pub(crate) fn eeinval_delete<O>(result: &Result<O, SvcError>) -> Self {
         match result {
             Err(error) if error.tonic_code() == tonic::Code::InvalidArgument => Self::Delete,
-            Err(error) if error.tonic_code() == tonic::Code::NotFound => Self::Delete,
+            _ => Self::SetDeleting,
+        }
+    }
+    /// Map errors into `Self` for pool creation requests, specifically.
+    pub(crate) fn on_pool_create_err<O>(result: &Result<O, SvcError>) -> Self {
+        let Err(ref error) = result else {
+            // nonsensical but that's how the api is today...
+            return Self::SetDeleting;
+        };
+        match error.tonic_code() {
+            // 1. the disk is open by another pool or bdev
+            // 2. the disk contains a pool with another name
+            tonic::Code::InvalidArgument => Self::Delete,
+            // 1. the pool disk is not available (ie not found or broken somehow)
+            tonic::Code::NotFound => Self::Delete,
+            // In this case, it's the pool operator's job to attempt re-creation of the pool.
+            // 1. pre-2.6 dataplane, contention on the pool service
+            // 2. pool disk is very slow or extremely large
+            // 3. dataplane core is shared with other processes
+            // TODO: use higher timeout on larger pool sizes or potentially make this
+            //  an async operation.
+            tonic::Code::Cancelled => Self::LeaveAsIs,
             _ => Self::SetDeleting,
         }
     }

diff --git a/control-plane/agents/src/bin/core/main.rs b/control-plane/agents/src/bin/core/main.rs
@@ -47,6 +47,13 @@ pub(crate) struct CliArgs {
     #[clap(long)]
     pub(crate) faulted_child_wait_period: Option<humantime::Duration>,
 
+    /// When the pool creation gRPC times out, the actual call in the io-engine
+    /// may still progress.
+    /// We wait up to this period before considering the operation a failure and
+    /// GC'ing the pool.
+    #[clap(long, default_value = "15m")]
+    pub(crate) pool_async_creat_tmo: humantime::Duration,
+
     /// Disable partial rebuild for volume targets.
     #[clap(long, env = "DISABLE_PARTIAL_REBUILD")]
     pub(crate) disable_partial_rebuild: bool,
@@ -194,6 +201,7 @@ async fn server(cli_args: CliArgs) -> anyhow::Result<()> {
         cli_args.reconcile_period.into(),
         cli_args.reconcile_idle_period.into(),
         cli_args.faulted_child_wait_period.map(|t| t.into()),
+        cli_args.pool_async_creat_tmo.into(),
         cli_args.disable_partial_rebuild,
         cli_args.disable_target_acc,
         cli_args.max_rebuilds,

diff --git a/control-plane/agents/src/bin/core/pool/operations_helper.rs b/control-plane/agents/src/bin/core/pool/operations_helper.rs
@@ -1,14 +1,77 @@
-use crate::controller::{
-    registry::Registry,
-    resources::{operations::ResourceLifecycle, OperationGuardArc},
+use crate::{
+    controller::{
+        io_engine::PoolApi,
+        registry::Registry,
+        resources::{
+            operations::ResourceLifecycle,
+            operations_helper::{GuardedOperationsHelper, OnCreateFail, SpecOperationsHelper},
+            OperationGuardArc,
+        },
+    },
+    node::wrapper::GetterOps,
 };
 use agents::{errors, errors::SvcError};
 use snafu::OptionExt;
+use std::ops::Deref;
 use stor_port::types::v0::{
-    store::replica::{PoolRef, ReplicaSpec},
-    transport::{DestroyReplica, NodeId, ReplicaOwners},
+    store::{
+        pool::PoolSpec,
+        replica::{PoolRef, ReplicaSpec},
+    },
+    transport::{CreatePool, DestroyReplica, NodeId, ReplicaOwners},
 };
 
+impl OperationGuardArc<PoolSpec> {
+    /// Retries the creation of the pool which is being done in the background by the io-engine.
+    /// This may happen if the pool create gRPC times out, for very large pools.
+    /// We could increase the timeout but as things stand today that would block all gRPC
+    /// access to the node.
+    /// TODO: Since the data-plane now allows concurrent gRPC we should also modify the
+    ///  control-plane to allow this, which would allows to set large timeouts for some gRPCs.
+    pub(crate) async fn retry_creating(&mut self, registry: &Registry) -> Result<(), SvcError> {
+        let request = {
+            let spec = self.lock();
+            if on_create_fail(&spec, registry).is_some() {
+                return Ok(());
+            }
+            CreatePool::from(spec.deref())
+        };
+
+        let node = registry.node_wrapper(&request.node).await?;
+        if node.pool(&request.id).await.is_none() {
+            return Ok(());
+        }
+
+        let _ = self.start_create(registry, &request).await?;
+        let result = node.create_pool(&request).await;
+        let _state = self
+            .complete_create(result, registry, OnCreateFail::LeaveAsIs)
+            .await?;
+
+        Ok(())
+    }
+
+    /// Ge the `OnCreateFail` policy.
+    /// For more information see [`Self::retry_creating`].
+    pub(crate) fn on_create_fail(&self, registry: &Registry) -> OnCreateFail {
+        let spec = self.lock();
+        on_create_fail(&spec, registry).unwrap_or(OnCreateFail::LeaveAsIs)
+    }
+}
+
+fn on_create_fail(pool: &PoolSpec, registry: &Registry) -> Option<OnCreateFail> {
+    if !pool.status().creating() {
+        return Some(OnCreateFail::LeaveAsIs);
+    }
+    let Some(last_mod_elapsed) = pool.creat_tsc.and_then(|t| t.elapsed().ok()) else {
+        return Some(OnCreateFail::SetDeleting);
+    };
+    if last_mod_elapsed > registry.pool_async_creat_tmo() {
+        return Some(OnCreateFail::SetDeleting);
+    }
+    None
+}
+
 impl OperationGuardArc<ReplicaSpec> {
     /// Destroy the replica from its volume
     pub(crate) async fn destroy_volume_replica(

diff --git a/control-plane/agents/src/bin/core/pool/pool_operations.rs b/control-plane/agents/src/bin/core/pool/pool_operations.rs
@@ -65,8 +65,7 @@ impl ResourceLifecycle for OperationGuardArc<PoolSpec> {
         let _ = pool.start_create(registry, request).await?;
 
         let result = node.create_pool(request).await;
-        let on_fail = OnCreateFail::eeinval_delete(&result);
-
+        let on_fail = OnCreateFail::on_pool_create_err(&result);
         let state = pool.complete_create(result, registry, on_fail).await?;
         let spec = pool.lock().clone();
         Ok(Pool::new(spec, Some(CtrlPoolState::new(state))))
@@ -93,14 +92,11 @@ impl ResourceLifecycle for OperationGuardArc<PoolSpec> {
             Err(SvcError::PoolNotFound { .. }) => {
                 match node.import_pool(&self.as_ref().into()).await {
                     Ok(_) => node.destroy_pool(request).await,
-                    Err(error)
-                        if allow_not_found
-                            && error.tonic_code() == tonic::Code::InvalidArgument =>
-                    {
-                        Ok(())
-                    }
-                    Err(error) if error.tonic_code() == tonic::Code::InvalidArgument => Ok(()),
-                    Err(error) => Err(error),
+                    Err(error) => match error.tonic_code() {
+                        tonic::Code::NotFound if allow_not_found => Ok(()),
+                        tonic::Code::InvalidArgument => Ok(()),
+                        _other => Err(error),
+                    },
                 }
             }
             Err(error) => Err(error),

diff --git a/control-plane/agents/src/bin/core/pool/specs.rs b/control-plane/agents/src/bin/core/pool/specs.rs
@@ -425,7 +425,9 @@ impl ResourceSpecsLocked {
         let pools = self.pools_rsc();
         for pool in pools {
             if let Ok(mut guard) = pool.operation_guard() {
-                if !guard.handle_incomplete_ops(registry).await {
+                let on_fail = guard.on_create_fail(registry);
+
+                if !guard.handle_incomplete_ops_ext(registry, on_fail).await {
                     // Not all pending operations could be handled.
                     pending_ops = true;
                 }

diff --git a/control-plane/agents/src/bin/core/tests/controller/mod.rs b/control-plane/agents/src/bin/core/tests/controller/mod.rs
@@ -9,7 +9,7 @@ use stor_port::{
 };
 
 use serde_json::Value;
-use std::str::FromStr;
+use std::{str::FromStr, time::Duration};
 use uuid::Uuid;
 
 /// Test that the content of the registry is correctly loaded from the persistent store on start up.
@@ -227,6 +227,7 @@ async fn etcd_pagination() {
         .with_rest(false)
         .with_jaeger(false)
         .with_store_lease_ttl(lease_ttl)
+        .with_req_timeouts(Duration::from_millis(200), Duration::from_millis(200))
         .build()
         .await
         .unwrap();

diff --git a/control-plane/agents/src/bin/core/tests/deserializer.rs b/control-plane/agents/src/bin/core/tests/deserializer.rs
@@ -151,6 +151,7 @@ fn test_deserialization_v1_to_v2() {
                 },
                 sequencer: Default::default(),
                 operation: None,
+                creat_tsc: None,
             }),
         },
         TestEntry {

diff --git a/control-plane/agents/src/bin/core/tests/pool/mod.rs b/control-plane/agents/src/bin/core/tests/pool/mod.rs
@@ -21,7 +21,10 @@ use stor_port::{
                 VolumePolicy,
             },
         },
-        store::replica::{ReplicaSpec, ReplicaSpecKey},
+        store::{
+            pool::PoolLabel,
+            replica::{ReplicaSpec, ReplicaSpecKey},
+        },
         transport::{
             CreatePool, CreateReplica, DestroyPool, DestroyReplica, Filter, GetSpecs, NexusId,
             NodeId, Protocol, Replica, ReplicaId, ReplicaName, ReplicaOwners, ReplicaShareProtocol,
@@ -1035,3 +1038,84 @@ async fn destroy_after_restart() {
 
     assert_eq!(pool.state().unwrap().id, create.id);
 }
+
+#[tokio::test]
+async fn slow_create() {
+    const POOL_SIZE_BYTES: u64 = 128 * 1024 * 1024;
+
+    let vg = deployer_cluster::lvm::VolGroup::new("slow-pool", POOL_SIZE_BYTES).unwrap();
+    let lvol = vg.create_lvol("lvol0", POOL_SIZE_BYTES / 2).unwrap();
+    lvol.suspend().unwrap();
+    {
+        let cluster = ClusterBuilder::builder()
+            .with_io_engines(1)
+            .with_reconcile_period(Duration::from_millis(250), Duration::from_millis(250))
+            .with_cache_period("200ms")
+            .with_options(|o| o.with_io_engine_devices(vec![lvol.path()]))
+            .with_req_timeouts(Duration::from_millis(500), Duration::from_millis(500))
+            .compose_build(|b| b.with_clean(true))
+            .await
+            .unwrap();
+
+        let client = cluster.grpc_client();
+
+        let create = CreatePool {
+            node: cluster.node(0),
+            id: "bob".into(),
+            disks: vec![lvol.path().into()],
+            labels: Some(PoolLabel::from([("a".into(), "b".into())])),
+        };
+
+        let error = client
+            .pool()
+            .create(&create, None)
+            .await
+            .expect_err("device suspended");
+        assert_eq!(error.kind, ReplyErrorKind::Cancelled);
+
+        lvol.resume().unwrap();
+
+        let start = std::time::Instant::now();
+        let timeout = Duration::from_secs(30);
+        loop {
+            if std::time::Instant::now() > (start + timeout) {
+                panic!("Timeout waiting for the pool");
+            }
+            tokio::time::sleep(Duration::from_millis(100)).await;
+
+            let pools = client
+                .pool()
+                .get(Filter::Pool(create.id.clone()), None)
+                .await
+                .unwrap();
+
+            let Some(pool) = pools.0.first() else {
+                continue;
+            };
+            let Some(pool_spec) = pool.spec() else {
+                continue;
+            };
+            if !pool_spec.status.created() {
+                continue;
+            }
+            break;
+        }
+        let destroy = DestroyPool::from(create.clone());
+        client.pool().destroy(&destroy, None).await.unwrap();
+
+        // Now we try to recreate using an API call, rather than using the reconciler
+        lvol.suspend().unwrap();
+
+        let error = client
+            .pool()
+            .create(&create, None)
+            .await
+            .expect_err("device suspended");
+        assert_eq!(error.kind, ReplyErrorKind::Cancelled);
+
+        lvol.resume().unwrap();
+
+        let pool = client.pool().create(&create, None).await.unwrap();
+        assert!(pool.spec().unwrap().status.created());
+    }
+}
diff --git a/control-plane/agents/src/bin/core/tests/snapshot/fs_cons_snapshot.rs b/control-plane/agents/src/bin/core/tests/snapshot/fs_cons_snapshot.rs
@@ -8,7 +8,7 @@ struct DeviceDisconnect(nvmeadm::NvmeTarget);
 impl Drop for DeviceDisconnect {
     fn drop(&mut self) {
         if self.0.disconnect().is_err() {
-            std::process::Command::new("sudo")
+            std::process::Command::new(env!("SUDO"))
                 .args(["nvme", "disconnect-all"])
                 .status()
                 .unwrap();

diff --git a/control-plane/agents/src/bin/core/tests/volume/capacity.rs b/control-plane/agents/src/bin/core/tests/volume/capacity.rs
@@ -89,7 +89,7 @@ struct DeviceDisconnect(nvmeadm::NvmeTarget);
 impl Drop for DeviceDisconnect {
     fn drop(&mut self) {
         if self.0.disconnect().is_err() {
-            std::process::Command::new("sudo")
+            std::process::Command::new(env!("SUDO"))
                 .args(["nvme", "disconnect-all"])
                 .status()
                 .unwrap();