diff --git a/iml-agent/src/action_plugins/lustre/snapshot.rs b/iml-agent/src/action_plugins/lustre/snapshot.rs index f3b7f92d62..daf029fe9e 100644 --- a/iml-agent/src/action_plugins/lustre/snapshot.rs +++ b/iml-agent/src/action_plugins/lustre/snapshot.rs @@ -2,7 +2,10 @@ // Use of this source code is governed by a MIT-style // license that can be found in the LICENSE file. -use crate::{agent_error::ImlAgentError, lustre::lctl}; +use crate::{ + agent_error::ImlAgentError, + lustre::{lctl, lctl_retry}, +}; use combine::{stream::easy, EasyParser}; use iml_wire_types::snapshot::{Create, Destroy, List, Mount, Snapshot, Unmount}; @@ -14,7 +17,7 @@ pub async fn list(l: List) -> Result, ImlAgentError> { args.push("--name"); args.push(name); } - let stdout = lctl(args).await?; + let stdout = lctl_retry(args).await?; let stdout = stdout.trim(); if stdout.is_empty() { @@ -53,7 +56,7 @@ pub async fn create(c: Create) -> Result<(), ImlAgentError> { args.push(cmnt); } - lctl(args).await.map(drop) + lctl_retry(args).await.map(drop) } pub async fn destroy(d: Destroy) -> Result<(), ImlAgentError> { @@ -61,7 +64,7 @@ pub async fn destroy(d: Destroy) -> Result<(), ImlAgentError> { if d.force { args.push("--force"); } - lctl(args).await.map(drop) + lctl_retry(args).await.map(drop) } pub async fn mount(m: Mount) -> Result<(), ImlAgentError> { diff --git a/iml-agent/src/daemon_plugins/snapshot.rs b/iml-agent/src/daemon_plugins/snapshot.rs index 878130cebc..c46779c161 100644 --- a/iml-agent/src/daemon_plugins/snapshot.rs +++ b/iml-agent/src/daemon_plugins/snapshot.rs @@ -20,6 +20,8 @@ use futures::{ lock::Mutex, Future, FutureExt, }; +use iml_cmd::CmdError; +use iml_tracing::tracing; use iml_wire_types::snapshot::{List, Snapshot}; use std::collections::BTreeSet; use std::{pin::Pin, sync::Arc, time::Duration}; @@ -50,7 +52,7 @@ async fn list() -> Result, ()> { let fss: Vec = lctl(vec!["get_param", "-N", "mgs.MGS.live.*"]) .await .map_err(|e| { - // XXX debug because of false positives + // XXX debug because of false positives. But this is still a failure. tracing::debug!("listing filesystems failed: {}", e); }) .map(|o| { @@ -87,14 +89,31 @@ async fn list() -> Result, ()> { let really_failed_fss = errs .into_iter() .map(|x| x.unwrap_err()) - .filter(|x| !snapshot_fsnames.contains(&x.0)) + .filter(|x| { + tracing::debug!("listing for {} failed: {:?}", x.0, x.1); + + if snapshot_fsnames.contains(&x.0) { + tracing::debug!("{} is a snapshot FS", x.0); + return false; + } + + match &x.1 { + ImlAgentError::CmdError(CmdError::Output(o)) => { + // XXX lctl returns 1 no matter what, so have to read its output: + let stderr = String::from_utf8_lossy(&o.stderr); + stderr.find("Miss MDT0 in the config file").is_none() + } + _ => true, + } + }) .collect::>(); - if !really_failed_fss.is_empty() { - // XXX debug because of false positives - tracing::debug!("listing failed: {:?}", really_failed_fss); + if really_failed_fss.is_empty() { + Ok(snaps) + } else { + tracing::error!("listing failed: {:?}", really_failed_fss); + Err(()) } - Ok(snaps) } #[async_trait] diff --git a/iml-agent/src/lustre.rs b/iml-agent/src/lustre.rs index d896b27181..3a9f12bc99 100644 --- a/iml-agent/src/lustre.rs +++ b/iml-agent/src/lustre.rs @@ -4,13 +4,48 @@ use crate::agent_error::ImlAgentError; use futures::TryFutureExt; -use iml_cmd::{CheckedCommandExt, Command}; +use iml_cmd::{CheckedCommandExt, CmdError, Command}; use liblustreapi::LlapiFid; -use std::ffi::OsStr; -use tokio::task::spawn_blocking; +use std::{ffi::OsStr, process::Output, time::Duration}; +use tokio::{task::spawn_blocking, time::delay_for}; -/// Runs lctl with given arguments +/// Execute insistently lctl with given arguments (retry if resource is temporarily unavailable) +pub async fn lctl_retry(args: I) -> Result +where + I: IntoIterator + Clone, + S: AsRef, +{ + loop { + let r = invoke_lctl(args.clone()).await; + + if let Err(CmdError::Output(o)) = &r { + let stderr = String::from_utf8_lossy(&o.stderr); + if stderr.find("Resource temporarily unavailable").is_some() { + const DUR: Duration = Duration::from_secs(3); + tracing::debug!("{}, waiting {:?} ...", stderr.trim(), DUR); + delay_for(DUR).await; + continue; + } + } + break r + .map_err(|e| e.into()) + .map(|o| String::from_utf8_lossy(&o.stdout).to_string()); + } +} + +/// Execute lctl with given arguments pub async fn lctl(args: I) -> Result +where + I: IntoIterator, + S: AsRef, +{ + invoke_lctl(args) + .await + .map_err(|e| e.into()) + .map(|o| String::from_utf8_lossy(&o.stdout).to_string()) +} + +async fn invoke_lctl(args: I) -> Result where I: IntoIterator, S: AsRef, @@ -18,9 +53,7 @@ where Command::new("/usr/sbin/lctl") .args(args) .checked_output() - .err_into() .await - .map(|o| String::from_utf8_lossy(&o.stdout).to_string()) } /// Returns LlapiFid for a given device or mount path