whamcloud · ip1981 · Sep 29, 2020 · jgrund · Oct 20, 2020 · ip1981
diff --git a/iml-agent/src/action_plugins/lustre/snapshot.rs b/iml-agent/src/action_plugins/lustre/snapshot.rs
@@ -2,7 +2,10 @@
 // Use of this source code is governed by a MIT-style
 // license that can be found in the LICENSE file.
 
-use crate::{agent_error::ImlAgentError, lustre::lctl};
+use crate::{
+ agent_error::ImlAgentError,
+ lustre::{lctl, lctl_retry},
+};
 use combine::{stream::easy, EasyParser};
 use iml_wire_types::snapshot::{Create, Destroy, List, Mount, Snapshot, Unmount};
 
@@ -14,7 +17,7 @@ pub async fn list(l: List) -> Result<Vec<Snapshot>, ImlAgentError> {
  args.push("--name");
  args.push(name);
  }
- let stdout = lctl(args).await?;
+ let stdout = lctl_retry(args).await?;
  let stdout = stdout.trim();
 
  if stdout.is_empty() {
@@ -53,15 +56,15 @@ pub async fn create(c: Create) -> Result<(), ImlAgentError> {
  args.push(cmnt);
  }
 
- lctl(args).await.map(drop)
+ lctl_retry(args).await.map(drop)
 }
 
 pub async fn destroy(d: Destroy) -> Result<(), ImlAgentError> {
  let mut args = vec!["snapshot_destroy", "--fsname", &d.fsname, "--name", &d.name];
  if d.force {
  args.push("--force");
  }
- lctl(args).await.map(drop)
+ lctl_retry(args).await.map(drop)
 }
 
 pub async fn mount(m: Mount) -> Result<(), ImlAgentError> {

diff --git a/iml-agent/src/daemon_plugins/snapshot.rs b/iml-agent/src/daemon_plugins/snapshot.rs
@@ -20,6 +20,8 @@ use futures::{
  lock::Mutex,
  Future, FutureExt,
 };
+use iml_cmd::CmdError;
+use iml_tracing::tracing;
 use iml_wire_types::snapshot::{List, Snapshot};
 use std::collections::BTreeSet;
 use std::{pin::Pin, sync::Arc, time::Duration};
@@ -50,7 +52,7 @@ async fn list() -> Result<Vec<Snapshot>, ()> {
  let fss: Vec<String> = lctl(vec!["get_param", "-N", "mgs.MGS.live.*"])
  .await
  .map_err(|e| {
- // XXX debug because of false positives
+ // XXX debug because of false positives. But this is still a failure.
  tracing::debug!("listing filesystems failed: {}", e);
  })
  .map(|o| {
@@ -87,14 +89,31 @@ async fn list() -> Result<Vec<Snapshot>, ()> {
  let really_failed_fss = errs
  .into_iter()
  .map(|x| x.unwrap_err())
- .filter(|x| !snapshot_fsnames.contains(&x.0))
+ .filter(|x| {
+ tracing::debug!("listing for {} failed: {:?}", x.0, x.1);
+
+ if snapshot_fsnames.contains(&x.0) {
+ tracing::debug!("{} is a snapshot FS", x.0);
+ return false;
+ }
+
+ match &x.1 {
+ ImlAgentError::CmdError(CmdError::Output(o)) => {
+ // XXX lctl returns 1 no matter what, so have to read its output:
+ let stderr = String::from_utf8_lossy(&o.stderr);
+ stderr.find("Miss MDT0 in the config file").is_none()
+ }
+ _ => true,
+ }
+ })
  .collect::<Vec<_>>();
 
- if !really_failed_fss.is_empty() {
- // XXX debug because of false positives
- tracing::debug!("listing failed: {:?}", really_failed_fss);
+ if really_failed_fss.is_empty() {
+ Ok(snaps)
+ } else {
+ tracing::error!("listing failed: {:?}", really_failed_fss);
+ Err(())
  }
- Ok(snaps)
 }
 
 #[async_trait]

diff --git a/iml-agent/src/lustre.rs b/iml-agent/src/lustre.rs
@@ -4,23 +4,56 @@
 
 use crate::agent_error::ImlAgentError;
 use futures::TryFutureExt;
-use iml_cmd::{CheckedCommandExt, Command};
+use iml_cmd::{CheckedCommandExt, CmdError, Command};
 use liblustreapi::LlapiFid;
-use std::ffi::OsStr;
-use tokio::task::spawn_blocking;
+use std::{ffi::OsStr, process::Output, time::Duration};
+use tokio::{task::spawn_blocking, time::delay_for};
 
-/// Runs lctl with given arguments
+/// Execute insistently lctl with given arguments (retry if resource is temporarily unavailable)
+pub async fn lctl_retry<I, S>(args: I) -> Result<String, ImlAgentError>
+where
+ I: IntoIterator<Item = S> + Clone,
+ S: AsRef<OsStr>,
+{
+ loop {
+ let r = invoke_lctl(args.clone()).await;
+
+ if let Err(CmdError::Output(o)) = &r {
+ let stderr = String::from_utf8_lossy(&o.stderr);
+ if stderr.find("Resource temporarily unavailable").is_some() {
+ const DUR: Duration = Duration::from_secs(3);
+ tracing::debug!("{}, waiting {:?} ...", stderr.trim(), DUR);
+ delay_for(DUR).await;
+ continue;
+ }
+ }
+ break r
+ .map_err(|e| e.into())
+ .map(|o| String::from_utf8_lossy(&o.stdout).to_string());
+ }
+}
+
+/// Execute lctl with given arguments
 pub async fn lctl<I, S>(args: I) -> Result<String, ImlAgentError>
+where
+ I: IntoIterator<Item = S>,
+ S: AsRef<OsStr>,
+{
+ invoke_lctl(args)
+ .await
+ .map_err(|e| e.into())
+ .map(|o| String::from_utf8_lossy(&o.stdout).to_string())
+}
+
+async fn invoke_lctl<I, S>(args: I) -> Result<Output, CmdError>
 where
  I: IntoIterator<Item = S>,
  S: AsRef<OsStr>,
 {
  Command::new("/usr/sbin/lctl")
  .args(args)
  .checked_output()
- .err_into()
  .await
- .map(|o| String::from_utf8_lossy(&o.stdout).to_string())
 }
 
 /// Returns LlapiFid for a given device or mount path