Skip to content

Commit

Permalink
feat(csi/node/timeout): add nvme-io-engine timeout and parse humantime
Browse files Browse the repository at this point in the history
Adds new parameter "--nvme-io-timeout".
This is used to set the timeout per nvme block device.
TODO: Check if this is enough to avoid setting the global timeout..
Also let's parse the "--nvme-core-io-timeout" as humantime as well..

Signed-off-by: Tiago Castro <tiagolobocastro@gmail.com>
  • Loading branch information
tiagolobocastro committed Dec 6, 2023
1 parent 02e666b commit d6f3380
Show file tree
Hide file tree
Showing 5 changed files with 96 additions and 13 deletions.
34 changes: 33 additions & 1 deletion control-plane/csi-driver/src/bin/node/config.rs
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,10 @@ pub fn nvme_keep_alive_tmo() -> String {
pub fn nvme_ctrl_loss_tmo() -> String {
Parameters::NvmeCtrlLossTmo.as_ref().to_kebab_case()
}
/// Command line arg name for `Parameters::NvmeIoTimeout`.
pub fn nvme_io_tmo() -> String {
Parameters::NvmeIoTimeout.as_ref().to_kebab_case()
}

/// Global configuration parameters.
#[derive(Debug, Default)]
Expand All @@ -42,17 +46,21 @@ pub(crate) struct NvmeConfig {
/// Default value for `ctrl_loss_tmo` when not specified via the volume parameters (sc).
ctrl_loss_tmo: Option<u32>,
keep_alive_tmo: Option<u32>,
/// Default value for `io_tmo` when not specified via the volume parameters (sc).
io_tmo: Option<humantime::Duration>,
}
impl NvmeConfig {
fn new(
nr_io_queues: Option<u32>,
ctrl_loss_tmo: Option<u32>,
keep_alive_tmo: Option<u32>,
io_tmo: Option<humantime::Duration>,
) -> Self {
Self {
nr_io_queues,
ctrl_loss_tmo,
keep_alive_tmo,
io_tmo,
}
}
/// Number of IO Queues.
Expand All @@ -68,6 +76,10 @@ impl NvmeConfig {
pub(crate) fn keep_alive_tmo(&self) -> Option<u32> {
self.keep_alive_tmo
}
/// The io timeout.
pub(crate) fn io_tmo(&self) -> Option<humantime::Duration> {
self.io_tmo
}
}

/// Get a mutex guard over the `Config`.
Expand Down Expand Up @@ -112,7 +124,22 @@ impl TryFrom<NvmeArgValues> for NvmeConfig {
error
)
})?;
Ok(Self::new(nvme_nr_ioq, ctrl_loss_tmo, keep_alive_tmo))
let nvme_io_tmo = Parameters::nvme_io_timeout(
src.0.get(Parameters::NvmeIoTimeout.as_ref()),
)
.map_err(|error| {
anyhow::anyhow!(
"Invalid value for {}, error = {}",
Parameters::NvmeIoTimeout.as_ref(),
error
)
})?;
Ok(Self::new(
nvme_nr_ioq,
ctrl_loss_tmo,
keep_alive_tmo,
nvme_io_tmo,
))
}
}
/// Nvme Arguments taken from the CSI volume calls (storage class parameters).
Expand Down Expand Up @@ -155,6 +182,11 @@ impl TryFrom<&ArgMatches> for NvmeArgValues {
map.0
.insert(Parameters::NvmeKeepAliveTmo.to_string(), value.to_string());
}

if let Some(value) = matches.get_one::<String>(&nvme_io_tmo()) {
map.0
.insert(Parameters::NvmeIoTimeout.to_string(), value.to_string());
}
Ok(map)
}
}
Expand Down
14 changes: 7 additions & 7 deletions control-plane/csi-driver/src/bin/node/dev/nvmf.rs
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,7 @@ pub(super) struct NvmfAttach {
port: u16,
uuid: Uuid,
nqn: String,
io_timeout: Option<u32>,
io_tmo: Option<u32>,
nr_io_queues: Option<u32>,
ctrl_loss_tmo: Option<u32>,
keep_alive_tmo: Option<u32>,
Expand All @@ -49,6 +49,7 @@ impl NvmfAttach {
uuid: Uuid,
nqn: String,
nr_io_queues: Option<u32>,
io_tmo: Option<humantime::Duration>,
ctrl_loss_tmo: Option<u32>,
keep_alive_tmo: Option<u32>,
hostnqn: Option<String>,
Expand All @@ -58,7 +59,7 @@ impl NvmfAttach {
port,
uuid,
nqn,
io_timeout: None,
io_tmo: io_tmo.map(|io_tmo| io_tmo.as_secs().try_into().unwrap_or(u32::MAX)),
nr_io_queues,
ctrl_loss_tmo,
keep_alive_tmo,
Expand Down Expand Up @@ -103,6 +104,7 @@ impl TryFrom<&Url> for NvmfAttach {
let nr_io_queues = config().nvme().nr_io_queues();
let ctrl_loss_tmo = config().nvme().ctrl_loss_tmo();
let keep_alive_tmo = config().nvme().keep_alive_tmo();
let io_tmo = config().nvme().io_tmo();

let hash_query: HashMap<_, _> = url.query_pairs().collect();
let hostnqn = hash_query.get("hostnqn").map(ToString::to_string);
Expand All @@ -113,6 +115,7 @@ impl TryFrom<&Url> for NvmfAttach {
uuid,
segments[0].to_string(),
nr_io_queues,
io_tmo,
ctrl_loss_tmo,
keep_alive_tmo,
hostnqn,
Expand All @@ -129,9 +132,6 @@ impl Attach for NvmfAttach {
let publish_context = PublishParams::try_from(context)
.map_err(|error| DeviceError::new(&error.to_string()))?;

if let Some(val) = publish_context.io_timeout() {
self.io_timeout = Some(*val);
}
if let Some(val) = publish_context.ctrl_loss_tmo() {
self.ctrl_loss_tmo = Some(*val);
}
Expand All @@ -158,7 +158,7 @@ impl Attach for NvmfAttach {
Err(NvmeError::SubsystemNotFound { .. }) => {
// The default reconnect delay in linux kernel is set to 10s. Use the
// same default value unless the timeout is less or equal to 10.
let reconnect_delay = match self.io_timeout {
let reconnect_delay = match self.io_tmo {
Some(io_timeout) => {
if io_timeout <= 10 {
Some(1)
Expand Down Expand Up @@ -199,7 +199,7 @@ impl Attach for NvmfAttach {
}

async fn fixup(&self) -> Result<(), DeviceError> {
let Some(io_timeout) = self.io_timeout else {
let Some(io_timeout) = self.io_tmo else {
return Ok(());
};

Expand Down
29 changes: 24 additions & 5 deletions control-plane/csi-driver/src/bin/node/main_.rs
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@ use std::{
future::Future,
io::ErrorKind,
pin::Pin,
str::FromStr,
sync::Arc,
task::{Context, Poll},
};
Expand Down Expand Up @@ -124,7 +125,13 @@ pub(super) async fn main() -> anyhow::Result<()> {
.long("nvme-core-io-timeout")
.value_name("TIMEOUT")
.required(false)
.help("Sets the global nvme_core module io_timeout, in seconds")
.help("Sets the global nvme_core module io_timeout, in seconds or humantime")
)
.arg(
Arg::new("nvme-io-timeout")
.long("nvme-io-timeout")
.required(false)
.help("Sets io_timeout for nvme block devices")
)
.arg(
Arg::new(crate::config::nvme_nr_io_queues())
Expand Down Expand Up @@ -162,6 +169,7 @@ pub(super) async fn main() -> anyhow::Result<()> {
.get_matches();

utils::print_package_info!();
println!("{:?}", env::args().collect::<Vec<String>>());

let endpoint = matches.get_one::<String>("grpc-endpoint").unwrap();
let csi_socket = matches
Expand All @@ -183,10 +191,21 @@ pub(super) async fn main() -> anyhow::Result<()> {
check_ana_and_label_node(matches.get_one::<String>("node-name").expect("required")).await?;
}

if let Some(nvme_io_timeout_secs) = matches.get_one::<String>("nvme-core-io-timeout") {
let io_timeout_secs: u32 = nvme_io_timeout_secs.parse().expect(
"nvme_core io_timeout should be an integer number, representing the timeout in seconds",
);
if let Some(nvme_io_timeout) = matches.get_one::<String>("nvme-io-timeout") {
let _ = humantime::Duration::from_str(nvme_io_timeout)
.map_err(|error| anyhow::format_err!("Failed to parse 'nvme-io-timeout': {error}"))?;
};
if let Some(nvme_io_timeout) = matches.get_one::<String>("nvme-core-io-timeout") {
let io_timeout_secs = match humantime::Duration::from_str(nvme_io_timeout) {
Ok(human_time) => {
human_time.as_secs() as u32
}
Err(_) => {
nvme_io_timeout.parse().expect(
"nvme_core io_timeout should be in humantime or an integer number, representing the timeout in seconds",
)
}
};

if let Err(error) = crate::dev::nvmf::set_nvmecore_iotimeout(io_timeout_secs) {
anyhow::bail!("Failed to set nvme_core io_timeout: {}", error);
Expand Down
27 changes: 27 additions & 0 deletions control-plane/csi-driver/src/context.rs
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,11 @@ pub fn parse_protocol(proto: Option<&String>) -> Result<VolumeShareProtocol, ton
#[derive(AsRefStr, EnumString, Display)]
#[strum(serialize_all = "camelCase")]
pub enum Parameters {
/// This value is now considered deprecated.
IoTimeout,
/// This value is used globally and not on a per volume context.
/// todo: split Parameters into 2 separate enums?
NvmeIoTimeout,
NvmeNrIoQueues,
NvmeCtrlLossTmo,
NvmeKeepAliveTmo,
Expand All @@ -50,6 +54,14 @@ pub enum Parameters {
FsId,
}
impl Parameters {
fn parse_human_time(
value: Option<&String>,
) -> Result<Option<humantime::Duration>, humantime::DurationError> {
Ok(match value {
Some(value) => humantime::Duration::from_str(value).map(Some)?,
None => None,
})
}
fn parse_u32(value: Option<&String>) -> Result<Option<u32>, ParseIntError> {
Ok(match value {
Some(value) => value.parse::<u32>().map(Some)?,
Expand Down Expand Up @@ -84,6 +96,12 @@ impl Parameters {
pub fn io_timeout(value: Option<&String>) -> Result<Option<u32>, ParseIntError> {
Self::parse_u32(value)
}
/// Parse the value for `Self::IoTimeout`.
pub fn nvme_io_timeout(
value: Option<&String>,
) -> Result<Option<humantime::Duration>, humantime::DurationError> {
Self::parse_human_time(value)
}
/// Parse the value for `Self::StsAffinityGroup`
pub fn sts_affinity_group(value: Option<&String>) -> Result<Option<bool>, ParseBoolError> {
Self::parse_bool(value)
Expand All @@ -105,6 +123,7 @@ impl Parameters {
#[derive(Debug)]
pub struct PublishParams {
io_timeout: Option<u32>,
nvme_io_timeout: Option<humantime::Duration>,
ctrl_loss_tmo: Option<u32>,
keep_alive_tmo: Option<u32>,
fs_type: Option<FileSystem>,
Expand All @@ -115,6 +134,10 @@ impl PublishParams {
pub fn io_timeout(&self) -> &Option<u32> {
&self.io_timeout
}
/// Get the `Parameters::NvmeIoTimeout` value.
pub fn nvme_io_timeout(&self) -> &Option<humantime::Duration> {
&self.nvme_io_timeout
}
/// Get the `Parameters::NvmeCtrlLossTmo` value.
pub fn ctrl_loss_tmo(&self) -> &Option<u32> {
&self.ctrl_loss_tmo
Expand Down Expand Up @@ -166,6 +189,9 @@ impl TryFrom<&HashMap<String, String>> for PublishParams {

let io_timeout = Parameters::io_timeout(args.get(Parameters::IoTimeout.as_ref()))
.map_err(|_| tonic::Status::invalid_argument("Invalid I/O timeout"))?;
let nvme_io_timeout =
Parameters::nvme_io_timeout(args.get(Parameters::NvmeIoTimeout.as_ref()))
.map_err(|_| tonic::Status::invalid_argument("Invalid I/O timeout"))?;
let ctrl_loss_tmo =
Parameters::ctrl_loss_tmo(args.get(Parameters::NvmeCtrlLossTmo.as_ref()))
.map_err(|_| tonic::Status::invalid_argument("Invalid ctrl_loss_tmo"))?;
Expand All @@ -177,6 +203,7 @@ impl TryFrom<&HashMap<String, String>> for PublishParams {

Ok(Self {
io_timeout,
nvme_io_timeout,
ctrl_loss_tmo,
keep_alive_tmo,
fs_type,
Expand Down
5 changes: 5 additions & 0 deletions tests/bdd/features/csi/node/test_parameters.py
Original file line number Diff line number Diff line change
Expand Up @@ -63,6 +63,10 @@ def the_nvme_device_should_report_total_queues(total):
# max io queues is cpu_count
# admin q is 1
assert queue_count == min(total, os.cpu_count() + 1)
file = f"/sys/block/nvme2c2n1/queue/io_timeout"
io_timoout = int(subprocess.run(["sudo", "cat", file], capture_output=True).stdout)
print(f"io_timeout: {io_timoout}")
assert io_timoout == 33000


@pytest.fixture
Expand Down Expand Up @@ -137,6 +141,7 @@ def monitor(proc, result):
"--csi-socket=/var/tmp/csi-node.sock",
"--grpc-endpoint=0.0.0.0:50050",
"--node-name=msn-test",
"--nvme-io-timeout=33s",
f"--nvme-nr-io-queues={io_queues}",
"-v",
],
Expand Down

0 comments on commit d6f3380

Please sign in to comment.