Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix OOM slice removal race #2353

Merged
merged 1 commit into from
Aug 2, 2024
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
59 changes: 54 additions & 5 deletions conmon-rs/server/src/oom_watcher.rs
Original file line number Diff line number Diff line change
Expand Up @@ -224,13 +224,35 @@ impl OOMWatcher {
if let Err(e) = tx.try_send(OOMEvent{ oom: false }) {
error!("try_send failed: {:#}", e);
};

debug!("Last resort check for OOM");
match Self::check_for_oom(&memory_events_file_path, last_counter).await {
Ok((counter, is_oom)) => {
if !is_oom {
debug!("No OOM found in event");
break;
}
debug!("Found OOM event count {counter}");
if let Err(e) = Self::write_oom_files(exit_paths).await {
error!("Writing OOM files failed: {:#}", e)
}
if let Err(e) = tx.try_send(OOMEvent{ oom: true }) {
error!("Try send failed: {:#}", e)
};
}
// It is still possible to miss an OOM event here if the memory events file
// got removed between the notify event below and the token cancellation.
Err(e) => debug!("Checking for last resort OOM failed: {:#}", e),
}

break;
}
Some(res) = rx.recv() => {
match res {
Ok(event) => {
debug!("Got event OOM file event: {:?}", event);
debug!("Got OOM file event: {:?}", event);
if event.kind.is_remove() {
debug!("Got remove event");
if let Err(e) = tx.try_send(OOMEvent{ oom: false }) {
error!("try_send failed: {:#}", e);
};
Expand All @@ -239,9 +261,10 @@ impl OOMWatcher {
match Self::check_for_oom(&memory_events_file_path, last_counter).await {
Ok((counter, is_oom)) => {
if !is_oom {
debug!("No OOM found in event");
continue;
}
debug!(counter, "Found OOM event");
debug!("Found OOM event count {}", counter);
if let Err(e) = Self::write_oom_files(exit_paths).await {
error!("Writing OOM files failed: {:#}", e);
}
Expand All @@ -252,8 +275,19 @@ impl OOMWatcher {
};
}
Err(e) => {
error!("Checking for OOM failed: {}", e);
match tx.try_send(OOMEvent{ oom: false }) {
error!("Checking for OOM failed: {:#}", e);

let mut oom = false;
if Self::file_not_found(&memory_events_file_path).await {
debug!("Assuming memory slice removal race, still reporting one OOM event");
if let Err(e) = Self::write_oom_files(exit_paths).await {
error!("Writing OOM files failed: {:#}", e);
}
last_counter = 1;
oom = true;
}

match tx.try_send(OOMEvent{ oom }) {
Ok(_) => break,
Err(e) => error!("try_send failed: {:#}", e)
};
Expand Down Expand Up @@ -281,11 +315,24 @@ impl OOMWatcher {
Ok(())
}

/// Checks if a file does not exist on disk.
async fn file_not_found(f: impl AsRef<Path>) -> bool {
// TODO: use is_err_and if we can use Rust 1.70.0:
// https://doc.rust-lang.org/std/result/enum.Result.html#method.is_err_and
match tokio::fs::metadata(f).await {
Err(e) => e.kind() == ErrorKind::NotFound,
_ => false,
}
}

async fn check_for_oom(
memory_events_file_path: &Path,
last_counter: u64,
) -> Result<(u64, bool)> {
debug!("Checking for possible OOM");
debug!(
"Checking for possible OOM in {}",
memory_events_file_path.display()
);
let mut new_counter: u64 = 0;
let mut found_oom = false;
let fp = File::open(memory_events_file_path).await.context(format!(
Expand All @@ -295,11 +342,13 @@ impl OOMWatcher {
let reader = BufReader::new(fp);
let mut lines = reader.lines();
while let Some(line) = lines.next_line().await.context("get next line")? {
trace!(line);
if let Some(counter) = line.strip_prefix("oom ").or(line.strip_prefix("oom_kill ")) {
let counter = counter
.to_string()
.parse::<u64>()
.context("parse u64 counter")?;
debug!("New oom counter: {counter}, last counter: {last_counter}",);
if counter != last_counter {
debug!("Updating OOM counter to {counter}");
new_counter = counter;
Expand Down
Loading