Skip to content

Commit d7b9fb2

Browse files
committed
Auto merge of #590 - Mark-Simulacrum:improve-errs, r=Mark-Simulacrum
Workaround docker is not running bug Currently, if an individual agent reports an error during execution (e.g., docker is not running, or one of its worker threads ended with an error), that job will be marked as failed in its entirety. Particularly as we currently have a transient agent (crater-gcp-1), which is sometimes down due to being a spot instance, this means that it can be hard to complete a Crater job if the GCP-1 instance is killing jobs midway through. It should be noted that in theory these errors shouldn't happen in the first place. In practice it looks like "docker is not running" is the primary cause of failure -- which is relatively hard to investigate; logs for the relevant time period appear absent. This PR restructures the code which detects docker absence to instead spin until docker *is* up. It looks relatively more difficult to re-organize the crater code to deal well with worker failure, likely by re-assigning the jobs to a live worker, though that is likely a better long-term solution.
2 parents 9503083 + 139d094 commit d7b9fb2

File tree

12 files changed

+55
-42
lines changed

12 files changed

+55
-42
lines changed

.github/workflows/bors.yml

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -12,8 +12,8 @@ jobs:
1212
steps:
1313
- uses: actions/checkout@v2
1414

15-
- name: Install Rust stable
16-
run: rustup update --no-self-update stable && rustup default stable
15+
- name: Install Rust nightly
16+
run: rustup update nightly && rustup default nightly && rustup component add rustfmt clippy
1717

1818
- name: Check the code formatting with rustfmt
1919
run: cargo fmt --all -- --check
@@ -31,7 +31,7 @@ jobs:
3131
strategy:
3232
matrix:
3333
os: [ubuntu-latest, windows-latest]
34-
channel: [stable, beta, nightly]
34+
channel: [nightly]
3535
runs-on: ${{ matrix.os }}
3636
steps:
3737
- uses: actions/checkout@v2
@@ -61,8 +61,8 @@ jobs:
6161
steps:
6262
- uses: actions/checkout@v2
6363

64-
- name: Install Rust stable
65-
run: rustup update --no-self-update stable && rustup default stable
64+
- name: Install Rust nightly
65+
run: rustup update --no-self-update nightly && rustup default nightly
6666

6767
- name: Run minicrater
6868
shell: bash

.github/workflows/pr.yml

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -8,8 +8,8 @@ jobs:
88
steps:
99
- uses: actions/checkout@v2
1010

11-
- name: Install Rust Stable
12-
run: rustup update stable && rustup default stable
11+
- name: Install Rust nightly
12+
run: rustup update nightly && rustup default nightly && rustup component add rustfmt clippy
1313

1414
- name: Check the code formatting with rustfmt
1515
run: cargo fmt --all -- --check
@@ -28,8 +28,8 @@ jobs:
2828
steps:
2929
- uses: actions/checkout@v2
3030

31-
- name: Install Rust Stable
32-
run: rustup update stable && rustup default stable
31+
- name: Install Rust nightly
32+
run: rustup update nightly && rustup default nightly
3333

3434
- name: Build Crater
3535
run: cargo build

Dockerfile

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,7 @@ RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y \
1919
# Install the currently pinned toolchain with rustup
2020
RUN curl https://static.rust-lang.org/rustup/dist/x86_64-unknown-linux-gnu/rustup-init >/tmp/rustup-init && \
2121
chmod +x /tmp/rustup-init && \
22-
/tmp/rustup-init -y --no-modify-path --default-toolchain stable --profile minimal
22+
/tmp/rustup-init -y --no-modify-path --default-toolchain nightly --profile minimal
2323
ENV PATH=/root/.cargo/bin:$PATH
2424

2525
# Build the dependencies in a separate step to avoid rebuilding all of them

rust-toolchain

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1 +1 @@
1-
stable
1+
nightly

src/crates/mod.rs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -94,7 +94,7 @@ impl TryFrom<&'_ PackageId> for Crate {
9494
[_, _, "path", path] => Ok(Crate::Path(path.to_string())),
9595
[_, _, "git", repo] => {
9696
if repo.starts_with("https://github.com") {
97-
Ok(Crate::GitHub(repo.replace("#", "/").parse()?))
97+
Ok(Crate::GitHub(repo.replace('#', "/").parse()?))
9898
} else {
9999
let mut parts = repo.split('#').rev().collect::<Vec<_>>();
100100
let url = parts.pop();

src/db/mod.rs

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -30,7 +30,7 @@ impl CustomizeConnection<Connection, ::rusqlite::Error> for ConnectionCustomizer
3030
pub struct Database {
3131
pool: Pool<SqliteConnectionManager>,
3232
// The tempfile is stored here to drop it after all the connections are closed
33-
tempfile: Option<Arc<NamedTempFile>>,
33+
_tempfile: Option<Arc<NamedTempFile>>,
3434
}
3535

3636
impl Database {
@@ -76,7 +76,7 @@ impl Database {
7676

7777
Ok(Database {
7878
pool,
79-
tempfile: tempfile.map(Arc::new),
79+
_tempfile: tempfile.map(Arc::new),
8080
})
8181
}
8282

src/report/markdown.rs

Lines changed: 4 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -66,7 +66,7 @@ fn write_crate(
6666
let prefix = if is_child { " * " } else { "* " };
6767
let status_warning = krate
6868
.status
69-
.map(|status| format!(" ({})", status.to_string()))
69+
.map(|status| format!(" ({})", status))
7070
.unwrap_or_default();
7171

7272
if let ReportConfig::Complete(toolchain) = comparison.report_config() {
@@ -82,7 +82,7 @@ fn write_crate(
8282
krate.name,
8383
status_warning,
8484
krate.url,
85-
comparison.to_string(),
85+
comparison,
8686
conj,
8787
runs[run],
8888
runs[1],
@@ -92,13 +92,7 @@ fn write_crate(
9292
writeln!(
9393
&mut rendered,
9494
"{}[{}{}]({}) {} [start]({}/log.txt) | [end]({}/log.txt)",
95-
prefix,
96-
krate.name,
97-
status_warning,
98-
krate.url,
99-
comparison.to_string(),
100-
runs[1],
101-
runs[3]
95+
prefix, krate.name, status_warning, krate.url, comparison, runs[1], runs[3]
10296
)?;
10397
};
10498

@@ -112,7 +106,7 @@ fn render_markdown(context: &ResultsContext) -> Fallible<String> {
112106
writeln!(&mut rendered, "# Crater report for {}\n\n", context.ex.name)?;
113107

114108
for (comparison, results) in context.categories.iter() {
115-
writeln!(&mut rendered, "\n### {}", comparison.to_string())?;
109+
writeln!(&mut rendered, "\n### {}", comparison)?;
116110
match results {
117111
ReportCratesMD::Plain(crates) => {
118112
for krate in crates {

src/report/mod.rs

Lines changed: 9 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -235,7 +235,7 @@ pub fn generate_report<DB: ReadResults>(
235235
log: crate_to_path_fragment(tc, krate, SanitizationContext::Url)
236236
.to_str()
237237
.unwrap()
238-
.replace(r"\", "/"), // Normalize paths in reports generated on Windows
238+
.replace('\'', "/"), // Normalize paths in reports generated on Windows
239239
})
240240
});
241241
// Convert errors to Nones
@@ -290,7 +290,7 @@ fn write_logs<DB: ReadResults, W: ReportWriter>(
290290
let content = db
291291
.load_log(ex, tc, krate)
292292
.and_then(|c| c.ok_or_else(|| err_msg("missing logs")))
293-
.with_context(|_| format!("failed to read log of {} on {}", krate, tc.to_string()));
293+
.with_context(|_| format!("failed to read log of {} on {}", krate, tc));
294294
let content = match content {
295295
Ok(c) => c,
296296
Err(e) => {
@@ -991,12 +991,12 @@ mod tests {
991991
TestResult::BuildFail(FailureReason::Unknown)
992992
);
993993
assert_eq!(
994-
(&gh_result.runs[0]).as_ref().unwrap().log.as_str(),
995-
"stable/gh/brson.hello-rs"
994+
Path::new((&gh_result.runs[0]).as_ref().unwrap().log.as_str()),
995+
Path::new("stable/gh/brson.hello-rs")
996996
);
997997
assert_eq!(
998-
(&gh_result.runs[1]).as_ref().unwrap().log.as_str(),
999-
"beta/gh/brson.hello-rs"
998+
Path::new((&gh_result.runs[1]).as_ref().unwrap().log.as_str()),
999+
Path::new("beta/gh/brson.hello-rs")
10001000
);
10011001

10021002
assert_eq!(reg_result.name.as_str(), "syn-1.0.0");
@@ -1015,11 +1015,11 @@ mod tests {
10151015
);
10161016
assert_eq!(
10171017
(&reg_result.runs[0]).as_ref().unwrap().log.as_str(),
1018-
"stable/reg/syn-1.0.0"
1018+
Path::new("stable/reg/syn-1.0.0")
10191019
);
10201020
assert_eq!(
1021-
(&reg_result.runs[1]).as_ref().unwrap().log.as_str(),
1022-
"beta/reg/syn-1.0.0"
1021+
Path::new((&reg_result.runs[1]).as_ref().unwrap().log.as_str()),
1022+
Path::new("beta/reg/syn-1.0.0")
10231023
);
10241024

10251025
assert_eq!(

src/runner/mod.rs

Lines changed: 23 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -56,8 +56,29 @@ pub fn run_ex<DB: WriteResults + Sync>(
5656
threads_count: usize,
5757
config: &Config,
5858
) -> Fallible<()> {
59-
if !rustwide::cmd::docker_running(workspace) {
60-
return Err(err_msg("docker is not running"));
59+
// Attempt to spin indefinitely until docker is up. Ideally, we would
60+
// decomission this agent until docker is up, instead of leaving the
61+
// assigned crates to 'hang' until we get our act together. In practice, we
62+
// expect workers to be around most of the time (just sometimes being
63+
// restarted etc.) and so the assigned crates shouldn't hang for long.
64+
//
65+
// If we return an Err(...) from this function, then currently that is
66+
// treated as a hard failure of the underlying experiment, but this error
67+
// has nothing to do with the experiment, so shouldn't be reported as such.
68+
//
69+
// In the future we'll want to *alert* on this error so that a human can
70+
// investigate, but the hope is that in practice docker is just being slow
71+
// or similar and this will fix itself, which currently makes the most sense
72+
// given low human resources. Additionally, it'll be indirectly alerted
73+
// through the worker being "down" according to our progress metrics, since
74+
// jobs won't be completed.
75+
let mut i = 0;
76+
while !rustwide::cmd::docker_running(workspace) {
77+
log::error!(
78+
"docker is not currently up, waiting for it to start (tried {} times)",
79+
i
80+
);
81+
i += 1;
6182
}
6283

6384
info!("computing the tasks graph...");

src/runner/tasks.rs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -76,7 +76,7 @@ impl fmt::Debug for TaskStep {
7676

7777
write!(f, "{}", name)?;
7878
if let Some(tc) = tc {
79-
write!(f, " {}", tc.to_string())?;
79+
write!(f, " {}", tc)?;
8080
}
8181
if quiet {
8282
write!(f, " (quiet)")?;

src/runner/test.rs

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -17,9 +17,9 @@ fn failure_reason(err: &Error) -> FailureReason {
1717
for cause in err.iter_chain() {
1818
if let Some(&CommandError::SandboxOOM) = cause.downcast_ctx() {
1919
return FailureReason::OOM;
20-
} else if let Some(&CommandError::NoOutputFor(_)) = cause.downcast_ctx() {
21-
return FailureReason::Timeout;
22-
} else if let Some(&CommandError::Timeout(_)) = cause.downcast_ctx() {
20+
} else if let Some(&CommandError::NoOutputFor(_) | &CommandError::Timeout(_)) =
21+
cause.downcast_ctx()
22+
{
2323
return FailureReason::Timeout;
2424
} else if let Some(reason) = cause.downcast_ctx::<FailureReason>() {
2525
return reason.clone();

src/server/routes/agent.rs

Lines changed: 1 addition & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -244,9 +244,7 @@ fn handle_results(resp: Fallible<Response<Body>>) -> Response<Body> {
244244
fn handle_errors(err: Rejection) -> Result<Response<Body>, Rejection> {
245245
let error = if let Some(compat) = err.find_cause::<Compat<HttpError>>() {
246246
Some(*compat.get_ref())
247-
} else if let StatusCode::NOT_FOUND = err.status() {
248-
Some(HttpError::NotFound)
249-
} else if let StatusCode::METHOD_NOT_ALLOWED = err.status() {
247+
} else if let StatusCode::NOT_FOUND | StatusCode::METHOD_NOT_ALLOWED = err.status() {
250248
Some(HttpError::NotFound)
251249
} else {
252250
None

0 commit comments

Comments
 (0)