roachtest: no longer show other errors if a test times out

When a test times out, it often leads to other failures (i.e. context cancelled). However, these are often just noise and distract from the root cause of the failures, the timeout. This change makes it so when a timeout failure is added, it suppresses future errors from being shown in github posts and the test failure message. They are still logged in their respective failure.log files. Epic: none Release note: none Fixes: #114565
cockroachdb · Nov 29, 2023 · 74e7332 · 74e7332
1 parent 11ad924
commit 74e7332
Show file tree

Hide file tree

Showing 2 changed files with 26 additions and 3 deletions.
diff --git a/pkg/cmd/roachtest/test_impl.go b/pkg/cmd/roachtest/test_impl.go
@@ -106,6 +106,13 @@ type testImpl struct {
 		// referencing 0+ errors. failure captures all the errors
 		failures []failure
 
+		// failuresSuppressed indicates if further failures should be added to mu.failures.
+		failuresSuppressed bool
+
+		// numFailures is the number of failures that have been added via addFailures.
+		// This can deviate from len(failures) if failures have been suppressed.
+		numFailures int
+
 		// status is a map from goroutine id to status set by that goroutine. A
 		// special goroutine is indicated by runnerID; that one provides the test's
 		// "main status".
@@ -393,13 +400,16 @@ func (t *testImpl) addFailure(depth int, format string, args ...interface{}) {
 	t.mu.Lock()
 	defer t.mu.Unlock()
 
-	t.mu.failures = append(t.mu.failures, reportFailure)
+	if !t.mu.failuresSuppressed {
+		t.mu.failures = append(t.mu.failures, reportFailure)
+	}
 
 	var b strings.Builder
 	formatFailure(&b, reportFailure)
 	msg := b.String()
 
-	failureNum := len(t.mu.failures)
+	t.mu.numFailures++
+	failureNum := t.mu.numFailures
 	failureLog := fmt.Sprintf("failure_%d", failureNum)
 	t.L().Printf("test failure #%d: full stack retained in %s.log: %s", failureNum, failureLog, msg)
 	// Also dump the verbose error (incl. all stack traces) to a log file, in case
@@ -425,6 +435,16 @@ func (t *testImpl) addFailure(depth int, format string, args ...interface{}) {
 	t.mu.output = append(t.mu.output, '\n')
 }
 
+// suppressFailures will stop future failures from being surfaced to github posting
+// or the test logger. It will not stop those failures from being logged in their
+// own failure.log files. Used if we are confident on the root cause of a failure and
+// want to reduce noise of other failures, i.e. timeouts.
+func (t *testImpl) suppressFailures() {
+	t.mu.Lock()
+	defer t.mu.Unlock()
+	t.mu.failuresSuppressed = true
+}
+
 // We take the "squashed" error that contains information of all the errors for each failure.
 func formatFailure(b *strings.Builder, reportFailures ...failure) {
 	for i, failure := range reportFailures {
@@ -450,7 +470,7 @@ func (t *testImpl) Failed() bool {
 }
 
 func (t *testImpl) failedRLocked() bool {
-	return len(t.mu.failures) > 0
+	return t.mu.numFailures > 0
 }
 
 func (t *testImpl) firstFailure() failure {

diff --git a/pkg/cmd/roachtest/test_runner.go b/pkg/cmd/roachtest/test_runner.go
@@ -1144,6 +1144,9 @@ func (r *testRunner) runTest(
 		// NB: We're adding the timeout failure intentionally without cancelling the context
 		// to capture as much state as possible during artifact collection.
 		t.addFailure(0, "test timed out (%s)", timeout)
+		// We suppress other failures from being surfaced to the top as the timeout is always going
+		// to be the main error and subsequent errors (i.e. context cancelled) add noise.
+		t.suppressFailures()
 		timedOut = true
 	}