cockroachdb · DarrylWong · Dec 4, 2023 · Nov 29, 2023
@@ -106,6 +106,13 @@ type testImpl struct {
 		// referencing 0+ errors. failure captures all the errors
 		failures []failure
 
+		// failuresSuppressed indicates if further failures should be added to mu.failures.
+		failuresSuppressed bool
+
+		// numFailures is the number of failures that have been added via addFailures.
+		// This can deviate from len(failures) if failures have been suppressed.
+		numFailures int
+
 		// status is a map from goroutine id to status set by that goroutine. A
 		// special goroutine is indicated by runnerID; that one provides the test's
 		// "main status".
@@ -393,13 +400,16 @@ func (t *testImpl) addFailure(depth int, format string, args ...interface{}) {
 	t.mu.Lock()
 	defer t.mu.Unlock()
 
-	t.mu.failures = append(t.mu.failures, reportFailure)
+	if !t.mu.failuresSuppressed {
+		t.mu.failures = append(t.mu.failures, reportFailure)
+	}
 
 	var b strings.Builder
 	formatFailure(&b, reportFailure)
 	msg := b.String()
 
-	failureNum := len(t.mu.failures)
+	t.mu.numFailures++
+	failureNum := t.mu.numFailures
 	failureLog := fmt.Sprintf("failure_%d", failureNum)
 	t.L().Printf("test failure #%d: full stack retained in %s.log: %s", failureNum, failureLog, msg)
 	// Also dump the verbose error (incl. all stack traces) to a log file, in case
@@ -425,6 +435,16 @@ func (t *testImpl) addFailure(depth int, format string, args ...interface{}) {
 	t.mu.output = append(t.mu.output, '\n')
 }
 
+// suppressFailures will stop future failures from being surfaced to github posting
+// or the test logger. It will not stop those failures from being logged in their
+// own failure.log files. Used if we are confident on the root cause of a failure and
+// want to reduce noise of other failures, i.e. timeouts.
+func (t *testImpl) suppressFailures() {
+	t.mu.Lock()
+	defer t.mu.Unlock()
+	t.mu.failuresSuppressed = true
+}
+
 // We take the "squashed" error that contains information of all the errors for each failure.
 func formatFailure(b *strings.Builder, reportFailures ...failure) {
 	for i, failure := range reportFailures {
@@ -450,7 +470,7 @@ func (t *testImpl) Failed() bool {
 }
 
 func (t *testImpl) failedRLocked() bool {
-	return len(t.mu.failures) > 0
+	return t.mu.numFailures > 0
 }
 
 func (t *testImpl) firstFailure() failure {

@@ -1139,6 +1139,9 @@ func (r *testRunner) runTest(
 		// NB: We're adding the timeout failure intentionally without cancelling the context
 		// to capture as much state as possible during artifact collection.
 		t.addFailure(0, "test timed out (%s)", timeout)
+		// We suppress other failures from being surfaced to the top as the timeout is always going
+		// to be the main error and subsequent errors (i.e. context cancelled) add noise.
+		t.suppressFailures()
 		timedOut = true
 	}