Skip to content

Commit

Permalink
roachtest: no longer show other errors if a test times out
Browse files Browse the repository at this point in the history
When a test times out, it often leads to other failures
(i.e. context cancelled). However, these are often just
noise and distract from the root cause of the failures,
the timeout.

This change makes it so when a timeout failure is added,
it suppresses future errors from being shown in github posts
and the test failure message. They are still logged in their
respective failure.log files.

Epic: none
Release note: none
Fixes: #114565
  • Loading branch information
DarrylWong committed Nov 29, 2023
1 parent 11ad924 commit 74e7332
Show file tree
Hide file tree
Showing 2 changed files with 26 additions and 3 deletions.
26 changes: 23 additions & 3 deletions pkg/cmd/roachtest/test_impl.go
Original file line number Diff line number Diff line change
Expand Up @@ -106,6 +106,13 @@ type testImpl struct {
// referencing 0+ errors. failure captures all the errors
failures []failure

// failuresSuppressed indicates if further failures should be added to mu.failures.
failuresSuppressed bool

// numFailures is the number of failures that have been added via addFailures.
// This can deviate from len(failures) if failures have been suppressed.
numFailures int

// status is a map from goroutine id to status set by that goroutine. A
// special goroutine is indicated by runnerID; that one provides the test's
// "main status".
Expand Down Expand Up @@ -393,13 +400,16 @@ func (t *testImpl) addFailure(depth int, format string, args ...interface{}) {
t.mu.Lock()
defer t.mu.Unlock()

t.mu.failures = append(t.mu.failures, reportFailure)
if !t.mu.failuresSuppressed {
t.mu.failures = append(t.mu.failures, reportFailure)
}

var b strings.Builder
formatFailure(&b, reportFailure)
msg := b.String()

failureNum := len(t.mu.failures)
t.mu.numFailures++
failureNum := t.mu.numFailures
failureLog := fmt.Sprintf("failure_%d", failureNum)
t.L().Printf("test failure #%d: full stack retained in %s.log: %s", failureNum, failureLog, msg)
// Also dump the verbose error (incl. all stack traces) to a log file, in case
Expand All @@ -425,6 +435,16 @@ func (t *testImpl) addFailure(depth int, format string, args ...interface{}) {
t.mu.output = append(t.mu.output, '\n')
}

// suppressFailures will stop future failures from being surfaced to github posting
// or the test logger. It will not stop those failures from being logged in their
// own failure.log files. Used if we are confident on the root cause of a failure and
// want to reduce noise of other failures, i.e. timeouts.
func (t *testImpl) suppressFailures() {
t.mu.Lock()
defer t.mu.Unlock()
t.mu.failuresSuppressed = true
}

// We take the "squashed" error that contains information of all the errors for each failure.
func formatFailure(b *strings.Builder, reportFailures ...failure) {
for i, failure := range reportFailures {
Expand All @@ -450,7 +470,7 @@ func (t *testImpl) Failed() bool {
}

func (t *testImpl) failedRLocked() bool {
return len(t.mu.failures) > 0
return t.mu.numFailures > 0
}

func (t *testImpl) firstFailure() failure {
Expand Down
3 changes: 3 additions & 0 deletions pkg/cmd/roachtest/test_runner.go
Original file line number Diff line number Diff line change
Expand Up @@ -1144,6 +1144,9 @@ func (r *testRunner) runTest(
// NB: We're adding the timeout failure intentionally without cancelling the context
// to capture as much state as possible during artifact collection.
t.addFailure(0, "test timed out (%s)", timeout)
// We suppress other failures from being surfaced to the top as the timeout is always going
// to be the main error and subsequent errors (i.e. context cancelled) add noise.
t.suppressFailures()
timedOut = true
}

Expand Down

0 comments on commit 74e7332

Please sign in to comment.