Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

backport-2.1: assorted testing-related backports #30503

Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion build/teamcity-local-roachtest.sh
Original file line number Diff line number Diff line change
Expand Up @@ -28,5 +28,5 @@ run build/builder.sh ./bin/roachtest run '(acceptance|kv/splits)' \
--cockroach "cockroach" \
--workload "bin/workload" \
--artifacts artifacts \
--teamcity
--teamcity 2>&1 | tee artifacts/roachtest.log
tc_end_block "Run local roachtests"
23 changes: 14 additions & 9 deletions pkg/cmd/internal/issues/issues.go
Original file line number Diff line number Diff line change
Expand Up @@ -50,11 +50,12 @@ var (
stacktraceRE = regexp.MustCompile(`(?m:^goroutine\s\d+)`)
)

// Based on the following observed API response:
// Based on the following observed API response the maximum here is 1<<16-1
// (but we stay way below that as nobody likes to scroll for pages and pages).
//
// 422 Validation Failed [{Resource:Issue Field:body Code:custom Message:body
// is too long (maximum is 65536 characters)}]
const githubIssueBodyMaximumLength = 1<<16 - 1
const githubIssueBodyMaximumLength = 5000

// trimIssueRequestBody trims message such that the total size of an issue body
// is less than githubIssueBodyMaximumLength. usedCharacters specifies the
Expand Down Expand Up @@ -273,27 +274,31 @@ make stress TESTS=%[5]s PKG=%[4]s TESTTIMEOUT=5m STRESSFLAGS='-stderr=false -max
Failed test: %[3]s`
const messageTemplate = "\n\n```\n%s\n```"

newIssueRequest := func(packageName, testName, message, assignee string) *github.IssueRequest {
body := func(packageName, testName, message string) string {
body := fmt.Sprintf(bodyTemplate, p.sha, p.parameters(), p.teamcityURL(), packageName, testName) + messageTemplate
// We insert a raw "%s" above so we can figure out the length of the
// body so far, without the actual error text. We need this length so we
// can calculate the maximum amount of error text we can include in the
// issue without exceeding GitHub's limit. We replace that %s in the
// following Sprintf.
body = fmt.Sprintf(body, trimIssueRequestBody(message, len(body)))
return fmt.Sprintf(body, trimIssueRequestBody(message, len(body)))
}

newIssueRequest := func(packageName, testName, message, assignee string) *github.IssueRequest {
b := body(packageName, testName, message)

return &github.IssueRequest{
Title: &title,
Body: &body,
Body: &b,
Labels: &issueLabels,
Assignee: &assignee,
Milestone: p.milestone,
}
}

newIssueComment := func(packageName, testName string) *github.IssueComment {
body := fmt.Sprintf(bodyTemplate, p.sha, p.parameters(), p.teamcityURL(), packageName, testName)
return &github.IssueComment{Body: &body}
newIssueComment := func(packageName, testName, message string) *github.IssueComment {
b := body(packageName, testName, message)
return &github.IssueComment{Body: &b}
}

assignee, err := getAssignee(ctx, authorEmail, p.listCommits)
Expand Down Expand Up @@ -331,7 +336,7 @@ Failed test: %[3]s`
github.Stringify(issueRequest))
}
} else {
comment := newIssueComment(packageName, testName)
comment := newIssueComment(packageName, testName, message)
if _, _, err := p.createComment(
ctx, githubUser, githubRepo, *foundIssue, comment); err != nil {
return errors.Wrapf(err, "failed to update issue #%d with %s",
Expand Down
5 changes: 3 additions & 2 deletions pkg/cmd/roachtest/acceptance.go
Original file line number Diff line number Diff line change
Expand Up @@ -25,8 +25,9 @@ func registerAcceptance(r *registry) {
// local mode the acceptance tests should be configured to run within a
// minute or so as these tests are run on every merge to master.
spec := testSpec{
Name: "acceptance",
Nodes: nodes(4),
Name: "acceptance",
Nodes: nodes(4),
Stable: true, // DO NOT COPY to new tests
}

testCases := []struct {
Expand Down
85 changes: 68 additions & 17 deletions pkg/cmd/roachtest/clearrange.go
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,8 @@ import (
)

func registerClearRange(r *registry) {
const aggressiveConsistencyChecks = true

r.Add(testSpec{
Name: `clearrange`,
MinVersion: `v2.1.0`,
Expand Down Expand Up @@ -52,7 +54,13 @@ func registerClearRange(r *registry) {
}

c.Put(ctx, cockroach, "./cockroach")
c.Start(ctx)
if aggressiveConsistencyChecks {
// Run with an env var that runs a synchronous consistency check after each rebalance and merge.
// This slows down merges, so it might hide some races.
c.Start(ctx, startArgs("--env=COCKROACH_CONSISTENCY_AGGRESSIVE=true"))
} else {
c.Start(ctx)
}

// Also restore a much smaller table. We'll use it to run queries against
// the cluster after having dropped the large table above, verifying that
Expand All @@ -68,6 +76,39 @@ func registerClearRange(r *registry) {

t.Status()

// Set up a convenience function that we can call to learn the number of
// ranges for the bank.bank table (even after it's been dropped).
numBankRanges := func() func() int {
conn := c.Conn(ctx, 1)
defer conn.Close()

var startHex string
// NB: set this to false to save yourself some time during development. Selecting
// from crdb_internal.ranges is very slow because it contacts all of the leaseholders.
// You may actually want to run a version of cockroach that doesn't do that because
// it'll still slow you down every time the method returned below is called.
if true {
if err := conn.QueryRow(
`SELECT to_hex(start_key) FROM crdb_internal.ranges WHERE "database" = 'bank' AND "table" = 'bank' ORDER BY start_key ASC LIMIT 1`,
).Scan(&startHex); err != nil {
t.Fatal(err)
}
} else {
startHex = "bd" // extremely likely to be the right thing (b'\275').
}
return func() int {
conn := c.Conn(ctx, 1)
defer conn.Close()
var n int
if err := conn.QueryRow(
`SELECT count(*) FROM crdb_internal.ranges WHERE substr(to_hex(start_key), 1, length($1::string)) = $1`, startHex,
).Scan(&n); err != nil {
t.Fatal(err)
}
return n
}
}()

m := newMonitor(ctx, c)
m.Go(func(ctx context.Context) error {
conn := c.Conn(ctx, 1)
Expand All @@ -77,6 +118,11 @@ func registerClearRange(r *registry) {
return err
}

// Merge as fast as possible to put maximum stress on the system.
if _, err := conn.ExecContext(ctx, `SET CLUSTER SETTING kv.range_merge.queue_interval = '0s'`); err != nil {
return err
}

t.WorkerStatus("dropping table")
defer t.WorkerStatus()

Expand All @@ -86,41 +132,46 @@ func registerClearRange(r *registry) {
return err
}

t.WorkerStatus("computing number of ranges")
initialBankRanges := numBankRanges()

t.WorkerStatus("dropping bank table")
if _, err := conn.ExecContext(ctx, `DROP TABLE bank.bank`); err != nil {
return err
}

// Spend a few minutes reading data with a timeout to make sure the
// Spend some time reading data with a timeout to make sure the
// DROP above didn't brick the cluster. At the time of writing,
// clearing all of the table data takes ~6min. We run for 2.5x that
// time to verify that nothing has gone wonky on the cluster.
//
// Don't lower this number, or the test may pass erroneously.
const minutes = 45
t.WorkerStatus("repeatedly running count(*) on small table")
for i := 0; i < minutes; i++ {
after := time.After(time.Minute)
// clearing all of the table data takes ~6min, so we want to run
// for at least a multiple of that duration.
const minDuration = 45 * time.Minute
deadline := timeutil.Now().Add(minDuration)
curBankRanges := numBankRanges()
t.WorkerStatus("waiting for ~", curBankRanges, " merges to complete (and for at least ", minDuration, " to pass)")
for timeutil.Now().Before(deadline) || curBankRanges > 1 {
after := time.After(5 * time.Minute)
curBankRanges = numBankRanges() // this call takes minutes, unfortunately
t.WorkerProgress(1 - float64(curBankRanges)/float64(initialBankRanges))

var count int
// NB: context cancellation in QueryRowContext does not work as expected.
// See #25435.
if _, err := conn.ExecContext(ctx, `SET statement_timeout = '10s'`); err != nil {
if _, err := conn.ExecContext(ctx, `SET statement_timeout = '5s'`); err != nil {
return err
}
// If we can't aggregate over 80kb in 10s, the database is far from usable.
start := timeutil.Now()
// If we can't aggregate over 80kb in 5s, the database is far from usable.
if err := conn.QueryRowContext(ctx, `SELECT count(*) FROM tinybank.bank`).Scan(&count); err != nil {
return err
}
c.l.Printf("read %d rows in %0.1fs\n", count, timeutil.Since(start).Seconds())
t.WorkerProgress(float64(i+1) / float64(minutes))

select {
case <-after:
case <-ctx.Done():
return ctx.Err()
}
}
// TODO(benesch): verify that every last range in the table has been
// merged away. For now, just exercising the merge code is a good start.
// TODO(tschottdorf): verify that disk space usage drops below to <some small amount>, but that
// may not actually happen (see https://github.com/cockroachdb/cockroach/issues/29290).
return nil
})
m.Wait()
Expand Down
4 changes: 2 additions & 2 deletions pkg/cmd/roachtest/drop.go
Original file line number Diff line number Diff line change
Expand Up @@ -62,11 +62,11 @@ func registerDrop(r *registry) {
stmt = fmt.Sprintf(stmtStr, "", "=")
}
t.WorkerStatus(stmt)
_, err := db.ExecContext(ctx, stmt)
_, err := db.ExecContext(ctx, stmt, args...)
if err != nil && maybeExperimental && strings.Contains(err.Error(), "syntax error") {
stmt = fmt.Sprintf(stmtStr, "EXPERIMENTAL", "")
t.WorkerStatus(stmt)
_, err = db.ExecContext(ctx, stmt)
_, err = db.ExecContext(ctx, stmt, args...)
}
if err != nil {
t.Fatal(err)
Expand Down
54 changes: 26 additions & 28 deletions pkg/cmd/roachtest/rapid_restart.go
Original file line number Diff line number Diff line change
Expand Up @@ -61,38 +61,36 @@ func runRapidRestart(ctx context.Context, t *test, c *cluster) {
}

waitTime := time.Duration(rand.Int63n(int64(time.Second)))
if !c.isLocal() {
// TODO(peter): This is hacky: the signal might be sent before the
// cockroach process starts, which is especially true on remote
// clusters. Perhaps combine this with a monitor so that we can detect
// as soon as the process starts before killing it. Or a custom kill
// script which loops looking for a cockroach process and kills it as
// soon as it appears. Using --pid_file or --background isn't quite
// right as we want to be able to kill the process before it is ready.
waitTime += time.Second
}
time.Sleep(waitTime)

sig := [2]string{"2", "9"}[rand.Intn(2)]
c.Stop(ctx, nodes, stopArgs("--sig="+sig))
select {
case <-ctx.Done():
return
case err := <-exitCh:
cause := errors.Cause(err)
if exitErr, ok := cause.(*exec.ExitError); ok {
switch status := sysutil.ExitStatus(exitErr); status {
case -1:
// Received SIGINT before setting up our own signal handlers or
// SIGKILL.
case 1:
// Exit code from a SIGINT received by our signal handlers.
default:
t.Fatalf("unexpected exit status %d", status)
}
} else {
t.Fatalf("unexpected exit err: %v", err)

var err error
for err == nil {
c.Stop(ctx, nodes, stopArgs("--sig="+sig))
select {
case <-ctx.Done():
return
case err = <-exitCh:
case <-time.After(10 * time.Second):
// We likely ended up killing before the process spawned.
// Loop around.
c.l.Printf("no exit status yet, killing again")
}
}
cause := errors.Cause(err)
if exitErr, ok := cause.(*exec.ExitError); ok {
switch status := sysutil.ExitStatus(exitErr); status {
case -1:
// Received SIGINT before setting up our own signal handlers or
// SIGKILL.
case 1:
// Exit code from a SIGINT received by our signal handlers.
default:
t.Fatalf("unexpected exit status %d", status)
}
} else {
t.Fatalf("unexpected exit err: %v", err)
}
}

Expand Down
10 changes: 7 additions & 3 deletions pkg/cmd/roachtest/test.go
Original file line number Diff line number Diff line change
Expand Up @@ -682,8 +682,8 @@ func (r *registry) run(
}
if err := issues.Post(
context.Background(),
fmt.Sprintf("roachtest: %s failed on %s", t.Name(), branch),
"roachtest", t.Name(), string(output), authorEmail,
fmt.Sprintf("roachtest: %s failed", t.Name()),
"roachtest", t.Name(), "The test failed on "+branch+":\n"+string(output), authorEmail,
); err != nil {
fmt.Fprintf(r.out, "failed to post issue: %s\n", err)
}
Expand Down Expand Up @@ -776,13 +776,17 @@ func (r *registry) run(

select {
case <-time.After(timeout):
t.printf("test timed out (%s)", timeout)
t.printf("test timed out (%s)\n", timeout)
if c != nil {
c.FetchLogs(ctx)
// NB: c.destroyed is nil for cloned clusters (i.e. in subtests).
if !debugEnabled && c.destroyed != nil {
c.Destroy(ctx)
}
if local {
t.printf("waiting for test to tear down since cluster is local\n")
<-done
}
}
case <-done:
}
Expand Down
14 changes: 2 additions & 12 deletions pkg/cmd/roachtest/upgrade.go
Original file line number Diff line number Diff line change
Expand Up @@ -48,17 +48,7 @@ func registerUpgrade(r *registry) {
c.Put(ctx, b, "./cockroach", c.Range(1, nodes))
// Force disable encryption.
// TODO(mberhault): allow it once oldVersion >= 2.1.
start := func() {
c.Start(ctx, c.Range(1, nodes), startArgsDontEncrypt)
}
start()
time.Sleep(5 * time.Second)

// TODO(tschottdorf): this is a hack similar to the one in the mixed version
// test. Remove it when we have a 2.0.x binary that has #27639 fixed.
c.Stop(ctx, c.Range(1, nodes))
start()
time.Sleep(5 * time.Second)
c.Start(ctx, c.Range(1, nodes), startArgsDontEncrypt)

const stageDuration = 30 * time.Second
const timeUntilStoreDead = 90 * time.Second
Expand Down Expand Up @@ -266,7 +256,7 @@ func registerUpgrade(r *registry) {
}
}

const oldVersion = "v2.0.0"
const oldVersion = "v2.0.5"
for _, n := range []int{5} {
r.Add(testSpec{
Name: fmt.Sprintf("upgrade/oldVersion=%s/nodes=%d", oldVersion, n),
Expand Down
2 changes: 1 addition & 1 deletion pkg/storage/client_raft_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -3193,7 +3193,7 @@ func TestReplicateRemovedNodeDisruptiveElection(t *testing.T) {
default:
t.Fatalf("unexpected error type %T: %s", pErr.GetDetail(), pErr)
}
case <-time.After(5 * time.Second):
case <-time.After(45 * time.Second):
t.Fatal("did not get expected error")
}

Expand Down
Loading