diff --git a/build/teamcity-local-roachtest.sh b/build/teamcity-local-roachtest.sh index 5a15c8339985..60e224df5d04 100755 --- a/build/teamcity-local-roachtest.sh +++ b/build/teamcity-local-roachtest.sh @@ -28,5 +28,5 @@ run build/builder.sh ./bin/roachtest run '(acceptance|kv/splits)' \ --cockroach "cockroach" \ --workload "bin/workload" \ --artifacts artifacts \ - --teamcity + --teamcity 2>&1 | tee artifacts/roachtest.log tc_end_block "Run local roachtests" diff --git a/pkg/cmd/internal/issues/issues.go b/pkg/cmd/internal/issues/issues.go index ec4510b7b63c..eda29fff7b16 100644 --- a/pkg/cmd/internal/issues/issues.go +++ b/pkg/cmd/internal/issues/issues.go @@ -50,11 +50,12 @@ var ( stacktraceRE = regexp.MustCompile(`(?m:^goroutine\s\d+)`) ) -// Based on the following observed API response: +// Based on the following observed API response the maximum here is 1<<16-1 +// (but we stay way below that as nobody likes to scroll for pages and pages). // // 422 Validation Failed [{Resource:Issue Field:body Code:custom Message:body // is too long (maximum is 65536 characters)}] -const githubIssueBodyMaximumLength = 1<<16 - 1 +const githubIssueBodyMaximumLength = 5000 // trimIssueRequestBody trims message such that the total size of an issue body // is less than githubIssueBodyMaximumLength. usedCharacters specifies the @@ -273,27 +274,31 @@ make stress TESTS=%[5]s PKG=%[4]s TESTTIMEOUT=5m STRESSFLAGS='-stderr=false -max Failed test: %[3]s` const messageTemplate = "\n\n```\n%s\n```" - newIssueRequest := func(packageName, testName, message, assignee string) *github.IssueRequest { + body := func(packageName, testName, message string) string { body := fmt.Sprintf(bodyTemplate, p.sha, p.parameters(), p.teamcityURL(), packageName, testName) + messageTemplate // We insert a raw "%s" above so we can figure out the length of the // body so far, without the actual error text. We need this length so we // can calculate the maximum amount of error text we can include in the // issue without exceeding GitHub's limit. We replace that %s in the // following Sprintf. - body = fmt.Sprintf(body, trimIssueRequestBody(message, len(body))) + return fmt.Sprintf(body, trimIssueRequestBody(message, len(body))) + } + + newIssueRequest := func(packageName, testName, message, assignee string) *github.IssueRequest { + b := body(packageName, testName, message) return &github.IssueRequest{ Title: &title, - Body: &body, + Body: &b, Labels: &issueLabels, Assignee: &assignee, Milestone: p.milestone, } } - newIssueComment := func(packageName, testName string) *github.IssueComment { - body := fmt.Sprintf(bodyTemplate, p.sha, p.parameters(), p.teamcityURL(), packageName, testName) - return &github.IssueComment{Body: &body} + newIssueComment := func(packageName, testName, message string) *github.IssueComment { + b := body(packageName, testName, message) + return &github.IssueComment{Body: &b} } assignee, err := getAssignee(ctx, authorEmail, p.listCommits) @@ -331,7 +336,7 @@ Failed test: %[3]s` github.Stringify(issueRequest)) } } else { - comment := newIssueComment(packageName, testName) + comment := newIssueComment(packageName, testName, message) if _, _, err := p.createComment( ctx, githubUser, githubRepo, *foundIssue, comment); err != nil { return errors.Wrapf(err, "failed to update issue #%d with %s", diff --git a/pkg/cmd/roachtest/acceptance.go b/pkg/cmd/roachtest/acceptance.go index 7dd4a21adddb..8ebff00081c9 100644 --- a/pkg/cmd/roachtest/acceptance.go +++ b/pkg/cmd/roachtest/acceptance.go @@ -25,8 +25,9 @@ func registerAcceptance(r *registry) { // local mode the acceptance tests should be configured to run within a // minute or so as these tests are run on every merge to master. spec := testSpec{ - Name: "acceptance", - Nodes: nodes(4), + Name: "acceptance", + Nodes: nodes(4), + Stable: true, // DO NOT COPY to new tests } testCases := []struct { diff --git a/pkg/cmd/roachtest/clearrange.go b/pkg/cmd/roachtest/clearrange.go index af143a66f0c4..f83b1b776dc6 100644 --- a/pkg/cmd/roachtest/clearrange.go +++ b/pkg/cmd/roachtest/clearrange.go @@ -23,6 +23,8 @@ import ( ) func registerClearRange(r *registry) { + const aggressiveConsistencyChecks = true + r.Add(testSpec{ Name: `clearrange`, MinVersion: `v2.1.0`, @@ -52,7 +54,13 @@ func registerClearRange(r *registry) { } c.Put(ctx, cockroach, "./cockroach") - c.Start(ctx) + if aggressiveConsistencyChecks { + // Run with an env var that runs a synchronous consistency check after each rebalance and merge. + // This slows down merges, so it might hide some races. + c.Start(ctx, startArgs("--env=COCKROACH_CONSISTENCY_AGGRESSIVE=true")) + } else { + c.Start(ctx) + } // Also restore a much smaller table. We'll use it to run queries against // the cluster after having dropped the large table above, verifying that @@ -68,6 +76,39 @@ func registerClearRange(r *registry) { t.Status() + // Set up a convenience function that we can call to learn the number of + // ranges for the bank.bank table (even after it's been dropped). + numBankRanges := func() func() int { + conn := c.Conn(ctx, 1) + defer conn.Close() + + var startHex string + // NB: set this to false to save yourself some time during development. Selecting + // from crdb_internal.ranges is very slow because it contacts all of the leaseholders. + // You may actually want to run a version of cockroach that doesn't do that because + // it'll still slow you down every time the method returned below is called. + if true { + if err := conn.QueryRow( + `SELECT to_hex(start_key) FROM crdb_internal.ranges WHERE "database" = 'bank' AND "table" = 'bank' ORDER BY start_key ASC LIMIT 1`, + ).Scan(&startHex); err != nil { + t.Fatal(err) + } + } else { + startHex = "bd" // extremely likely to be the right thing (b'\275'). + } + return func() int { + conn := c.Conn(ctx, 1) + defer conn.Close() + var n int + if err := conn.QueryRow( + `SELECT count(*) FROM crdb_internal.ranges WHERE substr(to_hex(start_key), 1, length($1::string)) = $1`, startHex, + ).Scan(&n); err != nil { + t.Fatal(err) + } + return n + } + }() + m := newMonitor(ctx, c) m.Go(func(ctx context.Context) error { conn := c.Conn(ctx, 1) @@ -77,6 +118,11 @@ func registerClearRange(r *registry) { return err } + // Merge as fast as possible to put maximum stress on the system. + if _, err := conn.ExecContext(ctx, `SET CLUSTER SETTING kv.range_merge.queue_interval = '0s'`); err != nil { + return err + } + t.WorkerStatus("dropping table") defer t.WorkerStatus() @@ -86,41 +132,46 @@ func registerClearRange(r *registry) { return err } + t.WorkerStatus("computing number of ranges") + initialBankRanges := numBankRanges() + + t.WorkerStatus("dropping bank table") if _, err := conn.ExecContext(ctx, `DROP TABLE bank.bank`); err != nil { return err } - // Spend a few minutes reading data with a timeout to make sure the + // Spend some time reading data with a timeout to make sure the // DROP above didn't brick the cluster. At the time of writing, - // clearing all of the table data takes ~6min. We run for 2.5x that - // time to verify that nothing has gone wonky on the cluster. - // - // Don't lower this number, or the test may pass erroneously. - const minutes = 45 - t.WorkerStatus("repeatedly running count(*) on small table") - for i := 0; i < minutes; i++ { - after := time.After(time.Minute) + // clearing all of the table data takes ~6min, so we want to run + // for at least a multiple of that duration. + const minDuration = 45 * time.Minute + deadline := timeutil.Now().Add(minDuration) + curBankRanges := numBankRanges() + t.WorkerStatus("waiting for ~", curBankRanges, " merges to complete (and for at least ", minDuration, " to pass)") + for timeutil.Now().Before(deadline) || curBankRanges > 1 { + after := time.After(5 * time.Minute) + curBankRanges = numBankRanges() // this call takes minutes, unfortunately + t.WorkerProgress(1 - float64(curBankRanges)/float64(initialBankRanges)) + var count int // NB: context cancellation in QueryRowContext does not work as expected. // See #25435. - if _, err := conn.ExecContext(ctx, `SET statement_timeout = '10s'`); err != nil { + if _, err := conn.ExecContext(ctx, `SET statement_timeout = '5s'`); err != nil { return err } - // If we can't aggregate over 80kb in 10s, the database is far from usable. - start := timeutil.Now() + // If we can't aggregate over 80kb in 5s, the database is far from usable. if err := conn.QueryRowContext(ctx, `SELECT count(*) FROM tinybank.bank`).Scan(&count); err != nil { return err } - c.l.Printf("read %d rows in %0.1fs\n", count, timeutil.Since(start).Seconds()) - t.WorkerProgress(float64(i+1) / float64(minutes)) + select { case <-after: case <-ctx.Done(): return ctx.Err() } } - // TODO(benesch): verify that every last range in the table has been - // merged away. For now, just exercising the merge code is a good start. + // TODO(tschottdorf): verify that disk space usage drops below to , but that + // may not actually happen (see https://github.com/cockroachdb/cockroach/issues/29290). return nil }) m.Wait() diff --git a/pkg/cmd/roachtest/drop.go b/pkg/cmd/roachtest/drop.go index 7515131c7565..d0f59be58325 100644 --- a/pkg/cmd/roachtest/drop.go +++ b/pkg/cmd/roachtest/drop.go @@ -62,11 +62,11 @@ func registerDrop(r *registry) { stmt = fmt.Sprintf(stmtStr, "", "=") } t.WorkerStatus(stmt) - _, err := db.ExecContext(ctx, stmt) + _, err := db.ExecContext(ctx, stmt, args...) if err != nil && maybeExperimental && strings.Contains(err.Error(), "syntax error") { stmt = fmt.Sprintf(stmtStr, "EXPERIMENTAL", "") t.WorkerStatus(stmt) - _, err = db.ExecContext(ctx, stmt) + _, err = db.ExecContext(ctx, stmt, args...) } if err != nil { t.Fatal(err) diff --git a/pkg/cmd/roachtest/rapid_restart.go b/pkg/cmd/roachtest/rapid_restart.go index b8f02167b7b3..299809d62579 100644 --- a/pkg/cmd/roachtest/rapid_restart.go +++ b/pkg/cmd/roachtest/rapid_restart.go @@ -61,38 +61,36 @@ func runRapidRestart(ctx context.Context, t *test, c *cluster) { } waitTime := time.Duration(rand.Int63n(int64(time.Second))) - if !c.isLocal() { - // TODO(peter): This is hacky: the signal might be sent before the - // cockroach process starts, which is especially true on remote - // clusters. Perhaps combine this with a monitor so that we can detect - // as soon as the process starts before killing it. Or a custom kill - // script which loops looking for a cockroach process and kills it as - // soon as it appears. Using --pid_file or --background isn't quite - // right as we want to be able to kill the process before it is ready. - waitTime += time.Second - } time.Sleep(waitTime) sig := [2]string{"2", "9"}[rand.Intn(2)] - c.Stop(ctx, nodes, stopArgs("--sig="+sig)) - select { - case <-ctx.Done(): - return - case err := <-exitCh: - cause := errors.Cause(err) - if exitErr, ok := cause.(*exec.ExitError); ok { - switch status := sysutil.ExitStatus(exitErr); status { - case -1: - // Received SIGINT before setting up our own signal handlers or - // SIGKILL. - case 1: - // Exit code from a SIGINT received by our signal handlers. - default: - t.Fatalf("unexpected exit status %d", status) - } - } else { - t.Fatalf("unexpected exit err: %v", err) + + var err error + for err == nil { + c.Stop(ctx, nodes, stopArgs("--sig="+sig)) + select { + case <-ctx.Done(): + return + case err = <-exitCh: + case <-time.After(10 * time.Second): + // We likely ended up killing before the process spawned. + // Loop around. + c.l.Printf("no exit status yet, killing again") + } + } + cause := errors.Cause(err) + if exitErr, ok := cause.(*exec.ExitError); ok { + switch status := sysutil.ExitStatus(exitErr); status { + case -1: + // Received SIGINT before setting up our own signal handlers or + // SIGKILL. + case 1: + // Exit code from a SIGINT received by our signal handlers. + default: + t.Fatalf("unexpected exit status %d", status) } + } else { + t.Fatalf("unexpected exit err: %v", err) } } diff --git a/pkg/cmd/roachtest/test.go b/pkg/cmd/roachtest/test.go index 4029f9023eb5..fc964e62ad64 100644 --- a/pkg/cmd/roachtest/test.go +++ b/pkg/cmd/roachtest/test.go @@ -682,8 +682,8 @@ func (r *registry) run( } if err := issues.Post( context.Background(), - fmt.Sprintf("roachtest: %s failed on %s", t.Name(), branch), - "roachtest", t.Name(), string(output), authorEmail, + fmt.Sprintf("roachtest: %s failed", t.Name()), + "roachtest", t.Name(), "The test failed on "+branch+":\n"+string(output), authorEmail, ); err != nil { fmt.Fprintf(r.out, "failed to post issue: %s\n", err) } @@ -776,13 +776,17 @@ func (r *registry) run( select { case <-time.After(timeout): - t.printf("test timed out (%s)", timeout) + t.printf("test timed out (%s)\n", timeout) if c != nil { c.FetchLogs(ctx) // NB: c.destroyed is nil for cloned clusters (i.e. in subtests). if !debugEnabled && c.destroyed != nil { c.Destroy(ctx) } + if local { + t.printf("waiting for test to tear down since cluster is local\n") + <-done + } } case <-done: } diff --git a/pkg/cmd/roachtest/upgrade.go b/pkg/cmd/roachtest/upgrade.go index 702afaae7a6c..d41da59115b7 100644 --- a/pkg/cmd/roachtest/upgrade.go +++ b/pkg/cmd/roachtest/upgrade.go @@ -48,17 +48,7 @@ func registerUpgrade(r *registry) { c.Put(ctx, b, "./cockroach", c.Range(1, nodes)) // Force disable encryption. // TODO(mberhault): allow it once oldVersion >= 2.1. - start := func() { - c.Start(ctx, c.Range(1, nodes), startArgsDontEncrypt) - } - start() - time.Sleep(5 * time.Second) - - // TODO(tschottdorf): this is a hack similar to the one in the mixed version - // test. Remove it when we have a 2.0.x binary that has #27639 fixed. - c.Stop(ctx, c.Range(1, nodes)) - start() - time.Sleep(5 * time.Second) + c.Start(ctx, c.Range(1, nodes), startArgsDontEncrypt) const stageDuration = 30 * time.Second const timeUntilStoreDead = 90 * time.Second @@ -266,7 +256,7 @@ func registerUpgrade(r *registry) { } } - const oldVersion = "v2.0.0" + const oldVersion = "v2.0.5" for _, n := range []int{5} { r.Add(testSpec{ Name: fmt.Sprintf("upgrade/oldVersion=%s/nodes=%d", oldVersion, n), diff --git a/pkg/storage/client_raft_test.go b/pkg/storage/client_raft_test.go index 2890198bcac6..54a249b03c17 100644 --- a/pkg/storage/client_raft_test.go +++ b/pkg/storage/client_raft_test.go @@ -3193,7 +3193,7 @@ func TestReplicateRemovedNodeDisruptiveElection(t *testing.T) { default: t.Fatalf("unexpected error type %T: %s", pErr.GetDetail(), pErr) } - case <-time.After(5 * time.Second): + case <-time.After(45 * time.Second): t.Fatal("did not get expected error") } diff --git a/pkg/storage/closed_timestamp_test.go b/pkg/storage/closed_timestamp_test.go index dca2215f4f34..44b2ccca5c95 100644 --- a/pkg/storage/closed_timestamp_test.go +++ b/pkg/storage/closed_timestamp_test.go @@ -16,6 +16,7 @@ package storage_test import ( "context" + "fmt" "testing" "time" @@ -28,6 +29,7 @@ import ( "github.com/cockroachdb/cockroach/pkg/storage/engine/enginepb" "github.com/cockroachdb/cockroach/pkg/testutils" "github.com/cockroachdb/cockroach/pkg/testutils/serverutils" + "github.com/cockroachdb/cockroach/pkg/util" "github.com/cockroachdb/cockroach/pkg/util/hlc" "github.com/cockroachdb/cockroach/pkg/util/leaktest" "github.com/cockroachdb/cockroach/pkg/util/timeutil" @@ -35,7 +37,13 @@ import ( func TestClosedTimestampCanServe(t *testing.T) { defer leaktest.AfterTest(t)() - t.Skip("https://github.com/cockroachdb/cockroach/issues/28607") + + if util.RaceEnabled { + // Limiting how long transactions can run does not work + // well with race unless we're extremely lenient, which + // drives up the test duration. + t.Skip("skipping under race") + } ctx := context.Background() const numNodes = 3 @@ -47,13 +55,16 @@ func TestClosedTimestampCanServe(t *testing.T) { // Every 0.1s=100ms, try close out a timestamp ~300ms in the past. // We don't want to be more aggressive than that since it's also // a limit on how long transactions can run. - if _, err := db0.Exec(` -SET CLUSTER SETTING kv.closed_timestamp.target_duration = '300ms'; -SET CLUSTER SETTING kv.closed_timestamp.close_fraction = 0.1/0.3; + targetDuration := 300 * time.Millisecond + closeFraction := 0.3 + + if _, err := db0.Exec(fmt.Sprintf(` +SET CLUSTER SETTING kv.closed_timestamp.target_duration = '%s'; +SET CLUSTER SETTING kv.closed_timestamp.close_fraction = %.3f; SET CLUSTER SETTING kv.closed_timestamp.follower_reads_enabled = true; CREATE DATABASE cttest; CREATE TABLE cttest.kv (id INT PRIMARY KEY, value STRING); -`); err != nil { +`, targetDuration, closeFraction)); err != nil { t.Fatal(err) } diff --git a/pkg/storage/consistency_queue.go b/pkg/storage/consistency_queue.go index bbcf04ab4430..d3884551b2da 100644 --- a/pkg/storage/consistency_queue.go +++ b/pkg/storage/consistency_queue.go @@ -22,6 +22,7 @@ import ( "github.com/cockroachdb/cockroach/pkg/gossip" "github.com/cockroachdb/cockroach/pkg/roachpb" "github.com/cockroachdb/cockroach/pkg/settings" + "github.com/cockroachdb/cockroach/pkg/util/envutil" "github.com/cockroachdb/cockroach/pkg/util/grpcutil" "github.com/cockroachdb/cockroach/pkg/util/hlc" "github.com/cockroachdb/cockroach/pkg/util/log" @@ -33,6 +34,8 @@ var consistencyCheckInterval = settings.RegisterNonNegativeDurationSetting( 24*time.Hour, ) +var testingAggressiveConsistencyChecks = envutil.EnvOrDefaultBool("COCKROACH_CONSISTENCY_AGGRESSIVE", false) + type consistencyQueue struct { *baseQueue interval func() time.Duration diff --git a/pkg/storage/gossip_test.go b/pkg/storage/gossip_test.go index 318cf5aadf0b..6e7487bd8754 100644 --- a/pkg/storage/gossip_test.go +++ b/pkg/storage/gossip_test.go @@ -41,15 +41,22 @@ func TestGossipFirstRange(t *testing.T) { }) defer tc.Stopper().Stop(context.TODO()) - errors := make(chan error) + errors := make(chan error, 1) descs := make(chan *roachpb.RangeDescriptor) unregister := tc.Servers[0].Gossip().RegisterCallback(gossip.KeyFirstRangeDescriptor, func(_ string, content roachpb.Value) { var desc roachpb.RangeDescriptor if err := content.GetProto(&desc); err != nil { - errors <- err + select { + case errors <- err: + default: + } } else { - descs <- &desc + select { + case descs <- &desc: + case <-time.After(45 * time.Second): + t.Logf("had to drop descriptor %+v", desc) + } } }, // Redundant callbacks are required by this test. diff --git a/pkg/storage/merge_queue.go b/pkg/storage/merge_queue.go index 1b06fd995034..7001d1da0ae5 100644 --- a/pkg/storage/merge_queue.go +++ b/pkg/storage/merge_queue.go @@ -269,6 +269,11 @@ func (mq *mergeQueue) process( // as purgatory-worthy. return rangeMergePurgatoryError{err} } + if testingAggressiveConsistencyChecks { + if err := mq.store.consistencyQueue.process(ctx, lhsRepl, sysCfg); err != nil { + log.Warning(ctx, err) + } + } return nil } diff --git a/pkg/storage/replica_test.go b/pkg/storage/replica_test.go index 1fdcd759b9b6..947316f27a37 100644 --- a/pkg/storage/replica_test.go +++ b/pkg/storage/replica_test.go @@ -7861,7 +7861,10 @@ func TestReplicaIDChangePending(t *testing.T) { repl.mu.Lock() repl.mu.submitProposalFn = func(p *ProposalData) error { if p.Request.Timestamp == magicTS { - commandProposed <- struct{}{} + select { + case commandProposed <- struct{}{}: + default: + } } return nil } diff --git a/pkg/storage/replicate_queue.go b/pkg/storage/replicate_queue.go index ca4df3dd537f..659859a4150e 100644 --- a/pkg/storage/replicate_queue.go +++ b/pkg/storage/replicate_queue.go @@ -265,6 +265,11 @@ func (rq *replicateQueue) process( // Enqueue this replica again to see if there are more changes to be made. rq.MaybeAdd(repl, rq.store.Clock().Now()) } + if testingAggressiveConsistencyChecks { + if err := rq.store.consistencyQueue.process(ctx, repl, sysCfg); err != nil { + log.Warning(ctx, err) + } + } return nil } return errors.Errorf("failed to replicate after %d retries", retryOpts.MaxRetries) diff --git a/pkg/testutils/testcluster/testcluster.go b/pkg/testutils/testcluster/testcluster.go index 030301ae04c1..e217b06804c1 100644 --- a/pkg/testutils/testcluster/testcluster.go +++ b/pkg/testutils/testcluster/testcluster.go @@ -348,37 +348,43 @@ func (tc *TestCluster) AddReplicas( startKey roachpb.Key, targets ...roachpb.ReplicationTarget, ) (roachpb.RangeDescriptor, error) { rKey := keys.MustAddr(startKey) - rangeDesc, err := tc.changeReplicas( - roachpb.ADD_REPLICA, rKey, targets..., - ) - if err != nil { - return roachpb.RangeDescriptor{}, err - } + errRetry := errors.Errorf("target not found") + for { + rangeDesc, err := tc.changeReplicas( + roachpb.ADD_REPLICA, rKey, targets..., + ) + if err != nil { + return roachpb.RangeDescriptor{}, err + } - // Wait for the replication to complete on all destination nodes. - if err := retry.ForDuration(time.Second*5, func() error { - for _, target := range targets { - // Use LookupReplica(keys) instead of GetRange(rangeID) to ensure that the - // snapshot has been transferred and the descriptor initialized. - store, err := tc.findMemberStore(target.StoreID) - if err != nil { - log.Errorf(context.TODO(), "unexpected error: %s", err) - return err - } - repl := store.LookupReplica(rKey) - if repl == nil { - return errors.Errorf("range not found on store %d", target) - } - desc := repl.Desc() - if _, ok := desc.GetReplicaDescriptor(target.StoreID); !ok { - return errors.Errorf("target store %d not yet in range descriptor %v", target.StoreID, desc) + // Wait for the replication to complete on all destination nodes. + if err := retry.ForDuration(time.Second*25, func() error { + for _, target := range targets { + // Use LookupReplica(keys) instead of GetRange(rangeID) to ensure that the + // snapshot has been transferred and the descriptor initialized. + store, err := tc.findMemberStore(target.StoreID) + if err != nil { + log.Errorf(context.TODO(), "unexpected error: %s", err) + return err + } + repl := store.LookupReplica(rKey) + if repl == nil { + return errors.Wrapf(errRetry, "for target %s", target) + } + desc := repl.Desc() + if _, ok := desc.GetReplicaDescriptor(target.StoreID); !ok { + return errors.Errorf("target store %d not yet in range descriptor %v", target.StoreID, desc) + } } + return nil + }); errors.Cause(err) == errRetry { + log.Warningf(context.Background(), "target was likely downreplicated again; retrying after %s", err) + continue + } else if err != nil { + return roachpb.RangeDescriptor{}, err } - return nil - }); err != nil { - return roachpb.RangeDescriptor{}, err + return rangeDesc, nil } - return rangeDesc, nil } // RemoveReplicas is part of the TestServerInterface.