From ef1f5b05bebe6fde4adcfb277fb9c71e13192e4b Mon Sep 17 00:00:00 2001
From: Tobias Schottdorf <tobias.schottdorf@gmail.com>
Date: Thu, 6 Sep 2018 15:19:57 +0200
Subject: [PATCH 01/16] storage: add env var for aggressive consistency checks

We saw a consistency failure in #29252 that would've been much more
useful had it occurred close to the time around which the inconsistency
must have been introduced. Instead of leaving it to chance, add a switch
that runs aggressive checks in (roach) tests that want them such as the
clearrange test.

Release note: None
---
 pkg/storage/consistency_queue.go | 3 +++
 pkg/storage/merge_queue.go       | 5 +++++
 pkg/storage/replicate_queue.go   | 5 +++++
 3 files changed, 13 insertions(+)

diff --git a/pkg/storage/consistency_queue.go b/pkg/storage/consistency_queue.go
index bbcf04ab4430..d3884551b2da 100644
--- a/pkg/storage/consistency_queue.go
+++ b/pkg/storage/consistency_queue.go
@@ -22,6 +22,7 @@ import (
 	"github.com/cockroachdb/cockroach/pkg/gossip"
 	"github.com/cockroachdb/cockroach/pkg/roachpb"
 	"github.com/cockroachdb/cockroach/pkg/settings"
+	"github.com/cockroachdb/cockroach/pkg/util/envutil"
 	"github.com/cockroachdb/cockroach/pkg/util/grpcutil"
 	"github.com/cockroachdb/cockroach/pkg/util/hlc"
 	"github.com/cockroachdb/cockroach/pkg/util/log"
@@ -33,6 +34,8 @@ var consistencyCheckInterval = settings.RegisterNonNegativeDurationSetting(
 	24*time.Hour,
 )
 
+var testingAggressiveConsistencyChecks = envutil.EnvOrDefaultBool("COCKROACH_CONSISTENCY_AGGRESSIVE", false)
+
 type consistencyQueue struct {
 	*baseQueue
 	interval       func() time.Duration
diff --git a/pkg/storage/merge_queue.go b/pkg/storage/merge_queue.go
index 1b06fd995034..7001d1da0ae5 100644
--- a/pkg/storage/merge_queue.go
+++ b/pkg/storage/merge_queue.go
@@ -269,6 +269,11 @@ func (mq *mergeQueue) process(
 		// as purgatory-worthy.
 		return rangeMergePurgatoryError{err}
 	}
+	if testingAggressiveConsistencyChecks {
+		if err := mq.store.consistencyQueue.process(ctx, lhsRepl, sysCfg); err != nil {
+			log.Warning(ctx, err)
+		}
+	}
 	return nil
 }
 
diff --git a/pkg/storage/replicate_queue.go b/pkg/storage/replicate_queue.go
index ca4df3dd537f..659859a4150e 100644
--- a/pkg/storage/replicate_queue.go
+++ b/pkg/storage/replicate_queue.go
@@ -265,6 +265,11 @@ func (rq *replicateQueue) process(
 			// Enqueue this replica again to see if there are more changes to be made.
 			rq.MaybeAdd(repl, rq.store.Clock().Now())
 		}
+		if testingAggressiveConsistencyChecks {
+			if err := rq.store.consistencyQueue.process(ctx, repl, sysCfg); err != nil {
+				log.Warning(ctx, err)
+			}
+		}
 		return nil
 	}
 	return errors.Errorf("failed to replicate after %d retries", retryOpts.MaxRetries)

From d7e8c8b971a7061653f4790d8554c8d942f6bc4f Mon Sep 17 00:00:00 2001
From: Tobias Schottdorf <tobias.schottdorf@gmail.com>
Date: Thu, 6 Sep 2018 17:53:01 +0200
Subject: [PATCH 02/16] roachtest: better merge testing in clearrange

Unfortunately, the method to determine the range count is quite slow
since crdb_internal.ranges internally sends an RPC for each range to
determine the leaseholder.

Anecdotally, I've seen ~25% of the merges completed after less than 15
minutes. I know that it's slowing down over time, but @benesch will fix
that.

Also throws in aggressive consistency checks so that when something goes
out of sync, we find out right there.

Release note: None
---
 pkg/cmd/roachtest/clearrange.go | 85 ++++++++++++++++++++++++++-------
 1 file changed, 68 insertions(+), 17 deletions(-)

diff --git a/pkg/cmd/roachtest/clearrange.go b/pkg/cmd/roachtest/clearrange.go
index af143a66f0c4..f83b1b776dc6 100644
--- a/pkg/cmd/roachtest/clearrange.go
+++ b/pkg/cmd/roachtest/clearrange.go
@@ -23,6 +23,8 @@ import (
 )
 
 func registerClearRange(r *registry) {
+	const aggressiveConsistencyChecks = true
+
 	r.Add(testSpec{
 		Name:       `clearrange`,
 		MinVersion: `v2.1.0`,
@@ -52,7 +54,13 @@ func registerClearRange(r *registry) {
 			}
 
 			c.Put(ctx, cockroach, "./cockroach")
-			c.Start(ctx)
+			if aggressiveConsistencyChecks {
+				// Run with an env var that runs a synchronous consistency check after each rebalance and merge.
+				// This slows down merges, so it might hide some races.
+				c.Start(ctx, startArgs("--env=COCKROACH_CONSISTENCY_AGGRESSIVE=true"))
+			} else {
+				c.Start(ctx)
+			}
 
 			// Also restore a much smaller table. We'll use it to run queries against
 			// the cluster after having dropped the large table above, verifying that
@@ -68,6 +76,39 @@ func registerClearRange(r *registry) {
 
 			t.Status()
 
+			// Set up a convenience function that we can call to learn the number of
+			// ranges for the bank.bank table (even after it's been dropped).
+			numBankRanges := func() func() int {
+				conn := c.Conn(ctx, 1)
+				defer conn.Close()
+
+				var startHex string
+				// NB: set this to false to save yourself some time during development. Selecting
+				// from crdb_internal.ranges is very slow because it contacts all of the leaseholders.
+				// You may actually want to run a version of cockroach that doesn't do that because
+				// it'll still slow you down every time the method returned below is called.
+				if true {
+					if err := conn.QueryRow(
+						`SELECT to_hex(start_key) FROM crdb_internal.ranges WHERE "database" = 'bank' AND "table" = 'bank' ORDER BY start_key ASC LIMIT 1`,
+					).Scan(&startHex); err != nil {
+						t.Fatal(err)
+					}
+				} else {
+					startHex = "bd" // extremely likely to be the right thing (b'\275').
+				}
+				return func() int {
+					conn := c.Conn(ctx, 1)
+					defer conn.Close()
+					var n int
+					if err := conn.QueryRow(
+						`SELECT count(*) FROM crdb_internal.ranges WHERE substr(to_hex(start_key), 1, length($1::string)) = $1`, startHex,
+					).Scan(&n); err != nil {
+						t.Fatal(err)
+					}
+					return n
+				}
+			}()
+
 			m := newMonitor(ctx, c)
 			m.Go(func(ctx context.Context) error {
 				conn := c.Conn(ctx, 1)
@@ -77,6 +118,11 @@ func registerClearRange(r *registry) {
 					return err
 				}
 
+				// Merge as fast as possible to put maximum stress on the system.
+				if _, err := conn.ExecContext(ctx, `SET CLUSTER SETTING kv.range_merge.queue_interval = '0s'`); err != nil {
+					return err
+				}
+
 				t.WorkerStatus("dropping table")
 				defer t.WorkerStatus()
 
@@ -86,41 +132,46 @@ func registerClearRange(r *registry) {
 					return err
 				}
 
+				t.WorkerStatus("computing number of ranges")
+				initialBankRanges := numBankRanges()
+
+				t.WorkerStatus("dropping bank table")
 				if _, err := conn.ExecContext(ctx, `DROP TABLE bank.bank`); err != nil {
 					return err
 				}
 
-				// Spend a few minutes reading data with a timeout to make sure the
+				// Spend some time reading data with a timeout to make sure the
 				// DROP above didn't brick the cluster. At the time of writing,
-				// clearing all of the table data takes ~6min. We run for 2.5x that
-				// time to verify that nothing has gone wonky on the cluster.
-				//
-				// Don't lower this number, or the test may pass erroneously.
-				const minutes = 45
-				t.WorkerStatus("repeatedly running count(*) on small table")
-				for i := 0; i < minutes; i++ {
-					after := time.After(time.Minute)
+				// clearing all of the table data takes ~6min, so we want to run
+				// for at least a multiple of that duration.
+				const minDuration = 45 * time.Minute
+				deadline := timeutil.Now().Add(minDuration)
+				curBankRanges := numBankRanges()
+				t.WorkerStatus("waiting for ~", curBankRanges, " merges to complete (and for at least ", minDuration, " to pass)")
+				for timeutil.Now().Before(deadline) || curBankRanges > 1 {
+					after := time.After(5 * time.Minute)
+					curBankRanges = numBankRanges() // this call takes minutes, unfortunately
+					t.WorkerProgress(1 - float64(curBankRanges)/float64(initialBankRanges))
+
 					var count int
 					// NB: context cancellation in QueryRowContext does not work as expected.
 					// See #25435.
-					if _, err := conn.ExecContext(ctx, `SET statement_timeout = '10s'`); err != nil {
+					if _, err := conn.ExecContext(ctx, `SET statement_timeout = '5s'`); err != nil {
 						return err
 					}
-					// If we can't aggregate over 80kb in 10s, the database is far from usable.
-					start := timeutil.Now()
+					// If we can't aggregate over 80kb in 5s, the database is far from usable.
 					if err := conn.QueryRowContext(ctx, `SELECT count(*) FROM tinybank.bank`).Scan(&count); err != nil {
 						return err
 					}
-					c.l.Printf("read %d rows in %0.1fs\n", count, timeutil.Since(start).Seconds())
-					t.WorkerProgress(float64(i+1) / float64(minutes))
+
 					select {
 					case <-after:
 					case <-ctx.Done():
 						return ctx.Err()
 					}
 				}
-				// TODO(benesch): verify that every last range in the table has been
-				// merged away. For now, just exercising the merge code is a good start.
+				// TODO(tschottdorf): verify that disk space usage drops below to <some small amount>, but that
+				// may not actually happen (see https://github.com/cockroachdb/cockroach/issues/29290).
 				return nil
 			})
 			m.Wait()

From 09c5df7383af9bf5bcfb9e2ec4348bfc96ec09b7 Mon Sep 17 00:00:00 2001
From: Tobias Schottdorf <tobias.schottdorf@gmail.com>
Date: Wed, 19 Sep 2018 19:50:19 +0200
Subject: [PATCH 03/16] issues: truncate more eagerly

Nobody likes scrolling for minutes at a time.

Release note: None
---
 pkg/cmd/internal/issues/issues.go | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/pkg/cmd/internal/issues/issues.go b/pkg/cmd/internal/issues/issues.go
index ec4510b7b63c..30dee21a43b4 100644
--- a/pkg/cmd/internal/issues/issues.go
+++ b/pkg/cmd/internal/issues/issues.go
@@ -50,11 +50,12 @@ var (
 	stacktraceRE = regexp.MustCompile(`(?m:^goroutine\s\d+)`)
 )
 
-// Based on the following observed API response:
+// Based on the following observed API response the maximum here is 1<<16-1
+// (but we stay way below that as nobody likes to scroll for pages and pages).
 //
 // 422 Validation Failed [{Resource:Issue Field:body Code:custom Message:body
 // is too long (maximum is 65536 characters)}]
-const githubIssueBodyMaximumLength = 1<<16 - 1
+const githubIssueBodyMaximumLength = 5000
 
 // trimIssueRequestBody trims message such that the total size of an issue body
 // is less than githubIssueBodyMaximumLength. usedCharacters specifies the

From b7b7a947a5805ffa3a250fa757abe3db5a7f592d Mon Sep 17 00:00:00 2001
From: Tobias Schottdorf <tobias.schottdorf@gmail.com>
Date: Wed, 19 Sep 2018 19:51:11 +0200
Subject: [PATCH 04/16] issues: use same message for new and existing issues

Previously, the message for existing issues didn't contain the output.
This was annoying as you care about the most recent failures first and
foremost, typically.

Release note: None
---
 pkg/cmd/internal/issues/issues.go | 18 +++++++++++-------
 1 file changed, 11 insertions(+), 7 deletions(-)

diff --git a/pkg/cmd/internal/issues/issues.go b/pkg/cmd/internal/issues/issues.go
index 30dee21a43b4..eda29fff7b16 100644
--- a/pkg/cmd/internal/issues/issues.go
+++ b/pkg/cmd/internal/issues/issues.go
@@ -274,27 +274,31 @@ make stress TESTS=%[5]s PKG=%[4]s TESTTIMEOUT=5m STRESSFLAGS='-stderr=false -max
 Failed test: %[3]s`
 	const messageTemplate = "\n\n```\n%s\n```"
 
-	newIssueRequest := func(packageName, testName, message, assignee string) *github.IssueRequest {
+	body := func(packageName, testName, message string) string {
 		body := fmt.Sprintf(bodyTemplate, p.sha, p.parameters(), p.teamcityURL(), packageName, testName) + messageTemplate
 		// We insert a raw "%s" above so we can figure out the length of the
 		// body so far, without the actual error text. We need this length so we
 		// can calculate the maximum amount of error text we can include in the
 		// issue without exceeding GitHub's limit. We replace that %s in the
 		// following Sprintf.
-		body = fmt.Sprintf(body, trimIssueRequestBody(message, len(body)))
+		return fmt.Sprintf(body, trimIssueRequestBody(message, len(body)))
+	}
+
+	newIssueRequest := func(packageName, testName, message, assignee string) *github.IssueRequest {
+		b := body(packageName, testName, message)
 
 		return &github.IssueRequest{
 			Title:     &title,
-			Body:      &body,
+			Body:      &b,
 			Labels:    &issueLabels,
 			Assignee:  &assignee,
 			Milestone: p.milestone,
 		}
 	}
 
-	newIssueComment := func(packageName, testName string) *github.IssueComment {
-		body := fmt.Sprintf(bodyTemplate, p.sha, p.parameters(), p.teamcityURL(), packageName, testName)
-		return &github.IssueComment{Body: &body}
+	newIssueComment := func(packageName, testName, message string) *github.IssueComment {
+		b := body(packageName, testName, message)
+		return &github.IssueComment{Body: &b}
 	}
 
 	assignee, err := getAssignee(ctx, authorEmail, p.listCommits)
@@ -332,7 +336,7 @@ Failed test: %[3]s`
 				github.Stringify(issueRequest))
 		}
 	} else {
-		comment := newIssueComment(packageName, testName)
+		comment := newIssueComment(packageName, testName, message)
 		if _, _, err := p.createComment(
 			ctx, githubUser, githubRepo, *foundIssue, comment); err != nil {
 			return errors.Wrapf(err, "failed to update issue #%d with %s",

From 9d7c3d99ef65742afed863d0d2379d38e70b673c Mon Sep 17 00:00:00 2001
From: Tobias Schottdorf <tobias.schottdorf@gmail.com>
Date: Wed, 19 Sep 2018 19:39:05 +0200
Subject: [PATCH 05/16] roachtest: don't disambiguate issue title by branch

This tends to create duplicate issues for a similar root cause. I think
it's better to collect failures from different branches in an umbrella
issue, even though it implies that we need to be more careful about
closing them.

Release note: None
---
 pkg/cmd/roachtest/test.go | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/pkg/cmd/roachtest/test.go b/pkg/cmd/roachtest/test.go
index 4029f9023eb5..f66e67727b26 100644
--- a/pkg/cmd/roachtest/test.go
+++ b/pkg/cmd/roachtest/test.go
@@ -682,8 +682,8 @@ func (r *registry) run(
 						}
 						if err := issues.Post(
 							context.Background(),
-							fmt.Sprintf("roachtest: %s failed on %s", t.Name(), branch),
-							"roachtest", t.Name(), string(output), authorEmail,
+							fmt.Sprintf("roachtest: %s failed", t.Name()),
+							"roachtest", t.Name(), "The test failed on "+branch+":\n"+string(output), authorEmail,
 						); err != nil {
 							fmt.Fprintf(r.out, "failed to post issue: %s\n", err)
 						}

From cf35b277a17a0325bca9b1dab62f393b1fd67b86 Mon Sep 17 00:00:00 2001
From: Tobias Schottdorf <tobias.schottdorf@gmail.com>
Date: Thu, 20 Sep 2018 16:21:21 +0200
Subject: [PATCH 06/16] roachtest: mark acceptance as stable

all of its subtests are already stable, but in running a test locally I
noticed that the top-level test was marked as passing as unstable. I'm
not sure, but this might mean that the top-level test would actually not
fail? Either way, better to mark it as stable explicitly.

We should also spend some thought on how diverging notions of Stable in
sub vs top level test are treated, not sure that this is well-defined.

Release note: None
---
 pkg/cmd/roachtest/acceptance.go | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/pkg/cmd/roachtest/acceptance.go b/pkg/cmd/roachtest/acceptance.go
index 7dd4a21adddb..8ebff00081c9 100644
--- a/pkg/cmd/roachtest/acceptance.go
+++ b/pkg/cmd/roachtest/acceptance.go
@@ -25,8 +25,9 @@ func registerAcceptance(r *registry) {
 	// local mode the acceptance tests should be configured to run within a
 	// minute or so as these tests are run on every merge to master.
 	spec := testSpec{
-		Name:  "acceptance",
-		Nodes: nodes(4),
+		Name:   "acceptance",
+		Nodes:  nodes(4),
+		Stable: true, // DO NOT COPY to new tests
 	}
 
 	testCases := []struct {

From 9670204dd24b96b6ab66f35aaabd0e61806f2549 Mon Sep 17 00:00:00 2001
From: Tobias Schottdorf <tobias.schottdorf@gmail.com>
Date: Thu, 20 Sep 2018 10:42:25 +0200
Subject: [PATCH 07/16] roachtest: remove now-unnecessary hack

Closes #27717.

Release note: None
---
 pkg/cmd/roachtest/upgrade.go | 14 ++------------
 1 file changed, 2 insertions(+), 12 deletions(-)

diff --git a/pkg/cmd/roachtest/upgrade.go b/pkg/cmd/roachtest/upgrade.go
index 702afaae7a6c..d41da59115b7 100644
--- a/pkg/cmd/roachtest/upgrade.go
+++ b/pkg/cmd/roachtest/upgrade.go
@@ -48,17 +48,7 @@ func registerUpgrade(r *registry) {
 		c.Put(ctx, b, "./cockroach", c.Range(1, nodes))
 		// Force disable encryption.
 		// TODO(mberhault): allow it once oldVersion >= 2.1.
-		start := func() {
-			c.Start(ctx, c.Range(1, nodes), startArgsDontEncrypt)
-		}
-		start()
-		time.Sleep(5 * time.Second)
-
-		// TODO(tschottdorf): this is a hack similar to the one in the mixed version
-		// test. Remove it when we have a 2.0.x binary that has #27639 fixed.
-		c.Stop(ctx, c.Range(1, nodes))
-		start()
-		time.Sleep(5 * time.Second)
+		c.Start(ctx, c.Range(1, nodes), startArgsDontEncrypt)
 
 		const stageDuration = 30 * time.Second
 		const timeUntilStoreDead = 90 * time.Second
@@ -266,7 +256,7 @@ func registerUpgrade(r *registry) {
 		}
 	}
 
-	const oldVersion = "v2.0.0"
+	const oldVersion = "v2.0.5"
 	for _, n := range []int{5} {
 		r.Add(testSpec{
 			Name:       fmt.Sprintf("upgrade/oldVersion=%s/nodes=%d", oldVersion, n),

From 15da15662ffebd81e6a1b5312c256a12ad8ea012 Mon Sep 17 00:00:00 2001
From: Tobias Schottdorf <tobias.schottdorf@gmail.com>
Date: Thu, 20 Sep 2018 10:51:26 +0200
Subject: [PATCH 08/16] storage: give
 TestReplicateRemovedNodeDisruptiveElection more time

Perhaps:

Fixes #27253.

Release note: None
---
 pkg/storage/client_raft_test.go | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pkg/storage/client_raft_test.go b/pkg/storage/client_raft_test.go
index 2890198bcac6..54a249b03c17 100644
--- a/pkg/storage/client_raft_test.go
+++ b/pkg/storage/client_raft_test.go
@@ -3193,7 +3193,7 @@ func TestReplicateRemovedNodeDisruptiveElection(t *testing.T) {
 		default:
 			t.Fatalf("unexpected error type %T: %s", pErr.GetDetail(), pErr)
 		}
-	case <-time.After(5 * time.Second):
+	case <-time.After(45 * time.Second):
 		t.Fatal("did not get expected error")
 	}
 

From b2b17a1b323f787cdc16d893d54e315aca2dd8d5 Mon Sep 17 00:00:00 2001
From: Tobias Schottdorf <tobias.schottdorf@gmail.com>
Date: Thu, 20 Sep 2018 11:11:03 +0200
Subject: [PATCH 09/16] storage: de-flake TestReplicaIDChangePending

setReplicaID refreshes the proposal and was thus synchronously writing
to the commandProposed chan. This channel could have filled up due to
an earlier reproposal already, deadlocking the test.

Fixes #28132.

Release note: None
---
 pkg/storage/replica_test.go | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/pkg/storage/replica_test.go b/pkg/storage/replica_test.go
index 1fdcd759b9b6..947316f27a37 100644
--- a/pkg/storage/replica_test.go
+++ b/pkg/storage/replica_test.go
@@ -7861,7 +7861,10 @@ func TestReplicaIDChangePending(t *testing.T) {
 	repl.mu.Lock()
 	repl.mu.submitProposalFn = func(p *ProposalData) error {
 		if p.Request.Timestamp == magicTS {
-			commandProposed <- struct{}{}
+			select {
+			case commandProposed <- struct{}{}:
+			default:
+			}
 		}
 		return nil
 	}

From f68fa47b68cfbb118b25849d3b73dfcf374740c7 Mon Sep 17 00:00:00 2001
From: Tobias Schottdorf <tobias.schottdorf@gmail.com>
Date: Thu, 20 Sep 2018 12:30:41 +0200
Subject: [PATCH 10/16] testcluster: improve AddReplicas check

AddReplicas was verifying that a replica had indeed been added, but
there's no guarantee that the replicate queue wouldn't have removed
it in the meantime. Attempt to work around this somewhat. The real
solution is not to provide that guarantee, but some tests likely
rely on it (and the failure is extremely rare, i.e. the new for
loop basically never runs).

Observed in #28368.

Release note: None
---
 pkg/testutils/testcluster/testcluster.go | 60 +++++++++++++-----------
 1 file changed, 33 insertions(+), 27 deletions(-)

diff --git a/pkg/testutils/testcluster/testcluster.go b/pkg/testutils/testcluster/testcluster.go
index 030301ae04c1..e217b06804c1 100644
--- a/pkg/testutils/testcluster/testcluster.go
+++ b/pkg/testutils/testcluster/testcluster.go
@@ -348,37 +348,43 @@ func (tc *TestCluster) AddReplicas(
 	startKey roachpb.Key, targets ...roachpb.ReplicationTarget,
 ) (roachpb.RangeDescriptor, error) {
 	rKey := keys.MustAddr(startKey)
-	rangeDesc, err := tc.changeReplicas(
-		roachpb.ADD_REPLICA, rKey, targets...,
-	)
-	if err != nil {
-		return roachpb.RangeDescriptor{}, err
-	}
+	errRetry := errors.Errorf("target not found")
+	for {
+		rangeDesc, err := tc.changeReplicas(
+			roachpb.ADD_REPLICA, rKey, targets...,
+		)
+		if err != nil {
+			return roachpb.RangeDescriptor{}, err
+		}
 
-	// Wait for the replication to complete on all destination nodes.
-	if err := retry.ForDuration(time.Second*5, func() error {
-		for _, target := range targets {
-			// Use LookupReplica(keys) instead of GetRange(rangeID) to ensure that the
-			// snapshot has been transferred and the descriptor initialized.
-			store, err := tc.findMemberStore(target.StoreID)
-			if err != nil {
-				log.Errorf(context.TODO(), "unexpected error: %s", err)
-				return err
-			}
-			repl := store.LookupReplica(rKey)
-			if repl == nil {
-				return errors.Errorf("range not found on store %d", target)
-			}
-			desc := repl.Desc()
-			if _, ok := desc.GetReplicaDescriptor(target.StoreID); !ok {
-				return errors.Errorf("target store %d not yet in range descriptor %v", target.StoreID, desc)
+		// Wait for the replication to complete on all destination nodes.
+		if err := retry.ForDuration(time.Second*25, func() error {
+			for _, target := range targets {
+				// Use LookupReplica(keys) instead of GetRange(rangeID) to ensure that the
+				// snapshot has been transferred and the descriptor initialized.
+				store, err := tc.findMemberStore(target.StoreID)
+				if err != nil {
+					log.Errorf(context.TODO(), "unexpected error: %s", err)
+					return err
+				}
+				repl := store.LookupReplica(rKey)
+				if repl == nil {
+					return errors.Wrapf(errRetry, "for target %s", target)
+				}
+				desc := repl.Desc()
+				if _, ok := desc.GetReplicaDescriptor(target.StoreID); !ok {
+					return errors.Errorf("target store %d not yet in range descriptor %v", target.StoreID, desc)
+				}
 			}
+			return nil
+		}); errors.Cause(err) == errRetry {
+			log.Warningf(context.Background(), "target was likely downreplicated again; retrying after %s", err)
+			continue
+		} else if err != nil {
+			return roachpb.RangeDescriptor{}, err
 		}
-		return nil
-	}); err != nil {
-		return roachpb.RangeDescriptor{}, err
+		return rangeDesc, nil
 	}
-	return rangeDesc, nil
 }
 
 // RemoveReplicas is part of the TestServerInterface.

From 7fb33e42de3d60ff087b5e64c6f608a893a04053 Mon Sep 17 00:00:00 2001
From: Tobias Schottdorf <tobias.schottdorf@gmail.com>
Date: Thu, 20 Sep 2018 12:50:06 +0200
Subject: [PATCH 11/16] storage: unskip TestClosedTimestampCanServe for
 non-race

Fixes #28607.

Release note: None
---
 pkg/storage/closed_timestamp_test.go | 21 ++++++++++++++++-----
 1 file changed, 16 insertions(+), 5 deletions(-)

diff --git a/pkg/storage/closed_timestamp_test.go b/pkg/storage/closed_timestamp_test.go
index dca2215f4f34..44b2ccca5c95 100644
--- a/pkg/storage/closed_timestamp_test.go
+++ b/pkg/storage/closed_timestamp_test.go
@@ -16,6 +16,7 @@ package storage_test
 
 import (
 	"context"
+	"fmt"
 	"testing"
 	"time"
 
@@ -28,6 +29,7 @@ import (
 	"github.com/cockroachdb/cockroach/pkg/storage/engine/enginepb"
 	"github.com/cockroachdb/cockroach/pkg/testutils"
 	"github.com/cockroachdb/cockroach/pkg/testutils/serverutils"
+	"github.com/cockroachdb/cockroach/pkg/util"
 	"github.com/cockroachdb/cockroach/pkg/util/hlc"
 	"github.com/cockroachdb/cockroach/pkg/util/leaktest"
 	"github.com/cockroachdb/cockroach/pkg/util/timeutil"
@@ -35,7 +37,13 @@ import (
 
 func TestClosedTimestampCanServe(t *testing.T) {
 	defer leaktest.AfterTest(t)()
-	t.Skip("https://github.com/cockroachdb/cockroach/issues/28607")
+
+	if util.RaceEnabled {
+		// Limiting how long transactions can run does not work
+		// well with race unless we're extremely lenient, which
+		// drives up the test duration.
+		t.Skip("skipping under race")
+	}
 
 	ctx := context.Background()
 	const numNodes = 3
@@ -47,13 +55,16 @@ func TestClosedTimestampCanServe(t *testing.T) {
 	// Every 0.1s=100ms, try close out a timestamp ~300ms in the past.
 	// We don't want to be more aggressive than that since it's also
 	// a limit on how long transactions can run.
-	if _, err := db0.Exec(`
-SET CLUSTER SETTING kv.closed_timestamp.target_duration = '300ms';
-SET CLUSTER SETTING kv.closed_timestamp.close_fraction = 0.1/0.3;
+	targetDuration := 300 * time.Millisecond
+	closeFraction := 0.3
+
+	if _, err := db0.Exec(fmt.Sprintf(`
+SET CLUSTER SETTING kv.closed_timestamp.target_duration = '%s';
+SET CLUSTER SETTING kv.closed_timestamp.close_fraction = %.3f;
 SET CLUSTER SETTING kv.closed_timestamp.follower_reads_enabled = true;
 CREATE DATABASE cttest;
 CREATE TABLE cttest.kv (id INT PRIMARY KEY, value STRING);
-`); err != nil {
+`, targetDuration, closeFraction)); err != nil {
 		t.Fatal(err)
 	}
 

From d2bcfb47add273efc5c0b87860591a0091655465 Mon Sep 17 00:00:00 2001
From: Tobias Schottdorf <tobias.schottdorf@gmail.com>
Date: Tue, 18 Sep 2018 18:30:16 +0200
Subject: [PATCH 12/16] roachtest: pass missing parameter

Fixes #29665.

Release note: None
---
 pkg/cmd/roachtest/drop.go | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/pkg/cmd/roachtest/drop.go b/pkg/cmd/roachtest/drop.go
index 7515131c7565..d0f59be58325 100644
--- a/pkg/cmd/roachtest/drop.go
+++ b/pkg/cmd/roachtest/drop.go
@@ -62,11 +62,11 @@ func registerDrop(r *registry) {
 					stmt = fmt.Sprintf(stmtStr, "", "=")
 				}
 				t.WorkerStatus(stmt)
-				_, err := db.ExecContext(ctx, stmt)
+				_, err := db.ExecContext(ctx, stmt, args...)
 				if err != nil && maybeExperimental && strings.Contains(err.Error(), "syntax error") {
 					stmt = fmt.Sprintf(stmtStr, "EXPERIMENTAL", "")
 					t.WorkerStatus(stmt)
-					_, err = db.ExecContext(ctx, stmt)
+					_, err = db.ExecContext(ctx, stmt, args...)
 				}
 				if err != nil {
 					t.Fatal(err)

From e5a8b38fcd45ca937398fa642a9cc3a9d8dcb03d Mon Sep 17 00:00:00 2001
From: Tobias Schottdorf <tobias.schottdorf@gmail.com>
Date: Thu, 20 Sep 2018 21:13:12 +0200
Subject: [PATCH 13/16] storage: de-hang TestGossipFirstRange

The test could deadlock if the callback fired during shutdown due to a
send on an unbuffered channel. Haven't been able to get this to fail
after this commit.

Fixes #29982.

Release note: None
---
 pkg/storage/gossip_test.go | 13 ++++++++++---
 1 file changed, 10 insertions(+), 3 deletions(-)

diff --git a/pkg/storage/gossip_test.go b/pkg/storage/gossip_test.go
index 318cf5aadf0b..6e7487bd8754 100644
--- a/pkg/storage/gossip_test.go
+++ b/pkg/storage/gossip_test.go
@@ -41,15 +41,22 @@ func TestGossipFirstRange(t *testing.T) {
 		})
 	defer tc.Stopper().Stop(context.TODO())
 
-	errors := make(chan error)
+	errors := make(chan error, 1)
 	descs := make(chan *roachpb.RangeDescriptor)
 	unregister := tc.Servers[0].Gossip().RegisterCallback(gossip.KeyFirstRangeDescriptor,
 		func(_ string, content roachpb.Value) {
 			var desc roachpb.RangeDescriptor
 			if err := content.GetProto(&desc); err != nil {
-				errors <- err
+				select {
+				case errors <- err:
+				default:
+				}
 			} else {
-				descs <- &desc
+				select {
+				case descs <- &desc:
+				case <-time.After(45 * time.Second):
+					t.Logf("had to drop descriptor %+v", desc)
+				}
 			}
 		},
 		// Redundant callbacks are required by this test.

From 805fb113642bfd000e1fce06eb205db882d77116 Mon Sep 17 00:00:00 2001
From: Tobias Schottdorf <tobias.schottdorf@gmail.com>
Date: Fri, 21 Sep 2018 11:28:40 +0200
Subject: [PATCH 14/16] roachtest: squirrel away pre-teamcity logs

The teamcity output is so much less pleasant to read. Better to just
open the artifact sometimes.

Release note: None
---
 build/teamcity-local-roachtest.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/build/teamcity-local-roachtest.sh b/build/teamcity-local-roachtest.sh
index 5a15c8339985..60e224df5d04 100755
--- a/build/teamcity-local-roachtest.sh
+++ b/build/teamcity-local-roachtest.sh
@@ -28,5 +28,5 @@ run build/builder.sh ./bin/roachtest run '(acceptance|kv/splits)' \
   --cockroach "cockroach" \
   --workload "bin/workload" \
   --artifacts artifacts \
-  --teamcity
+  --teamcity 2>&1 | tee artifacts/roachtest.log
 tc_end_block "Run local roachtests"

From f581d78f317589f5f3cc5b723cb5c9d389ef1d9f Mon Sep 17 00:00:00 2001
From: Tobias Schottdorf <tobias.schottdorf@gmail.com>
Date: Fri, 21 Sep 2018 11:40:42 +0200
Subject: [PATCH 15/16] roachtest: de-flake acceptance/rapid-restart

The kill signal was sometimes a noop (when issued before the process
to be killed started). Prior to this patch, that would leave the test
stuck.

Fixes #30475.

Release note: None
---
 pkg/cmd/roachtest/rapid_restart.go | 54 ++++++++++++++----------------
 1 file changed, 26 insertions(+), 28 deletions(-)

diff --git a/pkg/cmd/roachtest/rapid_restart.go b/pkg/cmd/roachtest/rapid_restart.go
index b8f02167b7b3..299809d62579 100644
--- a/pkg/cmd/roachtest/rapid_restart.go
+++ b/pkg/cmd/roachtest/rapid_restart.go
@@ -61,38 +61,36 @@ func runRapidRestart(ctx context.Context, t *test, c *cluster) {
 			}
 
 			waitTime := time.Duration(rand.Int63n(int64(time.Second)))
-			if !c.isLocal() {
-				// TODO(peter): This is hacky: the signal might be sent before the
-				// cockroach process starts, which is especially true on remote
-				// clusters. Perhaps combine this with a monitor so that we can detect
-				// as soon as the process starts before killing it. Or a custom kill
-				// script which loops looking for a cockroach process and kills it as
-				// soon as it appears. Using --pid_file or --background isn't quite
-				// right as we want to be able to kill the process before it is ready.
-				waitTime += time.Second
-			}
 			time.Sleep(waitTime)
 
 			sig := [2]string{"2", "9"}[rand.Intn(2)]
-			c.Stop(ctx, nodes, stopArgs("--sig="+sig))
-			select {
-			case <-ctx.Done():
-				return
-			case err := <-exitCh:
-				cause := errors.Cause(err)
-				if exitErr, ok := cause.(*exec.ExitError); ok {
-					switch status := sysutil.ExitStatus(exitErr); status {
-					case -1:
-						// Received SIGINT before setting up our own signal handlers or
-						// SIGKILL.
-					case 1:
-						// Exit code from a SIGINT received by our signal handlers.
-					default:
-						t.Fatalf("unexpected exit status %d", status)
-					}
-				} else {
-					t.Fatalf("unexpected exit err: %v", err)
+
+			var err error
+			for err == nil {
+				c.Stop(ctx, nodes, stopArgs("--sig="+sig))
+				select {
+				case <-ctx.Done():
+					return
+				case err = <-exitCh:
+				case <-time.After(10 * time.Second):
+					// We likely ended up killing before the process spawned.
+					// Loop around.
+					c.l.Printf("no exit status yet, killing again")
+				}
+			}
+			cause := errors.Cause(err)
+			if exitErr, ok := cause.(*exec.ExitError); ok {
+				switch status := sysutil.ExitStatus(exitErr); status {
+				case -1:
+					// Received SIGINT before setting up our own signal handlers or
+					// SIGKILL.
+				case 1:
+					// Exit code from a SIGINT received by our signal handlers.
+				default:
+					t.Fatalf("unexpected exit status %d", status)
 				}
+			} else {
+				t.Fatalf("unexpected exit err: %v", err)
 			}
 		}
 

From 7f75981eb8390336be8ec85ec2885a5c0f2f03ed Mon Sep 17 00:00:00 2001
From: Tobias Schottdorf <tobias.schottdorf@gmail.com>
Date: Fri, 21 Sep 2018 12:20:55 +0200
Subject: [PATCH 16/16] roachtest: wait for teardown if local test times out

I suspect not doing so previously can cause issues like #30397.
Local roachtests are simply not set up to run concurrently, but once a
test times out we're doing that (though we attempt to teardown the
cluster, tests can apparently still "do stuff" with it, for example
the rapid restarts test was running one-off invocations of CockroachDB
via Exec; also note the comment near the added lines suggesting that
in local clusters we may not actually be destroying them).

We can revisit this when we allow multiple local clusters in parallel,
as we can then use a different one.

Optimistically:

Fixes #30397.

Release note: None
---
 pkg/cmd/roachtest/test.go | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/pkg/cmd/roachtest/test.go b/pkg/cmd/roachtest/test.go
index f66e67727b26..fc964e62ad64 100644
--- a/pkg/cmd/roachtest/test.go
+++ b/pkg/cmd/roachtest/test.go
@@ -776,13 +776,17 @@ func (r *registry) run(
 
 					select {
 					case <-time.After(timeout):
-						t.printf("test timed out (%s)", timeout)
+						t.printf("test timed out (%s)\n", timeout)
 						if c != nil {
 							c.FetchLogs(ctx)
 							// NB: c.destroyed is nil for cloned clusters (i.e. in subtests).
 							if !debugEnabled && c.destroyed != nil {
 								c.Destroy(ctx)
 							}
+							if local {
+								t.printf("waiting for test to tear down since cluster is local\n")
+								<-done
+							}
 						}
 					case <-done:
 					}