cockroachdb · nvanbenschoten · Dec 20, 2017 · Dec 9, 2017 · Dec 9, 2017
diff --git a/pkg/storage/client_raft_test.go b/pkg/storage/client_raft_test.go
@@ -3635,13 +3635,9 @@ func TestInitRaftGroupOnRequest(t *testing.T) {
  t.Fatal("replica should not be nil for RHS range")
  }
 
- // TODO(spencer): Raft messages seem to turn up
- // occasionally on restart, which initialize the replica, so
- // this is not a test failure. Not sure how to work around this
- // problem.
  // Verify the raft group isn't initialized yet.
  if repl.IsRaftGroupInitialized() {
- log.Errorf(context.TODO(), "expected raft group to be uninitialized")
+ t.Fatal("expected raft group to be uninitialized")
  }
 
  // Send an increment and verify that initializes the Raft group.

diff --git a/pkg/storage/client_split_test.go b/pkg/storage/client_split_test.go
@@ -34,6 +34,7 @@ import (
  "github.com/cockroachdb/cockroach/pkg/internal/client"
  "github.com/cockroachdb/cockroach/pkg/keys"
  "github.com/cockroachdb/cockroach/pkg/roachpb"
+ "github.com/cockroachdb/cockroach/pkg/server"
  "github.com/cockroachdb/cockroach/pkg/sql/sqlbase"
  "github.com/cockroachdb/cockroach/pkg/storage"
  "github.com/cockroachdb/cockroach/pkg/storage/engine"
@@ -789,7 +790,7 @@ func TestStoreRangeSplitStatsWithMerges(t *testing.T) {
 // fillRange writes keys with the given prefix and associated values
 // until bytes bytes have been written or the given range has split.
 func fillRange(
- store *storage.Store, rangeID roachpb.RangeID, prefix roachpb.Key, bytes int64, t *testing.T,
+ t *testing.T, store *storage.Store, rangeID roachpb.RangeID, prefix roachpb.Key, bytes int64,
 ) {
  src := rand.New(rand.NewSource(0))
  for {
@@ -802,7 +803,7 @@ func fillRange(
  return
  }
  key := append(append([]byte(nil), prefix...), randutil.RandBytes(src, 100)...)
- key = keys.MakeFamilyKey(key, 0)
+ key = keys.MakeFamilyKey(key, src.Uint32())
  val := randutil.RandBytes(src, int(src.Int31n(1<<8)))
  pArgs := putArgs(key, val)
  _, pErr := client.SendWrappedWith(context.Background(), store, roachpb.Header{
@@ -861,7 +862,7 @@ func TestStoreZoneUpdateAndRangeSplit(t *testing.T) {
  }
 
  // Look in the range after prefix we're writing to.
- fillRange(store, repl.RangeID, tableBoundary, maxBytes, t)
+ fillRange(t, store, repl.RangeID, tableBoundary, maxBytes)
  }
 
  // Verify that the range is in fact split.
@@ -912,6 +913,169 @@ func TestStoreRangeSplitWithMaxBytesUpdate(t *testing.T) {
  })
 }
 
+// TestStoreRangeSplitAfterLargeSnapshot tests a scenario where a range is too
+// large to snapshot a follower, but is unable to split because it cannot
+// achieve quorum. The leader of the range should adapt to this, eventually
+// permitting the large snapshot so that it can recover and then split
+// successfully.
+func TestStoreRangeSplitAfterLargeSnapshot(t *testing.T) {
+ defer leaktest.AfterTest(t)()
+
+ // Set maxBytes to something small so we can exceed the maximum snapshot
+ // size without adding 2x64MB of data.
+ const maxBytes = 1 << 16
+ defer config.TestingSetDefaultZoneConfig(config.ZoneConfig{
+ RangeMaxBytes: maxBytes,
+ })()
+
+ // Create a three node cluster.
+ sc := storage.TestStoreConfig(nil)
+ sc.RaftElectionTimeoutTicks = 1000000
+ mtc := &multiTestContext{storeConfig: &sc}
+ defer mtc.Stop()
+ mtc.Start(t, 3)
+ store0 := mtc.stores[0]
+ forAllLiveStores := func(f func(*storage.Store)) {
+ for _, store := range mtc.stores {
+ if store != nil {
+ f(store)
+ }
+ }
+ }
+
+ // The behindNode falls behind far enough to require a snapshot.
+ const behindNode = 1
+ // The crashingNode crashes after its single range becomes too large to
+ // snapshot.
+ const crashingNode = 2
+
+ // Wait for initial splits.
+ t.Log("waiting for initial splits")
+ forAllLiveStores(func(store *storage.Store) {
+ store.SetRaftSnapshotQueueActive(true)
+ store.SetSplitQueueActive(true)
+ store.ForceSplitScanAndProcess()
+ })
+ if err := server.WaitForInitialSplits(store0.DB()); err != nil {
+ t.Fatal(err)
+ }
+
+ // Then do a write; we'll use this to determine when the dust has settled.
+ t.Log("performing first write")
+ keyPrefix := append(keys.UserTableDataMin, []byte("key")...)
+ repl := store0.LookupReplica(roachpb.RKey(keyPrefix), nil)
+ rangeID := repl.RangeID
+ header := roachpb.Header{RangeID: rangeID}
+ incArgs := incrementArgs(keyPrefix, 1)
+ if _, pErr := client.SendWrappedWith(context.Background(), store0, header, incArgs); pErr != nil {
+ t.Fatal(pErr)
+ }
+
+ // Replicate the range we'll play with to the other nodes.
+ t.Log("replicating range")
+ mtc.replicateRange(rangeID, behindNode, crashingNode)
+ mtc.waitForValues(keyPrefix, []int64{1, 1, 1})
+
+ // Fill the range without allowing splits so that it will try to split once
+ // the splitQueue is re-enabled. Fill it past the snapshot size limit
+ // enforced in Replica.GetSnapshot. We do this before stopping behindNode so
+ // that the quotaPool does not throttle progress.
+ t.Log("filling range")
+ forAllLiveStores(func(store *storage.Store) {
+ store.SetSplitQueueActive(false)
+ })
+ fillRange(t, store0, rangeID, keyPrefix, 2*maxBytes+1)
+
+ // Turn off replica scanner and snapshot queue. We'll control queues
+ // directly from now on.
+ forAllLiveStores(func(store *storage.Store) {
+ store.SetReplicaScannerActive(false)
+ store.SetRaftSnapshotQueueActive(false)
+ })
+
+ // Stop behindNode so it falls behind and will require a snapshot.
+ t.Log("letting one follower fall behind")
+ mtc.stopStore(behindNode)
+
+ // Let behindNode fall behind.
+ if _, pErr := client.SendWrappedWith(context.Background(), store0, header, incArgs); pErr != nil {
+ t.Fatal(pErr)
+ }
+ mtc.waitForValues(keyPrefix, []int64{2, 1, 2})
+
+ // Truncate the replica's log. This ensures that the only way behindNode can
+ // recover is through a snapshot.
+ index, err := repl.GetLastIndex()
+ if err != nil {
+ t.Fatal(err)
+ }
+ truncArgs := truncateLogArgs(index+1, rangeID)
+ truncArgs.Key = repl.Desc().StartKey.AsRawKey()
+ if _, pErr := client.SendWrappedWith(context.Background(), store0, header, truncArgs); pErr != nil {
+ t.Fatal(pErr)
+ }
+
+ // The range can still make forward progress.
+ if _, pErr := client.SendWrappedWith(context.Background(), store0, header, incArgs); pErr != nil {
+ t.Fatal(pErr)
+ }
+ mtc.waitForValues(keyPrefix, []int64{3, 1, 3})
+
+ // Determine the range count.
+ prevRangeCount := store0.ReplicaCount()
+
+ // Stop crashingNode so that we lose quorum and can no longer split.
+ // Bring behindNode back up.
+ t.Log("killing the other follower")
+ mtc.stopStore(crashingNode)
+ mtc.restartStore(behindNode)
+
+ // Reactivate the split queues and reduce its timeout so it times out due
+ // to a lack of quorum faster. Force a split, which should fail because it
+ // cannot achieve quorum. This in turn should set the permitLargeSnapshot
+ // flag.
+ t.Log("attempting a split without quorum; this should fail")
+ forAllLiveStores(func(store *storage.Store) {
+ store.SetSplitQueueProcessTimeout(1 * time.Second)
+ store.SetSplitQueueActive(true)
+ store.ForceSplitScanAndProcess()
+ })
+ testutils.SucceedsSoon(t, func() error {
+ if !repl.PermittingLargeSnapshots() {
+ return errors.Errorf("replica not permitting large snapshots")
+ }
+ return nil
+ })
+
+ // Now that the permitLargeSnapshot flag is set, we should see
+ // the range recover after behindNode is sent a snapshot.
+ t.Log("waiting for large snapshot to succeed")
+ forAllLiveStores(func(store *storage.Store) {
+ store.SetRaftSnapshotQueueActive(true)
+ store.ForceRaftSnapshotQueueProcess()
+ })
+ mtc.waitForValues(keyPrefix, []int64{3, 3, 3})
+
+ // Once the range has a majority of up-to-date nodes, it should be
+ // able to split. We first increment the manual clock to make sure
+ // any dangling intents left by previous splits expire.
+ t.Log("waiting for split to succeed")
+ mtc.manualClock.Increment(2*base.DefaultHeartbeatInterval.Nanoseconds() + 1)
+ forAllLiveStores(func(store *storage.Store) {
+ store.ForceSplitScanAndProcess()
+ })
+ testutils.SucceedsSoon(t, func() error {
+ if store0.ReplicaCount() < prevRangeCount+1 {
+ return errors.Errorf("expected new range created by split")
+ }
+ return nil
+ })
+
+ // Per the contract on multiTestContext.stopStore, we need to restart the
+ // stopped store before calling multiTestContext.Stop.
+ mtc.restartStore(crashingNode)
+}
+
 // TestStoreRangeSystemSplits verifies that splits are based on the contents of
 // the SystemConfig span.
 func TestStoreRangeSystemSplits(t *testing.T) {

diff --git a/pkg/storage/client_test.go b/pkg/storage/client_test.go
@@ -874,6 +874,11 @@ func (m *multiTestContext) stopStore(i int) {
 
  m.mu.Lock()
  m.stoppers[i] = nil
+ // Break the transport breaker for this node so that messages sent between a
+ // store stopping and that store restarting will never remain in-flight in
+ // the transport and end up reaching the store. This has been the cause of
+ // flakiness in the past.
+ m.transport.GetCircuitBreaker(m.idents[i].NodeID).Break()
  m.senders[i].RemoveStore(m.stores[i])
  m.stores[i] = nil
  m.mu.Unlock()
@@ -905,6 +910,7 @@ func (m *multiTestContext) restartStore(i int) {
  m.t.Fatal(err)
  }
  m.senders[i].AddStore(store)
+ m.transport.GetCircuitBreaker(m.idents[i].NodeID).Reset()
  m.mu.Unlock()
  cfg.NodeLiveness.StartHeartbeat(ctx, stopper, func(ctx context.Context) {
  now := m.clock.Now()

diff --git a/pkg/storage/helpers_test.go b/pkg/storage/helpers_test.go
@@ -176,6 +176,12 @@ func (s *Store) SetReplicaScannerActive(active bool) {
  s.setScannerActive(active)
 }
 
+// SetSplitQueueProcessTimeout sets the timeout for processing a replica in the
+// split queue.
+func (s *Store) SetSplitQueueProcessTimeout(dur time.Duration) {
+ s.splitQueue.SetProcessTimeout(dur)
+}
+
 // GetOrCreateReplica passes through to its lowercase sibling.
 func (s *Store) GetOrCreateReplica(
  ctx context.Context,
@@ -347,6 +353,14 @@ func (r *Replica) GetTimestampCacheLowWater() hlc.Timestamp {
  return t
 }
 
+// PermittingLargeSnapshots returns whether the replica is permitting large
+// snapshots.
+func (r *Replica) PermittingLargeSnapshots() bool {
+ r.mu.RLock()
+ defer r.mu.RUnlock()
+ return r.mu.permitLargeSnapshots
+}
+
 // GetRaftLogSize returns the raft log size.
 func (r *Replica) GetRaftLogSize() int64 {
  r.mu.Lock()

diff --git a/pkg/storage/id_alloc.go b/pkg/storage/id_alloc.go
@@ -81,15 +81,19 @@ func newIDAllocator(
 }
 
 // Allocate allocates a new ID from the global KV DB.
-func (ia *idAllocator) Allocate() (uint32, error) {
+func (ia *idAllocator) Allocate(ctx context.Context) (uint32, error) {
  ia.once.Do(ia.start)
 
- id := <-ia.ids
- // when the channel is closed, the zero value is returned.
- if id == 0 {
- return id, errors.Errorf("could not allocate ID; system is draining")
+ select {
+ case id := <-ia.ids:
+ // when the channel is closed, the zero value is returned.
+ if id == 0 {
+ return id, errors.Errorf("could not allocate ID; system is draining")
+ }
+ return id, nil
+ case <-ctx.Done():
+ return 0, ctx.Err()
  }
- return id, nil
 }
 
 func (ia *idAllocator) start() {

diff --git a/pkg/storage/id_alloc_test.go b/pkg/storage/id_alloc_test.go
@@ -56,7 +56,7 @@ func TestIDAllocator(t *testing.T) {
  for i := 0; i < maxI; i++ {
  go func() {
  for j := 0; j < maxJ; j++ {
- id, err := idAlloc.Allocate()
+ id, err := idAlloc.Allocate(context.Background())
  errChan <- err
  allocd <- id
  }
@@ -111,7 +111,7 @@ func TestIDAllocatorNegativeValue(t *testing.T) {
  if err != nil {
  t.Errorf("failed to create IDAllocator: %v", err)
  }
- value, err := idAlloc.Allocate()
+ value, err := idAlloc.Allocate(context.Background())
  if err != nil {
  t.Fatal(err)
  }
@@ -158,7 +158,7 @@ func TestAllocateErrorAndRecovery(t *testing.T) {
  t.Errorf("failed to create IDAllocator: %v", err)
  }
 
- firstID, err := idAlloc.Allocate()
+ firstID, err := idAlloc.Allocate(context.Background())
  if err != nil {
  t.Fatal(err)
  }
@@ -172,7 +172,7 @@ func TestAllocateErrorAndRecovery(t *testing.T) {
  // Should be able to get the allocated IDs, and there will be one
  // background allocateBlock to get ID continuously.
  for i := 0; i < 8; i++ {
- id, err := idAlloc.Allocate()
+ id, err := idAlloc.Allocate(context.Background())
  if err != nil {
  t.Fatal(err)
  }
@@ -194,7 +194,7 @@ func TestAllocateErrorAndRecovery(t *testing.T) {
  errChan <- nil
  }
 
- id, err := idAlloc.Allocate()
+ id, err := idAlloc.Allocate(context.Background())
  errChan <- err
  allocd <- id
  }()
@@ -207,6 +207,16 @@ func TestAllocateErrorAndRecovery(t *testing.T) {
  }
  }
 
+ // Attempt a few allocations with a context timeout while allocations are
+ // blocked. All attempts should hit a context deadline exceeded error.
+ ctx, _ := context.WithTimeout(context.Background(), 10*time.Millisecond)
+ for i := 0; i < routines; i++ {
+ id, err := idAlloc.Allocate(ctx)
+ if id != 0 || err != context.DeadlineExceeded {
+ t.Errorf("expected context cancellation, found id=%d, err=%v", id, err)
+ }
+ }
+
  // Make the IDAllocator valid again.
  idAlloc.idKey.Store(keys.RangeIDGenerator)
  // Check if the blocked allocations return expected ID.
@@ -226,7 +236,7 @@ func TestAllocateErrorAndRecovery(t *testing.T) {
 
  // Check if the following allocations return expected ID.
  for i := 0; i < routines; i++ {
- id, err := idAlloc.Allocate()
+ id, err := idAlloc.Allocate(context.Background())
  if err != nil {
  t.Fatal(err)
  }
@@ -254,7 +264,7 @@ func TestAllocateWithStopper(t *testing.T) {
  return idAlloc
  }()
 
- if _, err := idAlloc.Allocate(); !testutils.IsError(err, "system is draining") {
+ if _, err := idAlloc.Allocate(context.Background()); !testutils.IsError(err, "system is draining") {
  t.Errorf("unexpected error: %v", err)
  }
 }
diff --git a/pkg/storage/queue.go b/pkg/storage/queue.go
@@ -302,6 +302,13 @@ func (bq *baseQueue) Disabled() bool {
  return bq.mu.disabled
 }
 
+// SetProcessTimeout sets the timeout for processing a replica.
+func (bq *baseQueue) SetProcessTimeout(dur time.Duration) {
+ bq.processMu.Lock()
+ bq.processTimeout = dur
+ bq.processMu.Unlock()
+}
+
 // Start launches a goroutine to process entries in the queue. The
 // provided stopper is used to finish processing.
 func (bq *baseQueue) Start(clock *hlc.Clock, stopper *stop.Stopper) {