Skip to content

Commit

Permalink
Merge #31988
Browse files Browse the repository at this point in the history
31988: storage: fix GC of subsumed replicas r=bdarnell a=benesch

When a range is subsumed, there is a chance that it leaves behind
replicas that are marked as destroyed with reason "merge pending." The
replica GC queue was previously misinterpreting this to mean that the
range had already been GC'd and thus the range would never get GC'd,
resulting in a proliferation of intersecting snapshot errors.

Additionally take the opportunity to teach the replica GC queue to
proactively queue a replica's left neighbor when that left neighbor is
blocking the replica from being GC'd.

Release note: None

Co-authored-by: Nikhil Benesch <nikhil.benesch@gmail.com>
  • Loading branch information
craig[bot] and benesch committed Oct 30, 2018
2 parents 4fe5133 + 3cc9d57 commit 636ed7d
Show file tree
Hide file tree
Showing 3 changed files with 76 additions and 5 deletions.
74 changes: 70 additions & 4 deletions pkg/storage/client_merge_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -27,9 +27,6 @@ import (
"testing"
"time"

"github.com/gogo/protobuf/proto"
"github.com/pkg/errors"

"github.com/cockroachdb/cockroach/pkg/config"
"github.com/cockroachdb/cockroach/pkg/gossip"
"github.com/cockroachdb/cockroach/pkg/internal/client"
Expand All @@ -54,6 +51,8 @@ import (
"github.com/cockroachdb/cockroach/pkg/util/retry"
"github.com/cockroachdb/cockroach/pkg/util/syncutil"
"github.com/cockroachdb/cockroach/pkg/util/timeutil"
"github.com/gogo/protobuf/proto"
"github.com/pkg/errors"
)

func adminMergeArgs(key roachpb.Key) *roachpb.AdminMergeRequest {
Expand Down Expand Up @@ -1920,7 +1919,7 @@ func TestStoreRangeMergeAbandonedFollowers(t *testing.T) {
}
}

// Verify that the abandoned ranges on store2 can only GC'd from left to
// Verify that the abandoned ranges on store2 can only be GC'd from left to
// right.
if err := store2.ManualReplicaGC(repls[2]); err != nil {
t.Fatal(err)
Expand Down Expand Up @@ -1962,6 +1961,73 @@ func TestStoreRangeMergeAbandonedFollowers(t *testing.T) {
}
}

// TestStoreRangeMergeAbandonedFollowersAutomaticallyGarbageCollected verifies
// that the replica GC queue will clean up an abandoned RHS replica whose
// destroyStatus is destroyReasonMergePending. The RHS replica ends up in this
// state when its merge watcher goroutine notices that the merge committed, and
// thus marks it as destroyed with reason destroyReasonMergePending, but the
// corresponding LHS is rebalanced off the store before it can apply the merge
// trigger. The replica GC queue would previously refuse to GC the abandoned
// RHS, as it interpreted destroyReasonMergePending to mean that the RHS replica
// had already been garbage collected.
func TestStoreRangeMergeAbandonedFollowersAutomaticallyGarbageCollected(t *testing.T) {
defer leaktest.AfterTest(t)()

t.Skip("this test occasionally flakes if the RHS quiesces")

ctx := context.Background()
storeCfg := storage.TestStoreConfig(nil)
storeCfg.TestingKnobs.DisableReplicateQueue = true
mtc := &multiTestContext{storeConfig: &storeCfg}
mtc.Start(t, 3)
defer mtc.Stop()
store0, store2 := mtc.Store(0), mtc.Store(2)

mtc.replicateRange(roachpb.RangeID(1), 1, 2)
lhsDesc, rhsDesc, err := createSplitRanges(ctx, store0)
if err != nil {
t.Fatal(err)
}

// Make store2 the leaseholder for the RHS.
mtc.transferLease(ctx, rhsDesc.RangeID, 0, 2)

// Start dropping all Raft traffic to the LHS replica on store2 so that it
// won't be aware that there is a merge in progress.
mtc.transport.Listen(store2.Ident.StoreID, &unreliableRaftHandler{
rangeID: lhsDesc.RangeID,
RaftMessageHandler: store2,
})

// Perform the merge. The LHS replica on store2 whon't hear about this merge
// and thus won't subsume its RHS replica. The RHS replica's merge watcher
// goroutine will, however, notice the merge and mark the RHS replica as
// destroyed with reason destroyReasonMergePending.
args := adminMergeArgs(lhsDesc.StartKey.AsRawKey())
_, pErr := client.SendWrapped(ctx, store0.TestSender(), args)
if pErr != nil {
t.Fatal(pErr)
}

// Remove the merged range from store2. Its replicas of both the LHS and RHS
// are now eligible for GC.
mtc.unreplicateRange(lhsDesc.RangeID, 2)

// Note that we purposely do not call store.ManualReplicaGC here, as that
// calls replicaGCQueue.process directly, bypassing the logic in
// baseQueue.MaybeAdd and baseQueue.Add. We specifically want to test that
// queuing logic, which has been broken in the past.
testutils.SucceedsSoon(t, func() error {
if _, err := store2.GetReplica(lhsDesc.RangeID); err == nil {
return errors.New("lhs not destroyed")
}
if _, err := store2.GetReplica(rhsDesc.RangeID); err == nil {
return errors.New("rhs not destroyed")
}
return nil
})
}

func TestStoreRangeMergeDeadFollowerBeforeTxn(t *testing.T) {
defer leaktest.AfterTest(t)()

Expand Down
2 changes: 1 addition & 1 deletion pkg/storage/queue.go
Original file line number Diff line number Diff line change
Expand Up @@ -714,7 +714,7 @@ func (bq *baseQueue) processReplica(queueCtx context.Context, repl *Replica) err
}

if reason, err := repl.IsDestroyed(); err != nil {
if !bq.queueConfig.processDestroyedReplicas || reason != destroyReasonRemovalPending {
if !bq.queueConfig.processDestroyedReplicas || reason == destroyReasonRemoved {
if log.V(3) {
log.Infof(queueCtx, "replica destroyed (%s); skipping", err)
}
Expand Down
5 changes: 5 additions & 0 deletions pkg/storage/replica_gc_queue.go
Original file line number Diff line number Diff line change
Expand Up @@ -259,6 +259,11 @@ func (rgcq *replicaGCQueue) process(
if leftReplyDesc := rs[0]; !leftDesc.Equal(leftReplyDesc) {
log.VEventf(ctx, 1, "left neighbor %s not up-to-date with meta descriptor %s; cannot safely GC range yet",
leftDesc, leftReplyDesc)
// Chances are that the left replica needs to be GC'd. Since we don't
// have definitive proof, queue it with a low priority.
if _, err := rgcq.Add(leftRepl, replicaGCPriorityDefault); err != nil {
log.Errorf(ctx, "unable to add %s to replica GC queue: %s", leftRepl, err)
}
return nil
}
}
Expand Down

0 comments on commit 636ed7d

Please sign in to comment.