Skip to content

Commit

Permalink
storage: delay split that would result in more snapshots
Browse files Browse the repository at this point in the history
When a Range has followers that aren't replicating properly, splitting
that range results in a right-hand side with followers in a similar
state. Certain workloads (restore/import/presplit) can run large numbers
of splits against a given range, and this can result in a large number
of Raft snapshots that backs up the Raft snapshot queue.

Ideally we'd never have any ranges that require a snapshot, but over
the last weeks it has become clear that this is very difficult to
achieve since the knowledge required to decide whether a snapshot
can efficiently be prevented is distributed across multiple nodes
that don't share the necessary information.

This is a bit of a nuclear option to prevent the likely last big
culprit in large numbers of Raft snapshots in #31409.

With this change, we should expect to see Raft snapshots regularly when
a split/scatter phase of an import/restore is active, but never large
volumes at once.

Release note: None
  • Loading branch information
tbg committed Nov 26, 2018
1 parent 5f865f3 commit 21da18d
Showing 1 changed file with 69 additions and 2 deletions.
71 changes: 69 additions & 2 deletions pkg/storage/replica_command.go
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,7 @@ import (
"github.com/cockroachdb/cockroach/pkg/util/log"
"github.com/cockroachdb/cockroach/pkg/util/protoutil"
"github.com/cockroachdb/cockroach/pkg/util/retry"
"github.com/cockroachdb/cockroach/pkg/util/timeutil"
"github.com/pkg/errors"
"go.etcd.io/etcd/raft"
"go.etcd.io/etcd/raft/raftpb"
Expand Down Expand Up @@ -222,6 +223,71 @@ func splitSnapshotWarningStr(rangeID roachpb.RangeID, status *raft.Status) strin
return s
}

func (r *Replica) maybeDelaySplitToAvoidSnapshot(ctx context.Context) string {
maxDelaySplitToAvoidSnapshotTicks := 5 + r.store.cfg.RaftPostSplitSuppressSnapshotTicks

var extra string

tPreWait := timeutil.Now()
tPostWait := tPreWait
for ticks := 0; ticks < maxDelaySplitToAvoidSnapshotTicks; ticks++ {
if ticks > 0 {
tPostWait = time.Time{}
}

r.mu.RLock()
raftStatus := r.raftStatusRLocked()
if raftStatus != nil {
updateRaftProgressFromActivity(
ctx, raftStatus.Progress, r.descRLocked().Replicas, r.mu.lastUpdateTimes, timeutil.Now(),
)
}
r.mu.RUnlock()

if raftStatus == nil {
// Don't delay followers artificially. This case is hit rarely
// enough to not matter.
break
}

done := true
for replicaID, pr := range raftStatus.Progress {
if replicaID == raftStatus.Lead {
// TODO(tschottdorf): remove this once we have picked up
// https://github.com/etcd-io/etcd/pull/10279
continue
}

if !pr.RecentActive {
continue
}

if pr.State != raft.ProgressStateReplicate {
if ticks == 0 {
extra += fmt.Sprintf("delaying split; replica r%d/%d not caught up: %+v", r.RangeID, replicaID, pr)
}
done = false
}
}
if done {
break
}
select {
case <-time.After(r.store.cfg.RaftTickInterval):
case <-ctx.Done():
return ""
}
}
if tPostWait == (time.Time{}) {
tPostWait = timeutil.Now()
}

if elapsed := tPostWait.Sub(tPreWait); elapsed != 0 {
extra += fmt.Sprintf("; delayed split for %s to avoid Raft snapshot", elapsed)
}
return extra
}

// adminSplitWithDescriptor divides the range into into two ranges, using
// either args.SplitKey (if provided) or an internally computed key that aims
// to roughly equipartition the range by size. The split is done inside of a
Expand Down Expand Up @@ -320,10 +386,11 @@ func (r *Replica) adminSplitWithDescriptor(
}
leftDesc.EndKey = splitKey

extra := splitSnapshotWarningStr(r.RangeID, r.RaftStatus())
extra := r.maybeDelaySplitToAvoidSnapshot(ctx)
extra += splitSnapshotWarningStr(r.RangeID, r.RaftStatus())

log.Infof(ctx, "initiating a split of this range at key %s [r%d]%s",
splitKey, rightDesc.RangeID, extra)
extra, splitKey, rightDesc.RangeID)

if err := r.store.DB().Txn(ctx, func(ctx context.Context, txn *client.Txn) error {
log.Event(ctx, "split closure begins")
Expand Down

0 comments on commit 21da18d

Please sign in to comment.