-
Notifications
You must be signed in to change notification settings - Fork 3.8k
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #33015 from tbg/backport2.1-32594
backport-2.1: storage: delay manual splits that would result in more snapshots
- Loading branch information
Showing
6 changed files
with
354 additions
and
5 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,157 @@ | ||
// Copyright 2018 The Cockroach Authors. | ||
// | ||
// Licensed under the Apache License, Version 2.0 (the "License"); | ||
// you may not use this file except in compliance with the License. | ||
// You may obtain a copy of the License at | ||
// | ||
// http://www.apache.org/licenses/LICENSE-2.0 | ||
// | ||
// Unless required by applicable law or agreed to in writing, software | ||
// distributed under the License is distributed on an "AS IS" BASIS, | ||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or | ||
// implied. See the License for the specific language governing | ||
// permissions and limitations under the License. | ||
|
||
package storage | ||
|
||
import ( | ||
"context" | ||
"fmt" | ||
"time" | ||
|
||
"github.com/cockroachdb/cockroach/pkg/roachpb" | ||
"github.com/cockroachdb/cockroach/pkg/util/timeutil" | ||
"go.etcd.io/etcd/raft" | ||
) | ||
|
||
type splitDelayHelperI interface { | ||
RaftStatus(context.Context) (roachpb.RangeID, *raft.Status) | ||
ProposeEmptyCommand(ctx context.Context) | ||
NumAttempts() int | ||
Sleep(context.Context) time.Duration | ||
} | ||
|
||
type splitDelayHelper Replica | ||
|
||
func (sdh *splitDelayHelper) RaftStatus(ctx context.Context) (roachpb.RangeID, *raft.Status) { | ||
r := (*Replica)(sdh) | ||
r.mu.RLock() | ||
raftStatus := r.raftStatusRLocked() | ||
if raftStatus != nil { | ||
updateRaftProgressFromActivity( | ||
ctx, raftStatus.Progress, r.descRLocked().Replicas, r.mu.lastUpdateTimes, timeutil.Now(), | ||
) | ||
} | ||
r.mu.RUnlock() | ||
return r.RangeID, raftStatus | ||
} | ||
|
||
func (sdh *splitDelayHelper) ProposeEmptyCommand(ctx context.Context) { | ||
r := (*Replica)(sdh) | ||
r.raftMu.Lock() | ||
_ = r.withRaftGroup(true /* campaignOnWake */, func(rawNode *raft.RawNode) (bool, error) { | ||
// NB: intentionally ignore the error (which can be ErrProposalDropped | ||
// when there's an SST inflight). | ||
_ = rawNode.Propose(encodeRaftCommandV1(makeIDKey(), nil)) | ||
// NB: we need to unquiesce as the group might be quiesced. | ||
return true /* unquiesceAndWakeLeader */, nil | ||
}) | ||
r.raftMu.Unlock() | ||
} | ||
|
||
func (sdh *splitDelayHelper) NumAttempts() int { | ||
return (*Replica)(sdh).store.cfg.RaftDelaySplitToSuppressSnapshotTicks | ||
} | ||
|
||
func (sdh *splitDelayHelper) Sleep(ctx context.Context) time.Duration { | ||
tBegin := timeutil.Now() | ||
|
||
r := (*Replica)(sdh) | ||
select { | ||
case <-time.After(r.store.cfg.RaftTickInterval): | ||
case <-ctx.Done(): | ||
} | ||
|
||
return timeutil.Since(tBegin) | ||
} | ||
|
||
func maybeDelaySplitToAvoidSnapshot(ctx context.Context, sdh splitDelayHelperI) string { | ||
// We have an "optimization" to avoid Raft snapshots by dropping some | ||
// outgoing MsgAppResp (see the _ assignment below) which takes effect for | ||
// RaftPostSplitSuppressSnapshotTicks ticks after an uninitialized replica | ||
// is created. This check can err, in which case the snapshot will be | ||
// delayed for that many ticks, and so we want to delay by at least as much | ||
// plus a bit of padding to give a snapshot a chance to catch the follower | ||
// up. If we run out of time, we'll resume the split no matter what. | ||
// _ = (*Replica)(nil).maybeDropMsgAppResp // guru assignment (not backported) | ||
maxDelaySplitToAvoidSnapshotTicks := sdh.NumAttempts() | ||
|
||
var slept time.Duration | ||
var extra string | ||
var succeeded bool | ||
for ticks := 0; ticks < maxDelaySplitToAvoidSnapshotTicks; ticks++ { | ||
succeeded = false | ||
extra = "" | ||
rangeID, raftStatus := sdh.RaftStatus(ctx) | ||
|
||
if raftStatus == nil { | ||
// Don't delay on followers (we don't know when to stop). This case | ||
// is hit rarely enough to not matter. | ||
extra += "; not Raft leader" | ||
succeeded = true | ||
break | ||
} | ||
|
||
done := true | ||
for replicaID, pr := range raftStatus.Progress { | ||
if replicaID == raftStatus.Lead { | ||
// TODO(tschottdorf): remove this once we have picked up | ||
// https://github.com/etcd-io/etcd/pull/10279 | ||
continue | ||
} | ||
|
||
if pr.State != raft.ProgressStateReplicate { | ||
if !pr.RecentActive { | ||
if ticks == 0 { | ||
// Having set done = false, we make sure we're not exiting early. | ||
// This is important because we sometimes need that Raft proposal | ||
// below to make the followers active as there's no chatter on an | ||
// idle range. (Note that there's a theoretical race in which the | ||
// follower becomes inactive again during the sleep, but the | ||
// inactivity interval is much larger than a tick). | ||
// | ||
// Don't do this more than once though: if a follower is down, | ||
// we don't want to delay splits for it. | ||
done = false | ||
} | ||
extra += fmt.Sprintf("; r%d/%d inactive", rangeID, replicaID) | ||
continue | ||
} | ||
done = false | ||
extra += fmt.Sprintf("; replica r%d/%d not caught up: %+v", rangeID, replicaID, &pr) | ||
} | ||
} | ||
if done { | ||
succeeded = true | ||
break | ||
} | ||
// Propose an empty command which works around a Raft bug that can | ||
// leave a follower in ProgressStateProbe even though it has caught | ||
// up. | ||
sdh.ProposeEmptyCommand(ctx) | ||
slept += sdh.Sleep(ctx) | ||
|
||
if ctx.Err() != nil { | ||
return "" | ||
} | ||
} | ||
|
||
if slept != 0 { | ||
extra += fmt.Sprintf("; delayed split for %.1fs to avoid Raft snapshot", slept.Seconds()) | ||
if !succeeded { | ||
extra += " (without success)" | ||
} | ||
} | ||
|
||
return extra | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,166 @@ | ||
// Copyright 2018 The Cockroach Authors. | ||
// | ||
// Licensed under the Apache License, Version 2.0 (the "License"); | ||
// you may not use this file except in compliance with the License. | ||
// You may obtain a copy of the License at | ||
// | ||
// http://www.apache.org/licenses/LICENSE-2.0 | ||
// | ||
// Unless required by applicable law or agreed to in writing, software | ||
// distributed under the License is distributed on an "AS IS" BASIS, | ||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or | ||
// implied. See the License for the specific language governing | ||
// permissions and limitations under the License. | ||
|
||
package storage | ||
|
||
import ( | ||
"context" | ||
"testing" | ||
"time" | ||
|
||
"github.com/stretchr/testify/assert" | ||
|
||
"github.com/cockroachdb/cockroach/pkg/roachpb" | ||
"go.etcd.io/etcd/raft" | ||
|
||
"github.com/cockroachdb/cockroach/pkg/util/leaktest" | ||
) | ||
|
||
type testSplitDelayHelper struct { | ||
numAttempts int | ||
|
||
rangeID roachpb.RangeID | ||
raftStatus *raft.Status | ||
sleep func() | ||
|
||
slept, emptyProposed int | ||
} | ||
|
||
func (h *testSplitDelayHelper) RaftStatus(context.Context) (roachpb.RangeID, *raft.Status) { | ||
return h.rangeID, h.raftStatus | ||
} | ||
func (h *testSplitDelayHelper) ProposeEmptyCommand(ctx context.Context) { | ||
h.emptyProposed++ | ||
} | ||
func (h *testSplitDelayHelper) NumAttempts() int { | ||
return h.numAttempts | ||
} | ||
func (h *testSplitDelayHelper) Sleep(context.Context) time.Duration { | ||
if h.sleep != nil { | ||
h.sleep() | ||
} | ||
h.slept++ | ||
return time.Second | ||
} | ||
|
||
var _ splitDelayHelperI = (*testSplitDelayHelper)(nil) | ||
|
||
func TestSplitDelayToAvoidSnapshot(t *testing.T) { | ||
defer leaktest.AfterTest(t)() | ||
|
||
ctx := context.Background() | ||
|
||
t.Run("disabled", func(t *testing.T) { | ||
// Should immediately bail out if told to run zero attempts. | ||
h := &testSplitDelayHelper{ | ||
numAttempts: 0, | ||
rangeID: 1, | ||
raftStatus: nil, | ||
} | ||
s := maybeDelaySplitToAvoidSnapshot(ctx, h) | ||
assert.Equal(t, "", s) | ||
assert.Equal(t, 0, h.slept) | ||
}) | ||
|
||
t.Run("follower", func(t *testing.T) { | ||
// Should immediately bail out if run on non-leader. | ||
h := &testSplitDelayHelper{ | ||
numAttempts: 5, | ||
rangeID: 1, | ||
raftStatus: nil, | ||
} | ||
s := maybeDelaySplitToAvoidSnapshot(ctx, h) | ||
assert.Equal(t, "; not Raft leader", s) | ||
assert.Equal(t, 0, h.slept) | ||
}) | ||
|
||
t.Run("inactive", func(t *testing.T) { | ||
h := &testSplitDelayHelper{ | ||
numAttempts: 5, | ||
rangeID: 1, | ||
raftStatus: &raft.Status{ | ||
Progress: map[uint64]raft.Progress{ | ||
2: {State: raft.ProgressStateProbe}, | ||
}, | ||
}, | ||
} | ||
s := maybeDelaySplitToAvoidSnapshot(ctx, h) | ||
// We try to wake up the follower once, but then give up on it. | ||
assert.Equal(t, "; r1/2 inactive; delayed split for 1.0s to avoid Raft snapshot", s) | ||
assert.Equal(t, 1, h.slept) | ||
assert.Equal(t, 1, h.emptyProposed) | ||
}) | ||
|
||
for _, state := range []raft.ProgressStateType{raft.ProgressStateProbe, raft.ProgressStateSnapshot} { | ||
t.Run(state.String(), func(t *testing.T) { | ||
h := &testSplitDelayHelper{ | ||
numAttempts: 5, | ||
rangeID: 1, | ||
raftStatus: &raft.Status{ | ||
Progress: map[uint64]raft.Progress{ | ||
2: {State: state, RecentActive: true, Paused: true /* unifies string output below */}, | ||
// Healthy follower just for kicks. | ||
3: {State: raft.ProgressStateReplicate}, | ||
}, | ||
}, | ||
} | ||
s := maybeDelaySplitToAvoidSnapshot(ctx, h) | ||
assert.Equal(t, "; replica r1/2 not caught up: next = 0, match = 0, state = "+ | ||
state.String()+ | ||
", waiting = true, pendingSnapshot = 0; delayed split for 5.0s to avoid Raft snapshot (without success)", s) | ||
assert.Equal(t, 5, h.slept) | ||
assert.Equal(t, 5, h.emptyProposed) | ||
}) | ||
} | ||
|
||
t.Run("immediately-replicating", func(t *testing.T) { | ||
h := &testSplitDelayHelper{ | ||
numAttempts: 5, | ||
rangeID: 1, | ||
raftStatus: &raft.Status{ | ||
Progress: map[uint64]raft.Progress{ | ||
2: {State: raft.ProgressStateReplicate}, // intentionally not recently active | ||
}, | ||
}, | ||
} | ||
s := maybeDelaySplitToAvoidSnapshot(ctx, h) | ||
assert.Equal(t, "", s) | ||
assert.Equal(t, 0, h.slept) | ||
assert.Equal(t, 0, h.emptyProposed) | ||
}) | ||
|
||
t.Run("becomes-replicating", func(t *testing.T) { | ||
h := &testSplitDelayHelper{ | ||
numAttempts: 5, | ||
rangeID: 1, | ||
raftStatus: &raft.Status{ | ||
Progress: map[uint64]raft.Progress{ | ||
2: {State: raft.ProgressStateProbe, RecentActive: true}, | ||
}, | ||
}, | ||
} | ||
// The fourth attempt will see the follower catch up. | ||
h.sleep = func() { | ||
if h.slept == 2 { | ||
pr := h.raftStatus.Progress[2] | ||
pr.State = raft.ProgressStateReplicate | ||
h.raftStatus.Progress[2] = pr | ||
} | ||
} | ||
s := maybeDelaySplitToAvoidSnapshot(ctx, h) | ||
assert.Equal(t, "; delayed split for 3.0s to avoid Raft snapshot", s) | ||
assert.Equal(t, 3, h.slept) | ||
assert.Equal(t, 3, h.emptyProposed) | ||
}) | ||
} |
Oops, something went wrong.