diff --git a/pkg/kv/kvserver/batcheval/cmd_lease.go b/pkg/kv/kvserver/batcheval/cmd_lease.go index 13b52d8cf4ca..d03ceabce4c4 100644 --- a/pkg/kv/kvserver/batcheval/cmd_lease.go +++ b/pkg/kv/kvserver/batcheval/cmd_lease.go @@ -33,14 +33,24 @@ func newFailedLeaseTrigger(isTransfer bool) result.Result { return trigger } -func checkCanReceiveLease(newLease *roachpb.Lease, rec EvalContext) error { - repDesc, ok := rec.Desc().GetReplicaDescriptorByID(newLease.Replica.ReplicaID) +// CheckCanReceiveLease checks whether `wouldbeLeaseholder` can receive a lease. +// Returns an error if the respective replica is not eligible. +// +// An error is also returned is the replica is not part of `rngDesc`. +// +// For now, don't allow replicas of type LEARNER to be leaseholders. There's +// no reason this wouldn't work in principle, but it seems inadvisable. In +// particular, learners can't become raft leaders, so we wouldn't be able to +// co-locate the leaseholder + raft leader, which is going to affect tail +// latencies. Additionally, as of the time of writing, learner replicas are +// only used for a short time in replica addition, so it's not worth working +// out the edge cases. +func CheckCanReceiveLease( + wouldbeLeaseholder roachpb.ReplicaDescriptor, rngDesc *roachpb.RangeDescriptor, +) error { + repDesc, ok := rngDesc.GetReplicaDescriptorByID(wouldbeLeaseholder.ReplicaID) if !ok { - if newLease.Replica.StoreID == rec.StoreID() { - return errors.AssertionFailedf( - `could not find replica for store %s in %s`, rec.StoreID(), rec.Desc()) - } - return errors.Errorf(`replica %s not found in %s`, newLease.Replica, rec.Desc()) + return errors.Errorf(`replica %s not found in %s`, wouldbeLeaseholder, rngDesc) } else if t := repDesc.GetType(); t != roachpb.VOTER_FULL { // NB: there's no harm in transferring the lease to a VOTER_INCOMING, // but we disallow it anyway. On the other hand, transferring to diff --git a/pkg/kv/kvserver/batcheval/cmd_lease_request.go b/pkg/kv/kvserver/batcheval/cmd_lease_request.go index 0db5c632b0cb..c41c09802642 100644 --- a/pkg/kv/kvserver/batcheval/cmd_lease_request.go +++ b/pkg/kv/kvserver/batcheval/cmd_lease_request.go @@ -56,18 +56,9 @@ func RequestLease( Requested: args.Lease, } - // For now, don't allow replicas of type LEARNER to be leaseholders. There's - // no reason this wouldn't work in principle, but it seems inadvisable. In - // particular, learners can't become raft leaders, so we wouldn't be able to - // co-locate the leaseholder + raft leader, which is going to affect tail - // latencies. Additionally, as of the time of writing, learner replicas are - // only used for a short time in replica addition, so it's not worth working - // out the edge cases. If we decide to start using long-lived learners at some - // point, that math may change. - // // If this check is removed at some point, the filtering of learners on the // sending side would have to be removed as well. - if err := checkCanReceiveLease(&args.Lease, cArgs.EvalCtx); err != nil { + if err := CheckCanReceiveLease(args.Lease.Replica, cArgs.EvalCtx.Desc()); err != nil { rErr.Message = err.Error() return newFailedLeaseTrigger(false /* isTransfer */), rErr } diff --git a/pkg/kv/kvserver/batcheval/cmd_lease_test.go b/pkg/kv/kvserver/batcheval/cmd_lease_test.go index ec707f9d4f8e..e90af83e7ddd 100644 --- a/pkg/kv/kvserver/batcheval/cmd_lease_test.go +++ b/pkg/kv/kvserver/batcheval/cmd_lease_test.go @@ -156,3 +156,38 @@ func TestLeaseCommandLearnerReplica(t *testing.T) { `replica (n2,s2):2LEARNER of type LEARNER cannot hold lease` require.EqualError(t, err, expForLearner) } + +func TestCheckCanReceiveLease(t *testing.T) { + defer leaktest.AfterTest(t)() + defer log.Scope(t).Close(t) + + for _, tc := range []struct { + leaseholderType roachpb.ReplicaType + eligible bool + }{ + {leaseholderType: roachpb.VOTER_FULL, eligible: true}, + {leaseholderType: roachpb.VOTER_INCOMING, eligible: false}, + {leaseholderType: roachpb.VOTER_OUTGOING, eligible: false}, + {leaseholderType: roachpb.VOTER_DEMOTING, eligible: false}, + {leaseholderType: roachpb.LEARNER, eligible: false}, + {leaseholderType: roachpb.NON_VOTER, eligible: false}, + } { + t.Run(tc.leaseholderType.String(), func(t *testing.T) { + repDesc := roachpb.ReplicaDescriptor{ + ReplicaID: 1, + Type: &tc.leaseholderType, + } + rngDesc := roachpb.RangeDescriptor{ + InternalReplicas: []roachpb.ReplicaDescriptor{repDesc}, + } + err := CheckCanReceiveLease(rngDesc.InternalReplicas[0], &rngDesc) + require.Equal(t, tc.eligible, err == nil, "err: %v", err) + }) + } + + t.Run("replica not in range desc", func(t *testing.T) { + repDesc := roachpb.ReplicaDescriptor{ReplicaID: 1} + rngDesc := roachpb.RangeDescriptor{} + require.Regexp(t, "replica.*not found", CheckCanReceiveLease(repDesc, &rngDesc)) + }) +} diff --git a/pkg/kv/kvserver/batcheval/cmd_lease_transfer.go b/pkg/kv/kvserver/batcheval/cmd_lease_transfer.go index 5f45599af60c..33f9c6094c61 100644 --- a/pkg/kv/kvserver/batcheval/cmd_lease_transfer.go +++ b/pkg/kv/kvserver/batcheval/cmd_lease_transfer.go @@ -49,18 +49,9 @@ func TransferLease( // a newFailedLeaseTrigger() to satisfy stats. args := cArgs.Args.(*roachpb.TransferLeaseRequest) - // For now, don't allow replicas of type LEARNER to be leaseholders. There's - // no reason this wouldn't work in principle, but it seems inadvisable. In - // particular, learners can't become raft leaders, so we wouldn't be able to - // co-locate the leaseholder + raft leader, which is going to affect tail - // latencies. Additionally, as of the time of writing, learner replicas are - // only used for a short time in replica addition, so it's not worth working - // out the edge cases. If we decide to start using long-lived learners at some - // point, that math may change. - // // If this check is removed at some point, the filtering of learners on the // sending side would have to be removed as well. - if err := checkCanReceiveLease(&args.Lease, cArgs.EvalCtx); err != nil { + if err := CheckCanReceiveLease(args.Lease.Replica, cArgs.EvalCtx.Desc()); err != nil { return newFailedLeaseTrigger(true /* isTransfer */), err } diff --git a/pkg/kv/kvserver/replica_proposal_buf.go b/pkg/kv/kvserver/replica_proposal_buf.go index 041163e272d3..c7488f385a66 100644 --- a/pkg/kv/kvserver/replica_proposal_buf.go +++ b/pkg/kv/kvserver/replica_proposal_buf.go @@ -15,6 +15,7 @@ import ( "sync" "sync/atomic" + "github.com/cockroachdb/cockroach/pkg/kv/kvserver/batcheval" "github.com/cockroachdb/cockroach/pkg/roachpb" "github.com/cockroachdb/cockroach/pkg/util/log" "github.com/cockroachdb/cockroach/pkg/util/protoutil" @@ -154,6 +155,21 @@ type propBuf struct { } } +type rangeLeaderInfo struct { + // leaderKnown is set if the local Raft machinery knows who the leader is. If + // not set, all other fields are empty. + leaderKnown bool + + // leader represents the Raft group's leader. Not set if leaderKnown is not + // set. + leader roachpb.ReplicaID + // iAmTheLeader is set if the local replica is the leader. + iAmTheLeader bool + // leaderEligibleForLease is set if the leader is known and its type of + // replica allows it to acquire a lease. + leaderEligibleForLease bool +} + // A proposer is an object that uses a propBuf to coordinate Raft proposals. type proposer interface { locker() sync.Locker @@ -166,6 +182,7 @@ type proposer interface { // The following require the proposer to hold an exclusive lock. withGroupLocked(func(proposerRaft) error) error registerProposalLocked(*ProposalData) + leaderStatusRLocked(raftGroup proposerRaft) rangeLeaderInfo // rejectProposalWithRedirectLocked rejects a proposal and redirects the // proposer to try it on another node. This is used to sometimes reject lease // acquisitions when another replica is the leader; the intended consequence @@ -430,19 +447,14 @@ func (b *propBuf) FlushLockedWithRaftGroup( // Figure out leadership info. We'll use it to conditionally drop some // requests. - var leaderKnown, iAmTheLeader bool - var leader roachpb.ReplicaID + var leaderInfo rangeLeaderInfo if raftGroup != nil { - status := raftGroup.BasicStatus() - iAmTheLeader = status.RaftState == raft.StateLeader - leaderKnown = status.Lead != raft.None - if leaderKnown { - leader = roachpb.ReplicaID(status.Lead) - if !iAmTheLeader && leader == b.p.replicaID() { - log.Fatalf(ctx, - "inconsistent Raft state: state %s while the current replica is also the lead: %d", - status.RaftState, leader) - } + leaderInfo = b.p.leaderStatusRLocked(raftGroup) + // Sanity check. + if leaderInfo.leaderKnown && leaderInfo.leader == b.p.replicaID() && !leaderInfo.iAmTheLeader { + log.Fatalf(ctx, + "inconsistent Raft state: state %s while the current replica is also the lead: %d", + raftGroup.BasicStatus().RaftState, leaderInfo.leader) } } @@ -484,14 +496,24 @@ func (b *propBuf) FlushLockedWithRaftGroup( // ErrProposalDropped. We'll eventually re-propose it once a leader is // known, at which point it will either go through or be rejected based on // whether or not it is this replica that became the leader. - if !iAmTheLeader && p.Request.IsLeaseRequest() { - if leaderKnown && !b.testing.allowLeaseProposalWhenNotLeader { + // + // A special case is when the leader is known, but is ineligible to get the + // lease. In that case, we have no choice but to continue with the proposal. + if !leaderInfo.iAmTheLeader && p.Request.IsLeaseRequest() { + leaderKnownAndEligible := leaderInfo.leaderKnown && leaderInfo.leaderEligibleForLease + if leaderKnownAndEligible && !b.testing.allowLeaseProposalWhenNotLeader { log.VEventf(ctx, 2, "not proposing lease acquisition because we're not the leader; replica %d is", - leader) - b.p.rejectProposalWithRedirectLocked(ctx, p, leader) + leaderInfo.leader) + b.p.rejectProposalWithRedirectLocked(ctx, p, leaderInfo.leader) continue } - // If the leader is not known, continue with the proposal as explained above. + // If the leader is not known, or if it is known but it's ineligible for + // the lease, continue with the proposal as explained above. + if !leaderInfo.leaderKnown { + log.VEventf(ctx, 2, "proposing lease acquisition even though we're not the leader; the leader is unknown") + } else { + log.VEventf(ctx, 2, "proposing lease acquisition even though we're not the leader; the leader is ineligible") + } } // Raft processing bookkeeping. @@ -721,6 +743,38 @@ func (rp *replicaProposer) registerProposalLocked(p *ProposalData) { rp.mu.proposals[p.idKey] = p } +func (rp *replicaProposer) leaderStatusRLocked(raftGroup proposerRaft) rangeLeaderInfo { + r := (*Replica)(rp) + + status := raftGroup.BasicStatus() + iAmTheLeader := status.RaftState == raft.StateLeader + leader := status.Lead + leaderKnown := leader != raft.None + var leaderEligibleForLease bool + rangeDesc := r.descRLocked() + if leaderKnown { + // Figure out if the leader is eligible for getting a lease. + leaderRep, ok := rangeDesc.GetReplicaDescriptorByID(roachpb.ReplicaID(leader)) + if !ok { + // There is a leader, but it's not part of our descriptor. The descriptor + // must be stale, so we are behind in applying the log. We don't want the + // lease ourselves (as we're behind), so let's assume that the leader is + // eligible. If it proves that it isn't, we might be asked to get the + // lease again, and by then hopefully we will have caught up. + leaderEligibleForLease = true + } else { + err := batcheval.CheckCanReceiveLease(leaderRep, rangeDesc) + leaderEligibleForLease = err == nil + } + } + return rangeLeaderInfo{ + leaderKnown: leaderKnown, + leader: roachpb.ReplicaID(leader), + iAmTheLeader: iAmTheLeader, + leaderEligibleForLease: leaderEligibleForLease, + } +} + // rejectProposalWithRedirectLocked is part of the proposer interface. func (rp *replicaProposer) rejectProposalWithRedirectLocked( ctx context.Context, prop *ProposalData, redirectTo roachpb.ReplicaID, @@ -728,11 +782,11 @@ func (rp *replicaProposer) rejectProposalWithRedirectLocked( r := (*Replica)(rp) rangeDesc := r.descRLocked() storeID := r.store.StoreID() - leaderRep, _ /* ok */ := rangeDesc.GetReplicaDescriptorByID(redirectTo) + redirectRep, _ /* ok */ := rangeDesc.GetReplicaDescriptorByID(redirectTo) speculativeLease := &roachpb.Lease{ - Replica: leaderRep, + Replica: redirectRep, } - log.VEventf(ctx, 2, "redirecting proposal to node %s; request: %s", leaderRep.NodeID, prop.Request) + log.VEventf(ctx, 2, "redirecting proposal to node %s; request: %s", redirectRep.NodeID, prop.Request) r.cleanupFailedProposalLocked(prop) prop.finishApplication(ctx, proposalResult{ Err: roachpb.NewError(newNotLeaseHolderError( diff --git a/pkg/kv/kvserver/replica_proposal_buf_test.go b/pkg/kv/kvserver/replica_proposal_buf_test.go index 534a7dcffb00..6447859c000d 100644 --- a/pkg/kv/kvserver/replica_proposal_buf_test.go +++ b/pkg/kv/kvserver/replica_proposal_buf_test.go @@ -17,6 +17,7 @@ import ( "testing" "time" + "github.com/cockroachdb/cockroach/pkg/kv/kvserver/batcheval" "github.com/cockroachdb/cockroach/pkg/kv/kvserver/kvserverpb" "github.com/cockroachdb/cockroach/pkg/roachpb" "github.com/cockroachdb/cockroach/pkg/util/leaktest" @@ -44,6 +45,15 @@ type testProposer struct { // If not nil, this is called by RejectProposalWithRedirectLocked(). If nil, // RejectProposalWithRedirectLocked() panics. onRejectProposalWithRedirectLocked func(prop *ProposalData, redirectTo roachpb.ReplicaID) + + // leaderReplicaInDescriptor is set if the leader (as indicated by raftGroup) + // is known, and that leader is part of the range's descriptor (as seen by the + // current replica). This can be used to simulate the local replica being so + // far behind that it doesn't have an up to date descriptor. + leaderReplicaInDescriptor bool + // If leaderReplicaInDescriptor is set, this specifies what type of replica it + // is. Some types of replicas are not eligible to get a lease. + leaderReplicaType roachpb.ReplicaType } type testProposerRaft struct { @@ -99,6 +109,39 @@ func (t *testProposer) registerProposalLocked(p *ProposalData) { t.registered++ } +func (t *testProposer) leaderStatusRLocked(raftGroup proposerRaft) rangeLeaderInfo { + leaderKnown := raftGroup.BasicStatus().Lead != raft.None + var leaderRep roachpb.ReplicaID + var iAmTheLeader, leaderEligibleForLease bool + if leaderKnown { + leaderRep = roachpb.ReplicaID(raftGroup.BasicStatus().Lead) + iAmTheLeader = leaderRep == t.replicaID() + repDesc := roachpb.ReplicaDescriptor{ + ReplicaID: leaderRep, + Type: &t.leaderReplicaType, + } + + if t.leaderReplicaInDescriptor { + // Fill in a RangeDescriptor just enough for the CheckCanReceiveLease() + // call. + rngDesc := roachpb.RangeDescriptor{ + InternalReplicas: []roachpb.ReplicaDescriptor{repDesc}, + } + err := batcheval.CheckCanReceiveLease(repDesc, &rngDesc) + leaderEligibleForLease = err == nil + } else { + // This matches replicaProposed.leaderStatusRLocked(). + leaderEligibleForLease = true + } + } + return rangeLeaderInfo{ + leaderKnown: leaderKnown, + leader: leaderRep, + iAmTheLeader: iAmTheLeader, + leaderEligibleForLease: leaderEligibleForLease, + } +} + func (t *testProposer) rejectProposalWithRedirectLocked( ctx context.Context, prop *ProposalData, redirectTo roachpb.ReplicaID, ) { @@ -391,9 +434,16 @@ func TestProposalBufferRejectLeaseAcqOnFollower(t *testing.T) { // Each subtest will try to propose a lease acquisition in a different Raft // scenario. Some proposals should be allowed, some should be rejected. for _, tc := range []struct { - name string - state raft.StateType - leader uint64 + name string + state raft.StateType + // raft.None means there's no leader, or the leader is unknown. + leader uint64 + // Empty means VOTER_FULL. + leaderRepType roachpb.ReplicaType + // Set to simulate situations where the local replica is so behind that the + // leader is not even part of the range descriptor. + leaderNotInRngDesc bool + expRejection bool }{ { @@ -404,7 +454,7 @@ func TestProposalBufferRejectLeaseAcqOnFollower(t *testing.T) { expRejection: false, }, { - name: "follower known leader", + name: "follower, known eligible leader", state: raft.StateFollower, // Someone else is leader. leader: self + 1, @@ -412,7 +462,28 @@ func TestProposalBufferRejectLeaseAcqOnFollower(t *testing.T) { expRejection: true, }, { - name: "follower unknown leader", + name: "follower, known ineligible leader", + state: raft.StateFollower, + // Someone else is leader. + leader: self + 1, + // The leader type makes it ineligible to get the lease. Thus, the local + // proposal will not be rejected. + leaderRepType: roachpb.VOTER_DEMOTING, + expRejection: false, + }, + { + // Here we simulate the leader being known by Raft, but the local replica + // is so far behind that it doesn't contain the leader replica. + name: "follower, known leader not in range descriptor", + state: raft.StateFollower, + // Someone else is leader. + leader: self + 1, + leaderNotInRngDesc: true, + // We assume that the leader is eligible, and redirect. + expRejection: true, + }, + { + name: "follower, unknown leader", state: raft.StateFollower, // Unknown leader. leader: raft.None, @@ -448,8 +519,13 @@ func TestProposalBufferRejectLeaseAcqOnFollower(t *testing.T) { Lead: tc.leader, }, } - r := testProposerRaft{status: raftStatus} + r := testProposerRaft{ + status: raftStatus, + } p.raftGroup = r + p.leaderReplicaInDescriptor = !tc.leaderNotInRngDesc + p.leaderReplicaType = tc.leaderRepType + var b propBuf b.Init(&p)