diff --git a/pkg/kv/kvserver/replica_application_result.go b/pkg/kv/kvserver/replica_application_result.go index a0ad9beca3f4..d8b97b555244 100644 --- a/pkg/kv/kvserver/replica_application_result.go +++ b/pkg/kv/kvserver/replica_application_result.go @@ -115,6 +115,11 @@ func (r *Replica) prepareLocalResult(ctx context.Context, cmd *replicatedCmd) { // that would be too risky to change at the time of writing. // pErr = nil + // If we failed to apply at the right lease index, try again with a + // new one. This is important for pipelined writes, since they don't + // have a client watching to retry, so a failure to eventually apply + // the proposal would be a user-visible error. + // // If the command associated with this rejected raft entry already applied // then we don't want to repropose it. Doing so could lead to duplicate // application of the same proposal. (We can see hit this case if an application @@ -131,11 +136,33 @@ func (r *Replica) prepareLocalResult(ctx context.Context, cmd *replicatedCmd) { // multiple lease indexes, so don't repropose it again. This ensures that at // any time, there is only up to a single lease index that has a chance of // succeeding in the Raft log for a given command. + // + // Taking a looking glass to the last paragraph, we see that the situation + // is much more subtle. As tryReproposeWithNewLeaseIndex gets to the stage + // where it calls `(*Replica).propose` (i.e. it passed the closed + // timestamp checks), it *resets* `cmd.proposal.MaxLeaseIndex` to zero. + // This means that the proposal immediately supersedes any other copy + // presently in the log, including for the remainder of application of the + // current log entry (since the current entry's LAI is certainly not equal + // to zero). However, the proposal buffer adds another layer of + // possibility on top of this. Typically, the buffer does not flush + // immediately: this only happens at the beginning of the *next* raft + // handling cycle, i.e. the proposal will not re-enter the proposals map + // while the current batches of commands (recall, which may contain an + // arbitrary number of copies of the current command, both with various + // LAIs all but at most one of which are too small) are applied. *But*, + // the proposal buffer has a capacity, and if it does fill up in + // `(*Replica).propose` it will synchronously flush, meaning that a new + // LAI will be assigned to `cmd.proposal.MaxLeaseIndex` AND the command + // will have re-entered the proposals map. So even if we remove the + // "zeroing" of the MaxLeaseIndex prior to proposing, we still have to + // contend with the fact that once `tryReproposeWithNewLeaseIndex` may + // or may not be in the map. (At least it is assigned a new LAI if and + // only if it is back in the map!). + // + // These many possible worlds are a major source of complexity, a + // reduction of which is postponed. if !cmd.proposal.applied && !cmd.proposal.Supersedes(cmd.Cmd.MaxLeaseIndex) { - // If we failed to apply at the right lease index, try again with a - // new one. This is important for pipelined writes, since they don't - // have a client watching to retry, so a failure to eventually apply - // the proposal would be a user-visible error. pErr = tryReproposeWithNewLeaseIndex(ctx, cmd, (*replicaReproposer)(r)) } @@ -164,6 +191,21 @@ func (r *Replica) prepareLocalResult(ctx context.Context, cmd *replicatedCmd) { log.Infof(ctx, "failed to repropose %s at idx %d with new lease index: %s", cmd.ID, cmd.Index(), pErr) cmd.response.Err = pErr + // This assertion can mis-fire if we artificially inject invalid LAIs + // during proposal, see the explanatory comment above. If, in the + // current app batch, we had two identical copies of an entry (which + // maps to a local proposal), and the LAI was stale, then both entries + // would be local. The first could succeed to repropose, which, if the + // propBuf was full, would immediately insert into the proposals map. + // Normally, when we then apply the second entry, it would be superseded + // and not hit the assertion. But, if we injected a stale LAI during + // this last reproposal, we could "accidentally" assign the same LAI + // again. The second entry would then try to repropose again, which is + // fine, but it could bump into the closed timestamp, get an error, + // enter this branch, and then trip the assertion. + // + // For proposed simplifications, see: + // https://github.com/cockroachdb/cockroach/issues/97633 r.mu.RLock() _, inMap := r.mu.proposals[cmd.ID] r.mu.RUnlock()