Merge #57136

57136: kv: consider intent in uncertainty interval to be uncertain, not a write-read conflict r=sumeerbhola a=nvanbenschoten This PR is a partial revert of #40600 and #46830. It solves the same problem that those PRs were solving, but in a different way. Those two PRs were handling the case where a reading transaction observes an intent in its uncertainty interval. Before those fixes, we were not considering intents in a scan's uncertainty interval to be uncertain. This had the potential to cause stale reads because an unresolved intent doesn't indicate that its transaction hasn’t been committed and is not a causal ancestor of the scan. The approach the first PR took was to throw a WriteIntentError on intents in a scan's uncertainty interval. Effectively, it made scans consider intents in their uncertainty interval to be write-read conflicts. This had the benefit that the scan would push the intent and eventually resolve the conflict, either by aborting the intent, pushing it out of the read's uncertainty interval, or waiting for it to commit. In this last case (which is by the far the most common), the scan would then throw an uncertainty error, because it now had a committed value in its uncertainty interval. The second PR introduced some awkward code to deal with the fallout from this decision. Once we started trying to keep the lock table in sync with the MVCC state, we needed to use latches to ensure proper synchronization between any operations that could conflict because such conflicts would influence the state of the lock table. This meant that we needed to start holding latches across a read's uncertainty interval, because intent writes in this interval were considered write-read conflicts. This led to some moderately gross code and always felt a little wrong. Now that we are complicating the logic around uncertainty intervals even further, this becomes even more of a burden. This motivates a reworking of these interactions. This commit replaces the logic that considers intents in a transaction's uncertainty interval to be write-read conflicts for logic that considers such intents to be... uncertain. Doing so means that a transaction will immediately refresh/restart above the uncertain timestamp and will only then begin conflicting with the intent. This has a number of nice benefits: 1. it keeps all considerations of read uncertainty intervals down in MVCC iteration logic. The lock table no longer needs to be aware of it. This is a big win now and an even bigger win once we introduce synthetic timestamps. 2. readers that are almost certainly bound to hit an uncertainty error and need to restart will now do so sooner. In rare cases, this may result in wasted work. In the vast majority of cases, this will allow the reader to be more responsive to the commit of its conflict. 3. uncertainty errors will only be thrown for locks in the uncertainty interval of a read that are protecting a provisional write (intents). Before, any form of a lock in a read's uncertainty interval would be considered a write-read conflict, which was pessimistic and not needed for correctness. In a future with a fully segregated lock table, the change in semantic meaning here becomes even more clear. Instead of detecting the lock associated with an intent in a read's uncertainty interval and declaring a write-read conflict, the read will instead pass through the lock table untouched and will detect the provisional value associated with an intent and declaring uncertainty. This seems closer to what were actually trying to say about these interactions. Before making this change, I intend to validate the hypothesis that it will not affect performance (or may even slightly improve performance) by running it on the YCSB benchmark suite. Co-authored-by: Nathan VanBenschoten <nvanbenschoten@gmail.com>
cockroachdb · Jan 16, 2021 · cd54fb7 · cd54fb7
2 parents 422560b + 0850deb
commit cd54fb7
Show file tree

Hide file tree

Showing 11 changed files with 134 additions and 315 deletions.
diff --git a/pkg/kv/kvserver/batcheval/declare.go b/pkg/kv/kvserver/batcheval/declare.go
@@ -50,39 +50,20 @@ func DefaultDeclareIsolatedKeys(
 		access = spanset.SpanReadOnly
 		if header.Txn != nil {
 			// For transactional reads, acquire read latches all the way up to
-			// the transaction's MaxTimestamp, because reads may observe locks
+			// the transaction's MaxTimestamp, because reads may observe writes
 			// all the way up to this timestamp.
 			//
-			// TODO(nvanbenschoten): this parallels similar logic in
-			// concurrency.Request.readConflictTimestamp, which indicates that
-			// there is almost certainly a better way to structure this. There
-			// are actually two issues here that lead to this duplication:
-			//
-			// 1. latch spans and lock spans are declared separately. While these
-			//    concepts are not identical, it appears that lock spans are always
-			//    a subset of latch spans, which means that we can probably unify
-			//    the concepts more closely than we have thus far. This would
-			//    probably also have positive performance implications, as the
-			//    duplication mandates extra memory allocations.
-			//
-			// 2. latch spans can each be assigned unique MVCC timestamps but lock
-			//    spans inherit the timestamp of their request's transaction (see
-			//    lockTable and concurrency.Request.{read,write}ConflictTimestamp).
-			//    This difference is strange and confusing. It's not clear that the
-			//    generality of latches each being declared at their own timestamp
-			//    is useful. There may be an emergent pattern that arises here when
-			//    we unify latch and lock spans (see part 1) where latches that are
-			//    in the lock span subset can inherit their request's transaction's
-			//    timestamp and latches that are not are non-MVCC latches.
-			//
-			// Note that addressing these issues does not necessarily need to
-			// lead to the timestamp that MVCC spans are interpretted at being
-			// the same for the purposes of the latch manager and lock-table.
-			// For instance, once the lock-table is segregated and all logic
-			// relating to "lock discovery" is removed, we no longer need to
-			// acquire read latches up to a txn's max timestamp, just to its
-			// read timestamp. However, we will still need to search the
-			// lock-table up to a txn's max timestamp.
+			// It is critical that reads declare latches up through their
+			// uncertainty interval so that they are properly synchronized with
+			// earlier writes that may have a happened-before relationship with
+			// the read. These writes could not have completed and returned to
+			// the client until they were durable in the Range's Raft log.
+			// However, they may not have been applied to the replica's state
+			// machine by the time the write was acknowledged, because Raft
+			// entry application occurs asynchronously with respect to the
+			// writer (see AckCommittedEntriesBeforeApplication). Latching is
+			// the only mechanism that ensures that any observers of the write
+			// wait for the write apply before reading.
 			timestamp.Forward(header.Txn.MaxTimestamp)
 		}
 	}

diff --git a/pkg/kv/kvserver/concurrency/concurrency_manager.go b/pkg/kv/kvserver/concurrency/concurrency_manager.go
@@ -408,31 +408,6 @@ func (r *Request) txnMeta() *enginepb.TxnMeta {
 	return &r.Txn.TxnMeta
 }
 
-// readConflictTimestamp returns the maximum timestamp at which the request
-// conflicts with locks acquired by other transaction. The request must wait
-// for all locks acquired by other transactions at or below this timestamp
-// to be released. All locks acquired by other transactions above this
-// timestamp are ignored.
-func (r *Request) readConflictTimestamp() hlc.Timestamp {
-	ts := r.Timestamp
-	if r.Txn != nil {
-		ts = r.Txn.ReadTimestamp
-		ts.Forward(r.Txn.MaxTimestamp)
-	}
-	return ts
-}
-
-// writeConflictTimestamp returns the minimum timestamp at which the request
-// acquires locks when performing mutations. All writes performed by the
-// requests must take place at or above this timestamp.
-func (r *Request) writeConflictTimestamp() hlc.Timestamp {
-	ts := r.Timestamp
-	if r.Txn != nil {
-		ts = r.Txn.WriteTimestamp
-	}
-	return ts
-}
-
 func (r *Request) isSingle(m roachpb.Method) bool {
 	if len(r.Requests) != 1 {
 		return false

diff --git a/pkg/kv/kvserver/concurrency/lock_table.go b/pkg/kv/kvserver/concurrency/lock_table.go
@@ -93,7 +93,6 @@ type waitingState struct {
 // TODO(sbhola):
 // - metrics about lockTable state to export to observability debug pages:
 //   number of locks, number of waiting requests, wait time?, ...
-// - test cases where guard.readTS != guard.writeTS.
 
 // The btree for a particular SpanScope.
 type treeMu struct {
@@ -270,10 +269,9 @@ type lockTableGuardImpl struct {
 	lt     *lockTableImpl
 
 	// Information about this request.
-	txn     *enginepb.TxnMeta
-	spans   *spanset.SpanSet
-	readTS  hlc.Timestamp
-	writeTS hlc.Timestamp
+	txn   *enginepb.TxnMeta
+	ts    hlc.Timestamp
+	spans *spanset.SpanSet
 
 	// Snapshots of the trees for which this request has some spans. Note that
 	// the lockStates in these snapshots may have been removed from
@@ -871,7 +869,7 @@ func (l *lockState) Format(buf *strings.Builder, finalizedTxnCache *txnCache) {
 	txn, ts := l.getLockHolder()
 	if txn == nil {
 		fmt.Fprintf(buf, "  res: req: %d, ", l.reservation.seqNum)
-		writeResInfo(buf, l.reservation.txn, l.reservation.writeTS)
+		writeResInfo(buf, l.reservation.txn, l.reservation.ts)
 	} else {
 		writeHolderInfo(buf, txn, ts)
 	}
@@ -1211,7 +1209,7 @@ func (l *lockState) tryActiveWait(
 			return false, false
 		}
 		// Locked by some other txn.
-		if g.readTS.Less(lockHolderTS) {
+		if g.ts.Less(lockHolderTS) {
 			return false, false
 		}
 		g.mu.Lock()
@@ -1534,7 +1532,7 @@ func (l *lockState) discoveredLock(
 		// the lock table. If not then it shouldn't have discovered the lock in
 		// the first place. Bugs here would cause infinite loops where the same
 		// lock is repeatedly re-discovered.
-		if g.readTS.Less(ts) {
+		if g.ts.Less(ts) {
 			return errors.AssertionFailedf("discovered non-conflicting lock")
 		}
 
@@ -1764,7 +1762,7 @@ func (l *lockState) increasedLockTs(newTs hlc.Timestamp) {
 		g := e.Value.(*lockTableGuardImpl)
 		curr := e
 		e = e.Next()
-		if g.readTS.Less(newTs) {
+		if g.ts.Less(newTs) {
 			// Stop waiting.
 			l.waitingReaders.Remove(curr)
 			if g == l.distinguishedWaiter {
@@ -1971,9 +1969,8 @@ func (t *lockTableImpl) ScanAndEnqueue(req Request, guard lockTableGuard) lockTa
 		g.seqNum = atomic.AddUint64(&t.seqNum, 1)
 		g.lt = t
 		g.txn = req.txnMeta()
+		g.ts = req.Timestamp
 		g.spans = req.LockSpans
-		g.readTS = req.readConflictTimestamp()
-		g.writeTS = req.writeConflictTimestamp()
 		g.sa = spanset.NumSpanAccess - 1
 		g.index = -1
 	} else {

diff --git a/pkg/kv/kvserver/concurrency/lock_table_waiter.go b/pkg/kv/kvserver/concurrency/lock_table_waiter.go
@@ -546,7 +546,7 @@ func (w *lockTableWaiterImpl) pushRequestTxn(
 
 func (w *lockTableWaiterImpl) pushHeader(req Request) roachpb.Header {
 	h := roachpb.Header{
-		Timestamp:    req.readConflictTimestamp(),
+		Timestamp:    req.Timestamp,
 		UserPriority: req.Priority,
 	}
 	if req.Txn != nil {
@@ -555,6 +555,12 @@ func (w *lockTableWaiterImpl) pushHeader(req Request) roachpb.Header {
 		// could race). Since the subsequent execution of the original request
 		// might mutate the transaction, make a copy here. See #9130.
 		h.Txn = req.Txn.Clone()
+		// We must push at least to req.Timestamp, but for transactional
+		// requests we actually want to go all the way up to the top of the
+		// transaction's uncertainty interval. This allows us to not have to
+		// restart for uncertainty if the push succeeds and we come back and
+		// read.
+		h.Timestamp.Forward(req.Txn.MaxTimestamp)
 	}
 	return h
 }

diff --git a/pkg/kv/kvserver/concurrency/testdata/concurrency_manager/acquire_wrong_txn_race b/pkg/kv/kvserver/concurrency/testdata/concurrency_manager/acquire_wrong_txn_race