From aaaebc5199fc758cf35e16b740bd7b46ea9ee5e9 Mon Sep 17 00:00:00 2001
From: Jackson Owens <jackson@cockroachlabs.com>
Date: Wed, 15 Nov 2023 18:31:03 -0500
Subject: [PATCH] db: add CanDeterministicallySingleDelete

Add a new specialized CanDeterministicallySingleDelete function that allows a
user to inspect hidden internal keys within the current user key to determine
whether the behavior of a SingleDelete of the current Key would be
deterministic, assuming no writes are made to the key between Iterator creation
and commit of the SingleDelete. The CanDeterministicallySingleDelete func may
only be called on an Iterator oriented in the forward direction and does not
change the external Iterator position.

During intent resolution, CockroachDB will write a SINGLEDEL tombstone to
delete the intent if the intent's value indicates that it must have only been
written once. Subtle logic that cuts across the stack makes this optimization
possible, and one possible explanation for a recent instance of replica
divergence (cockroachdb/cockroach#114421) is a bug in this logic. We anticipate
using CanDeterministicallySingleDelete within CockroachDB's intent resolution
to validate this logic at runtime.

There is some potential for this function to drive the decision to use
SingleDelete when creating batches for local Engine application only.

Informs cockroachdb/cockroach#114492.
---
 data_test.go                          |  23 +-
 iterator.go                           | 147 ++++++++++
 metamorphic/config.go                 |   2 +
 metamorphic/generator.go              |   8 +
 metamorphic/key_manager.go            |   1 +
 metamorphic/ops.go                    |  30 +++
 metamorphic/parser.go                 |   5 +
 testdata/iter_histories/internal_next | 370 ++++++++++++++++++++++++++
 8 files changed, 585 insertions(+), 1 deletion(-)
 create mode 100644 testdata/iter_histories/internal_next
diff --git a/data_test.go b/data_test.go
index 15735f59d4f..ec8f5bda909 100644
--- a/data_test.go
+++ b/data_test.go
@@ -132,6 +132,27 @@ func runIterCmd(d *datadriven.TestData, iter *Iterator, closeIter bool) string {
 			}
 			validityState = iter.NextWithLimit([]byte(parts[1]))
 			printValidityState = true
+		case "internal-next":
+			validity, keyKind := iter.internalNext()
+			switch validity {
+			case internalNextError:
+				fmt.Fprintf(&b, "err: %s\n", iter.Error())
+			case internalNextExhausted:
+				fmt.Fprint(&b, ".\n")
+			case internalNextValid:
+				fmt.Fprintf(&b, "%s\n", keyKind)
+			default:
+				panic("unreachable")
+			}
+			continue
+		case "can-deterministically-single-delete":
+			ok, err := CanDeterministicallySingleDelete(iter)
+			if err != nil {
+				fmt.Fprintf(&b, "err: %s\n", err)
+			} else {
+				fmt.Fprintf(&b, "%t\n", ok)
+			}
+			continue
 		case "prev-limit":
 			if len(parts) != 2 {
 				return "prev-limit <limit>\n"
@@ -402,7 +423,7 @@ func runBatchDefineCmd(d *datadriven.TestData, b *Batch) error {
 			err = b.Delete([]byte(parts[1]), nil)
 		case "del-sized":
 			if len(parts) != 3 {
-				return errors.Errorf("%s expects 1 argument", parts[0])
+				return errors.Errorf("%s expects 2 arguments", parts[0])
 			}
 			var valSize uint64
 			valSize, err = strconv.ParseUint(parts[2], 10, 32)
diff --git a/iterator.go b/iterator.go
index 831e31512ef..45fd8de87c5 100644
--- a/iterator.go
+++ b/iterator.go
@@ -2843,3 +2843,150 @@ func (stats *IteratorStats) SafeFormat(s redact.SafePrinter, verb rune) {
 			stats.RangeKeyStats.SkippedPoints)
 	}
 }
+
+// CanDeterministicallySingleDelete takes a valid iterator and examines internal
+// state to determine if a SingleDelete deleting Iterator.Key() would
+// deterministically delete the key. CanDeterministicallySingleDelete requires
+// the iterator to be oriented in the forward direction (eg, the last
+// positioning operation must've been a First, a Seek[Prefix]GE, or a
+// Next[Prefix][WithLimit]).
+//
+// This function does not change the external position of the iterator, and all
+// positioning methods should behave the same as if it was never called. This
+// function will only return a meaningful result the first time it's invoked at
+// an iterator position. This function invalidates the iterator Value's memory,
+// and the caller must not rely on the memory safety of the previous Iterator
+// position.
+//
+// If CanDeterministicallySingleDelete returns true AND the key at the iterator
+// position is not modified between the creation of the Iterator and the commit
+// of a batch containing a SingleDelete over the key, then the caller can be
+// assured that SingleDelete is equivalent to Delete on the local engine, but it
+// may not be true on another engine that received the same writes and with
+// logically equivalent state since this engine may have collapsed multiple SETs
+// into one.
+func CanDeterministicallySingleDelete(it *Iterator) (bool, error) {
+	validity, kind := it.internalNext()
+	for validity == internalNextValid {
+		switch kind {
+		case InternalKeyKindDelete, InternalKeyKindDeleteSized, InternalKeyKindSingleDelete:
+			// A DEL or DELSIZED tombstone is okay. An internal key
+			// sequence like SINGLEDEL; SET; DEL; SET can be handled
+			// deterministically. If there are SETs further down, we
+			// don't care about them.
+			//
+			// A SingleDelete is also okay as long as when that SingleDelete was
+			// written, it was written deterministically (eg, with its own
+			// CanDeterministicallySingleDelete check).
+			return true, nil
+		case InternalKeyKindSet, InternalKeyKindSetWithDelete, InternalKeyKindMerge:
+			// We encountered a shadowed SET, SETWITHDEL, MERGE. A SINGLEDEL
+			// that deleted the KV at the original iterator position could
+			// result in this key becoming visible.
+			return false, nil
+		case InternalKeyKindRangeDelete:
+			// RangeDeletes are handled by the merging iterator and should never
+			// be observed by the top-level Iterator.
+			panic(errors.AssertionFailedf("pebble: unexpected range delete"))
+		case InternalKeyKindRangeKeySet, InternalKeyKindRangeKeyUnset, InternalKeyKindRangeKeyDelete:
+			// Range keys are interleaved at the maximal sequence number and
+			// should never be observed within a user key.
+			panic(errors.AssertionFailedf("pebble: unexpected range key"))
+		default:
+			panic(errors.AssertionFailedf("pebble: unexpected key kind: %s", errors.Safe(kind)))
+		}
+	}
+	if validity == internalNextError {
+		return false, it.Error()
+	}
+	return true, nil
+}
+
+// internalNextValidity enumerates the potential outcomes of a call to
+// internalNext.
+type internalNextValidity int8
+
+const (
+	// internalNextError is returned by internalNext when an error occurred and
+	// the caller is responsible for checking iter.Error().
+	internalNextError internalNextValidity = iota
+	// internalNextExhausted is returned by internalNext when the next internal
+	// key is an internal key with a different user key than Iterator.Key().
+	internalNextExhausted
+	// internalNextValid is returned by internalNext when the internal next
+	// found a shadowed internal key with a user key equal to Iterator.Key().
+	internalNextValid
+)
+
+// internalNext advances internal Iterator state forward to expose the
+// InternalKeyKind of the next internal key with a user key equal to Key().
+//
+// internalNext is a highly specialized operation and is unlikely to be
+// generally useful. See Iterator.Next for how to reposition the iterator to the
+// next key. internalNext requires the Iterator to be at a valid position in the
+// forward direction (the last positioning operation must've been a First, a
+// Seek[Prefix]GE, or a Next[Prefix][WithLimit] and Valid() must return true).
+//
+// internalNext, unlike all other Iterator methods, exposes internal LSM state.
+// internalNext advances the Iterator's internal iterator to the next shadowed
+// key with a user key equal to Key(). When a key is overwritten or deleted, its
+// removal from the LSM occurs lazily as a part of compactions. internalNext
+// allows the caller to see whether an obsolete internal key exists with the
+// current Key(), and what it's key kind is. Note that the existence of an
+// internal key is nondeterministic and dependent on internal LSM state. These
+// semantics are unlikely to be applicable to almost all use cases.
+//
+// If internalNext finds a key that shares the same user key as Key(), it
+// returns internalNextValid and the internal key's kind. If internalNext
+// encounters an error, it returns internalNextError and the caller is expected
+// to call Iterator.Error() to retrieve it. In all other circumstances,
+// internalNext returns internalNextExhausted, indicating that there are no more
+// additional internal keys with the user key Key().
+//
+// internalNext does not change the external position of the iterator, and a
+// Next operation should behave the same as if internalNext was never called.
+// internalNext does invalidate the iterator Value's memory, and the caller must
+// not rely on the memory safety of the previous Iterator position.
+func (i *Iterator) internalNext() (internalNextValidity, base.InternalKeyKind) {
+	i.stats.ForwardStepCount[InterfaceCall]++
+	if i.err != nil {
+		return internalNextError, base.InternalKeyKindInvalid
+	} else if i.iterValidityState != IterValid {
+		return internalNextExhausted, base.InternalKeyKindInvalid
+	}
+	i.lastPositioningOp = unknownLastPositionOp
+
+	switch i.pos {
+	case iterPosCurForward:
+		i.iterKey, i.iterValue = i.iter.Next()
+		if i.iterKey == nil {
+			// We check i.iter.Error() here and return an internalNextError enum
+			// variant so that the caller does not need to check i.iter.Error()
+			// in the common case that the next internal key has a new user key.
+			if i.err = i.iter.Error(); i.err != nil {
+				return internalNextError, base.InternalKeyKindInvalid
+			}
+			i.pos = iterPosNext
+			return internalNextExhausted, base.InternalKeyKindInvalid
+		} else if i.comparer.Equal(i.iterKey.UserKey, i.key) {
+			return internalNextValid, i.iterKey.Kind()
+		}
+		i.pos = iterPosNext
+		return internalNextExhausted, base.InternalKeyKindInvalid
+	case iterPosCurReverse, iterPosCurReversePaused, iterPosPrev:
+		i.err = errors.New("switching from reverse to forward via internalNext is prohibited")
+		i.iterValidityState = IterExhausted
+		return internalNextError, base.InternalKeyKindInvalid
+	case iterPosNext, iterPosCurForwardPaused:
+		// The previous method already moved onto the next user key. This is
+		// only possible if
+		//   - the last positioning method was a call to internalNext, and we
+		//     advanced to a new user key.
+		//   - the previous non-internalNext iterator operation encountered a
+		//     range key or merge, forcing an internal Next that found a new
+		//     user key that's not equal to i.Iterator.Key().
+		return internalNextExhausted, base.InternalKeyKindInvalid
+	default:
+		panic("unreachable")
+	}
+}
diff --git a/metamorphic/config.go b/metamorphic/config.go
index f86c651269f..891c56568ab 100644
--- a/metamorphic/config.go
+++ b/metamorphic/config.go
@@ -23,6 +23,7 @@ const (
 	iterNext
 	iterNextWithLimit
 	iterNextPrefix
+	iterCanSingleDelete
 	iterPrev
 	iterPrevWithLimit
 	iterSeekGE
@@ -131,6 +132,7 @@ func defaultConfig() config {
 			iterNext:                    100,
 			iterNextWithLimit:           20,
 			iterNextPrefix:              20,
+			iterCanSingleDelete:         20,
 			iterPrev:                    100,
 			iterPrevWithLimit:           20,
 			iterSeekGE:                  100,
diff --git a/metamorphic/generator.go b/metamorphic/generator.go
index 1ec5545561c..8e7e6d45b36 100644
--- a/metamorphic/generator.go
+++ b/metamorphic/generator.go
@@ -149,6 +149,7 @@ func generate(rng *rand.Rand, count uint64, cfg config, km *keyManager) []op {
 		iterNext:                    g.randIter(g.iterNext),
 		iterNextWithLimit:           g.randIter(g.iterNextWithLimit),
 		iterNextPrefix:              g.randIter(g.iterNextPrefix),
+		iterCanSingleDelete:         g.randIter(g.iterCanSingleDelete),
 		iterPrev:                    g.randIter(g.iterPrev),
 		iterPrevWithLimit:           g.randIter(g.iterPrevWithLimit),
 		iterSeekGE:                  g.randIter(g.iterSeekGE),
@@ -1121,6 +1122,13 @@ func (g *generator) iterNextPrefix(iterID objID) {
 	})
 }
 
+func (g *generator) iterCanSingleDelete(iterID objID) {
+	g.add(&iterCanSingleDelOp{
+		iterID:          iterID,
+		derivedReaderID: g.iterReaderID[iterID],
+	})
+}
+
 func (g *generator) iterPrevWithLimit(iterID objID) {
 	g.add(&iterPrevOp{
 		iterID:          iterID,
diff --git a/metamorphic/key_manager.go b/metamorphic/key_manager.go
index 96975032f4e..5943aa30420 100644
--- a/metamorphic/key_manager.go
+++ b/metamorphic/key_manager.go
@@ -522,6 +522,7 @@ func opWrittenKeys(untypedOp op) [][]byte {
 	case *iterLastOp:
 	case *iterNextOp:
 	case *iterNextPrefixOp:
+	case *iterCanSingleDelOp:
 	case *iterPrevOp:
 	case *iterSeekGEOp:
 	case *iterSeekLTOp:
diff --git a/metamorphic/ops.go b/metamorphic/ops.go
index bc194e6da85..1a671fe0135 100644
--- a/metamorphic/ops.go
+++ b/metamorphic/ops.go
@@ -1280,6 +1280,36 @@ func (o *iterNextPrefixOp) String() string       { return fmt.Sprintf("%s.NextPr
 func (o *iterNextPrefixOp) receiver() objID      { return o.iterID }
 func (o *iterNextPrefixOp) syncObjs() objIDSlice { return onlyBatchIDs(o.derivedReaderID) }
 
+// iterCanSingleDelOp models a call to CanDeterministicallySingleDelete with an
+// Iterator.
+type iterCanSingleDelOp struct {
+	iterID objID
+
+	derivedReaderID objID
+}
+
+func (o *iterCanSingleDelOp) run(t *test, h historyRecorder) {
+	// TODO(jackson): When we perform error injection, we'll need to rethink
+	// this.
+	_, err := pebble.CanDeterministicallySingleDelete(t.getIter(o.iterID).iter)
+	// The return value of CanDeterministicallySingleDelete is dependent on
+	// internal LSM state and non-deterministic, so we don't record it.
+	// Including the operation within the metamorphic test at all helps ensure
+	// that it does not change the result of any other Iterator operation that
+	// should be deterministic, regardless of its own outcome.
+	//
+	// We still record the value of the error because it's deterministic, at
+	// least for now. The possible error cases are:
+	//  - The iterator was already in an error state when the operation ran.
+	//  - The operation is deterministically invalid (like using an InternalNext
+	//    to change directions.)
+	h.Recordf("%s // %v", o, err)
+}
+
+func (o *iterCanSingleDelOp) String() string       { return fmt.Sprintf("%s.InternalNext()", o.iterID) }
+func (o *iterCanSingleDelOp) receiver() objID      { return o.iterID }
+func (o *iterCanSingleDelOp) syncObjs() objIDSlice { return onlyBatchIDs(o.derivedReaderID) }
+
 // iterPrevOp models an Iterator.Prev[WithLimit] operation.
 type iterPrevOp struct {
 	iterID objID
diff --git a/metamorphic/parser.go b/metamorphic/parser.go
index 75aa040150d..81b14c3e731 100644
--- a/metamorphic/parser.go
+++ b/metamorphic/parser.go
@@ -89,6 +89,8 @@ func opArgs(op op) (receiverID *objID, targetID *objID, args []interface{}) {
 		return &t.iterID, nil, []interface{}{&t.limit}
 	case *iterNextPrefixOp:
 		return &t.iterID, nil, nil
+	case *iterCanSingleDelOp:
+		return &t.iterID, nil, []interface{}{}
 	case *iterPrevOp:
 		return &t.iterID, nil, []interface{}{&t.limit}
 	case *iterSeekLTOp:
@@ -139,6 +141,7 @@ var methods = map[string]*methodInfo{
 	"NewSnapshot":               makeMethod(newSnapshotOp{}, dbTag),
 	"Next":                      makeMethod(iterNextOp{}, iterTag),
 	"NextPrefix":                makeMethod(iterNextPrefixOp{}, iterTag),
+	"InternalNext":              makeMethod(iterCanSingleDelOp{}, iterTag),
 	"Prev":                      makeMethod(iterPrevOp{}, iterTag),
 	"RangeKeyDelete":            makeMethod(rangeKeyDeleteOp{}, dbTag, batchTag),
 	"RangeKeySet":               makeMethod(rangeKeySetOp{}, dbTag, batchTag),
@@ -536,6 +539,8 @@ func computeDerivedFields(ops []op) {
 			v.derivedReaderID = iterToReader[v.iterID]
 		case *iterNextPrefixOp:
 			v.derivedReaderID = iterToReader[v.iterID]
+		case *iterCanSingleDelOp:
+			v.derivedReaderID = iterToReader[v.iterID]
 		case *iterPrevOp:
 			v.derivedReaderID = iterToReader[v.iterID]
 		case *newBatchOp:
diff --git a/testdata/iter_histories/internal_next b/testdata/iter_histories/internal_next
new file mode 100644
index 00000000000..614f1e15d45
--- /dev/null
+++ b/testdata/iter_histories/internal_next
@@ -0,0 +1,370 @@
+reset
+----
+
+# For all prefixes a-z, write 3 keys at timestamps @1, @10, @100.
+# This populates a total of 26 * 3 = 78 keys.
+
+populate keylen=1 timestamps=(1, 10, 100)
+----
+wrote 78 keys
+
+combined-iter
+first
+next-prefix
+internal-next
+internal-next
+next
+next-prefix
+internal-next
+internal-next
+next
+internal-next
+next
+internal-next
+----
+a@100: (a@100, .)
+b@100: (b@100, .)
+.
+.
+b@10: (b@10, .)
+c@100: (c@100, .)
+.
+.
+c@10: (c@10, .)
+.
+c@1: (c@1, .)
+.
+
+combined-iter
+first
+next-prefix
+can-deterministically-single-delete
+next
+next-prefix
+can-deterministically-single-delete
+next
+can-deterministically-single-delete
+next
+can-deterministically-single-delete
+----
+a@100: (a@100, .)
+b@100: (b@100, .)
+true
+b@10: (b@10, .)
+c@100: (c@100, .)
+true
+c@10: (c@10, .)
+true
+c@1: (c@1, .)
+true
+
+# The start boundaries of range keys are interleaved and can cause the internal
+# iterator to be advanced ahead to look for a point at the same user key. This
+# is one of the few situations in which InternalNext may find the iterator is
+# already positioned at iterPosNext. Test this scenario.
+
+batch commit
+range-key-set a b @1 foo
+range-key-set bb c @2 bar
+----
+committed 2 keys
+
+combined-iter
+first
+internal-next
+next
+internal-next
+seek-ge b@10
+internal-next
+next
+internal-next
+internal-next
+next
+----
+a: (., [a-b) @1=foo UPDATED)
+.
+a@100: (a@100, [a-b) @1=foo)
+.
+b@10: (b@10, . UPDATED)
+.
+b@1: (b@1, .)
+.
+.
+bb: (., [bb-c) @2=bar UPDATED)
+
+combined-iter
+first
+can-deterministically-single-delete
+next
+can-deterministically-single-delete
+seek-ge b@10
+can-deterministically-single-delete
+next
+can-deterministically-single-delete
+next
+----
+a: (., [a-b) @1=foo UPDATED)
+true
+a@100: (a@100, [a-b) @1=foo)
+true
+b@10: (b@10, . UPDATED)
+true
+b@1: (b@1, .)
+true
+bb: (., [bb-c) @2=bar UPDATED)
+
+
+reset
+----
+
+batch commit
+set a a
+set b b
+range-key-set b c @1 foo
+set d d
+----
+committed 4 keys
+
+combined-iter
+first
+internal-next
+next
+internal-next
+next
+prev
+internal-next
+----
+a: (a, .)
+.
+b: (b, [b-c) @1=foo UPDATED)
+.
+d: (d, . UPDATED)
+b: (b, [b-c) @1=foo UPDATED)
+err: switching from reverse to forward via internalNext is prohibited
+
+combined-iter
+first
+can-deterministically-single-delete
+next
+can-deterministically-single-delete
+next
+prev
+can-deterministically-single-delete
+----
+a: (a, .)
+true
+b: (b, [b-c) @1=foo UPDATED)
+true
+d: (d, . UPDATED)
+b: (b, [b-c) @1=foo UPDATED)
+err: switching from reverse to forward via internalNext is prohibited
+
+# Perform a test where we produce two internal versions (both SETs) for each
+# user key. Note that this test disables automatic compactions, so the presence
+# of the internal keys will be deterministic.
+
+reset
+----
+
+populate keylen=1 timestamps=(1, 10, 100)
+----
+wrote 78 keys
+
+flush
+----
+
+populate keylen=1 timestamps=(1, 10, 100)
+----
+wrote 78 keys
+
+combined-iter
+first
+next-prefix
+internal-next
+internal-next
+next
+next-prefix
+internal-next
+internal-next
+next
+internal-next
+next
+internal-next
+----
+a@100: (a@100, .)
+b@100: (b@100, .)
+SET
+.
+b@10: (b@10, .)
+c@100: (c@100, .)
+SET
+.
+c@10: (c@10, .)
+SET
+c@1: (c@1, .)
+SET
+
+combined-iter
+seek-ge z
+internal-next
+next
+next
+internal-next
+internal-next
+next
+internal-next
+----
+z@100: (z@100, .)
+SET
+z@10: (z@10, .)
+z@1: (z@1, .)
+SET
+.
+.
+.
+
+combined-iter
+first
+next-prefix
+can-deterministically-single-delete
+next
+next-prefix
+can-deterministically-single-delete
+next
+can-deterministically-single-delete
+next
+can-deterministically-single-delete
+----
+a@100: (a@100, .)
+b@100: (b@100, .)
+false
+b@10: (b@10, .)
+c@100: (c@100, .)
+false
+c@10: (c@10, .)
+false
+c@1: (c@1, .)
+false
+
+# Test that a CanDeterministicallySingleDelete is true if the old SETs are all
+# deleted by a range delete.
+
+batch commit
+del-range a zzz
+----
+committed 1 keys
+
+populate keylen=1 timestamps=(1, 10, 100)
+----
+wrote 78 keys
+
+combined-iter
+first
+next-prefix
+can-deterministically-single-delete
+next
+next-prefix
+can-deterministically-single-delete
+next
+can-deterministically-single-delete
+next
+can-deterministically-single-delete
+----
+a@100: (a@100, .)
+b@100: (b@100, .)
+true
+b@10: (b@10, .)
+c@100: (c@100, .)
+true
+c@10: (c@10, .)
+true
+c@1: (c@1, .)
+true
+
+# Set fmv=FormatDeleteSizedAndObsolete.
+
+reset format-major-version=15
+----
+
+# Test that a SET > SINGLEDEL > SET sequence yields
+# CanDeterministicallySingleDelete() = false.
+
+batch commit
+set a a
+singledel a
+set a a
+----
+committed 3 keys
+
+combined-iter
+first
+internal-next
+internal-next
+first
+can-deterministically-single-delete
+----
+a: (a, .)
+SINGLEDEL
+SET
+a: (a, .)
+false
+
+# Deleting with a DEL[SIZED] should then allow deterministic single delete
+# again.
+
+batch commit
+del a
+set a a
+----
+committed 2 keys
+
+combined-iter
+first
+internal-next
+internal-next
+internal-next
+internal-next
+internal-next
+first
+can-deterministically-single-delete
+----
+a: (a, .)
+DEL
+SET
+SINGLEDEL
+SET
+.
+a: (a, .)
+true
+
+# The above case tested DEL. Explicitly test DELSIZED by setting the key again,
+# then writing a DELSIZED, then another key.
+
+batch commit
+del-sized a 1
+set a a
+----
+committed 2 keys
+
+combined-iter
+first
+internal-next
+internal-next
+internal-next
+internal-next
+internal-next
+internal-next
+internal-next
+first
+can-deterministically-single-delete
+----
+a: (a, .)
+DELSIZED
+SET
+DEL
+SET
+SINGLEDEL
+SET
+.
+a: (a, .)
+true