diff --git a/internal/crdbtest/crdbtest.go b/internal/crdbtest/crdbtest.go index 5a65f3ef65..09fe0e0e3b 100644 --- a/internal/crdbtest/crdbtest.go +++ b/internal/crdbtest/crdbtest.go @@ -155,7 +155,9 @@ func EncodeTimestamp(key []byte, walltime uint64, logical uint32) []byte { } // DecodeTimestamp decodes a MVCC timestamp from a serialized MVCC key. -func DecodeTimestamp(mvccKey []byte) ([]byte, []byte, uint64, uint32) { +func DecodeTimestamp( + mvccKey []byte, +) (prefix []byte, untypedSuffix []byte, wallTime uint64, logicalTime uint32) { tsLen := int(mvccKey[len(mvccKey)-1]) keyPartEnd := len(mvccKey) - tsLen if keyPartEnd < 0 { diff --git a/sstable/block/block.go b/sstable/block/block.go index f67f73bebc..740a9ebc51 100644 --- a/sstable/block/block.go +++ b/sstable/block/block.go @@ -256,6 +256,14 @@ type IterTransforms struct { // NoTransforms is the default value for IterTransforms. var NoTransforms = IterTransforms{} +// NoTransforms returns true if there are no transforms enabled. +func (t *IterTransforms) NoTransforms() bool { + return t.SyntheticSeqNum == 0 && + t.HideObsoletePoints && + t.SyntheticPrefix.IsSet() && + t.SyntheticSuffix.IsSet() +} + // FragmentIterTransforms allow on-the-fly transformation of range deletion or // range key data at iteration time. type FragmentIterTransforms struct { @@ -293,7 +301,7 @@ const NoSyntheticSeqNum SyntheticSeqNum = 0 // RangeKeyUnset keys are not supported when a synthetic suffix is used. type SyntheticSuffix []byte -// IsSet returns true if the synthetic suffix is not enpty. +// IsSet returns true if the synthetic suffix is not empty. func (ss SyntheticSuffix) IsSet() bool { return len(ss) > 0 } diff --git a/sstable/colblk/cockroach_test.go b/sstable/colblk/cockroach_test.go index 00de833623..e81d9b5ce6 100644 --- a/sstable/colblk/cockroach_test.go +++ b/sstable/colblk/cockroach_test.go @@ -10,6 +10,7 @@ import ( "encoding/binary" "fmt" "io" + "slices" "sync" "testing" "time" @@ -19,6 +20,7 @@ import ( "github.com/cockroachdb/crlib/crstrings" "github.com/cockroachdb/pebble/internal/base" "github.com/cockroachdb/pebble/internal/crdbtest" + "github.com/cockroachdb/pebble/internal/invariants" "github.com/cockroachdb/pebble/sstable/block" "github.com/stretchr/testify/require" "golang.org/x/exp/rand" @@ -198,12 +200,18 @@ func (ks *cockroachKeySeeker) Init(r *DataBlockReader) error { // contained within the data block. It's equivalent to performing // // Compare(firstUserKey, k) -func (ks *cockroachKeySeeker) IsLowerBound(k []byte) bool { +func (ks *cockroachKeySeeker) IsLowerBound(k []byte, syntheticSuffix []byte) bool { prefix, untypedSuffix, wallTime, logicalTime := crdbtest.DecodeTimestamp(k) if v := crdbtest.Compare(ks.prefixes.UnsafeFirstSlice(), prefix); v != 0 { return v > 0 } + if len(syntheticSuffix) > 0 { + return crdbtest.Compare(syntheticSuffix, k[len(prefix):]) >= 0 + } if len(untypedSuffix) > 0 { + if invariants.Enabled && ks.mvccWallTimes.At(0) != 0 { + panic("comparing timestamp with untyped suffix") + } return crdbtest.Compare(ks.untypedSuffixes.At(0), untypedSuffix) >= 0 } if v := cmp.Compare(ks.mvccWallTimes.At(0), wallTime); v != 0 { @@ -250,12 +258,12 @@ func (ks *cockroachKeySeeker) seekGEOnSuffix(index int, seekSuffix []byte) (row seekWallTime = binary.LittleEndian.Uint64(seekSuffix) default: // The suffix is untyped. Compare the untyped suffixes. - // Binary search between [index, prefixChanged.SeekSetBitGE(index)]. + // Binary search between [index, prefixChanged.SeekSetBitGE(index+1)]. // // Define f(l-1) == false and f(u) == true. // Invariant: f(l-1) == false, f(u) == true. l := index - u := ks.reader.prefixChanged.SeekSetBitGE(index) + u := ks.reader.prefixChanged.SeekSetBitGE(index + 1) for l < u { h := int(uint(l+u) >> 1) // avoid overflow when computing h // l ≤ h < u @@ -309,6 +317,7 @@ func (ks *cockroachKeySeeker) MaterializeUserKey(ki *PrefixBytesIter, prevRow, r if mvccWall == 0 && mvccLogical == 0 { // This is not an MVCC key. Use the untyped suffix. untypedSuffixed := ks.untypedSuffixes.At(row) + // Slice first, to check that the capacity is sufficient. res := ki.buf[:len(ki.buf)+len(untypedSuffixed)] memmove(ptr, unsafe.Pointer(unsafe.SliceData(untypedSuffixed)), uintptr(len(untypedSuffixed))) return res @@ -341,6 +350,23 @@ func (ks *cockroachKeySeeker) MaterializeUserKey(ki *PrefixBytesIter, prevRow, r return ki.buf[:len(ki.buf)+13] } +// MaterializeUserKeyWithSyntheticSuffix is part of the KeySeeker interface. +func (ks *cockroachKeySeeker) MaterializeUserKeyWithSyntheticSuffix( + ki *PrefixBytesIter, suffix []byte, prevRow, row int, +) []byte { + if prevRow+1 == row && prevRow >= 0 { + ks.prefixes.SetNext(ki) + } else { + ks.prefixes.SetAt(ki, row) + } + + // Slice first, to check that the capacity is sufficient. + res := ki.buf[:len(ki.buf)+len(suffix)] + ptr := unsafe.Pointer(uintptr(unsafe.Pointer(unsafe.SliceData(ki.buf))) + uintptr(len(ki.buf))) + memmove(ptr, unsafe.Pointer(unsafe.SliceData(suffix)), uintptr(len(suffix))) + return res +} + // Release is part of the KeySeeker interface. func (ks *cockroachKeySeeker) Release() { *ks = cockroachKeySeeker{} @@ -371,9 +397,13 @@ func TestCockroachDataBlock(t *testing.T) { var reader DataBlockReader var it DataBlockIter reader.Init(cockroachKeySchema, serializedBlock) - it.Init(&reader, cockroachKeySchema.NewKeySeeker(), getLazyValuer(func([]byte) base.LazyValue { - return base.LazyValue{ValueOrHandle: []byte("mock external value")} - }), block.IterTransforms{}) + it.Init( + &reader, cockroachKeySchema.NewKeySeeker(), crdbtest.Compare, crdbtest.Split, + getLazyValuer(func([]byte) base.LazyValue { + return base.LazyValue{ValueOrHandle: []byte("mock external value")} + }), + block.IterTransforms{}, + ) t.Run("Next", func(t *testing.T) { // Scan the block using Next and ensure that all the keys values match. @@ -518,6 +548,18 @@ func BenchmarkCockroachDataBlockIterTransforms(b *testing.B) { HideObsoletePoints: true, }, }, + { + description: "SyntheticPrefix", + transforms: block.IterTransforms{ + SyntheticPrefix: []byte("prefix_"), + }, + }, + { + description: "SyntheticSuffix", + transforms: block.IterTransforms{ + SyntheticSuffix: crdbtest.EncodeTimestamp(make([]byte, 0, 20), 1_000_000_000_000, 0)[1:], + }, + }, } for _, cfg := range shortBenchConfigs { for _, t := range transforms { @@ -561,11 +603,21 @@ func benchmarkCockroachDataBlockIter( var reader DataBlockReader var it DataBlockIter reader.Init(cockroachKeySchema, serializedBlock) - it.Init(&reader, cockroachKeySchema.NewKeySeeker(), getLazyValuer(func([]byte) base.LazyValue { - return base.LazyValue{ValueOrHandle: []byte("mock external value")} - }), transforms) + it.Init( + &reader, cockroachKeySchema.NewKeySeeker(), crdbtest.Compare, crdbtest.Split, + getLazyValuer(func([]byte) base.LazyValue { + return base.LazyValue{ValueOrHandle: []byte("mock external value")} + }), + transforms, + ) avgRowSize := float64(len(serializedBlock)) / float64(count) + if transforms.SyntheticPrefix.IsSet() { + for i := range keys { + keys[i] = slices.Concat(transforms.SyntheticPrefix, keys[i]) + } + } + b.Run("Next", func(b *testing.B) { kv := it.First() b.ResetTimer() diff --git a/sstable/colblk/data_block.go b/sstable/colblk/data_block.go index bdab29ef4e..dda0aa1d01 100644 --- a/sstable/colblk/data_block.go +++ b/sstable/colblk/data_block.go @@ -100,9 +100,10 @@ func (kcmp KeyComparison) PrefixEqual() bool { return kcmp.PrefixLen == kcmp.Com type KeySeeker interface { // Init initializes the iterator to read from the provided DataBlockReader. Init(b *DataBlockReader) error - // IsLowerBound returns true if all keys in the data block are >= the given - // key. If the data block contains no keys, returns true. - IsLowerBound(k []byte) bool + // IsLowerBound returns true if all keys in the data block (after suffix + // replacement if syntheticSuffix is not empty) are >= the given key. If the + // data block contains no keys, returns true. + IsLowerBound(k []byte, syntheticSuffix []byte) bool // SeekGE returns the index of the first row with a key greater than or equal // to [key], and whether that row has the same prefix as [key]. // @@ -119,9 +120,22 @@ type KeySeeker interface { // // The provided keyIter must have a buffer large enough to hold the key. // - // The prevRow parameter is the row MaterializeUserKey was last invoked with. - // Implementations may take advantage of that knowledge to reduce work. + // The prevRow parameter is the row MaterializeUserKey was last invoked with + // (or a negative number if not applicable). Implementations may take + // advantage of that knowledge to reduce work. MaterializeUserKey(keyIter *PrefixBytesIter, prevRow, row int) []byte + // MaterializeUserKeyWithSyntheticSuffix is a variant of MaterializeUserKey + // where the suffix is replaced. + // + // The provided keyIter must have a buffer large enough to hold the key after + // suffix replacement. + // + // The prevRow parameter is the row MaterializeUserKeyWithSyntheticSuffix was + // last invoked with (or a negative number if not applicable). Implementations + // may take advantage of that knowledge to reduce work. + MaterializeUserKeyWithSyntheticSuffix( + keyIter *PrefixBytesIter, syntheticSuffix []byte, prevRow, row int, + ) []byte // Release releases the KeySeeker. It's called when the seeker is no longer // in use. Implementations may pool KeySeeker objects. Release() @@ -297,12 +311,16 @@ func (ks *defaultKeySeeker) Init(r *DataBlockReader) error { } // IsLowerBound is part of the KeySeeker interface. -func (ks *defaultKeySeeker) IsLowerBound(k []byte) bool { +func (ks *defaultKeySeeker) IsLowerBound(k []byte, syntheticSuffix []byte) bool { si := ks.comparer.Split(k) if v := ks.comparer.Compare(ks.prefixes.UnsafeFirstSlice(), k[:si]); v != 0 { return v > 0 } - return ks.comparer.Compare(ks.suffixes.At(0), k[si:]) >= 0 + suffix := syntheticSuffix + if len(suffix) == 0 { + suffix = ks.suffixes.At(0) + } + return ks.comparer.Compare(suffix, k[si:]) >= 0 } // SeekGE is part of the KeySeeker interface. @@ -363,6 +381,24 @@ func (ks *defaultKeySeeker) MaterializeUserKey(keyIter *PrefixBytesIter, prevRow return res } +// MaterializeUserKeyWithSyntheticSuffix is part of the colblk.KeySeeker interface. +func (ks *defaultKeySeeker) MaterializeUserKeyWithSyntheticSuffix( + keyIter *PrefixBytesIter, suffix []byte, prevRow, row int, +) []byte { + if row == prevRow+1 && prevRow >= 0 { + ks.prefixes.SetNext(keyIter) + } else { + ks.prefixes.SetAt(keyIter, row) + } + res := keyIter.buf[:len(keyIter.buf)+len(suffix)] + memmove( + unsafe.Pointer(uintptr(unsafe.Pointer(unsafe.SliceData(keyIter.buf)))+uintptr(len(keyIter.buf))), + unsafe.Pointer(unsafe.SliceData(suffix)), + uintptr(len(suffix)), + ) + return res +} + func (ks *defaultKeySeeker) Release() { *ks = defaultKeySeeker{} defaultKeySeekerPool.Put(ks) @@ -585,6 +621,8 @@ type DataBlockRewriter struct { reader DataBlockReader iter DataBlockIter keySeeker KeySeeker + compare base.Compare + split base.Split keyBuf []byte // keyAlloc grown throughout the lifetime of the rewriter. keyAlloc bytealloc.A @@ -592,6 +630,17 @@ type DataBlockRewriter struct { initialized bool } +// NewDataBlockRewriter creates a block rewriter. +func NewDataBlockRewriter( + keySchema KeySchema, compare base.Compare, split base.Split, +) *DataBlockRewriter { + return &DataBlockRewriter{ + KeySchema: keySchema, + compare: compare, + split: split, + } +} + // RewriteSuffixes rewrites the input block. It expects the input block to only // contain keys with the suffix `from`. It rewrites the block to contain the // same keys with the suffix `to`. @@ -634,7 +683,7 @@ func (rw *DataBlockRewriter) RewriteSuffixes( rw.reader.Init(rw.KeySchema, input) rw.keySeeker.Init(&rw.reader) rw.writer.Reset() - if err = rw.iter.Init(&rw.reader, rw.keySeeker, nil, block.IterTransforms{}); err != nil { + if err = rw.iter.Init(&rw.reader, rw.keySeeker, rw.compare, rw.split, nil, block.IterTransforms{}); err != nil { return base.InternalKey{}, base.InternalKey{}, nil, err } @@ -781,7 +830,7 @@ func (c *MultiDataBlockIter) InitHandle( c.h.Release() c.h = h c.r.Init(c.KeySchema, h.Get()) - return c.DataBlockIter.Init(&c.r, c.KeySchema.NewKeySeeker(), c.GetLazyValuer, t) + return c.DataBlockIter.Init(&c.r, c.KeySchema.NewKeySeeker(), cmp, split, c.GetLazyValuer, t) } // Handle returns the handle to the block. @@ -844,8 +893,11 @@ type DataBlockIter struct { r *DataBlockReader maxRow int keySeeker KeySeeker + cmp base.Compare + split base.Split getLazyValuer block.GetLazyValueForPrefixAndValueHandler transforms block.IterTransforms + noTransforms bool // state keyIter PrefixBytesIter @@ -863,43 +915,85 @@ type DataBlockIter struct { // provided reader. func (i *DataBlockIter) Init( r *DataBlockReader, - keyIterator KeySeeker, + keySeeker KeySeeker, + cmp base.Compare, + split base.Split, getLazyValuer block.GetLazyValueForPrefixAndValueHandler, transforms block.IterTransforms, ) error { numRows := int(r.r.header.Rows) + keyIterBuf := i.keyIter.buf *i = DataBlockIter{ r: r, maxRow: numRows - 1, - keySeeker: keyIterator, + keySeeker: keySeeker, + cmp: cmp, + split: split, getLazyValuer: getLazyValuer, transforms: transforms, row: -1, kvRow: math.MinInt, kv: base.InternalKV{}, - keyIter: PrefixBytesIter{}, + keyIter: PrefixBytesIter{buf: keyIterBuf}, } if i.transforms.HideObsoletePoints && r.isObsolete.SeekSetBitGE(0) == numRows { // There are no obsolete points in the block; don't bother checking. i.transforms.HideObsoletePoints = false } - // Allocate a keyIter buffer that's large enough to hold the largest user - // key in the block with 1 byte to spare (so that pointer arithmetic is - // never pointing beyond the allocation, which would violate Go rules). - n := int(r.maximumKeyLength) + 1 - if cap(i.keyIter.buf) < n { - ptr := mallocgc(uintptr(n), nil, false) - i.keyIter.buf = unsafe.Slice((*byte)(ptr), n)[:0] - } else { - i.keyIter.buf = i.keyIter.buf[:0] - } + + i.noTransforms = i.transforms.NoTransforms() + + // The worst case is when the largest key in the block has no suffix. + maxKeyLength := len(i.transforms.SyntheticPrefix) + int(r.maximumKeyLength) + len(i.transforms.SyntheticSuffix) + i.keyIter.Init(maxKeyLength, i.transforms.SyntheticPrefix) return i.keySeeker.Init(r) } // IsLowerBound implements the block.DataBlockIterator interface. func (i *DataBlockIter) IsLowerBound(k []byte) bool { + if i.transforms.SyntheticPrefix.IsSet() { + var keyPrefix []byte + keyPrefix, k = splitKey(k, len(i.transforms.SyntheticPrefix)) + if cmp := bytes.Compare(keyPrefix, i.transforms.SyntheticPrefix); cmp != 0 { + return cmp < 0 + } + } + // If we are hiding obsolete points, it is possible that all points < k are + // hidden. // Note: we ignore HideObsoletePoints, but false negatives are allowed. - return i.keySeeker.IsLowerBound(k) + return i.keySeeker.IsLowerBound(k, i.transforms.SyntheticSuffix) +} + +func splitKey(k []byte, at int) (before, after []byte) { + if len(k) <= at { + return k, nil + } + return k[:at], k[at:] +} + +// seekGEInternal is a wrapper around keySeeker.SeekGE which takes into account +// the synthetic prefix and suffix. +func (i *DataBlockIter) seekGEInternal(key []byte, boundRow int, searchDir int8) (row int) { + if i.transforms.SyntheticPrefix.IsSet() { + var keyPrefix []byte + keyPrefix, key = splitKey(key, len(i.transforms.SyntheticPrefix)) + if cmp := bytes.Compare(keyPrefix, i.transforms.SyntheticPrefix); cmp != 0 { + if cmp < 0 { + return 0 + } + return i.maxRow + 1 + } + } + if i.transforms.SyntheticSuffix.IsSet() { + n := i.split(key) + row, eq := i.keySeeker.SeekGE(key[:n], boundRow, searchDir) + if eq && i.cmp(key[n:], i.transforms.SyntheticSuffix) > 0 { + row = i.r.prefixChanged.SeekSetBitGE(row + 1) + } + return row + } + row, _ = i.keySeeker.SeekGE(key, boundRow, searchDir) + return row } // SeekGE implements the base.InternalIterator interface. @@ -911,7 +1005,12 @@ func (i *DataBlockIter) SeekGE(key []byte, flags base.SeekGEFlags) *base.Interna if flags.TrySeekUsingNext() { searchDir = +1 } - i.row, _ = i.keySeeker.SeekGE(key, i.row, searchDir) + if i.noTransforms { + // Fast path. + i.row, _ = i.keySeeker.SeekGE(key, i.row, searchDir) + return i.decodeRow() + } + i.row = i.seekGEInternal(key, i.row, searchDir) if i.transforms.HideObsoletePoints { i.nextObsoletePoint = i.r.isObsolete.SeekSetBitGE(i.row) if i.atObsoletePointForward() { @@ -943,8 +1042,7 @@ func (i *DataBlockIter) SeekLT(key []byte, _ base.SeekLTFlags) *base.InternalKV if i.r == nil { return nil } - geRow, _ := i.keySeeker.SeekGE(key, i.row, 0 /* searchDir */) - i.row = geRow - 1 + i.row = i.seekGEInternal(key, i.row, 0 /* searchDir */) - 1 if i.transforms.HideObsoletePoints { i.nextObsoletePoint = i.r.isObsolete.SeekSetBitGE(max(i.row, 0)) if i.atObsoletePointBackward() { @@ -1004,19 +1102,29 @@ func (i *DataBlockIter) Next() *base.InternalKV { return nil } i.row++ - if i.transforms.HideObsoletePoints && i.atObsoletePointForward() { - i.skipObsoletePointsForward() - if i.row > i.maxRow { - return nil + // Inline decodeKey(), adding obsolete points logic. + if i.noTransforms { + // Fast path. + i.kv.K = base.InternalKey{ + UserKey: i.keySeeker.MaterializeUserKey(&i.keyIter, i.kvRow, i.row), + Trailer: base.InternalKeyTrailer(i.r.trailers.At(i.row)), + } + } else { + if i.transforms.HideObsoletePoints && i.atObsoletePointForward() { + i.skipObsoletePointsForward() + if i.row > i.maxRow { + return nil + } + } + if suffix := i.transforms.SyntheticSuffix; suffix.IsSet() { + i.kv.K.UserKey = i.keySeeker.MaterializeUserKeyWithSyntheticSuffix(&i.keyIter, suffix, i.kvRow, i.row) + } else { + i.kv.K.UserKey = i.keySeeker.MaterializeUserKey(&i.keyIter, i.kvRow, i.row) + } + i.kv.K.Trailer = base.InternalKeyTrailer(i.r.trailers.At(i.row)) + if n := i.transforms.SyntheticSeqNum; n != 0 { + i.kv.K.SetSeqNum(base.SeqNum(n)) } - } - // Inline decodeKey(). - i.kv.K = base.InternalKey{ - UserKey: i.keySeeker.MaterializeUserKey(&i.keyIter, i.kvRow, i.row), - Trailer: base.InternalKeyTrailer(i.r.trailers.At(i.row)), - } - if n := i.transforms.SyntheticSeqNum; n != 0 { - i.kv.K.SetSeqNum(base.SeqNum(n)) } // Inline i.r.values.At(row). v := i.r.values.slice(i.r.values.offsets.At2(i.row)) @@ -1167,12 +1275,22 @@ func (i *DataBlockIter) decodeRow() *base.InternalKV { return &i.kv default: // Inline decodeKey(). - i.kv.K = base.InternalKey{ - UserKey: i.keySeeker.MaterializeUserKey(&i.keyIter, i.kvRow, i.row), - Trailer: base.InternalKeyTrailer(i.r.trailers.At(i.row)), - } - if n := i.transforms.SyntheticSeqNum; n != 0 { - i.kv.K.SetSeqNum(base.SeqNum(n)) + if i.noTransforms { + // Fast path. + i.kv.K = base.InternalKey{ + UserKey: i.keySeeker.MaterializeUserKey(&i.keyIter, i.kvRow, i.row), + Trailer: base.InternalKeyTrailer(i.r.trailers.At(i.row)), + } + } else { + if suffix := i.transforms.SyntheticSuffix; suffix.IsSet() { + i.kv.K.UserKey = i.keySeeker.MaterializeUserKeyWithSyntheticSuffix(&i.keyIter, suffix, i.kvRow, i.row) + } else { + i.kv.K.UserKey = i.keySeeker.MaterializeUserKey(&i.keyIter, i.kvRow, i.row) + } + i.kv.K.Trailer = base.InternalKeyTrailer(i.r.trailers.At(i.row)) + if n := i.transforms.SyntheticSeqNum; n != 0 { + i.kv.K.SetSeqNum(base.SeqNum(n)) + } } // Inline i.r.values.At(row). startOffset := i.r.values.offsets.At(i.row) @@ -1191,12 +1309,22 @@ func (i *DataBlockIter) decodeRow() *base.InternalKV { // This function does not inline, so we copy its code verbatim. For any updates // to this code, all code preceded by "Inline decodeKey" must be updated. func (i *DataBlockIter) decodeKey() { - i.kv.K = base.InternalKey{ - UserKey: i.keySeeker.MaterializeUserKey(&i.keyIter, i.kvRow, i.row), - Trailer: base.InternalKeyTrailer(i.r.trailers.At(i.row)), - } - if n := i.transforms.SyntheticSeqNum; n != 0 { - i.kv.K.SetSeqNum(base.SeqNum(n)) + if i.noTransforms { + // Fast path. + i.kv.K = base.InternalKey{ + UserKey: i.keySeeker.MaterializeUserKey(&i.keyIter, i.kvRow, i.row), + Trailer: base.InternalKeyTrailer(i.r.trailers.At(i.row)), + } + } else { + if suffix := i.transforms.SyntheticSuffix; suffix.IsSet() { + i.kv.K.UserKey = i.keySeeker.MaterializeUserKeyWithSyntheticSuffix(&i.keyIter, suffix, i.kvRow, i.row) + } else { + i.kv.K.UserKey = i.keySeeker.MaterializeUserKey(&i.keyIter, i.kvRow, i.row) + } + i.kv.K.Trailer = base.InternalKeyTrailer(i.r.trailers.At(i.row)) + if n := i.transforms.SyntheticSeqNum; n != 0 { + i.kv.K.SetSeqNum(base.SeqNum(n)) + } } } diff --git a/sstable/colblk/data_block_test.go b/sstable/colblk/data_block_test.go index a25a215a51..72f1a5b5e1 100644 --- a/sstable/colblk/data_block_test.go +++ b/sstable/colblk/data_block_test.go @@ -28,8 +28,7 @@ func TestDataBlock(t *testing.T) { var w DataBlockWriter var r DataBlockReader var it DataBlockIter - var rw DataBlockRewriter - rw.KeySchema = testKeysSchema + rw := NewDataBlockRewriter(testKeysSchema, testkeys.Comparer.Compare, testkeys.Comparer.Split) var sizes []int datadriven.Walk(t, "testdata/data_block", func(t *testing.T, path string) { datadriven.RunTest(t, path, func(t *testing.T, td *datadriven.TestData) string { @@ -111,9 +110,13 @@ func TestDataBlock(t *testing.T) { SyntheticPrefix: []byte(syntheticPrefix), SyntheticSuffix: []byte(syntheticSuffix), } - it.Init(&r, testKeysSchema.NewKeySeeker(), getLazyValuer(func([]byte) base.LazyValue { - return base.LazyValue{ValueOrHandle: []byte("mock external value")} - }), transforms) + it.Init( + &r, testKeysSchema.NewKeySeeker(), testkeys.Comparer.Compare, testkeys.Comparer.Split, + getLazyValuer(func([]byte) base.LazyValue { + return base.LazyValue{ValueOrHandle: []byte("mock external value")} + }), + transforms, + ) o := []itertest.IterOpt{itertest.ShowCommands} if td.HasArg("verbose") { o = append(o, itertest.Verbose) diff --git a/sstable/colblk/prefix_bytes.go b/sstable/colblk/prefix_bytes.go index 66c41a243b..71f17f0b64 100644 --- a/sstable/colblk/prefix_bytes.go +++ b/sstable/colblk/prefix_bytes.go @@ -17,6 +17,7 @@ import ( "github.com/cockroachdb/errors" "github.com/cockroachdb/pebble/internal/binfmt" "github.com/cockroachdb/pebble/internal/invariants" + "github.com/cockroachdb/pebble/sstable/block" ) // PrefixBytes holds an array of lexicographically ordered byte slices. It @@ -248,11 +249,31 @@ type PrefixBytesIter struct { // buf is used for materializing a user key. It is preallocated to the maximum // key length in the data block. buf []byte + syntheticPrefixLen uint32 sharedAndBundlePrefixLen uint32 offsetIndex int nextBundleOffsetIndex int } +// Init initializes the prefix bytes iterator; maxKeyLength must be +// large enough to fit any key in the block after applying any synthetic prefix +// and/or suffix. +func (i *PrefixBytesIter) Init(maxKeyLength int, syntheticPrefix block.SyntheticPrefix) { + // Allocate a buffer that's large enough to hold the largest user key in the + // block with 1 byte to spare (so that pointer arithmetic is never pointing + // beyond the allocation, which would violate Go rules). + n := maxKeyLength + 1 + if cap(i.buf) < n { + ptr := mallocgc(uintptr(n), nil, false) + i.buf = unsafe.Slice((*byte)(ptr), n) + } + i.buf = i.buf[:0] + i.syntheticPrefixLen = uint32(len(syntheticPrefix)) + if syntheticPrefix.IsSet() { + i.buf = append(i.buf, syntheticPrefix...) + } +} + // SetAt updates the provided PrefixBytesIter to hold the i'th []byte slice in // the PrefixBytes. The PrefixBytesIter's buffer must be sufficiently large to // hold the i'th []byte slice, and the caller is required to statically ensure @@ -272,21 +293,24 @@ func (b *PrefixBytes) SetAt(it *PrefixBytesIter, i int) { rowSuffixStart, rowSuffixEnd := b.rowSuffixOffsets(i, it.offsetIndex) rowSuffixLen := rowSuffixEnd - rowSuffixStart - it.sharedAndBundlePrefixLen = uint32(b.sharedPrefixLen) + bundlePrefixLen + it.sharedAndBundlePrefixLen = it.syntheticPrefixLen + uint32(b.sharedPrefixLen) + bundlePrefixLen it.buf = it.buf[:it.sharedAndBundlePrefixLen+rowSuffixLen] ptr := unsafe.Pointer(unsafe.SliceData(it.buf)) + ptr = unsafe.Pointer(uintptr(ptr) + uintptr(it.syntheticPrefixLen)) // Copy the shared key prefix. memmove(ptr, b.rawBytes.data, uintptr(b.sharedPrefixLen)) // Copy the bundle prefix. + ptr = unsafe.Pointer(uintptr(ptr) + uintptr(b.sharedPrefixLen)) memmove( - unsafe.Pointer(uintptr(ptr)+uintptr(b.sharedPrefixLen)), + ptr, unsafe.Pointer(uintptr(b.rawBytes.data)+uintptr(bundleOffsetStart)), uintptr(bundlePrefixLen)) // Copy the per-row suffix. + ptr = unsafe.Pointer(uintptr(ptr) + uintptr(bundlePrefixLen)) memmove( - unsafe.Pointer(uintptr(ptr)+uintptr(it.sharedAndBundlePrefixLen)), + ptr, unsafe.Pointer(uintptr(b.rawBytes.data)+uintptr(rowSuffixStart)), uintptr(rowSuffixLen)) // Set nextBundleOffsetIndex so that a call to SetNext can cheaply determine @@ -335,17 +359,19 @@ func (b *PrefixBytes) SetNext(it *PrefixBytesIter) { bundlePrefixLen := rowSuffixStart - bundlePrefixStart it.nextBundleOffsetIndex = it.offsetIndex + (1 << b.bundleShift) - it.sharedAndBundlePrefixLen = uint32(b.sharedPrefixLen) + bundlePrefixLen + it.sharedAndBundlePrefixLen = it.syntheticPrefixLen + uint32(b.sharedPrefixLen) + bundlePrefixLen it.buf = it.buf[:it.sharedAndBundlePrefixLen+rowSuffixLen] // Copy in the new bundle suffix. ptr := unsafe.Pointer(unsafe.SliceData(it.buf)) + ptr = unsafe.Pointer(uintptr(ptr) + uintptr(it.syntheticPrefixLen) + uintptr(b.sharedPrefixLen)) memmove( - unsafe.Pointer(uintptr(ptr)+uintptr(b.sharedPrefixLen)), + ptr, unsafe.Pointer(uintptr(b.rawBytes.data)+uintptr(bundlePrefixStart)), uintptr(bundlePrefixLen)) // Copy in the per-row suffix. + ptr = unsafe.Pointer(uintptr(ptr) + uintptr(bundlePrefixLen)) memmove( - unsafe.Pointer(uintptr(ptr)+uintptr(it.sharedAndBundlePrefixLen)), + ptr, unsafe.Pointer(uintptr(b.rawBytes.data)+uintptr(rowSuffixStart)), uintptr(rowSuffixLen)) } diff --git a/sstable/colblk/testdata/data_block/transforms b/sstable/colblk/testdata/data_block/transforms index b8b7a47a2c..aa01df4d27 100644 --- a/sstable/colblk/testdata/data_block/transforms +++ b/sstable/colblk/testdata/data_block/transforms @@ -1,3 +1,4 @@ +# Test synthetic sequence numbers. write-block a@10#1,SET:apple b@5#2,SET:banana @@ -207,3 +208,194 @@ seek-lt z: . seek-lt b: . next: . prev: . + +# Test synthetic prefix. +write-block +blockprefix_a@10#1,SET:apple +blockprefix_b@5#3,SET:banana +blockprefix_blueberry#3,SET:blueberry +blockprefix_c@9#4,SET:coconut +blockprefix_c@6#5,SET:cantaloupe +blockprefix_c@1#7,SET:clementine +---- + +iter synthetic-prefix=foo_ +first +next +next +next +next +next +prev +---- +first: foo_blockprefix_a@10:apple + next: foo_blockprefix_b@5:banana + next: foo_blockprefix_blueberry:blueberry + next: foo_blockprefix_c@9:coconut + next: foo_blockprefix_c@6:cantaloupe + next: foo_blockprefix_c@1:clementine + prev: foo_blockprefix_c@6:cantaloupe + +iter synthetic-prefix=foo_ +last +prev +next +prev +prev +prev +prev +prev +prev +next +---- +last: foo_blockprefix_c@1:clementine +prev: foo_blockprefix_c@6:cantaloupe +next: foo_blockprefix_c@1:clementine +prev: foo_blockprefix_c@6:cantaloupe +prev: foo_blockprefix_c@9:coconut +prev: foo_blockprefix_blueberry:blueberry +prev: foo_blockprefix_b@5:banana +prev: foo_blockprefix_a@10:apple +prev: . +next: foo_blockprefix_a@10:apple + +iter synthetic-prefix=foo_ +seek-ge foo_blockprefix +seek-ge foo_blockprefix_b +seek-ge foo_blockprefix_bb +seek-ge foo_blockprefix_d +prev +seek-ge fo +seek-ge foa_a +seek-ge foz_a +prev +---- + seek-ge foo_blockprefix: foo_blockprefix_a@10:apple + seek-ge foo_blockprefix_b: foo_blockprefix_b@5:banana +seek-ge foo_blockprefix_bb: foo_blockprefix_blueberry:blueberry + seek-ge foo_blockprefix_d: . + prev: foo_blockprefix_c@1:clementine + seek-ge fo: foo_blockprefix_a@10:apple + seek-ge foa_a: foo_blockprefix_a@10:apple + seek-ge foz_a: . + prev: foo_blockprefix_c@1:clementine + +iter synthetic-prefix=foo_ +seek-lt foo_blockprefix_d +seek-lt foo_blockprefix_bb +seek-lt foo_blockprefix_b +seek-lt foo_blockprefix +next +seek-lt foz_a +seek-lt foa_a +next +seek-lt fo +next +---- + seek-lt foo_blockprefix_d: foo_blockprefix_c@1:clementine +seek-lt foo_blockprefix_bb: foo_blockprefix_b@5:banana + seek-lt foo_blockprefix_b: foo_blockprefix_a@10:apple + seek-lt foo_blockprefix: . + next: foo_blockprefix_a@10:apple + seek-lt foz_a: foo_blockprefix_c@1:clementine + seek-lt foa_a: . + next: foo_blockprefix_a@10:apple + seek-lt fo: . + next: foo_blockprefix_a@10:apple + +iter synthetic-prefix=foo_ +is-lower-bound fo +is-lower-bound foo_ +is-lower-bound foo_blockprefix_a@10 +is-lower-bound foo_blockprefix_a@9 +is-lower-bound fop +---- + is-lower-bound fo: true + is-lower-bound foo_: true +is-lower-bound foo_blockprefix_a@10: true + is-lower-bound foo_blockprefix_a@9: false + is-lower-bound fop: false + +# Test synthetic suffix. +write-block +a@1#1,SET:a +b@2#1,SET:b +c@3#1,SET:c +d@1#1,SET:d +---- + +iter synthetic-suffix=@10 +first +next +next +next +next +prev +---- +first: a@10:a + next: b@10:b + next: c@10:c + next: d@10:d + next: . + prev: d@10:d + +iter synthetic-suffix=@10 +last +prev +prev +prev +prev +next +---- +last: d@10:d +prev: c@10:c +prev: b@10:b +prev: a@10:a +prev: . +next: a@10:a + +iter synthetic-suffix=@10 +seek-ge b +seek-ge b@11 +seek-ge b@10 +seek-ge b@9 +seek-ge d +seek-ge d@11 +seek-ge d@10 +seek-ge d@9 +---- + seek-ge b: b@10:b +seek-ge b@11: b@10:b +seek-ge b@10: b@10:b + seek-ge b@9: c@10:c + seek-ge d: d@10:d +seek-ge d@11: d@10:d +seek-ge d@10: d@10:d + seek-ge d@9: . + +iter synthetic-suffix=@10 +seek-lt e +seek-lt d@9 +seek-lt d@10 +seek-lt d@11 +seek-lt d +seek-lt c@9 +seek-lt c@10 +seek-lt c@11 +seek-lt c +seek-lt a@9 +seek-lt a@10 +seek-lt a +---- + seek-lt e: d@10:d + seek-lt d@9: d@10:d +seek-lt d@10: c@10:c +seek-lt d@11: c@10:c + seek-lt d: c@10:c + seek-lt c@9: c@10:c +seek-lt c@10: b@10:b +seek-lt c@11: b@10:b + seek-lt c: b@10:b + seek-lt a@9: a@10:a +seek-lt a@10: . + seek-lt a: . diff --git a/sstable/colblk_writer.go b/sstable/colblk_writer.go index 85de5620d8..997abd70f3 100644 --- a/sstable/colblk_writer.go +++ b/sstable/colblk_writer.go @@ -959,7 +959,7 @@ func (w *RawColumnWriter) rewriteSuffixes( } // Copy data blocks in parallel, rewriting suffixes as we go. blocks, err := rewriteDataBlocksInParallel(r, wo, l.Data, from, to, concurrency, func() blockRewriter { - return &colblk.DataBlockRewriter{KeySchema: wo.KeySchema} + return colblk.NewDataBlockRewriter(wo.KeySchema, w.comparer.Compare, w.comparer.Split) }) if err != nil { return errors.Wrap(err, "rewriting data blocks") diff --git a/sstable/layout.go b/sstable/layout.go index fc6a5ebac1..7fa5a40254 100644 --- a/sstable/layout.go +++ b/sstable/layout.go @@ -332,7 +332,7 @@ func formatColblkDataBlock( if fmtRecord != nil { var iter colblk.DataBlockIter if err := iter.Init( - &reader, r.keySchema.NewKeySeeker(), describingLazyValueHandler{}, block.IterTransforms{}, + &reader, r.keySchema.NewKeySeeker(), r.Compare, r.Split, describingLazyValueHandler{}, block.IterTransforms{}, ); err != nil { return err }