From 59f79a528f1cc6d8f59547d5de159077a5ae6195 Mon Sep 17 00:00:00 2001 From: Jackson Owens Date: Thu, 1 Jul 2021 20:43:59 -0400 Subject: [PATCH] compaction: remove nonzero-seqnum splitting logic Remove the `nonZeroSeqNumSplitter`. The logic preventing splitting at nonzero-seqnum keys was responsible for the creation of arbitrarily large files (#1181). The condition that the `nonZeroSeqNumSplitter` guards against is no longer possible. Fix #1181. --- compaction.go | 236 ++++++++++----------------- compaction_test.go | 27 +-- testdata/compaction_output_splitters | 120 -------------- testdata/manual_compaction | 57 ++++++- 4 files changed, 143 insertions(+), 297 deletions(-) diff --git a/compaction.go b/compaction.go index d20603151b..c86b8b0fdd 100644 --- a/compaction.go +++ b/compaction.go @@ -315,7 +315,7 @@ func (a *splitterGroup) onNewOutput(key *InternalKey) []byte { // the compaction output is at the boundary between two user keys (also // the boundary between atomic compaction units). Use this splitter to wrap // any splitters that don't guarantee user key splits (i.e. splitters that make -// their determinatino in ways other than comparing the current key against a +// their determination in ways other than comparing the current key against a // limit key. type userKeyChangeSplitter struct { cmp Compare @@ -344,49 +344,6 @@ func (u *userKeyChangeSplitter) onNewOutput(key *InternalKey) []byte { return u.splitter.onNewOutput(key) } -// nonZeroSeqNumSplitter is a compactionOutputSplitter that takes in a child -// splitter, and advises a split when 1) that child splitter advises a split, -// and 2) the compaction output is at a point where the previous point sequence -// number is nonzero. -type nonZeroSeqNumSplitter struct { - c *compaction - splitter compactionOutputSplitter - prevPointSeqNum uint64 - splitOnNonZeroSeqNum bool -} - -func (n *nonZeroSeqNumSplitter) shouldSplitBefore( - key *InternalKey, tw *sstable.Writer, -) compactionSplitSuggestion { - curSeqNum := key.SeqNum() - keyKind := key.Kind() - prevPointSeqNum := n.prevPointSeqNum - if keyKind != InternalKeyKindRangeDelete { - n.prevPointSeqNum = curSeqNum - } - - if n.splitOnNonZeroSeqNum { - if prevPointSeqNum > 0 || n.c.rangeDelFrag.Empty() { - n.splitOnNonZeroSeqNum = false - return splitNow - } - } else if split := n.splitter.shouldSplitBefore(key, tw); split == splitNow { - userKeyChange := curSeqNum > prevPointSeqNum - if prevPointSeqNum > 0 || n.c.rangeDelFrag.Empty() || userKeyChange { - return splitNow - } - n.splitOnNonZeroSeqNum = true - return splitSoon - } - return noSplit -} - -func (n *nonZeroSeqNumSplitter) onNewOutput(key *InternalKey) []byte { - n.prevPointSeqNum = InternalKeySeqNumMax - n.splitOnNonZeroSeqNum = false - return n.splitter.onNewOutput(key) -} - // compactionFile is a vfs.File wrapper that, on every write, updates a metric // in `versions` on bytes written by in-progress compactions so far. It also // increments a per-compaction `written` int. @@ -951,7 +908,7 @@ func (c *compaction) errorOnUserKeyOverlap(ve *versionEdit) error { // snapshots requiring them to be kept. It performs this determination by // looking for an sstable which overlaps the bounds of the compaction at a // lower level in the LSM. -func (c *compaction) allowZeroSeqNum(iter internalIterator) bool { +func (c *compaction) allowZeroSeqNum() bool { return c.elideRangeTombstone(c.smallest.UserKey, c.largest.UserKey) } @@ -2060,7 +2017,7 @@ func (d *DB) runCompaction( if err != nil { return nil, pendingOutputs, err } - c.allowedZeroSeqNum = c.allowZeroSeqNum(iiter) + c.allowedZeroSeqNum = c.allowZeroSeqNum() iter := newCompactionIter(c.cmp, c.formatKey, d.merge, iiter, snapshots, &c.rangeDelFrag, c.allowedZeroSeqNum, c.elideTombstone, c.elideRangeTombstone) @@ -2149,14 +2106,15 @@ func (d *DB) runCompaction( splitL0Outputs := c.outputLevel.level == 0 && d.opts.FlushSplitBytes > 0 - // finishOutput is called for an sstable with the first key of the next sstable, and for the - // last sstable with an empty key. - finishOutput := func(key []byte) error { + // finishOutput is called with the a user key up to which all tombstones + // should be flushed. Typically, this is the first key of the next + // sstable or an empty key if this output is the final sstable. + finishOutput := func(splitKey []byte) error { // If we haven't output any point records to the sstable (tw == nil) // then the sstable will only contain range tombstones. The smallest // key in the sstable will be the start key of the first range // tombstone added. We need to ensure that this start key is distinct - // from the limit (key) passed to finishOutput (if set), otherwise we + // from the splitKey passed to finishOutput (if set), otherwise we // would generate an sstable where the largest key is smaller than the // smallest key due to how the largest key boundary is set below. // NB: It is permissible for the range tombstone start key to be the @@ -2169,15 +2127,15 @@ func (d *DB) runCompaction( if len(iter.tombstones) > 0 { startKey = iter.tombstones[0].Start.UserKey } - if key != nil && d.cmp(startKey, key) == 0 { + if splitKey != nil && d.cmp(startKey, splitKey) == 0 { return nil } } // NB: clone the key because the data can be held on to by the call to // compactionIter.Tombstones via rangedel.Fragmenter.FlushTo. - key = append([]byte(nil), key...) - for _, v := range iter.Tombstones(key, splitL0Outputs) { + splitKey = append([]byte(nil), splitKey...) + for _, v := range iter.Tombstones(splitKey, splitL0Outputs) { if tw == nil { if err := newOutput(); err != nil { return err @@ -2294,7 +2252,7 @@ func (d *DB) runCompaction( } } - if key != nil && writerMeta.LargestRange.UserKey != nil { + if splitKey != nil && writerMeta.LargestRange.UserKey != nil { // The current file is not the last output file and there is a range tombstone in it. // If the tombstone extends into the next file, then truncate it for the purposes of // computing meta.Largest. For example, say the next file's first key is c#7,1 and the @@ -2303,8 +2261,8 @@ func (d *DB) runCompaction( // c#inf where inf is the InternalKeyRangeDeleteSentinel. Note that this is just for // purposes of bounds computation -- the current sstable will end up with a Largest key // of c#7,1 so the range tombstone in the current file will be able to delete c#7. - if d.cmp(writerMeta.LargestRange.UserKey, key) >= 0 { - writerMeta.LargestRange.UserKey = key + if d.cmp(writerMeta.LargestRange.UserKey, splitKey) >= 0 { + writerMeta.LargestRange.UserKey = splitKey writerMeta.LargestRange.Trailer = InternalKeyRangeDeleteSentinel } } @@ -2365,27 +2323,11 @@ func (d *DB) runCompaction( // we start off with splitters for file sizes, grandparent limits, and (for // L0 splits) L0 limits, before wrapping them in an splitterGroup. // - // There is a complication here: we can't split outputs where the largest - // key on the left side has a seqnum of zero. This limitation - // exists because a range tombstone which extends into the next sstable - // will cause the smallest key for the next sstable to have the same user - // key, but we need the two tables to be disjoint in key space. Consider - // the scenario: - // - // a#RANGEDEL-c,3 b#SET,0 - // - // If b#SET,0 is the last key added to an sstable, the range tombstone - // [b-c)#3 will extend into the next sstable. The boundary generation - // code in finishOutput() will compute the smallest key for that sstable - // as b#RANGEDEL,3 which sorts before b#SET,0. Normally we just adjust - // the seqnum of this key, but that isn't possible for seqnum 0. To ensure - // we only split where the previous point key has a zero seqnum, we wrap - // our splitters with a nonZeroSeqNumSplitter. - // - // Another case where we may not be able to switch SSTables right away is - // when we are splitting an L0 output. We do not split the same user key - // across different sstables within one flush, so the userKeyChangeSplitter - // ensures we are at a user key change boundary when doing a split. + // There is a complication here: We may not be able to switch SSTables + // right away is when we are splitting an L0 output. We do not split the + // same user key across different sstables within one flush, so the + // userKeyChangeSplitter ensures we are at a user key change boundary when + // doing a split. outputSplitters := []compactionOutputSplitter{ &fileSizeSplitter{maxFileSize: c.maxOutputFileSize}, &grandparentLimitSplitter{c: c, ve: ve}, @@ -2405,18 +2347,6 @@ func (d *DB) runCompaction( cmp: c.cmp, splitters: outputSplitters, } - // Compactions to L0 don't need nonzero last-point-key seqnums at split - // boundaries because when writing to L0, we are able to guarantee that - // the end key of tombstones will also be truncated (through the - // TruncateAndFlushTo call), and no user keys will - // be split between sstables. So a nonZeroSeqNumSplitter is unnecessary - // in that case. - if !splitL0Outputs { - splitter = &nonZeroSeqNumSplitter{ - c: c, - splitter: splitter, - } - } // NB: we avoid calling maybeThrottle on a nilPacer because the cost of // dynamic dispatch in the hot loop below is pronounced in CPU profiles (see @@ -2435,36 +2365,22 @@ func (d *DB) runCompaction( // progress guarantees ensure that eventually the input iterator will be // exhausted and the range tombstone fragments will all be flushed. for key, val := iter.First(); key != nil || !c.rangeDelFrag.Empty(); { - limit := splitter.onNewOutput(key) + splitterSuggestion := splitter.onNewOutput(key) // Each inner loop iteration processes one key from the input iterator. prevPointSeqNum := InternalKeySeqNumMax for ; key != nil; key, val = iter.Next() { - if split := splitter.shouldSplitBefore(key, tw); split != noSplit { - if split == splitNow { - limit = key.UserKey - if splitL0Outputs { - // Flush all tombstones up until key.UserKey, and - // truncate them at that key. - // - // The fragmenter could save the passed-in key. As this - // key could live beyond the write into the current - // sstable output file, make a copy. - c.rangeDelFrag.TruncateAndFlushTo(key.Clone().UserKey) - } - break + if split := splitter.shouldSplitBefore(key, tw); split == splitNow { + if splitL0Outputs { + // Flush all tombstones up until key.UserKey, and + // truncate them at that key. + // + // The fragmenter could save the passed-in key. As this + // key could live beyond the write into the current + // sstable output file, make a copy. + c.rangeDelFrag.TruncateAndFlushTo(key.Clone().UserKey) } - // split == splitSoon - // - // Invalidate the limit here. It has probably been exceeded - // by the current key, but we can't split just yet, such as to - // maintain the nonzero sequence number invariant mentioned - // above. Setting limit to nil is okay as it's just a transient - // setting, as when split eventually equals splitNow, we will - // set the limit to the key after that. If the compaction were - // to run out of keys before we get to that point, limit would - // be nil as it should be for all end-of-compaction cases. - limit = nil + break } atomic.StoreUint64(c.atomicBytesIterated, c.bytesIterated) @@ -2492,49 +2408,77 @@ func (d *DB) runCompaction( prevPointSeqNum = key.SeqNum() } + // A splitter requested a split, and we're ready to finish the output. + // We need to choose the key at which to split any pending range + // tombstones. + // + // There's a complication here. We need to ensure that for a user key + // k we never end up with one output's largest key as k#0 and the + // next output's smallest key as k#RANGEDEL,#x where x > 0. This is a + // problem because k#RANGEDEL,#x sorts before k#0. Normally, we just + // adjust the seqnum of the next output's smallest boundary to be + // less, but that's not possible with the zero seqnum. We can avoid + // this case with careful picking of where to split pending range + // tombstones. + var splitKey []byte switch { - case key == nil && prevPointSeqNum == 0 && !c.rangeDelFrag.Empty(): - // We ran out of keys and the last key added to the sstable has a zero - // seqnum and there are buffered range tombstones, so we're unable to use - // the grandparent/flush limit for the sstable boundary. See the example in the - // in the loop above with range tombstones straddling sstables. Setting - // limit to nil ensures that we flush the entirety of the rangedel - // fragmenter when writing the last output. - limit = nil - case key == nil && splitL0Outputs && !c.rangeDelFrag.Empty(): - // We ran out of keys with flush splits enabled, and have remaining - // buffered range tombstones. Set limit to nil so all range - // tombstones get flushed in the current sstable. Consider this - // example: + case key != nil: + // We hit the size, grandparent, or L0 limit for the sstable. + // The next key either has a greater user key than the previous + // key, or if not, the previous key must not have had a zero + // sequence number. + + // TODO(jackson): If we hit the grandparent limit, the next + // grandparent's smallest key may be less than the current key. + // Splitting at the current key will cause this output to overlap + // a potentially unbounded number of grandparents. + splitKey = key.UserKey + case key == nil && splitL0Outputs: + // We ran out of keys with flush splits enabled. Set splitKey to + // nil so all range tombstones get flushed in the current sstable. + // Consider this example: // // a.SET.4 // d.MERGE.5 // d.RANGEDEL.3:f // (no more keys remaining) // - // Where d is a flush split key (i.e. limit = 'd'). Since d.MERGE.5 - // has already been written to this output by this point (as it's - // <= limit), and flushes cannot have user keys split across - // multiple sstables, we have to set limit to a key greater than - // 'd' to ensure the range deletion also gets flushed. Setting - // the limit to nil is the simplest way to ensure that. - limit = nil - case key == nil /* && (prevPointSeqNum != 0 || c.rangeDelFrag.Empty()) */ : - // We ran out of keys. Because of the previous case, either rangeDelFrag - // is empty or the last record added to the sstable has a non-zero - // seqnum. If the rangeDelFragmenter is empty we have no concerns as - // there won't be another sstable generated by this compaction and the - // current limit is fine (it won't apply). Otherwise, if the last key - // added to the sstable had a non-zero seqnum we're also in the clear as - // we can decrement that seqnum to create a boundary key for the next - // sstable (if we end up generating a next sstable). - case key != nil: - // We either hit the size, grandparent, or L0 limit for the sstable. + // Where d is a flush split key (i.e. splitterSuggestion = 'd'). + // Since d.MERGE.5 has already been written to this output by this + // point (as it's <= splitterSuggestion), and flushes cannot have + // user keys split across multiple sstables, we have to set + // splitKey to a key greater than 'd' to ensure the range deletion + // also gets flushed. Setting the splitKey to nil is the simplest + // way to ensure that. + // + // TODO(jackson): This case is only problematic if splitKey equals + // the user key of the last point key added. We don't need to + // flush *all* sstables to the current sstable. We could flush up + // to the next grandparent limit greater than `splitterSuggestion` + // instead. + splitKey = nil + case key == nil && prevPointSeqNum != 0: + // The last key added did not have a zero sequence number, so + // we'll always be able to adjust the next table's smallest key. + splitKey = splitterSuggestion + case key == nil && prevPointSeqNum == 0: + // The last key added DID have a zero sequence number. The + // splitters' suggested split point might have the same user key, + // which would cause the next output to have an unadjustable + // smallest key. To prevent that, we ignore the splitter's + // suggestion, leaving splitKey nil to flush all pending range + // tombstones. + // TODO(jackson): This case is only problematic if splitKey equals + // the user key of the last point key added. We don't need to + // flush *all* sstables to the current sstable. We could flush up + // to the next grandparent limit greater than `splitterSuggestion` + // instead. + splitKey = nil default: return nil, nil, errors.New("pebble: not reached") } - if err := finishOutput(limit); err != nil { + if err := finishOutput(splitKey); err != nil { return nil, pendingOutputs, err } } diff --git a/compaction_test.go b/compaction_test.go index 006ac764e5..bc66250ec3 100644 --- a/compaction_test.go +++ b/compaction_test.go @@ -24,7 +24,6 @@ import ( "github.com/cockroachdb/pebble/internal/datadriven" "github.com/cockroachdb/pebble/internal/errorfs" "github.com/cockroachdb/pebble/internal/manifest" - "github.com/cockroachdb/pebble/internal/rangedel" "github.com/cockroachdb/pebble/sstable" "github.com/cockroachdb/pebble/vfs" "github.com/stretchr/testify/require" @@ -2023,7 +2022,6 @@ func TestCompactionAllowZeroSeqNum(t *testing.T) { c.startLevel.level = -1 var startFiles, outputFiles []*fileMetadata - var iter internalIterator switch { case len(parts) == 1 && parts[0] == "flush": @@ -2032,10 +2030,6 @@ func TestCompactionAllowZeroSeqNum(t *testing.T) { c.flushing = d.mu.mem.queue d.mu.Unlock() - var err error - if iter, err = c.newInputIter(nil); err != nil { - return err.Error() - } default: for _, p := range parts { level, meta := parseMeta(p) @@ -2064,7 +2058,7 @@ func TestCompactionAllowZeroSeqNum(t *testing.T) { c.inuseKeyRanges = nil c.setupInuseKeyRanges() - fmt.Fprintf(&buf, "%t\n", c.allowZeroSeqNum(iter)) + fmt.Fprintf(&buf, "%t\n", c.allowZeroSeqNum()) } return buf.String() @@ -2334,25 +2328,6 @@ func TestCompactionOutputSplitters(t *testing.T) { cmp: base.DefaultComparer.Compare, splitter: child0, } - case "nonzeroseqnum": - c := &compaction{ - rangeDelFrag: rangedel.Fragmenter{ - Cmp: base.DefaultComparer.Compare, - Format: base.DefaultFormatter, - Emit: func(fragmented []rangedel.Tombstone) {}, - }, - } - frag := &c.rangeDelFrag - if len(d.CmdArgs) >= 3 { - if d.CmdArgs[2].Key == "tombstone" { - // Add a tombstone so Empty() returns false. - frag.Add(base.ParseInternalKey("foo.RANGEDEL.10"), []byte("pan")) - } - } - *splitterToInit = &nonZeroSeqNumSplitter{ - c: c, - splitter: child0, - } } (*splitterToInit).onNewOutput(nil) case "set-should-split": diff --git a/testdata/compaction_output_splitters b/testdata/compaction_output_splitters index 6361d84a2b..aaec60acae 100644 --- a/testdata/compaction_output_splitters +++ b/testdata/compaction_output_splitters @@ -114,123 +114,3 @@ ok should-split-before food2.SET.4 ---- split-now - -# nonZeroSeqNumSplitter tests - -reset ----- -ok - -init child0 mock ----- -ok - -init main nonzeroseqnum tombstone ----- -ok - -set-should-split child0 no-split ----- -ok - -should-split-before foo.SET.5 ----- -no-split - -should-split-before foo.RANGEDEL.0 ----- -no-split - -set-should-split child0 split-now ----- -ok - -# This should be split-now, as the last point key is foo.SET.5. - -should-split-before foo.SET.0 ----- -split-now - -set-should-split child0 no-split ----- -ok - -should-split-before food.SET.0 ----- -no-split - -set-should-split child0 split-now ----- -ok - -should-split-before food1.SET.0 ----- -split-soon - -# Even though we've set should-split-before to no-split for the child splitter, -# nonZeroSeqNumSplitter "remembers" it and splits at the next good split point. - -set-should-split child0 no-split ----- -ok - -should-split-before food2.SET.0 ----- -no-split - -should-split-before food3.SET.4 ----- -no-split - -# This one should be split-now, as the previous point seqnum is nonzero. - -should-split-before food4.SET.2 ----- -split-now - -should-split-before food4.SET.0 ----- -no-split - -set-should-split child0 split-now ----- -ok - -should-split-before food5.SET.3 ----- -split-now - -reset ----- -ok - -# nonZeroSeqNumSplitter tests, but with an empty rangedel fragmenter. In -# this case, it just passees through the child splitter. - -reset ----- -ok - -init child0 mock ----- -ok - -init main nonzeroseqnum ----- -ok - -set-should-split child0 no-split ----- -ok - -should-split-before food.SET.0 ----- -no-split - -set-should-split child0 split-now ----- -ok - -should-split-before food1.SET.0 ----- -split-now diff --git a/testdata/manual_compaction b/testdata/manual_compaction index 49e01aee48..594c742d1a 100644 --- a/testdata/manual_compaction +++ b/testdata/manual_compaction @@ -775,10 +775,6 @@ compact a-r L1 3: 000006:[q#6,SET-q#6,SET] - -# Test a case where the output ends one key after a grandparent limit is -# crossed, but we couldn't have split outputs at that key due to the point -# key before that having a zero seqnum. define target-file-sizes=(100, 100, 100) L1 a.RANGEDEL.10:b @@ -811,10 +807,61 @@ L4 compact a-r L1 ---- 2: - 000010:[a#10,RANGEDEL-j#10,SET] + 000010:[a#10,RANGEDEL-b#0,SET] + 000011:[d#0,RANGEDEL-j#10,SET] 3: 000006:[c#6,SET-c#6,SET] 000007:[c#5,SET-c#5,SET] 000008:[c#4,SET-c#4,SET] 4: 000009:[a#0,SET-f#0,SET] + +# Test a snapshot that separates a range deletion from all the data that it +# deletes. Ensure that we respect the target-file-size and split into multiple +# outputs. + +define target-file-sizes=(1, 1, 1) snapshots=(14) +L1 + a.RANGEDEL.15:z + b.SET.11:foo + c.SET.11:foo +L2 + c.SET.0:foo + d.SET.0:foo +---- +1: + 000004:[a-z] +2: + 000005:[c-d] + +compact a-z L1 +---- +2: + 000006:[a#15,RANGEDEL-c#72057594037927935,RANGEDEL] + 000007:[c#15,RANGEDEL-d#72057594037927935,RANGEDEL] + 000008:[d#15,RANGEDEL-z#72057594037927935,RANGEDEL] + +# Test an interaction between a range deletion that will be elided with +# output splitting. Ensure that the output is still split (previous versions +# of the code did not, because of intricacies around preventing a zero +# sequence number in an output's largest key). + +define target-file-sizes=(1, 1, 1) +L1 + a.RANGEDEL.10:z + b.SET.11:foo + c.SET.11:foo +L2 + c.SET.0:foo + d.SET.0:foo +---- +1: + 000004:[a-z] +2: + 000005:[c-d] + +compact a-z L1 +---- +2: + 000006:[b#0,SET-b#0,SET] + 000007:[c#0,SET-c#0,SET]