Skip to content

Commit

Permalink
compaction: remove nonzero-seqnum splitting logic
Browse files Browse the repository at this point in the history
Remove the `nonZeroSeqNumSplitter`. The logic preventing splitting at
nonzero-seqnum keys was responsible for the creation of arbitrarily
large files (cockroachdb#1181). The condition that the `nonZeroSeqNumSplitter`
guards against is no longer possible.

Fix cockroachdb#1181.
  • Loading branch information
jbowens committed Jul 9, 2021
1 parent 4fcf409 commit 801de10
Show file tree
Hide file tree
Showing 4 changed files with 100 additions and 230 deletions.
126 changes: 47 additions & 79 deletions compaction.go
Original file line number Diff line number Diff line change
Expand Up @@ -315,7 +315,7 @@ func (a *splitterGroup) onNewOutput(key *InternalKey) []byte {
// the compaction output is at the boundary between two user keys (also
// the boundary between atomic compaction units). Use this splitter to wrap
// any splitters that don't guarantee user key splits (i.e. splitters that make
// their determinatino in ways other than comparing the current key against a
// their determination in ways other than comparing the current key against a
// limit key.
type userKeyChangeSplitter struct {
cmp Compare
Expand Down Expand Up @@ -344,49 +344,6 @@ func (u *userKeyChangeSplitter) onNewOutput(key *InternalKey) []byte {
return u.splitter.onNewOutput(key)
}

// nonZeroSeqNumSplitter is a compactionOutputSplitter that takes in a child
// splitter, and advises a split when 1) that child splitter advises a split,
// and 2) the compaction output is at a point where the previous point sequence
// number is nonzero.
type nonZeroSeqNumSplitter struct {
c *compaction
splitter compactionOutputSplitter
prevPointSeqNum uint64
splitOnNonZeroSeqNum bool
}

func (n *nonZeroSeqNumSplitter) shouldSplitBefore(
key *InternalKey, tw *sstable.Writer,
) compactionSplitSuggestion {
curSeqNum := key.SeqNum()
keyKind := key.Kind()
prevPointSeqNum := n.prevPointSeqNum
if keyKind != InternalKeyKindRangeDelete {
n.prevPointSeqNum = curSeqNum
}

if n.splitOnNonZeroSeqNum {
if prevPointSeqNum > 0 || n.c.rangeDelFrag.Empty() {
n.splitOnNonZeroSeqNum = false
return splitNow
}
} else if split := n.splitter.shouldSplitBefore(key, tw); split == splitNow {
userKeyChange := curSeqNum > prevPointSeqNum
if prevPointSeqNum > 0 || n.c.rangeDelFrag.Empty() || userKeyChange {
return splitNow
}
n.splitOnNonZeroSeqNum = true
return splitSoon
}
return noSplit
}

func (n *nonZeroSeqNumSplitter) onNewOutput(key *InternalKey) []byte {
n.prevPointSeqNum = InternalKeySeqNumMax
n.splitOnNonZeroSeqNum = false
return n.splitter.onNewOutput(key)
}

// compactionFile is a vfs.File wrapper that, on every write, updates a metric
// in `versions` on bytes written by in-progress compactions so far. It also
// increments a per-compaction `written` int.
Expand Down Expand Up @@ -951,7 +908,7 @@ func (c *compaction) errorOnUserKeyOverlap(ve *versionEdit) error {
// snapshots requiring them to be kept. It performs this determination by
// looking for an sstable which overlaps the bounds of the compaction at a
// lower level in the LSM.
func (c *compaction) allowZeroSeqNum(iter internalIterator) bool {
func (c *compaction) allowZeroSeqNum() bool {
return c.elideRangeTombstone(c.smallest.UserKey, c.largest.UserKey)
}

Expand Down Expand Up @@ -2060,7 +2017,7 @@ func (d *DB) runCompaction(
if err != nil {
return nil, pendingOutputs, err
}
c.allowedZeroSeqNum = c.allowZeroSeqNum(iiter)
c.allowedZeroSeqNum = c.allowZeroSeqNum()
iter := newCompactionIter(c.cmp, c.formatKey, d.merge, iiter, snapshots,
&c.rangeDelFrag, c.allowedZeroSeqNum, c.elideTombstone, c.elideRangeTombstone)

Expand Down Expand Up @@ -2365,27 +2322,11 @@ func (d *DB) runCompaction(
// we start off with splitters for file sizes, grandparent limits, and (for
// L0 splits) L0 limits, before wrapping them in an splitterGroup.
//
// There is a complication here: we can't split outputs where the largest
// key on the left side has a seqnum of zero. This limitation
// exists because a range tombstone which extends into the next sstable
// will cause the smallest key for the next sstable to have the same user
// key, but we need the two tables to be disjoint in key space. Consider
// the scenario:
//
// a#RANGEDEL-c,3 b#SET,0
//
// If b#SET,0 is the last key added to an sstable, the range tombstone
// [b-c)#3 will extend into the next sstable. The boundary generation
// code in finishOutput() will compute the smallest key for that sstable
// as b#RANGEDEL,3 which sorts before b#SET,0. Normally we just adjust
// the seqnum of this key, but that isn't possible for seqnum 0. To ensure
// we only split where the previous point key has a zero seqnum, we wrap
// our splitters with a nonZeroSeqNumSplitter.
//
// Another case where we may not be able to switch SSTables right away is
// when we are splitting an L0 output. We do not split the same user key
// across different sstables within one flush, so the userKeyChangeSplitter
// ensures we are at a user key change boundary when doing a split.
// There is a complication here: We may not be able to switch SSTables
// right away is when we are splitting an L0 output. We do not split the
// same user key across different sstables within one flush, so the
// userKeyChangeSplitter ensures we are at a user key change boundary when
// doing a split.
outputSplitters := []compactionOutputSplitter{
&fileSizeSplitter{maxFileSize: c.maxOutputFileSize},
&grandparentLimitSplitter{c: c, ve: ve},
Expand All @@ -2405,18 +2346,6 @@ func (d *DB) runCompaction(
cmp: c.cmp,
splitters: outputSplitters,
}
// Compactions to L0 don't need nonzero last-point-key seqnums at split
// boundaries because when writing to L0, we are able to guarantee that
// the end key of tombstones will also be truncated (through the
// TruncateAndFlushTo call), and no user keys will
// be split between sstables. So a nonZeroSeqNumSplitter is unnecessary
// in that case.
if !splitL0Outputs {
splitter = &nonZeroSeqNumSplitter{
c: c,
splitter: splitter,
}
}

// NB: we avoid calling maybeThrottle on a nilPacer because the cost of
// dynamic dispatch in the hot loop below is pronounced in CPU profiles (see
Expand Down Expand Up @@ -2492,6 +2421,40 @@ func (d *DB) runCompaction(
prevPointSeqNum = key.SeqNum()
}

// We're ready to finish the output. We must take care to ensure we
// don't split outputs where the largest key on the left side has a
// seqnum of zero. This limitation exists because we need the two
// tables to be disjoint in key space. Consider the scenario:
//
// a#RANGEDEL-c,3 b#SET,0
//
// If b#SET,0 is the last key added to an sstable, the range tombstone
// extends into the next sstable. If we let the first key of the next
// table be b#RANGEDEL,3, the boundary generation code in
// finishOutput() would compute the smallest key for that sstable as
// b#RANGEDEL,3 which sorts before b#SET,0. Normally we just adjust
// the seqnum of this key, but that isn't possible for seqnum 0. We
// avoid this problematic split point by adjusting limit, which
// determines the key we pass into (*rangedel.Fragmenter).FlushTo.
//
// If limit > last key added to the sstable: the pending tombstone
// will be truncated to a clean user key boundary.
//
// If limit == last key added to the sstable (this may happen due to
// file size limits or we ran out of keys):
//
// If key != nil: the last key added to the sstable must not have a
// zero sequence number, because we set limit = key.UserKey when we
// decided to split, and there are no keys with the same user key
// greater than the zero-sequence numbered key.
//
// If prevPointSeqNum != 0: There's no problem, because we'll always
// be able to adjust the next sstable's smallest boundary.
//
// If prevPointSeqNum == 0: We flush all pending range tombstones to
// the current table, ensuring that there are no future compaction
// outputs. (See the TODO below.)
//
switch {
case key == nil && prevPointSeqNum == 0 && !c.rangeDelFrag.Empty():
// We ran out of keys and the last key added to the sstable has a zero
Expand All @@ -2500,6 +2463,11 @@ func (d *DB) runCompaction(
// in the loop above with range tombstones straddling sstables. Setting
// limit to nil ensures that we flush the entirety of the rangedel
// fragmenter when writing the last output.
//
// TODO(jackson): This case is only problematic if limit equals
// the last point key added. We also don't need to flush *all*
// sstables to the current sstable. We could flush up to the next
// grandparent limit greater than `limit` instead.
limit = nil
case key == nil && splitL0Outputs && !c.rangeDelFrag.Empty():
// We ran out of keys with flush splits enabled, and have remaining
Expand Down
27 changes: 1 addition & 26 deletions compaction_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,6 @@ import (
"github.com/cockroachdb/pebble/internal/datadriven"
"github.com/cockroachdb/pebble/internal/errorfs"
"github.com/cockroachdb/pebble/internal/manifest"
"github.com/cockroachdb/pebble/internal/rangedel"
"github.com/cockroachdb/pebble/sstable"
"github.com/cockroachdb/pebble/vfs"
"github.com/stretchr/testify/require"
Expand Down Expand Up @@ -2023,7 +2022,6 @@ func TestCompactionAllowZeroSeqNum(t *testing.T) {
c.startLevel.level = -1

var startFiles, outputFiles []*fileMetadata
var iter internalIterator

switch {
case len(parts) == 1 && parts[0] == "flush":
Expand All @@ -2032,10 +2030,6 @@ func TestCompactionAllowZeroSeqNum(t *testing.T) {
c.flushing = d.mu.mem.queue
d.mu.Unlock()

var err error
if iter, err = c.newInputIter(nil); err != nil {
return err.Error()
}
default:
for _, p := range parts {
level, meta := parseMeta(p)
Expand Down Expand Up @@ -2064,7 +2058,7 @@ func TestCompactionAllowZeroSeqNum(t *testing.T) {

c.inuseKeyRanges = nil
c.setupInuseKeyRanges()
fmt.Fprintf(&buf, "%t\n", c.allowZeroSeqNum(iter))
fmt.Fprintf(&buf, "%t\n", c.allowZeroSeqNum())
}
return buf.String()

Expand Down Expand Up @@ -2334,25 +2328,6 @@ func TestCompactionOutputSplitters(t *testing.T) {
cmp: base.DefaultComparer.Compare,
splitter: child0,
}
case "nonzeroseqnum":
c := &compaction{
rangeDelFrag: rangedel.Fragmenter{
Cmp: base.DefaultComparer.Compare,
Format: base.DefaultFormatter,
Emit: func(fragmented []rangedel.Tombstone) {},
},
}
frag := &c.rangeDelFrag
if len(d.CmdArgs) >= 3 {
if d.CmdArgs[2].Key == "tombstone" {
// Add a tombstone so Empty() returns false.
frag.Add(base.ParseInternalKey("foo.RANGEDEL.10"), []byte("pan"))
}
}
*splitterToInit = &nonZeroSeqNumSplitter{
c: c,
splitter: child0,
}
}
(*splitterToInit).onNewOutput(nil)
case "set-should-split":
Expand Down
120 changes: 0 additions & 120 deletions testdata/compaction_output_splitters
Original file line number Diff line number Diff line change
Expand Up @@ -114,123 +114,3 @@ ok
should-split-before food2.SET.4
----
split-now

# nonZeroSeqNumSplitter tests

reset
----
ok

init child0 mock
----
ok

init main nonzeroseqnum tombstone
----
ok

set-should-split child0 no-split
----
ok

should-split-before foo.SET.5
----
no-split

should-split-before foo.RANGEDEL.0
----
no-split

set-should-split child0 split-now
----
ok

# This should be split-now, as the last point key is foo.SET.5.

should-split-before foo.SET.0
----
split-now

set-should-split child0 no-split
----
ok

should-split-before food.SET.0
----
no-split

set-should-split child0 split-now
----
ok

should-split-before food1.SET.0
----
split-soon

# Even though we've set should-split-before to no-split for the child splitter,
# nonZeroSeqNumSplitter "remembers" it and splits at the next good split point.

set-should-split child0 no-split
----
ok

should-split-before food2.SET.0
----
no-split

should-split-before food3.SET.4
----
no-split

# This one should be split-now, as the previous point seqnum is nonzero.

should-split-before food4.SET.2
----
split-now

should-split-before food4.SET.0
----
no-split

set-should-split child0 split-now
----
ok

should-split-before food5.SET.3
----
split-now

reset
----
ok

# nonZeroSeqNumSplitter tests, but with an empty rangedel fragmenter. In
# this case, it just passees through the child splitter.

reset
----
ok

init child0 mock
----
ok

init main nonzeroseqnum
----
ok

set-should-split child0 no-split
----
ok

should-split-before food.SET.0
----
no-split

set-should-split child0 split-now
----
ok

should-split-before food1.SET.0
----
split-now
Loading

0 comments on commit 801de10

Please sign in to comment.