Skip to content

Commit

Permalink
Merge pull request facebook#15 from cockroachdb/rditer-bug
Browse files Browse the repository at this point in the history
Fix a bug in iteration while skipping range deletion tombstones
  • Loading branch information
benesch authored Sep 12, 2018
2 parents b5ed67f + d62a998 commit f46d346
Show file tree
Hide file tree
Showing 2 changed files with 83 additions and 0 deletions.
81 changes: 81 additions & 0 deletions db/db_range_del_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -927,6 +927,87 @@ TEST_F(DBRangeDelTest, IteratorRangeTombstoneOverlapsSstable) {
}
}

TEST_F(DBRangeDelTest, IteratorRangeTombstoneMultipleBlocks) {
Options options = CurrentOptions();
BlockBasedTableOptions table_options;
table_options.block_size = 1024;
options.table_factory.reset(NewBlockBasedTableFactory(table_options));
Reopen(options);

// Create one SST with blocks [a, b] and [c, d].
db_->Put(WriteOptions(), "a", std::string(512, 'x'));
db_->Put(WriteOptions(), "b", std::string(512, 'x'));
db_->Put(WriteOptions(), "c", std::string(512, 'x'));
db_->Put(WriteOptions(), "d", std::string(512, 'x'));
ASSERT_OK(db_->Flush(FlushOptions()));
db_->CompactRange(CompactRangeOptions(), nullptr, nullptr);

// This test exercises a bookkeeping issue discovered in the
// BlockBasedTableIterator. In short, a range tombstone spanning keys in
// multiple data blocks could cause the iterator to incorrectly reuse its
// cached data block when it in fact needed to fetch a new data block.
//
// The goal is to force BlockBasedTableIterator::FindKeyForward and
// BlockBasedTableIterator::FindKeyBackward to seek between data blocks, which
// occurs when Next or Prev advances to a key covered by a range tombstone.

// Add a range tombstone over [b, d). To reproduce the bug during reverse
// iteration, it is required that the tombstone's end key is not the first key
// in a block.
db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), "b", "d");

// Test the forward case.
unique_ptr<Iterator> iter(db_->NewIterator(ReadOptions()));
// First, seek to d to put the [c, d] block in the cache.
iter->Seek("d");
ASSERT_TRUE(iter->Valid());
ASSERT_EQ("d", iter->key());
// Then, seek to a. This puts the [a, b] block in the cache.
iter->Seek("a");
ASSERT_TRUE(iter->Valid());
ASSERT_EQ("a", iter->key());
// Advance to b. Since this key is covered by the [b, d) tombstone,
// FindKeyForward will need to seek to the next data block to find the next
// non-deleted key (d). With the bug present, the iterator would forget that
// it had put the [a, b] block in the cache, instead thinking that [c, d] was
// still in the cache. It would thus look for d in the [a, b] block and
// incorrectly declare that the key did not exist.
iter->Next();
ASSERT_TRUE(iter->Valid());
ASSERT_EQ("d", iter->key());

// Test the reverse case. Beware: this is not quite analogous to the forward
// case.
iter.reset(db_->NewIterator(ReadOptions()));
// First, seek to a to put the [a, b] block in the cache. To be perfectly
// analogous, we'd use SeekForPrev, but DBIter performs a Prev after every
// SeekForPrev. So SeekForPrev(a) winds up invalidating the internal
// BlockBasedTableIterator instead of warming its cache.
iter->Seek("a");
ASSERT_TRUE(iter->Valid());
ASSERT_EQ("a", iter->key());
// Seek to d. This first puts the [c, d] block in the cache. As mentioned
// above, DBIter performs a Prev internally, so this also advances backwards
// over the [b, d) tombstone and puts [a, b] in the cache. (The result of this
// internal Prev is stashed away and not visible until the next user call to
// Prev.) As in the forward case, with the bug present, the iterator would
// forget that it had put the [a, b] block in the cache, instead thinking that
// [c, d] was still in the cache. It would thus look for a in the [c, d] block
// and incorrectly declare that the key did not exist.
//
// Note that if the tombstone were instead over [b, c), the bug would not
// occur. Advancing backwards over this tombstone would invalidate the cached
// data block's iterator, thus triggering a code path that correctly reloaded
// the [c,d] block into the cache.
iter->SeekForPrev("d");
ASSERT_TRUE(iter->Valid());
ASSERT_EQ("d", iter->key());
// Observe the result of the internal Prev by advancing backwards to a.
iter->Prev();
ASSERT_TRUE(iter->Valid());
ASSERT_EQ("a", iter->key());
}

#ifndef ROCKSDB_UBSAN_RUN
TEST_F(DBRangeDelTest, TailingIteratorRangeTombstoneUnsupported) {
db_->Put(WriteOptions(), "key", "val");
Expand Down
2 changes: 2 additions & 0 deletions table/block_based_table_reader.cc
Original file line number Diff line number Diff line change
Expand Up @@ -2023,6 +2023,7 @@ void BlockBasedTableIterator::FindKeyForward() {
}

// Find the next non-deleted key.
SavePrevIndexValue();
index_iter_->Seek(tombstone_internal_end_key());
if (!index_iter_->Valid()) {
ResetDataIter();
Expand Down Expand Up @@ -2098,6 +2099,7 @@ void BlockBasedTableIterator::FindKeyBackward() {
}

// Find the previous non-deleted key.
SavePrevIndexValue();
index_iter_->Seek(tombstone_internal_start_key());
if (!index_iter_->Valid()) {
index_iter_->SeekToLast();
Expand Down

0 comments on commit f46d346

Please sign in to comment.