From 9e2c804fe622a5dde0d8b5321a8e1005e8bdeeea Mon Sep 17 00:00:00 2001 From: Andrew Kryczka Date: Fri, 4 Jan 2019 11:21:32 -0800 Subject: [PATCH] Fix point lookup on range tombstone sentinel endpoint (#4829) Summary: Previously for point lookup we decided which file to look into based on user key overlap only. We also did not truncate range tombstones in the point lookup code path. These two ideas did not interact well in cases like this: - L1 has range tombstone [a, c)#1 and point key b#2. The data is split between file1 with range [a#1,1, b#72057594037927935,15], and file2 with range [b#2, c#1]. - L1's file2 gets compacted to L2. - User issues `Get()` for b#3. - L1's file1 is opened and the range tombstone [a, c)#1 is found for b, while no point-key for b is found in L1. - `Get()` assumes that the range tombstone must cover all data in that range in lower levels, so short circuits and returns `NotFound`. The solution to this problem is to not look into files that only overlap with the point lookup at a range tombstone sentinel endpoint. In the above example, this would mean not opening L1's file1 or its tombstones during the `Get()`. Pull Request resolved: https://github.com/facebook/rocksdb/pull/4829 Differential Revision: D13561355 Pulled By: ajkr fbshipit-source-id: a13c21c816870a2f5d32a48af6dbd719a7d9d19f --- HISTORY.md | 1 + db/db_range_del_test.cc | 5 +++++ db/version_set.cc | 19 +++++++++++++++---- 3 files changed, 21 insertions(+), 4 deletions(-) diff --git a/HISTORY.md b/HISTORY.md index bc40fac4f..63f3a5d9c 100644 --- a/HISTORY.md +++ b/HISTORY.md @@ -14,6 +14,7 @@ * Fix a memory leak when files with range tombstones are read in mmap mode and block cache is enabled * Fix handling of corrupt range tombstone blocks such that corruptions cannot cause deleted keys to reappear * Lock free MultiGet +* Fix incorrect `NotFound` point lookup result when querying the endpoint of a file that has been extended by a range tombstone. * Fix with pipelined write, write leaders's callback failure lead to the whole write group fail. ## 5.18.0 (11/30/2018) diff --git a/db/db_range_del_test.cc b/db/db_range_del_test.cc index 508149a34..73bc4d275 100644 --- a/db/db_range_del_test.cc +++ b/db/db_range_del_test.cc @@ -1041,11 +1041,16 @@ TEST_F(DBRangeDelTest, RangeTombstoneEndKeyAsSstableUpperBound) { // L2: // [key000000#1,1, key000000#1,1] // [key000002#6,1, key000004#72057594037927935,15] + // + // At the same time, verify the compaction does not cause the key at the + // endpoint (key000002#6,1) to disappear. + ASSERT_EQ(value, Get(Key(2))); auto begin_str = Key(3); const rocksdb::Slice begin = begin_str; dbfull()->TEST_CompactRange(1, &begin, nullptr); ASSERT_EQ(1, NumTableFilesAtLevel(1)); ASSERT_EQ(2, NumTableFilesAtLevel(2)); + ASSERT_EQ(value, Get(Key(2))); } { diff --git a/db/version_set.cc b/db/version_set.cc index 8de8c86b5..fb7542eb4 100644 --- a/db/version_set.cc +++ b/db/version_set.cc @@ -301,17 +301,28 @@ class FilePicker { // On Level-n (n>=1), files are sorted. Binary search to find the // earliest file whose largest key >= ikey. Search left bound and // right bound are used to narrow the range. - if (search_left_bound_ == search_right_bound_) { - start_index = search_left_bound_; - } else if (search_left_bound_ < search_right_bound_) { + if (search_left_bound_ <= search_right_bound_) { if (search_right_bound_ == FileIndexer::kLevelMaxIndex) { search_right_bound_ = static_cast(curr_file_level_->num_files) - 1; } + // `search_right_bound_` is an inclusive upper-bound, but since it was + // determined based on user key, it is still possible the lookup key + // falls to the right of `search_right_bound_`'s corresponding file. + // So, pass a limit one higher, which allows us to detect this case. start_index = FindFileInRange(*internal_comparator_, *curr_file_level_, ikey_, static_cast(search_left_bound_), - static_cast(search_right_bound_)); + static_cast(search_right_bound_) + 1); + if (start_index == search_right_bound_ + 1) { + // `ikey_` comes after `search_right_bound_`. The lookup key does + // not exist on this level, so let's skip this level and do a full + // binary search on the next level. + search_left_bound_ = 0; + search_right_bound_ = FileIndexer::kLevelMaxIndex; + curr_level_++; + continue; + } } else { // search_left_bound > search_right_bound, key does not exist in // this level. Since no comparison is done in this level, it will