From f0b50491938ec3a5423e9395b2cf2dd3e7db8cb9 Mon Sep 17 00:00:00 2001 From: Abhishek Madan Date: Tue, 27 Nov 2018 15:49:43 -0800 Subject: [PATCH 1/5] Properly set smallest key of subcompaction output Summary: It is possible to see a situation like the following when subcompactions are enabled: 1. A subcompaction boundary is set to `[b, e)`. 2. The first output file in a subcompaction has `c@20` as its smallest key 3. The range tombstone `[a, d)@30` is encountered. 4. The tombstone is written to the range-del meta block and the new smallest key is set to `b@0` (since no keys in this subcompaction's output can be smaller than `b`). 5. A key `b@10` in a lower level will now reappear, since it is not covered by the truncated start key `b@0`. In general, unless the smallest data key in a file has a seqnum of 0, it is not safe to truncate a tombstone at the start key to have a seqnum of 0, since it can expose keys with a seqnum greater than 0 but less than the tombstone's actual seqnum. To fix this, when the lower bound of a file is from the subcompaction boundaries, we now set the seqnum of an artificially extended smallest key to the tombstone's seqnum. This is safe because subcompactions operate over disjoint sets of keys, and the subcompactions that can experience this problem are not the first subcompaction (which is unbounded on the left). Furthermore, there is now an assertion to detect the described anomalous case. Test Plan: run the following command a few times: ``` make db_stress && TEST_TMPDIR=/dev/shm ./db_stress --max_background_compactions=8 --subcompactions=0 --memtablerep=skip_list --acquire_snapshot_one_in=10000 --delpercent=4 --delrangepercent=1 --snapshot_hold_ops=100000 --allow_concurrent_memtable_write=1 --compact_files_one_in=10000 --clear_column_family_one_in=0 --writepercent=35 --readpercent=25 --write_buffer_size=1048576 --max_bytes_for_level_base=4194304 --target_file_size_base=1048576 --column_families=1 --compact_range_one_in=10000 --open_files=-1 --max_key=10000000 --prefixpercent=25 --ops_per_thread=1000000 ``` Reviewers: Subscribers: Tasks: Tags: --- db/compaction_job.cc | 33 ++++++++++++++++++++++++++++----- 1 file changed, 28 insertions(+), 5 deletions(-) diff --git a/db/compaction_job.cc b/db/compaction_job.cc index 8a878fe725f..6b29b897ab8 100644 --- a/db/compaction_job.cc +++ b/db/compaction_job.cc @@ -1192,10 +1192,12 @@ Status CompactionJob::FinishCompactionOutputFile( Slice lower_bound_guard, upper_bound_guard; std::string smallest_user_key; const Slice *lower_bound, *upper_bound; + bool lower_bound_from_sub_compact = false; if (sub_compact->outputs.size() == 1) { // For the first output table, include range tombstones before the min key // but after the subcompaction boundary. lower_bound = sub_compact->start; + lower_bound_from_sub_compact = true; } else if (meta->smallest.size() > 0) { // For subsequent output tables, only include range tombstones from min // key onwards since the previous file was extended to contain range @@ -1265,11 +1267,22 @@ Status CompactionJob::FinishCompactionOutputFile( // (the max key in the previous table or subcompaction) in order for // files to appear key-space partitioned. // - // Choose lowest seqnum so this file's smallest internal key comes - // after the previous file's/subcompaction's largest. The fake seqnum - // is OK because the read path's file-picking code only considers user - // key. - smallest_candidate = InternalKey(*lower_bound, 0, kTypeRangeDeletion); + // When lower_bound is chosen by a subcompaction, we know that + // subcompactions over smaller keys cannot contain any keys at + // lower_bound. We also know that smaller subcompactions exist, because + // otherwise the subcompaction woud be unbounded on the left. As a + // result, we know that no other files on the output level will contain + // keys at lower_bound. Therefore, it is safe to use the tombstone's + // sequence number, to ensure that keys at lower_bound at lower levels + // are covered by truncated tombstones. + // + // If lower_bound was chosen by the smallest data key in the file, + // choose lowest seqnum so this file's smallest internal key comes after + // the previous file's largest. The fake seqnum is OK because the read + // path's file-picking code only considers user key. + smallest_candidate = InternalKey( + *lower_bound, lower_bound_from_sub_compact ? tombstone.seq_ : 0, + kTypeRangeDeletion); } InternalKey largest_candidate = tombstone.SerializeEndKey(); if (upper_bound != nullptr && @@ -1291,9 +1304,19 @@ Status CompactionJob::FinishCompactionOutputFile( largest_candidate = InternalKey(*upper_bound, kMaxSequenceNumber, kTypeRangeDeletion); } + SequenceNumber smallest_ikey_seqnum = + GetInternalKeySeqno(meta->smallest.Encode()); meta->UpdateBoundariesForRange(smallest_candidate, largest_candidate, tombstone.seq_, cfd->internal_comparator()); + + // The smallest key in a file is used for range tombstone truncation, so + // it cannot have a seqnum of 0 (unless the smallest data key in a file + // has a seqnum of 0). Otherwise, the truncated tombstone may expose + // deleted keys at lower levels. + assert(smallest_ikey_seqnum == 0 || + ExtractInternalKeyFooter(meta->smallest.Encode()) != + PackSequenceAndType(0, kTypeRangeDeletion)); } meta->marked_for_compaction = sub_compact->builder->NeedCompact(); } From f410a11cf49800038a19d11d24377f18e9f71db3 Mon Sep 17 00:00:00 2001 From: Abhishek Madan Date: Tue, 27 Nov 2018 16:38:36 -0800 Subject: [PATCH 2/5] Fix compile error in release builds --- db/compaction_job.cc | 2 ++ 1 file changed, 2 insertions(+) diff --git a/db/compaction_job.cc b/db/compaction_job.cc index 6b29b897ab8..b52b72da51e 100644 --- a/db/compaction_job.cc +++ b/db/compaction_job.cc @@ -1304,8 +1304,10 @@ Status CompactionJob::FinishCompactionOutputFile( largest_candidate = InternalKey(*upper_bound, kMaxSequenceNumber, kTypeRangeDeletion); } +#ifndef NDEBUG SequenceNumber smallest_ikey_seqnum = GetInternalKeySeqno(meta->smallest.Encode()); +#endif meta->UpdateBoundariesForRange(smallest_candidate, largest_candidate, tombstone.seq_, cfd->internal_comparator()); From 7a6a66e2ca0ddc72d624cc70f9e7a5eeae72612c Mon Sep 17 00:00:00 2001 From: Abhishek Madan Date: Tue, 27 Nov 2018 17:00:28 -0800 Subject: [PATCH 3/5] Don't decode an empty ikey --- db/compaction_job.cc | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/db/compaction_job.cc b/db/compaction_job.cc index b52b72da51e..da7b3efd6d0 100644 --- a/db/compaction_job.cc +++ b/db/compaction_job.cc @@ -1305,8 +1305,10 @@ Status CompactionJob::FinishCompactionOutputFile( InternalKey(*upper_bound, kMaxSequenceNumber, kTypeRangeDeletion); } #ifndef NDEBUG - SequenceNumber smallest_ikey_seqnum = - GetInternalKeySeqno(meta->smallest.Encode()); + SequenceNumber smallest_ikey_seqnum = kMaxSequenceNumber; + if (meta->smallest.Valid()) { + GetInternalKeySeqno(meta->smallest.Encode()); + } #endif meta->UpdateBoundariesForRange(smallest_candidate, largest_candidate, tombstone.seq_, From 3ea12bbb7a8e3ab4232ac3466f185b14807a843b Mon Sep 17 00:00:00 2001 From: Abhishek Madan Date: Tue, 27 Nov 2018 17:32:12 -0800 Subject: [PATCH 4/5] Do faster sanity check on file bound --- db/compaction_job.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/db/compaction_job.cc b/db/compaction_job.cc index da7b3efd6d0..1bf502cb1a6 100644 --- a/db/compaction_job.cc +++ b/db/compaction_job.cc @@ -1306,7 +1306,7 @@ Status CompactionJob::FinishCompactionOutputFile( } #ifndef NDEBUG SequenceNumber smallest_ikey_seqnum = kMaxSequenceNumber; - if (meta->smallest.Valid()) { + if (meta->smallest.size() > 0) { GetInternalKeySeqno(meta->smallest.Encode()); } #endif From a6f133934a0991a350f09d35991629228839cc9d Mon Sep 17 00:00:00 2001 From: Abhishek Madan Date: Mon, 10 Dec 2018 10:27:01 -0800 Subject: [PATCH 5/5] Fix comment, set assertion seqnum --- db/compaction_job.cc | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/db/compaction_job.cc b/db/compaction_job.cc index 1bf502cb1a6..dd04d8b205d 100644 --- a/db/compaction_job.cc +++ b/db/compaction_job.cc @@ -1272,9 +1272,11 @@ Status CompactionJob::FinishCompactionOutputFile( // lower_bound. We also know that smaller subcompactions exist, because // otherwise the subcompaction woud be unbounded on the left. As a // result, we know that no other files on the output level will contain - // keys at lower_bound. Therefore, it is safe to use the tombstone's - // sequence number, to ensure that keys at lower_bound at lower levels - // are covered by truncated tombstones. + // actual keys at lower_bound (an output file may have a largest key of + // lower_bound@kMaxSequenceNumber, but this only indicates a large range + // tombstone was truncated). Therefore, it is safe to use the + // tombstone's sequence number, to ensure that keys at lower_bound at + // lower levels are covered by truncated tombstones. // // If lower_bound was chosen by the smallest data key in the file, // choose lowest seqnum so this file's smallest internal key comes after @@ -1307,7 +1309,7 @@ Status CompactionJob::FinishCompactionOutputFile( #ifndef NDEBUG SequenceNumber smallest_ikey_seqnum = kMaxSequenceNumber; if (meta->smallest.size() > 0) { - GetInternalKeySeqno(meta->smallest.Encode()); + smallest_ikey_seqnum = GetInternalKeySeqno(meta->smallest.Encode()); } #endif meta->UpdateBoundariesForRange(smallest_candidate, largest_candidate,