diff --git a/db/column_family.cc b/db/column_family.cc index 2b611fda7ca..eb614c01857 100644 --- a/db/column_family.cc +++ b/db/column_family.cc @@ -1201,10 +1201,12 @@ Status ColumnFamilyData::RangesOverlapWithMemtables( read_opts.total_order_seek = true; MergeIteratorBuilder merge_iter_builder(&internal_comparator_, &arena); merge_iter_builder.AddIterator(super_version->mem->NewIterator( - read_opts, /*seqno_to_time_mapping=*/nullptr, &arena)); - super_version->imm->AddIterators(read_opts, /*seqno_to_time_mapping=*/nullptr, - &merge_iter_builder, - false /* add_range_tombstone_iter */); + read_opts, /*seqno_to_time_mapping=*/nullptr, &arena, + /*prefix_extractor=*/nullptr)); + super_version->imm->AddIterators( + read_opts, /*seqno_to_time_mapping=*/nullptr, + super_version->mutable_cf_options.prefix_extractor.get(), + &merge_iter_builder, false /* add_range_tombstone_iter */); ScopedArenaPtr memtable_iter(merge_iter_builder.Finish()); auto read_seq = super_version->current->version_set()->LastSequence(); diff --git a/db/db_bloom_filter_test.cc b/db/db_bloom_filter_test.cc index 7b42f4ee5e4..15deca2a618 100644 --- a/db/db_bloom_filter_test.cc +++ b/db/db_bloom_filter_test.cc @@ -2177,24 +2177,118 @@ TEST_F(DBBloomFilterTest, MemtableWholeKeyBloomFilterMultiGet) { db_->ReleaseSnapshot(snapshot); } +namespace { +std::pair GetBloomStat(const Options& options, bool sst) { + if (sst) { + return {options.statistics->getAndResetTickerCount( + NON_LAST_LEVEL_SEEK_FILTER_MATCH), + options.statistics->getAndResetTickerCount( + NON_LAST_LEVEL_SEEK_FILTERED)}; + } else { + auto hit = std::exchange(get_perf_context()->bloom_memtable_hit_count, 0); + auto miss = std::exchange(get_perf_context()->bloom_memtable_miss_count, 0); + return {hit, miss}; + } +} + +std::pair HitAndMiss(uint64_t hits, uint64_t misses) { + return {hits, misses}; +} +} // namespace -TEST_F(DBBloomFilterTest, MemtablePrefixBloomOutOfDomain) { - constexpr size_t kPrefixSize = 8; - const std::string kKey = "key"; - assert(kKey.size() < kPrefixSize); +TEST_F(DBBloomFilterTest, MemtablePrefixBloom) { Options options = CurrentOptions(); - options.prefix_extractor.reset(NewFixedPrefixTransform(kPrefixSize)); + options.prefix_extractor.reset(NewFixedPrefixTransform(4)); options.memtable_prefix_bloom_size_ratio = 0.25; Reopen(options); - ASSERT_OK(Put(kKey, "v")); - ASSERT_EQ("v", Get(kKey)); - std::unique_ptr iter(dbfull()->NewIterator(ReadOptions())); - iter->Seek(kKey); + ASSERT_FALSE(options.prefix_extractor->InDomain("key")); + ASSERT_OK(Put("key", "v")); + ASSERT_OK(Put("goat1", "g1")); + ASSERT_OK(Put("goat2", "g2")); + + // Out of domain + ASSERT_EQ("v", Get("key")); + ASSERT_EQ(HitAndMiss(0, 0), GetBloomStat(options, false)); + + // In domain + ASSERT_EQ("g1", Get("goat1")); + ASSERT_EQ(HitAndMiss(1, 0), GetBloomStat(options, false)); + ASSERT_EQ("NOT_FOUND", Get("goat9")); + ASSERT_EQ(HitAndMiss(1, 0), GetBloomStat(options, false)); + ASSERT_EQ("NOT_FOUND", Get("goan1")); + ASSERT_EQ(HitAndMiss(0, 1), GetBloomStat(options, false)); + + ReadOptions ropts; + if (options.prefix_seek_opt_in_only) { + ropts.prefix_same_as_start = true; + } + std::unique_ptr iter(db_->NewIterator(ropts)); + // Out of domain + iter->Seek("ke"); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ("key", iter->key()); + iter->SeekForPrev("kez"); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ("key", iter->key()); + ASSERT_EQ(HitAndMiss(0, 0), GetBloomStat(options, false)); + + // In domain + iter->Seek("goan"); + ASSERT_FALSE(iter->Valid()); + ASSERT_EQ(HitAndMiss(0, 1), GetBloomStat(options, false)); + iter->Seek("goat"); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ("goat1", iter->key()); + ASSERT_EQ(HitAndMiss(1, 0), GetBloomStat(options, false)); + + // Changing prefix extractor should affect prefix query semantics + // and bypass the existing memtable Bloom filter + ASSERT_OK(db_->SetOptions({{"prefix_extractor", "fixed:5"}})); + iter.reset(db_->NewIterator(ropts)); + // Now out of domain + iter->Seek("goan"); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ("goat1", iter->key()); + ASSERT_EQ(HitAndMiss(0, 0), GetBloomStat(options, false)); + // In domain + iter->Seek("goat2"); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ("goat2", iter->key()); + ASSERT_EQ(HitAndMiss(0, 0), GetBloomStat(options, false)); + // In domain + if (ropts.prefix_same_as_start) { + iter->Seek("goat0"); + ASSERT_FALSE(iter->Valid()); + ASSERT_EQ(HitAndMiss(0, 0), GetBloomStat(options, false)); + } else { + // NOTE: legacy prefix Seek may return keys outside of prefix + } + + // Start a fresh new memtable, using new prefix extractor + ASSERT_OK(SingleDelete("key")); + ASSERT_OK(SingleDelete("goat1")); + ASSERT_OK(SingleDelete("goat2")); + ASSERT_OK(Flush()); + + ASSERT_OK(Put("goat1", "g1")); + ASSERT_OK(Put("goat2", "g2")); + + iter.reset(db_->NewIterator(ropts)); + + // Now out of domain + iter->Seek("goan"); ASSERT_TRUE(iter->Valid()); - ASSERT_EQ(kKey, iter->key()); - iter->SeekForPrev(kKey); + ASSERT_EQ("goat1", iter->key()); + ASSERT_EQ(HitAndMiss(0, 0), GetBloomStat(options, false)); + // In domain + iter->Seek("goat2"); ASSERT_TRUE(iter->Valid()); - ASSERT_EQ(kKey, iter->key()); + ASSERT_EQ("goat2", iter->key()); + ASSERT_EQ(HitAndMiss(1, 0), GetBloomStat(options, false)); + // In domain + iter->Seek("goat0"); + ASSERT_FALSE(iter->Valid()); + ASSERT_EQ(HitAndMiss(0, 1), GetBloomStat(options, false)); } class DBBloomFilterTestVaryPrefixAndFormatVer @@ -2507,7 +2601,11 @@ TEST_P(BloomStatsTestWithParam, BloomStatsTestWithIter) { ASSERT_OK(Put(key1, value1, WriteOptions())); ASSERT_OK(Put(key3, value3, WriteOptions())); - std::unique_ptr iter(dbfull()->NewIterator(ReadOptions())); + ReadOptions ropts; + if (options_.prefix_seek_opt_in_only) { + ropts.prefix_same_as_start = true; + } + std::unique_ptr iter(dbfull()->NewIterator(ropts)); // check memtable bloom stats iter->Seek(key1); @@ -2532,7 +2630,7 @@ TEST_P(BloomStatsTestWithParam, BloomStatsTestWithIter) { ASSERT_OK(Flush()); - iter.reset(dbfull()->NewIterator(ReadOptions())); + iter.reset(dbfull()->NewIterator(ropts)); // Check SST bloom stats iter->Seek(key1); @@ -2659,9 +2757,14 @@ TEST_F(DBBloomFilterTest, PrefixScan) { PrefixScanInit(this); count = 0; env_->random_read_counter_.Reset(); - iter = db_->NewIterator(ReadOptions()); + ReadOptions ropts; + if (options.prefix_seek_opt_in_only) { + ropts.prefix_same_as_start = true; + } + iter = db_->NewIterator(ropts); for (iter->Seek(prefix); iter->Valid(); iter->Next()) { if (!iter->key().starts_with(prefix)) { + ASSERT_FALSE(ropts.prefix_same_as_start); break; } count++; @@ -3397,23 +3500,6 @@ class FixedSuffix4Transform : public SliceTransform { bool InDomain(const Slice& src) const override { return src.size() >= 4; } }; - -std::pair GetBloomStat(const Options& options, bool sst) { - if (sst) { - return {options.statistics->getAndResetTickerCount( - NON_LAST_LEVEL_SEEK_FILTER_MATCH), - options.statistics->getAndResetTickerCount( - NON_LAST_LEVEL_SEEK_FILTERED)}; - } else { - auto hit = std::exchange(get_perf_context()->bloom_memtable_hit_count, 0); - auto miss = std::exchange(get_perf_context()->bloom_memtable_miss_count, 0); - return {hit, miss}; - } -} - -std::pair HitAndMiss(uint64_t hits, uint64_t misses) { - return {hits, misses}; -} } // anonymous namespace // This uses a prefix_extractor + comparator combination that violates @@ -3447,9 +3533,7 @@ TEST_F(DBBloomFilterTest, WeirdPrefixExtractorWithFilter1) { ASSERT_OK(Flush()); } ReadOptions read_options; - if (flushed) { // TODO: support auto_prefix_mode in memtable? - read_options.auto_prefix_mode = true; - } + read_options.auto_prefix_mode = true; EXPECT_EQ(GetBloomStat(options, flushed), HitAndMiss(0, 0)); { Slice ub("999aaaa"); @@ -3517,9 +3601,8 @@ TEST_F(DBBloomFilterTest, WeirdPrefixExtractorWithFilter2) { ASSERT_OK(Flush()); } ReadOptions read_options; - if (flushed) { // TODO: support auto_prefix_mode in memtable? - read_options.auto_prefix_mode = true; - } else { + read_options.auto_prefix_mode = true; + if (!flushed) { // TODO: why needed? get_perf_context()->bloom_memtable_hit_count = 0; get_perf_context()->bloom_memtable_miss_count = 0; @@ -3661,9 +3744,8 @@ TEST_F(DBBloomFilterTest, WeirdPrefixExtractorWithFilter3) { ASSERT_OK(Flush()); } ReadOptions read_options; - if (flushed) { // TODO: support auto_prefix_mode in memtable? - read_options.auto_prefix_mode = true; - } else { + read_options.auto_prefix_mode = true; + if (!flushed) { // TODO: why needed? get_perf_context()->bloom_memtable_hit_count = 0; get_perf_context()->bloom_memtable_miss_count = 0; diff --git a/db/db_impl/db_impl.cc b/db/db_impl/db_impl.cc index e95561efab9..fe98ef0e4b9 100644 --- a/db/db_impl/db_impl.cc +++ b/db/db_impl/db_impl.cc @@ -2066,15 +2066,18 @@ InternalIterator* DBImpl::NewInternalIterator( bool allow_unprepared_value, ArenaWrappedDBIter* db_iter) { InternalIterator* internal_iter; assert(arena != nullptr); + auto prefix_extractor = + super_version->mutable_cf_options.prefix_extractor.get(); // Need to create internal iterator from the arena. MergeIteratorBuilder merge_iter_builder( &cfd->internal_comparator(), arena, - !read_options.total_order_seek && - super_version->mutable_cf_options.prefix_extractor != nullptr, + !read_options.total_order_seek && // FIXME? + prefix_extractor != nullptr, read_options.iterate_upper_bound); // Collect iterator for mutable memtable auto mem_iter = super_version->mem->NewIterator( - read_options, super_version->GetSeqnoToTimeMapping(), arena); + read_options, super_version->GetSeqnoToTimeMapping(), arena, + super_version->mutable_cf_options.prefix_extractor.get()); Status s; if (!read_options.ignore_range_deletions) { std::unique_ptr mem_tombstone_iter; @@ -2098,6 +2101,7 @@ InternalIterator* DBImpl::NewInternalIterator( if (s.ok()) { super_version->imm->AddIterators( read_options, super_version->GetSeqnoToTimeMapping(), + super_version->mutable_cf_options.prefix_extractor.get(), &merge_iter_builder, !read_options.ignore_range_deletions); } TEST_SYNC_POINT_CALLBACK("DBImpl::NewInternalIterator:StatusCallback", &s); @@ -3805,6 +3809,9 @@ Iterator* DBImpl::NewIterator(const ReadOptions& _read_options, read_options.io_activity = Env::IOActivity::kDBIterator; } + read_options.total_order_seek |= + immutable_db_options_.prefix_seek_opt_in_only; + if (read_options.managed) { return NewErrorIterator( Status::NotSupported("Managed iterator is not supported anymore.")); diff --git a/db/db_impl/db_impl_open.cc b/db/db_impl/db_impl_open.cc index 4fb1008761a..dec61c0504f 100644 --- a/db/db_impl/db_impl_open.cc +++ b/db/db_impl/db_impl_open.cc @@ -1669,7 +1669,8 @@ Status DBImpl::WriteLevel0TableForRecovery(int job_id, ColumnFamilyData* cfd, TableProperties table_properties; { ScopedArenaPtr iter( - mem->NewIterator(ro, /*seqno_to_time_mapping=*/nullptr, &arena)); + mem->NewIterator(ro, /*seqno_to_time_mapping=*/nullptr, &arena, + /*prefix_extractor=*/nullptr)); ROCKS_LOG_DEBUG(immutable_db_options_.info_log, "[%s] [WriteLevel0TableForRecovery]" " Level-0 table #%" PRIu64 ": started", diff --git a/db/db_iter.cc b/db/db_iter.cc index b42acc4bc6a..84acdc4e040 100644 --- a/db/db_iter.cc +++ b/db/db_iter.cc @@ -65,9 +65,8 @@ DBIter::DBIter(Env* _env, const ReadOptions& read_options, valid_(false), current_entry_is_merged_(false), is_key_seqnum_zero_(false), - prefix_same_as_start_(mutable_cf_options.prefix_extractor - ? read_options.prefix_same_as_start - : false), + prefix_same_as_start_( + prefix_extractor_ ? read_options.prefix_same_as_start : false), pin_thru_lifetime_(read_options.pin_data), expect_total_order_inner_iter_(prefix_extractor_ == nullptr || read_options.total_order_seek || diff --git a/db/db_test2.cc b/db/db_test2.cc index 92211ad42dd..1d645a07199 100644 --- a/db/db_test2.cc +++ b/db/db_test2.cc @@ -5597,6 +5597,7 @@ TEST_F(DBTest2, PrefixBloomFilteredOut) { bbto.filter_policy.reset(NewBloomFilterPolicy(10, false)); bbto.whole_key_filtering = false; options.table_factory.reset(NewBlockBasedTableFactory(bbto)); + options.prefix_seek_opt_in_only = false; // Use legacy prefix seek DestroyAndReopen(options); // Construct two L1 files with keys: @@ -5987,6 +5988,7 @@ TEST_F(DBTest2, ChangePrefixExtractor) { // create a DB with block prefix index BlockBasedTableOptions table_options; Options options = CurrentOptions(); + options.prefix_seek_opt_in_only = false; // Use legacy prefix seek // Sometimes filter is checked based on upper bound. Assert counters // for that case. Otherwise, only check data correctness. diff --git a/db/db_with_timestamp_basic_test.cc b/db/db_with_timestamp_basic_test.cc index acb1c05c1d2..60441da0b52 100644 --- a/db/db_with_timestamp_basic_test.cc +++ b/db/db_with_timestamp_basic_test.cc @@ -832,6 +832,7 @@ TEST_P(DBBasicTestWithTimestampTableOptions, GetAndMultiGet) { TEST_P(DBBasicTestWithTimestampTableOptions, SeekWithPrefixLessThanKey) { Options options = CurrentOptions(); + options.prefix_seek_opt_in_only = false; // Use legacy prefix seek options.env = env_; options.create_if_missing = true; options.prefix_extractor.reset(NewFixedPrefixTransform(3)); @@ -1009,6 +1010,7 @@ TEST_F(DBBasicTestWithTimestamp, ChangeIterationDirection) { TestComparator test_cmp(kTimestampSize); options.comparator = &test_cmp; options.prefix_extractor.reset(NewFixedPrefixTransform(1)); + options.prefix_seek_opt_in_only = false; // Use legacy prefix seek options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics(); DestroyAndReopen(options); const std::vector timestamps = {Timestamp(1, 1), Timestamp(0, 2), diff --git a/db/flush_job.cc b/db/flush_job.cc index 6bd71dd5624..8206bd298a8 100644 --- a/db/flush_job.cc +++ b/db/flush_job.cc @@ -421,8 +421,8 @@ Status FlushJob::MemPurge() { std::vector> range_del_iters; for (MemTable* m : mems_) { - memtables.push_back( - m->NewIterator(ro, /*seqno_to_time_mapping=*/nullptr, &arena)); + memtables.push_back(m->NewIterator(ro, /*seqno_to_time_mapping=*/nullptr, + &arena, /*prefix_extractor=*/nullptr)); auto* range_del_iter = m->NewRangeTombstoneIterator( ro, kMaxSequenceNumber, true /* immutable_memtable */); if (range_del_iter != nullptr) { @@ -893,8 +893,8 @@ Status FlushJob::WriteLevel0Table() { db_options_.info_log, "[%s] [JOB %d] Flushing memtable with next log file: %" PRIu64 "\n", cfd_->GetName().c_str(), job_context_->job_id, m->GetNextLogNumber()); - memtables.push_back( - m->NewIterator(ro, /*seqno_to_time_mapping=*/nullptr, &arena)); + memtables.push_back(m->NewIterator(ro, /*seqno_to_time_mapping=*/nullptr, + &arena, /*prefix_extractor=*/nullptr)); auto* range_del_iter = m->NewRangeTombstoneIterator( ro, kMaxSequenceNumber, true /* immutable_memtable */); if (range_del_iter != nullptr) { diff --git a/db/forward_iterator.cc b/db/forward_iterator.cc index a4cbdb46679..0bf7c15ab8e 100644 --- a/db/forward_iterator.cc +++ b/db/forward_iterator.cc @@ -712,9 +712,11 @@ void ForwardIterator::RebuildIterators(bool refresh_sv) { UnownedPtr seqno_to_time_mapping = sv_->GetSeqnoToTimeMapping(); mutable_iter_ = - sv_->mem->NewIterator(read_options_, seqno_to_time_mapping, &arena_); - sv_->imm->AddIterators(read_options_, seqno_to_time_mapping, &imm_iters_, - &arena_); + sv_->mem->NewIterator(read_options_, seqno_to_time_mapping, &arena_, + sv_->mutable_cf_options.prefix_extractor.get()); + sv_->imm->AddIterators(read_options_, seqno_to_time_mapping, + sv_->mutable_cf_options.prefix_extractor.get(), + &imm_iters_, &arena_); if (!read_options_.ignore_range_deletions) { std::unique_ptr range_del_iter( sv_->mem->NewRangeTombstoneIterator( @@ -781,9 +783,11 @@ void ForwardIterator::RenewIterators() { UnownedPtr seqno_to_time_mapping = svnew->GetSeqnoToTimeMapping(); mutable_iter_ = - svnew->mem->NewIterator(read_options_, seqno_to_time_mapping, &arena_); - svnew->imm->AddIterators(read_options_, seqno_to_time_mapping, &imm_iters_, - &arena_); + svnew->mem->NewIterator(read_options_, seqno_to_time_mapping, &arena_, + svnew->mutable_cf_options.prefix_extractor.get()); + svnew->imm->AddIterators(read_options_, seqno_to_time_mapping, + svnew->mutable_cf_options.prefix_extractor.get(), + &imm_iters_, &arena_); ReadRangeDelAggregator range_del_agg(&cfd_->internal_comparator(), kMaxSequenceNumber /* upper_bound */); if (!read_options_.ignore_range_deletions) { diff --git a/db/memtable.cc b/db/memtable.cc index ef1184ded48..e1a71006842 100644 --- a/db/memtable.cc +++ b/db/memtable.cc @@ -365,15 +365,19 @@ const char* EncodeKey(std::string* scratch, const Slice& target) { class MemTableIterator : public InternalIterator { public: - MemTableIterator(const MemTable& mem, const ReadOptions& read_options, - UnownedPtr seqno_to_time_mapping, - Arena* arena, bool use_range_del_table = false) + enum Kind { kPointEntries, kRangeDelEntries }; + MemTableIterator( + Kind kind, const MemTable& mem, const ReadOptions& read_options, + UnownedPtr seqno_to_time_mapping = nullptr, + Arena* arena = nullptr, + const SliceTransform* cf_prefix_extractor = nullptr) : bloom_(nullptr), prefix_extractor_(mem.prefix_extractor_), comparator_(mem.comparator_), seqno_to_time_mapping_(seqno_to_time_mapping), status_(Status::OK()), logger_(mem.moptions_.info_log), + auto_prefix_upper_bound_(nullptr), ts_sz_(mem.ts_sz_), protection_bytes_per_key_(mem.moptions_.protection_bytes_per_key), valid_(false), @@ -382,14 +386,23 @@ class MemTableIterator : public InternalIterator { arena_mode_(arena != nullptr), paranoid_memory_checks_(mem.moptions_.paranoid_memory_checks), allow_data_in_error(mem.moptions_.allow_data_in_errors) { - if (use_range_del_table) { + if (kind == kRangeDelEntries) { iter_ = mem.range_del_table_->GetIterator(arena); - } else if (prefix_extractor_ != nullptr && !read_options.total_order_seek && - !read_options.auto_prefix_mode) { - // Auto prefix mode is not implemented in memtable yet. + } else if (prefix_extractor_ != nullptr && + // NOTE: checking extractor equivalence when not pointer + // equivalent is arguably too expensive for memtable + prefix_extractor_ == cf_prefix_extractor && + (read_options.prefix_same_as_start || + read_options.auto_prefix_mode || + !read_options.total_order_seek)) { + assert(kind == kPointEntries); bloom_ = mem.bloom_filter_.get(); iter_ = mem.table_->GetDynamicPrefixIterator(arena); + if (read_options.auto_prefix_mode) { + auto_prefix_upper_bound_ = read_options.iterate_upper_bound; + } } else { + assert(kind == kPointEntries); iter_ = mem.table_->GetIterator(arena); } status_.PermitUncheckedError(); @@ -433,13 +446,18 @@ class MemTableIterator : public InternalIterator { // iterator should only use prefix bloom filter Slice user_k_without_ts(ExtractUserKeyAndStripTimestamp(k, ts_sz_)); if (prefix_extractor_->InDomain(user_k_without_ts)) { - if (!bloom_->MayContain( - prefix_extractor_->Transform(user_k_without_ts))) { - PERF_COUNTER_ADD(bloom_memtable_miss_count, 1); - valid_ = false; - return; - } else { - PERF_COUNTER_ADD(bloom_memtable_hit_count, 1); + Slice prefix = prefix_extractor_->Transform(user_k_without_ts); + if (!auto_prefix_upper_bound_ || + (prefix_extractor_->InDomain(*auto_prefix_upper_bound_) && + prefix_extractor_->Transform(*auto_prefix_upper_bound_) + .compare(prefix) == 0)) { + if (!bloom_->MayContain(prefix)) { + PERF_COUNTER_ADD(bloom_memtable_miss_count, 1); + valid_ = false; + return; + } else { + PERF_COUNTER_ADD(bloom_memtable_hit_count, 1); + } } } } @@ -457,7 +475,9 @@ class MemTableIterator : public InternalIterator { status_ = Status::OK(); if (bloom_) { Slice user_k_without_ts(ExtractUserKeyAndStripTimestamp(k, ts_sz_)); - if (prefix_extractor_->InDomain(user_k_without_ts)) { + // NOTE: auto_prefix_mode ineffective for SeekForPrev + if (!auto_prefix_upper_bound_ && + prefix_extractor_->InDomain(user_k_without_ts)) { if (!bloom_->MayContain( prefix_extractor_->Transform(user_k_without_ts))) { PERF_COUNTER_ADD(bloom_memtable_miss_count, 1); @@ -573,6 +593,7 @@ class MemTableIterator : public InternalIterator { UnownedPtr seqno_to_time_mapping_; Status status_; Logger* logger_; + const Slice* auto_prefix_upper_bound_; size_t ts_sz_; uint32_t protection_bytes_per_key_; bool valid_; @@ -594,11 +615,13 @@ class MemTableIterator : public InternalIterator { InternalIterator* MemTable::NewIterator( const ReadOptions& read_options, - UnownedPtr seqno_to_time_mapping, Arena* arena) { + UnownedPtr seqno_to_time_mapping, Arena* arena, + const SliceTransform* prefix_extractor) { assert(arena != nullptr); auto mem = arena->AllocateAligned(sizeof(MemTableIterator)); return new (mem) - MemTableIterator(*this, read_options, seqno_to_time_mapping, arena); + MemTableIterator(MemTableIterator::kPointEntries, *this, read_options, + seqno_to_time_mapping, arena, prefix_extractor); } FragmentedRangeTombstoneIterator* MemTable::NewRangeTombstoneIterator( @@ -633,8 +656,7 @@ FragmentedRangeTombstoneIterator* MemTable::NewRangeTombstoneIteratorInternal( cache->reader_mutex.lock(); if (!cache->tombstones) { auto* unfragmented_iter = new MemTableIterator( - *this, read_options, nullptr /* seqno_to_time_mapping= */, - nullptr /* arena */, true /* use_range_del_table */); + MemTableIterator::kRangeDelEntries, *this, read_options); cache->tombstones.reset(new FragmentedRangeTombstoneList( std::unique_ptr(unfragmented_iter), comparator_.comparator)); @@ -655,8 +677,7 @@ void MemTable::ConstructFragmentedRangeTombstones() { if (!is_range_del_table_empty_.load(std::memory_order_relaxed)) { // TODO: plumb Env::IOActivity, Env::IOPriority auto* unfragmented_iter = new MemTableIterator( - *this, ReadOptions(), nullptr /*seqno_to_time_mapping=*/, - nullptr /* arena */, true /* use_range_del_table */); + MemTableIterator::kRangeDelEntries, *this, ReadOptions()); fragmented_range_tombstone_list_ = std::make_unique( diff --git a/db/memtable.h b/db/memtable.h index ca0652bc044..194b4543c21 100644 --- a/db/memtable.h +++ b/db/memtable.h @@ -210,7 +210,8 @@ class MemTable { // data, currently only needed for iterators serving user reads. InternalIterator* NewIterator( const ReadOptions& read_options, - UnownedPtr seqno_to_time_mapping, Arena* arena); + UnownedPtr seqno_to_time_mapping, Arena* arena, + const SliceTransform* prefix_extractor); // Returns an iterator that yields the range tombstones of the memtable. // The caller must ensure that the underlying MemTable remains live diff --git a/db/memtable_list.cc b/db/memtable_list.cc index c3612656e24..c81c096b511 100644 --- a/db/memtable_list.cc +++ b/db/memtable_list.cc @@ -214,20 +214,23 @@ Status MemTableListVersion::AddRangeTombstoneIterators( void MemTableListVersion::AddIterators( const ReadOptions& options, UnownedPtr seqno_to_time_mapping, + const SliceTransform* prefix_extractor, std::vector* iterator_list, Arena* arena) { for (auto& m : memlist_) { - iterator_list->push_back( - m->NewIterator(options, seqno_to_time_mapping, arena)); + iterator_list->push_back(m->NewIterator(options, seqno_to_time_mapping, + arena, prefix_extractor)); } } void MemTableListVersion::AddIterators( const ReadOptions& options, UnownedPtr seqno_to_time_mapping, + const SliceTransform* prefix_extractor, MergeIteratorBuilder* merge_iter_builder, bool add_range_tombstone_iter) { for (auto& m : memlist_) { - auto mem_iter = m->NewIterator(options, seqno_to_time_mapping, - merge_iter_builder->GetArena()); + auto mem_iter = + m->NewIterator(options, seqno_to_time_mapping, + merge_iter_builder->GetArena(), prefix_extractor); if (!add_range_tombstone_iter || options.ignore_range_deletions) { merge_iter_builder->AddIterator(mem_iter); } else { diff --git a/db/memtable_list.h b/db/memtable_list.h index dd439de5590..390b4137dd2 100644 --- a/db/memtable_list.h +++ b/db/memtable_list.h @@ -113,11 +113,13 @@ class MemTableListVersion { void AddIterators(const ReadOptions& options, UnownedPtr seqno_to_time_mapping, + const SliceTransform* prefix_extractor, std::vector* iterator_list, Arena* arena); void AddIterators(const ReadOptions& options, UnownedPtr seqno_to_time_mapping, + const SliceTransform* prefix_extractor, MergeIteratorBuilder* merge_iter_builder, bool add_range_tombstone_iter); diff --git a/db/repair.cc b/db/repair.cc index 114d36a6a82..f7f4fbafb7f 100644 --- a/db/repair.cc +++ b/db/repair.cc @@ -443,7 +443,8 @@ class Repairer { ro.total_order_seek = true; Arena arena; ScopedArenaPtr iter( - mem->NewIterator(ro, /*seqno_to_time_mapping=*/nullptr, &arena)); + mem->NewIterator(ro, /*seqno_to_time_mapping=*/nullptr, &arena, + /*prefix_extractor=*/nullptr)); int64_t _current_time = 0; immutable_db_options_.clock->GetCurrentTime(&_current_time) .PermitUncheckedError(); // ignore error diff --git a/include/rocksdb/options.h b/include/rocksdb/options.h index 507f9bab80a..47374d13d7d 100644 --- a/include/rocksdb/options.h +++ b/include/rocksdb/options.h @@ -1399,6 +1399,17 @@ struct DBOptions { // eventually be obsolete and removed as Identity files are phased out. bool write_identity_file = true; + // Historically, when prefix_extractor != nullptr, iterators have an + // unfortunate default semantics of *possibly* only returning data + // within the same prefix. To avoid "spooky action at a distance," iterator + // bounds should come from the instantiation or seeking of the iterator, + // not from a mutable column family option. + // + // When set to true, it is as if every iterator is created with + // total_order_seek=true and only auto_prefix_mode=true and + // prefix_same_as_start=true can take advantage of prefix seek optimizations. + bool prefix_seek_opt_in_only = false; + // The number of bytes to prefetch when reading the log. This is mostly useful // for reading a remotely located log, as it can save the number of // round-trips. If 0, then the prefetching is disabled. @@ -1848,10 +1859,10 @@ struct ReadOptions { bool auto_prefix_mode = false; // Enforce that the iterator only iterates over the same prefix as the seek. - // This option is effective only for prefix seeks, i.e. prefix_extractor is - // non-null for the column family and total_order_seek is false. Unlike - // iterate_upper_bound, prefix_same_as_start only works within a prefix - // but in both directions. + // This makes the iterator bounds dependent on the column family's current + // prefix_extractor, which is mutable. When SST files have been built with + // the same prefix extractor, prefix filtering optimizations will be used + // for both Seek and SeekForPrev. bool prefix_same_as_start = false; // Keep the blocks loaded by the iterator pinned in memory as long as the diff --git a/options/db_options.cc b/options/db_options.cc index 0b880f4b939..e9e8fc5b7e0 100644 --- a/options/db_options.cc +++ b/options/db_options.cc @@ -403,6 +403,10 @@ static std::unordered_map {offsetof(struct ImmutableDBOptions, avoid_unnecessary_blocking_io), OptionType::kBoolean, OptionVerificationType::kNormal, OptionTypeFlags::kNone}}, + {"prefix_seek_opt_in_only", + {offsetof(struct ImmutableDBOptions, prefix_seek_opt_in_only), + OptionType::kBoolean, OptionVerificationType::kNormal, + OptionTypeFlags::kNone}}, {"write_dbid_to_manifest", {offsetof(struct ImmutableDBOptions, write_dbid_to_manifest), OptionType::kBoolean, OptionVerificationType::kNormal, @@ -774,6 +778,7 @@ ImmutableDBOptions::ImmutableDBOptions(const DBOptions& options) background_close_inactive_wals(options.background_close_inactive_wals), atomic_flush(options.atomic_flush), avoid_unnecessary_blocking_io(options.avoid_unnecessary_blocking_io), + prefix_seek_opt_in_only(options.prefix_seek_opt_in_only), persist_stats_to_disk(options.persist_stats_to_disk), write_dbid_to_manifest(options.write_dbid_to_manifest), write_identity_file(options.write_identity_file), @@ -948,6 +953,8 @@ void ImmutableDBOptions::Dump(Logger* log) const { ROCKS_LOG_HEADER(log, " Options.avoid_unnecessary_blocking_io: %d", avoid_unnecessary_blocking_io); + ROCKS_LOG_HEADER(log, " Options.prefix_seek_opt_in_only: %d", + prefix_seek_opt_in_only); ROCKS_LOG_HEADER(log, " Options.persist_stats_to_disk: %u", persist_stats_to_disk); ROCKS_LOG_HEADER(log, " Options.write_dbid_to_manifest: %d", diff --git a/options/db_options.h b/options/db_options.h index 842fefca860..ac76ea40d8e 100644 --- a/options/db_options.h +++ b/options/db_options.h @@ -87,6 +87,7 @@ struct ImmutableDBOptions { bool background_close_inactive_wals; bool atomic_flush; bool avoid_unnecessary_blocking_io; + bool prefix_seek_opt_in_only; bool persist_stats_to_disk; bool write_dbid_to_manifest; bool write_identity_file; diff --git a/table/block_based/block_based_table_reader.cc b/table/block_based/block_based_table_reader.cc index fe45224d08e..0dfe3e38ab6 100644 --- a/table/block_based/block_based_table_reader.cc +++ b/table/block_based/block_based_table_reader.cc @@ -2063,7 +2063,9 @@ InternalIterator* BlockBasedTable::NewIterator( if (arena == nullptr) { return new BlockBasedTableIterator( this, read_options, rep_->internal_comparator, std::move(index_iter), - !skip_filters && !read_options.total_order_seek && + !skip_filters && + (!read_options.total_order_seek || read_options.auto_prefix_mode || + read_options.prefix_same_as_start) && prefix_extractor != nullptr, need_upper_bound_check, prefix_extractor, caller, compaction_readahead_size, allow_unprepared_value); @@ -2071,7 +2073,9 @@ InternalIterator* BlockBasedTable::NewIterator( auto* mem = arena->AllocateAligned(sizeof(BlockBasedTableIterator)); return new (mem) BlockBasedTableIterator( this, read_options, rep_->internal_comparator, std::move(index_iter), - !skip_filters && !read_options.total_order_seek && + !skip_filters && + (!read_options.total_order_seek || read_options.auto_prefix_mode || + read_options.prefix_same_as_start) && prefix_extractor != nullptr, need_upper_bound_check, prefix_extractor, caller, compaction_readahead_size, allow_unprepared_value); diff --git a/table/plain/plain_table_reader.cc b/table/plain/plain_table_reader.cc index d3c968f73a9..9d4e8eccf3e 100644 --- a/table/plain/plain_table_reader.cc +++ b/table/plain/plain_table_reader.cc @@ -201,8 +201,10 @@ InternalIterator* PlainTableReader::NewIterator( assert(table_properties_); // Auto prefix mode is not implemented in PlainTable. - bool use_prefix_seek = !IsTotalOrderMode() && !options.total_order_seek && - !options.auto_prefix_mode; + bool use_prefix_seek = + !IsTotalOrderMode() && + (options.prefix_same_as_start || + (!options.total_order_seek && !options.auto_prefix_mode)); if (arena == nullptr) { return new PlainTableIterator(this, use_prefix_seek); } else {