From 5c6f9134e759610575ece05dadb6e574aea8c202 Mon Sep 17 00:00:00 2001 From: Udi Date: Mon, 11 Jul 2022 16:48:53 +0300 Subject: [PATCH] Speedb's Paired Block Bloom (#29) --- include/rocksdb/filter_policy.h | 5 + plugin/speedb/CMakeLists.txt | 4 +- .../speedb_db_bloom_filter_test.cc | 2729 +++++++++++++++++ .../paired_filter/speedb_paired_bloom.cc | 120 + .../paired_filter/speedb_paired_bloom.h | 83 + .../speedb_paired_bloom_internal.cc | 833 +++++ .../speedb_paired_bloom_internal.h | 188 ++ plugin/speedb/speedb.mk | 15 +- plugin/speedb/speedb_customizable_test.cc | 105 + plugin/speedb/speedb_registry.cc | 21 + table/block_based/filter_policy.cc | 252 +- table/block_based/filter_policy_internal.h | 123 +- table/block_based/full_filter_block_test.cc | 4 + tools/db_crashtest.py | 6 + util/bloom_impl.h | 195 +- 15 files changed, 4443 insertions(+), 240 deletions(-) create mode 100644 plugin/speedb/paired_filter/speedb_db_bloom_filter_test.cc create mode 100644 plugin/speedb/paired_filter/speedb_paired_bloom.cc create mode 100644 plugin/speedb/paired_filter/speedb_paired_bloom.h create mode 100644 plugin/speedb/paired_filter/speedb_paired_bloom_internal.cc create mode 100644 plugin/speedb/paired_filter/speedb_paired_bloom_internal.h create mode 100644 plugin/speedb/speedb_customizable_test.cc diff --git a/include/rocksdb/filter_policy.h b/include/rocksdb/filter_policy.h index 039b826de7..d83925dd6c 100644 --- a/include/rocksdb/filter_policy.h +++ b/include/rocksdb/filter_policy.h @@ -103,6 +103,11 @@ class FilterPolicy : public Customizable { // family (rare), implementations may return Name(). virtual const char* CompatibilityName() const = 0; + // Utility helper to parse the URI passed to the CreateFromString() + // And extract the value of the bits-per-key passed via that URI + // See CreateFromString() below for more details + static double ExtractBitsPerKeyFromUri(const std::string& uri); + // Creates a new FilterPolicy based on the input value string and returns the // result The value might be an ID, and ID with properties, or an old-style // policy string. diff --git a/plugin/speedb/CMakeLists.txt b/plugin/speedb/CMakeLists.txt index b411820482..5d4f8ed24d 100644 --- a/plugin/speedb/CMakeLists.txt +++ b/plugin/speedb/CMakeLists.txt @@ -13,6 +13,8 @@ set(speedb_SOURCES speedb_registry.cc - memtable/hash_spd_rep.cc) + memtable/hash_spd_rep.cc + paired_filter/speedb_paired_bloom.cc + paired_filter/speedb_paired_bloom_internal.cc) set(speedb_FUNC register_SpeedbPlugins) diff --git a/plugin/speedb/paired_filter/speedb_db_bloom_filter_test.cc b/plugin/speedb/paired_filter/speedb_db_bloom_filter_test.cc new file mode 100644 index 0000000000..5b8bf596ee --- /dev/null +++ b/plugin/speedb/paired_filter/speedb_db_bloom_filter_test.cc @@ -0,0 +1,2729 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include +#include +#include +#include + +#include "cache/cache_entry_roles.h" +#include "cache/cache_reservation_manager.h" +#include "db/db_test_util.h" +#include "options/options_helper.h" +#include "plugin/speedb/paired_filter/speedb_paired_bloom.h" +#include "port/stack_trace.h" +#include "rocksdb/advanced_options.h" +#include "rocksdb/convenience.h" +#include "rocksdb/filter_policy.h" +#include "rocksdb/perf_context.h" +#include "rocksdb/statistics.h" +#include "rocksdb/table.h" +#include "table/block_based/block_based_table_reader.h" +#include "table/block_based/filter_policy_internal.h" +#include "table/format.h" +#include "test_util/testutil.h" +#include "util/string_util.h" + +namespace ROCKSDB_NAMESPACE { + +namespace { +std::shared_ptr Create(double bits_per_key, + const std::string& name) { + if (name == SpdbPairedBloomFilterPolicy::kClassName()) { + return std::make_shared(bits_per_key); + } else { + return nullptr; + } +} +const std::string kSpdbPairedBloom = SpdbPairedBloomFilterPolicy::kClassName(); + +} // namespace + +// DB tests related to Speedb's Paired Block Bloom Filter. + +class SpdbDBBloomFilterTest : public DBTestBase { + public: + SpdbDBBloomFilterTest() + : DBTestBase("speedb_db_bloom_filter_test", /*env_do_fsync=*/true) {} +}; + +class SpdbDBBloomFilterTestWithParam + : public DBTestBase, + public testing::WithParamInterface> { + protected: + bool partition_filters_; + + public: + SpdbDBBloomFilterTestWithParam() + : DBTestBase("speedb_db_bloom_filter_test", /*env_do_fsync=*/true) {} + + ~SpdbDBBloomFilterTestWithParam() override {} + + void SetUp() override { partition_filters_ = std::get<0>(GetParam()); } +}; + +class SpdbDBBloomFilterTestDefFormatVersion + : public SpdbDBBloomFilterTestWithParam {}; + +class SliceTransformLimitedDomainGeneric : public SliceTransform { + static constexpr size_t kPrefixLen = 5U; + + const char* Name() const override { + return "SliceTransformLimitedDomainGeneric"; + } + + Slice Transform(const Slice& src) const override { + return Slice(src.data(), kPrefixLen); + } + + bool InDomain(const Slice& src) const override { + // prefix will be x???? + return src.size() >= kPrefixLen; + } + + bool InRange(const Slice& dst) const override { + // prefix will be x???? + return dst.size() == kPrefixLen; + } +}; + +// KeyMayExist can lead to a few false positives, but not false negatives. +// To make test deterministic, use a much larger number of bits per key-20 than +// bits in the key, so that false positives are eliminated +TEST_P(SpdbDBBloomFilterTestDefFormatVersion, KeyMayExist) { + do { + ReadOptions ropts; + std::string value; + anon::OptionsOverride options_override; + options_override.filter_policy = Create(20, kSpdbPairedBloom); + options_override.partition_filters = partition_filters_; + options_override.metadata_block_size = 32; + Options options = CurrentOptions(options_override); + if (partition_filters_) { + auto* table_options = + options.table_factory->GetOptions(); + if (table_options != nullptr && + table_options->index_type != + BlockBasedTableOptions::kTwoLevelIndexSearch) { + // In the current implementation partitioned filters depend on + // partitioned indexes + continue; + } + } + options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics(); + CreateAndReopenWithCF({"pikachu"}, options); + + ASSERT_TRUE(!db_->KeyMayExist(ropts, handles_[1], "a", &value)); + + ASSERT_OK(Put(1, "a", "b")); + bool value_found = false; + ASSERT_TRUE( + db_->KeyMayExist(ropts, handles_[1], "a", &value, &value_found)); + ASSERT_TRUE(value_found); + ASSERT_EQ("b", value); + + ASSERT_OK(Flush(1)); + value.clear(); + + uint64_t numopen = TestGetTickerCount(options, NO_FILE_OPENS); + uint64_t cache_added = TestGetTickerCount(options, BLOCK_CACHE_ADD); + ASSERT_TRUE( + db_->KeyMayExist(ropts, handles_[1], "a", &value, &value_found)); + ASSERT_TRUE(!value_found); + // assert that no new files were opened and no new blocks were + // read into block cache. + ASSERT_EQ(numopen, TestGetTickerCount(options, NO_FILE_OPENS)); + ASSERT_EQ(cache_added, TestGetTickerCount(options, BLOCK_CACHE_ADD)); + + ASSERT_OK(Delete(1, "a")); + + numopen = TestGetTickerCount(options, NO_FILE_OPENS); + cache_added = TestGetTickerCount(options, BLOCK_CACHE_ADD); + ASSERT_TRUE(!db_->KeyMayExist(ropts, handles_[1], "a", &value)); + ASSERT_EQ(numopen, TestGetTickerCount(options, NO_FILE_OPENS)); + ASSERT_EQ(cache_added, TestGetTickerCount(options, BLOCK_CACHE_ADD)); + + ASSERT_OK(Flush(1)); + ASSERT_OK(dbfull()->TEST_CompactRange(0, nullptr, nullptr, handles_[1], + true /* disallow trivial move */)); + + numopen = TestGetTickerCount(options, NO_FILE_OPENS); + cache_added = TestGetTickerCount(options, BLOCK_CACHE_ADD); + ASSERT_TRUE(!db_->KeyMayExist(ropts, handles_[1], "a", &value)); + ASSERT_EQ(numopen, TestGetTickerCount(options, NO_FILE_OPENS)); + ASSERT_EQ(cache_added, TestGetTickerCount(options, BLOCK_CACHE_ADD)); + + ASSERT_OK(Delete(1, "c")); + + numopen = TestGetTickerCount(options, NO_FILE_OPENS); + cache_added = TestGetTickerCount(options, BLOCK_CACHE_ADD); + ASSERT_TRUE(!db_->KeyMayExist(ropts, handles_[1], "c", &value)); + ASSERT_EQ(numopen, TestGetTickerCount(options, NO_FILE_OPENS)); + ASSERT_EQ(cache_added, TestGetTickerCount(options, BLOCK_CACHE_ADD)); + + // KeyMayExist function only checks data in block caches, which is not used + // by plain table format. + } while ( + ChangeOptions(kSkipPlainTable | kSkipHashIndex | kSkipFIFOCompaction)); +} + +TEST_P(SpdbDBBloomFilterTestWithParam, + GetFilterByPrefixBloomCustomPrefixExtractor) { + Options options = last_options_; + options.prefix_extractor = + std::make_shared(); + options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics(); + get_perf_context()->Reset(); + get_perf_context()->EnablePerLevelPerfContext(); + BlockBasedTableOptions bbto; + bbto.filter_policy.reset(new SpdbPairedBloomFilterPolicy(20)); + if (partition_filters_) { + bbto.partition_filters = true; + bbto.index_type = BlockBasedTableOptions::IndexType::kTwoLevelIndexSearch; + } + bbto.whole_key_filtering = false; + options.table_factory.reset(NewBlockBasedTableFactory(bbto)); + DestroyAndReopen(options); + + WriteOptions wo; + ReadOptions ro; + FlushOptions fo; + fo.wait = true; + std::string value; + + ASSERT_OK(dbfull()->Put(wo, "barbarbar", "foo")); + ASSERT_OK(dbfull()->Put(wo, "barbarbar2", "foo2")); + ASSERT_OK(dbfull()->Put(wo, "foofoofoo", "bar")); + + ASSERT_OK(dbfull()->Flush(fo)); + + ASSERT_EQ("foo", Get("barbarbar")); + ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 0); + ASSERT_EQ( + 0, (*(get_perf_context()->level_to_perf_context))[0].bloom_filter_useful); + ASSERT_EQ("foo2", Get("barbarbar2")); + ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 0); + ASSERT_EQ( + 0, (*(get_perf_context()->level_to_perf_context))[0].bloom_filter_useful); + ASSERT_EQ("NOT_FOUND", Get("barbarbar3")); + ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 0); + ASSERT_EQ( + 0, (*(get_perf_context()->level_to_perf_context))[0].bloom_filter_useful); + + ASSERT_EQ("NOT_FOUND", Get("barfoofoo")); + ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 1); + ASSERT_EQ( + 1, (*(get_perf_context()->level_to_perf_context))[0].bloom_filter_useful); + + ASSERT_EQ("NOT_FOUND", Get("foobarbar")); + ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 2); + ASSERT_EQ( + 2, (*(get_perf_context()->level_to_perf_context))[0].bloom_filter_useful); + + ro.total_order_seek = true; + // NOTE: total_order_seek no longer affects Get() + ASSERT_EQ("NOT_FOUND", Get("foobarbar")); + ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 3); + ASSERT_EQ( + 3, (*(get_perf_context()->level_to_perf_context))[0].bloom_filter_useful); + + // No bloom on extractor changed +#ifndef ROCKSDB_LITE + ASSERT_OK(db_->SetOptions({{"prefix_extractor", "capped:10"}})); + ASSERT_EQ("NOT_FOUND", Get("foobarbar")); + ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 3); + ASSERT_EQ( + 3, (*(get_perf_context()->level_to_perf_context))[0].bloom_filter_useful); +#endif // ROCKSDB_LITE + + get_perf_context()->Reset(); +} + +TEST_P(SpdbDBBloomFilterTestWithParam, GetFilterByPrefixBloom) { + Options options = last_options_; + options.prefix_extractor.reset(NewFixedPrefixTransform(8)); + options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics(); + get_perf_context()->EnablePerLevelPerfContext(); + BlockBasedTableOptions bbto; + bbto.filter_policy.reset(new SpdbPairedBloomFilterPolicy(20)); + if (partition_filters_) { + bbto.partition_filters = true; + bbto.index_type = BlockBasedTableOptions::IndexType::kTwoLevelIndexSearch; + } + bbto.whole_key_filtering = false; + options.table_factory.reset(NewBlockBasedTableFactory(bbto)); + DestroyAndReopen(options); + + WriteOptions wo; + ReadOptions ro; + FlushOptions fo; + fo.wait = true; + std::string value; + + ASSERT_OK(dbfull()->Put(wo, "barbarbar", "foo")); + ASSERT_OK(dbfull()->Put(wo, "barbarbar2", "foo2")); + ASSERT_OK(dbfull()->Put(wo, "foofoofoo", "bar")); + + ASSERT_OK(dbfull()->Flush(fo)); + + ASSERT_EQ("foo", Get("barbarbar")); + ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 0); + ASSERT_EQ("foo2", Get("barbarbar2")); + ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 0); + ASSERT_EQ("NOT_FOUND", Get("barbarbar3")); + ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 0); + + ASSERT_EQ("NOT_FOUND", Get("barfoofoo")); + ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 1); + + ASSERT_EQ("NOT_FOUND", Get("foobarbar")); + ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 2); + + ro.total_order_seek = true; + // NOTE: total_order_seek no longer affects Get() + ASSERT_EQ("NOT_FOUND", Get("foobarbar")); + ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 3); + ASSERT_EQ( + 3, (*(get_perf_context()->level_to_perf_context))[0].bloom_filter_useful); + + // No bloom on extractor changed +#ifndef ROCKSDB_LITE + ASSERT_OK(db_->SetOptions({{"prefix_extractor", "capped:10"}})); + ASSERT_EQ("NOT_FOUND", Get("foobarbar")); + ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 3); + ASSERT_EQ( + 3, (*(get_perf_context()->level_to_perf_context))[0].bloom_filter_useful); +#endif // ROCKSDB_LITE + + get_perf_context()->Reset(); +} + +TEST_P(SpdbDBBloomFilterTestWithParam, WholeKeyFilterProp) { + for (bool partition_filters : {true, false}) { + Options options = last_options_; + options.prefix_extractor.reset(NewFixedPrefixTransform(3)); + options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics(); + get_perf_context()->EnablePerLevelPerfContext(); + + BlockBasedTableOptions bbto; + bbto.filter_policy.reset(new SpdbPairedBloomFilterPolicy(20)); + bbto.whole_key_filtering = false; + if (partition_filters) { + bbto.partition_filters = true; + bbto.index_type = BlockBasedTableOptions::IndexType::kTwoLevelIndexSearch; + } + options.table_factory.reset(NewBlockBasedTableFactory(bbto)); + DestroyAndReopen(options); + + WriteOptions wo; + ReadOptions ro; + FlushOptions fo; + fo.wait = true; + std::string value; + + ASSERT_OK(dbfull()->Put(wo, "foobar", "foo")); + // Needs insert some keys to make sure files are not filtered out by key + // ranges. + ASSERT_OK(dbfull()->Put(wo, "aaa", "")); + ASSERT_OK(dbfull()->Put(wo, "zzz", "")); + ASSERT_OK(dbfull()->Flush(fo)); + + Reopen(options); + ASSERT_EQ("NOT_FOUND", Get("foo")); + ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 0); + ASSERT_EQ("NOT_FOUND", Get("bar")); + ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 1); + ASSERT_EQ("foo", Get("foobar")); + ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 1); + + // Reopen with whole key filtering enabled and prefix extractor + // NULL. Bloom filter should be off for both of whole key and + // prefix bloom. + bbto.whole_key_filtering = true; + options.table_factory.reset(NewBlockBasedTableFactory(bbto)); + options.prefix_extractor.reset(); + Reopen(options); + + ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 1); + ASSERT_EQ("NOT_FOUND", Get("foo")); + ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 1); + ASSERT_EQ("NOT_FOUND", Get("bar")); + ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 1); + ASSERT_EQ("foo", Get("foobar")); + ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 1); + // Write DB with only full key filtering. + ASSERT_OK(dbfull()->Put(wo, "foobar", "foo")); + // Needs insert some keys to make sure files are not filtered out by key + // ranges. + ASSERT_OK(dbfull()->Put(wo, "aaa", "")); + ASSERT_OK(dbfull()->Put(wo, "zzz", "")); + ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr)); + + // Reopen with both of whole key off and prefix extractor enabled. + // Still no bloom filter should be used. + options.prefix_extractor.reset(NewFixedPrefixTransform(3)); + bbto.whole_key_filtering = false; + options.table_factory.reset(NewBlockBasedTableFactory(bbto)); + Reopen(options); + + ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 1); + ASSERT_EQ("NOT_FOUND", Get("foo")); + ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 1); + ASSERT_EQ("NOT_FOUND", Get("bar")); + ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 1); + ASSERT_EQ("foo", Get("foobar")); + ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 1); + + // Try to create a DB with mixed files: + ASSERT_OK(dbfull()->Put(wo, "foobar", "foo")); + // Needs insert some keys to make sure files are not filtered out by key + // ranges. + ASSERT_OK(dbfull()->Put(wo, "aaa", "")); + ASSERT_OK(dbfull()->Put(wo, "zzz", "")); + ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr)); + + options.prefix_extractor.reset(); + bbto.whole_key_filtering = true; + options.table_factory.reset(NewBlockBasedTableFactory(bbto)); + Reopen(options); + + // Try to create a DB with mixed files. + ASSERT_OK(dbfull()->Put(wo, "barfoo", "bar")); + // In this case needs insert some keys to make sure files are + // not filtered out by key ranges. + ASSERT_OK(dbfull()->Put(wo, "aaa", "")); + ASSERT_OK(dbfull()->Put(wo, "zzz", "")); + ASSERT_OK(Flush()); + + // Now we have two files: + // File 1: An older file with prefix bloom. + // File 2: A newer file with whole bloom filter. + ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 1); + ASSERT_EQ("NOT_FOUND", Get("foo")); + ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 2); + ASSERT_EQ("NOT_FOUND", Get("bar")); + ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 3); + ASSERT_EQ("foo", Get("foobar")); + ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 4); + ASSERT_EQ("bar", Get("barfoo")); + ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 4); + + // Reopen with the same setting: only whole key is used + Reopen(options); + ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 4); + ASSERT_EQ("NOT_FOUND", Get("foo")); + ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 5); + ASSERT_EQ("NOT_FOUND", Get("bar")); + ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 6); + ASSERT_EQ("foo", Get("foobar")); + ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 7); + ASSERT_EQ("bar", Get("barfoo")); + ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 7); + + // Restart with both filters are allowed + options.prefix_extractor.reset(NewFixedPrefixTransform(3)); + bbto.whole_key_filtering = true; + options.table_factory.reset(NewBlockBasedTableFactory(bbto)); + Reopen(options); + ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 7); + // File 1 will has it filtered out. + // File 2 will not, as prefix `foo` exists in the file. + ASSERT_EQ("NOT_FOUND", Get("foo")); + ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 8); + ASSERT_EQ("NOT_FOUND", Get("bar")); + ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 10); + ASSERT_EQ("foo", Get("foobar")); + ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 11); + ASSERT_EQ("bar", Get("barfoo")); + ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 11); + + // Restart with only prefix bloom is allowed. + options.prefix_extractor.reset(NewFixedPrefixTransform(3)); + bbto.whole_key_filtering = false; + options.table_factory.reset(NewBlockBasedTableFactory(bbto)); + Reopen(options); + ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 11); + ASSERT_EQ("NOT_FOUND", Get("foo")); + ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 11); + ASSERT_EQ("NOT_FOUND", Get("bar")); + ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 12); + ASSERT_EQ("foo", Get("foobar")); + ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 12); + ASSERT_EQ("bar", Get("barfoo")); + ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 12); + uint64_t bloom_filter_useful_all_levels = 0; + for (auto& kv : (*(get_perf_context()->level_to_perf_context))) { + if (kv.second.bloom_filter_useful > 0) { + bloom_filter_useful_all_levels += kv.second.bloom_filter_useful; + } + } + ASSERT_EQ(12, bloom_filter_useful_all_levels); + get_perf_context()->Reset(); + } +} + +TEST_P(SpdbDBBloomFilterTestWithParam, BloomFilter) { + do { + Options options = CurrentOptions(); + env_->count_random_reads_ = true; + options.env = env_; + // ChangeCompactOptions() only changes compaction style, which does not + // trigger reset of table_factory + BlockBasedTableOptions table_options; + table_options.no_block_cache = true; + const auto kBpk = 20U; + const auto bytes_per_key = kBpk / 8; + table_options.filter_policy = Create(kBpk, kSpdbPairedBloom); + ASSERT_FALSE(table_options.filter_policy == nullptr); + table_options.partition_filters = partition_filters_; + if (partition_filters_) { + table_options.index_type = + BlockBasedTableOptions::IndexType::kTwoLevelIndexSearch; + } + ASSERT_GE(table_options.format_version, 5U); + // value delta encoding challenged more with index interval > 1 + table_options.index_block_restart_interval = 8; + table_options.metadata_block_size = 32; + options.table_factory.reset(NewBlockBasedTableFactory(table_options)); + + CreateAndReopenWithCF({"pikachu"}, options); + + // Populate multiple layers + const int N = 10000; + for (int i = 0; i < N; i++) { + ASSERT_OK(Put(1, Key(i), Key(i))); + } + Compact(1, "a", "z"); + for (int i = 0; i < N; i += 100) { + ASSERT_OK(Put(1, Key(i), Key(i))); + } + ASSERT_OK(Flush(1)); + + // Prevent auto compactions triggered by seeks + env_->delay_sstable_sync_.store(true, std::memory_order_release); + + // Lookup present keys. Should rarely read from small sstable. + env_->random_read_counter_.Reset(); + for (int i = 0; i < N; i++) { + ASSERT_EQ(Key(i), Get(1, Key(i))); + } + int reads = env_->random_read_counter_.Read(); + fprintf(stderr, "%d present => %d reads\n", N, reads); + ASSERT_GE(reads, N); + if (partition_filters_) { + // Without block cache, we read an extra partition filter per each + // level*read and a partition index per each read + ASSERT_LE(reads, 4 * N + 2 * N / 100); + } else { + ASSERT_LE(reads, N + 2 * N / 100); + } + + // Lookup present keys. Should rarely read from either sstable. + env_->random_read_counter_.Reset(); + for (int i = 0; i < N; i++) { + ASSERT_EQ("NOT_FOUND", Get(1, Key(i) + ".missing")); + } + reads = env_->random_read_counter_.Read(); + fprintf(stderr, "%d missing => %d reads\n", N, reads); + if (partition_filters_) { + // With partitioned filter we read one extra filter per level per each + // missed read. + ASSERT_LE(reads, 2 * N + 3 * N / 100); + } else { + ASSERT_LE(reads, 3 * N / 100); + } + +#ifndef ROCKSDB_LITE + // Sanity check some table properties + std::map props; + ASSERT_TRUE(db_->GetMapProperty( + handles_[1], DB::Properties::kAggregatedTableProperties, &props)); + uint64_t nkeys = N + N / 100; + uint64_t filter_size = ParseUint64(props["filter_size"]); + // TODO: Our Filter has a min size of 8192 bytes (64 X 128) => The upper + // limit depends on the number of filters + // => Adapt the caclulation + // // // EXPECT_LE(filter_size, + // // // (partition_filters_ ? 12 : 11) * nkeys / /*bits / byte*/ + // 8); Always Bloom + EXPECT_GE(filter_size, static_cast(bytes_per_key * nkeys)); + + uint64_t num_filter_entries = ParseUint64(props["num_filter_entries"]); + EXPECT_EQ(num_filter_entries, nkeys); + + // // // fprintf(stderr, "filter_size:%d, num_filter_entries:%d, + // nkeys:%d\n", (int)filter_size, (int)num_filter_entries, (int)nkeys); +#endif // ROCKSDB_LITE + + env_->delay_sstable_sync_.store(false, std::memory_order_release); + Close(); + } while (ChangeCompactOptions()); +} + +namespace { + +class AlwaysTrueBitsBuilder : public FilterBitsBuilder { + public: + void AddKey(const Slice&) override {} + size_t EstimateEntriesAdded() override { return 0U; } + Slice Finish(std::unique_ptr* /* buf */) override { + // Interpreted as "always true" filter (0 probes over 1 byte of + // payload, 5 bytes metadata) + return Slice("\0\0\0\0\0\0", 6); + } + using FilterBitsBuilder::Finish; + size_t ApproximateNumEntries(size_t) override { return SIZE_MAX; } +}; + +class AlwaysTrueFilterPolicy : public ReadOnlyBuiltinFilterPolicy { + public: + explicit AlwaysTrueFilterPolicy(bool skip) : skip_(skip) {} + + FilterBitsBuilder* GetBuilderWithContext( + const FilterBuildingContext&) const override { + if (skip_) { + return nullptr; + } else { + return new AlwaysTrueBitsBuilder(); + } + } + + private: + bool skip_; +}; + +} // namespace + +TEST_P(SpdbDBBloomFilterTestWithParam, SkipFilterOnEssentiallyZeroBpk) { + constexpr int maxKey = 10; + auto PutFn = [&]() { + int i; + // Put + for (i = 0; i < maxKey; i++) { + ASSERT_OK(Put(Key(i), Key(i))); + } + Flush(); + }; + auto GetFn = [&]() { + int i; + // Get OK + for (i = 0; i < maxKey; i++) { + ASSERT_EQ(Key(i), Get(Key(i))); + } + // Get NotFound + for (; i < maxKey * 2; i++) { + ASSERT_EQ(Get(Key(i)), "NOT_FOUND"); + } + }; + auto PutAndGetFn = [&]() { + PutFn(); + GetFn(); + }; +#ifndef ROCKSDB_LITE + std::map props; + const auto& kAggTableProps = DB::Properties::kAggregatedTableProperties; +#endif // ROCKSDB_LITE + + Options options = CurrentOptions(); + options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics(); + BlockBasedTableOptions table_options; + table_options.partition_filters = partition_filters_; + if (partition_filters_) { + table_options.index_type = + BlockBasedTableOptions::IndexType::kTwoLevelIndexSearch; + } + + // Test 1: bits per key < 0.5 means skip filters -> no filter + // constructed or read. + table_options.filter_policy = Create(0.4, kSpdbPairedBloom); + options.table_factory.reset(NewBlockBasedTableFactory(table_options)); + DestroyAndReopen(options); + PutAndGetFn(); + + // Verify no filter access nor contruction + EXPECT_EQ(TestGetTickerCount(options, BLOOM_FILTER_FULL_POSITIVE), 0); + EXPECT_EQ(TestGetTickerCount(options, BLOOM_FILTER_FULL_TRUE_POSITIVE), 0); + +#ifndef ROCKSDB_LITE + props.clear(); + ASSERT_TRUE(db_->GetMapProperty(kAggTableProps, &props)); + EXPECT_EQ(props["filter_size"], "0"); +#endif // ROCKSDB_LITE + + // Test 2: use custom API to skip filters -> no filter constructed + // or read. + table_options.filter_policy.reset( + new AlwaysTrueFilterPolicy(/* skip */ true)); + options.table_factory.reset(NewBlockBasedTableFactory(table_options)); + DestroyAndReopen(options); + PutAndGetFn(); + + // Verify no filter access nor construction + EXPECT_EQ(TestGetTickerCount(options, BLOOM_FILTER_FULL_POSITIVE), 0); + EXPECT_EQ(TestGetTickerCount(options, BLOOM_FILTER_FULL_TRUE_POSITIVE), 0); + +#ifndef ROCKSDB_LITE + props.clear(); + ASSERT_TRUE(db_->GetMapProperty(kAggTableProps, &props)); + EXPECT_EQ(props["filter_size"], "0"); +#endif // ROCKSDB_LITE + + // Control test: using an actual filter with 100% FP rate -> the filter + // is constructed and checked on read. + table_options.filter_policy.reset( + new AlwaysTrueFilterPolicy(/* skip */ false)); + options.table_factory.reset(NewBlockBasedTableFactory(table_options)); + DestroyAndReopen(options); + PutAndGetFn(); + + // Verify filter is accessed (and constructed) + EXPECT_EQ(TestGetAndResetTickerCount(options, BLOOM_FILTER_FULL_POSITIVE), + maxKey * 2); + EXPECT_EQ( + TestGetAndResetTickerCount(options, BLOOM_FILTER_FULL_TRUE_POSITIVE), + maxKey); +#ifndef ROCKSDB_LITE + props.clear(); + ASSERT_TRUE(db_->GetMapProperty(kAggTableProps, &props)); + EXPECT_NE(props["filter_size"], "0"); +#endif // ROCKSDB_LITE + + // Test 3 (options test): Able to read existing filters with longstanding + // generated options file entry `filter_policy=rocksdb.BuiltinBloomFilter` + ASSERT_OK(FilterPolicy::CreateFromString(ConfigOptions(), + "rocksdb.BuiltinBloomFilter", + &table_options.filter_policy)); + options.table_factory.reset(NewBlockBasedTableFactory(table_options)); + Reopen(options); + GetFn(); + + // Verify filter is accessed + EXPECT_EQ(TestGetAndResetTickerCount(options, BLOOM_FILTER_FULL_POSITIVE), + maxKey * 2); + EXPECT_EQ( + TestGetAndResetTickerCount(options, BLOOM_FILTER_FULL_TRUE_POSITIVE), + maxKey); + + // But new filters are not generated (configuration details unknown) + DestroyAndReopen(options); + PutAndGetFn(); + + // Verify no filter access nor construction + EXPECT_EQ(TestGetTickerCount(options, BLOOM_FILTER_FULL_POSITIVE), 0); + EXPECT_EQ(TestGetTickerCount(options, BLOOM_FILTER_FULL_TRUE_POSITIVE), 0); + +#ifndef ROCKSDB_LITE + props.clear(); + ASSERT_TRUE(db_->GetMapProperty(kAggTableProps, &props)); + EXPECT_EQ(props["filter_size"], "0"); +#endif // ROCKSDB_LITE +} + +INSTANTIATE_TEST_CASE_P(DBBloomFilterTestWithParam, + SpdbDBBloomFilterTestWithParam, + ::testing::Values(false, true)); + +#if !defined(ROCKSDB_VALGRIND_RUN) || defined(ROCKSDB_FULL_VALGRIND_RUN) +INSTANTIATE_TEST_CASE_P(FormatDef, SpdbDBBloomFilterTestDefFormatVersion, + ::testing::Values(false, true)); + +INSTANTIATE_TEST_CASE_P(FormatDef, SpdbDBBloomFilterTestWithParam, + ::testing::Values(false, true)); + +INSTANTIATE_TEST_CASE_P(FormatLatest, SpdbDBBloomFilterTestWithParam, + ::testing::Values(false, true)); +#endif // !defined(ROCKSDB_VALGRIND_RUN) || defined(ROCKSDB_FULL_VALGRIND_RUN) + +TEST_F(SpdbDBBloomFilterTest, BloomFilterRate) { + while (ChangeFilterOptions()) { + anon::OptionsOverride options_override; + options_override.filter_policy = Create(20, kSpdbPairedBloom); + Options options = CurrentOptions(options_override); + options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics(); + get_perf_context()->EnablePerLevelPerfContext(); + CreateAndReopenWithCF({"pikachu"}, options); + + const int maxKey = 10000; + for (int i = 0; i < maxKey; i++) { + ASSERT_OK(Put(1, Key(i), Key(i))); + } + // Add a large key to make the file contain wide range + ASSERT_OK(Put(1, Key(maxKey + 55555), Key(maxKey + 55555))); + Flush(1); + + // Check if they can be found + for (int i = 0; i < maxKey; i++) { + ASSERT_EQ(Key(i), Get(1, Key(i))); + } + ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 0); + + // Check if filter is useful + for (int i = 0; i < maxKey; i++) { + ASSERT_EQ("NOT_FOUND", Get(1, Key(i + 33333))); + } + ASSERT_GE(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), maxKey * 0.98); + ASSERT_GE( + (*(get_perf_context()->level_to_perf_context))[0].bloom_filter_useful, + maxKey * 0.98); + get_perf_context()->Reset(); + } +} + +namespace { +struct CompatibilityConfig { + std::shared_ptr policy; + bool partitioned; + uint32_t format_version; + + void SetInTableOptions(BlockBasedTableOptions* table_options) { + table_options->filter_policy = policy; + table_options->partition_filters = partitioned; + if (partitioned) { + table_options->index_type = + BlockBasedTableOptions::IndexType::kTwoLevelIndexSearch; + } else { + table_options->index_type = + BlockBasedTableOptions::IndexType::kBinarySearch; + } + table_options->format_version = format_version; + } +}; +// // // // High bits per key -> almost no FPs +// // // std::shared_ptr kCompatibilityBloomPolicy{ +// // // NewBloomFilterPolicy(20)}; +// // // // bloom_before_level=-1 -> always use Ribbon +// // // std::shared_ptr kCompatibilityRibbonPolicy{ +// // // NewRibbonFilterPolicy(20, -1)}; + +// // // std::vector kCompatibilityConfigs = { +// // // {Create(20, kDeprecatedBlock), false, +// // // BlockBasedTableOptions().format_version}, +// // // {kCompatibilityBloomPolicy, false, +// BlockBasedTableOptions().format_version}, +// // // {kCompatibilityBloomPolicy, true, +// BlockBasedTableOptions().format_version}, +// // // {kCompatibilityBloomPolicy, false, /* legacy Bloom */ 4U}, +// // // {kCompatibilityRibbonPolicy, false, +// // // BlockBasedTableOptions().format_version}, +// // // {kCompatibilityRibbonPolicy, true, +// BlockBasedTableOptions().format_version}, +// // // }; +} // namespace + +// // // TEST_F(SpdbDBBloomFilterTest, BloomFilterCompatibility) { +// // // Options options = CurrentOptions(); +// // // options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics(); +// // // options.level0_file_num_compaction_trigger = +// // // static_cast(kCompatibilityConfigs.size()) + 1; +// // // options.max_open_files = -1; + +// // // Close(); + +// // // // Create one file for each kind of filter. Each file covers a +// distinct key +// // // // range. +// // // for (size_t i = 0; i < kCompatibilityConfigs.size(); ++i) { +// // // BlockBasedTableOptions table_options; +// // // kCompatibilityConfigs[i].SetInTableOptions(&table_options); +// // // ASSERT_TRUE(table_options.filter_policy != nullptr); +// // // options.table_factory.reset(NewBlockBasedTableFactory(table_options)); +// // // Reopen(options); + +// // // std::string prefix = ToString(i) + "_"; +// // // ASSERT_OK(Put(prefix + "A", "val")); +// // // ASSERT_OK(Put(prefix + "Z", "val")); +// // // ASSERT_OK(Flush()); +// // // } + +// // // // Test filter is used between each pair of {reader,writer} +// configurations, +// // // // because any built-in FilterPolicy should be able to read filters +// from any +// // // // other built-in FilterPolicy +// // // for (size_t i = 0; i < kCompatibilityConfigs.size(); ++i) { +// // // BlockBasedTableOptions table_options; +// // // kCompatibilityConfigs[i].SetInTableOptions(&table_options); +// // // options.table_factory.reset(NewBlockBasedTableFactory(table_options)); +// // // Reopen(options); +// // // for (size_t j = 0; j < kCompatibilityConfigs.size(); ++j) { +// // // std::string prefix = ToString(j) + "_"; +// // // ASSERT_EQ("val", Get(prefix + "A")); // Filter positive +// // // ASSERT_EQ("val", Get(prefix + "Z")); // Filter positive +// // // // Filter negative, with high probability +// // // ASSERT_EQ("NOT_FOUND", Get(prefix + "Q")); +// // // // FULL_POSITIVE does not include block-based filter case (j == +// 0) +// // // EXPECT_EQ(TestGetAndResetTickerCount(options, +// BLOOM_FILTER_FULL_POSITIVE), +// // // j == 0 ? 0 : 2); +// // // EXPECT_EQ(TestGetAndResetTickerCount(options, +// BLOOM_FILTER_USEFUL), 1); +// // // } +// // // } +// // // } + +/* + * A cache wrapper that tracks peaks and increments of filter + * construction cache reservation. + * p0 + * / \ p1 + * / \ /\ + * / \/ \ + * a / b \ + * peaks = {p0, p1} + * increments = {p1-a, p2-b} + */ +class FilterConstructResPeakTrackingCache : public CacheWrapper { + public: + explicit FilterConstructResPeakTrackingCache(std::shared_ptr target) + : CacheWrapper(std::move(target)), + cur_cache_res_(0), + cache_res_peak_(0), + cache_res_increment_(0), + last_peak_tracked_(false), + cache_res_increments_sum_(0) {} + + using Cache::Insert; + Status Insert(const Slice& key, void* value, size_t charge, + void (*deleter)(const Slice& key, void* value), + Handle** handle = nullptr, + Priority priority = Priority::LOW) override { + Status s = target_->Insert(key, value, charge, deleter, handle, priority); + if (deleter == kNoopDeleterForFilterConstruction) { + if (last_peak_tracked_) { + cache_res_peak_ = 0; + cache_res_increment_ = 0; + last_peak_tracked_ = false; + } + cur_cache_res_ += charge; + cache_res_peak_ = std::max(cache_res_peak_, cur_cache_res_); + cache_res_increment_ += charge; + } + return s; + } + + using Cache::Release; + bool Release(Handle* handle, bool erase_if_last_ref = false) override { + auto deleter = GetDeleter(handle); + if (deleter == kNoopDeleterForFilterConstruction) { + if (!last_peak_tracked_) { + cache_res_peaks_.push_back(cache_res_peak_); + cache_res_increments_sum_ += cache_res_increment_; + last_peak_tracked_ = true; + } + cur_cache_res_ -= GetCharge(handle); + } + bool is_successful = target_->Release(handle, erase_if_last_ref); + return is_successful; + } + + std::deque GetReservedCachePeaks() { return cache_res_peaks_; } + + std::size_t GetReservedCacheIncrementSum() { + return cache_res_increments_sum_; + } + + private: + static const Cache::DeleterFn kNoopDeleterForFilterConstruction; + + std::size_t cur_cache_res_; + std::size_t cache_res_peak_; + std::size_t cache_res_increment_; + bool last_peak_tracked_; + std::deque cache_res_peaks_; + std::size_t cache_res_increments_sum_; +}; + +const Cache::DeleterFn + FilterConstructResPeakTrackingCache::kNoopDeleterForFilterConstruction = + CacheReservationManagerImpl< + CacheEntryRole::kFilterConstruction>::TEST_GetNoopDeleterForRole(); + +// To align with the type of hash entry being reserved in implementation. +using FilterConstructionReserveMemoryHash = uint64_t; + +class DBFilterConstructionReserveMemoryTestWithParam + : public DBTestBase, + public testing::WithParamInterface< + std::tuple> { + public: + DBFilterConstructionReserveMemoryTestWithParam() + : DBTestBase("db_bloom_filter_tests", + /*env_do_fsync=*/true), + num_key_(0), + charge_filter_construction_(std::get<0>(GetParam())), + partition_filters_(std::get<1>(GetParam())), + detect_filter_construct_corruption_(std::get<2>(GetParam())) { + if (charge_filter_construction_ == + CacheEntryRoleOptions::Decision::kDisabled) { + // For these cases, we only interested in whether filter construction + // cache reservation happens instead of its accuracy. Therefore we don't + // need many keys. + num_key_ = 5; + } else if (partition_filters_) { + // For PartitionFilter case, since we set + // table_options.metadata_block_size big enough such that each partition + // trigger at least 1 dummy entry reservation each for hash entries and + // final filter, we need a large number of keys to ensure we have at least + // two partitions. + num_key_ = 18 * + CacheReservationManagerImpl< + CacheEntryRole::kFilterConstruction>::GetDummyEntrySize() / + sizeof(FilterConstructionReserveMemoryHash); + } else { + // For Bloom Filter + FullFilter case, since we design the num_key_ to + // make hash entry cache reservation be a multiple of dummy entries, the + // correct behavior of charging final filter on top of it will trigger at + // least another dummy entry insertion. Therefore we can assert that + // behavior and we don't need a large number of keys to verify we + // indeed charge the final filter for cache reservation, even though final + // filter is a lot smaller than hash entries. + num_key_ = 1 * + CacheReservationManagerImpl< + CacheEntryRole::kFilterConstruction>::GetDummyEntrySize() / + sizeof(FilterConstructionReserveMemoryHash); + + // TODO: Add support for this test for our filter !!!!!!!!!!!!!!!!!! + } + } + + BlockBasedTableOptions GetBlockBasedTableOptions() { + BlockBasedTableOptions table_options; + + // We set cache capacity big enough to prevent cache full for convenience in + // calculation. + constexpr std::size_t kCacheCapacity = 100 * 1024 * 1024; + + table_options.cache_usage_options.options_overrides.insert( + {CacheEntryRole::kFilterConstruction, + {/*.charged = */ charge_filter_construction_}}); + table_options.filter_policy = Create(10, kSpdbPairedBloom); + table_options.partition_filters = partition_filters_; + if (table_options.partition_filters) { + table_options.index_type = + BlockBasedTableOptions::IndexType::kTwoLevelIndexSearch; + // We set table_options.metadata_block_size big enough so that each + // partition trigger at least 1 dummy entry insertion each for hash + // entries and final filter. + table_options.metadata_block_size = 409000; + } + table_options.detect_filter_construct_corruption = + detect_filter_construct_corruption_; + + LRUCacheOptions lo; + lo.capacity = kCacheCapacity; + lo.num_shard_bits = 0; // 2^0 shard + lo.strict_capacity_limit = true; + cache_ = std::make_shared( + (NewLRUCache(lo))); + table_options.block_cache = cache_; + + return table_options; + } + + std::size_t GetNumKey() { return num_key_; } + + CacheEntryRoleOptions::Decision ChargeFilterConstructMemory() { + return charge_filter_construction_; + } + + bool PartitionFilters() { return partition_filters_; } + + std::shared_ptr + GetFilterConstructResPeakTrackingCache() { + return cache_; + } + + private: + std::size_t num_key_; + CacheEntryRoleOptions::Decision charge_filter_construction_; + bool partition_filters_; + std::shared_ptr cache_; + bool detect_filter_construct_corruption_; +}; + +INSTANTIATE_TEST_CASE_P( + DBFilterConstructionReserveMemoryTestWithParam, + DBFilterConstructionReserveMemoryTestWithParam, + ::testing::Values( + std::make_tuple(CacheEntryRoleOptions::Decision::kEnabled, false, + false), + std::make_tuple(CacheEntryRoleOptions::Decision::kEnabled, false, true), + std::make_tuple(CacheEntryRoleOptions::Decision::kEnabled, true, false), + std::make_tuple(CacheEntryRoleOptions::Decision::kEnabled, true, + true))); + +// TODO: Speed up this test, and reduce disk space usage (~700MB) +// The current test inserts many keys (on the scale of dummy entry size) +// in order to make small memory user (e.g, final filter, partitioned hash +// entries/filter/banding) , which is proportional to the number of +// keys, big enough so that its cache reservation triggers dummy entry insertion +// and becomes observable in the test. +// +// However, inserting that many keys slows down this test and leaves future +// developers an opportunity to speed it up. +// +// Possible approaches & challenges: +// 1. Use sync point during cache reservation of filter construction +// +// Benefit: It does not rely on triggering dummy entry insertion +// but the sync point to verify small memory user is charged correctly. +// +// Challenge: this approach is intrusive. +// +// 2. Make dummy entry size configurable and set it small in the test +// +// Benefit: It increases the precision of cache reservation and therefore +// small memory usage can still trigger insertion of dummy entry. +// +// Challenge: change CacheReservationManager related APIs and a hack +// might be needed to control the size of dummmy entry of +// CacheReservationManager used in filter construction for testing +// since CacheReservationManager is not exposed at the high level. +// +TEST_P(DBFilterConstructionReserveMemoryTestWithParam, ReserveMemory) { + // // // Options options = CurrentOptions(); + // // // // We set write_buffer_size big enough so that in the case where + // there is + // // // // filter construction cache reservation, flush won't be triggered + // before we + // // // // manually trigger it for clean testing + // // // options.write_buffer_size = 640 << 20; + // // // BlockBasedTableOptions table_options = GetBlockBasedTableOptions(); + // // // + // options.table_factory.reset(NewBlockBasedTableFactory(table_options)); + // // // std::shared_ptr cache = + // // // GetFilterConstructResPeakTrackingCache(); + // // // options.create_if_missing = true; + // // // // Disable auto compaction to prevent its unexpected side effect + // // // // to the number of keys per partition designed by us in the test + // // // options.disable_auto_compactions = true; + // // // DestroyAndReopen(options); + // // // int num_key = static_cast(GetNumKey()); + // // // for (int i = 0; i < num_key; i++) { + // // // ASSERT_OK(Put(Key(i), Key(i))); + // // // } + + // // // ASSERT_EQ(cache->GetReservedCacheIncrementSum(), 0) + // // // << "Flush was triggered too early in the test case with filter " + // // // "construction cache reservation - please make sure no flush + // triggered " + // // // "during the key insertions above"; + + // // // ASSERT_OK(Flush()); + + // // // bool reserve_table_builder_memory = ReserveTableBuilderMemory(); + // // // std::string policy = kSpdbPairedBloom; + // // // bool partition_filters = PartitionFilters(); + // // // bool detect_filter_construct_corruption = + // // // table_options.detect_filter_construct_corruption; + + // // // std::deque filter_construction_cache_res_peaks = + // // // cache->GetReservedCachePeaks(); + // // // std::size_t filter_construction_cache_res_increments_sum = + // // // cache->GetReservedCacheIncrementSum(); + + // // // if (!reserve_table_builder_memory) { + // // // EXPECT_EQ(filter_construction_cache_res_peaks.size(), 0); + // // // return; + // // // } + + // // // const std::size_t kDummyEntrySize = CacheReservationManagerImpl< + // // // CacheEntryRole::kFilterConstruction>::GetDummyEntrySize(); + + // // // const std::size_t predicted_hash_entries_cache_res = + // // // num_key * sizeof(FilterConstructionReserveMemoryHash); + // // // ASSERT_EQ(predicted_hash_entries_cache_res % kDummyEntrySize, 0) + // // // << "It's by this test's design that + // predicted_hash_entries_cache_res is " + // // // "a multipe of dummy entry"; + + // // // const std::size_t predicted_hash_entries_cache_res_dummy_entry_num = + // // // predicted_hash_entries_cache_res / kDummyEntrySize; + // // // const std::size_t predicted_final_filter_cache_res = + // // // static_cast( + // // // std::ceil(1.0 * + // predicted_hash_entries_cache_res_dummy_entry_num / 6 * 1)) * + // kDummyEntrySize; + // // // const std::size_t predicted_banding_cache_res = + // // // static_cast( + // // // std::ceil(predicted_hash_entries_cache_res_dummy_entry_num + // * 2.5)) * + // // // kDummyEntrySize; + +#if 0 + if (policy == kFastLocalBloom) { + /* kFastLocalBloom + FullFilter + * p0 + * / \ + * b / \ + * / \ + * / \ + * 0/ \ + * hash entries = b - 0, final filter = p0 - b + * p0 = hash entries + final filter + * + * The test is designed in a way such that the reservation for b is a + * multiple of dummy entries so that reservation for (p0 - b) + * will trigger at least another dummy entry insertion. + * + * kFastLocalBloom + FullFilter + + * detect_filter_construct_corruption + * The peak p0 stays the same as + * (kFastLocalBloom + FullFilter) but just lasts + * longer since we release hash entries reservation later. + * + * kFastLocalBloom + PartitionedFilter + * p1 + * / \ + * p0 b'/ \ + * / \ / \ + * b / \ / \ + * / \ / \ + * / a \ + * 0/ \ + * partitioned hash entries1 = b - 0, partitioned hash entries1 = b' - a + * parittioned final filter1 = p0 - b, parittioned final filter2 = p1 - b' + * + * (increment p0 - 0) + (increment p1 - a) + * = partitioned hash entries1 + partitioned hash entries2 + * + parittioned final filter1 + parittioned final filter2 + * = hash entries + final filter + * + * kFastLocalBloom + PartitionedFilter + + * detect_filter_construct_corruption + * The peak p0, p1 stay the same as + * (kFastLocalBloom + PartitionedFilter) but just + * last longer since we release hash entries reservation later. + * + */ + if (!partition_filters) { + EXPECT_EQ(filter_construction_cache_res_peaks.size(), 1) + << "Filter construction cache reservation should have only 1 peak in " + "case: kFastLocalBloom + FullFilter"; + std::size_t filter_construction_cache_res_peak = + filter_construction_cache_res_peaks[0]; + EXPECT_GT(filter_construction_cache_res_peak, + predicted_hash_entries_cache_res) + << "The testing number of hash entries is designed to make hash " + "entries cache reservation be multiples of dummy entries" + " so the correct behavior of charging final filter on top of it" + " should've triggered at least another dummy entry insertion"; + + std::size_t predicted_filter_construction_cache_res_peak = + predicted_hash_entries_cache_res + predicted_final_filter_cache_res; + EXPECT_GE(filter_construction_cache_res_peak, + predicted_filter_construction_cache_res_peak * 0.9); + EXPECT_LE(filter_construction_cache_res_peak, + predicted_filter_construction_cache_res_peak * 1.1); + return; + } else { + EXPECT_GE(filter_construction_cache_res_peaks.size(), 2) + << "Filter construction cache reservation should have multiple peaks " + "in case: kFastLocalBloom + " + "PartitionedFilter"; + std::size_t predicted_filter_construction_cache_res_increments_sum = + predicted_hash_entries_cache_res + predicted_final_filter_cache_res; + EXPECT_GE(filter_construction_cache_res_increments_sum, + predicted_filter_construction_cache_res_increments_sum * 0.9); + EXPECT_LE(filter_construction_cache_res_increments_sum, + predicted_filter_construction_cache_res_increments_sum * 1.1); + return; + } + } +#endif +} + +class DBFilterConstructionCorruptionTestWithParam + : public DBTestBase, + public testing::WithParamInterface< + std::tuple> { + public: + DBFilterConstructionCorruptionTestWithParam() + : DBTestBase("db_bloom_filter_tests", + /*env_do_fsync=*/true) {} + + BlockBasedTableOptions GetBlockBasedTableOptions() { + BlockBasedTableOptions table_options; + table_options.detect_filter_construct_corruption = std::get<0>(GetParam()); + table_options.filter_policy = Create(20, kSpdbPairedBloom); + table_options.partition_filters = std::get<1>(GetParam()); + if (table_options.partition_filters) { + table_options.index_type = + BlockBasedTableOptions::IndexType::kTwoLevelIndexSearch; + // We set table_options.metadata_block_size small enough so we can + // trigger filter partitioning with GetNumKey() amount of keys + table_options.metadata_block_size = 10; + } + + return table_options; + } + + // Return an appropriate amount of keys for testing + // to generate a long filter (i.e, size >= 8 + kMetadataLen) + std::size_t GetNumKey() { return 5000; } +}; + +INSTANTIATE_TEST_CASE_P(DBFilterConstructionCorruptionTestWithParam, + DBFilterConstructionCorruptionTestWithParam, + ::testing::Values(std::make_tuple(false, false), + std::make_tuple(true, false), + std::make_tuple(true, true))); + +TEST_P(DBFilterConstructionCorruptionTestWithParam, DetectCorruption) { + Options options = CurrentOptions(); + BlockBasedTableOptions table_options = GetBlockBasedTableOptions(); + options.table_factory.reset(NewBlockBasedTableFactory(table_options)); + options.create_if_missing = true; + options.disable_auto_compactions = true; + + DestroyAndReopen(options); + int num_key = static_cast(GetNumKey()); + Status s; + + // Case 1: No corruption in filter construction + for (int i = 0; i < num_key; i++) { + ASSERT_OK(Put(Key(i), Key(i))); + } + s = Flush(); + EXPECT_TRUE(s.ok()); + + // Case 2: Corruption of hash entries in filter construction + for (int i = 0; i < num_key; i++) { + ASSERT_OK(Put(Key(i), Key(i))); + } + + SyncPoint::GetInstance()->SetCallBack( + "XXPH3FilterBitsBuilder::Finish::TamperHashEntries", [&](void* arg) { + std::deque* hash_entries_to_corrupt = + (std::deque*)arg; + assert(!hash_entries_to_corrupt->empty()); + *(hash_entries_to_corrupt->begin()) = + *(hash_entries_to_corrupt->begin()) ^ uint64_t { 1 }; + }); + SyncPoint::GetInstance()->EnableProcessing(); + + s = Flush(); + + if (table_options.detect_filter_construct_corruption) { + EXPECT_TRUE(s.IsCorruption()); + EXPECT_TRUE( + s.ToString().find("Filter's hash entries checksum mismatched") != + std::string::npos); + } else { + EXPECT_TRUE(s.ok()); + } + + SyncPoint::GetInstance()->DisableProcessing(); + SyncPoint::GetInstance()->ClearCallBack( + "XXPH3FilterBitsBuilder::Finish::" + "TamperHashEntries"); + + // Case 3: Corruption of filter content in filter construction + DestroyAndReopen(options); + + for (int i = 0; i < num_key; i++) { + ASSERT_OK(Put(Key(i), Key(i))); + } + + SyncPoint::GetInstance()->SetCallBack( + "XXPH3FilterBitsBuilder::Finish::TamperFilter", [&](void* arg) { + std::pair*, std::size_t>* TEST_arg_pair = + (std::pair*, std::size_t>*)arg; + std::size_t filter_size = TEST_arg_pair->second; + // 5 is the kMetadataLen and + assert(filter_size >= 8 + 5); + std::unique_ptr* filter_content_to_corrupt = + TEST_arg_pair->first; + std::memset(filter_content_to_corrupt->get(), '\0', 8); + }); + SyncPoint::GetInstance()->EnableProcessing(); + + s = Flush(); + + if (table_options.detect_filter_construct_corruption) { + EXPECT_TRUE(s.IsCorruption()); + EXPECT_TRUE(s.ToString().find("Corrupted filter content") != + std::string::npos); + } else { + EXPECT_TRUE(s.ok()); + } + + SyncPoint::GetInstance()->DisableProcessing(); + SyncPoint::GetInstance()->ClearCallBack( + "XXPH3FilterBitsBuilder::Finish::" + "TamperFilter"); +} + +// RocksDB lite does not support dynamic options +#ifndef ROCKSDB_LITE +TEST_P(DBFilterConstructionCorruptionTestWithParam, + DynamicallyTurnOnAndOffDetectConstructCorruption) { + Options options = CurrentOptions(); + BlockBasedTableOptions table_options = GetBlockBasedTableOptions(); + // We intend to turn on + // table_options.detect_filter_construct_corruption dynamically + // therefore we override this test parmater's value + table_options.detect_filter_construct_corruption = false; + + options.table_factory.reset(NewBlockBasedTableFactory(table_options)); + options.create_if_missing = true; + + int num_key = static_cast(GetNumKey()); + Status s; + + DestroyAndReopen(options); + + // Case 1: !table_options.detect_filter_construct_corruption + for (int i = 0; i < num_key; i++) { + ASSERT_OK(Put(Key(i), Key(i))); + } + + SyncPoint::GetInstance()->SetCallBack( + "XXPH3FilterBitsBuilder::Finish::TamperHashEntries", [&](void* arg) { + std::deque* hash_entries_to_corrupt = + (std::deque*)arg; + assert(!hash_entries_to_corrupt->empty()); + *(hash_entries_to_corrupt->begin()) = + *(hash_entries_to_corrupt->begin()) ^ uint64_t { 1 }; + }); + SyncPoint::GetInstance()->EnableProcessing(); + + s = Flush(); + + SyncPoint::GetInstance()->DisableProcessing(); + SyncPoint::GetInstance()->ClearCallBack( + "XXPH3FilterBitsBuilder::Finish::" + "TamperHashEntries"); + + ASSERT_FALSE(table_options.detect_filter_construct_corruption); + EXPECT_TRUE(s.ok()); + + // Case 2: dynamically turn on + // table_options.detect_filter_construct_corruption + ASSERT_OK(db_->SetOptions({{"block_based_table_factory", + "{detect_filter_construct_corruption=true;}"}})); + + for (int i = 0; i < num_key; i++) { + ASSERT_OK(Put(Key(i), Key(i))); + } + + SyncPoint::GetInstance()->SetCallBack( + "XXPH3FilterBitsBuilder::Finish::TamperHashEntries", [&](void* arg) { + std::deque* hash_entries_to_corrupt = + (std::deque*)arg; + assert(!hash_entries_to_corrupt->empty()); + *(hash_entries_to_corrupt->begin()) = + *(hash_entries_to_corrupt->begin()) ^ uint64_t { 1 }; + }); + SyncPoint::GetInstance()->EnableProcessing(); + + s = Flush(); + + SyncPoint::GetInstance()->DisableProcessing(); + SyncPoint::GetInstance()->ClearCallBack( + "XXPH3FilterBitsBuilder::Finish::" + "TamperHashEntries"); + + auto updated_table_options = + db_->GetOptions().table_factory->GetOptions(); + EXPECT_TRUE(updated_table_options->detect_filter_construct_corruption); + EXPECT_TRUE(s.IsCorruption()); + EXPECT_TRUE(s.ToString().find("Filter's hash entries checksum mismatched") != + std::string::npos); + + // Case 3: dynamically turn off + // table_options.detect_filter_construct_corruption + ASSERT_OK(db_->SetOptions({{"block_based_table_factory", + "{detect_filter_construct_corruption=false;}"}})); + updated_table_options = + db_->GetOptions().table_factory->GetOptions(); + EXPECT_FALSE(updated_table_options->detect_filter_construct_corruption); +} +#endif // ROCKSDB_LITE + +namespace { +// // // // NOTE: This class is referenced by HISTORY.md as a model for a +// wrapper +// // // // FilterPolicy selecting among configurations based on context. +// // // class LevelAndStyleCustomFilterPolicy : public FilterPolicy { +// // // public: +// // // explicit LevelAndStyleCustomFilterPolicy(int bpk_fifo, int +// bpk_l0_other, +// // // int bpk_otherwise) +// // // : policy_fifo_(NewBloomFilterPolicy(bpk_fifo)), +// // // policy_l0_other_(NewBloomFilterPolicy(bpk_l0_other)), +// // // policy_otherwise_(NewBloomFilterPolicy(bpk_otherwise)) {} + +// // // const char* Name() const override { +// // // return "LevelAndStyleCustomFilterPolicy"; +// // // } + +// // // // OK to use built-in policy name because we are deferring to a +// // // // built-in builder. We aren't changing the serialized format. +// // // const char* CompatibilityName() const override { +// // // return policy_fifo_->CompatibilityName(); +// // // } + +// // // FilterBitsBuilder* GetBuilderWithContext( +// // // const FilterBuildingContext& context) const override { +// // // if (context.compaction_style == kCompactionStyleFIFO) { +// // // return policy_fifo_->GetBuilderWithContext(context); +// // // } else if (context.level_at_creation == 0) { +// // // return policy_l0_other_->GetBuilderWithContext(context); +// // // } else { +// // // return policy_otherwise_->GetBuilderWithContext(context); +// // // } +// // // } + +// // // FilterBitsReader* GetFilterBitsReader(const Slice& contents) const +// override { +// // // // OK to defer to any of them; they all can parse built-in filters +// // // // from any settings. +// // // return policy_fifo_->GetFilterBitsReader(contents); +// // // } + +// // // private: +// // // const std::unique_ptr policy_fifo_; +// // // const std::unique_ptr policy_l0_other_; +// // // const std::unique_ptr policy_otherwise_; +// // // }; + +// // // static std::map +// // // table_file_creation_reason_to_string{ +// // // {TableFileCreationReason::kCompaction, "kCompaction"}, +// // // {TableFileCreationReason::kFlush, "kFlush"}, +// // // {TableFileCreationReason::kMisc, "kMisc"}, +// // // {TableFileCreationReason::kRecovery, "kRecovery"}, +// // // }; + +// // // class TestingContextCustomFilterPolicy +// // // : public LevelAndStyleCustomFilterPolicy { +// // // public: +// // // explicit TestingContextCustomFilterPolicy(int bpk_fifo, int +// bpk_l0_other, +// // // int bpk_otherwise) +// // // : LevelAndStyleCustomFilterPolicy(bpk_fifo, bpk_l0_other, +// bpk_otherwise) { +// // // } + +// // // FilterBitsBuilder* GetBuilderWithContext( +// // // const FilterBuildingContext& context) const override { +// // // test_report_ += "cf="; +// // // test_report_ += context.column_family_name; +// // // test_report_ += ",s="; +// // // test_report_ += +// // // OptionsHelper::compaction_style_to_string[context.compaction_style]; +// // // test_report_ += ",n="; +// // // test_report_ += ROCKSDB_NAMESPACE::ToString(context.num_levels); +// // // test_report_ += ",l="; +// // // test_report_ += +// ROCKSDB_NAMESPACE::ToString(context.level_at_creation); +// // // test_report_ += ",b="; +// // // test_report_ += +// ROCKSDB_NAMESPACE::ToString(int{context.is_bottommost}); +// // // test_report_ += ",r="; +// // // test_report_ += +// table_file_creation_reason_to_string[context.reason]; +// // // test_report_ += "\n"; + +// // // return +// LevelAndStyleCustomFilterPolicy::GetBuilderWithContext(context); +// // // } + +// // // std::string DumpTestReport() { +// // // std::string rv; +// // // std::swap(rv, test_report_); +// // // return rv; +// // // } + +// // // private: +// // // mutable std::string test_report_; +// // // }; +} // namespace + +// // // TEST_F(SpdbDBBloomFilterTest, ContextCustomFilterPolicy) { +// // // auto policy = std::make_shared(15, +// 8, 5); +// // // Options options; +// // // for (bool fifo : {true, false}) { +// // // options = CurrentOptions(); +// // // options.max_open_files = fifo ? -1 : options.max_open_files; +// // // options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics(); +// // // options.compaction_style = +// // // fifo ? kCompactionStyleFIFO : kCompactionStyleLevel; + +// // // BlockBasedTableOptions table_options; +// // // table_options.filter_policy = policy; +// // // table_options.format_version = 5; +// // // options.table_factory.reset(NewBlockBasedTableFactory(table_options)); + +// // // TryReopen(options); +// // // CreateAndReopenWithCF({fifo ? "abe" : "bob"}, options); + +// // // const int maxKey = 10000; +// // // for (int i = 0; i < maxKey / 2; i++) { +// // // ASSERT_OK(Put(1, Key(i), Key(i))); +// // // } +// // // // Add a large key to make the file contain wide range +// // // ASSERT_OK(Put(1, Key(maxKey + 55555), Key(maxKey + 55555))); +// // // Flush(1); +// // // EXPECT_EQ(policy->DumpTestReport(), +// // // fifo ? +// "cf=abe,s=kCompactionStyleFIFO,n=1,l=0,b=0,r=kFlush\n" +// // // : +// "cf=bob,s=kCompactionStyleLevel,n=7,l=0,b=0,r=kFlush\n"); + +// // // for (int i = maxKey / 2; i < maxKey; i++) { +// // // ASSERT_OK(Put(1, Key(i), Key(i))); +// // // } +// // // Flush(1); +// // // EXPECT_EQ(policy->DumpTestReport(), +// // // fifo ? +// "cf=abe,s=kCompactionStyleFIFO,n=1,l=0,b=0,r=kFlush\n" +// // // : +// "cf=bob,s=kCompactionStyleLevel,n=7,l=0,b=0,r=kFlush\n"); + +// // // // Check that they can be found +// // // for (int i = 0; i < maxKey; i++) { +// // // ASSERT_EQ(Key(i), Get(1, Key(i))); +// // // } +// // // // Since we have two tables / two filters, we might have Bloom +// checks on +// // // // our queries, but no more than one "useful" per query on a found +// key. +// // // EXPECT_LE(TestGetAndResetTickerCount(options, BLOOM_FILTER_USEFUL), +// maxKey); + +// // // // Check that we have two filters, each about +// // // // fifo: 0.12% FP rate (15 bits per key) +// // // // level: 2.3% FP rate (8 bits per key) +// // // for (int i = 0; i < maxKey; i++) { +// // // ASSERT_EQ("NOT_FOUND", Get(1, Key(i + 33333))); +// // // } +// // // { +// // // auto useful_count = +// // // TestGetAndResetTickerCount(options, BLOOM_FILTER_USEFUL); +// // // EXPECT_GE(useful_count, maxKey * 2 * (fifo ? 0.9980 : 0.975)); +// // // EXPECT_LE(useful_count, maxKey * 2 * (fifo ? 0.9995 : 0.98)); +// // // } + +// // // if (!fifo) { // FIFO only has L0 +// // // // Full compaction +// // // ASSERT_OK(db_->CompactRange(CompactRangeOptions(), handles_[1], +// nullptr, +// // // nullptr)); +// // // EXPECT_EQ(policy->DumpTestReport(), +// // // "cf=bob,s=kCompactionStyleLevel,n=7,l=1,b=1,r=kCompaction\n"); + +// // // // Check that we now have one filter, about 9.2% FP rate (5 bits +// per key) +// // // for (int i = 0; i < maxKey; i++) { +// // // ASSERT_EQ("NOT_FOUND", Get(1, Key(i + 33333))); +// // // } +// // // { +// // // auto useful_count = +// // // TestGetAndResetTickerCount(options, BLOOM_FILTER_USEFUL); +// // // EXPECT_GE(useful_count, maxKey * 0.90); +// // // EXPECT_LE(useful_count, maxKey * 0.91); +// // // } +// // // } else { +// // // #ifndef ROCKSDB_LITE +// // // // Also try external SST file +// // // { +// // // std::string file_path = dbname_ + "/external.sst"; +// // // SstFileWriter sst_file_writer(EnvOptions(), options, +// handles_[1]); +// // // ASSERT_OK(sst_file_writer.Open(file_path)); +// // // ASSERT_OK(sst_file_writer.Put("key", "value")); +// // // ASSERT_OK(sst_file_writer.Finish()); +// // // } +// // // // Note: kCompactionStyleLevel is default, ignored if num_levels +// == -1 +// // // EXPECT_EQ(policy->DumpTestReport(), +// // // "cf=abe,s=kCompactionStyleLevel,n=-1,l=-1,b=0,r=kMisc\n"); +// // // #endif +// // // } + +// // // // Destroy +// // // ASSERT_OK(dbfull()->DropColumnFamily(handles_[1])); +// // // ASSERT_OK(dbfull()->DestroyColumnFamilyHandle(handles_[1])); +// // // handles_[1] = nullptr; +// // // } +// // // } + +class SliceTransformLimitedDomain : public SliceTransform { + const char* Name() const override { return "SliceTransformLimitedDomain"; } + + Slice Transform(const Slice& src) const override { + return Slice(src.data(), 5); + } + + bool InDomain(const Slice& src) const override { + // prefix will be x???? + return src.size() >= 5 && src[0] == 'x'; + } + + bool InRange(const Slice& dst) const override { + // prefix will be x???? + return dst.size() == 5 && dst[0] == 'x'; + } +}; + +TEST_F(SpdbDBBloomFilterTest, PrefixExtractorFullFilter) { + BlockBasedTableOptions bbto; + // Full Filter Block + bbto.filter_policy.reset(new SpdbPairedBloomFilterPolicy(20)); + bbto.whole_key_filtering = false; + + Options options = CurrentOptions(); + options.prefix_extractor = std::make_shared(); + options.table_factory.reset(NewBlockBasedTableFactory(bbto)); + + DestroyAndReopen(options); + + ASSERT_OK(Put("x1111_AAAA", "val1")); + ASSERT_OK(Put("x1112_AAAA", "val2")); + ASSERT_OK(Put("x1113_AAAA", "val3")); + ASSERT_OK(Put("x1114_AAAA", "val4")); + // Not in domain, wont be added to filter + ASSERT_OK(Put("zzzzz_AAAA", "val5")); + + ASSERT_OK(Flush()); + + ASSERT_EQ(Get("x1111_AAAA"), "val1"); + ASSERT_EQ(Get("x1112_AAAA"), "val2"); + ASSERT_EQ(Get("x1113_AAAA"), "val3"); + ASSERT_EQ(Get("x1114_AAAA"), "val4"); + // Was not added to filter but rocksdb will try to read it from the filter + ASSERT_EQ(Get("zzzzz_AAAA"), "val5"); +} + +TEST_F(SpdbDBBloomFilterTest, PrefixExtractorBlockFilter) { + BlockBasedTableOptions bbto; + // Full Filter Block + bbto.filter_policy.reset(new SpdbPairedBloomFilterPolicy(20)); + Options options = CurrentOptions(); + options.prefix_extractor = std::make_shared(); + options.table_factory.reset(NewBlockBasedTableFactory(bbto)); + + DestroyAndReopen(options); + + ASSERT_OK(Put("x1113_AAAA", "val3")); + ASSERT_OK(Put("x1114_AAAA", "val4")); + // Not in domain, wont be added to filter + ASSERT_OK(Put("zzzzz_AAAA", "val1")); + ASSERT_OK(Put("zzzzz_AAAB", "val2")); + ASSERT_OK(Put("zzzzz_AAAC", "val3")); + ASSERT_OK(Put("zzzzz_AAAD", "val4")); + + ASSERT_OK(Flush()); + + std::vector iter_res; + auto iter = db_->NewIterator(ReadOptions()); + // Seek to a key that was not in Domain + for (iter->Seek("zzzzz_AAAA"); iter->Valid(); iter->Next()) { + iter_res.emplace_back(iter->value().ToString()); + } + + std::vector expected_res = {"val1", "val2", "val3", "val4"}; + ASSERT_EQ(iter_res, expected_res); + delete iter; +} + +TEST_F(SpdbDBBloomFilterTest, MemtableWholeKeyBloomFilter) { + // regression test for #2743. the range delete tombstones in memtable should + // be added even when Get() skips searching due to its prefix bloom filter + const int kMemtableSize = 1 << 20; // 1MB + const int kMemtablePrefixFilterSize = 1 << 13; // 8KB + const int kPrefixLen = 4; + anon::OptionsOverride options_override; + options_override.filter_policy = Create(20, kSpdbPairedBloom); + Options options = CurrentOptions(options_override); + options.memtable_prefix_bloom_size_ratio = + static_cast(kMemtablePrefixFilterSize) / kMemtableSize; + options.prefix_extractor.reset( + ROCKSDB_NAMESPACE::NewFixedPrefixTransform(kPrefixLen)); + options.write_buffer_size = kMemtableSize; + options.memtable_whole_key_filtering = false; + Reopen(options); + std::string key1("AAAABBBB"); + std::string key2("AAAACCCC"); // not in DB + std::string key3("AAAADDDD"); + std::string key4("AAAAEEEE"); + std::string value1("Value1"); + std::string value3("Value3"); + std::string value4("Value4"); + + ASSERT_OK(Put(key1, value1, WriteOptions())); + + // check memtable bloom stats + ASSERT_EQ("NOT_FOUND", Get(key2)); + ASSERT_EQ(0, get_perf_context()->bloom_memtable_miss_count); + // same prefix, bloom filter false positive + ASSERT_EQ(1, get_perf_context()->bloom_memtable_hit_count); + + // enable whole key bloom filter + options.memtable_whole_key_filtering = true; + Reopen(options); + // check memtable bloom stats + ASSERT_OK(Put(key3, value3, WriteOptions())); + ASSERT_EQ("NOT_FOUND", Get(key2)); + // whole key bloom filter kicks in and determines it's a miss + ASSERT_EQ(1, get_perf_context()->bloom_memtable_miss_count); + ASSERT_EQ(1, get_perf_context()->bloom_memtable_hit_count); + + // verify whole key filtering does not depend on prefix_extractor + options.prefix_extractor.reset(); + Reopen(options); + // check memtable bloom stats + ASSERT_OK(Put(key4, value4, WriteOptions())); + ASSERT_EQ("NOT_FOUND", Get(key2)); + // whole key bloom filter kicks in and determines it's a miss + ASSERT_EQ(2, get_perf_context()->bloom_memtable_miss_count); + ASSERT_EQ(1, get_perf_context()->bloom_memtable_hit_count); +} + +TEST_F(SpdbDBBloomFilterTest, MemtableWholeKeyBloomFilterMultiGet) { + anon::OptionsOverride options_override; + options_override.filter_policy = Create(20, kSpdbPairedBloom); + Options options = CurrentOptions(options_override); + options.memtable_prefix_bloom_size_ratio = 0.015; + options.memtable_whole_key_filtering = true; + Reopen(options); + std::string key1("AA"); + std::string key2("BB"); + std::string key3("CC"); + std::string key4("DD"); + std::string key_not("EE"); + std::string value1("Value1"); + std::string value2("Value2"); + std::string value3("Value3"); + std::string value4("Value4"); + + ASSERT_OK(Put(key1, value1, WriteOptions())); + ASSERT_OK(Put(key2, value2, WriteOptions())); + ASSERT_OK(Flush()); + ASSERT_OK(Put(key3, value3, WriteOptions())); + const Snapshot* snapshot = db_->GetSnapshot(); + ASSERT_OK(Put(key4, value4, WriteOptions())); + + // Delete key2 and key3 + ASSERT_OK( + db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), "BA", "CZ")); + + // Read without snapshot + auto results = MultiGet({key_not, key1, key2, key3, key4}); + ASSERT_EQ(results[0], "NOT_FOUND"); + ASSERT_EQ(results[1], value1); + ASSERT_EQ(results[2], "NOT_FOUND"); + ASSERT_EQ(results[3], "NOT_FOUND"); + ASSERT_EQ(results[4], value4); + + // Also check Get + ASSERT_EQ(Get(key1), value1); + ASSERT_EQ(Get(key2), "NOT_FOUND"); + ASSERT_EQ(Get(key3), "NOT_FOUND"); + ASSERT_EQ(Get(key4), value4); + + // Read with snapshot + results = MultiGet({key_not, key1, key2, key3, key4}, snapshot); + ASSERT_EQ(results[0], "NOT_FOUND"); + ASSERT_EQ(results[1], value1); + ASSERT_EQ(results[2], value2); + ASSERT_EQ(results[3], value3); + ASSERT_EQ(results[4], "NOT_FOUND"); + + // Also check Get + ASSERT_EQ(Get(key1, snapshot), value1); + ASSERT_EQ(Get(key2, snapshot), value2); + ASSERT_EQ(Get(key3, snapshot), value3); + ASSERT_EQ(Get(key4, snapshot), "NOT_FOUND"); + + db_->ReleaseSnapshot(snapshot); +} + +TEST_F(SpdbDBBloomFilterTest, MemtablePrefixBloomOutOfDomain) { + constexpr size_t kPrefixSize = 8; + const std::string kKey = "key"; + assert(kKey.size() < kPrefixSize); + anon::OptionsOverride options_override; + options_override.filter_policy = Create(20, kSpdbPairedBloom); + Options options = CurrentOptions(options_override); + options.prefix_extractor.reset(NewFixedPrefixTransform(kPrefixSize)); + options.memtable_prefix_bloom_size_ratio = 0.25; + Reopen(options); + ASSERT_OK(Put(kKey, "v")); + ASSERT_EQ("v", Get(kKey)); + std::unique_ptr iter(dbfull()->NewIterator(ReadOptions())); + iter->Seek(kKey); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ(kKey, iter->key()); + iter->SeekForPrev(kKey); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ(kKey, iter->key()); +} + +#ifndef ROCKSDB_LITE +namespace { +static const std::string kPlainTable = "test_PlainTableBloom"; +} // namespace + +class BloomStatsTestWithParam + : public SpdbDBBloomFilterTest, + public testing::WithParamInterface> { + public: + BloomStatsTestWithParam() { + partition_filters_ = std::get<1>(GetParam()); + + options_.create_if_missing = true; + options_.prefix_extractor.reset( + ROCKSDB_NAMESPACE::NewFixedPrefixTransform(4)); + options_.memtable_prefix_bloom_size_ratio = + 8.0 * 1024.0 / static_cast(options_.write_buffer_size); + BlockBasedTableOptions table_options; + if (partition_filters_) { + table_options.partition_filters = partition_filters_; + table_options.index_type = + BlockBasedTableOptions::IndexType::kTwoLevelIndexSearch; + } + table_options.filter_policy = Create(10, kSpdbPairedBloom); + options_.table_factory.reset(NewBlockBasedTableFactory(table_options)); + options_.env = env_; + + get_perf_context()->Reset(); + DestroyAndReopen(options_); + } + + ~BloomStatsTestWithParam() override { + get_perf_context()->Reset(); + Destroy(options_); + } + + // Required if inheriting from testing::WithParamInterface<> + static void SetUpTestCase() {} + static void TearDownTestCase() {} + + bool partition_filters_; + Options options_; +}; + +// 1 Insert 2 K-V pairs into DB +// 2 Call Get() for both keys - expext memtable bloom hit stat to be 2 +// 3 Call Get() for nonexisting key - expect memtable bloom miss stat to be 1 +// 4 Call Flush() to create SST +// 5 Call Get() for both keys - expext SST bloom hit stat to be 2 +// 6 Call Get() for nonexisting key - expect SST bloom miss stat to be 1 +// Test both: block and plain SST +TEST_P(BloomStatsTestWithParam, BloomStatsTest) { + std::string key1("AAAA"); + std::string key2("RXDB"); // not in DB + std::string key3("ZBRA"); + std::string value1("Value1"); + std::string value3("Value3"); + + ASSERT_OK(Put(key1, value1, WriteOptions())); + ASSERT_OK(Put(key3, value3, WriteOptions())); + + // check memtable bloom stats + ASSERT_EQ(value1, Get(key1)); + ASSERT_EQ(1, get_perf_context()->bloom_memtable_hit_count); + ASSERT_EQ(value3, Get(key3)); + ASSERT_EQ(2, get_perf_context()->bloom_memtable_hit_count); + ASSERT_EQ(0, get_perf_context()->bloom_memtable_miss_count); + + ASSERT_EQ("NOT_FOUND", Get(key2)); + ASSERT_EQ(1, get_perf_context()->bloom_memtable_miss_count); + ASSERT_EQ(2, get_perf_context()->bloom_memtable_hit_count); + + // sanity checks + ASSERT_EQ(0, get_perf_context()->bloom_sst_hit_count); + ASSERT_EQ(0, get_perf_context()->bloom_sst_miss_count); + + Flush(); + + // sanity checks + ASSERT_EQ(0, get_perf_context()->bloom_sst_hit_count); + ASSERT_EQ(0, get_perf_context()->bloom_sst_miss_count); + + // check SST bloom stats + ASSERT_EQ(value1, Get(key1)); + ASSERT_EQ(1, get_perf_context()->bloom_sst_hit_count); + ASSERT_EQ(value3, Get(key3)); + ASSERT_EQ(2, get_perf_context()->bloom_sst_hit_count); + + ASSERT_EQ("NOT_FOUND", Get(key2)); + ASSERT_EQ(1, get_perf_context()->bloom_sst_miss_count); +} + +// Same scenario as in BloomStatsTest but using an iterator +TEST_P(BloomStatsTestWithParam, BloomStatsTestWithIter) { + std::string key1("AAAA"); + std::string key2("RXDB"); // not in DB + std::string key3("ZBRA"); + std::string value1("Value1"); + std::string value3("Value3"); + + ASSERT_OK(Put(key1, value1, WriteOptions())); + ASSERT_OK(Put(key3, value3, WriteOptions())); + + std::unique_ptr iter(dbfull()->NewIterator(ReadOptions())); + + // check memtable bloom stats + iter->Seek(key1); + ASSERT_OK(iter->status()); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ(value1, iter->value().ToString()); + ASSERT_EQ(1, get_perf_context()->bloom_memtable_hit_count); + ASSERT_EQ(0, get_perf_context()->bloom_memtable_miss_count); + + iter->Seek(key3); + ASSERT_OK(iter->status()); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ(value3, iter->value().ToString()); + ASSERT_EQ(2, get_perf_context()->bloom_memtable_hit_count); + ASSERT_EQ(0, get_perf_context()->bloom_memtable_miss_count); + + iter->Seek(key2); + ASSERT_OK(iter->status()); + ASSERT_TRUE(!iter->Valid()); + ASSERT_EQ(1, get_perf_context()->bloom_memtable_miss_count); + ASSERT_EQ(2, get_perf_context()->bloom_memtable_hit_count); + + Flush(); + + iter.reset(dbfull()->NewIterator(ReadOptions())); + + // Check SST bloom stats + iter->Seek(key1); + ASSERT_OK(iter->status()); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ(value1, iter->value().ToString()); + ASSERT_EQ(1, get_perf_context()->bloom_sst_hit_count); + + iter->Seek(key3); + ASSERT_OK(iter->status()); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ(value3, iter->value().ToString()); + // The seek doesn't check block-based bloom filter because last index key + // starts with the same prefix we're seeking to. + uint64_t expected_hits = 2; + ASSERT_EQ(expected_hits, get_perf_context()->bloom_sst_hit_count); + + iter->Seek(key2); + ASSERT_OK(iter->status()); + ASSERT_TRUE(!iter->Valid()); + ASSERT_EQ(1, get_perf_context()->bloom_sst_miss_count); + ASSERT_EQ(expected_hits, get_perf_context()->bloom_sst_hit_count); +} + +// // INSTANTIATE_TEST_CASE_P( +// // BloomStatsTestWithParam, BloomStatsTestWithParam, +// // ::testing::Values(false, true)); + +namespace { +void PrefixScanInit(SpdbDBBloomFilterTest* dbtest) { + char buf[100]; + std::string keystr; + const int small_range_sstfiles = 5; + const int big_range_sstfiles = 5; + + // Generate 11 sst files with the following prefix ranges. + // GROUP 0: [0,10] (level 1) + // GROUP 1: [1,2], [2,3], [3,4], [4,5], [5, 6] (level 0) + // GROUP 2: [0,6], [0,7], [0,8], [0,9], [0,10] (level 0) + // + // A seek with the previous API would do 11 random I/Os (to all the + // files). With the new API and a prefix filter enabled, we should + // only do 2 random I/O, to the 2 files containing the key. + + // GROUP 0 + snprintf(buf, sizeof(buf), "%02d______:start", 0); + keystr = std::string(buf); + ASSERT_OK(dbtest->Put(keystr, keystr)); + snprintf(buf, sizeof(buf), "%02d______:end", 10); + keystr = std::string(buf); + ASSERT_OK(dbtest->Put(keystr, keystr)); + ASSERT_OK(dbtest->Flush()); + ASSERT_OK(dbtest->dbfull()->CompactRange(CompactRangeOptions(), nullptr, + nullptr)); // move to level 1 + + // GROUP 1 + for (int i = 1; i <= small_range_sstfiles; i++) { + snprintf(buf, sizeof(buf), "%02d______:start", i); + keystr = std::string(buf); + ASSERT_OK(dbtest->Put(keystr, keystr)); + snprintf(buf, sizeof(buf), "%02d______:end", i + 1); + keystr = std::string(buf); + ASSERT_OK(dbtest->Put(keystr, keystr)); + dbtest->Flush(); + } + + // GROUP 2 + for (int i = 1; i <= big_range_sstfiles; i++) { + snprintf(buf, sizeof(buf), "%02d______:start", 0); + keystr = std::string(buf); + ASSERT_OK(dbtest->Put(keystr, keystr)); + snprintf(buf, sizeof(buf), "%02d______:end", small_range_sstfiles + i + 1); + keystr = std::string(buf); + ASSERT_OK(dbtest->Put(keystr, keystr)); + dbtest->Flush(); + } +} +} // namespace + +TEST_F(SpdbDBBloomFilterTest, PrefixScan) { + while (ChangeFilterOptions()) { + int count; + Slice prefix; + Slice key; + char buf[100]; + Iterator* iter; + snprintf(buf, sizeof(buf), "03______:"); + prefix = Slice(buf, 8); + key = Slice(buf, 9); + ASSERT_EQ(key.difference_offset(prefix), 8); + ASSERT_EQ(prefix.difference_offset(key), 8); + // db configs + env_->count_random_reads_ = true; + Options options = CurrentOptions(); + options.env = env_; + options.prefix_extractor.reset(NewFixedPrefixTransform(8)); + options.disable_auto_compactions = true; + options.max_background_compactions = 2; + options.create_if_missing = true; + options.memtable_factory.reset(NewHashSkipListRepFactory(16)); + assert(!options.unordered_write); + // It is incompatible with allow_concurrent_memtable_write=false + options.allow_concurrent_memtable_write = false; + + BlockBasedTableOptions table_options; + table_options.no_block_cache = true; + table_options.filter_policy.reset(new SpdbPairedBloomFilterPolicy(20)); + table_options.whole_key_filtering = false; + options.table_factory.reset(NewBlockBasedTableFactory(table_options)); + + // 11 RAND I/Os + DestroyAndReopen(options); + PrefixScanInit(this); + count = 0; + env_->random_read_counter_.Reset(); + iter = db_->NewIterator(ReadOptions()); + for (iter->Seek(prefix); iter->Valid(); iter->Next()) { + if (!iter->key().starts_with(prefix)) { + break; + } + count++; + } + ASSERT_OK(iter->status()); + delete iter; + ASSERT_EQ(count, 2); + ASSERT_EQ(env_->random_read_counter_.Read(), 2); + Close(); + } // end of while +} + +// TODO: The filter builder is created always with OFFM = false, both for us and +// rocksdb. Is that how it's supposed to be? +TEST_F(SpdbDBBloomFilterTest, OptimizeFiltersForHits) { + Options options = CurrentOptions(); + options.write_buffer_size = 64 * 1024; + options.arena_block_size = 4 * 1024; + options.target_file_size_base = 64 * 1024; + options.level0_file_num_compaction_trigger = 2; + options.level0_slowdown_writes_trigger = 2; + options.level0_stop_writes_trigger = 4; + options.max_bytes_for_level_base = 256 * 1024; + options.max_write_buffer_number = 2; + options.max_background_compactions = 8; + options.max_background_flushes = 8; + options.compression = kNoCompression; + options.compaction_style = kCompactionStyleLevel; + options.level_compaction_dynamic_level_bytes = true; + BlockBasedTableOptions bbto; + bbto.cache_index_and_filter_blocks = true; + bbto.filter_policy.reset(new SpdbPairedBloomFilterPolicy(20)); + bbto.whole_key_filtering = true; + options.table_factory.reset(NewBlockBasedTableFactory(bbto)); + options.optimize_filters_for_hits = true; + options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics(); + get_perf_context()->Reset(); + get_perf_context()->EnablePerLevelPerfContext(); + CreateAndReopenWithCF({"mypikachu"}, options); + + int numkeys = 200000; + + // Generate randomly shuffled keys, so the updates are almost + // random. + std::vector keys; + keys.reserve(numkeys); + for (int i = 0; i < numkeys; i += 2) { + keys.push_back(i); + } + RandomShuffle(std::begin(keys), std::end(keys)); + int num_inserted = 0; + for (int key : keys) { + ASSERT_OK(Put(1, Key(key), "val")); + if (++num_inserted % 1000 == 0) { + ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable()); + ASSERT_OK(dbfull()->TEST_WaitForCompact()); + } + } + ASSERT_OK(Put(1, Key(0), "val")); + ASSERT_OK(Put(1, Key(numkeys), "val")); + ASSERT_OK(Flush(1)); + ASSERT_OK(dbfull()->TEST_WaitForCompact()); + + if (NumTableFilesAtLevel(0, 1) == 0) { + // No Level 0 file. Create one. + ASSERT_OK(Put(1, Key(0), "val")); + ASSERT_OK(Put(1, Key(numkeys), "val")); + ASSERT_OK(Flush(1)); + ASSERT_OK(dbfull()->TEST_WaitForCompact()); + } + + for (int i = 1; i < numkeys; i += 2) { + ASSERT_EQ(Get(1, Key(i)), "NOT_FOUND"); + } + + ASSERT_EQ(0, TestGetTickerCount(options, GET_HIT_L0)); + ASSERT_EQ(0, TestGetTickerCount(options, GET_HIT_L1)); + ASSERT_EQ(0, TestGetTickerCount(options, GET_HIT_L2_AND_UP)); + + // Now we have three sorted run, L0, L5 and L6 with most files in L6 have + // no bloom filter. Most keys be checked bloom filters twice. + ASSERT_GT(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 65000 * 2); + ASSERT_LT(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 120000 * 2); + uint64_t bloom_filter_useful_all_levels = 0; + for (auto& kv : (*(get_perf_context()->level_to_perf_context))) { + if (kv.second.bloom_filter_useful > 0) { + bloom_filter_useful_all_levels += kv.second.bloom_filter_useful; + } + } + ASSERT_GT(bloom_filter_useful_all_levels, 65000 * 2); + ASSERT_LT(bloom_filter_useful_all_levels, 120000 * 2); + + for (int i = 0; i < numkeys; i += 2) { + ASSERT_EQ(Get(1, Key(i)), "val"); + } + + // Part 2 (read path): rewrite last level with blooms, then verify they get + // cached only if !optimize_filters_for_hits + options.disable_auto_compactions = true; + options.num_levels = 9; + options.optimize_filters_for_hits = false; + options.statistics = CreateDBStatistics(); + bbto.block_cache.reset(); + options.table_factory.reset(NewBlockBasedTableFactory(bbto)); + + ReopenWithColumnFamilies({"default", "mypikachu"}, options); + MoveFilesToLevel(7 /* level */, 1 /* column family index */); + + std::string value = Get(1, Key(0)); + uint64_t prev_cache_filter_hits = + TestGetTickerCount(options, BLOCK_CACHE_FILTER_HIT); + value = Get(1, Key(0)); + ASSERT_EQ(prev_cache_filter_hits + 1, + TestGetTickerCount(options, BLOCK_CACHE_FILTER_HIT)); + + // Now that we know the filter blocks exist in the last level files, see if + // filter caching is skipped for this optimization + options.optimize_filters_for_hits = true; + options.statistics = CreateDBStatistics(); + bbto.block_cache.reset(); + options.table_factory.reset(NewBlockBasedTableFactory(bbto)); + + ReopenWithColumnFamilies({"default", "mypikachu"}, options); + + value = Get(1, Key(0)); + ASSERT_EQ(0, TestGetTickerCount(options, BLOCK_CACHE_FILTER_MISS)); + ASSERT_EQ(0, TestGetTickerCount(options, BLOCK_CACHE_FILTER_HIT)); + ASSERT_EQ(2 /* index and data block */, + TestGetTickerCount(options, BLOCK_CACHE_ADD)); + + // Check filter block ignored for files preloaded during DB::Open() + options.max_open_files = -1; + options.statistics = CreateDBStatistics(); + bbto.block_cache.reset(); + options.table_factory.reset(NewBlockBasedTableFactory(bbto)); + + ReopenWithColumnFamilies({"default", "mypikachu"}, options); + + uint64_t prev_cache_filter_misses = + TestGetTickerCount(options, BLOCK_CACHE_FILTER_MISS); + prev_cache_filter_hits = TestGetTickerCount(options, BLOCK_CACHE_FILTER_HIT); + Get(1, Key(0)); + ASSERT_EQ(prev_cache_filter_misses, + TestGetTickerCount(options, BLOCK_CACHE_FILTER_MISS)); + ASSERT_EQ(prev_cache_filter_hits, + TestGetTickerCount(options, BLOCK_CACHE_FILTER_HIT)); + + // Check filter block ignored for file trivially-moved to bottom level + bbto.block_cache.reset(); + options.max_open_files = 100; // setting > -1 makes it not preload all files + options.statistics = CreateDBStatistics(); + options.table_factory.reset(NewBlockBasedTableFactory(bbto)); + + ReopenWithColumnFamilies({"default", "mypikachu"}, options); + + ASSERT_OK(Put(1, Key(numkeys + 1), "val")); + ASSERT_OK(Flush(1)); + + int32_t trivial_move = 0; + int32_t non_trivial_move = 0; + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "DBImpl::BackgroundCompaction:TrivialMove", + [&](void* /*arg*/) { trivial_move++; }); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "DBImpl::BackgroundCompaction:NonTrivial", + [&](void* /*arg*/) { non_trivial_move++; }); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); + + CompactRangeOptions compact_options; + compact_options.bottommost_level_compaction = + BottommostLevelCompaction::kSkip; + compact_options.change_level = true; + compact_options.target_level = 7; + ASSERT_TRUE(db_->CompactRange(compact_options, handles_[1], nullptr, nullptr) + .IsNotSupported()); + + ASSERT_EQ(trivial_move, 1); + ASSERT_EQ(non_trivial_move, 0); + + prev_cache_filter_hits = TestGetTickerCount(options, BLOCK_CACHE_FILTER_HIT); + prev_cache_filter_misses = + TestGetTickerCount(options, BLOCK_CACHE_FILTER_MISS); + value = Get(1, Key(numkeys + 1)); + ASSERT_EQ(prev_cache_filter_hits, + TestGetTickerCount(options, BLOCK_CACHE_FILTER_HIT)); + ASSERT_EQ(prev_cache_filter_misses, + TestGetTickerCount(options, BLOCK_CACHE_FILTER_MISS)); + + // Check filter block not cached for iterator + bbto.block_cache.reset(); + options.statistics = CreateDBStatistics(); + options.table_factory.reset(NewBlockBasedTableFactory(bbto)); + + ReopenWithColumnFamilies({"default", "mypikachu"}, options); + + std::unique_ptr iter(db_->NewIterator(ReadOptions(), handles_[1])); + iter->SeekToFirst(); + ASSERT_EQ(0, TestGetTickerCount(options, BLOCK_CACHE_FILTER_MISS)); + ASSERT_EQ(0, TestGetTickerCount(options, BLOCK_CACHE_FILTER_HIT)); + ASSERT_EQ(2 /* index and data block */, + TestGetTickerCount(options, BLOCK_CACHE_ADD)); + get_perf_context()->Reset(); +} + +int CountIter(std::unique_ptr& iter, const Slice& key) { + int count = 0; + for (iter->Seek(key); iter->Valid(); iter->Next()) { + count++; + } + EXPECT_OK(iter->status()); + return count; +} + +// use iterate_upper_bound to hint compatiability of existing bloom filters. +// The BF is considered compatible if 1) upper bound and seek key transform +// into the same string, or 2) the transformed seek key is of the same length +// as the upper bound and two keys are adjacent according to the comparator. +TEST_F(SpdbDBBloomFilterTest, DynamicBloomFilterUpperBound) { + auto bfp_impl = kSpdbPairedBloom; + int using_full_builder = true; + Options options; + options.create_if_missing = true; + options.env = CurrentOptions().env; + options.prefix_extractor.reset(NewCappedPrefixTransform(4)); + options.disable_auto_compactions = true; + options.statistics = CreateDBStatistics(); + // Enable prefix bloom for SST files + BlockBasedTableOptions table_options; + table_options.cache_index_and_filter_blocks = true; + table_options.filter_policy = Create(20, bfp_impl); + table_options.index_shortening = BlockBasedTableOptions::IndexShorteningMode:: + kShortenSeparatorsAndSuccessor; + options.table_factory.reset(NewBlockBasedTableFactory(table_options)); + DestroyAndReopen(options); + + ASSERT_OK(Put("abcdxxx0", "val1")); + ASSERT_OK(Put("abcdxxx1", "val2")); + ASSERT_OK(Put("abcdxxx2", "val3")); + ASSERT_OK(Put("abcdxxx3", "val4")); + ASSERT_OK(dbfull()->Flush(FlushOptions())); + { + // prefix_extractor has not changed, BF will always be read + Slice upper_bound("abce"); + ReadOptions read_options; + read_options.prefix_same_as_start = true; + read_options.iterate_upper_bound = &upper_bound; + std::unique_ptr iter(db_->NewIterator(read_options)); + ASSERT_EQ(CountIter(iter, "abcd0000"), 4); + } + { + Slice upper_bound("abcdzzzz"); + ReadOptions read_options; + read_options.prefix_same_as_start = true; + read_options.iterate_upper_bound = &upper_bound; + std::unique_ptr iter(db_->NewIterator(read_options)); + ASSERT_EQ(CountIter(iter, "abcd0000"), 4); + ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_CHECKED), 2); + ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_USEFUL), 0); + } + ASSERT_OK(dbfull()->SetOptions({{"prefix_extractor", "fixed:5"}})); + ASSERT_EQ(dbfull()->GetOptions().prefix_extractor->AsString(), + "rocksdb.FixedPrefix.5"); + { + // BF changed, [abcdxx00, abce) is a valid bound, will trigger BF read + Slice upper_bound("abce"); + ReadOptions read_options; + read_options.prefix_same_as_start = true; + read_options.iterate_upper_bound = &upper_bound; + std::unique_ptr iter(db_->NewIterator(read_options)); + ASSERT_EQ(CountIter(iter, "abcdxx00"), 4); + // should check bloom filter since upper bound meets requirement + ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_CHECKED), + 2 + using_full_builder); + ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_USEFUL), 0); + } + { + // [abcdxx01, abcey) is not valid bound since upper bound is too long for + // the BF in SST (capped:4) + Slice upper_bound("abcey"); + ReadOptions read_options; + read_options.prefix_same_as_start = true; + read_options.iterate_upper_bound = &upper_bound; + std::unique_ptr iter(db_->NewIterator(read_options)); + ASSERT_EQ(CountIter(iter, "abcdxx01"), 4); + // should skip bloom filter since upper bound is too long + ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_CHECKED), + 2 + using_full_builder); + ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_USEFUL), 0); + } + { + // [abcdxx02, abcdy) is a valid bound since the prefix is the same + Slice upper_bound("abcdy"); + ReadOptions read_options; + read_options.prefix_same_as_start = true; + read_options.iterate_upper_bound = &upper_bound; + std::unique_ptr iter(db_->NewIterator(read_options)); + ASSERT_EQ(CountIter(iter, "abcdxx02"), 4); + // should check bloom filter since upper bound matches transformed seek + // key + ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_CHECKED), + 2 + using_full_builder * 2); + ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_USEFUL), 0); + } + { + // [aaaaaaaa, abce) is not a valid bound since 1) they don't share the + // same prefix, 2) the prefixes are not consecutive + Slice upper_bound("abce"); + ReadOptions read_options; + read_options.prefix_same_as_start = true; + read_options.iterate_upper_bound = &upper_bound; + std::unique_ptr iter(db_->NewIterator(read_options)); + ASSERT_EQ(CountIter(iter, "aaaaaaaa"), 0); + // should skip bloom filter since mismatch is found + ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_CHECKED), + 2 + using_full_builder * 2); + ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_USEFUL), 0); + } + ASSERT_OK(dbfull()->SetOptions({{"prefix_extractor", "fixed:3"}})); + { + // [abc, abd) is not a valid bound since the upper bound is too short + // for BF (capped:4) + Slice upper_bound("abd"); + ReadOptions read_options; + read_options.prefix_same_as_start = true; + read_options.iterate_upper_bound = &upper_bound; + std::unique_ptr iter(db_->NewIterator(read_options)); + ASSERT_EQ(CountIter(iter, "abc"), 4); + ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_CHECKED), + 2 + using_full_builder * 2); + ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_USEFUL), 0); + } + ASSERT_OK(dbfull()->SetOptions({{"prefix_extractor", "capped:4"}})); + { + // set back to capped:4 and verify BF is always read + Slice upper_bound("abd"); + ReadOptions read_options; + read_options.prefix_same_as_start = true; + read_options.iterate_upper_bound = &upper_bound; + std::unique_ptr iter(db_->NewIterator(read_options)); + ASSERT_EQ(CountIter(iter, "abc"), 0); + ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_CHECKED), + 3 + using_full_builder * 2); + ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_USEFUL), 1); + } +} + +// Create multiple SST files each with a different prefix_extractor config, +// verify iterators can read all SST files using the latest config. +TEST_F(SpdbDBBloomFilterTest, DynamicBloomFilterMultipleSST) { + auto bfp_impl = kSpdbPairedBloom; + int using_full_builder = true; + Options options; + options.env = CurrentOptions().env; + options.create_if_missing = true; + options.prefix_extractor.reset(NewFixedPrefixTransform(1)); + options.disable_auto_compactions = true; + options.statistics = CreateDBStatistics(); + // Enable prefix bloom for SST files + BlockBasedTableOptions table_options; + table_options.filter_policy = Create(20, bfp_impl); + table_options.cache_index_and_filter_blocks = true; + options.table_factory.reset(NewBlockBasedTableFactory(table_options)); + DestroyAndReopen(options); + + Slice upper_bound("foz90000"); + ReadOptions read_options; + read_options.prefix_same_as_start = true; + + // first SST with fixed:1 BF + ASSERT_OK(Put("foo2", "bar2")); + ASSERT_OK(Put("foo", "bar")); + ASSERT_OK(Put("foq1", "bar1")); + ASSERT_OK(Put("fpa", "0")); + dbfull()->Flush(FlushOptions()); + std::unique_ptr iter_old(db_->NewIterator(read_options)); + ASSERT_EQ(CountIter(iter_old, "foo"), 4); + ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_CHECKED), 1); + + ASSERT_OK(dbfull()->SetOptions({{"prefix_extractor", "capped:3"}})); + ASSERT_EQ(dbfull()->GetOptions().prefix_extractor->AsString(), + "rocksdb.CappedPrefix.3"); + read_options.iterate_upper_bound = &upper_bound; + std::unique_ptr iter(db_->NewIterator(read_options)); + ASSERT_EQ(CountIter(iter, "foo"), 2); + ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_CHECKED), + 1 + using_full_builder); + ASSERT_EQ(CountIter(iter, "gpk"), 0); + ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_CHECKED), + 1 + using_full_builder); + ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_USEFUL), 0); + + // second SST with capped:3 BF + ASSERT_OK(Put("foo3", "bar3")); + ASSERT_OK(Put("foo4", "bar4")); + ASSERT_OK(Put("foq5", "bar5")); + ASSERT_OK(Put("fpb", "1")); + ASSERT_OK(dbfull()->Flush(FlushOptions())); + { + // BF is cappped:3 now + std::unique_ptr iter_tmp(db_->NewIterator(read_options)); + ASSERT_EQ(CountIter(iter_tmp, "foo"), 4); + ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_CHECKED), + 2 + using_full_builder * 2); + ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_USEFUL), 0); + ASSERT_EQ(CountIter(iter_tmp, "gpk"), 0); + // both counters are incremented because BF is "not changed" for 1 of the + // 2 SST files, so filter is checked once and found no match. + ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_CHECKED), + 3 + using_full_builder * 2); + ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_USEFUL), 1); + } + + ASSERT_OK(dbfull()->SetOptions({{"prefix_extractor", "fixed:2"}})); + ASSERT_EQ(dbfull()->GetOptions().prefix_extractor->AsString(), + "rocksdb.FixedPrefix.2"); + // third SST with fixed:2 BF + ASSERT_OK(Put("foo6", "bar6")); + ASSERT_OK(Put("foo7", "bar7")); + ASSERT_OK(Put("foq8", "bar8")); + ASSERT_OK(Put("fpc", "2")); + ASSERT_OK(dbfull()->Flush(FlushOptions())); + { + // BF is fixed:2 now + std::unique_ptr iter_tmp(db_->NewIterator(read_options)); + ASSERT_EQ(CountIter(iter_tmp, "foo"), 9); + // the first and last BF are checked + ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_CHECKED), + 4 + using_full_builder * 3); + ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_USEFUL), 1); + ASSERT_EQ(CountIter(iter_tmp, "gpk"), 0); + // only last BF is checked and not found + ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_CHECKED), + 5 + using_full_builder * 3); + ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_USEFUL), 2); + } + + // iter_old can only see the first SST, so checked plus 1 + ASSERT_EQ(CountIter(iter_old, "foo"), 4); + ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_CHECKED), + 6 + using_full_builder * 3); + // iter was created after the first setoptions call so only full filter + // will check the filter + ASSERT_EQ(CountIter(iter, "foo"), 2); + ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_CHECKED), + 6 + using_full_builder * 4); + + { + // keys in all three SSTs are visible to iterator + // The range of [foo, foz90000] is compatible with (fixed:1) and (fixed:2) + // so +2 for checked counter + std::unique_ptr iter_all(db_->NewIterator(read_options)); + ASSERT_EQ(CountIter(iter_all, "foo"), 9); + ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_CHECKED), + 7 + using_full_builder * 5); + ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_USEFUL), 2); + ASSERT_EQ(CountIter(iter_all, "gpk"), 0); + ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_CHECKED), + 8 + using_full_builder * 5); + ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_USEFUL), 3); + } + ASSERT_OK(dbfull()->SetOptions({{"prefix_extractor", "capped:3"}})); + ASSERT_EQ(dbfull()->GetOptions().prefix_extractor->AsString(), + "rocksdb.CappedPrefix.3"); + { + std::unique_ptr iter_all(db_->NewIterator(read_options)); + ASSERT_EQ(CountIter(iter_all, "foo"), 6); + // all three SST are checked because the current options has the same as + // the remaining SST (capped:3) + ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_CHECKED), + 9 + using_full_builder * 7); + ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_USEFUL), 3); + ASSERT_EQ(CountIter(iter_all, "gpk"), 0); + ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_CHECKED), + 10 + using_full_builder * 7); + ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_USEFUL), 4); + } + // TODO(Zhongyi): Maybe also need to add Get calls to test point look up? +} + +// Create a new column family in a running DB, change prefix_extractor +// dynamically, verify the iterator created on the new column family behaves +// as expected +// TODO: No filter is created here (in rocksdb's test it's the same) => Why is +// this test in this suite? +TEST_F(SpdbDBBloomFilterTest, DynamicBloomFilterNewColumnFamily) { + auto bfp_impl = kSpdbPairedBloom; + Options options = CurrentOptions(); + options.create_if_missing = true; + options.prefix_extractor.reset(NewFixedPrefixTransform(1)); + options.disable_auto_compactions = true; + options.statistics = CreateDBStatistics(); + // Enable prefix bloom for SST files + BlockBasedTableOptions table_options; + table_options.cache_index_and_filter_blocks = true; + table_options.filter_policy = Create(20, bfp_impl); + options.table_factory.reset(NewBlockBasedTableFactory(table_options)); + CreateAndReopenWithCF({"pikachu0"}, options); + ReadOptions read_options; + read_options.prefix_same_as_start = true; + // create a new CF and set prefix_extractor dynamically + options.prefix_extractor.reset(NewCappedPrefixTransform(3)); + CreateColumnFamilies({"ramen_dojo_0"}, options); + ASSERT_EQ(dbfull()->GetOptions(handles_[2]).prefix_extractor->AsString(), + "rocksdb.CappedPrefix.3"); + ASSERT_OK(Put(2, "foo3", "bar3")); + ASSERT_OK(Put(2, "foo4", "bar4")); + ASSERT_OK(Put(2, "foo5", "bar5")); + ASSERT_OK(Put(2, "foq6", "bar6")); + ASSERT_OK(Put(2, "fpq7", "bar7")); + dbfull()->Flush(FlushOptions()); + { + std::unique_ptr iter(db_->NewIterator(read_options, handles_[2])); + ASSERT_EQ(CountIter(iter, "foo"), 3); + ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_CHECKED), 0); + ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_USEFUL), 0); + } + ASSERT_OK( + dbfull()->SetOptions(handles_[2], {{"prefix_extractor", "fixed:2"}})); + ASSERT_EQ(dbfull()->GetOptions(handles_[2]).prefix_extractor->AsString(), + "rocksdb.FixedPrefix.2"); + { + std::unique_ptr iter(db_->NewIterator(read_options, handles_[2])); + ASSERT_EQ(CountIter(iter, "foo"), 4); + ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_CHECKED), 0); + ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_USEFUL), 0); + } + ASSERT_OK(dbfull()->DropColumnFamily(handles_[2])); + ASSERT_OK(dbfull()->DestroyColumnFamilyHandle(handles_[2])); + handles_[2] = nullptr; + ASSERT_OK(dbfull()->DropColumnFamily(handles_[1])); + ASSERT_OK(dbfull()->DestroyColumnFamilyHandle(handles_[1])); + handles_[1] = nullptr; +} + +// Verify it's possible to change prefix_extractor at runtime and iterators +// behaves as expected +TEST_F(SpdbDBBloomFilterTest, DynamicBloomFilterOptions) { + auto bfp_impl = kSpdbPairedBloom; + Options options; + options.env = CurrentOptions().env; + options.create_if_missing = true; + options.prefix_extractor.reset(NewFixedPrefixTransform(1)); + options.disable_auto_compactions = true; + options.statistics = CreateDBStatistics(); + // Enable prefix bloom for SST files + BlockBasedTableOptions table_options; + table_options.cache_index_and_filter_blocks = true; + table_options.filter_policy = Create(20, bfp_impl); + options.table_factory.reset(NewBlockBasedTableFactory(table_options)); + DestroyAndReopen(options); + + ASSERT_OK(Put("foo2", "bar2")); + ASSERT_OK(Put("foo", "bar")); + ASSERT_OK(Put("foo1", "bar1")); + ASSERT_OK(Put("fpa", "0")); + dbfull()->Flush(FlushOptions()); + ASSERT_OK(Put("foo3", "bar3")); + ASSERT_OK(Put("foo4", "bar4")); + ASSERT_OK(Put("foo5", "bar5")); + ASSERT_OK(Put("fpb", "1")); + dbfull()->Flush(FlushOptions()); + ASSERT_OK(Put("foo6", "bar6")); + ASSERT_OK(Put("foo7", "bar7")); + ASSERT_OK(Put("foo8", "bar8")); + ASSERT_OK(Put("fpc", "2")); + dbfull()->Flush(FlushOptions()); + + ReadOptions read_options; + read_options.prefix_same_as_start = true; + { + std::unique_ptr iter(db_->NewIterator(read_options)); + ASSERT_EQ(CountIter(iter, "foo"), 12); + ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_CHECKED), 3); + ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_USEFUL), 0); + } + std::unique_ptr iter_old(db_->NewIterator(read_options)); + ASSERT_EQ(CountIter(iter_old, "foo"), 12); + ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_CHECKED), 6); + ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_USEFUL), 0); + + ASSERT_OK(dbfull()->SetOptions({{"prefix_extractor", "capped:3"}})); + ASSERT_EQ(dbfull()->GetOptions().prefix_extractor->AsString(), + "rocksdb.CappedPrefix.3"); + { + std::unique_ptr iter(db_->NewIterator(read_options)); + // "fp*" should be skipped + ASSERT_EQ(CountIter(iter, "foo"), 9); + ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_CHECKED), 6); + ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_USEFUL), 0); + } + + // iterator created before should not be affected and see all keys + ASSERT_EQ(CountIter(iter_old, "foo"), 12); + ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_CHECKED), 9); + ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_USEFUL), 0); + ASSERT_EQ(CountIter(iter_old, "abc"), 0); + ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_CHECKED), 12); + ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_USEFUL), 3); +} + +#endif // ROCKSDB_LITE + +} // namespace ROCKSDB_NAMESPACE + +int main(int argc, char** argv) { + ROCKSDB_NAMESPACE::port::InstallStackTraceHandler(); + ::testing::InitGoogleTest(&argc, argv); + return RUN_ALL_TESTS(); +} diff --git a/plugin/speedb/paired_filter/speedb_paired_bloom.cc b/plugin/speedb/paired_filter/speedb_paired_bloom.cc new file mode 100644 index 0000000000..b5dd78acfa --- /dev/null +++ b/plugin/speedb/paired_filter/speedb_paired_bloom.cc @@ -0,0 +1,120 @@ +// TODO: ADD Speedb's Copyright Notice !!!!! + +#include "plugin/speedb/paired_filter/speedb_paired_bloom.h" + +#include "plugin/speedb/paired_filter/speedb_paired_bloom_internal.h" +#include "rocksdb/filter_policy.h" +#include "rocksdb/utilities/object_registry.h" +#include "table/block_based/filter_policy_internal.h" +#include "util/string_util.h" + +namespace ROCKSDB_NAMESPACE { +SpdbPairedBloomFilterPolicy::SpdbPairedBloomFilterPolicy(double bits_per_key) { + constexpr double kMinBitsPerKey = speedb_filter::kMinMillibitsPerKey / 1000; + + // Sanitize bits_per_key + if (bits_per_key < 0.5) { + // Round down to no filter + bits_per_key = 0; + } else if (bits_per_key < kMinBitsPerKey) { + // Minimum 1 bit per key (equiv) when creating filter + bits_per_key = kMinBitsPerKey; + } else if (!(bits_per_key < kMaxBitsPerKey)) { // including NaN + bits_per_key = kMaxBitsPerKey; + } + + // Includes a nudge toward rounding up, to ensure on all platforms + // that doubles specified with three decimal digits after the decimal + // point are interpreted accurately. + millibits_per_key_ = static_cast(bits_per_key * 1000.0 + 0.500001); +} + +FilterBitsBuilder* SpdbPairedBloomFilterPolicy::GetBuilderWithContext( + const FilterBuildingContext& context) const { + if (millibits_per_key_ == 0) { + // "No filter" special case + return nullptr; + } + + // TODO: The code below is duplicates from + // BloomLikeFilterPolicy::GetFastLocalBloomBuilderWithContext + // TODO: See if it may be refactored to a static method + bool offm = context.table_options.optimize_filters_for_memory; + const auto options_overrides_iter = + context.table_options.cache_usage_options.options_overrides.find( + CacheEntryRole::kFilterConstruction); + const auto filter_construction_charged = + options_overrides_iter != + context.table_options.cache_usage_options.options_overrides.end() + ? options_overrides_iter->second.charged + : context.table_options.cache_usage_options.options.charged; + + // TODO: Refactor this to a static method of BloomLikeFilterPolicy + std::shared_ptr cache_res_mgr; + if (context.table_options.block_cache && + filter_construction_charged == + CacheEntryRoleOptions::Decision::kEnabled) { + cache_res_mgr = std::make_shared< + CacheReservationManagerImpl>( + context.table_options.block_cache); + } + + return new SpdbPairedBloomBitsBuilder( + millibits_per_key_, offm ? &aggregate_rounding_balance_ : nullptr, + cache_res_mgr, context.table_options.detect_filter_construct_corruption, + std::bind(&SpdbPairedBloomFilterPolicy::GetFilterBitsReader, this, + std::placeholders::_1)); +} + +FilterBitsReader* SpdbPairedBloomFilterPolicy::GetFilterBitsReader( + const Slice& contents) const { + uint32_t len_with_meta = static_cast(contents.size()); + const auto trailer_len = speedb_filter::FilterMetadata::kMetadataLen; + if (len_with_meta <= trailer_len) { + // filter is empty or broken. Treat like zero keys added. + return new AlwaysFalseFilter(); + } + + const auto len = len_with_meta - trailer_len; + const char* metadata_start = &contents.data()[len]; + + auto trailer_data = + speedb_filter::FilterMetadata::ReadMetadata(metadata_start); + switch (trailer_data.filter_type) { + case speedb_filter::FilterType::kPairedBlockBloom: + return new SpdbPairedBloomBitsReader(contents.data(), + trailer_data.num_probes, len); + break; + + case speedb_filter::FilterType::kFutureUnknown: + return new AlwaysTrueFilter(); + break; + + default: + assert(0); + return new AlwaysTrueFilter(); + } +} + +std::string SpdbPairedBloomFilterPolicy::GetId() const { + return Name() + + BloomLikeFilterPolicy::GetBitsPerKeySuffix(millibits_per_key_); +} + +bool SpdbPairedBloomFilterPolicy::IsInstanceOf(const std::string& name) const { + if (name == kClassName()) { + return true; + } else { + return FilterPolicy::IsInstanceOf(name); + } +} + +const char* SpdbPairedBloomFilterPolicy::kClassName() { + return "speedb_paired_bloom_filter"; +} + +const char* SpdbPairedBloomFilterPolicy::kNickName() { + return "speedb.PairedBloomFilter"; +} + +} // namespace ROCKSDB_NAMESPACE diff --git a/plugin/speedb/paired_filter/speedb_paired_bloom.h b/plugin/speedb/paired_filter/speedb_paired_bloom.h new file mode 100644 index 0000000000..ad82b00142 --- /dev/null +++ b/plugin/speedb/paired_filter/speedb_paired_bloom.h @@ -0,0 +1,83 @@ +// TODO: ADD Speedb's Copyright Notice !!!!! + +#pragma once +#include "rocksdb/filter_policy.h" + +namespace ROCKSDB_NAMESPACE { + +// Forward Declarations +class ObjectLibrary; +struct FilterBuildingContext; + +// In the default cache-local bloom filter in RocksDB +// (FastLocalBloomFilterPolicy) the trade-off between memory and false positive +// rate is significantly worse than the theoretical standard bloom filter, +// however it is significantly faster in terms of CPU. This trade-off +// deteriorates performance/memory footprint especially in use cases in which +// large accuracy of the filter is needed (typically from ~20 bits-per-key). +// +// For really high bits-per-key there could be orders of magnitude difference in +// the false positive rate. Ribbon filter is generally better than bloom filter +// in the trade-off (takes ~30% less memory to obtain the same false positive +// rate. However, its construction and use is slower by a factor of ~4 than +// bloom filter, so in use cases that require fast testing and construction +// ribbon filter cannot be used. +// +// This filter is fast and low on CPU consumption on the one hand, but with a +// better memory footprint- FPR trade-off on the other hand. +// +class SpdbPairedBloomFilterPolicy : public FilterPolicy { + public: + // Max supported BPK. Filters using higher BPK-s will use the max + static constexpr double kMaxBitsPerKey = 100.0; + + public: + explicit SpdbPairedBloomFilterPolicy(double bits_per_key); + + FilterBitsBuilder* GetBuilderWithContext( + const FilterBuildingContext& context) const override; + + FilterBitsReader* GetFilterBitsReader(const Slice& contents) const override; + + // Plug-In Support + public: + static const char* kClassName(); + const char* Name() const override { return kClassName(); } + static const char* kNickName(); + const char* NickName() const override { return kNickName(); } + + std::string GetId() const override; + + bool IsInstanceOf(const std::string& name) const override; + + // This filter is NOT compatible with RocksDB's built-in filter, only with + // itself + const char* CompatibilityName() const override { + return kCompatibilityName(); + } + static const char* kCompatibilityName() { return kClassName(); } + + private: + // This filter supports fractional bits per key. For predictable behavior + // of 0.001-precision values across floating point implementations, we + // round to thousandths of a bit (on average) per key. + int millibits_per_key_; + + // State for implementing optimize_filters_for_memory. Essentially, this + // tracks a surplus or deficit in total FP rate of filters generated by + // builders under this policy vs. what would have been generated without + // optimize_filters_for_memory. + // + // To avoid floating point weirdness, the actual value is + // Sum over all generated filters f: + // (predicted_fp_rate(f) - predicted_fp_rate(f|o_f_f_m=false)) * 2^32 + mutable std::atomic aggregate_rounding_balance_; +}; + +// Plug-In Support +extern "C" { +int register_SpdbPairedBloomFilter(ROCKSDB_NAMESPACE::ObjectLibrary& library, + const std::string&); +} // extern "C" + +} // namespace ROCKSDB_NAMESPACE diff --git a/plugin/speedb/paired_filter/speedb_paired_bloom_internal.cc b/plugin/speedb/paired_filter/speedb_paired_bloom_internal.cc new file mode 100644 index 0000000000..435f13ef0f --- /dev/null +++ b/plugin/speedb/paired_filter/speedb_paired_bloom_internal.cc @@ -0,0 +1,833 @@ +// TODO: ADD Speedb's Copyright Notice !!!!! + +#include "plugin/speedb/paired_filter/speedb_paired_bloom_internal.h" + +#include +#include +#include +#include +#include +#include +#include + +#include "port/likely.h" // for LIKELY +#include "port/port.h" // for PREFETCH +#include "test_util/sync_point.h" +#include "util/bloom_impl.h" +#include "util/fastrange.h" + +#ifdef HAVE_AVX2 +#include +#endif + +namespace ROCKSDB_NAMESPACE { + +namespace { + +using InBatchBlockIdx = uint8_t; + +// We currently assume the in-batch block index fits within the 1st byte (8 +// bits) of the block and it is a power of 2 +static_assert(speedb_filter::kPairedBloomBatchSizeInBlocks <= (1 << 8U)); +static_assert((speedb_filter::kPairedBloomBatchSizeInBlocks > 0) && + ((speedb_filter::kPairedBloomBatchSizeInBlocks & + (speedb_filter::kPairedBloomBatchSizeInBlocks - 1)) == 0)); + +// Number of bits to point to any block in a batch (in-batch block index) +static const uint32_t kInBatchIdxNumBits = + std::ceil(std::log2(speedb_filter::kPairedBloomBatchSizeInBlocks)); + +// kBlockSizeInBytes must be a power of 2 (= Cacheline size) +constexpr uint32_t kBlockSizeInBytes = 64U; +static_assert((kBlockSizeInBytes > 0) && + ((kBlockSizeInBytes & (kBlockSizeInBytes - 1)) == 0)); +constexpr uint32_t kBlockSizeInBits = kBlockSizeInBytes * 8U; +static const uint32_t kBlockSizeNumBits = + std::ceil(std::log2(kBlockSizeInBits)); +static const uint32_t kNumBlockSizeBitsShiftBits = 32 - kBlockSizeNumBits; + +// Number of bits to represent kBlockSizeInBytes +static const uint32_t kNumBitsForBlockSize = std::log2(kBlockSizeInBytes); +static const uint32_t KNumBitsInBlockBloom = + kBlockSizeInBits - kInBatchIdxNumBits; + +constexpr uint32_t kBatchSizeInBytes = + speedb_filter::kPairedBloomBatchSizeInBlocks * kBlockSizeInBytes; + +constexpr uint64_t kNumMillibitsInByte = 8 * 1000U; + +[[maybe_unused]] constexpr uint32_t kMaxSupportLenWithMetadata = 0xffffffffU; +constexpr uint32_t kMaxSupportedSizeNoMetadata = 0xffffffc0U; + +constexpr size_t kMaxNumProbes = 30U; +static_assert(kMaxNumProbes % 2 == 0U); + +static const uint8_t kInBatchIdxMask = (uint8_t{1U} << kInBatchIdxNumBits) - 1; +static const uint8_t kFirstByteBitsMask = ~kInBatchIdxMask; + +// ================================================================================================== +// +// Helper Functions +// + +inline uint32_t HashToGlobalBlockIdx(uint32_t h1, uint32_t len_bytes) { + return FastRange32(h1, len_bytes >> kNumBitsForBlockSize); +} + +inline void PrefetchBlock(const char* block_address) { + PREFETCH(block_address, 0 /* rw */, 1 /* locality */); + PREFETCH(block_address + kBlockSizeInBytes - 1, 0 /* rw */, 1 /* locality */); +} + +inline uint32_t GetContainingBatchIdx(uint32_t global_block_idx) { + return (global_block_idx / speedb_filter::kPairedBloomBatchSizeInBlocks); +} + +inline uint8_t GetInBatchBlockIdx(uint32_t global_block_idx) { + return (global_block_idx % speedb_filter::kPairedBloomBatchSizeInBlocks); +} + +inline uint32_t GetHashSetSelector(uint32_t first_in_batch_block_idx, + uint32_t second_in_batch_block_idx) { + assert((first_in_batch_block_idx < + speedb_filter::kPairedBloomBatchSizeInBlocks) && + (second_in_batch_block_idx < + speedb_filter::kPairedBloomBatchSizeInBlocks)); + return (first_in_batch_block_idx < second_in_batch_block_idx) ? 0U : 1U; +} + +inline uint32_t GetFirstGlobalBlockIdxOfBatch(uint32_t batch_idx) { + return batch_idx * speedb_filter::kPairedBloomBatchSizeInBlocks; +} + +inline char* GetBlockAddress(char* data, uint32_t global_block_idx) { + return (data + global_block_idx * kBlockSizeInBytes); +} + +inline const char* GetBlockAddress(const char* data, + uint32_t global_block_idx) { + return (data + global_block_idx * kBlockSizeInBytes); +} + +inline double CalcAdjustedBitsPerKey(size_t millibits_per_key) { + return ((millibits_per_key * KNumBitsInBlockBloom) / kBlockSizeInBits / 1000); +} + +inline double CalcRawNumProbes(size_t millibits_per_key) { + static const auto log_2 = std::log(2); + return (log_2 * CalcAdjustedBitsPerKey(millibits_per_key)); +} + +inline size_t CalcNumProbes(size_t millibits_per_key) { + double raw_num_probes = CalcRawNumProbes(millibits_per_key); + + // Num probes must be even + auto num_probes = static_cast(std::ceil(raw_num_probes / 2.0) * 2); + assert(num_probes % 2 == 0U); + + return std::min(num_probes, kMaxNumProbes); +} + +// False positive rate of a standard Bloom filter, for given ratio of +// filter memory bits to added keys, and number of probes per operation. +// (The false positive rate is effectively independent of scale, assuming +// the implementation scales OK.) +inline double SpdbStandardFpRate(double bits_per_key, double raw_num_probes) { + // Standard very-good-estimate formula. See + // https://en.wikipedia.org/wiki/Bloom_filter#Probability_of_false_positives + return std::pow(1.0 - std::exp(-raw_num_probes / bits_per_key), + raw_num_probes); +} + +class BuildBlock { + public: + BuildBlock() = default; + BuildBlock(char* data, uint32_t global_block_idx, bool prefetch_block); + + uint8_t GetInBatchBlockIdxOfPair() const; + void SetInBatchBlockIdxOfPair(InBatchBlockIdx pair_batch_block_idx); + void SetBlockBloomBits(uint32_t hash, uint32_t set_idx, size_t hash_set_size); + + private: + char* const block_address_ = nullptr; +}; + +inline BuildBlock::BuildBlock(char* data, uint32_t global_block_idx, + bool prefetch_block) + : block_address_(GetBlockAddress(data, global_block_idx)) { + if (prefetch_block) { + PrefetchBlock(block_address_); + } +} + +inline uint8_t BuildBlock::GetInBatchBlockIdxOfPair() const { + return static_cast(*block_address_) & kInBatchIdxMask; +} + +inline void BuildBlock::SetInBatchBlockIdxOfPair( + InBatchBlockIdx pair_batch_block_idx) { + assert(((*block_address_ & kInBatchIdxMask) == 0U) || + ((*block_address_ & kInBatchIdxMask) == pair_batch_block_idx)); + + *block_address_ = + (pair_batch_block_idx | (*block_address_ & kFirstByteBitsMask)); +} + +inline int GetBitPosInBlockForHash(uint32_t hash, uint32_t set_idx) { + assert(set_idx <= 1U); + + int bitpos = 0; + + if (set_idx == 0) { + bitpos = hash >> 23; + if (LIKELY(bitpos > 6)) { + return bitpos; + } + hash <<= 9; + } else { + constexpr uint32_t mask = 0x007FC000; + bitpos = (hash & mask) >> 14; + if (LIKELY(bitpos > 6)) { + return bitpos; + } + } + + return kInBatchIdxNumBits + + (static_cast(KNumBitsInBlockBloom * + (hash >> kBlockSizeNumBits)) >> + (kNumBlockSizeBitsShiftBits)); +} + +inline void BuildBlock::SetBlockBloomBits(uint32_t hash, uint32_t set_idx, + size_t hash_set_size) { + for (auto i = 0U; i < hash_set_size; ++i) { + int bitpos = GetBitPosInBlockForHash(hash, set_idx); + block_address_[bitpos >> 3] |= (char{1} << (bitpos & kInBatchIdxNumBits)); + hash *= 0x9e3779b9; + } +} + +class ReadBlock { + public: + ReadBlock(const char* data, uint32_t global_block_idx, bool prefetch_block); + + uint8_t GetInBatchBlockIdxOfPair() const; + bool AreAllBlockBloomBitsSet(uint32_t hash, uint32_t set_idx, + size_t hash_set_size) const; + + private: +#ifdef HAVE_AVX2 + bool AreAllBlockBloomBitsSetAvx2(uint32_t hash, uint32_t set_idx, + size_t hash_set_size) const; +#endif + bool AreAllBlockBloomBitsSetNonAvx2(uint32_t hash, uint32_t set_idx, + size_t hash_set_size) const; + + private: + const char* const block_address_; +}; + +inline ReadBlock::ReadBlock(const char* data, uint32_t global_block_idx, + bool prefetch_block) + : block_address_(GetBlockAddress(data, global_block_idx)) { + if (prefetch_block) { + PrefetchBlock(block_address_); + } +} + +inline uint8_t ReadBlock::GetInBatchBlockIdxOfPair() const { + return static_cast(*block_address_) & kInBatchIdxMask; +} + +bool ReadBlock::AreAllBlockBloomBitsSet(uint32_t hash, uint32_t set_idx, + size_t hash_set_size) const { +#ifdef HAVE_AVX2 + // The AVX2 code currently supports only cache-line / block sizes of 64 bytes + // (512 bits) + if (kBlockSizeInBits == 512) { + return AreAllBlockBloomBitsSetAvx2(hash, set_idx, hash_set_size); + } else { + return AreAllBlockBloomBitsSetNonAvx2(hash, set_idx, hash_set_size); + } +#else + return AreAllBlockBloomBitsSetNonAvx2(hash, set_idx, hash_set_size); +#endif +} + +#ifdef HAVE_AVX2 +const __m256i mask_vec = _mm256_set1_epi32(0x007FC000); +const __m256i max_bitpos_vec = _mm256_set1_epi32(7); +const __m256i fast_range_vec = _mm256_set1_epi32(KNumBitsInBlockBloom); +const __m256i num_idx_bits_vec = _mm256_set1_epi32(kInBatchIdxNumBits); + +// Powers of 32-bit golden ratio, mod 2**32. +const __m256i multipliers = + _mm256_setr_epi32(0x00000001, 0x9e3779b9, 0xe35e67b1, 0x734297e9, + 0x35fbe861, 0xdeb7c719, 0x448b211, 0x3459b749); + +bool ReadBlock::AreAllBlockBloomBitsSetAvx2(uint32_t hash, uint32_t set_idx, + size_t hash_set_size) const { + assert(kBlockSizeInBytes == 64U); + + int rem_probes = static_cast(hash_set_size); + + // NOTE: This code is an adaptation of the equivalent code for RocksDB's + // bloom filter testing code using AVX2. + // See bloom_impl.h for more details + + for (;;) { + // Eight copies of hash + __m256i hash_vector = _mm256_set1_epi32(hash); + + // Same effect as repeated multiplication by 0x9e3779b9 thanks to + // associativity of multiplication. + hash_vector = _mm256_mullo_epi32(hash_vector, multipliers); + + __m256i orig_hash_vector = hash_vector; + + if (set_idx == 0) { + // hash >> 23 + hash_vector = _mm256_srli_epi32(hash_vector, 23); + } else { + // hash & mask (0x007FC000) + hash_vector = _mm256_and_si256(hash_vector, mask_vec); + + // hash >> 14 + hash_vector = _mm256_srli_epi32(hash_vector, 14); + } + + // // Find the bit positions that are < 7 + __m256i smaller_than_7_vec = + _mm256_cmpgt_epi32(max_bitpos_vec, hash_vector); + + if (_mm256_testz_si256(smaller_than_7_vec, smaller_than_7_vec) == false) { + __m256i hash_vector_fast_range = orig_hash_vector; + + if (set_idx == 0) { + // << 9 + hash_vector_fast_range = _mm256_slli_epi32(orig_hash_vector, 9); + } + + // AVX2 code to calculate the equivalent of + // GetBitPosInBlockForHash1stPass() for up to 8 hashes + + // Shift right the hashes by kBlockSizeNumBits + hash_vector_fast_range = + _mm256_srli_epi32(hash_vector_fast_range, kBlockSizeNumBits); + + // Multiplying by 505 => The result (lower 32 bits will be in the range + // 0-504 (in the 9 MSB bits). + hash_vector_fast_range = + _mm256_mullo_epi32(hash_vector_fast_range, fast_range_vec); + hash_vector_fast_range = + _mm256_srli_epi32(hash_vector_fast_range, kNumBlockSizeBitsShiftBits); + + // Add 7 to get the final bit position in the range 7 - 511 (In the 9 MSB + // bits) + hash_vector_fast_range = + _mm256_add_epi32(hash_vector_fast_range, num_idx_bits_vec); + + hash_vector = _mm256_blendv_epi8(hash_vector, hash_vector_fast_range, + smaller_than_7_vec); + } + + hash_vector = _mm256_slli_epi32(hash_vector, kNumBlockSizeBitsShiftBits); + + auto [is_done, answer] = FastLocalBloomImpl::CheckBitsPositionsInBloomBlock( + rem_probes, hash_vector, block_address_); + if (is_done) { + return answer; + } + + // otherwise + // Need another iteration. 0xab25f4c1 == golden ratio to the 8th power + hash *= 0xab25f4c1; + rem_probes -= 8; + } +} + +#endif // HAVE_AVX2 + +bool ReadBlock::AreAllBlockBloomBitsSetNonAvx2(uint32_t hash, uint32_t set_idx, + size_t hash_set_size) const { + for (auto i = 0U; i < hash_set_size; ++i) { + int bitpos = GetBitPosInBlockForHash(hash, set_idx); + if ((block_address_[bitpos >> 3] & + (char{1} << (bitpos & kInBatchIdxNumBits))) == 0) { + return false; + } + hash *= 0x9e3779b9; + } + return true; +} + +} // Unnamed namespace + +// ================================================================================================== +namespace speedb_filter { + +void FilterMetadata::WriteMetadata(char* metadata, [[maybe_unused]] size_t len, + const Fields& fields) { + assert(len == kMetadataLen); + + // Init the metadata to all Zeros + std::memset(metadata, 0x0, kMetadataLen); + + metadata[0] = static_cast(speedb_filter::FilterType::kPairedBlockBloom); + + assert(fields.num_probes <= 30U); + metadata[1] = static_cast(fields.num_probes); + // rest of metadata stays zero +} + +auto FilterMetadata::ReadMetadata(const char* metadata) -> Fields { + char filter_type = *metadata; + char block_and_probes = *(metadata + 1); + + // TODO: Avoid the use of magic numbers + size_t num_probes = (block_and_probes & 0x1F); + if (num_probes < 1 || num_probes > 30) { + // Reserved / future safe + return {num_probes, FilterType::kFutureUnknown}; + } + + uint16_t rest = DecodeFixed16(metadata + 2); + if (rest != 0) { + // Reserved, possibly for hash seed + // Future safe + return {num_probes, FilterType::kFutureUnknown}; + } + + if (speedb_filter::FilterType(filter_type) == + speedb_filter::FilterType::kPairedBlockBloom) { // FastLocalBloom + // TODO: Avoid the use of magic numbers + auto log2_block_bytes = ((block_and_probes >> 5) & 7); + if (log2_block_bytes == 0U) { // Only block size supported for now + return {num_probes, FilterType::kPairedBlockBloom}; + } + } + + return {num_probes, FilterType::kFutureUnknown}; +} + +} // namespace speedb_filter + +// ================================================================================================== +SpdbPairedBloomBitsBuilder::SpdbPairedBloomBitsBuilder( + const int millibits_per_key, + std::atomic* aggregate_rounding_balance, + const std::shared_ptr& cache_res_mgr, + bool detect_filter_construct_corruption, + const FilterBitsReaderCreateFunc& reader_create_func) + : XXPH3FilterBitsBuilder(aggregate_rounding_balance, + std::move(cache_res_mgr), + detect_filter_construct_corruption), + millibits_per_key_(millibits_per_key), + reader_create_func_(reader_create_func) { + assert(millibits_per_key >= speedb_filter::kMinMillibitsPerKey); +} + +void SpdbPairedBloomBitsBuilder::InitVars(uint64_t len_no_metadata) { + assert((len_no_metadata % kBatchSizeInBytes) == 0U); + num_blocks_ = len_no_metadata / kBlockSizeInBytes; + num_blocks_ = std::max(num_blocks_, + speedb_filter::kPairedBloomBatchSizeInBlocks); + // num_blocks must be event and a multiple of the batch size + assert(num_blocks_ > 0U); + assert(num_blocks_ % 2 == 0); + assert(num_blocks_ % speedb_filter::kPairedBloomBatchSizeInBlocks == 0); + + num_batches_ = num_blocks_ / speedb_filter::kPairedBloomBatchSizeInBlocks; + // There must be at least 1 batch + assert(num_batches_ > 0U); + + pairing_table_.resize(num_batches_); + AddCacheReservation(num_batches_ * + sizeof(decltype(pairing_table_)::value_type)); + + num_probes_ = CalcNumProbes(millibits_per_key_); +} + +Slice SpdbPairedBloomBitsBuilder::Finish(std::unique_ptr* buf, + Status* status) { + const size_t num_entries = hash_entries_info_.entries.size(); + size_t len_with_metadata = CalculateSpace(num_entries); + + std::unique_ptr mutable_buf; + std::unique_ptr + final_filter_cache_res_handle; + len_with_metadata = + AllocateMaybeRounding(len_with_metadata, num_entries, &mutable_buf); + assert(mutable_buf); + assert(len_with_metadata >= speedb_filter::FilterMetadata::kMetadataLen); + // Max size supported by implementation + assert(len_with_metadata <= kMaxSupportLenWithMetadata); + + // Cache reservation for mutable_buf + if (cache_res_mgr_) { + Status s = cache_res_mgr_->MakeCacheReservation( + len_with_metadata * sizeof(char), &final_filter_cache_res_handle); + s.PermitUncheckedError(); + } + + uint32_t len_no_metadata = static_cast( + len_with_metadata - speedb_filter::FilterMetadata::kMetadataLen); + InitVars(len_no_metadata); + + if (len_no_metadata > 0) { + TEST_SYNC_POINT_CALLBACK( + "XXPH3FilterBitsBuilder::Finish::" + "TamperHashEntries", + &hash_entries_info_.entries); + AddAllEntries(mutable_buf.get(), len_no_metadata); + Status verify_hash_entries_checksum_status = + MaybeVerifyHashEntriesChecksum(); + if (!verify_hash_entries_checksum_status.ok()) { + if (status) { + *status = verify_hash_entries_checksum_status; + } + return FinishAlwaysTrue(buf); + } + } + + bool keep_entries_for_postverify = detect_filter_construct_corruption_; + if (!keep_entries_for_postverify) { + ResetEntries(); + } + + speedb_filter::FilterMetadata::Fields metadata_fields{ + num_probes_, speedb_filter::FilterType::kPairedBlockBloom}; + speedb_filter::FilterMetadata::WriteMetadata( + &mutable_buf[len_no_metadata], + speedb_filter::FilterMetadata::kMetadataLen, metadata_fields); + + auto TEST_arg_pair __attribute__((__unused__)) = + std::make_pair(&mutable_buf, len_with_metadata); + TEST_SYNC_POINT_CALLBACK("XXPH3FilterBitsBuilder::Finish::TamperFilter", + &TEST_arg_pair); + + Slice rv(mutable_buf.get(), len_with_metadata); + *buf = std::move(mutable_buf); + final_filter_cache_res_handles_.push_back( + std::move(final_filter_cache_res_handle)); + if (status) { + *status = Status::OK(); + } + return rv; +} + +size_t SpdbPairedBloomBitsBuilder::ApproximateNumEntries( + size_t len_with_metadata) { + size_t len_no_meta = + len_with_metadata >= speedb_filter::FilterMetadata::kMetadataLen + ? RoundDownUsableSpace(len_with_metadata) - + speedb_filter::FilterMetadata::kMetadataLen + : 0; + return static_cast(kNumMillibitsInByte * len_no_meta / + millibits_per_key_); +} + +size_t SpdbPairedBloomBitsBuilder::CalculateSpace(size_t num_entries) { + size_t len_without_metadata = + num_entries * millibits_per_key_ / kNumMillibitsInByte; + // Make sure we have enough space for at least 1 batch + len_without_metadata = + std::max(len_without_metadata, kBatchSizeInBytes); + return RoundDownUsableSpace(len_without_metadata + + speedb_filter::FilterMetadata::kMetadataLen); +} + +size_t SpdbPairedBloomBitsBuilder::GetNumProbes() { + return CalcNumProbes(millibits_per_key_); +} + +double SpdbPairedBloomBitsBuilder::EstimatedFpRate( + size_t /*num_entries*/, size_t /*len_with_metadata*/) { + auto raw_num_probes = CalcRawNumProbes(millibits_per_key_); + + double adjusted_bits_per_key = CalcAdjustedBitsPerKey(millibits_per_key_); + return SpdbStandardFpRate(adjusted_bits_per_key, raw_num_probes); +} + +size_t SpdbPairedBloomBitsBuilder::RoundDownUsableSpace(size_t available_size) { + size_t rv = available_size - speedb_filter::FilterMetadata::kMetadataLen; + + if (rv >= kMaxSupportedSizeNoMetadata) { + // Max supported for this data structure implementation + rv = kMaxSupportedSizeNoMetadata; + } + + // round down to multiple of a Batch + rv = std::max((rv / kBatchSizeInBytes) * kBatchSizeInBytes, + kBatchSizeInBytes); + + return rv + speedb_filter::FilterMetadata::kMetadataLen; +} + +FilterBitsReader* SpdbPairedBloomBitsBuilder::GetBitsReader( + const Slice& filter_content) { + assert(reader_create_func_ != nullptr); + return reader_create_func_ ? reader_create_func_(filter_content) : nullptr; +} + +void SpdbPairedBloomBitsBuilder::InitBlockHistogram() { + blocks_histogram_.resize(num_batches_); + AddCacheReservation(num_batches_ * + sizeof(decltype(blocks_histogram_)::value_type)); + + for (auto batch_idx = 0U; batch_idx < blocks_histogram_.size(); ++batch_idx) { + for (auto in_batch_block_idx = 0U; + in_batch_block_idx < blocks_histogram_[batch_idx].size(); + ++in_batch_block_idx) { + blocks_histogram_[batch_idx][in_batch_block_idx] + .original_in_batch_block_idx = in_batch_block_idx; + } + } +} + +void SpdbPairedBloomBitsBuilder::BuildBlocksHistogram(uint32_t data_len_bytes) { + for (const auto& hash : hash_entries_info_.entries) { + const uint32_t global_block_idx = + HashToGlobalBlockIdx(Lower32of64(hash), data_len_bytes); + const uint8_t in_batch_block_idx = GetInBatchBlockIdx(global_block_idx); + const uint32_t batch_idx = GetContainingBatchIdx(global_block_idx); + + ++blocks_histogram_[batch_idx][in_batch_block_idx].num_keys; + } +} + +void SpdbPairedBloomBitsBuilder::SortBatchBlocks(uint32_t batch_idx) { + assert(batch_idx < num_batches_); + BatchBlocksHistogram& batch_blocks_histrogram = blocks_histogram_[batch_idx]; + std::stable_sort(batch_blocks_histrogram.begin(), + batch_blocks_histrogram.end()); +} + +void SpdbPairedBloomBitsBuilder::PairBatchBlocks(uint32_t batch_idx) { + assert(batch_idx < num_batches_); + BatchBlocksHistogram& batch_blocks_histrogram = blocks_histogram_[batch_idx]; + auto& batch_pairing_info = pairing_table_[batch_idx]; + + for (auto in_batch_block_idx = 0U; + in_batch_block_idx < speedb_filter::kPairedBloomBatchSizeInBlocks; + ++in_batch_block_idx) { + const auto pair_in_batch_block_idx = + batch_blocks_histrogram.size() - in_batch_block_idx - 1; + auto original_in_batch_block_idx = + batch_blocks_histrogram[in_batch_block_idx].original_in_batch_block_idx; + + batch_pairing_info[original_in_batch_block_idx].pair_in_batch_block_idx = + batch_blocks_histrogram[pair_in_batch_block_idx] + .original_in_batch_block_idx; + batch_pairing_info[original_in_batch_block_idx].hash_set_selector = + GetHashSetSelector(original_in_batch_block_idx, + batch_blocks_histrogram[pair_in_batch_block_idx] + .original_in_batch_block_idx); + } +} + +void SpdbPairedBloomBitsBuilder::PairBlocks() { + for (auto batch_idx = 0U; batch_idx < num_batches_; ++batch_idx) { + SortBatchBlocks(batch_idx); + PairBatchBlocks(batch_idx); + } +} + +void SpdbPairedBloomBitsBuilder::SetBlocksPairs(char* data) { + for (auto batch_idx = 0U; batch_idx < pairing_table_.size(); ++batch_idx) { + for (auto in_batch_block_idx = 0U; + in_batch_block_idx < speedb_filter::kPairedBloomBatchSizeInBlocks; + ++in_batch_block_idx) { + uint32_t global_block_idx = + batch_idx * speedb_filter::kPairedBloomBatchSizeInBlocks + + in_batch_block_idx; + BuildBlock block(data, global_block_idx, false /* prefetch */); + const uint32_t pair_in_batch_block_idx = + pairing_table_[batch_idx][in_batch_block_idx].pair_in_batch_block_idx; + block.SetInBatchBlockIdxOfPair(pair_in_batch_block_idx); + } + } +} + +// +// Build the blocks in similarly to how Rocksd does it +// The idea is to trigger blocks prefetching in batches, and access the +// prefetched blocks in batches. +void SpdbPairedBloomBitsBuilder::BuildBlocks(char* data, + uint32_t data_len_bytes) { + const size_t num_entries = hash_entries_info_.entries.size(); + constexpr size_t kBufferMask = 7; + static_assert(((kBufferMask + 1) & kBufferMask) == 0, + "Must be power of 2 minus 1"); + + constexpr auto kArraySize = kBufferMask + 1; + std::array primary_blocks; + std::array secondary_blocks; + std::array primary_hash_selectors; + std::array upper_32_bits_of_hashes; + + auto const hash_set_size = num_probes_ / 2; + + size_t i = 0; + std::deque::iterator hash_entries_it = + hash_entries_info_.entries.begin(); + + for (; i <= kBufferMask && i < num_entries; ++i) { + uint64_t hash = *hash_entries_it; + + // Primary Block + uint32_t primary_global_block_idx = + HashToGlobalBlockIdx(Lower32of64(hash), data_len_bytes); + new (&primary_blocks[i]) BuildBlock(data, primary_global_block_idx, true); + + const uint32_t batch_idx = GetContainingBatchIdx(primary_global_block_idx); + const uint8_t primary_in_batch_block_idx = + GetInBatchBlockIdx(primary_global_block_idx); + const uint32_t secondary_in_batch_block_idx = + pairing_table_[batch_idx][primary_in_batch_block_idx] + .pair_in_batch_block_idx; + + primary_hash_selectors[i] = GetHashSetSelector( + primary_in_batch_block_idx, secondary_in_batch_block_idx); + + const uint32_t secondary_global_block_idx = + GetFirstGlobalBlockIdxOfBatch(batch_idx) + secondary_in_batch_block_idx; + new (&secondary_blocks[i]) + BuildBlock(data, secondary_global_block_idx, true); + + upper_32_bits_of_hashes[i] = Upper32of64(hash); + ++hash_entries_it; + } + + // Process and buffer + for (; i < num_entries; ++i) { + auto idx = i & kBufferMask; + uint32_t& upper_32_bits_of_hash_ref = upper_32_bits_of_hashes[idx]; + auto& primary_block_ref = primary_blocks[idx]; + auto& secondary_block_ref = secondary_blocks[idx]; + auto& primary_hash_selector_ref = primary_hash_selectors[idx]; + + primary_block_ref.SetBlockBloomBits( + upper_32_bits_of_hash_ref, primary_hash_selector_ref, hash_set_size); + secondary_block_ref.SetBlockBloomBits(upper_32_bits_of_hash_ref, + 1 - primary_hash_selector_ref, + hash_set_size); + // And buffer + uint64_t hash = *hash_entries_it; + + // Primary Block + uint32_t primary_global_block_idx = + HashToGlobalBlockIdx(Lower32of64(hash), data_len_bytes); + new (&primary_block_ref) BuildBlock(data, primary_global_block_idx, true); + + const uint32_t batch_idx = GetContainingBatchIdx(primary_global_block_idx); + const uint8_t primary_in_batch_block_idx = + GetInBatchBlockIdx(primary_global_block_idx); + const uint32_t secondary_in_batch_block_idx = + pairing_table_[batch_idx][primary_in_batch_block_idx] + .pair_in_batch_block_idx; + primary_hash_selector_ref = GetHashSetSelector( + primary_in_batch_block_idx, secondary_in_batch_block_idx); + const uint32_t secondary_global_block_idx = + GetFirstGlobalBlockIdxOfBatch(batch_idx) + secondary_in_batch_block_idx; + new (&secondary_block_ref) + BuildBlock(data, secondary_global_block_idx, true); + + upper_32_bits_of_hash_ref = Upper32of64(hash); + ++hash_entries_it; + } + + // Finish processing + for (i = 0; i <= kBufferMask && i < num_entries; ++i) { + primary_blocks[i].SetBlockBloomBits( + upper_32_bits_of_hashes[i], primary_hash_selectors[i], hash_set_size); + secondary_blocks[i].SetBlockBloomBits(upper_32_bits_of_hashes[i], + 1 - primary_hash_selectors[i], + hash_set_size); + } +} + +void SpdbPairedBloomBitsBuilder::AddAllEntries(char* data, + uint32_t data_len_bytes) { + InitBlockHistogram(); + BuildBlocksHistogram(data_len_bytes); + PairBlocks(); + SetBlocksPairs(data); + BuildBlocks(data, data_len_bytes); + CleanupBuildData(); +} + +void SpdbPairedBloomBitsBuilder::CleanupBuildData() { + blocks_histogram_.clear(); + blocks_histogram_.shrink_to_fit(); + + pairing_table_.clear(); + pairing_table_.shrink_to_fit(); + + internal_cache_res_handles_.clear(); + internal_cache_res_handles_.shrink_to_fit(); +} + +void SpdbPairedBloomBitsBuilder::AddCacheReservation( + std::size_t incremental_memory_used) { + if (cache_res_mgr_) { + std::unique_ptr + filter_cache_res_handle; + Status s = cache_res_mgr_->MakeCacheReservation(incremental_memory_used, + &filter_cache_res_handle); + s.PermitUncheckedError(); + + internal_cache_res_handles_.push_back(std::move(filter_cache_res_handle)); + } +} + +// ======================================================================================================================= +bool SpdbPairedBloomBitsReader::HashMayMatch(const uint64_t hash) { + uint32_t primary_global_block_idx = + HashToGlobalBlockIdx(Lower32of64(hash), data_len_bytes_); + // Not prefetching as performance seems to improve + // TODO: Needs additional verification + ReadBlock primary_block(data_, primary_global_block_idx, true /* prefetch */); + + uint8_t primary_in_batch_block_idx = + GetInBatchBlockIdx(primary_global_block_idx); + uint8_t secondary_in_batch_block_idx = + primary_block.GetInBatchBlockIdxOfPair(); + auto primary_block_hash_selector = GetHashSetSelector( + primary_in_batch_block_idx, secondary_in_batch_block_idx); + + auto const hash_set_size = num_probes_ / 2; + + const uint32_t upper_32_bits_of_hash = Upper32of64(hash); + if (primary_block.AreAllBlockBloomBitsSet(upper_32_bits_of_hash, + primary_block_hash_selector, + hash_set_size) == false) { + return false; + } + + uint32_t secondary_block_hash_selector = 1 - primary_block_hash_selector; + uint32_t batch_idx = GetContainingBatchIdx(primary_global_block_idx); + uint32_t secondary_global_block_idx = + GetFirstGlobalBlockIdxOfBatch(batch_idx) + secondary_in_batch_block_idx; + + ReadBlock secondary_block(data_, secondary_global_block_idx, + true /* prefetch */); + return secondary_block.AreAllBlockBloomBitsSet( + upper_32_bits_of_hash, secondary_block_hash_selector, hash_set_size); +} + +bool SpdbPairedBloomBitsReader::MayMatch(const Slice& key) { + uint64_t hash = GetSliceHash64(key); + return HashMayMatch(hash); +} + +// TODO: COPY Rocksdb's approach for multi-keys to improve performance +// (prefetch blocks) +void SpdbPairedBloomBitsReader::MayMatch(int num_keys, Slice** keys, + bool* may_match) { + for (auto i = 0; i < num_keys; ++i) { + may_match[i] = MayMatch(*keys[i]); + } +} + +} // namespace ROCKSDB_NAMESPACE diff --git a/plugin/speedb/paired_filter/speedb_paired_bloom_internal.h b/plugin/speedb/paired_filter/speedb_paired_bloom_internal.h new file mode 100644 index 0000000000..2a260a9027 --- /dev/null +++ b/plugin/speedb/paired_filter/speedb_paired_bloom_internal.h @@ -0,0 +1,188 @@ +#pragma once + +#include +#include +#include + +#include "table/block_based/filter_policy_internal.h" + +namespace ROCKSDB_NAMESPACE { + +namespace speedb_filter { +inline constexpr size_t kPairedBloomBatchSizeInBlocks = 128U; +// Max supported BPK. Filters using higher BPK-s will use the max +inline constexpr int kMinMillibitsPerKey = 1000.0; + +// Types of proprietary Speedb's filters +enum class FilterType : uint8_t { + kPairedBlockBloom = 1, + kFutureUnknown = 0xFF, // User to indicate an unrecognized filter type from a + // future version +}; + +// Bloom Filter's data provided by Speedb: +// 0 |-----------------------------------| +// | Raw Paired Bloom filter data | +// | ... | +// len |-----------------------------------| +// | bytes Spdb Filter Types | +// | 1: SpdbPairedBloom | +// | other: reserved | +// len+1 |-----------------------------------| +// | byte for block_and_probes | +// | 0 in top 3 bits -> 6 -> 64-byte | +// | reserved: | +// | 1 in top 3 bits -> 7 -> 128-byte| +// | 2 in top 3 bits -> 8 -> 256-byte| +// | ... | +// | num_probes in bottom 5 bits, | +// | except 0 and 31 reserved | +// len+2 |-----------------------------------| +// | two bytes reserved | +// | possibly for hash seed | +// len_with_meta |-----------------------------------| +class FilterMetadata { + public: + // Metadata trailer size for Speedb's filters. (This is separate from + // block-based table block trailer). Starting at len in the diagram above + static constexpr uint32_t kMetadataLen = 4U; + + struct Fields { + size_t num_probes; + FilterType filter_type; + }; + + public: + static void WriteMetadata(char* metadata, size_t len, const Fields& fields); + static Fields ReadMetadata(const char* metadata); +}; + +} // namespace speedb_filter + +// =========================================================================================================== +class SpdbPairedBloomBitsBuilder : public XXPH3FilterBitsBuilder { + public: + // Callback function to create a compatible reader. This is needed when + // performing post-verify during filter construction / filter block writing + // (See BlockBasedTableBuilder::WriteRawBlock() + using FilterBitsReaderCreateFunc = + std::function; + + public: + // Non-null aggregate_rounding_balance implies optimize_filters_for_memory + explicit SpdbPairedBloomBitsBuilder( + const int millibits_per_key, + std::atomic* aggregate_rounding_balance, + const std::shared_ptr& cache_res_mgr, + bool detect_filter_construct_corruption, + const FilterBitsReaderCreateFunc& reader_create_func); + + ~SpdbPairedBloomBitsBuilder() override {} + + // No Copy allowed + SpdbPairedBloomBitsBuilder(const SpdbPairedBloomBitsBuilder&) = delete; + void operator=(const SpdbPairedBloomBitsBuilder&) = delete; + + protected: + size_t RoundDownUsableSpace(size_t available_size) override; + + FilterBitsReader* GetBitsReader(const Slice& filter_content) override; + + private: + // Stores the per-block information used to sort and pair blocks in the + // algorithm + struct BlockHistogramInfo { + // Number of keys mapped to this block + uint16_t num_keys = 0U; + + // Records the original in-batch block idx of the block before sorting + uint8_t original_in_batch_block_idx = std::numeric_limits::max(); + + // Allows block to be sorted using std sorting algorithms + bool operator<(const BlockHistogramInfo& other) const { + return (num_keys < other.num_keys); + } + }; + + // Records the info about a block's pair in the batch + struct PairingInfo { + uint8_t pair_in_batch_block_idx; + uint8_t hash_set_selector; + }; + + using BatchBlocksHistogram = + std::array; + using BatchPairingInfo = + std::array; + + public: + Slice Finish(std::unique_ptr* buf) override { + return Finish(buf, nullptr); + } + + Slice Finish(std::unique_ptr* buf, Status* status) override; + + size_t ApproximateNumEntries(size_t len_with_metadata) override; + size_t CalculateSpace(size_t num_entries) override; + double EstimatedFpRate(size_t /*num_entries*/, + size_t /*len_with_metadata*/) override; + + private: + size_t GetNumProbes(); + + void InitVars(uint64_t len_no_metadata); + void InitBlockHistogram(); + void BuildBlocksHistogram(uint32_t data_len_bytes); + void SortBatchBlocks(uint32_t batch_idx); + void PairBatchBlocks(uint32_t batch_idx); + void PairBlocks(); + void SetBlocksPairs(char* data); + void BuildBlocks(char* data, uint32_t data_len_bytes); + void CleanupBuildData(); + + void AddAllEntries(char* data, uint32_t data_len_bytes); + + void AddCacheReservation(std::size_t incremental_memory_used); + + private: + // Target allocation per added key, in thousandths of a bit. + int millibits_per_key_; + + size_t num_blocks_ = 0U; + size_t num_batches_ = 0U; + size_t num_probes_ = 0U; + + std::vector blocks_histogram_; + std::vector pairing_table_; + + // For managing cache reservations needed for the building of the filter + std::vector> + internal_cache_res_handles_; + + FilterBitsReaderCreateFunc reader_create_func_; +}; + +class SpdbPairedBloomBitsReader : public BuiltinFilterBitsReader { + public: + SpdbPairedBloomBitsReader(const char* data, size_t num_probes, + uint32_t data_len_bytes) + : data_(data), num_probes_(num_probes), data_len_bytes_(data_len_bytes) {} + + ~SpdbPairedBloomBitsReader() override {} + + // No Copy allowed + SpdbPairedBloomBitsReader(const SpdbPairedBloomBitsReader&) = delete; + void operator=(const SpdbPairedBloomBitsReader&) = delete; + + bool HashMayMatch(const uint64_t /*hash*/) override; + bool MayMatch(const Slice& key) override; + void MayMatch(int num_keys, Slice** keys, bool* may_match) override; + + private: + const char* data_; + const size_t num_probes_; + const uint32_t data_len_bytes_; +}; + +} // namespace ROCKSDB_NAMESPACE diff --git a/plugin/speedb/speedb.mk b/plugin/speedb/speedb.mk index ed5b64a757..a19cfd6967 100644 --- a/plugin/speedb/speedb.mk +++ b/plugin/speedb/speedb.mk @@ -12,9 +12,18 @@ # See the License for the specific language governing permissions and # limitations under the License. -speedb_SOURCES = \ - speedb_registry.cc \ - memtable/hash_spd_rep.cc \ +speedb_SOURCES = \ + speedb_registry.cc \ + memtable/hash_spd_rep.cc \ + paired_filter/speedb_paired_bloom.cc \ + paired_filter/speedb_paired_bloom_internal.cc \ speedb_FUNC = register_SpeedbPlugins + +speedb_HEADERS = \ + paired_filter/speedb_paired_bloom.h \ + +speedb_TESTS = \ + speedb_customizable_test.cc \ + paired_filter/speedb_db_bloom_filter_test.cc \ diff --git a/plugin/speedb/speedb_customizable_test.cc b/plugin/speedb/speedb_customizable_test.cc new file mode 100644 index 0000000000..ca63e188c0 --- /dev/null +++ b/plugin/speedb/speedb_customizable_test.cc @@ -0,0 +1,105 @@ +// TODO: ADD Speedb's Copyright Notice !!!!! + +#include +#include +#include +#include + +#include "db/db_test_util.h" +#include "plugin/speedb/paired_filter/speedb_paired_bloom.h" +#include "port/stack_trace.h" +#include "rocksdb/customizable.h" +#include "rocksdb/filter_policy.h" +#include "rocksdb/utilities/customizable_util.h" +#include "rocksdb/utilities/object_registry.h" +#include "table/block_based/filter_policy_internal.h" +#include "test_util/testharness.h" +#include "test_util/testutil.h" + +#ifdef GFLAGS +#include "util/gflags_compat.h" +using GFLAGS_NAMESPACE::ParseCommandLineFlags; +DEFINE_bool(enable_print, false, "Print options generated to console."); +#endif // GFLAGS + +namespace ROCKSDB_NAMESPACE { + +class LoadCustomizableTest : public testing::Test { + public: + LoadCustomizableTest() { + config_options_.ignore_unsupported_options = false; + config_options_.invoke_prepare_options = false; + } + bool RegisterTests(const std::string& arg) { + (void)arg; + return false; + } + + protected: + DBOptions db_opts_; + ColumnFamilyOptions cf_opts_; + ConfigOptions config_options_; +}; + +// ========================================================================================== +TEST_F(LoadCustomizableTest, LoadSpdbPairedFilterPolicyTest) { + std::shared_ptr table; + std::shared_ptr result; + ASSERT_NOK(FilterPolicy::CreateFromString( + config_options_, SpdbPairedBloomFilterPolicy::kClassName(), &result)); + + ASSERT_OK(FilterPolicy::CreateFromString(config_options_, "", &result)); + ASSERT_EQ(result, nullptr); + ASSERT_OK(FilterPolicy::CreateFromString( + config_options_, ReadOnlyBuiltinFilterPolicy::kClassName(), &result)); + ASSERT_NE(result, nullptr); + ASSERT_STREQ(result->Name(), ReadOnlyBuiltinFilterPolicy::kClassName()); + +#ifndef ROCKSDB_LITE + std::string table_opts = "id=BlockBasedTable; filter_policy="; + ASSERT_OK(TableFactory::CreateFromString(config_options_, + table_opts + "nullptr", &table)); + ASSERT_NE(table.get(), nullptr); + auto bbto = table->GetOptions(); + ASSERT_NE(bbto, nullptr); + ASSERT_EQ(bbto->filter_policy.get(), nullptr); + ASSERT_OK(TableFactory::CreateFromString( + config_options_, table_opts + ReadOnlyBuiltinFilterPolicy::kClassName(), + &table)); + bbto = table->GetOptions(); + ASSERT_NE(bbto, nullptr); + ASSERT_NE(bbto->filter_policy.get(), nullptr); + ASSERT_STREQ(bbto->filter_policy->Name(), + ReadOnlyBuiltinFilterPolicy::kClassName()); + ASSERT_OK(TableFactory::CreateFromString( + config_options_, table_opts + SpdbPairedBloomFilterPolicy::kClassName(), + &table)); + bbto = table->GetOptions(); + ASSERT_NE(bbto, nullptr); + ASSERT_EQ(bbto->filter_policy.get(), nullptr); + if (RegisterTests("Test")) { + ASSERT_OK(FilterPolicy::CreateFromString( + config_options_, SpdbPairedBloomFilterPolicy::kClassName(), &result)); + ASSERT_NE(result, nullptr); + ASSERT_STREQ(result->Name(), SpdbPairedBloomFilterPolicy::kClassName()); + ASSERT_OK(TableFactory::CreateFromString( + config_options_, table_opts + SpdbPairedBloomFilterPolicy::kClassName(), + &table)); + bbto = table->GetOptions(); + ASSERT_NE(bbto, nullptr); + ASSERT_NE(bbto->filter_policy.get(), nullptr); + ASSERT_STREQ(bbto->filter_policy->Name(), + SpdbPairedBloomFilterPolicy::kClassName()); + } +#endif // ROCKSDB_LITE +} + +} // namespace ROCKSDB_NAMESPACE +int main(int argc, char** argv) { + ::testing::InitGoogleTest(&argc, argv); + ROCKSDB_NAMESPACE::port::InstallStackTraceHandler(); +#ifdef GFLAGS + ParseCommandLineFlags(&argc, &argv, true); +#endif // GFLAGS + return RUN_ALL_TESTS(); +} diff --git a/plugin/speedb/speedb_registry.cc b/plugin/speedb/speedb_registry.cc index 9742bc5f7a..fe4436d8f4 100644 --- a/plugin/speedb/speedb_registry.cc +++ b/plugin/speedb/speedb_registry.cc @@ -14,6 +14,7 @@ #include "plugin/speedb/speedb_registry.h" +#include "paired_filter/speedb_paired_bloom.h" #include "plugin/speedb/memtable/hash_spd_rep.h" #include "rocksdb/utilities/object_registry.h" #include "util/string_util.h" @@ -21,6 +22,14 @@ namespace ROCKSDB_NAMESPACE { #ifndef ROCKSDB_LITE +// Similar to the NewBuiltinFilterPolicyWithBits template for RocksDB built-in +// filters +SpdbPairedBloomFilterPolicy* NewSpdbPairedBloomFilterWithBits( + const std::string& uri) { + return new SpdbPairedBloomFilterPolicy( + FilterPolicy::ExtractBitsPerKeyFromUri(uri)); +} + int register_SpeedbPlugins(ObjectLibrary& library, const std::string&) { library.AddFactory( ObjectLibrary::PatternEntry(HashSpdRepFactory::kClassName(), true) @@ -36,6 +45,18 @@ int register_SpeedbPlugins(ObjectLibrary& library, const std::string&) { } return guard->get(); }); + + library.AddFactory( + ObjectLibrary::PatternEntry(SpdbPairedBloomFilterPolicy::kClassName(), + false) + .AnotherName(SpdbPairedBloomFilterPolicy::kNickName()) + .AddNumber(":", false), + [](const std::string& uri, std::unique_ptr* guard, + std::string* /* errmsg */) { + guard->reset(NewSpdbPairedBloomFilterWithBits(uri)); + return guard->get(); + }); + size_t num_types; return static_cast(library.GetFactoryCount(&num_types)); } diff --git a/table/block_based/filter_policy.cc b/table/block_based/filter_policy.cc index 19b880a900..aa672597fb 100644 --- a/table/block_based/filter_policy.cc +++ b/table/block_based/filter_policy.cc @@ -13,12 +13,10 @@ #include #include #include -#include #include #include #include "cache/cache_entry_roles.h" -#include "cache/cache_reservation_manager.h" #include "logging/logging.h" #include "port/lang.h" #include "rocksdb/convenience.h" @@ -55,83 +53,67 @@ Slice FinishAlwaysFalse(std::unique_ptr* /*buf*/) { return Slice(nullptr, 0); } -Slice FinishAlwaysTrue(std::unique_ptr* /*buf*/) { - return Slice("\0\0\0\0\0\0", 6); -} +} // namespace + +// Number of hash entries to accumulate before charging their memory usage to +// the cache when cache reservation is available +const std::size_t XXPH3FilterBitsBuilder::kUint64tHashEntryCacheResBucketSize = + CacheReservationManagerImpl< + CacheEntryRole::kFilterConstruction>::GetDummyEntrySize() / + sizeof(uint64_t); // Base class for filter builders using the XXH3 preview hash, // also known as Hash64 or GetSliceHash64. -class XXPH3FilterBitsBuilder : public BuiltinFilterBitsBuilder { - public: - explicit XXPH3FilterBitsBuilder( - std::atomic* aggregate_rounding_balance, - std::shared_ptr cache_res_mgr, - bool detect_filter_construct_corruption) - : aggregate_rounding_balance_(aggregate_rounding_balance), - cache_res_mgr_(cache_res_mgr), - detect_filter_construct_corruption_( - detect_filter_construct_corruption) {} - - ~XXPH3FilterBitsBuilder() override {} - - virtual void AddKey(const Slice& key) override { - uint64_t hash = GetSliceHash64(key); - // Especially with prefixes, it is common to have repetition, - // though only adjacent repetition, which we want to immediately - // recognize and collapse for estimating true filter space - // requirements. - if (hash_entries_info_.entries.empty() || - hash != hash_entries_info_.entries.back()) { - if (detect_filter_construct_corruption_) { - hash_entries_info_.xor_checksum ^= hash; - } - hash_entries_info_.entries.push_back(hash); - if (cache_res_mgr_ && - // Traditional rounding to whole bucket size - ((hash_entries_info_.entries.size() % - kUint64tHashEntryCacheResBucketSize) == - kUint64tHashEntryCacheResBucketSize / 2)) { - hash_entries_info_.cache_res_bucket_handles.emplace_back(nullptr); - Status s = cache_res_mgr_->MakeCacheReservation( - kUint64tHashEntryCacheResBucketSize * sizeof(hash), - &hash_entries_info_.cache_res_bucket_handles.back()); - s.PermitUncheckedError(); - } +XXPH3FilterBitsBuilder::XXPH3FilterBitsBuilder( + std::atomic* aggregate_rounding_balance, + std::shared_ptr cache_res_mgr, + bool detect_filter_construct_corruption) + : aggregate_rounding_balance_(aggregate_rounding_balance), + cache_res_mgr_(cache_res_mgr), + detect_filter_construct_corruption_(detect_filter_construct_corruption) {} + +void XXPH3FilterBitsBuilder::AddKey(const Slice& key) { + uint64_t hash = GetSliceHash64(key); + // Especially with prefixes, it is common to have repetition, + // though only adjacent repetition, which we want to immediately + // recognize and collapse for estimating true filter space + // requirements. + if (hash_entries_info_.entries.empty() || + hash != hash_entries_info_.entries.back()) { + if (detect_filter_construct_corruption_) { + hash_entries_info_.xor_checksum ^= hash; + } + hash_entries_info_.entries.push_back(hash); + if (cache_res_mgr_ && + // Traditional rounding to whole bucket size + ((hash_entries_info_.entries.size() % + kUint64tHashEntryCacheResBucketSize) == + kUint64tHashEntryCacheResBucketSize / 2)) { + hash_entries_info_.cache_res_bucket_handles.emplace_back(nullptr); + Status s = cache_res_mgr_->MakeCacheReservation( + kUint64tHashEntryCacheResBucketSize * sizeof(hash), + &hash_entries_info_.cache_res_bucket_handles.back()); + s.PermitUncheckedError(); } } +} - virtual size_t EstimateEntriesAdded() override { - return hash_entries_info_.entries.size(); - } - - virtual Status MaybePostVerify(const Slice& filter_content) override; - - protected: - static constexpr uint32_t kMetadataLen = 5; - - // Number of hash entries to accumulate before charging their memory usage to - // the cache when cache charging is available - static const std::size_t kUint64tHashEntryCacheResBucketSize = - CacheReservationManagerImpl< - CacheEntryRole::kFilterConstruction>::GetDummyEntrySize() / - sizeof(uint64_t); +size_t XXPH3FilterBitsBuilder::EstimateEntriesAdded() { + return hash_entries_info_.entries.size(); +} // For delegating between XXPH3FilterBitsBuilders - void SwapEntriesWith(XXPH3FilterBitsBuilder* other) { - assert(other != nullptr); - hash_entries_info_.Swap(&(other->hash_entries_info_)); - } - - void ResetEntries() { hash_entries_info_.Reset(); } - - virtual size_t RoundDownUsableSpace(size_t available_size) = 0; +void XXPH3FilterBitsBuilder::SwapEntriesWith(XXPH3FilterBitsBuilder* other) { + assert(other != nullptr); + hash_entries_info_.Swap(&(other->hash_entries_info_)); +} // To choose size using malloc_usable_size, we have to actually allocate. - size_t AllocateMaybeRounding(size_t target_len_with_metadata, - size_t num_entries, - std::unique_ptr* buf) { - // Return value set to a default; overwritten in some cases - size_t rv = target_len_with_metadata; +size_t XXPH3FilterBitsBuilder::AllocateMaybeRounding( + size_t target_len_with_metadata, size_t num_entries, + std::unique_ptr* buf) { + // Return value set to a default; overwritten in some cases + size_t rv = target_len_with_metadata; #ifdef ROCKSDB_MALLOC_USABLE_SIZE if (aggregate_rounding_balance_ != nullptr) { // Do optimize_filters_for_memory, using malloc_usable_size. @@ -222,7 +204,7 @@ class XXPH3FilterBitsBuilder : public BuiltinFilterBitsBuilder { buf->reset(new char[rv]()); #endif // ROCKSDB_MALLOC_USABLE_SIZE return rv; - } +} // TODO: Ideally we want to verify the hash entry // as it is added to the filter and eliminate this function @@ -231,73 +213,25 @@ class XXPH3FilterBitsBuilder : public BuiltinFilterBitsBuilder { // Possible solution: // pass a custom iterator that tracks the xor checksum as // it iterates to ResetAndFindSeedToSolve - Status MaybeVerifyHashEntriesChecksum() { - if (!detect_filter_construct_corruption_) { - return Status::OK(); - } - - uint64_t actual_hash_entries_xor_checksum = 0; - for (uint64_t h : hash_entries_info_.entries) { - actual_hash_entries_xor_checksum ^= h; - } - - if (actual_hash_entries_xor_checksum == hash_entries_info_.xor_checksum) { - return Status::OK(); - } else { - // Since these hash entries are corrupted and they will not be used - // anymore, we can reset them and release memory. - ResetEntries(); - return Status::Corruption("Filter's hash entries checksum mismatched"); - } +Status XXPH3FilterBitsBuilder::MaybeVerifyHashEntriesChecksum() { + if (!detect_filter_construct_corruption_) { + return Status::OK(); } - // See BloomFilterPolicy::aggregate_rounding_balance_. If nullptr, - // always "round up" like historic behavior. - std::atomic* aggregate_rounding_balance_; - - // For reserving memory used in (new) Bloom and Ribbon Filter construction - std::shared_ptr cache_res_mgr_; - - // For managing cache charge for final filter in (new) Bloom and Ribbon - // Filter construction - std::deque> - final_filter_cache_res_handles_; - - bool detect_filter_construct_corruption_; - - struct HashEntriesInfo { - // A deque avoids unnecessary copying of already-saved values - // and has near-minimal peak memory use. - std::deque entries; - - // If cache_res_mgr_ != nullptr, - // it manages cache charge for buckets of hash entries in (new) Bloom - // or Ribbon Filter construction. - // Otherwise, it is empty. - std::deque> - cache_res_bucket_handles; - - // If detect_filter_construct_corruption_ == true, - // it records the xor checksum of hash entries. - // Otherwise, it is 0. - uint64_t xor_checksum = 0; - - void Swap(HashEntriesInfo* other) { - assert(other != nullptr); - std::swap(entries, other->entries); - std::swap(cache_res_bucket_handles, other->cache_res_bucket_handles); - std::swap(xor_checksum, other->xor_checksum); - } - - void Reset() { - entries.clear(); - cache_res_bucket_handles.clear(); - xor_checksum = 0; - } - }; + uint64_t actual_hash_entries_xor_checksum = 0; + for (uint64_t h : hash_entries_info_.entries) { + actual_hash_entries_xor_checksum ^= h; + } - HashEntriesInfo hash_entries_info_; -}; + if (actual_hash_entries_xor_checksum == hash_entries_info_.xor_checksum) { + return Status::OK(); + } else { + // Since these hash entries are corrupted and they will not be used + // anymore, we can reset them and release memory. + ResetEntries(); + return Status::Corruption("Filter's hash entries checksum mismatched"); + } +} // #################### FastLocalBloom implementation ################## // // ############## also known as format_version=5 Bloom filter ########## // @@ -1261,21 +1195,10 @@ class LegacyBloomBitsReader : public BuiltinFilterBitsReader { const uint32_t log2_cache_line_size_; }; -class AlwaysTrueFilter : public BuiltinFilterBitsReader { - public: - bool MayMatch(const Slice&) override { return true; } - using FilterBitsReader::MayMatch; // inherit overload - bool HashMayMatch(const uint64_t) override { return true; } - using BuiltinFilterBitsReader::HashMayMatch; // inherit overload -}; - -class AlwaysFalseFilter : public BuiltinFilterBitsReader { - public: - bool MayMatch(const Slice&) override { return false; } - using FilterBitsReader::MayMatch; // inherit overload - bool HashMayMatch(const uint64_t) override { return false; } - using BuiltinFilterBitsReader::HashMayMatch; // inherit overload -}; +FilterBitsReader* XXPH3FilterBitsBuilder::GetBitsReader( + const Slice& filter_content) { + return BuiltinFilterPolicy::GetBuiltinFilterBitsReader(filter_content); +} Status XXPH3FilterBitsBuilder::MaybePostVerify(const Slice& filter_content) { Status s = Status::OK(); @@ -1284,8 +1207,7 @@ Status XXPH3FilterBitsBuilder::MaybePostVerify(const Slice& filter_content) { return s; } - std::unique_ptr bits_reader( - BuiltinFilterPolicy::GetBuiltinFilterBitsReader(filter_content)); + std::unique_ptr bits_reader(GetBitsReader(filter_content)); for (uint64_t h : hash_entries_info_.entries) { // The current approach will not detect corruption from XXPH3Filter to @@ -1302,7 +1224,6 @@ Status XXPH3FilterBitsBuilder::MaybePostVerify(const Slice& filter_content) { ResetEntries(); return s; } -} // namespace const char* BuiltinFilterPolicy::kClassName() { return "rocksdb.internal.BuiltinFilter"; @@ -1377,7 +1298,7 @@ const char* ReadOnlyBuiltinFilterPolicy::kClassName() { } std::string BloomLikeFilterPolicy::GetId() const { - return Name() + GetBitsPerKeySuffix(); + return Name() + GetBitsPerKeySuffix(millibits_per_key_); } BloomFilterPolicy::BloomFilterPolicy(double bits_per_key) @@ -1480,9 +1401,9 @@ BloomLikeFilterPolicy::GetStandard128RibbonBuilderWithContext( context.info_log); } -std::string BloomLikeFilterPolicy::GetBitsPerKeySuffix() const { - std::string rv = ":" + std::to_string(millibits_per_key_ / 1000); - int frac = millibits_per_key_ % 1000; +std::string BloomLikeFilterPolicy::GetBitsPerKeySuffix(int millibits_per_key) { + std::string rv = ":" + std::to_string(millibits_per_key / 1000); + int frac = millibits_per_key % 1000; if (frac > 0) { rv.push_back('.'); rv.push_back(static_cast('0' + (frac / 100))); @@ -1835,9 +1756,7 @@ static ObjectLibrary::PatternEntry FilterPatternEntryWithBits( template T* NewBuiltinFilterPolicyWithBits(const std::string& uri) { - const std::vector vals = StringSplit(uri, ':'); - double bits_per_key = ParseDouble(vals[1]); - return new T(bits_per_key); + return new T(FilterPolicy::ExtractBitsPerKeyFromUri(uri)); } static int RegisterBuiltinFilterPolicies(ObjectLibrary& library, const std::string& /*arg*/) { @@ -1936,6 +1855,11 @@ static int RegisterBuiltinFilterPolicies(ObjectLibrary& library, } } // namespace +double FilterPolicy::ExtractBitsPerKeyFromUri(const std::string& uri) { + const std::vector vals = StringSplit(uri, ':'); + return ParseDouble(vals[1]); +} + Status FilterPolicy::CreateFromString( const ConfigOptions& options, const std::string& value, std::shared_ptr* policy) { @@ -1981,4 +1905,14 @@ const std::vector& BloomLikeFilterPolicy::GetAllFixedImpls() { return impls; } +int BloomLikeFilterPolicy::GetAllFixedImplIndex(const std::string& name) { + const auto& all_names = GetAllFixedImpls(); + for (size_t idx = 0; idx < all_names.size(); idx++) { + if (name == all_names[idx]) { + return static_cast(idx); + } + } + return -1; +} + } // namespace ROCKSDB_NAMESPACE diff --git a/table/block_based/filter_policy_internal.h b/table/block_based/filter_policy_internal.h index 3919c8c6d2..f5615373d9 100644 --- a/table/block_based/filter_policy_internal.h +++ b/table/block_based/filter_policy_internal.h @@ -9,10 +9,12 @@ #pragma once #include +#include #include #include #include +#include "cache/cache_reservation_manager.h" #include "rocksdb/filter_policy.h" #include "rocksdb/table.h" @@ -95,6 +97,8 @@ class FilterBitsReader { may_match[i] = MayMatch(*keys[i]); } } + + virtual bool HashMayMatch(const uint64_t /* h */) = 0; }; // Exposes any extra information needed for testing built-in @@ -115,12 +119,102 @@ class BuiltinFilterBitsBuilder : public FilterBitsBuilder { virtual double EstimatedFpRate(size_t num_entries, size_t bytes) = 0; }; +class XXPH3FilterBitsBuilder : public BuiltinFilterBitsBuilder { + public: + explicit XXPH3FilterBitsBuilder( + std::atomic* aggregate_rounding_balance, + std::shared_ptr cache_res_mgr, + bool detect_filter_construct_corruption); + + ~XXPH3FilterBitsBuilder() override {} + + virtual void AddKey(const Slice& key) override; + virtual size_t EstimateEntriesAdded() override; + virtual Status MaybePostVerify(const Slice& filter_content) override; + + protected: + static constexpr uint32_t kMetadataLen = 5; + + // Number of hash entries to accumulate before charging their memory usage to + // the cache when cache reservation is available + static const std::size_t kUint64tHashEntryCacheResBucketSize; + + // For delegating between XXPH3FilterBitsBuilders + void SwapEntriesWith(XXPH3FilterBitsBuilder* other); + void ResetEntries() { hash_entries_info_.Reset(); } + + virtual size_t RoundDownUsableSpace(size_t available_size) = 0; + + // To choose size using malloc_usable_size, we have to actually allocate. + size_t AllocateMaybeRounding(size_t target_len_with_metadata, + size_t num_entries, + std::unique_ptr* buf); + + // TODO: Ideally we want to verify the hash entry + // as it is added to the filter and eliminate this function + // for speeding up and leaving fewer spaces for undetected memory/CPU + // corruption. For Ribbon Filter, it's bit harder. + // Possible solution: + // pass a custom iterator that tracks the xor checksum as + // it iterates to ResetAndFindSeedToSolve + Status MaybeVerifyHashEntriesChecksum(); + + virtual FilterBitsReader* GetBitsReader(const Slice& filter_content); + + // See BloomFilterPolicy::aggregate_rounding_balance_. If nullptr, + // always "round up" like historic behavior. + std::atomic* aggregate_rounding_balance_; + + // For reserving memory used in (new) Bloom and Ribbon Filter construction + std::shared_ptr cache_res_mgr_; + + // For managing cache reservation for final filter in (new) Bloom and Ribbon + // Filter construction + std::deque> + final_filter_cache_res_handles_; + + bool detect_filter_construct_corruption_; + + struct HashEntriesInfo { + // A deque avoids unnecessary copying of already-saved values + // and has near-minimal peak memory use. + std::deque entries; + + // If cache_res_mgr_ != nullptr, + // it manages cache reservation for buckets of hash entries in (new) Bloom + // or Ribbon Filter construction. + // Otherwise, it is empty. + std::deque> + cache_res_bucket_handles; + + // If detect_filter_construct_corruption_ == true, + // it records the xor checksum of hash entries. + // Otherwise, it is 0. + uint64_t xor_checksum = 0; + + void Swap(HashEntriesInfo* other) { + assert(other != nullptr); + std::swap(entries, other->entries); + std::swap(cache_res_bucket_handles, other->cache_res_bucket_handles); + std::swap(xor_checksum, other->xor_checksum); + } + + void Reset() { + entries.clear(); + cache_res_bucket_handles.clear(); + xor_checksum = 0; + } + }; + + HashEntriesInfo hash_entries_info_; +}; + // Base class for RocksDB built-in filter reader with // extra useful functionalities for inernal. class BuiltinFilterBitsReader : public FilterBitsReader { public: // Check if the hash of the entry match the bits in filter - virtual bool HashMayMatch(const uint64_t /* h */) { return true; } + bool HashMayMatch(const uint64_t /* h */) override { return true; } }; // Base class for RocksDB built-in filter policies. This provides the @@ -191,6 +285,8 @@ class BloomLikeFilterPolicy : public BuiltinFilterPolicy { std::string GetId() const override; + static std::string GetBitsPerKeySuffix(int millibits_per_key); + // Essentially for testing only: configured millibits/key int GetMillibitsPerKey() const { return millibits_per_key_; } // Essentially for testing only: legacy whole bits/key @@ -201,6 +297,9 @@ class BloomLikeFilterPolicy : public BuiltinFilterPolicy { // "always use this implementation." Only appropriate for unit tests. static const std::vector& GetAllFixedImpls(); + // Returns the index in GetAllFixedImpls of "name" if found, -1 if not + static int GetAllFixedImplIndex(const std::string& name); + // Convenience function for creating by name for fixed impls static std::shared_ptr Create(const std::string& name, double bits_per_key); @@ -214,8 +313,6 @@ class BloomLikeFilterPolicy : public BuiltinFilterPolicy { FilterBitsBuilder* GetStandard128RibbonBuilderWithContext( const FilterBuildingContext& context) const; - std::string GetBitsPerKeySuffix() const; - private: // Bits per key settings are for configuring Bloom filters. @@ -297,6 +394,26 @@ class RibbonFilterPolicy : public BloomLikeFilterPolicy { std::atomic bloom_before_level_; }; +class AlwaysTrueFilter : public BuiltinFilterBitsReader { + public: + bool MayMatch(const Slice&) override { return true; } + using FilterBitsReader::MayMatch; // inherit overload + bool HashMayMatch(const uint64_t) override { return true; } + using BuiltinFilterBitsReader::HashMayMatch; // inherit overload +}; + +class AlwaysFalseFilter : public BuiltinFilterBitsReader { + public: + bool MayMatch(const Slice&) override { return false; } + using FilterBitsReader::MayMatch; // inherit overload + bool HashMayMatch(const uint64_t) override { return false; } + using BuiltinFilterBitsReader::HashMayMatch; // inherit overload +}; + +inline Slice FinishAlwaysTrue(std::unique_ptr* /*buf*/) { + return Slice("\0\0\0\0\0\0", 6); +} + // For testing only, but always constructable with internal names namespace test { diff --git a/table/block_based/full_filter_block_test.cc b/table/block_based/full_filter_block_test.cc index 0268b7b271..a47ee68add 100644 --- a/table/block_based/full_filter_block_test.cc +++ b/table/block_based/full_filter_block_test.cc @@ -67,6 +67,10 @@ class TestFilterBitsReader : public FilterBitsReader { using FilterBitsReader::MayMatch; bool MayMatch(const Slice& entry) override { uint32_t h = Hash(entry.data(), entry.size(), 1); + return HashMayMatch(h); + } + + bool HashMayMatch(const uint64_t h) override { for (size_t i = 0; i + 4 <= len_; i += 4) { if (h == DecodeFixed32(data_ + i)) { return true; diff --git a/tools/db_crashtest.py b/tools/db_crashtest.py index 5b796999da..9173c86bb1 100644 --- a/tools/db_crashtest.py +++ b/tools/db_crashtest.py @@ -239,6 +239,7 @@ "num_iterations": lambda: random.randint(0, 100), "sync_wal_one_in": 100000, "customopspercent": 0, + "filter_uri": lambda: random.choice(["speedb.PairedBloomFilter", ""]), } _TEST_DIR_ENV_VAR = "TEST_TMPDIR" @@ -839,6 +840,11 @@ def finalize_and_sanitize(src_params, counter): if (dest_params["cache_size"] <= 0 or dest_params["db_write_buffer_size"] <= 0): dest_params["use_write_buffer_manager"] = 0 + + # make sure bloom_bits is not 0 when filter_uri is used since it fails in CreateFilterPolicy. + if dest_params.get("filter_uri") != "": + dest_params["bloom_bits"] = random.choice([random.randint(1,19), + random.lognormvariate(2.3, 1.3)]) return dest_params diff --git a/util/bloom_impl.h b/util/bloom_impl.h index c9bbb125b8..7178e72b8e 100644 --- a/util/bloom_impl.h +++ b/util/bloom_impl.h @@ -11,6 +11,7 @@ #include #include +#include #include #include "port/port.h" // for PREFETCH @@ -24,6 +25,18 @@ namespace ROCKSDB_NAMESPACE { class BloomMath { + public: + // Powers of 32-bit golden ratio, mod 2**32. + static constexpr size_t kNumGoldenRatioPowers = 30U; + static constexpr std::array + GoldenRatioPowers{ + 0x00000001, 0x9e3779b9, 0xe35e67b1, 0x734297e9, 0x35fbe861, + 0xdeb7c719, 0x0448b211, 0x3459b749, 0xab25f4c1, 0x52941879, + 0x9c95e071, 0xf5ab9aa9, 0x2d6ba521, 0x8bededd9, 0x9bfb72d1, + 0x3ae1c209, 0x7fca7981, 0xc576c739, 0xd23ee931, 0x0335ad69, + 0xc04ff1e1, 0x98702499, 0x7535c391, 0x9f70dcc9, 0x0e198e41, + 0xf2ab85f9, 0xe6c581f1, 0xc7ecd029, 0x6f54cea1, 0x4c8a6b59}; + public: // False positive rate of a standard Bloom filter, for given ratio of // filter memory bits to added keys, and number of probes per operation. @@ -228,6 +241,105 @@ class FastLocalBloomImpl { return HashMayMatchPrepared(h2, num_probes, data + bytes_to_cache_line); } +#ifdef HAVE_AVX2 + // Receives an intrinsic (__m256i) hash_vector comprised of num_probes (1-8) + // 32-bits bit positions (0-511) to test within a 512 bits bloom block + // + // Returns a pair: + // first: Whether testing is complete + // second: If testing is complete, the answer, otherwise N/A + // + // IMPORTANT: THIS CODE ASSUMES A BLOCK (CACHE-LINE) SIZE OF 64 BYTES !!!! + // + static inline std::pair CheckBitsPositionsInBloomBlock( + int num_probes, __m256i &hash_vector, const char *const block_address_) { + // Now the top 9 bits of each of the eight 32-bit values in + // hash_vector are bit addresses for probes within the cache line. + // While the platform-independent code uses byte addressing (6 bits + // to pick a byte + 3 bits to pick a bit within a byte), here we work + // with 32-bit words (4 bits to pick a word + 5 bits to pick a bit + // within a word) because that works well with AVX2 and is equivalent + // under little-endian. + + // Shift each right by 28 bits to get 4-bit word addresses. + const __m256i word_addresses = _mm256_srli_epi32(hash_vector, 28); + + // Gather 32-bit values spread over 512 bits by 4-bit address. In + // essence, we are dereferencing eight pointers within the cache + // line. + // + // Option 1: AVX2 gather (seems to be a little slow - understandable) + // const __m256i value_vector = + // _mm256_i32gather_epi32(static_cast(data_at_cache_line), + // word_addresses, + // /*bytes / i32*/ 4); + // END Option 1 + // Potentially unaligned as we're not *always* cache-aligned -> loadu + const __m256i *mm_data = reinterpret_cast(block_address_); + // lower = block[0:255], higher = block[256:511] + __m256i lower = _mm256_loadu_si256(mm_data); + __m256i upper = _mm256_loadu_si256(mm_data + 1); + + // Option 2: AVX512VL permute hack + // Only negligibly faster than Option 3, so not yet worth supporting + // const __m256i value_vector = + // _mm256_permutex2var_epi32(lower, word_addresses, upper); + // END Option 2 + // Option 3: AVX2 permute+blend hack + // Use lowest three bits to order probing values, as if all from same + // 256 bit piece. + + // UDI: The last 3 bits of each integer of b are used as addresses into + // the 8 integers of a. + lower = _mm256_permutevar8x32_epi32(lower, word_addresses); + upper = _mm256_permutevar8x32_epi32(upper, word_addresses); + // Just top 1 bit of address, to select between lower and upper. + // UDI: Shifts packed 32-bit integers in a right by IMM8 while shifting in + // sign bits. + const __m256i upper_lower_selector = _mm256_srai_epi32(hash_vector, 31); + // Finally: the next 8 probed 32-bit values, in probing sequence order. + const __m256i value_vector = + _mm256_blendv_epi8(lower, upper, upper_lower_selector); + // END Option 3 + + // We might not need to probe all 8, so build a mask for selecting only + // what we need. (The k_selector(s) could be pre-computed but that + // doesn't seem to make a noticeable performance difference.) + const __m256i zero_to_seven = _mm256_setr_epi32(0, 1, 2, 3, 4, 5, 6, 7); + // Subtract num_probes from each of those constants + __m256i k_selector = + _mm256_sub_epi32(zero_to_seven, _mm256_set1_epi32(num_probes)); + // Negative after subtract -> use/select + // Keep only high bit (logical shift right each by 31). + k_selector = _mm256_srli_epi32(k_selector, 31); + + // Strip off the 4 bit word address (shift LEFT) + // Strips the 4 MSB bits + __m256i bit_addresses = _mm256_slli_epi32(hash_vector, 4); + + // And keep only 5-bit (32 - 27) bit-within-32-bit-word addresses. + // Shifts RIGHT 27 => 5 lower bit pos bits remain + bit_addresses = _mm256_srli_epi32(bit_addresses, 27); + // Build a bit mask + // Performs a logical shift of 32 (doublewords) in the individual data + // elements in k_selector to the left by the bit_addresses value + const __m256i bit_mask = _mm256_sllv_epi32(k_selector, bit_addresses); + + // Like ((~value_vector) & bit_mask) == 0) + bool match = _mm256_testc_si256(value_vector, bit_mask) != 0; + + // This check first so that it's easy for branch predictor to optimize + // num_probes <= 8 case, making it free of unpredictable branches. + if (num_probes <= 8) { + return {true, match}; + } else if (!match) { + return {true, false}; + } + return {false, false}; + } +#endif // HAVE_AVX2 + static inline bool HashMayMatchPrepared(uint32_t h2, int num_probes, const char *data_at_cache_line) { uint32_t h = h2; @@ -242,9 +354,11 @@ class FastLocalBloomImpl { // in doubt, don't add unnecessary code. // Powers of 32-bit golden ratio, mod 2**32. - const __m256i multipliers = - _mm256_setr_epi32(0x00000001, 0x9e3779b9, 0xe35e67b1, 0x734297e9, - 0x35fbe861, 0xdeb7c719, 0x448b211, 0x3459b749); + const __m256i multipliers = _mm256_setr_epi32( + BloomMath::GoldenRatioPowers[0], BloomMath::GoldenRatioPowers[1], + BloomMath::GoldenRatioPowers[2], BloomMath::GoldenRatioPowers[3], + BloomMath::GoldenRatioPowers[4], BloomMath::GoldenRatioPowers[5], + BloomMath::GoldenRatioPowers[6], BloomMath::GoldenRatioPowers[7]); for (;;) { // Eight copies of hash @@ -254,77 +368,10 @@ class FastLocalBloomImpl { // associativity of multiplication. hash_vector = _mm256_mullo_epi32(hash_vector, multipliers); - // Now the top 9 bits of each of the eight 32-bit values in - // hash_vector are bit addresses for probes within the cache line. - // While the platform-independent code uses byte addressing (6 bits - // to pick a byte + 3 bits to pick a bit within a byte), here we work - // with 32-bit words (4 bits to pick a word + 5 bits to pick a bit - // within a word) because that works well with AVX2 and is equivalent - // under little-endian. - - // Shift each right by 28 bits to get 4-bit word addresses. - const __m256i word_addresses = _mm256_srli_epi32(hash_vector, 28); - - // Gather 32-bit values spread over 512 bits by 4-bit address. In - // essence, we are dereferencing eight pointers within the cache - // line. - // - // Option 1: AVX2 gather (seems to be a little slow - understandable) - // const __m256i value_vector = - // _mm256_i32gather_epi32(static_cast(data_at_cache_line), - // word_addresses, - // /*bytes / i32*/ 4); - // END Option 1 - // Potentially unaligned as we're not *always* cache-aligned -> loadu - const __m256i *mm_data = - reinterpret_cast(data_at_cache_line); - __m256i lower = _mm256_loadu_si256(mm_data); - __m256i upper = _mm256_loadu_si256(mm_data + 1); - // Option 2: AVX512VL permute hack - // Only negligibly faster than Option 3, so not yet worth supporting - // const __m256i value_vector = - // _mm256_permutex2var_epi32(lower, word_addresses, upper); - // END Option 2 - // Option 3: AVX2 permute+blend hack - // Use lowest three bits to order probing values, as if all from same - // 256 bit piece. - lower = _mm256_permutevar8x32_epi32(lower, word_addresses); - upper = _mm256_permutevar8x32_epi32(upper, word_addresses); - // Just top 1 bit of address, to select between lower and upper. - const __m256i upper_lower_selector = _mm256_srai_epi32(hash_vector, 31); - // Finally: the next 8 probed 32-bit values, in probing sequence order. - const __m256i value_vector = - _mm256_blendv_epi8(lower, upper, upper_lower_selector); - // END Option 3 - - // We might not need to probe all 8, so build a mask for selecting only - // what we need. (The k_selector(s) could be pre-computed but that - // doesn't seem to make a noticeable performance difference.) - const __m256i zero_to_seven = _mm256_setr_epi32(0, 1, 2, 3, 4, 5, 6, 7); - // Subtract rem_probes from each of those constants - __m256i k_selector = - _mm256_sub_epi32(zero_to_seven, _mm256_set1_epi32(rem_probes)); - // Negative after subtract -> use/select - // Keep only high bit (logical shift right each by 31). - k_selector = _mm256_srli_epi32(k_selector, 31); - - // Strip off the 4 bit word address (shift left) - __m256i bit_addresses = _mm256_slli_epi32(hash_vector, 4); - // And keep only 5-bit (32 - 27) bit-within-32-bit-word addresses. - bit_addresses = _mm256_srli_epi32(bit_addresses, 27); - // Build a bit mask - const __m256i bit_mask = _mm256_sllv_epi32(k_selector, bit_addresses); - - // Like ((~value_vector) & bit_mask) == 0) - bool match = _mm256_testc_si256(value_vector, bit_mask) != 0; - - // This check first so that it's easy for branch predictor to optimize - // num_probes <= 8 case, making it free of unpredictable branches. - if (rem_probes <= 8) { - return match; - } else if (!match) { - return false; + auto [is_done, answer] = CheckBitsPositionsInBloomBlock( + rem_probes, hash_vector, data_at_cache_line); + if (is_done) { + return answer; } // otherwise // Need another iteration. 0xab25f4c1 == golden ratio to the 8th power