From f4e304f987590ef89dcd31f4f7c36b71d496a1ef Mon Sep 17 00:00:00 2001 From: darionyaphet Date: Mon, 3 Jul 2023 09:41:48 -0700 Subject: [PATCH 1/6] Simplify conditional judgment (#11580) Summary: Pull Request resolved: https://github.com/facebook/rocksdb/pull/11580 Reviewed By: ajkr Differential Revision: D47158687 Pulled By: cbi42 fbshipit-source-id: 4841b77eee78ddcf35da6ea33da71861c5f1e773 --- db/file_indexer.cc | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/db/file_indexer.cc b/db/file_indexer.cc index 608f1cb28dac..ee6cfdc03f08 100644 --- a/db/file_indexer.cc +++ b/db/file_indexer.cc @@ -56,17 +56,15 @@ void FileIndexer::GetNextLevelIndex(const size_t level, const size_t file_index, } else if (cmp_smallest == 0) { *left_bound = index.smallest_lb; *right_bound = index.smallest_rb; - } else if (cmp_smallest > 0 && cmp_largest < 0) { + } else if (cmp_largest < 0) { *left_bound = index.smallest_lb; *right_bound = index.largest_rb; } else if (cmp_largest == 0) { *left_bound = index.largest_lb; *right_bound = index.largest_rb; - } else if (cmp_largest > 0) { + } else { *left_bound = index.largest_lb; *right_bound = level_rb_[level + 1]; - } else { - assert(false); } assert(*left_bound >= 0); From 25b08eb4386768b05a0748bfdb505ab58921281a Mon Sep 17 00:00:00 2001 From: leipeng Date: Mon, 3 Jul 2023 15:05:38 -0700 Subject: [PATCH 2/6] MemTable::Add: first_seqno_.compare_exchange_weak to earliest_seqno_ (#11398) Summary: This should be a benign bug caused by a long lived typo, this PR fix this issue. Pull Request resolved: https://github.com/facebook/rocksdb/pull/11398 Reviewed By: ajkr Differential Revision: D47163379 Pulled By: cbi42 fbshipit-source-id: 531728cae496fd7ac1371bbbd64fc103c3a90dcf --- db/memtable.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/db/memtable.cc b/db/memtable.cc index 3801b67a9b53..dfef13a15b2c 100644 --- a/db/memtable.cc +++ b/db/memtable.cc @@ -810,7 +810,7 @@ Status MemTable::Add(SequenceNumber s, ValueType type, earliest_seqno_.load(std::memory_order_relaxed); while ( (cur_earliest_seqno == kMaxSequenceNumber || s < cur_earliest_seqno) && - !first_seqno_.compare_exchange_weak(cur_earliest_seqno, s)) { + !earliest_seqno_.compare_exchange_weak(cur_earliest_seqno, s)) { } } if (type == kTypeRangeDeletion) { From c53d604f4114baa6e06e90e204850c36d6f35765 Mon Sep 17 00:00:00 2001 From: Changyu Bi Date: Wed, 5 Jul 2023 14:12:06 -0700 Subject: [PATCH 3/6] `sst_dump --command=verify` should verify block checksums (#11576) Summary: `sst_dump --command=verify` did not set read_options.verify_checksum to true so it was not verifying checksum. Pull Request resolved: https://github.com/facebook/rocksdb/pull/11576 Test Plan: ran the same command on an SST file with bad checksum: ``` sst_dump --command=verify --file=...sst_file_with_bad_block_checksum Before this PR: options.env is 0x6ba048 Process ...sst_file_with_bad_block_checksum Sst file format: block-based The file is ok After this PR: options.env is 0x7f43f6690000 Process ...sst_file_with_bad_block_checksum Sst file format: block-based ... is corrupted: Corruption: block checksum mismatch: stored = 2170109798, computed = 2170097510, type = 4 ... ``` Reviewed By: ajkr Differential Revision: D47136284 Pulled By: cbi42 fbshipit-source-id: 07d68db715c00347145e5b83d649aef2c3f2acd9 --- table/sst_file_dumper.cc | 1 + tools/sst_dump_tool.cc | 4 ++++ 2 files changed, 5 insertions(+) diff --git a/table/sst_file_dumper.cc b/table/sst_file_dumper.cc index da91c1130bde..85a264d59a22 100644 --- a/table/sst_file_dumper.cc +++ b/table/sst_file_dumper.cc @@ -196,6 +196,7 @@ Status SstFileDumper::NewTableReader( } Status SstFileDumper::VerifyChecksum() { + assert(read_options_.verify_checksums); // We could pass specific readahead setting into read options if needed. return table_reader_->VerifyChecksum(read_options_, TableReaderCaller::kSSTDumpTool); diff --git a/tools/sst_dump_tool.cc b/tools/sst_dump_tool.cc index 87577ab8e1c3..1b269043ab2c 100644 --- a/tools/sst_dump_tool.cc +++ b/tools/sst_dump_tool.cc @@ -419,6 +419,10 @@ int SSTDumpTool::Run(int argc, char const* const* argv, Options options) { filename = std::string(dir_or_file) + "/" + filename; } + if (command == "verify") { + verify_checksum = true; + } + ROCKSDB_NAMESPACE::SstFileDumper dumper( options, filename, Temperature::kUnknown, readahead_size, verify_checksum, output_hex, decode_blob_index); From df082c8d1ddf5a90b195941064b56e853f104ff0 Mon Sep 17 00:00:00 2001 From: Changyu Bi <102700264+cbi42@users.noreply.github.com> Date: Wed, 5 Jul 2023 14:40:45 -0700 Subject: [PATCH 4/6] Deprecate option `periodic_compaction_seconds` for FIFO compaction (#11550) Summary: both options `ttl` and `periodic_compaction_seconds` have the same meaning for FIFO compaction, which is redundant and can be confusing to use. For example, setting TTL to 0 does not disable TTL: user needs to also set periodic_compaction_seconds to 0. Another example is that dynamically setting `periodic_compaction_seconds` (surprisingly) has no effect on TTL compaction. This is because FIFO compaction picker internally only looks at value of `ttl`. The value of `ttl` is in `SanitizeOptions()` which take into account the value of `periodic_compaction_seconds`, but dynamically setting an option does not invoke this method. This PR clarifies the usage of both options for FIFO compaction: only `ttl` should be used, `periodic_compaction_seconds` will not have any effect on FIFO compaction. Pull Request resolved: https://github.com/facebook/rocksdb/pull/11550 Test Plan: - updated existing unit test `DBOptionsTest.SanitizeFIFOPeriodicCompaction` - checked existing values of both options in feature matrix: https://fburl.com/daiquery/xxd0gs9w. All current uses cases either have `periodic_compaction_seconds = 0` or have `periodic_compaction_seconds > ttl`, so should not cause change of behavior. Reviewed By: ajkr Differential Revision: D46902959 Pulled By: cbi42 fbshipit-source-id: a9ede235b276783b4906aaec443551fa62ceff4c --- db/column_family.cc | 20 +++++------ db/db_options_test.cc | 15 +++++---- include/rocksdb/advanced_options.h | 33 ++++++++----------- .../fifo_ttl_periodic_compaction.md | 1 + 4 files changed, 30 insertions(+), 39 deletions(-) create mode 100644 unreleased_history/behavior_changes/fifo_ttl_periodic_compaction.md diff --git a/db/column_family.cc b/db/column_family.cc index 42e13a13aaad..23349ba586d9 100644 --- a/db/column_family.cc +++ b/db/column_family.cc @@ -382,8 +382,8 @@ ColumnFamilyOptions SanitizeOptions(const ImmutableDBOptions& db_options, const uint64_t kAdjustedTtl = 30 * 24 * 60 * 60; if (result.ttl == kDefaultTtl) { - if (is_block_based_table && - result.compaction_style != kCompactionStyleFIFO) { + if (is_block_based_table) { + // For FIFO, max_open_files is checked in ValidateOptions(). result.ttl = kAdjustedTtl; } else { result.ttl = 0; @@ -403,16 +403,12 @@ ColumnFamilyOptions SanitizeOptions(const ImmutableDBOptions& db_options, result.periodic_compaction_seconds = kAdjustedPeriodicCompSecs; } } else { - // result.compaction_style == kCompactionStyleFIFO - if (result.ttl == 0) { - if (is_block_based_table) { - if (result.periodic_compaction_seconds == kDefaultPeriodicCompSecs) { - result.periodic_compaction_seconds = kAdjustedPeriodicCompSecs; - } - result.ttl = result.periodic_compaction_seconds; - } - } else if (result.periodic_compaction_seconds != 0) { - result.ttl = std::min(result.ttl, result.periodic_compaction_seconds); + if (result.periodic_compaction_seconds != kDefaultPeriodicCompSecs && + result.periodic_compaction_seconds > 0) { + ROCKS_LOG_WARN( + db_options.info_log.get(), + "periodic_compaction_seconds does not support FIFO compaction. You" + "may want to set option TTL instead."); } } diff --git a/db/db_options_test.cc b/db/db_options_test.cc index 533103f3cba4..d64d0eae5790 100644 --- a/db/db_options_test.cc +++ b/db/db_options_test.cc @@ -880,9 +880,13 @@ TEST_F(DBOptionsTest, SanitizeFIFOPeriodicCompaction) { Options options; options.compaction_style = kCompactionStyleFIFO; options.env = CurrentOptions().env; + // Default value allows RocksDB to set ttl to 30 days. + ASSERT_EQ(30 * 24 * 60 * 60, dbfull()->GetOptions().ttl); + + // Disable options.ttl = 0; Reopen(options); - ASSERT_EQ(30 * 24 * 60 * 60, dbfull()->GetOptions().ttl); + ASSERT_EQ(0, dbfull()->GetOptions().ttl); options.ttl = 100; Reopen(options); @@ -892,15 +896,12 @@ TEST_F(DBOptionsTest, SanitizeFIFOPeriodicCompaction) { Reopen(options); ASSERT_EQ(100 * 24 * 60 * 60, dbfull()->GetOptions().ttl); - options.ttl = 200; - options.periodic_compaction_seconds = 300; - Reopen(options); - ASSERT_EQ(200, dbfull()->GetOptions().ttl); - + // periodic_compaction_seconds should have no effect + // on FIFO compaction. options.ttl = 500; options.periodic_compaction_seconds = 300; Reopen(options); - ASSERT_EQ(300, dbfull()->GetOptions().ttl); + ASSERT_EQ(500, dbfull()->GetOptions().ttl); } TEST_F(DBOptionsTest, SetFIFOCompactionOptions) { diff --git a/include/rocksdb/advanced_options.h b/include/rocksdb/advanced_options.h index 3af820b661a7..df7bb4e32fa5 100644 --- a/include/rocksdb/advanced_options.h +++ b/include/rocksdb/advanced_options.h @@ -858,24 +858,23 @@ struct AdvancedColumnFamilyOptions { // Dynamically changeable through SetOptions() API bool report_bg_io_stats = false; - // Files containing updates older than TTL will go through the compaction - // process. This usually happens in a cascading way so that those entries - // will be compacted to bottommost level/file. - // The feature is used to remove stale entries that have been deleted or - // updated from the file system. - // Pre-req: This needs max_open_files to be set to -1. - // In Level: Non-bottom-level files older than TTL will go through the - // compaction process. - // In FIFO: Files older than TTL will be deleted. + // This option has different meanings for different compaction styles: + // + // Leveled: Non-bottom-level files with all keys older than TTL will go + // through the compaction process. This usually happens in a cascading + // way so that those entries will be compacted to bottommost level/file. + // The feature is used to remove stale entries that have been deleted or + // updated from the file system. + // + // FIFO: Files with all keys older than TTL will be deleted. TTL is only + // supported if option max_open_files is set to -1. + // // unit: seconds. Ex: 1 day = 1 * 24 * 60 * 60 - // In FIFO, this option will have the same meaning as - // periodic_compaction_seconds. Whichever stricter will be used. // 0 means disabling. // UINT64_MAX - 1 (0xfffffffffffffffe) is special flag to allow RocksDB to // pick default. // - // Default: 30 days for leveled compaction + block based table. disable - // otherwise. + // Default: 30 days if using block based table. 0 (disable) otherwise. // // Dynamically changeable through SetOptions() API uint64_t ttl = 0xfffffffffffffffe; @@ -891,12 +890,9 @@ struct AdvancedColumnFamilyOptions { // age is based on the file's last modified time (given by the underlying // Env). // - // Supported in all compaction styles. + // Supported in leveled and universal compaction. // In Universal compaction, rocksdb will try to do a full compaction when // possible, see more in UniversalCompactionBuilder::PickPeriodicCompaction(). - // In FIFO compaction, this option has the same meaning as TTL and whichever - // stricter will be used. - // Pre-req: max_open_file == -1. // unit: seconds. Ex: 7 days = 7 * 24 * 60 * 60 // // Values: @@ -905,9 +901,6 @@ struct AdvancedColumnFamilyOptions { // as needed. For now, RocksDB will change this value to 30 days // (i.e 30 * 24 * 60 * 60) so that every file goes through the compaction // process at least once every 30 days if not compacted sooner. - // In FIFO compaction, since the option has the same meaning as ttl, - // when this value is left default, and ttl is left to 0, 30 days will be - // used. Otherwise, min(ttl, periodic_compaction_seconds) will be used. // // Default: UINT64_MAX - 1 (allow RocksDB to auto-tune) // diff --git a/unreleased_history/behavior_changes/fifo_ttl_periodic_compaction.md b/unreleased_history/behavior_changes/fifo_ttl_periodic_compaction.md new file mode 100644 index 000000000000..6297ccc91380 --- /dev/null +++ b/unreleased_history/behavior_changes/fifo_ttl_periodic_compaction.md @@ -0,0 +1 @@ +Option `periodic_compaction_seconds` no longer supports FIFO compaction: setting it has no effect on FIFO compactions. FIFO compaction users should only set option `ttl` instead. \ No newline at end of file From 1f410ff95f623216c6d1c72f8d0788ed333e829c Mon Sep 17 00:00:00 2001 From: Changyu Bi Date: Fri, 7 Jul 2023 13:16:20 -0700 Subject: [PATCH 5/6] Make `rocksdb_options_add_compact_on_deletion_collector_factory` backward compatible (#11593) Summary: https://github.com/facebook/rocksdb/issues/11542 added a parameter to the C API `rocksdb_options_add_compact_on_deletion_collector_factory` which causes some internal builds to fail. External users using this API would also require code change. Making the API backward compatible by restoring the old C API and add the parameter to a new C API `rocksdb_options_add_compact_on_deletion_collector_factory_del_ratio`. Also updated change log for 8.4 and will backport this change to 8.4 branch once landed. Pull Request resolved: https://github.com/facebook/rocksdb/pull/11593 Test Plan: `make c_test && ./c_test` Reviewed By: akankshamahajan15 Differential Revision: D47299555 Pulled By: cbi42 fbshipit-source-id: 517dc093ef4cf02cac2fe4af4f1af13754bbda63 --- HISTORY.md | 2 +- db/c.cc | 8 ++++++++ db/c_test.c | 4 +++- include/rocksdb/c.h | 3 +++ 4 files changed, 15 insertions(+), 2 deletions(-) diff --git a/HISTORY.md b/HISTORY.md index 91f28f3e2e53..f4dce5cd8751 100644 --- a/HISTORY.md +++ b/HISTORY.md @@ -14,7 +14,7 @@ * Add `WriteBatch::Release()` that releases the batch's serialized data to the caller. ### Public API Changes -* Add parameter `deletion_ratio` to C API `rocksdb_options_add_compact_on_deletion_collector_factory`. +* Add C API `rocksdb_options_add_compact_on_deletion_collector_factory_del_ratio`. * change the FileSystem::use_async_io() API to SupportedOps API in order to extend it to various operations supported by underlying FileSystem. Right now it contains FSSupportedOps::kAsyncIO and FSSupportedOps::kFSBuffer. More details about FSSupportedOps in filesystem.h * Add new tickers: `rocksdb.error.handler.bg.error.count`, `rocksdb.error.handler.bg.io.error.count`, `rocksdb.error.handler.bg.retryable.io.error.count` to replace the misspelled ones: `rocksdb.error.handler.bg.errro.count`, `rocksdb.error.handler.bg.io.errro.count`, `rocksdb.error.handler.bg.retryable.io.errro.count` ('error' instead of 'errro'). Users should switch to use the new tickers before 9.0 release as the misspelled old tickers will be completely removed then. * Overload the API CreateColumnFamilyWithImport() to support creating ColumnFamily by importing multiple ColumnFamilies It requires that CFs should not overlap in user key range. diff --git a/db/c.cc b/db/c.cc index 88527f99c8fe..6da280b46a0b 100644 --- a/db/c.cc +++ b/db/c.cc @@ -3916,6 +3916,14 @@ void rocksdb_options_set_row_cache(rocksdb_options_t* opt, } void rocksdb_options_add_compact_on_deletion_collector_factory( + rocksdb_options_t* opt, size_t window_size, size_t num_dels_trigger) { + std::shared_ptr + compact_on_del = + NewCompactOnDeletionCollectorFactory(window_size, num_dels_trigger); + opt->rep.table_properties_collector_factories.emplace_back(compact_on_del); +} + +void rocksdb_options_add_compact_on_deletion_collector_factory_del_ratio( rocksdb_options_t* opt, size_t window_size, size_t num_dels_trigger, double deletion_ratio) { std::shared_ptr diff --git a/db/c_test.c b/db/c_test.c index 454aa162281b..a694643bcde3 100644 --- a/db/c_test.c +++ b/db/c_test.c @@ -720,7 +720,9 @@ int main(int argc, char** argv) { rocksdb_compactoptions_set_exclusive_manual_compaction(coptions, 1); rocksdb_options_add_compact_on_deletion_collector_factory(options, 10000, - 10001, 0.0); + 10001); + rocksdb_options_add_compact_on_deletion_collector_factory_del_ratio( + options, 10000, 10001, 0.0); StartPhase("destroy"); rocksdb_destroy_db(options, dbname, &err); diff --git a/include/rocksdb/c.h b/include/rocksdb/c.h index 808617db6f5f..81d05f778582 100644 --- a/include/rocksdb/c.h +++ b/include/rocksdb/c.h @@ -1626,6 +1626,9 @@ extern ROCKSDB_LIBRARY_API void rocksdb_options_set_row_cache( extern ROCKSDB_LIBRARY_API void rocksdb_options_add_compact_on_deletion_collector_factory( + rocksdb_options_t*, size_t window_size, size_t num_dels_trigger); +extern ROCKSDB_LIBRARY_API void +rocksdb_options_add_compact_on_deletion_collector_factory_del_ratio( rocksdb_options_t*, size_t window_size, size_t num_dels_trigger, double deletion_ratio); extern ROCKSDB_LIBRARY_API void rocksdb_options_set_manual_wal_flush( From baf37a0e818dc334a0ed94f3d315155e2c138c93 Mon Sep 17 00:00:00 2001 From: Yu Zhang Date: Fri, 7 Jul 2023 16:47:49 -0700 Subject: [PATCH 6/6] Fix a unit test hole for recovering UDTs with WAL files (#11577) Summary: Thanks pdillinger for pointing out this test hole. The test `DBWALTestWithTimestamp.Recover` that is intended to test recovery from WAL including user-defined timestamps doesn't achieve its promised coverage. Specifically, after https://github.com/facebook/rocksdb/issues/11557, timestamps will be removed during flush, and RocksDB by default flush memtables during recovery with `avoid_flush_during_recovery` defaults to false. This test didn't fail even if all the timestamps are quickly lost due to the default flush behavior. This PR renamed test `Recover` to `RecoverAndNoFlush`, and updated it to verify timestamps are successfully recovered from WAL with some time-travel reads. `avoid_flush_during_recovery` is set to true to help do this verification. On the other hand, for test `DBWALTestWithTimestamp.RecoverAndFlush`, since flush on reopen is DB's default behavior. Setting the flags `max_write_buffer` and `arena_block_size` are not really the factors that enforces the flush, so these flags are removed. Pull Request resolved: https://github.com/facebook/rocksdb/pull/11577 Test Plan: ./db_wal_test Reviewed By: pdillinger Differential Revision: D47142892 Pulled By: jowlyzhang fbshipit-source-id: 9465e278806faa5885b541b4e32d99e698edef7d --- db/db_wal_test.cc | 132 +++++++++++++++++++++++++++++++++------------- 1 file changed, 95 insertions(+), 37 deletions(-) diff --git a/db/db_wal_test.cc b/db/db_wal_test.cc index 2b7edc6eb0d0..232d329727b9 100644 --- a/db/db_wal_test.cc +++ b/db/db_wal_test.cc @@ -318,16 +318,25 @@ class DBWALTestWithTimestamp DBWALTestWithTimestamp() : DBBasicTestWithTimestampBase("db_wal_test_with_timestamp") {} + void SetUp() override { + persist_udt_ = test::ShouldPersistUDT(GetParam()); + DBBasicTestWithTimestampBase::SetUp(); + } + Status CreateAndReopenWithCFWithTs(const std::vector& cfs, - const Options& options) { + const Options& options, + bool avoid_flush_during_recovery = false) { CreateColumnFamilies(cfs, options); - return ReopenColumnFamiliesWithTs(cfs, options); + return ReopenColumnFamiliesWithTs(cfs, options, + avoid_flush_during_recovery); } Status ReopenColumnFamiliesWithTs(const std::vector& cfs, - Options ts_options) { + Options ts_options, + bool avoid_flush_during_recovery = false) { Options default_options = CurrentOptions(); default_options.create_if_missing = false; + default_options.avoid_flush_during_recovery = avoid_flush_during_recovery; ts_options.create_if_missing = false; std::vector cf_options(cfs.size(), ts_options); @@ -345,46 +354,88 @@ class DBWALTestWithTimestamp } void CheckGet(const ReadOptions& read_opts, uint32_t cf, const Slice& key, - const std::string& expected_value) { + const std::string& expected_value, + const std::string& expected_ts) { std::string actual_value; - ASSERT_OK(db_->Get(read_opts, handles_[cf], key, &actual_value)); + std::string actual_ts; + ASSERT_OK( + db_->Get(read_opts, handles_[cf], key, &actual_value, &actual_ts)); ASSERT_EQ(expected_value, actual_value); + ASSERT_EQ(expected_ts, actual_ts); } + + protected: + bool persist_udt_; }; -TEST_F(DBWALTestWithTimestamp, Recover) { +TEST_P(DBWALTestWithTimestamp, RecoverAndNoFlush) { // Set up the option that enables user defined timestmp size. - std::string ts = Timestamp(1, 0); - const size_t kTimestampSize = ts.size(); + std::string ts1 = Timestamp(1, 0); + const size_t kTimestampSize = ts1.size(); TestComparator test_cmp(kTimestampSize); Options ts_options; ts_options.create_if_missing = true; ts_options.comparator = &test_cmp; + // Test that user-defined timestamps are recovered from WAL regardless of + // the value of this flag because UDTs are saved in WAL nonetheless. + // We however need to explicitly disable flush during recovery by setting + // `avoid_flush_during_recovery=true` so that we can avoid timestamps getting + // stripped when the `persist_user_defined_timestamps` flag is false, so that + // all written timestamps are available for testing user-defined time travel + // read. + ts_options.persist_user_defined_timestamps = persist_udt_; + bool avoid_flush_during_recovery = true; ReadOptions read_opts; - Slice ts_slice = ts; - read_opts.timestamp = &ts_slice; do { - ASSERT_OK(CreateAndReopenWithCFWithTs({"pikachu"}, ts_options)); - ASSERT_OK(Put(1, "foo", ts, "v1")); - ASSERT_OK(Put(1, "baz", ts, "v5")); - - ASSERT_OK(ReopenColumnFamiliesWithTs({"pikachu"}, ts_options)); - CheckGet(read_opts, 1, "foo", "v1"); - CheckGet(read_opts, 1, "baz", "v5"); - ASSERT_OK(Put(1, "bar", ts, "v2")); - ASSERT_OK(Put(1, "foo", ts, "v3")); - - ASSERT_OK(ReopenColumnFamiliesWithTs({"pikachu"}, ts_options)); - CheckGet(read_opts, 1, "foo", "v3"); - ASSERT_OK(Put(1, "foo", ts, "v4")); - CheckGet(read_opts, 1, "foo", "v4"); - CheckGet(read_opts, 1, "bar", "v2"); - CheckGet(read_opts, 1, "baz", "v5"); + Slice ts_slice = ts1; + read_opts.timestamp = &ts_slice; + ASSERT_OK(CreateAndReopenWithCFWithTs({"pikachu"}, ts_options, + avoid_flush_during_recovery)); + ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "pikachu"), 0U); + ASSERT_OK(Put(1, "foo", ts1, "v1")); + ASSERT_OK(Put(1, "baz", ts1, "v5")); + + ASSERT_OK(ReopenColumnFamiliesWithTs({"pikachu"}, ts_options, + avoid_flush_during_recovery)); + ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "pikachu"), 0U); + // Do a timestamped read with ts1 after second reopen. + CheckGet(read_opts, 1, "foo", "v1", ts1); + CheckGet(read_opts, 1, "baz", "v5", ts1); + + // Write more value versions for key "foo" and "bar" before and after second + // reopen. + std::string ts2 = Timestamp(2, 0); + ASSERT_OK(Put(1, "bar", ts2, "v2")); + ASSERT_OK(Put(1, "foo", ts2, "v3")); + + ASSERT_OK(ReopenColumnFamiliesWithTs({"pikachu"}, ts_options, + avoid_flush_during_recovery)); + ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "pikachu"), 0U); + std::string ts3 = Timestamp(3, 0); + ASSERT_OK(Put(1, "foo", ts3, "v4")); + + // Do a timestamped read with ts1 after third reopen. + CheckGet(read_opts, 1, "foo", "v1", ts1); + std::string value; + ASSERT_TRUE(db_->Get(read_opts, handles_[1], "bar", &value).IsNotFound()); + CheckGet(read_opts, 1, "baz", "v5", ts1); + + // Do a timestamped read with ts2 after third reopen. + ts_slice = ts2; + CheckGet(read_opts, 1, "foo", "v3", ts2); + CheckGet(read_opts, 1, "bar", "v2", ts2); + CheckGet(read_opts, 1, "baz", "v5", ts1); + + // Do a timestamped read with ts3 after third reopen. + ts_slice = ts3; + CheckGet(read_opts, 1, "foo", "v4", ts3); + CheckGet(read_opts, 1, "bar", "v2", ts2); + CheckGet(read_opts, 1, "baz", "v5", ts1); } while (ChangeWalOptions()); } -TEST_F(DBWALTestWithTimestamp, RecoverInconsistentTimestamp) { +TEST_P(DBWALTestWithTimestamp, RecoverInconsistentTimestamp) { // Set up the option that enables user defined timestmp size. std::string ts = Timestamp(1, 0); const size_t kTimestampSize = ts.size(); @@ -392,11 +443,19 @@ TEST_F(DBWALTestWithTimestamp, RecoverInconsistentTimestamp) { Options ts_options; ts_options.create_if_missing = true; ts_options.comparator = &test_cmp; + ts_options.persist_user_defined_timestamps = persist_udt_; ASSERT_OK(CreateAndReopenWithCFWithTs({"pikachu"}, ts_options)); ASSERT_OK(Put(1, "foo", ts, "v1")); ASSERT_OK(Put(1, "baz", ts, "v5")); + // In real use cases, switching to a different user comparator is prohibited + // by a sanity check during DB open that does a user comparator name + // comparison. This test mocked and bypassed that sanity check because the + // before and after user comparator are both named "TestComparator". This is + // to test the user-defined timestamp recovery logic for WAL files have + // the intended consistency check. + // `HandleWriteBatchTimestampSizeDifference` in udt_util.h has more details. TestComparator diff_test_cmp(kTimestampSize + 1); ts_options.comparator = &diff_test_cmp; ASSERT_TRUE( @@ -404,7 +463,7 @@ TEST_F(DBWALTestWithTimestamp, RecoverInconsistentTimestamp) { } TEST_P(DBWALTestWithTimestamp, RecoverAndFlush) { - // Set up the option that enables user defined timestmp size. + // Set up the option that enables user defined timestamp size. std::string min_ts = Timestamp(0, 0); std::string write_ts = Timestamp(1, 0); const size_t kTimestampSize = write_ts.size(); @@ -412,22 +471,21 @@ TEST_P(DBWALTestWithTimestamp, RecoverAndFlush) { Options ts_options; ts_options.create_if_missing = true; ts_options.comparator = &test_cmp; - bool persist_udt = test::ShouldPersistUDT(GetParam()); - ts_options.persist_user_defined_timestamps = persist_udt; + ts_options.persist_user_defined_timestamps = persist_udt_; std::string smallest_ukey_without_ts = "baz"; std::string largest_ukey_without_ts = "foo"; ASSERT_OK(CreateAndReopenWithCFWithTs({"pikachu"}, ts_options)); + // No flush, no sst files, because of no data. + ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "pikachu"), 0U); ASSERT_OK(Put(1, largest_ukey_without_ts, write_ts, "v1")); ASSERT_OK(Put(1, smallest_ukey_without_ts, write_ts, "v5")); - // Very small write buffer size to force flush memtables recovered from WAL. - ts_options.write_buffer_size = 16; - ts_options.arena_block_size = 16; ASSERT_OK(ReopenColumnFamiliesWithTs({"pikachu"}, ts_options)); - ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "pikachu"), - static_cast(1)); + // Memtable recovered from WAL flushed because `avoid_flush_during_recovery` + // defaults to false, created one L0 file. + ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "pikachu"), 1U); std::vector> level_to_files; dbfull()->TEST_GetFilesMetaData(handles_[1], &level_to_files); @@ -435,7 +493,7 @@ TEST_P(DBWALTestWithTimestamp, RecoverAndFlush) { // L0 only has one SST file. ASSERT_EQ(level_to_files[0].size(), 1); auto meta = level_to_files[0][0]; - if (persist_udt) { + if (persist_udt_) { ASSERT_EQ(smallest_ukey_without_ts + write_ts, meta.smallest.user_key()); ASSERT_EQ(largest_ukey_without_ts + write_ts, meta.largest.user_key()); } else { @@ -446,7 +504,7 @@ TEST_P(DBWALTestWithTimestamp, RecoverAndFlush) { // Param 0: test mode for the user-defined timestamp feature INSTANTIATE_TEST_CASE_P( - RecoverAndFlush, DBWALTestWithTimestamp, + DBWALTestWithTimestamp, DBWALTestWithTimestamp, ::testing::Values( test::UserDefinedTimestampTestMode::kStripUserDefinedTimestamp, test::UserDefinedTimestampTestMode::kNormal));