From 6c2bd68a65966548e8ae3daff61d80a0f49ca8cb Mon Sep 17 00:00:00 2001 From: Andrew Kryczka Date: Sun, 26 Dec 2021 15:17:50 -0800 Subject: [PATCH] toggle disable_wal in crash test --- db_stress_tool/db_stress_test_base.cc | 22 +++++++++++++++++++--- tools/db_crashtest.py | 15 ++++++++++++++- 2 files changed, 33 insertions(+), 4 deletions(-) diff --git a/db_stress_tool/db_stress_test_base.cc b/db_stress_tool/db_stress_test_base.cc index 8930bf0aead..00304f240cf 100644 --- a/db_stress_tool/db_stress_test_base.cc +++ b/db_stress_tool/db_stress_test_base.cc @@ -312,7 +312,7 @@ void StressTest::FinishInitDb(SharedState* shared) { } } - if (FLAGS_sync_fault_injection && IsStateTracked()) { + if ((FLAGS_sync_fault_injection || FLAGS_disable_wal) && IsStateTracked()) { Status s = shared->SaveAtAndAfter(db_); if (!s.ok()) { fprintf(stderr, "Error enabling history tracing: %s\n", @@ -1449,7 +1449,20 @@ Status StressTest::TestBackupRestore( test_opts.file_sizes = thread->rand.OneIn(2) == 0; TEST_EnableWriteFutureSchemaVersion2(backup_engine, test_opts); } - s = backup_engine->CreateNewBackup(db_); + CreateBackupOptions create_opts; + if (FLAGS_disable_wal) { + // The verification can only work when latest value of `key` is backed up, + // which requires flushing in case of WAL disabled. + // + // Note this triggers a flush with a key lock held. Meanwhile, operations + // like flush/compaction may attempt to grab key locks like in + // `DbStressCompactionFilter`. The philosophy around preventing deadlock + // is the background operation key lock acquisition only tries but does + // not wait for the lock. So here in the foreground it is OK to hold the + // lock and wait on a background operation (flush). + create_opts.flush_before_backup = true; + } + s = backup_engine->CreateNewBackup(create_opts, db_); if (!s.ok()) { from = "BackupEngine::CreateNewBackup"; } @@ -1902,6 +1915,9 @@ void StressTest::TestCompactFiles(ThreadState* thread, Status StressTest::TestFlush(const std::vector& rand_column_families) { FlushOptions flush_opts; + if (FLAGS_atomic_flush) { + return db_->Flush(flush_opts, column_families_); + } std::vector cfhs; std::for_each(rand_column_families.begin(), rand_column_families.end(), [this, &cfhs](int k) { cfhs.push_back(column_families_[k]); }); @@ -2827,7 +2843,7 @@ void StressTest::Reopen(ThreadState* thread) { clock_->TimeToString(now / 1000000).c_str(), num_times_reopened_); Open(); - if (FLAGS_sync_fault_injection && IsStateTracked()) { + if ((FLAGS_sync_fault_injection || FLAGS_disable_wal) && IsStateTracked()) { Status s = thread->shared->SaveAtAndAfter(db_); if (!s.ok()) { fprintf(stderr, "Error enabling history tracing: %s\n", diff --git a/tools/db_crashtest.py b/tools/db_crashtest.py index cb095c9d017..617745979d5 100644 --- a/tools/db_crashtest.py +++ b/tools/db_crashtest.py @@ -91,7 +91,6 @@ "progress_reports": 0, "readpercent": 45, "recycle_log_file_num": lambda: random.randint(0, 1), - "reopen": 20, "snapshot_hold_ops": 100000, "sst_file_manager_bytes_per_sec": lambda: random.choice([0, 104857600]), "sst_file_manager_bytes_per_truncate": lambda: random.choice([0, 1048576]), @@ -207,20 +206,28 @@ def is_direct_io_supported(dbname): blackbox_default_params = { + "disable_wal": lambda: random.choice([0, 0, 0, 1]), # total time for this script to test db_stress "duration": 6000, # time for one db_stress instance to run "interval": 120, # since we will be killing anyway, use large value for ops_per_thread "ops_per_thread": 100000000, + "reopen": 0, "set_options_one_in": 10000, } whitebox_default_params = { + # TODO: enable this once we figure out how to adjust kill odds for WAL- + # disabled runs, and either (1) separate full `db_stress` runs out of + # whitebox crash or (2) support verification at end of `db_stress` runs + # that ran with WAL disabled. + "disable_wal": 0, "duration": 10000, "log2_keys_per_lock": 10, "ops_per_thread": 200000, "random_kill_odd": 888887, + "reopen": 20, } simple_default_params = { @@ -269,6 +276,8 @@ def is_direct_io_supported(dbname): # Avoid lambda to set it once for the entire test "txn_write_policy": random.randint(0, 2), "unordered_write": random.randint(0, 1), + # TODO: there is such a thing as transactions with WAL disabled. We should + # cover that case. "disable_wal": 0, # OpenReadOnly after checkpoint is not currnetly compatible with WritePrepared txns "checkpoint_one_in": 0, @@ -352,6 +361,10 @@ def finalize_and_sanitize(src_params): dest_params["allow_concurrent_memtable_write"] = 1 if dest_params.get("disable_wal", 0) == 1: dest_params["atomic_flush"] = 1 + # The `DbStressCompactionFilter` can apply memtable updates to SST + # files, which would be problematic without WAL since such updates are + # expected to be lost in crash recoveries. + dest_params["enable_compaction_filter"] = 0 dest_params["sync"] = 0 dest_params["write_fault_one_in"] = 0 if dest_params.get("open_files", 1) != -1: