Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Test correctness with WAL disabled in non-txn blackbox crash tests #9338

Closed
wants to merge 1 commit into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
22 changes: 19 additions & 3 deletions db_stress_tool/db_stress_test_base.cc
Original file line number Diff line number Diff line change
Expand Up @@ -312,7 +312,7 @@ void StressTest::FinishInitDb(SharedState* shared) {
}
}

if (FLAGS_sync_fault_injection && IsStateTracked()) {
if ((FLAGS_sync_fault_injection || FLAGS_disable_wal) && IsStateTracked()) {
Status s = shared->SaveAtAndAfter(db_);
if (!s.ok()) {
fprintf(stderr, "Error enabling history tracing: %s\n",
Expand Down Expand Up @@ -1449,7 +1449,20 @@ Status StressTest::TestBackupRestore(
test_opts.file_sizes = thread->rand.OneIn(2) == 0;
TEST_EnableWriteFutureSchemaVersion2(backup_engine, test_opts);
}
s = backup_engine->CreateNewBackup(db_);
CreateBackupOptions create_opts;
if (FLAGS_disable_wal) {
// The verification can only work when latest value of `key` is backed up,
// which requires flushing in case of WAL disabled.
//
// Note this triggers a flush with a key lock held. Meanwhile, operations
// like flush/compaction may attempt to grab key locks like in
// `DbStressCompactionFilter`. The philosophy around preventing deadlock
// is the background operation key lock acquisition only tries but does
// not wait for the lock. So here in the foreground it is OK to hold the
// lock and wait on a background operation (flush).
create_opts.flush_before_backup = true;
}
s = backup_engine->CreateNewBackup(create_opts, db_);
if (!s.ok()) {
from = "BackupEngine::CreateNewBackup";
}
Expand Down Expand Up @@ -1902,6 +1915,9 @@ void StressTest::TestCompactFiles(ThreadState* thread,

Status StressTest::TestFlush(const std::vector<int>& rand_column_families) {
FlushOptions flush_opts;
if (FLAGS_atomic_flush) {
return db_->Flush(flush_opts, column_families_);
}
std::vector<ColumnFamilyHandle*> cfhs;
std::for_each(rand_column_families.begin(), rand_column_families.end(),
[this, &cfhs](int k) { cfhs.push_back(column_families_[k]); });
Expand Down Expand Up @@ -2827,7 +2843,7 @@ void StressTest::Reopen(ThreadState* thread) {
clock_->TimeToString(now / 1000000).c_str(), num_times_reopened_);
Open();

if (FLAGS_sync_fault_injection && IsStateTracked()) {
if ((FLAGS_sync_fault_injection || FLAGS_disable_wal) && IsStateTracked()) {
Status s = thread->shared->SaveAtAndAfter(db_);
if (!s.ok()) {
fprintf(stderr, "Error enabling history tracing: %s\n",
Expand Down
15 changes: 14 additions & 1 deletion tools/db_crashtest.py
Original file line number Diff line number Diff line change
Expand Up @@ -91,7 +91,6 @@
"progress_reports": 0,
"readpercent": 45,
"recycle_log_file_num": lambda: random.randint(0, 1),
"reopen": 20,
"snapshot_hold_ops": 100000,
"sst_file_manager_bytes_per_sec": lambda: random.choice([0, 104857600]),
"sst_file_manager_bytes_per_truncate": lambda: random.choice([0, 1048576]),
Expand Down Expand Up @@ -207,20 +206,28 @@ def is_direct_io_supported(dbname):


blackbox_default_params = {
"disable_wal": lambda: random.choice([0, 0, 0, 1]),
# total time for this script to test db_stress
"duration": 6000,
# time for one db_stress instance to run
"interval": 120,
# since we will be killing anyway, use large value for ops_per_thread
"ops_per_thread": 100000000,
"reopen": 0,
"set_options_one_in": 10000,
}

whitebox_default_params = {
# TODO: enable this once we figure out how to adjust kill odds for WAL-
# disabled runs, and either (1) separate full `db_stress` runs out of
# whitebox crash or (2) support verification at end of `db_stress` runs
# that ran with WAL disabled.
"disable_wal": 0,
"duration": 10000,
"log2_keys_per_lock": 10,
"ops_per_thread": 200000,
"random_kill_odd": 888887,
"reopen": 20,
}

simple_default_params = {
Expand Down Expand Up @@ -269,6 +276,8 @@ def is_direct_io_supported(dbname):
# Avoid lambda to set it once for the entire test
"txn_write_policy": random.randint(0, 2),
"unordered_write": random.randint(0, 1),
# TODO: there is such a thing as transactions with WAL disabled. We should
# cover that case.
"disable_wal": 0,
# OpenReadOnly after checkpoint is not currnetly compatible with WritePrepared txns
"checkpoint_one_in": 0,
Expand Down Expand Up @@ -352,6 +361,10 @@ def finalize_and_sanitize(src_params):
dest_params["allow_concurrent_memtable_write"] = 1
if dest_params.get("disable_wal", 0) == 1:
dest_params["atomic_flush"] = 1
# The `DbStressCompactionFilter` can apply memtable updates to SST
# files, which would be problematic without WAL since such updates are
# expected to be lost in crash recoveries.
dest_params["enable_compaction_filter"] = 0
dest_params["sync"] = 0
dest_params["write_fault_one_in"] = 0
if dest_params.get("open_files", 1) != -1:
Expand Down