From add46b112c62e94a0230141c51bf10e88d0bc588 Mon Sep 17 00:00:00 2001 From: Jingyu Zhou Date: Tue, 24 Jan 2023 21:16:55 -0800 Subject: [PATCH 1/4] Change SubmitBackup to only reboot in Attrition Otherwise, the Attrition can RebootAndDelete tlogs in remote DC such that the remote is unusable and blocking recovery to fully_recovered state. In fact, the FirstCycleTest can only reach the accepting_commits state. In the part 2 of the restarting test, the runTests() wait for quietDatabase() to reach fully fully_recovered state, but was stuck in the accepting_commits state. --- tests/restarting/from_7.0.0/UpgradeAndBackupRestore-1.toml | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/tests/restarting/from_7.0.0/UpgradeAndBackupRestore-1.toml b/tests/restarting/from_7.0.0/UpgradeAndBackupRestore-1.toml index 289d5c3d1da..ddfd93f103d 100644 --- a/tests/restarting/from_7.0.0/UpgradeAndBackupRestore-1.toml +++ b/tests/restarting/from_7.0.0/UpgradeAndBackupRestore-1.toml @@ -12,6 +12,13 @@ runConsistencyCheck=false delayFor = 0 stopWhenDone = false + [[test.workload]] + testName = 'Attrition' + machinesToKill = 10 + machinesToLeave = 3 + reboot = true + testDuration = 30.0 + [[test]] testTitle = 'FirstCycleTest' clearAfterTest=false From 56e488b2ff44075f8a8af0a03a09c065dacf4d07 Mon Sep 17 00:00:00 2001 From: Jingyu Zhou Date: Wed, 25 Jan 2023 14:57:05 -0800 Subject: [PATCH 2/4] Add a new toml option to disable failure injection workload This is needed for UpgradeAndBackupRestore-1 to make sure the DB is recoverable so that the part 2 can start. --- .../include/fdbserver/TesterInterface.actor.h | 2 ++ .../fdbserver/workloads/workloads.actor.h | 1 + fdbserver/tester.actor.cpp | 24 +++++++++++++++++++ .../from_7.0.0/UpgradeAndBackupRestore-1.toml | 2 ++ 4 files changed, 29 insertions(+) diff --git a/fdbserver/include/fdbserver/TesterInterface.actor.h b/fdbserver/include/fdbserver/TesterInterface.actor.h index b48bd66415a..f7c43c4349f 100644 --- a/fdbserver/include/fdbserver/TesterInterface.actor.h +++ b/fdbserver/include/fdbserver/TesterInterface.actor.h @@ -85,6 +85,7 @@ struct WorkloadRequest { int clientId; // the "id" of the client receiving the request (0 indexed) int clientCount; // the total number of test clients participating in the workload ReplyPromise reply; + std::vector disabledFailureInjectionWorkloads; template void serialize(Ar& ar) { @@ -100,6 +101,7 @@ struct WorkloadRequest { reply, defaultTenant, runFailureWorkloads, + disabledFailureInjectionWorkloads, arena); } }; diff --git a/fdbserver/include/fdbserver/workloads/workloads.actor.h b/fdbserver/include/fdbserver/workloads/workloads.actor.h index e0f29e2c4b1..a496214ec42 100644 --- a/fdbserver/include/fdbserver/workloads/workloads.actor.h +++ b/fdbserver/include/fdbserver/workloads/workloads.actor.h @@ -335,6 +335,7 @@ class TestSpec { ISimulator::BackupAgentType simDrAgents; KnobKeyValuePairs overrideKnobs; + std::vector disabledFailureInjectionWorkloads; }; ACTOR Future runWorkload(Database cx, diff --git a/fdbserver/tester.actor.cpp b/fdbserver/tester.actor.cpp index 871df633457..84d18e4422b 100644 --- a/fdbserver/tester.actor.cpp +++ b/fdbserver/tester.actor.cpp @@ -45,6 +45,7 @@ #include "fdbserver/Knobs.h" #include "fdbserver/WorkerInterface.actor.h" #include "fdbrpc/SimulatorProcessInfo.h" +#include "flow/Platform.h" #include "flow/actorcompiler.h" // This must be the last #include. FDB_DEFINE_BOOLEAN_PARAM(UntrustedMode); @@ -402,8 +403,18 @@ void CompoundWorkload::addFailureInjection(WorkloadRequest& work) { if (disabledWorkloads.count(workload->description()) > 0) { continue; } + if (std::count(work.disabledFailureInjectionWorkloads.begin(), + work.disabledFailureInjectionWorkloads.end(), + workload->description()) > 0) { + continue; + } while (shouldInjectFailure(random, work, workload)) { workload->initFailureInjectionMode(random); + TraceEvent("AddFailureInjectionWorkload") + .detail("Name", workload->description()) + .detail("ClientID", work.clientId) + .detail("ClientCount", clientCount) + .detail("Title", work.title); failureInjection.push_back(workload); workload = factory->create(*this); } @@ -986,6 +997,7 @@ ACTOR Future runWorkload(Database cx, req.clientCount = testers.size(); req.sharedRandomNumber = sharedRandom; req.defaultTenant = defaultTenant.castTo(); + req.disabledFailureInjectionWorkloads = spec.disabledFailureInjectionWorkloads; workRequests.push_back(testers[i].recruitments.getReply(req)); } @@ -1414,6 +1426,18 @@ std::maprunFailureWorkloads = (value == "true"); } }, + { "disabledFailureInjectionWorkloads", + [](const std::string& value, TestSpec* spec) { + std::stringstream ss(value); + while (ss.good()) { + std::string substr; + getline(ss, substr, ','); + substr = removeWhitespace(substr); + if (!substr.empty()) { + spec->disabledFailureInjectionWorkloads.push_back(substr); + } + } + } }, }; std::vector readTests(std::ifstream& ifs) { diff --git a/tests/restarting/from_7.0.0/UpgradeAndBackupRestore-1.toml b/tests/restarting/from_7.0.0/UpgradeAndBackupRestore-1.toml index ddfd93f103d..699554c17a9 100644 --- a/tests/restarting/from_7.0.0/UpgradeAndBackupRestore-1.toml +++ b/tests/restarting/from_7.0.0/UpgradeAndBackupRestore-1.toml @@ -6,6 +6,7 @@ testTitle = 'SubmitBackup' simBackupAgents= 'BackupToFile' clearAfterTest = false runConsistencyCheck=false +disabledFailureInjectionWorkloads = 'Attrition' [[test.workload]] testName = 'SubmitBackup' @@ -23,6 +24,7 @@ runConsistencyCheck=false testTitle = 'FirstCycleTest' clearAfterTest=false runConsistencyCheck = false +disabledFailureInjectionWorkloads = 'Attrition' [[test.workload]] testName = 'Cycle' From 73524c960ce0f1582fb54a14c087c0d56cb2bbb7 Mon Sep 17 00:00:00 2001 From: Jingyu Zhou Date: Wed, 25 Jan 2023 15:36:40 -0800 Subject: [PATCH 3/4] Change UpgradeAndBackupRestore to from_7.2.4 Because of the new option "disabledFailureInjectionWorkloads" is not available until 7.2.4. --- tests/CMakeLists.txt | 4 ++-- .../{from_7.0.0 => from_7.2.4}/UpgradeAndBackupRestore-1.toml | 0 .../{from_7.0.0 => from_7.2.4}/UpgradeAndBackupRestore-2.toml | 0 3 files changed, 2 insertions(+), 2 deletions(-) rename tests/restarting/{from_7.0.0 => from_7.2.4}/UpgradeAndBackupRestore-1.toml (100%) rename tests/restarting/{from_7.0.0 => from_7.2.4}/UpgradeAndBackupRestore-2.toml (100%) diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt index 2a8220230ee..c3f20b45920 100644 --- a/tests/CMakeLists.txt +++ b/tests/CMakeLists.txt @@ -276,8 +276,8 @@ if(WITH_PYTHON) TEST_FILES restarting/from_6.3.13_until_7.2.0/DrUpgradeRestart-1.txt restarting/from_6.3.13_until_7.2.0/DrUpgradeRestart-2.txt) add_fdb_test( - TEST_FILES restarting/from_7.0.0/UpgradeAndBackupRestore-1.toml - restarting/from_7.0.0/UpgradeAndBackupRestore-2.toml) + TEST_FILES restarting/from_7.2.4/UpgradeAndBackupRestore-1.toml + restarting/from_7.2.4/UpgradeAndBackupRestore-2.toml) add_fdb_test( TEST_FILES restarting/to_7.1.0/CycleTestRestart-1.toml restarting/to_7.1.0/CycleTestRestart-2.toml) diff --git a/tests/restarting/from_7.0.0/UpgradeAndBackupRestore-1.toml b/tests/restarting/from_7.2.4/UpgradeAndBackupRestore-1.toml similarity index 100% rename from tests/restarting/from_7.0.0/UpgradeAndBackupRestore-1.toml rename to tests/restarting/from_7.2.4/UpgradeAndBackupRestore-1.toml diff --git a/tests/restarting/from_7.0.0/UpgradeAndBackupRestore-2.toml b/tests/restarting/from_7.2.4/UpgradeAndBackupRestore-2.toml similarity index 100% rename from tests/restarting/from_7.0.0/UpgradeAndBackupRestore-2.toml rename to tests/restarting/from_7.2.4/UpgradeAndBackupRestore-2.toml From e1bf6d5cb0913146431893e304394d76f38d1b8d Mon Sep 17 00:00:00 2001 From: Jingyu Zhou Date: Wed, 25 Jan 2023 17:37:16 -0800 Subject: [PATCH 4/4] Add from_7.0.0_until_7.2.0 for UpgradeAndBackupRestore tests --- tests/CMakeLists.txt | 7 ++- .../UpgradeAndBackupRestore-1.toml | 58 ++++++++++++++++++ .../UpgradeAndBackupRestore-2.toml | 61 +++++++++++++++++++ 3 files changed, 124 insertions(+), 2 deletions(-) create mode 100644 tests/restarting/from_7.0.0_until_7.2.0/UpgradeAndBackupRestore-1.toml create mode 100644 tests/restarting/from_7.0.0_until_7.2.0/UpgradeAndBackupRestore-2.toml diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt index c3f20b45920..741a255b161 100644 --- a/tests/CMakeLists.txt +++ b/tests/CMakeLists.txt @@ -276,8 +276,8 @@ if(WITH_PYTHON) TEST_FILES restarting/from_6.3.13_until_7.2.0/DrUpgradeRestart-1.txt restarting/from_6.3.13_until_7.2.0/DrUpgradeRestart-2.txt) add_fdb_test( - TEST_FILES restarting/from_7.2.4/UpgradeAndBackupRestore-1.toml - restarting/from_7.2.4/UpgradeAndBackupRestore-2.toml) + TEST_FILES restarting/from_7.0.0_until_7.2.0/UpgradeAndBackupRestore-1.toml + restarting/from_7.0.0_until_7.2.0/UpgradeAndBackupRestore-2.toml) add_fdb_test( TEST_FILES restarting/to_7.1.0/CycleTestRestart-1.toml restarting/to_7.1.0/CycleTestRestart-2.toml) @@ -308,6 +308,9 @@ if(WITH_PYTHON) add_fdb_test( TEST_FILES restarting/from_7.2.0/DrUpgradeRestart-1.txt restarting/from_7.2.0/DrUpgradeRestart-2.txt) + add_fdb_test( + TEST_FILES restarting/from_7.2.4/UpgradeAndBackupRestore-1.toml + restarting/from_7.2.4/UpgradeAndBackupRestore-2.toml) add_fdb_test(TEST_FILES slow/ApiCorrectness.toml) diff --git a/tests/restarting/from_7.0.0_until_7.2.0/UpgradeAndBackupRestore-1.toml b/tests/restarting/from_7.0.0_until_7.2.0/UpgradeAndBackupRestore-1.toml new file mode 100644 index 00000000000..289d5c3d1da --- /dev/null +++ b/tests/restarting/from_7.0.0_until_7.2.0/UpgradeAndBackupRestore-1.toml @@ -0,0 +1,58 @@ +[configuration] +storageEngineExcludeTypes=[3] + +[[test]] +testTitle = 'SubmitBackup' +simBackupAgents= 'BackupToFile' +clearAfterTest = false +runConsistencyCheck=false + + [[test.workload]] + testName = 'SubmitBackup' + delayFor = 0 + stopWhenDone = false + +[[test]] +testTitle = 'FirstCycleTest' +clearAfterTest=false +runConsistencyCheck = false + + [[test.workload]] + testName = 'Cycle' + nodeCount = 30000 + transactionsPerSecond = 2500.0 + testDuration = 30.0 + expectedRate = 0 + keyPrefix = 'BeforeRestart' + + [[test.workload]] + testName = 'RandomClogging' + testDuration = 90.0 + + [[test.workload]] + testName = 'Rollback' + meanDelay = 90.0 + testDuration = 90.0 + + [[test.workload]] + testName = 'Attrition' + machinesToKill = 10 + machinesToLeave = 3 + reboot = true + testDuration = 90.0 + + [[test.workload]] + testName='Attrition' + machinesToKill = 10 + machinesToLeave = 3 + reboot = true + testDuration = 90.0 + +[[test]] +testTitle = 'SaveDatabase' +clearAfterTest = false + + [[test.workload]] + testName = 'SaveAndKill' + restartInfoLocation = 'simfdb/restartInfo.ini' + testDuration=30.0 diff --git a/tests/restarting/from_7.0.0_until_7.2.0/UpgradeAndBackupRestore-2.toml b/tests/restarting/from_7.0.0_until_7.2.0/UpgradeAndBackupRestore-2.toml new file mode 100644 index 00000000000..7be9a98cc67 --- /dev/null +++ b/tests/restarting/from_7.0.0_until_7.2.0/UpgradeAndBackupRestore-2.toml @@ -0,0 +1,61 @@ +[[test]] +testTitle = 'SecondCycleTest' +simBackupAgents = 'BackupToFile' +clearAfterTest=false +runConsistencyCheck=false + + [[test.workload]] + testName = 'Cycle' + nodeCount = 30000 + transactionsPerSecond = 2500.0 + testDuration = 30.0 + expectedRate = 0 + keyPrefix = 'AfterRestart' + + [[test.workload]] + testName = 'RandomClogging' + testDuration = 90.0 + + [[test.workload]] + testName = 'Rollback' + meanDelay = 90.0 + testDuration = 90.0 + + [[test.workload]] + testName = 'Attrition' + machinesToKill = 10 + machinesToLeave = 3 + reboot = true + testDuration = 90.0 + + [[test.workload]] + testName = 'Attrition' + machinesToKill = 10 + machinesToLeave = 3 + reboot = true + testDuration = 90.0 + +[[test]] +testTitle= 'RestoreBackup' +simBackupAgents = 'BackupToFile' +clearAfterTest=false + + [[test.workload]] + testName = 'RestoreBackup' + tag = 'default' + +[[test]] +testTitle = 'CheckCycles' +checkOnly=true + + [[test.workload]] + testName = 'Cycle' + nodeCount=30000 + keyPrefix = 'AfterRestart' + expectedRate=0 + + [[test.workload]] + testName = 'Cycle' + nodeCount = 30000 + keyPrefix= 'BeforeRestart' + expectedRate = 0