Skip to content

Commit

Permalink
Fix backup/restore in stress/crash test (facebook#7357)
Browse files Browse the repository at this point in the history
Summary:
(1) Skip check on specific key if restoring an old backup
(small minority of cases) because it can fail in those cases. (2) Remove
an old assertion about number of column families and number of keys
passed in, which is broken by atomic flush (cf_consistency) test. Like
other code (for better or worse) assume a single key and iterate over
column families. (3) Apply mock_direct_io to NewSequentialFile so that
db_stress backup works on /dev/shm.

Also add more context to output in case of backup/restore db_stress
failure.

Also a minor fix to BackupEngine to report first failure status in
creating new backup, and drop another clue about the potential
source of a "Backup failed" status.

Reverts "Disable backup/restore stress test (facebook#7350)"

Pull Request resolved: facebook#7357

Test Plan:
Using backup_one_in=10000,
"USE_CLANG=1 make crash_test_with_atomic_flush" for 30+ minutes
"USE_CLANG=1 make blackbox_crash_test" for 30+ minutes
And with use_direct_reads with TEST_TMPDIR=/dev/shm/rocksdb

Reviewed By: riversand963

Differential Revision: D23567244

Pulled By: pdillinger

fbshipit-source-id: e77171c2e8394d173917e36898c02dead1c40b77
  • Loading branch information
pdillinger authored and codingrhythm committed Mar 5, 2021
1 parent 7ced56e commit eb34054
Show file tree
Hide file tree
Showing 5 changed files with 63 additions and 14 deletions.
62 changes: 50 additions & 12 deletions db_stress_tool/db_stress_test_base.cc
Original file line number Diff line number Diff line change
Expand Up @@ -1217,10 +1217,6 @@ void StressTest::TestCompactFiles(ThreadState* /* thread */,
Status StressTest::TestBackupRestore(
ThreadState* thread, const std::vector<int>& rand_column_families,
const std::vector<int64_t>& rand_keys) {
// Note the column families chosen by `rand_column_families` cannot be
// dropped while the locks for `rand_keys` are held. So we should not have
// to worry about accessing those column families throughout this function.
assert(rand_column_families.size() == rand_keys.size());
std::string backup_dir = FLAGS_db + "/.backup" + ToString(thread->tid);
std::string restore_dir = FLAGS_db + "/.restore" + ToString(thread->tid);
BackupableDBOptions backup_opts(backup_dir);
Expand Down Expand Up @@ -1249,32 +1245,60 @@ Status StressTest::TestBackupRestore(
}
}
BackupEngine* backup_engine = nullptr;
std::string from = "a backup/restore operation";
Status s = BackupEngine::Open(db_stress_env, backup_opts, &backup_engine);
if (!s.ok()) {
from = "BackupEngine::Open";
}
if (s.ok()) {
s = backup_engine->CreateNewBackup(db_);
if (!s.ok()) {
from = "BackupEngine::CreateNewBackup";
}
}
if (s.ok()) {
delete backup_engine;
backup_engine = nullptr;
s = BackupEngine::Open(db_stress_env, backup_opts, &backup_engine);
if (!s.ok()) {
from = "BackupEngine::Open (again)";
}
}
std::vector<BackupInfo> backup_info;
if (s.ok()) {
backup_engine->GetBackupInfo(&backup_info);
if (backup_info.empty()) {
s = Status::NotFound("no backups found");
from = "BackupEngine::GetBackupInfo";
}
}
if (s.ok() && thread->rand.OneIn(2)) {
s = backup_engine->VerifyBackup(
backup_info.front().backup_id,
thread->rand.OneIn(2) /* verify_with_checksum */);
if (!s.ok()) {
from = "BackupEngine::VerifyBackup";
}
}
bool from_latest = false;
if (s.ok()) {
int count = static_cast<int>(backup_info.size());
s = backup_engine->RestoreDBFromBackup(
RestoreOptions(), backup_info[thread->rand.Uniform(count)].backup_id,
restore_dir /* db_dir */, restore_dir /* wal_dir */);
if (count > 1) {
s = backup_engine->RestoreDBFromBackup(
RestoreOptions(), backup_info[thread->rand.Uniform(count)].backup_id,
restore_dir /* db_dir */, restore_dir /* wal_dir */);
if (!s.ok()) {
from = "BackupEngine::RestoreDBFromBackup";
}
} else {
from_latest = true;
s = backup_engine->RestoreDBFromLatestBackup(RestoreOptions(),
restore_dir /* db_dir */,
restore_dir /* wal_dir */);
if (!s.ok()) {
from = "BackupEngine::RestoreDBFromLatestBackup";
}
}
}
if (s.ok()) {
uint32_t to_keep = 0;
Expand All @@ -1283,6 +1307,9 @@ Status StressTest::TestBackupRestore(
to_keep = thread->rand.Uniform(3);
}
s = backup_engine->PurgeOldBackups(to_keep);
if (!s.ok()) {
from = "BackupEngine::PurgeOldBackups";
}
}
DB* restored_db = nullptr;
std::vector<ColumnFamilyHandle*> restored_cf_handles;
Expand All @@ -1300,11 +1327,19 @@ Status StressTest::TestBackupRestore(
}
s = DB::Open(DBOptions(restore_options), restore_dir, cf_descriptors,
&restored_cf_handles, &restored_db);
if (!s.ok()) {
from = "DB::Open in backup/restore";
}
}
// for simplicity, currently only verifies existence/non-existence of a few
// keys
for (size_t i = 0; s.ok() && i < rand_column_families.size(); ++i) {
std::string key_str = Key(rand_keys[i]);
// Note the column families chosen by `rand_column_families` cannot be
// dropped while the locks for `rand_keys` are held. So we should not have
// to worry about accessing those column families throughout this function.
//
// For simplicity, currently only verifies existence/non-existence of a
// single key
for (size_t i = 0; from_latest && s.ok() && i < rand_column_families.size();
++i) {
std::string key_str = Key(rand_keys[0]);
Slice key = key_str;
std::string restored_value;
Status get_status = restored_db->Get(
Expand All @@ -1321,6 +1356,9 @@ Status StressTest::TestBackupRestore(
}
} else {
s = get_status;
if (!s.ok()) {
from = "DB::Get in backup/restore";
}
}
}
if (backup_engine != nullptr) {
Expand All @@ -1335,7 +1373,7 @@ Status StressTest::TestBackupRestore(
restored_db = nullptr;
}
if (!s.ok()) {
fprintf(stderr, "A backup/restore operation failed with: %s\n",
fprintf(stderr, "Failure in %s with: %s\n", from.c_str(),
s.ToString().c_str());
}
return s;
Expand Down
1 change: 1 addition & 0 deletions env/fs_posix.cc
Original file line number Diff line number Diff line change
Expand Up @@ -159,6 +159,7 @@ class PosixFileSystem : public FileSystem {
#endif // !ROCKSDB_LITE
#if !defined(OS_MACOSX) && !defined(OS_OPENBSD) && !defined(OS_SOLARIS)
flags |= O_DIRECT;
TEST_SYNC_POINT_CALLBACK("NewSequentialFile:O_DIRECT", &flags);
#endif
}

Expand Down
5 changes: 5 additions & 0 deletions test_util/sync_point.cc
Original file line number Diff line number Diff line change
Expand Up @@ -83,6 +83,11 @@ void SetupSyncPointsToMockDirectIO() {
int* val = static_cast<int*>(arg);
*val &= ~O_DIRECT;
});
ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
"NewSequentialFile:O_DIRECT", [&](void* arg) {
int* val = static_cast<int*>(arg);
*val &= ~O_DIRECT;
});
ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
#endif
}
Expand Down
3 changes: 3 additions & 0 deletions tools/db_crashtest.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,9 @@

default_params = {
"acquire_snapshot_one_in": 10000,
"backup_max_size": 100 * 1024 * 1024,
# Consider larger number when backups considered more stable
"backup_one_in": 100000,
"block_size": 16384,
"bloom_bits": lambda: random.choice([random.randint(0,19),
random.lognormvariate(2.3, 1.3)]),
Expand Down
6 changes: 4 additions & 2 deletions utilities/backupable/backupable_db.cc
Original file line number Diff line number Diff line change
Expand Up @@ -1182,7 +1182,9 @@ Status BackupEngineImpl::CreateNewBackupWithMetadata(
new_backup->SetSequenceNumber(sequence_number);
}
}
ROCKS_LOG_INFO(options_.info_log, "add files for backup done, wait finish.");
ROCKS_LOG_INFO(options_.info_log,
"add files for backup done (%s), wait finish.",
s.ok() ? "OK" : "not OK");
Status item_status;
for (auto& item : backup_items_to_finish) {
item.result.wait();
Expand All @@ -1198,7 +1200,7 @@ Status BackupEngineImpl::CreateNewBackupWithMetadata(
result.custom_checksum_hex, result.checksum_func_name, result.db_id,
result.db_session_id));
}
if (!item_status.ok()) {
if (s.ok() && !item_status.ok()) {
s = item_status;
}
}
Expand Down

0 comments on commit eb34054

Please sign in to comment.