Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix backup/restore in stress/crash test #7357

Closed
wants to merge 1 commit into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
62 changes: 50 additions & 12 deletions db_stress_tool/db_stress_test_base.cc
Original file line number Diff line number Diff line change
Expand Up @@ -1217,10 +1217,6 @@ void StressTest::TestCompactFiles(ThreadState* /* thread */,
Status StressTest::TestBackupRestore(
ThreadState* thread, const std::vector<int>& rand_column_families,
const std::vector<int64_t>& rand_keys) {
// Note the column families chosen by `rand_column_families` cannot be
// dropped while the locks for `rand_keys` are held. So we should not have
// to worry about accessing those column families throughout this function.
assert(rand_column_families.size() == rand_keys.size());
std::string backup_dir = FLAGS_db + "/.backup" + ToString(thread->tid);
std::string restore_dir = FLAGS_db + "/.restore" + ToString(thread->tid);
BackupableDBOptions backup_opts(backup_dir);
Expand Down Expand Up @@ -1249,32 +1245,60 @@ Status StressTest::TestBackupRestore(
}
}
BackupEngine* backup_engine = nullptr;
std::string from = "a backup/restore operation";
Status s = BackupEngine::Open(db_stress_env, backup_opts, &backup_engine);
if (!s.ok()) {
from = "BackupEngine::Open";
}
if (s.ok()) {
s = backup_engine->CreateNewBackup(db_);
if (!s.ok()) {
from = "BackupEngine::CreateNewBackup";
}
}
if (s.ok()) {
delete backup_engine;
backup_engine = nullptr;
s = BackupEngine::Open(db_stress_env, backup_opts, &backup_engine);
if (!s.ok()) {
from = "BackupEngine::Open (again)";
}
}
std::vector<BackupInfo> backup_info;
if (s.ok()) {
backup_engine->GetBackupInfo(&backup_info);
if (backup_info.empty()) {
s = Status::NotFound("no backups found");
from = "BackupEngine::GetBackupInfo";
}
}
if (s.ok() && thread->rand.OneIn(2)) {
s = backup_engine->VerifyBackup(
backup_info.front().backup_id,
thread->rand.OneIn(2) /* verify_with_checksum */);
if (!s.ok()) {
from = "BackupEngine::VerifyBackup";
}
}
bool from_latest = false;
if (s.ok()) {
int count = static_cast<int>(backup_info.size());
s = backup_engine->RestoreDBFromBackup(
RestoreOptions(), backup_info[thread->rand.Uniform(count)].backup_id,
restore_dir /* db_dir */, restore_dir /* wal_dir */);
if (count > 1) {
s = backup_engine->RestoreDBFromBackup(
RestoreOptions(), backup_info[thread->rand.Uniform(count)].backup_id,
restore_dir /* db_dir */, restore_dir /* wal_dir */);
if (!s.ok()) {
from = "BackupEngine::RestoreDBFromBackup";
}
} else {
from_latest = true;
s = backup_engine->RestoreDBFromLatestBackup(RestoreOptions(),
restore_dir /* db_dir */,
restore_dir /* wal_dir */);
if (!s.ok()) {
from = "BackupEngine::RestoreDBFromLatestBackup";
}
}
}
if (s.ok()) {
uint32_t to_keep = 0;
Expand All @@ -1283,6 +1307,9 @@ Status StressTest::TestBackupRestore(
to_keep = thread->rand.Uniform(3);
}
s = backup_engine->PurgeOldBackups(to_keep);
if (!s.ok()) {
from = "BackupEngine::PurgeOldBackups";
}
}
DB* restored_db = nullptr;
std::vector<ColumnFamilyHandle*> restored_cf_handles;
Expand All @@ -1300,11 +1327,19 @@ Status StressTest::TestBackupRestore(
}
s = DB::Open(DBOptions(restore_options), restore_dir, cf_descriptors,
&restored_cf_handles, &restored_db);
if (!s.ok()) {
from = "DB::Open in backup/restore";
}
}
// for simplicity, currently only verifies existence/non-existence of a few
// keys
for (size_t i = 0; s.ok() && i < rand_column_families.size(); ++i) {
std::string key_str = Key(rand_keys[i]);
// Note the column families chosen by `rand_column_families` cannot be
// dropped while the locks for `rand_keys` are held. So we should not have
// to worry about accessing those column families throughout this function.
//
// For simplicity, currently only verifies existence/non-existence of a
// single key
for (size_t i = 0; from_latest && s.ok() && i < rand_column_families.size();
++i) {
std::string key_str = Key(rand_keys[0]);
Slice key = key_str;
std::string restored_value;
Status get_status = restored_db->Get(
Expand All @@ -1321,6 +1356,9 @@ Status StressTest::TestBackupRestore(
}
} else {
s = get_status;
if (!s.ok()) {
from = "DB::Get in backup/restore";
}
}
}
if (backup_engine != nullptr) {
Expand All @@ -1335,7 +1373,7 @@ Status StressTest::TestBackupRestore(
restored_db = nullptr;
}
if (!s.ok()) {
fprintf(stderr, "A backup/restore operation failed with: %s\n",
fprintf(stderr, "Failure in %s with: %s\n", from.c_str(),
s.ToString().c_str());
}
return s;
Expand Down
1 change: 1 addition & 0 deletions env/fs_posix.cc
Original file line number Diff line number Diff line change
Expand Up @@ -159,6 +159,7 @@ class PosixFileSystem : public FileSystem {
#endif // !ROCKSDB_LITE
#if !defined(OS_MACOSX) && !defined(OS_OPENBSD) && !defined(OS_SOLARIS)
flags |= O_DIRECT;
TEST_SYNC_POINT_CALLBACK("NewSequentialFile:O_DIRECT", &flags);
#endif
}

Expand Down
5 changes: 5 additions & 0 deletions test_util/sync_point.cc
Original file line number Diff line number Diff line change
Expand Up @@ -83,6 +83,11 @@ void SetupSyncPointsToMockDirectIO() {
int* val = static_cast<int*>(arg);
*val &= ~O_DIRECT;
});
ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
"NewSequentialFile:O_DIRECT", [&](void* arg) {
int* val = static_cast<int*>(arg);
*val &= ~O_DIRECT;
});
ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
#endif
}
Expand Down
3 changes: 3 additions & 0 deletions tools/db_crashtest.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,9 @@

default_params = {
"acquire_snapshot_one_in": 10000,
"backup_max_size": 100 * 1024 * 1024,
# Consider larger number when backups considered more stable
"backup_one_in": 100000,
"block_size": 16384,
"bloom_bits": lambda: random.choice([random.randint(0,19),
random.lognormvariate(2.3, 1.3)]),
Expand Down
6 changes: 4 additions & 2 deletions utilities/backupable/backupable_db.cc
Original file line number Diff line number Diff line change
Expand Up @@ -1182,7 +1182,9 @@ Status BackupEngineImpl::CreateNewBackupWithMetadata(
new_backup->SetSequenceNumber(sequence_number);
}
}
ROCKS_LOG_INFO(options_.info_log, "add files for backup done, wait finish.");
ROCKS_LOG_INFO(options_.info_log,
"add files for backup done (%s), wait finish.",
s.ok() ? "OK" : "not OK");
Status item_status;
for (auto& item : backup_items_to_finish) {
item.result.wait();
Expand All @@ -1198,7 +1200,7 @@ Status BackupEngineImpl::CreateNewBackupWithMetadata(
result.custom_checksum_hex, result.checksum_func_name, result.db_id,
result.db_session_id));
}
if (!item_status.ok()) {
if (s.ok() && !item_status.ok()) {
s = item_status;
}
}
Expand Down