Skip to content

Commit

Permalink
UnitTest to repro recovery_in_prog_ being false when Recover starts
Browse files Browse the repository at this point in the history
  • Loading branch information
jaykorean committed Oct 23, 2023
1 parent 76b9502 commit 87fe776
Show file tree
Hide file tree
Showing 3 changed files with 67 additions and 2 deletions.
4 changes: 2 additions & 2 deletions db/error_handler.cc
Original file line number Diff line number Diff line change
Expand Up @@ -664,11 +664,11 @@ const Status& ErrorHandler::StartRecoverFromRetryableBGIOError(
// wait the previous recover thread to finish and create a new thread
// to recover from the bg error.
db_mutex_->Unlock();
TEST_SYNC_POINT("StartRecoverFromRetryableBGIOError:WaitingForOtherThread");
old_recovery_thread->join();
db_mutex_->Lock();
}

TEST_SYNC_POINT("StartRecoverFromRetryableBGIOError::in_progress");
recovery_thread_.reset(
new port::Thread(&ErrorHandler::RecoverFromRetryableBGIOError, this));

Expand All @@ -682,7 +682,7 @@ const Status& ErrorHandler::StartRecoverFromRetryableBGIOError(
// Automatic recover from Retryable BG IO error. Must be called after db
// mutex is released.
void ErrorHandler::RecoverFromRetryableBGIOError() {
assert(recovery_in_prog_);
// assert(recovery_in_prog_);
TEST_SYNC_POINT("RecoverFromRetryableBGIOError:BeforeStart");
TEST_SYNC_POINT("RecoverFromRetryableBGIOError:BeforeStart2");
InstrumentedMutexLock l(db_mutex_);
Expand Down
63 changes: 63 additions & 0 deletions db/error_handler_fs_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -1827,6 +1827,69 @@ TEST_F(DBErrorHandlingFSTest, FLushWritNoWALRetryableErrorAutoRecover1) {
Destroy(options);
}

TEST_F(DBErrorHandlingFSTest, MultipleRecoveryThreads) {
// Activate the FS before the first resume
std::shared_ptr<ErrorHandlerFSListener> listener(
new ErrorHandlerFSListener());
Options options = GetDefaultOptions();
options.env = fault_env_.get();
options.create_if_missing = true;
options.listeners.emplace_back(listener);
options.max_bgerror_resume_count = 2;
options.bgerror_resume_retry_interval = 100000; // 0.1 second
options.statistics = CreateDBStatistics();

listener->EnableAutoRecovery(false);
DestroyAndReopen(options);

IOStatus error_msg = IOStatus::IOError("Retryable IO Error");
error_msg.SetRetryable(true);

WriteOptions wo = WriteOptions();
wo.disableWAL = true;
fault_fs_->SetFilesystemActive(false, error_msg);
ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency(
{{"NotifyOnErrorRecoveryEnd:MutexUnlocked:1",
"MultipleRecoveryThreads:1"},
{"MultipleRecoveryThreads:2",
"NotifyOnErrorRecoveryEnd:MutexUnlocked:2"},
{"StartRecoverFromRetryableBGIOError:WaitingForOtherThread",
"MultipleRecoveryThreads:3"}});
SyncPoint::GetInstance()->EnableProcessing();

// First write with read fault injected and recovery will start
{
ASSERT_OK(Put(Key(1), "val1", wo));
Status s = Flush();
ASSERT_NOK(s);
ASSERT_EQ(s.severity(), ROCKSDB_NAMESPACE::Status::Severity::kSoftError);
}
// Remove read fault injection so that first recovery can go through
fault_fs_->SetFilesystemActive(true);

// At this point, first recovery is now at NotifyOnErrorRecoveryEnd. Mutex is
// released.
TEST_SYNC_POINT("MultipleRecoveryThreads:1");

// Inject failure again to create second recovery
fault_fs_->SetFilesystemActive(false, error_msg);
ROCKSDB_NAMESPACE::port::Thread second_write([&] {
// Second write with read fault injected
ASSERT_OK(Put(Key(2), "val2", wo));
Status s = Flush();
ASSERT_NOK(s);
ASSERT_EQ(s.severity(), ROCKSDB_NAMESPACE::Status::Severity::kSoftError);
});
TEST_SYNC_POINT("MultipleRecoveryThreads:3");
TEST_SYNC_POINT("MultipleRecoveryThreads:2");
// Remove error injection so that second thread recovery can go through
fault_fs_->SetFilesystemActive(true);
second_write.join();
// Wait for second write to recover
ASSERT_OK(dbfull()->TEST_WaitForBackgroundWork());
Destroy(options);
}

TEST_F(DBErrorHandlingFSTest, FLushWritNoWALRetryableErrorAutoRecover2) {
// Activate the FS before the first resume
std::shared_ptr<ErrorHandlerFSListener> listener(
Expand Down
2 changes: 2 additions & 0 deletions db/event_helpers.cc
Original file line number Diff line number Diff line change
Expand Up @@ -230,6 +230,8 @@ void EventHelpers::NotifyOnErrorRecoveryEnd(
db_mutex->AssertHeld();
// release lock while notifying events
db_mutex->Unlock();
TEST_SYNC_POINT("NotifyOnErrorRecoveryEnd:MutexUnlocked:1");
TEST_SYNC_POINT("NotifyOnErrorRecoveryEnd:MutexUnlocked:2");
for (auto& listener : listeners) {
BackgroundErrorRecoveryInfo info;
info.old_bg_error = old_bg_error;
Expand Down

0 comments on commit 87fe776

Please sign in to comment.