From c035675c7749de9164f3fe69dd4c78e7582d4665 Mon Sep 17 00:00:00 2001 From: VasuDevrani <101383635+VasuDevrani@users.noreply.github.com> Date: Thu, 16 May 2024 14:06:20 +0000 Subject: [PATCH 1/7] improve logging message for retryableIOErrors --- src/server/server.cc | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/src/server/server.cc b/src/server/server.cc index 0ebc142cbbb..38bcbd2f294 100644 --- a/src/server/server.cc +++ b/src/server/server.cc @@ -808,9 +808,13 @@ void Server::cron() { // In order to properly handle all possible situations on rocksdb, we manually resume here // when encountering no space error and disk quota exceeded error. if (counter != 0 && counter % 600 == 0 && storage->IsDBInRetryableIOError()) { - storage->GetDB()->Resume(); - LOG(INFO) << "[server] Schedule to resume DB after retryable IO error"; - storage->SetDBInRetryableIOError(false); + rocksdb::Status status = storage->GetDB()->Resume(); + if (status.ok()) { + LOG(WARNING) << "[server] Successfully resumed DB after retryable IO error: " << status.ToString(); + storage->SetDBInRetryableIOError(false); + } else { + LOG(ERROR) << "[server] Failed to resume DB after retryable IO error: " << status.ToString(); + } } // check if we need to clean up exited worker threads every 5s From f715076005cc85b52cef69dea241826e60b64d1c Mon Sep 17 00:00:00 2001 From: VasuDevrani <101383635+VasuDevrani@users.noreply.github.com> Date: Thu, 16 May 2024 19:48:01 +0530 Subject: [PATCH 2/7] Update src/server/server.cc Co-authored-by: Twice --- src/server/server.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/server/server.cc b/src/server/server.cc index 38bcbd2f294..55c8d53046b 100644 --- a/src/server/server.cc +++ b/src/server/server.cc @@ -810,7 +810,7 @@ void Server::cron() { if (counter != 0 && counter % 600 == 0 && storage->IsDBInRetryableIOError()) { rocksdb::Status status = storage->GetDB()->Resume(); if (status.ok()) { - LOG(WARNING) << "[server] Successfully resumed DB after retryable IO error: " << status.ToString(); + LOG(WARNING) << "[server] Successfully resumed DB after retryable IO error."; storage->SetDBInRetryableIOError(false); } else { LOG(ERROR) << "[server] Failed to resume DB after retryable IO error: " << status.ToString(); From c695aa5542a1333e65d3a2810e3d3da35cdfdd6f Mon Sep 17 00:00:00 2001 From: Twice Date: Thu, 16 May 2024 23:27:59 +0900 Subject: [PATCH 3/7] Update src/server/server.cc Co-authored-by: Myth --- src/server/server.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/server/server.cc b/src/server/server.cc index 55c8d53046b..62ad7319130 100644 --- a/src/server/server.cc +++ b/src/server/server.cc @@ -810,7 +810,7 @@ void Server::cron() { if (counter != 0 && counter % 600 == 0 && storage->IsDBInRetryableIOError()) { rocksdb::Status status = storage->GetDB()->Resume(); if (status.ok()) { - LOG(WARNING) << "[server] Successfully resumed DB after retryable IO error."; + LOG(WARNING) << "[server] Successfully resumed DB after retryable IO error"; storage->SetDBInRetryableIOError(false); } else { LOG(ERROR) << "[server] Failed to resume DB after retryable IO error: " << status.ToString(); From ddd77712a005e75f430d5aeded6831b66ee43e96 Mon Sep 17 00:00:00 2001 From: Twice Date: Thu, 16 May 2024 23:28:24 +0900 Subject: [PATCH 4/7] Update src/server/server.cc Co-authored-by: Myth --- src/server/server.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/server/server.cc b/src/server/server.cc index 62ad7319130..ce05103e232 100644 --- a/src/server/server.cc +++ b/src/server/server.cc @@ -808,7 +808,7 @@ void Server::cron() { // In order to properly handle all possible situations on rocksdb, we manually resume here // when encountering no space error and disk quota exceeded error. if (counter != 0 && counter % 600 == 0 && storage->IsDBInRetryableIOError()) { - rocksdb::Status status = storage->GetDB()->Resume(); + auto s = storage->GetDB()->Resume(); if (status.ok()) { LOG(WARNING) << "[server] Successfully resumed DB after retryable IO error"; storage->SetDBInRetryableIOError(false); From e9346f07a706bae463c82f07f751e56a79134edc Mon Sep 17 00:00:00 2001 From: Myth Date: Thu, 16 May 2024 22:30:09 +0800 Subject: [PATCH 5/7] Update src/server/server.cc --- src/server/server.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/server/server.cc b/src/server/server.cc index ce05103e232..42398da3bde 100644 --- a/src/server/server.cc +++ b/src/server/server.cc @@ -809,7 +809,7 @@ void Server::cron() { // when encountering no space error and disk quota exceeded error. if (counter != 0 && counter % 600 == 0 && storage->IsDBInRetryableIOError()) { auto s = storage->GetDB()->Resume(); - if (status.ok()) { + if (s.ok()) { LOG(WARNING) << "[server] Successfully resumed DB after retryable IO error"; storage->SetDBInRetryableIOError(false); } else { From 2193497ffac3942aed03721a4b7fa22b8081ebfa Mon Sep 17 00:00:00 2001 From: Myth Date: Thu, 16 May 2024 22:30:16 +0800 Subject: [PATCH 6/7] Update src/server/server.cc --- src/server/server.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/server/server.cc b/src/server/server.cc index 42398da3bde..c73fbf89060 100644 --- a/src/server/server.cc +++ b/src/server/server.cc @@ -813,7 +813,7 @@ void Server::cron() { LOG(WARNING) << "[server] Successfully resumed DB after retryable IO error"; storage->SetDBInRetryableIOError(false); } else { - LOG(ERROR) << "[server] Failed to resume DB after retryable IO error: " << status.ToString(); + LOG(ERROR) << "[server] Failed to resume DB after retryable IO error: " << s.ToString(); } } From ccaf7e629f9b8ce718cf174449de05e50f725d9c Mon Sep 17 00:00:00 2001 From: VasuDevrani <101383635+VasuDevrani@users.noreply.github.com> Date: Thu, 16 May 2024 21:13:32 +0530 Subject: [PATCH 7/7] Update src/server/server.cc Co-authored-by: Twice --- src/server/server.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/server/server.cc b/src/server/server.cc index c73fbf89060..7ed970bfc24 100644 --- a/src/server/server.cc +++ b/src/server/server.cc @@ -811,10 +811,10 @@ void Server::cron() { auto s = storage->GetDB()->Resume(); if (s.ok()) { LOG(WARNING) << "[server] Successfully resumed DB after retryable IO error"; - storage->SetDBInRetryableIOError(false); } else { LOG(ERROR) << "[server] Failed to resume DB after retryable IO error: " << s.ToString(); } + storage->SetDBInRetryableIOError(false); } // check if we need to clean up exited worker threads every 5s