From 843533ec75f0d6d0ffa98490af875d0f1f1a44b0 Mon Sep 17 00:00:00 2001 From: Christian Schwarz Date: Tue, 2 Jul 2024 15:13:27 +0200 Subject: [PATCH] fix: noisy logging when download gets cancelled during shutdown (#8224) Before this PR, during timeline shutdown, we'd occasionally see log lines like this one: ``` 2024-06-26T18:28:11.063402Z INFO initial_size_calculation{tenant_id=$TENANT,shard_id=0000 timeline_id=$TIMELINE}:logical_size_calculation_task:get_or_maybe_download{layer=000000000000000000000000000000000000-000000067F0001A3950001C1630100000000__0000000D88265898}: layer file download failed, and caller has been cancelled: Cancelled, shutting down Stack backtrace: 0: as core::ops::try_trait::FromResidual>>::from_residual at /rustc/129f3b9964af4d4a709d1383930ade12dfe7c081/library/core/src/result.rs:1964:27 pageserver::tenant::remote_timeline_client::RemoteTimelineClient::download_layer_file::{{closure}} at /home/nonroot/pageserver/src/tenant/remote_timeline_client.rs:531:13 pageserver::tenant::storage_layer::layer::LayerInner::download_and_init::{{closure}} at /home/nonroot/pageserver/src/tenant/storage_layer/layer.rs:1136:14 pageserver::tenant::storage_layer::layer::LayerInner::download_init_and_wait::{{closure}}::{{closure}} at /home/nonroot/pageserver/src/tenant/storage_layer/layer.rs:1082:74 ``` We can eliminate the anyhow backtrace with no loss of information because the conversion to anyhow::Error happens in exactly one place. refs #7427 --- pageserver/src/tenant/remote_timeline_client.rs | 2 +- pageserver/src/tenant/storage_layer/layer.rs | 17 ++++------------- 2 files changed, 5 insertions(+), 14 deletions(-) diff --git a/pageserver/src/tenant/remote_timeline_client.rs b/pageserver/src/tenant/remote_timeline_client.rs index e33e4b84aa978..bc9364de61d4b 100644 --- a/pageserver/src/tenant/remote_timeline_client.rs +++ b/pageserver/src/tenant/remote_timeline_client.rs @@ -519,7 +519,7 @@ impl RemoteTimelineClient { local_path: &Utf8Path, cancel: &CancellationToken, ctx: &RequestContext, - ) -> anyhow::Result { + ) -> Result { let downloaded_size = { let _unfinished_gauge_guard = self.metrics.call_begin( &RemoteOpFileKind::Layer, diff --git a/pageserver/src/tenant/storage_layer/layer.rs b/pageserver/src/tenant/storage_layer/layer.rs index 5dd947253578d..02069c29d2644 100644 --- a/pageserver/src/tenant/storage_layer/layer.rs +++ b/pageserver/src/tenant/storage_layer/layer.rs @@ -1096,19 +1096,10 @@ impl LayerInner { match rx.await { Ok(Ok(res)) => Ok(res), - Ok(Err(e)) => { - // sleep already happened in the spawned task, if it was not cancelled - match e.downcast_ref::() { - // If the download failed due to its cancellation token, - // propagate the cancellation error upstream. - Some(remote_storage::DownloadError::Cancelled) => { - Err(DownloadError::DownloadCancelled) - } - // FIXME: this is not embedding the error because historically it would had - // been output to compute, however that is no longer the case. - _ => Err(DownloadError::DownloadFailed), - } + Ok(Err(remote_storage::DownloadError::Cancelled)) => { + Err(DownloadError::DownloadCancelled) } + Ok(Err(_)) => Err(DownloadError::DownloadFailed), Err(_gone) => Err(DownloadError::DownloadCancelled), } } @@ -1118,7 +1109,7 @@ impl LayerInner { timeline: Arc, permit: heavier_once_cell::InitPermit, ctx: &RequestContext, - ) -> anyhow::Result> { + ) -> Result, remote_storage::DownloadError> { let result = timeline .remote_client .download_layer_file(