Skip to content

Commit

Permalink
fix: Treat 502s and 503s as transient for GCS OS (#2202)
Browse files Browse the repository at this point in the history
A refactoring introduced lately caused multiple restarts in provers
(namely BWGs) when GCS was unavailable (502 or 503). This is a sporadic,
once in a while, but still invalides tens of minutes of work and makes
proving fickle and slow. This PR addresses the issue and restores old
behavior pre-refactoring, treating 502s and 503s as transient errors.
  • Loading branch information
EmilLuta authored Jun 11, 2024
1 parent b43a881 commit 0a12c52
Show file tree
Hide file tree
Showing 4 changed files with 10 additions and 2 deletions.
1 change: 1 addition & 0 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

1 change: 1 addition & 0 deletions core/lib/object_store/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@ rand.workspace = true
tokio = { workspace = true, features = ["full"] }
tracing.workspace = true
prost.workspace = true
reqwest.workspace = true

[dev-dependencies]
assert_matches.workspace = true
Expand Down
9 changes: 7 additions & 2 deletions core/lib/object_store/src/gcs.rs
Original file line number Diff line number Diff line change
Expand Up @@ -87,7 +87,7 @@ impl From<AuthError> for ObjectStoreError {
fn from(err: AuthError) -> Self {
let is_transient = matches!(
&err,
AuthError::HttpError(err) if err.is_timeout() || err.is_connect() || has_transient_io_source(err)
AuthError::HttpError(err) if err.is_timeout() || err.is_connect() || has_transient_io_source(err) || upstream_unavailable(err)
);
Self::Initialization {
source: err.into(),
Expand All @@ -111,6 +111,11 @@ fn has_transient_io_source(mut err: &(dyn StdError + 'static)) -> bool {
}
}

fn upstream_unavailable(err: &reqwest::Error) -> bool {
err.status() == Some(StatusCode::BAD_GATEWAY)
|| err.status() == Some(StatusCode::SERVICE_UNAVAILABLE)
}

impl From<HttpError> for ObjectStoreError {
fn from(err: HttpError) -> Self {
let is_not_found = match &err {
Expand All @@ -126,7 +131,7 @@ impl From<HttpError> for ObjectStoreError {
} else {
let is_transient = matches!(
&err,
HttpError::HttpClient(err) if err.is_timeout() || err.is_connect() || has_transient_io_source(err)
HttpError::HttpClient(err) if err.is_timeout() || err.is_connect() || has_transient_io_source(err) || upstream_unavailable(err)
);
ObjectStoreError::Other {
is_transient,
Expand Down
1 change: 1 addition & 0 deletions prover/Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

0 comments on commit 0a12c52

Please sign in to comment.