Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

fix: Retry check workflow query to be more resilient to backend failures #1740

Merged
7 changes: 6 additions & 1 deletion crates/rover-client/src/error.rs
Original file line number Diff line number Diff line change
Expand Up @@ -200,9 +200,14 @@ pub enum RoverClientError {
)]
PlanError { msg: String },

#[error("Your check took too long to run")]
#[error("The check workflow took too long to run.")]
ChecksTimeoutError { url: Option<String> },

#[error(
"A check workflow status was reported but it was not specified as a pass or a failure."
)]
UnknownCheckWorkflowStatus,

#[error("You cannot publish a new subgraph without specifying a routing URL.")]
MissingRoutingUrlError {
subgraph_name: String,
Expand Down
33 changes: 18 additions & 15 deletions crates/rover-client/src/operations/graph/check_workflow/runner.rs
Original file line number Diff line number Diff line change
Expand Up @@ -42,26 +42,31 @@ pub fn run(
client: &StudioClient,
) -> Result<CheckWorkflowResponse, RoverClientError> {
let graph_ref = input.graph_ref.clone();
let mut data;
let mut url: Option<String> = None;
let now = Instant::now();
loop {
data = client.post::<GraphCheckWorkflowQuery>(input.clone().into())?;
let graph = data.clone().graph.ok_or(RoverClientError::GraphNotFound {
graph_ref: graph_ref.clone(),
})?;
if let Some(check_workflow) = graph.check_workflow {
if !matches!(check_workflow.status, CheckWorkflowStatus::PENDING) {
break;
let result = client.post::<GraphCheckWorkflowQuery>(input.clone().into());
match result {
Ok(data) => {
let graph = data.clone().graph.ok_or(RoverClientError::GraphNotFound {
graph_ref: graph_ref.clone(),
})?;
if let Some(check_workflow) = graph.check_workflow {
if !matches!(check_workflow.status, CheckWorkflowStatus::PENDING) {
return get_check_response_from_data(data, graph_ref);
}
}
url = get_target_url_from_data(data);
}
Err(e) => {
eprintln!("error while checking status of check: {e}\nthis error may be transient... retrying");
}
}
if now.elapsed() > Duration::from_secs(input.checks_timeout_seconds) {
return Err(RoverClientError::ChecksTimeoutError {
url: get_target_url_from_data(data),
});
return Err(RoverClientError::ChecksTimeoutError { url });
EverlastingBugstopper marked this conversation as resolved.
Show resolved Hide resolved
}
std::thread::sleep(Duration::from_secs(5));
}
get_check_response_from_data(data, graph_ref)
}

fn get_check_response_from_data(
Expand Down Expand Up @@ -141,9 +146,7 @@ fn get_check_response_from_data(
graph_ref,
check_response: Box::new(check_response),
}),
_ => Err(RoverClientError::ChecksTimeoutError {
url: Some(default_target_url),
}),
_ => Err(RoverClientError::UnknownCheckWorkflowStatus),
}
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -47,26 +47,31 @@ pub fn run(
client: &StudioClient,
) -> Result<CheckWorkflowResponse, RoverClientError> {
let graph_ref = input.graph_ref.clone();
let mut data;
let mut url: Option<String> = None;
let now = Instant::now();
loop {
data = client.post::<SubgraphCheckWorkflowQuery>(input.clone().into())?;
let graph = data.clone().graph.ok_or(RoverClientError::GraphNotFound {
graph_ref: graph_ref.clone(),
})?;
if let Some(check_workflow) = graph.check_workflow {
if !matches!(check_workflow.status, CheckWorkflowStatus::PENDING) {
break;
let result = client.post::<SubgraphCheckWorkflowQuery>(input.clone().into());
match result {
Ok(data) => {
let graph = data.clone().graph.ok_or(RoverClientError::GraphNotFound {
graph_ref: graph_ref.clone(),
})?;
if let Some(check_workflow) = graph.check_workflow {
if !matches!(check_workflow.status, CheckWorkflowStatus::PENDING) {
return get_check_response_from_data(data, graph_ref, subgraph);
}
}
url = get_target_url_from_data(data);
}
Err(e) => {
eprintln!("error while checking status of check: {e}\nthis error may be transient... retrying");
}
}
if now.elapsed() > Duration::from_secs(input.checks_timeout_seconds) {
return Err(RoverClientError::ChecksTimeoutError {
url: get_target_url_from_data(data),
});
return Err(RoverClientError::ChecksTimeoutError { url });
}
std::thread::sleep(Duration::from_secs(5));
}
get_check_response_from_data(data, graph_ref, subgraph)
}

fn get_check_response_from_data(
Expand Down Expand Up @@ -203,9 +208,7 @@ fn get_check_response_from_data(
graph_ref,
check_response: Box::new(check_response),
}),
_ => Err(RoverClientError::ChecksTimeoutError {
url: Some(default_target_url),
}),
_ => Err(RoverClientError::UnknownCheckWorkflowStatus),
}
}

Expand Down
3 changes: 3 additions & 0 deletions src/error/metadata/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -271,6 +271,9 @@ impl From<&mut anyhow::Error> for RoverErrorMetadata {
Some(RoverErrorSuggestion::IncreaseChecksTimeout { url: url.clone() }),
None,
),
RoverClientError::UnknownCheckWorkflowStatus => {
(Some(RoverErrorSuggestion::SubmitIssue), None)
}
RoverClientError::MissingRoutingUrlError {
subgraph_name,
graph_ref,
Expand Down