Skip to content

Commit

Permalink
fixup! update-agent: react to persistent deploy failure
Browse files Browse the repository at this point in the history
  • Loading branch information
lucab committed May 20, 2020
1 parent 43e74d2 commit 3c5beba
Show file tree
Hide file tree
Showing 2 changed files with 82 additions and 44 deletions.
35 changes: 22 additions & 13 deletions src/update_agent/actor.rs
Original file line number Diff line number Diff line change
Expand Up @@ -219,19 +219,17 @@ impl UpdateAgent {
fn tick_stage_update(&mut self, release: Release) -> ResponseActFuture<Self, Result<(), ()>> {
trace!("trying to stage an update");

let target = release.clone();
let can_fetch = self.strategy.can_check_and_fetch(&self.identity);
let state_change = actix::fut::wrap_future::<_, Self>(can_fetch)
.then(|can_fetch, actor, _ctx| actor.locked_upgrade(can_fetch, release))
.map(|res, actor, _ctx| {
let success = res.is_ok();
if let Some(abandoned) = actor.state.deploy_attempted(success) {
log::warn!(
"persistent deploy failure detected, target release '{}' abandoned",
abandoned.version
);
};
Ok(())
});
let deploy_outcome = actix::fut::wrap_future::<_, Self>(can_fetch)
.then(|can_fetch, actor, _ctx| actor.attempt_deploy(can_fetch, target));
let state_change = deploy_outcome.map(move |res, actor, _ctx| {
match res {
Ok(_) => actor.state.update_staged(release),
Err(_) => actor.deploy_attempt_failed(release),
};
Ok(())
});

Box::new(state_change)
}
Expand Down Expand Up @@ -263,7 +261,7 @@ impl UpdateAgent {
}

/// Fetch and stage an update, in finalization-locked mode.
fn locked_upgrade(
fn attempt_deploy(
&mut self,
can_fetch: bool,
release: Release,
Expand All @@ -290,6 +288,17 @@ impl UpdateAgent {
Box::new(upgrade)
}

/// Record a failed deploy attempt.
fn deploy_attempt_failed(&mut self, release: Release) {
let is_abandoned = self.state.record_failed_deploy();
if is_abandoned {
log::warn!(
"persistent deploy failure detected, target release '{}' abandoned",
release.version
);
}
}

/// List local deployments.
fn local_deployments(
&mut self,
Expand Down
91 changes: 60 additions & 31 deletions src/update_agent/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -12,11 +12,16 @@ use chrono::prelude::*;
use prometheus::IntGauge;
use std::time::Duration;

/// Default tick/refresh period for the state machine (in seconds).
const DEFAULT_REFRESH_PERIOD_SECS: u64 = 300; // 5 minutes.
/// Default refresh interval for steady state (in seconds).
pub(crate) const DEFAULT_STEADY_INTERVAL_SECS: u64 = 300; // 5 minutes.

/// Default tick/refresh period for the state machine (in seconds).
const DEFAULT_REFRESH_PERIOD_SECS: u64 = 300; // 5 minutes.

/// Maximum failed deploy attempts in a row in `UpdateAvailable` state
/// before abandoning a target update.
const MAX_DEPLOY_ATTEMPTS: u8 = 12;

lazy_static::lazy_static! {
static ref LATEST_STATE_CHANGE: IntGauge = register_int_gauge!(opts!(
"zincati_update_agent_latest_state_change_timestamp",
Expand All @@ -36,6 +41,10 @@ enum UpdateAgentState {
/// No further updates available yet.
NoNewUpdate,
/// Update available from Cincinnati.
///
/// The integer counter keeps track of how many times in a row this
/// update was attempted, but deploying failed. At `MAX_DEPLOY_ATTEMPTS`
/// a state transition is triggered to abandon the target update.
UpdateAvailable((Release, u8)),
/// Update staged by rpm-ostree.
UpdateStaged(Release),
Expand Down Expand Up @@ -117,40 +126,30 @@ impl UpdateAgentState {
self.transition_to(target);
}

/// Record a deploy attempt in UpdateAvailable state.
/// Record a failed deploy attempt in UpdateAvailable state.
///
/// In case of persistent failure, this returns the update version which
/// is being abandoned.
fn deploy_attempted(&mut self, success: bool) -> Option<Release> {
// Maximum failed deploy attempts before declaring a persistent error.
const MAX_DEPLOY_ATTEMPTS: u8 = 12;

/// This returns whether a persistent deploy failure was detected
/// and the target update abandoned.
fn record_failed_deploy(&mut self) -> bool {
let (release, attempts) = match self.clone() {
UpdateAgentState::UpdateAvailable((r, a)) => (r, a),
_ => unreachable!("transition not allowed: deploy_attempted on {:?}", self,),
_ => unreachable!("transition not allowed: record_failed_deploy on {:?}", self,),
};
let fail_count = attempts.saturating_add(1);
let persistent_err = fail_count >= MAX_DEPLOY_ATTEMPTS;

let persistent_err = attempts >= MAX_DEPLOY_ATTEMPTS;
match (success, persistent_err) {
(true, _) => {
self.update_staged(release);
None
}
(false, false) => {
self.deploy_failed(release, attempts);
None
}
(false, true) => {
self.update_abandoned();
Some(release)
}
if persistent_err {
self.update_abandoned();
} else {
self.deploy_failed(release, fail_count);
}

persistent_err
}

/// Transition to the UpdateAvailable state after a deploy failure.
fn deploy_failed(&mut self, update: Release, attempts: u8) {
let failed = attempts.saturating_add(1);
let target = UpdateAgentState::UpdateAvailable((update, failed));
fn deploy_failed(&mut self, update: Release, fail_count: u8) {
let target = UpdateAgentState::UpdateAvailable((update, fail_count));

self.transition_to(target);
}
Expand Down Expand Up @@ -268,15 +267,14 @@ mod tests {
UpdateAgentState::UpdateAvailable((update.clone(), 0))
);

let a1 = machine.deploy_attempted(false);
assert_eq!(a1, None);
let persistent_err = machine.record_failed_deploy();
assert_eq!(persistent_err, false);
assert_eq!(
machine,
UpdateAgentState::UpdateAvailable((update.clone(), 1))
);

let a2 = machine.deploy_attempted(true);
assert_eq!(a2, None);
machine.update_staged(update.clone());
assert_eq!(machine, UpdateAgentState::UpdateStaged(update.clone()));

machine.update_finalized(update.clone());
Expand All @@ -285,4 +283,35 @@ mod tests {
machine.end();
assert_eq!(machine, UpdateAgentState::EndState);
}

#[test]
fn test_fsm_abandon_update() {
let update = Release {
version: "v1".to_string(),
checksum: "ostree-checksum".to_string(),
age_index: None,
};
let mut machine = UpdateAgentState::NoNewUpdate;

machine.update_available(update.clone());
assert_eq!(
machine,
UpdateAgentState::UpdateAvailable((update.clone(), 0))
);

// MAX-1 temporary failures.
for attempt in 1..MAX_DEPLOY_ATTEMPTS {
let persistent_err = machine.record_failed_deploy();
assert_eq!(persistent_err, false);
assert_eq!(
machine,
UpdateAgentState::UpdateAvailable((update.clone(), attempt as u8))
)
}

// Persistent error threshold reached.
let persistent_err = machine.record_failed_deploy();
assert_eq!(persistent_err, true);
assert_eq!(machine, UpdateAgentState::NoNewUpdate);
}
}

0 comments on commit 3c5beba

Please sign in to comment.