diff --git a/config/template.toml b/config/template.toml
index 7734585..ca73215 100644
--- a/config/template.toml
+++ b/config/template.toml
@@ -61,6 +61,13 @@ snapshot_console_on_crash = true
snapshot_console_on_manual_stop = false
# Save last console out on user invoked service kill, default false
snapshot_console_on_manual_kill = false
+# backoff in ms on restart
+retry_backoff_ms = 1000
+# whether to backoff exponentially, based on retry_backoff_ms
+retry_expotential_backoff = true
+# maximum restarts that happen <= backoff time
+retry_max = 10
+
[[services]]
id = 1
diff --git a/examples/crash_backoff.rs b/examples/crash_backoff.rs
new file mode 100644
index 0000000..6aefc08
--- /dev/null
+++ b/examples/crash_backoff.rs
@@ -0,0 +1,5 @@
+use std::io::{Error, ErrorKind, Result};
+
+fn main() -> Result<()> {
+ Err(Error::new(ErrorKind::Other, "oh no!"))
+}
\ No newline at end of file
diff --git a/frontend/src/lib/Api.js b/frontend/src/lib/Api.js
index 35d37c2..a1b5d8f 100644
--- a/frontend/src/lib/Api.js
+++ b/frontend/src/lib/Api.js
@@ -14,6 +14,8 @@ export const ServiceState = {
Crashed: "Crashed",
Stopping: "Stopping",
Killed: "Killed",
+ EndedBackoff: "EndedBackoff",
+ CrashedBackoff: "CrashedBackoff",
};
export const ConsoleType = {
@@ -175,6 +177,7 @@ export class Permissions {
}
export class Log {
+ static ServiceMaxRetries = "ServiceMaxRetries";
static SystemStart = "SystemStartup";
static KilledCmd = "ServiceCmdKilled";
static Killed = "ServiceKilled";
@@ -213,6 +216,7 @@ export function formatLog(entry) {
switch (Object.keys(entry.action)[0]) {
case Log.StartFailure: return "Startup failure: "+entry.action[Log.StartFailure];
case Log.Crash: return "Service crashed, signal "+entry.action[Log.Crash];
+ case Log.ServiceMaxRetries: return "Maximum start retries reached: "+entry.action[Log.ServiceMaxRetries];
case Log.Input: return "Console input by "+entry.invoker.name+": "+entry.action[Log.Input];
}
}
diff --git a/frontend/src/views/service.js b/frontend/src/views/service.js
index e4bb4e5..e812d40 100644
--- a/frontend/src/views/service.js
+++ b/frontend/src/views/service.js
@@ -193,7 +193,8 @@ export default class Service extends React.Component {
render () {
const running = this.state.state === ServiceState.Running;
const stopping = this.state.state === ServiceState.Stopping;
- const stopped = !running && !stopping;
+ const backoff = this.state.state === ServiceState.EndedBackoff || this.state.state === ServiceState.CrashedBackoff;
+ const stopped = !running && !stopping && !backoff;
const perms = this.state.permissions;
const perm_console = Permissions.hasFlag(perms, Permissions.OUTPUT) || Permissions.hasFlag(perms, Permissions.STDIN_ALL);
const perm_log = Permissions.hasFlag(perms, Permissions.LOG);
@@ -231,6 +232,10 @@ export default class Service extends React.Component {
}
+ {backoff &&
+
+ }
{(running || stopping) &&
diff --git a/src/db/models.rs b/src/db/models.rs
index a24cc66..a4b27ac 100644
--- a/src/db/models.rs
+++ b/src/db/models.rs
@@ -200,6 +200,7 @@ impl NewLogEntry {
#[derive(Debug, Serialize, Deserialize, PartialEq)]
pub enum LogAction {
SystemStartup,
+ ServiceMaxRetries(usize),
ServiceCmdKilled,
ServiceKilled,
ServiceCmdStop,
diff --git a/src/handler/error.rs b/src/handler/error.rs
index 036c0d2..afd1667 100644
--- a/src/handler/error.rs
+++ b/src/handler/error.rs
@@ -129,6 +129,8 @@ pub enum ControllerError {
DBError(db::Error),
#[fail(display = "Service has no soft-stop parameter")]
NoSoftStop,
+ #[fail(display = "Service has no backoff handle!")]
+ NoBackoffHandle,
}
impl From for ControllerError {
diff --git a/src/handler/messages.rs b/src/handler/messages.rs
index 427683e..4ee486b 100644
--- a/src/handler/messages.rs
+++ b/src/handler/messages.rs
@@ -193,7 +193,7 @@ pub mod unchecked {
#[rtype(result = "Result<(), ControllerError>")]
pub struct StartService {
pub id: SID,
- /// Invoker to use for logging
+ /// Invoker to use for logging & differentiate between user and internal (re)start
pub user: Option,
}
diff --git a/src/handler/service.rs b/src/handler/service.rs
index 35b26a2..4bb5962 100644
--- a/src/handler/service.rs
+++ b/src/handler/service.rs
@@ -15,7 +15,7 @@ use failure::Fallible;
use futures::stream::StreamExt;
use metrohash::MetroHashMap;
use serde::Serialize;
-use std::env::current_dir;
+use std::{env::current_dir, time::Duration};
use std::ffi::OsString;
use std::path::Path;
use strip_ansi_escapes as ansi_esc;
@@ -105,7 +105,7 @@ impl Handler for ServiceController {
return Err(ControllerError::ServiceRunning);
}
trace!("starting..");
- if let Err(e) = instance.run(ctx.address()) {
+ if let Err(e) = instance.run(ctx.address(),msg.user.is_some()) {
return Err(ControllerError::StartupIOError(e));
}
Self::log(
@@ -165,8 +165,12 @@ impl Handler for ServiceController {
None,
);
return Ok(());
+ } else if service.state.in_backoff() {
+ // handle kill during backoff, abort backoff restart
+ service.stop_backoff()
+ } else {
+ Err(ControllerError::NoServiceHandle)
}
- Err(ControllerError::NoServiceHandle)
} else {
Err(ControllerError::InvalidInstance(msg.id))
}
@@ -178,7 +182,10 @@ impl Handler for ServiceController {
fn handle(&mut self, msg: StopService, _ctx: &mut Context) -> Self::Result {
if let Some(service) = self.services.get_mut(&msg.id) {
- if !service.running.load(Ordering::Acquire) {
+ // service in backoff, stop backoff restart, if applicable
+ if !service.running.load(Ordering::Acquire) && service.state.in_backoff() {
+ service.stop_backoff()?;
+ } else {
return Err(ControllerError::ServiceStopped);
}
let stdin = match service.stdin.as_mut() {
@@ -229,9 +236,18 @@ impl Handler for ServiceController {
snapshot = instance.model.snapshot_console_on_manual_kill;
LogAction::ServiceKilled
}
+ State::ServiceMaxRetries => {
+ LogAction::ServiceMaxRetries(instance.backoff_counter)
+ }
State::Stopping => {
unreachable!("unreachable: service-stopping-state in state update!")
}
+ State::EndedBackoff => {
+ unreachable!("unreachable: service-ended-backoff-state in state update!")
+ }
+ State::CrashedBackoff => {
+ unreachable!("unreachable: service-crashed-backoff-state in state update!")
+ }
};
let log_data = match snapshot {
@@ -250,15 +266,57 @@ impl Handler for ServiceController {
instance.model.restart && state == State::Crashed
};
+ trace!("restart: {}",restart);
if restart {
- ctx.address().do_send(StartService {
- id: instance.model.id,
- user: None,
- });
+ instance.backoff_counter += 1;
+ // if no max retry limit and no backoff time is set, restart instantly
+ if instance.model.retry_max.is_none() && instance.model.retry_backoff_ms.is_none() {
+ info!("No backoff limit/time configured, restarting \"{}\" instantly.",instance.model.name);
+ ctx.address().do_send(StartService {
+ id: instance.model.id,
+ user: None,
+ });
+ } else if instance.can_backoff() {
+ let backoff_time = instance.get_backoff_time();
+ let id = instance.model.id.clone();
+ let name = instance.model.name.clone();
+ let flag = instance.backoff_kill_flag.clone();
+ let addr = ctx.address();
+ let (fut,aborter) = future::abortable(async move {
+ tokio::time::delay_for(backoff_time).await;
+ if flag.load(Ordering::Acquire) {
+ return;
+ }
+ trace!("Restarting from backoff");
+ if let Err(e) = addr.try_send(StartService {
+ id,
+ user: None,
+ }) {
+ warn!("Unable to send restart message from backoff for {} {}", name, e);
+ }
+ });
+ let id = instance.model.id.clone();
+ spawn(fut.map(move |v| {
+ if let Err(e) = v {
+ error!("Backoff error instance {}: {}", id, e);
+ }
+ }));
+ instance.backoff_kill_handle = Some(aborter);
+ } else {
+ trace!("Reached max retries!");
+ instance.state.set_state(State::ServiceMaxRetries);
+ ctx.address().do_send(ServiceStateChanged {
+ id: instance.model.id,
+ running: false,
+ });
+ // TODO: log max retriess
+ }
} else {
// cleanup
instance.kill_handle = None;
instance.stdin = None;
+ // reset backoff
+ instance.reset_backoff(true);
}
}
}
@@ -460,6 +518,12 @@ struct Instance {
stdin: Option>,
start_time: Option,
end_time: Option,
+ last_backoff: Option,
+ backoff_counter: usize,
+ /// Handle to kill backoff timer to abort a delayed restart
+ backoff_kill_handle: Option,
+ /// Flag to check, avoiding race condition between aborthandle and future poll on delay end
+ backoff_kill_flag: Arc,
}
#[derive(PartialEq, Serialize)]
@@ -467,9 +531,12 @@ pub enum State {
Stopped = 0,
Running = 1,
Ended = 2,
- Crashed = 3,
- Stopping = 4,
- Killed = 5,
+ EndedBackoff = 3,
+ Crashed = 4,
+ CrashedBackoff = 5,
+ Stopping = 6,
+ Killed = 7,
+ ServiceMaxRetries = 8,
}
// derived from https://gist.github.com/polypus74/eabc7bb00873e6b90abe230f9e632989
@@ -492,6 +559,12 @@ impl StateFlag {
pub fn set_state(&self, state: State) {
self.inner.store(state as usize, Ordering::SeqCst)
}
+ /// Returns true if state is currently in backoff wait
+ /// Meaning it's waiting for a delay to restart
+ pub fn in_backoff(&self) -> bool {
+ let state = self.get_state();
+ state == State::CrashedBackoff || state == State::EndedBackoff
+ }
}
impl From for State {
@@ -501,10 +574,13 @@ impl From for State {
0 => Stopped,
1 => Running,
2 => Ended,
- 3 => Crashed,
- 4 => Stopping,
- 5 => Killed,
- _ => unreachable!(),
+ 3 => EndedBackoff,
+ 4 => Crashed,
+ 5 => CrashedBackoff,
+ 6 => Stopping,
+ 7 => Killed,
+ 8 => ServiceMaxRetries,
+ _ => unreachable!("Invalid service state: {}",val),
}
}
}
@@ -539,8 +615,8 @@ impl Instance {
msg
}
/// Run instance, outer catch function to log startup errors to tty
- fn run(&mut self, addr: Addr) -> Result<(), ::std::io::Error> {
- let res = self.run_internal(addr);
+ fn run(&mut self, addr: Addr, user_initiated: bool) -> Result<(), ::std::io::Error> {
+ let res = self.run_internal(addr, user_initiated);
if let Err(e) = &res {
let mut buffer_w = self.tty.write().expect("Can't write buffer!");
buffer_w.push_back(ConsoleType::State(
@@ -588,13 +664,14 @@ impl Instance {
}
/// real service starter
- fn run_internal(&mut self, addr: Addr) -> Result<(), ::std::io::Error> {
+ fn run_internal(&mut self, addr: Addr, user_initiated: bool) -> Result<(), ::std::io::Error> {
if self.model.enabled
&& !self
.running
- .compare_and_swap(false, true, Ordering::Acquire)
+ .compare_and_swap(false, true, Ordering::AcqRel)
{
- trace!("Starting {}", self.model.name);
+ self.reset_backoff(user_initiated);
+ trace!("Starting {}, through user: {}", self.model.name,user_initiated);
{
let mut buffer_w = self.tty.write().expect("Can't write buffer!");
buffer_w.push_back(ConsoleType::State(
@@ -774,6 +851,38 @@ impl Instance {
}
Ok(())
}
+ /// Reset backoff, also resets counter if enabled
+ fn reset_backoff(&mut self, backoff: bool) {
+ trace!("Resetting backoff, counter: {}",backoff);
+ self.last_backoff = None;
+ self.backoff_kill_flag.store(false, Ordering::Release);
+ if backoff {
+ self.backoff_counter = 0;
+ }
+ }
+ fn can_backoff(&self) -> bool {
+ trace!("Backoff retries: {}/{:?}",self.backoff_counter,self.model.retry_max);
+ self.model.retry_max.map_or(true, |v|self.backoff_counter < v)
+ }
+ fn get_backoff_time(&self) -> Duration {
+ trace!("get_backoff_time");
+ if let Some(v) = self.model.retry_backoff_ms {
+ Duration::from_millis(v * (self.backoff_counter as u64))
+ } else {
+ Duration::from_millis(10_000 * (self.backoff_counter as u64))
+ }
+ }
+ /// Stop backoff from execution
+ fn stop_backoff(&mut self) -> Result<(), ControllerError> {
+ trace!("stop_backoff");
+ if let Some(handle) = self.backoff_kill_handle.take() {
+ self.backoff_kill_flag.store(true, Ordering::Release);
+ handle.abort();
+ Ok(())
+ } else {
+ return Err(ControllerError::NoBackoffHandle.into());
+ }
+ }
}
impl From for Instance {
@@ -788,6 +897,14 @@ impl From for Instance {
stdin: None,
start_time: None,
end_time: None,
+ backoff_counter: 0,
+ last_backoff: None,
+ backoff_kill_handle: None,
+ backoff_kill_flag: Arc::new(AtomicBool::new(false)),
+ // TODO:
+ // add kill-switch for delayed future, https://docs.rs/tokio/0.2.22/tokio/time/fn.delay_for.html
+ // to allow delayed backoff future that restarts, but can also be cancelled on manual interaction
+ // need to also add some kind of additional state flag to show the user a running backoff
}
}
-}
+}
\ No newline at end of file
diff --git a/src/settings.rs b/src/settings.rs
index 2cef33e..87c6bd0 100644
--- a/src/settings.rs
+++ b/src/settings.rs
@@ -76,6 +76,10 @@ pub struct Service {
pub snapshot_console_on_manual_stop: bool,
#[serde(default)]
pub snapshot_console_on_manual_kill: bool,
+ #[serde(default)]
+ pub retry_max: Option,
+ #[serde(default)]
+ pub retry_backoff_ms: Option,
}
impl Settings {
@@ -166,6 +170,8 @@ mod tests {
snapshot_console_on_manual_kill: true,
id: 0,
restart: true,
+ retry_backoff_ms: Some(0),
+ retry_max: Some(0)
},
Service {
name: "some service2".to_owned(),
@@ -183,6 +189,8 @@ mod tests {
args: vec!["asd".to_owned(), "def".to_owned()],
id: 1,
restart: true,
+ retry_backoff_ms: Some(0),
+ retry_max: Some(0)
},
],
};