Skip to content

Commit

Permalink
auto stop-restart: allow force mode to override scheduled restart
Browse files Browse the repository at this point in the history
  • Loading branch information
oliver-sanders committed Nov 9, 2018
1 parent 32d78a9 commit 4d8b623
Show file tree
Hide file tree
Showing 2 changed files with 114 additions and 8 deletions.
31 changes: 23 additions & 8 deletions lib/cylc/scheduler.py
Original file line number Diff line number Diff line change
Expand Up @@ -1443,25 +1443,37 @@ def set_auto_restart(self, restart_delay=None,
bool: False if it is not possible to automatically stop/restart
the suite due to it's configuration/runtime state.
"""
# Check that the suite itn't already shutting down.
if self.stop_mode or self.auto_restart_time is not None:
# Check that the suite isn't already shutting down.
if self.stop_mode:
return True

# Force mode, stop the suite now, don't restart it.
if mode == self.AUTO_STOP_RESTART_FORCE:
if self.auto_restart_time:
LOG.info('Scheduled automatic restart canceled')
self.auto_restart_time = time()
self.auto_restart_mode = mode
return True

# Check suite isn't already scheduled to auto-stop.
if self.auto_restart_time is not None:
return True

# Check suite is able to be safely restarted.
if not self.can_auto_restart():
return False

LOG.info('Suite will automatically restart on a new host.')
if restart_delay > 0:
# Delay shutdown by a random interval to avoid many
# suites restarting simultaneously.
from random import random
shutdown_delay = int(random() * restart_delay)
if restart_delay is not None and restart_delay != 0:
if restart_delay > 0:
# Delay shutdown by a random interval to avoid many
# suites restarting simultaneously.
from random import random
shutdown_delay = int(random() * restart_delay)
else:
# Un-documented feature, schedule exact restart interval for
# testing purposes.
shutdown_delay = abs(int(restart_delay))
shutdown_time = time() + shutdown_delay
LOG.info('Suite will restart in %ss (at %s)' % (
shutdown_delay, time2str(shutdown_time)))
Expand Down Expand Up @@ -1516,6 +1528,8 @@ def suite_health_check(self, has_changes):
4. Suite contact file has the right info?
"""
LOG.debug('Performing suite health check')

# 1. check if suite is stalled - if so call handler if defined
if self.stop_mode is None and not has_changes:
self.check_suite_stalled()
Expand All @@ -1525,7 +1539,7 @@ def suite_health_check(self, has_changes):
now > self.time_next_health_check):

# 2. check if suite host is condemned - if so auto restart.
if self.auto_restart_time is None and self.stop_mode is None:
if self.stop_mode is None:
current_glbl_cfg = glbl_cfg(cached=False)
for host in current_glbl_cfg.get(['suite servers',
'condemned hosts']):
Expand All @@ -1534,6 +1548,7 @@ def suite_health_check(self, has_changes):
mode = self.AUTO_STOP_RESTART_FORCE
host = host[:-1]
else:
# normal mode (stop and restart the suite)
mode = self.AUTO_STOP_RESTART_NORMAL
if self.auto_restart_time is not None:
# suite is already scheduled to stop-restart only
Expand Down
91 changes: 91 additions & 0 deletions tests/restart/43-auto-restart-force-override-normal.t
Original file line number Diff line number Diff line change
@@ -0,0 +1,91 @@
#!/bin/bash
# THIS FILE IS PART OF THE CYLC SUITE ENGINE.
# Copyright (C) 2008-2018 NIWA & British Crown (Met Office) & Contributors.
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>.
#-------------------------------------------------------------------------------
# Check that "Force Mode" can override a scheduler "Normal Mode" restart.
. "$(dirname "$0")/test_header"
export CYLC_TEST_HOST_2=$( \
cylc get-global-config -i '[test battery]remote host with shared fs' \
2>'/dev/null')
if [[ -z "${CYLC_TEST_HOST_2}" ]]; then
skip_all '"[test battery]remote host with shared fs": not defined'
fi
export CYLC_TEST_HOST_1="$(hostname)"

BASE_GLOBALRC='
[cylc]
health check interval = PT5S
[[events]]
abort on inactivity = True
abort on timeout = True
inactivity = PT2M
timeout = PT2M
'

TEST_DIR="$HOME/cylc-run/" init_suite "${TEST_NAME_BASE}" <<< '
[cylc]
abort if any task fails = True
[scheduling]
initial cycle point = 2000
[[dependencies]]
[[[P1Y]]]
graph = foo[-P1Y] => foo
'

create_test_globalrc '' "
${BASE_GLOBALRC}
[suite servers]
run hosts = ${CYLC_TEST_HOST_1}
"

set_test_number 7
#-------------------------------------------------------------------------------
# run suite
cylc run "${SUITE_NAME}"
poll ! test -f "${SUITE_RUN_DIR}/.service/contact"
sleep 1
FILE=$(cylc cat-log "${SUITE_NAME}" -m p |xargs readlink -f)

# condemn the host, the suite will schedule restart in PT60S
create_test_globalrc '' "
${BASE_GLOBALRC}
[suite servers]
run hosts = ${CYLC_TEST_HOST_1}, ${CYLC_TEST_HOST_2}
condemned hosts = ${CYLC_TEST_HOST_1}
auto restart delay = -PT60S # results in +PT60S delay
"
log_scan "${TEST_NAME_BASE}-stop" "${FILE}" 40 1 \
'The Cylc suite host will soon become un-available' \
'Suite will restart in 60s'

# condemn the host in "Force Mode", this should cancel the scheduled restart
create_test_globalrc '' "
${BASE_GLOBALRC}
[suite servers]
condemned hosts = ${CYLC_TEST_HOST_1}!
"
log_scan "${TEST_NAME_BASE}-stop" "${FILE}" 40 1 \
'This suite will be shutdown as the suite host is' \
'When another suite host becomes available the suite can' \
'Scheduled automatic restart canceled' \
'Suite shutting down - REQUEST(NOW)' \
'DONE'

cylc stop --now --now 2>/dev/null # incase test fails
poll test -f "${SUITE_RUN_DIR}/.service/contact"
purge_suite "${SUITE_NAME}"

exit

0 comments on commit 4d8b623

Please sign in to comment.