Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

3695.abort if any task fails #3785

Merged
merged 6 commits into from
Aug 25, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 2 additions & 5 deletions cylc/flow/cfgspec/suite.py
Original file line number Diff line number Diff line change
Expand Up @@ -257,7 +257,6 @@
Conf('abort if timeout handler fails', VDR.V_BOOLEAN)
Conf('abort if inactivity handler fails', VDR.V_BOOLEAN)
Conf('abort if stalled handler fails', VDR.V_BOOLEAN)
Conf('abort if any task fails', VDR.V_BOOLEAN)
Conf('abort on stalled', VDR.V_BOOLEAN)
Conf('abort on timeout', VDR.V_BOOLEAN)
Conf('abort on inactivity', VDR.V_BOOLEAN)
Expand Down Expand Up @@ -1303,11 +1302,9 @@ def upg(cfg, descr):
u.obsolete(
'8.0.0',
['cylc', 'health check interval'])
u.deprecate(
'8.0.0',
['cylc', 'abort if any task fails'],
['cylc', 'events', 'abort if any task fails'])
wxtim marked this conversation as resolved.
Show resolved Hide resolved
u.obsolete('8.0.0', ['runtime', '__MANY__', 'job', 'shell'])
u.obsolete('8.0.0', ['cylc', 'abort if any task fails'])
u.obsolete('8.0.0', ['cylc', 'events', 'abort if any task fails'])
# TODO uncomment these deprecations when ready - see todo in
# [runtime][__MANY__] section.
# for job_setting in [
Expand Down
2 changes: 1 addition & 1 deletion cylc/flow/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -2305,7 +2305,7 @@ def get_expected_failed_tasks(self):
"""
if self.options.reftest:
return self.cfg['cylc']['reference test']['expected task failures']
elif self.cfg['cylc']['events']['abort if any task fails']:
elif self.options.abort_if_any_task_fails:
return []
else:
return None
1 change: 0 additions & 1 deletion cylc/flow/etc/syntax/cylc.lang
Original file line number Diff line number Diff line change
Expand Up @@ -195,7 +195,6 @@
<keyword>abort if stalled handler fails</keyword>
<keyword>abort if shutdown handler fails</keyword>
<keyword>abort if inactivity handler fails</keyword>
<keyword>abort if any task fails</keyword>
<keyword>UTC mode</keyword>
<keyword>URL</keyword>
</context>
Expand Down
1 change: 0 additions & 1 deletion cylc/flow/etc/syntax/cylc.xml
Original file line number Diff line number Diff line change
Expand Up @@ -122,7 +122,6 @@
<RegExpr attribute='Keyword' String=' abort if stalled handler fails '/>
<RegExpr attribute='Keyword' String=' abort if shutdown handler fails '/>
<RegExpr attribute='Keyword' String=' abort if inactivity handler fails '/>
<RegExpr attribute='Keyword' String=' abort if any task fails '/>
<RegExpr attribute='Keyword' String=' UTC mode '/>
<RegExpr attribute='Keyword' String=' URL '/>
<!-- Non-keyword syntax -->
Expand Down
10 changes: 9 additions & 1 deletion cylc/flow/scheduler.py
Original file line number Diff line number Diff line change
Expand Up @@ -1545,7 +1545,15 @@ async def suite_shutdown(self):

def suite_auto_restart(self, max_retries=3):
"""Attempt to restart the suite assuming it has already stopped."""
cmd = ['cylc', 'restart', quote(self.suite)]
if self.options.abort_if_any_task_fails:
cmd = [
'cylc',
'restart',
'--abort-if-any-task-fails',
quote(self.suite)
]
else:
cmd = ['cylc', 'restart', quote(self.suite)]

for attempt_no in range(max_retries):
new_host = select_suite_host(cached=False)[0]
Expand Down
6 changes: 6 additions & 0 deletions cylc/flow/scheduler_cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -234,6 +234,12 @@ def get_option_parser(is_restart, add_std_opts=False):
metavar="PLUGIN_NAME", action="append", dest="main_loop"
)

parser.add_option(
"--abort-if-any-task-fails",
help="If set workflow will abort with status 1 if any task fails.",
wxtim marked this conversation as resolved.
Show resolved Hide resolved
action="store_true", default=False, dest="abort_if_any_task_fails"
)

parser.set_defaults(stop_point_string=None)
if add_std_opts:
# This is for the API wrapper for integration tests. Otherwise (CLI
Expand Down
2 changes: 1 addition & 1 deletion tests/flakyfunctional/execution-time-limit/00-background.t
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,7 @@ batch system = ${CYLC_TEST_BATCH_SYS}
run_ok "${TEST_NAME_BASE}-validate" \
cylc validate "${SUITE_NAME}"
suite_run_fail "${TEST_NAME_BASE}-run" \
cylc run --reference-test --debug --no-detach "${SUITE_NAME}"
cylc run --reference-test --debug --no-detach --abort-if-any-task-fails "${SUITE_NAME}"

LOGD="${RUN_DIR}/${SUITE_NAME}/log/job/1/foo"
grep_ok '# Execution time limit: 5.0' "${LOGD}/01/job"
Expand Down
Original file line number Diff line number Diff line change
@@ -1,6 +1,5 @@
[cylc]
[[events]]
abort if any task fails = True
abort on inactivity = True
abort on stalled = True
inactivity = PT2M
Expand Down
1 change: 1 addition & 0 deletions tests/flakyfunctional/execution-time-limit/04-poll.t
Original file line number Diff line number Diff line change
Expand Up @@ -18,5 +18,6 @@
# Test execution time limit polling.
. "$(dirname "$0")/test_header"
set_test_number 2
export ABORT_ON_TASK_FAIL=true
reftest
exit
Original file line number Diff line number Diff line change
@@ -1,7 +1,6 @@
#!jinja2
[cylc]
[[events]]
abort if any task fails = True
abort on inactivity = True
inactivity = PT2M

Expand Down
3 changes: 1 addition & 2 deletions tests/functional/authentication/02-suite2-stop-suite1.t
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,6 @@ mkdir -p "${SUITE2_RUND}"
cat >"${SUITE2_RUND}/flow.cylc" <<__FLOW_CONFIG__
[cylc]
[[events]]
abort if any task fails=True
[scheduling]
[[graph]]
R1=t1
Expand All @@ -43,7 +42,7 @@ __FLOW_CONFIG__
cylc register "${NAME2}" "${SUITE2_RUND}"
cylc run --no-detach "${NAME1}" 1>'1.out' 2>&1 &
SUITE_RUN_DIR="${SUITE1_RUND}" poll_suite_running
run_ok "${TEST_NAME_BASE}" cylc run --no-detach "${NAME2}"
run_ok "${TEST_NAME_BASE}" cylc run --no-detach --abort-if-any-task-fails "${NAME2}"
cylc shutdown "${NAME1}" --max-polls=20 --interval=1 1>'/dev/null' 2>&1 || true
purge_suite "${NAME1}"
purge_suite "${NAME2}"
Expand Down
2 changes: 1 addition & 1 deletion tests/functional/broadcast/05-bad-point.t
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@ set_test_number 2
install_suite "${TEST_NAME_BASE}" "${TEST_NAME_BASE}"

run_ok "${TEST_NAME_BASE}-validate" cylc validate "${SUITE_NAME}"
suite_run_ok "${TEST_NAME_BASE}" cylc run --debug --no-detach "${SUITE_NAME}"
suite_run_ok "${TEST_NAME_BASE}" cylc run --debug --no-detach --abort-if-any-task-fails "${SUITE_NAME}"

purge_suite "${SUITE_NAME}"
exit
1 change: 0 additions & 1 deletion tests/functional/broadcast/05-bad-point/flow.cylc
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,6 @@
# And see github #1415 - it did cause the suite server program to abort.
[cylc]
[[events]]
abort if any task fails = True
abort on timeout = True
timeout=PT1M
[scheduling]
Expand Down
2 changes: 1 addition & 1 deletion tests/functional/broadcast/06-bad-namespace.t
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@ set_test_number 2
install_suite "${TEST_NAME_BASE}" "${TEST_NAME_BASE}"

run_ok "${TEST_NAME_BASE}-validate" cylc validate "${SUITE_NAME}"
suite_run_ok "${TEST_NAME_BASE}" cylc run --debug --no-detach "${SUITE_NAME}"
suite_run_ok "${TEST_NAME_BASE}" cylc run --debug --no-detach --abort-if-any-task-fails "${SUITE_NAME}"

purge_suite "${SUITE_NAME}"
exit
1 change: 0 additions & 1 deletion tests/functional/broadcast/06-bad-namespace/flow.cylc
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,6 @@
description=Test broadcast to an undefined namespace fails.
[cylc]
[[events]]
abort if any task fails = True
abort on timeout = True
timeout=PT1M
[scheduling]
Expand Down
1 change: 1 addition & 0 deletions tests/functional/broadcast/08-space.t
Original file line number Diff line number Diff line change
Expand Up @@ -18,5 +18,6 @@
# Test broadcast -s '[foo] bar=baz' syntax. cylc/cylc-flow#1680
. "$(dirname "$0")/test_header"
set_test_number 2
export ABORT_ON_TASK_FAIL=true
reftest
exit
1 change: 0 additions & 1 deletion tests/functional/broadcast/08-space/flow.cylc
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,6 @@
[cylc]
UTC mode = True
[[events]]
abort if any task fails = True
abort on timeout = True
timeout=PT1M
[scheduling]
Expand Down
3 changes: 2 additions & 1 deletion tests/functional/clock-expire/00-basic.t
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,8 @@ install_suite "${TEST_NAME_BASE}" "${TEST_NAME_BASE}"

run_ok "${TEST_NAME_BASE}-validate" cylc validate "${SUITE_NAME}"

suite_run_ok "${TEST_NAME_BASE}-run" cylc run --debug --no-detach "${SUITE_NAME}"
suite_run_ok "${TEST_NAME_BASE}-run" \
cylc run --debug --no-detach --abort-if-any-task-fails "${SUITE_NAME}"

purge_suite "${SUITE_NAME}"
exit
1 change: 0 additions & 1 deletion tests/functional/clock-expire/00-basic/flow.cylc
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,6 @@ Skip a daily post-processing workflow if the 'copy' task has expired."""
[cylc]
cycle point format = %Y-%m-%dT%H
[[events]]
abort if any task fails = True
abort on timeout = True
timeout = PT1M
[scheduling]
Expand Down
2 changes: 1 addition & 1 deletion tests/functional/deprecations/01-cylc8-basic.t
Original file line number Diff line number Diff line change
Expand Up @@ -37,8 +37,8 @@ cmp_ok val.out <<__END__
* (8.0.0) [cylc][reference test][simulation mode suite timeout] - DELETED (OBSOLETE)
* (8.0.0) [cylc][reference test][required run mode] - DELETED (OBSOLETE)
* (8.0.0) [cylc][reference test][suite shutdown event handler] - DELETED (OBSOLETE)
* (8.0.0) [cylc][abort if any task fails] -> [cylc][events][abort if any task fails] - value unchanged
* (8.0.0) [runtime][foo, cat, dog][job][shell] - DELETED (OBSOLETE)
* (8.0.0) [cylc][abort if any task fails] - DELETED (OBSOLETE)
__END__

purge_suite "${SUITE_NAME}"
4 changes: 2 additions & 2 deletions tests/functional/hold-release/05-release.t
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,6 @@ init_suite "${TEST_NAME_BASE}" <<'__FLOW_CONFIG__'
[[events]]
timeout = PT1M
abort on timeout = True
abort if any task fails = True
[scheduling]
[[graph]]
R1 = "spawner & holdrelease => STUFF & TOAST & CATS & DOGS & stop"
Expand Down Expand Up @@ -70,7 +69,8 @@ __FLOW_CONFIG__

run_ok "${TEST_NAME_BASE}-val" cylc validate "${SUITE_NAME}"

suite_run_ok "${TEST_NAME_BASE}-run" cylc run --debug --no-detach "${SUITE_NAME}"
suite_run_ok "${TEST_NAME_BASE}-run" \
cylc run --debug --no-detach --abort-if-any-task-fails "${SUITE_NAME}"

# Should shut down with all non-released tasks in the held state, and dog1.1
# finished and gone from the task pool.
Expand Down
4 changes: 2 additions & 2 deletions tests/functional/hold-release/08-hold.t
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,6 @@ init_suite "${TEST_NAME_BASE}" <<'__FLOW_CONFIG__'
[[events]]
timeout = PT1M
abort on timeout = True
abort if any task fails = True
[scheduling]
[[graph]]
R1 = "spawner & holdrelease => STUFF & TOAST & CATS & DOGS & stop"
Expand Down Expand Up @@ -70,7 +69,8 @@ __FLOW_CONFIG__

run_ok "${TEST_NAME_BASE}-val" cylc validate "${SUITE_NAME}"

suite_run_ok "${TEST_NAME_BASE}-run" cylc run --debug --no-detach "${SUITE_NAME}"
suite_run_ok "${TEST_NAME_BASE}-run" \
cylc run --debug --no-detach --abort-if-any-task-fails "${SUITE_NAME}"

# Should shut down with all the held tasks in the held state, and dog.2
# finished and gone from the task pool.
Expand Down
1 change: 1 addition & 0 deletions tests/functional/hold-release/18-hold-cycle-globs.t
Original file line number Diff line number Diff line change
Expand Up @@ -18,5 +18,6 @@
# Test hold cycle point glob
. "$(dirname "$0")/test_header"
set_test_number 2
export ABORT_ON_TASK_FAIL=true
reftest
exit
Original file line number Diff line number Diff line change
@@ -1,7 +1,6 @@
[cylc]
UTC mode = True
[[events]]
abort if any task fails = True
timeout = PT1M
abort on timeout = True
[scheduling]
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -18,5 +18,6 @@
# Test on release of a waiting task, don't reset its prerequisites
. "$(dirname "$0")/test_header"
set_test_number 2
export ABORT_ON_TASK_FAIL=true
reftest
exit
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,6 @@
[cylc]
[[events]]
abort on stalled = True
abort if any task fails = True
abort on timeout = True
timeout = PT1M
[scheduling]
Expand Down
9 changes: 7 additions & 2 deletions tests/functional/lib/bash/test_header
Original file line number Diff line number Diff line change
Expand Up @@ -699,8 +699,13 @@ reftest() {
local TEST_NAME="${1:-${TEST_NAME_BASE}}"
install_suite "$@"
run_ok "${TEST_NAME}-validate" cylc validate "${SUITE_NAME}"
suite_run_ok "${TEST_NAME}-run" \
cylc run --reference-test --debug --no-detach "${SUITE_NAME}"
if [[ "${ABORT_ON_TASK_FAIL:-}" == true ]]; then
suite_run_ok "${TEST_NAME}-run" \
cylc run --reference-test --debug --no-detach --abort-if-any-task-fails "${SUITE_NAME}"
else
suite_run_ok "${TEST_NAME}-run" \
cylc run --reference-test --debug --no-detach "${SUITE_NAME}"
fi
purge_suite "${SUITE_NAME}"
}

Expand Down
2 changes: 1 addition & 1 deletion tests/functional/message-triggers/02-action.t
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@ run_ok "${TEST_NAME}" cylc validate "${SUITE_NAME}"

# The suite tests that two tasks suicide immediately on message triggers.
TEST_NAME="${TEST_NAME_BASE}-run"
suite_run_ok "${TEST_NAME}" cylc run --no-detach "${SUITE_NAME}"
suite_run_ok "${TEST_NAME}" cylc run --no-detach --abort-if-any-task-fails "${SUITE_NAME}"

# Check that final task pool indicates bar and baz ran
# TODO: some final null task pool tests would be better on task_states table!
Expand Down
1 change: 0 additions & 1 deletion tests/functional/message-triggers/02-action/flow.cylc
Original file line number Diff line number Diff line change
@@ -1,6 +1,5 @@
[cylc]
[[events]]
abort if any task fails = True
abort on inactivity = True
inactivity = PT30S
[scheduling]
Expand Down
4 changes: 2 additions & 2 deletions tests/functional/restart/01-broadcast.t
Original file line number Diff line number Diff line change
Expand Up @@ -31,10 +31,10 @@ run_ok "${TEST_NAME}" cylc validate "${SUITE_NAME}"
cmp_ok "${TEST_NAME}.stderr" <'/dev/null'
#-------------------------------------------------------------------------------
TEST_NAME="${TEST_NAME_BASE}-run"
suite_run_ok "${TEST_NAME}" cylc run --no-detach "${SUITE_NAME}"
suite_run_ok "${TEST_NAME}" cylc run --no-detach --abort-if-any-task-fails "${SUITE_NAME}"
#-------------------------------------------------------------------------------
TEST_NAME="${TEST_NAME_BASE}-restart-run"
suite_run_ok "${TEST_NAME}" cylc restart --no-detach "${SUITE_NAME}"
suite_run_ok "${TEST_NAME}" cylc restart --no-detach --abort-if-any-task-fails "${SUITE_NAME}"
#-------------------------------------------------------------------------------
grep_ok "send_a_broadcast_task|20130923T0000Z|1|1|succeeded" \
"${TEST_DIR}/pre-restart-db"
Expand Down
4 changes: 2 additions & 2 deletions tests/functional/restart/13-bad-job-host.t
Original file line number Diff line number Diff line change
Expand Up @@ -24,14 +24,14 @@ set_test_number 4
install_suite "${TEST_NAME_BASE}" bad-job-host
#-------------------------------------------------------------------------------
run_ok "${TEST_NAME_BASE}-validate" cylc validate "${SUITE_NAME}"
suite_run_ok "${TEST_NAME_BASE}-run" cylc run --debug --no-detach "${SUITE_NAME}"
suite_run_ok "${TEST_NAME_BASE}-run" cylc run --debug --no-detach --abort-if-any-task-fails "${SUITE_NAME}"
# Modify DB with garbage host
CYLC_SUITE_RUN_DIR="$RUN_DIR/${SUITE_NAME}"
for DB_NAME in 'log/db' '.service/db'; do
sqlite3 "${CYLC_SUITE_RUN_DIR}/${DB_NAME}" \
'UPDATE task_jobs SET platform_name="garbage" WHERE name=="t-remote";'
done
suite_run_fail "${TEST_NAME_BASE}-restart" cylc restart --debug --no-detach "${SUITE_NAME}"
suite_run_fail "${TEST_NAME_BASE}-restart" cylc restart --debug --no-detach --abort-if-any-task-fails "${SUITE_NAME}"
grep_ok PlatformLookupError "${CYLC_SUITE_RUN_DIR}/log/suite/log"
#-------------------------------------------------------------------------------
purge_suite_platform "${CYLC_TEST_PLATFORM}" "${SUITE_NAME}"
Expand Down
3 changes: 2 additions & 1 deletion tests/functional/restart/25-hold-suite.t
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,8 @@ set_test_number 7
install_suite "${TEST_NAME_BASE}" "${TEST_NAME_BASE}"

run_ok "${TEST_NAME_BASE}-validate" cylc validate "${SUITE_NAME}"
suite_run_ok "${TEST_NAME_BASE}-run" cylc run "${SUITE_NAME}" --debug --no-detach
suite_run_ok "${TEST_NAME_BASE}-run" \
cylc run "${SUITE_NAME}" --debug --no-detach --abort-if-any-task-fails

sqlite3 "${SUITE_RUN_DIR}/log/db" \
'SELECT value FROM suite_params WHERE key=="is_held"' >'suite-is-held.out'
Expand Down
1 change: 0 additions & 1 deletion tests/functional/restart/25-hold-suite/flow.cylc
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,6 @@
UTC mode=True
cycle point format = %Y
[[events]]
abort if any task fails = True
abort on inactivity = True
inactivity = P2M
[scheduling]
Expand Down
5 changes: 2 additions & 3 deletions tests/functional/restart/33-simulation.t
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,6 @@ init_suite "${TEST_NAME_BASE}" <<'__FLOW_CONFIG__'
[cylc]
cycle point format = %Y
[[events]]
abort if any task fails = True
abort on stalled = True
[scheduling]
initial cycle point = 2018
Expand All @@ -36,13 +35,13 @@ __FLOW_CONFIG__

run_ok "${TEST_NAME_BASE}-validate" cylc validate "${SUITE_NAME}"
suite_run_ok "${TEST_NAME_BASE}-run" \
cylc run --no-detach --stop-point=2019 --mode=simulation "${SUITE_NAME}"
cylc run --no-detach --stop-point=2019 --mode=simulation --abort-if-any-task-fails "${SUITE_NAME}"
# Force a waiting task into a running task
sqlite3 "${HOME}/cylc-run/${SUITE_NAME}/.service/db" \
'UPDATE task_states SET status="running" WHERE name=="t1" AND cycle=="2019"'
sqlite3 "${HOME}/cylc-run/${SUITE_NAME}/.service/db" \
'UPDATE task_pool SET status="running" WHERE name=="t1" AND cycle=="2019"'
suite_run_ok "${TEST_NAME_BASE}-restart" \
cylc restart --debug --no-detach --until=2020 --mode=simulation "${SUITE_NAME}"
cylc restart --debug --no-detach --until=2020 --mode=simulation --abort-if-any-task-fails "${SUITE_NAME}"
purge_suite "${SUITE_NAME}"
exit
6 changes: 1 addition & 5 deletions tests/functional/restart/42-auto-restart-ping-pong.t
Original file line number Diff line number Diff line change
Expand Up @@ -36,9 +36,6 @@ BASE_GLOBAL_CONFIG='
'

TEST_DIR="$HOME/cylc-run/" init_suite "${TEST_NAME_BASE}" <<< '
[cylc]
[[events]]
abort if any task fails = True
wxtim marked this conversation as resolved.
Show resolved Hide resolved
[scheduling]
initial cycle point = 2000
final cycle point = 9999 # test cylc/cylc-flow/issues/2799
Expand Down Expand Up @@ -68,7 +65,6 @@ kill_suite() {
}

log_scan2() {
# abort if any test fails = True
NO_TESTS="$(( NO_TESTS - $# + 4 ))"
if ! log_scan "$@"; then
skip $NO_TESTS # skip remaining tests
Expand All @@ -83,7 +79,7 @@ set_test_number "${NO_TESTS}"

# run the suite
stuck_in_the_middle
cylc run "${SUITE_NAME}" --host="${JOKERS}"
cylc run "${SUITE_NAME}" --host="${JOKERS}" --abort-if-any-task-fails
poll_suite_running
sleep 1

Expand Down
Loading