diff --git a/cylc/flow/scheduler.py b/cylc/flow/scheduler.py index 27d5d89dab5..d9d9c2bec9f 100644 --- a/cylc/flow/scheduler.py +++ b/cylc/flow/scheduler.py @@ -94,7 +94,6 @@ from cylc.flow.task_state import ( TASK_STATUSES_ACTIVE, TASK_STATUSES_NEVER_ACTIVE, - TASK_STATUSES_SUCCESS, TASK_STATUS_FAILED) from cylc.flow.templatevars import load_template_vars from cylc.flow import __version__ as CYLC_VERSION @@ -1524,7 +1523,7 @@ async def update_data_structure(self): updated_nodes = set(updated_tasks).union( self.pool.get_pool_change_tasks()) if ( - has_updated or + updated_nodes or self.data_store_mgr.updates_pending or self.job_pool.updates_pending ): @@ -1586,10 +1585,8 @@ def check_suite_stalled(self): return self.is_stalled = self.pool.is_stalled() if self.is_stalled: - message = 'suite stalled' - LOG.warning(message) - self.run_event_handlers(self.EVENT_STALLED, message) - self.pool.report_stalled_task_deps() + self.run_event_handlers(self.EVENT_STALLED, 'suite stalled') + self.pool.report_unmet_deps() if self._get_events_conf('abort on stalled'): raise SchedulerError('Abort on suite stalled is set') # Start suite timeout timer @@ -1671,6 +1668,9 @@ async def shutdown(self, reason): self.proc_pool.process() if self.pool is not None: + if not self.is_stalled: + # (else already reported) + self.pool.report_unmet_deps() self.pool.warn_stop_orphans() try: self.suite_db_mgr.put_task_event_timers(self.task_events_mgr) @@ -1746,30 +1746,18 @@ def stop_clock_done(self): return False def check_auto_shutdown(self): - """Check if we should do a normal automatic shutdown.""" + """Check if we should do an automatic shutdown: main pool empty.""" if not self.can_auto_stop: return False - can_shutdown = True - for itask in self.pool.get_all_tasks(): - if self.pool.stop_point is None: - # Don't if any unsucceeded task exists. - if not itask.state(*TASK_STATUSES_SUCCESS): - can_shutdown = False - break - elif ( - itask.point <= self.pool.stop_point - and not itask.state(*TASK_STATUSES_SUCCESS) - ): - # Don't if any unsucceeded task exists < stop point... - if itask.identity not in self.pool.stuck_future_tasks: - # ...unless it has a future trigger extending > stop point. - can_shutdown = False - break - if can_shutdown and self.pool.stop_point: + self.pool.release_runahead_tasks() + if self.pool.get_tasks(): + return False + # can shut down + if self.pool.stop_point: self.options.stopcp = None self.pool.stop_point = None self.suite_db_mgr.delete_suite_stop_cycle_point() - return can_shutdown + return True def hold_suite(self, point=None): """Hold all tasks in suite.""" diff --git a/cylc/flow/task_pool.py b/cylc/flow/task_pool.py index 62a0c1947f0..5656dbca7e8 100644 --- a/cylc/flow/task_pool.py +++ b/cylc/flow/task_pool.py @@ -40,7 +40,7 @@ from cylc.flow.task_proxy import TaskProxy from cylc.flow.task_state import ( TASK_STATUSES_ACTIVE, - TASK_STATUSES_NOT_STALLED, + TASK_STATUSES_FAILURE, TASK_STATUS_WAITING, TASK_STATUS_EXPIRED, TASK_STATUS_QUEUED, @@ -176,7 +176,6 @@ def __init__(self, config, suite_db_mgr, task_events_mgr, job_pool): self.is_held = False self.hold_point = None - self.stuck_future_tasks = [] self.abs_outputs_done = set() self.stop_task_id = None @@ -241,7 +240,11 @@ def add_to_runahead_pool(self, itask, is_new=True): return itask def release_runahead_tasks(self): - """Restrict the number of active cycle points. + """Release tasks from the runahead pool to the main pool. + + This serves to: + - restrict the number of active cycle points + - keep partially-satisfied waiting tasks out of the n=0 active pool Compute runahead limit, and release tasks to the main pool if they are below that point (and <= the stop point, if there is a stop point). @@ -347,6 +350,9 @@ def release_runahead_tasks(self): for point, itask_id_map in self.runahead_pool.copy().items(): if point <= latest_allowed_point: for itask in itask_id_map.copy().values(): + if itask.is_task_prereqs_not_done(): + # Only release if all prerequisites are satisfied. + continue self.release_runahead_task(itask) released = True return released @@ -728,15 +734,6 @@ def get_ready_tasks(self): return ready_tasks - def task_has_future_trigger_overrun(self, itask): - """Check for future triggers extending beyond the final cycle.""" - if not self.stop_point: - return False - for pct in itask.state.prerequisites_get_target_points(): - if pct > self.stop_point: - return True - return False - def get_min_point(self): """Return the minimum cycle point currently in the pool.""" cycles = list(self.pool) @@ -914,16 +911,31 @@ def can_stop(self, stop_mode): def warn_stop_orphans(self): """Log (warning) orphaned tasks on suite stop.""" + orphans = [] + orphans_kill_failed = [] for itask in self.get_tasks(): - if ( - itask.state(*TASK_STATUSES_ACTIVE) - and itask.state.kill_failed - ): - LOG.warning("%s: orphaned task (%s, kill failed)" % ( - itask.identity, itask.state.status)) - elif itask.state(*TASK_STATUSES_ACTIVE): - LOG.warning("%s: orphaned task (%s)" % ( - itask.identity, itask.state.status)) + if itask.state(*TASK_STATUSES_ACTIVE): + if itask.state.kill_failed: + orphans_kill_failed.append(itask) + else: + orphans.append(itask) + if orphans_kill_failed: + LOG.warning( + "Orphaned task jobs (kill failed):\n" + + "\n".join( + f"* {itask.identity} ({itask.state.status})" + for itask in orphans_kill_failed + ) + ) + if orphans: + LOG.warning( + "Orphaned task jobs:\n" + + "\n".join( + f"* {itask.identity} ({itask.state.status})" + for itask in orphans + ) + ) + for key1, point, name, submit_num in self.task_events_mgr.event_timers: LOG.warning("%s/%s/%s: incomplete task event handler %s" % ( point, name, submit_num, key1)) @@ -931,60 +943,45 @@ def warn_stop_orphans(self): def is_stalled(self): """Return True if the suite is stalled. - A suite is stalled when: - * It is not held. - * It has no active tasks. - * It has waiting tasks with unmet prerequisites - (ignoring clock triggers). + A suite is stalled if it is not held and the active pool contains only + unhandled failed tasks. """ if self.is_held: return False - can_be_stalled = False + unhandled_failed = [] for itask in self.get_tasks(): - if ( - self.stop_point - and itask.point > self.stop_point - or itask.state( - TASK_STATUS_SUCCEEDED, - TASK_STATUS_EXPIRED, - ) - ): - # Ignore: Task beyond stop point. - # Ignore: Succeeded and expired tasks. - continue - if itask.state(*TASK_STATUSES_NOT_STALLED): - # Pool contains active tasks (or held active tasks) - # Return "not stalled" immediately. - return False - if ( - itask.state(TASK_STATUS_WAITING) - and itask.state.prerequisites_all_satisfied() - ): - # Waiting tasks with all prerequisites satisfied, - # probably waiting for clock trigger only. - # This task can be considered active. - # Return "not stalled" immediately. + if itask.state(*TASK_STATUSES_FAILURE): + unhandled_failed.append(itask) + else: return False - # We should be left with (submission) failed tasks and - # waiting tasks with unsatisfied prerequisites. - can_be_stalled = True - return can_be_stalled + if unhandled_failed: + LOG.warning( + "Suite stalled with unhandled failed tasks:\n" + + "\n".join( + f"* {itask.identity} ({itask.state.status})" + for itask in unhandled_failed + ) + ) + return True + else: + return False - def report_stalled_task_deps(self): - """Log unmet dependencies on stalled.""" + def report_unmet_deps(self): + """Log unmet dependencies on stall or shutdown.""" prereqs_map = {} - for itask in self.get_tasks(): - if ( - itask.state(TASK_STATUS_WAITING) - and itask.state.prerequisites_are_not_all_satisfied() - ): - prereqs_map[itask.identity] = [] - for prereq_str, is_met in itask.state.prerequisites_dump(): - if not is_met: - prereqs_map[itask.identity].append(prereq_str) + # Partially satisfied tasks are hidden in the runahead pool. + for itask in self.get_rh_tasks(): + prereqs_map[itask.identity] = [] + for prereq_str, is_met in itask.state.prerequisites_dump(): + if not is_met: + prereqs_map[itask.identity].append(prereq_str) # prune tree to ignore items that are elsewhere in it for id_, prereqs in list(prereqs_map.copy().items()): + if not prereqs: + # (tasks in runahead pool that are not unsatisfied) + del prereqs_map[id_] + continue for prereq in prereqs: prereq_strs = prereq.split() if prereq_strs[0] == "LABEL:": @@ -998,10 +995,16 @@ def report_stalled_task_deps(self): del prereqs_map[id_] break - for id_, prereqs in prereqs_map.items(): - LOG.warning("Unmet prerequisites for %s:" % id_) - for prereq in prereqs: - LOG.warning(" * %s" % prereq) + if prereqs_map: + LOG.warning( + "Some partially satisfied prerequisites left over:\n" + + "\n".join( + f"{id_} is waiting on:" + + "\n".join( + f"\n* {prereq}" for prereq in prereqs + ) for id_, prereqs in prereqs_map.items() + ) + ) def set_hold_point(self, point): """Set the point after which tasks must be held.""" @@ -1217,13 +1220,16 @@ def spawn_task(self, name, point, flow_label=None, reflow=True, "[%s] -holding (beyond suite hold point) %s", itask, self.hold_point) itask.state.reset(is_held=True) - elif (self.stop_point and itask.point <= self.stop_point and - self.task_has_future_trigger_overrun(itask)): - # Record tasks waiting on a future trigger beyond the stop point. - # (We ignore these waiting tasks when considering shutdown). - LOG.info("[%s] -holding (future trigger beyond stop point)", itask) - self.stuck_future_tasks.append(itask.identity) - elif (self.is_held + if self.stop_point and itask.point <= self.stop_point: + future_trigger_overrun = False + for pct in itask.state.prerequisites_get_target_points(): + if pct > self.stop_point: + future_trigger_overrun = True + break + if future_trigger_overrun: + LOG.warning("[%s] -won't run: depends on a " + "task beyond the stop point", itask) + if (self.is_held and itask.state(TASK_STATUS_WAITING, is_held=False)): # Hold newly-spawned tasks in a held suite (e.g. due to manual # triggering of a held task). diff --git a/cylc/flow/task_state.py b/cylc/flow/task_state.py index 3cf5ee0ee18..c07585c22ae 100644 --- a/cylc/flow/task_state.py +++ b/cylc/flow/task_state.py @@ -147,21 +147,12 @@ TASK_STATUS_READY, ]) -# Task statuses that are to be externally active -TASK_STATUSES_TO_BE_ACTIVE = set([ - TASK_STATUS_QUEUED, - TASK_STATUS_READY, -]) - # Task statuses that are externally active TASK_STATUSES_ACTIVE = set([ TASK_STATUS_SUBMITTED, TASK_STATUS_RUNNING, ]) -# Task statuses in which tasks cannot be considered stalled -TASK_STATUSES_NOT_STALLED = TASK_STATUSES_ACTIVE | TASK_STATUSES_TO_BE_ACTIVE - # Task statuses that can be manually triggered. TASK_STATUSES_TRIGGERABLE = set([ TASK_STATUS_WAITING, diff --git a/tests/flakyfunctional/cylc-take-checkpoints/00-basic.t b/tests/flakyfunctional/cylc-take-checkpoints/00-basic.t deleted file mode 100755 index abf73eec632..00000000000 --- a/tests/flakyfunctional/cylc-take-checkpoints/00-basic.t +++ /dev/null @@ -1,55 +0,0 @@ -#!/usr/bin/env bash -# THIS FILE IS PART OF THE CYLC SUITE ENGINE. -# Copyright (C) NIWA & British Crown (Met Office) & Contributors. -# -# This program is free software: you can redistribute it and/or modify -# it under the terms of the GNU General Public License as published by -# the Free Software Foundation, either version 3 of the License, or -# (at your option) any later version. -# -# This program is distributed in the hope that it will be useful, -# but WITHOUT ANY WARRANTY; without even the implied warranty of -# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -# GNU General Public License for more details. -# -# You should have received a copy of the GNU General Public License -# along with this program. If not, see . -#------------------------------------------------------------------------------- -# Test checkpoint basic -. "$(dirname "$0")/test_header" - -date-remove() { - sed 's/[0-9]\+\(-[0-9]\{2\}\)\{2\}T[0-9]\{2\}\(:[0-9]\{2\}\)\{2\}Z/DATE/' -} - -set_test_number 4 - -install_suite "${TEST_NAME_BASE}" "${TEST_NAME_BASE}" - -run_ok "${TEST_NAME_BASE}-validate" cylc validate "${SUITE_NAME}" - -# Suite reloads+inserts new task to mess up prerequisites - suite should stall -suite_run_ok "${TEST_NAME_BASE}-run" \ - cylc run --debug --no-detach --reference-test "${SUITE_NAME}" -cylc ls-checkpoints "${SUITE_NAME}" | date-remove >'cylc-ls-checkpoints.out' -contains_ok 'cylc-ls-checkpoints.out' <<'__OUT__' -####################################################################### -# CHECKPOINT ID (ID|TIME|EVENT) -1|DATE|snappy -0|DATE|latest -__OUT__ - -cylc ls-checkpoints "${SUITE_NAME}" 1 | date-remove >'cylc-ls-checkpoints-1.out' -contains_ok 'cylc-ls-checkpoints-1.out' <<'__OUT__' -####################################################################### -# CHECKPOINT ID (ID|TIME|EVENT) -1|DATE|snappy - -# SUITE PARAMS (KEY|VALUE) - -# TASK POOL (CYCLE|NAME|STATUS|IS_HELD) -2017|t1|running|0 -__OUT__ - -purge_suite "${SUITE_NAME}" -exit diff --git a/tests/flakyfunctional/cylc-take-checkpoints/00-basic/flow.cylc b/tests/flakyfunctional/cylc-take-checkpoints/00-basic/flow.cylc deleted file mode 100644 index e57721a0e68..00000000000 --- a/tests/flakyfunctional/cylc-take-checkpoints/00-basic/flow.cylc +++ /dev/null @@ -1,26 +0,0 @@ -#!jinja2 -[cylc] - UTC mode=True - cycle point format = %Y - [[events]] - abort on stalled = True - abort on inactivity = True - inactivity = P1M -[scheduling] - initial cycle point = 2016 - final cycle point = 2020 - [[graph]] - P1Y=t1[-P1Y] => t1 -[runtime] - [[t1]] - script = """ -if [[ "${CYLC_TASK_CYCLE_POINT}" == '2017' ]]; then - wait "${CYLC_TASK_MESSAGE_STARTED_PID}" 2>/dev/null || true - sleep 2 # state of current task should be recorded after 2 seconds - cylc checkpoint "${CYLC_SUITE_NAME}" 'snappy' - cylc__job__poll_grep_suite_log -F "Command succeeded: take_checkpoints(snappy)" - sleep 2 # checkpoint should be recorded after 2 seconds -fi -""" - [[[job]]] - execution time limit = PT50S diff --git a/tests/flakyfunctional/cylc-take-checkpoints/00-basic/reference.log b/tests/flakyfunctional/cylc-take-checkpoints/00-basic/reference.log deleted file mode 100644 index e49908e5492..00000000000 --- a/tests/flakyfunctional/cylc-take-checkpoints/00-basic/reference.log +++ /dev/null @@ -1,7 +0,0 @@ -2016-10-10T14:01:04Z INFO - Initial point: 2016 -2016-10-10T14:01:04Z INFO - Final point: 2020 -2016-10-10T14:01:05Z INFO - [t1.2016] -triggered off [] -2016-10-10T14:01:05Z INFO - [t1.2017] -triggered off ['t1.2016'] -2016-10-10T14:01:05Z INFO - [t1.2018] -triggered off ['t1.2017'] -2016-10-10T14:01:08Z INFO - [t1.2019] -triggered off ['t1.2018'] -2016-10-10T14:01:11Z INFO - [t1.2020] -triggered off ['t1.2019'] diff --git a/tests/flakyfunctional/cylc-take-checkpoints/test_header b/tests/flakyfunctional/cylc-take-checkpoints/test_header deleted file mode 120000 index 0126592858e..00000000000 --- a/tests/flakyfunctional/cylc-take-checkpoints/test_header +++ /dev/null @@ -1 +0,0 @@ -../../functional/lib/bash/test_header \ No newline at end of file diff --git a/tests/flakyfunctional/events/40-stall-despite-clock-trig.t b/tests/flakyfunctional/events/40-stall-despite-clock-trig.t index 67c5777ffcb..daf2ec27b9e 100755 --- a/tests/flakyfunctional/events/40-stall-despite-clock-trig.t +++ b/tests/flakyfunctional/events/40-stall-despite-clock-trig.t @@ -21,14 +21,11 @@ set_test_number 3 install_suite "${TEST_NAME_BASE}" "${TEST_NAME_BASE}" run_ok "${TEST_NAME_BASE}-validate" cylc validate "${SUITE_NAME}" -# Saw evidence in a failed test that timeout 60 isn't long enough under load?: -run_fail "${TEST_NAME_BASE}-run" \ - timeout 120 cylc run --debug --no-detach "${SUITE_NAME}" -sed -n 's/^.* WARNING - //p' "${SUITE_RUN_DIR}/log/suite/log" \ - >"${SUITE_RUN_DIR}/log/suite/log.edited" -contains_ok "${SUITE_RUN_DIR}/log/suite/log.edited" <<__OUT__ -suite stalled -__OUT__ + +TEST_NAME="${TEST_NAME_BASE}-run" +suite_run_fail "${TEST_NAME}" cylc run --debug --no-detach "${SUITE_NAME}" + +grep_ok "suite stalled" "${TEST_NAME}.stderr" purge_suite "${SUITE_NAME}" exit diff --git a/tests/flakyfunctional/events/40-stall-despite-clock-trig/flow.cylc b/tests/flakyfunctional/events/40-stall-despite-clock-trig/flow.cylc index 3ece887ca1f..66e3d583c8e 100644 --- a/tests/flakyfunctional/events/40-stall-despite-clock-trig/flow.cylc +++ b/tests/flakyfunctional/events/40-stall-despite-clock-trig/flow.cylc @@ -1,4 +1,8 @@ -# Stall with t3 waiting due to unhandled failure of t2 +# Stall due to unhandled failure of t2 +# TODO: I think this test can be removed. Since SoD it only tests that a suite +# can stall due to unhandled failed tasks, which is tested elsewhere. It was +# probably meant to test that stall was not affected by the clock trigger on +# waiting t1 in the next cycle under SoS. [cylc] UTC mode = True cycle point format = %Y%m%d @@ -11,7 +15,6 @@ [[special tasks]] clock-trigger = t1(P0D) [[graph]] - # Stall with t2 failed, no waiting tasks. P1D=t3[-P1D] => t1 => t2 => t3 [runtime] [[t1]] diff --git a/tests/flakyfunctional/restart/21-task-elapsed.t b/tests/flakyfunctional/restart/21-task-elapsed.t index 0f0acd7257b..77596c7b4f7 100755 --- a/tests/flakyfunctional/restart/21-task-elapsed.t +++ b/tests/flakyfunctional/restart/21-task-elapsed.t @@ -15,7 +15,6 @@ # You should have received a copy of the GNU General Public License # along with this program. If not, see . #------------------------------------------------------------------------------- -# Test restart from a checkpoint before a reload . "$(dirname "$0")/test_header" set_test_number 8 install_suite "${TEST_NAME_BASE}" "${TEST_NAME_BASE}" diff --git a/tests/functional/events/26-suite-stalled-dump-prereq.t b/tests/functional/events/26-suite-stalled-dump-prereq.t index 21e8672ccc3..429768e741d 100755 --- a/tests/functional/events/26-suite-stalled-dump-prereq.t +++ b/tests/functional/events/26-suite-stalled-dump-prereq.t @@ -17,19 +17,28 @@ #------------------------------------------------------------------------------- # Test suite event handler, dump unmet prereqs on stall . "$(dirname "$0")/test_header" -set_test_number 5 +set_test_number 8 install_suite "${TEST_NAME_BASE}" "${TEST_NAME_BASE}" run_ok "${TEST_NAME_BASE}-validate" \ cylc validate "${SUITE_NAME}" + suite_run_fail "${TEST_NAME_BASE}-run" \ cylc run --reference-test --debug --no-detach "${SUITE_NAME}" -grep_ok "Abort on suite stalled is set" \ - "${TEST_NAME_BASE}-run.stderr" -grep_ok "WARNING - Unmet prerequisites for foo.20100101T0600Z:" \ - "${TEST_NAME_BASE}-run.stderr" -grep_ok "WARNING - \\* bar.20100101T0000Z succeeded" \ - "${TEST_NAME_BASE}-run.stderr" + +grep_ok "Abort on suite stalled is set" "${TEST_NAME_BASE}-run.stderr" + +grep_ok "WARNING - Suite stalled with unhandled failed tasks:" \ + "${TEST_NAME_BASE}-run.stderr" +grep_ok "\* bar.20100101T0000Z (failed)" \ + "${TEST_NAME_BASE}-run.stderr" + +grep_ok "WARNING - Some partially satisfied prerequisites left over:" \ + "${TEST_NAME_BASE}-run.stderr" +grep_ok "foo.20100101T0600Z is waiting on:" \ + "${TEST_NAME_BASE}-run.stderr" +grep_ok "\* bar.20100101T0000Z succeeded" \ + "${TEST_NAME_BASE}-run.stderr" purge_suite "${SUITE_NAME}" exit diff --git a/tests/functional/events/26-suite-stalled-dump-prereq/flow.cylc b/tests/functional/events/26-suite-stalled-dump-prereq/flow.cylc index 5e418725c94..3c8230591cf 100644 --- a/tests/functional/events/26-suite-stalled-dump-prereq/flow.cylc +++ b/tests/functional/events/26-suite-stalled-dump-prereq/flow.cylc @@ -7,7 +7,7 @@ [scheduling] initial cycle point = 20100101T0000Z [[graph]] - # will abort on stalled with failed bar, waiting foo, at T00 + # will abort on stalled with unhandled failed bar T00, T06, T12, T18 = foo[-PT6H] & bar[-PT6H] => foo => bar => qux T12 = qux[-PT6H] => baz [runtime] diff --git a/tests/functional/events/27-suite-stalled-dump-prereq-fam.t b/tests/functional/events/27-suite-stalled-dump-prereq-fam.t index 4563036cc62..aff3b20228d 100755 --- a/tests/functional/events/27-suite-stalled-dump-prereq-fam.t +++ b/tests/functional/events/27-suite-stalled-dump-prereq-fam.t @@ -17,22 +17,37 @@ #------------------------------------------------------------------------------- # Test suite event handler, dump unmet prereqs on stall . "$(dirname "$0")/test_header" -set_test_number 7 +set_test_number 12 install_suite "${TEST_NAME_BASE}" "${TEST_NAME_BASE}" + run_ok "${TEST_NAME_BASE}-validate" \ cylc validate "${SUITE_NAME}" + suite_run_fail "${TEST_NAME_BASE}-run" \ cylc run --reference-test --debug --no-detach "${SUITE_NAME}" -grep_ok "Abort on suite stalled is set" \ + +grep_ok "Abort on suite stalled is set" "${TEST_NAME_BASE}-run.stderr" + +grep_ok "WARNING - Suite stalled with unhandled failed tasks:" \ + "${TEST_NAME_BASE}-run.stderr" +grep_ok "\* foo.1 (failed)" \ "${TEST_NAME_BASE}-run.stderr" -grep_ok "WARNING - Unmet prerequisites for f_1.1:" \ + +grep_ok "WARNING - Some partially satisfied prerequisites left over:" \ + "${TEST_NAME_BASE}-run.stderr" +grep_ok "f_1.1 is waiting on:" \ + "${TEST_NAME_BASE}-run.stderr" +grep_ok "\* foo.1 succeeded" \ "${TEST_NAME_BASE}-run.stderr" -grep_ok "WARNING - Unmet prerequisites for f_3.1:" \ +grep_ok "f_2.1 is waiting on:" \ "${TEST_NAME_BASE}-run.stderr" -grep_ok "WARNING - Unmet prerequisites for f_2.1" \ +grep_ok "\* foo.1 succeeded" \ "${TEST_NAME_BASE}-run.stderr" -grep_ok "WARNING - \\* foo.1 succeeded" \ +grep_ok "f_3.1 is waiting on:" \ "${TEST_NAME_BASE}-run.stderr" +grep_ok "\* foo.1 succeeded" \ + "${TEST_NAME_BASE}-run.stderr" + purge_suite "${SUITE_NAME}" exit diff --git a/tests/functional/events/27-suite-stalled-dump-prereq-fam/flow.cylc b/tests/functional/events/27-suite-stalled-dump-prereq-fam/flow.cylc index d23faa16e39..0626d58f4ed 100644 --- a/tests/functional/events/27-suite-stalled-dump-prereq-fam/flow.cylc +++ b/tests/functional/events/27-suite-stalled-dump-prereq-fam/flow.cylc @@ -6,8 +6,7 @@ expected task failures = foo.1 [scheduling] [[graph]] - # Goo added to spawn waiting FAM and thereby cause a stall with - # unsatisfied waiting tasks. + # will abort on stalled with unhandled failed foo R1 = """foo & goo => FAM FAM:succeed-any => bar""" [runtime] diff --git a/tests/functional/hold-release/02-hold-on-spawn.t b/tests/functional/hold-release/02-hold-on-spawn.t index 76903ec8a65..cbbc2e7c49f 100755 --- a/tests/functional/hold-release/02-hold-on-spawn.t +++ b/tests/functional/hold-release/02-hold-on-spawn.t @@ -30,7 +30,7 @@ __FLOW_CONFIG__ suite_run_ok "${TEST_NAME_BASE}-run" cylc run --hold "${SUITE_NAME}" cylc release "${SUITE_NAME}" foo.1 -# foo.1 should run and spawn bar.1 as waiting +# foo.1 should run and spawn bar.1 as waiting and held poll_grep_suite_log 'spawned bar\.1' diff --git a/tests/functional/restart/22-hold/flow.cylc b/tests/functional/restart/22-hold/flow.cylc index a89643f7141..d044cb8c9a1 100644 --- a/tests/functional/restart/22-hold/flow.cylc +++ b/tests/functional/restart/22-hold/flow.cylc @@ -18,7 +18,7 @@ [[t1]] script = """ if [[ "${CYLC_TASK_CYCLE_POINT}" == '2016' ]]; then - cylc__job__poll_grep_suite_log -F '[t2.2016] -released to the task pool' + cylc__job__poll_grep_suite_log -F 'spawned t2.2016' cylc hold "${CYLC_SUITE_NAME}" t2.2016 cylc stop "${CYLC_SUITE_NAME}" else diff --git a/tests/functional/runahead/03-check-default-future.t b/tests/functional/runahead/03-check-default-future.t index 3928761ab1a..4081ff5428a 100644 --- a/tests/functional/runahead/03-check-default-future.t +++ b/tests/functional/runahead/03-check-default-future.t @@ -37,6 +37,6 @@ run_ok "${TEST_NAME}" sqlite3 "${DB}" \ cmp_ok "${TEST_NAME}.stdout" <<< "20100101T0400Z" # i.e. should have spawned 5 cycle points from initial T00 (wibble not spawned) #------------------------------------------------------------------------------- -grep_ok 'Suite shutting down - Abort on suite stalled is set' "${SUITE_RUN_DIR}/log/suite/log" +grep_ok 'Suite shutting down - Abort on suite inactivity is set' "${SUITE_RUN_DIR}/log/suite/log" #------------------------------------------------------------------------------- purge_suite "${SUITE_NAME}" diff --git a/tests/functional/runahead/05-check-default-future-2.t b/tests/functional/runahead/05-check-default-future-2.t index 66740d05cc1..33340a1a706 100644 --- a/tests/functional/runahead/05-check-default-future-2.t +++ b/tests/functional/runahead/05-check-default-future-2.t @@ -36,10 +36,10 @@ run_ok "${TEST_NAME}" sqlite3 "${DB}" \ "select max(cycle) from task_states where name=='foo' and status=='failed'" cmp_ok "${TEST_NAME}.stdout" <<< "20100101T1000Z" # i.e. should have spawned 5 cycle points from initial T00, and then raised -# this by PT6H due to fact that wibble spawned +# this by PT6H due to fact that wibble spawned. #------------------------------------------------------------------------------- -TEST_NAME=${TEST_NAME_BASE}-check-stalled +TEST_NAME=${TEST_NAME_BASE}-check-aborted LOG="$RUN_DIR/${SUITE_NAME}/log/suite/log" -grep_ok 'Suite shutting down - Abort on suite stalled is set' "${LOG}" +grep_ok 'Suite shutting down - Abort on suite inactivity is set' "${LOG}" #------------------------------------------------------------------------------- purge_suite "${SUITE_NAME}" diff --git a/tests/functional/runahead/06-release-update.t b/tests/functional/runahead/06-release-update.t index 503c8ab8c02..8687358fa98 100644 --- a/tests/functional/runahead/06-release-update.t +++ b/tests/functional/runahead/06-release-update.t @@ -15,7 +15,7 @@ # You should have received a copy of the GNU General Public License # along with this program. If not, see . #------------------------------------------------------------------------------- -# Test that the state summary is updated when runahead tasks are released. +# Test that the datastore is updated when runahead tasks are released. # GitHub #1981 . "$(dirname "$0")/test_header" set_test_number 3 @@ -27,11 +27,13 @@ CYLC_RUN_PID="$!" poll_suite_running YYYY="$(date +%Y)" NEXT1=$(( YYYY + 1 )) -poll_grep_suite_log -F "[bar.${NEXT1}] -released to the task pool" +poll_grep_suite_log -F "spawned bar.${NEXT1}" +# sleep a little to allow the datastore to update (`cylc dump` sees the +# datastore) TODO can we avoid this flaky sleep somehow? +sleep 10 cylc dump -t "${SUITE_NAME}" | awk '{print $1 $2 $3}' >'log' cmp_ok 'log' - <<__END__ -bar,$NEXT1,waiting, foo,$NEXT1,waiting, __END__ diff --git a/tests/functional/runahead/default-future/flow.cylc b/tests/functional/runahead/default-future/flow.cylc index 9c4650f4fca..abd5ef57691 100644 --- a/tests/functional/runahead/default-future/flow.cylc +++ b/tests/functional/runahead/default-future/flow.cylc @@ -2,31 +2,31 @@ [cylc] UTC mode = True [[events]] - abort on stalled = True - timeout = PT30S - abort on timeout = True + inactivity = PT10S + abort on inactivity = True [scheduling] initial cycle point = 20100101T00 final cycle point = 20100105T00 + [[xtriggers]] + never = wall_clock(P100Y) [[graph]] - # oops makes bar spawn as waiting, to hold back runahead - PT1H = "foo & oops => bar" + R1 = spawner + PT1H = """ + @never => bar + foo + """ # If wibble gets into the pool, it will demand a +PT6H raise # of the 'runahead limit'. {{ FUTURE_TRIGGER_START_POINT }}/PT6H = """ - baz[+PT6H] => wibble - baz + foo[+PT6H] => wibble """ [runtime] - [[foo]] - script = """ - if [[ "$CYLC_TASK_CYCLE_POINT" == "20100101T0000Z" ]]; then - # SoD: spawn wibble (it's not sitting there waiting like in SoS) - cylc set-outputs $CYLC_SUITE_NAME baz.20100101T0800Z -fi -false""" - [[bar,baz,wibble]] + [[root]] script = true - -[visualization] - number of cycle points = 10 + [[spawner]] + script = """ + # spawn wibble + cylc set-outputs $CYLC_SUITE_NAME foo.20100101T0800Z + """ + [[foo]] + script = false diff --git a/tests/functional/runahead/no_final/flow.cylc b/tests/functional/runahead/no_final/flow.cylc index ca4e3466f8d..d412cd129d2 100644 --- a/tests/functional/runahead/no_final/flow.cylc +++ b/tests/functional/runahead/no_final/flow.cylc @@ -2,15 +2,18 @@ [cylc] cycle point time zone = Z [[events]] - abort on stalled = True + abort on inactivity = True + inactivity = PT10S [scheduling] runahead limit = P4 initial cycle point = 20100101T00 + [[xtriggers]] + never = wall_clock(P100Y) [[graph]] - # oops makes bar spawn as waiting, to hold back the runahead - PT6H = "foo & oops => bar" + PT6H = """ + foo + @never => bar + """ [runtime] [[foo]] script = false - [[bar]] - script = true diff --git a/tests/functional/shutdown/09-now2.t b/tests/functional/shutdown/09-now2.t index 4534f2a3cd4..6a51ac09c1c 100755 --- a/tests/functional/shutdown/09-now2.t +++ b/tests/functional/shutdown/09-now2.t @@ -18,14 +18,15 @@ # Test "cylc stop --now --now". . "$(dirname "$0")/test_header" -set_test_number 8 +set_test_number 9 install_suite "${TEST_NAME_BASE}" "${TEST_NAME_BASE}" run_ok "${TEST_NAME_BASE}-validate" cylc validate "${SUITE_NAME}" suite_run_ok "${TEST_NAME_BASE}-run" cylc run --no-detach "${SUITE_NAME}" LOGD="$RUN_DIR/${SUITE_NAME}/log" grep_ok 'INFO - Suite shutting down - REQUEST(NOW-NOW)' "${LOGD}/suite/log" -grep_ok 'WARNING - t1.1: orphaned task (running)' "${LOGD}/suite/log" +grep_ok 'WARNING - Orphaned task jobs' "${LOGD}/suite/log" +grep_ok '\* t1.1 (running)' "${LOGD}/suite/log" JLOGD="${LOGD}/job/1/t1/01" # Check that t1.1 event handler runs run_fail "${TEST_NAME_BASE}-activity-log-succeeded" \