From 9c3cd49912d7c006572b62b6ae81320aca612e1a Mon Sep 17 00:00:00 2001 From: Tim Pillinger <26465611+wxtim@users.noreply.github.com> Date: Wed, 12 Jun 2024 12:17:22 +0100 Subject: [PATCH 1/3] platforms: fix unreachable hosts not reset on platform group failure (#6109) --- changes.d/fix.6109.md | 1 + cylc/flow/exceptions.py | 9 ++++- cylc/flow/platforms.py | 9 +++-- cylc/flow/task_job_mgr.py | 13 ++++++- cylc/flow/task_remote_mgr.py | 5 ++- tests/integration/test_platforms.py | 53 +++++++++++++++++++++++++++++ 6 files changed, 85 insertions(+), 5 deletions(-) create mode 100644 changes.d/fix.6109.md create mode 100644 tests/integration/test_platforms.py diff --git a/changes.d/fix.6109.md b/changes.d/fix.6109.md new file mode 100644 index 00000000000..36f22c3d4fc --- /dev/null +++ b/changes.d/fix.6109.md @@ -0,0 +1 @@ +Fixed bug affecting job submission where the list of bad hosts was not always reset correctly. \ No newline at end of file diff --git a/cylc/flow/exceptions.py b/cylc/flow/exceptions.py index 79a726d7bbe..1f9092c9a29 100644 --- a/cylc/flow/exceptions.py +++ b/cylc/flow/exceptions.py @@ -21,6 +21,7 @@ Callable, Dict, Iterable, + Set, NoReturn, Optional, Tuple, @@ -444,15 +445,21 @@ class NoPlatformsError(PlatformLookupError): Args: identity: The name of the platform group or install target + hosts_consumed: Hosts which have already been tried. set_type: Whether the set of platforms is a platform group or an install target place: Where the attempt to get the platform failed. """ def __init__( - self, identity: str, set_type: str = 'group', place: str = '' + self, + identity: str, + hosts_consumed: Set[str], + set_type: str = 'group', + place: str = '', ): self.identity = identity self.type = set_type + self.hosts_consumed = hosts_consumed if place: self.place = f' during {place}.' else: diff --git a/cylc/flow/platforms.py b/cylc/flow/platforms.py index 72ccebffe52..d06c84ade92 100644 --- a/cylc/flow/platforms.py +++ b/cylc/flow/platforms.py @@ -302,9 +302,14 @@ def get_platform_from_group( else: platform_names = group['platforms'] - # Return False if there are no platforms available to be selected. + # If there are no platforms available to be selected: if not platform_names: - raise NoPlatformsError(group_name) + hosts_consumed = { + host + for platform in group['platforms'] + for host in platform_from_name(platform)['hosts']} + raise NoPlatformsError( + group_name, hosts_consumed) # Get the selection method method = group['selection']['method'] diff --git a/cylc/flow/task_job_mgr.py b/cylc/flow/task_job_mgr.py index e41e05dfd30..019de580ea6 100644 --- a/cylc/flow/task_job_mgr.py +++ b/cylc/flow/task_job_mgr.py @@ -267,11 +267,13 @@ def submit_task_jobs(self, workflow, itasks, curve_auth, # Prepare tasks for job submission prepared_tasks, bad_tasks = self.prep_submit_task_jobs( workflow, itasks) + # Reset consumed host selection results self.task_remote_mgr.subshell_eval_reset() if not prepared_tasks: return bad_tasks + auth_itasks = {} # {platform: [itask, ...], ...} for itask in prepared_tasks: @@ -279,6 +281,7 @@ def submit_task_jobs(self, workflow, itasks, curve_auth, auth_itasks.setdefault(platform_name, []) auth_itasks[platform_name].append(itask) # Submit task jobs for each platform + # Non-prepared tasks can be considered done for now: done_tasks = bad_tasks for _, itasks in sorted(auth_itasks.items()): @@ -1087,7 +1090,7 @@ def _prep_submit_task_job( Returns: * itask - preparation complete. * None - preparation in progress. - * False - perparation failed. + * False - preparation failed. """ if itask.local_job_file_path: @@ -1181,6 +1184,14 @@ def _prep_submit_task_job( itask.summary['platforms_used'][itask.submit_num] = '' # Retry delays, needed for the try_num self._create_job_log_path(workflow, itask) + if isinstance(exc, NoPlatformsError): + # Clear all hosts from all platforms in group from + # bad_hosts: + self.bad_hosts -= exc.hosts_consumed + self._set_retry_timers(itask, rtconfig) + self._prep_submit_task_job_error( + workflow, itask, '(no platforms available)', exc) + return False self._prep_submit_task_job_error( workflow, itask, '(platform not defined)', exc) return False diff --git a/cylc/flow/task_remote_mgr.py b/cylc/flow/task_remote_mgr.py index 2797672617b..5d7c092336b 100644 --- a/cylc/flow/task_remote_mgr.py +++ b/cylc/flow/task_remote_mgr.py @@ -388,7 +388,10 @@ def remote_tidy(self) -> None: else: LOG.error( NoPlatformsError( - install_target, 'install target', 'remote tidy')) + install_target, + set(), + 'install target', + 'remote tidy')) # Wait for commands to complete for a max of 10 seconds timeout = time() + 10.0 while queue and time() < timeout: diff --git a/tests/integration/test_platforms.py b/tests/integration/test_platforms.py new file mode 100644 index 00000000000..f8187227ceb --- /dev/null +++ b/tests/integration/test_platforms.py @@ -0,0 +1,53 @@ +# THIS FILE IS PART OF THE CYLC WORKFLOW ENGINE. +# Copyright (C) NIWA & British Crown (Met Office) & Contributors. +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program. If not, see . +"""Integration testing for platforms functionality.""" + + +async def test_prep_submit_task_tries_multiple_platforms( + flow, scheduler, start, mock_glbl_cfg +): + """Preparation tries multiple platforms within a group if the + task platform setting matches a group, and that after all platforms + have been tried that the hosts matching that platform group are + cleared. + + See https://github.com/cylc/cylc-flow/pull/6109 + """ + global_conf = ''' + [platforms] + [[myplatform]] + hosts = broken + [[anotherbad]] + hosts = broken2 + [platform groups] + [[mygroup]] + platforms = myplatform, anotherbad''' + mock_glbl_cfg('cylc.flow.platforms.glbl_cfg', global_conf) + + wid = flow({ + "scheduling": {"graph": {"R1": "foo"}}, + "runtime": {"foo": {"platform": "mygroup"}} + }) + schd = scheduler(wid, run_mode='live') + async with start(schd): + itask = schd.pool.get_tasks()[0] + itask.submit_num = 1 + # simulate failed attempts to contact the job hosts + schd.task_job_mgr.bad_hosts = {'broken', 'broken2'} + res = schd.task_job_mgr._prep_submit_task_job(schd.workflow, itask) + assert res is False + # ensure the bad hosts have been cleared + assert not schd.task_job_mgr.bad_hosts From 48ece7d134e1f2fa3c95e47b5ec93fe8a9c18dcf Mon Sep 17 00:00:00 2001 From: Tim Pillinger <26465611+wxtim@users.noreply.github.com> Date: Wed, 12 Jun 2024 16:49:31 +0100 Subject: [PATCH 2/3] Update exceptions.py --- cylc/flow/exceptions.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/cylc/flow/exceptions.py b/cylc/flow/exceptions.py index 63ecca9afce..05efc541600 100644 --- a/cylc/flow/exceptions.py +++ b/cylc/flow/exceptions.py @@ -78,9 +78,7 @@ class InputError(CylcError): class CylcConfigError(CylcError): """Generic exception to handle an error in a Cylc configuration file.""" - # TODO: reference the configuration el - - ement causing the problem + # TODO: reference the configuration element causing the problem class WorkflowConfigError(CylcConfigError): From 9ed0cc989847abe837094609ea0f39023bdd9abe Mon Sep 17 00:00:00 2001 From: Tim Pillinger <26465611+wxtim@users.noreply.github.com> Date: Wed, 12 Jun 2024 16:52:20 +0100 Subject: [PATCH 3/3] Update exceptions.py --- cylc/flow/exceptions.py | 1 + 1 file changed, 1 insertion(+) diff --git a/cylc/flow/exceptions.py b/cylc/flow/exceptions.py index 05efc541600..7235455cace 100644 --- a/cylc/flow/exceptions.py +++ b/cylc/flow/exceptions.py @@ -21,6 +21,7 @@ Dict, Optional, Sequence, + Set, Union, TYPE_CHECKING, )