From ddddfcdcb49fab6c9f800c02b572e31255dc953d Mon Sep 17 00:00:00 2001 From: Tim Pillinger <26465611+wxtim@users.noreply.github.com> Date: Tue, 10 Sep 2024 16:14:47 +0100 Subject: [PATCH] Ensure that platform from group selection checks broadcast manager (#6330) --- changes.d/6330.fix.md | 1 + cylc/flow/task_job_mgr.py | 25 +++++----- .../07-cylc7-badhost.t | 2 +- tests/integration/test_task_job_mgr.py | 49 +++++++++++++++++++ 4 files changed, 64 insertions(+), 13 deletions(-) create mode 100644 changes.d/6330.fix.md diff --git a/changes.d/6330.fix.md b/changes.d/6330.fix.md new file mode 100644 index 00000000000..190a9637a93 --- /dev/null +++ b/changes.d/6330.fix.md @@ -0,0 +1 @@ +Fix bug where broadcasting failed to change platform selected after host selection failure. \ No newline at end of file diff --git a/cylc/flow/task_job_mgr.py b/cylc/flow/task_job_mgr.py index 185966ff12d..e8b77f14d85 100644 --- a/cylc/flow/task_job_mgr.py +++ b/cylc/flow/task_job_mgr.py @@ -306,18 +306,19 @@ def submit_task_jobs(self, workflow, itasks, curve_auth, # Get another platform, if task config platform is a group use_next_platform_in_group = False - if itask.tdef.rtconfig['platform']: - try: - platform = get_platform( - itask.tdef.rtconfig['platform'], - bad_hosts=self.bad_hosts - ) - except PlatformLookupError: - pass - else: - # If were able to select a new platform; - if platform and platform != itask.platform: - use_next_platform_in_group = True + bc_mgr = self.task_events_mgr.broadcast_mgr + rtconf = bc_mgr.get_updated_rtconfig(itask) + try: + platform = get_platform( + rtconf, + bad_hosts=self.bad_hosts + ) + except PlatformLookupError: + pass + else: + # If were able to select a new platform; + if platform and platform != itask.platform: + use_next_platform_in_group = True if use_next_platform_in_group: # store the previous platform's hosts so that when diff --git a/tests/functional/intelligent-host-selection/07-cylc7-badhost.t b/tests/functional/intelligent-host-selection/07-cylc7-badhost.t index 29c72e6c5a5..a35f81c8d5a 100644 --- a/tests/functional/intelligent-host-selection/07-cylc7-badhost.t +++ b/tests/functional/intelligent-host-selection/07-cylc7-badhost.t @@ -52,7 +52,7 @@ workflow_run_fail "${TEST_NAME_BASE}-run" \ cylc play --no-detach "${WORKFLOW_NAME}" grep_workflow_log_ok "${TEST_NAME_BASE}-grep-1" \ - "platform: badhostplatform - initialisation did not complete (no hosts were reachable)" + "platform: badhostplatform - initialisation did not complete" grep_workflow_log_ok "${TEST_NAME_BASE}-grep-2" "CRITICAL - Workflow stalled" diff --git a/tests/integration/test_task_job_mgr.py b/tests/integration/test_task_job_mgr.py index 9265f2198dc..48a49eb30aa 100644 --- a/tests/integration/test_task_job_mgr.py +++ b/tests/integration/test_task_job_mgr.py @@ -187,3 +187,52 @@ async def test__prep_submit_task_job_impl_handles_execution_time_limit( schd.task_job_mgr._prep_submit_task_job( schd.workflow, task_a) assert not task_a.summary.get('execution_time_limit', '') + + +async def test_broadcast_platform_change( + mock_glbl_cfg, + flow, + scheduler, + start, + log_filter, +): + """Broadcast can change task platform. + + Even after host selection failure. + + see https://github.com/cylc/cylc-flow/issues/6320 + """ + mock_glbl_cfg( + 'cylc.flow.platforms.glbl_cfg', + ''' + [platforms] + [[foo]] + hosts = food + ''') + + id_ = flow({ + "scheduling": {"graph": {"R1": "mytask"}}, + # Platform = None doesn't cause this issue! + "runtime": {"mytask": {"platform": "localhost"}}}) + + schd = scheduler(id_, run_mode='live') + + async with start(schd) as log: + # Change the task platform with broadcast: + schd.broadcast_mgr.put_broadcast( + ['1'], ['mytask'], [{'platform': 'foo'}]) + + # Simulate prior failure to contact hosts: + schd.task_job_mgr.task_remote_mgr.bad_hosts = {'food'} + + # Attempt job submission: + schd.task_job_mgr.submit_task_jobs( + schd.workflow, + schd.pool.get_tasks(), + schd.server.curve_auth, + schd.server.client_pub_key_dir) + + # Check that task platform hasn't become "localhost": + assert schd.pool.get_tasks()[0].platform['name'] == 'foo' + # ... and that remote init failed because all hosts bad: + assert log_filter(log, contains="(no hosts were reachable)")