Skip to content

Commit

Permalink
[Jobs] Refactor: Extract task failure state update helper (#4185)
Browse files Browse the repository at this point in the history
refactor: a unified exception handling utility
  • Loading branch information
andylizf authored Oct 26, 2024
1 parent 647fcea commit c0c1748
Showing 1 changed file with 28 additions and 33 deletions.
61 changes: 28 additions & 33 deletions sky/jobs/controller.py
Original file line number Diff line number Diff line change
Expand Up @@ -340,48 +340,28 @@ def run(self):
common_utils.format_exception(reason, use_bracket=True)
for reason in e.reasons))
logger.error(failure_reason)
managed_job_state.set_failed(
self._job_id,
task_id=task_id,
failure_type=managed_job_state.ManagedJobStatus.
FAILED_PRECHECKS,
failure_reason=failure_reason,
callback_func=managed_job_utils.event_callback_func(
job_id=self._job_id,
task_id=task_id,
task=self._dag.tasks[task_id]))
self._update_failed_task_state(
task_id, managed_job_state.ManagedJobStatus.FAILED_PRECHECKS,
failure_reason)
except exceptions.ManagedJobReachedMaxRetriesError as e:
# Please refer to the docstring of self._run for the cases when
# this exception can occur.
logger.error(common_utils.format_exception(e))
failure_reason = common_utils.format_exception(e)
logger.error(failure_reason)
# The managed job should be marked as FAILED_NO_RESOURCE, as the
# managed job may be able to launch next time.
managed_job_state.set_failed(
self._job_id,
task_id=task_id,
failure_type=managed_job_state.ManagedJobStatus.
FAILED_NO_RESOURCE,
failure_reason=common_utils.format_exception(e),
callback_func=managed_job_utils.event_callback_func(
job_id=self._job_id,
task_id=task_id,
task=self._dag.tasks[task_id]))
self._update_failed_task_state(
task_id, managed_job_state.ManagedJobStatus.FAILED_NO_RESOURCE,
failure_reason)
except (Exception, SystemExit) as e: # pylint: disable=broad-except
with ux_utils.enable_traceback():
logger.error(traceback.format_exc())
msg = ('Unexpected error occurred: '
f'{common_utils.format_exception(e, use_bracket=True)}')
msg = ('Unexpected error occurred: ' +
common_utils.format_exception(e, use_bracket=True))
logger.error(msg)
managed_job_state.set_failed(
self._job_id,
task_id=task_id,
failure_type=managed_job_state.ManagedJobStatus.
FAILED_CONTROLLER,
failure_reason=msg,
callback_func=managed_job_utils.event_callback_func(
job_id=self._job_id,
task_id=task_id,
task=self._dag.tasks[task_id]))
self._update_failed_task_state(
task_id, managed_job_state.ManagedJobStatus.FAILED_CONTROLLER,
msg)
finally:
# This will set all unfinished tasks to CANCELLING, and will not
# affect the jobs in terminal states.
Expand All @@ -396,6 +376,21 @@ def run(self):
managed_job_state.set_cancelled(job_id=self._job_id,
callback_func=callback_func)

def _update_failed_task_state(
self, task_id: int,
failure_type: managed_job_state.ManagedJobStatus,
failure_reason: str):
"""Update the state of the failed task."""
managed_job_state.set_failed(
self._job_id,
task_id=task_id,
failure_type=failure_type,
failure_reason=failure_reason,
callback_func=managed_job_utils.event_callback_func(
job_id=self._job_id,
task_id=task_id,
task=self._dag.tasks[task_id]))


def _run_controller(job_id: int, dag_yaml: str, retry_until_up: bool):
"""Runs the controller in a remote process for interruption."""
Expand Down

0 comments on commit c0c1748

Please sign in to comment.