diff --git a/python/ray/serve/_private/application_state.py b/python/ray/serve/_private/application_state.py index 23958d1b87bc..872020423964 100644 --- a/python/ray/serve/_private/application_state.py +++ b/python/ray/serve/_private/application_state.py @@ -41,7 +41,7 @@ def __init__( self._name = name self._deploy_obj_ref = deploy_obj_ref - self._app_msg = "" + self._status_msg = "" self._deployment_state_manager = deployment_state_manager self._deployment_params: List[Dict] = [] # This set tracks old deployments that are being deleted @@ -92,7 +92,7 @@ def deployments(self) -> List[str]: def delete(self): """Delete the application""" - self._status = ApplicationStatus.DELETING + self._update_status(ApplicationStatus.DELETING) def deploy(self, deployment_params: List[Dict]) -> List[str]: """Deploy the application. @@ -141,13 +141,13 @@ def deploy(self, deployment_params: List[Dict]) -> List[str]: "path in your application to avoid this issue." ) - self._status = ApplicationStatus.DEPLOYING + self._update_status(ApplicationStatus.DEPLOYING) return cur_deployments_to_delete def update_obj_ref(self, deploy_obj_ref: ObjectRef, deployment_time: int): self._deploy_obj_ref = deploy_obj_ref self._deployment_timestamp = deployment_time - self._status = ApplicationStatus.DEPLOYING + self._update_status(ApplicationStatus.DEPLOYING) def _process_terminating_deployments(self): """Update the tracking for all deployments being deleted @@ -202,42 +202,60 @@ def update(self): ray.get(finished[0]) logger.info(f"Deploy task for app '{self._name}' ran successfully.") except RayTaskError as e: - self._status = ApplicationStatus.DEPLOY_FAILED # NOTE(zcin): we should use str(e) instead of traceback.format_exc() # here because the full details of the error is not displayed # properly with traceback.format_exc(). RayTaskError has its own # custom __str__ function. - self._app_msg = f"Deploying app '{self._name}' failed:\n{str(e)}" - logger.warning(self._app_msg) + self._update_status( + ApplicationStatus.DEPLOY_FAILED, + status_msg=f"Deploying app '{self._name}' failed:\n{str(e)}", + ) + logger.warning(self._status_msg) return except RuntimeEnvSetupError: - self._status = ApplicationStatus.DEPLOY_FAILED - self._app_msg = ( - f"Runtime env setup for app '{self._name}' " - f"failed:\n{traceback.format_exc()}" + self._update_status( + ApplicationStatus.DEPLOY_FAILED, + status_msg=( + f"Runtime env setup for app '{self._name}' " + f"failed:\n{traceback.format_exc()}" + ), ) - logger.warning(self._app_msg) + logger.warning(self._status_msg) return except Exception: - self._status = ApplicationStatus.DEPLOY_FAILED - self._app_msg = ( - "Unexpected error occured while deploying application " - f"'{self._name}':\n{traceback.format_exc()}" + self._update_status( + ApplicationStatus.DEPLOY_FAILED, + status_msg=( + "Unexpected error occured while deploying " + f"application '{self._name}':" + f"\n{traceback.format_exc()}" + ), ) - logger.warning(self._app_msg) + logger.warning(self._status_msg) return deployments_statuses = ( self._deployment_state_manager.get_deployment_statuses(self.deployments) ) num_health_deployments = 0 + unhealthy_deployment_names = [] for deployment_status in deployments_statuses: if deployment_status.status == DeploymentStatus.UNHEALTHY: - self._status = ApplicationStatus.DEPLOY_FAILED - return + unhealthy_deployment_names.append(deployment_status.name) if deployment_status.status == DeploymentStatus.HEALTHY: num_health_deployments += 1 + + if len(unhealthy_deployment_names) != 0: + self._update_status( + ApplicationStatus.DEPLOY_FAILED, + status_msg=( + "The following deployments are UNHEALTHY: " + f"{unhealthy_deployment_names}" + ), + ) + return + if num_health_deployments == len(deployments_statuses): - self._status = ApplicationStatus.RUNNING + self._update_status(ApplicationStatus.RUNNING) self._process_terminating_deployments() @@ -249,7 +267,7 @@ def get_application_status_info(self) -> ApplicationStatusInfo: """Return the application status information""" return ApplicationStatusInfo( self._status, - message=self._app_msg, + message=self._status_msg, deployment_timestamp=self._deployment_timestamp, ) @@ -269,6 +287,10 @@ def list_deployment_details(self) -> Dict[str, DeploymentDetails]: } return {k: v for k, v in details.items() if v is not None} + def _update_status(self, status: ApplicationStatus, status_msg: str = ""): + self._status = status + self._status_msg = status_msg + class ApplicationStateManager: def __init__(self, deployment_state_manager): diff --git a/python/ray/serve/tests/test_application_state.py b/python/ray/serve/tests/test_application_state.py index 28aeea2499b2..f0c70214d75e 100644 --- a/python/ray/serve/tests/test_application_state.py +++ b/python/ray/serve/tests/test_application_state.py @@ -115,7 +115,7 @@ def test_update_app_running(mocked_application_state_manager): def test_update_app_deploy_failed(mocked_application_state_manager): - """Test DEPLOYING -> DEPLOY_FAILED""" + """Test DEPLOYING -> DEPLOY_FAILED -> DEPLOYING -> RUNNING""" app_state_manager, deployment_state_manager = mocked_application_state_manager app_state_manager.deploy_application("test_app", [{"name": "d1"}]) # Simulate controller @@ -128,8 +128,31 @@ def test_update_app_deploy_failed(mocked_application_state_manager): app_status = app_state_manager.get_app_status("test_app") assert app_status.status == ApplicationStatus.DEPLOY_FAILED # rerun update, application status should not make difference + deploy_failed_msg = app_status.message + assert len(deploy_failed_msg) != 0 app_state_manager.update() assert app_status.status == ApplicationStatus.DEPLOY_FAILED + assert app_status.message == deploy_failed_msg + + app_state_manager.deploy_application("test_app", [{"name": "d1"}, {"name": "d2"}]) + # Simulate controller + deployment_state_manager.deploy("d1", None) + deployment_state_manager.deploy("d2", None) + + app_status = app_state_manager.get_app_status("test_app") + assert app_status.status == ApplicationStatus.DEPLOYING + assert app_status.message != deploy_failed_msg + deployment_state_manager.set_deployment_statuses_healthy("d1") + deployment_state_manager.set_deployment_statuses_healthy("d2") + app_state_manager.update() + app_status = app_state_manager.get_app_status("test_app") + assert app_status.status == ApplicationStatus.RUNNING + running_msg = app_status.message + assert running_msg != deploy_failed_msg + # rerun update, application status should not make difference + app_state_manager.update() + assert app_status.status == ApplicationStatus.RUNNING + assert app_status.message == running_msg @pytest.mark.skipif(sys.platform == "win32", reason="Failing on Windows.")