Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[Serve] Propagate replica constructor error to deployment status message and print num retries left #48531

Merged
Merged
Show file tree
Hide file tree
Changes from 6 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
18 changes: 18 additions & 0 deletions python/ray/serve/_private/deployment_state.py
Original file line number Diff line number Diff line change
Expand Up @@ -1271,6 +1271,7 @@ def __init__(
self._last_retry: float = 0.0
self._backoff_time_s: int = 1
self._replica_constructor_retry_counter: int = 0
self._prev_replica_constructor_retry_counter: int = 0
self._replica_constructor_error_msg: Optional[str] = None
self._replicas: ReplicaStateContainer = ReplicaStateContainer()
self._curr_status_info: DeploymentStatusInfo = DeploymentStatusInfo(
Expand Down Expand Up @@ -1619,6 +1620,7 @@ def deploy(self, deployment_info: DeploymentInfo) -> bool:
f"(initial target replicas: {target_num_replicas})."
)
self._replica_constructor_retry_counter = 0
self._prev_replica_constructor_retry_counter = 0
self._backoff_time_s = 1
return True

Expand Down Expand Up @@ -1942,6 +1944,22 @@ def check_curr_status(self) -> Tuple[bool, bool]:
)
return False, any_replicas_recovering

if failed_to_start_count > self._prev_replica_constructor_retry_counter:
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why is _prev_replica_constructor_retry_counter necessary? Can we just always call update_message and change it to a no-op if the message is the same?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

yeah you're right

self._prev_replica_constructor_retry_counter = failed_to_start_count
if failed_to_start_threshold == 0:
message = (
"A replica failed to start with exception. Retrying. Error:\n"
f"{self._replica_constructor_error_msg}"
)
else:
remaining_retries = failed_to_start_threshold - failed_to_start_count
message = (
f"A replica failed to start with exception. Retrying "
f"{remaining_retries} more time(s). Error:\n"
f"{self._replica_constructor_error_msg}"
)
self._curr_status_info = self._curr_status_info.update_message(message)

# If we have pending ops, the current goal is *not* ready.
if (
self._replicas.count(
Expand Down
28 changes: 28 additions & 0 deletions python/ray/serve/tests/test_cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -606,6 +606,34 @@ def check_for_failed_deployment():
wait_for_condition(check_for_failed_deployment)


@pytest.mark.skipif(sys.platform == "win32", reason="File path incorrect on Windows.")
def test_status_constructor_retry_error(ray_start_stop):
"""Deploys Serve deployment that errors out in constructor, checks that the
retry message is surfaced.
"""

config_file_name = os.path.join(
os.path.dirname(__file__), "test_config_files", "deployment_fail_2.yaml"
)

subprocess.check_output(["serve", "deploy", config_file_name])

def check_for_failed_deployment():
cli_output = subprocess.check_output(
["serve", "status", "-a", "http://localhost:52365/"]
)
status = yaml.safe_load(cli_output)["applications"][SERVE_DEFAULT_APP_NAME]
assert status["status"] == "DEPLOYING"

deployment_status = status["deployments"]["A"]
assert deployment_status["status"] == "UPDATING"
assert deployment_status["status_trigger"] == "CONFIG_UPDATE_STARTED"
assert "ZeroDivisionError" in deployment_status["message"]
return True

wait_for_condition(check_for_failed_deployment)
zcin marked this conversation as resolved.
Show resolved Hide resolved


@pytest.mark.skipif(sys.platform == "win32", reason="File path incorrect on Windows.")
def test_status_package_unavailable_in_controller(ray_start_stop):
"""Test that exceptions raised from packages that are installed on deployment actors
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
applications:
- name: default
import_path: ray.serve.tests.test_config_files.fail_2.node
13 changes: 13 additions & 0 deletions python/ray/serve/tests/test_config_files/fail_2.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
import time

from ray import serve


@serve.deployment
class A:
def __init__(self):
time.sleep(5)
1 / 0


node = A.bind()
8 changes: 4 additions & 4 deletions python/ray/serve/tests/unit/test_deployment_state.py
Original file line number Diff line number Diff line change
Expand Up @@ -2492,8 +2492,8 @@ def create_deployment_state(

check_counts(ds1, total=3, by_state=[(ReplicaState.STOPPING, 3, None)])
assert ds1._replica_constructor_retry_counter == 3
# An error message should show up after
# 3 * num_replicas startup failures.
# In the update sequence, the status is populated
# prior to scheduling, so the message is empty still.
assert "" == ds1.curr_status_info.message

# Set all of ds1's replicas to stopped.
Expand All @@ -2512,7 +2512,7 @@ def create_deployment_state(
assert ds1.curr_status_info.status == DeploymentStatus.UPDATING
check_counts(ds1, total=3, by_state=[(ReplicaState.STOPPING, 3, None)])
assert ds1._replica_constructor_retry_counter == 6
assert "" == ds1.curr_status_info.message
assert "Retrying 6 more time(s)" in ds1.curr_status_info.message

# Set all of ds1's replicas to stopped.
for replica in ds1._replicas.get():
Expand All @@ -2527,7 +2527,7 @@ def create_deployment_state(
assert ds1.curr_status_info.status == DeploymentStatus.UPDATING
check_counts(ds1, total=3, by_state=[(ReplicaState.STOPPING, 3, None)])
assert ds1._replica_constructor_retry_counter == 9
assert "" == ds1.curr_status_info.message
assert "Retrying 3 more time(s)" in ds1.curr_status_info.message

# Set all of ds1's replicas to stopped.
for replica in ds1._replicas.get():
Expand Down