Skip to content

Commit

Permalink
[Serve] [release test] Add max_retries and max_restarts (ray-project#…
Browse files Browse the repository at this point in the history
…32011)

The long_running_serve_failure test uses a long-running actor, RandomKiller, to randomly kill Serve actors. This change sets the RandomKiller's max_restarts and max_task_retries to -1, so it can restart after crashes.

Related issue number
Addresses ray-project#31741
  • Loading branch information
shrekris-anyscale authored Jan 27, 2023
1 parent 2a7dd31 commit dd36360
Showing 1 changed file with 5 additions and 4 deletions.
9 changes: 5 additions & 4 deletions release/long_running_tests/workloads/serve_failure.py
Original file line number Diff line number Diff line change
Expand Up @@ -60,7 +60,7 @@ def update_progress(result):
serve.start(detached=True)


@ray.remote
@ray.remote(max_restarts=-1, max_task_retries=-1)
class RandomKiller:
def __init__(self, kill_period_s=1):
self.kill_period_s = kill_period_s
Expand Down Expand Up @@ -95,15 +95,15 @@ def _get_serve_actors(self):


class RandomTest:
def __init__(self, max_deployments=1):
def __init__(self, random_killer_handle, max_deployments=1):
self.max_deployments = max_deployments
self.weighted_actions = [
(self.create_deployment, 1),
(self.verify_deployment, 4),
]
self.deployments = []
self.random_killer = RandomKiller.remote()

self.random_killer = random_killer_handle
for _ in range(max_deployments):
self.create_deployment()
self.random_killer.run.remote()
Expand Down Expand Up @@ -172,5 +172,6 @@ def run(self):
break


tester = RandomTest(max_deployments=NUM_NODES * CPUS_PER_NODE)
random_killer = RandomKiller.remote()
tester = RandomTest(random_killer, max_deployments=NUM_NODES * CPUS_PER_NODE)
tester.run()

0 comments on commit dd36360

Please sign in to comment.