Skip to content

Commit eaf2af4

Browse files
authored
[Serve] Prioritize stopping most recently scaled-up replicas during downscaling (#52929)
<!-- Thank you for your contribution! Please review https://github.com/ray-project/ray/blob/master/CONTRIBUTING.rst before opening a pull request. --> <!-- Please add a reviewer to the assignee section when you create a PR. If you don't have the access to it, we will shortly find a reviewer and assign them to your PR. --> ## Why are these changes needed? <!-- Please give a short summary of the change and the problem this solves. --> This PR improves the downscaling behavior in Ray Serve by modifying the logic in `_get_replicas_to_stop()` within Default `DeploymentScheduler`. Previously, the scheduler selected replicas to stop by traversing the least loaded nodes in ascending order. This often resulted in stopping replicas that had been scheduled earlier and placed optimally using the `_best_fit_node()` strategy. This led to several drawbacks: - Long-lived replicas, which were scheduled on best-fit nodes, were removed first — leading to inefficient reuse of resources. - Recently scaled-up replicas, which were placed on less utilized nodes, were kept longer despite being suboptimal. - Cold-start overhead increased, as newer replicas were removed before fully warming up. This PR reverses the node traversal order during downscaling so that **more recently added replicas are prioritized for termination**, *in cases where other conditions (e.g., running state and number of replicas per node) are equal*. These newer replicas are typically less optimal in placement and not yet fully warmed up. Preserving long-lived replicas improves performance stability and reduces unnecessary resource fragmentation. ## Related issue number <!-- For example: "Closes #1234" --> N/A ## Checks - [x] I've signed off every commit(by using the -s flag, i.e., `git commit -s`) in this PR. - [x] I've run `scripts/format.sh` to lint the changes in this PR. - [ ] I've included any doc changes needed for https://docs.ray.io/en/master/. - [ ] I've added any new APIs to the API Reference. For example, if I added a method in Tune, I've added it in `doc/source/tune/api/` under the corresponding `.rst` file. - [ ] I've made sure the tests are passing. Note that there might be a few flaky tests, see the recent failures at https://flakey-tests.ray.io/ - Testing Strategy - [ ] Unit tests - [ ] Release tests - [ ] This PR is not tested :( --------- Signed-off-by: kitae <ryugitae777@gmail.com>
1 parent 0325fab commit eaf2af4

File tree

2 files changed

+21
-14
lines changed

2 files changed

+21
-14
lines changed

python/ray/serve/_private/deployment_scheduler.py

Lines changed: 18 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -735,18 +735,26 @@ def _get_replicas_to_stop(
735735
for (
736736
pending_launching_recovering_replica
737737
) in pending_launching_recovering_replicas:
738+
replicas_to_stop.add(pending_launching_recovering_replica)
738739
if len(replicas_to_stop) == max_num_to_stop:
739740
return replicas_to_stop
740-
else:
741-
replicas_to_stop.add(pending_launching_recovering_replica)
742741

743-
node_to_running_replicas_of_target_deployment = (
744-
self._get_node_to_running_replicas(deployment_id)
745-
)
746742
node_to_running_replicas_of_all_deployments = (
747743
self._get_node_to_running_replicas()
748744
)
749745

746+
# _running_replicas preserves insertion order (oldest → newest).
747+
# Reverse once so we have newest → oldest, then bucket by node.
748+
ordered_running_replicas = list(self._running_replicas[deployment_id].items())
749+
ordered_running_replicas.reverse()
750+
ordered_running_replicas_of_target_deployment: Dict[
751+
str, List[ReplicaID]
752+
] = defaultdict(list)
753+
for replica_id, replica_node_id in ordered_running_replicas:
754+
ordered_running_replicas_of_target_deployment[replica_node_id].append(
755+
replica_id
756+
)
757+
750758
# Replicas on the head node has the lowest priority for downscaling
751759
# since we cannot relinquish the head node.
752760
def key(node_and_num_running_replicas_of_all_deployments):
@@ -760,15 +768,14 @@ def key(node_and_num_running_replicas_of_all_deployments):
760768
for node_id, _ in sorted(
761769
node_to_running_replicas_of_all_deployments.items(), key=key
762770
):
763-
if node_id not in node_to_running_replicas_of_target_deployment:
771+
if node_id not in ordered_running_replicas_of_target_deployment:
764772
continue
765-
for running_replica in node_to_running_replicas_of_target_deployment[
766-
node_id
767-
]:
773+
774+
# Newest-first list for this node.
775+
for replica_id in ordered_running_replicas_of_target_deployment[node_id]:
776+
replicas_to_stop.add(replica_id)
768777
if len(replicas_to_stop) == max_num_to_stop:
769778
return replicas_to_stop
770-
else:
771-
replicas_to_stop.add(running_replica)
772779

773780
return replicas_to_stop
774781

python/ray/serve/tests/unit/test_deployment_scheduler.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -674,7 +674,7 @@ def test_downscale_multiple_deployments():
674674
# but it has more replicas of all deployments so
675675
# we should stop replicas from node2.
676676
assert len(deployment_to_replicas_to_stop[d1_id]) == 1
677-
assert deployment_to_replicas_to_stop[d1_id] < {d1_r2_id, d1_r3_id}
677+
assert deployment_to_replicas_to_stop[d1_id].issubset({d1_r2_id, d1_r3_id})
678678

679679
scheduler.on_replica_stopping(d1_r3_id)
680680
scheduler.on_replica_stopping(d2_r3_id)
@@ -737,7 +737,7 @@ def test_downscale_head_node():
737737
},
738738
)
739739
assert len(deployment_to_replicas_to_stop) == 1
740-
assert deployment_to_replicas_to_stop[dep_id] < {r2_id, r3_id}
740+
assert deployment_to_replicas_to_stop[dep_id].issubset({r2_id, r3_id})
741741
scheduler.on_replica_stopping(deployment_to_replicas_to_stop[dep_id].pop())
742742

743743
deployment_to_replicas_to_stop = scheduler.schedule(
@@ -861,7 +861,7 @@ def test_downscale_single_deployment():
861861
},
862862
)
863863
assert len(deployment_to_replicas_to_stop) == 1
864-
assert deployment_to_replicas_to_stop[dep_id] == {r1_id, r2_id}
864+
assert deployment_to_replicas_to_stop[dep_id] <= {r1_id, r2_id}
865865
scheduler.on_replica_stopping(r1_id)
866866
scheduler.on_replica_stopping(r2_id)
867867
scheduler.on_deployment_deleted(dep_id)

0 commit comments

Comments
 (0)