[Serve] Prioritize stopping most recently scaled-up replicas during downscaling (#52929)

ktyxx · web-flow · commit eaf2af49c9ad · 2025-11-21T12:19:54.000-08:00
## Why are these changes needed?  This PR improves the downscaling behavior in Ray Serve by modifying the logic in `_get_replicas_to_stop()` within Default `DeploymentScheduler`. Previously, the scheduler selected replicas to stop by traversing the least loaded nodes in ascending order. This often resulted in stopping replicas that had been scheduled earlier and placed optimally using the `_best_fit_node()` strategy. This led to several drawbacks: - Long-lived replicas, which were scheduled on best-fit nodes, were removed first — leading to inefficient reuse of resources. - Recently scaled-up replicas, which were placed on less utilized nodes, were kept longer despite being suboptimal. - Cold-start overhead increased, as newer replicas were removed before fully warming up. This PR reverses the node traversal order during downscaling so that **more recently added replicas are prioritized for termination**, *in cases where other conditions (e.g., running state and number of replicas per node) are equal*. These newer replicas are typically less optimal in placement and not yet fully warmed up. Preserving long-lived replicas improves performance stability and reduces unnecessary resource fragmentation. ## Related issue number  N/A ## Checks - [x] I've signed off every commit(by using the -s flag, i.e., `git commit -s`) in this PR. - [x] I've run `scripts/format.sh` to lint the changes in this PR. - [ ] I've included any doc changes needed for https://docs.ray.io/en/master/. - [ ] I've added any new APIs to the API Reference. For example, if I added a method in Tune, I've added it in `doc/source/tune/api/` under the corresponding `.rst` file. - [ ] I've made sure the tests are passing. Note that there might be a few flaky tests, see the recent failures at https://flakey-tests.ray.io/ - Testing Strategy - [ ] Unit tests - [ ] Release tests - [ ] This PR is not tested :( --------- Signed-off-by: kitae <ryugitae777@gmail.com>
diff --git a/python/ray/serve/_private/deployment_scheduler.py b/python/ray/serve/_private/deployment_scheduler.py
@@ -735,18 +735,26 @@ def _get_replicas_to_stop(
         for (
             pending_launching_recovering_replica
         ) in pending_launching_recovering_replicas:
+            replicas_to_stop.add(pending_launching_recovering_replica)
             if len(replicas_to_stop) == max_num_to_stop:
                 return replicas_to_stop
-            else:
-                replicas_to_stop.add(pending_launching_recovering_replica)
 
-        node_to_running_replicas_of_target_deployment = (
-            self._get_node_to_running_replicas(deployment_id)
-        )
         node_to_running_replicas_of_all_deployments = (
             self._get_node_to_running_replicas()
         )
 
+        # _running_replicas preserves insertion order (oldest → newest).
+        # Reverse once so we have newest → oldest, then bucket by node.
+        ordered_running_replicas = list(self._running_replicas[deployment_id].items())
+        ordered_running_replicas.reverse()
+        ordered_running_replicas_of_target_deployment: Dict[
+            str, List[ReplicaID]
+        ] = defaultdict(list)
+        for replica_id, replica_node_id in ordered_running_replicas:
+            ordered_running_replicas_of_target_deployment[replica_node_id].append(
+                replica_id
+            )
+
         # Replicas on the head node has the lowest priority for downscaling
         # since we cannot relinquish the head node.
         def key(node_and_num_running_replicas_of_all_deployments):
@@ -760,15 +768,14 @@ def key(node_and_num_running_replicas_of_all_deployments):
         for node_id, _ in sorted(
             node_to_running_replicas_of_all_deployments.items(), key=key
         ):
-            if node_id not in node_to_running_replicas_of_target_deployment:
+            if node_id not in ordered_running_replicas_of_target_deployment:
                 continue
-            for running_replica in node_to_running_replicas_of_target_deployment[
-                node_id
-            ]:
+
+            # Newest-first list for this node.
+            for replica_id in ordered_running_replicas_of_target_deployment[node_id]:
+                replicas_to_stop.add(replica_id)
                 if len(replicas_to_stop) == max_num_to_stop:
                     return replicas_to_stop
-                else:
-                    replicas_to_stop.add(running_replica)
 
         return replicas_to_stop
 
diff --git a/python/ray/serve/tests/unit/test_deployment_scheduler.py b/python/ray/serve/tests/unit/test_deployment_scheduler.py
@@ -674,7 +674,7 @@ def test_downscale_multiple_deployments():
     # but it has more replicas of all deployments so
     # we should stop replicas from node2.
     assert len(deployment_to_replicas_to_stop[d1_id]) == 1
-    assert deployment_to_replicas_to_stop[d1_id] < {d1_r2_id, d1_r3_id}
+    assert deployment_to_replicas_to_stop[d1_id].issubset({d1_r2_id, d1_r3_id})
 
     scheduler.on_replica_stopping(d1_r3_id)
     scheduler.on_replica_stopping(d2_r3_id)
@@ -737,7 +737,7 @@ def test_downscale_head_node():
         },
     )
     assert len(deployment_to_replicas_to_stop) == 1
-    assert deployment_to_replicas_to_stop[dep_id] < {r2_id, r3_id}
+    assert deployment_to_replicas_to_stop[dep_id].issubset({r2_id, r3_id})
     scheduler.on_replica_stopping(deployment_to_replicas_to_stop[dep_id].pop())
 
     deployment_to_replicas_to_stop = scheduler.schedule(
@@ -861,7 +861,7 @@ def test_downscale_single_deployment():
         },
     )
     assert len(deployment_to_replicas_to_stop) == 1
-    assert deployment_to_replicas_to_stop[dep_id] == {r1_id, r2_id}
+    assert deployment_to_replicas_to_stop[dep_id] <= {r1_id, r2_id}
     scheduler.on_replica_stopping(r1_id)
     scheduler.on_replica_stopping(r2_id)
     scheduler.on_deployment_deleted(dep_id)