[serve] deflake test_autoscaling_policy_with_metr_disab (#54458)

zcin · web-flow · commit 7e05314ac127 · 2025-07-09T10:48:10.000-07:00
Deflake `test_autoscaling_policy_with_metr_disab.py::TestAutoscalingMetrics::test_basic` When `RAY_SERVE_COLLECT_AUTOSCALING_METRICS_ON_HANDLE=0`, we collect ongoing request metrics at the replica and queued request metrics at the handle -- but ongoing request metrics are updated very fast while queued metrics are sent every 10s. Because of this delay the total number of ongoing requests climbs to almost 100 because before the queued request metrics are flushed, almost every request is double counted. Example: https://buildkite.com/ray-project/postmerge/builds/11322#0197eaca-62e1-457d-947b-a981210e98b9/177-852 Note that we are sending exactly 50 requests and expect the number of replicas to scale to exactly 5. However the metrics grow above 50 here, almost to 100, which causes the test to be flaky / fail. This pr sets the env var `RAY_SERVE_HANDLE_METRIC_PUSH_INTERVAL_S=0.1` and pairs with other stabilizing changes. Signed-off-by: Cindy Zhang <cindyzyx9@gmail.com>
diff --git a/python/ray/serve/tests/BUILD b/python/ray/serve/tests/BUILD
@@ -291,7 +291,11 @@ py_test(
     name = "test_autoscaling_policy_with_metr_disab",
     size = "large",
     srcs = ["test_autoscaling_policy.py"],
-    env = {"RAY_SERVE_COLLECT_AUTOSCALING_METRICS_ON_HANDLE": "0"},
+    env = {
+        "RAY_SERVE_COLLECT_AUTOSCALING_METRICS_ON_HANDLE": "0",
+        # Make sure queued metrics are cleared out quickly.
+        "RAY_SERVE_HANDLE_METRIC_PUSH_INTERVAL_S": "0.1",
+    },
     main = "test_autoscaling_policy.py",
     tags = [
         "autoscaling",
diff --git a/python/ray/serve/tests/test_autoscaling_policy.py b/python/ray/serve/tests/test_autoscaling_policy.py
@@ -58,6 +58,13 @@ def get_deployment_start_time(controller: ServeController, name: str):
     return deployment_info.start_time_ms
 
 
+def check_num_queued_requests_eq(handle: DeploymentHandle, expected: int):
+    assert (
+        handle._router._asyncio_router._metrics_manager.num_queued_requests == expected
+    )
+    return True
+
+
 def assert_no_replicas_deprovisioned(
     replica_ids_1: Iterable[ReplicaID], replica_ids_2: Iterable[ReplicaID]
 ) -> None:
@@ -135,12 +142,9 @@ def test_basic(self, serve_instance):
                 "max_replicas": 10,
                 "target_ongoing_requests": 10,
                 "upscale_delay_s": 0,
-                "downscale_delay_s": 0,
+                "downscale_delay_s": 5,
                 "look_back_period_s": 1,
             },
-            # We will send many requests. This will make sure replicas are
-            # killed quickly during cleanup.
-            graceful_shutdown_timeout_s=1,
             max_ongoing_requests=25,
             version="v1",
         )
@@ -154,24 +158,27 @@ async def __call__(self):
 
         # Wait for metrics to propagate
         wait_for_condition(check_num_requests_ge, client=client, id=dep_id, expected=1)
-        print("Autoscaling metrics started recording on controller.")
+        tlog("Autoscaling metrics started recording on controller.")
 
         # Many queries should be inflight.
         wait_for_condition(check_num_requests_ge, client=client, id=dep_id, expected=45)
-        print("Confirmed many queries are inflight.")
+        tlog("Confirmed many queries are inflight.")
+
+        wait_for_condition(check_num_queued_requests_eq, handle=handle, expected=0)
+        tlog("Confirmed all requests are assigned to replicas.")
 
         wait_for_condition(check_num_replicas_eq, name="A", target=5)
-        print("Confirmed deployment scaled to 5 replicas.")
-        print("Releasing signal.")
+        tlog("Confirmed deployment scaled to 5 replicas.")
+        tlog("Releasing signal.")
         signal.send.remote()
 
         # After traffic stops, num replica should drop to 1
         wait_for_condition(check_num_replicas_eq, name="A", target=1, timeout=15)
-        print("Num replicas dropped to 1.")
+        tlog("Num replicas dropped to 1.")
 
         # Request metrics should drop to 0
         wait_for_condition(check_num_requests_eq, client=client, id=dep_id, expected=0)
-        print("Queued and ongoing requests dropped to 0.")
+        tlog("Queued and ongoing requests dropped to 0.")
 
     @pytest.mark.parametrize("use_generator", [True, False])
     def test_replicas_die(self, serve_instance_with_signal, use_generator):