From b96ac42c8f69202ca043c217d0b03e4468431924 Mon Sep 17 00:00:00 2001
From: "Lucia (Lu) Fang" <fanglu@meta.com>
Date: Mon, 22 Sep 2025 13:27:26 -0700
Subject: [PATCH 1/3] allow skip ready check for bench serve

Summary: Allow skip ready check for bench serve through `--skip-ready-check`

Test Plan: `vllm bench serve  --skip-ready-check`

Differential Revision: D82995002

Signed-off-by: Lu Fang <fanglu@fb.com>
---
 vllm/benchmarks/serve.py | 33 +++++++++++++++++++++------------
 1 file changed, 21 insertions(+), 12 deletions(-)

diff --git a/vllm/benchmarks/serve.py b/vllm/benchmarks/serve.py
index 7382782f1165..3279f068c02e 100644
--- a/vllm/benchmarks/serve.py
+++ b/vllm/benchmarks/serve.py
@@ -476,6 +476,7 @@ async def benchmark(
     ramp_up_start_rps: Optional[int] = None,
     ramp_up_end_rps: Optional[int] = None,
     ready_check_timeout_sec: int = 600,
+    skip_ready_check: bool = False,
 ):
     task_type = (TaskType.EMBEDDING if api_url.endswith("/v1/embeddings") else
                  TaskType.GENERATION)
@@ -531,18 +532,19 @@ async def benchmark(
         extra_body=extra_body,
     )
 
-    test_output = await wait_for_endpoint(
-        request_func,
-        test_input,
-        session,
-        timeout_seconds=ready_check_timeout_sec,
-    )
-    if not test_output.success:
-        raise ValueError(
-            "Initial test run failed - Please make sure benchmark arguments "
-            f"are correctly specified. Error: {test_output.error}")
-    else:
-        print("Initial test run completed. Starting main benchmark run...")
+    if not skip_ready_check:
+        test_output = await wait_for_endpoint(
+            request_func,
+            test_input,
+            session,
+            timeout_seconds=ready_check_timeout_sec,
+        )
+        if not test_output.success:
+            raise ValueError(
+                "Initial test run failed - Please make sure benchmark arguments "
+                f"are correctly specified. Error: {test_output.error}")
+        else:
+            print("Initial test run completed. Starting main benchmark run...")
 
     if lora_modules:
         # For each input request, choose a LoRA module at random.
@@ -1153,6 +1155,12 @@ def add_cli_args(parser: argparse.ArgumentParser):
         help="Maximum time to wait for the endpoint to become ready "
         "in seconds (default: 600 seconds / 10 minutes).",
     )
+    parser.add_argument(
+        "--skip-ready-check",
+        action="store_true",
+        help="Skip the ready check. This is useful when the endpoint "
+        "is already ready and the ready check is not needed.",
+    )
 
 
 def main(args: argparse.Namespace) -> dict[str, Any]:
@@ -1272,6 +1280,7 @@ async def main_async(args: argparse.Namespace) -> dict[str, Any]:
         ramp_up_start_rps=args.ramp_up_start_rps,
         ramp_up_end_rps=args.ramp_up_end_rps,
         ready_check_timeout_sec=args.ready_check_timeout_sec,
+        skip_ready_check=args.skip_ready_check,
     )
 
     # Save config and results to json

From 95173828455bfcb30ae6f41547b964933594bb48 Mon Sep 17 00:00:00 2001
From: Lucia Fang <116399278+luccafong@users.noreply.github.com>
Date: Mon, 22 Sep 2025 13:29:37 -0700
Subject: [PATCH 2/3] Update vllm/benchmarks/serve.py

Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com>
Signed-off-by: Lucia Fang <116399278+luccafong@users.noreply.github.com>
Signed-off-by: Lu Fang <fanglu@fb.com>
---
 vllm/benchmarks/serve.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/vllm/benchmarks/serve.py b/vllm/benchmarks/serve.py
index 3279f068c02e..a0457b874306 100644
--- a/vllm/benchmarks/serve.py
+++ b/vllm/benchmarks/serve.py
@@ -545,6 +545,8 @@ async def benchmark(
                 f"are correctly specified. Error: {test_output.error}")
         else:
             print("Initial test run completed. Starting main benchmark run...")
+    else:
+        print("Skipping ready check as requested.")
 
     if lora_modules:
         # For each input request, choose a LoRA module at random.

From f52445dc3390e872f91c1b3ce740fc00a562cea4 Mon Sep 17 00:00:00 2001
From: Lu Fang <fanglu@fb.com>
Date: Mon, 22 Sep 2025 16:03:39 -0700
Subject: [PATCH 3/3] switch to reuse ready_check_timeout_sec=0

Signed-off-by: Lu Fang <fanglu@fb.com>
---
 vllm/benchmarks/serve.py | 20 +++++++-------------
 1 file changed, 7 insertions(+), 13 deletions(-)

diff --git a/vllm/benchmarks/serve.py b/vllm/benchmarks/serve.py
index a0457b874306..2a042802d0d5 100644
--- a/vllm/benchmarks/serve.py
+++ b/vllm/benchmarks/serve.py
@@ -476,7 +476,6 @@ async def benchmark(
     ramp_up_start_rps: Optional[int] = None,
     ramp_up_end_rps: Optional[int] = None,
     ready_check_timeout_sec: int = 600,
-    skip_ready_check: bool = False,
 ):
     task_type = (TaskType.EMBEDDING if api_url.endswith("/v1/embeddings") else
                  TaskType.GENERATION)
@@ -532,7 +531,7 @@ async def benchmark(
         extra_body=extra_body,
     )
 
-    if not skip_ready_check:
+    if ready_check_timeout_sec > 0:
         test_output = await wait_for_endpoint(
             request_func,
             test_input,
@@ -541,12 +540,13 @@ async def benchmark(
         )
         if not test_output.success:
             raise ValueError(
-                "Initial test run failed - Please make sure benchmark arguments "
-                f"are correctly specified. Error: {test_output.error}")
+                "Initial test run failed - Please make sure benchmark "
+                "arguments are correctly specified. "
+                f"Error: {test_output.error}")
         else:
             print("Initial test run completed. Starting main benchmark run...")
     else:
-        print("Skipping ready check as requested.")
+        print("Skipping endpoint ready check.")
 
     if lora_modules:
         # For each input request, choose a LoRA module at random.
@@ -1155,13 +1155,8 @@ def add_cli_args(parser: argparse.ArgumentParser):
         type=int,
         default=600,
         help="Maximum time to wait for the endpoint to become ready "
-        "in seconds (default: 600 seconds / 10 minutes).",
-    )
-    parser.add_argument(
-        "--skip-ready-check",
-        action="store_true",
-        help="Skip the ready check. This is useful when the endpoint "
-        "is already ready and the ready check is not needed.",
+        "in seconds (default: 600 seconds / 10 minutes). If set to 0, "
+        "the ready check will be skipped."
     )
 
 
@@ -1282,7 +1277,6 @@ async def main_async(args: argparse.Namespace) -> dict[str, Any]:
         ramp_up_start_rps=args.ramp_up_start_rps,
         ramp_up_end_rps=args.ramp_up_end_rps,
         ready_check_timeout_sec=args.ready_check_timeout_sec,
-        skip_ready_check=args.skip_ready_check,
     )
 
     # Save config and results to json