From b96ac42c8f69202ca043c217d0b03e4468431924 Mon Sep 17 00:00:00 2001 From: "Lucia (Lu) Fang" Date: Mon, 22 Sep 2025 13:27:26 -0700 Subject: [PATCH 1/3] allow skip ready check for bench serve Summary: Allow skip ready check for bench serve through `--skip-ready-check` Test Plan: `vllm bench serve --skip-ready-check` Differential Revision: D82995002 Signed-off-by: Lu Fang --- vllm/benchmarks/serve.py | 33 +++++++++++++++++++++------------ 1 file changed, 21 insertions(+), 12 deletions(-) diff --git a/vllm/benchmarks/serve.py b/vllm/benchmarks/serve.py index 7382782f1165..3279f068c02e 100644 --- a/vllm/benchmarks/serve.py +++ b/vllm/benchmarks/serve.py @@ -476,6 +476,7 @@ async def benchmark( ramp_up_start_rps: Optional[int] = None, ramp_up_end_rps: Optional[int] = None, ready_check_timeout_sec: int = 600, + skip_ready_check: bool = False, ): task_type = (TaskType.EMBEDDING if api_url.endswith("/v1/embeddings") else TaskType.GENERATION) @@ -531,18 +532,19 @@ async def benchmark( extra_body=extra_body, ) - test_output = await wait_for_endpoint( - request_func, - test_input, - session, - timeout_seconds=ready_check_timeout_sec, - ) - if not test_output.success: - raise ValueError( - "Initial test run failed - Please make sure benchmark arguments " - f"are correctly specified. Error: {test_output.error}") - else: - print("Initial test run completed. Starting main benchmark run...") + if not skip_ready_check: + test_output = await wait_for_endpoint( + request_func, + test_input, + session, + timeout_seconds=ready_check_timeout_sec, + ) + if not test_output.success: + raise ValueError( + "Initial test run failed - Please make sure benchmark arguments " + f"are correctly specified. Error: {test_output.error}") + else: + print("Initial test run completed. Starting main benchmark run...") if lora_modules: # For each input request, choose a LoRA module at random. @@ -1153,6 +1155,12 @@ def add_cli_args(parser: argparse.ArgumentParser): help="Maximum time to wait for the endpoint to become ready " "in seconds (default: 600 seconds / 10 minutes).", ) + parser.add_argument( + "--skip-ready-check", + action="store_true", + help="Skip the ready check. This is useful when the endpoint " + "is already ready and the ready check is not needed.", + ) def main(args: argparse.Namespace) -> dict[str, Any]: @@ -1272,6 +1280,7 @@ async def main_async(args: argparse.Namespace) -> dict[str, Any]: ramp_up_start_rps=args.ramp_up_start_rps, ramp_up_end_rps=args.ramp_up_end_rps, ready_check_timeout_sec=args.ready_check_timeout_sec, + skip_ready_check=args.skip_ready_check, ) # Save config and results to json From 95173828455bfcb30ae6f41547b964933594bb48 Mon Sep 17 00:00:00 2001 From: Lucia Fang <116399278+luccafong@users.noreply.github.com> Date: Mon, 22 Sep 2025 13:29:37 -0700 Subject: [PATCH 2/3] Update vllm/benchmarks/serve.py Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com> Signed-off-by: Lucia Fang <116399278+luccafong@users.noreply.github.com> Signed-off-by: Lu Fang --- vllm/benchmarks/serve.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/vllm/benchmarks/serve.py b/vllm/benchmarks/serve.py index 3279f068c02e..a0457b874306 100644 --- a/vllm/benchmarks/serve.py +++ b/vllm/benchmarks/serve.py @@ -545,6 +545,8 @@ async def benchmark( f"are correctly specified. Error: {test_output.error}") else: print("Initial test run completed. Starting main benchmark run...") + else: + print("Skipping ready check as requested.") if lora_modules: # For each input request, choose a LoRA module at random. From f52445dc3390e872f91c1b3ce740fc00a562cea4 Mon Sep 17 00:00:00 2001 From: Lu Fang Date: Mon, 22 Sep 2025 16:03:39 -0700 Subject: [PATCH 3/3] switch to reuse ready_check_timeout_sec=0 Signed-off-by: Lu Fang --- vllm/benchmarks/serve.py | 20 +++++++------------- 1 file changed, 7 insertions(+), 13 deletions(-) diff --git a/vllm/benchmarks/serve.py b/vllm/benchmarks/serve.py index a0457b874306..2a042802d0d5 100644 --- a/vllm/benchmarks/serve.py +++ b/vllm/benchmarks/serve.py @@ -476,7 +476,6 @@ async def benchmark( ramp_up_start_rps: Optional[int] = None, ramp_up_end_rps: Optional[int] = None, ready_check_timeout_sec: int = 600, - skip_ready_check: bool = False, ): task_type = (TaskType.EMBEDDING if api_url.endswith("/v1/embeddings") else TaskType.GENERATION) @@ -532,7 +531,7 @@ async def benchmark( extra_body=extra_body, ) - if not skip_ready_check: + if ready_check_timeout_sec > 0: test_output = await wait_for_endpoint( request_func, test_input, @@ -541,12 +540,13 @@ async def benchmark( ) if not test_output.success: raise ValueError( - "Initial test run failed - Please make sure benchmark arguments " - f"are correctly specified. Error: {test_output.error}") + "Initial test run failed - Please make sure benchmark " + "arguments are correctly specified. " + f"Error: {test_output.error}") else: print("Initial test run completed. Starting main benchmark run...") else: - print("Skipping ready check as requested.") + print("Skipping endpoint ready check.") if lora_modules: # For each input request, choose a LoRA module at random. @@ -1155,13 +1155,8 @@ def add_cli_args(parser: argparse.ArgumentParser): type=int, default=600, help="Maximum time to wait for the endpoint to become ready " - "in seconds (default: 600 seconds / 10 minutes).", - ) - parser.add_argument( - "--skip-ready-check", - action="store_true", - help="Skip the ready check. This is useful when the endpoint " - "is already ready and the ready check is not needed.", + "in seconds (default: 600 seconds / 10 minutes). If set to 0, " + "the ready check will be skipped." ) @@ -1282,7 +1277,6 @@ async def main_async(args: argparse.Namespace) -> dict[str, Any]: ramp_up_start_rps=args.ramp_up_start_rps, ramp_up_end_rps=args.ramp_up_end_rps, ready_check_timeout_sec=args.ready_check_timeout_sec, - skip_ready_check=args.skip_ready_check, ) # Save config and results to json