vllm-project
diff --git a/‎tests/async_engine/api_server_async_engine.py‎
Lines changed: 8 additions & 6 deletions b/‎tests/async_engine/api_server_async_engine.py‎
Lines changed: 8 additions & 6 deletions
diff --git a/‎tests/async_engine/conftest.py‎
Lines changed: 1 addition & 1 deletion b/‎tests/async_engine/conftest.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎tests/async_engine/test_api_server.py‎
Lines changed: 18 additions & 13 deletions b/‎tests/async_engine/test_api_server.py‎
Lines changed: 18 additions & 13 deletions
diff --git a/‎tests/async_engine/test_async_llm_engine.py‎
Lines changed: 30 additions & 30 deletions b/‎tests/async_engine/test_async_llm_engine.py‎
Lines changed: 30 additions & 30 deletions
diff --git a/‎tests/async_engine/test_request_tracker.py‎
Lines changed: 2 additions & 1 deletion b/‎tests/async_engine/test_request_tracker.py‎
Lines changed: 2 additions & 1 deletion
@@ -1,6 +1,7 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """vllm.entrypoints.api_server with some extra logging for testing."""
+
 from collections.abc import Iterable
 from typing import Any
 
@@ -17,7 +18,6 @@
 
 
 class AsyncLLMEngineWithStats(AsyncLLMEngine):
-
     def __init__(self, *args, **kwargs):
         super().__init__(*args, **kwargs)
         self._num_aborts = 0
@@ -47,8 +47,10 @@ def stats() -> Response:
     engine_args = AsyncEngineArgs.from_cli_args(args)
     engine = AsyncLLMEngineWithStats.from_engine_args(engine_args)
     vllm.entrypoints.api_server.engine = engine
-    uvicorn.run(app,
-                host=args.host,
-                port=args.port,
-                log_level="debug",
-                timeout_keep_alive=envs.VLLM_HTTP_TIMEOUT_KEEP_ALIVE)
+    uvicorn.run(
+        app,
+        host=args.host,
+        port=args.port,
+        log_level="debug",
+        timeout_keep_alive=envs.VLLM_HTTP_TIMEOUT_KEEP_ALIVE,
+    )
@@ -9,4 +9,4 @@ def use_v0_only(monkeypatch):
     Since this module is V0 only, set VLLM_USE_V1=0 for
     all tests in the module.
     """
-    monkeypatch.setenv('VLLM_USE_V1', '0')
+    monkeypatch.setenv("VLLM_USE_V1", "0")
@@ -13,13 +13,15 @@
 
 
 def _query_server(prompt: str, max_tokens: int = 5) -> dict:
-    response = requests.post("http://localhost:8000/generate",
-                             json={
-                                 "prompt": prompt,
-                                 "max_tokens": max_tokens,
-                                 "temperature": 0,
-                                 "ignore_eos": True
-                             })
+    response = requests.post(
+        "http://localhost:8000/generate",
+        json={
+            "prompt": prompt,
+            "max_tokens": max_tokens,
+            "temperature": 0,
+            "ignore_eos": True,
+        },
+    )
     response.raise_for_status()
     return response.json()
 
@@ -30,8 +32,9 @@ def _query_server_long(prompt: str) -> dict:
 
 @pytest.fixture
 def api_server(distributed_executor_backend: str):
-    script_path = Path(__file__).parent.joinpath(
-        "api_server_async_engine.py").absolute()
+    script_path = (
+        Path(__file__).parent.joinpath("api_server_async_engine.py").absolute()
+    )
     commands = [
         sys.executable,
         "-u",
@@ -80,8 +83,9 @@ def test_api_server(api_server, distributed_executor_backend: str):
         for result in pool.map(_query_server, prompts):
             assert result
 
-        num_aborted_requests = requests.get(
-            "http://localhost:8000/stats").json()["num_aborted_requests"]
+        num_aborted_requests = requests.get("http://localhost:8000/stats").json()[
+            "num_aborted_requests"
+        ]
         assert num_aborted_requests == 0
 
         # Try with 100 prompts
@@ -101,8 +105,9 @@ def test_api_server(api_server, distributed_executor_backend: str):
         # give it some times to update the stats
         time.sleep(1)
 
-        num_aborted_requests = requests.get(
-            "http://localhost:8000/stats").json()["num_aborted_requests"]
+        num_aborted_requests = requests.get("http://localhost:8000/stats").json()[
+            "num_aborted_requests"
+        ]
         assert num_aborted_requests > 0
 
     # check that server still runs after cancellations
 
@@ -36,7 +36,6 @@ class MockModelConfig:
 
 
 class MockEngine:
-
     def __init__(self):
         self.step_calls = 0
         self.add_request_calls = 0
@@ -49,8 +48,7 @@ def __init__(self):
     async def step_async(self, virtual_engine):
         # PP size is 1, ignore virtual engine
         self.step_calls += 1
-        return [RequestOutput(
-            request_id=self.request_id)] if self.request_id else []
+        return [RequestOutput(request_id=self.request_id)] if self.request_id else []
 
     async def process_model_inputs_async(self, *args, **kwargs):
         pass
@@ -67,7 +65,7 @@ def stop_generating(self):
     def add_request(self, **kwargs):
         del kwargs  # Unused
         self.add_request_calls += 1
-        print(f'Request calls: {self.add_request_calls}')
+        print(f"Request calls: {self.add_request_calls}")
 
     async def add_request_async(self, **kwargs):
         self.add_request_calls += 1
@@ -142,9 +140,12 @@ def start_engine():
     print(f"Starting engine with num_scheduler_steps={num_scheduler_steps}")
 
     return AsyncLLMEngine.from_engine_args(
-        AsyncEngineArgs(model="facebook/opt-125m",
-                        enforce_eager=True,
-                        num_scheduler_steps=num_scheduler_steps))
+        AsyncEngineArgs(
+            model="facebook/opt-125m",
+            enforce_eager=True,
+            num_scheduler_steps=num_scheduler_steps,
+        )
+    )
 
 
 def uid() -> str:
@@ -157,8 +158,9 @@ async def async_engine():
     # scoped fixture and monkeypatch is function scoped.
     previous_value = os.getenv("VLLM_USE_V1", None)
     os.environ["VLLM_USE_V1"] = "0"
-    engine = await asyncio.get_event_loop().run_in_executor(executor=None,
-                                                            func=start_engine)
+    engine = await asyncio.get_event_loop().run_in_executor(
+        executor=None, func=start_engine
+    )
     try:
         yield engine
     finally:
@@ -182,7 +184,6 @@ def should_do_global_cleanup_after_test(request) -> bool:
 @pytest.mark.asyncio(scope="module")
 @pytest.mark.parametrize("stop", [None, ["a stop string"]])
 async def test_asyncio_run(async_engine, stop):
-
     scheduler_config = await async_engine.get_scheduler_config()
     num_scheduler_steps = scheduler_config.num_scheduler_steps
 
@@ -196,9 +197,9 @@ async def run(prompt: str):
 
         output_count = 0
         final_output = None
-        async for output in async_engine.generate(prompt,
-                                                  sampling_params,
-                                                  request_id=uid()):
+        async for output in async_engine.generate(
+            prompt, sampling_params, request_id=uid()
+        ):
             output_count += 1
             final_output = output
         return final_output, output_count
@@ -247,18 +248,19 @@ async def run(prompt: str, kind: RequestOutputKind):
 
         output_count = 0
         final_output = None
-        async for output in async_engine.generate(prompt,
-                                                  params,
-                                                  request_id=uid()):
+        async for output in async_engine.generate(prompt, params, request_id=uid()):
             output_count += 1
             final_output = output
 
         assert final_output is not None
         assert final_output.finished
 
-        return (final_output.prompt_token_ids,
-                final_output.outputs[0].token_ids,
-                final_output.outputs[0].text, output_count)
+        return (
+            final_output.prompt_token_ids,
+            final_output.outputs[0].token_ids,
+            final_output.outputs[0].text,
+            output_count,
+        )
 
     async def run_deltas(prompt: str):
         params = copy(sampling_params)
@@ -269,9 +271,7 @@ async def run_deltas(prompt: str):
         output_text = ""
         output_count = 0
         final_output = None
-        async for output in async_engine.generate(prompt,
-                                                  params,
-                                                  request_id=uid()):
+        async for output in async_engine.generate(prompt, params, request_id=uid()):
             token_ids = output.outputs[0].token_ids
             text = output.outputs[0].text
             final_output = output
@@ -298,7 +298,8 @@ async def run_deltas(prompt: str):
     results = await asyncio.gather(
         run("common input prompt", RequestOutputKind.CUMULATIVE),
         run("common input prompt", RequestOutputKind.FINAL_ONLY),
-        run_deltas("common input prompt"))
+        run_deltas("common input prompt"),
+    )
 
     # Make sure outputs are the same
     prompt_set = set(tuple(prompt_ids) for prompt_ids, _, _, _ in results)
@@ -342,9 +343,9 @@ async def test_cancellation(async_engine, stop):
 
     i = 0
     with pytest.raises(CancelledError):
-        async for output in async_engine.generate("test2",
-                                                  sampling_params,
-                                                  request_id=request_id):
+        async for output in async_engine.generate(
+            "test2", sampling_params, request_id=request_id
+        ):
             assert not output.finished
             i += 1
             if i == stop_at:
@@ -402,8 +403,7 @@ async def test_invalid_argument(async_engine):
 
     # Targeting specific DP rank only supported in v1 multi-instance DP
     with pytest.raises(ValueError):
-        async for _ in async_engine.generate("test",
-                                             sampling_params,
-                                             request_id=uid(),
-                                             data_parallel_rank=0):
+        async for _ in async_engine.generate(
+            "test", sampling_params, request_id=uid(), data_parallel_rank=0
+        ):
             pass
@@ -60,7 +60,8 @@ async def test_request_tracker():
     stream_5 = tracker.add_request("5")
     assert tracker.new_requests_event.is_set()
     tracker.process_request_output(
-        RequestOutput("2", "output", [], [], [], finished=True))
+        RequestOutput("2", "output", [], [], [], finished=True)
+    )
     await tracker.wait_for_new_requests()
     new, aborted = tracker.get_new_and_aborted_requests()
     assert not tracker.new_requests_event.is_set()