make a helper fct

Andrew Xia · Andrew Xia · commit eb6dd74d025c · 2025-09-29T20:56:25.000-07:00
Signed-off-by: Andrew Xia &lt;axia@fb.com&gt;
diff --git a/tests/entrypoints/openai/test_response_api_with_harmony.py b/tests/entrypoints/openai/test_response_api_with_harmony.py
@@ -3,6 +3,7 @@
 
 import json
 import os
+import textwrap
 import time
 
 import pytest
@@ -771,43 +772,44 @@ async def test_output_messages_enabled(client: OpenAI, model_name: str,
 
 
 @pytest.fixture(scope="module")
-def server_with_mock(monkeypatch_module: pytest.MonkeyPatch, tmp_path_factory):
-    import textwrap
+def server_with_mock_render_for_completion(
+        monkeypatch_module: pytest.MonkeyPatch, tmp_path_factory):
 
     args = ["--enforce-eager", "--tool-server", "demo"]
 
     # Create a sitecustomize.py that patches render_for_completion
     # Python automatically imports sitecustomize on startup if it's in sys.path
     tmp_dir = tmp_path_factory.mktemp("test_setup")
     sitecustomize = tmp_dir / "sitecustomize.py"
-    sitecustomize.write_text(textwrap.dedent("""
+    sitecustomize.write_text(
+        textwrap.dedent("""
         import os
-        if os.environ.get('VLLM_TEST_MOCK_LARGE_PROMPT') == '1':
-            from unittest.mock import patch
+        from unittest.mock import patch
 
-            # Mock render_for_completion to return a large token list
-            def mock_render_for_completion(messages):
-                return list(range(1000000))  # Return 1M tokens for testing
+        # Mock render_for_completion to return a large token list
+        def mock_render_for_completion(messages):
+            return list(range(1000000))  # Return 1M tokens for testing
 
-            # Patch it at module level before it's imported
-            patch('vllm.entrypoints.harmony_utils.render_for_completion',
-                  mock_render_for_completion).start()
+        # Patch it at module level before it's imported
+        patch('vllm.entrypoints.harmony_utils.render_for_completion',
+                mock_render_for_completion).start()
     """))
 
     with monkeypatch_module.context() as m:
         m.setenv("VLLM_ENABLE_RESPONSES_API_STORE", "1")
-        m.setenv("VLLM_TEST_MOCK_LARGE_PROMPT", "1")
         # Add tmp_dir to PYTHONPATH so sitecustomize.py is found
-        current_pythonpath = os.environ.get("PYTHONPATH", "")
-        new_pythonpath = f"{tmp_dir}:{current_pythonpath}" if current_pythonpath else str(tmp_dir)
+        curr_path = os.environ.get("PYTHONPATH", "")
+        new_pythonpath = f"{tmp_dir}:{curr_path}" if curr_path else str(
+            tmp_dir)
         m.setenv("PYTHONPATH", new_pythonpath)
         with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
             yield remote_server
 
 
 @pytest_asyncio.fixture
-async def client_with_mock(server_with_mock):
-    async with server_with_mock.get_async_client() as async_client:
+async def client_with_mock(server_with_mock_render_for_completion):
+    async with server_with_mock_render_for_completion.get_async_client(
+    ) as async_client:
         yield async_client
 
 
@@ -822,7 +824,7 @@ async def test_prompt_length_exceeds_max_model_len(client_with_mock: OpenAI,
             input="hello",
         )
 
-    # Verify the error message matches what's expected from lines 287-294
+    # Verify the error message matches what's expected
     error = exc_info.value
     assert "'The engine prompt length" in str(error)
     assert "Please reduce prompt" in str(error)
diff --git a/vllm/entrypoints/openai/serving_responses.py b/vllm/entrypoints/openai/serving_responses.py
@@ -189,6 +189,23 @@ def __init__(
 
         self.tool_server = tool_server
 
+    def _validate_generator_input(
+            self,
+            engine_prompt: EngineTokensPrompt) -> Optional[ErrorResponse]:
+        """Add validations to the input to the generator here."""
+        if self.max_model_len <= len(engine_prompt["prompt_token_ids"]):
+            error_message = (
+                "The engine prompt length"
+                f" {len(engine_prompt['prompt_token_ids'])} "
+                f"exceeds the max_model_len {self.max_model_len}. "
+                "Please reduce prompt.")
+            return self.create_error_response(
+                err_type="invalid_request_error",
+                message=error_message,
+                status_code=HTTPStatus.BAD_REQUEST,
+            )
+        return None
+
     async def create_responses(
         self,
         request: ResponsesRequest,
@@ -284,19 +301,10 @@ async def create_responses(
             available_tools = []
         try:
             for i, engine_prompt in enumerate(engine_prompts):
+                maybe_error = self._validate_generator_input(engine_prompt)
+                if maybe_error is not None:
+                    return maybe_error
 
-                if self.max_model_len <= len(
-                        engine_prompt["prompt_token_ids"]):
-                    error_message = (
-                        "The engine prompt length"
-                        f" {len(engine_prompt['prompt_token_ids'])} "
-                        f"exceeds the max_model_len {self.max_model_len}. "
-                        "Please reduce prompt.")
-                    return self.create_error_response(
-                        err_type="invalid_request_error",
-                        message=error_message,
-                        status_code=HTTPStatus.BAD_REQUEST,
-                    )
                 default_max_tokens = self.max_model_len - len(
                     engine_prompt["prompt_token_ids"])