[Core] Rename PromptInputs and inputs(vllm-project#8673)

agt · Sep 24, 2024 · f072d49 · f072d49
1 parent 21d1a4b
commit f072d49
Show file tree

Hide file tree

Showing 18 changed files with 157 additions and 162 deletions.
diff --git a/benchmarks/benchmark_latency.py b/benchmarks/benchmark_latency.py
@@ -11,7 +11,7 @@
 
 from vllm import LLM, SamplingParams
 from vllm.engine.arg_utils import DEVICE_OPTIONS, EngineArgs
-from vllm.inputs import PromptInputs
+from vllm.inputs import PromptType
 from vllm.model_executor.layers.quantization import QUANTIZATION_METHODS
 from vllm.utils import FlexibleArgumentParser
 
@@ -61,7 +61,7 @@ def main(args: argparse.Namespace):
     dummy_prompt_token_ids = np.random.randint(10000,
                                                size=(args.batch_size,
                                                      args.input_len))
-    dummy_inputs: List[PromptInputs] = [{
+    dummy_prompts: List[PromptType] = [{
         "prompt_token_ids": batch
     } for batch in dummy_prompt_token_ids.tolist()]
 
@@ -74,13 +74,13 @@ def run_to_completion(profile_dir: Optional[str] = None):
                     ],
                     on_trace_ready=torch.profiler.tensorboard_trace_handler(
                         str(profile_dir))) as p:
-                llm.generate(dummy_inputs,
+                llm.generate(dummy_prompts,
                              sampling_params=sampling_params,
                              use_tqdm=False)
             print(p.key_averages())
         else:
             start_time = time.perf_counter()
-            llm.generate(dummy_inputs,
+            llm.generate(dummy_prompts,
                          sampling_params=sampling_params,
                          use_tqdm=False)
             end_time = time.perf_counter()

diff --git a/docs/source/dev/multimodal/multimodal_index.rst b/docs/source/dev/multimodal/multimodal_index.rst
@@ -8,7 +8,7 @@ Multi-Modality
 vLLM provides experimental support for multi-modal models through the :mod:`vllm.multimodal` package.
 
 Multi-modal inputs can be passed alongside text and token prompts to :ref:`supported models <supported_vlms>`
-via the ``multi_modal_data`` field in :class:`vllm.inputs.PromptInputs`.
+via the ``multi_modal_data`` field in :class:`vllm.inputs.PromptType`.
 
 Currently, vLLM only has built-in support for image data. You can extend vLLM to process additional modalities
 by following :ref:`this guide <adding_multimodal_plugin>`.

diff --git a/docs/source/dev/offline_inference/llm_inputs.rst b/docs/source/dev/offline_inference/llm_inputs.rst
@@ -1,7 +1,7 @@
 LLM Inputs
 ==========
 
-.. autodata:: vllm.inputs.PromptInputs
+.. autodata:: vllm.inputs.PromptType
 
 .. autoclass:: vllm.inputs.TextPrompt
     :show-inheritance:

diff --git a/docs/source/models/vlm.rst b/docs/source/models/vlm.rst
@@ -27,7 +27,7 @@ The :class:`~vllm.LLM` class can be instantiated in much the same way as languag
     We have removed all vision language related CLI args in the ``0.5.1`` release. **This is a breaking change**, so please update your code to follow
     the above snippet. Specifically, ``image_feature_size`` can no longer be specified as we now calculate that internally for each model.
 
-To pass an image to the model, note the following in :class:`vllm.inputs.PromptInputs`:
+To pass an image to the model, note the following in :class:`vllm.inputs.PromptType`:
 
 * ``prompt``: The prompt should follow the format that is documented on HuggingFace.
 * ``multi_modal_data``: This is a dictionary that follows the schema defined in :class:`vllm.multimodal.MultiModalDataDict`. 

diff --git a/tests/mq_llm_engine/test_error_handling.py b/tests/mq_llm_engine/test_error_handling.py
@@ -61,15 +61,15 @@ async def test_evil_forward(tmp_socket):
 
         # Throws an error in first forward pass.
         with pytest.raises(RAISED_ERROR):
-            async for _ in client.generate(inputs="Hello my name is",
+            async for _ in client.generate(prompt="Hello my name is",
                                            sampling_params=SamplingParams(),
                                            request_id=uuid.uuid4()):
                 pass
         assert client.errored
 
         # Engine is errored, should get ENGINE_DEAD_ERROR.
         with pytest.raises(MQEngineDeadError):
-            async for _ in client.generate(inputs="Hello my name is",
+            async for _ in client.generate(prompt="Hello my name is",
                                            sampling_params=SamplingParams(),
                                            request_id=uuid.uuid4()):
                 pass
@@ -118,7 +118,7 @@ async def test_failed_health_check(tmp_socket):
 
         # Generate call should throw ENGINE_DEAD_ERROR
         with pytest.raises(MQEngineDeadError):
-            async for _ in client.generate(inputs="Hello my name is",
+            async for _ in client.generate(prompt="Hello my name is",
                                            sampling_params=SamplingParams(),
                                            request_id=uuid.uuid4()):
                 pass
@@ -165,7 +165,7 @@ async def bad_abort_after_2s():
         # with reference to the original KeyError("foo")
         with pytest.raises(MQEngineDeadError) as execinfo:
             async for _ in client.generate(
-                    inputs="Hello my name is",
+                    prompt="Hello my name is",
                     sampling_params=SamplingParams(max_tokens=2000),
                     request_id=uuid.uuid4()):
                 pass
@@ -190,7 +190,7 @@ async def test_bad_request(tmp_socket):
 
         # Invalid request should fail, but not crash the server.
         with pytest.raises(ValueError):
-            async for _ in client.generate(inputs="Hello my name is",
+            async for _ in client.generate(prompt="Hello my name is",
                                            sampling_params=SamplingParams(),
                                            request_id="abcd-1",
                                            lora_request=LoRARequest(
@@ -199,7 +199,7 @@ async def test_bad_request(tmp_socket):
                 pass
 
         # This request should be okay.
-        async for _ in client.generate(inputs="Hello my name is",
+        async for _ in client.generate(prompt="Hello my name is",
                                        sampling_params=SamplingParams(),
                                        request_id="abcd-2"):
             pass

diff --git a/tests/mq_llm_engine/utils.py b/tests/mq_llm_engine/utils.py
@@ -20,7 +20,7 @@ async def generate(
     count = 0
     async for out in client.generate(
             request_id=request_id,
-            inputs="Hello my name is Robert and",
+            prompt="Hello my name is Robert and",
             sampling_params=SamplingParams(max_tokens=num_tokens,
                                            temperature=0)):
 

diff --git a/vllm/__init__.py b/vllm/__init__.py
@@ -5,7 +5,7 @@
 from vllm.engine.llm_engine import LLMEngine
 from vllm.entrypoints.llm import LLM
 from vllm.executor.ray_utils import initialize_ray_cluster
-from vllm.inputs import PromptInputs, TextPrompt, TokensPrompt
+from vllm.inputs import PromptType, TextPrompt, TokensPrompt
 from vllm.model_executor.models import ModelRegistry
 from vllm.outputs import (CompletionOutput, EmbeddingOutput,
                           EmbeddingRequestOutput, RequestOutput)
@@ -19,7 +19,7 @@
     "__version__",
     "LLM",
     "ModelRegistry",
-    "PromptInputs",
+    "PromptType",
     "TextPrompt",
     "TokensPrompt",
     "SamplingParams",

diff --git a/vllm/engine/async_llm_engine.py b/vllm/engine/async_llm_engine.py
@@ -17,7 +17,7 @@
 from vllm.executor.executor_base import ExecutorAsyncBase
 from vllm.executor.gpu_executor import GPUExecutorAsync
 from vllm.executor.ray_utils import initialize_ray_cluster
-from vllm.inputs import PromptInputs
+from vllm.inputs import PromptType
 from vllm.logger import init_logger
 from vllm.lora.request import LoRARequest
 from vllm.model_executor.layers.sampler import SamplerOutput
@@ -405,7 +405,7 @@ async def stop_remote_worker_execution_loop_async(self) -> None:
     async def add_request_async(
         self,
         request_id: str,
-        inputs: PromptInputs,
+        prompt: PromptType,
         params: Union[SamplingParams, PoolingParams],
         arrival_time: Optional[float] = None,
         lora_request: Optional[LoRARequest] = None,
@@ -420,7 +420,7 @@ async def add_request_async(
             arrival_time = time.time()
 
         preprocessed_inputs = await self.input_preprocessor.preprocess_async(
-            inputs,
+            prompt,
             request_id=request_id,
             lora_request=lora_request,
             prompt_adapter_request=prompt_adapter_request,
@@ -777,7 +777,7 @@ async def run_engine_loop(engine_ref: ReferenceType):
     async def add_request(
         self,
         request_id: str,
-        inputs: PromptInputs,
+        prompt: PromptType,
         params: Union[SamplingParams, PoolingParams],
         arrival_time: Optional[float] = None,
         lora_request: Optional[LoRARequest] = None,
@@ -797,7 +797,7 @@ async def add_request(
         stream = self._request_tracker.add_request(
             request_id,
             verbose=self.log_requests,
-            inputs=inputs,
+            prompt=prompt,
             params=params,
             arrival_time=arrival_time or time.time(),
             lora_request=lora_request,
@@ -808,7 +808,7 @@ async def add_request(
 
     async def generate(
         self,
-        inputs: PromptInputs,
+        prompt: PromptType,
         sampling_params: SamplingParams,
         request_id: str,
         lora_request: Optional[LoRARequest] = None,
@@ -822,8 +822,7 @@ async def generate(
         from the LLMEngine to the caller.
 
         Args:
-            inputs: The inputs to the LLM. See
-                :class:`~vllm.inputs.PromptInputs`
+            prompt: The prompt to the LLM. See :class:`~vllm.inputs.PromptType`
                 for more details about the format of each input.
             sampling_params: The sampling parameters of the request.
             request_id: The unique id of the request.
@@ -881,7 +880,7 @@ async def generate(
         """
         async for output in await self.add_request(
                 request_id,
-                inputs,
+                prompt,
                 sampling_params,
                 lora_request=lora_request,
                 trace_headers=trace_headers,
@@ -891,7 +890,7 @@ async def generate(
 
     async def encode(
         self,
-        inputs: PromptInputs,
+        prompt: PromptType,
         pooling_params: PoolingParams,
         request_id: str,
         lora_request: Optional[LoRARequest] = None,
@@ -904,8 +903,7 @@ async def encode(
         from the LLMEngine to the caller.
 
         Args:
-            inputs: The inputs to the LLM. See
-                :class:`~vllm.inputs.PromptInputs`
+            prompt: The prompt to the LLM. See :class:`~vllm.inputs.PromptType`
                 for more details about the format of each input.
             pooling_params: The pooling parameters of the request.
             request_id: The unique id of the request.
@@ -959,7 +957,7 @@ async def encode(
         """
         async for output in await self.add_request(
                 request_id,
-                inputs,
+                prompt,
                 pooling_params,
                 lora_request=lora_request,
                 trace_headers=trace_headers,

diff --git a/vllm/engine/llm_engine.py b/vllm/engine/llm_engine.py
@@ -29,7 +29,7 @@
 from vllm.executor.gpu_executor import GPUExecutor
 from vllm.executor.ray_utils import initialize_ray_cluster
 from vllm.inputs import (INPUT_REGISTRY, EncoderDecoderLLMInputs,
-                         InputRegistry, LLMInputs, PromptInputs)
+                         InputRegistry, LLMInputs, PromptType)
 from vllm.inputs.preprocess import InputPreprocessor
 from vllm.logger import init_logger
 from vllm.lora.request import LoRARequest
@@ -680,7 +680,7 @@ def stop_remote_worker_execution_loop(self) -> None:
     def add_request(
         self,
         request_id: str,
-        inputs: PromptInputs,
+        prompt: PromptType,
         params: Union[SamplingParams, PoolingParams],
         arrival_time: Optional[float] = None,
         lora_request: Optional[LoRARequest] = None,
@@ -695,8 +695,7 @@ def add_request(
 
         Args:
             request_id: The unique ID of the request.
-            inputs: The inputs to the LLM. See
-                :class:`~vllm.inputs.PromptInputs`
+            prompt: The prompt to the LLM. See :class:`~vllm.inputs.PromptType`
                 for more details about the format of each input.
             params: Parameters for sampling or pooling.
                 :class:`~vllm.SamplingParams` for text generation.
@@ -736,7 +735,7 @@ def add_request(
             arrival_time = time.time()
 
         preprocessed_inputs = self.input_preprocessor.preprocess(
-            inputs,
+            prompt,
             request_id=request_id,
             lora_request=lora_request,
             prompt_adapter_request=prompt_adapter_request,

diff --git a/vllm/engine/multiprocessing/__init__.py b/vllm/engine/multiprocessing/__init__.py
@@ -3,7 +3,7 @@
 from typing import List, Mapping, Optional, Union
 
 from vllm import PoolingParams
-from vllm.inputs import PromptInputs
+from vllm.inputs import PromptType
 from vllm.lora.request import LoRARequest
 from vllm.outputs import RequestOutput
 from vllm.prompt_adapter.request import PromptAdapterRequest
@@ -23,7 +23,7 @@ class MQEngineDeadError(RuntimeError):
 
 @dataclass
 class RPCProcessRequest:
-    inputs: PromptInputs
+    prompt: PromptType
     params: Union[SamplingParams, PoolingParams]
     request_id: str
     lora_request: Optional[LoRARequest] = None

diff --git a/vllm/engine/multiprocessing/client.py b/vllm/engine/multiprocessing/client.py
@@ -25,7 +25,7 @@
                                          RPCStartupResponse)
 # yapf: enable
 from vllm.envs import VLLM_RPC_TIMEOUT
-from vllm.inputs import PromptInputs
+from vllm.inputs import PromptType
 from vllm.logger import init_logger
 from vllm.lora.request import LoRARequest
 from vllm.outputs import EmbeddingRequestOutput, RequestOutput
@@ -375,7 +375,7 @@ def dead_error(self) -> BaseException:
 
     def generate(
         self,
-        inputs: PromptInputs,
+        prompt: PromptType,
         sampling_params: SamplingParams,
         request_id: str,
         lora_request: Optional[LoRARequest] = None,
@@ -389,8 +389,7 @@ def generate(
         from the LLMEngine to the caller.
 
         Args:
-            inputs: The inputs to the LLM. See
-                :class:`~vllm.inputs.PromptInputs`
+            prompt: The prompt to the LLM. See :class:`~vllm.inputs.PromptType`
                 for more details about the format of each input.
             sampling_params: The sampling parameters of the request.
             request_id: The unique id of the request.
@@ -399,13 +398,13 @@ def generate(
             prompt_adapter_request: Prompt Adapter request to use
                                             for generation, if any.
         """
-        return self._process_request(inputs, sampling_params, request_id,
+        return self._process_request(prompt, sampling_params, request_id,
                                      lora_request, trace_headers,
                                      prompt_adapter_request)
 
     def encode(
         self,
-        inputs: PromptInputs,
+        prompt: PromptType,
         pooling_params: PoolingParams,
         request_id: str,
         lora_request: Optional[LoRARequest] = None,
@@ -418,8 +417,7 @@ def encode(
         from the LLMEngine to the caller.
 
         Args:
-            inputs: The inputs to the LLM. See
-                :class:`~vllm.inputs.PromptInputs`
+            prompt: The prompt to the LLM. See :class:`~vllm.inputs.PromptType`
                 for more details about the format of each input.
             pooling_params: The pooling parameters of the request.
             request_id: The unique id of the request.
@@ -430,12 +428,12 @@ def encode(
             The output `EmbeddingRequestOutput` objects from the LLMEngine
             for the request.
         """
-        return self._process_request(inputs, pooling_params, request_id,
+        return self._process_request(prompt, pooling_params, request_id,
                                      lora_request, trace_headers)
 
     async def _process_request(
         self,
-        inputs: PromptInputs,
+        prompt: PromptType,
         params: Union[SamplingParams, PoolingParams],
         request_id: str,
         lora_request: Optional[LoRARequest] = None,
@@ -468,7 +466,7 @@ async def _process_request(
 
             request_bytes = pickle.dumps(
                 RPCProcessRequest(
-                    inputs=inputs,
+                    prompt=prompt,
                     params=params,
                     request_id=request_id,
                     lora_request=lora_request,

diff --git a/vllm/engine/multiprocessing/engine.py b/vllm/engine/multiprocessing/engine.py
@@ -245,7 +245,7 @@ def _handle_process_request(self, request: RPCProcessRequest):
         try:
             self.engine.add_request(
                 request_id=request_id,
-                inputs=request.inputs,
+                prompt=request.prompt,
                 params=request.params,
                 lora_request=request.lora_request,
                 trace_headers=request.trace_headers,