2525from vllm .model_executor .guided_decoding import (
2626 get_guided_decoding_logits_processor )
2727from vllm .model_executor .layers .sampler import SamplerOutput
28- from vllm .outputs import EmbeddingRequestOutput , RequestOutput
28+ from vllm .outputs import PoolingRequestOutput , RequestOutput
2929from vllm .pooling_params import PoolingParams
3030from vllm .prompt_adapter .request import PromptAdapterRequest
3131from vllm .sampling_params import SamplingParams
@@ -74,7 +74,7 @@ def _log_task_completion(task: asyncio.Task,
7474
7575
7676class AsyncStream :
77- """A stream of RequestOutputs or EmbeddingRequestOutputs for a request
77+ """A stream of RequestOutputs or PoolingRequestOutputs for a request
7878 that can be iterated over asynchronously via an async generator."""
7979
8080 def __init__ (self , request_id : str , cancel : Callable [[str ], None ]) -> None :
@@ -83,7 +83,7 @@ def __init__(self, request_id: str, cancel: Callable[[str], None]) -> None:
8383 self ._queue : asyncio .Queue = asyncio .Queue ()
8484 self ._finished = False
8585
86- def put (self , item : Union [RequestOutput , EmbeddingRequestOutput ,
86+ def put (self , item : Union [RequestOutput , PoolingRequestOutput ,
8787 Exception ]) -> None :
8888 if not self ._finished :
8989 self ._queue .put_nowait (item )
@@ -103,7 +103,7 @@ def finished(self) -> bool:
103103
104104 async def generator (
105105 self
106- ) -> AsyncGenerator [Union [RequestOutput , EmbeddingRequestOutput ], None ]:
106+ ) -> AsyncGenerator [Union [RequestOutput , PoolingRequestOutput ], None ]:
107107 try :
108108 while True :
109109 result = await self ._queue .get ()
@@ -154,7 +154,7 @@ def propagate_exception(self,
154154
155155 def process_request_output (self ,
156156 request_output : Union [RequestOutput ,
157- EmbeddingRequestOutput ],
157+ PoolingRequestOutput ],
158158 * ,
159159 verbose : bool = False ) -> None :
160160 """Process a request output from the engine."""
@@ -265,7 +265,7 @@ def __init__(self, *args, **kwargs):
265265
266266 async def step_async (
267267 self , virtual_engine : int
268- ) -> List [Union [RequestOutput , EmbeddingRequestOutput ]]:
268+ ) -> List [Union [RequestOutput , PoolingRequestOutput ]]:
269269 """Performs one decoding iteration and returns newly generated results.
270270 The workers are ran asynchronously if possible.
271271
@@ -907,7 +907,7 @@ def add_request(
907907 prompt_adapter_request : Optional [PromptAdapterRequest ] = None ,
908908 priority : int = 0 ,
909909 ) -> Coroutine [None , None , AsyncGenerator [Union [
910- RequestOutput , EmbeddingRequestOutput ], None ]]:
910+ RequestOutput , PoolingRequestOutput ], None ]]:
911911 ...
912912
913913 @overload
@@ -922,7 +922,7 @@ def add_request(
922922 prompt_adapter_request : Optional [PromptAdapterRequest ] = None ,
923923 priority : int = 0 ,
924924 ) -> Coroutine [None , None , AsyncGenerator [Union [
925- RequestOutput , EmbeddingRequestOutput ], None ]]:
925+ RequestOutput , PoolingRequestOutput ], None ]]:
926926 ...
927927
928928 @deprecate_kwargs (
@@ -941,7 +941,7 @@ async def add_request(
941941 priority : int = 0 ,
942942 * ,
943943 inputs : Optional [PromptType ] = None , # DEPRECATED
944- ) -> AsyncGenerator [Union [RequestOutput , EmbeddingRequestOutput ], None ]:
944+ ) -> AsyncGenerator [Union [RequestOutput , PoolingRequestOutput ], None ]:
945945 if inputs is not None :
946946 prompt = inputs
947947 assert prompt is not None and params is not None
@@ -1070,7 +1070,7 @@ async def encode(
10701070 lora_request : Optional [LoRARequest ] = None ,
10711071 trace_headers : Optional [Mapping [str , str ]] = None ,
10721072 priority : int = 0 ,
1073- ) -> AsyncGenerator [EmbeddingRequestOutput , None ]:
1073+ ) -> AsyncGenerator [PoolingRequestOutput , None ]:
10741074 """Generate outputs for a request from an embedding model.
10751075
10761076 Generate outputs for a request. This method is a coroutine. It adds the
@@ -1088,7 +1088,7 @@ async def encode(
10881088 Only applicable with priority scheduling.
10891089
10901090 Yields:
1091- The output `EmbeddingRequestOutput ` objects from the LLMEngine
1091+ The output `PoolingRequestOutput ` objects from the LLMEngine
10921092 for the request.
10931093
10941094 Details:
@@ -1141,7 +1141,7 @@ async def encode(
11411141 trace_headers = trace_headers ,
11421142 priority = priority ,
11431143 ):
1144- yield LLMEngine .validate_output (output , EmbeddingRequestOutput )
1144+ yield LLMEngine .validate_output (output , PoolingRequestOutput )
11451145
11461146 async def abort (self , request_id : str ) -> None :
11471147 """Abort a request.
0 commit comments