77
88from fastapi import Request
99
10+ from vllm import envs
1011from vllm .config import ModelConfig
1112from vllm .engine .protocol import EngineClient
1213from vllm .entrypoints .logger import RequestLogger
1718 ScoreResponseData , UsageInfo )
1819from vllm .entrypoints .openai .serving_engine import OpenAIServing
1920from vllm .entrypoints .openai .serving_models import OpenAIServingModels
21+ # yapf conflicts with isort for this block
22+ # yapf: disable
2023from vllm .entrypoints .score_utils import (ScoreContentPartParam ,
2124 ScoreMultiModalParam ,
2225 _cosine_similarity ,
2326 _validate_score_input_lens ,
27+ compress_token_type_ids ,
2428 get_score_prompt )
29+ # yapf: enable
2530from vllm .entrypoints .utils import _validate_truncation_size
2631from vllm .inputs .data import TokensPrompt
2732from vllm .logger import init_logger
@@ -158,6 +163,8 @@ def _preprocess_score(
158163 tokenizer = tokenizer ,
159164 tokenization_kwargs = tokenization_kwargs ,
160165 )
166+ self ._validate_input (request , engine_prompt ["prompt_token_ids" ],
167+ full_prompt )
161168 if request .mm_processor_kwargs is not None :
162169 engine_prompt ["mm_processor_kwargs" ] = request .mm_processor_kwargs
163170
@@ -188,64 +195,27 @@ async def _cross_encoding_score(
188195
189196 input_pairs = [(t1 , t2 ) for t1 , t2 in zip (data_1 , data_2 )]
190197
191- if self .model_config .is_multimodal_model :
198+ preprocess_async = make_async (self ._preprocess_score ,
199+ executor = self ._tokenizer_executor )
192200
193- preprocess_async = make_async (self ._preprocess_score ,
194- executor = self ._tokenizer_executor )
201+ preprocessed_prompts = await asyncio .gather (
202+ * (preprocess_async (request = request ,
203+ tokenizer = tokenizer ,
204+ tokenization_kwargs = tokenization_kwargs ,
205+ data_1 = t1 ,
206+ data_2 = t2 ) for t1 , t2 in input_pairs ))
195207
196- preprocessed_prompts = await asyncio .gather (
197- * (preprocess_async (request = request ,
198- tokenizer = tokenizer ,
199- tokenization_kwargs = tokenization_kwargs ,
200- data_1 = t1 ,
201- data_2 = t2 ) for t1 , t2 in input_pairs ))
202-
203- for full_prompt , engine_prompt in preprocessed_prompts :
204- request_prompts .append (full_prompt )
205- engine_prompts .append (engine_prompt )
206-
207- else :
208- tokenize_async = make_async (tokenizer .__call__ ,
209- executor = self ._tokenizer_executor )
210- use_pad_token = self .model_config .use_pad_token
211-
212- if use_pad_token :
213- # cross_encoder models defaults to using pad_token.
214- tokenized_prompts = await asyncio .gather (* (
215- tokenize_async (
216- text = t1 , # type: ignore[arg-type]
217- text_pair = t2 , # type: ignore[arg-type]
218- ** tokenization_kwargs ) for t1 , t2 in input_pairs ))
219- else :
220- # `llm as reranker` models defaults to not using pad_token.
221- tokenized_prompts = await asyncio .gather (* (
222- tokenize_async (
223- text = t1 + # type: ignore[operator]
224- t2 ,
225- ** tokenization_kwargs ) for t1 , t2 in input_pairs ))
226-
227- for prompt_inputs , (t1 , t2 ) in zip (tokenized_prompts , input_pairs ):
228- sep_token = tokenizer .sep_token if (tokenizer .sep_token
229- and use_pad_token ) else ''
230- request_prompt = f"{ t1 } { sep_token } { t2 } "
231-
232- input_ids = prompt_inputs ["input_ids" ]
233- text_token_prompt = \
234- self ._validate_input (request , input_ids , request_prompt )
235- engine_prompt = TokensPrompt (
236- prompt_token_ids = text_token_prompt ["prompt_token_ids" ],
237- token_type_ids = prompt_inputs .get ("token_type_ids" ))
238-
239- request_prompts .append (request_prompt )
240- engine_prompts .append (engine_prompt )
208+ for full_prompt , engine_prompt in preprocessed_prompts :
209+ request_prompts .append (full_prompt )
210+ engine_prompts .append (engine_prompt )
241211
242212 # Schedule the request and get the result generator.
243213 generators : list [AsyncGenerator [PoolingRequestOutput , None ]] = []
244214
245- pooling_params = request .to_pooling_params ()
215+ default_pooling_params = request .to_pooling_params ()
246216
247217 try :
248- pooling_params .verify ("score" , self .model_config )
218+ default_pooling_params .verify ("score" , self .model_config )
249219 except ValueError as e :
250220 return self .create_error_response (str (e ))
251221
@@ -254,9 +224,19 @@ async def _cross_encoding_score(
254224
255225 self ._log_inputs (request_id_item ,
256226 request_prompts [i ],
257- params = pooling_params ,
227+ params = default_pooling_params ,
258228 lora_request = lora_request )
259229
230+ if envs .VLLM_USE_V1 and (token_type_ids := engine_prompt .pop (
231+ "token_type_ids" , None )):
232+ pooling_params = default_pooling_params .clone ()
233+ compressed = compress_token_type_ids (token_type_ids )
234+ pooling_params .extra_kwargs = {
235+ "compressed_token_type_ids" : compressed
236+ }
237+ else :
238+ pooling_params = (default_pooling_params )
239+
260240 generator = self .engine_client .encode (
261241 engine_prompt ,
262242 pooling_params ,
0 commit comments