Skip to content

Commit da6ea29

Browse files
authored
[V1] Avoid redundant input processing in n>1 case (#14985)
Signed-off-by: Nick Hill <nhill@redhat.com>
1 parent 7297941 commit da6ea29

File tree

13 files changed

+82
-142
lines changed

13 files changed

+82
-142
lines changed

tests/lora/test_tokenizer_group.py

Lines changed: 2 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -24,12 +24,10 @@ async def test_tokenizer_group_lora(sql_lora_files, tokenizer_group_type):
2424
)
2525
lora_request = LoRARequest("1", 1, sql_lora_files)
2626
assert reference_tokenizer.encode("prompt") == tokenizer_group.encode(
27-
request_id="request_id", prompt="prompt", lora_request=lora_request)
27+
prompt="prompt", lora_request=lora_request)
2828
assert reference_tokenizer.encode(
2929
"prompt") == await tokenizer_group.encode_async(
30-
request_id="request_id",
31-
prompt="prompt",
32-
lora_request=lora_request)
30+
prompt="prompt", lora_request=lora_request)
3331
assert isinstance(tokenizer_group.get_lora_tokenizer(None),
3432
PreTrainedTokenizerBase)
3533
assert tokenizer_group.get_lora_tokenizer(

tests/tokenization/test_tokenizer_group.py

Lines changed: 9 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -41,10 +41,10 @@ async def test_tokenizer_group(tokenizer_group_type):
4141
max_input_length=None,
4242
)
4343
assert reference_tokenizer.encode("prompt") == tokenizer_group.encode(
44-
request_id="request_id", prompt="prompt", lora_request=None)
44+
prompt="prompt", lora_request=None)
4545
assert reference_tokenizer.encode(
46-
"prompt") == await tokenizer_group.encode_async(
47-
request_id="request_id", prompt="prompt", lora_request=None)
46+
"prompt") == await tokenizer_group.encode_async(prompt="prompt",
47+
lora_request=None)
4848
assert isinstance(tokenizer_group.get_lora_tokenizer(None),
4949
PreTrainedTokenizerBase)
5050
assert tokenizer_group.get_lora_tokenizer(
@@ -69,8 +69,7 @@ async def test_tokenizer_group_pool(tokenizer_group_type):
6969
# and check that all requests are processed correctly.
7070
num_requests = tokenizer_group_pool.pool_size * 5
7171
requests = [
72-
tokenizer_group_pool.encode_async(request_id=str(i),
73-
prompt=f"prompt {i}",
72+
tokenizer_group_pool.encode_async(prompt=f"prompt {i}",
7473
lora_request=None)
7574
for i in range(num_requests)
7675
]
@@ -161,12 +160,8 @@ class FailingRayTokenizerGroupPool(RayTokenizerGroupPool):
161160
fail_at[0] = 1000
162161

163162
# We should recover successfully.
164-
await tokenizer_group_pool.encode_async(request_id="1",
165-
prompt="prompt",
166-
lora_request=None)
167-
await tokenizer_group_pool.encode_async(request_id="1",
168-
prompt="prompt",
169-
lora_request=None)
163+
await tokenizer_group_pool.encode_async(prompt="prompt", lora_request=None)
164+
await tokenizer_group_pool.encode_async(prompt="prompt", lora_request=None)
170165

171166
# Check that we have a new actor
172167
assert len(tokenizer_group_pool.tokenizer_actors) == len(tokenizer_actors)
@@ -184,8 +179,7 @@ class FailingRayTokenizerGroupPool(RayTokenizerGroupPool):
184179

185180
# We should fail after re-initialization.
186181
with pytest.raises(RuntimeError):
187-
await tokenizer_group_pool.encode_async(request_id="1",
188-
prompt="prompt",
182+
await tokenizer_group_pool.encode_async(prompt="prompt",
189183
lora_request=None)
190184

191185
# check_health should raise the same thing
@@ -206,11 +200,8 @@ class FailingRayTokenizerGroupPool(RayTokenizerGroupPool):
206200

207201
# Prompt too long error
208202
with pytest.raises(ValueError):
209-
await tokenizer_group_pool.encode_async(request_id="1",
210-
prompt="prompt" * 100,
203+
await tokenizer_group_pool.encode_async(prompt="prompt" * 100,
211204
lora_request=None)
212-
await tokenizer_group_pool.encode_async(request_id="1",
213-
prompt="prompt",
214-
lora_request=None)
205+
await tokenizer_group_pool.encode_async(prompt="prompt", lora_request=None)
215206
# Actors should stay the same.
216207
assert tokenizer_group_pool.tokenizer_actors == tokenizer_actors

vllm/engine/async_llm_engine.py

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -492,7 +492,6 @@ async def add_request_async(
492492

493493
preprocessed_inputs = await self.input_preprocessor.preprocess_async(
494494
prompt,
495-
request_id=request_id,
496495
lora_request=lora_request,
497496
prompt_adapter_request=prompt_adapter_request,
498497
)

vllm/engine/llm_engine.py

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -783,7 +783,6 @@ def add_request(
783783

784784
preprocessed_inputs = self.input_preprocessor.preprocess(
785785
prompt,
786-
request_id=request_id,
787786
lora_request=lora_request,
788787
prompt_adapter_request=prompt_adapter_request,
789788
)

vllm/engine/protocol.py

Lines changed: 1 addition & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -81,10 +81,7 @@ async def beam_search(
8181
if is_explicit_encoder_decoder_prompt(prompt):
8282
raise NotImplementedError
8383
else:
84-
processed_inputs = preprocessor._prompt_to_llm_inputs(
85-
prompt,
86-
request_id=request_id,
87-
)
84+
processed_inputs = preprocessor._prompt_to_llm_inputs(prompt)
8885

8986
prompt_token_ids = processed_inputs["prompt_token_ids"]
9087
prompt_text = processed_inputs.get("prompt")

vllm/inputs/preprocess.py

Lines changed: 9 additions & 54 deletions
Original file line numberDiff line numberDiff line change
@@ -182,7 +182,6 @@ def _apply_prompt_adapter(
182182
def _tokenize_prompt(
183183
self,
184184
prompt: str,
185-
request_id: str,
186185
lora_request: Optional[LoRARequest],
187186
) -> list[int]:
188187
"""
@@ -202,15 +201,13 @@ def _tokenize_prompt(
202201
"do_lower_case", False)):
203202
prompt = prompt.lower()
204203

205-
return tokenizer.encode(request_id=request_id,
206-
prompt=prompt,
204+
return tokenizer.encode(prompt=prompt,
207205
lora_request=lora_request,
208206
add_special_tokens=add_special_tokens)
209207

210208
async def _tokenize_prompt_async(
211209
self,
212210
prompt: str,
213-
request_id: str,
214211
lora_request: Optional[LoRARequest],
215212
) -> list[int]:
216213
"""Async version of :meth:`_tokenize_prompt`."""
@@ -222,7 +219,6 @@ async def _tokenize_prompt_async(
222219
# appending an EOS token to the prompt which disrupts generation.
223220
add_special_tokens = False
224221
return await tokenizer.encode_async(
225-
request_id=request_id,
226222
prompt=prompt,
227223
lora_request=lora_request,
228224
add_special_tokens=add_special_tokens)
@@ -309,7 +305,6 @@ async def _process_multimodal_async(
309305
def _prompt_to_llm_inputs(
310306
self,
311307
prompt: SingletonPrompt,
312-
request_id: str,
313308
lora_request: Optional[LoRARequest] = None,
314309
return_mm_hashes: bool = False,
315310
) -> SingletonInputs:
@@ -318,7 +313,6 @@ def _prompt_to_llm_inputs(
318313
319314
Arguments:
320315
321-
* request_id
322316
* prompt: single encoder or decoder input prompt
323317
* lora_request: this is only valid for decoder prompts
324318
* return_mm_hashes: whether to return multimodal hashes
@@ -333,7 +327,6 @@ def _prompt_to_llm_inputs(
333327
prompt_text = parsed["content"]
334328
prompt_token_ids = self._tokenize_prompt(
335329
prompt_text,
336-
request_id=request_id,
337330
lora_request=lora_request,
338331
)
339332

@@ -384,7 +377,6 @@ def _prompt_to_llm_inputs(
384377

385378
prompt_token_ids = self._tokenize_prompt(
386379
prompt_text,
387-
request_id=request_id,
388380
lora_request=lora_request,
389381
)
390382

@@ -400,7 +392,6 @@ def _prompt_to_llm_inputs(
400392
async def _prompt_to_llm_inputs_async(
401393
self,
402394
prompt: SingletonPrompt,
403-
request_id: str,
404395
lora_request: Optional[LoRARequest] = None,
405396
return_mm_hashes: bool = False,
406397
) -> SingletonInputs:
@@ -411,7 +402,6 @@ async def _prompt_to_llm_inputs_async(
411402
prompt_text = parsed["content"]
412403
prompt_token_ids = await self._tokenize_prompt_async(
413404
prompt_text,
414-
request_id=request_id,
415405
lora_request=lora_request,
416406
)
417407

@@ -460,7 +450,6 @@ async def _prompt_to_llm_inputs_async(
460450

461451
prompt_token_ids = await self._tokenize_prompt_async(
462452
prompt_text,
463-
request_id=request_id,
464453
lora_request=lora_request,
465454
)
466455

@@ -560,7 +549,6 @@ def _separate_enc_dec_inputs_from_mm_processor_outputs(
560549
def _process_encoder_decoder_prompt(
561550
self,
562551
prompt: PromptType,
563-
request_id: str,
564552
) -> EncoderDecoderInputs:
565553
"""
566554
For encoder/decoder models only:
@@ -587,7 +575,6 @@ def _process_encoder_decoder_prompt(
587575
Arguments:
588576
589577
* prompt: an input prompt
590-
* request_id
591578
592579
Returns:
593580
@@ -598,16 +585,11 @@ def _process_encoder_decoder_prompt(
598585

599586
if is_explicit_encoder_decoder_prompt(prompt):
600587
encoder_inputs = self._prompt_to_llm_inputs(
601-
prompt["encoder_prompt"],
602-
request_id=request_id,
603-
)
588+
prompt["encoder_prompt"])
604589
if (decoder_input := prompt["decoder_prompt"]) is None:
605590
decoder_inputs = None
606591
else:
607-
decoder_inputs = self._prompt_to_llm_inputs(
608-
decoder_input,
609-
request_id=request_id,
610-
)
592+
decoder_inputs = self._prompt_to_llm_inputs(decoder_input)
611593
# For multimodal model, override decoder prompt from processor
612594
# with explicit decoder prompt.
613595
if self.model_config.is_multimodal_model and (
@@ -616,10 +598,7 @@ def _process_encoder_decoder_prompt(
616598
self._separate_enc_dec_inputs_from_mm_processor_outputs(
617599
encoder_inputs, decoder_inputs))
618600
else:
619-
inputs = self._prompt_to_llm_inputs(
620-
prompt,
621-
request_id=request_id,
622-
)
601+
inputs = self._prompt_to_llm_inputs(prompt)
623602
if self.model_config.is_multimodal_model and (
624603
self._can_process_multimodal()):
625604
# Encoder-Decoder Multimodal model
@@ -636,26 +615,20 @@ def _process_encoder_decoder_prompt(
636615
async def _process_encoder_decoder_prompt_async(
637616
self,
638617
prompt: PromptType,
639-
request_id: str,
640618
) -> EncoderDecoderInputs:
641619
"""Async version of :meth:`_process_encoder_decoder_prompt`."""
642620
encoder_inputs: SingletonInputs
643621
decoder_inputs: Optional[SingletonInputs]
644622

645623
if is_explicit_encoder_decoder_prompt(prompt):
646624
encoder_task = self._prompt_to_llm_inputs_async(
647-
prompt["encoder_prompt"],
648-
request_id=request_id,
649-
)
625+
prompt["encoder_prompt"])
650626

651627
if (decoder_input := prompt["decoder_prompt"]) is None:
652628
encoder_inputs = await encoder_task
653629
decoder_inputs = None
654630
else:
655-
decoder_task = self._prompt_to_llm_inputs_async(
656-
decoder_input,
657-
request_id=request_id,
658-
)
631+
decoder_task = self._prompt_to_llm_inputs_async(decoder_input)
659632

660633
encoder_inputs, decoder_inputs = await asyncio.gather(
661634
encoder_task, decoder_task)
@@ -668,10 +641,7 @@ async def _process_encoder_decoder_prompt_async(
668641
self._separate_enc_dec_inputs_from_mm_processor_outputs(
669642
encoder_inputs, decoder_inputs))
670643
else:
671-
inputs = await self._prompt_to_llm_inputs_async(
672-
prompt,
673-
request_id=request_id,
674-
)
644+
inputs = await self._prompt_to_llm_inputs_async(prompt)
675645
if self.model_config.is_multimodal_model and (
676646
self._can_process_multimodal()):
677647
# Encoder-Decoder Multimodal model
@@ -704,7 +674,6 @@ def _build_decoder_only_llm_inputs(
704674
def _process_decoder_only_prompt(
705675
self,
706676
prompt: SingletonPrompt,
707-
request_id: str,
708677
lora_request: Optional[LoRARequest] = None,
709678
prompt_adapter_request: Optional[PromptAdapterRequest] = None,
710679
return_mm_hashes: bool = False,
@@ -716,7 +685,6 @@ def _process_decoder_only_prompt(
716685
Arguments:
717686
718687
* prompt: input prompt
719-
* request_id
720688
* lora_request
721689
* prompt_adapter_request
722690
* return_mm_hashes
@@ -728,7 +696,6 @@ def _process_decoder_only_prompt(
728696

729697
prompt_comps = self._prompt_to_llm_inputs(
730698
prompt,
731-
request_id=request_id,
732699
lora_request=lora_request,
733700
return_mm_hashes=return_mm_hashes,
734701
)
@@ -741,15 +708,13 @@ def _process_decoder_only_prompt(
741708
async def _process_decoder_only_prompt_async(
742709
self,
743710
prompt: SingletonPrompt,
744-
request_id: str,
745711
lora_request: Optional[LoRARequest] = None,
746712
prompt_adapter_request: Optional[PromptAdapterRequest] = None,
747713
return_mm_hashes: bool = False,
748714
) -> DecoderOnlyInputs:
749715
"""Async version of :meth:`_process_decoder_only_prompt`."""
750716
prompt_comps = await self._prompt_to_llm_inputs_async(
751717
prompt,
752-
request_id=request_id,
753718
lora_request=lora_request,
754719
return_mm_hashes=return_mm_hashes,
755720
)
@@ -762,7 +727,6 @@ async def _process_decoder_only_prompt_async(
762727
def preprocess(
763728
self,
764729
prompt: PromptType,
765-
request_id: str,
766730
lora_request: Optional[LoRARequest] = None,
767731
prompt_adapter_request: Optional[PromptAdapterRequest] = None,
768732
return_mm_hashes: bool = False,
@@ -774,10 +738,7 @@ def preprocess(
774738
"returned until they are supported on vLLM V1.")
775739
# Encoder-decoder model requires special mapping of
776740
# input prompts to encoder & decoder
777-
return self._process_encoder_decoder_prompt(
778-
prompt,
779-
request_id=request_id,
780-
)
741+
return self._process_encoder_decoder_prompt(prompt)
781742

782743
if is_explicit_encoder_decoder_prompt(prompt):
783744
raise ValueError("Cannot pass encoder-decoder prompt "
@@ -786,7 +747,6 @@ def preprocess(
786747
# Decoder-only operation
787748
return self._process_decoder_only_prompt(
788749
prompt,
789-
request_id=request_id,
790750
lora_request=lora_request,
791751
prompt_adapter_request=prompt_adapter_request,
792752
return_mm_hashes=return_mm_hashes,
@@ -795,7 +755,6 @@ def preprocess(
795755
async def preprocess_async(
796756
self,
797757
prompt: PromptType,
798-
request_id: str,
799758
lora_request: Optional[LoRARequest] = None,
800759
prompt_adapter_request: Optional[PromptAdapterRequest] = None,
801760
return_mm_hashes: bool = False,
@@ -807,10 +766,7 @@ async def preprocess_async(
807766
"returned until they are supported on vLLM V1.")
808767
# Encoder-decoder model requires special mapping of
809768
# input prompts to encoder & decoder
810-
return await self._process_encoder_decoder_prompt_async(
811-
prompt,
812-
request_id=request_id,
813-
)
769+
return await self._process_encoder_decoder_prompt_async(prompt)
814770

815771
if is_explicit_encoder_decoder_prompt(prompt):
816772
raise ValueError("Cannot pass encoder-decoder prompt "
@@ -819,7 +775,6 @@ async def preprocess_async(
819775
# Decoder-only operation
820776
return await self._process_decoder_only_prompt_async(
821777
prompt,
822-
request_id=request_id,
823778
lora_request=lora_request,
824779
prompt_adapter_request=prompt_adapter_request,
825780
return_mm_hashes=return_mm_hashes,

vllm/transformers_utils/tokenizer_group/base_tokenizer_group.py

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -33,7 +33,6 @@ def get_max_input_len(
3333
@abstractmethod
3434
def encode(self,
3535
prompt: str,
36-
request_id: Optional[str] = None,
3736
lora_request: Optional[LoRARequest] = None,
3837
add_special_tokens: Optional[bool] = None) -> List[int]:
3938
"""Encode a prompt using the tokenizer group."""
@@ -43,7 +42,6 @@ def encode(self,
4342
async def encode_async(
4443
self,
4544
prompt: str,
46-
request_id: Optional[str] = None,
4745
lora_request: Optional[LoRARequest] = None,
4846
add_special_tokens: Optional[bool] = None) -> List[int]:
4947
"""Encode a prompt using the tokenizer group."""

0 commit comments

Comments
 (0)