improve embedding input

Bryce1010 · Bryce1010 · commit dd93d89f3102 · 2025-03-06T15:05:35.000+08:00
diff --git a/vllm/attention/backends/flash_attn.py b/vllm/attention/backends/flash_attn.py
@@ -732,7 +732,6 @@ def forward(
         prefill_output = output[:num_prefill_query_tokens]
         assert query.shape[0] == num_prefill_query_tokens
         assert decode_query.shape[0] == num_decode_query_tokens
-
         if prefill_meta := attn_metadata.prefill_metadata:
             # Prompt run.
             if (kv_cache.numel() == 0 or prefill_meta.block_tables is None
diff --git a/vllm/engine/llm_engine.py b/vllm/engine/llm_engine.py
@@ -753,10 +753,9 @@ def add_request(
         if arrival_time is None:
             arrival_time = time.time()
 
-        if isinstance(prompt, dict) and prompt.get("prompt_embeds",
-                                                   None) is not None:
+        if isinstance(prompt, dict) and prompt.get("prompt_embeds", None) is not None:
             if not prompt.get("prompt_token_ids", None):
-                prompt["prompt_token_ids"] = [0] * len(prompt["prompt_embeds"])
+                prompt["prompt_token_ids"] = [0] * prompt["prompt_embeds"].shape[0]
 
         if self.tokenizer is not None:
             self._validate_token_prompt(
diff --git a/vllm/entrypoints/llm.py b/vllm/entrypoints/llm.py
@@ -9,12 +9,7 @@
 import cloudpickle
 import torch.nn as nn
 from tqdm import tqdm
-<<<<<<< HEAD
 from typing_extensions import TypeVar, deprecated
-=======
-from typing_extensions import deprecated
-import torch
->>>>>>> 0d69ec2f ((vllm) add input embedding)
 
 from vllm import envs
 from vllm.beam_search import (BeamSearchInstance, BeamSearchOutput,
@@ -386,12 +381,8 @@ def generate(
                        Optional[Union[str, list[str]]]] = None,
         sampling_params: Optional[Union[SamplingParams,
                                         Sequence[SamplingParams]]] = None,
-<<<<<<< HEAD
-        prompt_token_ids: Optional[Union[list[int], list[list[int]]]] = None,
-=======
         prompt_token_ids: Optional[Union[List[int], List[List[int]]]] = None,
         prompt_embeds: Optional[torch.Tensor] = None,
->>>>>>> 0d69ec2f ((vllm) add input embedding)
         use_tqdm: bool = True,
         lora_request: Optional[Union[list[LoRARequest], LoRARequest]] = None,
         prompt_adapter_request: Optional[PromptAdapterRequest] = None,
@@ -1238,14 +1229,9 @@ def wake_up(self):
     # LEGACY
     def _convert_v1_inputs(
         self,
-<<<<<<< HEAD
-        prompts: Optional[Union[str, list[str]]],
-        prompt_token_ids: Optional[Union[list[int], list[list[int]]]],
-=======
         prompts: Optional[Union[str, List[str]]],
         prompt_token_ids: Optional[Union[List[int], List[List[int]]]],
         prompt_embeds: Optional[torch.Tensor] = None,
->>>>>>> 0d69ec2f ((vllm) add input embedding)
     ):
         # skip_tokenizer_init is now checked in engine
 
diff --git a/vllm/inputs/data.py b/vllm/inputs/data.py
@@ -145,9 +145,6 @@ class TokenInputs(TypedDict):
     prompt_token_ids: List[int]
     """The token IDs of the prompt."""
 
-    prompt_embeds: NotRequired[torch.Tensor]
-    """The embeddings of the prompt, if available."""
-
     token_type_ids: NotRequired[List[int]]
     """The token type IDs of the prompt."""
 
@@ -156,6 +153,9 @@ class TokenInputs(TypedDict):
     The original prompt text corresponding to the token IDs, if available.
     """
 
+    prompt_embeds: NotRequired[torch.Tensor]
+    """The embeddings of the prompt, if available."""
+
     multi_modal_data: NotRequired["MultiModalDataDict"]
     """
     Optional multi-modal data to pass to the model,
diff --git a/vllm/inputs/preprocess.py b/vllm/inputs/preprocess.py
@@ -360,7 +360,7 @@ def _prompt_to_llm_inputs(
 
             return token_inputs(
                 prompt_token_ids=prompt_token_ids,
-                prompt_embeds=tokens_content.get('prompt_embeds'),
+                prompt_embeds=tokens_content.get("prompt_embeds"),
                 token_type_ids=token_type_ids,
                 multi_modal_data=multi_modal_data,
                 mm_processor_kwargs=mm_processor_kwargs,
@@ -390,7 +390,7 @@ def _prompt_to_llm_inputs(
             return token_inputs(
                 prompt=prompt_text,
                 prompt_token_ids=prompt_token_ids,
-                prompt_embeds=text_content.get('prompt_embeds'),
+                prompt_embeds=text_content.get("prompt_embeds"),
                 multi_modal_data=multi_modal_data,
                 mm_processor_kwargs=mm_processor_kwargs,
             )
@@ -436,7 +436,7 @@ async def _prompt_to_llm_inputs_async(
 
             return token_inputs(
                 prompt_token_ids=prompt_token_ids,
-                prompt_embeds=tokens_content.get('prompt_embeds'),
+                prompt_embeds=tokens_content.get("prompt_embeds"),
                 multi_modal_data=multi_modal_data,
                 mm_processor_kwargs=mm_processor_kwargs,
             )
@@ -465,7 +465,7 @@ async def _prompt_to_llm_inputs_async(
             return token_inputs(
                 prompt=prompt_text,
                 prompt_token_ids=prompt_token_ids,
-                prompt_embeds=text_content.get('prompt_embeds'),
+                prompt_embeds=tokens_content.get("prompt_embeds"),
                 multi_modal_data=multi_modal_data,
                 mm_processor_kwargs=mm_processor_kwargs,
             )
diff --git a/vllm/model_executor/models/qwen2.py b/vllm/model_executor/models/qwen2.py
@@ -460,14 +460,10 @@ def forward(
         intermediate_tensors: Optional[IntermediateTensors] = None,
         inputs_embeds: Optional[torch.Tensor] = None,
     ) -> Union[torch.Tensor, IntermediateTensors]:
-<<<<<<< HEAD
-        hidden_states = self.model(input_ids, positions, intermediate_tensors,
-                                   inputs_embeds)
-=======
+
         hidden_states = self.model(input_ids, positions, kv_caches,
                                    attn_metadata, intermediate_tensors,
                                    inputs_embeds, self.lm_head.bias)
->>>>>>> 0d69ec2f ((vllm) add input embedding)
         return hidden_states
 
     def compute_logits(
diff --git a/vllm/sequence.py b/vllm/sequence.py
@@ -264,14 +264,6 @@ def output_token_ids(self,
                                        new_output_token_ids)
         self._update_cached_all_tokens()
 
-    @property
-    def prompt_embeds(self) -> Optional[torch.Tensor]:
-        return self._prompt_embeds
-
-    @prompt_embeds.setter
-    def prompt_embeds(self, prompt_embeds: Optional[torch.Tensor]) -> None:
-        self._prompt_embeds = prompt_embeds
-
     @property
     def output_token_ids_array(self) -> array:
         """Return the prompt token ids in array type.
@@ -281,6 +273,14 @@ def output_token_ids_array(self) -> array:
         """
         assert isinstance(self._output_token_ids, array)
         return self._output_token_ids
+    
+    @property
+    def prompt_embeds(self) -> Optional[torch.Tensor]:
+        return self._prompt_embeds
+    
+    @prompt_embeds.setter
+    def prompt_embeds(self, prompt_embeds: torch.Tensor) -> None:
+        self._prompt_embeds = prompt_embeds
 
     @property
     def mrope_position_delta(self) -> Optional[int]:
@@ -389,8 +389,8 @@ def stage(self) -> SequenceStage:
     def __repr__(self) -> str:
         return (f"SequenceData("
                 f"prompt_token_ids={self._prompt_token_ids}, "
+                f"prompt_embeds={getattr(self._prompt_embeds, 'shape', None)}, "
                 f"output_token_ids={self.output_token_ids}, "
-                f"prompt_embeds={getattr(self.prompt_embeds, 'shape', None)}, "
                 f"cumulative_logprob={self.cumulative_logprob}, "
                 f"get_num_computed_tokens={self.get_num_computed_tokens()})")
 
diff --git a/vllm/worker/model_runner.py b/vllm/worker/model_runner.py
@@ -365,8 +365,9 @@ def __init__(
 
             else:
                 self.input_tokens = input_tokens or []
-                self.inputs_embeds = (inputs_embeds
-                                      if inputs_embeds is not None else None)
+                self.inputs_embeds = (
+                    inputs_embeds if inputs_embeds is not None else None
+                )
                 self.input_positions = input_positions or []
                 self.token_types = token_types or []
                 self.mrope_input_positions = mrope_input_positions or None
@@ -544,12 +545,12 @@ def _compute_lens(self, inter_data: InterDataForSeqGroup, seq_idx: int,
 
         # Compute tokens.
         tokens = seq_data.get_token_ids()[context_len:seq_len]
-        if seq_data.prompt_embeds is not None and seq_data.get_output_len(
-        ) == 0:
-            prompt_embeds = seq_data.prompt_embeds[context_len:seq_len]
+        if seq_data.prompt_embeds is not None and seq_data.get_output_len() == 0:
+                prompt_embeds = seq_data.prompt_embeds[context_len:seq_len]
         else:
-            seq_data.prompt_embeds = None
+            seq_data.prompt_embeds = None  # release memory
             prompt_embeds = None
+
         token_types = seq_group_metadata.token_type_ids
 
         inter_data.seq_lens[seq_idx] = seq_len
@@ -870,9 +871,7 @@ def build(self) -> ModelInputForGPU:
             for cur_token_types in inter_data.token_types:
                 token_types.extend(cur_token_types)
             if inter_data.inputs_embeds is not None:
-                inputs_embeds.append(
-                    inter_data.inputs_embeds.to(self.runner.device))
-
+                inputs_embeds.append(inter_data.inputs_embeds.to(self.runner.device))
         if len(inputs_embeds) == 0:
             inputs_embeds = None
         elif len(inputs_embeds) == 1: