feat: working for Qwen 2.5 VL

hhzhang16 · hhzhang16 · commit d5863438fbac · 2025-06-10T17:20:03.000-07:00
diff --git a/examples/multimodal/README.md b/examples/multimodal/README.md
@@ -52,7 +52,7 @@ flowchart LR
 cd $DYNAMO_HOME/examples/multimodal
 # Serve a LLaVA 1.5 7B model:
 dynamo serve graphs.agg:Frontend -f ./configs/agg-llava.yaml
-# Serve a Qwen2 VL model:
+# Serve a Qwen2.5-VL model:
 # dynamo serve graphs.agg:Frontend -f ./configs/agg-qwen.yaml
 # Serve a Phi3V model:
 # dynamo serve graphs.agg:Frontend -f ./configs/agg-phi3v.yaml
@@ -89,7 +89,7 @@ curl http://localhost:8000/v1/chat/completions \
     }'
 ```
 
-If serving the example Qwen model, replace `"llava-hf/llava-1.5-7b-hf"` in the `"model"` field with `"Qwen/Qwen2-VL-7B-Instruct"`. If serving the example Phi3V model, replace `"llava-hf/llava-1.5-7b-hf"` in the `"model"` field with `"microsoft/Phi-3.5-vision-instruct"`.
+If serving the example Qwen model, replace `"llava-hf/llava-1.5-7b-hf"` in the `"model"` field with `"Qwen/Qwen2.5-VL-7B-Instruct"`. If serving the example Phi3V model, replace `"llava-hf/llava-1.5-7b-hf"` in the `"model"` field with `"microsoft/Phi-3.5-vision-instruct"`.
 
 You should see a response similar to this:
 ```json
@@ -204,7 +204,7 @@ DYNAMO_TAG=$(dynamo build graphs.agg:Frontend | grep "Successfully built" |  awk
 export DEPLOYMENT_NAME=multimodal-agg
 # For aggregated serving with LLaVA:
 dynamo deploy $DYNAMO_TAG -n $DEPLOYMENT_NAME -f ./configs/agg-llava.yaml
-# For aggregated serving with Qwen2-VL:
+# For aggregated serving with Qwen2.5-VL:
 # dynamo deploy $DYNAMO_TAG -n $DEPLOYMENT_NAME -f ./configs/agg-qwen.yaml
 # For aggregated serving with Phi3V:
 # dynamo deploy $DYNAMO_TAG -n $DEPLOYMENT_NAME -f ./configs/agg-phi3v.yaml
@@ -249,6 +249,6 @@ curl localhost:8000/v1/chat/completions \
   }'
 ```
 
-If serving the example Qwen model, replace `"llava-hf/llava-1.5-7b-hf"` in the `"model"` field with `"Qwen/Qwen2-VL-7B-Instruct"`. If serving the example Phi3V model, replace `"llava-hf/llava-1.5-7b-hf"` in the `"model"` field with `"microsoft/Phi-3.5-vision-instruct"`.
+If serving the example Qwen model, replace `"llava-hf/llava-1.5-7b-hf"` in the `"model"` field with `"Qwen/Qwen2.5-VL-7B-Instruct"`. If serving the example Phi3V model, replace `"llava-hf/llava-1.5-7b-hf"` in the `"model"` field with `"microsoft/Phi-3.5-vision-instruct"`.
 
 For more details on managing deployments, testing, and troubleshooting, please refer to the [Operator Deployment Guide](../../docs/guides/dynamo_deploy/operator_deployment.md).
diff --git a/examples/multimodal/components/decode_worker.py b/examples/multimodal/components/decode_worker.py
@@ -117,7 +117,7 @@ async def async_init(self):
             )
 
         runtime = dynamo_context["runtime"]
-        embeddings_shape, embeddings_dtype = get_vision_embeddings_info(
+        embeddings_shape, self.embeddings_dtype = get_vision_embeddings_info(
             self.engine_args.model, self.engine_args.num_patches
         )
         logger.debug(f"Embeddings shape: {embeddings_shape}")
@@ -139,6 +139,7 @@ async def async_init(self):
             else:
                 self.disaggregated_router = None
         else:
+            EMBEDDINGS_DTYPE = torch.float16
             EMBEDDINGS_DEVICE = "cuda"
 
             enc_comp_ns, enc_comp_name = VllmEncodeWorker.dynamo_address()  # type: ignore
@@ -154,7 +155,7 @@ async def async_init(self):
 
             # Create a longer-lived buffer for receiving the image embeddings.
             embeddings = torch.empty(
-                embeddings_shape, dtype=embeddings_dtype, device=EMBEDDINGS_DEVICE
+                embeddings_shape, dtype=EMBEDDINGS_DTYPE, device=EMBEDDINGS_DEVICE
             )
             descriptor = connect.Descriptor(embeddings)
             # Register the descriptor w/ NIXL (this is optional, if not done here the connect subsytem will take care of this automatically).
@@ -290,7 +291,7 @@ async def local_prefill(self, request: vLLMMultimodalRequest) -> tuple:
         )
         # When using disaggregated serving, the encode worker will have provided the key-value cache updates via the encode worker.
         multi_modal_data = construct_mm_data(
-            self.engine_args.model, encode_output, embeddings
+            self.engine_args.model, encode_output, embeddings, self.embeddings_dtype
         )
 
         return prompt_ids, multi_modal_data, remote_prefill_params
diff --git a/examples/multimodal/components/prefill_worker.py b/examples/multimodal/components/prefill_worker.py
@@ -111,12 +111,12 @@ async def async_init(self):
         await self._connector.initialize()
 
         # Create a longer-lived buffer for receiving the image embeddings.
-        embeddings_shape, embeddings_dtype = get_vision_embeddings_info(
+        embeddings_shape, self.embeddings_dtype = get_vision_embeddings_info(
             self.engine_args.model, self.engine_args.num_patches
         )
         embeddings = torch.empty(
             embeddings_shape,
-            dtype=embeddings_dtype,
+            dtype=self.embeddings_dtype,
             device=EMBEDDINGS_DEVICE,
         )
         descriptor = connect.Descriptor(embeddings)
@@ -265,7 +265,10 @@ async def generate(self, request: RemotePrefillRequest):
                 prompt=TokensPrompt(
                     prompt_token_ids=prompt_token_ids,
                     multi_modal_data=construct_mm_data(
-                        self.engine_args.model, encode_output, embeddings
+                        self.engine_args.model,
+                        encode_output,
+                        embeddings,
+                        self.embeddings_dtype,
                     ),
                 ),
                 sampling_params=sampling_params,
diff --git a/examples/multimodal/configs/agg-qwen.yaml b/examples/multimodal/configs/agg-qwen.yaml
@@ -13,7 +13,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 Common:
-  model: Qwen/Qwen2-VL-7B-Instruct
+  model: Qwen/Qwen2.5-VL-7B-Instruct
   block-size: 64
   max-model-len: 4096
 
@@ -29,6 +29,7 @@ VllmDecodeWorker:
   mm-processor-kwargs:
     min_pixels: 784
     max_pixels: 1003520
+    fps: 1
   enable-prefix-caching: true
   image-token-id: 151655
   num-patches: 345
diff --git a/examples/multimodal/utils/model.py b/examples/multimodal/utils/model.py
@@ -68,13 +68,17 @@ def get_vision_embeddings_info(
 
 
 def construct_mm_data(
-    model: str, encode_output: EncodeResponse, image_embeds: torch.Tensor
+    model: str,
+    encode_output: EncodeResponse,
+    image_embeds: torch.Tensor,
+    embeddings_dtype: torch.dtype,
 ) -> Dict[str, torch.Tensor | Dict[str, Any]]:
     """Construct multimodal data for a vLLM request for models that require additional parameters alongside the embeddings"""
+    image_embeds = image_embeds.to(embeddings_dtype)
     if "Qwen2" in model:
         return {
             "image": {
-                "image_embeds": image_embeds.squeeze(0).to(torch.float16),
+                "image_embeds": image_embeds.squeeze(0),
                 "image_grid_thw": torch.tensor(encode_output.image_grid_thw).squeeze(0),
             }
         }

Original file line number	Diff line number	Diff line change
`@@ -117,7 +117,7 @@ async def async_init(self):`
`117`	`117`	`)`
`118`	`118`
`119`	`119`	`runtime = dynamo_context["runtime"]`
`120`		`- embeddings_shape, embeddings_dtype = get_vision_embeddings_info(`
	`120`	`+ embeddings_shape, self.embeddings_dtype = get_vision_embeddings_info(`
`121`	`121`	`self.engine_args.model, self.engine_args.num_patches`
`122`	`122`	`)`
`123`	`123`	`logger.debug(f"Embeddings shape: {embeddings_shape}")`
`@@ -139,6 +139,7 @@ async def async_init(self):`
`139`	`139`	`else:`
`140`	`140`	`self.disaggregated_router = None`
`141`	`141`	`else:`
	`142`	`+ EMBEDDINGS_DTYPE = torch.float16`
`142`	`143`	`EMBEDDINGS_DEVICE = "cuda"`
`143`	`144`
`144`	`145`	`enc_comp_ns, enc_comp_name = VllmEncodeWorker.dynamo_address() # type: ignore`
`@@ -154,7 +155,7 @@ async def async_init(self):`
`154`	`155`
`155`	`156`	`# Create a longer-lived buffer for receiving the image embeddings.`
`156`	`157`	`embeddings = torch.empty(`
`157`		`- embeddings_shape, dtype=embeddings_dtype, device=EMBEDDINGS_DEVICE`
	`158`	`+ embeddings_shape, dtype=EMBEDDINGS_DTYPE, device=EMBEDDINGS_DEVICE`
`158`	`159`	`)`
`159`	`160`	`descriptor = connect.Descriptor(embeddings)`
`160`	`161`	`# Register the descriptor w/ NIXL (this is optional, if not done here the connect subsytem will take care of this automatically).`
`@@ -290,7 +291,7 @@ async def local_prefill(self, request: vLLMMultimodalRequest) -> tuple:`
`290`	`291`	`)`
`291`	`292`	`# When using disaggregated serving, the encode worker will have provided the key-value cache updates via the encode worker.`
`292`	`293`	`multi_modal_data = construct_mm_data(`
`293`		`- self.engine_args.model, encode_output, embeddings`
	`294`	`+ self.engine_args.model, encode_output, embeddings, self.embeddings_dtype`
`294`	`295`	`)`
`295`	`296`
`296`	`297`	`return prompt_ids, multi_modal_data, remote_prefill_params`