feat: working code for phi3v

hhzhang16 · hhzhang16 · commit d73689558dd9 · 2025-06-10T12:13:31.000-07:00
diff --git a/examples/llm/configs/disagg.yaml b/examples/llm/configs/disagg.yaml
@@ -26,7 +26,6 @@ Frontend:
 Processor:
   router: round-robin
   common-configs: [model, block-size]
-  prompt-template: "USER: <image>\n<prompt> ASSISTANT:"
 
 VllmWorker:
   remote-prefill: true
diff --git a/examples/multimodal/components/decode_worker.py b/examples/multimodal/components/decode_worker.py
@@ -25,7 +25,7 @@
 from components.encode_worker import VllmEncodeWorker
 from components.prefill_worker import VllmPrefillWorker
 from utils.logging import check_required_workers
-from utils.model import construct_mm_data, get_vision_embeddings_size
+from utils.model import construct_mm_data, get_vision_embeddings_info
 from utils.nixl import NixlMetadataStore
 from utils.prefill_queue import PrefillQueue
 from utils.protocol import (
@@ -117,7 +117,7 @@ async def async_init(self):
             )
 
         runtime = dynamo_context["runtime"]
-        embeddings_shape = get_vision_embeddings_size(
+        embeddings_shape, embeddings_dtype = get_vision_embeddings_info(
             self.engine_args.model, self.engine_args.num_patches
         )
         logger.debug(f"Embeddings shape: {embeddings_shape}")
@@ -139,7 +139,6 @@ async def async_init(self):
             else:
                 self.disaggregated_router = None
         else:
-            EMBEDDINGS_DTYPE = torch.float16
             EMBEDDINGS_DEVICE = "cuda"
 
             enc_comp_ns, enc_comp_name = VllmEncodeWorker.dynamo_address()  # type: ignore
@@ -155,7 +154,7 @@ async def async_init(self):
 
             # Create a longer-lived buffer for receiving the image embeddings.
             embeddings = torch.empty(
-                embeddings_shape, dtype=EMBEDDINGS_DTYPE, device=EMBEDDINGS_DEVICE
+                embeddings_shape, dtype=embeddings_dtype, device=EMBEDDINGS_DEVICE
             )
             descriptor = connect.Descriptor(embeddings)
             # Register the descriptor w/ NIXL (this is optional, if not done here the connect subsytem will take care of this automatically).
diff --git a/examples/multimodal/components/encode_worker.py b/examples/multimodal/components/encode_worker.py
@@ -165,27 +165,32 @@ async def encode(self, request: EncodeRequest) -> AsyncIterator[EncodeResponse]:
 
             logger.debug(f"Processing image for request: {{ id: {request_id} }}")
             image_embeds = self.image_processor(images=image, return_tensors="pt")
-            # Add a batch dimension to the pixel values
-            image_embeds["pixel_values"] = (
-                image_embeds["pixel_values"].unsqueeze(0).to(DEVICE)
-            )
+            # Add a batch dimension to everything
+            for item in image_embeds:
+                image_embeds[item] = image_embeds[item].unsqueeze(0).to(DEVICE)
             logger.debug(f"Image embeds: {image_embeds}")
-            image_grid_thw = None
-            if "image_grid_thw" in image_embeds:
-                image_grid_thw = image_embeds["image_grid_thw"].tolist()
-            image_sizes = [image.size]
+
+            image_grid_thw = (
+                image_embeds["image_grid_thw"].tolist()
+                if "image_grid_thw" in image_embeds
+                else None
+            )
+            image_sizes = (
+                image_embeds["image_sizes"].tolist()
+                if "image_sizes" in image_embeds
+                else [image.size]
+            )
             logger.debug(
                 f"Pixel values stats: mean={image_embeds['pixel_values'].mean().item()}, std={image_embeds['pixel_values'].std().item()}, min={image_embeds['pixel_values'].min().item()}, max={image_embeds['pixel_values'].max().item()}"
             )
 
             with torch.no_grad():
                 embeddings = self.vision_model.get_multimodal_embeddings(**image_embeds)
-                if isinstance(embeddings, tuple):
-                    # The result multimodal_embeddings is tuple of tensors, with each
+                if isinstance(embeddings, tuple) or isinstance(embeddings, list):
+                    # The result multimodal_embeddings may be a list or tuple of tensors, with each
                     # tensor corresponding to a multimodal data item (image or video).
                     # TODO: for multi-image support, this result will contain multiple tensors.
                     embeddings = embeddings[0].unsqueeze(0)
-
                 logger.debug(
                     f"Embeddings: {{ shape: {embeddings.shape}, dtype: {embeddings.dtype}, device: {embeddings.device}, ptr: {embeddings.data_ptr()}, elements: {{ count: {embeddings.numel()}, size: {embeddings.element_size()} }} }}."
                 )
diff --git a/examples/multimodal/components/prefill_worker.py b/examples/multimodal/components/prefill_worker.py
@@ -25,7 +25,7 @@
 from components.encode_worker import VllmEncodeWorker
 from pydantic import BaseModel
 from utils.logging import check_required_workers
-from utils.model import construct_mm_data, get_vision_embeddings_size
+from utils.model import construct_mm_data, get_vision_embeddings_info
 from utils.nixl import NixlMetadataStore
 from utils.prefill_queue import PrefillQueue
 from utils.protocol import EncodeRequest, EncodeResponse
@@ -40,8 +40,6 @@
 
 logger = logging.getLogger(__name__)
 
-# Constants for the dtype and device of the embeddings tensor.
-EMBEDDINGS_DTYPE = torch.float16
 EMBEDDINGS_DEVICE = "cuda"
 
 
@@ -113,12 +111,12 @@ async def async_init(self):
         await self._connector.initialize()
 
         # Create a longer-lived buffer for receiving the image embeddings.
-        embeddings_shape = get_vision_embeddings_size(
+        embeddings_shape, embeddings_dtype = get_vision_embeddings_info(
             self.engine_args.model, self.engine_args.num_patches
         )
         embeddings = torch.empty(
             embeddings_shape,
-            dtype=EMBEDDINGS_DTYPE,
+            dtype=embeddings_dtype,
             device=EMBEDDINGS_DEVICE,
         )
         descriptor = connect.Descriptor(embeddings)
diff --git a/examples/multimodal/configs/agg-phi3v.yaml b/examples/multimodal/configs/agg-phi3v.yaml
@@ -0,0 +1,50 @@
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+Common:
+  model: microsoft/Phi-3.5-vision-instruct
+  block-size: 64
+  max-model-len: 4096
+  trust-remote-code: true
+
+Processor:
+  router: round-robin
+  prompt-template: "<|user|>\n<|image_1|>\n<prompt><|end|>\n<|assistant|>\n"
+  common-configs: [model, block-size, max-model-len, trust-remote-code]
+
+VllmDecodeWorker:
+  enforce-eager: true
+  max-num-batched-tokens: 16384
+  max-num-seqs: 2
+  mm-processor-kwargs:
+    num_crops: 16
+  enable-prefix-caching: true
+  image-token-id: 32000
+  num-patches: 757
+  router: random
+  tensor-parallel-size: 1
+  ServiceArgs:
+    workers: 1
+    resources:
+      gpu: '1'
+  common-configs: [model, block-size, max-model-len, trust-remote-code]
+
+VllmEncodeWorker:
+  tensor-parallel-size: 1
+  router: random
+  ServiceArgs:
+    workers: 1
+    resources:
+      gpu: '1'
+  common-configs: [model]
diff --git a/examples/multimodal/utils/model.py b/examples/multimodal/utils/model.py
@@ -13,7 +13,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from typing import Any, Dict
+import logging
+from typing import Any, Dict, Tuple
 
 import torch
 from transformers import AutoConfig
@@ -22,6 +23,8 @@
 from vllm.utils import get_distributed_init_method, get_ip, get_open_port
 from vllm.worker.worker import Worker
 
+logger = logging.getLogger(__name__)
+
 
 def load_vision_model(model_id: str) -> torch.nn.Module:
     """
@@ -44,13 +47,24 @@ def load_vision_model(model_id: str) -> torch.nn.Module:
     return worker.model_runner.model
 
 
-def get_vision_embeddings_size(model_id: str, num_patches: int) -> tuple[int, int, int]:
-    """Calculate vision embeddings size using model config and image processor
-    Returns a tuple of (batch_size, num_patches, hidden_dim).
+def get_vision_embeddings_info(
+    model_id: str, num_patches: int
+) -> Tuple[Tuple[int, int, int], torch.dtype]:
+    """Calculate vision embeddings size and dtype using model config
+    Returns a tuple of (batch_size, num_patches, hidden_dim), dtype.
     """
     config = AutoConfig.from_pretrained(model_id, trust_remote_code=True)
     assert num_patches > 0, "Number of patches must be positive"
-    return 1, num_patches, getattr(config, "hidden_size", 4096)
+    if not hasattr(config, "torch_dtype"):
+        raise ValueError("Model config missing required 'torch_dtype' attribute")
+    if not hasattr(config, "hidden_size"):
+        logger.warning(
+            "Model config missing required 'hidden_size' attribute, using 4096"
+        )
+        hidden_size = 4096
+    else:
+        hidden_size = config.hidden_size
+    return (1, num_patches, hidden_size), config.torch_dtype
 
 
 def construct_mm_data(
@@ -60,8 +74,8 @@ def construct_mm_data(
     if "Qwen2" in model:
         return {
             "image": {
-                "image_embeds": image_embeds.squeeze(0),
-                "image_grid_thw": torch.tensor(encode_output.image_grid_thw),
+                "image_embeds": image_embeds.squeeze(0).to(torch.float16),
+                "image_grid_thw": torch.tensor(encode_output.image_grid_thw).squeeze(0),
             }
         }
     elif "MiniCPM-V" in model:
diff --git a/examples/multimodal/utils/protocol.py b/examples/multimodal/utils/protocol.py
@@ -15,7 +15,7 @@
 
 
 import json
-from typing import Any, List, Literal, Optional, Tuple, Union
+from typing import Any, List, Literal, Optional, Union
 
 import connect
 import msgspec
@@ -143,7 +143,7 @@ class EncodeResponse(BaseModel):
     model_config = ConfigDict(arbitrary_types_allowed=True)
     request_id: str
     image_grid_thw: Optional[List[Any]] = None
-    image_sizes: Optional[List[Tuple[int, int]]] = None
+    image_sizes: Optional[List[Any]] = None
 
 
 class MyRequestOutput(BaseModel):