Skip to content

Commit d586343

Browse files
committed
feat: working for Qwen 2.5 VL
1 parent 36eacb9 commit d586343

File tree

5 files changed

+22
-13
lines changed

5 files changed

+22
-13
lines changed

examples/multimodal/README.md

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -52,7 +52,7 @@ flowchart LR
5252
cd $DYNAMO_HOME/examples/multimodal
5353
# Serve a LLaVA 1.5 7B model:
5454
dynamo serve graphs.agg:Frontend -f ./configs/agg-llava.yaml
55-
# Serve a Qwen2 VL model:
55+
# Serve a Qwen2.5-VL model:
5656
# dynamo serve graphs.agg:Frontend -f ./configs/agg-qwen.yaml
5757
# Serve a Phi3V model:
5858
# dynamo serve graphs.agg:Frontend -f ./configs/agg-phi3v.yaml
@@ -89,7 +89,7 @@ curl http://localhost:8000/v1/chat/completions \
8989
}'
9090
```
9191

92-
If serving the example Qwen model, replace `"llava-hf/llava-1.5-7b-hf"` in the `"model"` field with `"Qwen/Qwen2-VL-7B-Instruct"`. If serving the example Phi3V model, replace `"llava-hf/llava-1.5-7b-hf"` in the `"model"` field with `"microsoft/Phi-3.5-vision-instruct"`.
92+
If serving the example Qwen model, replace `"llava-hf/llava-1.5-7b-hf"` in the `"model"` field with `"Qwen/Qwen2.5-VL-7B-Instruct"`. If serving the example Phi3V model, replace `"llava-hf/llava-1.5-7b-hf"` in the `"model"` field with `"microsoft/Phi-3.5-vision-instruct"`.
9393

9494
You should see a response similar to this:
9595
```json
@@ -204,7 +204,7 @@ DYNAMO_TAG=$(dynamo build graphs.agg:Frontend | grep "Successfully built" | awk
204204
export DEPLOYMENT_NAME=multimodal-agg
205205
# For aggregated serving with LLaVA:
206206
dynamo deploy $DYNAMO_TAG -n $DEPLOYMENT_NAME -f ./configs/agg-llava.yaml
207-
# For aggregated serving with Qwen2-VL:
207+
# For aggregated serving with Qwen2.5-VL:
208208
# dynamo deploy $DYNAMO_TAG -n $DEPLOYMENT_NAME -f ./configs/agg-qwen.yaml
209209
# For aggregated serving with Phi3V:
210210
# dynamo deploy $DYNAMO_TAG -n $DEPLOYMENT_NAME -f ./configs/agg-phi3v.yaml
@@ -249,6 +249,6 @@ curl localhost:8000/v1/chat/completions \
249249
}'
250250
```
251251

252-
If serving the example Qwen model, replace `"llava-hf/llava-1.5-7b-hf"` in the `"model"` field with `"Qwen/Qwen2-VL-7B-Instruct"`. If serving the example Phi3V model, replace `"llava-hf/llava-1.5-7b-hf"` in the `"model"` field with `"microsoft/Phi-3.5-vision-instruct"`.
252+
If serving the example Qwen model, replace `"llava-hf/llava-1.5-7b-hf"` in the `"model"` field with `"Qwen/Qwen2.5-VL-7B-Instruct"`. If serving the example Phi3V model, replace `"llava-hf/llava-1.5-7b-hf"` in the `"model"` field with `"microsoft/Phi-3.5-vision-instruct"`.
253253

254254
For more details on managing deployments, testing, and troubleshooting, please refer to the [Operator Deployment Guide](../../docs/guides/dynamo_deploy/operator_deployment.md).

examples/multimodal/components/decode_worker.py

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -117,7 +117,7 @@ async def async_init(self):
117117
)
118118

119119
runtime = dynamo_context["runtime"]
120-
embeddings_shape, embeddings_dtype = get_vision_embeddings_info(
120+
embeddings_shape, self.embeddings_dtype = get_vision_embeddings_info(
121121
self.engine_args.model, self.engine_args.num_patches
122122
)
123123
logger.debug(f"Embeddings shape: {embeddings_shape}")
@@ -139,6 +139,7 @@ async def async_init(self):
139139
else:
140140
self.disaggregated_router = None
141141
else:
142+
EMBEDDINGS_DTYPE = torch.float16
142143
EMBEDDINGS_DEVICE = "cuda"
143144

144145
enc_comp_ns, enc_comp_name = VllmEncodeWorker.dynamo_address() # type: ignore
@@ -154,7 +155,7 @@ async def async_init(self):
154155

155156
# Create a longer-lived buffer for receiving the image embeddings.
156157
embeddings = torch.empty(
157-
embeddings_shape, dtype=embeddings_dtype, device=EMBEDDINGS_DEVICE
158+
embeddings_shape, dtype=EMBEDDINGS_DTYPE, device=EMBEDDINGS_DEVICE
158159
)
159160
descriptor = connect.Descriptor(embeddings)
160161
# Register the descriptor w/ NIXL (this is optional, if not done here the connect subsytem will take care of this automatically).
@@ -290,7 +291,7 @@ async def local_prefill(self, request: vLLMMultimodalRequest) -> tuple:
290291
)
291292
# When using disaggregated serving, the encode worker will have provided the key-value cache updates via the encode worker.
292293
multi_modal_data = construct_mm_data(
293-
self.engine_args.model, encode_output, embeddings
294+
self.engine_args.model, encode_output, embeddings, self.embeddings_dtype
294295
)
295296

296297
return prompt_ids, multi_modal_data, remote_prefill_params

examples/multimodal/components/prefill_worker.py

Lines changed: 6 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -111,12 +111,12 @@ async def async_init(self):
111111
await self._connector.initialize()
112112

113113
# Create a longer-lived buffer for receiving the image embeddings.
114-
embeddings_shape, embeddings_dtype = get_vision_embeddings_info(
114+
embeddings_shape, self.embeddings_dtype = get_vision_embeddings_info(
115115
self.engine_args.model, self.engine_args.num_patches
116116
)
117117
embeddings = torch.empty(
118118
embeddings_shape,
119-
dtype=embeddings_dtype,
119+
dtype=self.embeddings_dtype,
120120
device=EMBEDDINGS_DEVICE,
121121
)
122122
descriptor = connect.Descriptor(embeddings)
@@ -265,7 +265,10 @@ async def generate(self, request: RemotePrefillRequest):
265265
prompt=TokensPrompt(
266266
prompt_token_ids=prompt_token_ids,
267267
multi_modal_data=construct_mm_data(
268-
self.engine_args.model, encode_output, embeddings
268+
self.engine_args.model,
269+
encode_output,
270+
embeddings,
271+
self.embeddings_dtype,
269272
),
270273
),
271274
sampling_params=sampling_params,

examples/multimodal/configs/agg-qwen.yaml

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,7 @@
1313
# See the License for the specific language governing permissions and
1414
# limitations under the License.
1515
Common:
16-
model: Qwen/Qwen2-VL-7B-Instruct
16+
model: Qwen/Qwen2.5-VL-7B-Instruct
1717
block-size: 64
1818
max-model-len: 4096
1919

@@ -29,6 +29,7 @@ VllmDecodeWorker:
2929
mm-processor-kwargs:
3030
min_pixels: 784
3131
max_pixels: 1003520
32+
fps: 1
3233
enable-prefix-caching: true
3334
image-token-id: 151655
3435
num-patches: 345

examples/multimodal/utils/model.py

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -68,13 +68,17 @@ def get_vision_embeddings_info(
6868

6969

7070
def construct_mm_data(
71-
model: str, encode_output: EncodeResponse, image_embeds: torch.Tensor
71+
model: str,
72+
encode_output: EncodeResponse,
73+
image_embeds: torch.Tensor,
74+
embeddings_dtype: torch.dtype,
7275
) -> Dict[str, torch.Tensor | Dict[str, Any]]:
7376
"""Construct multimodal data for a vLLM request for models that require additional parameters alongside the embeddings"""
77+
image_embeds = image_embeds.to(embeddings_dtype)
7478
if "Qwen2" in model:
7579
return {
7680
"image": {
77-
"image_embeds": image_embeds.squeeze(0).to(torch.float16),
81+
"image_embeds": image_embeds.squeeze(0),
7882
"image_grid_thw": torch.tensor(encode_output.image_grid_thw).squeeze(0),
7983
}
8084
}

0 commit comments

Comments
 (0)