Skip to content

Commit 74b2ee2

Browse files
Isotr0pymawong-amd
authored andcommitted
[Model] Broadcast Ovis2 implementation to fit Ovis1.6 (vllm-project#17861)
Signed-off-by: Isotr0py <2037008807@qq.com>
1 parent cf52511 commit 74b2ee2

File tree

16 files changed

+330
-212
lines changed

16 files changed

+330
-212
lines changed

docs/source/models/supported_models.md

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1045,10 +1045,10 @@ Specified using `--task generate`.
10451045
*
10461046
* ✅︎
10471047
* ✅︎
1048-
- * `Ovis2ForConditionalGeneration`<sup>^</sup>
1049-
* Ovis2
1048+
- * `Ovis`
1049+
* Ovis2, Ovis1.6
10501050
* T + I<sup>+</sup>
1051-
* `AIDC-AI/Ovis2-1B`, `AIDC-AI/Ovis2-2B`, etc.
1051+
* `AIDC-AI/Ovis2-1B`, `AIDC-AI/Ovis1.6-Llama3.2-3B`, etc.
10521052
*
10531053
*
10541054
* ✅︎

examples/offline_inference/vision_language.py

Lines changed: 12 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -725,8 +725,8 @@ def run_nvlm_d(questions: list[str], modality: str) -> ModelRequestData:
725725
)
726726

727727

728-
# Ovis2
729-
def run_ovis2(questions: list[str], modality: str) -> ModelRequestData:
728+
# Ovis
729+
def run_ovis(questions: list[str], modality: str) -> ModelRequestData:
730730
assert modality == "image"
731731

732732
model_name = "AIDC-AI/Ovis2-1B"
@@ -737,15 +737,18 @@ def run_ovis2(questions: list[str], modality: str) -> ModelRequestData:
737737
max_num_seqs=2,
738738
trust_remote_code=True,
739739
dtype="half",
740-
hf_overrides={"architectures": ["Ovis2ForConditionalGeneration"]},
741740
limit_mm_per_prompt={modality: 1},
742741
)
743742

744-
placeholder = "<image>\n"
745-
prompts = [("<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n"
746-
f"<|im_start|>user\n{placeholder}"
747-
f"{question}<|im_end|>\n"
748-
"<|im_start|>assistant\n") for question in questions]
743+
tokenizer = AutoTokenizer.from_pretrained(model_name,
744+
trust_remote_code=True)
745+
messages = [[{
746+
'role': 'user',
747+
'content': f"<image>\n{question}"
748+
}] for question in questions]
749+
prompts = tokenizer.apply_chat_template(messages,
750+
tokenize=False,
751+
add_generation_prompt=True)
749752

750753
return ModelRequestData(
751754
engine_args=engine_args,
@@ -1069,7 +1072,7 @@ def run_skyworkr1v(questions: list[str], modality: str) -> ModelRequestData:
10691072
"llama4": run_llama4,
10701073
"molmo": run_molmo,
10711074
"NVLM_D": run_nvlm_d,
1072-
"ovis2": run_ovis2,
1075+
"ovis": run_ovis,
10731076
"paligemma": run_paligemma,
10741077
"paligemma2": run_paligemma2,
10751078
"phi3_v": run_phi3v,

examples/offline_inference/vision_language_multi_image.py

Lines changed: 12 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -436,8 +436,8 @@ def load_nvlm_d(question: str, image_urls: list[str]) -> ModelRequestData:
436436
)
437437

438438

439-
# Ovis2
440-
def load_ovis2(question: str, image_urls: list[str]) -> ModelRequestData:
439+
# Ovis
440+
def load_ovis(question: str, image_urls: list[str]) -> ModelRequestData:
441441
model_name = "AIDC-AI/Ovis2-1B"
442442

443443
engine_args = EngineArgs(
@@ -447,15 +447,17 @@ def load_ovis2(question: str, image_urls: list[str]) -> ModelRequestData:
447447
trust_remote_code=True,
448448
dtype="half",
449449
limit_mm_per_prompt={"image": len(image_urls)},
450-
hf_overrides={"architectures": ["Ovis2ForConditionalGeneration"]},
451450
)
452451

453-
placeholder = '\n'.join(
454-
[f'Image {i+1}: <image>' for i in range(len(image_urls))]) + '\n'
455-
prompt = ("<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n"
456-
f"<|im_start|>user\n{placeholder}"
457-
f"{question}<|im_end|>\n"
458-
"<|im_start|>assistant\n")
452+
placeholders = "\n".join(f"Image-{i}: <image>\n"
453+
for i, _ in enumerate(image_urls, start=1))
454+
messages = [{'role': 'user', 'content': f"{placeholders}\n{question}"}]
455+
456+
tokenizer = AutoTokenizer.from_pretrained(model_name,
457+
trust_remote_code=True)
458+
prompt = tokenizer.apply_chat_template(messages,
459+
tokenize=False,
460+
add_generation_prompt=True)
459461

460462
return ModelRequestData(
461463
engine_args=engine_args,
@@ -713,7 +715,7 @@ def load_qwen2_5_vl(question: str, image_urls: list[str]) -> ModelRequestData:
713715
"mistral3": load_mistral3,
714716
"mllama": load_mllama,
715717
"NVLM_D": load_nvlm_d,
716-
"ovis2": load_ovis2,
718+
"ovis": load_ovis,
717719
"phi3_v": load_phi3v,
718720
"phi4_mm": load_phi4mm,
719721
"pixtral_hf": load_pixtral_hf,

tests/conftest.py

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -355,10 +355,16 @@ def __init__(
355355
**model_kwargs,
356356
)
357357

358+
# in case some unquantized custom models are not in same dtype
359+
if (getattr(model, "quantization_method", None) is None
360+
and any(p.dtype != self.dtype
361+
for p in model.parameters())):
362+
model = model.to(dtype=self.dtype)
363+
358364
if (getattr(model, "quantization_method", None) != "bitsandbytes"
359365
and len({p.device
360366
for p in model.parameters()}) < 2):
361-
model = model.to(self.device)
367+
model = model.to(device=self.device)
362368

363369
self.model = model
364370

tests/models/multimodal/generation/test_common.py

Lines changed: 26 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -476,6 +476,31 @@
476476
max_num_seqs=2,
477477
patch_hf_runner=model_utils.molmo_patch_hf_runner,
478478
),
479+
"ovis1_6-gemma2": VLMTestInfo(
480+
models=["AIDC-AI/Ovis1.6-Gemma2-9B"],
481+
test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE),
482+
prompt_formatter=lambda img_prompt: f"<bos><start_of_turn>user\n{img_prompt}<end_of_turn>\n<start_of_turn>model\n", # noqa: E501
483+
img_idx_to_prompt=lambda idx: "<image>\n", # noqa: E501
484+
max_model_len=4096,
485+
max_num_seqs=2,
486+
dtype="half",
487+
# use sdpa mode for hf runner since ovis2 didn't work with flash_attn
488+
hf_model_kwargs={"llm_attn_implementation": "sdpa"},
489+
patch_hf_runner=model_utils.ovis_patch_hf_runner,
490+
marks=[large_gpu_mark(min_gb=32)],
491+
),
492+
"ovis1_6": VLMTestInfo(
493+
models=["AIDC-AI/Ovis1.6-Llama3.2-3B"],
494+
test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE),
495+
prompt_formatter=lambda img_prompt: f"<|begin_of_text|><|start_header_id|>system<|end_header_id|>\n\nYou are a helpful and honest multimodal assistant.<|eot_id|><|start_header_id|>user<|end_header_id|>\n\n{img_prompt}<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n", # noqa: E501
496+
img_idx_to_prompt=lambda idx: "<image>\n", # noqa: E501
497+
max_model_len=4096,
498+
max_num_seqs=2,
499+
dtype="half",
500+
# use sdpa mode for hf runner since ovis2 didn't work with flash_attn
501+
hf_model_kwargs={"llm_attn_implementation": "sdpa"},
502+
patch_hf_runner=model_utils.ovis_patch_hf_runner,
503+
),
479504
"ovis2": VLMTestInfo(
480505
models=["AIDC-AI/Ovis2-1B"],
481506
test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE),
@@ -486,7 +511,7 @@
486511
dtype="half",
487512
# use sdpa mode for hf runner since ovis2 didn't work with flash_attn
488513
hf_model_kwargs={"llm_attn_implementation": "sdpa"},
489-
patch_hf_runner=model_utils.ovis2_patch_hf_runner,
514+
patch_hf_runner=model_utils.ovis_patch_hf_runner,
490515
),
491516
"phi3v": VLMTestInfo(
492517
models=["microsoft/Phi-3.5-vision-instruct"],

tests/models/multimodal/generation/vlm_utils/model_utils.py

Lines changed: 11 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -678,20 +678,25 @@ def _generate(self, max_new_tokens=None, do_sample=None, **kwargs):
678678
return hf_model
679679

680680

681-
def ovis2_patch_hf_runner(hf_model: HfRunner) -> HfRunner:
681+
def ovis_patch_hf_runner(hf_model: HfRunner) -> HfRunner:
682682
"""Patches and returns an instance of the HfRunner to use for Ovis2."""
683-
hf_model.model.visual_tokenizer.to(hf_model.dtype)
684-
hf_model.model.vte.to(hf_model.dtype)
685-
hf_model.model.llm.to(hf_model.dtype)
686-
687683
hf_model.model.get_output_embeddings = lambda: \
688684
hf_model.model.llm.get_output_embeddings()
689685

690686
def processor(*args, text="", images=None, **kwargs):
691687
text_tokenizer = hf_model.model.get_text_tokenizer()
692688
images = [images] if isinstance(images, Image) else images
693689

694-
text = text.split("<|im_start|>user\n")[1].split("<|im_end|>\n")[0]
690+
prompt_start_and_end = {
691+
"qwen2": ("<|im_start|>user\n", "<|im_end|>\n"),
692+
"llama":
693+
("<|start_header_id|>user<|end_header_id|>\n\n", "<|eot_id|>"),
694+
"gemma2": ("<start_of_turn>user\n", "<end_of_turn>\n"),
695+
}
696+
for start, end in prompt_start_and_end.values():
697+
if start in text and end in text:
698+
text = text.split(start)[1].split(end)[0]
699+
break
695700

696701
prompt, input_ids, pixel_values = hf_model.model.preprocess_inputs(
697702
text_or_conversations=text, images=images)

tests/models/multimodal/processing/test_common.py

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -146,7 +146,8 @@ def _test_processing_correctness_hf(
146146
batch_idx: int,
147147
ignore_mm_keys: Optional[set[str]] = None,
148148
):
149-
if model_config.hf_config.model_type in ("mllama", "whisper", "ultravox"):
149+
if model_config.hf_config.model_type in ("mllama", "ovis", "ultravox",
150+
"whisper"):
150151
# For some multimodal models, tokenizer will always add bos_token
151152
# at the beginning of prompt by default, causing hf_processor outputs
152153
# incorrect token ids. So we need use `add_special_tokens=False` here
@@ -274,6 +275,8 @@ def _test_processing_correctness_mistral(
274275
"allenai/Molmo-7B-D-0924",
275276
"allenai/Molmo-7B-O-0924",
276277
"nvidia/NVLM-D-72B",
278+
"AIDC-AI/Ovis1.6-Gemma2-9B",
279+
"AIDC-AI/Ovis1.6-Llama3.2-3B",
277280
"AIDC-AI/Ovis2-1B",
278281
"google/paligemma-3b-mix-224",
279282
"google/paligemma2-3b-ft-docci-448",

tests/models/registry.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -355,9 +355,9 @@ def check_available_online(
355355
max_transformers_version="4.48",
356356
transformers_version_reason="Use of deprecated imports which have been removed.", # noqa: E501
357357
extras={"phi3.5": "microsoft/Phi-3.5-vision-instruct"}), # noqa: E501
358-
"Ovis2ForConditionalGeneration": _HfExamplesInfo("AIDC-AI/Ovis2-1B",
359-
trust_remote_code=True,
360-
hf_overrides={"architectures": ["Ovis2ForConditionalGeneration"]}), # noqa: E501
358+
"Ovis": _HfExamplesInfo("AIDC-AI/Ovis2-1B", trust_remote_code=True,
359+
extras={"1.6-llama": "AIDC-AI/Ovis1.6-Llama3.2-3B",
360+
"1.6-gemma": "AIDC-AI/Ovis1.6-Gemma2-9B"}), # noqa: E501
361361
"Phi4MMForCausalLM": _HfExamplesInfo("microsoft/Phi-4-multimodal-instruct",
362362
trust_remote_code=True),
363363
"PixtralForConditionalGeneration": _HfExamplesInfo("mistralai/Pixtral-12B-2409", # noqa: E501

vllm/entrypoints/chat_utils.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -512,7 +512,7 @@ def _placeholder_str(self, modality: ModalityStr,
512512
hf_config.image_token_index)
513513

514514
if model_type in ("aya_vision", "chameleon", "deepseek_vl_v2",
515-
"internvl_chat", "ovis2", "skywork_chat",
515+
"internvl_chat", "ovis", "skywork_chat",
516516
"NVLM_D", "h2ovl_chat", "idefics3", "smolvlm"):
517517
return "<image>"
518518
if model_type in ("mllama", "llama4"):

vllm/model_executor/models/aimv2.py

Lines changed: 2 additions & 125 deletions
Original file line numberDiff line numberDiff line change
@@ -5,129 +5,14 @@
55
from typing import Optional
66

77
import torch
8-
from torch import nn, softmax
8+
import torch.nn as nn
99
from torch.nn import functional as F
10-
from torch.nn.functional import gumbel_softmax, pad
1110

1211
from vllm.model_executor.layers.layernorm import RMSNorm
1312
from vllm.model_executor.layers.linear import ReplicatedLinear
1413
from vllm.model_executor.layers.quantization.base_config import (
1514
QuantizationConfig)
16-
from vllm.transformers_utils.configs.ovis2 import (AIMv2Config,
17-
Aimv2VisualTokenizerConfig)
18-
19-
IMAGE_INDICATOR_IDS = [-301, -302, -303, -304,
20-
-305] # kept for vocab prefixed tokens
21-
22-
23-
def st_argmax(y_soft: torch.Tensor, dim: int): # straight-through softmax
24-
index = y_soft.max(dim, keepdim=True)[1]
25-
y_hard = torch.zeros_like(
26-
y_soft, memory_format=torch.legacy_contiguous_format).scatter_(
27-
dim, index, 1.0)
28-
ret = y_hard - y_soft.detach() + y_soft
29-
return ret
30-
31-
32-
class Aimv2VisualTokenizer(torch.nn.Module):
33-
34-
def __init__(self,
35-
config: Aimv2VisualTokenizerConfig,
36-
quant_config: Optional[QuantizationConfig] = None,
37-
prefix: str = "",
38-
**kwargs):
39-
super().__init__()
40-
self.config = config
41-
self.backbone = AIMv2Model(
42-
config=config.backbone_config, # noqa
43-
quant_config=quant_config,
44-
prefix=f"{prefix}.visual_tokenizer")
45-
# reserved tokens for IMAGE_INDICATORS
46-
head_dim = config.vocab_size - len(IMAGE_INDICATOR_IDS)
47-
self.head = torch.nn.Sequential(
48-
ReplicatedLinear(
49-
config.backbone_config.hidden_size * config.hidden_stride *
50-
config.hidden_stride,
51-
head_dim,
52-
bias=False,
53-
), torch.nn.LayerNorm(head_dim))
54-
55-
@property
56-
def dtype(self):
57-
return self.backbone.dtype
58-
59-
@property
60-
def device(self):
61-
return self.backbone.device
62-
63-
def tokenize(self, logits):
64-
if self.config.tokenize_function == 'softmax':
65-
tokens = softmax(logits, dim=-1)
66-
elif self.config.tokenize_function == 'gumbel_argmax':
67-
tokens = gumbel_softmax(logits, tau=self.config.tau, hard=True)
68-
elif self.config.tokenize_function == 'st_argmax':
69-
tokens = st_argmax(logits, dim=-1)
70-
else:
71-
raise ValueError(
72-
'Invalid `max_type`, expected softmax or gumbel_argmax '
73-
f'or st_argmax, but got {self.config.tokenize_function}')
74-
return tokens
75-
76-
def encode(self, pixel_values):
77-
features = self.backbone(pixel_values)
78-
if self.config.drop_cls_token:
79-
features = features[:, 1:, :]
80-
81-
# merge number of `hidden_stride * hidden_stride` hidden states together
82-
# to reduce token sequence length
83-
# e.g., for hidden_stride=2, this leads to a token length reduction:
84-
# 1024 -> 256 for aimv2
85-
if self.config.hidden_stride > 1:
86-
# this `d` maybe different from the above `d``
87-
n, L, d = features.shape
88-
sqrt_l = int(L**0.5)
89-
assert sqrt_l**2 == L, (
90-
"The token sequence length should be a perfect square.")
91-
features = features.reshape(n, sqrt_l, sqrt_l, d)
92-
pl = (self.config.hidden_stride -
93-
(sqrt_l %
94-
self.config.hidden_stride)) % self.config.hidden_stride
95-
features = pad(features, (0, 0, 0, pl, 0, pl), "constant", 0)
96-
sqrt_l += pl
97-
features = features.reshape(n, sqrt_l // self.config.hidden_stride,
98-
self.config.hidden_stride,
99-
sqrt_l // self.config.hidden_stride,
100-
self.config.hidden_stride, d)
101-
# [n, sqrt_l/hs, sqrt_l/hs, hs, hs, d]
102-
features = features.permute(0, 1, 3, 2, 4, 5)
103-
# [n, sqrt_l/hs, sqrt_l/hs, hs*hs*d]
104-
features = features.flatten(3)
105-
# [n, sqrt_l/hs*sqrt_l/hs, hs*hs*d]
106-
features = features.reshape(
107-
n, -1,
108-
self.config.hidden_stride * self.config.hidden_stride * d)
109-
110-
return features
111-
112-
def forward(self, pixel_values: torch.Tensor) -> torch.Tensor:
113-
"""[BatchSize, ImageShape] -> [BatchSize, Token, VocabSize]"""
114-
features = self.encode(pixel_values)
115-
logits, _ = self.head[0](
116-
features) # we spllit the sequncial here for not throwing an error
117-
logits = self.head[1](logits)
118-
tokens = self.tokenize(logits)
119-
# tokens' shape is [BatchSize, #Token, VocabSize-5], so padding with
120-
# [BatchSize, #Token, 5], after which, tokens' shape should become
121-
# [BatchSize, #Token, VocabSize]
122-
batch_size, token_len, _ = tokens.shape
123-
padding_tensor = torch.zeros(size=(batch_size, token_len,
124-
len(IMAGE_INDICATOR_IDS)),
125-
dtype=tokens.dtype,
126-
device=tokens.device,
127-
layout=tokens.layout,
128-
requires_grad=False)
129-
tokens = torch.cat((tokens, padding_tensor), dim=2)
130-
return tokens
15+
from vllm.transformers_utils.configs.ovis import AIMv2Config
13116

13217

13318
class AIMv2SwiGLUFFN(nn.Module):
@@ -302,14 +187,6 @@ def __init__(self,
302187
quant_config=quant_config,
303188
prefix=f"{prefix}.trunk")
304189

305-
@property
306-
def dtype(self):
307-
return self.trunk.blocks[0].attn.qkv.weight.dtype
308-
309-
@property
310-
def device(self):
311-
return self.trunk.blocks[0].attn.qkv.device
312-
313190
def forward(
314191
self,
315192
pixel_values: torch.Tensor,

0 commit comments

Comments
 (0)