Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions docs/models/supported_models.md
Original file line number Diff line number Diff line change
Expand Up @@ -562,6 +562,7 @@ Specified using `--task generate`.
| `SkyworkR1VChatModel` | Skywork-R1V-38B | T + I | `Skywork/Skywork-R1V-38B` | | ✅︎ | ✅︎ |
| `SmolVLMForConditionalGeneration` | SmolVLM2 | T + I | `SmolVLM2-2.2B-Instruct` | ✅︎ | | ✅︎ |
| `TarsierForConditionalGeneration` | Tarsier | T + I<sup>E+</sup> | `omni-search/Tarsier-7b`,`omni-search/Tarsier-34b` | | ✅︎ | ✅︎ |
| `Tarsier2ForConditionalGeneration`<sup>^</sup> | Tarsier2 | T + I<sup>E+</sup> + V<sup>E+</sup> | `omni-research/Tarsier2-Recap-7b`,`omni-research/Tarsier2-7b-0115` | | ✅︎ | ✅︎ |

<sup>^</sup> You need to set the architecture name via `--hf-overrides` to match the one in vLLM.
&nbsp;&nbsp;&nbsp;&nbsp;• For example, to use DeepSeek-VL2 series models:
Expand Down
32 changes: 32 additions & 0 deletions examples/offline_inference/vision_language.py
Original file line number Diff line number Diff line change
Expand Up @@ -1040,6 +1040,37 @@ def run_qwen2_5_omni(questions: list[str], modality: str):
)


def run_tarsier2(questions: list[str], modality: str) -> ModelRequestData:
model_name = "omni-research/Tarsier2-Recap-7b"

engine_args = EngineArgs(
model=model_name,
max_model_len=4096,
hf_overrides={"architectures": ["Tarsier2ForConditionalGeneration"]},
limit_mm_per_prompt={modality: 1},
)

if modality == "image":
placeholder = "<|image_pad|>"
elif modality == "video":
placeholder = "<|video_pad|>"

prompts = [
(
"<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n"
f"<|im_start|>user\n<|vision_start|>{placeholder}<|vision_end|>"
f"{question}<|im_end|>\n"
"<|im_start|>assistant\n"
)
for question in questions
]

return ModelRequestData(
engine_args=engine_args,
prompts=prompts,
)


# SkyworkR1V
def run_skyworkr1v(questions: list[str], modality: str) -> ModelRequestData:
assert modality == "image"
Expand Down Expand Up @@ -1112,6 +1143,7 @@ def run_skyworkr1v(questions: list[str], modality: str) -> ModelRequestData:
"skywork_chat": run_skyworkr1v,
"smolvlm": run_smolvlm,
"tarsier": run_tarsier,
"tarsier2": run_tarsier2,
}


Expand Down
27 changes: 27 additions & 0 deletions examples/offline_inference/vision_language_multi_image.py
Original file line number Diff line number Diff line change
Expand Up @@ -828,6 +828,32 @@ def load_tarsier(question: str, image_urls: list[str]) -> ModelRequestData:
)


def load_tarsier2(question: str, image_urls: list[str]) -> ModelRequestData:
model_name = "omni-research/Tarsier2-Recap-7b"

engine_args = EngineArgs(
model=model_name,
trust_remote_code=True,
max_model_len=32768,
limit_mm_per_prompt={"image": len(image_urls)},
hf_overrides={"architectures": ["Tarsier2ForConditionalGeneration"]},
)

prompt = (
"<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n"
f"<|im_start|>user\n<|vision_start|>{'<|image_pad|>' * len(image_urls)}"
f"<|vision_end|>{question}<|im_end|>\n"
"<|im_start|>assistant\n"
)
image_data = [fetch_image(url) for url in image_urls]

return ModelRequestData(
engine_args=engine_args,
prompt=prompt,
image_data=image_data,
)


model_example_map = {
"aria": load_aria,
"aya_vision": load_aya_vision,
Expand All @@ -853,6 +879,7 @@ def load_tarsier(question: str, image_urls: list[str]) -> ModelRequestData:
"qwen2_5_vl": load_qwen2_5_vl,
"smolvlm": load_smolvlm,
"tarsier": load_tarsier,
"tarsier2": load_tarsier2,
}


Expand Down
1 change: 1 addition & 0 deletions tests/models/multimodal/processing/test_common.py
Original file line number Diff line number Diff line change
Expand Up @@ -284,6 +284,7 @@ def _test_processing_correctness_one(
"fixie-ai/ultravox-v0_5-llama-3_2-1b",
"openai/whisper-large-v3",
"omni-research/Tarsier-7b",
"omni-research/Tarsier2-Recap-7b"
])
@pytest.mark.parametrize("hit_rate", [0.3, 0.5, 1.0])
@pytest.mark.parametrize("num_batches", [32])
Expand Down
2 changes: 2 additions & 0 deletions tests/models/registry.py
Original file line number Diff line number Diff line change
Expand Up @@ -397,6 +397,8 @@ def check_available_online(
trust_remote_code=True),
"TarsierForConditionalGeneration": _HfExamplesInfo("omni-research/Tarsier-7b", # noqa: E501
hf_overrides={"architectures": ["TarsierForConditionalGeneration"]}), # noqa: E501
"Tarsier2ForConditionalGeneration": _HfExamplesInfo("omni-research/Tarsier2-Recap-7b", # noqa: E501
hf_overrides={"architectures": ["Tarsier2ForConditionalGeneration"]}), # noqa: E501
# [Encoder-decoder]
# Florence-2 uses BartFastTokenizer which can't be loaded from AutoTokenizer
# Therefore, we borrow the BartTokenizer from the original Bart model
Expand Down
89 changes: 88 additions & 1 deletion vllm/model_executor/models/qwen2_vl.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,12 +32,14 @@
import torch.nn as nn
import torch.nn.functional as F
from einops import rearrange, repeat
from transformers import BatchFeature
from transformers import AutoConfig, BatchFeature
from transformers.models.qwen2_vl import (Qwen2VLImageProcessor,
Qwen2VLProcessor)
from transformers.models.qwen2_vl.configuration_qwen2_vl import (
Qwen2VLConfig, Qwen2VLVisionConfig)
from transformers.models.qwen2_vl.image_processing_qwen2_vl import smart_resize
from transformers.models.qwen2_vl.video_processing_qwen2_vl import (
Qwen2VLVideoProcessor)

from vllm.config import VllmConfig
from vllm.distributed import parallel_state, tensor_model_parallel_all_gather
Expand Down Expand Up @@ -69,6 +71,7 @@
from vllm.transformers_utils.config import uses_mrope
from vllm.transformers_utils.processor import (
cached_image_processor_from_config)
from vllm.transformers_utils.tokenizer import AnyTokenizer

from .interfaces import (MultiModalEmbeddings, SupportsLoRA,
SupportsMultiModal, SupportsPP)
Expand Down Expand Up @@ -1403,3 +1406,87 @@ def get_mm_mapping(self) -> MultiModelKeys:
connector="visual.merger.",
tower_model="visual.",
)


class Tarsier2MultiModalProcessor(Qwen2VLMultiModalProcessor):
pass


class Tarsier2ImageProcessor(Qwen2VLImageProcessor):

def __init__(
self,
size: Optional[dict[str, int]] = None,
**kwargs,
) -> None:
if size is not None and "min_pixels" in size and "max_pixels" in size:
# Remap if Tarsier2-specific format is provided
remapped_size = {
"shortest_edge": size["min_pixels"],
"longest_edge": size["max_pixels"]
}
super().__init__(size=remapped_size, **kwargs)
else:
super().__init__(size=size, **kwargs)


class Tarsier2Processor(Qwen2VLProcessor):

def __init__(
self,
vision_config: dict,
tokenizer: AnyTokenizer,
**kwargs,
):
self.image_processor = Tarsier2ImageProcessor(**vision_config)
super().__init__(image_processor=self.image_processor,
tokenizer=tokenizer,
video_processor=Qwen2VLVideoProcessor(),
chat_template=None,
**kwargs)


class Tarsier2ProcessingInfo(Qwen2VLProcessingInfo):

def get_hf_config(self) -> Qwen2VLConfig:
model_path = self.ctx.model_config.model
original_config = AutoConfig.from_pretrained(model_path)
config_dict = original_config.to_dict()
correct_config = Qwen2VLConfig.from_dict(config_dict)

return correct_config

def get_hf_processor(self, **kwargs: object) -> Tarsier2Processor:
return Tarsier2Processor(
vision_config=self.ctx.get_hf_image_processor_config(),
tokenizer=self.get_tokenizer(),
**kwargs,
)

def get_image_processor(self) -> Tarsier2ImageProcessor:
return Tarsier2ImageProcessor(
**self.ctx.get_hf_image_processor_config())


@MULTIMODAL_REGISTRY.register_processor(Tarsier2MultiModalProcessor,
info=Tarsier2ProcessingInfo,
dummy_inputs=Qwen2VLDummyInputsBuilder)
class Tarsier2ForConditionalGeneration(Qwen2VLForConditionalGeneration):
hf_to_vllm_mapper = WeightsMapper(orig_to_new_prefix={
"vision_tower.": "visual.",
})

def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
# Tarsier2 uses llava as model_type, which will create a Qwen2VLConfig
# as text_config, we need to reconstruct Qwen2VLConfig from LlavaConfig.
config = vllm_config.model_config.hf_config
qwen2vl_config = config.text_config
qwen2vl_config.architectures = config.architectures
vllm_config.model_config.hf_config = qwen2vl_config
super().__init__(vllm_config=vllm_config, prefix=prefix)

def load_weights(self, weights: Iterable[tuple[str,
torch.Tensor]]) -> set[str]:

loader = AutoWeightsLoader(self)
return loader.load_weights(weights, mapper=self.hf_to_vllm_mapper)
1 change: 1 addition & 0 deletions vllm/model_executor/models/registry.py
Original file line number Diff line number Diff line change
Expand Up @@ -216,6 +216,7 @@
"UltravoxModel": ("ultravox", "UltravoxModel"),
"Phi4MMForCausalLM": ("phi4mm", "Phi4MMForCausalLM"),
"TarsierForConditionalGeneration": ("tarsier", "TarsierForConditionalGeneration"), # noqa: E501
"Tarsier2ForConditionalGeneration": ("qwen2_vl", "Tarsier2ForConditionalGeneration"), # noqa: E501
# [Encoder-decoder]
"Florence2ForConditionalGeneration": ("florence2", "Florence2ForConditionalGeneration"), # noqa: E501
"MllamaForConditionalGeneration": ("mllama", "MllamaForConditionalGeneration"), # noqa: E501
Expand Down