Skip to content

Commit 38327cf

Browse files
JenZhaoywang96
andauthored
[Model] Aya Vision (#15441)
Signed-off-by: Jennifer Zhao <ai.jenniferzhao@gmail.com> Signed-off-by: Roger Wang <ywang@roblox.com> Co-authored-by: Roger Wang <ywang@roblox.com>
1 parent dfa82e2 commit 38327cf

File tree

10 files changed

+617
-2
lines changed

10 files changed

+617
-2
lines changed

docs/source/models/supported_models.md

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -753,6 +753,13 @@ See [this page](#generative-models) for more information on how to use generativ
753753
*
754754
* ✅︎
755755
* ✅︎
756+
- * `AyaVisionForConditionalGeneration`
757+
* Aya Vision
758+
* T + I<sup>+</sup>
759+
* `CohereForAI/aya-vision-8b`, `CohereForAI/aya-vision-32b`, etc.
760+
*
761+
* ✅︎
762+
* ✅︎
756763
- * `Blip2ForConditionalGeneration`
757764
* BLIP-2
758765
* T + I<sup>E</sup>

examples/offline_inference/vision_language.py

Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -60,6 +60,28 @@ def run_aria(questions: list[str], modality: str) -> ModelRequestData:
6060
)
6161

6262

63+
# Aya Vision
64+
def run_aya_vision(questions: list[str], modality: str) -> ModelRequestData:
65+
assert modality == "image"
66+
model_name = "CohereForAI/aya-vision-8b"
67+
68+
engine_args = EngineArgs(
69+
model=model_name,
70+
max_model_len=2048,
71+
max_num_seqs=2,
72+
mm_processor_kwargs={"crop_to_patches": True},
73+
disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache,
74+
)
75+
prompts = [
76+
f"<|START_OF_TURN_TOKEN|><|USER_TOKEN|><image>{question}<|END_OF_TURN_TOKEN|><|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|>"
77+
for question in questions
78+
]
79+
return ModelRequestData(
80+
engine_args=engine_args,
81+
prompts=prompts,
82+
)
83+
84+
6385
# BLIP-2
6486
def run_blip2(questions: list[str], modality: str) -> ModelRequestData:
6587
assert modality == "image"
@@ -865,6 +887,7 @@ def run_skyworkr1v(questions: list[str], modality: str) -> ModelRequestData:
865887

866888
model_example_map = {
867889
"aria": run_aria,
890+
"aya_vision": run_aya_vision,
868891
"blip-2": run_blip2,
869892
"chameleon": run_chameleon,
870893
"deepseek_vl_v2": run_deepseek_vl2,

examples/offline_inference/vision_language_multi_image.py

Lines changed: 36 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -61,6 +61,41 @@ def load_aria(question: str, image_urls: list[str]) -> ModelRequestData:
6161
)
6262

6363

64+
def load_aya_vision(question: str, image_urls: list[str]) -> ModelRequestData:
65+
model_name = "CohereForAI/aya-vision-8b"
66+
67+
engine_args = EngineArgs(
68+
model=model_name,
69+
max_num_seqs=2,
70+
limit_mm_per_prompt={"image": len(image_urls)},
71+
)
72+
73+
placeholders = [{"type": "image", "image": url} for url in image_urls]
74+
messages = [{
75+
"role":
76+
"user",
77+
"content": [
78+
*placeholders,
79+
{
80+
"type": "text",
81+
"text": question
82+
},
83+
],
84+
}]
85+
86+
processor = AutoProcessor.from_pretrained(model_name)
87+
88+
prompt = processor.apply_chat_template(messages,
89+
tokenize=False,
90+
add_generation_prompt=True)
91+
92+
return ModelRequestData(
93+
engine_args=engine_args,
94+
prompt=prompt,
95+
image_data=[fetch_image(url) for url in image_urls],
96+
)
97+
98+
6499
def load_deepseek_vl2(question: str,
65100
image_urls: list[str]) -> ModelRequestData:
66101
model_name = "deepseek-ai/deepseek-vl2-tiny"
@@ -526,6 +561,7 @@ def load_qwen2_5_vl(question: str, image_urls: list[str]) -> ModelRequestData:
526561

527562
model_example_map = {
528563
"aria": load_aria,
564+
"aya_vision": load_aya_vision,
529565
"deepseek_vl_v2": load_deepseek_vl2,
530566
"gemma3": load_gemma3,
531567
"h2ovl_chat": load_h2ovl,

tests/models/decoder_only/vision_language/test_models.py

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -158,6 +158,20 @@
158158
max_tokens=64,
159159
marks=[large_gpu_mark(min_gb=64)],
160160
),
161+
"aya_vision": VLMTestInfo(
162+
models=["CohereForAI/aya-vision-8b"],
163+
test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE),
164+
prompt_formatter=lambda img_prompt: f"<|START_OF_TURN_TOKEN|><|USER_TOKEN|>{img_prompt}<|END_OF_TURN_TOKEN|><|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|>", # noqa: E501
165+
single_image_prompts=IMAGE_ASSETS.prompts({
166+
"stop_sign": "<image>What's the content in the center of the image?", # noqa: E501
167+
"cherry_blossom": "<image>What is the season?", # noqa: E501
168+
}),
169+
multi_image_prompt="<image><image>Describe the two images in detail.", # noqa: E501
170+
max_model_len=8192,
171+
max_num_seqs=2,
172+
auto_cls=AutoModelForImageTextToText,
173+
vllm_runner_kwargs={"mm_processor_kwargs": {"crop_to_patches": True}}
174+
),
161175
"blip2": VLMTestInfo(
162176
# TODO: Change back to 2.7b once head_dim = 80 is supported
163177
models=["Salesforce/blip2-opt-6.7b"],

tests/models/multimodal/processing/test_common.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -246,6 +246,7 @@ def _test_processing_correctness_mistral(
246246
# yapf: disable
247247
@pytest.mark.parametrize("model_id", [
248248
"rhymes-ai/Aria",
249+
"CohereForAI/aya-vision-8b",
249250
"Salesforce/blip2-opt-2.7b",
250251
"facebook/chameleon-7b",
251252
"deepseek-ai/deepseek-vl2-tiny",

tests/models/registry.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -259,6 +259,7 @@ def check_available_online(
259259
_MULTIMODAL_EXAMPLE_MODELS = {
260260
# [Decoder-only]
261261
"AriaForConditionalGeneration": _HfExamplesInfo("rhymes-ai/Aria"),
262+
"AyaVisionForConditionalGeneration": _HfExamplesInfo("CohereForAI/aya-vision-8b"), # noqa: E501
262263
"Blip2ForConditionalGeneration": _HfExamplesInfo("Salesforce/blip2-opt-2.7b", # noqa: E501
263264
extras={"6b": "Salesforce/blip2-opt-6.7b"}), # noqa: E501
264265
"ChameleonForConditionalGeneration": _HfExamplesInfo("facebook/chameleon-7b"), # noqa: E501

vllm/config.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2716,6 +2716,10 @@ def _get_and_verify_max_len(
27162716
max_len_key = key if max_len < derived_max_model_len \
27172717
else max_len_key
27182718
derived_max_model_len = min(derived_max_model_len, max_len)
2719+
# For Command-R / Cohere, Cohere2 / Aya Vision models
2720+
if tmp_max_len := getattr(hf_config, "model_max_length", None):
2721+
max_len_key = "model_max_length"
2722+
derived_max_model_len = tmp_max_len
27192723

27202724
# If sliding window is manually disabled, max_length should be less
27212725
# than the sliding window length in the model config.

vllm/entrypoints/chat_utils.py

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -496,8 +496,9 @@ def _placeholder_str(self, modality: ModalityStr,
496496
if model_type.startswith("llava"):
497497
return self._cached_token_str(self._tokenizer,
498498
hf_config.image_token_index)
499-
if model_type in ("chameleon", "deepseek_vl_v2", "internvl_chat",
500-
"skywork_chat", "NVLM_D", "h2ovl_chat"):
499+
if model_type in ("aya_vision", "chameleon", "deepseek_vl_v2",
500+
"internvl_chat", "skywork_chat", "NVLM_D",
501+
"h2ovl_chat"):
501502
return "<image>"
502503
if model_type == "mllama":
503504
return "<|image|>"

0 commit comments

Comments
 (0)