Skip to content

Commit c2e1332

Browse files
russellbNickLucche
authored andcommitted
[v1] Add Whisper model support (encoder-decoder) (vllm-project#21088)
Signed-off-by: Russell Bryant <rbryant@redhat.com> Co-authored-by: NickLucche <nlucches@redhat.com> Signed-off-by: xuebwang-amd <xuebwang@amd.com>
1 parent 215180c commit c2e1332

File tree

31 files changed

+429
-92
lines changed

31 files changed

+429
-92
lines changed

.buildkite/test-pipeline.yaml

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -321,7 +321,6 @@ steps:
321321
- python3 offline_inference/vision_language_pooling.py --seed 0
322322
- python3 offline_inference/vision_language_multi_image.py --seed 0
323323
- VLLM_USE_V1=0 python3 others/tensorize_vllm_model.py --model facebook/opt-125m serialize --serialized-directory /tmp/ --suffix v1 && python3 others/tensorize_vllm_model.py --model facebook/opt-125m deserialize --path-to-tensors /tmp/vllm/facebook/opt-125m/v1/model.tensors
324-
- python3 offline_inference/encoder_decoder.py
325324
- python3 offline_inference/encoder_decoder_multimodal.py --model-type whisper --seed 0
326325
- python3 offline_inference/basic/classify.py
327326
- python3 offline_inference/basic/embed.py
@@ -644,7 +643,7 @@ steps:
644643
- pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
645644
- pip freeze | grep -E 'torch'
646645
- pytest -v -s models/multimodal -m core_model --ignore models/multimodal/generation/test_whisper.py --ignore models/multimodal/processing
647-
- cd .. && pytest -v -s tests/models/multimodal/generation/test_whisper.py -m core_model # Otherwise, mp_method="spawn" doesn't work
646+
- cd .. && VLLM_WORKER_MULTIPROC_METHOD=spawn pytest -v -s tests/models/multimodal/generation/test_whisper.py -m core_model # Otherwise, mp_method="spawn" doesn't work
648647

649648
- label: Multi-Modal Models Test (Extended) 1
650649
mirror_hardwares: [amdexperimental]
@@ -818,7 +817,8 @@ steps:
818817
# Avoid importing model tests that cause CUDA reinitialization error
819818
- pytest models/test_transformers.py -v -s -m 'distributed(num_gpus=2)'
820819
- pytest models/language -v -s -m 'distributed(num_gpus=2)'
821-
- pytest models/multimodal -v -s -m 'distributed(num_gpus=2)'
820+
- pytest models/multimodal -v -s -m 'distributed(num_gpus=2)' --ignore models/multimodal/generation/test_whisper.py
821+
- VLLM_WORKER_MULTIPROC_METHOD=spawn pytest models/multimodal/generation/test_whisper.py -v -s -m 'distributed(num_gpus=2)'
822822
# test sequence parallel
823823
- pytest -v -s distributed/test_sequence_parallel.py
824824
# this test fails consistently.

examples/offline_inference/encoder_decoder.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,8 @@
55
encoder/decoder models, specifically BART and mBART.
66
77
This script is refactored to allow model selection via command-line arguments.
8+
9+
NOTE: This example is not yet supported in V1.
810
"""
911

1012
import argparse

examples/offline_inference/encoder_decoder_multimodal.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@
55
the explicit/implicit prompt format on enc-dec LMMs for text generation.
66
"""
77

8+
import os
89
import time
910
from collections.abc import Sequence
1011
from dataclasses import asdict
@@ -130,6 +131,8 @@ def run_mllama():
130131

131132

132133
def run_whisper():
134+
os.environ["VLLM_WORKER_MULTIPROC_METHOD"] = "spawn"
135+
133136
engine_args = EngineArgs(
134137
model="openai/whisper-large-v3-turbo",
135138
max_model_len=448,

tests/encoder_decoder/test_e2e_correctness.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -63,6 +63,7 @@ def clear_cache():
6363
current_platform.is_cpu(),
6464
reason="CPU backend is not currently supported with encoder/decoder models"
6565
)
66+
@pytest.mark.skip(reason="bart not supported in V1")
6667
def test_encoder_decoder_e2e(
6768
hf_runner,
6869
vllm_runner,

tests/entrypoints/openai/test_encoder_decoder.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -30,6 +30,7 @@ async def client(server):
3030

3131
@pytest.mark.asyncio
3232
@pytest.mark.parametrize("model_name", [MODEL_NAME])
33+
@pytest.mark.skip(reason="bart is not yet supported in V1")
3334
async def test_single_completion(client: openai.AsyncOpenAI, model_name: str):
3435
completion = await client.completions.create(model=model_name,
3536
prompt="Hello, my name is",

tests/models/language/generation/test_bart.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -178,6 +178,7 @@ def run_test(
178178
@pytest.mark.parametrize("max_tokens", [64])
179179
@pytest.mark.parametrize("num_logprobs", [5])
180180
@pytest.mark.parametrize("decoder_prompt_type", list(DecoderPromptType))
181+
@pytest.mark.skip(reason="bart not supported in V1")
181182
def test_models(hf_runner, vllm_runner, example_encoder_decoder_prompts, model,
182183
dtype, max_tokens, num_logprobs, decoder_prompt_type) -> None:
183184

@@ -201,6 +202,7 @@ def test_models(hf_runner, vllm_runner, example_encoder_decoder_prompts, model,
201202
@pytest.mark.parametrize("max_tokens", [64])
202203
@pytest.mark.parametrize("num_logprobs", [5])
203204
@pytest.mark.parametrize("decoder_prompt_type", [DecoderPromptType.CUSTOM])
205+
@pytest.mark.skip(reason="bart not supported in V1")
204206
def test_models_distributed(hf_runner, vllm_runner,
205207
example_encoder_decoder_prompts,
206208
distributed_executor_backend, model, dtype,

tests/models/multimodal/generation/test_whisper.py

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -122,8 +122,7 @@ def run_test(
122122

123123

124124
@pytest.mark.core_model
125-
@pytest.mark.parametrize(
126-
"model", ["openai/whisper-small", "openai/whisper-large-v3-turbo"])
125+
@pytest.mark.parametrize("model", ["openai/whisper-large-v3-turbo"])
127126
@create_new_process_for_each_test()
128127
def test_models(vllm_runner, model) -> None:
129128
run_test(

tests/models/multimodal/processing/test_tensor_schema.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -31,6 +31,7 @@
3131

3232
ARCH_TO_SKIP = {
3333
"MolmoForCausalLM": "incompatible requirements",
34+
"Florence2ForConditionalGeneration": "not supported in V1",
3435
}
3536
ARCH_NEEDS_EXTRAS = [
3637
"InternVLChatModel",

tests/models/test_initialization.py

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -68,6 +68,12 @@ def _initialize_kv_caches_v1(self, vllm_config):
6868
# has cc==8.9 which hasn't supported FA3 yet. Remove this hack when
6969
# L4 supports FA3.
7070
m.setenv("VLLM_ATTENTION_BACKEND", "TRITON_ATTN_VLLM_V1")
71+
if model_arch == "Florence2ForConditionalGeneration":
72+
# An encoder-decoder model that's V0-only. Just skip it
73+
# since V0 is about to be removed.
74+
pytest.skip("Skipping Florence2ForConditionalGeneration")
75+
if model_arch == "WhisperForConditionalGeneration":
76+
m.setenv("VLLM_WORKER_MULTIPROC_METHOD", "spawn")
7177
LLM(
7278
model_info.default,
7379
tokenizer=model_info.tokenizer,

tests/v1/test_oracle.py

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,6 @@
1010
from vllm.engine.async_llm_engine import AsyncLLMEngine
1111

1212
UNSUPPORTED_MODELS_V1 = [
13-
"openai/whisper-large-v3", # transcription
1413
"facebook/bart-large-cnn", # encoder decoder
1514
]
1615

0 commit comments

Comments
 (0)