Skip to content

Commit a7907d2

Browse files
[0.9.1][Fixbug] Fix num_hidden_layers when Qwen2-Audio and Qwen2.5-Omni
Signed-off-by: hfadzxy <starmoon_zhang@163.com>
1 parent 40c2c05 commit a7907d2

File tree

5 files changed

+81
-4
lines changed

5 files changed

+81
-4
lines changed

docs/source/user_guide/support_matrix/supported_models.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -46,6 +46,7 @@ Get the newest info here: https://github.com/vllm-project/vllm-ascend/issues/160
4646
| InternVL2 || |
4747
| InternVL2.5 || |
4848
| Qwen2-Audio || |
49+
| Qwen2.5-Omni || |
4950
| LLaVA-Next | | Need test |
5051
| LLaVA-Next-Video | | Need test |
5152
| Phi-3-Vison/Phi-3.5-Vison | | Need test |

requirements-dev.txt

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -13,3 +13,5 @@ types-psutil
1313
networkx
1414
ray>=2.47.1
1515
protobuf>3.20.0
16+
librosa
17+
soundfile

tests/singlecard/test_offline_inference.py

Lines changed: 48 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,7 @@
2727
import vllm # noqa: F401
2828
from modelscope import snapshot_download # type: ignore[import-untyped]
2929
from vllm import SamplingParams
30+
from vllm.assets.audio import AudioAsset
3031
from vllm.assets.image import ImageAsset
3132

3233
import vllm_ascend # noqa: F401
@@ -36,7 +37,8 @@
3637
"Qwen/Qwen2.5-0.5B-Instruct",
3738
"Qwen/Qwen3-0.6B-Base",
3839
]
39-
MULTIMODALITY_MODELS = ["Qwen/Qwen2.5-VL-3B-Instruct"]
40+
MULTIMODALITY_VL_MODELS = ["Qwen/Qwen2.5-VL-3B-Instruct"]
41+
MULTIMODALITY_AUDIO_MODELS = ["Qwen/Qwen2-Audio-7B-Instruct"]
4042

4143
QUANTIZATION_MODELS = [
4244
"vllm-ascend/Qwen2.5-0.5B-Instruct-W8A8",
@@ -47,6 +49,11 @@
4749
"vllm-ascend/Qwen2.5-0.5B-Instruct-W8A8",
4850
]
4951

52+
AUDIO_ASSETS = [AudioAsset("mary_had_lamb"), AudioAsset("winning_call")]
53+
AUDIO_PROMPT_TEMPLATES = {
54+
1: "What is recited in the audio?",
55+
2: "What sport and what nursery rhyme are referenced?"
56+
}
5057

5158
@pytest.mark.parametrize("model", MODELS)
5259
@pytest.mark.parametrize("dtype", ["half", "float16"])
@@ -88,8 +95,8 @@ def test_quantization_models(model: str, max_tokens: int) -> None:
8895
vllm_model.generate_greedy(example_prompts, max_tokens)
8996

9097

91-
@pytest.mark.parametrize("model", MULTIMODALITY_MODELS)
92-
def test_multimodal(model, prompt_template, vllm_runner):
98+
@pytest.mark.parametrize("model", MULTIMODALITY_VL_MODELS)
99+
def test_multimodal_vl(model, prompt_template, vllm_runner):
93100
image = ImageAsset("cherry_blossom") \
94101
.pil_image.convert("RGB")
95102
img_questions = [
@@ -131,3 +138,41 @@ def test_models_topk() -> None:
131138
enforce_eager=True,
132139
gpu_memory_utilization=0.7) as vllm_model:
133140
vllm_model.generate(example_prompts, sampling_params)
141+
142+
def prepare_audio_inputs(audio_count: int):
143+
audio_prompt = "".join([
144+
f"Audio {idx+1}: <|audio_bos|><|AUDIO|><|audio_eos|>\n"
145+
for idx in range(audio_count)
146+
])
147+
question = AUDIO_PROMPT_TEMPLATES[audio_count]
148+
prompt = ("<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n"
149+
"<|im_start|>user\n"
150+
f"{audio_prompt}{question}<|im_end|>\n"
151+
"<|im_start|>assistant\n")
152+
mm_data = {
153+
"audio":
154+
[asset.audio_and_sample_rate for asset in AUDIO_ASSETS[:audio_count]]
155+
}
156+
inputs = {"prompt": prompt, "multi_modal_data": mm_data}
157+
return inputs
158+
159+
160+
@pytest.mark.parametrize("model", MULTIMODALITY_AUDIO_MODELS)
161+
@pytest.mark.parametrize("audio_count", [2])
162+
@pytest.mark.parametrize("max_tokens", [10])
163+
def test_multimodal_audio(model: str, audio_count: int,
164+
max_tokens: int) -> None:
165+
inputs = prepare_audio_inputs(audio_count)
166+
167+
sampling_params = SamplingParams(temperature=0.2,
168+
max_tokens=max_tokens,
169+
stop_token_ids=None)
170+
171+
with VllmRunner(model,
172+
max_model_len=4096,
173+
max_num_seqs=5,
174+
enforce_eager=False,
175+
dtype="bfloat16",
176+
limit_mm_per_prompt={"audio": audio_count},
177+
gpu_memory_utilization=0.9) as vllm_model:
178+
vllm_model.generate(inputs, sampling_params=sampling_params)

vllm_ascend/utils.py

Lines changed: 26 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -175,6 +175,27 @@ def vllm_version_is(target_vllm_version: str):
175175
"format of x.y.z.")
176176

177177

178+
def get_max_hidden_layers(hf_config) -> int:
179+
cfg_dict = hf_config.to_dict()
180+
layer_counts = []
181+
182+
def _rec_find(d):
183+
if isinstance(d, dict):
184+
for k, v in d.items():
185+
if k == "num_hidden_layers" and isinstance(v, int):
186+
layer_counts.append(v)
187+
else:
188+
_rec_find(v)
189+
elif isinstance(d, list):
190+
for item in d:
191+
_rec_find(item)
192+
193+
_rec_find(cfg_dict)
194+
if not layer_counts:
195+
raise ValueError("Not found num_hidden_layers in model config.")
196+
return max(layer_counts)
197+
198+
178199
def update_aclgraph_sizes(vllm_config: VllmConfig) -> None:
179200
"""Update ACL graph capture sizes based on hardware limitations"""
180201
# Store original configuration and temporarily clear it
@@ -204,7 +225,11 @@ def update_aclgraph_sizes(vllm_config: VllmConfig) -> None:
204225
return
205226

206227
# Calculate parallel configuration factor
207-
num_hidden_layers = vllm_config.model_config.hf_config.num_hidden_layers
228+
hf_config = vllm_config.model_config.hf_config
229+
if hasattr(hf_config, 'num_hidden_layers'):
230+
num_hidden_layers = hf_config.num_hidden_layers
231+
else:
232+
num_hidden_layers = get_max_hidden_layers(hf_config)
208233
parallel_config = vllm_config.parallel_config
209234

210235
# TODO: Find out whether we need to take into account the pp_size

vllm_ascend/worker/model_runner.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -866,6 +866,8 @@ def _compute_multi_modal_input(self, inter_data: InterDataForSeqGroup,
866866
"mrope embedding type requires multi-modal input mapper "
867867
"returns 'image_grid_thw' or 'video_grid_thw'.")
868868
second_per_grid_ts = mm_kwargs.get("second_per_grid_ts", None)
869+
audio_feature_lengths= mm_kwargs.get("audio_feature_lengths", None)
870+
use_audio_in_video=mm_kwargs.get("use_audio_in_video", False)
869871

870872
hf_config = self.runner.model_config.hf_config
871873

@@ -884,6 +886,8 @@ def _compute_multi_modal_input(self, inter_data: InterDataForSeqGroup,
884886
second_per_grid_ts=second_per_grid_ts,
885887
context_len=inter_data.context_lens[seq_idx],
886888
seq_len=inter_data.seq_lens[seq_idx],
889+
audio_feature_lengths=audio_feature_lengths,
890+
use_audio_in_video=use_audio_in_video,
887891
)
888892

889893
seq_data.mrope_position_delta = mrope_position_delta

0 commit comments

Comments
 (0)