2525
2626import pytest
2727import vllm # noqa: F401
28+ from modelscope import snapshot_download # type: ignore[import-untyped]
2829from vllm import SamplingParams
2930from vllm .assets .image import ImageAsset
3031
3334
3435MODELS = [
3536 "Qwen/Qwen2.5-0.5B-Instruct" ,
36- "vllm-ascend/Qwen2.5-0.5B-Instruct-w8a8" ,
3737 "Qwen/Qwen3-0.6B-Base" ,
3838]
3939MULTIMODALITY_MODELS = ["Qwen/Qwen2.5-VL-3B-Instruct" ]
40+
41+ QUANTIZATION_MODELS = [
42+ "vllm-ascend/Qwen2.5-0.5B-Instruct-W8A8-new" ,
43+ ]
4044os .environ ["PYTORCH_NPU_ALLOC_CONF" ] = "max_split_size_mb:256"
4145
4246
@@ -59,6 +63,27 @@ def test_models(model: str, dtype: str, max_tokens: int) -> None:
5963 vllm_model .generate_greedy (example_prompts , max_tokens )
6064
6165
66+ @pytest .mark .parametrize ("model" , QUANTIZATION_MODELS )
67+ @pytest .mark .parametrize ("max_tokens" , [5 ])
68+ def test_quantization_models (model : str , max_tokens : int ) -> None :
69+ prompt = "The following numbers of the sequence " + ", " .join (
70+ str (i ) for i in range (1024 )) + " are:"
71+ example_prompts = [prompt ]
72+
73+ # NOTE: Using quantized model repo id from modelscope encounters an issue,
74+ # this pr (https://github.com/vllm-project/vllm/pull/19212) fix the issue,
75+ # after it is being merged, there's no need to download model explicitly.
76+ model_path = snapshot_download (model )
77+
78+ with VllmRunner (model_path ,
79+ max_model_len = 8192 ,
80+ enforce_eager = True ,
81+ dtype = "auto" ,
82+ gpu_memory_utilization = 0.7 ,
83+ quantization = "ascend" ) as vllm_model :
84+ vllm_model .generate_greedy (example_prompts , max_tokens )
85+
86+
6287@pytest .mark .parametrize ("model" , MULTIMODALITY_MODELS )
6388def test_multimodal (model , prompt_template , vllm_runner ):
6489 image = ImageAsset ("cherry_blossom" ) \
0 commit comments