diff --git a/tests/conftest.py b/tests/conftest.py index 7b060f3a07..16bbc8027d 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -78,6 +78,7 @@ def __init__( enable_chunked_prefill: bool = False, swap_space: int = 4, enforce_eager: Optional[bool] = True, + quantization: Optional[str] = None, **kwargs, ) -> None: self.model = LLM( @@ -94,6 +95,7 @@ def __init__( max_model_len=max_model_len, block_size=block_size, enable_chunked_prefill=enable_chunked_prefill, + quantization=quantization, **kwargs, ) diff --git a/tests/singlecard/test_offline_inference.py b/tests/singlecard/test_offline_inference.py index a65451d4ff..5b58df7928 100644 --- a/tests/singlecard/test_offline_inference.py +++ b/tests/singlecard/test_offline_inference.py @@ -25,6 +25,7 @@ import pytest import vllm # noqa: F401 +from modelscope import snapshot_download # type: ignore[import-untyped] from vllm import SamplingParams from vllm.assets.image import ImageAsset @@ -33,10 +34,13 @@ MODELS = [ "Qwen/Qwen2.5-0.5B-Instruct", - "vllm-ascend/Qwen2.5-0.5B-Instruct-w8a8", "Qwen/Qwen3-0.6B-Base", ] MULTIMODALITY_MODELS = ["Qwen/Qwen2.5-VL-3B-Instruct"] + +QUANTIZATION_MODELS = [ + "vllm-ascend/Qwen2.5-0.5B-Instruct-W8A8-new", +] os.environ["PYTORCH_NPU_ALLOC_CONF"] = "max_split_size_mb:256" @@ -59,6 +63,27 @@ def test_models(model: str, dtype: str, max_tokens: int) -> None: vllm_model.generate_greedy(example_prompts, max_tokens) +@pytest.mark.parametrize("model", QUANTIZATION_MODELS) +@pytest.mark.parametrize("max_tokens", [5]) +def test_quantization_models(model: str, max_tokens: int) -> None: + prompt = "The following numbers of the sequence " + ", ".join( + str(i) for i in range(1024)) + " are:" + example_prompts = [prompt] + + # NOTE: Using quantized model repo id from modelscope encounters an issue, + # this pr (https://github.com/vllm-project/vllm/pull/19212) fix the issue, + # after it is being merged, there's no need to download model explicitly. + model_path = snapshot_download(model) + + with VllmRunner(model_path, + max_model_len=8192, + enforce_eager=True, + dtype="auto", + gpu_memory_utilization=0.7, + quantization="ascend") as vllm_model: + vllm_model.generate_greedy(example_prompts, max_tokens) + + @pytest.mark.parametrize("model", MULTIMODALITY_MODELS) def test_multimodal(model, prompt_template, vllm_runner): image = ImageAsset("cherry_blossom") \