|
29 | 29 |
|
30 | 30 | os.environ["PYTORCH_NPU_ALLOC_CONF"] = "max_split_size_mb:256" |
31 | 31 |
|
| 32 | +MODELS = ["Qwen/QwQ-32B", "deepseek-ai/DeepSeek-V2-Lite"] |
| 33 | +DIST_EXECUTOR_BACKENDS = ["mp", "ray"] |
| 34 | + |
| 35 | +@pytest.mark.parametrize("model", MODELS) |
| 36 | +@pytest.mark.parametrize("distributed_executor_backend", |
| 37 | + DIST_EXECUTOR_BACKENDS) |
| 38 | +def test_models_distributed(model: str, |
| 39 | + distributed_executor_backend: str, |
| 40 | + monkeypatch: pytest.MonkeyPatch, |
| 41 | + ) -> None: |
| 42 | + with monkeypatch.context() as m: |
| 43 | + m.setenv("VLLM_USE_MODELSCOPE", "True") |
| 44 | + example_prompts = [ |
| 45 | + "vLLM is a high-throughput and memory-efficient inference and serving engine for LLMs.", |
| 46 | + "Briefly describe the major milestones in the development of artificial intelligence from 1950 to 2020.", |
| 47 | + "Compare and contrast artificial intelligence with human intelligence in terms of processing information.", |
| 48 | + ] |
| 49 | + dtype = "half" |
| 50 | + max_tokens = 5 |
| 51 | + with VllmRunner( |
| 52 | + model, |
| 53 | + dtype=dtype, |
| 54 | + tensor_parallel_size=4, |
| 55 | + distributed_executor_backend=distributed_executor_backend, |
| 56 | + ) as vllm_model: |
| 57 | + vllm_model.generate_greedy(example_prompts, max_tokens) |
32 | 58 |
|
33 | | -def test_models_distributed_QwQ(): |
34 | | - example_prompts = [ |
35 | | - "vLLM is a high-throughput and memory-efficient inference and serving engine for LLMs.", |
36 | | - "Briefly describe the major milestones in the development of artificial intelligence from 1950 to 2020.", |
37 | | - "Compare and contrast artificial intelligence with human intelligence in terms of processing information.", |
38 | | - ] |
39 | | - dtype = "half" |
40 | | - max_tokens = 5 |
41 | | - with VllmRunner( |
42 | | - "Qwen/QwQ-32B", |
43 | | - dtype=dtype, |
44 | | - tensor_parallel_size=4, |
45 | | - distributed_executor_backend="mp", |
46 | | - ) as vllm_model: |
47 | | - vllm_model.generate_greedy(example_prompts, max_tokens) |
48 | | - |
49 | | - |
50 | | -@pytest.mark.skipif(os.getenv("VLLM_USE_V1") == "1", |
51 | | - reason="deepseek v2 lite is not supported on v1") |
52 | | -def test_models_distributed_DeepSeek(): |
53 | | - example_prompts = [ |
54 | | - "vLLM is a high-throughput and memory-efficient inference and serving engine for LLMs.", |
55 | | - "Briefly describe the major milestones in the development of artificial intelligence from 1950 to 2020.", |
56 | | - "Compare and contrast artificial intelligence with human intelligence in terms of processing information.", |
57 | | - ] |
58 | | - dtype = "half" |
59 | | - max_tokens = 5 |
60 | | - with VllmRunner( |
61 | | - "deepseek-ai/DeepSeek-V2-Lite", |
62 | | - dtype=dtype, |
63 | | - tensor_parallel_size=4, |
64 | | - distributed_executor_backend="mp", |
65 | | - ) as vllm_model: |
66 | | - vllm_model.generate_greedy(example_prompts, max_tokens) |
0 commit comments