1616# This file is a part of the vllm-ascend project.
1717# Adapted from vllm-project/blob/main/tests/entrypoints/llm/test_accuracy.py
1818#
19-
2019import gc
2120import multiprocessing
21+ import os
22+ import signal
23+ import subprocess
2224import sys
25+ import time
2326from multiprocessing import Queue
2427
2528import lm_eval
2629import pytest
30+ import requests
2731import torch
2832
33+ SERVER_HOST = "127.0.0.1"
34+ SERVER_PORT = 8000
35+ HEALTH_URL = f"http://{ SERVER_HOST } :{ SERVER_PORT } /health"
36+ COMPLETIONS_URL = f"http://{ SERVER_HOST } :{ SERVER_PORT } /v1/completions"
37+
2938# pre-trained model path on Hugging Face.
30- MODEL_NAME = ["Qwen/Qwen2.5-0.5B-Instruct" , "Qwen/Qwen2.5-VL-3B-Instruct" ]
39+ # Qwen/Qwen2.5-0.5B-Instruct: accuracy test for unimodal model and DP.
40+ # Qwen/Qwen2.5-VL-3B-Instruct: accuracy test for multimodal model.
41+ # Qwen/Qwen3-30B-A3B: accuracy test for EP.
42+ # deepseek-ai/DeepSeek-V2-Lite: accuracy test for TP.
43+ MODEL_NAME = [
44+ "Qwen/Qwen2.5-0.5B-Instruct" , "Qwen/Qwen2.5-VL-3B-Instruct" ,
45+ "Qwen/Qwen3-30B-A3B" , "deepseek-ai/DeepSeek-V2-Lite"
46+ ]
47+
3148# Benchmark configuration mapping models to evaluation tasks:
3249# - Text model: GSM8K (grade school math reasoning)
3350# - Vision-language model: MMMU Art & Design validation (multimodal understanding)
3451TASK = {
3552 "Qwen/Qwen2.5-0.5B-Instruct" : "gsm8k" ,
36- "Qwen/Qwen2.5-VL-3B-Instruct" : "mmmu_val_art_and_design"
53+ "Qwen/Qwen2.5-VL-3B-Instruct" : "mmmu_val_art_and_design" ,
54+ "Qwen/Qwen3-30B-A3B" : "gsm8k" ,
55+ "deepseek-ai/DeepSeek-V2-Lite" : "gsm8k"
3756}
3857# Answer validation requiring format consistency.
3958FILTER = {
4059 "Qwen/Qwen2.5-0.5B-Instruct" : "exact_match,strict-match" ,
41- "Qwen/Qwen2.5-VL-3B-Instruct" : "acc,none"
60+ "Qwen/Qwen2.5-VL-3B-Instruct" : "acc,none" ,
61+ "Qwen/Qwen3-30B-A3B" : "exact_match,strict-match" ,
62+ "deepseek-ai/DeepSeek-V2-Lite" : "exact_match,strict-match"
4263}
4364# 3% relative tolerance for numerical accuracy.
4465RTOL = 0.03
4566# Baseline accuracy after VLLM optimization.
4667EXPECTED_VALUE = {
4768 "Qwen/Qwen2.5-0.5B-Instruct" : 0.316 ,
48- "Qwen/Qwen2.5-VL-3B-Instruct" : 0.541
69+ "Qwen/Qwen2.5-VL-3B-Instruct" : 0.541 ,
70+ "Qwen/Qwen3-30B-A3B" : 0.888 ,
71+ "deepseek-ai/DeepSeek-V2-Lite" : 0.376
4972}
5073# Maximum context length configuration for each model.
5174MAX_MODEL_LEN = {
5275 "Qwen/Qwen2.5-0.5B-Instruct" : 4096 ,
53- "Qwen/Qwen2.5-VL-3B-Instruct" : 8192
76+ "Qwen/Qwen2.5-VL-3B-Instruct" : 8192 ,
77+ "Qwen/Qwen3-30B-A3B" : 4096 ,
78+ "deepseek-ai/DeepSeek-V2-Lite" : 4096
5479}
5580# Model types distinguishing text-only and vision-language models.
5681MODEL_TYPE = {
5782 "Qwen/Qwen2.5-0.5B-Instruct" : "vllm" ,
58- "Qwen/Qwen2.5-VL-3B-Instruct" : "vllm-vlm"
83+ "Qwen/Qwen2.5-VL-3B-Instruct" : "vllm-vlm" ,
84+ "Qwen/Qwen3-30B-A3B" : "vllm" ,
85+ "deepseek-ai/DeepSeek-V2-Lite" : "vllm"
5986}
6087# wrap prompts in a chat-style template.
61- APPLY_CHAT_TEMPLATE = {"vllm" : False , "vllm-vlm" : True }
88+ APPLY_CHAT_TEMPLATE = {
89+ "Qwen/Qwen2.5-0.5B-Instruct" : False ,
90+ "Qwen/Qwen2.5-VL-3B-Instruct" : True ,
91+ "Qwen/Qwen3-30B-A3B" : False ,
92+ "deepseek-ai/DeepSeek-V2-Lite" : False
93+ }
6294# Few-shot examples handling as multi-turn dialogues.
63- FEWSHOT_AS_MULTITURN = {"vllm" : False , "vllm-vlm" : True }
95+ FEWSHOT_AS_MULTITURN = {
96+ "Qwen/Qwen2.5-0.5B-Instruct" : False ,
97+ "Qwen/Qwen2.5-VL-3B-Instruct" : True ,
98+ "Qwen/Qwen3-30B-A3B" : False ,
99+ "deepseek-ai/DeepSeek-V2-Lite" : False
100+ }
101+ # MORE_ARGS extra CLI args per model
102+ MORE_ARGS = {
103+ "Qwen/Qwen2.5-0.5B-Instruct" :
104+ None ,
105+ "Qwen/Qwen2.5-VL-3B-Instruct" :
106+ None ,
107+ "Qwen/Qwen3-30B-A3B" :
108+ "tensor_parallel_size=4,enable_expert_parallel=True,enforce_eager=True" ,
109+ "deepseek-ai/DeepSeek-V2-Lite" :
110+ "tensor_parallel_size=4,trust_remote_code=True,enforce_eager=True"
111+ }
112+
113+ multiprocessing .set_start_method ("spawn" , force = True )
114+
64115
116+ def get_available_npu_count ():
117+ return torch .npu .device_count ()
65118
66- def run_test (queue , model , max_model_len , model_type ):
119+
120+ def run_test (queue , model , max_model_len , model_type , more_args ):
67121 try :
68122 if model_type == "vllm-vlm" :
69123 model_args = (f"pretrained={ model } ,max_model_len={ max_model_len } ,"
70124 "dtype=auto,max_images=2" )
71125 else :
72126 model_args = (f"pretrained={ model } ,max_model_len={ max_model_len } ,"
73127 "dtype=auto" )
128+ if more_args is not None :
129+ model_args = f"{ model_args } ,{ more_args } "
74130 results = lm_eval .simple_evaluate (
75131 model = model_type ,
76132 model_args = model_args ,
77133 tasks = TASK [model ],
78134 batch_size = "auto" ,
79- apply_chat_template = APPLY_CHAT_TEMPLATE [model_type ],
80- fewshot_as_multiturn = FEWSHOT_AS_MULTITURN [model_type ],
135+ apply_chat_template = APPLY_CHAT_TEMPLATE [model ],
136+ fewshot_as_multiturn = FEWSHOT_AS_MULTITURN [model ],
81137 )
82138 result = results ["results" ][TASK [model ]][FILTER [model ]]
83139 print ("result:" , result )
84140 queue .put (result )
85141 except Exception as e :
86- queue .put (e )
142+ error_msg = f"{ type (e ).__name__ } : { str (e )} "
143+ queue .put (error_msg )
87144 sys .exit (1 )
88145 finally :
89146 gc .collect ()
@@ -93,19 +150,162 @@ def run_test(queue, model, max_model_len, model_type):
93150@pytest .mark .parametrize ("model" , MODEL_NAME )
94151@pytest .mark .parametrize ("VLLM_USE_V1" , ["0" , "1" ])
95152def test_lm_eval_accuracy (monkeypatch : pytest .MonkeyPatch , model , VLLM_USE_V1 ):
153+ os .environ ["VLLM_USE_V1" ] = VLLM_USE_V1
154+ npu_count = get_available_npu_count ()
96155 if model == "Qwen/Qwen2.5-VL-3B-Instruct" and VLLM_USE_V1 == "1" :
97156 pytest .skip (
98- "Qwen2.5-VL-3B-Instruct is not supported when VLLM_USE_V1=1" )
99- with monkeypatch .context () as m :
100- m .setenv ("VLLM_USE_V1" , VLLM_USE_V1 )
157+ "skip test multimodal model accuracy for {model} when VLLM_USE_V1={VLLM_USE_V1} and tp={npu_count}"
158+ )
159+ if (model == "Qwen/Qwen2.5-VL-3B-Instruct"
160+ or model == "Qwen/Qwen2.5-0.5B-Instruct" ) and npu_count != 1 :
161+ pytest .skip (
162+ "skip test accuracy for {model} when VLLM_USE_V1={VLLM_USE_V1} and tp={npu_count}"
163+ )
164+ if (model == "Qwen/Qwen3-30B-A3B"
165+ or model == "deepseek-ai/DeepSeek-V2-Lite" ) and (
166+ os .getenv ("VLLM_USE_V1" ) != "1" or npu_count != 4 ):
167+ pytest .skip (
168+ "skip test accuracy for {model} when VLLM_USE_V1={VLLM_USE_V1} and tp={npu_count}"
169+ )
170+ with monkeypatch .context ():
101171 result_queue : Queue [float ] = multiprocessing .Queue ()
102172 p = multiprocessing .Process (target = run_test ,
103173 args = (result_queue , model ,
104174 MAX_MODEL_LEN [model ],
105- MODEL_TYPE [model ]))
175+ MODEL_TYPE [model ], MORE_ARGS [ model ] ))
106176 p .start ()
107177 p .join ()
108178 result = result_queue .get ()
109179 print (result )
110180 assert (EXPECTED_VALUE [model ] - RTOL < result < EXPECTED_VALUE [model ] + RTOL ), \
111181 f"Expected: { EXPECTED_VALUE [model ]} ±{ RTOL } | Measured: { result } "
182+
183+
184+ @pytest .mark .parametrize ("max_tokens" , [10 ])
185+ @pytest .mark .parametrize ("VLLM_USE_V1" , ["1" ])
186+ @pytest .mark .parametrize ("model" , ["Qwen/Qwen2.5-0.5B-Instruct" ])
187+ def test_lm_eval_accuracy_dp (model , max_tokens , VLLM_USE_V1 ):
188+ os .environ ["VLLM_USE_V1" ] = VLLM_USE_V1
189+ npu_count = get_available_npu_count ()
190+ if npu_count != 4 :
191+ pytest .skip (
192+ "skip test dp accuracy for {model} when VLLM_USE_V1={VLLM_USE_V1} and tp={npu_count}"
193+ )
194+
195+ log_file = open ("accuracy.log" , "a" )
196+ cmd = [
197+ "vllm" , "serve" , model , "--max_model_len" , "4096" ,
198+ "--tensor_parallel_size" , "2" , "--data_parallel_size" , "2"
199+ ]
200+ server_proc = subprocess .Popen (cmd ,
201+ stdout = log_file ,
202+ stderr = subprocess .DEVNULL )
203+
204+ try :
205+ for _ in range (300 ):
206+ try :
207+ r = requests .get (HEALTH_URL , timeout = 1 )
208+ if r .status_code == 200 :
209+ break
210+ except requests .exceptions .RequestException :
211+ pass
212+ time .sleep (1 )
213+ else :
214+ log_file .flush ()
215+ log_file .seek (0 )
216+ log_content = log_file .read ()
217+ pytest .fail (
218+ f"vLLM serve did not become healthy after 300s: { HEALTH_URL } \n "
219+ f"==== vLLM Serve Log Start ===\n { log_content } \n ==== vLLM Serve Log End ==="
220+ )
221+
222+ prompt = "bejing is a"
223+ payload = {
224+ "prompt" : prompt ,
225+ "max_tokens" : max_tokens ,
226+ "sampling_params" : {
227+ "temperature" : 0.0 ,
228+ "top_p" : 1.0 ,
229+ "seed" : 123
230+ }
231+ }
232+ resp = requests .post (COMPLETIONS_URL , json = payload , timeout = 30 )
233+ resp .raise_for_status ()
234+ data = resp .json ()
235+
236+ generated = data ["choices" ][0 ]["text" ].strip ()
237+ expected = "city in north china, it has many famous attractions"
238+ assert generated == expected , f"Expected `{ expected } `, got `{ generated } `"
239+
240+ finally :
241+ server_proc .send_signal (signal .SIGINT )
242+ try :
243+ server_proc .wait (timeout = 10 )
244+ except subprocess .TimeoutExpired :
245+ server_proc .kill ()
246+ server_proc .wait ()
247+
248+
249+ @pytest .mark .parametrize ("max_tokens" , [10 ])
250+ @pytest .mark .parametrize ("VLLM_USE_V1" , ["1" ])
251+ @pytest .mark .parametrize ("model" , ["Qwen/Qwen3-30B-A3B" ])
252+ def test_lm_eval_accuracy_etp (model , max_tokens , VLLM_USE_V1 ):
253+ os .environ ["VLLM_USE_V1" ] = VLLM_USE_V1
254+ npu_count = get_available_npu_count ()
255+ if npu_count != 4 :
256+ pytest .skip (
257+ "skip test etp accuracy for {model} when VLLM_USE_V1={VLLM_USE_V1} and tp={npu_count}"
258+ )
259+ log_file = open ("accuracy.log" , "a" )
260+ cmd = [
261+ "vllm" , "serve" , model , "--tensor_parallel_size" , "4" ,
262+ "--enforce_eager" , "True" , "--enable_expert_parallel" , "True" ,
263+ "--additional_config" , '{"expert_tensor_parallel_size": "4"}'
264+ ]
265+ server_proc = subprocess .Popen (cmd ,
266+ stdout = log_file ,
267+ stderr = subprocess .DEVNULL )
268+
269+ try :
270+ for _ in range (300 ):
271+ try :
272+ r = requests .get (HEALTH_URL , timeout = 1 )
273+ if r .status_code == 200 :
274+ break
275+ except requests .exceptions .RequestException :
276+ pass
277+ time .sleep (1 )
278+ else :
279+ log_file .flush ()
280+ log_file .seek (0 )
281+ log_content = log_file .read ()
282+ pytest .fail (
283+ f"vLLM serve did not become healthy after 300s: { HEALTH_URL } \n "
284+ f"==== vLLM Serve Log Start ===\n { log_content } \n ==== vLLM Serve Log End ==="
285+ )
286+
287+ prompt = "bejing is a"
288+ payload = {
289+ "prompt" : prompt ,
290+ "max_tokens" : max_tokens ,
291+ "sampling_params" : {
292+ "temperature" : 0.0 ,
293+ "top_p" : 1.0 ,
294+ "seed" : 123
295+ }
296+ }
297+ resp = requests .post (COMPLETIONS_URL , json = payload , timeout = 30 )
298+ resp .raise_for_status ()
299+ data = resp .json ()
300+
301+ generated = data ["choices" ][0 ]["text" ].strip ()
302+ expected = "city in china. it is the capital city of"
303+ assert generated == expected , f"Expected `{ expected } `, got `{ generated } `"
304+
305+ finally :
306+ server_proc .send_signal (signal .SIGINT )
307+ try :
308+ server_proc .wait (timeout = 10 )
309+ except subprocess .TimeoutExpired :
310+ server_proc .kill ()
311+ server_proc .wait ()
0 commit comments