1919
2020import gc
2121import multiprocessing
22+ import os
23+ import sys
2224from multiprocessing import Queue
2325
2426import lm_eval
2527import pytest
2628import torch
2729
2830# pre-trained model path on Hugging Face.
29- MODEL_NAME = "Qwen/Qwen2.5-0.5B-Instruct"
30- # Math reasoning benchmark (Grade School Math 8K).
31- TASK = "gsm8k"
31+ MODEL_NAME = ["Qwen/Qwen2.5-0.5B-Instruct" , "Qwen/Qwen2.5-VL-3B-Instruct" ]
32+ # Benchmark configuration mapping models to evaluation tasks:
33+ # - Text model: GSM8K (grade school math reasoning)
34+ # - Vision-language model: MMMU Art & Design validation (multimodal understanding)
35+ TASK = {
36+ "Qwen/Qwen2.5-0.5B-Instruct" : "gsm8k" ,
37+ "Qwen/Qwen2.5-VL-3B-Instruct" : "mmmu_val_art_and_design"
38+ }
3239# Answer validation requiring format consistency.
33- FILTER = "exact_match,strict-match"
40+ FILTER = {
41+ "Qwen/Qwen2.5-0.5B-Instruct" : "exact_match,strict-match" ,
42+ "Qwen/Qwen2.5-VL-3B-Instruct" : "acc,none"
43+ }
3444# 3% relative tolerance for numerical accuracy.
3545RTOL = 0.03
3646# Baseline accuracy after VLLM optimization.
37- EXPECTED_VALUE = 0.316
47+ EXPECTED_VALUE = {
48+ "Qwen/Qwen2.5-0.5B-Instruct" : 0.316 ,
49+ "Qwen/Qwen2.5-VL-3B-Instruct" : 0.541
50+ }
51+ # Maximum context length configuration for each model.
52+ MAX_MODEL_LEN = {
53+ "Qwen/Qwen2.5-0.5B-Instruct" : 4096 ,
54+ "Qwen/Qwen2.5-VL-3B-Instruct" : 8192
55+ }
56+ # Model types distinguishing text-only and vision-language models.
57+ MODEL_TYPE = {
58+ "Qwen/Qwen2.5-0.5B-Instruct" : "vllm" ,
59+ "Qwen/Qwen2.5-VL-3B-Instruct" : "vllm-vlm"
60+ }
61+ # wrap prompts in a chat-style template.
62+ APPLY_CHAT_TEMPLATE = {
63+ "Qwen/Qwen2.5-0.5B-Instruct" : False ,
64+ "Qwen/Qwen2.5-VL-3B-Instruct" : True
65+ }
66+ # Few-shot examples handling as multi-turn dialogues.
67+ FEWSHOT_AS_MULTITURN = {
68+ "Qwen/Qwen2.5-0.5B-Instruct" : False ,
69+ "Qwen/Qwen2.5-VL-3B-Instruct" : True
70+ }
3871
3972
40- def run_test (queue , more_args = None ):
41- model_args = f"pretrained={ MODEL_NAME } ,max_model_len=4096"
42- if more_args is not None :
43- model_args = f"{ model_args } ,{ more_args } "
44- results = lm_eval .simple_evaluate (
45- model = "vllm" ,
46- model_args = model_args ,
47- tasks = TASK ,
48- batch_size = "auto" ,
49- )
50- result = results ["results" ][TASK ][FILTER ]
51- print ("result:" , result )
52- queue .put (result )
53- del results
54- torch .npu .empty_cache ()
55- gc .collect ()
73+ def run_test (queue , model , max_model_len , model_type ):
74+ try :
75+ if model_type == "vllm-vlm" :
76+ model_args = (f"pretrained={ model } ,max_model_len={ max_model_len } ,"
77+ "dtype=auto,max_images=2" )
78+ else :
79+ model_args = (f"pretrained={ model } ,max_model_len={ max_model_len } ,"
80+ "dtype=auto" )
81+ results = lm_eval .simple_evaluate (
82+ model = model_type ,
83+ model_args = model_args ,
84+ tasks = TASK [model ],
85+ batch_size = "auto" ,
86+ apply_chat_template = APPLY_CHAT_TEMPLATE [model ],
87+ fewshot_as_multiturn = FEWSHOT_AS_MULTITURN [model ],
88+ )
89+ result = results ["results" ][TASK [model ]][FILTER [model ]]
90+ print ("result:" , result )
91+ queue .put (result )
92+ except Exception as e :
93+ queue .put (e )
94+ sys .exit (1 )
95+ finally :
96+ gc .collect ()
97+ torch .npu .empty_cache ()
5698
57-
58- def test_lm_eval_accuracy (monkeypatch : pytest .MonkeyPatch ):
99+ @pytest .mark .parametrize ("model" , MODEL_NAME )
100+ def test_lm_eval_accuracy (monkeypatch : pytest .MonkeyPatch , model ):
101+ if model == "Qwen/Qwen2.5-VL-3B-Instruct" and os .getenv (
102+ "VLLM_USE_V1" ) == "1" :
103+ pytest .skip (
104+ "Qwen2.5-VL-3B-Instruct is not supported when VLLM_USE_V1=1" )
59105 with monkeypatch .context ():
60106 result_queue : Queue [float ] = multiprocessing .Queue ()
61- p = multiprocessing .Process (target = run_test , args = (result_queue , ))
107+ p = multiprocessing .Process (target = run_test ,
108+ args = (result_queue , model ,
109+ MAX_MODEL_LEN [model ],
110+ MODEL_TYPE [model ]))
62111 p .start ()
63112 p .join ()
64113 result = result_queue .get ()
65- assert (EXPECTED_VALUE - RTOL < result < EXPECTED_VALUE + RTOL ), \
66- f"Expected: { EXPECTED_VALUE } ±{ RTOL } | Measured: { result } "
114+ print (result )
115+ assert (EXPECTED_VALUE [model ] - RTOL < result < EXPECTED_VALUE [model ] + RTOL ), \
116+ f"Expected: { EXPECTED_VALUE [model ]} ±{ RTOL } | Measured: { result } "
0 commit comments