|
19 | 19 |
|
20 | 20 | import gc |
21 | 21 | import multiprocessing |
| 22 | +import os |
| 23 | +import sys |
22 | 24 | from multiprocessing import Queue |
23 | 25 |
|
24 | 26 | import lm_eval |
25 | 27 | import pytest |
26 | 28 | import torch |
27 | 29 |
|
28 | 30 | # pre-trained model path on Hugging Face. |
29 | | -MODEL_NAME = "Qwen/Qwen2.5-0.5B-Instruct" |
30 | | -# Math reasoning benchmark (Grade School Math 8K). |
31 | | -TASK = "gsm8k" |
| 31 | +MODEL_NAME = ["Qwen/Qwen2.5-0.5B-Instruct", "Qwen/Qwen2.5-VL-3B-Instruct"] |
| 32 | +# Benchmark configuration mapping models to evaluation tasks: |
| 33 | +# - Text model: GSM8K (grade school math reasoning) |
| 34 | +# - Vision-language model: MMMU Art & Design validation (multimodal understanding) |
| 35 | +TASK = { |
| 36 | + "Qwen/Qwen2.5-0.5B-Instruct": "gsm8k", |
| 37 | + "Qwen/Qwen2.5-VL-3B-Instruct": "mmmu_val_art_and_design" |
| 38 | +} |
32 | 39 | # Answer validation requiring format consistency. |
33 | | -FILTER = "exact_match,strict-match" |
| 40 | +FILTER = { |
| 41 | + "Qwen/Qwen2.5-0.5B-Instruct": "exact_match,strict-match", |
| 42 | + "Qwen/Qwen2.5-VL-3B-Instruct": "acc,none" |
| 43 | +} |
34 | 44 | # 3% relative tolerance for numerical accuracy. |
35 | 45 | RTOL = 0.03 |
36 | 46 | # Baseline accuracy after VLLM optimization. |
37 | | -EXPECTED_VALUE = 0.316 |
| 47 | +EXPECTED_VALUE = { |
| 48 | + "Qwen/Qwen2.5-0.5B-Instruct": 0.316, |
| 49 | + "Qwen/Qwen2.5-VL-3B-Instruct": 0.541 |
| 50 | +} |
| 51 | +# Maximum context length configuration for each model. |
| 52 | +MAX_MODEL_LEN = { |
| 53 | + "Qwen/Qwen2.5-0.5B-Instruct": 4096, |
| 54 | + "Qwen/Qwen2.5-VL-3B-Instruct": 8192 |
| 55 | +} |
| 56 | +# Model types distinguishing text-only and vision-language models. |
| 57 | +MODEL_TYPE = { |
| 58 | + "Qwen/Qwen2.5-0.5B-Instruct": "vllm", |
| 59 | + "Qwen/Qwen2.5-VL-3B-Instruct": "vllm-vlm" |
| 60 | +} |
| 61 | +# wrap prompts in a chat-style template. |
| 62 | +APPLY_CHAT_TEMPLATE = { |
| 63 | + "vllm": False, |
| 64 | + "vllm-vlm": True |
| 65 | +} |
| 66 | +# Few-shot examples handling as multi-turn dialogues. |
| 67 | +FEWSHOT_AS_MULTITURN = { |
| 68 | + "vllm": False, |
| 69 | + "vllm-vlm": True |
| 70 | +} |
38 | 71 |
|
39 | 72 |
|
40 | | -def run_test(queue, more_args=None): |
41 | | - model_args = f"pretrained={MODEL_NAME},max_model_len=4096" |
42 | | - if more_args is not None: |
43 | | - model_args = f"{model_args},{more_args}" |
44 | | - results = lm_eval.simple_evaluate( |
45 | | - model="vllm", |
46 | | - model_args=model_args, |
47 | | - tasks=TASK, |
48 | | - batch_size="auto", |
49 | | - ) |
50 | | - result = results["results"][TASK][FILTER] |
51 | | - print("result:", result) |
52 | | - queue.put(result) |
53 | | - del results |
54 | | - torch.npu.empty_cache() |
55 | | - gc.collect() |
| 73 | +def run_test(queue, model, max_model_len, model_type): |
| 74 | + try: |
| 75 | + if model_type == "vllm-vlm": |
| 76 | + model_args = (f"pretrained={model},max_model_len={max_model_len}," |
| 77 | + "dtype=auto,max_images=2") |
| 78 | + else: |
| 79 | + model_args = (f"pretrained={model},max_model_len={max_model_len}," |
| 80 | + "dtype=auto") |
| 81 | + results = lm_eval.simple_evaluate( |
| 82 | + model=model_type, |
| 83 | + model_args=model_args, |
| 84 | + tasks=TASK[model], |
| 85 | + batch_size="auto", |
| 86 | + apply_chat_template=APPLY_CHAT_TEMPLATE[model_type], |
| 87 | + fewshot_as_multiturn=FEWSHOT_AS_MULTITURN[model_type], |
| 88 | + ) |
| 89 | + result = results["results"][TASK[model]][FILTER[model]] |
| 90 | + print("result:", result) |
| 91 | + queue.put(result) |
| 92 | + except Exception as e: |
| 93 | + queue.put(e) |
| 94 | + sys.exit(1) |
| 95 | + finally: |
| 96 | + gc.collect() |
| 97 | + torch.npu.empty_cache() |
56 | 98 |
|
57 | 99 |
|
58 | | -def test_lm_eval_accuracy(monkeypatch: pytest.MonkeyPatch): |
59 | | - with monkeypatch.context(): |
| 100 | +@pytest.mark.parametrize("model", MODEL_NAME) |
| 101 | +@pytest.mark.parametrize("VLLM_USE_V1", ["0", "1"]) |
| 102 | +def test_lm_eval_accuracy(monkeypatch: pytest.MonkeyPatch, model, VLLM_USE_V1): |
| 103 | + if model == "Qwen/Qwen2.5-VL-3B-Instruct" and VLLM_USE_V1 == "1": |
| 104 | + pytest.skip( |
| 105 | + "Qwen2.5-VL-3B-Instruct is not supported when VLLM_USE_V1=1") |
| 106 | + with monkeypatch.context() as m: |
| 107 | + m.setenv("VLLM_USE_V1", VLLM_USE_V1) |
60 | 108 | result_queue: Queue[float] = multiprocessing.Queue() |
61 | | - p = multiprocessing.Process(target=run_test, args=(result_queue, )) |
| 109 | + p = multiprocessing.Process(target=run_test, |
| 110 | + args=(result_queue, model, |
| 111 | + MAX_MODEL_LEN[model], |
| 112 | + MODEL_TYPE[model])) |
62 | 113 | p.start() |
63 | 114 | p.join() |
64 | 115 | result = result_queue.get() |
65 | | - assert (EXPECTED_VALUE - RTOL < result < EXPECTED_VALUE + RTOL), \ |
66 | | - f"Expected: {EXPECTED_VALUE}±{RTOL} | Measured: {result}" |
| 116 | + print(result) |
| 117 | + assert (EXPECTED_VALUE[model] - RTOL < result < EXPECTED_VALUE[model] + RTOL), \ |
| 118 | + f"Expected: {EXPECTED_VALUE[model]}±{RTOL} | Measured: {result}" |
0 commit comments