[CI] Add accuracy test for qwen2.5-vl-3b-instruct

zhangxinyuehfad · zhangxinyuehfad · commit 04f59719fa1e · 2025-05-09T22:04:41.000+08:00
Signed-off-by: hfadzxy &lt;starmoon_zhang@163.com&gt;
diff --git a/.github/workflows/vllm_ascend_test.yaml b/.github/workflows/vllm_ascend_test.yaml
@@ -111,6 +111,7 @@ jobs:
           VLLM_WORKER_MULTIPROC_METHOD: spawn
         run: |
           if [[ "${{ matrix.os }}" == "linux-arm64-npu-1" ]]; then
+            pytest -sv tests/singlecard/test_accuracy.py
             pytest -sv tests/singlecard/test_offline_inference.py
             pytest -sv tests/ops
             pytest -sv tests/compile
@@ -125,6 +126,7 @@ jobs:
           VLLM_USE_V1: 0
         run: |
           if [[ "${{ matrix.os }}" == "linux-arm64-npu-1" ]]; then
+            pytest -sv tests/singlecard/test_accuracy.py
             pytest -sv tests/singlecard/test_offline_inference.py
             pytest -sv tests/ops
           else
diff --git a/tests/singlecard/test_accuracy.py b/tests/singlecard/test_accuracy.py
@@ -19,48 +19,94 @@
 
 import gc
 import multiprocessing
+import sys
 from multiprocessing import Queue
 
 import lm_eval
 import pytest
 import torch
 
 # pre-trained model path on Hugging Face.
-MODEL_NAME = "Qwen/Qwen2.5-0.5B-Instruct"
-# Math reasoning benchmark (Grade School Math 8K).
-TASK = "gsm8k"
+MODEL_NAME = ["Qwen/Qwen2.5-0.5B-Instruct", "Qwen/Qwen2.5-VL-3B-Instruct"]
+# Benchmark configuration mapping models to evaluation tasks:
+# - Text model: GSM8K (grade school math reasoning)
+# - Vision-language model: MMMU Art & Design validation (multimodal understanding)
+TASK = {
+    "Qwen/Qwen2.5-0.5B-Instruct": "gsm8k",
+    "Qwen/Qwen2.5-VL-3B-Instruct": "mmmu_val_art_and_design"
+}
 # Answer validation requiring format consistency.
-FILTER = "exact_match,strict-match"
+FILTER = {
+    "Qwen/Qwen2.5-0.5B-Instruct": "exact_match,strict-match",
+    "Qwen/Qwen2.5-VL-3B-Instruct": "acc,none"
+}
 # 3% relative tolerance for numerical accuracy.
 RTOL = 0.03
 # Baseline accuracy after VLLM optimization.
-EXPECTED_VALUE = 0.316
+EXPECTED_VALUE = {
+    "Qwen/Qwen2.5-0.5B-Instruct": 0.316,
+    "Qwen/Qwen2.5-VL-3B-Instruct": 0.541
+}
+# Maximum context length configuration for each model.
+MAX_MODEL_LEN = {
+    "Qwen/Qwen2.5-0.5B-Instruct": 4096,
+    "Qwen/Qwen2.5-VL-3B-Instruct": 8192
+}
+# Model types distinguishing text-only and vision-language models.
+MODEL_TYPE = {
+    "Qwen/Qwen2.5-0.5B-Instruct": "vllm",
+    "Qwen/Qwen2.5-VL-3B-Instruct": "vllm-vlm"
+}
+# wrap prompts in a chat-style template.
+APPLY_CHAT_TEMPLATE = {
+    "Qwen/Qwen2.5-0.5B-Instruct": False,
+    "Qwen/Qwen2.5-VL-3B-Instruct": True
+}
+# Few-shot examples handling as multi-turn dialogues.
+FEWSHOT_AS_MULTITURN = {
+    "Qwen/Qwen2.5-0.5B-Instruct": False,
+    "Qwen/Qwen2.5-VL-3B-Instruct": True
+}
 
 
-def run_test(queue, more_args=None):
-    model_args = f"pretrained={MODEL_NAME},max_model_len=4096"
-    if more_args is not None:
-        model_args = f"{model_args},{more_args}"
-    results = lm_eval.simple_evaluate(
-        model="vllm",
-        model_args=model_args,
-        tasks=TASK,
-        batch_size="auto",
-    )
-    result = results["results"][TASK][FILTER]
-    print("result:", result)
-    queue.put(result)
-    del results
-    torch.npu.empty_cache()
-    gc.collect()
+def run_test(queue, model, max_model_len, model_type):
+    try:
+        if model_type == "vllm-vlm":
+            model_args = (f"pretrained={model},max_model_len={max_model_len},"
+                          "dtype=auto,max_images=2")
+        else:
+            model_args = (f"pretrained={model},max_model_len={max_model_len},"
+                          "dtype=auto")
+        results = lm_eval.simple_evaluate(
+            model=model_type,
+            model_args=model_args,
+            tasks=TASK[model],
+            batch_size="auto",
+            apply_chat_template=APPLY_CHAT_TEMPLATE[model],
+            fewshot_as_multiturn=FEWSHOT_AS_MULTITURN[model],
+        )
+        result = results["results"][TASK[model]][FILTER[model]]
+        print("result:", result)
+        queue.put(result)
+    except Exception as e:
+        queue.put(e)
+        sys.exit(1)
+    finally:
+        gc.collect()
+        torch.npu.empty_cache()
 
 
 def test_lm_eval_accuracy(monkeypatch: pytest.MonkeyPatch):
-    with monkeypatch.context():
-        result_queue: Queue[float] = multiprocessing.Queue()
-        p = multiprocessing.Process(target=run_test, args=(result_queue, ))
-        p.start()
-        p.join()
-        result = result_queue.get()
-        assert (EXPECTED_VALUE - RTOL < result < EXPECTED_VALUE + RTOL), \
-            f"Expected: {EXPECTED_VALUE}±{RTOL} | Measured: {result}"
+    for model in MODEL_NAME:
+        with monkeypatch.context():
+            result_queue: Queue[float] = multiprocessing.Queue()
+            p = multiprocessing.Process(target=run_test,
+                                        args=(result_queue, model,
+                                              MAX_MODEL_LEN[model],
+                                              MODEL_TYPE[model]))
+            p.start()
+            p.join()
+            result = result_queue.get()
+            print(result)
+            assert (EXPECTED_VALUE[model] - RTOL < result < EXPECTED_VALUE[model] + RTOL), \
+                f"Expected: {EXPECTED_VALUE[model]}±{RTOL} | Measured: {result}"