1+ #
2+ # Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved.
3+ # This file is a part of the vllm-ascend project.
4+ # Adapted from vllm-project/blob/main/tests/entrypoints/llm/test_accuracy.py
5+ # Copyright 2023 The vLLM team.
6+ #
7+ # Licensed under the Apache License, Version 2.0 (the "License");
8+ # you may not use this file except in compliance with the License.
9+ # You may obtain a copy of the License at
10+ #
11+ # http://www.apache.org/licenses/LICENSE-2.0
12+ #
13+ # Unless required by applicable law or agreed to in writing, software
14+ # distributed under the License is distributed on an "AS IS" BASIS,
15+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16+ # See the License for the specific language governing permissions and
17+ # limitations under the License.
18+ #
19+
20+ import gc
21+ import multiprocessing
22+ import os
23+ from multiprocessing import Queue
24+
25+ import lm_eval
26+ import pytest
27+ import torch
28+
29+ # pre-trained model path on Hugging Face.
30+ MODEL_NAME = "Qwen/Qwen2-0.5B-Instruct"
31+ # Math reasoning benchmark (Grade School Math 8K).
32+ TASK = "gsm8k"
33+ # Answer validation requiring format consistency.
34+ FILTER = "exact_match,strict-match"
35+ # 3% relative tolerance for numerical accuracy.
36+ RTOL = 0.03
37+ # Baseline accuracy after VLLM optimization.
38+ EXPECTED_VALUE = 0.37
39+
40+
41+ def run_test (queue , more_args = None ):
42+ model_args = f"pretrained={ MODEL_NAME } ,max_model_len=4096"
43+ if more_args is not None :
44+ model_args = f"{ model_args } ,{ more_args } "
45+ results = lm_eval .simple_evaluate (
46+ model = "vllm" ,
47+ model_args = model_args ,
48+ tasks = TASK ,
49+ batch_size = "auto" ,
50+ )
51+ result = results ["results" ][TASK ][FILTER ]
52+ print ("result:" , result )
53+ queue .put (result )
54+ del results
55+ torch .npu .empty_cache ()
56+ gc .collect ()
57+ return
58+
59+
60+ @pytest .mark .skipif (
61+ os .getenv ('VLLM_USE_V1' ) == '1' ,
62+ reason = "V1 engine is fully supported in 0.8.X release, skipping this test."
63+ )
64+ def test_lm_eval_accuracy (monkeypatch : pytest .MonkeyPatch ):
65+ with monkeypatch .context ():
66+ result_queue : Queue [float ] = multiprocessing .Queue ()
67+ p = multiprocessing .Process (target = run_test , args = (result_queue , ))
68+ p .start ()
69+ p .join ()
70+ result = result_queue .get ()
71+ assert (EXPECTED_VALUE - RTOL < result < EXPECTED_VALUE + RTOL ), \
72+ f"Expected: { EXPECTED_VALUE } ±{ RTOL } | Measured: { result } "
0 commit comments