Skip to content

Commit 2c38964

Browse files
committed
[Bugfix] Fix deepseek V0 percision issue and add acc ci for it
Signed-off-by: MengqingCao <cmq0113@163.com>
1 parent e2a0c19 commit 2c38964

File tree

7 files changed

+185
-38
lines changed

7 files changed

+185
-38
lines changed

tests/conftest.py

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -354,4 +354,12 @@ def prompt_template(request):
354354

355355
@pytest.fixture(scope="session")
356356
def ilama_lora_files():
357-
return snapshot_download(repo_id="jeeejeee/ilama-text2sql-spider")
357+
return snapshot_download(repo_id="jeeejeee/ilama-text2sql-spider")
358+
359+
360+
@pytest.fixture
361+
def enable_modelscope_env():
362+
import os
363+
from unittest.mock import patch
364+
with patch.dict(os.environ, {"VLLM_USE_MODEL_SCOPE": "TRUE"}):
365+
yield

tests/long_term/spec_decode/e2e/test_mtp_correctness.py

Lines changed: 86 additions & 34 deletions
Original file line numberDiff line numberDiff line change
@@ -93,11 +93,17 @@
9393
])
9494
@pytest.mark.parametrize("batch_size", [1, 32])
9595
@pytest.mark.parametrize("seed", [1])
96-
def test_mtp_e2e_greedy_correctness(vllm_runner, common_llm_kwargs,
97-
per_test_common_llm_kwargs,
98-
baseline_llm_kwargs, test_llm_kwargs,
99-
batch_size: int, output_len: int,
100-
seed: int):
96+
def test_mtp_e2e_greedy_correctness(
97+
enable_modelscope_env,
98+
vllm_runner,
99+
common_llm_kwargs,
100+
per_test_common_llm_kwargs,
101+
baseline_llm_kwargs,
102+
test_llm_kwargs,
103+
batch_size: int,
104+
output_len: int,
105+
seed: int,
106+
):
101107

102108
run_equality_correctness_test(vllm_runner, common_llm_kwargs,
103109
per_test_common_llm_kwargs,
@@ -138,12 +144,17 @@ def test_mtp_e2e_greedy_correctness(vllm_runner, common_llm_kwargs,
138144
])
139145
@pytest.mark.parametrize("batch_size", [1, 32])
140146
@pytest.mark.parametrize("seed", [1])
141-
def test_mtp_e2e_quant_greedy_correctness(vllm_runner, common_llm_kwargs,
142-
per_test_common_llm_kwargs,
143-
baseline_llm_kwargs, test_llm_kwargs,
144-
batch_size: int, output_len: int,
145-
seed: int):
146-
147+
def test_mtp_e2e_quant_greedy_correctness(
148+
enable_modelscope_env,
149+
vllm_runner,
150+
common_llm_kwargs,
151+
per_test_common_llm_kwargs,
152+
baseline_llm_kwargs,
153+
test_llm_kwargs,
154+
batch_size: int,
155+
output_len: int,
156+
seed: int,
157+
):
147158
run_equality_correctness_test(vllm_runner, common_llm_kwargs,
148159
per_test_common_llm_kwargs,
149160
baseline_llm_kwargs, test_llm_kwargs,
@@ -192,12 +203,18 @@ def test_mtp_e2e_quant_greedy_correctness(vllm_runner, common_llm_kwargs,
192203
@pytest.mark.parametrize("batch_size", [8])
193204
@pytest.mark.parametrize("seed", [1])
194205
@pytest.mark.parametrize("logprobs", [1, 6])
195-
def test_mtp_e2e_greedy_logprobs(vllm_runner, common_llm_kwargs,
196-
per_test_common_llm_kwargs,
197-
baseline_llm_kwargs, test_llm_kwargs,
198-
batch_size: int, output_len: int, seed: int,
199-
logprobs: int):
200-
206+
def test_mtp_e2e_greedy_logprobs(
207+
enable_modelscope_env,
208+
vllm_runner,
209+
common_llm_kwargs,
210+
per_test_common_llm_kwargs,
211+
baseline_llm_kwargs,
212+
test_llm_kwargs,
213+
batch_size: int,
214+
output_len: int,
215+
seed: int,
216+
logprobs: int,
217+
):
201218
run_equality_correctness_test(
202219
vllm_runner,
203220
common_llm_kwargs,
@@ -246,9 +263,16 @@ def test_mtp_e2e_greedy_logprobs(vllm_runner, common_llm_kwargs,
246263
@pytest.mark.parametrize("batch_size", [1, 32])
247264
@pytest.mark.parametrize("seed", [1])
248265
def test_mtp_e2e_greedy_correctness_torchair_graph(
249-
vllm_runner, common_llm_kwargs, per_test_common_llm_kwargs,
250-
baseline_llm_kwargs, test_llm_kwargs, batch_size: int, output_len: int,
251-
seed: int):
266+
enable_modelscope_env,
267+
vllm_runner,
268+
common_llm_kwargs,
269+
per_test_common_llm_kwargs,
270+
baseline_llm_kwargs,
271+
test_llm_kwargs,
272+
batch_size: int,
273+
output_len: int,
274+
seed: int,
275+
):
252276
"""Verify greedy equality with torchair graph enabled and different
253277
batch sizes using bfloat16 weights."""
254278
run_equality_correctness_test(vllm_runner, common_llm_kwargs,
@@ -290,9 +314,16 @@ def test_mtp_e2e_greedy_correctness_torchair_graph(
290314
@pytest.mark.parametrize("batch_size", [1, 32])
291315
@pytest.mark.parametrize("seed", [1])
292316
def test_mtp_e2e_quant_greedy_correctness_torchair_graph(
293-
vllm_runner, common_llm_kwargs, per_test_common_llm_kwargs,
294-
baseline_llm_kwargs, test_llm_kwargs, batch_size: int, output_len: int,
295-
seed: int):
317+
enable_modelscope_env,
318+
vllm_runner,
319+
common_llm_kwargs,
320+
per_test_common_llm_kwargs,
321+
baseline_llm_kwargs,
322+
test_llm_kwargs,
323+
batch_size: int,
324+
output_len: int,
325+
seed: int,
326+
):
296327
"""Verify greedy equality with torchair graph enabled and different
297328
batch sizes using quant weights."""
298329
run_equality_correctness_test(vllm_runner, common_llm_kwargs,
@@ -341,9 +372,16 @@ def test_mtp_e2e_quant_greedy_correctness_torchair_graph(
341372
@pytest.mark.parametrize("batch_size", [4])
342373
@pytest.mark.parametrize("seed", [1])
343374
def test_mtp_e2e_greedy_correctness_with_preemption(
344-
vllm_runner, common_llm_kwargs, per_test_common_llm_kwargs,
345-
baseline_llm_kwargs, test_llm_kwargs, batch_size: int, output_len: int,
346-
seed: int):
375+
enable_modelscope_env,
376+
vllm_runner,
377+
common_llm_kwargs,
378+
per_test_common_llm_kwargs,
379+
baseline_llm_kwargs,
380+
test_llm_kwargs,
381+
batch_size: int,
382+
output_len: int,
383+
seed: int,
384+
):
347385
"""Verify greedy equality, even when some sequences are preempted mid-
348386
generation.
349387
"""
@@ -391,10 +429,17 @@ def test_mtp_e2e_greedy_correctness_with_preemption(
391429
32,
392430
])
393431
@pytest.mark.parametrize("seed", [1])
394-
def test_mtp_different_k(vllm_runner, common_llm_kwargs,
395-
per_test_common_llm_kwargs, baseline_llm_kwargs,
396-
test_llm_kwargs, batch_size: int, output_len: int,
397-
seed: int):
432+
def test_mtp_different_k(
433+
enable_modelscope_env,
434+
vllm_runner,
435+
common_llm_kwargs,
436+
per_test_common_llm_kwargs,
437+
baseline_llm_kwargs,
438+
test_llm_kwargs,
439+
batch_size: int,
440+
output_len: int,
441+
seed: int,
442+
):
398443
"""Verify that mtp speculative decoding produces exact equality
399444
to without spec decode with different values of num_speculative_tokens.
400445
"""
@@ -437,10 +482,17 @@ def test_mtp_different_k(vllm_runner, common_llm_kwargs,
437482
32,
438483
])
439484
@pytest.mark.parametrize("seed", [1])
440-
def test_mtp_disable_queue(vllm_runner, common_llm_kwargs,
441-
per_test_common_llm_kwargs, baseline_llm_kwargs,
442-
test_llm_kwargs, batch_size: int, output_len: int,
443-
seed: int):
485+
def test_mtp_disable_queue(
486+
enable_modelscope_env,
487+
vllm_runner,
488+
common_llm_kwargs,
489+
per_test_common_llm_kwargs,
490+
baseline_llm_kwargs,
491+
test_llm_kwargs,
492+
batch_size: int,
493+
output_len: int,
494+
seed: int,
495+
):
444496
"""Verify that mtp speculative decoding produces exact equality
445497
to without spec decode when speculation is disabled for large
446498
batch sizes.

tests/long_term/spec_decode/e2e/test_v1_spec_decode.py

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -60,6 +60,7 @@ def eagle3_model_name():
6060

6161

6262
def test_ngram_correctness(
63+
enable_modelscope_env,
6364
monkeypatch: pytest.MonkeyPatch,
6465
test_prompts: list[list[dict[str, Any]]],
6566
sampling_config: SamplingParams,
@@ -71,8 +72,10 @@ def test_ngram_correctness(
7172
'''
7273
with monkeypatch.context() as m:
7374
m.setenv("VLLM_USE_V1", "1")
75+
m.setenv("VLLM_USE_MODELSCOPE", "True")
7476

75-
ref_llm = LLM(model=model_name, max_model_len=1024)
77+
ref_llm = LLM(model="LLM-Research/Meta-Llama-3.1-8B-Instruct",
78+
max_model_len=1024)
7679
ref_outputs = ref_llm.chat(test_prompts, sampling_config)
7780
del ref_llm
7881

@@ -105,6 +108,7 @@ def test_ngram_correctness(
105108

106109
@pytest.mark.parametrize("use_eagle3", [False, True], ids=["eagle", "eagle3"])
107110
def test_eagle_correctness(
111+
enable_modelscope_env,
108112
monkeypatch: pytest.MonkeyPatch,
109113
test_prompts: list[list[dict[str, Any]]],
110114
sampling_config: SamplingParams,

tests/multicard/test_accuracy.py

Lines changed: 71 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,71 @@
1+
#
2+
# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved.
3+
# Copyright 2023 The vLLM team.
4+
#
5+
# Licensed under the Apache License, Version 2.0 (the "License");
6+
# you may not use this file except in compliance with the License.
7+
# You may obtain a copy of the License at
8+
#
9+
# http://www.apache.org/licenses/LICENSE-2.0
10+
#
11+
# Unless required by applicable law or agreed to in writing, software
12+
# distributed under the License is distributed on an "AS IS" BASIS,
13+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14+
# See the License for the specific language governing permissions and
15+
# limitations under the License.
16+
# This file is a part of the vllm-ascend project.
17+
# Adapted from vllm-project/blob/main/tests/entrypoints/llm/test_accuracy.py
18+
#
19+
20+
import gc
21+
import multiprocessing
22+
from multiprocessing import Queue
23+
24+
import lm_eval
25+
import pytest
26+
import torch
27+
28+
# pre-trained model path on Hugging Face.
29+
MODELS = ["deepseek-ai/DeepSeek-V2-Lite"]
30+
# Math reasoning benchmark (Grade School Math 8K).
31+
TASK = "gsm8k"
32+
# Answer validation requiring format consistency.
33+
FILTER = "exact_match,strict-match"
34+
# 3% relative tolerance for numerical accuracy.
35+
RTOL = 0.03
36+
# Baseline accuracy after VLLM optimization.
37+
EXPECTED_VALUE = 0.316
38+
39+
40+
def run_test(model_name, queue, more_args=None):
41+
model_args = f"pretrained={model_name},max_model_len=4096,trust_remote_code=True,tensor_parallel_size=4"
42+
if more_args is not None:
43+
model_args = f"{model_args},{more_args}"
44+
results = lm_eval.simple_evaluate(
45+
model="vllm",
46+
model_args=model_args,
47+
tasks=TASK,
48+
batch_size="auto",
49+
)
50+
result = results["results"][TASK][FILTER]
51+
print(100 * "*", "\nThe accuracy test result:", result)
52+
queue.put(result)
53+
del results
54+
torch.npu.empty_cache()
55+
gc.collect()
56+
57+
58+
@pytest.mark.parametrize("model", MODELS)
59+
def test_lm_eval_accuracy(model, monkeypatch: pytest.MonkeyPatch):
60+
with monkeypatch.context():
61+
result_queue: Queue[float] = multiprocessing.Queue()
62+
p = multiprocessing.Process(target=run_test,
63+
args=(
64+
model,
65+
result_queue,
66+
))
67+
p.start()
68+
p.join()
69+
result = result_queue.get()
70+
assert (EXPECTED_VALUE - RTOL < result < EXPECTED_VALUE + RTOL), \
71+
f"Expected: {EXPECTED_VALUE}±{RTOL} | Measured: {result}"

tests/singlecard/test_offline_inference.py

Lines changed: 12 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -41,7 +41,12 @@
4141
@pytest.mark.parametrize("model", MODELS)
4242
@pytest.mark.parametrize("dtype", ["half", "float16"])
4343
@pytest.mark.parametrize("max_tokens", [5])
44-
def test_models(model: str, dtype: str, max_tokens: int) -> None:
44+
def test_models(
45+
enable_modelscope_env,
46+
model: str,
47+
dtype: str,
48+
max_tokens: int,
49+
) -> None:
4550
# 5042 tokens for gemma2
4651
# gemma2 has alternating sliding window size of 4096
4752
# we need a prompt with more than 4096 tokens to test the sliding window
@@ -60,7 +65,12 @@ def test_models(model: str, dtype: str, max_tokens: int) -> None:
6065
@pytest.mark.parametrize("model", MULTIMODALITY_MODELS)
6166
@pytest.mark.skipif(os.getenv("VLLM_USE_V1") == "1",
6267
reason="qwen2.5_vl is not supported on v1")
63-
def test_multimodal(model, prompt_template, vllm_runner):
68+
def test_multimodal(
69+
enable_modelscope_env,
70+
model,
71+
prompt_template,
72+
vllm_runner,
73+
):
6474
image = ImageAsset("cherry_blossom") \
6575
.pil_image.convert("RGB")
6676
img_questions = [

vllm_ascend/ops/fused_moe.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -337,6 +337,7 @@ def fused_experts(
337337
num_experts = w1.shape[0]
338338
dtype = hidden_states.dtype
339339
device = hidden_states.device
340+
topk_weights = topk_weights.to(dtype)
340341
# assert dtype in [torch.float32, torch.float16, torch.bfloat16
341342
# ], "Only float32, float16, and bfloat16 are supported"
342343

vllm_ascend/quantization/w8a8_dynamic.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -342,6 +342,7 @@ def fused_experts(hidden_states: torch.Tensor,
342342
num_experts = w1.shape[0]
343343
dtype = hidden_states.dtype
344344
device = hidden_states.device
345+
topk_weights = topk_weights.to(dtype)
345346

346347
if expert_map is not None:
347348
# Generate token indices and flatten

0 commit comments

Comments
 (0)