Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .github/workflows/vllm_ascend_test.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -204,6 +204,7 @@ jobs:
VLLM_USE_MODELSCOPE=True pytest -sv tests/multicard/test_offline_inference_distributed.py::test_models_distributed_DeepSeek_dbo
VLLM_USE_MODELSCOPE=True pytest -sv tests/multicard/test_offline_inference_distributed.py::test_models_distributed_DeepSeekV3_dbo
VLLM_USE_MODELSCOPE=True pytest -sv tests/multicard/ --ignore=tests/multicard/test_ilama_lora_tp2.py --ignore=tests/multicard/test_offline_inference_distributed.py
VLLM_USE_MODELSCOPE=True pytest -sv tests/multicard/test_w4a8_deepseek.py::test_deepseek_W4A8
fi

- name: Run vllm-project/vllm-ascend test on V0 engine
Expand Down
2 changes: 2 additions & 0 deletions tests/multicard/test_offline_inference_distributed.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@
import os
from unittest.mock import patch

import pytest
from modelscope import snapshot_download # type: ignore
from vllm import SamplingParams
from vllm.model_executor.models.registry import ModelRegistry
Expand Down Expand Up @@ -104,6 +105,7 @@ def test_models_distributed_DeepSeek_dbo():
vllm_model.generate(example_prompts, sampling_params)


@pytest.mark.skip(reason="Due to OOM,waiting for 1311pr to merge in")
@patch.dict(os.environ, {"VLLM_ASCEND_ENABLE_DBO": "1"})
def test_models_distributed_DeepSeekV3_dbo():
example_prompts = ["The president of the United States is"] * 41
Expand Down
67 changes: 67 additions & 0 deletions tests/multicard/test_w4a8_deepseek.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,67 @@
#
# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved.
# This file is a part of the vllm-ascend project.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import os

import pytest

from tests.conftest import VllmRunner


@pytest.mark.skip(reason="Due to OOM,waiting for 1311pr to merge in")
@pytest.mark.skipif(os.getenv("VLLM_USE_V1") == "0",
reason="w4a8_dynamic is not supported on v0")
def test_deepseek_W4A8(monkeypatch: pytest.MonkeyPatch):
with monkeypatch.context() as m:
m.setenv("VLLM_USE_V1", "1")

prompts = [
"Hello, my name is",
"The president of the United States is",
"The capital of France is",
"The future of AI is",
]
dtype = "bfloat16"
max_tokens = 5
with VllmRunner(
"vllm-ascend/DeepSeek-R1-w4a8-pruning",
dtype=dtype,
tensor_parallel_size=2,
enforce_eager=True,
quantization="ascend",
enable_expert_parallel=True,
additional_config={
"torchair_graph_config": {
"enabled": False,
},
"ascend_scheduler_config": {
"enabled": True,
}
},
) as vllm_model:
# use greedy sampler to make sure the generated results are fix
vllm_output = vllm_model.generate_greedy(prompts, max_tokens)

golden_results = [
'Hello, my name is逸研究发现IPPudsimentary',
'The president of the United States is逸 Ban Corporealistically',
'The capital of France is逸 Ban Corporealistically',
'The future of AI is逸 Ban Corporealistically',
]
assert len(golden_results) == len(vllm_output)
for i in range(len(vllm_output)):
assert golden_results[i] == vllm_output[i][1]
print(f"Generated text: {vllm_output[i][1]!r}")
10 changes: 9 additions & 1 deletion vllm_ascend/models/deepseek_v2.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@
# # vllm-project/vllm/vllm/model_executor/models/deepseek_v2.py
# """Inference-only DeepseekV2/DeepseekV3 model."""

from typing import Any, Callable, Dict, List, Optional, Tuple, Union
from typing import Any, Callable, Dict, Iterable, List, Optional, Tuple, Union

import torch
import torch_npu
Expand Down Expand Up @@ -765,6 +765,14 @@ def forward(
inputs_embeds)
return hidden_states

def load_weights(self, weights: Iterable[tuple[str,
torch.Tensor]]) -> set[str]:
weights = filter(lambda x: ".module." not in x[0], weights)
# weights = ((name, data) for name, data in weights if ".module." not in name)
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

remove the comment code

loaded_params = super().load_weights(weights)

return loaded_params


class CustomDeepseekV3ForCausalLM(CustomDeepseekV2ForCausalLM):
pass
4 changes: 4 additions & 0 deletions vllm_ascend/quantization/quant_config.py
Original file line number Diff line number Diff line change
Expand Up @@ -305,6 +305,10 @@ def create_weights(
param = torch.nn.Parameter(param_value, requires_grad=False)
layer.register_parameter(param_key, param)
set_weight_attrs(param, extra_weight_attrs)
if "weight_scale_second" in param_key or "weight_offset_second" in param_key:
setattr(param, "quant_method",
FusedMoeWeightScaleSupported.GROUP.value)
param.quant_method = FusedMoeWeightScaleSupported.GROUP.value

def apply(
self,
Expand Down
9 changes: 9 additions & 0 deletions vllm_ascend/quantization/quantizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@

from .func_wrapper import (wrapper_load_model, wrapper_rmsnorm_forward_oot,
wrapper_rmsnorm_init)
from .w4a8_dynamic import AscendW4A8DynamicFusedMoEMethod
from .w8a8 import AscendW8A8LinearMethod
from .w8a8_dynamic import (AscendW8A8DynamicFusedMoEMethod,
AscendW8A8DynamicLinearMethod)
Expand Down Expand Up @@ -281,7 +282,15 @@ def build_moe_method():
return AscendW8A8DynamicFusedMoEMethod()


class W4A8DYNAMICQuantizer(VLLMAscendQuantizer):

@staticmethod
def build_moe_method():
return AscendW4A8DynamicFusedMoEMethod()


SUPPORT_ASCEND_QUANTIZER_TYPE = {
"W8A8": W8A8Quantizer,
"W8A8_DYNAMIC": W8A8DYNAMICQuantizer,
"W4A8_DYNAMIC": W4A8DYNAMICQuantizer
}
Loading