vllm-project · wangxiyuan · Jun 20, 2025 · Jun 20, 2025 · Jun 20, 2025 · Jun 20, 2025
diff --git a/.github/workflows/vllm_ascend_test.yaml b/.github/workflows/vllm_ascend_test.yaml
@@ -204,6 +204,7 @@ jobs:
             VLLM_USE_MODELSCOPE=True pytest -sv tests/multicard/test_offline_inference_distributed.py::test_models_distributed_DeepSeek_dbo
             VLLM_USE_MODELSCOPE=True pytest -sv tests/multicard/test_offline_inference_distributed.py::test_models_distributed_DeepSeekV3_dbo
             VLLM_USE_MODELSCOPE=True pytest -sv tests/multicard/ --ignore=tests/multicard/test_ilama_lora_tp2.py --ignore=tests/multicard/test_offline_inference_distributed.py
+            VLLM_USE_MODELSCOPE=True pytest -sv tests/multicard/test_w4a8_deepseek.py::test_deepseek_W4A8
           fi
 
       - name: Run vllm-project/vllm-ascend test on V0 engine

diff --git a/tests/multicard/test_offline_inference_distributed.py b/tests/multicard/test_offline_inference_distributed.py
@@ -23,6 +23,7 @@
 import os
 from unittest.mock import patch
 
+import pytest
 from modelscope import snapshot_download  # type: ignore
 from vllm import SamplingParams
 from vllm.model_executor.models.registry import ModelRegistry
@@ -104,6 +105,7 @@ def test_models_distributed_DeepSeek_dbo():
         vllm_model.generate(example_prompts, sampling_params)
 
 
+@pytest.mark.skip(reason="Due to OOM,waiting for 1311pr to merge in")
 @patch.dict(os.environ, {"VLLM_ASCEND_ENABLE_DBO": "1"})
 def test_models_distributed_DeepSeekV3_dbo():
     example_prompts = ["The president of the United States is"] * 41

diff --git a/tests/multicard/test_w4a8_deepseek.py b/tests/multicard/test_w4a8_deepseek.py
@@ -0,0 +1,67 @@
+#
+# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved.
+# This file is a part of the vllm-ascend project.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+
+import pytest
+
+from tests.conftest import VllmRunner
+
+
+@pytest.mark.skip(reason="Due to OOM,waiting for 1311pr to merge in")
+@pytest.mark.skipif(os.getenv("VLLM_USE_V1") == "0",
+                    reason="w4a8_dynamic is not supported on v0")
+def test_deepseek_W4A8(monkeypatch: pytest.MonkeyPatch):
+    with monkeypatch.context() as m:
+        m.setenv("VLLM_USE_V1", "1")
+
+        prompts = [
+            "Hello, my name is",
+            "The president of the United States is",
+            "The capital of France is",
+            "The future of AI is",
+        ]
+        dtype = "bfloat16"
+        max_tokens = 5
+        with VllmRunner(
+                "vllm-ascend/DeepSeek-R1-w4a8-pruning",
+                dtype=dtype,
+                tensor_parallel_size=2,
+                enforce_eager=True,
+                quantization="ascend",
+                enable_expert_parallel=True,
+                additional_config={
+                    "torchair_graph_config": {
+                        "enabled": False,
+                    },
+                    "ascend_scheduler_config": {
+                        "enabled": True,
+                    }
+                },
+        ) as vllm_model:
+            # use greedy sampler to make sure the generated results are fix
+            vllm_output = vllm_model.generate_greedy(prompts, max_tokens)
+
+        golden_results = [
+            'Hello, my name is逸研究发现IPPudsimentary',
+            'The president of the United States is逸 Ban Corporealistically',
+            'The capital of France is逸 Ban Corporealistically',
+            'The future of AI is逸 Ban Corporealistically',
+        ]
+        assert len(golden_results) == len(vllm_output)
+        for i in range(len(vllm_output)):
+            assert golden_results[i] == vllm_output[i][1]
+            print(f"Generated text: {vllm_output[i][1]!r}")
diff --git a/vllm_ascend/models/deepseek_v2.py b/vllm_ascend/models/deepseek_v2.py
@@ -25,7 +25,7 @@
 # # vllm-project/vllm/vllm/model_executor/models/deepseek_v2.py
 # """Inference-only DeepseekV2/DeepseekV3 model."""
 
-from typing import Any, Callable, Dict, List, Optional, Tuple, Union
+from typing import Any, Callable, Dict, Iterable, List, Optional, Tuple, Union
 
 import torch
 import torch_npu
@@ -765,6 +765,14 @@ def forward(
                                    inputs_embeds)
         return hidden_states
 
+    def load_weights(self, weights: Iterable[tuple[str,
+                                                   torch.Tensor]]) -> set[str]:
+        weights = filter(lambda x: ".module." not in x[0], weights)
+        # weights = ((name, data) for name, data in weights if ".module." not in name)
+        loaded_params = super().load_weights(weights)
+
+        return loaded_params
+
 
 class CustomDeepseekV3ForCausalLM(CustomDeepseekV2ForCausalLM):
     pass
diff --git a/vllm_ascend/quantization/quant_config.py b/vllm_ascend/quantization/quant_config.py
@@ -305,6 +305,10 @@ def create_weights(
             param = torch.nn.Parameter(param_value, requires_grad=False)
             layer.register_parameter(param_key, param)
             set_weight_attrs(param, extra_weight_attrs)
+            if "weight_scale_second" in param_key or "weight_offset_second" in param_key:
+                setattr(param, "quant_method",
+                        FusedMoeWeightScaleSupported.GROUP.value)
+                param.quant_method = FusedMoeWeightScaleSupported.GROUP.value
 
     def apply(
         self,

diff --git a/vllm_ascend/quantization/quantizer.py b/vllm_ascend/quantization/quantizer.py
@@ -24,6 +24,7 @@
 
 from .func_wrapper import (wrapper_load_model, wrapper_rmsnorm_forward_oot,
                            wrapper_rmsnorm_init)
+from .w4a8_dynamic import AscendW4A8DynamicFusedMoEMethod
 from .w8a8 import AscendW8A8LinearMethod
 from .w8a8_dynamic import (AscendW8A8DynamicFusedMoEMethod,
                            AscendW8A8DynamicLinearMethod)
@@ -281,7 +282,15 @@ def build_moe_method():
         return AscendW8A8DynamicFusedMoEMethod()
 
 
+class W4A8DYNAMICQuantizer(VLLMAscendQuantizer):
+
+    @staticmethod
+    def build_moe_method():
+        return AscendW4A8DynamicFusedMoEMethod()
+
+
 SUPPORT_ASCEND_QUANTIZER_TYPE = {
     "W8A8": W8A8Quantizer,
     "W8A8_DYNAMIC": W8A8DYNAMICQuantizer,
+    "W4A8_DYNAMIC": W4A8DYNAMICQuantizer
 }