[0.9.1]support deepseek w4a8 quantization (#1320)

pichangping · web-flow · commit 53ce4a0ad232 · 2025-06-20T21:25:52.000+08:00
### What this PR does / why we need it? Supports Deepseek-R1 w4a8 quantization. Since R1 w4a8 uses mixed quantization, only the MOE layer uses w4a8_dynamic quantization, so we added the w4a8_dynamic.py file, which includes the AscendW4A8DynamicFusedMoEMethod class. ### Does this PR introduce _any_ user-facing change? no, use `--quantization=ascend` is engouh. ### How was this patch tested? #### 1.How to get weights using Modelslim ##### Installation steps Use the branch master, the commit id is: 298e175d69b3b855111a1e09bbe2fcd12fdb4e24 git clone https://gitee.com/ascend/msit.git cd msit/msmodelslim bash install.sh ##### The required transformers environment pip install transformers==4.48.2 ##### Generate w4a8 weights cd /example/DeepSeek Command reference: msmodelslim/example/DeepSeek/README.md Execute the [pre-check](https://gitee.com/ascend/msit/blob/master/msmodelslim/example/DeepSeek/README.md#运行前必检) and [DeepSeek-R1 w4a8 mix quantization](https://gitee.com/ascend/msit/blob/master/msmodelslim/example/DeepSeek/README.md#deepseek-r1-w4a8-混合量化前三层-mlpw8a8-dynamic-量化mla共享专家w8a8量化路由专家w4a8-dynamic量化) chapter Reference command：python3 quant_deepseek_w4a8.py --model_path {Original weight path} --save_path {Generate weight path} --mindie_format ##### Adapt to vllm-ascend Since mindie_format generates mindie format, some adaptation modifications are needed for vllm-ascend to use it: `quant_model_description_w8a8_dynamic.json` rename to `quant_model_description.json`, and change `"group_size": 0` to `"group_size": 256` Modification in `config.json`：`"model_type":deepseekv2` is changed to `"model_type":deepseek_v3` ; `quantization_config` is removed; #### 2.How to run w4a8 TP + EP： python -m vllm.entrypoints.openai.api_server --model=$1 --trust-remote-code -tp $2 --enable_expert_parallel --quantization ascend --port $3 --max-model-len $4 --max-num-seqs $5 --enforce-eager eg: python -m vllm.entrypoints.openai.api_server --model=/weightpath/w4a8_4_layer --trust-remote-code -tp 4 --enable_expert_parallel --quantization ascend --port 8002 --max-model-len 2048 --max-num-seqs 128 --enforce-eager DP+TP+EP: python -m vllm.entrypoints.openai.api_server --model=$1 --trust-remote-code -tp $2 -dp $3 --enable_expert_parallel --quantization ascend --port $4 --max-model-len $5 --max-num-seqs $6 --enforce-eager eg: python -m vllm.entrypoints.openai.api_server --model=/weightpath/w4a8_4_layer --trust-remote-code -tp 2 -dp2 --enable_expert_parallel --quantization ascend --port 8002 --max-model-len 2048 --max-num-seqs 128 --enforce-eager #### 3.Use constraints export VLLM_USE_V1=1 # v1 --------- Signed-off-by: pichangping <1337510399@qq.com>
diff --git a/.github/workflows/vllm_ascend_test.yaml b/.github/workflows/vllm_ascend_test.yaml
@@ -204,6 +204,7 @@ jobs:
             VLLM_USE_MODELSCOPE=True pytest -sv tests/multicard/test_offline_inference_distributed.py::test_models_distributed_DeepSeek_dbo
             VLLM_USE_MODELSCOPE=True pytest -sv tests/multicard/test_offline_inference_distributed.py::test_models_distributed_DeepSeekV3_dbo
             VLLM_USE_MODELSCOPE=True pytest -sv tests/multicard/ --ignore=tests/multicard/test_ilama_lora_tp2.py --ignore=tests/multicard/test_offline_inference_distributed.py
+            VLLM_USE_MODELSCOPE=True pytest -sv tests/multicard/test_w4a8_deepseek.py::test_deepseek_W4A8
           fi
 
       - name: Run vllm-project/vllm-ascend test on V0 engine
diff --git a/tests/multicard/test_offline_inference_distributed.py b/tests/multicard/test_offline_inference_distributed.py
@@ -23,6 +23,7 @@
 import os
 from unittest.mock import patch
 
+import pytest
 from modelscope import snapshot_download  # type: ignore
 from vllm import SamplingParams
 from vllm.model_executor.models.registry import ModelRegistry
@@ -104,6 +105,7 @@ def test_models_distributed_DeepSeek_dbo():
         vllm_model.generate(example_prompts, sampling_params)
 
 
+@pytest.mark.skip(reason="Due to OOM,waiting for 1311pr to merge in")
 @patch.dict(os.environ, {"VLLM_ASCEND_ENABLE_DBO": "1"})
 def test_models_distributed_DeepSeekV3_dbo():
     example_prompts = ["The president of the United States is"] * 41
diff --git a/tests/multicard/test_w4a8_deepseek.py b/tests/multicard/test_w4a8_deepseek.py
@@ -0,0 +1,67 @@
+#
+# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved.
+# This file is a part of the vllm-ascend project.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+
+import pytest
+
+from tests.conftest import VllmRunner
+
+
+@pytest.mark.skip(reason="Due to OOM,waiting for 1311pr to merge in")
+@pytest.mark.skipif(os.getenv("VLLM_USE_V1") == "0",
+                    reason="w4a8_dynamic is not supported on v0")
+def test_deepseek_W4A8(monkeypatch: pytest.MonkeyPatch):
+    with monkeypatch.context() as m:
+        m.setenv("VLLM_USE_V1", "1")
+
+        prompts = [
+            "Hello, my name is",
+            "The president of the United States is",
+            "The capital of France is",
+            "The future of AI is",
+        ]
+        dtype = "bfloat16"
+        max_tokens = 5
+        with VllmRunner(
+                "vllm-ascend/DeepSeek-R1-w4a8-pruning",
+                dtype=dtype,
+                tensor_parallel_size=2,
+                enforce_eager=True,
+                quantization="ascend",
+                enable_expert_parallel=True,
+                additional_config={
+                    "torchair_graph_config": {
+                        "enabled": False,
+                    },
+                    "ascend_scheduler_config": {
+                        "enabled": True,
+                    }
+                },
+        ) as vllm_model:
+            # use greedy sampler to make sure the generated results are fix
+            vllm_output = vllm_model.generate_greedy(prompts, max_tokens)
+
+        golden_results = [
+            'Hello, my name is逸研究发现IPPudsimentary',
+            'The president of the United States is逸 Ban Corporealistically',
+            'The capital of France is逸 Ban Corporealistically',
+            'The future of AI is逸 Ban Corporealistically',
+        ]
+        assert len(golden_results) == len(vllm_output)
+        for i in range(len(vllm_output)):
+            assert golden_results[i] == vllm_output[i][1]
+            print(f"Generated text: {vllm_output[i][1]!r}")
diff --git a/vllm_ascend/models/deepseek_v2.py b/vllm_ascend/models/deepseek_v2.py
@@ -25,7 +25,7 @@
 # # vllm-project/vllm/vllm/model_executor/models/deepseek_v2.py
 # """Inference-only DeepseekV2/DeepseekV3 model."""
 
-from typing import Any, Callable, Dict, List, Optional, Tuple, Union
+from typing import Any, Callable, Dict, Iterable, List, Optional, Tuple, Union
 
 import torch
 import torch_npu
@@ -765,6 +765,14 @@ def forward(
                                    inputs_embeds)
         return hidden_states
 
+    def load_weights(self, weights: Iterable[tuple[str,
+                                                   torch.Tensor]]) -> set[str]:
+        weights = filter(lambda x: ".module." not in x[0], weights)
+        # weights = ((name, data) for name, data in weights if ".module." not in name)
+        loaded_params = super().load_weights(weights)
+
+        return loaded_params
+
 
 class CustomDeepseekV3ForCausalLM(CustomDeepseekV2ForCausalLM):
     pass
diff --git a/vllm_ascend/quantization/quant_config.py b/vllm_ascend/quantization/quant_config.py
@@ -305,6 +305,10 @@ def create_weights(
             param = torch.nn.Parameter(param_value, requires_grad=False)
             layer.register_parameter(param_key, param)
             set_weight_attrs(param, extra_weight_attrs)
+            if "weight_scale_second" in param_key or "weight_offset_second" in param_key:
+                setattr(param, "quant_method",
+                        FusedMoeWeightScaleSupported.GROUP.value)
+                param.quant_method = FusedMoeWeightScaleSupported.GROUP.value
 
     def apply(
         self,
diff --git a/vllm_ascend/quantization/quantizer.py b/vllm_ascend/quantization/quantizer.py
@@ -24,6 +24,7 @@
 
 from .func_wrapper import (wrapper_load_model, wrapper_rmsnorm_forward_oot,
                            wrapper_rmsnorm_init)
+from .w4a8_dynamic import AscendW4A8DynamicFusedMoEMethod
 from .w8a8 import AscendW8A8LinearMethod
 from .w8a8_dynamic import (AscendW8A8DynamicFusedMoEMethod,
                            AscendW8A8DynamicLinearMethod)
@@ -281,7 +282,15 @@ def build_moe_method():
         return AscendW8A8DynamicFusedMoEMethod()
 
 
+class W4A8DYNAMICQuantizer(VLLMAscendQuantizer):
+
+    @staticmethod
+    def build_moe_method():
+        return AscendW4A8DynamicFusedMoEMethod()
+
+
 SUPPORT_ASCEND_QUANTIZER_TYPE = {
     "W8A8": W8A8Quantizer,
     "W8A8_DYNAMIC": W8A8DYNAMICQuantizer,
+    "W4A8_DYNAMIC": W4A8DYNAMICQuantizer
 }
diff --git a/vllm_ascend/quantization/w4a8_dynamic.py b/vllm_ascend/quantization/w4a8_dynamic.py