Skip to content

Commit 251b608

Browse files
committed
fix lint
Signed-off-by: whx-sjtu <2952154980@qq.com>
1 parent e9bb687 commit 251b608

File tree

2 files changed

+24
-15
lines changed

2 files changed

+24
-15
lines changed

vllm_ascend/ops/fused_moe.py

Lines changed: 7 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1139,13 +1139,17 @@ def forward(self,
11391139
fused_moe_state = get_forward_context().fused_moe_state
11401140
# For w8a8 dynamic we can do npu_dynamic_quant and gate in parallel.
11411141
quantized_x_for_share, dynamic_scale_for_share = None, None
1142-
from vllm_ascend.quantization.w8a8_dynamic import AscendW8A8DynamicFusedMoEMethod
1142+
from vllm_ascend.quantization.w8a8_dynamic import \
1143+
AscendW8A8DynamicFusedMoEMethod
11431144
if self.enable_multistream_moe:
11441145
assert gate is not None
11451146
router_logits, _ = gate(hidden_states)
1146-
if isinstance(self.quant_method.quant_method, AscendW8A8DynamicFusedMoEMethod) and fused_moe_state == FusedMoEState.MC2:
1147+
if isinstance(self.quant_method.quant_method,
1148+
AscendW8A8DynamicFusedMoEMethod
1149+
) and fused_moe_state == FusedMoEState.MC2:
11471150
with npu_stream_switch("moe_secondary", 0):
1148-
quantized_x_for_share, dynamic_scale_for_share = torch_npu.npu_dynamic_quant(hidden_states)
1151+
quantized_x_for_share, dynamic_scale_for_share = torch_npu.npu_dynamic_quant(
1152+
hidden_states)
11491153

11501154
if shared_experts:
11511155
if not self.enable_multistream_moe or fused_moe_state != FusedMoEState.MC2:

vllm_ascend/quantization/w8a8_dynamic.py

Lines changed: 17 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,7 @@
1616
#
1717

1818
import math
19-
from typing import Any, Callable, Dict, Optional, Tuple, Union, List
19+
from typing import Any, Callable, Dict, List, Optional, Tuple, Union
2020

2121
import torch
2222
import torch.distributed as dist
@@ -31,6 +31,7 @@
3131
dispose_tensor, get_ascend_soc_version,
3232
npu_stream_switch, npu_wait_tensor)
3333

34+
3435
def apply_mlp_decode(hidden_states_wrapper: List[torch.Tensor],
3536
w1: torch.Tensor,
3637
w1_scale: torch.Tensor,
@@ -80,7 +81,7 @@ def apply_mlp_decode(hidden_states_wrapper: List[torch.Tensor],
8081

8182
# act_fn: swiglu
8283
hidden_states, swiglu_out_scale = torch_npu.npu_dequant_swiglu_quant(
83-
x=hidden_states,
84+
x=hidden_states,
8485
weight_scale=w1_scale,
8586
activation_scale=pertoken_scale,
8687
bias=None,
@@ -269,17 +270,18 @@ def fused_experts_with_mc2(
269270
if shared_experts is not None:
270271
with npu_stream_switch("moe_secondary", 0):
271272
npu_wait_tensor(quantized_x_for_share, expand_x)
272-
shared_act_out = shared_experts.act_fn((quantized_x_for_share, dynamic_scale_for_share))
273+
shared_act_out = shared_experts.act_fn(
274+
(quantized_x_for_share, dynamic_scale_for_share))
273275
shared_act, swiglu_out_scale = shared_act_out[0], shared_act_out[1]
274276

275277
# `expand_x` will be disposed in the `apply_mlp` function
276278
down_out_list = apply_mlp_decode([expand_x],
277-
w1,
278-
w1_scale,
279-
w2,
280-
w2_scale,
281-
expert_token_nums,
282-
dynamic_scale=dynamic_scale)
279+
w1,
280+
w1_scale,
281+
w2,
282+
w2_scale,
283+
expert_token_nums,
284+
dynamic_scale=dynamic_scale)
283285

284286
# moeCombine
285287
kwargs_mc2 = {
@@ -317,7 +319,8 @@ def fused_experts_with_mc2(
317319
else:
318320
with npu_stream_switch("moe_secondary", 0):
319321
npu_wait_tensor(shared_act, down_out_list)
320-
shared_output, _ = shared_experts.down_proj((shared_act, swiglu_out_scale))
322+
shared_output, _ = shared_experts.down_proj(
323+
(shared_act, swiglu_out_scale))
321324
return hidden_states, shared_output
322325

323326

@@ -774,8 +777,10 @@ def apply(
774777
if shared_experts is not None and fused_moe_state == FusedMoEState.MC2:
775778
with npu_stream_switch("moe_secondary", 0):
776779
npu_wait_tensor(quantized_x_for_share, router_logits)
777-
share_up_out, _ = shared_experts.gate_up_proj((quantized_x_for_share, dynamic_scale_for_share))
778-
shared_gate_up, shared_dequant_scale = share_up_out[0], share_up_out[1]
780+
share_up_out, _ = shared_experts.gate_up_proj(
781+
(quantized_x_for_share, dynamic_scale_for_share))
782+
shared_gate_up, shared_dequant_scale = share_up_out[
783+
0], share_up_out[1]
779784

780785
# this is a naive implementation for experts load balance so as
781786
# to avoid accumulating too much tokens on a single rank.

0 commit comments

Comments
 (0)