fix: fix accuracy problem for quantized deepseek models

linfeng-yuan · linfeng-yuan · commit e4179976f007 · 2025-05-06T19:09:04.000+08:00
Signed-off-by: linfeng-yuan &lt;1102311262@qq.com&gt;
diff --git a/vllm_ascend/quantization/w8a8_dynamic.py b/vllm_ascend/quantization/w8a8_dynamic.py
@@ -285,8 +285,9 @@ def fused_experts(hidden_states: torch.Tensor,
         valid_token_mask = torch.arange(
             0, sorted_token_indices.shape[0],
             device=device).unsqueeze(1) < num_valid_tokens
-        down_out_list.mul_(valid_token_mask)
-        final_hidden_states.index_add_(0, sorted_token_indices, down_out_list)
+        valid_output = torch.where(valid_token_mask, down_out_list,
+                                   torch.zeros_like(down_out_list)).to(dtype)
+        final_hidden_states.index_add_(0, sorted_token_indices, valid_output)
     else:
         # TODO: Reorder device memory 2 times here, replace the current
         # implementation here when suitable operators become available.