Skip to content
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
13 changes: 9 additions & 4 deletions vllm_ascend/ops/common_fused_moe.py
Original file line number Diff line number Diff line change
Expand Up @@ -169,11 +169,15 @@ def __init__(self, *args, **kwargs):
self.expert_map_path = ascend_config.expert_map_path
self.global_redundant_expert_num = ascend_config.init_redundancy_expert
self.global_num_experts = num_experts + self.global_redundant_expert_num
# TODO: Flag for static expert placement. This is a temporary workaround
# to allow dynamic EPLB with float weights by skipping quantization checks.
self.static_eplb_enabled = False
if self.custom_routing_function is None and self.e_score_correction_bias is not None:
vllm_config = get_current_vllm_config()
self.e_score_correction_bias.data = self.e_score_correction_bias.data.to(
dtype=vllm_config.model_config.dtype)
# static eplb initializing with expert_map_path
init_eplb_enable = False
if self.expert_map_path and os.path.exists(
self.expert_map_path) and os.access(self.expert_map_path,
os.R_OK):
Expand All @@ -189,6 +193,7 @@ def __init__(self, *args, **kwargs):
self.moe_instance_id, self.ep_rank))
self.log2phy = self.expert_load_balancer.get_rank_log2phy_map(
self.moe_instance_id, self.ep_rank).npu()
init_eplb_enable = True
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

high

To be consistent with the suggested change of using an instance attribute, this should be updated to set self.static_eplb_enabled.

Suggested change
init_eplb_enable = True
self.static_eplb_enabled = True

except Exception as e:
logger.warning(
f"Init expert map of mtp/eagle when using sample.{e}")
Expand Down Expand Up @@ -219,10 +224,10 @@ def __init__(self, *args, **kwargs):
self.moe_load = torch.zeros(local_num_experts,
dtype=torch.int64).npu()

eplb_enable = self.dynamic_eplb or (self.expert_map_path is not None)
if eplb_enable and (not hasattr(self.quant_method, "quant_method") or
not isinstance(self.quant_method.quant_method,
AscendW8A8DynamicFusedMoEMethod)):
if init_eplb_enable and (
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

high

To be consistent with the suggested change, this condition should use the self.static_eplb_enabled instance attribute.

Suggested change
if init_eplb_enable and (
if self.static_eplb_enabled and (

not hasattr(self.quant_method, "quant_method")
or not isinstance(self.quant_method.quant_method,
AscendW8A8DynamicFusedMoEMethod)):
raise ValueError("Eplb supports only w8a8_dynamic quantization.")

self.moe_config.num_experts = self.global_num_experts
Expand Down
Loading