136136from vllm_ascend .utils import (ACL_FORMAT_FRACTAL_ND , ACL_FORMAT_FRACTAL_NZ ,
137137 AscendSocVersion , ProfileExecuteDuration ,
138138 enable_sp , get_ascend_soc_version , is_310p ,
139- is_enable_nz , lmhead_tp_enable ,
139+ is_enable_nz , is_moe_model , lmhead_tp_enable ,
140140 prefill_context_parallel_enable ,
141141 vllm_version_is )
142142from vllm_ascend .worker .npu_input_batch import CachedRequestState , InputBatch
@@ -515,11 +515,14 @@ def __init__(self, vllm_config: VllmConfig, device: torch.device):
515515 self .in_profile_run = False
516516
517517 self ._init_mc2_tokens_capacity ()
518- self .reserved_mc2_mask = torch .zeros (
519- self .mc2_tokens_capacity ,
520- dtype = torch .bool ,
521- device = self .device ,
522- )
518+ if is_moe_model (vllm_config ):
519+ self .reserved_mc2_mask = torch .zeros (
520+ self .mc2_tokens_capacity ,
521+ dtype = torch .bool ,
522+ device = self .device ,
523+ )
524+ else :
525+ self .reserved_mc2_mask = None
523526 self .dynamic_eplb = self .ascend_config .dynamic_eplb or self .ascend_config .expert_map_record_path
524527 if self .dynamic_eplb :
525528 EPLBParamUtils .check_dynamic_eplb (self .ascend_config .dynamic_eplb )
@@ -1497,9 +1500,7 @@ def _prepare_inputs(
14971500 self .query_lens = torch .from_numpy (num_scheduled_tokens )
14981501
14991502 # Copy the tensors to the NPU.
1500- self .input_ids [:total_num_scheduled_tokens ].copy_ (
1501- self .input_ids_cpu [:total_num_scheduled_tokens ], non_blocking = True )
1502-
1503+ self ._prepare_input_ids (total_num_scheduled_tokens , cu_num_tokens )
15031504 self .positions_cpu [total_num_scheduled_tokens :num_input_tokens ].zero_ ()
15041505 self .positions [:num_input_tokens ].copy_ (
15051506 self .positions_cpu [:num_input_tokens ], non_blocking = True )
@@ -1521,16 +1522,6 @@ def _prepare_inputs(
15211522 self ._update_graph_pad_size (with_prefill , maybe_padded_num_tokens )
15221523 attn_metadata : dict [str , Any ] = {}
15231524
1524- # Prepare input_ids
1525- token_indices = (positions_np +
1526- req_indices * self .input_batch .token_ids_cpu .shape [1 ])
1527- torch .index_select (self .input_batch .token_ids_cpu_tensor .flatten (),
1528- 0 ,
1529- torch .from_numpy (token_indices ),
1530- out = self .input_ids_cpu [:total_num_scheduled_tokens ])
1531- # Copy the tensors to the NPU.
1532- self ._prepare_input_ids (total_num_scheduled_tokens , cu_num_tokens )
1533-
15341525 # _prepare_inputs may reorder the batch, so we must gather
15351526 # multi-modal outputs after that to ensure the correct order
15361527 if self .is_multimodal_model :
@@ -2075,7 +2066,7 @@ def _pool(
20752066 )
20762067
20772068 def _select_moe_comm_method (self , num_tokens : int ,
2078- with_prefill : bool ) -> MoECommType :
2069+ with_prefill : bool ) -> Optional [ MoECommType ] :
20792070 """1. If expert parallel is not enabled, we use all-gather since MC2 and all-to-all
20802071 are designed for expert parallelism.
20812072 2. If expert parallel is enabled, we need to consider the soc version and the
@@ -2098,6 +2089,9 @@ def _select_moe_comm_method(self, num_tokens: int,
20982089 Returns:
20992090 MoECommType: The selected MoE communication method.
21002091 """
2092+ if not is_moe_model (self .vllm_config ):
2093+ return None
2094+
21012095 soc_version = get_ascend_soc_version ()
21022096 quant_type = getattr (self .vllm_config .model_config .hf_config ,
21032097 'moe_quantize' , None )
0 commit comments