File tree Expand file tree Collapse file tree 2 files changed +8
-6
lines changed
vllm/model_executor/layers Expand file tree Collapse file tree 2 files changed +8
-6
lines changed Original file line number Diff line number Diff line change @@ -21,14 +21,14 @@ def __init__(
2121 gemm1_alpha ,
2222 gemm1_beta ,
2323 gemm1_clamp_limit ,
24- max_capture_size ,
24+ tune_max_num_tokens ,
2525 ):
2626 super ().__init__ (quant_config )
2727 self .moe = moe
2828 self .gemm1_alpha = gemm1_alpha
2929 self .gemm1_beta = gemm1_beta
3030 self .gemm1_clamp_limit = gemm1_clamp_limit
31- self .max_capture_size = max_capture_size
31+ self .tune_max_num_tokens = tune_max_num_tokens
3232
3333 @property
3434 def activation_formats (
@@ -127,7 +127,7 @@ def apply(
127127 "routing_method_type" : 1 ,
128128 "do_finalize" : True ,
129129 "output" : output ,
130- "tune_max_num_tokens" : max ( self .max_capture_size , 1 ) ,
130+ "tune_max_num_tokens" : self .tune_max_num_tokens ,
131131 }
132132
133133 from flashinfer import trtllm_fp4_block_scale_routed_moe
Original file line number Diff line number Diff line change @@ -204,8 +204,10 @@ class Mxfp4MoEMethod(FusedMoEMethodBase):
204204 def __init__ (self , moe : FusedMoEConfig ):
205205 super ().__init__ (moe )
206206 self .mxfp4_backend = get_mxfp4_backend (moe .is_lora_enabled )
207- self .max_capture_size = (
208- get_current_vllm_config ().compilation_config .max_cudagraph_capture_size
207+ # Be conservative and tune for the most extreme inbalance for MoE,
208+ # i.e., one expert receives all the tokens.
209+ self .tune_max_num_tokens = (
210+ get_current_vllm_config ().scheduler_config .max_num_batched_tokens
209211 )
210212
211213 assert self .mxfp4_backend != Mxfp4Backend .NONE , (
@@ -845,7 +847,7 @@ def select_gemm_impl(
845847 "gemm1_beta" : layer .gemm1_beta ,
846848 "gemm1_clamp_limit" : layer .gemm1_clamp_limit ,
847849 # TODO(bnell): part of quant_config
848- "max_capture_size " : self .max_capture_size ,
850+ "tune_max_num_tokens " : self .tune_max_num_tokens ,
849851 }
850852 return TrtLlmGenExperts (self .moe , self .moe_quant_config , ** kwargs )
851853 elif self .mxfp4_backend == Mxfp4Backend .MARLIN :
You can’t perform that action at this time.
0 commit comments