File tree Expand file tree Collapse file tree 1 file changed +13
-7
lines changed
vllm/model_executor/warmup Expand file tree Collapse file tree 1 file changed +13
-7
lines changed Original file line number Diff line number Diff line change 1111import torch
1212
1313import vllm .envs as envs
14- from vllm .config import VllmConfig
14+ from vllm .config import CUDAGraphMode , VllmConfig
1515from vllm .logger import init_logger
1616from vllm .model_executor .warmup .deep_gemm_warmup import deep_gemm_warmup
1717from vllm .platforms import current_platform
@@ -30,13 +30,19 @@ def flashinfer_autotune_supported(vllm_config: VllmConfig) -> bool:
3030 Record known issues with vllm + flashinfer autotune here. Return True if
3131 and only if flashinfer autotune will run through without issues.
3232 """
33- return not (
34- vllm_config .parallel_config .data_parallel_size > 1
35- and (
36- envs .VLLM_USE_FLASHINFER_MOE_MXFP4_BF16
37- or envs .VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8
38- )
33+ is_tp_or_dp = (vllm_config .parallel_config .data_parallel_size > 1 ) or (
34+ vllm_config .parallel_config .tensor_parallel_size > 1
3935 )
36+ is_fi_mxfp4_backend = (
37+ envs .VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8
38+ or envs .VLLM_USE_FLASHINFER_MOE_MXFP4_BF16
39+ or envs .VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8_CUTLASS
40+ ) or (
41+ current_platform .is_cuda () and current_platform .is_device_capability (100 )
42+ ) # on >=sm100, default mxfp4 backend is flashinfer
43+ is_eager = vllm_config .compilation_config .cudagraph_mode == CUDAGraphMode .NONE
44+
45+ return not (is_tp_or_dp and is_fi_mxfp4_backend and is_eager )
4046
4147
4248def kernel_warmup (worker : "Worker" ):
You can’t perform that action at this time.
0 commit comments