From 5e54e7ee01a40bf5e9fafe81b98e60efd3f82e07 Mon Sep 17 00:00:00 2001 From: Varun Sundar Rabindranath Date: Wed, 29 Oct 2025 10:56:50 -0400 Subject: [PATCH 1/4] relax deepep_ht skip Signed-off-by: Varun Sundar Rabindranath --- vllm/model_executor/warmup/kernel_warmup.py | 1 + 1 file changed, 1 insertion(+) diff --git a/vllm/model_executor/warmup/kernel_warmup.py b/vllm/model_executor/warmup/kernel_warmup.py index 79d1927d3210..54ba78fcb7e2 100644 --- a/vllm/model_executor/warmup/kernel_warmup.py +++ b/vllm/model_executor/warmup/kernel_warmup.py @@ -32,6 +32,7 @@ def flashinfer_autotune_supported(vllm_config: VllmConfig) -> bool: """ return not ( vllm_config.parallel_config.data_parallel_size > 1 + and envs.VLLM_ALL2ALL_BACKEND == "deepep_high_throughput" and ( envs.VLLM_USE_FLASHINFER_MOE_MXFP4_BF16 or envs.VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8 From 67718f19bae9addfeff5a01c3a6c04789cc1fbf9 Mon Sep 17 00:00:00 2001 From: Varun Sundar Rabindranath Date: Wed, 29 Oct 2025 13:21:38 -0400 Subject: [PATCH 2/4] better conditions Signed-off-by: Varun Sundar Rabindranath --- vllm/model_executor/warmup/kernel_warmup.py | 19 +++++++++++-------- 1 file changed, 11 insertions(+), 8 deletions(-) diff --git a/vllm/model_executor/warmup/kernel_warmup.py b/vllm/model_executor/warmup/kernel_warmup.py index 54ba78fcb7e2..6ed5be957b30 100644 --- a/vllm/model_executor/warmup/kernel_warmup.py +++ b/vllm/model_executor/warmup/kernel_warmup.py @@ -11,7 +11,7 @@ import torch import vllm.envs as envs -from vllm.config import VllmConfig +from vllm.config import CUDAGraphMode, VllmConfig from vllm.logger import init_logger from vllm.model_executor.warmup.deep_gemm_warmup import deep_gemm_warmup from vllm.platforms import current_platform @@ -30,14 +30,17 @@ def flashinfer_autotune_supported(vllm_config: VllmConfig) -> bool: Record known issues with vllm + flashinfer autotune here. Return True if and only if flashinfer autotune will run through without issues. """ - return not ( - vllm_config.parallel_config.data_parallel_size > 1 - and envs.VLLM_ALL2ALL_BACKEND == "deepep_high_throughput" - and ( - envs.VLLM_USE_FLASHINFER_MOE_MXFP4_BF16 - or envs.VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8 - ) + is_tp_or_dp = (vllm_config.parallel_config.data_parallel_size > 1) or ( + vllm_config.parallel_config.tensor_parallel_size > 1 + ) + is_fi_mxfp4_backend = ( + envs.VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8 + or envs.VLLM_USE_FLASHINFER_MOE_MXFP4_BF16 + or envs.VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8_CUTLASS ) + is_eager = vllm_config.compilation_config.cudagraph_mode == CUDAGraphMode.NONE + + return not (is_tp_or_dp and is_fi_mxfp4_backend and is_eager) def kernel_warmup(worker: "Worker"): From c95b2d75dede9fcee754b1aef86edbfaa39a22af Mon Sep 17 00:00:00 2001 From: Varun Sundar Rabindranath Date: Wed, 29 Oct 2025 14:20:14 -0400 Subject: [PATCH 3/4] fix b200 Signed-off-by: Varun Sundar Rabindranath --- vllm/model_executor/warmup/kernel_warmup.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/vllm/model_executor/warmup/kernel_warmup.py b/vllm/model_executor/warmup/kernel_warmup.py index 6ed5be957b30..8c0063c58ed6 100644 --- a/vllm/model_executor/warmup/kernel_warmup.py +++ b/vllm/model_executor/warmup/kernel_warmup.py @@ -37,7 +37,9 @@ def flashinfer_autotune_supported(vllm_config: VllmConfig) -> bool: envs.VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8 or envs.VLLM_USE_FLASHINFER_MOE_MXFP4_BF16 or envs.VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8_CUTLASS - ) + ) or current_platform.is_device_capability( + 100 + ) # on >sm100, default mxfp4 backend is flashinfer is_eager = vllm_config.compilation_config.cudagraph_mode == CUDAGraphMode.NONE return not (is_tp_or_dp and is_fi_mxfp4_backend and is_eager) From 5ca0376fdaa162710057defe21aa0f8200a18630 Mon Sep 17 00:00:00 2001 From: Varun Sundar Rabindranath Date: Wed, 29 Oct 2025 14:25:56 -0400 Subject: [PATCH 4/4] fixes Signed-off-by: Varun Sundar Rabindranath --- vllm/model_executor/warmup/kernel_warmup.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/vllm/model_executor/warmup/kernel_warmup.py b/vllm/model_executor/warmup/kernel_warmup.py index 8c0063c58ed6..ffa3bc8f021e 100644 --- a/vllm/model_executor/warmup/kernel_warmup.py +++ b/vllm/model_executor/warmup/kernel_warmup.py @@ -37,9 +37,9 @@ def flashinfer_autotune_supported(vllm_config: VllmConfig) -> bool: envs.VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8 or envs.VLLM_USE_FLASHINFER_MOE_MXFP4_BF16 or envs.VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8_CUTLASS - ) or current_platform.is_device_capability( - 100 - ) # on >sm100, default mxfp4 backend is flashinfer + ) or ( + current_platform.is_cuda() and current_platform.is_device_capability(100) + ) # on >=sm100, default mxfp4 backend is flashinfer is_eager = vllm_config.compilation_config.cudagraph_mode == CUDAGraphMode.NONE return not (is_tp_or_dp and is_fi_mxfp4_backend and is_eager)