From 506a52fbc7572c24517f379352e15bfa862cf3be Mon Sep 17 00:00:00 2001 From: Boyuan Feng Date: Sun, 12 Oct 2025 22:03:42 -0700 Subject: [PATCH 1/2] use combo kernel Signed-off-by: Boyuan Feng --- vllm/config/compilation.py | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/vllm/config/compilation.py b/vllm/config/compilation.py index 657c430049f8..6e4568ed4c8d 100644 --- a/vllm/config/compilation.py +++ b/vllm/config/compilation.py @@ -367,6 +367,11 @@ class CompilationConfig: since we know all keys are in a range [0, max_capture_size], we can optimize it to list[int] for better lookup performance.""" + use_horizontal_fusion = True + """Whether use horizontal fusion. This is + useful for fusing qk-norm and qk-rope when query and key have + different shapes.""" + # keep track of enabled and disabled custom ops enabled_custom_ops: Counter[str] = field(default_factory=Counter, init=False) """custom ops that are enabled""" @@ -498,6 +503,10 @@ def __post_init__(self) -> None: if isinstance(self.pass_config, dict): self.pass_config = PassConfig(**self.pass_config) + if self.use_horizontal_fusion and is_torch_equal_or_newer("2.9.0.dev"): + self.inductor_compile_config["combo_kernels"] = True + self.inductor_compile_config["benchmark_combo_kernel"] = True + # migrate the deprecated flags if not self.use_cudagraph: logger.warning( From 00f4fcab217536bf08eacf352076eb860197b6ab Mon Sep 17 00:00:00 2001 From: Boyuan Feng Date: Mon, 13 Oct 2025 09:57:06 -0700 Subject: [PATCH 2/2] respect user inductor config Signed-off-by: Boyuan Feng --- vllm/config/compilation.py | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/vllm/config/compilation.py b/vllm/config/compilation.py index 6e4568ed4c8d..17a063d0c233 100644 --- a/vllm/config/compilation.py +++ b/vllm/config/compilation.py @@ -367,11 +367,6 @@ class CompilationConfig: since we know all keys are in a range [0, max_capture_size], we can optimize it to list[int] for better lookup performance.""" - use_horizontal_fusion = True - """Whether use horizontal fusion. This is - useful for fusing qk-norm and qk-rope when query and key have - different shapes.""" - # keep track of enabled and disabled custom ops enabled_custom_ops: Counter[str] = field(default_factory=Counter, init=False) """custom ops that are enabled""" @@ -503,7 +498,13 @@ def __post_init__(self) -> None: if isinstance(self.pass_config, dict): self.pass_config = PassConfig(**self.pass_config) - if self.use_horizontal_fusion and is_torch_equal_or_newer("2.9.0.dev"): + if ( + is_torch_equal_or_newer("2.9.0.dev") + and "combo_kernels" not in self.inductor_compile_config + and "benchmark_combo_kernel" not in self.inductor_compile_config + ): + # use horizontal fusion, which is useful for fusing qk-norm and + # qk-rope when query and key have different shapes. self.inductor_compile_config["combo_kernels"] = True self.inductor_compile_config["benchmark_combo_kernel"] = True