From 36315f1787447a997db0d36592a6962cf415b782 Mon Sep 17 00:00:00 2001 From: Boyuan Feng Date: Tue, 28 Oct 2025 16:58:51 -0700 Subject: [PATCH 01/10] use_aot_compile should respect VLLM_DISABLE_COMPILE_CACHE Signed-off-by: Boyuan Feng --- vllm/envs.py | 15 +++++++++++---- 1 file changed, 11 insertions(+), 4 deletions(-) diff --git a/vllm/envs.py b/vllm/envs.py index 73bb2678ea85..b1fb4e5f6804 100755 --- a/vllm/envs.py +++ b/vllm/envs.py @@ -246,10 +246,19 @@ def maybe_convert_bool(value: str | None) -> bool | None: return bool(int(value)) +def disable_compile_cache() -> bool: + return bool(int(os.getenv("VLLM_DISABLE_COMPILE_CACHE", "0"))) + + def use_aot_compile() -> bool: from vllm.utils.torch_utils import is_torch_equal_or_newer - default_value = "1" if is_torch_equal_or_newer("2.10.0.dev") else "0" + default_value = ( + "1" + if is_torch_equal_or_newer("2.10.0.dev") and not disable_compile_cache() + else "0" + ) + return os.environ.get("VLLM_USE_AOT_COMPILE", default_value) == "1" @@ -944,9 +953,7 @@ def get_vllm_port() -> int | None: "VLLM_LOG_BATCHSIZE_INTERVAL": lambda: float( os.getenv("VLLM_LOG_BATCHSIZE_INTERVAL", "-1") ), - "VLLM_DISABLE_COMPILE_CACHE": lambda: bool( - int(os.getenv("VLLM_DISABLE_COMPILE_CACHE", "0")) - ), + "VLLM_DISABLE_COMPILE_CACHE": disable_compile_cache, # If set, vllm will run in development mode, which will enable # some additional endpoints for developing and debugging, # e.g. `/reset_prefix_cache` From 49f93fbeeca399c793040a24c2221f76318f0348 Mon Sep 17 00:00:00 2001 From: Boyuan Feng Date: Tue, 28 Oct 2025 17:52:36 -0700 Subject: [PATCH 02/10] fix fx graph cache Signed-off-by: Boyuan Feng --- vllm/compilation/backends.py | 5 ++--- vllm/compilation/partition_rules.py | 34 ++++++++++------------------- vllm/compilation/pass_manager.py | 30 +------------------------ vllm/env_override.py | 11 +++++++--- 4 files changed, 22 insertions(+), 58 deletions(-) diff --git a/vllm/compilation/backends.py b/vllm/compilation/backends.py index 53fd5e74dc0a..803146344f90 100644 --- a/vllm/compilation/backends.py +++ b/vllm/compilation/backends.py @@ -95,10 +95,9 @@ def compile_context(self, runtime_shape: int | None = None): compilation (e.g. partition rules, pass context).""" with pass_context(runtime_shape): if self.compilation_config.use_inductor_graph_partition: - inductor_partition_ops = resolve_defined_ops( + with inductor_partition_rule_context( self.compilation_config.splitting_ops - ) - with inductor_partition_rule_context(inductor_partition_ops): + ): yield else: yield diff --git a/vllm/compilation/partition_rules.py b/vllm/compilation/partition_rules.py index cea4f9a81637..fd48a00f9977 100644 --- a/vllm/compilation/partition_rules.py +++ b/vllm/compilation/partition_rules.py @@ -3,15 +3,12 @@ import contextlib import logging -from typing import TYPE_CHECKING +import torch from torch._library.utils import lookup_op from vllm.logger import init_logger -if TYPE_CHECKING: - import torch - logger = init_logger(__name__) @@ -56,7 +53,7 @@ def resolve_defined_ops(op_names: list[str]) -> list["torch._ops.OpOverload"]: @contextlib.contextmanager -def inductor_partition_rule_context(overloads: list["torch._ops.OpOverload"]): +def inductor_partition_rule_context(splitting_ops: list[str]): """Context manager to temporarily register Inductor partition rules. Registers custom partition rules for specified operators, forcing the @@ -69,34 +66,25 @@ def inductor_partition_rule_context(overloads: list["torch._ops.OpOverload"]): Args: overloads: List of resolved operator overload objects. """ - if not overloads: + if not splitting_ops: logger.debug("No partition ops provided; skipping rule registration.") yield return - from torch._inductor.scheduler import ( # type: ignore - _custom_should_partition_fns, - register_should_partition_rule, - ) - - def _always_partition(*_args, **_kwargs): - return True - # Save current state before registering - saved_rules = _custom_should_partition_fns.copy() - for overload in overloads: - register_should_partition_rule( - overload, - _always_partition, - ) + saved_splitting_ops: list[str] = list( + torch._inductor.config.custom_should_partition_ops + ) + torch._inductor.config.custom_should_partition_ops = splitting_ops - logger.debug("Registered inductor partition rules for %d operators", len(overloads)) + logger.debug( + "Registered inductor partition rules for %d operators", len(splitting_ops) + ) try: yield finally: # Clear and restore previous state - _custom_should_partition_fns.clear() - _custom_should_partition_fns.update(saved_rules) + torch._inductor.config.custom_should_partition_ops = saved_splitting_ops logger.debug("Restored previous partition rules state.") diff --git a/vllm/compilation/pass_manager.py b/vllm/compilation/pass_manager.py index 3bc35a8f7198..dfda2adf1d3b 100644 --- a/vllm/compilation/pass_manager.py +++ b/vllm/compilation/pass_manager.py @@ -113,27 +113,6 @@ def configure(self, config: VllmConfig): self.post_cleanup = PostCleanupPass(config) self.fix_functionalization = FixFunctionalizationPass(config) - # [HACK: Bug with Inductor graph partition and torch.compile cache] - # In PyTorch 2.9, torch.compile has a bug where the graph - # partition is not taken into account during caching. - # Because vLLM's Mode.VLLM_COMPILE is the only mode that uses - # Inductor graph partition, and VLLM_COMPILE implies there - # is a PostGradPassManager, we put the list of operators to graph - # partition into the PostGradPassManager's uuid (which - # then gets incorporated into Inductor's FX graph cache key). - # Remove this hack whenever torch.compile fixes it. - - # This is the list of operators that vLLM asks Inductor to split. - self.inductor_splitting_ops = [] - if ( - config.compilation_config.use_inductor_graph_partition - and config.compilation_config.splitting_ops is not None - ): - # Sort them so we're not dependent on the ordering. - self.inductor_splitting_ops = sorted( - config.compilation_config.splitting_ops - ) - def add(self, pass_: InductorPass): assert isinstance(pass_, InductorPass) self.passes.append(pass_) @@ -144,16 +123,9 @@ def uuid(self): affects compilation caching. Its uuid depends on the UUIDs of all dependent passes and the pass config. See InductorPass for more info. """ - state = { - "pass_config": self.pass_config.uuid(), - "passes": [], - "inductor_splitting_ops": [], - } + state = {"pass_config": self.pass_config.uuid(), "passes": []} for pass_ in self.passes: state["passes"].append(pass_.uuid()) state["passes"].append(self.fix_functionalization.uuid()) - # See [HACK: Bug with Inductor graph partition and torch.compile cache] - state["inductor_splitting_ops"].extend(self.inductor_splitting_ops) - return InductorPass.hash_dict(state) diff --git a/vllm/env_override.py b/vllm/env_override.py index ae3e4e751bd9..211c2700ecfb 100644 --- a/vllm/env_override.py +++ b/vllm/env_override.py @@ -272,7 +272,6 @@ def should_partition_patched(self, node, should_log: bool = False) -> bool: from torch._inductor.scheduler import ( BaseSchedulerNode, FusedSchedulerNode, - _custom_should_partition_fns, ) from torch._inductor.utils import ( _unstable_customized_partition_wrapper, @@ -283,9 +282,13 @@ def should_partition_patched(self, node, should_log: bool = False) -> bool: # Allow users to manually specify if a node should be partitioned # Can only do this for FallbackKernels ir_node = node.node - if isinstance(ir_node, ir.FallbackKernel): + if isinstance(ir_node, torch._inductor.ir.FallbackKernel): operator = ir_node.op_overload - if operator is not None and operator in _custom_should_partition_fns: + if ( + operator is not None + and operator.name() in torch._inductor.config.custom_should_partition_ops + ): + assert isinstance(operator, torch._ops.OpOverload) return True # When not using cudagraphs, keep all kernels in the `call` function @@ -356,5 +359,7 @@ def _update_scheduler_patched(self) -> None: from torch._inductor.codegen.wrapper import PythonWrapperCodegen from torch._inductor.graph import GraphLowering + torch._inductor.config.custom_should_partition_ops: list[str] = [] + PythonWrapperCodegen.memory_plan_reuse = memory_plan_reuse_patched GraphLowering._update_scheduler = _update_scheduler_patched From 9f1944d3c0c14af90fb72f781cfd695aed82852c Mon Sep 17 00:00:00 2001 From: Boyuan Feng Date: Tue, 28 Oct 2025 21:33:38 -0700 Subject: [PATCH 03/10] nit Signed-off-by: Boyuan Feng --- vllm/env_override.py | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/vllm/env_override.py b/vllm/env_override.py index 211c2700ecfb..d67816d0260a 100644 --- a/vllm/env_override.py +++ b/vllm/env_override.py @@ -282,13 +282,13 @@ def should_partition_patched(self, node, should_log: bool = False) -> bool: # Allow users to manually specify if a node should be partitioned # Can only do this for FallbackKernels ir_node = node.node - if isinstance(ir_node, torch._inductor.ir.FallbackKernel): - operator = ir_node.op_overload + if isinstance(ir_node, ir.FallbackKernel): + op = ir_node.op_overload if ( - operator is not None - and operator.name() in torch._inductor.config.custom_should_partition_ops + op is not None + and op.name() in torch._inductor.config.custom_should_partition_ops ): - assert isinstance(operator, torch._ops.OpOverload) + assert isinstance(op, torch._ops.OpOverload) return True # When not using cudagraphs, keep all kernels in the `call` function @@ -359,7 +359,8 @@ def _update_scheduler_patched(self) -> None: from torch._inductor.codegen.wrapper import PythonWrapperCodegen from torch._inductor.graph import GraphLowering - torch._inductor.config.custom_should_partition_ops: list[str] = [] + if not hasattr(torch._inductor.config, "custom_should_partition_ops"): + torch._inductor.config.custom_should_partition_ops: list[str] = [] PythonWrapperCodegen.memory_plan_reuse = memory_plan_reuse_patched GraphLowering._update_scheduler = _update_scheduler_patched From cd9b0b6478f4fcc9c94c22ec738b6ac08a7b9d57 Mon Sep 17 00:00:00 2001 From: Boyuan Feng Date: Tue, 28 Oct 2025 21:44:14 -0700 Subject: [PATCH 04/10] update for doc Signed-off-by: Boyuan Feng --- vllm/compilation/partition_rules.py | 2 +- vllm/env_override.py | 7 +++---- 2 files changed, 4 insertions(+), 5 deletions(-) diff --git a/vllm/compilation/partition_rules.py b/vllm/compilation/partition_rules.py index fd48a00f9977..df1bafbefee5 100644 --- a/vllm/compilation/partition_rules.py +++ b/vllm/compilation/partition_rules.py @@ -74,7 +74,7 @@ def inductor_partition_rule_context(splitting_ops: list[str]): # Save current state before registering saved_splitting_ops: list[str] = list( - torch._inductor.config.custom_should_partition_ops + getattr(torch._inductor.config, "custom_should_partition_ops", []) ) torch._inductor.config.custom_should_partition_ops = splitting_ops diff --git a/vllm/env_override.py b/vllm/env_override.py index d67816d0260a..dd87e127a702 100644 --- a/vllm/env_override.py +++ b/vllm/env_override.py @@ -284,9 +284,8 @@ def should_partition_patched(self, node, should_log: bool = False) -> bool: ir_node = node.node if isinstance(ir_node, ir.FallbackKernel): op = ir_node.op_overload - if ( - op is not None - and op.name() in torch._inductor.config.custom_should_partition_ops + if op is not None and op.name() in getattr( + torch._inductor.config, "custom_should_partition_ops", [] ): assert isinstance(op, torch._ops.OpOverload) return True @@ -360,7 +359,7 @@ def _update_scheduler_patched(self) -> None: from torch._inductor.graph import GraphLowering if not hasattr(torch._inductor.config, "custom_should_partition_ops"): - torch._inductor.config.custom_should_partition_ops: list[str] = [] + torch._inductor.config.custom_should_partition_ops = [] PythonWrapperCodegen.memory_plan_reuse = memory_plan_reuse_patched GraphLowering._update_scheduler = _update_scheduler_patched From 0e5a38b9c68f65f0f1d44e246b200e11f73b3eee Mon Sep 17 00:00:00 2001 From: Boyuan Feng Date: Wed, 29 Oct 2025 10:38:10 -0700 Subject: [PATCH 05/10] patch 2.9 Signed-off-by: Boyuan Feng --- vllm/compilation/partition_rules.py | 2 +- vllm/env_override.py | 14 ++++++++++---- 2 files changed, 11 insertions(+), 5 deletions(-) diff --git a/vllm/compilation/partition_rules.py b/vllm/compilation/partition_rules.py index df1bafbefee5..fd48a00f9977 100644 --- a/vllm/compilation/partition_rules.py +++ b/vllm/compilation/partition_rules.py @@ -74,7 +74,7 @@ def inductor_partition_rule_context(splitting_ops: list[str]): # Save current state before registering saved_splitting_ops: list[str] = list( - getattr(torch._inductor.config, "custom_should_partition_ops", []) + torch._inductor.config.custom_should_partition_ops ) torch._inductor.config.custom_should_partition_ops = splitting_ops diff --git a/vllm/env_override.py b/vllm/env_override.py index dd87e127a702..71027fcc5a39 100644 --- a/vllm/env_override.py +++ b/vllm/env_override.py @@ -284,8 +284,9 @@ def should_partition_patched(self, node, should_log: bool = False) -> bool: ir_node = node.node if isinstance(ir_node, ir.FallbackKernel): op = ir_node.op_overload - if op is not None and op.name() in getattr( - torch._inductor.config, "custom_should_partition_ops", [] + if ( + op is not None + and op.name() in torch._inductor.config.custom_should_partition_ops ): assert isinstance(op, torch._ops.OpOverload) return True @@ -357,9 +358,14 @@ def _update_scheduler_patched(self) -> None: if is_torch_equal("2.9.0"): from torch._inductor.codegen.wrapper import PythonWrapperCodegen from torch._inductor.graph import GraphLowering + from torch.utils._config_module import _Config, _ConfigEntry - if not hasattr(torch._inductor.config, "custom_should_partition_ops"): - torch._inductor.config.custom_should_partition_ops = [] + # `custom_should_partition_ops` is a new config after 2.9.0. So this would + # not overwrite any user configs. + dummy_config = _Config(default=[]) + torch._inductor.config._config["custom_should_partition_ops"] = _ConfigEntry( + dummy_config + ) PythonWrapperCodegen.memory_plan_reuse = memory_plan_reuse_patched GraphLowering._update_scheduler = _update_scheduler_patched From 2f7ae24e47afefc02939568c79ca116b6f7f1490 Mon Sep 17 00:00:00 2001 From: Boyuan Feng Date: Wed, 29 Oct 2025 10:45:39 -0700 Subject: [PATCH 06/10] nit Signed-off-by: Boyuan Feng --- vllm/env_override.py | 16 ++++++++++++---- 1 file changed, 12 insertions(+), 4 deletions(-) diff --git a/vllm/env_override.py b/vllm/env_override.py index 71027fcc5a39..646c16c8aeab 100644 --- a/vllm/env_override.py +++ b/vllm/env_override.py @@ -282,11 +282,19 @@ def should_partition_patched(self, node, should_log: bool = False) -> bool: # Allow users to manually specify if a node should be partitioned # Can only do this for FallbackKernels ir_node = node.node - if isinstance(ir_node, ir.FallbackKernel): - op = ir_node.op_overload + if isinstance(ir_node, torch._inductor.ir.FallbackKernel) and ( + op := ir_node.op_overload + ): + op_overload_packet_name = op.name() + op_overload_name = ( + f"{op_overload_packet_name}.{op._overloadname}" + if isinstance(op, torch._ops.OpOverload) + else op_overload_packet_name + ) if ( - op is not None - and op.name() in torch._inductor.config.custom_should_partition_ops + op_overload_packet_name + in torch._inductor.config.custom_should_partition_ops + or op_overload_name in torch._inductor.config.custom_should_partition_ops ): assert isinstance(op, torch._ops.OpOverload) return True From a85682a78bda414ef5d661317d16a6db8a8b84fb Mon Sep 17 00:00:00 2001 From: Boyuan Feng Date: Wed, 29 Oct 2025 12:54:01 -0700 Subject: [PATCH 07/10] update doc Signed-off-by: Boyuan Feng --- vllm/compilation/partition_rules.py | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/vllm/compilation/partition_rules.py b/vllm/compilation/partition_rules.py index fd48a00f9977..fb3c7451a7f8 100644 --- a/vllm/compilation/partition_rules.py +++ b/vllm/compilation/partition_rules.py @@ -60,11 +60,9 @@ def inductor_partition_rule_context(splitting_ops: list[str]): Inductor scheduler to partition the graph at these operators. The rules are automatically restored to their previous state on exit. - Note: Callers should use resolve_defined_ops() to convert operator names - to OpOverload objects before calling this function. - Args: - overloads: List of resolved operator overload objects. + splitting_ops: List of operator names to partition on. The format should + be `namespace::op_name` (e.g., `silly::attention`). """ if not splitting_ops: logger.debug("No partition ops provided; skipping rule registration.") From 3e374a0ebb54e85a5617987dfccf2672ed39e4a2 Mon Sep 17 00:00:00 2001 From: Boyuan Feng Date: Wed, 29 Oct 2025 14:35:26 -0700 Subject: [PATCH 08/10] nit: doc format Signed-off-by: Boyuan Feng --- vllm/compilation/partition_rules.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vllm/compilation/partition_rules.py b/vllm/compilation/partition_rules.py index fb3c7451a7f8..fbf8eeb85f16 100644 --- a/vllm/compilation/partition_rules.py +++ b/vllm/compilation/partition_rules.py @@ -62,7 +62,7 @@ def inductor_partition_rule_context(splitting_ops: list[str]): Args: splitting_ops: List of operator names to partition on. The format should - be `namespace::op_name` (e.g., `silly::attention`). + be "namespace::op_name" (e.g., "silly::attention"). """ if not splitting_ops: logger.debug("No partition ops provided; skipping rule registration.") From e069c2eaf87959176d9373804409575df0892634 Mon Sep 17 00:00:00 2001 From: Boyuan Feng Date: Wed, 29 Oct 2025 15:51:33 -0700 Subject: [PATCH 09/10] nit: doc format Signed-off-by: Boyuan Feng --- vllm/compilation/partition_rules.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/vllm/compilation/partition_rules.py b/vllm/compilation/partition_rules.py index fbf8eeb85f16..094b86dcb4aa 100644 --- a/vllm/compilation/partition_rules.py +++ b/vllm/compilation/partition_rules.py @@ -61,8 +61,7 @@ def inductor_partition_rule_context(splitting_ops: list[str]): are automatically restored to their previous state on exit. Args: - splitting_ops: List of operator names to partition on. The format should - be "namespace::op_name" (e.g., "silly::attention"). + splitting_ops: List of operator names to partition on. """ if not splitting_ops: logger.debug("No partition ops provided; skipping rule registration.") From c942ce2f41226cc09a80d6791e2e9b75b8a12ba9 Mon Sep 17 00:00:00 2001 From: Boyuan Feng Date: Tue, 4 Nov 2025 14:22:11 -0800 Subject: [PATCH 10/10] nit Signed-off-by: Boyuan Feng --- vllm/env_override.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/vllm/env_override.py b/vllm/env_override.py index 646c16c8aeab..14dae2850c35 100644 --- a/vllm/env_override.py +++ b/vllm/env_override.py @@ -370,9 +370,8 @@ def _update_scheduler_patched(self) -> None: # `custom_should_partition_ops` is a new config after 2.9.0. So this would # not overwrite any user configs. - dummy_config = _Config(default=[]) torch._inductor.config._config["custom_should_partition_ops"] = _ConfigEntry( - dummy_config + _Config(default=[]) ) PythonWrapperCodegen.memory_plan_reuse = memory_plan_reuse_patched