From 36315f1787447a997db0d36592a6962cf415b782 Mon Sep 17 00:00:00 2001
From: Boyuan Feng <boyuan@meta.com>
Date: Tue, 28 Oct 2025 16:58:51 -0700
Subject: [PATCH 01/10] use_aot_compile should respect
 VLLM_DISABLE_COMPILE_CACHE

Signed-off-by: Boyuan Feng <boyuan@meta.com>
---
 vllm/envs.py | 15 +++++++++++----
 1 file changed, 11 insertions(+), 4 deletions(-)

diff --git a/vllm/envs.py b/vllm/envs.py
index 73bb2678ea85..b1fb4e5f6804 100755
--- a/vllm/envs.py
+++ b/vllm/envs.py
@@ -246,10 +246,19 @@ def maybe_convert_bool(value: str | None) -> bool | None:
     return bool(int(value))
 
 
+def disable_compile_cache() -> bool:
+    return bool(int(os.getenv("VLLM_DISABLE_COMPILE_CACHE", "0")))
+
+
 def use_aot_compile() -> bool:
     from vllm.utils.torch_utils import is_torch_equal_or_newer
 
-    default_value = "1" if is_torch_equal_or_newer("2.10.0.dev") else "0"
+    default_value = (
+        "1"
+        if is_torch_equal_or_newer("2.10.0.dev") and not disable_compile_cache()
+        else "0"
+    )
+
     return os.environ.get("VLLM_USE_AOT_COMPILE", default_value) == "1"
 
 
@@ -944,9 +953,7 @@ def get_vllm_port() -> int | None:
     "VLLM_LOG_BATCHSIZE_INTERVAL": lambda: float(
         os.getenv("VLLM_LOG_BATCHSIZE_INTERVAL", "-1")
     ),
-    "VLLM_DISABLE_COMPILE_CACHE": lambda: bool(
-        int(os.getenv("VLLM_DISABLE_COMPILE_CACHE", "0"))
-    ),
+    "VLLM_DISABLE_COMPILE_CACHE": disable_compile_cache,
     # If set, vllm will run in development mode, which will enable
     # some additional endpoints for developing and debugging,
     # e.g. `/reset_prefix_cache`

From 49f93fbeeca399c793040a24c2221f76318f0348 Mon Sep 17 00:00:00 2001
From: Boyuan Feng <boyuan@meta.com>
Date: Tue, 28 Oct 2025 17:52:36 -0700
Subject: [PATCH 02/10] fix fx graph cache

Signed-off-by: Boyuan Feng <boyuan@meta.com>
---
 vllm/compilation/backends.py        |  5 ++---
 vllm/compilation/partition_rules.py | 34 ++++++++++-------------------
 vllm/compilation/pass_manager.py    | 30 +------------------------
 vllm/env_override.py                | 11 +++++++---
 4 files changed, 22 insertions(+), 58 deletions(-)

diff --git a/vllm/compilation/backends.py b/vllm/compilation/backends.py
index 53fd5e74dc0a..803146344f90 100644
--- a/vllm/compilation/backends.py
+++ b/vllm/compilation/backends.py
@@ -95,10 +95,9 @@ def compile_context(self, runtime_shape: int | None = None):
         compilation (e.g. partition rules, pass context)."""
         with pass_context(runtime_shape):
             if self.compilation_config.use_inductor_graph_partition:
-                inductor_partition_ops = resolve_defined_ops(
+                with inductor_partition_rule_context(
                     self.compilation_config.splitting_ops
-                )
-                with inductor_partition_rule_context(inductor_partition_ops):
+                ):
                     yield
             else:
                 yield
diff --git a/vllm/compilation/partition_rules.py b/vllm/compilation/partition_rules.py
index cea4f9a81637..fd48a00f9977 100644
--- a/vllm/compilation/partition_rules.py
+++ b/vllm/compilation/partition_rules.py
@@ -3,15 +3,12 @@
 
 import contextlib
 import logging
-from typing import TYPE_CHECKING
 
+import torch
 from torch._library.utils import lookup_op
 
 from vllm.logger import init_logger
 
-if TYPE_CHECKING:
-    import torch
-
 logger = init_logger(__name__)
 
 
@@ -56,7 +53,7 @@ def resolve_defined_ops(op_names: list[str]) -> list["torch._ops.OpOverload"]:
 
 
 @contextlib.contextmanager
-def inductor_partition_rule_context(overloads: list["torch._ops.OpOverload"]):
+def inductor_partition_rule_context(splitting_ops: list[str]):
     """Context manager to temporarily register Inductor partition rules.
 
     Registers custom partition rules for specified operators, forcing the
@@ -69,34 +66,25 @@ def inductor_partition_rule_context(overloads: list["torch._ops.OpOverload"]):
     Args:
         overloads: List of resolved operator overload objects.
     """
-    if not overloads:
+    if not splitting_ops:
         logger.debug("No partition ops provided; skipping rule registration.")
         yield
         return
 
-    from torch._inductor.scheduler import (  # type: ignore
-        _custom_should_partition_fns,
-        register_should_partition_rule,
-    )
-
-    def _always_partition(*_args, **_kwargs):
-        return True
-
     # Save current state before registering
-    saved_rules = _custom_should_partition_fns.copy()
 
-    for overload in overloads:
-        register_should_partition_rule(
-            overload,
-            _always_partition,
-        )
+    saved_splitting_ops: list[str] = list(
+        torch._inductor.config.custom_should_partition_ops
+    )
+    torch._inductor.config.custom_should_partition_ops = splitting_ops
 
-    logger.debug("Registered inductor partition rules for %d operators", len(overloads))
+    logger.debug(
+        "Registered inductor partition rules for %d operators", len(splitting_ops)
+    )
 
     try:
         yield
     finally:
         # Clear and restore previous state
-        _custom_should_partition_fns.clear()
-        _custom_should_partition_fns.update(saved_rules)
+        torch._inductor.config.custom_should_partition_ops = saved_splitting_ops
         logger.debug("Restored previous partition rules state.")
diff --git a/vllm/compilation/pass_manager.py b/vllm/compilation/pass_manager.py
index 3bc35a8f7198..dfda2adf1d3b 100644
--- a/vllm/compilation/pass_manager.py
+++ b/vllm/compilation/pass_manager.py
@@ -113,27 +113,6 @@ def configure(self, config: VllmConfig):
             self.post_cleanup = PostCleanupPass(config)
             self.fix_functionalization = FixFunctionalizationPass(config)
 
-        # [HACK: Bug with Inductor graph partition and torch.compile cache]
-        # In PyTorch 2.9, torch.compile has a bug where the graph
-        # partition is not taken into account during caching.
-        # Because vLLM's Mode.VLLM_COMPILE is the only mode that uses
-        # Inductor graph partition, and VLLM_COMPILE implies there
-        # is a PostGradPassManager, we put the list of operators to graph
-        # partition into the PostGradPassManager's uuid (which
-        # then gets incorporated into Inductor's FX graph cache key).
-        # Remove this hack whenever torch.compile fixes it.
-
-        # This is the list of operators that vLLM asks Inductor to split.
-        self.inductor_splitting_ops = []
-        if (
-            config.compilation_config.use_inductor_graph_partition
-            and config.compilation_config.splitting_ops is not None
-        ):
-            # Sort them so we're not dependent on the ordering.
-            self.inductor_splitting_ops = sorted(
-                config.compilation_config.splitting_ops
-            )
-
     def add(self, pass_: InductorPass):
         assert isinstance(pass_, InductorPass)
         self.passes.append(pass_)
@@ -144,16 +123,9 @@ def uuid(self):
         affects compilation caching. Its uuid depends on the UUIDs of all
         dependent passes and the pass config. See InductorPass for more info.
         """
-        state = {
-            "pass_config": self.pass_config.uuid(),
-            "passes": [],
-            "inductor_splitting_ops": [],
-        }
+        state = {"pass_config": self.pass_config.uuid(), "passes": []}
         for pass_ in self.passes:
             state["passes"].append(pass_.uuid())
         state["passes"].append(self.fix_functionalization.uuid())
 
-        # See [HACK: Bug with Inductor graph partition and torch.compile cache]
-        state["inductor_splitting_ops"].extend(self.inductor_splitting_ops)
-
         return InductorPass.hash_dict(state)
diff --git a/vllm/env_override.py b/vllm/env_override.py
index ae3e4e751bd9..211c2700ecfb 100644
--- a/vllm/env_override.py
+++ b/vllm/env_override.py
@@ -272,7 +272,6 @@ def should_partition_patched(self, node, should_log: bool = False) -> bool:
     from torch._inductor.scheduler import (
         BaseSchedulerNode,
         FusedSchedulerNode,
-        _custom_should_partition_fns,
     )
     from torch._inductor.utils import (
         _unstable_customized_partition_wrapper,
@@ -283,9 +282,13 @@ def should_partition_patched(self, node, should_log: bool = False) -> bool:
     # Allow users to manually specify if a node should be partitioned
     # Can only do this for FallbackKernels
     ir_node = node.node
-    if isinstance(ir_node, ir.FallbackKernel):
+    if isinstance(ir_node, torch._inductor.ir.FallbackKernel):
         operator = ir_node.op_overload
-        if operator is not None and operator in _custom_should_partition_fns:
+        if (
+            operator is not None
+            and operator.name() in torch._inductor.config.custom_should_partition_ops
+        ):
+            assert isinstance(operator, torch._ops.OpOverload)
             return True
 
     # When not using cudagraphs, keep all kernels in the `call` function
@@ -356,5 +359,7 @@ def _update_scheduler_patched(self) -> None:
     from torch._inductor.codegen.wrapper import PythonWrapperCodegen
     from torch._inductor.graph import GraphLowering
 
+    torch._inductor.config.custom_should_partition_ops: list[str] = []
+
     PythonWrapperCodegen.memory_plan_reuse = memory_plan_reuse_patched
     GraphLowering._update_scheduler = _update_scheduler_patched

From 9f1944d3c0c14af90fb72f781cfd695aed82852c Mon Sep 17 00:00:00 2001
From: Boyuan Feng <boyuan@meta.com>
Date: Tue, 28 Oct 2025 21:33:38 -0700
Subject: [PATCH 03/10] nit

Signed-off-by: Boyuan Feng <boyuan@meta.com>
---
 vllm/env_override.py | 13 +++++++------
 1 file changed, 7 insertions(+), 6 deletions(-)

diff --git a/vllm/env_override.py b/vllm/env_override.py
index 211c2700ecfb..d67816d0260a 100644
--- a/vllm/env_override.py
+++ b/vllm/env_override.py
@@ -282,13 +282,13 @@ def should_partition_patched(self, node, should_log: bool = False) -> bool:
     # Allow users to manually specify if a node should be partitioned
     # Can only do this for FallbackKernels
     ir_node = node.node
-    if isinstance(ir_node, torch._inductor.ir.FallbackKernel):
-        operator = ir_node.op_overload
+    if isinstance(ir_node, ir.FallbackKernel):
+        op = ir_node.op_overload
         if (
-            operator is not None
-            and operator.name() in torch._inductor.config.custom_should_partition_ops
+            op is not None
+            and op.name() in torch._inductor.config.custom_should_partition_ops
         ):
-            assert isinstance(operator, torch._ops.OpOverload)
+            assert isinstance(op, torch._ops.OpOverload)
             return True
 
     # When not using cudagraphs, keep all kernels in the `call` function
@@ -359,7 +359,8 @@ def _update_scheduler_patched(self) -> None:
     from torch._inductor.codegen.wrapper import PythonWrapperCodegen
     from torch._inductor.graph import GraphLowering
 
-    torch._inductor.config.custom_should_partition_ops: list[str] = []
+    if not hasattr(torch._inductor.config, "custom_should_partition_ops"):
+        torch._inductor.config.custom_should_partition_ops: list[str] = []
 
     PythonWrapperCodegen.memory_plan_reuse = memory_plan_reuse_patched
     GraphLowering._update_scheduler = _update_scheduler_patched

From cd9b0b6478f4fcc9c94c22ec738b6ac08a7b9d57 Mon Sep 17 00:00:00 2001
From: Boyuan Feng <boyuan@meta.com>
Date: Tue, 28 Oct 2025 21:44:14 -0700
Subject: [PATCH 04/10] update for doc

Signed-off-by: Boyuan Feng <boyuan@meta.com>
---
 vllm/compilation/partition_rules.py | 2 +-
 vllm/env_override.py                | 7 +++----
 2 files changed, 4 insertions(+), 5 deletions(-)

diff --git a/vllm/compilation/partition_rules.py b/vllm/compilation/partition_rules.py
index fd48a00f9977..df1bafbefee5 100644
--- a/vllm/compilation/partition_rules.py
+++ b/vllm/compilation/partition_rules.py
@@ -74,7 +74,7 @@ def inductor_partition_rule_context(splitting_ops: list[str]):
     # Save current state before registering
 
     saved_splitting_ops: list[str] = list(
-        torch._inductor.config.custom_should_partition_ops
+        getattr(torch._inductor.config, "custom_should_partition_ops", [])
     )
     torch._inductor.config.custom_should_partition_ops = splitting_ops
 
diff --git a/vllm/env_override.py b/vllm/env_override.py
index d67816d0260a..dd87e127a702 100644
--- a/vllm/env_override.py
+++ b/vllm/env_override.py
@@ -284,9 +284,8 @@ def should_partition_patched(self, node, should_log: bool = False) -> bool:
     ir_node = node.node
     if isinstance(ir_node, ir.FallbackKernel):
         op = ir_node.op_overload
-        if (
-            op is not None
-            and op.name() in torch._inductor.config.custom_should_partition_ops
+        if op is not None and op.name() in getattr(
+            torch._inductor.config, "custom_should_partition_ops", []
         ):
             assert isinstance(op, torch._ops.OpOverload)
             return True
@@ -360,7 +359,7 @@ def _update_scheduler_patched(self) -> None:
     from torch._inductor.graph import GraphLowering
 
     if not hasattr(torch._inductor.config, "custom_should_partition_ops"):
-        torch._inductor.config.custom_should_partition_ops: list[str] = []
+        torch._inductor.config.custom_should_partition_ops = []
 
     PythonWrapperCodegen.memory_plan_reuse = memory_plan_reuse_patched
     GraphLowering._update_scheduler = _update_scheduler_patched

From 0e5a38b9c68f65f0f1d44e246b200e11f73b3eee Mon Sep 17 00:00:00 2001
From: Boyuan Feng <boyuan@meta.com>
Date: Wed, 29 Oct 2025 10:38:10 -0700
Subject: [PATCH 05/10] patch 2.9

Signed-off-by: Boyuan Feng <boyuan@meta.com>
---
 vllm/compilation/partition_rules.py |  2 +-
 vllm/env_override.py                | 14 ++++++++++----
 2 files changed, 11 insertions(+), 5 deletions(-)

diff --git a/vllm/compilation/partition_rules.py b/vllm/compilation/partition_rules.py
index df1bafbefee5..fd48a00f9977 100644
--- a/vllm/compilation/partition_rules.py
+++ b/vllm/compilation/partition_rules.py
@@ -74,7 +74,7 @@ def inductor_partition_rule_context(splitting_ops: list[str]):
     # Save current state before registering
 
     saved_splitting_ops: list[str] = list(
-        getattr(torch._inductor.config, "custom_should_partition_ops", [])
+        torch._inductor.config.custom_should_partition_ops
     )
     torch._inductor.config.custom_should_partition_ops = splitting_ops
 
diff --git a/vllm/env_override.py b/vllm/env_override.py
index dd87e127a702..71027fcc5a39 100644
--- a/vllm/env_override.py
+++ b/vllm/env_override.py
@@ -284,8 +284,9 @@ def should_partition_patched(self, node, should_log: bool = False) -> bool:
     ir_node = node.node
     if isinstance(ir_node, ir.FallbackKernel):
         op = ir_node.op_overload
-        if op is not None and op.name() in getattr(
-            torch._inductor.config, "custom_should_partition_ops", []
+        if (
+            op is not None
+            and op.name() in torch._inductor.config.custom_should_partition_ops
         ):
             assert isinstance(op, torch._ops.OpOverload)
             return True
@@ -357,9 +358,14 @@ def _update_scheduler_patched(self) -> None:
 if is_torch_equal("2.9.0"):
     from torch._inductor.codegen.wrapper import PythonWrapperCodegen
     from torch._inductor.graph import GraphLowering
+    from torch.utils._config_module import _Config, _ConfigEntry
 
-    if not hasattr(torch._inductor.config, "custom_should_partition_ops"):
-        torch._inductor.config.custom_should_partition_ops = []
+    # `custom_should_partition_ops` is a new config after 2.9.0. So this would
+    # not overwrite any user configs.
+    dummy_config = _Config(default=[])
+    torch._inductor.config._config["custom_should_partition_ops"] = _ConfigEntry(
+        dummy_config
+    )
 
     PythonWrapperCodegen.memory_plan_reuse = memory_plan_reuse_patched
     GraphLowering._update_scheduler = _update_scheduler_patched

From 2f7ae24e47afefc02939568c79ca116b6f7f1490 Mon Sep 17 00:00:00 2001
From: Boyuan Feng <boyuan@meta.com>
Date: Wed, 29 Oct 2025 10:45:39 -0700
Subject: [PATCH 06/10] nit

Signed-off-by: Boyuan Feng <boyuan@meta.com>
---
 vllm/env_override.py | 16 ++++++++++++----
 1 file changed, 12 insertions(+), 4 deletions(-)

diff --git a/vllm/env_override.py b/vllm/env_override.py
index 71027fcc5a39..646c16c8aeab 100644
--- a/vllm/env_override.py
+++ b/vllm/env_override.py
@@ -282,11 +282,19 @@ def should_partition_patched(self, node, should_log: bool = False) -> bool:
     # Allow users to manually specify if a node should be partitioned
     # Can only do this for FallbackKernels
     ir_node = node.node
-    if isinstance(ir_node, ir.FallbackKernel):
-        op = ir_node.op_overload
+    if isinstance(ir_node, torch._inductor.ir.FallbackKernel) and (
+        op := ir_node.op_overload
+    ):
+        op_overload_packet_name = op.name()
+        op_overload_name = (
+            f"{op_overload_packet_name}.{op._overloadname}"
+            if isinstance(op, torch._ops.OpOverload)
+            else op_overload_packet_name
+        )
         if (
-            op is not None
-            and op.name() in torch._inductor.config.custom_should_partition_ops
+            op_overload_packet_name
+            in torch._inductor.config.custom_should_partition_ops
+            or op_overload_name in torch._inductor.config.custom_should_partition_ops
         ):
             assert isinstance(op, torch._ops.OpOverload)
             return True

From a85682a78bda414ef5d661317d16a6db8a8b84fb Mon Sep 17 00:00:00 2001
From: Boyuan Feng <boyuan@meta.com>
Date: Wed, 29 Oct 2025 12:54:01 -0700
Subject: [PATCH 07/10] update doc

Signed-off-by: Boyuan Feng <boyuan@meta.com>
---
 vllm/compilation/partition_rules.py | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/vllm/compilation/partition_rules.py b/vllm/compilation/partition_rules.py
index fd48a00f9977..fb3c7451a7f8 100644
--- a/vllm/compilation/partition_rules.py
+++ b/vllm/compilation/partition_rules.py
@@ -60,11 +60,9 @@ def inductor_partition_rule_context(splitting_ops: list[str]):
     Inductor scheduler to partition the graph at these operators. The rules
     are automatically restored to their previous state on exit.
 
-    Note: Callers should use resolve_defined_ops() to convert operator names
-    to OpOverload objects before calling this function.
-
     Args:
-        overloads: List of resolved operator overload objects.
+        splitting_ops: List of operator names to partition on. The format should
+        be `namespace::op_name` (e.g., `silly::attention`).
     """
     if not splitting_ops:
         logger.debug("No partition ops provided; skipping rule registration.")

From 3e374a0ebb54e85a5617987dfccf2672ed39e4a2 Mon Sep 17 00:00:00 2001
From: Boyuan Feng <boyuan@meta.com>
Date: Wed, 29 Oct 2025 14:35:26 -0700
Subject: [PATCH 08/10] nit: doc format

Signed-off-by: Boyuan Feng <boyuan@meta.com>
---
 vllm/compilation/partition_rules.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/vllm/compilation/partition_rules.py b/vllm/compilation/partition_rules.py
index fb3c7451a7f8..fbf8eeb85f16 100644
--- a/vllm/compilation/partition_rules.py
+++ b/vllm/compilation/partition_rules.py
@@ -62,7 +62,7 @@ def inductor_partition_rule_context(splitting_ops: list[str]):
 
     Args:
         splitting_ops: List of operator names to partition on. The format should
-        be `namespace::op_name` (e.g., `silly::attention`).
+        be "namespace::op_name" (e.g., "silly::attention").
     """
     if not splitting_ops:
         logger.debug("No partition ops provided; skipping rule registration.")

From e069c2eaf87959176d9373804409575df0892634 Mon Sep 17 00:00:00 2001
From: Boyuan Feng <boyuan@meta.com>
Date: Wed, 29 Oct 2025 15:51:33 -0700
Subject: [PATCH 09/10] nit: doc format

Signed-off-by: Boyuan Feng <boyuan@meta.com>
---
 vllm/compilation/partition_rules.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/vllm/compilation/partition_rules.py b/vllm/compilation/partition_rules.py
index fbf8eeb85f16..094b86dcb4aa 100644
--- a/vllm/compilation/partition_rules.py
+++ b/vllm/compilation/partition_rules.py
@@ -61,8 +61,7 @@ def inductor_partition_rule_context(splitting_ops: list[str]):
     are automatically restored to their previous state on exit.
 
     Args:
-        splitting_ops: List of operator names to partition on. The format should
-        be "namespace::op_name" (e.g., "silly::attention").
+        splitting_ops: List of operator names to partition on.
     """
     if not splitting_ops:
         logger.debug("No partition ops provided; skipping rule registration.")

From c942ce2f41226cc09a80d6791e2e9b75b8a12ba9 Mon Sep 17 00:00:00 2001
From: Boyuan Feng <boyuan@meta.com>
Date: Tue, 4 Nov 2025 14:22:11 -0800
Subject: [PATCH 10/10] nit

Signed-off-by: Boyuan Feng <boyuan@meta.com>
---
 vllm/env_override.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/vllm/env_override.py b/vllm/env_override.py
index 646c16c8aeab..14dae2850c35 100644
--- a/vllm/env_override.py
+++ b/vllm/env_override.py
@@ -370,9 +370,8 @@ def _update_scheduler_patched(self) -> None:
 
     # `custom_should_partition_ops` is a new config after 2.9.0. So this would
     # not overwrite any user configs.
-    dummy_config = _Config(default=[])
     torch._inductor.config._config["custom_should_partition_ops"] = _ConfigEntry(
-        dummy_config
+        _Config(default=[])
     )
 
     PythonWrapperCodegen.memory_plan_reuse = memory_plan_reuse_patched