[BugFix] Patch inductor partitioning logic

angelayi · ProExpertProg · commit 44de6d0fd074 · 2025-10-14T13:49:41.000Z
Signed-off-by: angelayi &lt;yiangela7@gmail.com&gt;
diff --git a/vllm/env_override.py b/vllm/env_override.py
@@ -3,8 +3,10 @@
 import os
 
 import torch
+from torch._inductor.graph import GraphLowering
 
 from vllm.logger import init_logger
+from vllm.utils import is_torch_equal_or_newer
 
 logger = init_logger(__name__)
 
@@ -21,3 +23,97 @@
 os.environ["TORCHINDUCTOR_COMPILE_THREADS"] = "1"
 # see https://github.com/vllm-project/vllm/issues/10619
 torch._inductor.config.compile_threads = 1
+
+
+def should_partition_patched(self, node, should_log: bool = False) -> bool:
+    # Copied from torch._inductor.scheduler.Scheduler.should_partition. Patches
+    # [this code](https://github.com/pytorch/pytorch/blob/ecb53078faf86ca1b33277df33b82985675bb011/torch/_inductor/scheduler.py#L4712-L4724)
+    # so that we always return True.
+    """Return True if we should partition the inductor graph on this node"""
+
+    import torch._inductor.ir as ir
+    from torch._inductor.scheduler import (
+        BaseSchedulerNode,
+        FusedSchedulerNode,
+        _custom_should_partition_fns,
+    )
+    from torch._inductor.utils import (
+        _unstable_customized_partition_wrapper,
+        is_cudagraph_unsafe_op,
+        maybe_log_cudagraph_partition,
+    )
+
+    # Allow users to manually specify if a node should be partitioned
+    # Can only do this for FallbackKernels
+    ir_node = node.node
+    if isinstance(ir_node, ir.FallbackKernel):
+        operator = ir_node.op_overload
+        if operator is not None and operator in _custom_should_partition_fns:
+            return True
+
+    # When not using cudagraphs, keep all kernels in the `call` function
+    # instead of graph partition functions, since graph partition only brings
+    # benefit to cudagraph
+    if (
+        not torch._inductor.config.triton.cudagraphs
+        and _unstable_customized_partition_wrapper.wrapper is None
+    ):
+        return True
+
+    # avoid duplicating logs when should_partition is called multiple times
+    # on the same node
+    def noop_log(msg: str, node: BaseSchedulerNode | None) -> None:
+        return
+
+    log_partition_reason = maybe_log_cudagraph_partition if should_log else noop_log
+
+    if isinstance(node, FusedSchedulerNode):
+        return any(self.should_partition(snode) for snode in node.snodes)
+
+    assert node.node is not None
+
+    if not node.is_gpu():
+        log_partition_reason("non gpu ops", node=node)
+
+        return True
+
+    if isinstance(node.node, ir.DeviceCopy):
+        log_partition_reason("DeviceCopy ops", node=node)
+        return True
+
+    if isinstance(node.node, ir.Conditional):
+        log_partition_reason("Conditional ops", node=node)
+        return True
+
+    if getattr(node.node, "unbacked_bindings", None):
+        log_partition_reason("unbacked binding ops", node=node)
+        return True
+
+    if is_cudagraph_unsafe_op(node.node):
+        log_partition_reason("CUDAGraph-unsafe custom ops", node=node)
+        return True
+
+    return False
+
+
+def _update_scheduler_patched(self) -> None:
+    # Copied from torch._inductor.graph.GrahLowering._update_scheduler. Patches
+    # this method so that we can patch Scheduler.should_partition with the
+    # function above
+    """
+    (Re)initializes the scheduler member.  When initializing the scheduler, no CUBIN
+    files should be generated (to avoid biasing any benchmarks and pessimizing
+    fusion decisions).
+    """
+    import torch._inductor.config as config
+    from torch._inductor.scheduler import Scheduler
+
+    Scheduler.should_partition = should_partition_patched
+
+    with config.patch("triton.store_cubin", False):
+        self.scheduler = Scheduler(self.operations)
+
+
+# see https://github.com/vllm-project/vllm/issues/26678
+if is_torch_equal_or_newer("2.9.0.dev"):
+    GraphLowering._update_scheduler = _update_scheduler_patched