[BugFix] Work around graph partition x torch.compile cache issue

zou3519 · zou3519 · commit 19ca497928a0 · 2025-10-15T16:34:33.000-07:00
In PyTorch 2.9, torch.compile has a bug where the graph
partition is not taken into account during caching.
Because vLLM's Mode.VLLM_COMPILE is the only mode that uses
Inductor graph partition, and VLLM_COMPILE implies there
is a PostGradPassManager, we put the list of operators to graph
partition into the PostGradPassManager's uuid (which
then gets incorporated into Inductor's FX graph cache key).
Remove this hack whenever torch.compile fixes it.

Signed-off-by: Richard Zou &lt;zou3519@gmail.com&gt;
diff --git a/vllm/compilation/pass_manager.py b/vllm/compilation/pass_manager.py
@@ -110,6 +110,23 @@ def configure(self, config: VllmConfig):
         self.post_cleanup = PostCleanupPass(config)
         self.fix_functionalization = FixFunctionalizationPass(config)
 
+        # [HACK: Bug with Inductor graph partition and torch.compile cache]
+        # In PyTorch 2.9, torch.compile has a bug where the graph
+        # partition is not taken into account during caching.
+        # Because vLLM's Mode.VLLM_COMPILE is the only mode that uses
+        # Inductor graph partition, and VLLM_COMPILE implies there
+        # is a PostGradPassManager, we put the list of operators to graph
+        # partition into the PostGradPassManager's uuid (which
+        # then gets incorporated into Inductor's FX graph cache key).
+        # Remove this hack whenever torch.compile fixes it.
+        self.splitting_ops = None
+        if config.compilation_config.use_inductor_graph_partition:
+            if config.compilation_config.splitting_ops is None:
+                self.splitting_ops = []
+            else:
+                # Sort them so we're not dependent on the ordering.
+                self.splitting_ops = sorted(config.compilation_config.splitting_ops)
+
     def add(self, pass_: InductorPass):
         assert isinstance(pass_, InductorPass)
         self.passes.append(pass_)
@@ -120,8 +137,17 @@ def uuid(self):
         affects compilation caching. Its uuid depends on the UUIDs of all
         dependent passes and the pass config. See InductorPass for more info.
         """
-        state = {"pass_config": self.pass_config.uuid(), "passes": []}
+        state = {
+            "pass_config": self.pass_config.uuid(),
+            "passes": [],
+            "splitting_ops": [],
+        }
         for pass_ in self.passes:
             state["passes"].append(pass_.uuid())
         state["passes"].append(self.fix_functionalization.uuid())
+
+        # See [HACK: Bug with Inductor graph partition and torch.compile cache]
+        if self.splitting_ops is not None:
+            state["splitting_ops"].extend(self.splitting_ops)
+
         return InductorPass.hash_dict(state)