<Replace this line with a title. Use 1 line only, 67 chars or less>

gmagogsfm · gmagogsfm · commit 60b709e66642 · 2025-11-01T00:20:03.000-07:00
Summary:

Test Plan:

Reviewers:

Subscribers:

Tasks:

Tags:
Signed-off-by: Yanan Cao &lt;gmagogsfm@gmail.com&gt;
diff --git a/vllm/compilation/backends.py b/vllm/compilation/backends.py
@@ -33,6 +33,7 @@
     EagerAdaptor,
     InductorAdaptor,
     InductorStandaloneAdaptor,
+    is_compile_cache_enabled,
 )
 from .counter import compilation_counter
 from .inductor_pass import InductorPass
@@ -238,7 +239,7 @@ def compile(
         assert compiled_graph is not None, "Failed to compile the graph"
 
         # store the artifact in the cache
-        if not envs.VLLM_DISABLE_COMPILE_CACHE and handle is not None:
+        if is_compile_cache_enabled() and handle is not None:
             self.cache[(runtime_shape, graph_index, self.compiler.name)] = handle
             compilation_counter.num_cache_entries_updated += 1
             self.is_cache_updated = True
@@ -610,7 +611,7 @@ def __call__(
         os.makedirs(local_cache_dir, exist_ok=True)
         self.compilation_config.local_cache_dir = local_cache_dir
 
-        disable_cache = envs.VLLM_DISABLE_COMPILE_CACHE
+        disable_cache = not is_compile_cache_enabled()
 
         if disable_cache:
             logger.info_once("vLLM's torch.compile cache is disabled.", scope="local")
diff --git a/vllm/compilation/compiler_interface.py b/vllm/compilation/compiler_interface.py
@@ -15,7 +15,7 @@
 
 import vllm.envs as envs
 from vllm.compilation.counter import compilation_counter
-from vllm.config import VllmConfig
+from vllm.config import VllmConfig, get_current_vllm_config
 from vllm.utils.torch_utils import is_torch_equal_or_newer
 
 
@@ -163,6 +163,19 @@ def get_inductor_factors() -> list[Any]:
     return factors
 
 
+def is_compile_cache_enabled() -> bool:
+    # TODO(gmagogsfm): Replace torch._inductor.config.force_disable_caches
+    # with torch.compiler.config.force_disable_caches when minimum PyTorch
+    # version reaches 2.10
+    return (
+        not envs.VLLM_DISABLE_COMPILE_CACHE
+        and not torch._inductor.config.force_disable_caches
+        and not (
+            get_current_vllm_config().compilation_config.inductor_compile_config.force_disable_caches
+        )
+    )
+
+
 class InductorStandaloneAdaptor(CompilerInterface):
     """
     The adaptor for the Inductor compiler.
@@ -219,7 +232,8 @@ def compile(
         # Save the compiled artifact to disk in the specified path
         assert key is not None
         path = os.path.join(self.cache_dir, key)
-        if not envs.VLLM_DISABLE_COMPILE_CACHE:
+
+        if is_compile_cache_enabled():
             compiled_graph.save(path=path, format="unpacked")
             compilation_counter.num_compiled_artifacts_saved += 1
         return compiled_graph, (key, path)
@@ -469,10 +483,8 @@ def _get_shape_env() -> AlwaysHitShapeEnv:
                 config_patches=current_config,
             )
 
-        # We treat VLLM_DISABLE_COMPILE_CACHE as the overall switch for torch
-        # compilation cache. So turn off the checks if we disable the
-        # compilation cache.
-        if not envs.VLLM_DISABLE_COMPILE_CACHE:
+        # Turn off the checks if we disable the compilation cache.
+        if is_compile_cache_enabled():
             if hash_str is None:
                 raise RuntimeError(
                     "vLLM failed to compile the model. The most "
diff --git a/vllm/compilation/decorators.py b/vllm/compilation/decorators.py
@@ -447,6 +447,7 @@ def patched_inline_call(self_):
                     InliningInstructionTranslator, "inline_call_", patched_inline_call
                 ),
                 torch._dynamo.config.patch(**dynamo_config_patches),
+                set_current_vllm_config(self.vllm_config),
                 maybe_use_cudagraph_partition_wrapper(self.vllm_config),
                 _torch27_patch_tensor_subclasses(),
             ):