Address comments

sarckk · sarckk · commit 0396bffa175c · 2025-09-19T12:49:26.000-07:00
Signed-off-by: Yong Hoon Shin &lt;yhshin@meta.com&gt;
diff --git a/tests/compile/test_decorator.py b/tests/compile/test_decorator.py
@@ -257,6 +257,7 @@ def forward(self, x: torch.Tensor) -> torch.Tensor:
             return x
 
     @support_torch_compile(no_weak_ref_output=True)
+    @support_torch_compile(no_weak_ref_output=False)
     class B(A):
         ...
 
@@ -283,12 +284,8 @@ class C(B):
     ):
         run_model(vllm_config, mod_A, cudagraph_runtime_mode)
 
-    with compilation_counter.expect(
-            num_weakref_output_graphs=1,
-            # This is 1 instead of 0 because B inherits from A
-            # and A's __init__ is called which initializes the VllmBackend
-            # If no_weak_ref_output=False, this value would be 2
-    ) and set_current_vllm_config(vllm_config):
+    with compilation_counter.expect(num_weakref_output_graphs=0,
+                                    ) and set_current_vllm_config(vllm_config):
         mod_B = B(vllm_config=vllm_config, prefix='').eval().cuda()
 
     # B also has support_torch_compile
@@ -301,12 +298,8 @@ class C(B):
     ):
         run_model(vllm_config, mod_B, cudagraph_runtime_mode)
 
-    with compilation_counter.expect(
-            num_weakref_output_graphs=2,
-            # C inherits from B which inherits from A
-            # both B and A's __init__ are called, incrementing the count by 2
-            # as A has no_weak_ref_output=False
-    ) and set_current_vllm_config(vllm_config):
+    with compilation_counter.expect(num_weakref_output_graphs=1,
+                                    ) and set_current_vllm_config(vllm_config):
         mod_C = C(vllm_config=vllm_config, prefix='').eval().cuda()
 
     # C has support_torch_compile
diff --git a/vllm/compilation/backends.py b/vllm/compilation/backends.py
@@ -454,9 +454,6 @@ def __init__(
             self.compilation_config)
 
         self.no_weak_ref_output = no_weak_ref_output
-        if not self.no_weak_ref_output:
-            # used for testing purposes
-            compilation_counter.num_weakref_output_graphs += 1
 
         # `torch.compile` is JIT compiled, so we don't need to
         # do anything here
diff --git a/vllm/compilation/cuda_graph.py b/vllm/compilation/cuda_graph.py
@@ -167,6 +167,8 @@ def __call__(self, *args, **kwargs):
                         # any other cuda graph.
                         output = weak_ref_tensors(output)
 
+                        compilation_counter.num_weakref_output_graphs += 1
+
             # here we always use weak ref for the output
             # to save memory
             entry.output = weak_ref_tensors(output)
diff --git a/vllm/compilation/decorators.py b/vllm/compilation/decorators.py
@@ -219,6 +219,11 @@ def _support_torch_compile(
     """
     A decorator to add support for compiling the forward method of a class.
     """
+    setattr(cls, IGNORE_COMPILE_KEY, False)
+
+    # setting as attribute on cls ensures child class will override parent class
+    setattr(cls, LAST_PIECEWISE_GRAPH_WEAKREF_KEY, no_weak_ref_output)
+
     if TorchCompileWrapperWithCustomDispatcher in cls.__bases__:
         # support decorating multiple times
         return cls
@@ -230,11 +235,6 @@ def _support_torch_compile(
 
     old_init = cls.__init__
 
-    setattr(cls, IGNORE_COMPILE_KEY, False)
-
-    # setting as attribute on cls ensures child class will override parent class
-    setattr(cls, LAST_PIECEWISE_GRAPH_WEAKREF_KEY, no_weak_ref_output)
-
     def __init__(self, *, vllm_config: VllmConfig, prefix: str = '', **kwargs):
         old_init(self, vllm_config=vllm_config, prefix=prefix, **kwargs)
         self.vllm_config = vllm_config