Fix FX Graph Cache issue in register_da8w4_concat_linear_cpu_pass

Cui, Yuxin · Cui, Yuxin · commit 62d3689b7d31 · 2025-08-29T16:46:53.000Z
Fix the bug that the FX Graph Cache was being bypassed when using the
register_da8w4_concat_linear_cpu_pass, preventing cache hits on subsequent
model runs.
Implement DA8W4ConcatLinearCPUPass that inherits from CustomGraphPass.
Ensure it can be serialized and saved as fxgraph properly.
Add the unit test. When saving fxgraph, the fxgraph_cache_bypass shuold
remain at 0, confirming that the custom pass is no longer being rejected
by the cache system.

Signed-off-by: Cui, Yuxin &lt;yuxin.cui@intel.com&gt;
diff --git a/test/quantization/test_da8w4_cpu.py b/test/quantization/test_da8w4_cpu.py
@@ -8,6 +8,7 @@
 import unittest
 
 import torch
+from torch._dynamo.utils import counters
 from torch.testing._internal import common_utils
 from torch.testing._internal.common_utils import (
     TestCase,
@@ -120,7 +121,6 @@ def test_8da4w_cpu(self, dtype, x_dim, bias, bs, sym_quant_a):
     @common_utils.parametrize("x_dim", [2, 3])
     @common_utils.parametrize("bias", [True, False])
     def test_8da4w_concat_linear_cpu(self, x_dim, bias):
-        self.skipTest("Disabled for now")
         N, K = 64, 128
 
         class Mod(torch.nn.Module):
@@ -163,6 +163,11 @@ def forward(self, x):
             # ensure the expected op occurs only once in the code after fusion
             # The trailing "(" is to avoid matching the op in the comment
             assert code[0].count("torch.ops.torchao.da8w4_linear_cpu.default(") == 1
+
+            # ensure the custom DA8W4ConcatLinearCPUPass is properly cached as fxgraph
+            enable_fxgraph_cache_bypass = counters["inductor"]["fxgraph_cache_bypass"]
+            assert enable_fxgraph_cache_bypass == 0
+
             with torch._inductor.config.patch(
                 {"freezing": True, "cpp.enable_concat_linear": False}
             ):
@@ -172,6 +177,9 @@ def forward(self, x):
                 )
             assert torch.allclose(y, y_ref)
 
+            disable_fxgraph_cache_bypass = counters["inductor"]["fxgraph_cache_bypass"]
+            assert disable_fxgraph_cache_bypass == 0
+
 
 common_utils.instantiate_parametrized_tests(TestDa8w4Cpu)
 
diff --git a/torchao/dtypes/uintx/dyn_int8_act_int4_wei_cpu_layout.py b/torchao/dtypes/uintx/dyn_int8_act_int4_wei_cpu_layout.py
@@ -314,6 +314,6 @@ def _linear_int8_act_int4_weight_cpu_impl(input_tensor, weight_tensor, bias):
 
 
 # Register the concat linear fusion pass
-# from ...prototype.inductor.fx_passes import register_da8w4_concat_linear_cpu_pass
+from ...prototype.inductor.fx_passes import register_da8w4_concat_linear_cpu_pass
 
-# register_da8w4_concat_linear_cpu_pass()
+register_da8w4_concat_linear_cpu_pass()
diff --git a/torchao/prototype/inductor/fx_passes/da8w4_concat_linear_fusion_cpu.py b/torchao/prototype/inductor/fx_passes/da8w4_concat_linear_fusion_cpu.py
@@ -7,7 +7,13 @@
 import operator
 
 import torch
+from torch._inductor.custom_graph_pass import CustomGraphPass, get_hash_for_files
 
+class DA8W4ConcatLinearCPUPass(CustomGraphPass):
+    def __call__(self, graph: torch.fx.Graph):
+        _concat_linear_dq8w4_cpu(graph)
+    def uuid(self):
+        return get_hash_for_files((__file__,))
 
 # Inductor FX passes for concat linear for DA8W4
 def _is_valid_concat_linear_da8w4_fusion(computation_nodes):
@@ -213,4 +219,5 @@ def ...
 def register_da8w4_concat_linear_cpu_pass():
     from torch._inductor import config as inductor_config
 
-    inductor_config.post_grad_custom_post_pass = _concat_linear_dq8w4_cpu
+    da8w4_concat_linear_cpu_pass = DA8W4ConcatLinearCPUPass()
+    inductor_config.post_grad_custom_post_pass = da8w4_concat_linear_cpu_pass