Register codebook quant ops

jerryzh168 · jerryzh168 · commit 0f7fa57f623e · 2025-04-01T10:54:06.000-07:00
Summary:
Register the codebook quant / dequant ops as custom ops so they can be recongnized after export

Test Plan:
python test/prototype/test_codebook_quant.py -k test_export

Reviewers:

Subscribers:

Tasks:

Tags:
diff --git a/test/prototype/test_codebook_quant.py b/test/prototype/test_codebook_quant.py
@@ -69,6 +69,20 @@ def test_quantize_api(self):
         quantize_(m, codebook_weight_only())
         assert type(m[0].weight) == CodebookQuantizedTensor
 
+    def test_export(self):
+        m = torch.nn.Sequential(torch.nn.Linear(128, 64)).to(
+            dtype=torch.bfloat16, device="cuda"
+        )
+        quantize_(m, codebook_weight_only())
+        # quantize_(m, int4_weight_only(group_size=16))
+        example_inputs = (torch.randn(1, 128, dtype=torch.bfloat16, device="cuda"),)
+        print("m:", m)
+        # torchao.utils.unwrap_tensor_subclass(m)
+        m = torch.export.export_for_training(m, example_inputs).module()
+        print("m:", m)
+        targets = [n.target for n in m.graph.nodes]
+        self.assertTrue(torch.ops.quant.quantize_codebook.default in targets)
+
 
 if __name__ == "__main__":
     unittest.main()
diff --git a/torchao/prototype/quantization/codebook/codebook_ops.py b/torchao/prototype/quantization/codebook/codebook_ops.py
@@ -6,8 +6,13 @@
     _DTYPE_TO_QVALUE_BOUNDS,
     _SUB_BYTE_UINT_BOUNDS,
 )
+from torchao.utils import _register_custom_op
 
+quant_lib = torch.library.Library("quant", "FRAGMENT")
+register_custom_op = _register_custom_op(quant_lib)
 
+
+@register_custom_op
 def quantize_codebook(
     input: torch.Tensor,
     codebook: torch.Tensor,
@@ -90,20 +95,24 @@ def quantize_codebook(
     return codes.to(code_dtype)
 
 
+@register_custom_op
 def dequantize_codebook(
     codes: torch.Tensor,
     codebook: torch.Tensor,
+    input_dtype: torch.dtype,
     scales: torch.Tensor,
     output_dtype: torch.dtype = torch.float32,
 ) -> torch.Tensor:
     """
     Reconstructs the original tensor from codes and the codebook.
 
     Args:
-        codes (torch.Tensor): Indices of codebook entries for each block,
-                                          shape (d1//b1, d2//b2, ..., dN//bN).
+        codes (torch.Tensor): torch.int32 dtype, indices of codebook entries for each block,
+                              shape (d1//b1, d2//b2, ..., dN//bN).
         codebook (torch.Tensor): Codebook tensor used for quantization,
                                  shape (k, b1, b2, ..., bN) where b_i are block sizes.
+        input_dtype (torch.dtype): Input dtype for `codes`, used for downstream pattern matching
+                             and not enforced in `codes`. can be sub byte dtype like torch.uint4
         scales (torch.Tensor): Scales, shape (d1, d2, ..., dN // scale_block_size, 1).
         output_dtype (torch.dtype): dtype for the output tensor.
 
@@ -137,7 +146,7 @@ def dequantize_codebook(
     dequant = dequant.view(
         *new_shape
     )  # (d1, d2, ..., num_scale_blocks, scale_block_size)
-    dequant.mul_(scales)
+    dequant = dequant * scales
 
     dequant = dequant.view(*original_shape)
 
diff --git a/torchao/prototype/quantization/codebook/codebook_quantized_tensor.py b/torchao/prototype/quantization/codebook/codebook_quantized_tensor.py
@@ -91,12 +91,15 @@ def dequantize(self, output_dtype: Optional[torch.dtype] = None) -> torch.Tensor
             codes = self.codes.get_plain()
         else:
             codes = self.codes
+
         if codes.dtype != torch.int32:
             # TODO: Investigate and support not casting to torch.int32 for indexing to improve performance
             codes = codes.to(torch.int32)
+
         return dequantize_codebook(
             codes,
             self.codebook,
+            self.codes.dtype,
             self.scales,
             output_dtype=output_dtype,
         )
diff --git a/torchao/utils.py b/torchao/utils.py
@@ -205,13 +205,13 @@ def decorator(fn):
 
             # expecting fn.__name__ starts with `_` and we want to take the rest
             # to be the name of the custom op
-            assert (
-                fn.__name__[0] == "_"
-            ), f"Expecting function name starts with `_`, got {fn.__name__}"
             assert not any(
                 c in fn.__name__ for c in ".<>"
             ), f"Expecting op to be defined in normal functions, not lambda or local: {fn.__name__}"
-            op_name = fn.__name__[1:]
+            op_name = fn.__name__
+            if op_name[0] == "_":
+                op_name = op_name[1:]
+
             schema = op_name + infer_schema(fn, mutates_args={})
             lib.define(schema)
             lib.impl(op_name, fn, "CompositeImplicitAutograd")