add int4tensor support for safetensors

liangel-02 · liangel-02 · commit 63cb06a7ab40 · 2025-09-24T14:09:49.000-07:00
ghstack-source-id: cec24fc Pull Request resolved: #3056
diff --git a/test/prototype/safetensors/test_safetensors_support.py b/test/prototype/safetensors/test_safetensors_support.py
@@ -7,6 +7,8 @@
 from torch.testing._internal.common_utils import (
     TestCase,
     run_tests,
+    instantiate_parametrized_tests,
+    parametrize,
 )
 
 from torchao import quantize_
@@ -15,7 +17,7 @@
     unflatten_tensor_state_dict,
 )
 from torchao.quantization.granularity import PerRow
-from torchao.quantization.quant_api import Float8DynamicActivationFloat8WeightConfig
+from torchao.quantization.quant_api import Float8DynamicActivationFloat8WeightConfig, Int4WeightOnlyConfig
 from torchao.utils import (
     is_sm_at_least_89,
 )
@@ -36,13 +38,13 @@ def load_data(file_path: str, device: str):
 @unittest.skipIf(not torch.cuda.is_available(), "Need CUDA available")
 @unittest.skipIf(not is_sm_at_least_89(), "Need sm89+")
 class TestSafeTensors(TestCase):
-    def test_safetensors(self):
-        config = Float8DynamicActivationFloat8WeightConfig(granularity=PerRow())
+    @parametrize("config", [Float8DynamicActivationFloat8WeightConfig(granularity=PerRow()), Int4WeightOnlyConfig()])
+    def test_safetensors(self, config):
         model = torch.nn.Sequential(
-            torch.nn.Linear(32, 256, dtype=torch.bfloat16, device="cuda")
+            torch.nn.Linear(128, 256, dtype=torch.bfloat16, device="cuda")
         )
         quantize_(model, config)
-        example_inputs = (torch.randn(2, 32, dtype=torch.bfloat16, device="cuda"),)
+        example_inputs = (torch.randn(2, 128, dtype=torch.bfloat16, device="cuda"),)
         ref_output = model(*example_inputs)
 
         with tempfile.NamedTemporaryFile() as f:
@@ -54,12 +56,13 @@ def test_safetensors(self):
             )
 
         model = torch.nn.Sequential(
-            torch.nn.Linear(32, 256, dtype=torch.bfloat16, device="cuda")
+            torch.nn.Linear(128, 256, dtype=torch.bfloat16, device="cuda")
         )
         model.load_state_dict(reconstructed_dict, assign=True)
         output = model(*example_inputs)
         assert torch.equal(output, ref_output)
 
+instantiate_parametrized_tests(TestSafeTensors)
 
 if __name__ == "__main__":
     run_tests()
diff --git a/torchao/prototype/safetensors/safetensors_support.py b/torchao/prototype/safetensors/safetensors_support.py
@@ -7,8 +7,9 @@
 from torchao.prototype.safetensors.safetensors_utils import (
     Float8TensorAttributeJSONEncoder,
     object_from_dict,
+    ALLOWED_TENSORS
 )
-from torchao.quantization import Float8Tensor
+from torchao.quantization import Float8Tensor, Int4Tensor
 
 logger: logging.Logger = logging.getLogger(__name__)
 
@@ -76,12 +77,11 @@ def unflatten_tensor_state_dict(
 
         tensor_metadata = json.loads(metadata.get(tensor_name))
         tensor_type = tensor_metadata.get("_type")
-
-        if tensor_type == Float8Tensor.__name__:
+        if tensor_type == torch.Tensor.__name__:
+            result[tensor_name] = tensor_tensors["_data"]
+        elif tensor_type in ALLOWED_TENSORS:
             tensor_metadata["_data"].update(tensor_tensors)
             result[tensor_name] = object_from_dict(tensor_metadata)
-        elif tensor_type == torch.Tensor.__name__:
-            result[tensor_name] = tensor_tensors["_data"]
         else:
             raise ValueError(f"Unsupported tensor type: {tensor_type}")
 
@@ -140,15 +140,15 @@ def flatten_tensor_state_dict(
     tensors_data_dict = {}
 
     for tensor_name, tensor in tensors_dict.items():
-        if isinstance(tensor, Float8Tensor):
+        if type(tensor) is torch.Tensor:
+            tensor_dict = {"_data": tensor}
+            tensor_metadata = json.dumps({"_type": torch.Tensor.__name__})
+        elif tensor.__class__.__name__ in ALLOWED_TENSORS:
             tensor_dict = {}
             for tensor_data_name in tensor.tensor_data_names:
                 tensor_dict[tensor_data_name] = getattr(tensor, tensor_data_name)
 
             tensor_metadata = json.dumps(tensor, cls=Float8TensorAttributeJSONEncoder)
-        elif type(tensor) is torch.Tensor:
-            tensor_dict = {"_data": tensor}
-            tensor_metadata = json.dumps({"_type": torch.Tensor.__name__})
         else:
             raise ValueError(f"Unsupported tensor type: {type(tensor)}")
 
diff --git a/torchao/prototype/safetensors/safetensors_utils.py b/torchao/prototype/safetensors/safetensors_utils.py
@@ -6,34 +6,36 @@
 import torch
 
 import torchao
-from torchao.quantization import Float8Tensor
+from torchao.quantization import Float8Tensor, Int4Tensor
 from torchao.quantization.quantize_.common import KernelPreference
 from torchao.quantization.quantize_.workflows import QuantizeTensorToFloat8Kwargs
 
 ALLOWED_CLASSES = {
     "Float8Tensor": Float8Tensor,
+    "Int4Tensor": Int4Tensor,
     "Float8MMConfig": torchao.float8.inference.Float8MMConfig,
     "QuantizeTensorToFloat8Kwargs": QuantizeTensorToFloat8Kwargs,
     "PerRow": torchao.quantization.PerRow,
     "PerTensor": torchao.quantization.PerTensor,
     "KernelPreference": KernelPreference,
 }
 
-ALLOWED_TENSORS = ["Float8Tensor", "Tensor"]
+ALLOWED_TENSORS = ["Float8Tensor", "Int4Tensor", "Tensor"]
 
 __all__ = [
     "Float8TensorAttributeJSONEncoder",
     "object_from_dict",
     "is_metadata_torchao",
 ]
 
-
 class Float8TensorAttributeJSONEncoder(json.JSONEncoder):
     def default(self, o):
-        if isinstance(o, Float8Tensor):
+        if o.__class__.__name__ in ALLOWED_TENSORS:
             tensor_attr_dict = {}
+            optional_tensor_attributes = o.optional_tensor_attribute_names if hasattr(o, "optional_tensor_attribute_names") else []
+
             all_tensor_attributes = (
-                o.optional_tensor_attribute_names + o.tensor_attribute_names
+                optional_tensor_attributes + o.tensor_attribute_names
             )
 
             for tensor_attribute_name in all_tensor_attributes: