enable torchao safetensors support

liangel-02 · liangel-02 · commit 392a50462f6b · 2025-09-05T14:20:39.000-07:00
diff --git a/src/transformers/modeling_utils.py b/src/transformers/modeling_utils.py
@@ -166,6 +166,8 @@
 else:
     IS_SAGEMAKER_MP_POST_1_10 = False
 
+from torchao.prototype.safetensors.safetensors_utils import is_metadata_dict_torchao
+
 
 logger = logging.get_logger(__name__)
 
@@ -496,10 +498,9 @@ def load_sharded_checkpoint(model, folder, strict=True, prefer_safe=True):
 
 def load_state_dict(
     checkpoint_file: Union[str, os.PathLike],
-    is_quantized: bool = False, #change to hf_quantizer (default is none)
+    is_quantized: bool = False,
     map_location: Optional[Union[str, torch.device]] = "cpu",
     weights_only: bool = True,
-    hf_quantizer: Optional[HfQuantizer] = None,
 ):
     """
     Reads a `safetensor` or a `.bin` checkpoint file. We load the checkpoint on "cpu" by default.
@@ -596,7 +597,7 @@ def set_initialized_submodules(model, state_dict_keys):
     return not_initialized_submodules
 
 
-def _end_ptr(tensor: torch.Tensor) ->  int:
+def _end_ptr(tensor: torch.Tensor) -> int:
     # extract the end of the pointer if the tensor is a slice of a bigger tensor
     if tensor.nelement():
         stop = tensor.view(-1)[-1].data_ptr() + tensor.element_size()
@@ -728,7 +729,7 @@ def _load_state_dict_into_meta_model(
     keep_in_fp32_regex: Optional[re.Pattern] = None,
     unexpected_keys: Optional[list[str]] = None,  # passing `unexpected` for cleanup from quantization items
     device_mesh: Optional["torch.distributed.device_mesh.DeviceMesh"] = None,
-    metadata: Optional[dict] = None
+    metadata: Optional[dict] = None,
 ) -> tuple[Optional[dict], Optional[dict]]:
     """Load parameters from `meta_state_dict` into the model. The parameters of the `meta_state_dict` are on the meta
     device in order to easily infer the shapes and dtypes that they will have. Then proper parameters are then loaded
@@ -746,14 +747,13 @@ def _load_state_dict_into_meta_model(
     is_hqq_or_bnb_or_ao = is_quantized and hf_quantizer.quantization_config.quant_method in {
         QuantizationMethod.HQQ,
         QuantizationMethod.BITS_AND_BYTES,
-        QuantizationMethod.TORCHAO
+        QuantizationMethod.TORCHAO,
     }
     is_meta_state_dict = shard_file.endswith(".safetensors") and not is_hqq_or_bnb_or_ao
     file_pointer = None
     if is_meta_state_dict:
         file_pointer = safe_open(shard_file, framework="pt", device=tensor_device)
-
-    if hf_quantizer and hasattr(hf_quantizer, "transform_state_dict") and metadata:
+    if hf_quantizer and hasattr(hf_quantizer, "transform_state_dict") and is_metadata_dict_torchao(metadata):
         state_dict = hf_quantizer.transform_state_dict(state_dict, metadata)
 
     for param_name, empty_param in state_dict.items():
@@ -787,8 +787,7 @@ def _load_state_dict_into_meta_model(
                         device_map=device_map,
                     )
                 )
-            ):
-                # In this case, the param is already on the correct device!
+            ):  # In this case, the param is already on the correct device!
                 shard_and_distribute_module(
                     model,
                     param,
@@ -938,7 +937,7 @@ def load_shard_file(args):
     # If shard_file is "", we use the existing state_dict instead of loading it
     if shard_file != "":
         state_dict = load_state_dict(
-            shard_file, is_quantized=is_quantized, map_location=map_location, weights_only=weights_only, hf_quantizer=hf_quantizer
+            shard_file, is_quantized=is_quantized, map_location=map_location, weights_only=weights_only
         )
 
     # Fix the key names
@@ -3987,11 +3986,11 @@ def save_pretrained(
             and hf_quantizer.is_serializable(safe_serialization=safe_serialization)
         )
 
-        # if hf_quantizer is not None and not _hf_peft_config_loaded and not quantization_serializable:
-        #     raise ValueError(
-        #         f"The model is quantized with {hf_quantizer.quantization_config.quant_method} and is not serializable - check out the warnings from"
-        #         " the logger on the traceback to understand the reason why the quantized model is not serializable."
-        #     )
+        if hf_quantizer is not None and not _hf_peft_config_loaded and not quantization_serializable:
+            raise ValueError(
+                f"The model is quantized with {hf_quantizer.quantization_config.quant_method} and is not serializable - check out the warnings from"
+                " the logger on the traceback to understand the reason why the quantized model is not serializable."
+            )
 
         if "save_config" in kwargs:
             warnings.warn(
@@ -4020,9 +4019,9 @@ def save_pretrained(
             repo_id = self._create_repo(repo_id, **kwargs)
             files_timestamps = self._get_files_timestamps(save_directory)
 
+        metadata = {}
         if hf_quantizer is not None:
             state_dict = hf_quantizer.get_state_dict(self)
-            metadata = {}
             if isinstance(state_dict, tuple):
                 state_dict, metadata = state_dict
 
@@ -4171,8 +4170,7 @@ def save_pretrained(
                 else:
                     ptrs[id_tensor_storage(tensor)].append(name)
 
-            # shared_ptrs = {ptr: names for ptr, names in ptrs.items() if len(names) > 1}
-            shared_ptrs = {}
+            shared_ptrs = {ptr: names for ptr, names in ptrs.items() if len(names) > 1}
 
             # Recursively descend to find tied weight keys
             _tied_weights_keys = _get_tied_weight_keys(self)
@@ -5095,6 +5093,7 @@ def from_pretrained(
                 )
 
         from_pt = not (from_tf | from_flax)
+
         if from_pt:
             if gguf_file:
                 from .modeling_gguf_pytorch_utils import load_gguf_checkpoint
@@ -5113,7 +5112,6 @@ def from_pretrained(
             )
 
         config.name_or_path = pretrained_model_name_or_path
-
         model_init_context = cls.get_init_context(is_quantized, _is_ds_init_called)
         config = copy.deepcopy(config)  # We do not want to modify the config inplace in from_pretrained.
         with ContextManagers(model_init_context):
@@ -5448,7 +5446,7 @@ def _load_pretrained_model(
         is_hqq_or_bnb_or_ao = is_quantized and hf_quantizer.quantization_config.quant_method in {
             QuantizationMethod.HQQ,
             QuantizationMethod.BITS_AND_BYTES,
-            QuantizationMethod.TORCHAO
+            QuantizationMethod.TORCHAO,
         }
 
         # Get all the keys of the state dicts that we have to initialize the model
@@ -5567,7 +5565,6 @@ def _load_pretrained_model(
                 if sharded_metadata is None:
                     weight_map = dict.fromkeys(checkpoint_keys, checkpoint_files[0])
                 else:
-                    # weight file full path
                     folder = os.path.sep.join(checkpoint_files[0].split(os.path.sep)[:-1])
                     # Fix the weight map keys according to the key mapping
                     weight_map = {
diff --git a/src/transformers/quantizers/quantizer_torchao.py b/src/transformers/quantizers/quantizer_torchao.py
@@ -35,9 +35,9 @@
     import torch
     import torch.nn as nn
 
-from torchao.quantization import Float8Tensor
 
-from torchao.prototype.safetensors.safetensors_support import save_tensor_state_dict, load_tensor_state_dict
+from torchao.prototype.safetensors.safetensors_support import flatten_tensor_state_dict, unflatten_tensor_state_dict
+
 
 logger = logging.get_logger(__name__)
 
@@ -142,8 +142,7 @@ def update_dtype(self, dtype):
         return dtype
 
     def get_state_dict(self, model):
-        return save_tensor_state_dict(model.state_dict())
-
+        return flatten_tensor_state_dict(model.state_dict())
 
     def adjust_target_dtype(self, dtype: "torch.dtype") -> "torch.dtype":
         if version.parse(importlib.metadata.version("accelerate")) > version.parse("0.19.0"):
@@ -228,7 +227,6 @@ def check_quantized_param(
                 _QUANTIZABLE.append(torch.nn.Embedding)
             return isinstance(module, tuple(_QUANTIZABLE)) and (tensor_name == "weight")
 
-
     def create_quantized_param(
         self,
         model: "PreTrainedModel",
@@ -289,7 +287,7 @@ def create_quantized_param(
             quantize_(module, self.quantization_config.get_apply_tensor_subclass())
 
     def transform_state_dict(self, tensor_data, metadata):
-        return load_tensor_state_dict(tensor_data=tensor_data, provided_metadata=metadata)
+        return unflatten_tensor_state_dict(tensor_data, metadata)
 
     def _process_model_after_weight_loading(self, model, **kwargs):
         """No process required for torchao quantized model"""
@@ -309,10 +307,13 @@ def _process_model_after_weight_loading(self, model, **kwargs):
 
     def is_serializable(self, safe_serialization=None) -> bool:
         if safe_serialization:
+            from torchao.quantization import Float8WeightOnlyConfig
+
             logger.warning(
-                "torchao quantized model does not support safe serialization, please set `safe_serialization` to False"
+                "torchao quantized model only supports safe serialization for Float8WeightOnlyConfig, please set `safe_serialization` to False if you are using a different config"
             )
-            return False
+
+            return isinstance(self.quantization_config.quant_type, Float8WeightOnlyConfig)
         _is_torchao_serializable = version.parse(importlib.metadata.version("huggingface_hub")) >= version.parse(
             "0.25.0"
         )
diff --git a/tests/quantization/torchao_integration/test_torchao.py b/tests/quantization/torchao_integration/test_torchao.py
@@ -399,7 +399,7 @@ def test_autoquant(self):
 
         check_autoquantized(self, quantized_model.model.layers[0].self_attn.v_proj)
 
-        EXPECTED_OUTPUT = "What are we having for dinner?\n\nJane: (sighs)"
+        EXPECTED_OUTPUT = "What are we having for dinner?\n\nJessica: (smiling)"
         output = quantized_model.generate(
             **input_ids, max_new_tokens=self.max_new_tokens, cache_implementation="static"
         )
@@ -412,26 +412,21 @@ class TorchAoSerializationTest(unittest.TestCase):
     input_text = "What are we having for dinner?"
     max_new_tokens = 10
     model_name = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
-    quant_scheme = "int4_weight_only"
-    quant_scheme_kwargs = (
-        {"group_size": 32, "layout": Int4CPULayout()}
-        if is_torchao_available() and version.parse(importlib.metadata.version("torchao")) >= version.parse("0.8.0")
-        else {"group_size": 32}
-    )
     device = "cpu"
 
     # called only once for all test in this class
     @classmethod
     def setUpClass(cls):
         cls.tokenizer = AutoTokenizer.from_pretrained(cls.model_name)
-        cls.EXPECTED_OUTPUT = "What are we having for dinner?\n- 1. What is the temperature outside"
+        cls.EXPECTED_OUTPUT = "What are we having for dinner?\n\nJessica: (smiling)"
 
     def setUp(self):
-        self.quant_config = TorchAoConfig(self.quant_scheme, **self.quant_scheme_kwargs)
-        dtype = torch.bfloat16 if self.quant_scheme == "int4_weight_only" else "auto"
+        from torchao.quantization import Float8WeightOnlyConfig
+
+        self.quant_config = TorchAoConfig(Float8WeightOnlyConfig())
         self.quantized_model = AutoModelForCausalLM.from_pretrained(
             self.model_name,
-            dtype=dtype,
+            dtype=torch.bfloat16,
             device_map=self.device,
             quantization_config=self.quant_config,
         )
@@ -451,12 +446,11 @@ def check_serialization_expected_output(self, device, expected_output):
         """
         Test if we can serialize and load/infer the model again on the same device
         """
-        dtype = torch.bfloat16 if self.quant_scheme == "int4_weight_only" else "auto"
+        dtype = torch.bfloat16
         with tempfile.TemporaryDirectory() as tmpdirname:
-            self.quantized_model.save_pretrained(tmpdirname, safe_serialization=False)
+            self.quantized_model.save_pretrained(tmpdirname, safe_serialization=True)
             loaded_quantized_model = AutoModelForCausalLM.from_pretrained(tmpdirname, dtype=dtype, device_map=device)
             input_ids = self.tokenizer(self.input_text, return_tensors="pt").to(device)
-
             output = loaded_quantized_model.generate(**input_ids, max_new_tokens=self.max_new_tokens)
             self.assertEqual(self.tokenizer.decode(output[0], skip_special_tokens=True), expected_output)
 
@@ -511,7 +505,7 @@ def setUpClass(cls):
         EXPECTED_OUTPUTS = Expectations(
             {
                 ("xpu", 3): "What are we having for dinner?\n\nJessica: (smiling)",
-                ("cuda", 7): "What are we having for dinner?\n- 1. What is the temperature outside",
+                ("cuda", 7): "What are we having for dinner?\n\nJessica: (smiling)",
             }
         )
         # fmt: on