enable torchao safetensors support

liangel-02 · liangel-02 · commit 6c79b5643ae0 · 2025-09-18T12:58:49.000-07:00
diff --git a/src/transformers/modeling_utils.py b/src/transformers/modeling_utils.py
@@ -496,10 +496,9 @@ def load_sharded_checkpoint(model, folder, strict=True, prefer_safe=True):
 
 def load_state_dict(
     checkpoint_file: Union[str, os.PathLike],
-    is_quantized: bool = False, #change to hf_quantizer (default is none)
+    is_quantized: bool = False,
     map_location: Optional[Union[str, torch.device]] = "cpu",
     weights_only: bool = True,
-    hf_quantizer: Optional[HfQuantizer] = None,
 ):
     """
     Reads a `safetensor` or a `.bin` checkpoint file. We load the checkpoint on "cpu" by default.
@@ -596,7 +595,7 @@ def set_initialized_submodules(model, state_dict_keys):
     return not_initialized_submodules
 
 
-def _end_ptr(tensor: torch.Tensor) ->  int:
+def _end_ptr(tensor: torch.Tensor) -> int:
     # extract the end of the pointer if the tensor is a slice of a bigger tensor
     if tensor.nelement():
         stop = tensor.view(-1)[-1].data_ptr() + tensor.element_size()
@@ -728,7 +727,6 @@ def _load_state_dict_into_meta_model(
     keep_in_fp32_regex: Optional[re.Pattern] = None,
     unexpected_keys: Optional[list[str]] = None,  # passing `unexpected` for cleanup from quantization items
     device_mesh: Optional["torch.distributed.device_mesh.DeviceMesh"] = None,
-    metadata: Optional[dict] = None
 ) -> tuple[Optional[dict], Optional[dict]]:
     """Load parameters from `meta_state_dict` into the model. The parameters of the `meta_state_dict` are on the meta
     device in order to easily infer the shapes and dtypes that they will have. Then proper parameters are then loaded
@@ -746,16 +744,13 @@ def _load_state_dict_into_meta_model(
     is_hqq_or_bnb_or_ao = is_quantized and hf_quantizer.quantization_config.quant_method in {
         QuantizationMethod.HQQ,
         QuantizationMethod.BITS_AND_BYTES,
-        QuantizationMethod.TORCHAO
+        QuantizationMethod.TORCHAO,
     }
     is_meta_state_dict = shard_file.endswith(".safetensors") and not is_hqq_or_bnb_or_ao
     file_pointer = None
     if is_meta_state_dict:
         file_pointer = safe_open(shard_file, framework="pt", device=tensor_device)
 
-    if hf_quantizer and hasattr(hf_quantizer, "transform_state_dict") and metadata:
-        state_dict = hf_quantizer.transform_state_dict(state_dict, metadata)
-
     for param_name, empty_param in state_dict.items():
         if param_name not in expected_keys:  # when loading from ckpt, we skip param if doesnt exist in modeling
             continue
@@ -787,8 +782,7 @@ def _load_state_dict_into_meta_model(
                         device_map=device_map,
                     )
                 )
-            ):
-                # In this case, the param is already on the correct device!
+            ):  # In this case, the param is already on the correct device!
                 shard_and_distribute_module(
                     model,
                     param,
@@ -938,7 +932,7 @@ def load_shard_file(args):
     # If shard_file is "", we use the existing state_dict instead of loading it
     if shard_file != "":
         state_dict = load_state_dict(
-            shard_file, is_quantized=is_quantized, map_location=map_location, weights_only=weights_only, hf_quantizer=hf_quantizer
+            shard_file, is_quantized=is_quantized, map_location=map_location, weights_only=weights_only
         )
 
     # Fix the key names
@@ -948,6 +942,9 @@ def load_shard_file(args):
         with safe_open(shard_file, framework="pt") as f:
             metadata = f.metadata()
 
+    if hf_quantizer:
+        state_dict = hf_quantizer.update_state_dict_with_metadata(state_dict, metadata)
+
     error_msgs = []
 
     if is_deepspeed_zero3_enabled() and not is_quantized:
@@ -970,7 +967,6 @@ def load_shard_file(args):
             keep_in_fp32_regex=keep_in_fp32_regex,
             unexpected_keys=unexpected_keys,
             device_mesh=device_mesh,
-            metadata=metadata,
         )
 
     return error_msgs, disk_offload_index, cpu_offload_index
@@ -3994,11 +3990,11 @@ def save_pretrained(
             and hf_quantizer.is_serializable(safe_serialization=safe_serialization)
         )
 
-        # if hf_quantizer is not None and not _hf_peft_config_loaded and not quantization_serializable:
-        #     raise ValueError(
-        #         f"The model is quantized with {hf_quantizer.quantization_config.quant_method} and is not serializable - check out the warnings from"
-        #         " the logger on the traceback to understand the reason why the quantized model is not serializable."
-        #     )
+        if hf_quantizer is not None and not _hf_peft_config_loaded and not quantization_serializable:
+            raise ValueError(
+                f"The model is quantized with {hf_quantizer.quantization_config.quant_method} and is not serializable - check out the warnings from"
+                " the logger on the traceback to understand the reason why the quantized model is not serializable."
+            )
 
         if "save_config" in kwargs:
             warnings.warn(
@@ -4029,10 +4025,8 @@ def save_pretrained(
 
         metadata = {}
         if hf_quantizer is not None:
-            state_dict = hf_quantizer.get_state_dict(self)
-            metadata = {}
-            if isinstance(state_dict, tuple):
-                state_dict, metadata = state_dict
+            state_dict, metadata = hf_quantizer.get_state_dict_and_metadata(self, safe_serialization)
+        metadata["format"] = "pt"
 
         # Only save the model itself if we are using distributed training
         model_to_save = unwrap_model(self)
@@ -4180,8 +4174,7 @@ def save_pretrained(
                 else:
                     ptrs[id_tensor_storage(tensor)].append(name)
 
-            # shared_ptrs = {ptr: names for ptr, names in ptrs.items() if len(names) > 1}
-            shared_ptrs = {}
+            shared_ptrs = {ptr: names for ptr, names in ptrs.items() if len(names) > 1}
 
             # Recursively descend to find tied weight keys
             _tied_weights_keys = _get_tied_weight_keys(self)
@@ -4312,7 +4305,6 @@ def save_pretrained(
             if safe_serialization:
                 # At some point we will need to deal better with save_function (used for TPU and other distributed
                 # joyfulness), but for now this enough.
-                metadata["format"] = "pt"
                 safe_save_file(shard, os.path.join(save_directory, shard_file), metadata=metadata)
             else:
                 save_function(shard, os.path.join(save_directory, shard_file))
@@ -4808,7 +4800,6 @@ def from_pretrained(
 
         if distributed_config is not None:
             tp_plan = "auto"
-
         # Not used anymore -- remove them from the kwargs
         _ = kwargs.pop("resume_download", None)
         _ = kwargs.pop("mirror", None)
@@ -4960,7 +4951,6 @@ def from_pretrained(
                     "Using a `device_map`, `tp_plan`, `torch.device` context manager or setting `torch.set_default_device(device)` "
                     "requires `accelerate`. You can install it with `pip install accelerate`"
                 )
-
         # handling bnb config from kwargs, remove after `load_in_{4/8}bit` deprecation.
         if load_in_4bit or load_in_8bit:
             if quantization_config is not None:
@@ -5030,7 +5020,6 @@ def from_pretrained(
                     "(*.safetensors) nor a safetensors index file (*.safetensors.index.json): "
                     f"{transformers_explicit_filename}"
                 )
-
         hf_quantizer, config, dtype, device_map = get_hf_quantizer(
             config, quantization_config, dtype, from_tf, from_flax, device_map, weights_only, user_agent
         )
@@ -5103,6 +5092,7 @@ def from_pretrained(
                 )
 
         from_pt = not (from_tf | from_flax)
+
         if from_pt:
             if gguf_file:
                 from .modeling_gguf_pytorch_utils import load_gguf_checkpoint
@@ -5121,7 +5111,6 @@ def from_pretrained(
             )
 
         config.name_or_path = pretrained_model_name_or_path
-
         model_init_context = cls.get_init_context(is_quantized, _is_ds_init_called)
         config = copy.deepcopy(config)  # We do not want to modify the config inplace in from_pretrained.
         with ContextManagers(model_init_context):
@@ -5449,7 +5438,7 @@ def _load_pretrained_model(
         is_hqq_or_bnb_or_ao = is_quantized and hf_quantizer.quantization_config.quant_method in {
             QuantizationMethod.HQQ,
             QuantizationMethod.BITS_AND_BYTES,
-            QuantizationMethod.TORCHAO
+            QuantizationMethod.TORCHAO,
         }
 
         # Get all the keys of the state dicts that we have to initialize the model
@@ -5568,7 +5557,6 @@ def _load_pretrained_model(
                 if sharded_metadata is None:
                     weight_map = dict.fromkeys(checkpoint_keys, checkpoint_files[0])
                 else:
-                    # weight file full path
                     folder = os.path.sep.join(checkpoint_files[0].split(os.path.sep)[:-1])
                     # Fix the weight map keys according to the key mapping
                     weight_map = {
diff --git a/src/transformers/quantizers/base.py b/src/transformers/quantizers/base.py
@@ -342,6 +342,9 @@ def get_state_dict_and_metadata(self, model, safe_serialization=False):
         """Get state dict and metadata. Useful when we need to modify a bit the state dict due to quantization"""
         return None, {}
 
+    def update_state_dict_with_metadata(self, state_dict, metadata):
+        return state_dict
+
     @abstractmethod
     def _process_model_before_weight_loading(self, model, **kwargs): ...
 
diff --git a/src/transformers/quantizers/quantizer_torchao.py b/src/transformers/quantizers/quantizer_torchao.py
@@ -35,9 +35,16 @@
     import torch
     import torch.nn as nn
 
-from torchao.quantization import Float8Tensor
+if is_torchao_available():
+    import torchao
+
+    if version.parse(importlib.metadata.version("torchao")) >= version.parse("0.14.0"):
+        from torchao.prototype.safetensors.safetensors_support import (
+            flatten_tensor_state_dict,
+            unflatten_tensor_state_dict,
+        )
+    from torchao.prototype.safetensors.safetensors_utils import is_metadata_torchao
 
-from torchao.prototype.safetensors.safetensors_support import save_tensor_state_dict, load_tensor_state_dict
 
 logger = logging.get_logger(__name__)
 
@@ -85,6 +92,13 @@ def _linear_extra_repr(self):
         return f"in_features={self.weight.shape[1]}, out_features={self.weight.shape[0]}, weight={weight}"
 
 
+if is_torchao_available():
+    SUPPORTED_SAFE_SERIALIZATION_CONFIGS = [
+        torchao.quantization.Float8WeightOnlyConfig,
+        torchao.quantization.Float8DynamicActivationFloat8WeightConfig,
+    ]
+
+
 class TorchAoHfQuantizer(HfQuantizer):
     """
     Quantizer for torchao: https://github.com/pytorch/ao/
@@ -141,9 +155,19 @@ def update_dtype(self, dtype):
                 dtype = torch.float32
         return dtype
 
-    def get_state_dict(self, model):
-        return save_tensor_state_dict(model.state_dict())
-
+    def get_state_dict_and_metadata(self, model, safe_serialization: Optional[bool] = False):
+        '''
+        If the model is safe serializable, we flatten the state dict of tensor subclasses so that it is compatible with
+        the safetensors format.
+        '''
+        if (
+            type(self.quantization_config.quant_type) in SUPPORTED_SAFE_SERIALIZATION_CONFIGS
+            and safe_serialization
+            and version.parse(importlib.metadata.version("torchao")) >= version.parse("0.14.0")
+        ):
+            return flatten_tensor_state_dict(model.state_dict())
+        else:
+            return super().get_state_dict_and_metadata(model)
 
     def adjust_target_dtype(self, dtype: "torch.dtype") -> "torch.dtype":
         if version.parse(importlib.metadata.version("accelerate")) > version.parse("0.19.0"):
@@ -228,7 +252,6 @@ def check_quantized_param(
                 _QUANTIZABLE.append(torch.nn.Embedding)
             return isinstance(module, tuple(_QUANTIZABLE)) and (tensor_name == "weight")
 
-
     def create_quantized_param(
         self,
         model: "PreTrainedModel",
@@ -288,8 +311,17 @@ def create_quantized_param(
 
             quantize_(module, self.quantization_config.get_apply_tensor_subclass())
 
-    def transform_state_dict(self, tensor_data, metadata):
-        return load_tensor_state_dict(tensor_data=tensor_data, provided_metadata=metadata)
+    def update_state_dict_with_metadata(self, state_dict, metadata):
+        '''
+        If the metadata contains torchao tensor subclass information, we reconstruct the tensor subclass state dict
+        from the provided state_dict and metadata.
+        '''
+        if version.parse(importlib.metadata.version("torchao")) >= version.parse("0.14.0") and is_metadata_torchao(
+            metadata
+        ):
+            return unflatten_tensor_state_dict(state_dict, metadata)
+        else:
+            return state_dict
 
     def _process_model_after_weight_loading(self, model, **kwargs):
         """No process required for torchao quantized model"""
@@ -309,10 +341,15 @@ def _process_model_after_weight_loading(self, model, **kwargs):
 
     def is_serializable(self, safe_serialization=None) -> bool:
         if safe_serialization:
-            logger.warning(
-                "torchao quantized model does not support safe serialization, please set `safe_serialization` to False"
+            _is_torchao_serializable = (
+                type(self.quantization_config.quant_type) in SUPPORTED_SAFE_SERIALIZATION_CONFIGS
             )
-            return False
+            if not _is_torchao_serializable:
+                logger.warning(
+                    f"torchao quantized model only supports safe serialization for {SUPPORTED_SAFE_SERIALIZATION_CONFIGS}, please set `safe_serialization` to False if you are using a different config"
+                )
+            return _is_torchao_serializable
+
         _is_torchao_serializable = version.parse(importlib.metadata.version("huggingface_hub")) >= version.parse(
             "0.25.0"
         )
diff --git a/tests/quantization/torchao_integration/test_torchao.py b/tests/quantization/torchao_integration/test_torchao.py