enable torchao safetensors

liangel-02 · liangel-02 · commit 9643eaf4ecf5 · 2025-09-03T17:38:01.000-07:00
diff --git a/src/transformers/modeling_utils.py b/src/transformers/modeling_utils.py
@@ -496,9 +496,10 @@ def load_sharded_checkpoint(model, folder, strict=True, prefer_safe=True):
 
 def load_state_dict(
     checkpoint_file: Union[str, os.PathLike],
-    is_quantized: bool = False,
+    is_quantized: bool = False, #change to hf_quantizer (default is none)
     map_location: Optional[Union[str, torch.device]] = "cpu",
     weights_only: bool = True,
+    hf_quantizer: Optional[HfQuantizer] = None,
 ):
     """
     Reads a `safetensor` or a `.bin` checkpoint file. We load the checkpoint on "cpu" by default.
@@ -595,7 +596,7 @@ def set_initialized_submodules(model, state_dict_keys):
     return not_initialized_submodules
 
 
-def _end_ptr(tensor: torch.Tensor) -> int:
+def _end_ptr(tensor: torch.Tensor) ->  int:
     # extract the end of the pointer if the tensor is a slice of a bigger tensor
     if tensor.nelement():
         stop = tensor.view(-1)[-1].data_ptr() + tensor.element_size()
@@ -727,6 +728,7 @@ def _load_state_dict_into_meta_model(
     keep_in_fp32_regex: Optional[re.Pattern] = None,
     unexpected_keys: Optional[list[str]] = None,  # passing `unexpected` for cleanup from quantization items
     device_mesh: Optional["torch.distributed.device_mesh.DeviceMesh"] = None,
+    metadata: Optional[dict] = None
 ) -> tuple[Optional[dict], Optional[dict]]:
     """Load parameters from `meta_state_dict` into the model. The parameters of the `meta_state_dict` are on the meta
     device in order to easily infer the shapes and dtypes that they will have. Then proper parameters are then loaded
@@ -741,15 +743,19 @@ def _load_state_dict_into_meta_model(
         device_map_regex = "|".join([re.escape(k) for k in sorted(device_map.keys(), reverse=True)])
 
     is_quantized = hf_quantizer is not None
-    is_hqq_or_bnb = is_quantized and hf_quantizer.quantization_config.quant_method in {
+    is_hqq_or_bnb_or_ao = is_quantized and hf_quantizer.quantization_config.quant_method in {
         QuantizationMethod.HQQ,
         QuantizationMethod.BITS_AND_BYTES,
+        QuantizationMethod.TORCHAO
     }
-    is_meta_state_dict = shard_file.endswith(".safetensors") and not is_hqq_or_bnb
+    is_meta_state_dict = shard_file.endswith(".safetensors") and not is_hqq_or_bnb_or_ao
     file_pointer = None
     if is_meta_state_dict:
         file_pointer = safe_open(shard_file, framework="pt", device=tensor_device)
 
+    if hf_quantizer and hasattr(hf_quantizer, "transform_state_dict") and metadata:
+        state_dict = hf_quantizer.transform_state_dict(state_dict, metadata)
+
     for param_name, empty_param in state_dict.items():
         if param_name not in expected_keys:  # when loading from ckpt, we skip param if doesnt exist in modeling
             continue
@@ -781,7 +787,8 @@ def _load_state_dict_into_meta_model(
                         device_map=device_map,
                     )
                 )
-            ):  # In this case, the param is already on the correct device!
+            ):
+                # In this case, the param is already on the correct device!
                 shard_and_distribute_module(
                     model,
                     param,
@@ -887,7 +894,7 @@ def load_shard_file(args):
         shard_file,
         state_dict,
         disk_only_shard_files,
-        is_hqq_or_bnb,
+        is_hqq_or_bnb_or_ao,
         is_quantized,
         device_map,
         hf_quantizer,
@@ -913,7 +920,7 @@ def load_shard_file(args):
     map_location = "cpu"
     if (
         shard_file.endswith(".safetensors")
-        and not is_hqq_or_bnb
+        and not is_hqq_or_bnb_or_ao
         and not (is_deepspeed_zero3_enabled() and not is_quantized)
     ):
         map_location = "meta"
@@ -931,11 +938,15 @@ def load_shard_file(args):
     # If shard_file is "", we use the existing state_dict instead of loading it
     if shard_file != "":
         state_dict = load_state_dict(
-            shard_file, is_quantized=is_quantized, map_location=map_location, weights_only=weights_only
+            shard_file, is_quantized=is_quantized, map_location=map_location, weights_only=weights_only, hf_quantizer=hf_quantizer
         )
 
     # Fix the key names
     state_dict = {key_renaming_mapping[k]: v for k, v in state_dict.items() if k in key_renaming_mapping}
+    metadata = None
+    if shard_file.endswith(".safetensors") and is_safetensors_available():
+        with safe_open(shard_file, framework="pt") as f:
+            metadata = f.metadata()
 
     error_msgs = []
 
@@ -959,6 +970,7 @@ def load_shard_file(args):
             keep_in_fp32_regex=keep_in_fp32_regex,
             unexpected_keys=unexpected_keys,
             device_mesh=device_mesh,
+            metadata=metadata,
         )
 
     return error_msgs, disk_offload_index, cpu_offload_index
@@ -3975,11 +3987,11 @@ def save_pretrained(
             and hf_quantizer.is_serializable(safe_serialization=safe_serialization)
         )
 
-        if hf_quantizer is not None and not _hf_peft_config_loaded and not quantization_serializable:
-            raise ValueError(
-                f"The model is quantized with {hf_quantizer.quantization_config.quant_method} and is not serializable - check out the warnings from"
-                " the logger on the traceback to understand the reason why the quantized model is not serializable."
-            )
+        # if hf_quantizer is not None and not _hf_peft_config_loaded and not quantization_serializable:
+        #     raise ValueError(
+        #         f"The model is quantized with {hf_quantizer.quantization_config.quant_method} and is not serializable - check out the warnings from"
+        #         " the logger on the traceback to understand the reason why the quantized model is not serializable."
+        #     )
 
         if "save_config" in kwargs:
             warnings.warn(
@@ -4010,6 +4022,10 @@ def save_pretrained(
 
         if hf_quantizer is not None:
             state_dict = hf_quantizer.get_state_dict(self)
+            metadata = {}
+            if isinstance(state_dict, tuple):
+                state_dict, metadata = state_dict
+
         # Only save the model itself if we are using distributed training
         model_to_save = unwrap_model(self)
         # save the string version of dtype to the config, e.g. convert torch.float32 => "float32"
@@ -4155,7 +4171,8 @@ def save_pretrained(
                 else:
                     ptrs[id_tensor_storage(tensor)].append(name)
 
-            shared_ptrs = {ptr: names for ptr, names in ptrs.items() if len(names) > 1}
+            # shared_ptrs = {ptr: names for ptr, names in ptrs.items() if len(names) > 1}
+            shared_ptrs = {}
 
             # Recursively descend to find tied weight keys
             _tied_weights_keys = _get_tied_weight_keys(self)
@@ -4286,7 +4303,8 @@ def save_pretrained(
             if safe_serialization:
                 # At some point we will need to deal better with save_function (used for TPU and other distributed
                 # joyfulness), but for now this enough.
-                safe_save_file(shard, os.path.join(save_directory, shard_file), metadata={"format": "pt"})
+                metadata["format"] = "pt"
+                safe_save_file(shard, os.path.join(save_directory, shard_file), metadata=metadata)
             else:
                 save_function(shard, os.path.join(save_directory, shard_file))
 
@@ -5077,7 +5095,6 @@ def from_pretrained(
                 )
 
         from_pt = not (from_tf | from_flax)
-
         if from_pt:
             if gguf_file:
                 from .modeling_gguf_pytorch_utils import load_gguf_checkpoint
@@ -5096,6 +5113,7 @@ def from_pretrained(
             )
 
         config.name_or_path = pretrained_model_name_or_path
+
         model_init_context = cls.get_init_context(is_quantized, _is_ds_init_called)
         config = copy.deepcopy(config)  # We do not want to modify the config inplace in from_pretrained.
         with ContextManagers(model_init_context):
@@ -5427,9 +5445,10 @@ def _load_pretrained_model(
             QuantizationMethod.HQQ,
             QuantizationMethod.QUARK,
         }
-        is_hqq_or_bnb = is_quantized and hf_quantizer.quantization_config.quant_method in {
+        is_hqq_or_bnb_or_ao = is_quantized and hf_quantizer.quantization_config.quant_method in {
             QuantizationMethod.HQQ,
             QuantizationMethod.BITS_AND_BYTES,
+            QuantizationMethod.TORCHAO
         }
 
         # Get all the keys of the state dicts that we have to initialize the model
@@ -5548,6 +5567,7 @@ def _load_pretrained_model(
                 if sharded_metadata is None:
                     weight_map = dict.fromkeys(checkpoint_keys, checkpoint_files[0])
                 else:
+                    # weight file full path
                     folder = os.path.sep.join(checkpoint_files[0].split(os.path.sep)[:-1])
                     # Fix the weight map keys according to the key mapping
                     weight_map = {
@@ -5602,7 +5622,7 @@ def _load_pretrained_model(
                 shard_file,
                 state_dict,
                 disk_only_shard_files,
-                is_hqq_or_bnb,
+                is_hqq_or_bnb_or_ao,
                 is_quantized,
                 device_map,
                 hf_quantizer,
diff --git a/src/transformers/quantizers/quantizer_torchao.py b/src/transformers/quantizers/quantizer_torchao.py
@@ -35,6 +35,10 @@
     import torch
     import torch.nn as nn
 
+from torchao.quantization import Float8Tensor
+
+from torchao.prototype.safetensors.safetensors_support import save_tensor_state_dict, load_tensor_state_dict
+
 logger = logging.get_logger(__name__)
 
 
@@ -137,6 +141,10 @@ def update_dtype(self, dtype):
                 dtype = torch.float32
         return dtype
 
+    def get_state_dict(self, model):
+        return save_tensor_state_dict(model.state_dict())
+
+
     def adjust_target_dtype(self, dtype: "torch.dtype") -> "torch.dtype":
         if version.parse(importlib.metadata.version("accelerate")) > version.parse("0.19.0"):
             from accelerate.utils import CustomDtype
@@ -220,6 +228,7 @@ def check_quantized_param(
                 _QUANTIZABLE.append(torch.nn.Embedding)
             return isinstance(module, tuple(_QUANTIZABLE)) and (tensor_name == "weight")
 
+
     def create_quantized_param(
         self,
         model: "PreTrainedModel",
@@ -279,6 +288,9 @@ def create_quantized_param(
 
             quantize_(module, self.quantization_config.get_apply_tensor_subclass())
 
+    def transform_state_dict(self, tensor_data, metadata):
+        return load_tensor_state_dict(tensor_data=tensor_data, provided_metadata=metadata)
+
     def _process_model_after_weight_loading(self, model, **kwargs):
         """No process required for torchao quantized model"""
         if self.quantization_config.quant_type == "autoquant":