huggingface · patrickvonplaten · Aug 24, 2021 · Aug 21, 2021 · Aug 21, 2021 · Aug 21, 2021
diff --git a/src/transformers/configuration_utils.py b/src/transformers/configuration_utils.py
@@ -30,6 +30,7 @@
     hf_bucket_url,
     is_offline_mode,
     is_remote_url,
+    is_torch_available,
 )
 from .utils import logging
 
@@ -207,6 +208,9 @@ class PretrainedConfig(PushToHubMixin):
           this attribute contains just the floating type string without the ``torch.`` prefix. For example, for
           ``torch.float16`` ``torch_dtype`` is the ``"float16"`` string.
 
+          This attribute is currently not being used during model loading time, but this may change in the future
+          versions. But we can already start preparing for the future by saving the dtype with save_pretrained.
+
     TensorFlow specific parameters
 
         - **use_bfloat16** (:obj:`bool`, `optional`, defaults to :obj:`False`) -- Whether or not the model should use
@@ -270,6 +274,14 @@ def __init__(self, **kwargs):
         else:
             self.num_labels = kwargs.pop("num_labels", 2)
 
+        if self.torch_dtype is not None and isinstance(self.torch_dtype, str):
+            # we will start using self.torch_dtype in v5, but to be consistent with
+            # from_pretrained's torch_dtype arg convert it to an actual torch.dtype object
+            if is_torch_available():
+                import torch
+
+                self.torch_dtype = getattr(torch, self.torch_dtype)
+
         # Tokenizer arguments TODO: eventually tokenizer and models should share the same config
         self.tokenizer_class = kwargs.pop("tokenizer_class", None)
         self.prefix = kwargs.pop("prefix", None)
@@ -574,7 +586,8 @@ def from_dict(cls, config_dict: Dict[str, Any], **kwargs) -> "PretrainedConfig":
         for key, value in kwargs.items():
             if hasattr(config, key):
                 setattr(config, key, value)
-                to_remove.append(key)
+                if key != "torch_dtype":
+                    to_remove.append(key)
         for key in to_remove:
             kwargs.pop(key, None)
 
@@ -640,6 +653,8 @@ def to_diff_dict(self) -> Dict[str, Any]:
             ):
                 serializable_config_dict[key] = value
 
+        self.dict_torch_dtype_to_str(serializable_config_dict)
+
         return serializable_config_dict
 
     def to_dict(self) -> Dict[str, Any]:
@@ -656,6 +671,8 @@ def to_dict(self) -> Dict[str, Any]:
         # Transformers version when serializing the model
         output["transformers_version"] = __version__
 
+        self.dict_torch_dtype_to_str(output)
+
         return output
 
     def to_json_string(self, use_diff: bool = True) -> str:
@@ -738,6 +755,15 @@ def update_from_string(self, update_str: str):
 
             setattr(self, k, v)
 
+    def dict_torch_dtype_to_str(self, d: Dict[str, Any]) -> None:
+        """
+        Checks whether the passed dictionary has a `torch_dtype` key and if it's not None, converts torch.dtype to a
+        string of just the type. For example, :obj:`torch.float32` get converted into `"float32"` string, which can
+        then be stored in the json format.
+        """
+        if d.get("torch_dtype", None) is not None and not isinstance(d["torch_dtype"], str):
+            d["torch_dtype"] = str(d["torch_dtype"]).split(".")[1]
+
 
 PretrainedConfig.push_to_hub = copy_func(PretrainedConfig.push_to_hub)
 PretrainedConfig.push_to_hub.__doc__ = PretrainedConfig.push_to_hub.__doc__.format(

diff --git a/tests/test_modeling_common.py b/tests/test_modeling_common.py
@@ -16,6 +16,7 @@
 import copy
 import gc
 import inspect
+import json
 import os.path
 import random
 import tempfile
@@ -1663,9 +1664,11 @@ def test_model_from_config_torch_dtype(self):
     @require_torch
     def test_model_from_pretrained_torch_dtype(self):
         # test that the model can be instantiated with dtype of either
-        # 1. config.torch_dtype setting in the saved model (priority)
-        # 2. via autodiscovery by looking at model weights
+        # 1. explicit from_pretrained's torch_dtype argument
+        # 2. via autodiscovery by looking at model weights (torch_dtype="auto")
         # so if a model.half() was saved, we want it to be instantiated as such.
+        #
+        # test an explicit model class, but also AutoModel separately as the latter goes through a different code path
         model_path = self.get_auto_remove_tmp_dir()
 
         # baseline - we know TINY_T5 is fp32 model
@@ -1688,13 +1691,26 @@ def test_model_from_pretrained_torch_dtype(self):
         model = model.half()
         model.save_pretrained(model_path)
         model = T5ForConditionalGeneration.from_pretrained(model_path, torch_dtype="auto")
-        self.assertEqual(model.config.torch_dtype, "float16")  # tests `config.torch_dtype` saving
+        self.assertEqual(model.config.torch_dtype, torch.float16)
         self.assertEqual(model.dtype, torch.float16)
 
+        # tests `config.torch_dtype` saving
+        with open(f"{model_path}/config.json") as f:
+            config_dict = json.load(f)
+        self.assertEqual(config_dict["torch_dtype"], "float16")
+
         # test fp16 save_pretrained, loaded with the explicit fp16
         model = T5ForConditionalGeneration.from_pretrained(model_path, torch_dtype=torch.float16)
         self.assertEqual(model.dtype, torch.float16)
 
+        # test AutoModel separately as it goes through a different path
+        # test auto-detection
+        model = AutoModel.from_pretrained(TINY_T5, torch_dtype="auto")
+        self.assertEqual(model.dtype, torch.float32)
+        # test forcing an explicit dtype
+        model = AutoModel.from_pretrained(TINY_T5, torch_dtype=torch.float16)
+        self.assertEqual(model.dtype, torch.float16)
+
 
 @require_torch
 @is_staging_test