Implement deepcopy to DTypeCast module (#98)

fmassa · web-flow · commit ba73b2e6ba69 · 2025-08-19T13:52:03.000-07:00
diff --git a/autoparallel/cast_parametrization.py b/autoparallel/cast_parametrization.py
@@ -3,17 +3,68 @@
 # This source code is licensed under the BSD license found in the
 # LICENSE file in the root directory of this source tree.
 
+import copy
+import copyreg
 from contextlib import contextmanager
-from typing import Any, Type
+from typing import Type
 
 import torch
 from torch.distributed.fsdp import MixedPrecisionPolicy
 from torch.utils._pytree import tree_map
 
 
-def _unimplemented_deepcopy(*args: Any, **kwargs: Any):
+def make_getter(self, p_name, mp_policy):
+    def getter(
+        self_mod=self,
+        _param_name=p_name,
+        _dtype=mp_policy.param_dtype,
+    ):
+        _param = self_mod._parameters[_param_name]
+        if not active_param():
+            return _param
+        return torch.ops.autoparallel.dtype_cast(_param, _dtype)
+
+    return getter
+
+
+# taken from PyTorch's parametrize module from
+# https://github.com/pytorch/pytorch/blob/5d9653d90ee003173dd03f93e09fed236500ef06/torch/nn/utils/parametrize.py#L324-L351
+# with some improvements
+def default_deepcopy(self, memo):
+    # Just emulate a standard deepcopy procedure when __deepcopy__ doesn't exist in the current class.
+    obj = memo.get(id(self), None)
+    if obj is not None:
+        return obj
+    replica = self.__new__(self.__class__)
+    memo[id(self)] = replica
+    replica.__dict__ = copy.deepcopy(self.__dict__, memo)
+
+    # Fix the parametrization getters to point to the replica instead of the original
+    if hasattr(replica, "_name_to_dtype_cast_managed_attr_getter") and hasattr(
+        replica, "_mp_policy"
+    ):
+        # Recreate the getter functions to point to the replica
+        param_properties = {}
+        for p_name in list(replica._name_to_dtype_cast_managed_attr_getter.keys()):
+            # Use a function factory to properly capture the loop variable
+            # def make_getter(param_name):
+            param_properties[p_name] = make_getter(replica, p_name, replica._mp_policy)
+        replica._name_to_dtype_cast_managed_attr_getter = param_properties
+
+    # Also save all slots if they exist.
+    slots_to_save = copyreg._slotnames(self.__class__)  # type: ignore[attr-defined]
+    for slot in slots_to_save:
+        if hasattr(self, slot):
+            setattr(replica, slot, copy.deepcopy(getattr(self, slot), memo))
+    return replica
+
+
+def getstate(self):
     raise RuntimeError(
-        "DTypeCast does not support deepcopy. Please use state dict for serialization.",
+        "Serialization of parametrized modules is only "
+        "supported through state_dict(). See:\n"
+        "https://pytorch.org/tutorials/beginner/saving_loading_models.html"
+        "#saving-loading-a-general-checkpoint-for-inference-and-or-resuming-training"
     )
 
 
@@ -103,27 +154,17 @@ def apply_dtype_cast(model, mp_policy: MixedPrecisionPolicy):
         params_dict = dict(mod.named_parameters(recurse=False))
 
         # Create new class for this module with all parametrized parameters
-        param_properties = {}
-        for p_name, p in params_dict.items():
-
-            def getter(
-                self_mod=mod,
-                _param_name=p_name,
-                _dtype=mp_policy.param_dtype,
-            ):
-                _param = self_mod._parameters[_param_name]
-                if not active_param():
-                    return _param
-                return torch.ops.autoparallel.dtype_cast(_param, _dtype)
-
-            param_properties[p_name] = getter
-
         cls = mod.__class__
-        param_properties_key = "#".join(sorted(param_properties.keys()))
+        param_properties_key = "#".join(sorted(params_dict.keys()))
         new_cls = cls_key_to_dtype_cast_cls.get((cls, param_properties_key), None)
         if not new_cls:
-            namespace = {"__deepcopy__": _unimplemented_deepcopy}
-            for p_name in param_properties:
+            namespace = {"__getstate__": getstate}
+            # We don't allow serialization of parametrized modules but should still allow deepcopying.
+            # Default 'deepcopy' function invokes __deepcopy__ method instead of __getstate__ when it exists.
+            if not hasattr(cls, "__deepcopy__"):
+                namespace["__deepcopy__"] = default_deepcopy  # type: ignore[assignment]
+
+            for p_name in params_dict.keys():
                 # NOTE: it's important to have this indirection, to make sure that:
                 # Different instances of the same class can resolve their parameter access to instance-specific getters
                 # (which contains unique objects used in that instance-specific parameter's unshard operation).
@@ -132,6 +173,11 @@ def getter(
             new_cls = type(f"DTypeCast{cls.__name__}", cls_t, namespace)
             cls_key_to_dtype_cast_cls[(cls, param_properties_key)] = new_cls
         mod.__class__ = new_cls
+
+        param_properties = {}
+        for p_name in params_dict.keys():
+            param_properties[p_name] = make_getter(mod, p_name, mp_policy)
+
         mod._name_to_dtype_cast_managed_attr_getter = param_properties
         mod._mp_policy = mp_policy