AlibabaPAI · lausannel · Sep 3, 2024 · Sep 2, 2024 · Sep 2, 2024
diff --git a/torch_xla/experimental/spmd_fully_sharded_data_parallel.py b/torch_xla/experimental/spmd_fully_sharded_data_parallel.py
@@ -11,6 +11,7 @@
 import torch_xla.core.xla_model as xm
 import torch_xla.distributed.spmd as spmd
 from torch_xla.distributed.fsdp.wrap import recursive_wrap
+from torch_xla.distributed.fsdp._init_utils import _materialize_module
 
 
 def _prepare_spmd_partition_spec(param):
@@ -95,6 +96,12 @@ def __init__(
       )
       self._auto_wrap(auto_wrap_kwargs, fsdp_kwargs)
 
+    _materialize_module(
+        module,
+        None, [],
+        deferred_init_check_fn=lambda k: not isinstance(
+            k, SpmdFullyShardedDataParallel))
+
     # Let's move the module to xla device in case it's not moved
     # by the caller already.
     self._orig_module = module.to(xm.xla_device())