Update modeling_utils.py

Cyrilvallez · Cyrilvallez · commit c45910c9f0ce · 2025-04-02T15:23:49.000+02:00
diff --git a/src/transformers/modeling_utils.py b/src/transformers/modeling_utils.py
@@ -2481,6 +2481,9 @@ def initialize_weights(self):
         to the `_init_weights` of the outer-most model instead of the given sub-model.
         This function dynamically dispatches the correct `init_weights` function to the modules as we advance in the 
         module graph along the recursion. It can handle an arbitrary number of sub-models.
+
+        Note that the `torch.no_grad()` decorator is very important as well, as most of our `_init_weights` do not use
+        `torch.nn.init` functions (which are all no_grad by default), but simply do in-place ops such as `module.weight.data.zero_()`.
         """
         if not hasattr(torch.nn.Module, "smart_apply"):
             # This function is equivalent to `torch.nn.Module.apply`, except that it dynamically adjust the function