Add option to minimise all reduces in HSDP

nathan-az · nathan-az · commit 0be173f073fb · 2025-02-21T12:27:22.000+11:00
Signed-off-by: Nathan Azrak &lt;nathan.azrak@gmail.com&gt;
diff --git a/recipes/full_finetune_distributed.py b/recipes/full_finetune_distributed.py
@@ -189,6 +189,15 @@ def __init__(self, cfg: DictConfig) -> None:
         self._resume_from_checkpoint = cfg.resume_from_checkpoint
         self._gradient_accumulation_steps = cfg.gradient_accumulation_steps
         self._optimizer_in_bwd = cfg.get("optimizer_in_bwd", False)
+
+        # Should we raise an error rather than performing these silent checks like with `_optimizer_in_bwd` and `_clip_grad_norm`?
+        self._minimize_all_reduces = (
+            cfg.get("minimize_all_reduces", False)
+            and self._gradient_accumulation_steps > 1
+            and not self._optimizer_in_bwd
+            and self.parallel_dims.dp_enabled
+        )
+
         self._clip_grad_norm = cfg.get("clip_grad_norm", None)
         self._checkpoint_client = CheckpointClient(cfg)
 
@@ -834,6 +843,12 @@ def train(self) -> None:
                     # We multiply by world_size to undo FSDP2 gradient normalization.
                     current_loss = current_loss * (self.world_size / num_tokens)
 
+                if self._minimize_all_reduces and (
+                    (idx + 1) % self._gradient_accumulation_steps == 0
+                ):
+                    self._model.set_is_last_backward(True)
+                    self._model.set_requires_all_reduce(True)
+
                 current_loss.backward()
 
                 # Step with optimizer
@@ -857,6 +872,10 @@ def train(self) -> None:
                         self._optimizer.step()
                         self._optimizer.zero_grad(set_to_none=True)
 
+                        if self._minimize_all_reduces:
+                            self._model.set_is_last_backward(False)
+                            self._model.set_requires_all_reduce(False)
+
                     # Update the number of steps when the weights are updated
                     self.global_step += 1