Fixing backward not being called, gradient scaling and enabling backward with torch.no_grad() (#237)

sanketpurandare · web-flow · commit b042f260cd25 · 2025-11-07T13:31:39.000-08:00
diff --git a/autoparallel/graph_pp_runner.py b/autoparallel/graph_pp_runner.py
@@ -3,8 +3,9 @@
 # This source code is licensed under the BSD license found in the
 # LICENSE file in the root directory of this source tree.
 
+import logging
 from dataclasses import dataclass
-from typing import Any, Callable, Optional, Union, cast
+from typing import Any, Callable, cast, Optional, Union
 
 import torch
 import torch.fx as fx
@@ -15,11 +16,14 @@
     _wait_batch_p2p,
 )
 from torch.distributed.pipelining.stage import (
-    PipelineStage,
     _normalize_model_output_as_tuple,
+    PipelineStage,
 )
 from torch.distributed.tensor import DTensor
 
+logger = logging.getLogger(__name__)
+logger.setLevel(logging.DEBUG)
+
 
 @dataclass
 class GraphCallables:
@@ -75,6 +79,22 @@ def __init__(
             "unsharded_grads": [],
         }
 
+    def scale_grads(self, grad_scale_factor: int) -> None:
+        """Scale stage's gradients by `grad_scale_factor`, which should be specified in coordination with the
+        loss function used with pipelining.  For loss functions which perform 'mean' loss reduction, `grad_scale_factor`
+        should be set to num_microbatches.  For loss functions that use `sum` reduction, `grad_scale_factor` should
+        be set to 1.
+
+        Should only be called once per pipeline schedule step, after all backwards passes have completed.
+        """
+
+        # PP scales only for its own contribution (microbatches), but relies on DP to scale further
+        # for DP degree.
+        if grad_scale_factor != 1:
+            for grad in self.state["unsharded_grads"]:
+                if grad is not None:
+                    grad.div_(grad_scale_factor)
+
 
 def _run_fw_module(
     fw_module: fx.GraphModule, graph_meta: GraphMeta, fw_args: list[Any]
@@ -243,7 +263,10 @@ def stage_forward(
         composite_args = stage._retrieve_recv_activations(mb_index)
 
     # stage._validate_fwd_input(args, kwargs) Maybe need to validate composite args?
-
+    logger.debug(
+        "GraphPPRunner running action %s",
+        action,
+    )
     output, saved_intermediates = _run_forward_microbatch(stage, *composite_args)
 
     # See [Note: pipeline model output type]
@@ -306,6 +329,7 @@ def stage_full_backward(
     grad_scale_factor = schedule._n_microbatches if schedule.scale_grads else 1
 
     if not backward_stage.has_backward:
+        logger.debug("Returning early for backward stage")
         return
     (
         stage_output,
@@ -320,7 +344,7 @@ def stage_full_backward(
         # HACK till we have loss function, we populate the tangents here manually
         bwd_kwargs = {
             "stage_output": loss,
-            "tangents": [torch.randn_like(stage_output)],
+            "tangents": [torch.randn_like(stage_output[0])],
             "saved_intermediates": saved_intermediates,
         }
     else:
@@ -334,10 +358,14 @@ def stage_full_backward(
             "tangents": output_grads,
             "saved_intermediates": saved_intermediates,
         }
-
+    logger.debug(
+        "GraphPPRunner running action %s",
+        action,
+    )
     input_grads = _run_backward_microbatch(backward_stage, bwd_kwargs)
-
-    backward_stage.bwd_cache[backward_mb_index] = input_grads
+    backward_stage.bwd_cache[backward_mb_index] = (
+        tuple(input_grads) if not isinstance(input_grads, tuple) else input_grads
+    )
 
     # skipping detach logic
 
@@ -362,9 +390,20 @@ def stage_unshard(
         stage.stage_index: cast(GraphPipelineStage, stage) for stage in schedule._stages
     }
     stage = stage_index_to_stage[action.stage_index]
+    logger.debug(
+        "GraphPPRunner running action %s",
+        action,
+    )
     if stage.graph_callables.unshard is None:
         stage.state["unsharded_params"] = stage.state["sharded_params"]
-    # TODO (sanketpurandare): Add the fw_fsdp_all_gather graph call here
+    else:
+        sharded_params = list(stage.state["sharded_params"])
+        unsharded_params = _run_unshard_module(
+            stage.graph_callables.unshard,
+            stage.graph_meta,
+            sharded_params,
+        )
+        stage.state["unsharded_params"] = unsharded_params
 
 
 def stage_reshard(
@@ -377,6 +416,10 @@ def stage_reshard(
         stage.stage_index: cast(GraphPipelineStage, stage) for stage in schedule._stages
     }
     stage = stage_index_to_stage[action.stage_index]
+    logger.debug(
+        "GraphPPRunner running action %s",
+        action,
+    )
     stage.state["unsharded_params"].clear()
 
 
@@ -390,8 +433,19 @@ def stage_reduce_grad(
         stage.stage_index: cast(GraphPipelineStage, stage) for stage in schedule._stages
     }
     stage = stage_index_to_stage[action.stage_index]
+    logger.debug(
+        "GraphPPRunner running action %s",
+        action,
+    )
     if stage.graph_callables.reduce_grad is None:
         stage.state["sharded_grads"] = stage.state["unsharded_grads"]
+    else:
+        sharded_grads = _run_reduce_grad_module(
+            stage.graph_callables.reduce_grad,
+            stage.graph_meta,
+            stage.state["unsharded_grads"],
+        )
+        stage.state["sharded_grads"] = sharded_grads
 
 
 class GraphPPRunner:
@@ -400,6 +454,19 @@ def __init__(
         schedule: _PipelineScheduleRuntime,
     ):
         self.schedule = schedule
+        if not schedule._backward_requires_autograd:
+            assert all(
+                isinstance(stage, GraphPipelineStage)
+                and (
+                    stage.graph_callables.full_bw is not None
+                    or (
+                        stage.graph_callables.bw_dI is not None
+                        and stage.graph_callables.bw_dW is not None
+                    )
+                )
+                for stage in schedule._stages
+            )
+            self.schedule._has_backward = True
 
     def _populate_stage_states(self, stage: GraphPipelineStage) -> None:
         sharded_params = [
@@ -415,21 +482,10 @@ def _populate_stage_states(self, stage: GraphPipelineStage) -> None:
         stage.state["sharded_params"] = sharded_params
         stage.state["buffers"] = buffers
         stage.state["unsharded_grads"] = [None] * len(sharded_params)
-        # TODO (sanketpurandare)
-        # pipeline schedule runtime does not allow us to register a custom function
-        # for UNSHARD/RESHARD/REDUCE_GRAD action types yet
-        # HACK remove this once we support this
-        if stage.graph_callables.unshard is None:
-            stage.state["unsharded_params"] = stage.state["sharded_params"]
 
     def _accumulate_stage_grads_and_clear_states(
         self, stage: GraphPipelineStage
     ) -> None:
-        # TODO (sanketpurandare)
-        # We don't have a REDUCE_GRAD action yet in the ScheduleIR yet
-        # HACK remove this once Ivan's PR lands
-        if stage.graph_callables.reduce_grad is None:
-            stage.state["sharded_grads"] = stage.state["unsharded_grads"]
         grads = stage.state["sharded_grads"]
         params = list(stage.submod.parameters())
         for param, grad in zip(params, grads):
diff --git a/examples/example_ds3_pp.py b/examples/example_ds3_pp.py
@@ -16,6 +16,9 @@
 from torch.distributed.pipelining.schedules import (
     FORWARD,
     FULL_BACKWARD,
+    REDUCE_GRAD,
+    RESHARD,
+    UNSHARD,
     PipelineScheduleMulti,
     _PipelineSchedule,
     _PipelineScheduleRuntime,
@@ -42,8 +45,15 @@
     GraphPPRunner,
     stage_forward,
     stage_full_backward,
+    stage_reduce_grad,
+    stage_reshard,
+    stage_unshard,
 )
 
+# Configure logging to show DEBUG messages
+logging.basicConfig(
+    level=logging.DEBUG, format="%(asctime)s - %(name)s - %(levelname)s - %(message)s"
+)
 logger = logging.getLogger(__name__)
 
 
@@ -54,6 +64,7 @@ def build_pipeline_schedule(
     microbatch_size: int,
     local_batch_size: int,
     pipeline_parallel_degree: int,
+    backward_requires_autograd: bool = False,
 ) -> _PipelineSchedule:
     """Builds a pipeline schedule for the given configuration and stages."""
     schedule_class = get_schedule_class(pipeline_parallel_schedule)
@@ -78,6 +89,7 @@ def build_pipeline_schedule(
         stages if looped_schedule else stages[0],
         n_microbatches=n_microbatches,
         loss_fn=loss_fn,
+        backward_requires_autograd=backward_requires_autograd,
     )
     logger.info(
         f"Using pipeline schedule {pipeline_parallel_schedule} "
@@ -427,11 +439,15 @@ def shape_inference_output_fn_last_stage():
         microbatch_size=microbatch_size,
         local_batch_size=local_batch_size,
         pipeline_parallel_degree=pp_degree,
+        backward_requires_autograd=False,
     )
     assert isinstance(schedule, _PipelineScheduleRuntime)
     # Step 6. Override the pipeline runner's action implementations
     schedule.register_custom_function(FORWARD, stage_forward)
     schedule.register_custom_function(FULL_BACKWARD, stage_full_backward)
+    schedule.register_custom_function(REDUCE_GRAD, stage_reduce_grad)
+    schedule.register_custom_function(RESHARD, stage_reshard)
+    schedule.register_custom_function(UNSHARD, stage_unshard)
 
     # Step 7. Register the schedule with the graph runner
 
diff --git a/examples/example_pp_graph_partition.py b/examples/example_pp_graph_partition.py