allreduce when grad current_mesh != pp_mesh

waliwali777 · waliwali777 · commit 87d24ed96c3c · 2025-08-28T13:42:35.000+08:00
diff --git a/python/paddle/nn/clip.py b/python/paddle/nn/clip.py
@@ -717,6 +717,7 @@ def _dygraph_clip(self, params_grads):
         sum_square_list = []
         sum_square_list_fp16 = []
         sum_square_list_fp32 = []
+        flag_auto_hybrid_pp = True  # Determine whether to use the new dynamic graph semi-automatic parallel pp framework
         if len(params_grads) > 0 and len(params_grads[0]) > 0:
             src_mesh = params_grads[0][0].process_mesh
         else:
@@ -742,8 +743,10 @@ def _dygraph_clip(self, params_grads):
             # if the gradient mesh is not equal to src mesh
             # do reshard to get the result of squared_l2 from other pp stage mesh
             if src_mesh is not None and g.process_mesh != src_mesh:
+                flag_auto_hybrid_pp = False
                 pp_mesh = get_complete_pp_mesh(g.process_mesh)
                 if set(g.process_mesh.process_ids) < set(pp_mesh.process_ids):
+                    flag_auto_hybrid_pp = True
                     sum_square = dist.reshard(
                         sum_square, pp_mesh, sum_square.placements
                     )
@@ -798,7 +801,7 @@ def async_add_n(var_list):
         # then performs pp group communication reduce(sum) to get correct global_norm_var.
         # For complete alignment with old dygraph semi-auto parallel PP logic,
         # refer to NOTE: align ClipGradByGlobalNorm in auto_parallel_align_mode
-        if src_mesh is not None:
+        if flag_auto_hybrid_pp and src_mesh is not None:
             g_mesh = dist.get_mesh()
             if (
                 g_mesh
@@ -882,6 +885,15 @@ def async_add_n(var_list):
                                 "Reshard a sharded tensor from a local mesh to a global mesh is not supported"
                             )
                     else:
+                        pp_mesh = get_complete_pp_mesh(g.process_mesh)
+
+                        if set(g.process_mesh.process_ids) < set(
+                            pp_mesh.process_ids
+                        ):
+                            clip_input = dist.reshard(
+                                clip_input, pp_mesh, clip_input.placements
+                            )
+
                         clip_input = paddle.distributed.reshard(
                             clip_input, g.process_mesh, clip_input.placements
                         )