diff --git a/python/paddle/distributed/fleet/meta_optimizers/dygraph_optimizer/dygraph_sharding_optimizer.py b/python/paddle/distributed/fleet/meta_optimizers/dygraph_optimizer/dygraph_sharding_optimizer.py index c3237c71353cbf..8c5342246bdf53 100644 --- a/python/paddle/distributed/fleet/meta_optimizers/dygraph_optimizer/dygraph_sharding_optimizer.py +++ b/python/paddle/distributed/fleet/meta_optimizers/dygraph_optimizer/dygraph_sharding_optimizer.py @@ -348,6 +348,13 @@ def reduce_gradients(self, parameter_list, hcg): with framework.no_grad(): for param in parameter_list: g_var = self._get_param_grad(param) + if g_var is None: + if hasattr(param, "main_grad"): + g_var = paddle.zeros_like(param, dtype=paddle.float32) + param.main_grad = g_var + else: + g_var = paddle.zeros_like(param, dtype=param.dtype) + param.grad = g_var if g_var is not None: reduce_op = ReduceOp.AVG if not self.use_reduce_avg: diff --git a/python/paddle/distributed/fleet/utils/tensor_fusion_helper.py b/python/paddle/distributed/fleet/utils/tensor_fusion_helper.py index bdbf6b2fa3f9f4..2a32948611b3cc 100644 --- a/python/paddle/distributed/fleet/utils/tensor_fusion_helper.py +++ b/python/paddle/distributed/fleet/utils/tensor_fusion_helper.py @@ -619,22 +619,19 @@ def _copy_grad_to_buffer(self, param): ) grad_var = param.main_grad if self.use_main_grad else param.grad - assert grad_var is not None, ( - f"The current parameter[{param.name}] has no gradient, its stop_grdient is {param.stop_gradient}" - ) - grad_var.stop_gradient = True - grad_var.flatten_() - tmp_var.add_(grad_var) - tmp_var.get_tensor()._set_dims(param.shape) + if grad_var is not None: + grad_var.stop_gradient = True + grad_var.flatten_() + tmp_var.add_(grad_var) + grad_var._clear() + tmp_var.get_tensor()._set_dims(param.shape) if self.use_main_grad: - param.main_grad._clear() if not self._free_grads_in_comm: param.main_grad = tmp_var param.main_grad.name = "main_grad@" + param.name else: - param.grad._clear() if not self._free_grads_in_comm: param._copy_gradient_from(tmp_var)