diff --git a/deepspeed/runtime/zero/stage_1_and_2.py b/deepspeed/runtime/zero/stage_1_and_2.py index 16b9c3c18919..226e7432305f 100755 --- a/deepspeed/runtime/zero/stage_1_and_2.py +++ b/deepspeed/runtime/zero/stage_1_and_2.py @@ -1307,7 +1307,8 @@ def complete_grad_norm_calculation_for_cpu_offload(self, params): total_norm = total_norm_cuda[0].item()**(1. / norm_type) if total_norm == float('inf') or total_norm == -float('inf') or total_norm != total_norm: - total_norm = -1 + logger.info(f"Warning: invalid gradient detected. Please check your model implementation/configuration to improve the numerical stability.") + total_norm = -1. return total_norm