From ac2794046662be4e1bc53624ab36a3161704fbbf Mon Sep 17 00:00:00 2001 From: Steve Lu <15214696+desire2020@users.noreply.github.com> Date: Mon, 11 Mar 2024 01:38:38 -0700 Subject: [PATCH] Addressing Issue #5241: Updating deepspeed/runtime/zero/stage_1_and_2.py A soft exception case in stage_1_and_2.py/complete_grad_norm_calculation_for_cpu_offload() is not properly handled. Creating a soft logger Warning to better inform the users for further debugging. --- deepspeed/runtime/zero/stage_1_and_2.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/deepspeed/runtime/zero/stage_1_and_2.py b/deepspeed/runtime/zero/stage_1_and_2.py index b1d94a4459d9..05257c10af54 100755 --- a/deepspeed/runtime/zero/stage_1_and_2.py +++ b/deepspeed/runtime/zero/stage_1_and_2.py @@ -1308,7 +1308,8 @@ def complete_grad_norm_calculation_for_cpu_offload(self, params): total_norm = total_norm_cuda[0].item()**(1. / norm_type) if total_norm == float('inf') or total_norm == -float('inf') or total_norm != total_norm: - total_norm = -1 + logger.info(f"Warning: invalid gradient detected. Please check your model implementation/configuration to improve the numerical stability.") + total_norm = -1. return total_norm