diff --git a/smdebug/rules/action/stop_training_action.py b/smdebug/rules/action/stop_training_action.py index e665ed041..230096637 100644 --- a/smdebug/rules/action/stop_training_action.py +++ b/smdebug/rules/action/stop_training_action.py @@ -26,6 +26,7 @@ def _get_sm_tj_jobs_with_prefix(self): next_token = None name = self._training_job_prefix i = 0 + exception_caught_times = 0 while i < 50: try: if next_token is None: @@ -49,7 +50,7 @@ def _get_sm_tj_jobs_with_prefix(self): self._logger.info( f"No TrainingJob summaries found: list_training_jobs output is : {res}" ) - return + return [] for job in jobs: tj_status = job["TrainingJobStatus"] tj_name = job["TrainingJobName"] @@ -61,6 +62,10 @@ def _get_sm_tj_jobs_with_prefix(self): self._logger.info( f"Caught exception while getting list_training_job exception is: \n {e}. Attempt:{i}" ) + exception_caught_times += 1 + if exception_caught_times > 5: + print("Got exception more than 5 times while finding training job. Giving up.") + break if "NextToken" not in res: break else: @@ -68,6 +73,11 @@ def _get_sm_tj_jobs_with_prefix(self): res = {} jobs = {} i += 1 + if len(found_job_dict) > 0: + print( + f"Found training jobs matching prefix:{name}. Exiting even if next_token:{next_token} was present." + ) + break return found_job_dict.keys()