diff --git a/ax/service/scheduler.py b/ax/service/scheduler.py index 4faf63b3f21..fd9ed0d0922 100644 --- a/ax/service/scheduler.py +++ b/ax/service/scheduler.py @@ -77,8 +77,9 @@ """ FAILURE_EXCEEDED_MSG = ( "Failure rate exceeds the tolerated trial failure rate of {f_rate} (at least " - "{n_failed} out of first {n_ran} trials failed). Checks are triggered both at " - "the end of a optimization and if at least {min_failed} trials have failed." + "{n_failed} out of first {n_ran} trials failed or abandoned). Checks are triggered both at " + "the end of a optimization and if at least {min_failed} trials have either failed, " + "or have been abandoned, potentially automatically due to issues with the trial." ) @@ -850,13 +851,16 @@ def error_if_failure_rate_exceeded(self, force_check: bool = False) -> None: ): return - num_ran_in_scheduler = ( - len(self.experiment.trials) - self._num_preexisting_trials + num_ran_in_scheduler = sum( + 1 + for idx, t in self.experiment.trials.items() + if idx >= self._num_preexisting_trials and t.status.is_terminal ) - failure_rate_exceeded = ( - num_bad_in_scheduler / num_ran_in_scheduler - ) > self.options.tolerated_trial_failure_rate + (num_bad_in_scheduler / num_ran_in_scheduler) + > self.options.tolerated_trial_failure_rate + + ) if failure_rate_exceeded: if self._num_trials_bad_due_to_err > num_bad_in_scheduler / 2: