From 8461e5c60bfe157001b799c2a1059811fbc99af4 Mon Sep 17 00:00:00 2001
From: Sebastian Ament <sebastianament@meta.com>
Date: Wed, 28 Aug 2024 12:30:58 -0700
Subject: [PATCH] Updating failure rate message and accounting (#2723)

Summary:
Pull Request resolved: https://github.com/facebook/Ax/pull/2723

This diff adds a clarification to failure-rate-exceeded errors that "abandoned" trials are also added in the failure rate accounting, which can help users look this up on their own.

In addition the diff changes the denominator of the failure-rate computation to only consider  trials with a `terminal` status.

Differential Revision: D61914570
---
 ax/service/scheduler.py | 13 ++++++++-----
 1 file changed, 8 insertions(+), 5 deletions(-)

diff --git a/ax/service/scheduler.py b/ax/service/scheduler.py
index 4faf63b3f21..7b643b19a94 100644
--- a/ax/service/scheduler.py
+++ b/ax/service/scheduler.py
@@ -77,8 +77,10 @@
 """
 FAILURE_EXCEEDED_MSG = (
     "Failure rate exceeds the tolerated trial failure rate of {f_rate} (at least "
-    "{n_failed} out of first {n_ran} trials failed). Checks are triggered both at "
-    "the end of a optimization and if at least {min_failed} trials have failed."
+    "{n_failed} out of first {n_ran} trials failed or were abandoned). Checks are "
+    "triggered both at the end of a optimization and if at least {min_failed} trials "
+    "have either failed, or have been abandoned, potentially automatically due to "
+    "issues with the trial."
 )
 
 
@@ -850,10 +852,11 @@ def error_if_failure_rate_exceeded(self, force_check: bool = False) -> None:
         ):
             return
 
-        num_ran_in_scheduler = (
-            len(self.experiment.trials) - self._num_preexisting_trials
+        num_ran_in_scheduler = sum(
+            1
+            for idx, t in self.experiment.trials.items()
+            if idx >= self._num_preexisting_trials and t.status.is_terminal
         )
-
         failure_rate_exceeded = (
             num_bad_in_scheduler / num_ran_in_scheduler
         ) > self.options.tolerated_trial_failure_rate