Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Improve handling of failed evaluations #154

Merged
merged 46 commits into from
Feb 12, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
46 commits
Select commit Hold shift + click to select a range
2307a10
Remove unnecessary sim_specs out
AngelFP Nov 10, 2023
7852c13
Prefill sim output with NaNs
AngelFP Nov 10, 2023
c5bd71c
Merge branch 'main' into feature/failed_trials
AngelFP Dec 11, 2023
a6c7142
Implement `TrialStatus`
AngelFP Dec 11, 2023
d192de7
Mark failed evaluations
AngelFP Dec 11, 2023
2f41ecf
Handle failed trials in Ax Service generators
AngelFP Dec 11, 2023
a38cff0
Merge branch 'feature/failed_trials' of https://github.com/optimas-or…
AngelFP Dec 11, 2023
78280ab
[pre-commit.ci] auto fixes from pre-commit.com hooks
pre-commit-ci[bot] Dec 11, 2023
b734498
Fix bug
AngelFP Dec 11, 2023
5ae53a3
Abandon failed trials by default
AngelFP Dec 11, 2023
e060250
Handle failed trials in multitask gen
AngelFP Dec 11, 2023
0bd1cef
Expose `abandon_failed_trials`
AngelFP Dec 11, 2023
6689547
Distinguish completed, failed and evaluated trials
AngelFP Dec 11, 2023
8307b19
Fix bug
AngelFP Dec 12, 2023
cb6dd4c
Fix bug
AngelFP Dec 12, 2023
b59bb3d
[pre-commit.ci] auto fixes from pre-commit.com hooks
pre-commit-ci[bot] Dec 12, 2023
b32d8fe
Add tests for failed trials
AngelFP Dec 12, 2023
4bbee18
Merge branch 'feature/failed_trials' of https://github.com/optimas-or…
AngelFP Dec 12, 2023
e0c7719
[pre-commit.ci] auto fixes from pre-commit.com hooks
pre-commit-ci[bot] Dec 12, 2023
86a07f1
Workaround to fix `cwd` being changed by `libE`
AngelFP Dec 12, 2023
f026646
Fix test
AngelFP Dec 12, 2023
70b98c2
Set trial status in sim function
AngelFP Dec 12, 2023
1cb339c
Fail evaluations with NaNs
AngelFP Dec 12, 2023
7f7ec41
Update tests
AngelFP Dec 12, 2023
6ae8a42
Remove unused import
AngelFP Dec 12, 2023
ede5624
Revert change
AngelFP Dec 12, 2023
480cfeb
Merge branch 'main' into feature/failed_trials
AngelFP Dec 12, 2023
31128fb
[pre-commit.ci] auto fixes from pre-commit.com hooks
pre-commit-ci[bot] Dec 12, 2023
f8cc666
Formatting
AngelFP Dec 12, 2023
deca2c3
Improve sorting of `trial_status` in history df
AngelFP Dec 12, 2023
d08cdf2
Merge branch 'main' into feature/failed_trials
AngelFP Dec 15, 2023
1f95c4c
Execute `analysis_func` safely
AngelFP Dec 15, 2023
cf4b2b2
Avoid generic exception catch-all
AngelFP Dec 15, 2023
6235217
[pre-commit.ci] auto fixes from pre-commit.com hooks
pre-commit-ci[bot] Dec 15, 2023
0e86e73
Extend test
AngelFP Dec 15, 2023
002b0ba
Merge branch 'feature/failed_trials' of https://github.com/optimas-or…
AngelFP Dec 15, 2023
b19de19
[pre-commit.ci] auto fixes from pre-commit.com hooks
pre-commit-ci[bot] Dec 15, 2023
272c891
Merge branch 'main' into feature/failed_trials
AngelFP Jan 24, 2024
847b7b3
Merge branch 'main' into feature/failed_trials
AngelFP Jan 25, 2024
6b29a80
Merge branch 'main' into feature/failed_trials
AngelFP Jan 26, 2024
d041560
Enable marking trials as failed after completion
AngelFP Jan 27, 2024
7178404
[pre-commit.ci] auto fixes from pre-commit.com hooks
pre-commit-ci[bot] Jan 27, 2024
fccd950
Update test
AngelFP Jan 27, 2024
e2980bf
Merge branch 'feature/failed_trials' of https://github.com/optimas-or…
AngelFP Jan 27, 2024
08f1bf7
Merge branch 'main' into feature/failed_trials
AngelFP Feb 8, 2024
43b7795
Update test
AngelFP Feb 8, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion optimas/core/__init__.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
from .evaluation import Evaluation
from .parameter import Parameter, VaryingParameter, TrialParameter, Objective
from .task import Task
from .trial import Trial
from .trial import Trial, TrialStatus


__all__ = [
Expand All @@ -12,4 +12,5 @@
"Objective",
"Task",
"Trial",
"TrialStatus",
]
49 changes: 42 additions & 7 deletions optimas/core/trial.py
Original file line number Diff line number Diff line change
@@ -1,13 +1,23 @@
"""Contains the definition of the Trial class."""

from typing import List, Dict, Optional
from enum import Enum

import numpy as np

from .parameter import VaryingParameter, Objective, Parameter, TrialParameter
from .evaluation import Evaluation


class TrialStatus(int, Enum):
"""Enum of trial status, based on the Ax implementation."""

CANDIDATE = 0
RUNNING = 1
COMPLETED = 2
FAILED = 3


class Trial:
"""Defines a trial to be evaluated.

Expand Down Expand Up @@ -67,6 +77,7 @@ def __init__(
self._mapped_evaluations[par.name] = None
for ev in evaluations:
self._mapped_evaluations[ev.parameter.name] = ev
self.mark_as(TrialStatus.CANDIDATE)

@property
def varying_parameters(self) -> List[VaryingParameter]:
Expand Down Expand Up @@ -121,6 +132,36 @@ def custom_parameters(self) -> List[TrialParameter]:
"""Get the list of custom trial parameters."""
return self._custom_parameters

@property
def status(self) -> TrialStatus:
"""Get current trial status."""
return self._status

@property
def completed(self) -> bool:
"""Determine whether the trial has been successfully evaluated."""
return self._status == TrialStatus.COMPLETED

@property
def failed(self) -> bool:
"""Determine whether the trial evaluation has failed."""
return self._status == TrialStatus.FAILED

@property
def evaluated(self) -> bool:
"""Determine whether the trial has been evaluated."""
return self.completed or self.failed

def mark_as(self, status) -> None:
"""Set trial status.

Parameters
----------
status : int
A valid trial status (use ``TrialStatus`` enum).
"""
self._status = status

def complete_evaluation(self, evaluation: Evaluation) -> None:
"""Complete the evaluation of an objective or analyzed parameter.

Expand All @@ -134,6 +175,7 @@ def complete_evaluation(self, evaluation: Evaluation) -> None:
assert evaluated_parameter in self._mapped_evaluations
if self._mapped_evaluations[evaluated_parameter] is None:
self._mapped_evaluations[evaluated_parameter] = evaluation
self.mark_as(TrialStatus.COMPLETED)

def parameters_as_dict(self) -> Dict:
"""Get a mapping between names and values of the varying parameters."""
Expand Down Expand Up @@ -165,10 +207,3 @@ def analyzed_parameters_as_dict(self) -> Dict:
ev = self._mapped_evaluations[par.name]
params[par.name] = (ev.value, ev.sem)
return params

def completed(self) -> bool:
"""Determine whether the trial has been completed."""
for par, ev in self._mapped_evaluations.items():
if ev is None:
return False
return True
11 changes: 8 additions & 3 deletions optimas/evaluators/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,9 @@ class Evaluator:
n_gpus : int, optional
The number of GPUs that will be made available for each evaluation. By
default, 0.
fail_on_nan : bool, optional
Whether to mark an evaluation as failed if the value of any of the
objectives is NaN. By default, ``True``.

"""

Expand All @@ -29,6 +32,7 @@ def __init__(
sim_function: Callable,
n_procs: Optional[int] = None,
n_gpus: Optional[int] = None,
fail_on_nan: Optional[bool] = True,
) -> None:
self.sim_function = sim_function
# If no resources are specified, use 1 CPU an 0 GPUs.
Expand All @@ -44,6 +48,7 @@ def __init__(
n_gpus = 0
self._n_procs = n_procs
self._n_gpus = n_gpus
self._fail_on_nan = fail_on_nan
self._initialized = False

def get_sim_specs(
Expand All @@ -68,14 +73,14 @@ def get_sim_specs(
"in": [var.name for var in varying_parameters],
"out": (
[(obj.name, obj.dtype) for obj in objectives]
# f is the single float output that LibEnsemble minimizes.
+ [(par.name, par.dtype) for par in analyzed_parameters]
# input parameters
+ [(var.name, var.dtype) for var in varying_parameters]
+ [("trial_status", str, 10)]
),
"user": {
"n_procs": self._n_procs,
"n_gpus": self._n_gpus,
"fail_on_nan": self._fail_on_nan,
"objectives": [obj.name for obj in objectives],
},
}
return sim_specs
Expand Down
4 changes: 1 addition & 3 deletions optimas/evaluators/multitask_evaluator.py
Original file line number Diff line number Diff line change
Expand Up @@ -56,10 +56,8 @@ def get_sim_specs(
self.tasks[0].name: sim_specs_1["user"],
self.tasks[1].name: sim_specs_2["user"],
}
# Add task name to sim_specs in and out.
task_len = max([len(self.tasks[0].name), len(self.tasks[1].name)])
# Add task name to sim_specs in.
sim_specs["in"].append("task")
sim_specs["out"].append(("task", str, task_len))
return sim_specs

def get_libe_specs(self) -> Dict:
Expand Down
33 changes: 29 additions & 4 deletions optimas/explorations/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
from libensemble.alloc_funcs.start_only_persistent import only_persistent_gens
from libensemble.executors.mpi_executor import MPIExecutor

from optimas.core.trial import TrialStatus
from optimas.generators.base import Generator
from optimas.evaluators.base import Evaluator
from optimas.evaluators.function_evaluator import FunctionEvaluator
Expand Down Expand Up @@ -128,7 +129,7 @@ def __init__(
def history(self) -> pd.DataFrame:
"""Get the exploration history."""
history = convert_to_dataframe(self._libe_history.H)
ordered_columns = ["trial_index"]
ordered_columns = ["trial_index", "trial_status"]
ordered_columns += [p.name for p in self.generator.varying_parameters]
ordered_columns += [p.name for p in self.generator.objectives]
ordered_columns += [p.name for p in self.generator.analyzed_parameters]
Expand All @@ -147,6 +148,13 @@ def run(self, n_evals: Optional[int] = None) -> None:
run until the number of evaluations reaches `max_evals`.

"""
# Store current working directory. It has been observed that sometimes
# (especially when using `local_threading`) the working directory
# is changed to the exploration directory after the call to `libE`.
# As a workaround, the cwd is stored and then set again at the end of
# `run`.
cwd = os.getcwd()
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Should we bring this up to the libEnsemble team?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Good point. I opened a new issue Libensemble/libensemble#1244


# Set exit criteria to maximum number of evaluations.
remaining_evals = self.max_evals - self._n_evals
if remaining_evals < 1:
Expand All @@ -162,7 +170,7 @@ def run(self, n_evals: Optional[int] = None) -> None:
exit_criteria["sim_max"] = sim_max

# Get initial number of generator trials.
n_evals_initial = self.generator.n_completed_trials
n_evals_initial = self.generator.n_evaluated_trials

# Create persis_info.
persis_info = add_unique_random_streams({}, self.sim_workers + 2)
Expand Down Expand Up @@ -209,8 +217,11 @@ def run(self, n_evals: Optional[int] = None) -> None:
self.generator._update(persis_info[1]["generator"])

# Update number of evaluation in this exploration.
n_trials_final = self.generator.n_completed_trials
self._n_evals += n_trials_final - n_evals_initial
n_evals_final = self.generator.n_evaluated_trials
self._n_evals += n_evals_final - n_evals_initial

# Reset `cwd` to initial value before `libE` was called.
os.chdir(cwd)

def attach_trials(
self,
Expand Down Expand Up @@ -420,10 +431,24 @@ def attach_evaluations(
self.generator._trial_count + n_evals,
dtype=int,
)
if "trial_status" not in fields:
history_new["trial_status"] = TrialStatus.COMPLETED.name

# Incorporate new history into generator.
self.generator.incorporate_history(history_new)

def mark_evaluation_as_failed(self, trial_index):
"""Mark an already evaluated trial as failed.

Parameters
----------
trial_index : int
The index of the trial.
"""
self.generator.mark_trial_as_failed(trial_index)
i = np.where(self._libe_history.H["trial_index"] == trial_index)[0][0]
self._libe_history.H[i]["trial_status"] = TrialStatus.FAILED.name

def _create_executor(self) -> None:
"""Create libEnsemble executor."""
self.executor = MPIExecutor()
Expand Down
13 changes: 9 additions & 4 deletions optimas/gen_functions.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
from libensemble.resources.resources import Resources

from optimas.core import Evaluation
from optimas.core.trial import TrialStatus


def persistent_generator(H, persis_info, gen_specs, libE_info):
Expand Down Expand Up @@ -99,11 +100,15 @@ def persistent_generator(H, persis_info, gen_specs, libE_info):
# Update the GP with latest simulation results
for i in range(n):
trial_index = int(calc_in["trial_index"][i])
trial_status = calc_in["trial_status"][i]
trial = generator.get_trial(trial_index)
for par in objectives + analyzed_parameters:
y = calc_in[par.name][i]
ev = Evaluation(parameter=par, value=y)
trial.complete_evaluation(ev)
if trial_status == TrialStatus.FAILED.name:
trial.mark_as(TrialStatus.FAILED)
else:
for par in objectives + analyzed_parameters:
y = calc_in[par.name][i]
ev = Evaluation(parameter=par, value=y)
trial.complete_evaluation(ev)
# Register trial with unknown SEM
generator.tell([trial])
# Set the number of points to generate to that number:
Expand Down
25 changes: 16 additions & 9 deletions optimas/generators/ax/developer/multitask.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,7 @@
Parameter,
Task,
Trial,
TrialStatus,
)
from .ax_metric import AxMetric

Expand Down Expand Up @@ -225,10 +226,13 @@ def _incorporate_external_data(self, trials: List[Trial]) -> None:
ax_trial.run()
# Incorporate observations.
for trial in trials_i:
objective_eval = {}
oe = trial.objective_evaluations[0]
objective_eval["f"] = (oe.value, oe.sem)
ax_trial.run_metadata[trial.arm_name] = objective_eval
if trial.status != TrialStatus.FAILED:
objective_eval = {}
oe = trial.objective_evaluations[0]
objective_eval["f"] = (oe.value, oe.sem)
ax_trial.run_metadata[trial.arm_name] = objective_eval
else:
ax_trial.mark_arm_abandoned(trial.arm_name)
# Mark batch trial as completed.
ax_trial.mark_completed()
# Keep track of high-fidelity trials.
Expand All @@ -245,10 +249,13 @@ def _complete_evaluations(self, trials: List[Trial]) -> None:
"External data can only be loaded into generator before "
"initialization."
)
objective_eval = {}
oe = trial.objective_evaluations[0]
objective_eval["f"] = (oe.value, oe.sem)
self.current_trial.run_metadata[trial.arm_name] = objective_eval
if trial.status != TrialStatus.FAILED:
objective_eval = {}
oe = trial.objective_evaluations[0]
objective_eval["f"] = (oe.value, oe.sem)
self.current_trial.run_metadata[trial.arm_name] = objective_eval
else:
self.current_trial.mark_arm_abandoned(trial.arm_name)
if trial.trial_type == self.lofi_task.name:
self.returned_lofi_trials += 1
if self.returned_lofi_trials == self.n_gen_lofi:
Expand Down Expand Up @@ -447,7 +454,7 @@ def _save_model_to_file(self) -> None:
file_path = os.path.join(
self._model_history_dir,
"ax_experiment_at_eval_{}.json".format(
self._n_completed_trials_last_saved
self._n_evaluated_trials_last_saved
),
)
save_experiment(
Expand Down
5 changes: 5 additions & 0 deletions optimas/generators/ax/service/ax_client.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,9 @@ class AxClientGenerator(AxServiceGenerator):
analyzed_parameters : list of Parameter, optional
List of parameters to analyze at each trial, but which are not
optimization objectives. By default ``None``.
abandon_failed_trials : bool, optional
Whether failed trials should be abandoned (i.e., not suggested again).
By default, ``True``.
gpu_id : int, optional
The ID of the GPU in which to run the generator. By default, ``0``.
This parameter will only have an effect if any ``GenerationStep`` in
Expand Down Expand Up @@ -61,6 +64,7 @@ def __init__(
self,
ax_client: AxClient,
analyzed_parameters: Optional[List[Parameter]] = None,
abandon_failed_trials: Optional[bool] = True,
gpu_id: Optional[int] = 0,
dedicated_resources: Optional[bool] = False,
save_model: Optional[bool] = True,
Expand All @@ -79,6 +83,7 @@ def __init__(
objectives=objectives,
analyzed_parameters=analyzed_parameters,
enforce_n_init=True,
abandon_failed_trials=abandon_failed_trials,
use_cuda=use_cuda,
gpu_id=gpu_id,
dedicated_resources=dedicated_resources,
Expand Down
Loading
Loading