Skip to content

Commit

Permalink
Basic error checks and exit zero on success (#850)
Browse files Browse the repository at this point in the history
The `mlos_bench` CLI wrapper exits non-zero currently even on success.

This PR adds some basic sanity checks and makes sure we exit 0 when the
process looks roughly OK.

Further work to be expanded in #523.

---------

Co-authored-by: Sergiy Matusevych <sergiym@microsoft.com>
Co-authored-by: Sergiy Matusevych <sergiy.matusevych@gmail.com>
  • Loading branch information
3 people authored Aug 20, 2024
1 parent 85fdec3 commit e514908
Show file tree
Hide file tree
Showing 4 changed files with 74 additions and 4 deletions.
55 changes: 53 additions & 2 deletions mlos_bench/mlos_bench/run.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,29 +12,80 @@
"""

import logging
import sys
from typing import Dict, List, Optional, Tuple

import numpy as np

from mlos_bench.launcher import Launcher
from mlos_bench.tunables.tunable_groups import TunableGroups

_LOG = logging.getLogger(__name__)


def _sanity_check_results(launcher: Launcher) -> None:
"""Do some sanity checking on the results and throw an exception if it looks like
something went wrong.
"""
basic_err_msg = "Check configuration, scripts, and logs for details."

# Check if the scheduler has any trials.
if not launcher.scheduler.trial_count:
raise RuntimeError(f"No trials were run. {basic_err_msg}")

# Check if the scheduler ran the expected number of trials.
expected_trial_count = min(
launcher.scheduler.max_trials if launcher.scheduler.max_trials > 0 else np.inf,
launcher.scheduler.trial_config_repeat_count * launcher.optimizer.max_suggestions,
)
if launcher.scheduler.trial_count < expected_trial_count:
raise RuntimeError(
f"Expected {expected_trial_count} trials, "
f"but only {launcher.scheduler.trial_count} were run. {basic_err_msg}"
)

# Check to see if "too many" trials seem to have failed (#523).
unsuccessful_trials = [t for t in launcher.scheduler.ran_trials if not t.status.is_succeeded()]
if len(unsuccessful_trials) > 0.2 * launcher.scheduler.trial_count:
raise RuntimeWarning(
"Too many trials failed: "
f"{len(unsuccessful_trials)} out of {launcher.scheduler.trial_count}. "
f"{basic_err_msg}"
)


def _main(
argv: Optional[List[str]] = None,
) -> Tuple[Optional[Dict[str, float]], Optional[TunableGroups]]:

launcher = Launcher("mlos_bench", "Systems autotuning and benchmarking tool", argv=argv)

with launcher.scheduler as scheduler_context:
scheduler_context.start()
scheduler_context.teardown()

_sanity_check_results(launcher)

(score, _config) = result = launcher.scheduler.get_best_observation()
# NOTE: This log line is used in test_launch_main_app_* unit tests:
_LOG.info("Final score: %s", score)
return result


def _shell_main(
argv: Optional[List[str]] = None,
) -> int:
(best_score, best_config) = _main(argv)
# Exit zero if it looks like the overall operation was successful.
# TODO: Improve this sanity check to be more robust.
if (
best_score
and best_config
and all(isinstance(score_value, float) for score_value in best_score.values())
):
return 0
else:
raise ValueError(f"Unexpected result: {best_score=}, {best_config=}")


if __name__ == "__main__":
_main()
sys.exit(_shell_main())
14 changes: 13 additions & 1 deletion mlos_bench/mlos_bench/schedulers/base_scheduler.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@
from abc import ABCMeta, abstractmethod
from datetime import datetime
from types import TracebackType
from typing import Any, Dict, Optional, Tuple, Type
from typing import Any, Dict, List, Optional, Tuple, Type

from pytz import UTC
from typing_extensions import Literal
Expand Down Expand Up @@ -87,6 +87,7 @@ def __init__( # pylint: disable=too-many-arguments
self.storage = storage
self._root_env_config = root_env_config
self._last_trial_id = -1
self._ran_trials: List[Storage.Trial] = []

_LOG.debug("Scheduler instantiated: %s :: %s", self, config)

Expand All @@ -113,6 +114,11 @@ def trial_config_repeat_count(self) -> int:
"""Gets the number of trials to run for a given config."""
return self._trial_config_repeat_count

@property
def trial_count(self) -> int:
"""Gets the current number of trials run for the experiment."""
return self._trial_count

@property
def max_trials(self) -> int:
"""Gets the maximum number of trials to run for a given experiment, or -1 for no
Expand Down Expand Up @@ -302,4 +308,10 @@ def run_trial(self, trial: Storage.Trial) -> None:
"""
assert self.experiment is not None
self._trial_count += 1
self._ran_trials.append(trial)
_LOG.info("QUEUE: Execute trial # %d/%d :: %s", self._trial_count, self._max_trials, trial)

@property
def ran_trials(self) -> List[Storage.Trial]:
"""Get the list of trials that were run."""
return self._ran_trials
7 changes: 7 additions & 0 deletions mlos_bench/mlos_bench/storage/base_storage.py
Original file line number Diff line number Diff line change
Expand Up @@ -391,6 +391,7 @@ def __init__( # pylint: disable=too-many-arguments
self._tunable_config_id = tunable_config_id
self._opt_targets = opt_targets
self._config = config or {}
self._status = Status.UNKNOWN

def __repr__(self) -> str:
return f"{self._experiment_id}:{self._trial_id}:{self._tunable_config_id}"
Expand Down Expand Up @@ -435,6 +436,11 @@ def config(self, global_config: Optional[Dict[str, Any]] = None) -> Dict[str, An
config["trial_id"] = self._trial_id
return config

@property
def status(self) -> Status:
"""Get the status of the current trial."""
return self._status

@abstractmethod
def update(
self,
Expand Down Expand Up @@ -471,6 +477,7 @@ def update(
opt_targets.difference(metrics.keys()),
)
# raise ValueError()
self._status = status
return metrics

@abstractmethod
Expand Down
2 changes: 1 addition & 1 deletion mlos_bench/pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,7 @@ dynamic = [
]

[project.scripts]
mlos_bench = "mlos_bench.run:_main"
mlos_bench = "mlos_bench.run:_shell_main"

[project.urls]
Documentation = "https://microsoft.github.io/MLOS/source_tree_docs/mlos_bench/"
Expand Down

0 comments on commit e514908

Please sign in to comment.