Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[Open for review] Store results after each job completion #526

Merged
merged 14 commits into from
Jun 9, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
80 changes: 44 additions & 36 deletions amlb/benchmark.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
- run the jobs.
- collect and save results.
"""
import time
from copy import copy
from enum import Enum
from importlib import import_module, invalidate_caches
Expand All @@ -24,8 +25,9 @@
from .datautils import read_csv
from .resources import get as rget, config as rconfig, output_dirs as routput_dirs
from .results import ErrorResult, Scoreboard, TaskResult
from .utils import Namespace as ns, OSMonitoring, as_list, datetime_iso, flatten, json_dump, lazy_property, profile, repr_def, \
run_cmd, run_script, signal_handler, str2bool, str_sanitize, system_cores, system_memory_mb, system_volume_mb, touch
from .utils import Namespace as ns, OSMonitoring, as_list, datetime_iso, file_lock, flatten, json_dump, \
lazy_property, profile, repr_def, run_cmd, run_script, signal_handler, str2bool, str_sanitize, \
system_cores, system_memory_mb, system_volume_mb, touch


log = logging.getLogger(__name__)
Expand Down Expand Up @@ -209,22 +211,21 @@ def run(self, tasks: Union[str, List[str]] = None, folds: Union[int, List[int]]
results = self._run_jobs(jobs)
log.info(f"Processing results for {self.sid}")
log.debug(results)
if tasks is None:
scoreboard = self._process_results(results)
else:
for task_def in task_defs:
task_results = filter(lambda res: res.result is not None and res.result.task == task_def.name, results)
scoreboard = self._process_results(task_results, task_name=task_def.name)
return scoreboard

if not rconfig().results.incremental_save:
self._process_results(results)
return self._results_summary()
finally:
self.cleanup()

def _create_job_runner(self, jobs):
on_new_result = self._process_results if rconfig().results.incremental_save else None
if self.parallel_jobs == 1:
return SimpleJobRunner(jobs)
return SimpleJobRunner(jobs, on_new_result=on_new_result)
else:
# return ThreadPoolExecutorJobRunner(jobs, self.parallel_jobs)
return MultiThreadingJobRunner(jobs, self.parallel_jobs,
return MultiThreadingJobRunner(jobs,
on_new_result=on_new_result,
parallel_jobs=self.parallel_jobs,
delay_secs=rconfig().job_scheduler.delay_between_jobs,
done_async=True)

Expand Down Expand Up @@ -254,10 +255,6 @@ def on_interrupt(*_):
pass
finally:
results = self.job_runner.results

for res in results:
if res.result is not None and math.isnan(res.result.duration):
res.result.duration = res.duration
return results

def _benchmark_tasks(self):
Expand Down Expand Up @@ -327,34 +324,45 @@ def _skip_job(self, task_def, fold):

return False

def _process_results(self, results, task_name=None):
def _process_results(self, results):
if not isinstance(results, list):
results = [results]
scores = list(filter(None, flatten([res.result for res in results])))
if len(scores) == 0:
return None

board = (Scoreboard(scores,
framework_name=self.framework_name,
task_name=task_name,
scores_dir=self.output_dirs.scores) if task_name
else Scoreboard(scores,
framework_name=self.framework_name,
benchmark_name=self.benchmark_name,
scores_dir=self.output_dirs.scores))

if rconfig().results.save:
self._save(board)
for res in results:
if math.isnan(res.result.duration):
res.result.duration = res.duration

log.info("Summing up scores for current run:\n%s",
board.as_printable_data_frame(verbosity=2).dropna(how='all', axis='columns').to_string(index=False))
return board.as_data_frame()
board = Scoreboard(scores, scores_dir=self.output_dirs.scores)
self._save(board)
return board

def _save(self, board):
board.save(append=True)
self._append(board)

def _append(self, board):
Scoreboard.all().append(board).save()
Scoreboard.all(rconfig().output_dir).append(board).save()
self._save_global(board)

def _save_global(self, board):
# Scoreboard.all().append(board).save()
if rconfig().results.global_save:
global_board = Scoreboard.all(rconfig().output_dir, autoload=False)
dest_path = global_board.path
timeout = rconfig().results.global_lock_timeout
try:
with file_lock(dest_path, timeout=timeout):
global_board.load().append(board).save()
except TimeoutError:
log.exception("Failed to acquire the lock on `%s` after %ss: "
"the partial board `%s` could not be appended to `%s`",
dest_path, timeout, board.path, dest_path)

def _results_summary(self, scoreboard=None):
board = scoreboard or Scoreboard.all(self.output_dirs.scores)
results = board.as_printable_data_frame(verbosity=2)
log.info("Summing up scores for current run:\n%s",
results.dropna(how='all', axis='columns').to_string(index=False))
return board.as_data_frame()

@lazy_property
def output_dirs(self):
Expand Down
66 changes: 51 additions & 15 deletions amlb/job.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@
import queue
import signal
import threading
from typing import Callable, List, Optional

from .utils import Namespace, Timer, ThreadSafeCounter, InterruptTimeout, is_main_thread, raise_in_thread, signal_handler

Expand Down Expand Up @@ -64,7 +65,10 @@ def is_state_transition_ok(cls, old_state: State, new_state: State):

printer = pprint.PrettyPrinter(indent=2, compact=True)

def __init__(self, name="", timeout_secs=None, priority=None, raise_on_failure=False):
def __init__(self, name: str = "",
timeout_secs: int = -1,
priority: Optional[int] = None,
raise_on_failure: bool = False):
"""

:param name:
Expand Down Expand Up @@ -195,18 +199,20 @@ class JobRunner:
(State.stopping, [State.stopped]),
(State.stopped, None)
]
END_Q = object()

@classmethod
def is_state_transition_ok(cls, old_state: State, new_state: State):
allowed = next((head for tail, head in cls.state_machine if tail == old_state), None)
return allowed and new_state in allowed

def __init__(self, jobs):
def __init__(self, jobs: List, on_new_result: Optional[Callable] = None):
self.jobs = jobs
self.results = []
self.state = None
self._queue = None
self._last_priority = 0
self._on_new_result = on_new_result
self.set_state(State.created)

def start(self):
Expand Down Expand Up @@ -235,7 +241,7 @@ def stop_if_complete(self):
if 0 < len(self.jobs) == len(self.results):
self.stop()

def put(self, job, priority=None):
def put(self, job: Job, priority: Optional[int] = None):
if self.state in [State.stopping, State.stopped]:
return
if priority is None:
Expand All @@ -248,7 +254,7 @@ def put(self, job, priority=None):
else:
log.warning("Ignoring job `%s`. Runner state: `%s`", job.name, self.state)

def reschedule(self, job, priority=None):
def reschedule(self, job: Job, priority: Optional[int] = None):
if self.state not in [State.running]:
return
job.reschedule()
Expand All @@ -267,6 +273,11 @@ def set_state(self, state: State):
log.exception("Error when handling state change to %s for job runner: %s", state, str(e))
return not skip_default

def _add_result(self, result):
self.results.append(result)
if self._on_new_result is not None:
self._on_new_result(result)

def __iter__(self):
return self

Expand All @@ -289,7 +300,7 @@ def _run(self):

def _stop(self):
if self._queue:
self._queue.put((-1, None))
self._queue.put((-1, JobRunner.END_Q))
jobs = self.jobs.copy()
self.jobs.clear()
for job in jobs:
Expand All @@ -301,19 +312,19 @@ def _on_state(self, state: State):

class SimpleJobRunner(JobRunner):

def __init__(self, jobs):
super().__init__(jobs)
def __init__(self, jobs: List, on_new_result: Optional[Callable] = None):
super().__init__(jobs, on_new_result=on_new_result)
self._interrupt = threading.Event()

def _run(self):
for job in self:
if job is None or self._interrupt.is_set():
if job is JobRunner.END_Q or self._interrupt.is_set():
break
result = job.start()
if job.state is State.rescheduling:
self.reschedule(job)
else:
self.results.append(result)
self._add_result(result)
job.done()
self.stop_if_complete()

Expand All @@ -328,16 +339,32 @@ class QueueingStrategy:
keep_queue_full = 0
enforce_job_priority = 1

def __init__(self, jobs, parallel_jobs=1, done_async=True, delay_secs=0,
def __init__(self, jobs: List,
on_new_result: Optional[Callable] = None,
parallel_jobs: int = 1,
done_async: bool = True,
delay_secs: int = 0,
queueing_strategy: QueueingStrategy = QueueingStrategy.keep_queue_full,
use_daemons=False):
super().__init__(jobs)
use_daemons: bool = False):
super().__init__(jobs, on_new_result=on_new_result)
self.parallel_jobs = parallel_jobs
self._done_async = done_async
self._delay = delay_secs # short sleep between enqueued jobs to make console more readable
self._daemons = use_daemons
self._queueing_strategy = queueing_strategy
self._interrupt = threading.Event()
self._exec = None

def _add_result(self, result):
sup_call = super()._add_result
if self._exec:
self._exec.submit(sup_call, result)
else:
log.warning("Application is submitting a function while the thread executor is not running: executing the function in the calling thread.")
try:
sup_call(result)
except:
pass

def _run(self):
q = queue.Queue()
Expand All @@ -349,13 +376,13 @@ def worker():
job = q.get()
available_workers.dec()
try:
if job is None or self._interrupt.is_set():
if job is JobRunner.END_Q or self._interrupt.is_set():
break
result = job.start()
if job.state is State.rescheduling:
self.reschedule(job)
else:
self.results.append(result)
self._add_result(result)
if self._done_async:
job.done()
self.stop_if_complete()
Expand Down Expand Up @@ -387,7 +414,7 @@ def worker():
q.maxsize = self.parallel_jobs # resize to ensure that all workers can get a None job
for _ in range(self.parallel_jobs):
try:
q.put_nowait(None) # stopping workers
q.put_nowait(JobRunner.END_Q) # stopping workers
except:
pass
for thread in threads:
Expand All @@ -397,8 +424,17 @@ def worker():
job.done()

def _on_state(self, state: State):
if state is State.starting:
self._exec = ThreadPoolExecutor(max_workers=1, thread_name_prefix="job_runner_exec_")
if state is State.stopping:
self._interrupt.set()
if self._exec is not None:
try:
self._exec.shutdown(wait=True)
except:
pass
finally:
self._exec = None


class MultiProcessingJobRunner(JobRunner):
Expand Down
Loading