-
-
Notifications
You must be signed in to change notification settings - Fork 134
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Fixes for Adaptive #63
Changes from all commits
7c56b1d
6b35688
507be82
99d0f1f
78a22ff
62e050c
d121180
2329bfe
92eaf4e
9084a35
4776892
ef62f59
a4e007a
c19e4da
5d5fd85
115b0c1
cde3ca4
75f2c6a
66db52d
ea7d56d
a6d31d2
25965c0
ab4363a
604a563
1e0455e
ace37ad
56e2990
914244c
359be59
1441634
0bf53d1
cc2628f
90dd730
9fe2178
18dfe31
d98b141
b303275
13e5dc3
b4877ad
667369e
bf99d29
292b595
627f873
f2b2a92
1f0dc71
a1b102d
c988f1e
619047f
8db65eb
ef16298
3803918
aad58d4
fa1b717
aeea2e5
b93a7c4
0a2e304
1a1fe75
8c32872
9c43f43
a02abc8
ee89f20
4abcea1
0c7425a
5d4552d
8a150c9
ca0c727
ce007df
c23ce7c
7618467
d5e42b3
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,7 +1,5 @@ | ||
#!/usr/bin/env bash | ||
|
||
set -x | ||
|
||
function jobqueue_before_install { | ||
# Install miniconda | ||
./ci/conda_setup.sh | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,17 +1,22 @@ | ||
from contextlib import contextmanager | ||
from __future__ import absolute_import, division, print_function | ||
|
||
import logging | ||
import math | ||
import shlex | ||
import socket | ||
import subprocess | ||
import sys | ||
import warnings | ||
from collections import OrderedDict | ||
from contextlib import contextmanager | ||
|
||
import dask | ||
import docrep | ||
from distributed import LocalCluster | ||
from distributed.deploy import Cluster | ||
from distributed.utils import (get_ip_interface, ignoring, parse_bytes, tmpfile, | ||
format_bytes) | ||
from distributed.diagnostics.plugin import SchedulerPlugin | ||
from distributed.utils import ( | ||
format_bytes, get_ip_interface, parse_bytes, tmpfile) | ||
|
||
logger = logging.getLogger(__name__) | ||
docstrings = docrep.DocstringProcessor() | ||
|
@@ -28,6 +33,54 @@ | |
""".strip() | ||
|
||
|
||
def _job_id_from_worker_name(name): | ||
''' utility to parse the job ID from the worker name | ||
|
||
template: 'prefix--jobid--suffix' | ||
''' | ||
_, job_id, _ = name.split('--') | ||
return job_id | ||
|
||
|
||
class JobQueuePlugin(SchedulerPlugin): | ||
def __init__(self): | ||
self.pending_jobs = OrderedDict() | ||
self.running_jobs = OrderedDict() | ||
self.finished_jobs = OrderedDict() | ||
self.all_workers = {} | ||
|
||
def add_worker(self, scheduler, worker=None, name=None, **kwargs): | ||
''' Run when a new worker enters the cluster''' | ||
logger.debug("adding worker %s" % worker) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Nitpick: generally with logging, you should do: logger.debug("adding worker %s", worker) I think this avoids to do some unnecessary formatting work when not logging. |
||
w = scheduler.workers[worker] | ||
job_id = _job_id_from_worker_name(w.name) | ||
logger.debug("job id for new worker: %s" % job_id) | ||
self.all_workers[worker] = (w.name, job_id) | ||
|
||
# if this is the first worker for this job, move job to running | ||
if job_id not in self.running_jobs: | ||
logger.debug("this is a new job") | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Maybe add the job_id in this log output? I guess job_id is in the previous logging statement but sometimes I find it more convenient to have a single logging statement as stand-alone as possible rather than having to go up a few lines to figure out the information you need. |
||
self.running_jobs[job_id] = self.pending_jobs.pop(job_id) | ||
|
||
# add worker to dict of workers in this job | ||
self.running_jobs[job_id][w.name] = w | ||
|
||
def remove_worker(self, scheduler=None, worker=None, **kwargs): | ||
''' Run when a worker leaves the cluster''' | ||
logger.debug("removing worker %s" % worker) | ||
name, job_id = self.all_workers[worker] | ||
logger.debug("removing worker name (%s) and" | ||
"job_id (%s)" % (name, job_id)) | ||
|
||
# remove worker from this job | ||
del self.running_jobs[job_id][name] | ||
|
||
# once there are no more workers, move this job to finished_jobs | ||
if not self.running_jobs[job_id]: | ||
logger.debug("that was the last worker for job %s" % job_id) | ||
self.finished_jobs[job_id] = self.running_jobs.pop(job_id) | ||
|
||
|
||
@docstrings.get_sectionsf('JobQueueCluster') | ||
class JobQueueCluster(Cluster): | ||
""" Base class to launch Dask Clusters for Job queues | ||
|
@@ -87,6 +140,8 @@ class JobQueueCluster(Cluster): | |
submit_command = None | ||
cancel_command = None | ||
scheduler_name = '' | ||
_adaptive_options = { | ||
'worker_key': lambda ws: _job_id_from_worker_name(ws.name)} | ||
|
||
def __init__(self, | ||
name=None, | ||
|
@@ -155,15 +210,17 @@ def __init__(self, | |
|
||
self.local_cluster = LocalCluster(n_workers=0, ip=host, **kwargs) | ||
|
||
# Keep information on process, cores, and memory, for use in subclasses | ||
self.worker_memory = parse_bytes(memory) | ||
|
||
# Keep information on process, threads and memory, for use in | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. You probably want to revert the change in this comment |
||
# subclasses | ||
self.worker_memory = parse_bytes(memory) if memory is not None else None | ||
self.worker_processes = processes | ||
self.worker_cores = cores | ||
self.name = name | ||
|
||
self.jobs = dict() | ||
self.n = 0 | ||
# plugin for tracking job status | ||
self._scheduler_plugin = JobQueuePlugin() | ||
self.local_cluster.scheduler.add_plugin(self._scheduler_plugin) | ||
|
||
self._adaptive = None | ||
|
||
self._env_header = '\n'.join(env_extra) | ||
|
@@ -179,47 +236,60 @@ def __init__(self, | |
mem = format_bytes(self.worker_memory / self.worker_processes) | ||
mem = mem.replace(' ', '') | ||
self._command_template += " --memory-limit %s" % mem | ||
self._command_template += " --name %s--${JOB_ID}--" % name | ||
|
||
if name is not None: | ||
self._command_template += " --name %s" % name | ||
self._command_template += "-%(n)d" # Keep %(n) to be replaced later | ||
if death_timeout is not None: | ||
self._command_template += " --death-timeout %s" % death_timeout | ||
if local_directory is not None: | ||
self._command_template += " --local-directory %s" % local_directory | ||
if extra is not None: | ||
self._command_template += extra | ||
|
||
@property | ||
def pending_jobs(self): | ||
""" Jobs pending in the queue """ | ||
return self._scheduler_plugin.pending_jobs | ||
|
||
@property | ||
def running_jobs(self): | ||
""" Jobs with currenly active workers """ | ||
return self._scheduler_plugin.running_jobs | ||
|
||
@property | ||
def finished_jobs(self): | ||
""" Jobs that have finished """ | ||
return self._scheduler_plugin.finished_jobs | ||
|
||
@property | ||
def worker_threads(self): | ||
return int(self.worker_cores / self.worker_processes) | ||
|
||
def job_script(self): | ||
""" Construct a job submission script """ | ||
self.n += 1 | ||
template = self._command_template % {'n': self.n} | ||
return self._script_template % {'job_header': self.job_header, | ||
'env_header': self._env_header, | ||
'worker_command': template} | ||
pieces = {'job_header': self.job_header, | ||
'env_header': self._env_header, | ||
'worker_command': self._command_template} | ||
return self._script_template % pieces | ||
|
||
@contextmanager | ||
def job_file(self): | ||
""" Write job submission script to temporary file """ | ||
with tmpfile(extension='sh') as fn: | ||
with open(fn, 'w') as f: | ||
logger.debug("writing job script: \n%s" % self.job_script()) | ||
f.write(self.job_script()) | ||
yield fn | ||
|
||
def start_workers(self, n=1): | ||
""" Start workers and point them to our local scheduler """ | ||
workers = [] | ||
for _ in range(n): | ||
logger.debug('starting %s workers' % n) | ||
num_jobs = math.ceil(n / self.worker_processes) | ||
for _ in range(num_jobs): | ||
with self.job_file() as fn: | ||
out = self._call(shlex.split(self.submit_command) + [fn]) | ||
job = self._job_id_from_submit_output(out.decode()) | ||
self.jobs[self.n] = job | ||
workers.append(self.n) | ||
return workers | ||
logger.debug("started job: %s" % job) | ||
self.pending_jobs[job] = {} | ||
|
||
@property | ||
def scheduler(self): | ||
|
@@ -248,12 +318,12 @@ def _calls(self, cmds): | |
Also logs any stderr information | ||
""" | ||
logger.debug("Submitting the following calls to command line") | ||
procs = [] | ||
for cmd in cmds: | ||
logger.debug(' '.join(cmd)) | ||
procs = [subprocess.Popen(cmd, | ||
stdout=subprocess.PIPE, | ||
stderr=subprocess.PIPE) | ||
for cmd in cmds] | ||
procs.append(subprocess.Popen(cmd, | ||
stdout=subprocess.PIPE, | ||
stderr=subprocess.PIPE)) | ||
|
||
result = [] | ||
for proc in procs: | ||
|
@@ -269,33 +339,60 @@ def _call(self, cmd): | |
|
||
def stop_workers(self, workers): | ||
""" Stop a list of workers""" | ||
logger.debug("Stopping workers: %s" % workers) | ||
if not workers: | ||
return | ||
workers = list(map(int, workers)) | ||
jobs = [self.jobs[w] for w in workers] | ||
self._call([self.cancel_command] + list(jobs)) | ||
jobs = self._stop_pending_jobs() # stop pending jobs too | ||
for w in workers: | ||
with ignoring(KeyError): | ||
del self.jobs[w] | ||
if isinstance(w, dict): | ||
jobs.append(_job_id_from_worker_name(w['name'])) | ||
else: | ||
jobs.append(_job_id_from_worker_name(w.name)) | ||
self.stop_jobs(set(jobs)) | ||
|
||
def stop_jobs(self, jobs): | ||
""" Stop a list of jobs""" | ||
logger.debug("Stopping jobs: %s" % jobs) | ||
if jobs: | ||
jobs = list(jobs) | ||
self._call([self.cancel_command] + list(set(jobs))) | ||
|
||
def scale_up(self, n, **kwargs): | ||
""" Brings total worker count up to ``n`` """ | ||
return self.start_workers(n - len(self.jobs)) | ||
logger.debug("Scaling up to %d workers." % n) | ||
active_and_pending = sum([len(j) for j in self.running_jobs.values()]) | ||
active_and_pending += self.worker_processes * len(self.pending_jobs) | ||
logger.debug("Found %d active/pending workers." % active_and_pending) | ||
self.start_workers(n - active_and_pending) | ||
|
||
def scale_down(self, workers): | ||
''' Close the workers with the given addresses ''' | ||
if isinstance(workers, dict): | ||
names = {v['name'] for v in workers.values()} | ||
job_ids = {name.split('-')[-2] for name in names} | ||
self.stop_workers(job_ids) | ||
logger.debug("Scaling down. Workers: %s" % workers) | ||
worker_states = [] | ||
for w in workers: | ||
try: | ||
# Get the actual WorkerState | ||
worker_states.append(self.scheduler.workers[w]) | ||
except KeyError: | ||
logger.debug('worker %s is already gone' % w) | ||
self.stop_workers(worker_states) | ||
|
||
def __enter__(self): | ||
return self | ||
|
||
def __exit__(self, type, value, traceback): | ||
self.stop_workers(self.jobs) | ||
jobs = self._stop_pending_jobs() | ||
jobs += list(self.running_jobs.keys()) | ||
self.stop_jobs(set(jobs)) | ||
self.local_cluster.__exit__(type, value, traceback) | ||
|
||
def _stop_pending_jobs(self): | ||
jobs = list(self.pending_jobs.keys()) | ||
logger.debug("Stopping pending jobs %s" % jobs) | ||
for job_id in jobs: | ||
del self.pending_jobs[job_id] | ||
return jobs | ||
|
||
def _job_id_from_submit_output(self, out): | ||
raise NotImplementedError('_job_id_from_submit_output must be ' | ||
'implemented when JobQueueCluster is ' | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,3 +1,5 @@ | ||
from __future__ import absolute_import, division, print_function | ||
|
||
import logging | ||
|
||
import dask | ||
|
@@ -56,8 +58,7 @@ def __init__(self, queue=None, project=None, resource_spec=None, walltime=None, | |
|
||
super(SGECluster, self).__init__(**kwargs) | ||
|
||
header_lines = ['#!/bin/bash'] | ||
|
||
header_lines = ['#!/usr/bin/env bash'] | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. So you don't have a solution for propagating JOB_ID var in sge script? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. good catch. I thought I did but I'll have to sort it out. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I take that back. SGE uses |
||
if self.name is not None: | ||
header_lines.append('#$ -N %(name)s') | ||
if queue is not None: | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I find
finished_jobs
is not such a great name because those are jobs that have beenqdel
ed. In my mindfinished_jobs
means the job has finished normally (i.e. was notqdel
ed). I don't have a very good suggestion for a better name though, maybestopped_jobs
orcanceled_jobs
.