dask · mrocklin · Jul 16, 2018 · May 18, 2018 · May 18, 2018 · May 18, 2018
diff --git a/ci/none.sh b/ci/none.sh
@@ -1,7 +1,5 @@
 #!/usr/bin/env bash
 
-set -x
-
 function jobqueue_before_install {
   # Install miniconda
   ./ci/conda_setup.sh

diff --git a/ci/pbs.sh b/ci/pbs.sh
@@ -1,7 +1,5 @@
 #!/usr/bin/env bash
 
-set -x
-
 function jobqueue_before_install {
     docker version
     docker-compose version

diff --git a/ci/sge.sh b/ci/sge.sh
@@ -1,7 +1,5 @@
 #!/usr/bin/env bash
 
-set -x
-
 function jobqueue_before_install {
     docker version
     docker-compose version

diff --git a/ci/slurm.sh b/ci/slurm.sh
@@ -1,7 +1,5 @@
 #!/usr/bin/env bash
 
-set -x
-
 function jobqueue_before_install {
     docker version
     docker-compose version

diff --git a/dask_jobqueue/config.py b/dask_jobqueue/config.py
@@ -1,11 +1,10 @@
-from __future__ import print_function, division, absolute_import
+from __future__ import absolute_import, division, print_function
 
 import os
 
 import dask
 import yaml
 
-
 fn = os.path.join(os.path.dirname(__file__), 'jobqueue.yaml')
 dask.config.ensure_file(source=fn)
 

diff --git a/dask_jobqueue/core.py b/dask_jobqueue/core.py
@@ -1,19 +1,23 @@
-from contextlib import contextmanager
 import logging
+import math
 import shlex
 import socket
 import subprocess
 import sys
 import warnings
+from collections import OrderedDict
+from contextlib import contextmanager
 
 import dask
 import docrep
 from distributed import LocalCluster
 from distributed.deploy import Cluster
-from distributed.utils import (get_ip_interface, ignoring, parse_bytes, tmpfile,
-                               format_bytes)
+from distributed.diagnostics.plugin import SchedulerPlugin
+from distributed.utils import (
+    format_bytes, get_ip_interface, parse_bytes, tmpfile)
 
 logger = logging.getLogger(__name__)
+logger.setLevel(logging.INFO)
 docstrings = docrep.DocstringProcessor()
 
 
@@ -28,6 +32,46 @@
 """.strip()
 
 
+def _job_id_from_worker_name(name):
+    ''' utility to parse the job ID from the worker name
+
+    template: 'prefix[jobid]suffix'
+    '''
+    return name.split('[', 1)[1].split(']')[0]
+
+
+class JobQueuePlugin(SchedulerPlugin):
+    def __init__(self):
+        self.pending_jobs = OrderedDict()
+        self.running_jobs = OrderedDict()
+        self.finished_jobs = OrderedDict()
+        self.all_workers = {}
+
+    def add_worker(self, scheduler, worker=None, name=None, **kwargs):
+        ''' Run when a new worker enters the cluster'''
+        w = scheduler.workers[worker]
+        job_id = _job_id_from_worker_name(w.name)
+        self.all_workers[worker] = (w.name, job_id)
+
+        # if this is the first worker for this job, move job to running
+        if job_id not in self.running_jobs:
+            self.running_jobs[job_id] = self.pending_jobs.pop(job_id)
+
+        # add worker to dict of workers in this job
+        self.running_jobs[job_id][w.name] = w
+
+    def remove_worker(self, scheduler=None, worker=None, **kwargs):
+        ''' Run when a worker leaves the cluster'''
+        name, job_id = self.all_workers[worker]
+
+        # remove worker from this job
+        del self.running_jobs[job_id][name]
+
+        # once there are no more workers, move this job to finished_jobs
+        if not self.running_jobs[job_id]:
+            self.finished_jobs[job_id] = self.running_jobs.pop(job_id)
+
+
 @docstrings.get_sectionsf('JobQueueCluster')
 class JobQueueCluster(Cluster):
     """ Base class to launch Dask Clusters for Job queues
@@ -87,6 +131,8 @@ class JobQueueCluster(Cluster):
     submit_command = None
     cancel_command = None
     scheduler_name = ''
+    _adaptive_options = {
+        'worker_key': lambda ws: _job_id_from_worker_name(ws.name)}
 
     def __init__(self,
                  name=None,
@@ -155,15 +201,17 @@ def __init__(self,
 
         self.local_cluster = LocalCluster(n_workers=0, ip=host, **kwargs)
 
-        # Keep information on process, cores, and memory, for use in subclasses
-        self.worker_memory = parse_bytes(memory)
-
+        # Keep information on process, threads and memory, for use in
+        # subclasses
+        self.worker_memory = parse_bytes(memory) if memory is not None else None
         self.worker_processes = processes
         self.worker_cores = cores
         self.name = name
 
-        self.jobs = dict()
-        self.n = 0
+        # plugin for tracking job status
+        self._scheduler_plugin = JobQueuePlugin()
+        self.local_cluster.scheduler.add_plugin(self._scheduler_plugin)
+
         self._adaptive = None
 
         self._env_header = '\n'.join(env_extra)
@@ -181,26 +229,42 @@ def __init__(self,
         self._command_template += " --memory-limit %s" % mem
 
         if name is not None:
-            self._command_template += " --name %s" % name
-            self._command_template += "-%(n)d" # Keep %(n) to be replaced later
+            # worker names follow this template: {NAME}[{JOB_ID}]
+            # {JOB_ID} is an environment variable defined by the individual
+            # job scrips/schedulers
+            self._command_template += " --name %s[${JOB_ID}]" % name
         if death_timeout is not None:
             self._command_template += " --death-timeout %s" % death_timeout
         if local_directory is not None:
             self._command_template += " --local-directory %s" % local_directory
         if extra is not None:
             self._command_template += extra
 
+    @property
+    def pending_jobs(self):
+        """ Jobs pending in the queue """
+        return self._scheduler_plugin.pending_jobs
+
+    @property
+    def running_jobs(self):
+        """ Jobs with currenly active workers """
+        return self._scheduler_plugin.running_jobs
+
+    @property
+    def finished_jobs(self):
+        """ Jobs that have finished """
+        return self._scheduler_plugin.finished_jobs
+
     @property
     def worker_threads(self):
         return int(self.worker_cores / self.worker_processes)
 
     def job_script(self):
         """ Construct a job submission script """
-        self.n += 1
-        template = self._command_template % {'n': self.n}
-        return self._script_template % {'job_header': self.job_header,
-                                        'env_header': self._env_header,
-                                        'worker_command': template}
+        pieces = {'job_header': self.job_header,
+                  'env_header': self._env_header,
+                  'worker_command': self._command_template}
+        return self._script_template % pieces
 
     @contextmanager
     def job_file(self):
@@ -212,14 +276,12 @@ def job_file(self):
 
     def start_workers(self, n=1):
         """ Start workers and point them to our local scheduler """
-        workers = []
-        for _ in range(n):
+        num_jobs = math.ceil(n / self.worker_processes)
+        for _ in range(num_jobs):
             with self.job_file() as fn:
                 out = self._call(shlex.split(self.submit_command) + [fn])
                 job = self._job_id_from_submit_output(out.decode())
-                self.jobs[self.n] = job
-                workers.append(self.n)
-        return workers
+                self._scheduler_plugin.pending_jobs[job] = {}
 
     @property
     def scheduler(self):
@@ -248,12 +310,12 @@ def _calls(self, cmds):
         Also logs any stderr information
         """
         logger.debug("Submitting the following calls to command line")
+        procs = []
         for cmd in cmds:
             logger.debug(' '.join(cmd))
-        procs = [subprocess.Popen(cmd,
-                                  stdout=subprocess.PIPE,
-                                  stderr=subprocess.PIPE)
-                 for cmd in cmds]
+            procs.append(subprocess.Popen(cmd,
+                                          stdout=subprocess.PIPE,
+                                          stderr=subprocess.PIPE))
 
         result = []
         for proc in procs:
@@ -269,31 +331,49 @@ def _call(self, cmd):
 
     def stop_workers(self, workers):
         """ Stop a list of workers"""
+        logger.debug("Stopping workers: %s" % workers)
         if not workers:
             return
-        workers = list(map(int, workers))
-        jobs = [self.jobs[w] for w in workers]
-        self._call([self.cancel_command] + list(jobs))
+        jobs = []
         for w in workers:
-            with ignoring(KeyError):
-                del self.jobs[w]
+            if isinstance(w, dict):
+                jobs.append(_job_id_from_worker_name(w['name']))
+            else:
+                jobs.append(_job_id_from_worker_name(w.name))
+        self.stop_jobs(set(jobs))
+
+    def stop_jobs(self, jobs):
+        """ Stop a list of jobs"""
+        logger.debug("Stopping jobs: %s" % jobs)
+        if jobs:
+            self._call([self.cancel_command] + list(set(jobs)))
 
     def scale_up(self, n, **kwargs):
         """ Brings total worker count up to ``n`` """
-        return self.start_workers(n - len(self.jobs))
+        logger.debug("Scaling up to %d workers." % n)
+        active_and_pending = sum([len(j) for j in self.running_jobs.values()])
+        active_and_pending += self.worker_processes * len(self.pending_jobs)
+        logger.debug("Found %d active/pending workers." % active_and_pending)
+        self.start_workers(n - active_and_pending)
 
     def scale_down(self, workers):
         ''' Close the workers with the given addresses '''
-        if isinstance(workers, dict):
-            names = {v['name'] for v in workers.values()}
-            job_ids = {name.split('-')[-2] for name in names}
-            self.stop_workers(job_ids)
+        logger.debug("Scaling down. Workers: %s" % workers)
+        worker_states = []
+        for w in workers:
+            try:
+                # Get the actual WorkerState
+                worker_states.append(self.scheduler.workers[w])
+            except KeyError:
+                logger.debug('worker %s is already gone' % w)
+        self.stop_workers(worker_states)
 
     def __enter__(self):
         return self
 
     def __exit__(self, type, value, traceback):
-        self.stop_workers(self.jobs)
+        jobs = list(self.pending_jobs.keys()) + list(self.running_jobs.keys())
+        self.stop_jobs(set(jobs))
         self.local_cluster.__exit__(type, value, traceback)
 
     def _job_id_from_submit_output(self, out):

diff --git a/dask_jobqueue/moab.py b/dask_jobqueue/moab.py
@@ -30,7 +30,7 @@ class MoabCluster(PBSCluster):
                               memory='16G', resource_spec='96G',
                               job_extra=['-d /home/First.Last', '-M none'],
                               local_directory=os.getenv('TMPDIR', '/tmp'))
-    >>> cluster.start_workers(10)  # this may take a few seconds to launch
+    >>> cluster.start_workers(10)  # submit enough jobs to deploy 10 workers
 
     >>> from dask.distributed import Client
     >>> client = Client(cluster)

diff --git a/dask_jobqueue/pbs.py b/dask_jobqueue/pbs.py
@@ -75,7 +75,10 @@ def __init__(self, queue=None, project=None, resource_spec=None, walltime=None,
         # Instantiate args and parameters from parent abstract class
         super(PBSCluster, self).__init__(**kwargs)
 
-        header_lines = []
+        # Try to find a project name from environment variable
+        project = project or os.environ.get('PBS_ACCOUNT')
+
+        header_lines = ['#!/usr/bin/env bash']
         # PBS header build
         if self.name is not None:
             header_lines.append('#PBS -N %s' % self.name)
@@ -95,6 +98,7 @@ def __init__(self, queue=None, project=None, resource_spec=None, walltime=None,
         if walltime is not None:
             header_lines.append('#PBS -l walltime=%s' % walltime)
         header_lines.extend(['#PBS %s' % arg for arg in job_extra])
+        header_lines.append('JOB_ID=${PBS_JOBID%.*}')
 
         # Declare class attribute that shall be overriden
         self.job_header = '\n'.join(header_lines)

diff --git a/dask_jobqueue/sge.py b/dask_jobqueue/sge.py
@@ -1,3 +1,5 @@
+from __future__ import absolute_import, division, print_function
+
 import logging
 
 import dask
@@ -56,8 +58,7 @@ def __init__(self, queue=None, project=None, resource_spec=None, walltime=None,
 
         super(SGECluster, self).__init__(**kwargs)
 
-        header_lines = ['#!/bin/bash']
-
+        header_lines = ['#!/usr/bin/env bash']
         if self.name is not None:
             header_lines.append('#$ -N %(name)s')
         if queue is not None:

diff --git a/dask_jobqueue/slurm.py b/dask_jobqueue/slurm.py
@@ -51,7 +51,7 @@ class SLURMCluster(JobQueueCluster):
     >>> cluster.adapt()
     """, 4)
 
-    #Override class variables
+    # Override class variables
     submit_command = 'sbatch --parsable'
     cancel_command = 'scancel'
     scheduler_name = 'slurm'
@@ -74,7 +74,7 @@ def __init__(self, queue=None, project=None, walltime=None,
         super(SLURMCluster, self).__init__(**kwargs)
 
         # Always ask for only one task
-        header_lines = []
+        header_lines = ['#!/usr/bin/env bash']
         # SLURM header build
         if self.name is not None:
             header_lines.append('#SBATCH -J %s' % self.name)
@@ -100,6 +100,8 @@ def __init__(self, queue=None, project=None, walltime=None,
             header_lines.append('#SBATCH -t %s' % walltime)
         header_lines.extend(['#SBATCH %s' % arg for arg in job_extra])
 
+        header_lines.append('JOB_ID=${SLURM_JOB_ID%;*}')
+
         # Declare class attribute that shall be overriden
         self.job_header = '\n'.join(header_lines)
 

diff --git a/dask_jobqueue/tests/__init__.py b/dask_jobqueue/tests/__init__.py
@@ -0,0 +1,2 @@
+
+QUEUE_WAIT = 60  # seconds
diff --git a/dask_jobqueue/tests/test_jobqueue_core.py b/dask_jobqueue/tests/test_jobqueue_core.py
@@ -3,11 +3,6 @@
 from dask_jobqueue import JobQueueCluster
 
 
-def test_jq_core_placeholder():
-    # to test that CI is working
-    pass
-
-
 def test_errors():
     with pytest.raises(NotImplementedError) as info:
         JobQueueCluster(cores=4)
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,2 @@

		QUEUE_WAIT = 60 # seconds
Copy link Member lesteve Jul 16, 2018 Choose a reason for hiding this comment The reason will be displayed to describe this comment to others. Learn more. It's great to have a constant that is used consistently in the test! Is there a good reason to leave this to 60s? If not a smaller number like 15s (I think that was the number before) would be good. jhamman reacted with thumbs up emoji