From 062f8e5462568dc71c749dee5c1a9a6ad92d4f87 Mon Sep 17 00:00:00 2001
From: AntonioJBT <a.berlanga@imperial.ac.uk>
Date: Thu, 4 Jan 2018 18:49:38 +0000
Subject: [PATCH 01/21] made Control.py more flexible, moved report memory to
 ini file

---
 CGATPipelines/Pipeline/Control.py        | 225 +++++++++++++++++++++--
 CGATPipelines/Pipeline/__init__.py       |   4 +-
 CGATPipelines/configuration/pipeline.ini |   2 +
 3 files changed, 217 insertions(+), 14 deletions(-)

diff --git a/CGATPipelines/Pipeline/Control.py b/CGATPipelines/Pipeline/Control.py
index fabb82a4..e785de16 100644
--- a/CGATPipelines/Pipeline/Control.py
+++ b/CGATPipelines/Pipeline/Control.py
@@ -27,6 +27,9 @@
 import tempfile
 import time
 import io
+import glob
+import fnmatch
+import importlib
 
 from multiprocessing.pool import ThreadPool
 
@@ -68,13 +71,64 @@
 GLOBAL_OPTIONS, GLOBAL_ARGS = None, None
 
 
-def writeConfigFiles(pipeline_path, general_path):
+def writeConfigFiles(paths):
+    #pipeline_path, pipeline_path_2, general_path):
     '''create default configuration files in `path`.
     '''
-
-    paths = [pipeline_path, general_path]
-    config_files = ['pipeline.ini', 'conf.py']
-
+    # TO DO: I've modified this function with workarounds to make it more
+    # flexible in order to find an ini file, find a configuration dir and 
+    # copy pre-run sphinx-quickstart files if they exist.
+    # Other than creating a 'report' dir, it should not change the way it is
+    # run from CGATPipelines.
+    # See also bottom of script for changes when calling the 'config' option 
+    # Antonio
+    #paths = [pipeline_path, pipeline_path_2, general_path]
+    report_dir = 'pipeline_report'
+    try:
+        os.mkdir(report_dir) # Sphinx config files will be copied here
+                             # CGATReport only needs its conf.py to generate the rest
+                             # though
+    except FileExistsError:
+        E.warn("directory `%s` already exists" % report_dir)
+        raise
+
+    # Look for ini file:
+    f_count = 0
+    INI_list = []
+    for path in paths:
+        if os.path.exists(path) and os.path.isdir(path):
+            for f in os.listdir(os.path.abspath(path)):
+                if fnmatch.fnmatch(f, 'pipeline*ini'):
+                    f_count += 1
+                    INI_file = f
+                    INI_list.extend([INI_file])
+
+    if f_count == 1:
+        config_files = [INI_file] # This is for the pipeline only
+
+    elif f_count > 1:
+        # Prioritise the file that contains the command called if more than one
+        # ini file are found:
+        for f in INI_list:
+            if caller_name in f:
+                INI_file = f
+                config_files = [INI_file]
+    else:
+        if f_count == 0:
+            print('''
+                  No configuration (ini) files found in:
+                  {}
+                  '''.format(paths)
+                  )
+        else:
+            print('''
+                  Found several ini files but could not prioritise based on:
+                  {}
+                  Exiting.
+                  '''.format(caller_name))
+            sys.exit()
+
+    # Copy pipeline ini file:
     for dest in config_files:
         if os.path.exists(dest):
             E.warn("file `%s` already exists - skipped" % dest)
@@ -87,10 +141,113 @@ def writeConfigFiles(pipeline_path, general_path):
                 E.info("created new configuration file `%s` " % dest)
                 break
         else:
-            raise ValueError(
-                "default config file for `%s` not found in %s" %
-                (config_files, paths))
+            raise ValueError('''default config file for `%s`
+                                not found in
+                                %s
+                                A pipeline cannot be run without this.
+                             ''' % (config_files, paths))
+
+    # Copy Sphinx configuration files, enforce copy of 'conf.py' in case
+    # CGATReport is used:
+    dest = 'conf.py'
+    if os.path.exists(dest):
+        E.warn("file `%s` already exists - skipped" % dest)
+
+    for path in paths:
+        src = os.path.join(path, dest)
+        if os.path.exists(src):
+            # Put sphinx files in separate dir:
+            shutil.copyfile(src, os.path.join(report_dir, dest))
+            # Create a softlink outside of report_dir dir for CGATReport:
+            os.symlink(os.path.join(report_dir, dest), str(dest))
+            E.info("created new configuration file `%s` " % dest)
+            break
 
+    else:
+        # Only warn as pipeline can be run without report:
+        E.warn('''default config file for `%s` not found in
+                  %s
+                  CGATReport nor Sphinx can be run without this''' % (dest, paths))
+
+    # If other Sphinx config files are found, copy them if there is a skeleton
+    # pipeline report to use:
+    E.info('Looking for additional Sphinx configuration files.')
+    sphinx_config_files = ['Makefile',
+                           'make.bat',
+                           '*.rst',
+                           '*.bib',
+                           ] # These are for a sphinx setup, not needed
+                             # with CGATReport
+                             # A 'report_pipeline_*.rst' template is
+                             # searched for below
+
+    # Look for a pipeline report file:
+    f_count = 0
+    for path in paths:
+        if os.path.exists(path):
+            for f in os.listdir(os.path.abspath(path)):
+                # TO DO:
+                # This pattern matching is particular to 
+                # https://github.com/AntonioJBT/project_quickstart
+                # Needs to be made more generic
+                if fnmatch.fnmatch(f, 'report_pipeline_*.rst'):
+                    f_count += 1
+                    pipeline_report_file = f
+
+    if f_count == 1:
+        sphinx_config_files.append(pipeline_report_file)
+
+    else:
+        # Only warn as pipeline can be run without report:
+        E.warn('''There is no pipeline report file matching
+                  report_pipeline_*.rst
+                  in the directories:
+                  {}
+                  {}
+                  or
+                  {}
+                  Ignore this if you are using CGATReport.
+                  '''.format(pipeline_path, pipeline_path_2, general_path)
+                  )
+
+    # Copy the files across if they are found:
+    f_count = 0
+    # Check all the paths and their files given above when searching for config files:
+    for path in paths:
+        if os.path.exists(path):
+            for f in os.listdir(path):
+                # For each file or search term given, match to an existing file:
+                for dest in sphinx_config_files:
+                    if fnmatch.fnmatch(f, dest):
+                        f_to_copy = f
+                        # If a match is found, walk the cwd to check it's not
+                        # already present:
+                        for root, dirs, files in os.walk('.'):
+                            if f_to_copy in files:
+                                E.warn("file `%s` already exists - skipped" % f_to_copy)
+                                continue
+
+                        # If not present, copy the file:
+                        else:
+                            f_count += 1
+                            src = os.path.join(path, f_to_copy)
+                            if os.path.exists(src):
+                                # Put sphinx files in separate dir:
+                                shutil.copyfile(src, os.path.join(report_dir,
+                                                                  f_to_copy)
+                                                )
+                                E.info("created new configuration file `%s` "
+                                        % f_to_copy)
+                                break
+    if f_count > 0:
+        pass
+    else:
+        E.warn('''No sphinx-quickstart skeleton files such as:
+                  {}
+                  were found
+                  in
+                  {}
+                  Continuing without.'''.format(dest, paths))
 
 def printConfigFiles():
     '''
@@ -1016,12 +1173,54 @@ def main(args=sys.argv):
         printConfigFiles()
 
     elif options.pipeline_action == "config":
-        f = sys._getframe(1)
-        caller = f.f_globals["__file__"]
+    # (Antonio) I've modified this section, see explanation and changes in the
+    # writeConfigFiles function above.
+        config_paths = []
+        try:
+            f = sys._getframe(1)
+            caller = inspect.getargvalues(f).locals["__file__"]
+            # Make it easier to match the name of the command executed so that
+            # the config file can be searched in case there are more than one
+            # ini files found in writeConfig():
+            # Making it global, check if there's better way:
+            global caller_name
+            caller_name = os.path.basename(os.path.normpath(caller))
+        except KeyError as e:
+            # The following code only works if something like this function is
+            # present in my_pipeline.py script:
+            # http://stackoverflow.com/questions/4519127/setuptools-package-data-folder-location
+            f = sys._getframe(2)
+            caller = inspect.getargvalues(f).locals["__file__"]
+            cmd_caller = os.path.basename(os.path.normpath(caller))
+            # As above, save the command called in a separate variable:
+            global caller_name
+            caller_name = cmd_caller
+            cmd_caller = importlib.import_module(cmd_caller)
+            caller = cmd_caller.getDir()
+        else:
+            print('''Unable to find path to file being executed. Probably because
+                    CGATPipelines and the pipeline that is being executed
+                    cannot figure out where each other lives. Raise an issue in
+                    GitHub if possible. Exiting.''')
+
+            # CGATPipelines have a pipe_XX/pipe_XX hierarchy, but a simplified
+            # version would only have pipe_XX/
+            # so creating an additional pipeline_path
+            # TO DO: clean this up
         pipeline_path = os.path.splitext(caller)[0]
-        general_path = os.path.join(os.path.dirname(pipeline_path),
-                                    "configuration")
-        writeConfigFiles(pipeline_path, general_path)
+        pipeline_path_2 = os.path.dirname(pipeline_path)
+            # CGATPipelines have a "configuration" folder
+            # adding a glob to have a bit more flexibility
+        general_path = glob.glob(str(os.path.abspath(pipeline_path_2) +
+                                      '/**/configuration*'), recursive = True)
+
+        if not general_path:
+            general_path = os.path.join(os.path.dirname(pipeline_path), "configuration")
+
+        config_paths.extend([pipeline_path, pipeline_path_2])
+        # Extend separately in case general_path returns more than one file:
+        config_paths.extend(general_path)
+        writeConfigFiles(config_paths)
 
     elif options.pipeline_action == "clone":
         clonePipeline(options.pipeline_targets[0])
diff --git a/CGATPipelines/Pipeline/__init__.py b/CGATPipelines/Pipeline/__init__.py
index b396bf43..2c39d7b1 100644
--- a/CGATPipelines/Pipeline/__init__.py
+++ b/CGATPipelines/Pipeline/__init__.py
@@ -273,7 +273,9 @@ def run_report(clean=True,
 
     # warning: memory gets multiplied by threads, so set it not too
     # high
-    job_memory = "1G"
+    job_memory = PARAMS["report_memory"]
+                 #"1G" # This causes problems in outside HPCs
+
     job_threads = PARAMS["report_threads"]
 
     # use a fake X display in order to avoid windows popping up
diff --git a/CGATPipelines/configuration/pipeline.ini b/CGATPipelines/configuration/pipeline.ini
index 6f5f6f61..ec33ed06 100644
--- a/CGATPipelines/configuration/pipeline.ini
+++ b/CGATPipelines/configuration/pipeline.ini
@@ -89,6 +89,8 @@ priority=-10
 # number of threads to use to build the documentation
 threads=10
 
+memory=1G
+
 # directory for html documentation
 html=report/html
 

From 8ae879731bc17340a478c51a4946b37d907310ae Mon Sep 17 00:00:00 2001
From: Antonio <AntonioJBT@users.noreply.github.com>
Date: Thu, 4 Jan 2018 19:19:14 +0000
Subject: [PATCH 02/21] Update Control.py

---
 CGATPipelines/Pipeline/Control.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/CGATPipelines/Pipeline/Control.py b/CGATPipelines/Pipeline/Control.py
index e785de16..05c9df3b 100644
--- a/CGATPipelines/Pipeline/Control.py
+++ b/CGATPipelines/Pipeline/Control.py
@@ -1193,7 +1193,7 @@ def main(args=sys.argv):
             caller = inspect.getargvalues(f).locals["__file__"]
             cmd_caller = os.path.basename(os.path.normpath(caller))
             # As above, save the command called in a separate variable:
-            global caller_name
+            #global caller_name
             caller_name = cmd_caller
             cmd_caller = importlib.import_module(cmd_caller)
             caller = cmd_caller.getDir()

From 267ed99f30d0134f458db88cd9971235f62bdec9 Mon Sep 17 00:00:00 2001
From: Antonio <AntonioJBT@users.noreply.github.com>
Date: Fri, 5 Jan 2018 12:53:05 +0000
Subject: [PATCH 03/21] Update Control.py

---
 CGATPipelines/Pipeline/Control.py | 108 ++++++++++++++++--------------
 1 file changed, 56 insertions(+), 52 deletions(-)

diff --git a/CGATPipelines/Pipeline/Control.py b/CGATPipelines/Pipeline/Control.py
index 05c9df3b..bbaa18ee 100644
--- a/CGATPipelines/Pipeline/Control.py
+++ b/CGATPipelines/Pipeline/Control.py
@@ -71,8 +71,60 @@
 GLOBAL_OPTIONS, GLOBAL_ARGS = None, None
 
 
-def writeConfigFiles(paths):
-    #pipeline_path, pipeline_path_2, general_path):
+def getConfigPaths():
+    '''
+    Search the current and installation paths where the configuration files live.
+    '''
+    # (Antonio) I've modified this section, see explanation and changes in the
+    # writeConfigFiles function above.
+    config_paths = []
+    try:
+        f = sys._getframe(1)
+        caller = inspect.getargvalues(f).locals["__file__"]
+        # Make it easier to match the name of the command executed so that
+        # the config file can be searched in case there are more than one
+        # ini files found in writeConfig():
+        caller_name = os.path.basename(os.path.normpath(caller))
+        # I think caller_name as separate var is needed for searching as string, can't remember now...
+    except KeyError as e:
+        # The following code only works if something like this function is
+        # present in my_pipeline.py script:
+        # http://stackoverflow.com/questions/4519127/setuptools-package-data-folder-location
+        f = sys._getframe(2)
+        caller = inspect.getargvalues(f).locals["__file__"]
+        cmd_caller = os.path.basename(os.path.normpath(caller))
+        # As above, save the command called in a separate variable:
+        caller_name = cmd_caller
+        cmd_caller = importlib.import_module(cmd_caller)
+        caller = cmd_caller.getDir()
+    else:
+        print('''Unable to find path to file being executed. Probably because
+                CGATPipelines and the pipeline that is being executed
+                cannot figure out where each other lives. Raise an issue in
+                GitHub if possible. Exiting.''')
+
+        # CGATPipelines have a pipe_XX/pipe_XX hierarchy, but a simplified
+        # version would only have pipe_XX/
+        # so creating an additional pipeline_path
+        # TO DO: clean this up
+    pipeline_path = os.path.splitext(caller)[0]
+    pipeline_path_2 = os.path.dirname(pipeline_path)
+        # CGATPipelines have a "configuration" folder
+        # adding a glob to have a bit more flexibility
+    general_path = glob.glob(str(os.path.abspath(pipeline_path_2) +
+                                  '/**/configuration*'), recursive = True)
+    if not general_path:
+        general_path = os.path.join(os.path.dirname(pipeline_path), "configuration")
+
+    # Add paths to search list:
+    config_paths.extend([pipeline_path, pipeline_path_2])
+    # Extend separately in case general_path returns more than one file:
+    config_paths.extend(general_path)
+
+    return(config_paths, caller_name)
+
+    
+def writeConfigFiles(paths, caller_name):
     '''create default configuration files in `path`.
     '''
     # TO DO: I've modified this function with workarounds to make it more
@@ -80,9 +132,8 @@ def writeConfigFiles(paths):
     # copy pre-run sphinx-quickstart files if they exist.
     # Other than creating a 'report' dir, it should not change the way it is
     # run from CGATPipelines.
-    # See also bottom of script for changes when calling the 'config' option 
+    # See also getConfigPaths() above, these run when calling the 'config' option 
     # Antonio
-    #paths = [pipeline_path, pipeline_path_2, general_path]
     report_dir = 'pipeline_report'
     try:
         os.mkdir(report_dir) # Sphinx config files will be copied here
@@ -1173,54 +1224,7 @@ def main(args=sys.argv):
         printConfigFiles()
 
     elif options.pipeline_action == "config":
-    # (Antonio) I've modified this section, see explanation and changes in the
-    # writeConfigFiles function above.
-        config_paths = []
-        try:
-            f = sys._getframe(1)
-            caller = inspect.getargvalues(f).locals["__file__"]
-            # Make it easier to match the name of the command executed so that
-            # the config file can be searched in case there are more than one
-            # ini files found in writeConfig():
-            # Making it global, check if there's better way:
-            global caller_name
-            caller_name = os.path.basename(os.path.normpath(caller))
-        except KeyError as e:
-            # The following code only works if something like this function is
-            # present in my_pipeline.py script:
-            # http://stackoverflow.com/questions/4519127/setuptools-package-data-folder-location
-            f = sys._getframe(2)
-            caller = inspect.getargvalues(f).locals["__file__"]
-            cmd_caller = os.path.basename(os.path.normpath(caller))
-            # As above, save the command called in a separate variable:
-            #global caller_name
-            caller_name = cmd_caller
-            cmd_caller = importlib.import_module(cmd_caller)
-            caller = cmd_caller.getDir()
-        else:
-            print('''Unable to find path to file being executed. Probably because
-                    CGATPipelines and the pipeline that is being executed
-                    cannot figure out where each other lives. Raise an issue in
-                    GitHub if possible. Exiting.''')
-
-            # CGATPipelines have a pipe_XX/pipe_XX hierarchy, but a simplified
-            # version would only have pipe_XX/
-            # so creating an additional pipeline_path
-            # TO DO: clean this up
-        pipeline_path = os.path.splitext(caller)[0]
-        pipeline_path_2 = os.path.dirname(pipeline_path)
-            # CGATPipelines have a "configuration" folder
-            # adding a glob to have a bit more flexibility
-        general_path = glob.glob(str(os.path.abspath(pipeline_path_2) +
-                                      '/**/configuration*'), recursive = True)
-
-        if not general_path:
-            general_path = os.path.join(os.path.dirname(pipeline_path), "configuration")
-
-        config_paths.extend([pipeline_path, pipeline_path_2])
-        # Extend separately in case general_path returns more than one file:
-        config_paths.extend(general_path)
-        writeConfigFiles(config_paths)
+        writeConfigFiles(getConfigPaths())
 
     elif options.pipeline_action == "clone":
         clonePipeline(options.pipeline_targets[0])

From 0f0c9e265dea44e7085a2cb21db59087ee675fcc Mon Sep 17 00:00:00 2001
From: AntonioJBT <a.berlanga@imperial.ac.uk>
Date: Tue, 9 Jan 2018 13:06:25 +0000
Subject: [PATCH 04/21] tests for control.py

---
 CGATPipelines/Pipeline/Control.py             |  108 +-
 CGATPipelines/Pipeline/Control.py.core_newest | 1235 +++++++++++++++++
 2 files changed, 1287 insertions(+), 56 deletions(-)
 create mode 100644 CGATPipelines/Pipeline/Control.py.core_newest

diff --git a/CGATPipelines/Pipeline/Control.py b/CGATPipelines/Pipeline/Control.py
index bbaa18ee..e785de16 100644
--- a/CGATPipelines/Pipeline/Control.py
+++ b/CGATPipelines/Pipeline/Control.py
@@ -71,60 +71,8 @@
 GLOBAL_OPTIONS, GLOBAL_ARGS = None, None
 
 
-def getConfigPaths():
-    '''
-    Search the current and installation paths where the configuration files live.
-    '''
-    # (Antonio) I've modified this section, see explanation and changes in the
-    # writeConfigFiles function above.
-    config_paths = []
-    try:
-        f = sys._getframe(1)
-        caller = inspect.getargvalues(f).locals["__file__"]
-        # Make it easier to match the name of the command executed so that
-        # the config file can be searched in case there are more than one
-        # ini files found in writeConfig():
-        caller_name = os.path.basename(os.path.normpath(caller))
-        # I think caller_name as separate var is needed for searching as string, can't remember now...
-    except KeyError as e:
-        # The following code only works if something like this function is
-        # present in my_pipeline.py script:
-        # http://stackoverflow.com/questions/4519127/setuptools-package-data-folder-location
-        f = sys._getframe(2)
-        caller = inspect.getargvalues(f).locals["__file__"]
-        cmd_caller = os.path.basename(os.path.normpath(caller))
-        # As above, save the command called in a separate variable:
-        caller_name = cmd_caller
-        cmd_caller = importlib.import_module(cmd_caller)
-        caller = cmd_caller.getDir()
-    else:
-        print('''Unable to find path to file being executed. Probably because
-                CGATPipelines and the pipeline that is being executed
-                cannot figure out where each other lives. Raise an issue in
-                GitHub if possible. Exiting.''')
-
-        # CGATPipelines have a pipe_XX/pipe_XX hierarchy, but a simplified
-        # version would only have pipe_XX/
-        # so creating an additional pipeline_path
-        # TO DO: clean this up
-    pipeline_path = os.path.splitext(caller)[0]
-    pipeline_path_2 = os.path.dirname(pipeline_path)
-        # CGATPipelines have a "configuration" folder
-        # adding a glob to have a bit more flexibility
-    general_path = glob.glob(str(os.path.abspath(pipeline_path_2) +
-                                  '/**/configuration*'), recursive = True)
-    if not general_path:
-        general_path = os.path.join(os.path.dirname(pipeline_path), "configuration")
-
-    # Add paths to search list:
-    config_paths.extend([pipeline_path, pipeline_path_2])
-    # Extend separately in case general_path returns more than one file:
-    config_paths.extend(general_path)
-
-    return(config_paths, caller_name)
-
-    
-def writeConfigFiles(paths, caller_name):
+def writeConfigFiles(paths):
+    #pipeline_path, pipeline_path_2, general_path):
     '''create default configuration files in `path`.
     '''
     # TO DO: I've modified this function with workarounds to make it more
@@ -132,8 +80,9 @@ def writeConfigFiles(paths, caller_name):
     # copy pre-run sphinx-quickstart files if they exist.
     # Other than creating a 'report' dir, it should not change the way it is
     # run from CGATPipelines.
-    # See also getConfigPaths() above, these run when calling the 'config' option 
+    # See also bottom of script for changes when calling the 'config' option 
     # Antonio
+    #paths = [pipeline_path, pipeline_path_2, general_path]
     report_dir = 'pipeline_report'
     try:
         os.mkdir(report_dir) # Sphinx config files will be copied here
@@ -1224,7 +1173,54 @@ def main(args=sys.argv):
         printConfigFiles()
 
     elif options.pipeline_action == "config":
-        writeConfigFiles(getConfigPaths())
+    # (Antonio) I've modified this section, see explanation and changes in the
+    # writeConfigFiles function above.
+        config_paths = []
+        try:
+            f = sys._getframe(1)
+            caller = inspect.getargvalues(f).locals["__file__"]
+            # Make it easier to match the name of the command executed so that
+            # the config file can be searched in case there are more than one
+            # ini files found in writeConfig():
+            # Making it global, check if there's better way:
+            global caller_name
+            caller_name = os.path.basename(os.path.normpath(caller))
+        except KeyError as e:
+            # The following code only works if something like this function is
+            # present in my_pipeline.py script:
+            # http://stackoverflow.com/questions/4519127/setuptools-package-data-folder-location
+            f = sys._getframe(2)
+            caller = inspect.getargvalues(f).locals["__file__"]
+            cmd_caller = os.path.basename(os.path.normpath(caller))
+            # As above, save the command called in a separate variable:
+            global caller_name
+            caller_name = cmd_caller
+            cmd_caller = importlib.import_module(cmd_caller)
+            caller = cmd_caller.getDir()
+        else:
+            print('''Unable to find path to file being executed. Probably because
+                    CGATPipelines and the pipeline that is being executed
+                    cannot figure out where each other lives. Raise an issue in
+                    GitHub if possible. Exiting.''')
+
+            # CGATPipelines have a pipe_XX/pipe_XX hierarchy, but a simplified
+            # version would only have pipe_XX/
+            # so creating an additional pipeline_path
+            # TO DO: clean this up
+        pipeline_path = os.path.splitext(caller)[0]
+        pipeline_path_2 = os.path.dirname(pipeline_path)
+            # CGATPipelines have a "configuration" folder
+            # adding a glob to have a bit more flexibility
+        general_path = glob.glob(str(os.path.abspath(pipeline_path_2) +
+                                      '/**/configuration*'), recursive = True)
+
+        if not general_path:
+            general_path = os.path.join(os.path.dirname(pipeline_path), "configuration")
+
+        config_paths.extend([pipeline_path, pipeline_path_2])
+        # Extend separately in case general_path returns more than one file:
+        config_paths.extend(general_path)
+        writeConfigFiles(config_paths)
 
     elif options.pipeline_action == "clone":
         clonePipeline(options.pipeline_targets[0])
diff --git a/CGATPipelines/Pipeline/Control.py.core_newest b/CGATPipelines/Pipeline/Control.py.core_newest
new file mode 100644
index 00000000..898ceb7c
--- /dev/null
+++ b/CGATPipelines/Pipeline/Control.py.core_newest
@@ -0,0 +1,1235 @@
+"""Control.py - Command line control for ruffus pipelines
+=========================================================
+
+The functions :func:`writeConfigFiles`, :func:`clean`,
+:func:`clonePipeline` and :func:`peekParameters` provide the
+functionality for particular pipeline commands.
+
+:class:`MultiLineFormatter` improves the formatting
+of long log messages, while
+:class:`LoggingFilterRabbitMQ` intercepts ruffus log
+messages and sends event information to a rabbitMQ message exchange
+for task process monitoring.
+
+Reference
+---------
+
+"""
+
+import inspect
+import json
+import logging
+import os
+import re
+import shutil
+import subprocess
+import sys
+import tempfile
+import time
+import io
+import glob
+import fnmatch
+import importlib
+
+from multiprocessing.pool import ThreadPool
+
+# talking to RabbitMQ
+try:
+    import pika
+    HAS_PIKA = True
+except ImportError:
+    HAS_PIKA = False
+
+# talking to a cluster
+try:
+    import drmaa
+    HAS_DRMAA = True
+except:
+# the following does not work on Travis
+#except ImportError or RuntimeError:
+    HAS_DRMAA = False
+
+from ruffus import pipeline_printout_graph, pipeline_printout, \
+    pipeline_run, ruffus_exceptions, task
+
+
+import CGAT.Experiment as E
+import CGAT.IOTools as IOTools
+from CGAT import Requirements as Requirements
+
+from CGATPipelines.Pipeline.Utils import isTest, getCaller, getCallerLocals
+from CGATPipelines.Pipeline.Execution import execute, startSession,\
+    closeSession
+from CGATPipelines.Pipeline.Local import getProjectName, getPipelineName
+from CGATPipelines.Pipeline.Parameters import inputValidation
+# Set from Pipeline.py
+PARAMS = {}
+
+# global options and arguments - set but currently not
+# used as relevant sections are entered into the PARAMS
+# dictionary. Could be deprecated and removed.
+GLOBAL_OPTIONS, GLOBAL_ARGS = None, None
+
+
+def getConfigPaths():
+    '''
+    Search the current and installation paths where the configuration files live.
+    '''
+    # (Antonio) I've modified this section, see explanation and changes in the
+    # writeConfigFiles function above.
+    config_paths = []
+    try:
+        f = sys._getframe(1)
+        caller = inspect.getargvalues(f).locals["__file__"]
+        # Make it easier to match the name of the command executed so that
+        # the config file can be searched in case there are more than one
+        # ini files found in writeConfig():
+        caller_name = os.path.basename(os.path.normpath(caller))
+    except KeyError as e:
+        # The following code only works if something like this function is
+        # present in my_pipeline.py script:
+        # http://stackoverflow.com/questions/4519127/setuptools-package-data-folder-location
+        f = sys._getframe(2)
+        caller = inspect.getargvalues(f).locals["__file__"]
+        cmd_caller = os.path.basename(os.path.normpath(caller))
+        # As above, save the command called in a separate variable:
+        caller_name = cmd_caller
+        cmd_caller = importlib.import_module(cmd_caller)
+        caller = cmd_caller.getDir()
+    else:
+        print('''Unable to find path to file being executed. Probably because
+                CGATPipelines and the pipeline that is being executed
+                cannot figure out where each other lives. Raise an issue in
+                GitHub if possible. Exiting.''')
+
+        # CGATPipelines have a pipe_XX/pipe_XX hierarchy, but a simplified
+        # version would only have pipe_XX/
+        # so creating an additional pipeline_path
+        # TO DO: clean this up
+    pipeline_path = os.path.splitext(caller)[0]
+    pipeline_path_2 = os.path.dirname(pipeline_path)
+        # CGATPipelines have a "configuration" folder
+        # adding a glob to have a bit more flexibility
+    general_path = glob.glob(str(os.path.abspath(pipeline_path_2) +
+                                  '/**/configuration*'), recursive = True)
+    if not general_path:
+        general_path = os.path.join(os.path.dirname(pipeline_path), "configuration")
+
+    # Add paths to search list:
+    config_paths.extend([pipeline_path, pipeline_path_2])
+    # Extend separately in case general_path returns more than one file:
+    config_paths.extend(general_path)
+
+    return(config_paths, caller_name)
+
+
+def writeConfigFiles(paths, caller_name):
+    '''create default configuration files in `path`.
+    '''
+    # TO DO: I've modified this function with workarounds to make it more
+    # flexible in order to find an ini file, find a configuration dir and 
+    # copy pre-run sphinx-quickstart files if they exist.
+    # Other than creating a 'report' dir, it should not change the way it is
+    # run from CGATPipelines.
+    # See also getConfigPaths() above, these run when calling the 'config' option 
+    # Antonio
+    report_dir = 'pipeline_report'
+    try:
+        os.mkdir(report_dir) # Sphinx config files will be copied here
+                             # CGATReport only needs its conf.py to generate the rest
+                             # though
+    except FileExistsError:
+        E.warn("directory `%s` already exists" % report_dir)
+        raise
+
+    # Look for ini file:
+    f_count = 0
+    INI_list = []
+    for path in paths:
+        if os.path.exists(path) and os.path.isdir(path):
+            for f in os.listdir(os.path.abspath(path)):
+                if fnmatch.fnmatch(f, 'pipeline*ini'):
+                    f_count += 1
+                    INI_file = f
+                    INI_list.extend([INI_file])
+
+    if f_count == 1:
+        config_files = [INI_file] # This is for the pipeline only
+
+    elif f_count > 1:
+        # Prioritise the file that contains the command called if more than one
+        # ini file are found:
+        for f in INI_list:
+            if caller_name in f:
+                INI_file = f
+                config_files = [INI_file]
+    else:
+        if f_count == 0:
+            print('''
+                  No configuration (ini) files found in:
+                  {}
+                  '''.format(paths)
+                  )
+        else:
+            print('''
+                  Found several ini files but could not prioritise based on:
+                  {}
+                  Exiting.
+                  '''.format(caller_name))
+            sys.exit()
+
+    # Copy pipeline ini file:
+    for dest in config_files:
+        if os.path.exists(dest):
+            E.warn("file `%s` already exists - skipped" % dest)
+            continue
+
+        for path in paths:
+            src = os.path.join(path, dest)
+            if os.path.exists(src):
+                shutil.copyfile(src, dest)
+                E.info("created new configuration file `%s` " % dest)
+                break
+        else:
+            raise ValueError('''default config file for `%s`
+                                not found in
+                                %s
+                                A pipeline cannot be run without this.
+                             ''' % (config_files, paths))
+
+    # Copy Sphinx configuration files, enforce copy of 'conf.py' in case
+    # CGATReport is used:
+    dest = 'conf.py'
+    if os.path.exists(dest):
+        E.warn("file `%s` already exists - skipped" % dest)
+
+    for path in paths:
+        src = os.path.join(path, dest)
+        if os.path.exists(src):
+            # Put sphinx files in separate dir:
+            shutil.copyfile(src, os.path.join(report_dir, dest))
+            # Create a softlink outside of report_dir dir for CGATReport:
+            os.symlink(os.path.join(report_dir, dest), str(dest))
+            E.info("created new configuration file `%s` " % dest)
+            break
+
+    else:
+        # Only warn as pipeline can be run without report:
+        E.warn('''default config file for `%s` not found in
+                  %s
+                  CGATReport nor Sphinx can be run without this''' % (dest, paths))
+
+    # If other Sphinx config files are found, copy them if there is a skeleton
+    # pipeline report to use:
+    E.info('Looking for additional Sphinx configuration files.')
+    sphinx_config_files = ['Makefile',
+                           'make.bat',
+                           '*.rst',
+                           '*.bib',
+                           ] # These are for a sphinx setup, not needed
+                             # with CGATReport
+                             # A 'report_pipeline_*.rst' template is
+                             # searched for below
+
+    # Look for a pipeline report file:
+    f_count = 0
+    for path in paths:
+        if os.path.exists(path):
+            for f in os.listdir(os.path.abspath(path)):
+                # TO DO:
+                # This pattern matching is particular to 
+                # https://github.com/AntonioJBT/project_quickstart
+                # Needs to be made more generic
+                if fnmatch.fnmatch(f, 'report_pipeline_*.rst'):
+                    f_count += 1
+                    pipeline_report_file = f
+
+    if f_count == 1:
+        sphinx_config_files.append(pipeline_report_file)
+
+    else:
+        # Only warn as pipeline can be run without report:
+        E.warn('''There is no pipeline report file matching
+                  report_pipeline_*.rst
+                  in the directories:
+                  {}
+                  {}
+                  or
+                  {}
+                  Ignore this if you are using CGATReport.
+                  '''.format(pipeline_path, pipeline_path_2, general_path)
+                  )
+
+    # Copy the files across if they are found:
+    f_count = 0
+    # Check all the paths and their files given above when searching for config files:
+    for path in paths:
+        if os.path.exists(path):
+            for f in os.listdir(path):
+                # For each file or search term given, match to an existing file:
+                for dest in sphinx_config_files:
+                    if fnmatch.fnmatch(f, dest):
+                        f_to_copy = f
+                        # If a match is found, walk the cwd to check it's not
+                        # already present:
+                        for root, dirs, files in os.walk('.'):
+                            if f_to_copy in files:
+                                E.warn("file `%s` already exists - skipped" % f_to_copy)
+                                continue
+
+                        # If not present, copy the file:
+                        else:
+                            f_count += 1
+                            src = os.path.join(path, f_to_copy)
+                            if os.path.exists(src):
+                                # Put sphinx files in separate dir:
+                                shutil.copyfile(src, os.path.join(report_dir,
+                                                                  f_to_copy)
+                                                )
+                                E.info("created new configuration file `%s` "
+                                        % f_to_copy)
+                                break
+    if f_count > 0:
+        pass
+    else:
+        E.warn('''No sphinx-quickstart skeleton files such as:
+                  {}
+                  were found
+                  in
+                  {}
+                  Continuing without.'''.format(dest, paths))
+
+def printConfigFiles():
+    '''
+        Print the list of .ini files used to configure the pipeline
+        along with their associated priorities.
+        Priority 1 is the highest.
+    '''
+
+    filenames = PARAMS['pipeline_ini']
+    print("\n List of .ini files used to configure the pipeline")
+    s = len(filenames)
+    if s == 0:
+        print(" No ini files passed!")
+    elif s >= 1:
+        print(" %-11s: %s " % ("Priority", "File"))
+        for f in filenames:
+            if s == 1:
+                print(" (highest) %s: %s\n" % (s, f))
+            else:
+                print(" %-11s: %s " % (s, f))
+            s -= 1
+
+
+def clonePipeline(srcdir, destdir=None):
+    '''clone a pipeline.
+
+    Cloning entails creating a mirror of the source pipeline.
+    Generally, data files are mirrored by linking. Configuration
+    files and the pipeline database will be copied.
+
+    Without modification of any files, building the cloned pipeline in
+    `destdir` should not re-run any commands. However, on deleting
+    selected files, the pipeline should run from the appropriate
+    point.  Newly created files will not affect the original pipeline.
+
+    Cloning pipelines permits sharing partial results between
+    pipelines, for example for parameter optimization.
+
+    Arguments
+    ---------
+    scrdir : string
+        Source directory
+    destdir : string
+        Destination directory. If None, use the current directory.
+
+    '''
+
+    if destdir is None:
+        destdir = os.path.curdir
+
+    E.info("cloning pipeline from %s to %s" % (srcdir, destdir))
+
+    copy_files = ("conf.py", "pipeline.ini", "csvdb")
+    ignore_prefix = (
+        "report", "_cache", "export", "tmp", "ctmp",
+        "_static", "_templates")
+
+    def _ignore(p):
+        for x in ignore_prefix:
+            if p.startswith(x):
+                return True
+        return False
+
+    for root, dirs, files in os.walk(srcdir):
+
+        relpath = os.path.relpath(root, srcdir)
+        if _ignore(relpath):
+            continue
+
+        for d in dirs:
+            if _ignore(d):
+                continue
+            dest = os.path.join(os.path.join(destdir, relpath, d))
+            os.mkdir(dest)
+            # touch
+            s = os.stat(os.path.join(root, d))
+            os.utime(dest, (s.st_atime, s.st_mtime))
+
+        for f in files:
+            if _ignore(f):
+                continue
+
+            fn = os.path.join(root, f)
+            dest_fn = os.path.join(destdir, relpath, f)
+            if f in copy_files:
+                shutil.copyfile(fn, dest_fn)
+            else:
+                # realpath resolves links - thus links will be linked to
+                # the original target
+                os.symlink(os.path.realpath(fn),
+                           dest_fn)
+
+
+def clean(files, logfile):
+    '''clean up files given by glob expressions.
+
+    Files are cleaned up by zapping, i.e. the files are set to size
+    0. Links to files are replaced with place-holders.
+
+    Information about the original file is written to `logfile`.
+
+    Arguments
+    ---------
+    files : list
+        List of glob expressions of files to clean up.
+    logfile : string
+        Filename of logfile.
+
+    '''
+    fields = ('st_atime', 'st_blksize', 'st_blocks',
+              'st_ctime', 'st_dev', 'st_gid', 'st_ino',
+              'st_mode', 'st_mtime', 'st_nlink',
+              'st_rdev', 'st_size', 'st_uid')
+
+    dry_run = PARAMS.get("dryrun", False)
+
+    if not dry_run:
+        if not os.path.exists(logfile):
+            outfile = IOTools.openFile(logfile, "w")
+            outfile.write("filename\tzapped\tlinkdest\t%s\n" %
+                          "\t".join(fields))
+        else:
+            outfile = IOTools.openFile(logfile, "a")
+
+    c = E.Counter()
+    for fn in files:
+        c.files += 1
+        if not dry_run:
+            stat, linkdest = IOTools.zapFile(fn)
+            if stat is not None:
+                c.zapped += 1
+                if linkdest is not None:
+                    c.links += 1
+                outfile.write("%s\t%s\t%s\t%s\n" % (
+                    fn,
+                    time.asctime(time.localtime(time.time())),
+                    linkdest,
+                    "\t".join([str(getattr(stat, x)) for x in fields])))
+
+    E.info("zapped: %s" % (c))
+    outfile.close()
+
+    return c
+
+
+def peekParameters(workingdir,
+                   pipeline,
+                   on_error_raise=None,
+                   prefix=None,
+                   update_interface=False,
+                   restrict_interface=False):
+    '''peek configuration parameters from external pipeline.
+
+    As the paramater dictionary is built at runtime, this method
+    executes the pipeline in workingdir, dumping its configuration
+    values and reading them into a dictionary.
+
+    If either `pipeline` or `workingdir` are not found, an error is
+    raised. This behaviour can be changed by setting `on_error_raise`
+    to False. In that case, an empty dictionary is returned.
+
+    Arguments
+    ---------
+    workingdir : string
+       Working directory. This is the directory that the pipeline
+       was executed in.
+    pipeline : string
+       Name of the pipeline script. The pipeline is assumed to live
+       in the same directory as the current pipeline.
+    on_error_raise : Bool
+       If set to a boolean, an error will be raised (or not) if there
+       is an error during parameter peeking, for example if
+       `workingdir` can not be found. If `on_error_raise` is None, it
+       will be set to the default, which is to raise an exception
+       unless the calling script is imported or the option
+       ``--is-test`` has been passed at the command line.
+    prefix : string
+       Add a prefix to all parameters. This is useful if the paramaters
+       are added to the configuration dictionary of the calling pipeline.
+    update_interface : bool
+       If True, this method will prefix any options in the
+       ``[interface]`` section with `workingdir`. This allows
+       transparent access to files in the external pipeline.
+    restrict_interface : bool
+       If  True, only interface parameters will be imported.
+
+    Returns
+    -------
+    config : dict
+        Dictionary of configuration values.
+
+    '''
+    caller_locals = getCallerLocals()
+
+    # check if we should raise errors
+    if on_error_raise is None:
+        on_error_raise = not isTest() and \
+            "__name__" in caller_locals and \
+            caller_locals["__name__"] == "__main__"
+
+    # patch - if --help or -h in command line arguments,
+    # do not peek as there might be no config file.
+    if "--help" in sys.argv or "-h" in sys.argv:
+        return {}
+
+    # Attempt to locate directory with pipeline source code. This is a
+    # patch as pipelines might be called within the repository
+    # directory or from an installed location
+    dirname = PARAMS["pipelinedir"]
+
+    # called without a directory, use current directory
+    if dirname == "":
+        dirname = os.path.abspath(".")
+    else:
+        # if not exists, assume we want version located
+        # in directory of calling script.
+        if not os.path.exists(dirname):
+            # directory is path of calling script
+            dirname = os.path.dirname(caller_locals['__file__'])
+
+    pipeline = os.path.join(dirname, pipeline)
+    if not os.path.exists(pipeline):
+        if on_error_raise:
+            raise ValueError(
+                "can't find pipeline at %s" % (pipeline))
+        else:
+            return {}
+
+    if workingdir == "":
+        workingdir = os.path.abspath(".")
+
+    # patch for the "config" target - use default
+    # pipeline directory if directory is not specified
+    # working dir is set to "?!"
+    if "config" in sys.argv or "check" in sys.argv or "clone" in sys.argv and workingdir == "?!":
+        workingdir = os.path.join(PARAMS.get("pipelinedir"),
+                                  IOTools.snip(pipeline, ".py"))
+
+    if not os.path.exists(workingdir):
+        if on_error_raise:
+            raise ValueError(
+                "can't find working dir %s" % workingdir)
+        else:
+            return {}
+
+    statement = "python %s -f -v 0 dump" % pipeline
+    process = subprocess.Popen(statement,
+                               cwd=workingdir,
+                               shell=True,
+                               stdin=subprocess.PIPE,
+                               stdout=subprocess.PIPE,
+                               stderr=subprocess.PIPE)
+
+    # process.stdin.close()
+    stdout, stderr = process.communicate()
+    if process.returncode != 0:
+        raise OSError(
+            ("Child was terminated by signal %i: \n"
+             "Statement: %s\n"
+             "The stderr was: \n%s\n"
+             "Stdout: %s") %
+            (-process.returncode, statement, stderr, stdout))
+
+    # subprocess only accepts encoding argument in py >= 3.6 so
+    # decode here.
+    stdout = stdout.decode("utf-8").splitlines()
+    # remove any log messages
+    stdout = [x for x in stdout if x.startswith("{")]
+    if len(stdout) > 1:
+        raise ValueError("received multiple configurations")
+    dump = json.loads(stdout[0])
+
+    # update interface
+    if update_interface:
+        for key, value in list(dump.items()):
+            if key.startswith("interface"):
+                dump[key] = os.path.join(workingdir, value)
+
+    # keep only interface if so required
+    if restrict_interface:
+        dump = dict([(k, v) for k, v in dump.items()
+                     if k.startswith("interface")])
+
+    # prefix all parameters
+    if prefix is not None:
+        dump = dict([("%s%s" % (prefix, x), y) for x, y in list(dump.items())])
+
+    return dump
+
+
+class MultiLineFormatter(logging.Formatter):
+    """add identation for multi-line entries.
+    """
+
+    def format(self, record):
+        s = logging.Formatter.format(self, record)
+        if record.message:
+            header, footer = s.split(record.message)
+            s = s.replace('\n', '\n' + ' ' * len(header))
+        return s
+
+
+class LoggingFilterRabbitMQ(logging.Filter):
+    """pass event information to a rabbitMQ message queue.
+
+    This is a log filter which detects messages from ruffus_ and sends
+    them to a rabbitMQ message queue.
+
+    A :term:`task` is a ruffus_ decorated function, which will execute
+    one or more :term:`jobs`.
+
+    Valid task/job status:
+
+    update
+       task/job needs updating
+    completed
+       task/job completed successfully
+    failed
+       task/job failed
+    running
+       task/job is running
+    ignore
+       ignore task/job (is up-to-date)
+
+    Arguments
+    ---------
+    ruffus_text : string
+        Log messages from ruffus.pipeline_printout. These are used
+        to collect all tasks that will be executed during pipeline
+        executation.
+    project_name : string
+        Name of the project
+    pipeline_name : string
+        Name of the pipeline
+    host : string
+        RabbitMQ host name
+    exchange : string
+        RabbitMQ exchange name
+
+    """
+
+    def __init__(self, ruffus_text,
+                 project_name,
+                 pipeline_name,
+                 host="localhost",
+                 exchange="ruffus_pipelines"):
+
+        self.project_name = project_name
+        self.pipeline_name = pipeline_name
+        self.exchange = exchange
+
+        # dictionary of jobs to run
+        self.jobs = {}
+        self.tasks = {}
+
+        if not HAS_PIKA:
+            self.connected = False
+            return
+
+        def split_by_job(text):
+            text = "".join(text)
+            job_message = ""
+            # ignore first entry which is the docstring
+            for line in text.split(" Job  = ")[1:]:
+                try:
+                    # long file names cause additional wrapping and
+                    # additional white-space characters
+                    job_name = re.search(
+                        "\[.*-> ([^\]]+)\]", line).groups()
+                except AttributeError:
+                    raise AttributeError("could not parse '%s'" % line)
+                job_status = "ignore"
+                if "Job needs update" in line:
+                    job_status = "update"
+
+                yield job_name, job_status, job_message
+
+        def split_by_task(text):
+            block, task_name = [], None
+            task_status = None
+            for line in text.split("\n"):
+                line = line.strip()
+
+                if line.startswith("Tasks which will be run"):
+                    task_status = "update"
+                elif line.startswith("Tasks which are up-to-date"):
+                    task_status = "ignore"
+
+                if line.startswith("Task = "):
+                    if task_name:
+                        yield task_name, task_status, list(split_by_job(block))
+                    block = []
+                    task_name = re.match("Task = (.*)", line).groups()[0]
+                    continue
+                if line:
+                    block.append(line)
+            if task_name:
+                yield task_name, task_status, list(split_by_job(block))
+
+        # create connection
+        try:
+            connection = pika.BlockingConnection(pika.ConnectionParameters(
+                host=host))
+            self.connected = True
+        except pika.exceptions.AMQPConnectionError:
+            self.connected = False
+            return
+
+        self.channel = connection.channel()
+        self.channel.exchange_declare(
+            exchange=self.exchange,
+            type='topic')
+
+        # populate with initial messages
+        for task_name, task_status, jobs in split_by_task(ruffus_text):
+            if task_name.startswith("(mkdir"):
+                continue
+
+            to_run = 0
+            for job_name, job_status, job_message in jobs:
+                self.jobs[job_name] = (task_name, job_name)
+                if job_status == "update":
+                    to_run += 1
+
+            self.tasks[task_name] = [task_status, len(jobs),
+                                     len(jobs) - to_run]
+            self.send_task(task_name)
+
+    def send_task(self, task_name):
+        '''send task status.'''
+
+        if not self.connected:
+            return
+
+        task_status, task_total, task_completed = self.tasks[task_name]
+
+        data = {}
+        data['created_at'] = time.time()
+        data['pipeline'] = self.pipeline_name
+        data['task_name'] = task_name
+        data['task_status'] = task_status
+        data['task_total'] = task_total
+        data['task_completed'] = task_completed
+
+        key = "%s.%s.%s" % (self.project_name, self.pipeline_name, task_name)
+        try:
+            self.channel.basic_publish(exchange=self.exchange,
+                                       routing_key=key,
+                                       body=json.dumps(data))
+        except pika.exceptions.ConnectionClosed:
+            E.warn("could not send message - connection closed")
+        except Exception as e:
+            E.warn("could not send message: %s" % str(e))
+
+    def send_error(self, task_name, job, error=None, msg=None):
+
+        if not self.connected:
+            return
+
+        try:
+            task_status, task_total, task_completed = self.tasks[task_name]
+        except KeyError:
+            E.warn("could not get task information for %s, no message sent" %
+                   task_name)
+            return
+
+        data = {}
+        data['created_at'] = time.time()
+        data['pipeline'] = self.pipeline_name
+        data['task_name'] = task_name
+        data['task_status'] = 'failed'
+        data['task_total'] = task_total
+        data['task_completed'] = task_completed
+
+        key = "%s.%s.%s" % (self.project_name, self.pipeline_name, task_name)
+
+        try:
+            self.channel.basic_publish(exchange=self.exchange,
+                                       routing_key=key,
+                                       body=json.dumps(data))
+        except pika.exceptions.ConnectionClosed:
+            E.warn("could not send message - connection closed")
+        except Exception as e:
+            E.warn("could not send message: %s" % str(e))
+
+    def filter(self, record):
+
+        if not self.connected:
+            return True
+
+        # filter ruffus logging messages
+        if record.filename.endswith("task.py"):
+            try:
+                before, task_name = record.msg.strip().split(" = ")
+            except ValueError:
+                return True
+
+            # ignore the mkdir, etc tasks
+            if task_name not in self.tasks:
+                return True
+
+            if before == "Task enters queue":
+                self.tasks[task_name][0] = "running"
+            elif before == "Completed Task":
+                self.tasks[task_name][0] = "completed"
+            elif before == "Uptodate Task":
+                self.tasks[task_name][0] = "uptodate"
+            else:
+                return True
+
+            # send new task status out
+            self.send_task(task_name)
+
+        return True
+
+
+USAGE = '''
+usage: %prog [OPTIONS] [CMD] [target]
+
+Execute pipeline %prog.
+
+Commands can be any of the following
+
+make <target>
+   run all tasks required to build *target*
+
+show <target>
+   show tasks required to build *target* without executing them
+
+plot <target>
+   plot image (using inkscape) of pipeline state for *target*
+
+debug <target> [args]
+   debug a method using the supplied arguments. The method <target>
+   in the pipeline is run without checking any dependencies.
+
+config
+   write new configuration files pipeline.ini, sphinxreport.ini and conf.py
+   with default values
+
+dump
+   write pipeline configuration to stdout
+
+printconfig
+   write pipeline configuration to stdout in a user-friendly way so
+   it is easier to debug pipeline parameters
+
+touch
+   touch files only, do not run
+
+regenerate
+   regenerate the ruffus checkpoint file
+
+check
+   check if requirements (external tool dependencies) are satisfied.
+
+clone <source>
+   create a clone of a pipeline in <source> in the current
+   directory. The cloning process aims to use soft linking to files
+   (not directories) as much as possible.  Time stamps are
+   preserved. Cloning is useful if a pipeline needs to be re-run from
+   a certain point but the original pipeline should be preserved.
+
+'''
+
+
+def main(args=sys.argv):
+    """command line control function for a pipeline.
+
+    This method defines command line options for the pipeline and
+    updates the global configuration dictionary correspondingly.
+
+    It then provides a command parser to execute particular tasks
+    using the ruffus pipeline control functions. See the generated
+    command line help for usage.
+
+    To use it, add::
+
+        import CGAT.Pipeline as P
+
+        if __name__ == "__main__":
+            sys.exit(P.main(sys.argv))
+
+    to your pipeline script.
+
+    Arguments
+    ---------
+    args : list
+        List of command line arguments.
+
+    """
+
+    global GLOBAL_OPTIONS
+    global GLOBAL_ARGS
+
+    parser = E.OptionParser(version="%prog version: $Id$",
+                            usage=USAGE)
+
+    parser.add_option("--pipeline-action", dest="pipeline_action",
+                      type="choice",
+                      choices=(
+                          "make", "show", "plot", "dump", "config", "clone",
+                          "check", "regenerate", "printconfig"),
+                      help="action to take [default=%default].")
+
+    parser.add_option("--pipeline-format", dest="pipeline_format",
+                      type="choice",
+                      choices=("dot", "jpg", "svg", "ps", "png"),
+                      help="pipeline format [default=%default].")
+
+    parser.add_option("-n", "--dry-run", dest="dry_run",
+                      action="store_true",
+                      help="perform a dry run (do not execute any shell "
+                      "commands) [default=%default].")
+
+    parser.add_option("-f", "--force-output", dest="force",
+                      action="store_true",
+                      help="force running the pipeline even if there "
+                      "are uncommited changes "
+                      "in the repository [default=%default].")
+
+    parser.add_option("-p", "--multiprocess", dest="multiprocess", type="int",
+                      help="number of parallel processes to use on "
+                      "submit host "
+                      "(different from number of jobs to use for "
+                      "cluster jobs) "
+                      "[default=%default].")
+
+    parser.add_option("-e", "--exceptions", dest="log_exceptions",
+                      action="store_true",
+                      help="echo exceptions immediately as they occur "
+                      "[default=%default].")
+
+    parser.add_option("-i", "--terminate", dest="terminate",
+                      action="store_true",
+                      help="terminate immediately at the first exception "
+                      "[default=%default].")
+
+    parser.add_option("-d", "--debug", dest="debug",
+                      action="store_true",
+                      help="output debugging information on console, "
+                      "and not the logfile "
+                      "[default=%default].")
+
+    parser.add_option("-s", "--set", dest="variables_to_set",
+                      type="string", action="append",
+                      help="explicitly set paramater values "
+                      "[default=%default].")
+
+    parser.add_option("-c", "--checksums", dest="ruffus_checksums_level",
+                      type="int",
+                      help="set the level of ruffus checksums"
+                      "[default=%default].")
+
+    parser.add_option("-t", "--is-test", dest="is_test",
+                      action="store_true",
+                      help="this is a test run"
+                      "[default=%default].")
+
+    parser.add_option("--rabbitmq-exchange", dest="rabbitmq_exchange",
+                      type="string",
+                      help="RabbitMQ exchange to send log messages to "
+                      "[default=%default].")
+
+    parser.add_option("--rabbitmq-host", dest="rabbitmq_host",
+                      type="string",
+                      help="RabbitMQ host to send log messages to "
+                      "[default=%default].")
+
+    parser.add_option("--input-validation", dest="input_validation",
+                      action="store_true",
+                      help="perform input validation before starting "
+                      "[default=%default].")
+
+    parser.set_defaults(
+        pipeline_action=None,
+        pipeline_format="svg",
+        pipeline_targets=[],
+        multiprocess=40,
+        logfile="pipeline.log",
+        dry_run=False,
+        force=False,
+        log_exceptions=False,
+        exceptions_terminate_immediately=False,
+        debug=False,
+        variables_to_set=[],
+        is_test=False,
+        ruffus_checksums_level=0,
+        rabbitmq_host="saruman",
+        rabbitmq_exchange="ruffus_pipelines",
+        input_validation=False)
+
+    (options, args) = E.Start(parser,
+                              add_cluster_options=True)
+
+    GLOBAL_OPTIONS, GLOBAL_ARGS = options, args
+    E.info("Started in: %s" % PARAMS.get("workingdir"))
+    # At this point, the PARAMS dictionary has already been
+    # built. It now needs to be updated with selected command
+    # line options as these should always take precedence over
+    # configuration files.
+
+    PARAMS["dryrun"] = options.dry_run
+    PARAMS["input_validation"] = options.input_validation
+
+    # use cli_cluster_* keys in PARAMS to ensure highest priority
+    # of cluster_* options passed with the command-line
+    if options.cluster_memory_default is not None:
+        PARAMS["cli_cluster_memory_default"] = options.cluster_memory_default
+        PARAMS["cluster_memory_default"] = options.cluster_memory_default
+    if options.cluster_memory_resource is not None:
+        PARAMS["cli_cluster_memory_resource"] = options.cluster_memory_resource
+        PARAMS["cluster_memory_resource"] = options.cluster_memory_resource
+    if options.cluster_num_jobs is not None:
+        PARAMS["cli_cluster_num_jobs"] = options.cluster_num_jobs
+        PARAMS["cluster_num_jobs"] = options.cluster_num_jobs
+    if options.cluster_options is not None:
+        PARAMS["cli_cluster_options"] = options.cluster_options
+        PARAMS["cluster_options"] = options.cluster_options
+    if options.cluster_parallel_environment is not None:
+        PARAMS["cli_cluster_parallel_environment"] = options.cluster_parallel_environment
+        PARAMS["cluster_parallel_environment"] = options.cluster_parallel_environment
+    if options.cluster_priority is not None:
+        PARAMS["cli_cluster_priority"] = options.cluster_priority
+        PARAMS["cluster_priority"] = options.cluster_priority
+    if options.cluster_queue is not None:
+        PARAMS["cli_cluster_queue"] = options.cluster_queue
+        PARAMS["cluster_queue"] = options.cluster_queue
+    if options.cluster_queue_manager is not None:
+        PARAMS["cli_cluster_queue_manager"] = options.cluster_queue_manager
+        PARAMS["cluster_queue_manager"] = options.cluster_queue_manager
+
+    PARAMS["ruffus_checksums_level"] = options.ruffus_checksums_level
+
+    for variables in options.variables_to_set:
+        variable, value = variables.split("=")
+        PARAMS[variable.strip()] = IOTools.str2val(value.strip())
+
+    if args:
+        options.pipeline_action = args[0]
+        if len(args) > 1:
+            options.pipeline_targets.extend(args[1:])
+
+    # see inputValidation function in Parameters.py
+    if options.input_validation:
+        inputValidation(PARAMS, sys.argv[0])
+
+    if options.pipeline_action == "check":
+        counter, requirements = Requirements.checkRequirementsFromAllModules()
+        for requirement in requirements:
+            E.info("\t".join(map(str, requirement)))
+        E.info("version check summary: %s" % str(counter))
+        E.Stop()
+        return
+
+    elif options.pipeline_action == "debug":
+        # create the session proxy
+        startSession()
+
+        method_name = options.pipeline_targets[0]
+        caller = getCaller()
+        method = getattr(caller, method_name)
+        method(*options.pipeline_targets[1:])
+
+    elif options.pipeline_action in ("make", "show", "svg", "plot",
+                                     "touch", "regenerate"):
+
+        # set up extra file logger
+        handler = logging.FileHandler(filename=options.logfile,
+                                      mode="a")
+        handler.setFormatter(
+            MultiLineFormatter(
+                '%(asctime)s %(levelname)s %(module)s.%(funcName)s.%(lineno)d %(message)s'))
+        logger = logging.getLogger()
+        logger.addHandler(handler)
+        messenger = None
+
+        try:
+            if options.pipeline_action == "make":
+
+                # get tasks to be done. This essentially replicates
+                # the state information within ruffus.
+                stream = io.StringIO()
+                pipeline_printout(
+                    stream,
+                    options.pipeline_targets,
+                    verbose=5,
+                    checksum_level=options.ruffus_checksums_level)
+
+                messenger = LoggingFilterRabbitMQ(
+                    stream.getvalue(),
+                    project_name=getProjectName(),
+                    pipeline_name=getPipelineName(),
+                    host=options.rabbitmq_host,
+                    exchange=options.rabbitmq_exchange)
+
+                logger.addFilter(messenger)
+
+                if not options.without_cluster and HAS_DRMAA:
+                    global task
+                    # use threading instead of multiprocessing in order to
+                    # limit the number of concurrent jobs by using the
+                    # GIL
+                    #
+                    # Note that threading might cause problems with rpy.
+                    task.Pool = ThreadPool
+
+                    # create the session proxy
+                    startSession()
+
+                #
+                #   make sure we are not logging at the same time in
+                #   different processes
+                #
+                # session_mutex = manager.Lock()
+                E.info(E.GetHeader())
+                E.info("code location: %s" % PARAMS["pipeline_scriptsdir"])
+                E.info("Working directory is: %s" % PARAMS["workingdir"])
+
+                pipeline_run(
+                    options.pipeline_targets,
+                    multiprocess=options.multiprocess,
+                    logger=logger,
+                    verbose=options.loglevel,
+                    log_exceptions=options.log_exceptions,
+                    exceptions_terminate_immediately=options.exceptions_terminate_immediately,
+                    checksum_level=options.ruffus_checksums_level,
+                )
+
+                E.info(E.GetFooter())
+
+                closeSession()
+
+            elif options.pipeline_action == "show":
+                pipeline_printout(
+                    options.stdout,
+                    options.pipeline_targets,
+                    verbose=options.loglevel,
+                    checksum_level=options.ruffus_checksums_level)
+
+            elif options.pipeline_action == "touch":
+                pipeline_run(
+                    options.pipeline_targets,
+                    touch_files_only=True,
+                    verbose=options.loglevel,
+                    checksum_level=options.ruffus_checksums_level)
+
+            elif options.pipeline_action == "regenerate":
+                pipeline_run(
+                    options.pipeline_targets,
+                    touch_files_only=options.ruffus_checksums_level,
+                    verbose=options.loglevel)
+
+            elif options.pipeline_action == "svg":
+                pipeline_printout_graph(
+                    options.stdout.buffer,
+                    options.pipeline_format,
+                    options.pipeline_targets,
+                    checksum_level=options.ruffus_checksums_level)
+
+            elif options.pipeline_action == "plot":
+                outf, filename = tempfile.mkstemp()
+                pipeline_printout_graph(
+                    os.fdopen(outf, "wb"),
+                    options.pipeline_format,
+                    options.pipeline_targets,
+                    checksum_level=options.ruffus_checksums_level)
+                execute("inkscape %s" % filename)
+                os.unlink(filename)
+
+        except ruffus_exceptions.RethrownJobError as value:
+
+            if not options.debug:
+                E.error("%i tasks with errors, please see summary below:" %
+                        len(value.args))
+                for idx, e in enumerate(value.args):
+                    task, job, error, msg, traceback = e
+
+                    if task is None:
+                        # this seems to be errors originating within ruffus
+                        # such as a missing dependency
+                        # msg then contains a RethrownJobJerror
+                        msg = str(msg)
+                        pass
+                    else:
+                        task = re.sub("__main__.", "", task)
+                        job = re.sub("\s", "", job)
+
+                    if messenger:
+                        messenger.send_error(task, job, error, msg)
+
+                    # display only single line messages
+                    if len([x for x in msg.split("\n") if x != ""]) > 1:
+                        msg = ""
+
+                    E.error("%i: Task=%s Error=%s %s: %s" %
+                            (idx, task, error, job, msg))
+
+                E.error("full traceback is in %s" % options.logfile)
+
+                # write full traceback to log file only by removing the stdout
+                # handler
+                lhStdout = logger.handlers[0]
+                logger.removeHandler(lhStdout)
+                logger.error("start of error messages")
+                logger.error(value)
+                logger.error("end of error messages")
+                logger.addHandler(lhStdout)
+
+                # raise error
+                raise ValueError(
+                    "pipeline failed with %i errors" % len(value.args))
+            else:
+                raise
+
+    elif options.pipeline_action == "dump":
+        print(json.dumps(PARAMS))
+
+    elif options.pipeline_action == "printconfig":
+        print("Printing out pipeline parameters: ")
+        for k in sorted(PARAMS):
+            print(k, "=", PARAMS[k])
+        printConfigFiles()
+
+    elif options.pipeline_action == "config":
+        writeConfigFiles(getConfigPaths())
+
+    elif options.pipeline_action == "clone":
+        clonePipeline(options.pipeline_targets[0])
+
+    else:
+        raise ValueError("unknown pipeline action %s" %
+                         options.pipeline_action)
+
+    E.Stop()

From 986fee17d1e7cead3177de5995d9a3aa97f97f23 Mon Sep 17 00:00:00 2001
From: Antonio <AntonioJBT@users.noreply.github.com>
Date: Tue, 9 Jan 2018 14:28:08 +0000
Subject: [PATCH 05/21] Update pipeline.ini

---
 CGATPipelines/configuration/pipeline.ini | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/CGATPipelines/configuration/pipeline.ini b/CGATPipelines/configuration/pipeline.ini
index ec33ed06..07af17bf 100644
--- a/CGATPipelines/configuration/pipeline.ini
+++ b/CGATPipelines/configuration/pipeline.ini
@@ -6,10 +6,10 @@
 ########################################################
 ########################################################
 # The project name to appear in the report
-projectname=CGATProject
+projectname=
 
 # The copyright statement to appear in the report
-copyright=CGAT (2010-2014)
+copyright=
 
 # The short X.Y version to appear in the report
 version=0.1
@@ -37,7 +37,7 @@ scratchdir=/tmp
 web_dir=../web
 
 # location of indexed genome 
-genome_dir=/ifs/mirror/genomes/plain
+genome_dir=/full/path/here
 
 # The genome to use (UCSC convention)
 genome=hg19
@@ -75,7 +75,8 @@ port=3306
 [cluster]
 
 # queue to use
-queue=all.q
+queue=
+#all.q
 
 # priority of jobs on cluster
 priority=-10

From 96b77566f391c6d026cbb26e800a121ee9281bf3 Mon Sep 17 00:00:00 2001
From: AntonioJBT <a.berlanga@imperial.ac.uk>
Date: Tue, 9 Jan 2018 14:35:34 +0000
Subject: [PATCH 06/21] updated cluster.py for pbspro, already in other branch
 though

---
 CGATPipelines/Pipeline/Cluster.py | 15 ++++++++-------
 1 file changed, 8 insertions(+), 7 deletions(-)

diff --git a/CGATPipelines/Pipeline/Cluster.py b/CGATPipelines/Pipeline/Cluster.py
index fc3d0d02..42bc53d4 100644
--- a/CGATPipelines/Pipeline/Cluster.py
+++ b/CGATPipelines/Pipeline/Cluster.py
@@ -199,13 +199,14 @@ def setupDrmaaJobTemplate(drmaa_session, options, job_name, job_memory):
             spec = ["-N %s" % job_name[0:15],
                     "-l select=1:ncpus=%s:mem=%s" % (job_threads, job_memory)]
 
-        if options["cluster_options"]:
-            if "mem" not in options["cluster_options"]:
-                spec.append("%(cluster_options)s")
-
-            elif "mem" in options["cluster_options"]:
-                raise ValueError('''mem resource specified twice, check ~/.cgat config file,
-                ini files, command line options, etc.''')
+            if options["cluster_options"]:
+                if "mem" not in options["cluster_options"]:
+                    spec.append("%(cluster_options)s")
+
+                elif "mem" in options["cluster_options"]:
+                    raise ValueError('''mem resource specified twice, check ~/.cgat config file,
+                                        ini files, command line options, etc.
+                                     ''')
 
         if "cluster_pe_queue" in options and multithread:
             spec.append(

From 1ef89cad5ab4c0a24727fdeb7171efe18a7dea8d Mon Sep 17 00:00:00 2001
From: AntonioJBT <a.berlanga@imperial.ac.uk>
Date: Tue, 9 Jan 2018 14:38:21 +0000
Subject: [PATCH 07/21] updates/testing

---
 CGATPipelines/Pipeline/Control.py             | 107 +++++++++---------
 ...ol.py.core_newest => Control.py.works_mac} | 107 +++++++++---------
 2 files changed, 107 insertions(+), 107 deletions(-)
 rename CGATPipelines/Pipeline/{Control.py.core_newest => Control.py.works_mac} (94%)

diff --git a/CGATPipelines/Pipeline/Control.py b/CGATPipelines/Pipeline/Control.py
index e785de16..898ceb7c 100644
--- a/CGATPipelines/Pipeline/Control.py
+++ b/CGATPipelines/Pipeline/Control.py
@@ -71,8 +71,59 @@
 GLOBAL_OPTIONS, GLOBAL_ARGS = None, None
 
 
-def writeConfigFiles(paths):
-    #pipeline_path, pipeline_path_2, general_path):
+def getConfigPaths():
+    '''
+    Search the current and installation paths where the configuration files live.
+    '''
+    # (Antonio) I've modified this section, see explanation and changes in the
+    # writeConfigFiles function above.
+    config_paths = []
+    try:
+        f = sys._getframe(1)
+        caller = inspect.getargvalues(f).locals["__file__"]
+        # Make it easier to match the name of the command executed so that
+        # the config file can be searched in case there are more than one
+        # ini files found in writeConfig():
+        caller_name = os.path.basename(os.path.normpath(caller))
+    except KeyError as e:
+        # The following code only works if something like this function is
+        # present in my_pipeline.py script:
+        # http://stackoverflow.com/questions/4519127/setuptools-package-data-folder-location
+        f = sys._getframe(2)
+        caller = inspect.getargvalues(f).locals["__file__"]
+        cmd_caller = os.path.basename(os.path.normpath(caller))
+        # As above, save the command called in a separate variable:
+        caller_name = cmd_caller
+        cmd_caller = importlib.import_module(cmd_caller)
+        caller = cmd_caller.getDir()
+    else:
+        print('''Unable to find path to file being executed. Probably because
+                CGATPipelines and the pipeline that is being executed
+                cannot figure out where each other lives. Raise an issue in
+                GitHub if possible. Exiting.''')
+
+        # CGATPipelines have a pipe_XX/pipe_XX hierarchy, but a simplified
+        # version would only have pipe_XX/
+        # so creating an additional pipeline_path
+        # TO DO: clean this up
+    pipeline_path = os.path.splitext(caller)[0]
+    pipeline_path_2 = os.path.dirname(pipeline_path)
+        # CGATPipelines have a "configuration" folder
+        # adding a glob to have a bit more flexibility
+    general_path = glob.glob(str(os.path.abspath(pipeline_path_2) +
+                                  '/**/configuration*'), recursive = True)
+    if not general_path:
+        general_path = os.path.join(os.path.dirname(pipeline_path), "configuration")
+
+    # Add paths to search list:
+    config_paths.extend([pipeline_path, pipeline_path_2])
+    # Extend separately in case general_path returns more than one file:
+    config_paths.extend(general_path)
+
+    return(config_paths, caller_name)
+
+
+def writeConfigFiles(paths, caller_name):
     '''create default configuration files in `path`.
     '''
     # TO DO: I've modified this function with workarounds to make it more
@@ -80,9 +131,8 @@ def writeConfigFiles(paths):
     # copy pre-run sphinx-quickstart files if they exist.
     # Other than creating a 'report' dir, it should not change the way it is
     # run from CGATPipelines.
-    # See also bottom of script for changes when calling the 'config' option 
+    # See also getConfigPaths() above, these run when calling the 'config' option 
     # Antonio
-    #paths = [pipeline_path, pipeline_path_2, general_path]
     report_dir = 'pipeline_report'
     try:
         os.mkdir(report_dir) # Sphinx config files will be copied here
@@ -1173,54 +1223,7 @@ def main(args=sys.argv):
         printConfigFiles()
 
     elif options.pipeline_action == "config":
-    # (Antonio) I've modified this section, see explanation and changes in the
-    # writeConfigFiles function above.
-        config_paths = []
-        try:
-            f = sys._getframe(1)
-            caller = inspect.getargvalues(f).locals["__file__"]
-            # Make it easier to match the name of the command executed so that
-            # the config file can be searched in case there are more than one
-            # ini files found in writeConfig():
-            # Making it global, check if there's better way:
-            global caller_name
-            caller_name = os.path.basename(os.path.normpath(caller))
-        except KeyError as e:
-            # The following code only works if something like this function is
-            # present in my_pipeline.py script:
-            # http://stackoverflow.com/questions/4519127/setuptools-package-data-folder-location
-            f = sys._getframe(2)
-            caller = inspect.getargvalues(f).locals["__file__"]
-            cmd_caller = os.path.basename(os.path.normpath(caller))
-            # As above, save the command called in a separate variable:
-            global caller_name
-            caller_name = cmd_caller
-            cmd_caller = importlib.import_module(cmd_caller)
-            caller = cmd_caller.getDir()
-        else:
-            print('''Unable to find path to file being executed. Probably because
-                    CGATPipelines and the pipeline that is being executed
-                    cannot figure out where each other lives. Raise an issue in
-                    GitHub if possible. Exiting.''')
-
-            # CGATPipelines have a pipe_XX/pipe_XX hierarchy, but a simplified
-            # version would only have pipe_XX/
-            # so creating an additional pipeline_path
-            # TO DO: clean this up
-        pipeline_path = os.path.splitext(caller)[0]
-        pipeline_path_2 = os.path.dirname(pipeline_path)
-            # CGATPipelines have a "configuration" folder
-            # adding a glob to have a bit more flexibility
-        general_path = glob.glob(str(os.path.abspath(pipeline_path_2) +
-                                      '/**/configuration*'), recursive = True)
-
-        if not general_path:
-            general_path = os.path.join(os.path.dirname(pipeline_path), "configuration")
-
-        config_paths.extend([pipeline_path, pipeline_path_2])
-        # Extend separately in case general_path returns more than one file:
-        config_paths.extend(general_path)
-        writeConfigFiles(config_paths)
+        writeConfigFiles(getConfigPaths())
 
     elif options.pipeline_action == "clone":
         clonePipeline(options.pipeline_targets[0])
diff --git a/CGATPipelines/Pipeline/Control.py.core_newest b/CGATPipelines/Pipeline/Control.py.works_mac
similarity index 94%
rename from CGATPipelines/Pipeline/Control.py.core_newest
rename to CGATPipelines/Pipeline/Control.py.works_mac
index 898ceb7c..e785de16 100644
--- a/CGATPipelines/Pipeline/Control.py.core_newest
+++ b/CGATPipelines/Pipeline/Control.py.works_mac
@@ -71,59 +71,8 @@ PARAMS = {}
 GLOBAL_OPTIONS, GLOBAL_ARGS = None, None
 
 
-def getConfigPaths():
-    '''
-    Search the current and installation paths where the configuration files live.
-    '''
-    # (Antonio) I've modified this section, see explanation and changes in the
-    # writeConfigFiles function above.
-    config_paths = []
-    try:
-        f = sys._getframe(1)
-        caller = inspect.getargvalues(f).locals["__file__"]
-        # Make it easier to match the name of the command executed so that
-        # the config file can be searched in case there are more than one
-        # ini files found in writeConfig():
-        caller_name = os.path.basename(os.path.normpath(caller))
-    except KeyError as e:
-        # The following code only works if something like this function is
-        # present in my_pipeline.py script:
-        # http://stackoverflow.com/questions/4519127/setuptools-package-data-folder-location
-        f = sys._getframe(2)
-        caller = inspect.getargvalues(f).locals["__file__"]
-        cmd_caller = os.path.basename(os.path.normpath(caller))
-        # As above, save the command called in a separate variable:
-        caller_name = cmd_caller
-        cmd_caller = importlib.import_module(cmd_caller)
-        caller = cmd_caller.getDir()
-    else:
-        print('''Unable to find path to file being executed. Probably because
-                CGATPipelines and the pipeline that is being executed
-                cannot figure out where each other lives. Raise an issue in
-                GitHub if possible. Exiting.''')
-
-        # CGATPipelines have a pipe_XX/pipe_XX hierarchy, but a simplified
-        # version would only have pipe_XX/
-        # so creating an additional pipeline_path
-        # TO DO: clean this up
-    pipeline_path = os.path.splitext(caller)[0]
-    pipeline_path_2 = os.path.dirname(pipeline_path)
-        # CGATPipelines have a "configuration" folder
-        # adding a glob to have a bit more flexibility
-    general_path = glob.glob(str(os.path.abspath(pipeline_path_2) +
-                                  '/**/configuration*'), recursive = True)
-    if not general_path:
-        general_path = os.path.join(os.path.dirname(pipeline_path), "configuration")
-
-    # Add paths to search list:
-    config_paths.extend([pipeline_path, pipeline_path_2])
-    # Extend separately in case general_path returns more than one file:
-    config_paths.extend(general_path)
-
-    return(config_paths, caller_name)
-
-
-def writeConfigFiles(paths, caller_name):
+def writeConfigFiles(paths):
+    #pipeline_path, pipeline_path_2, general_path):
     '''create default configuration files in `path`.
     '''
     # TO DO: I've modified this function with workarounds to make it more
@@ -131,8 +80,9 @@ def writeConfigFiles(paths, caller_name):
     # copy pre-run sphinx-quickstart files if they exist.
     # Other than creating a 'report' dir, it should not change the way it is
     # run from CGATPipelines.
-    # See also getConfigPaths() above, these run when calling the 'config' option 
+    # See also bottom of script for changes when calling the 'config' option 
     # Antonio
+    #paths = [pipeline_path, pipeline_path_2, general_path]
     report_dir = 'pipeline_report'
     try:
         os.mkdir(report_dir) # Sphinx config files will be copied here
@@ -1223,7 +1173,54 @@ def main(args=sys.argv):
         printConfigFiles()
 
     elif options.pipeline_action == "config":
-        writeConfigFiles(getConfigPaths())
+    # (Antonio) I've modified this section, see explanation and changes in the
+    # writeConfigFiles function above.
+        config_paths = []
+        try:
+            f = sys._getframe(1)
+            caller = inspect.getargvalues(f).locals["__file__"]
+            # Make it easier to match the name of the command executed so that
+            # the config file can be searched in case there are more than one
+            # ini files found in writeConfig():
+            # Making it global, check if there's better way:
+            global caller_name
+            caller_name = os.path.basename(os.path.normpath(caller))
+        except KeyError as e:
+            # The following code only works if something like this function is
+            # present in my_pipeline.py script:
+            # http://stackoverflow.com/questions/4519127/setuptools-package-data-folder-location
+            f = sys._getframe(2)
+            caller = inspect.getargvalues(f).locals["__file__"]
+            cmd_caller = os.path.basename(os.path.normpath(caller))
+            # As above, save the command called in a separate variable:
+            global caller_name
+            caller_name = cmd_caller
+            cmd_caller = importlib.import_module(cmd_caller)
+            caller = cmd_caller.getDir()
+        else:
+            print('''Unable to find path to file being executed. Probably because
+                    CGATPipelines and the pipeline that is being executed
+                    cannot figure out where each other lives. Raise an issue in
+                    GitHub if possible. Exiting.''')
+
+            # CGATPipelines have a pipe_XX/pipe_XX hierarchy, but a simplified
+            # version would only have pipe_XX/
+            # so creating an additional pipeline_path
+            # TO DO: clean this up
+        pipeline_path = os.path.splitext(caller)[0]
+        pipeline_path_2 = os.path.dirname(pipeline_path)
+            # CGATPipelines have a "configuration" folder
+            # adding a glob to have a bit more flexibility
+        general_path = glob.glob(str(os.path.abspath(pipeline_path_2) +
+                                      '/**/configuration*'), recursive = True)
+
+        if not general_path:
+            general_path = os.path.join(os.path.dirname(pipeline_path), "configuration")
+
+        config_paths.extend([pipeline_path, pipeline_path_2])
+        # Extend separately in case general_path returns more than one file:
+        config_paths.extend(general_path)
+        writeConfigFiles(config_paths)
 
     elif options.pipeline_action == "clone":
         clonePipeline(options.pipeline_targets[0])

From ef798d1ab7c75b14604773cf145d32d22c1554e3 Mon Sep 17 00:00:00 2001
From: AntonioJBT <a.berlanga@imperial.ac.uk>
Date: Tue, 9 Jan 2018 17:55:04 +0000
Subject: [PATCH 08/21] control.py changes

---
 CGATPipelines/Pipeline/Control.py             | 127 ++++++++++--------
 ...{Control.py.works_mac => Control.py.works} |   1 -
 2 files changed, 73 insertions(+), 55 deletions(-)
 rename CGATPipelines/Pipeline/{Control.py.works_mac => Control.py.works} (99%)

diff --git a/CGATPipelines/Pipeline/Control.py b/CGATPipelines/Pipeline/Control.py
index 898ceb7c..884c6217 100644
--- a/CGATPipelines/Pipeline/Control.py
+++ b/CGATPipelines/Pipeline/Control.py
@@ -73,35 +73,46 @@
 
 def getConfigPaths():
     '''
-    Search the current and installation paths where the configuration files live.
+    Search the current and installation paths where the configuration files
+    live for the pipeline being called.
     '''
     # (Antonio) I've modified this section, see explanation and changes in the
     # writeConfigFiles function above.
     config_paths = []
+    # Get the name of the pipeline being called
+    # This could be:
+    # cgatflow readqc config
+    # pipeline_QTL config
+    # python /YYYY//XXXX/pipeline_XXXX.py config
     try:
         f = sys._getframe(1)
-        caller = inspect.getargvalues(f).locals["__file__"]
+        caller = f.f_globals["__file__"] # cgatflow config
+        #caller = inspect.getargvalues(f).locals["__file__"]
         # Make it easier to match the name of the command executed so that
         # the config file can be searched in case there are more than one
         # ini files found in writeConfig():
         caller_name = os.path.basename(os.path.normpath(caller))
+        print('try 1', f, caller, caller_name)
     except KeyError as e:
         # The following code only works if something like this function is
         # present in my_pipeline.py script:
         # http://stackoverflow.com/questions/4519127/setuptools-package-data-folder-location
         f = sys._getframe(2)
-        caller = inspect.getargvalues(f).locals["__file__"]
+        caller = f.f_globals["__file__"] # cgatflow config
+        #caller = inspect.getargvalues(f).locals["__file__"]
         cmd_caller = os.path.basename(os.path.normpath(caller))
+        print('first defs try 2', f, caller, cmd_caller)
         # As above, save the command called in a separate variable:
         caller_name = cmd_caller
         cmd_caller = importlib.import_module(cmd_caller)
         caller = cmd_caller.getDir()
-    else:
-        print('''Unable to find path to file being executed. Probably because
-                CGATPipelines and the pipeline that is being executed
-                cannot figure out where each other lives. Raise an issue in
-                GitHub if possible. Exiting.''')
-
+        print('2nd defs try 2', caller_name, cmd_caller, caller)
+    #else:
+    #    print('''Unable to find path to file being executed. Probably because
+    #            CGATPipelines and the pipeline that is being executed
+    #            cannot figure out where each other lives. Raise an issue in
+    #            GitHub if possible. Exiting.''')
+    #    sys.exit()
         # CGATPipelines have a pipe_XX/pipe_XX hierarchy, but a simplified
         # version would only have pipe_XX/
         # so creating an additional pipeline_path
@@ -111,7 +122,7 @@ def getConfigPaths():
         # CGATPipelines have a "configuration" folder
         # adding a glob to have a bit more flexibility
     general_path = glob.glob(str(os.path.abspath(pipeline_path_2) +
-                                  '/**/configuration*'), recursive = True)
+                                  '/*/configuration*'), recursive = True)
     if not general_path:
         general_path = os.path.join(os.path.dirname(pipeline_path), "configuration")
 
@@ -120,10 +131,10 @@ def getConfigPaths():
     # Extend separately in case general_path returns more than one file:
     config_paths.extend(general_path)
 
+    print(config_paths, caller_name)
     return(config_paths, caller_name)
 
-
-def writeConfigFiles(paths, caller_name):
+def writeConfigFiles(config_paths, caller_name):
     '''create default configuration files in `path`.
     '''
     # TO DO: I've modified this function with workarounds to make it more
@@ -134,6 +145,10 @@ def writeConfigFiles(paths, caller_name):
     # See also getConfigPaths() above, these run when calling the 'config' option 
     # Antonio
     report_dir = 'pipeline_report'
+    config_files = []
+    print(config_paths)
+    print(caller_name)
+
     try:
         os.mkdir(report_dir) # Sphinx config files will be copied here
                              # CGATReport only needs its conf.py to generate the rest
@@ -145,7 +160,7 @@ def writeConfigFiles(paths, caller_name):
     # Look for ini file:
     f_count = 0
     INI_list = []
-    for path in paths:
+    for path in config_paths:
         if os.path.exists(path) and os.path.isdir(path):
             for f in os.listdir(os.path.abspath(path)):
                 if fnmatch.fnmatch(f, 'pipeline*ini'):
@@ -158,44 +173,48 @@ def writeConfigFiles(paths, caller_name):
 
     elif f_count > 1:
         # Prioritise the file that contains the command called if more than one
-        # ini file are found:
+        # ini files are found:
         for f in INI_list:
             if caller_name in f:
+                count += 1
                 INI_file = f
                 config_files = [INI_file]
-    else:
-        if f_count == 0:
-            print('''
-                  No configuration (ini) files found in:
-                  {}
-                  '''.format(paths)
-                  )
-        else:
-            print('''
-                  Found several ini files but could not prioritise based on:
-                  {}
-                  Exiting.
-                  '''.format(caller_name))
-            sys.exit()
+        if count == 0:
+            E.warn('''
+                   Found several ini files but could not prioritise based on:
+                   {}.
+                   Some pipelines do not require an ini file though, try
+                   without.
+                   '''.format(caller_name))
+
+    if f_count == 0:
+        E.warn('''
+               No configuration (ini) files found in:
+                {}
+               '''.format(config_paths)
+               )
 
     # Copy pipeline ini file:
-    for dest in config_files:
-        if os.path.exists(dest):
-            E.warn("file `%s` already exists - skipped" % dest)
-            continue
+    if not config_files:
+        E.warn('No configuration files found.')
+    else:
+        for dest in config_files:
+            if os.path.exists(dest):
+                E.warn("file `%s` already exists - skipped" % dest)
+                continue
 
-        for path in paths:
-            src = os.path.join(path, dest)
-            if os.path.exists(src):
-                shutil.copyfile(src, dest)
-                E.info("created new configuration file `%s` " % dest)
-                break
-        else:
-            raise ValueError('''default config file for `%s`
-                                not found in
-                                %s
-                                A pipeline cannot be run without this.
-                             ''' % (config_files, paths))
+            for path in config_paths:
+                src = os.path.join(path, dest)
+                if os.path.exists(src):
+                    shutil.copyfile(src, dest)
+                    E.info("created new configuration file `%s` " % dest)
+                    break
+            else:
+                raise ValueError('''default config file for `%s`
+                                    not found in
+                                    %s
+                                    A pipeline cannot be run without this.
+                                 ''' % (config_files, config_paths))
 
     # Copy Sphinx configuration files, enforce copy of 'conf.py' in case
     # CGATReport is used:
@@ -203,7 +222,7 @@ def writeConfigFiles(paths, caller_name):
     if os.path.exists(dest):
         E.warn("file `%s` already exists - skipped" % dest)
 
-    for path in paths:
+    for path in config_paths:
         src = os.path.join(path, dest)
         if os.path.exists(src):
             # Put sphinx files in separate dir:
@@ -217,7 +236,8 @@ def writeConfigFiles(paths, caller_name):
         # Only warn as pipeline can be run without report:
         E.warn('''default config file for `%s` not found in
                   %s
-                  CGATReport nor Sphinx can be run without this''' % (dest, paths))
+                  CGATReport nor Sphinx can be run without this''' % (dest,
+                                                                      config_paths))
 
     # If other Sphinx config files are found, copy them if there is a skeleton
     # pipeline report to use:
@@ -233,7 +253,7 @@ def writeConfigFiles(paths, caller_name):
 
     # Look for a pipeline report file:
     f_count = 0
-    for path in paths:
+    for path in config_paths:
         if os.path.exists(path):
             for f in os.listdir(os.path.abspath(path)):
                 # TO DO:
@@ -253,17 +273,14 @@ def writeConfigFiles(paths, caller_name):
                   report_pipeline_*.rst
                   in the directories:
                   {}
-                  {}
-                  or
-                  {}
                   Ignore this if you are using CGATReport.
-                  '''.format(pipeline_path, pipeline_path_2, general_path)
+                  '''.format(config_paths)
                   )
 
     # Copy the files across if they are found:
     f_count = 0
     # Check all the paths and their files given above when searching for config files:
-    for path in paths:
+    for path in config_paths:
         if os.path.exists(path):
             for f in os.listdir(path):
                 # For each file or search term given, match to an existing file:
@@ -297,7 +314,7 @@ def writeConfigFiles(paths, caller_name):
                   were found
                   in
                   {}
-                  Continuing without.'''.format(dest, paths))
+                  Continuing without.'''.format(dest, config_paths))
 
 def printConfigFiles():
     '''
@@ -1223,7 +1240,9 @@ def main(args=sys.argv):
         printConfigFiles()
 
     elif options.pipeline_action == "config":
-        writeConfigFiles(getConfigPaths())
+        config_paths = getConfigPaths()[0]
+        caller_name = getConfigPaths()[1]
+        writeConfigFiles(config_paths, caller_name)
 
     elif options.pipeline_action == "clone":
         clonePipeline(options.pipeline_targets[0])
diff --git a/CGATPipelines/Pipeline/Control.py.works_mac b/CGATPipelines/Pipeline/Control.py.works
similarity index 99%
rename from CGATPipelines/Pipeline/Control.py.works_mac
rename to CGATPipelines/Pipeline/Control.py.works
index e785de16..87dde9b3 100644
--- a/CGATPipelines/Pipeline/Control.py.works_mac
+++ b/CGATPipelines/Pipeline/Control.py.works
@@ -1193,7 +1193,6 @@ def main(args=sys.argv):
             caller = inspect.getargvalues(f).locals["__file__"]
             cmd_caller = os.path.basename(os.path.normpath(caller))
             # As above, save the command called in a separate variable:
-            global caller_name
             caller_name = cmd_caller
             cmd_caller = importlib.import_module(cmd_caller)
             caller = cmd_caller.getDir()

From 50173b2c486784919b920e4d66d635f3a5ab4be3 Mon Sep 17 00:00:00 2001
From: Antonio <AntonioJBT@users.noreply.github.com>
Date: Wed, 17 Jan 2018 15:29:06 +0000
Subject: [PATCH 09/21] added function to search and import external pipeline,
 untested

---
 CGATPipelines/cgatflow.py | 36 +++++++++++++++++++++++++++++++-----
 1 file changed, 31 insertions(+), 5 deletions(-)

diff --git a/CGATPipelines/cgatflow.py b/CGATPipelines/cgatflow.py
index 8d219115..c194eb53 100644
--- a/CGATPipelines/cgatflow.py
+++ b/CGATPipelines/cgatflow.py
@@ -55,6 +55,28 @@ def printListInColumns(l, ncolumns):
     # put it all together
     return '\n'.join([pattern % row for row in rows])
 
+def getExternalPipeline():
+    '''
+    Import external pipeline built using CGAT approach and installed as python module.
+    Assumes you have called using cgatflow, e.g. cgatflow externalPipeline CMD
+    '''
+    # Get name of pipeline from command line arguments:
+    argv = sys.argv
+    command = argv[1]
+    command = re.sub("-", "_", command)
+    pipeline = "pipeline_{}".format(command)
+    
+    # Get path to where the pipeline is installed:
+    path = os.path.join(os.path.abspath(os.path.dirname(command.__file__)))
+    
+    # Import it:
+    (file, pathname, description) = imp.find_module(pipeline, path)
+    module = imp.load_module(pipeline, file, pathname, description)
+    # remove 'cgatflow' from sys.argv
+    del sys.argv[0]
+    module.main(sys.argv)
+
+    return
 
 def main(argv=None):
 
@@ -82,11 +104,15 @@ def main(argv=None):
     command = re.sub("-", "_", command)
     pipeline = "pipeline_{}".format(command)
     
-    (file, pathname, description) = imp.find_module(pipeline, paths)
-    module = imp.load_module(pipeline, file, pathname, description)
-    # remove 'cgatflow' from sys.argv
-    del sys.argv[0]
-    module.main(sys.argv)
+    try:
+        (file, pathname, description) = imp.find_module(pipeline, paths)
+        module = imp.load_module(pipeline, file, pathname, description)
+        # remove 'cgatflow' from sys.argv
+        del sys.argv[0]
+        module.main(sys.argv)
+
+    except ImportError:
+        getExternalPipeline()
 
 if __name__ == "__main__":
     sys.exit(main())

From bf20e7501984486441b59e8ad14fc01e99a0d420 Mon Sep 17 00:00:00 2001
From: Antonio <AntonioJBT@users.noreply.github.com>
Date: Wed, 17 Jan 2018 15:33:46 +0000
Subject: [PATCH 10/21] docstrings for cgatflow external pipeline import
 function

---
 CGATPipelines/cgatflow.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/CGATPipelines/cgatflow.py b/CGATPipelines/cgatflow.py
index c194eb53..0615b88c 100644
--- a/CGATPipelines/cgatflow.py
+++ b/CGATPipelines/cgatflow.py
@@ -59,6 +59,8 @@ def getExternalPipeline():
     '''
     Import external pipeline built using CGAT approach and installed as python module.
     Assumes you have called using cgatflow, e.g. cgatflow externalPipeline CMD
+    "externalPipeline" should be the load_entry_point coomand in setup.py
+    and the main pipeline script needs to be "pipeline_externalPipeline.py"
     '''
     # Get name of pipeline from command line arguments:
     argv = sys.argv

From 74895c8e2e772106e0901fae6c6f18beebd4a059 Mon Sep 17 00:00:00 2001
From: AntonioJBT <a.berlanga@imperial.ac.uk>
Date: Wed, 17 Jan 2018 17:00:53 +0000
Subject: [PATCH 11/21] updates/testing

---
 CGATPipelines/Pipeline/Control.py | 19 ++++++++-----------
 1 file changed, 8 insertions(+), 11 deletions(-)

diff --git a/CGATPipelines/Pipeline/Control.py b/CGATPipelines/Pipeline/Control.py
index 884c6217..e98410ea 100644
--- a/CGATPipelines/Pipeline/Control.py
+++ b/CGATPipelines/Pipeline/Control.py
@@ -15,7 +15,6 @@
 ---------
 
 """
-
 import inspect
 import json
 import logging
@@ -87,26 +86,24 @@ def getConfigPaths():
     try:
         f = sys._getframe(1)
         caller = f.f_globals["__file__"] # cgatflow config
-        #caller = inspect.getargvalues(f).locals["__file__"]
         # Make it easier to match the name of the command executed so that
         # the config file can be searched in case there are more than one
         # ini files found in writeConfig():
         caller_name = os.path.basename(os.path.normpath(caller))
         print('try 1', f, caller, caller_name)
-    except KeyError as e:
+    except KeyError:
         # The following code only works if something like this function is
         # present in my_pipeline.py script:
         # http://stackoverflow.com/questions/4519127/setuptools-package-data-folder-location
-        f = sys._getframe(2)
-        caller = f.f_globals["__file__"] # cgatflow config
-        #caller = inspect.getargvalues(f).locals["__file__"]
+        f = sys._getframe(0) # if e.g. call is direct for pipeline_QTL config
+        caller = inspect.getargvalues(f).locals["__file__"]
         cmd_caller = os.path.basename(os.path.normpath(caller))
-        print('first defs try 2', f, caller, cmd_caller)
+        print('1st', caller_name, cmd_caller, caller)
         # As above, save the command called in a separate variable:
         caller_name = cmd_caller
         cmd_caller = importlib.import_module(cmd_caller)
         caller = cmd_caller.getDir()
-        print('2nd defs try 2', caller_name, cmd_caller, caller)
+        print('2nd', caller_name, cmd_caller, caller)
     #else:
     #    print('''Unable to find path to file being executed. Probably because
     #            CGATPipelines and the pipeline that is being executed
@@ -129,7 +126,7 @@ def getConfigPaths():
     # Add paths to search list:
     config_paths.extend([pipeline_path, pipeline_path_2])
     # Extend separately in case general_path returns more than one file:
-    config_paths.extend(general_path)
+    config_paths.append(general_path)
 
     print(config_paths, caller_name)
     return(config_paths, caller_name)
@@ -146,8 +143,8 @@ def writeConfigFiles(config_paths, caller_name):
     # Antonio
     report_dir = 'pipeline_report'
     config_files = []
-    print(config_paths)
-    print(caller_name)
+    #print(config_paths)
+    #print(caller_name)
 
     try:
         os.mkdir(report_dir) # Sphinx config files will be copied here

From d71189592dba8b791bb2d3a84a4d1d3bf24dc9ec Mon Sep 17 00:00:00 2001
From: AntonioJBT <a.berlanga@imperial.ac.uk>
Date: Wed, 17 Jan 2018 20:57:43 +0000
Subject: [PATCH 12/21] added options for calling external pipelines to
 cgatflow

---
 CGATPipelines/cgatflow.py | 108 ++++++++++++++++++++++++++++++++------
 1 file changed, 93 insertions(+), 15 deletions(-)

diff --git a/CGATPipelines/cgatflow.py b/CGATPipelines/cgatflow.py
index 0615b88c..205b1924 100644
--- a/CGATPipelines/cgatflow.py
+++ b/CGATPipelines/cgatflow.py
@@ -22,7 +22,9 @@
 import re
 import glob
 import imp
+import importlib # imp is being deprecated in favour of importlib
 import collections
+import shutil
 import CGATPipelines
 
 
@@ -58,22 +60,94 @@ def printListInColumns(l, ncolumns):
 def getExternalPipeline():
     '''
     Import external pipeline built using CGAT approach and installed as python module.
-    Assumes you have called using cgatflow, e.g. cgatflow externalPipeline CMD
-    "externalPipeline" should be the load_entry_point coomand in setup.py
-    and the main pipeline script needs to be "pipeline_externalPipeline.py"
+    Assumes you have called using cgatflow, e.g. cgatflow pipeline_external CMD
+    The main pipeline script needs to be "pipeline_external.py"
+    with the directory structure as in CGATPipelines
+    Alternatively, "pipeline_external" could be the load_entry_point command in setup.py
+    without the need for cgatflow
     '''
     # Get name of pipeline from command line arguments:
     argv = sys.argv
-    command = argv[1]
-    command = re.sub("-", "_", command)
-    pipeline = "pipeline_{}".format(command)
-    
-    # Get path to where the pipeline is installed:
-    path = os.path.join(os.path.abspath(os.path.dirname(command.__file__)))
-    
-    # Import it:
-    (file, pathname, description) = imp.find_module(pipeline, path)
-    module = imp.load_module(pipeline, file, pathname, description)
+    command = argv[1] #e.g. cgatflow pipeline_external CMD
+
+    try:
+        # System path to the command:
+        sys_cmd_path = shutil.which(str(command))
+        # Get the base command if cgatflow was given a full path to the command:
+        # e.g. cgatflow /path_to/CGATPipelines/CGATPipelines/pipeline_readqc.py -h
+        # Unnecessary though as can just be called as
+        # python /path_to/CGATPipelines/CGATPipelines/pipeline_readqc.py -h
+        command_sanitised = os.path.basename(sys_cmd_path)
+        if command_sanitised.endswith('.py'):
+            command_sanitised = command_sanitised.replace('.py', '')
+        else:
+            pass
+        # Set the paths to search for the module:
+        command_path = os.path.abspath(os.path.dirname(sys_cmd_path))
+        command_path_up = os.path.abspath(os.path.join(command_path, '..'))
+        relpath = os.path.abspath("../src")
+        relpath_2 = os.path.abspath("../code")
+    # If dependencies are missing (ImportErrors in main() below, they'll be
+    # sent here and picked as AttributeError from shutil.which(), coded just above): 
+    except AttributeError:
+        print('''
+              The pipeline provided to cgatflow could not be found.
+              Are you trying to run an external pipeline?
+              Is the command line correctly specified?
+              Are all dependencies installed?
+              Try cgatflow --help for more info or raise an issue in GitHub
+              with the following trace please:
+              ''')
+        raise
+
+    paths = [command_path, command_path_up, relpath, relpath_2]
+    print(paths)
+
+    # Import module as given:
+    try:
+        module = importlib.import_module(command)
+        #module_spec = importlib.util.find_spec(str(command))
+        #(file, pathname, description) = imp.find_module(str(command), paths)
+    except (ImportError, ValueError, TypeError):
+        pass
+    # Import it as sanitised:
+    try:
+        module = importlib.import_module(command_sanitised)
+        #module_spec = importlib.util.find_spec(str(command_sanitised))
+        #(file, pathname, description) = imp.find_module(str(command_sanitised), paths)
+        command = command_sanitised
+    except:
+        print('''
+              Error. Tried importing the pipeline provided as
+              {}
+              and
+              {}
+              but it did not work.
+              See trace below and also try
+              cgatflow --help
+              for more information.
+              '''.format(command, command_sanitised)
+              )
+        raise
+
+    # If module was found:
+    #if module_spec:
+        #print('Found {}, loading ...'.format(command))
+        #module = importlib.import_module(command)
+        #module = importlib.util.module_from_spec(module_spec)
+        #module_spec.loader.exec_module(module)
+    #else:
+    #    print('''
+    #          Error. Module
+    #          {}
+    #          could not be loaded.
+    #          Is your pipeline an importable module?
+    #          All dependencies installed?
+    #          See trace below, try cgatflow --help or raise an issue in GitHub.
+    #          '''.format(command))
+
+    print(dir(module))
+    #module = imp.load_module(str(command), file, pathname, description)
     # remove 'cgatflow' from sys.argv
     del sys.argv[0]
     module.main(sys.argv)
@@ -89,7 +163,7 @@ def main(argv=None):
     relpath = os.path.abspath("../src")
 
     paths = [path, relpath]
-    
+
     if len(argv) == 1 or argv[1] == "--help" or argv[1] == "-h":
         pipelines = []
         for path in paths:
@@ -105,7 +179,7 @@ def main(argv=None):
     command = argv[1]
     command = re.sub("-", "_", command)
     pipeline = "pipeline_{}".format(command)
-    
+
     try:
         (file, pathname, description) = imp.find_module(pipeline, paths)
         module = imp.load_module(pipeline, file, pathname, description)
@@ -113,6 +187,10 @@ def main(argv=None):
         del sys.argv[0]
         module.main(sys.argv)
 
+    # If the command is an external pipeline this allows to search for it with
+    # getExternalPipeline() above. If dependencies are not installed they'll be
+    # picked up here as well though but should error inside
+    # getExternalPipeline().
     except ImportError:
         getExternalPipeline()
 

From 1aee135a0b4c693a83d35619075c8f2810f66dc7 Mon Sep 17 00:00:00 2001
From: AntonioJBT <a.berlanga@imperial.ac.uk>
Date: Wed, 17 Jan 2018 21:12:16 +0000
Subject: [PATCH 13/21] updates/testing

---
 CGATPipelines/cgatflow.py | 27 ++++-----------------------
 1 file changed, 4 insertions(+), 23 deletions(-)

diff --git a/CGATPipelines/cgatflow.py b/CGATPipelines/cgatflow.py
index 205b1924..9be9f3e4 100644
--- a/CGATPipelines/cgatflow.py
+++ b/CGATPipelines/cgatflow.py
@@ -101,20 +101,16 @@ def getExternalPipeline():
         raise
 
     paths = [command_path, command_path_up, relpath, relpath_2]
-    print(paths)
 
     # Import module as given:
+    # With importlib paths are not needed, leaving for now though.
     try:
         module = importlib.import_module(command)
-        #module_spec = importlib.util.find_spec(str(command))
-        #(file, pathname, description) = imp.find_module(str(command), paths)
     except (ImportError, ValueError, TypeError):
         pass
     # Import it as sanitised:
     try:
         module = importlib.import_module(command_sanitised)
-        #module_spec = importlib.util.find_spec(str(command_sanitised))
-        #(file, pathname, description) = imp.find_module(str(command_sanitised), paths)
         command = command_sanitised
     except:
         print('''
@@ -130,27 +126,12 @@ def getExternalPipeline():
               )
         raise
 
-    # If module was found:
-    #if module_spec:
-        #print('Found {}, loading ...'.format(command))
-        #module = importlib.import_module(command)
-        #module = importlib.util.module_from_spec(module_spec)
-        #module_spec.loader.exec_module(module)
-    #else:
-    #    print('''
-    #          Error. Module
-    #          {}
-    #          could not be loaded.
-    #          Is your pipeline an importable module?
-    #          All dependencies installed?
-    #          See trace below, try cgatflow --help or raise an issue in GitHub.
-    #          '''.format(command))
-
-    print(dir(module))
-    #module = imp.load_module(str(command), file, pathname, description)
+    #print(dir(module))
     # remove 'cgatflow' from sys.argv
     del sys.argv[0]
     module.main(sys.argv)
+    except AttributeError:
+
 
     return
 

From db7f8a5e5f85f9b9425e9f6e42261b5d9eddeaa1 Mon Sep 17 00:00:00 2001
From: AntonioJBT <a.berlanga@imperial.ac.uk>
Date: Wed, 17 Jan 2018 21:12:56 +0000
Subject: [PATCH 14/21] updates/testing

---
 CGATPipelines/cgatflow.py | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/CGATPipelines/cgatflow.py b/CGATPipelines/cgatflow.py
index 9be9f3e4..caa30be1 100644
--- a/CGATPipelines/cgatflow.py
+++ b/CGATPipelines/cgatflow.py
@@ -130,9 +130,6 @@ def getExternalPipeline():
     # remove 'cgatflow' from sys.argv
     del sys.argv[0]
     module.main(sys.argv)
-    except AttributeError:
-
-
     return
 
 def main(argv=None):

From a3f1497f194d9665c75d7bbd6166eb6dfa5e1252 Mon Sep 17 00:00:00 2001
From: AntonioJBT <a.berlanga@imperial.ac.uk>
Date: Mon, 22 Jan 2018 13:59:13 +0000
Subject: [PATCH 15/21] returned cgatflow to original code, no changes, easier
 to call external pipelines through entry_point

---
 CGATPipelines/cgatflow.py | 98 +++------------------------------------
 1 file changed, 7 insertions(+), 91 deletions(-)

diff --git a/CGATPipelines/cgatflow.py b/CGATPipelines/cgatflow.py
index caa30be1..8d219115 100644
--- a/CGATPipelines/cgatflow.py
+++ b/CGATPipelines/cgatflow.py
@@ -22,9 +22,7 @@
 import re
 import glob
 import imp
-import importlib # imp is being deprecated in favour of importlib
 import collections
-import shutil
 import CGATPipelines
 
 
@@ -57,80 +55,6 @@ def printListInColumns(l, ncolumns):
     # put it all together
     return '\n'.join([pattern % row for row in rows])
 
-def getExternalPipeline():
-    '''
-    Import external pipeline built using CGAT approach and installed as python module.
-    Assumes you have called using cgatflow, e.g. cgatflow pipeline_external CMD
-    The main pipeline script needs to be "pipeline_external.py"
-    with the directory structure as in CGATPipelines
-    Alternatively, "pipeline_external" could be the load_entry_point command in setup.py
-    without the need for cgatflow
-    '''
-    # Get name of pipeline from command line arguments:
-    argv = sys.argv
-    command = argv[1] #e.g. cgatflow pipeline_external CMD
-
-    try:
-        # System path to the command:
-        sys_cmd_path = shutil.which(str(command))
-        # Get the base command if cgatflow was given a full path to the command:
-        # e.g. cgatflow /path_to/CGATPipelines/CGATPipelines/pipeline_readqc.py -h
-        # Unnecessary though as can just be called as
-        # python /path_to/CGATPipelines/CGATPipelines/pipeline_readqc.py -h
-        command_sanitised = os.path.basename(sys_cmd_path)
-        if command_sanitised.endswith('.py'):
-            command_sanitised = command_sanitised.replace('.py', '')
-        else:
-            pass
-        # Set the paths to search for the module:
-        command_path = os.path.abspath(os.path.dirname(sys_cmd_path))
-        command_path_up = os.path.abspath(os.path.join(command_path, '..'))
-        relpath = os.path.abspath("../src")
-        relpath_2 = os.path.abspath("../code")
-    # If dependencies are missing (ImportErrors in main() below, they'll be
-    # sent here and picked as AttributeError from shutil.which(), coded just above): 
-    except AttributeError:
-        print('''
-              The pipeline provided to cgatflow could not be found.
-              Are you trying to run an external pipeline?
-              Is the command line correctly specified?
-              Are all dependencies installed?
-              Try cgatflow --help for more info or raise an issue in GitHub
-              with the following trace please:
-              ''')
-        raise
-
-    paths = [command_path, command_path_up, relpath, relpath_2]
-
-    # Import module as given:
-    # With importlib paths are not needed, leaving for now though.
-    try:
-        module = importlib.import_module(command)
-    except (ImportError, ValueError, TypeError):
-        pass
-    # Import it as sanitised:
-    try:
-        module = importlib.import_module(command_sanitised)
-        command = command_sanitised
-    except:
-        print('''
-              Error. Tried importing the pipeline provided as
-              {}
-              and
-              {}
-              but it did not work.
-              See trace below and also try
-              cgatflow --help
-              for more information.
-              '''.format(command, command_sanitised)
-              )
-        raise
-
-    #print(dir(module))
-    # remove 'cgatflow' from sys.argv
-    del sys.argv[0]
-    module.main(sys.argv)
-    return
 
 def main(argv=None):
 
@@ -141,7 +65,7 @@ def main(argv=None):
     relpath = os.path.abspath("../src")
 
     paths = [path, relpath]
-
+    
     if len(argv) == 1 or argv[1] == "--help" or argv[1] == "-h":
         pipelines = []
         for path in paths:
@@ -157,20 +81,12 @@ def main(argv=None):
     command = argv[1]
     command = re.sub("-", "_", command)
     pipeline = "pipeline_{}".format(command)
-
-    try:
-        (file, pathname, description) = imp.find_module(pipeline, paths)
-        module = imp.load_module(pipeline, file, pathname, description)
-        # remove 'cgatflow' from sys.argv
-        del sys.argv[0]
-        module.main(sys.argv)
-
-    # If the command is an external pipeline this allows to search for it with
-    # getExternalPipeline() above. If dependencies are not installed they'll be
-    # picked up here as well though but should error inside
-    # getExternalPipeline().
-    except ImportError:
-        getExternalPipeline()
+    
+    (file, pathname, description) = imp.find_module(pipeline, paths)
+    module = imp.load_module(pipeline, file, pathname, description)
+    # remove 'cgatflow' from sys.argv
+    del sys.argv[0]
+    module.main(sys.argv)
 
 if __name__ == "__main__":
     sys.exit(main())

From 21a41724e60587a18737c4581f6b1618c2d51c2f Mon Sep 17 00:00:00 2001
From: AntonioJBT <a.berlanga@imperial.ac.uk>
Date: Mon, 22 Jan 2018 17:01:24 +0000
Subject: [PATCH 16/21] updates/testing

---
 CGATPipelines/Pipeline/Control.py | 74 ++++++++++++++++++++-----------
 1 file changed, 48 insertions(+), 26 deletions(-)

diff --git a/CGATPipelines/Pipeline/Control.py b/CGATPipelines/Pipeline/Control.py
index e98410ea..2f3b012d 100644
--- a/CGATPipelines/Pipeline/Control.py
+++ b/CGATPipelines/Pipeline/Control.py
@@ -85,25 +85,26 @@ def getConfigPaths():
     # python /YYYY//XXXX/pipeline_XXXX.py config
     try:
         f = sys._getframe(1)
-        caller = f.f_globals["__file__"] # cgatflow config
+        #caller = f.f_globals["__file__"] # cgatflow config
+        caller = f.f_locals["__file__"]
         # Make it easier to match the name of the command executed so that
         # the config file can be searched in case there are more than one
         # ini files found in writeConfig():
         caller_name = os.path.basename(os.path.normpath(caller))
-        print('try 1', f, caller, caller_name)
     except KeyError:
         # The following code only works if something like this function is
         # present in my_pipeline.py script:
         # http://stackoverflow.com/questions/4519127/setuptools-package-data-folder-location
-        f = sys._getframe(0) # if e.g. call is direct for pipeline_QTL config
-        caller = inspect.getargvalues(f).locals["__file__"]
-        cmd_caller = os.path.basename(os.path.normpath(caller))
-        print('1st', caller_name, cmd_caller, caller)
+        f = sys._getframe(2) # if e.g. call is direct for pipeline_QTL config
+        caller = f.f_globals["__file__"]
+        #caller = inspect.getargvalues(f).locals["__file__"]
+        cmd_caller = str(os.path.basename(os.path.normpath(caller)))
         # As above, save the command called in a separate variable:
+        if cmd_caller.endswith('.py'):
+            cmd_caller = cmd_caller.replace('.py', '')
         caller_name = cmd_caller
         cmd_caller = importlib.import_module(cmd_caller)
         caller = cmd_caller.getDir()
-        print('2nd', caller_name, cmd_caller, caller)
     #else:
     #    print('''Unable to find path to file being executed. Probably because
     #            CGATPipelines and the pipeline that is being executed
@@ -118,17 +119,17 @@ def getConfigPaths():
     pipeline_path_2 = os.path.dirname(pipeline_path)
         # CGATPipelines have a "configuration" folder
         # adding a glob to have a bit more flexibility
-    general_path = glob.glob(str(os.path.abspath(pipeline_path_2) +
-                                  '/*/configuration*'), recursive = True)
+    general_path = glob.glob(str(os.path.abspath(pipeline_path) +
+                                  '/configuratio*'), recursive = True)
     if not general_path:
-        general_path = os.path.join(os.path.dirname(pipeline_path), "configuration")
+        general_path = [os.path.join(os.path.dirname(pipeline_path),
+            "configuration")]
 
     # Add paths to search list:
     config_paths.extend([pipeline_path, pipeline_path_2])
     # Extend separately in case general_path returns more than one file:
-    config_paths.append(general_path)
+    config_paths.extend(general_path)
 
-    print(config_paths, caller_name)
     return(config_paths, caller_name)
 
 def writeConfigFiles(config_paths, caller_name):
@@ -143,8 +144,6 @@ def writeConfigFiles(config_paths, caller_name):
     # Antonio
     report_dir = 'pipeline_report'
     config_files = []
-    #print(config_paths)
-    #print(caller_name)
 
     try:
         os.mkdir(report_dir) # Sphinx config files will be copied here
@@ -158,33 +157,51 @@ def writeConfigFiles(config_paths, caller_name):
     f_count = 0
     INI_list = []
     for path in config_paths:
+        path = str(path)
         if os.path.exists(path) and os.path.isdir(path):
             for f in os.listdir(os.path.abspath(path)):
                 if fnmatch.fnmatch(f, 'pipeline*ini'):
                     f_count += 1
                     INI_file = f
-                    INI_list.extend([INI_file])
+                    INI_list.append(INI_file)
 
     if f_count == 1:
-        config_files = [INI_file] # This is for the pipeline only
+        config_files = INI_list # This is for the pipeline only
 
     elif f_count > 1:
+        new_count = 0
         # Prioritise the file that contains the command called if more than one
         # ini files are found:
         for f in INI_list:
             if caller_name in f:
-                count += 1
+                new_count += 1
                 INI_file = f
-                config_files = [INI_file]
-        if count == 0:
+                config_files.append(INI_file)
+        if new_count > 1:
             E.warn('''
                    Found several ini files but could not prioritise based on:
-                   {}.
-                   Some pipelines do not require an ini file though, try
-                   without.
-                   '''.format(caller_name))
+                   {}
+                   as more than one matched.
+                   Using the first one found:
+                   {}
+                   from
+                   {}
+                   '''.format(caller_name, config_files[0], config_files)
+                   )
+        elif new_count == 1:
+            pass
+        elif new_count == 0:
+            print('''
+                  More than one ini file found but none matched
+                  {}
+                  Using the first one found:
+                  {}
+                  from
+                  {}
+                  '''.format(caller_name, config_files[0], config_files)
+                  )
 
-    if f_count == 0:
+    elif f_count == 0:
         E.warn('''
                No configuration (ini) files found in:
                 {}
@@ -196,11 +213,13 @@ def writeConfigFiles(config_paths, caller_name):
         E.warn('No configuration files found.')
     else:
         for dest in config_files:
+            dest = str(dest)
             if os.path.exists(dest):
                 E.warn("file `%s` already exists - skipped" % dest)
                 continue
 
             for path in config_paths:
+                path = str(path)
                 src = os.path.join(path, dest)
                 if os.path.exists(src):
                     shutil.copyfile(src, dest)
@@ -220,6 +239,7 @@ def writeConfigFiles(config_paths, caller_name):
         E.warn("file `%s` already exists - skipped" % dest)
 
     for path in config_paths:
+        path = str(path)
         src = os.path.join(path, dest)
         if os.path.exists(src):
             # Put sphinx files in separate dir:
@@ -251,6 +271,7 @@ def writeConfigFiles(config_paths, caller_name):
     # Look for a pipeline report file:
     f_count = 0
     for path in config_paths:
+        path = str(path)
         if os.path.exists(path):
             for f in os.listdir(os.path.abspath(path)):
                 # TO DO:
@@ -278,6 +299,7 @@ def writeConfigFiles(config_paths, caller_name):
     f_count = 0
     # Check all the paths and their files given above when searching for config files:
     for path in config_paths:
+        path = str(path)
         if os.path.exists(path):
             for f in os.listdir(path):
                 # For each file or search term given, match to an existing file:
@@ -1237,8 +1259,8 @@ def main(args=sys.argv):
         printConfigFiles()
 
     elif options.pipeline_action == "config":
-        config_paths = getConfigPaths()[0]
-        caller_name = getConfigPaths()[1]
+        config_paths, caller_name = getConfigPaths()
+        print(config_paths, caller_name)
         writeConfigFiles(config_paths, caller_name)
 
     elif options.pipeline_action == "clone":

From cc7535781d0113c6c6579940b9bed7b669573ec4 Mon Sep 17 00:00:00 2001
From: AntonioJBT <a.berlanga@imperial.ac.uk>
Date: Tue, 23 Jan 2018 12:46:16 +0000
Subject: [PATCH 17/21] control.py

---
 CGATPipelines/Pipeline/Control.py | 32 ++++++++++++-------------------
 1 file changed, 12 insertions(+), 20 deletions(-)

diff --git a/CGATPipelines/Pipeline/Control.py b/CGATPipelines/Pipeline/Control.py
index 2f3b012d..706853b0 100644
--- a/CGATPipelines/Pipeline/Control.py
+++ b/CGATPipelines/Pipeline/Control.py
@@ -86,7 +86,8 @@ def getConfigPaths():
     try:
         f = sys._getframe(1)
         #caller = f.f_globals["__file__"] # cgatflow config
-        caller = f.f_locals["__file__"]
+        # globals will get Control.py
+        caller = f.f_locals["__file__"] # TO DO: cgatflow 
         # Make it easier to match the name of the command executed so that
         # the config file can be searched in case there are more than one
         # ini files found in writeConfig():
@@ -97,14 +98,9 @@ def getConfigPaths():
         # http://stackoverflow.com/questions/4519127/setuptools-package-data-folder-location
         f = sys._getframe(2) # if e.g. call is direct for pipeline_QTL config
         caller = f.f_globals["__file__"]
-        #caller = inspect.getargvalues(f).locals["__file__"]
-        cmd_caller = str(os.path.basename(os.path.normpath(caller)))
-        # As above, save the command called in a separate variable:
-        if cmd_caller.endswith('.py'):
-            cmd_caller = cmd_caller.replace('.py', '')
-        caller_name = cmd_caller
-        cmd_caller = importlib.import_module(cmd_caller)
-        caller = cmd_caller.getDir()
+        caller_name = os.path.basename(os.path.normpath(caller))
+        if caller_name.endswith('.py'):
+            caller_name = caller_name.replace('.py', '')
     #else:
     #    print('''Unable to find path to file being executed. Probably because
     #            CGATPipelines and the pipeline that is being executed
@@ -117,10 +113,12 @@ def getConfigPaths():
         # TO DO: clean this up
     pipeline_path = os.path.splitext(caller)[0]
     pipeline_path_2 = os.path.dirname(pipeline_path)
-        # CGATPipelines have a "configuration" folder
-        # adding a glob to have a bit more flexibility
-    general_path = glob.glob(str(os.path.abspath(pipeline_path) +
-                                  '/configuratio*'), recursive = True)
+    # CGATPipelines have a "configuration" folder
+    # adding a glob to have a bit more flexibility
+    # TO DO: add max depth to glob recursion:
+    general_path = glob.glob(str(os.path.abspath(pipeline_path_2) +
+                                  '/**/configuration*'), recursive = True)
+
     if not general_path:
         general_path = [os.path.join(os.path.dirname(pipeline_path),
             "configuration")]
@@ -129,7 +127,7 @@ def getConfigPaths():
     config_paths.extend([pipeline_path, pipeline_path_2])
     # Extend separately in case general_path returns more than one file:
     config_paths.extend(general_path)
-
+    print(config_paths, caller_name)
     return(config_paths, caller_name)
 
 def writeConfigFiles(config_paths, caller_name):
@@ -157,7 +155,6 @@ def writeConfigFiles(config_paths, caller_name):
     f_count = 0
     INI_list = []
     for path in config_paths:
-        path = str(path)
         if os.path.exists(path) and os.path.isdir(path):
             for f in os.listdir(os.path.abspath(path)):
                 if fnmatch.fnmatch(f, 'pipeline*ini'):
@@ -219,7 +216,6 @@ def writeConfigFiles(config_paths, caller_name):
                 continue
 
             for path in config_paths:
-                path = str(path)
                 src = os.path.join(path, dest)
                 if os.path.exists(src):
                     shutil.copyfile(src, dest)
@@ -239,7 +235,6 @@ def writeConfigFiles(config_paths, caller_name):
         E.warn("file `%s` already exists - skipped" % dest)
 
     for path in config_paths:
-        path = str(path)
         src = os.path.join(path, dest)
         if os.path.exists(src):
             # Put sphinx files in separate dir:
@@ -271,7 +266,6 @@ def writeConfigFiles(config_paths, caller_name):
     # Look for a pipeline report file:
     f_count = 0
     for path in config_paths:
-        path = str(path)
         if os.path.exists(path):
             for f in os.listdir(os.path.abspath(path)):
                 # TO DO:
@@ -299,7 +293,6 @@ def writeConfigFiles(config_paths, caller_name):
     f_count = 0
     # Check all the paths and their files given above when searching for config files:
     for path in config_paths:
-        path = str(path)
         if os.path.exists(path):
             for f in os.listdir(path):
                 # For each file or search term given, match to an existing file:
@@ -1260,7 +1253,6 @@ def main(args=sys.argv):
 
     elif options.pipeline_action == "config":
         config_paths, caller_name = getConfigPaths()
-        print(config_paths, caller_name)
         writeConfigFiles(config_paths, caller_name)
 
     elif options.pipeline_action == "clone":

From a41c6fcf69b0384eea126e14c8822a90f331eb72 Mon Sep 17 00:00:00 2001
From: AntonioJBT <a.berlanga@imperial.ac.uk>
Date: Tue, 23 Jan 2018 13:17:12 +0000
Subject: [PATCH 18/21] control.py for external pipelines

---
 CGATPipelines/Pipeline/Control.py       |    6 -
 CGATPipelines/Pipeline/Control.py.works | 1231 -----------------------
 2 files changed, 1237 deletions(-)
 delete mode 100644 CGATPipelines/Pipeline/Control.py.works

diff --git a/CGATPipelines/Pipeline/Control.py b/CGATPipelines/Pipeline/Control.py
index 706853b0..101d3d54 100644
--- a/CGATPipelines/Pipeline/Control.py
+++ b/CGATPipelines/Pipeline/Control.py
@@ -93,9 +93,6 @@ def getConfigPaths():
         # ini files found in writeConfig():
         caller_name = os.path.basename(os.path.normpath(caller))
     except KeyError:
-        # The following code only works if something like this function is
-        # present in my_pipeline.py script:
-        # http://stackoverflow.com/questions/4519127/setuptools-package-data-folder-location
         f = sys._getframe(2) # if e.g. call is direct for pipeline_QTL config
         caller = f.f_globals["__file__"]
         caller_name = os.path.basename(os.path.normpath(caller))
@@ -109,8 +106,6 @@ def getConfigPaths():
     #    sys.exit()
         # CGATPipelines have a pipe_XX/pipe_XX hierarchy, but a simplified
         # version would only have pipe_XX/
-        # so creating an additional pipeline_path
-        # TO DO: clean this up
     pipeline_path = os.path.splitext(caller)[0]
     pipeline_path_2 = os.path.dirname(pipeline_path)
     # CGATPipelines have a "configuration" folder
@@ -127,7 +122,6 @@ def getConfigPaths():
     config_paths.extend([pipeline_path, pipeline_path_2])
     # Extend separately in case general_path returns more than one file:
     config_paths.extend(general_path)
-    print(config_paths, caller_name)
     return(config_paths, caller_name)
 
 def writeConfigFiles(config_paths, caller_name):
diff --git a/CGATPipelines/Pipeline/Control.py.works b/CGATPipelines/Pipeline/Control.py.works
deleted file mode 100644
index 87dde9b3..00000000
--- a/CGATPipelines/Pipeline/Control.py.works
+++ /dev/null
@@ -1,1231 +0,0 @@
-"""Control.py - Command line control for ruffus pipelines
-=========================================================
-
-The functions :func:`writeConfigFiles`, :func:`clean`,
-:func:`clonePipeline` and :func:`peekParameters` provide the
-functionality for particular pipeline commands.
-
-:class:`MultiLineFormatter` improves the formatting
-of long log messages, while
-:class:`LoggingFilterRabbitMQ` intercepts ruffus log
-messages and sends event information to a rabbitMQ message exchange
-for task process monitoring.
-
-Reference
----------
-
-"""
-
-import inspect
-import json
-import logging
-import os
-import re
-import shutil
-import subprocess
-import sys
-import tempfile
-import time
-import io
-import glob
-import fnmatch
-import importlib
-
-from multiprocessing.pool import ThreadPool
-
-# talking to RabbitMQ
-try:
-    import pika
-    HAS_PIKA = True
-except ImportError:
-    HAS_PIKA = False
-
-# talking to a cluster
-try:
-    import drmaa
-    HAS_DRMAA = True
-except:
-# the following does not work on Travis
-#except ImportError or RuntimeError:
-    HAS_DRMAA = False
-
-from ruffus import pipeline_printout_graph, pipeline_printout, \
-    pipeline_run, ruffus_exceptions, task
-
-
-import CGAT.Experiment as E
-import CGAT.IOTools as IOTools
-from CGAT import Requirements as Requirements
-
-from CGATPipelines.Pipeline.Utils import isTest, getCaller, getCallerLocals
-from CGATPipelines.Pipeline.Execution import execute, startSession,\
-    closeSession
-from CGATPipelines.Pipeline.Local import getProjectName, getPipelineName
-from CGATPipelines.Pipeline.Parameters import inputValidation
-# Set from Pipeline.py
-PARAMS = {}
-
-# global options and arguments - set but currently not
-# used as relevant sections are entered into the PARAMS
-# dictionary. Could be deprecated and removed.
-GLOBAL_OPTIONS, GLOBAL_ARGS = None, None
-
-
-def writeConfigFiles(paths):
-    #pipeline_path, pipeline_path_2, general_path):
-    '''create default configuration files in `path`.
-    '''
-    # TO DO: I've modified this function with workarounds to make it more
-    # flexible in order to find an ini file, find a configuration dir and 
-    # copy pre-run sphinx-quickstart files if they exist.
-    # Other than creating a 'report' dir, it should not change the way it is
-    # run from CGATPipelines.
-    # See also bottom of script for changes when calling the 'config' option 
-    # Antonio
-    #paths = [pipeline_path, pipeline_path_2, general_path]
-    report_dir = 'pipeline_report'
-    try:
-        os.mkdir(report_dir) # Sphinx config files will be copied here
-                             # CGATReport only needs its conf.py to generate the rest
-                             # though
-    except FileExistsError:
-        E.warn("directory `%s` already exists" % report_dir)
-        raise
-
-    # Look for ini file:
-    f_count = 0
-    INI_list = []
-    for path in paths:
-        if os.path.exists(path) and os.path.isdir(path):
-            for f in os.listdir(os.path.abspath(path)):
-                if fnmatch.fnmatch(f, 'pipeline*ini'):
-                    f_count += 1
-                    INI_file = f
-                    INI_list.extend([INI_file])
-
-    if f_count == 1:
-        config_files = [INI_file] # This is for the pipeline only
-
-    elif f_count > 1:
-        # Prioritise the file that contains the command called if more than one
-        # ini file are found:
-        for f in INI_list:
-            if caller_name in f:
-                INI_file = f
-                config_files = [INI_file]
-    else:
-        if f_count == 0:
-            print('''
-                  No configuration (ini) files found in:
-                  {}
-                  '''.format(paths)
-                  )
-        else:
-            print('''
-                  Found several ini files but could not prioritise based on:
-                  {}
-                  Exiting.
-                  '''.format(caller_name))
-            sys.exit()
-
-    # Copy pipeline ini file:
-    for dest in config_files:
-        if os.path.exists(dest):
-            E.warn("file `%s` already exists - skipped" % dest)
-            continue
-
-        for path in paths:
-            src = os.path.join(path, dest)
-            if os.path.exists(src):
-                shutil.copyfile(src, dest)
-                E.info("created new configuration file `%s` " % dest)
-                break
-        else:
-            raise ValueError('''default config file for `%s`
-                                not found in
-                                %s
-                                A pipeline cannot be run without this.
-                             ''' % (config_files, paths))
-
-    # Copy Sphinx configuration files, enforce copy of 'conf.py' in case
-    # CGATReport is used:
-    dest = 'conf.py'
-    if os.path.exists(dest):
-        E.warn("file `%s` already exists - skipped" % dest)
-
-    for path in paths:
-        src = os.path.join(path, dest)
-        if os.path.exists(src):
-            # Put sphinx files in separate dir:
-            shutil.copyfile(src, os.path.join(report_dir, dest))
-            # Create a softlink outside of report_dir dir for CGATReport:
-            os.symlink(os.path.join(report_dir, dest), str(dest))
-            E.info("created new configuration file `%s` " % dest)
-            break
-
-    else:
-        # Only warn as pipeline can be run without report:
-        E.warn('''default config file for `%s` not found in
-                  %s
-                  CGATReport nor Sphinx can be run without this''' % (dest, paths))
-
-    # If other Sphinx config files are found, copy them if there is a skeleton
-    # pipeline report to use:
-    E.info('Looking for additional Sphinx configuration files.')
-    sphinx_config_files = ['Makefile',
-                           'make.bat',
-                           '*.rst',
-                           '*.bib',
-                           ] # These are for a sphinx setup, not needed
-                             # with CGATReport
-                             # A 'report_pipeline_*.rst' template is
-                             # searched for below
-
-    # Look for a pipeline report file:
-    f_count = 0
-    for path in paths:
-        if os.path.exists(path):
-            for f in os.listdir(os.path.abspath(path)):
-                # TO DO:
-                # This pattern matching is particular to 
-                # https://github.com/AntonioJBT/project_quickstart
-                # Needs to be made more generic
-                if fnmatch.fnmatch(f, 'report_pipeline_*.rst'):
-                    f_count += 1
-                    pipeline_report_file = f
-
-    if f_count == 1:
-        sphinx_config_files.append(pipeline_report_file)
-
-    else:
-        # Only warn as pipeline can be run without report:
-        E.warn('''There is no pipeline report file matching
-                  report_pipeline_*.rst
-                  in the directories:
-                  {}
-                  {}
-                  or
-                  {}
-                  Ignore this if you are using CGATReport.
-                  '''.format(pipeline_path, pipeline_path_2, general_path)
-                  )
-
-    # Copy the files across if they are found:
-    f_count = 0
-    # Check all the paths and their files given above when searching for config files:
-    for path in paths:
-        if os.path.exists(path):
-            for f in os.listdir(path):
-                # For each file or search term given, match to an existing file:
-                for dest in sphinx_config_files:
-                    if fnmatch.fnmatch(f, dest):
-                        f_to_copy = f
-                        # If a match is found, walk the cwd to check it's not
-                        # already present:
-                        for root, dirs, files in os.walk('.'):
-                            if f_to_copy in files:
-                                E.warn("file `%s` already exists - skipped" % f_to_copy)
-                                continue
-
-                        # If not present, copy the file:
-                        else:
-                            f_count += 1
-                            src = os.path.join(path, f_to_copy)
-                            if os.path.exists(src):
-                                # Put sphinx files in separate dir:
-                                shutil.copyfile(src, os.path.join(report_dir,
-                                                                  f_to_copy)
-                                                )
-                                E.info("created new configuration file `%s` "
-                                        % f_to_copy)
-                                break
-    if f_count > 0:
-        pass
-    else:
-        E.warn('''No sphinx-quickstart skeleton files such as:
-                  {}
-                  were found
-                  in
-                  {}
-                  Continuing without.'''.format(dest, paths))
-
-def printConfigFiles():
-    '''
-        Print the list of .ini files used to configure the pipeline
-        along with their associated priorities.
-        Priority 1 is the highest.
-    '''
-
-    filenames = PARAMS['pipeline_ini']
-    print("\n List of .ini files used to configure the pipeline")
-    s = len(filenames)
-    if s == 0:
-        print(" No ini files passed!")
-    elif s >= 1:
-        print(" %-11s: %s " % ("Priority", "File"))
-        for f in filenames:
-            if s == 1:
-                print(" (highest) %s: %s\n" % (s, f))
-            else:
-                print(" %-11s: %s " % (s, f))
-            s -= 1
-
-
-def clonePipeline(srcdir, destdir=None):
-    '''clone a pipeline.
-
-    Cloning entails creating a mirror of the source pipeline.
-    Generally, data files are mirrored by linking. Configuration
-    files and the pipeline database will be copied.
-
-    Without modification of any files, building the cloned pipeline in
-    `destdir` should not re-run any commands. However, on deleting
-    selected files, the pipeline should run from the appropriate
-    point.  Newly created files will not affect the original pipeline.
-
-    Cloning pipelines permits sharing partial results between
-    pipelines, for example for parameter optimization.
-
-    Arguments
-    ---------
-    scrdir : string
-        Source directory
-    destdir : string
-        Destination directory. If None, use the current directory.
-
-    '''
-
-    if destdir is None:
-        destdir = os.path.curdir
-
-    E.info("cloning pipeline from %s to %s" % (srcdir, destdir))
-
-    copy_files = ("conf.py", "pipeline.ini", "csvdb")
-    ignore_prefix = (
-        "report", "_cache", "export", "tmp", "ctmp",
-        "_static", "_templates")
-
-    def _ignore(p):
-        for x in ignore_prefix:
-            if p.startswith(x):
-                return True
-        return False
-
-    for root, dirs, files in os.walk(srcdir):
-
-        relpath = os.path.relpath(root, srcdir)
-        if _ignore(relpath):
-            continue
-
-        for d in dirs:
-            if _ignore(d):
-                continue
-            dest = os.path.join(os.path.join(destdir, relpath, d))
-            os.mkdir(dest)
-            # touch
-            s = os.stat(os.path.join(root, d))
-            os.utime(dest, (s.st_atime, s.st_mtime))
-
-        for f in files:
-            if _ignore(f):
-                continue
-
-            fn = os.path.join(root, f)
-            dest_fn = os.path.join(destdir, relpath, f)
-            if f in copy_files:
-                shutil.copyfile(fn, dest_fn)
-            else:
-                # realpath resolves links - thus links will be linked to
-                # the original target
-                os.symlink(os.path.realpath(fn),
-                           dest_fn)
-
-
-def clean(files, logfile):
-    '''clean up files given by glob expressions.
-
-    Files are cleaned up by zapping, i.e. the files are set to size
-    0. Links to files are replaced with place-holders.
-
-    Information about the original file is written to `logfile`.
-
-    Arguments
-    ---------
-    files : list
-        List of glob expressions of files to clean up.
-    logfile : string
-        Filename of logfile.
-
-    '''
-    fields = ('st_atime', 'st_blksize', 'st_blocks',
-              'st_ctime', 'st_dev', 'st_gid', 'st_ino',
-              'st_mode', 'st_mtime', 'st_nlink',
-              'st_rdev', 'st_size', 'st_uid')
-
-    dry_run = PARAMS.get("dryrun", False)
-
-    if not dry_run:
-        if not os.path.exists(logfile):
-            outfile = IOTools.openFile(logfile, "w")
-            outfile.write("filename\tzapped\tlinkdest\t%s\n" %
-                          "\t".join(fields))
-        else:
-            outfile = IOTools.openFile(logfile, "a")
-
-    c = E.Counter()
-    for fn in files:
-        c.files += 1
-        if not dry_run:
-            stat, linkdest = IOTools.zapFile(fn)
-            if stat is not None:
-                c.zapped += 1
-                if linkdest is not None:
-                    c.links += 1
-                outfile.write("%s\t%s\t%s\t%s\n" % (
-                    fn,
-                    time.asctime(time.localtime(time.time())),
-                    linkdest,
-                    "\t".join([str(getattr(stat, x)) for x in fields])))
-
-    E.info("zapped: %s" % (c))
-    outfile.close()
-
-    return c
-
-
-def peekParameters(workingdir,
-                   pipeline,
-                   on_error_raise=None,
-                   prefix=None,
-                   update_interface=False,
-                   restrict_interface=False):
-    '''peek configuration parameters from external pipeline.
-
-    As the paramater dictionary is built at runtime, this method
-    executes the pipeline in workingdir, dumping its configuration
-    values and reading them into a dictionary.
-
-    If either `pipeline` or `workingdir` are not found, an error is
-    raised. This behaviour can be changed by setting `on_error_raise`
-    to False. In that case, an empty dictionary is returned.
-
-    Arguments
-    ---------
-    workingdir : string
-       Working directory. This is the directory that the pipeline
-       was executed in.
-    pipeline : string
-       Name of the pipeline script. The pipeline is assumed to live
-       in the same directory as the current pipeline.
-    on_error_raise : Bool
-       If set to a boolean, an error will be raised (or not) if there
-       is an error during parameter peeking, for example if
-       `workingdir` can not be found. If `on_error_raise` is None, it
-       will be set to the default, which is to raise an exception
-       unless the calling script is imported or the option
-       ``--is-test`` has been passed at the command line.
-    prefix : string
-       Add a prefix to all parameters. This is useful if the paramaters
-       are added to the configuration dictionary of the calling pipeline.
-    update_interface : bool
-       If True, this method will prefix any options in the
-       ``[interface]`` section with `workingdir`. This allows
-       transparent access to files in the external pipeline.
-    restrict_interface : bool
-       If  True, only interface parameters will be imported.
-
-    Returns
-    -------
-    config : dict
-        Dictionary of configuration values.
-
-    '''
-    caller_locals = getCallerLocals()
-
-    # check if we should raise errors
-    if on_error_raise is None:
-        on_error_raise = not isTest() and \
-            "__name__" in caller_locals and \
-            caller_locals["__name__"] == "__main__"
-
-    # patch - if --help or -h in command line arguments,
-    # do not peek as there might be no config file.
-    if "--help" in sys.argv or "-h" in sys.argv:
-        return {}
-
-    # Attempt to locate directory with pipeline source code. This is a
-    # patch as pipelines might be called within the repository
-    # directory or from an installed location
-    dirname = PARAMS["pipelinedir"]
-
-    # called without a directory, use current directory
-    if dirname == "":
-        dirname = os.path.abspath(".")
-    else:
-        # if not exists, assume we want version located
-        # in directory of calling script.
-        if not os.path.exists(dirname):
-            # directory is path of calling script
-            dirname = os.path.dirname(caller_locals['__file__'])
-
-    pipeline = os.path.join(dirname, pipeline)
-    if not os.path.exists(pipeline):
-        if on_error_raise:
-            raise ValueError(
-                "can't find pipeline at %s" % (pipeline))
-        else:
-            return {}
-
-    if workingdir == "":
-        workingdir = os.path.abspath(".")
-
-    # patch for the "config" target - use default
-    # pipeline directory if directory is not specified
-    # working dir is set to "?!"
-    if "config" in sys.argv or "check" in sys.argv or "clone" in sys.argv and workingdir == "?!":
-        workingdir = os.path.join(PARAMS.get("pipelinedir"),
-                                  IOTools.snip(pipeline, ".py"))
-
-    if not os.path.exists(workingdir):
-        if on_error_raise:
-            raise ValueError(
-                "can't find working dir %s" % workingdir)
-        else:
-            return {}
-
-    statement = "python %s -f -v 0 dump" % pipeline
-    process = subprocess.Popen(statement,
-                               cwd=workingdir,
-                               shell=True,
-                               stdin=subprocess.PIPE,
-                               stdout=subprocess.PIPE,
-                               stderr=subprocess.PIPE)
-
-    # process.stdin.close()
-    stdout, stderr = process.communicate()
-    if process.returncode != 0:
-        raise OSError(
-            ("Child was terminated by signal %i: \n"
-             "Statement: %s\n"
-             "The stderr was: \n%s\n"
-             "Stdout: %s") %
-            (-process.returncode, statement, stderr, stdout))
-
-    # subprocess only accepts encoding argument in py >= 3.6 so
-    # decode here.
-    stdout = stdout.decode("utf-8").splitlines()
-    # remove any log messages
-    stdout = [x for x in stdout if x.startswith("{")]
-    if len(stdout) > 1:
-        raise ValueError("received multiple configurations")
-    dump = json.loads(stdout[0])
-
-    # update interface
-    if update_interface:
-        for key, value in list(dump.items()):
-            if key.startswith("interface"):
-                dump[key] = os.path.join(workingdir, value)
-
-    # keep only interface if so required
-    if restrict_interface:
-        dump = dict([(k, v) for k, v in dump.items()
-                     if k.startswith("interface")])
-
-    # prefix all parameters
-    if prefix is not None:
-        dump = dict([("%s%s" % (prefix, x), y) for x, y in list(dump.items())])
-
-    return dump
-
-
-class MultiLineFormatter(logging.Formatter):
-    """add identation for multi-line entries.
-    """
-
-    def format(self, record):
-        s = logging.Formatter.format(self, record)
-        if record.message:
-            header, footer = s.split(record.message)
-            s = s.replace('\n', '\n' + ' ' * len(header))
-        return s
-
-
-class LoggingFilterRabbitMQ(logging.Filter):
-    """pass event information to a rabbitMQ message queue.
-
-    This is a log filter which detects messages from ruffus_ and sends
-    them to a rabbitMQ message queue.
-
-    A :term:`task` is a ruffus_ decorated function, which will execute
-    one or more :term:`jobs`.
-
-    Valid task/job status:
-
-    update
-       task/job needs updating
-    completed
-       task/job completed successfully
-    failed
-       task/job failed
-    running
-       task/job is running
-    ignore
-       ignore task/job (is up-to-date)
-
-    Arguments
-    ---------
-    ruffus_text : string
-        Log messages from ruffus.pipeline_printout. These are used
-        to collect all tasks that will be executed during pipeline
-        executation.
-    project_name : string
-        Name of the project
-    pipeline_name : string
-        Name of the pipeline
-    host : string
-        RabbitMQ host name
-    exchange : string
-        RabbitMQ exchange name
-
-    """
-
-    def __init__(self, ruffus_text,
-                 project_name,
-                 pipeline_name,
-                 host="localhost",
-                 exchange="ruffus_pipelines"):
-
-        self.project_name = project_name
-        self.pipeline_name = pipeline_name
-        self.exchange = exchange
-
-        # dictionary of jobs to run
-        self.jobs = {}
-        self.tasks = {}
-
-        if not HAS_PIKA:
-            self.connected = False
-            return
-
-        def split_by_job(text):
-            text = "".join(text)
-            job_message = ""
-            # ignore first entry which is the docstring
-            for line in text.split(" Job  = ")[1:]:
-                try:
-                    # long file names cause additional wrapping and
-                    # additional white-space characters
-                    job_name = re.search(
-                        "\[.*-> ([^\]]+)\]", line).groups()
-                except AttributeError:
-                    raise AttributeError("could not parse '%s'" % line)
-                job_status = "ignore"
-                if "Job needs update" in line:
-                    job_status = "update"
-
-                yield job_name, job_status, job_message
-
-        def split_by_task(text):
-            block, task_name = [], None
-            task_status = None
-            for line in text.split("\n"):
-                line = line.strip()
-
-                if line.startswith("Tasks which will be run"):
-                    task_status = "update"
-                elif line.startswith("Tasks which are up-to-date"):
-                    task_status = "ignore"
-
-                if line.startswith("Task = "):
-                    if task_name:
-                        yield task_name, task_status, list(split_by_job(block))
-                    block = []
-                    task_name = re.match("Task = (.*)", line).groups()[0]
-                    continue
-                if line:
-                    block.append(line)
-            if task_name:
-                yield task_name, task_status, list(split_by_job(block))
-
-        # create connection
-        try:
-            connection = pika.BlockingConnection(pika.ConnectionParameters(
-                host=host))
-            self.connected = True
-        except pika.exceptions.AMQPConnectionError:
-            self.connected = False
-            return
-
-        self.channel = connection.channel()
-        self.channel.exchange_declare(
-            exchange=self.exchange,
-            type='topic')
-
-        # populate with initial messages
-        for task_name, task_status, jobs in split_by_task(ruffus_text):
-            if task_name.startswith("(mkdir"):
-                continue
-
-            to_run = 0
-            for job_name, job_status, job_message in jobs:
-                self.jobs[job_name] = (task_name, job_name)
-                if job_status == "update":
-                    to_run += 1
-
-            self.tasks[task_name] = [task_status, len(jobs),
-                                     len(jobs) - to_run]
-            self.send_task(task_name)
-
-    def send_task(self, task_name):
-        '''send task status.'''
-
-        if not self.connected:
-            return
-
-        task_status, task_total, task_completed = self.tasks[task_name]
-
-        data = {}
-        data['created_at'] = time.time()
-        data['pipeline'] = self.pipeline_name
-        data['task_name'] = task_name
-        data['task_status'] = task_status
-        data['task_total'] = task_total
-        data['task_completed'] = task_completed
-
-        key = "%s.%s.%s" % (self.project_name, self.pipeline_name, task_name)
-        try:
-            self.channel.basic_publish(exchange=self.exchange,
-                                       routing_key=key,
-                                       body=json.dumps(data))
-        except pika.exceptions.ConnectionClosed:
-            E.warn("could not send message - connection closed")
-        except Exception as e:
-            E.warn("could not send message: %s" % str(e))
-
-    def send_error(self, task_name, job, error=None, msg=None):
-
-        if not self.connected:
-            return
-
-        try:
-            task_status, task_total, task_completed = self.tasks[task_name]
-        except KeyError:
-            E.warn("could not get task information for %s, no message sent" %
-                   task_name)
-            return
-
-        data = {}
-        data['created_at'] = time.time()
-        data['pipeline'] = self.pipeline_name
-        data['task_name'] = task_name
-        data['task_status'] = 'failed'
-        data['task_total'] = task_total
-        data['task_completed'] = task_completed
-
-        key = "%s.%s.%s" % (self.project_name, self.pipeline_name, task_name)
-
-        try:
-            self.channel.basic_publish(exchange=self.exchange,
-                                       routing_key=key,
-                                       body=json.dumps(data))
-        except pika.exceptions.ConnectionClosed:
-            E.warn("could not send message - connection closed")
-        except Exception as e:
-            E.warn("could not send message: %s" % str(e))
-
-    def filter(self, record):
-
-        if not self.connected:
-            return True
-
-        # filter ruffus logging messages
-        if record.filename.endswith("task.py"):
-            try:
-                before, task_name = record.msg.strip().split(" = ")
-            except ValueError:
-                return True
-
-            # ignore the mkdir, etc tasks
-            if task_name not in self.tasks:
-                return True
-
-            if before == "Task enters queue":
-                self.tasks[task_name][0] = "running"
-            elif before == "Completed Task":
-                self.tasks[task_name][0] = "completed"
-            elif before == "Uptodate Task":
-                self.tasks[task_name][0] = "uptodate"
-            else:
-                return True
-
-            # send new task status out
-            self.send_task(task_name)
-
-        return True
-
-
-USAGE = '''
-usage: %prog [OPTIONS] [CMD] [target]
-
-Execute pipeline %prog.
-
-Commands can be any of the following
-
-make <target>
-   run all tasks required to build *target*
-
-show <target>
-   show tasks required to build *target* without executing them
-
-plot <target>
-   plot image (using inkscape) of pipeline state for *target*
-
-debug <target> [args]
-   debug a method using the supplied arguments. The method <target>
-   in the pipeline is run without checking any dependencies.
-
-config
-   write new configuration files pipeline.ini, sphinxreport.ini and conf.py
-   with default values
-
-dump
-   write pipeline configuration to stdout
-
-printconfig
-   write pipeline configuration to stdout in a user-friendly way so
-   it is easier to debug pipeline parameters
-
-touch
-   touch files only, do not run
-
-regenerate
-   regenerate the ruffus checkpoint file
-
-check
-   check if requirements (external tool dependencies) are satisfied.
-
-clone <source>
-   create a clone of a pipeline in <source> in the current
-   directory. The cloning process aims to use soft linking to files
-   (not directories) as much as possible.  Time stamps are
-   preserved. Cloning is useful if a pipeline needs to be re-run from
-   a certain point but the original pipeline should be preserved.
-
-'''
-
-
-def main(args=sys.argv):
-    """command line control function for a pipeline.
-
-    This method defines command line options for the pipeline and
-    updates the global configuration dictionary correspondingly.
-
-    It then provides a command parser to execute particular tasks
-    using the ruffus pipeline control functions. See the generated
-    command line help for usage.
-
-    To use it, add::
-
-        import CGAT.Pipeline as P
-
-        if __name__ == "__main__":
-            sys.exit(P.main(sys.argv))
-
-    to your pipeline script.
-
-    Arguments
-    ---------
-    args : list
-        List of command line arguments.
-
-    """
-
-    global GLOBAL_OPTIONS
-    global GLOBAL_ARGS
-
-    parser = E.OptionParser(version="%prog version: $Id$",
-                            usage=USAGE)
-
-    parser.add_option("--pipeline-action", dest="pipeline_action",
-                      type="choice",
-                      choices=(
-                          "make", "show", "plot", "dump", "config", "clone",
-                          "check", "regenerate", "printconfig"),
-                      help="action to take [default=%default].")
-
-    parser.add_option("--pipeline-format", dest="pipeline_format",
-                      type="choice",
-                      choices=("dot", "jpg", "svg", "ps", "png"),
-                      help="pipeline format [default=%default].")
-
-    parser.add_option("-n", "--dry-run", dest="dry_run",
-                      action="store_true",
-                      help="perform a dry run (do not execute any shell "
-                      "commands) [default=%default].")
-
-    parser.add_option("-f", "--force-output", dest="force",
-                      action="store_true",
-                      help="force running the pipeline even if there "
-                      "are uncommited changes "
-                      "in the repository [default=%default].")
-
-    parser.add_option("-p", "--multiprocess", dest="multiprocess", type="int",
-                      help="number of parallel processes to use on "
-                      "submit host "
-                      "(different from number of jobs to use for "
-                      "cluster jobs) "
-                      "[default=%default].")
-
-    parser.add_option("-e", "--exceptions", dest="log_exceptions",
-                      action="store_true",
-                      help="echo exceptions immediately as they occur "
-                      "[default=%default].")
-
-    parser.add_option("-i", "--terminate", dest="terminate",
-                      action="store_true",
-                      help="terminate immediately at the first exception "
-                      "[default=%default].")
-
-    parser.add_option("-d", "--debug", dest="debug",
-                      action="store_true",
-                      help="output debugging information on console, "
-                      "and not the logfile "
-                      "[default=%default].")
-
-    parser.add_option("-s", "--set", dest="variables_to_set",
-                      type="string", action="append",
-                      help="explicitly set paramater values "
-                      "[default=%default].")
-
-    parser.add_option("-c", "--checksums", dest="ruffus_checksums_level",
-                      type="int",
-                      help="set the level of ruffus checksums"
-                      "[default=%default].")
-
-    parser.add_option("-t", "--is-test", dest="is_test",
-                      action="store_true",
-                      help="this is a test run"
-                      "[default=%default].")
-
-    parser.add_option("--rabbitmq-exchange", dest="rabbitmq_exchange",
-                      type="string",
-                      help="RabbitMQ exchange to send log messages to "
-                      "[default=%default].")
-
-    parser.add_option("--rabbitmq-host", dest="rabbitmq_host",
-                      type="string",
-                      help="RabbitMQ host to send log messages to "
-                      "[default=%default].")
-
-    parser.add_option("--input-validation", dest="input_validation",
-                      action="store_true",
-                      help="perform input validation before starting "
-                      "[default=%default].")
-
-    parser.set_defaults(
-        pipeline_action=None,
-        pipeline_format="svg",
-        pipeline_targets=[],
-        multiprocess=40,
-        logfile="pipeline.log",
-        dry_run=False,
-        force=False,
-        log_exceptions=False,
-        exceptions_terminate_immediately=False,
-        debug=False,
-        variables_to_set=[],
-        is_test=False,
-        ruffus_checksums_level=0,
-        rabbitmq_host="saruman",
-        rabbitmq_exchange="ruffus_pipelines",
-        input_validation=False)
-
-    (options, args) = E.Start(parser,
-                              add_cluster_options=True)
-
-    GLOBAL_OPTIONS, GLOBAL_ARGS = options, args
-    E.info("Started in: %s" % PARAMS.get("workingdir"))
-    # At this point, the PARAMS dictionary has already been
-    # built. It now needs to be updated with selected command
-    # line options as these should always take precedence over
-    # configuration files.
-
-    PARAMS["dryrun"] = options.dry_run
-    PARAMS["input_validation"] = options.input_validation
-
-    # use cli_cluster_* keys in PARAMS to ensure highest priority
-    # of cluster_* options passed with the command-line
-    if options.cluster_memory_default is not None:
-        PARAMS["cli_cluster_memory_default"] = options.cluster_memory_default
-        PARAMS["cluster_memory_default"] = options.cluster_memory_default
-    if options.cluster_memory_resource is not None:
-        PARAMS["cli_cluster_memory_resource"] = options.cluster_memory_resource
-        PARAMS["cluster_memory_resource"] = options.cluster_memory_resource
-    if options.cluster_num_jobs is not None:
-        PARAMS["cli_cluster_num_jobs"] = options.cluster_num_jobs
-        PARAMS["cluster_num_jobs"] = options.cluster_num_jobs
-    if options.cluster_options is not None:
-        PARAMS["cli_cluster_options"] = options.cluster_options
-        PARAMS["cluster_options"] = options.cluster_options
-    if options.cluster_parallel_environment is not None:
-        PARAMS["cli_cluster_parallel_environment"] = options.cluster_parallel_environment
-        PARAMS["cluster_parallel_environment"] = options.cluster_parallel_environment
-    if options.cluster_priority is not None:
-        PARAMS["cli_cluster_priority"] = options.cluster_priority
-        PARAMS["cluster_priority"] = options.cluster_priority
-    if options.cluster_queue is not None:
-        PARAMS["cli_cluster_queue"] = options.cluster_queue
-        PARAMS["cluster_queue"] = options.cluster_queue
-    if options.cluster_queue_manager is not None:
-        PARAMS["cli_cluster_queue_manager"] = options.cluster_queue_manager
-        PARAMS["cluster_queue_manager"] = options.cluster_queue_manager
-
-    PARAMS["ruffus_checksums_level"] = options.ruffus_checksums_level
-
-    for variables in options.variables_to_set:
-        variable, value = variables.split("=")
-        PARAMS[variable.strip()] = IOTools.str2val(value.strip())
-
-    if args:
-        options.pipeline_action = args[0]
-        if len(args) > 1:
-            options.pipeline_targets.extend(args[1:])
-
-    # see inputValidation function in Parameters.py
-    if options.input_validation:
-        inputValidation(PARAMS, sys.argv[0])
-
-    if options.pipeline_action == "check":
-        counter, requirements = Requirements.checkRequirementsFromAllModules()
-        for requirement in requirements:
-            E.info("\t".join(map(str, requirement)))
-        E.info("version check summary: %s" % str(counter))
-        E.Stop()
-        return
-
-    elif options.pipeline_action == "debug":
-        # create the session proxy
-        startSession()
-
-        method_name = options.pipeline_targets[0]
-        caller = getCaller()
-        method = getattr(caller, method_name)
-        method(*options.pipeline_targets[1:])
-
-    elif options.pipeline_action in ("make", "show", "svg", "plot",
-                                     "touch", "regenerate"):
-
-        # set up extra file logger
-        handler = logging.FileHandler(filename=options.logfile,
-                                      mode="a")
-        handler.setFormatter(
-            MultiLineFormatter(
-                '%(asctime)s %(levelname)s %(module)s.%(funcName)s.%(lineno)d %(message)s'))
-        logger = logging.getLogger()
-        logger.addHandler(handler)
-        messenger = None
-
-        try:
-            if options.pipeline_action == "make":
-
-                # get tasks to be done. This essentially replicates
-                # the state information within ruffus.
-                stream = io.StringIO()
-                pipeline_printout(
-                    stream,
-                    options.pipeline_targets,
-                    verbose=5,
-                    checksum_level=options.ruffus_checksums_level)
-
-                messenger = LoggingFilterRabbitMQ(
-                    stream.getvalue(),
-                    project_name=getProjectName(),
-                    pipeline_name=getPipelineName(),
-                    host=options.rabbitmq_host,
-                    exchange=options.rabbitmq_exchange)
-
-                logger.addFilter(messenger)
-
-                if not options.without_cluster and HAS_DRMAA:
-                    global task
-                    # use threading instead of multiprocessing in order to
-                    # limit the number of concurrent jobs by using the
-                    # GIL
-                    #
-                    # Note that threading might cause problems with rpy.
-                    task.Pool = ThreadPool
-
-                    # create the session proxy
-                    startSession()
-
-                #
-                #   make sure we are not logging at the same time in
-                #   different processes
-                #
-                # session_mutex = manager.Lock()
-                E.info(E.GetHeader())
-                E.info("code location: %s" % PARAMS["pipeline_scriptsdir"])
-                E.info("Working directory is: %s" % PARAMS["workingdir"])
-
-                pipeline_run(
-                    options.pipeline_targets,
-                    multiprocess=options.multiprocess,
-                    logger=logger,
-                    verbose=options.loglevel,
-                    log_exceptions=options.log_exceptions,
-                    exceptions_terminate_immediately=options.exceptions_terminate_immediately,
-                    checksum_level=options.ruffus_checksums_level,
-                )
-
-                E.info(E.GetFooter())
-
-                closeSession()
-
-            elif options.pipeline_action == "show":
-                pipeline_printout(
-                    options.stdout,
-                    options.pipeline_targets,
-                    verbose=options.loglevel,
-                    checksum_level=options.ruffus_checksums_level)
-
-            elif options.pipeline_action == "touch":
-                pipeline_run(
-                    options.pipeline_targets,
-                    touch_files_only=True,
-                    verbose=options.loglevel,
-                    checksum_level=options.ruffus_checksums_level)
-
-            elif options.pipeline_action == "regenerate":
-                pipeline_run(
-                    options.pipeline_targets,
-                    touch_files_only=options.ruffus_checksums_level,
-                    verbose=options.loglevel)
-
-            elif options.pipeline_action == "svg":
-                pipeline_printout_graph(
-                    options.stdout.buffer,
-                    options.pipeline_format,
-                    options.pipeline_targets,
-                    checksum_level=options.ruffus_checksums_level)
-
-            elif options.pipeline_action == "plot":
-                outf, filename = tempfile.mkstemp()
-                pipeline_printout_graph(
-                    os.fdopen(outf, "wb"),
-                    options.pipeline_format,
-                    options.pipeline_targets,
-                    checksum_level=options.ruffus_checksums_level)
-                execute("inkscape %s" % filename)
-                os.unlink(filename)
-
-        except ruffus_exceptions.RethrownJobError as value:
-
-            if not options.debug:
-                E.error("%i tasks with errors, please see summary below:" %
-                        len(value.args))
-                for idx, e in enumerate(value.args):
-                    task, job, error, msg, traceback = e
-
-                    if task is None:
-                        # this seems to be errors originating within ruffus
-                        # such as a missing dependency
-                        # msg then contains a RethrownJobJerror
-                        msg = str(msg)
-                        pass
-                    else:
-                        task = re.sub("__main__.", "", task)
-                        job = re.sub("\s", "", job)
-
-                    if messenger:
-                        messenger.send_error(task, job, error, msg)
-
-                    # display only single line messages
-                    if len([x for x in msg.split("\n") if x != ""]) > 1:
-                        msg = ""
-
-                    E.error("%i: Task=%s Error=%s %s: %s" %
-                            (idx, task, error, job, msg))
-
-                E.error("full traceback is in %s" % options.logfile)
-
-                # write full traceback to log file only by removing the stdout
-                # handler
-                lhStdout = logger.handlers[0]
-                logger.removeHandler(lhStdout)
-                logger.error("start of error messages")
-                logger.error(value)
-                logger.error("end of error messages")
-                logger.addHandler(lhStdout)
-
-                # raise error
-                raise ValueError(
-                    "pipeline failed with %i errors" % len(value.args))
-            else:
-                raise
-
-    elif options.pipeline_action == "dump":
-        print(json.dumps(PARAMS))
-
-    elif options.pipeline_action == "printconfig":
-        print("Printing out pipeline parameters: ")
-        for k in sorted(PARAMS):
-            print(k, "=", PARAMS[k])
-        printConfigFiles()
-
-    elif options.pipeline_action == "config":
-    # (Antonio) I've modified this section, see explanation and changes in the
-    # writeConfigFiles function above.
-        config_paths = []
-        try:
-            f = sys._getframe(1)
-            caller = inspect.getargvalues(f).locals["__file__"]
-            # Make it easier to match the name of the command executed so that
-            # the config file can be searched in case there are more than one
-            # ini files found in writeConfig():
-            # Making it global, check if there's better way:
-            global caller_name
-            caller_name = os.path.basename(os.path.normpath(caller))
-        except KeyError as e:
-            # The following code only works if something like this function is
-            # present in my_pipeline.py script:
-            # http://stackoverflow.com/questions/4519127/setuptools-package-data-folder-location
-            f = sys._getframe(2)
-            caller = inspect.getargvalues(f).locals["__file__"]
-            cmd_caller = os.path.basename(os.path.normpath(caller))
-            # As above, save the command called in a separate variable:
-            caller_name = cmd_caller
-            cmd_caller = importlib.import_module(cmd_caller)
-            caller = cmd_caller.getDir()
-        else:
-            print('''Unable to find path to file being executed. Probably because
-                    CGATPipelines and the pipeline that is being executed
-                    cannot figure out where each other lives. Raise an issue in
-                    GitHub if possible. Exiting.''')
-
-            # CGATPipelines have a pipe_XX/pipe_XX hierarchy, but a simplified
-            # version would only have pipe_XX/
-            # so creating an additional pipeline_path
-            # TO DO: clean this up
-        pipeline_path = os.path.splitext(caller)[0]
-        pipeline_path_2 = os.path.dirname(pipeline_path)
-            # CGATPipelines have a "configuration" folder
-            # adding a glob to have a bit more flexibility
-        general_path = glob.glob(str(os.path.abspath(pipeline_path_2) +
-                                      '/**/configuration*'), recursive = True)
-
-        if not general_path:
-            general_path = os.path.join(os.path.dirname(pipeline_path), "configuration")
-
-        config_paths.extend([pipeline_path, pipeline_path_2])
-        # Extend separately in case general_path returns more than one file:
-        config_paths.extend(general_path)
-        writeConfigFiles(config_paths)
-
-    elif options.pipeline_action == "clone":
-        clonePipeline(options.pipeline_targets[0])
-
-    else:
-        raise ValueError("unknown pipeline action %s" %
-                         options.pipeline_action)
-
-    E.Stop()

From 01fd59e21ff7897a0b03ed0eba13a4cd8da2372f Mon Sep 17 00:00:00 2001
From: AntonioJBT <a.berlanga@imperial.ac.uk>
Date: Tue, 23 Jan 2018 20:39:57 +0000
Subject: [PATCH 19/21] changes to cluster.py for pbspro

---
 CGATPipelines/Pipeline/Cluster.py | 64 +++++++++++++++----------------
 CGATPipelines/Pipeline/Control.py | 12 +++---
 2 files changed, 35 insertions(+), 41 deletions(-)

diff --git a/CGATPipelines/Pipeline/Cluster.py b/CGATPipelines/Pipeline/Cluster.py
index 42bc53d4..f04e8949 100644
--- a/CGATPipelines/Pipeline/Cluster.py
+++ b/CGATPipelines/Pipeline/Cluster.py
@@ -166,51 +166,47 @@ def setupDrmaaJobTemplate(drmaa_session, options, job_name, job_memory):
 
         # PBSPro only takes the first 15 characters, throws uninformative error if longer.
         # mem is maximum amount of RAM used by job; mem_free doesn't seem to be available.
+        # For qsub job requirements would be passed as e.g.
+        #PBS -lselect=N:ncpus=X:mem=Ygb
+        #PBS -lwalltime=HH:00:00
+        # 'select=1' determines de number of nodes. Should go in a config file.
+        # mem is per node and maximum memory
+        # Site dependent but in general setting '#PBS -l select=NN:ncpus=NN:mem=NN{gb|mb}'
+        # is sufficient for parallel jobs (OpenMP, MPI).
+        # Also architecture dependent, jobs could be hanging if resource doesn't exist.
+        # TO DO: Kill if long waiting time?
+        nodes = 1 # TO DO: hard coding as unsure of definitions between
+                  # threads, nodes, etc. between programmes for now
+
+        # Set up basic requirements for job submission:
+        # if process has multiple threads, use a parallel environment:
+        # TO DO: error in fastqc build_report, var referenced before assignment.
+        # For now adding to workaround:
+        if 'job_threads' in options:
+            job_threads = options["job_threads"]
+        else:
+            job_threads = 1
+
         spec = ["-N %s" % job_name[0:15],
-                "-l mem=%s" % job_memory]
+                "-l select=%s:ncpus=%s:mem=%s" % (nodes, job_threads, job_memory)]
 
         # Leaving walltime to be specified by user as difficult to set dynamically and
         # depends on site/admin configuration of default values. Likely means setting for
         # longest job with trade-off of longer waiting times for resources to be
         # available for other jobs.
         if options["cluster_options"]:
-            if "mem" not in options["cluster_options"]:
-                spec.append("%(cluster_options)s")
-            elif "mem" in options["cluster_options"]:
+            conds = ('mem' in options["cluster_options"],
+                     'ncpus' in options["cluster_options"],
+                     'select' in options["cluster_options"]
+                     )
+            if any(conds):
                 spec = ["-N %s" % job_name[0:15]]
                 spec.append("%(cluster_options)s")
-
-        # if process has multiple threads, use a parallel environment:
-        # TO DO: error in fastqc build_report, var referenced before assignment.
-        # For now adding to workaround:
-        if 'job_threads' in options:
-            job_threads = options["job_threads"]
-        else:
-            job_threads = 1
-
-        multithread = 'job_threads' in options and options['job_threads'] > 1
-        if multithread:
-            # TO DO 'select=1' determines de number of nodes. Should go in a config file.
-            # mem is per node and maximum memory
-            # Site dependent but in general setting '#PBS -l select=NN:ncpus=NN:mem=NN{gb|mb}'
-            # is sufficient for parallel jobs (OpenMP, MPI).
-            # Also architecture dependent, jobs could be hanging if resource doesn't exist.
-            # TO DO: Kill if long waiting time?
-            spec = ["-N %s" % job_name[0:15],
-                    "-l select=1:ncpus=%s:mem=%s" % (job_threads, job_memory)]
-
-            if options["cluster_options"]:
-                if "mem" not in options["cluster_options"]:
-                    spec.append("%(cluster_options)s")
-
-                elif "mem" in options["cluster_options"]:
-                    raise ValueError('''mem resource specified twice, check ~/.cgat config file,
-                                        ini files, command line options, etc.
-                                     ''')
+            else:
+                spec.append("%(cluster_options)s")
 
         if "cluster_pe_queue" in options and multithread:
-            spec.append(
-                "-q %(cluster_pe_queue)s")
+            spec.append("-q %(cluster_pe_queue)s")
         elif options['cluster_queue'] != "NONE":
             spec.append("-q %(cluster_queue)s")
             # TO DO: sort out in Parameters.py to allow none values for configparser:
diff --git a/CGATPipelines/Pipeline/Control.py b/CGATPipelines/Pipeline/Control.py
index 101d3d54..4be0abf5 100644
--- a/CGATPipelines/Pipeline/Control.py
+++ b/CGATPipelines/Pipeline/Control.py
@@ -147,27 +147,24 @@ def writeConfigFiles(config_paths, caller_name):
 
     # Look for ini file:
     f_count = 0
-    INI_list = []
     for path in config_paths:
         if os.path.exists(path) and os.path.isdir(path):
             for f in os.listdir(os.path.abspath(path)):
                 if fnmatch.fnmatch(f, 'pipeline*ini'):
                     f_count += 1
-                    INI_file = f
-                    INI_list.append(INI_file)
+                    config_files.append(f)
 
     if f_count == 1:
-        config_files = INI_list # This is for the pipeline only
+        pass
 
     elif f_count > 1:
         new_count = 0
         # Prioritise the file that contains the command called if more than one
         # ini files are found:
-        for f in INI_list:
+        for f in config_files:
             if caller_name in f:
                 new_count += 1
-                INI_file = f
-                config_files.append(INI_file)
+                config_files.append(f)
         if new_count > 1:
             E.warn('''
                    Found several ini files but could not prioritise based on:
@@ -182,6 +179,7 @@ def writeConfigFiles(config_paths, caller_name):
         elif new_count == 1:
             pass
         elif new_count == 0:
+            pass
             print('''
                   More than one ini file found but none matched
                   {}

From 0e760fef9b1933e6c36993d156d5f40fbb31dc46 Mon Sep 17 00:00:00 2001
From: AntonioJBT <a.berlanga@imperial.ac.uk>
Date: Mon, 29 Jan 2018 11:53:05 +0000
Subject: [PATCH 20/21] reverting Control.py

---
 CGATPipelines/Pipeline/Control.py | 274 +++---------------------------
 1 file changed, 25 insertions(+), 249 deletions(-)

diff --git a/CGATPipelines/Pipeline/Control.py b/CGATPipelines/Pipeline/Control.py
index 4be0abf5..fabb82a4 100644
--- a/CGATPipelines/Pipeline/Control.py
+++ b/CGATPipelines/Pipeline/Control.py
@@ -15,6 +15,7 @@
 ---------
 
 """
+
 import inspect
 import json
 import logging
@@ -26,9 +27,6 @@
 import tempfile
 import time
 import io
-import glob
-import fnmatch
-import importlib
 
 from multiprocessing.pool import ThreadPool
 
@@ -70,255 +68,29 @@
 GLOBAL_OPTIONS, GLOBAL_ARGS = None, None
 
 
-def getConfigPaths():
-    '''
-    Search the current and installation paths where the configuration files
-    live for the pipeline being called.
-    '''
-    # (Antonio) I've modified this section, see explanation and changes in the
-    # writeConfigFiles function above.
-    config_paths = []
-    # Get the name of the pipeline being called
-    # This could be:
-    # cgatflow readqc config
-    # pipeline_QTL config
-    # python /YYYY//XXXX/pipeline_XXXX.py config
-    try:
-        f = sys._getframe(1)
-        #caller = f.f_globals["__file__"] # cgatflow config
-        # globals will get Control.py
-        caller = f.f_locals["__file__"] # TO DO: cgatflow 
-        # Make it easier to match the name of the command executed so that
-        # the config file can be searched in case there are more than one
-        # ini files found in writeConfig():
-        caller_name = os.path.basename(os.path.normpath(caller))
-    except KeyError:
-        f = sys._getframe(2) # if e.g. call is direct for pipeline_QTL config
-        caller = f.f_globals["__file__"]
-        caller_name = os.path.basename(os.path.normpath(caller))
-        if caller_name.endswith('.py'):
-            caller_name = caller_name.replace('.py', '')
-    #else:
-    #    print('''Unable to find path to file being executed. Probably because
-    #            CGATPipelines and the pipeline that is being executed
-    #            cannot figure out where each other lives. Raise an issue in
-    #            GitHub if possible. Exiting.''')
-    #    sys.exit()
-        # CGATPipelines have a pipe_XX/pipe_XX hierarchy, but a simplified
-        # version would only have pipe_XX/
-    pipeline_path = os.path.splitext(caller)[0]
-    pipeline_path_2 = os.path.dirname(pipeline_path)
-    # CGATPipelines have a "configuration" folder
-    # adding a glob to have a bit more flexibility
-    # TO DO: add max depth to glob recursion:
-    general_path = glob.glob(str(os.path.abspath(pipeline_path_2) +
-                                  '/**/configuration*'), recursive = True)
-
-    if not general_path:
-        general_path = [os.path.join(os.path.dirname(pipeline_path),
-            "configuration")]
-
-    # Add paths to search list:
-    config_paths.extend([pipeline_path, pipeline_path_2])
-    # Extend separately in case general_path returns more than one file:
-    config_paths.extend(general_path)
-    return(config_paths, caller_name)
-
-def writeConfigFiles(config_paths, caller_name):
+def writeConfigFiles(pipeline_path, general_path):
     '''create default configuration files in `path`.
     '''
-    # TO DO: I've modified this function with workarounds to make it more
-    # flexible in order to find an ini file, find a configuration dir and 
-    # copy pre-run sphinx-quickstart files if they exist.
-    # Other than creating a 'report' dir, it should not change the way it is
-    # run from CGATPipelines.
-    # See also getConfigPaths() above, these run when calling the 'config' option 
-    # Antonio
-    report_dir = 'pipeline_report'
-    config_files = []
-
-    try:
-        os.mkdir(report_dir) # Sphinx config files will be copied here
-                             # CGATReport only needs its conf.py to generate the rest
-                             # though
-    except FileExistsError:
-        E.warn("directory `%s` already exists" % report_dir)
-        raise
-
-    # Look for ini file:
-    f_count = 0
-    for path in config_paths:
-        if os.path.exists(path) and os.path.isdir(path):
-            for f in os.listdir(os.path.abspath(path)):
-                if fnmatch.fnmatch(f, 'pipeline*ini'):
-                    f_count += 1
-                    config_files.append(f)
-
-    if f_count == 1:
-        pass
-
-    elif f_count > 1:
-        new_count = 0
-        # Prioritise the file that contains the command called if more than one
-        # ini files are found:
-        for f in config_files:
-            if caller_name in f:
-                new_count += 1
-                config_files.append(f)
-        if new_count > 1:
-            E.warn('''
-                   Found several ini files but could not prioritise based on:
-                   {}
-                   as more than one matched.
-                   Using the first one found:
-                   {}
-                   from
-                   {}
-                   '''.format(caller_name, config_files[0], config_files)
-                   )
-        elif new_count == 1:
-            pass
-        elif new_count == 0:
-            pass
-            print('''
-                  More than one ini file found but none matched
-                  {}
-                  Using the first one found:
-                  {}
-                  from
-                  {}
-                  '''.format(caller_name, config_files[0], config_files)
-                  )
-
-    elif f_count == 0:
-        E.warn('''
-               No configuration (ini) files found in:
-                {}
-               '''.format(config_paths)
-               )
-
-    # Copy pipeline ini file:
-    if not config_files:
-        E.warn('No configuration files found.')
-    else:
-        for dest in config_files:
-            dest = str(dest)
-            if os.path.exists(dest):
-                E.warn("file `%s` already exists - skipped" % dest)
-                continue
 
-            for path in config_paths:
-                src = os.path.join(path, dest)
-                if os.path.exists(src):
-                    shutil.copyfile(src, dest)
-                    E.info("created new configuration file `%s` " % dest)
-                    break
-            else:
-                raise ValueError('''default config file for `%s`
-                                    not found in
-                                    %s
-                                    A pipeline cannot be run without this.
-                                 ''' % (config_files, config_paths))
-
-    # Copy Sphinx configuration files, enforce copy of 'conf.py' in case
-    # CGATReport is used:
-    dest = 'conf.py'
-    if os.path.exists(dest):
-        E.warn("file `%s` already exists - skipped" % dest)
-
-    for path in config_paths:
-        src = os.path.join(path, dest)
-        if os.path.exists(src):
-            # Put sphinx files in separate dir:
-            shutil.copyfile(src, os.path.join(report_dir, dest))
-            # Create a softlink outside of report_dir dir for CGATReport:
-            os.symlink(os.path.join(report_dir, dest), str(dest))
-            E.info("created new configuration file `%s` " % dest)
-            break
+    paths = [pipeline_path, general_path]
+    config_files = ['pipeline.ini', 'conf.py']
 
-    else:
-        # Only warn as pipeline can be run without report:
-        E.warn('''default config file for `%s` not found in
-                  %s
-                  CGATReport nor Sphinx can be run without this''' % (dest,
-                                                                      config_paths))
-
-    # If other Sphinx config files are found, copy them if there is a skeleton
-    # pipeline report to use:
-    E.info('Looking for additional Sphinx configuration files.')
-    sphinx_config_files = ['Makefile',
-                           'make.bat',
-                           '*.rst',
-                           '*.bib',
-                           ] # These are for a sphinx setup, not needed
-                             # with CGATReport
-                             # A 'report_pipeline_*.rst' template is
-                             # searched for below
-
-    # Look for a pipeline report file:
-    f_count = 0
-    for path in config_paths:
-        if os.path.exists(path):
-            for f in os.listdir(os.path.abspath(path)):
-                # TO DO:
-                # This pattern matching is particular to 
-                # https://github.com/AntonioJBT/project_quickstart
-                # Needs to be made more generic
-                if fnmatch.fnmatch(f, 'report_pipeline_*.rst'):
-                    f_count += 1
-                    pipeline_report_file = f
-
-    if f_count == 1:
-        sphinx_config_files.append(pipeline_report_file)
+    for dest in config_files:
+        if os.path.exists(dest):
+            E.warn("file `%s` already exists - skipped" % dest)
+            continue
+
+        for path in paths:
+            src = os.path.join(path, dest)
+            if os.path.exists(src):
+                shutil.copyfile(src, dest)
+                E.info("created new configuration file `%s` " % dest)
+                break
+        else:
+            raise ValueError(
+                "default config file for `%s` not found in %s" %
+                (config_files, paths))
 
-    else:
-        # Only warn as pipeline can be run without report:
-        E.warn('''There is no pipeline report file matching
-                  report_pipeline_*.rst
-                  in the directories:
-                  {}
-                  Ignore this if you are using CGATReport.
-                  '''.format(config_paths)
-                  )
-
-    # Copy the files across if they are found:
-    f_count = 0
-    # Check all the paths and their files given above when searching for config files:
-    for path in config_paths:
-        if os.path.exists(path):
-            for f in os.listdir(path):
-                # For each file or search term given, match to an existing file:
-                for dest in sphinx_config_files:
-                    if fnmatch.fnmatch(f, dest):
-                        f_to_copy = f
-                        # If a match is found, walk the cwd to check it's not
-                        # already present:
-                        for root, dirs, files in os.walk('.'):
-                            if f_to_copy in files:
-                                E.warn("file `%s` already exists - skipped" % f_to_copy)
-                                continue
-
-                        # If not present, copy the file:
-                        else:
-                            f_count += 1
-                            src = os.path.join(path, f_to_copy)
-                            if os.path.exists(src):
-                                # Put sphinx files in separate dir:
-                                shutil.copyfile(src, os.path.join(report_dir,
-                                                                  f_to_copy)
-                                                )
-                                E.info("created new configuration file `%s` "
-                                        % f_to_copy)
-                                break
-    if f_count > 0:
-        pass
-    else:
-        E.warn('''No sphinx-quickstart skeleton files such as:
-                  {}
-                  were found
-                  in
-                  {}
-                  Continuing without.'''.format(dest, config_paths))
 
 def printConfigFiles():
     '''
@@ -1244,8 +1016,12 @@ def main(args=sys.argv):
         printConfigFiles()
 
     elif options.pipeline_action == "config":
-        config_paths, caller_name = getConfigPaths()
-        writeConfigFiles(config_paths, caller_name)
+        f = sys._getframe(1)
+        caller = f.f_globals["__file__"]
+        pipeline_path = os.path.splitext(caller)[0]
+        general_path = os.path.join(os.path.dirname(pipeline_path),
+                                    "configuration")
+        writeConfigFiles(pipeline_path, general_path)
 
     elif options.pipeline_action == "clone":
         clonePipeline(options.pipeline_targets[0])

From 36581996e3624af9a88eb6a5293730b1576f16a8 Mon Sep 17 00:00:00 2001
From: Sebastian Luna Valero <sebastian.luna.valero@gmail.com>
Date: Mon, 29 Jan 2018 15:26:19 +0000
Subject: [PATCH 21/21]  Revert Control.py to master version

---
 CGATPipelines/Pipeline/Control.py        | 274 +++--------------------
 CGATPipelines/configuration/pipeline.ini |   7 +-
 2 files changed, 29 insertions(+), 252 deletions(-)

diff --git a/CGATPipelines/Pipeline/Control.py b/CGATPipelines/Pipeline/Control.py
index 4be0abf5..fabb82a4 100644
--- a/CGATPipelines/Pipeline/Control.py
+++ b/CGATPipelines/Pipeline/Control.py
@@ -15,6 +15,7 @@
 ---------
 
 """
+
 import inspect
 import json
 import logging
@@ -26,9 +27,6 @@
 import tempfile
 import time
 import io
-import glob
-import fnmatch
-import importlib
 
 from multiprocessing.pool import ThreadPool
 
@@ -70,255 +68,29 @@
 GLOBAL_OPTIONS, GLOBAL_ARGS = None, None
 
 
-def getConfigPaths():
-    '''
-    Search the current and installation paths where the configuration files
-    live for the pipeline being called.
-    '''
-    # (Antonio) I've modified this section, see explanation and changes in the
-    # writeConfigFiles function above.
-    config_paths = []
-    # Get the name of the pipeline being called
-    # This could be:
-    # cgatflow readqc config
-    # pipeline_QTL config
-    # python /YYYY//XXXX/pipeline_XXXX.py config
-    try:
-        f = sys._getframe(1)
-        #caller = f.f_globals["__file__"] # cgatflow config
-        # globals will get Control.py
-        caller = f.f_locals["__file__"] # TO DO: cgatflow 
-        # Make it easier to match the name of the command executed so that
-        # the config file can be searched in case there are more than one
-        # ini files found in writeConfig():
-        caller_name = os.path.basename(os.path.normpath(caller))
-    except KeyError:
-        f = sys._getframe(2) # if e.g. call is direct for pipeline_QTL config
-        caller = f.f_globals["__file__"]
-        caller_name = os.path.basename(os.path.normpath(caller))
-        if caller_name.endswith('.py'):
-            caller_name = caller_name.replace('.py', '')
-    #else:
-    #    print('''Unable to find path to file being executed. Probably because
-    #            CGATPipelines and the pipeline that is being executed
-    #            cannot figure out where each other lives. Raise an issue in
-    #            GitHub if possible. Exiting.''')
-    #    sys.exit()
-        # CGATPipelines have a pipe_XX/pipe_XX hierarchy, but a simplified
-        # version would only have pipe_XX/
-    pipeline_path = os.path.splitext(caller)[0]
-    pipeline_path_2 = os.path.dirname(pipeline_path)
-    # CGATPipelines have a "configuration" folder
-    # adding a glob to have a bit more flexibility
-    # TO DO: add max depth to glob recursion:
-    general_path = glob.glob(str(os.path.abspath(pipeline_path_2) +
-                                  '/**/configuration*'), recursive = True)
-
-    if not general_path:
-        general_path = [os.path.join(os.path.dirname(pipeline_path),
-            "configuration")]
-
-    # Add paths to search list:
-    config_paths.extend([pipeline_path, pipeline_path_2])
-    # Extend separately in case general_path returns more than one file:
-    config_paths.extend(general_path)
-    return(config_paths, caller_name)
-
-def writeConfigFiles(config_paths, caller_name):
+def writeConfigFiles(pipeline_path, general_path):
     '''create default configuration files in `path`.
     '''
-    # TO DO: I've modified this function with workarounds to make it more
-    # flexible in order to find an ini file, find a configuration dir and 
-    # copy pre-run sphinx-quickstart files if they exist.
-    # Other than creating a 'report' dir, it should not change the way it is
-    # run from CGATPipelines.
-    # See also getConfigPaths() above, these run when calling the 'config' option 
-    # Antonio
-    report_dir = 'pipeline_report'
-    config_files = []
-
-    try:
-        os.mkdir(report_dir) # Sphinx config files will be copied here
-                             # CGATReport only needs its conf.py to generate the rest
-                             # though
-    except FileExistsError:
-        E.warn("directory `%s` already exists" % report_dir)
-        raise
-
-    # Look for ini file:
-    f_count = 0
-    for path in config_paths:
-        if os.path.exists(path) and os.path.isdir(path):
-            for f in os.listdir(os.path.abspath(path)):
-                if fnmatch.fnmatch(f, 'pipeline*ini'):
-                    f_count += 1
-                    config_files.append(f)
-
-    if f_count == 1:
-        pass
-
-    elif f_count > 1:
-        new_count = 0
-        # Prioritise the file that contains the command called if more than one
-        # ini files are found:
-        for f in config_files:
-            if caller_name in f:
-                new_count += 1
-                config_files.append(f)
-        if new_count > 1:
-            E.warn('''
-                   Found several ini files but could not prioritise based on:
-                   {}
-                   as more than one matched.
-                   Using the first one found:
-                   {}
-                   from
-                   {}
-                   '''.format(caller_name, config_files[0], config_files)
-                   )
-        elif new_count == 1:
-            pass
-        elif new_count == 0:
-            pass
-            print('''
-                  More than one ini file found but none matched
-                  {}
-                  Using the first one found:
-                  {}
-                  from
-                  {}
-                  '''.format(caller_name, config_files[0], config_files)
-                  )
-
-    elif f_count == 0:
-        E.warn('''
-               No configuration (ini) files found in:
-                {}
-               '''.format(config_paths)
-               )
-
-    # Copy pipeline ini file:
-    if not config_files:
-        E.warn('No configuration files found.')
-    else:
-        for dest in config_files:
-            dest = str(dest)
-            if os.path.exists(dest):
-                E.warn("file `%s` already exists - skipped" % dest)
-                continue
 
-            for path in config_paths:
-                src = os.path.join(path, dest)
-                if os.path.exists(src):
-                    shutil.copyfile(src, dest)
-                    E.info("created new configuration file `%s` " % dest)
-                    break
-            else:
-                raise ValueError('''default config file for `%s`
-                                    not found in
-                                    %s
-                                    A pipeline cannot be run without this.
-                                 ''' % (config_files, config_paths))
-
-    # Copy Sphinx configuration files, enforce copy of 'conf.py' in case
-    # CGATReport is used:
-    dest = 'conf.py'
-    if os.path.exists(dest):
-        E.warn("file `%s` already exists - skipped" % dest)
-
-    for path in config_paths:
-        src = os.path.join(path, dest)
-        if os.path.exists(src):
-            # Put sphinx files in separate dir:
-            shutil.copyfile(src, os.path.join(report_dir, dest))
-            # Create a softlink outside of report_dir dir for CGATReport:
-            os.symlink(os.path.join(report_dir, dest), str(dest))
-            E.info("created new configuration file `%s` " % dest)
-            break
+    paths = [pipeline_path, general_path]
+    config_files = ['pipeline.ini', 'conf.py']
 
-    else:
-        # Only warn as pipeline can be run without report:
-        E.warn('''default config file for `%s` not found in
-                  %s
-                  CGATReport nor Sphinx can be run without this''' % (dest,
-                                                                      config_paths))
-
-    # If other Sphinx config files are found, copy them if there is a skeleton
-    # pipeline report to use:
-    E.info('Looking for additional Sphinx configuration files.')
-    sphinx_config_files = ['Makefile',
-                           'make.bat',
-                           '*.rst',
-                           '*.bib',
-                           ] # These are for a sphinx setup, not needed
-                             # with CGATReport
-                             # A 'report_pipeline_*.rst' template is
-                             # searched for below
-
-    # Look for a pipeline report file:
-    f_count = 0
-    for path in config_paths:
-        if os.path.exists(path):
-            for f in os.listdir(os.path.abspath(path)):
-                # TO DO:
-                # This pattern matching is particular to 
-                # https://github.com/AntonioJBT/project_quickstart
-                # Needs to be made more generic
-                if fnmatch.fnmatch(f, 'report_pipeline_*.rst'):
-                    f_count += 1
-                    pipeline_report_file = f
-
-    if f_count == 1:
-        sphinx_config_files.append(pipeline_report_file)
+    for dest in config_files:
+        if os.path.exists(dest):
+            E.warn("file `%s` already exists - skipped" % dest)
+            continue
+
+        for path in paths:
+            src = os.path.join(path, dest)
+            if os.path.exists(src):
+                shutil.copyfile(src, dest)
+                E.info("created new configuration file `%s` " % dest)
+                break
+        else:
+            raise ValueError(
+                "default config file for `%s` not found in %s" %
+                (config_files, paths))
 
-    else:
-        # Only warn as pipeline can be run without report:
-        E.warn('''There is no pipeline report file matching
-                  report_pipeline_*.rst
-                  in the directories:
-                  {}
-                  Ignore this if you are using CGATReport.
-                  '''.format(config_paths)
-                  )
-
-    # Copy the files across if they are found:
-    f_count = 0
-    # Check all the paths and their files given above when searching for config files:
-    for path in config_paths:
-        if os.path.exists(path):
-            for f in os.listdir(path):
-                # For each file or search term given, match to an existing file:
-                for dest in sphinx_config_files:
-                    if fnmatch.fnmatch(f, dest):
-                        f_to_copy = f
-                        # If a match is found, walk the cwd to check it's not
-                        # already present:
-                        for root, dirs, files in os.walk('.'):
-                            if f_to_copy in files:
-                                E.warn("file `%s` already exists - skipped" % f_to_copy)
-                                continue
-
-                        # If not present, copy the file:
-                        else:
-                            f_count += 1
-                            src = os.path.join(path, f_to_copy)
-                            if os.path.exists(src):
-                                # Put sphinx files in separate dir:
-                                shutil.copyfile(src, os.path.join(report_dir,
-                                                                  f_to_copy)
-                                                )
-                                E.info("created new configuration file `%s` "
-                                        % f_to_copy)
-                                break
-    if f_count > 0:
-        pass
-    else:
-        E.warn('''No sphinx-quickstart skeleton files such as:
-                  {}
-                  were found
-                  in
-                  {}
-                  Continuing without.'''.format(dest, config_paths))
 
 def printConfigFiles():
     '''
@@ -1244,8 +1016,12 @@ def main(args=sys.argv):
         printConfigFiles()
 
     elif options.pipeline_action == "config":
-        config_paths, caller_name = getConfigPaths()
-        writeConfigFiles(config_paths, caller_name)
+        f = sys._getframe(1)
+        caller = f.f_globals["__file__"]
+        pipeline_path = os.path.splitext(caller)[0]
+        general_path = os.path.join(os.path.dirname(pipeline_path),
+                                    "configuration")
+        writeConfigFiles(pipeline_path, general_path)
 
     elif options.pipeline_action == "clone":
         clonePipeline(options.pipeline_targets[0])
diff --git a/CGATPipelines/configuration/pipeline.ini b/CGATPipelines/configuration/pipeline.ini
index 07af17bf..798ebf13 100644
--- a/CGATPipelines/configuration/pipeline.ini
+++ b/CGATPipelines/configuration/pipeline.ini
@@ -6,7 +6,7 @@
 ########################################################
 ########################################################
 # The project name to appear in the report
-projectname=
+projectname=to-set
 
 # The copyright statement to appear in the report
 copyright=
@@ -37,7 +37,8 @@ scratchdir=/tmp
 web_dir=../web
 
 # location of indexed genome 
-genome_dir=/full/path/here
+#genome_dir=/ifs/mirror/genomes/plain
+genome_dir=to-set
 
 # The genome to use (UCSC convention)
 genome=hg19
@@ -75,8 +76,8 @@ port=3306
 [cluster]
 
 # queue to use
+#queue=all.q
 queue=
-#all.q
 
 # priority of jobs on cluster
 priority=-10