From 062f8e5462568dc71c749dee5c1a9a6ad92d4f87 Mon Sep 17 00:00:00 2001 From: AntonioJBT Date: Thu, 4 Jan 2018 18:49:38 +0000 Subject: [PATCH 01/21] made Control.py more flexible, moved report memory to ini file --- CGATPipelines/Pipeline/Control.py | 225 +++++++++++++++++++++-- CGATPipelines/Pipeline/__init__.py | 4 +- CGATPipelines/configuration/pipeline.ini | 2 + 3 files changed, 217 insertions(+), 14 deletions(-) diff --git a/CGATPipelines/Pipeline/Control.py b/CGATPipelines/Pipeline/Control.py index fabb82a4..e785de16 100644 --- a/CGATPipelines/Pipeline/Control.py +++ b/CGATPipelines/Pipeline/Control.py @@ -27,6 +27,9 @@ import tempfile import time import io +import glob +import fnmatch +import importlib from multiprocessing.pool import ThreadPool @@ -68,13 +71,64 @@ GLOBAL_OPTIONS, GLOBAL_ARGS = None, None -def writeConfigFiles(pipeline_path, general_path): +def writeConfigFiles(paths): + #pipeline_path, pipeline_path_2, general_path): '''create default configuration files in `path`. ''' - - paths = [pipeline_path, general_path] - config_files = ['pipeline.ini', 'conf.py'] - + # TO DO: I've modified this function with workarounds to make it more + # flexible in order to find an ini file, find a configuration dir and + # copy pre-run sphinx-quickstart files if they exist. + # Other than creating a 'report' dir, it should not change the way it is + # run from CGATPipelines. + # See also bottom of script for changes when calling the 'config' option + # Antonio + #paths = [pipeline_path, pipeline_path_2, general_path] + report_dir = 'pipeline_report' + try: + os.mkdir(report_dir) # Sphinx config files will be copied here + # CGATReport only needs its conf.py to generate the rest + # though + except FileExistsError: + E.warn("directory `%s` already exists" % report_dir) + raise + + # Look for ini file: + f_count = 0 + INI_list = [] + for path in paths: + if os.path.exists(path) and os.path.isdir(path): + for f in os.listdir(os.path.abspath(path)): + if fnmatch.fnmatch(f, 'pipeline*ini'): + f_count += 1 + INI_file = f + INI_list.extend([INI_file]) + + if f_count == 1: + config_files = [INI_file] # This is for the pipeline only + + elif f_count > 1: + # Prioritise the file that contains the command called if more than one + # ini file are found: + for f in INI_list: + if caller_name in f: + INI_file = f + config_files = [INI_file] + else: + if f_count == 0: + print(''' + No configuration (ini) files found in: + {} + '''.format(paths) + ) + else: + print(''' + Found several ini files but could not prioritise based on: + {} + Exiting. + '''.format(caller_name)) + sys.exit() + + # Copy pipeline ini file: for dest in config_files: if os.path.exists(dest): E.warn("file `%s` already exists - skipped" % dest) @@ -87,10 +141,113 @@ def writeConfigFiles(pipeline_path, general_path): E.info("created new configuration file `%s` " % dest) break else: - raise ValueError( - "default config file for `%s` not found in %s" % - (config_files, paths)) + raise ValueError('''default config file for `%s` + not found in + %s + A pipeline cannot be run without this. + ''' % (config_files, paths)) + + # Copy Sphinx configuration files, enforce copy of 'conf.py' in case + # CGATReport is used: + dest = 'conf.py' + if os.path.exists(dest): + E.warn("file `%s` already exists - skipped" % dest) + + for path in paths: + src = os.path.join(path, dest) + if os.path.exists(src): + # Put sphinx files in separate dir: + shutil.copyfile(src, os.path.join(report_dir, dest)) + # Create a softlink outside of report_dir dir for CGATReport: + os.symlink(os.path.join(report_dir, dest), str(dest)) + E.info("created new configuration file `%s` " % dest) + break + else: + # Only warn as pipeline can be run without report: + E.warn('''default config file for `%s` not found in + %s + CGATReport nor Sphinx can be run without this''' % (dest, paths)) + + # If other Sphinx config files are found, copy them if there is a skeleton + # pipeline report to use: + E.info('Looking for additional Sphinx configuration files.') + sphinx_config_files = ['Makefile', + 'make.bat', + '*.rst', + '*.bib', + ] # These are for a sphinx setup, not needed + # with CGATReport + # A 'report_pipeline_*.rst' template is + # searched for below + + # Look for a pipeline report file: + f_count = 0 + for path in paths: + if os.path.exists(path): + for f in os.listdir(os.path.abspath(path)): + # TO DO: + # This pattern matching is particular to + # https://github.com/AntonioJBT/project_quickstart + # Needs to be made more generic + if fnmatch.fnmatch(f, 'report_pipeline_*.rst'): + f_count += 1 + pipeline_report_file = f + + if f_count == 1: + sphinx_config_files.append(pipeline_report_file) + + else: + # Only warn as pipeline can be run without report: + E.warn('''There is no pipeline report file matching + report_pipeline_*.rst + in the directories: + {} + {} + or + {} + Ignore this if you are using CGATReport. + '''.format(pipeline_path, pipeline_path_2, general_path) + ) + + # Copy the files across if they are found: + f_count = 0 + # Check all the paths and their files given above when searching for config files: + for path in paths: + if os.path.exists(path): + for f in os.listdir(path): + # For each file or search term given, match to an existing file: + for dest in sphinx_config_files: + if fnmatch.fnmatch(f, dest): + f_to_copy = f + # If a match is found, walk the cwd to check it's not + # already present: + for root, dirs, files in os.walk('.'): + if f_to_copy in files: + E.warn("file `%s` already exists - skipped" % f_to_copy) + continue + + # If not present, copy the file: + else: + f_count += 1 + src = os.path.join(path, f_to_copy) + if os.path.exists(src): + # Put sphinx files in separate dir: + shutil.copyfile(src, os.path.join(report_dir, + f_to_copy) + ) + E.info("created new configuration file `%s` " + % f_to_copy) + break + if f_count > 0: + pass + else: + E.warn('''No sphinx-quickstart skeleton files such as: + {} + were found + in + {} + Continuing without.'''.format(dest, paths)) def printConfigFiles(): ''' @@ -1016,12 +1173,54 @@ def main(args=sys.argv): printConfigFiles() elif options.pipeline_action == "config": - f = sys._getframe(1) - caller = f.f_globals["__file__"] + # (Antonio) I've modified this section, see explanation and changes in the + # writeConfigFiles function above. + config_paths = [] + try: + f = sys._getframe(1) + caller = inspect.getargvalues(f).locals["__file__"] + # Make it easier to match the name of the command executed so that + # the config file can be searched in case there are more than one + # ini files found in writeConfig(): + # Making it global, check if there's better way: + global caller_name + caller_name = os.path.basename(os.path.normpath(caller)) + except KeyError as e: + # The following code only works if something like this function is + # present in my_pipeline.py script: + # http://stackoverflow.com/questions/4519127/setuptools-package-data-folder-location + f = sys._getframe(2) + caller = inspect.getargvalues(f).locals["__file__"] + cmd_caller = os.path.basename(os.path.normpath(caller)) + # As above, save the command called in a separate variable: + global caller_name + caller_name = cmd_caller + cmd_caller = importlib.import_module(cmd_caller) + caller = cmd_caller.getDir() + else: + print('''Unable to find path to file being executed. Probably because + CGATPipelines and the pipeline that is being executed + cannot figure out where each other lives. Raise an issue in + GitHub if possible. Exiting.''') + + # CGATPipelines have a pipe_XX/pipe_XX hierarchy, but a simplified + # version would only have pipe_XX/ + # so creating an additional pipeline_path + # TO DO: clean this up pipeline_path = os.path.splitext(caller)[0] - general_path = os.path.join(os.path.dirname(pipeline_path), - "configuration") - writeConfigFiles(pipeline_path, general_path) + pipeline_path_2 = os.path.dirname(pipeline_path) + # CGATPipelines have a "configuration" folder + # adding a glob to have a bit more flexibility + general_path = glob.glob(str(os.path.abspath(pipeline_path_2) + + '/**/configuration*'), recursive = True) + + if not general_path: + general_path = os.path.join(os.path.dirname(pipeline_path), "configuration") + + config_paths.extend([pipeline_path, pipeline_path_2]) + # Extend separately in case general_path returns more than one file: + config_paths.extend(general_path) + writeConfigFiles(config_paths) elif options.pipeline_action == "clone": clonePipeline(options.pipeline_targets[0]) diff --git a/CGATPipelines/Pipeline/__init__.py b/CGATPipelines/Pipeline/__init__.py index b396bf43..2c39d7b1 100644 --- a/CGATPipelines/Pipeline/__init__.py +++ b/CGATPipelines/Pipeline/__init__.py @@ -273,7 +273,9 @@ def run_report(clean=True, # warning: memory gets multiplied by threads, so set it not too # high - job_memory = "1G" + job_memory = PARAMS["report_memory"] + #"1G" # This causes problems in outside HPCs + job_threads = PARAMS["report_threads"] # use a fake X display in order to avoid windows popping up diff --git a/CGATPipelines/configuration/pipeline.ini b/CGATPipelines/configuration/pipeline.ini index 6f5f6f61..ec33ed06 100644 --- a/CGATPipelines/configuration/pipeline.ini +++ b/CGATPipelines/configuration/pipeline.ini @@ -89,6 +89,8 @@ priority=-10 # number of threads to use to build the documentation threads=10 +memory=1G + # directory for html documentation html=report/html From 8ae879731bc17340a478c51a4946b37d907310ae Mon Sep 17 00:00:00 2001 From: Antonio Date: Thu, 4 Jan 2018 19:19:14 +0000 Subject: [PATCH 02/21] Update Control.py --- CGATPipelines/Pipeline/Control.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CGATPipelines/Pipeline/Control.py b/CGATPipelines/Pipeline/Control.py index e785de16..05c9df3b 100644 --- a/CGATPipelines/Pipeline/Control.py +++ b/CGATPipelines/Pipeline/Control.py @@ -1193,7 +1193,7 @@ def main(args=sys.argv): caller = inspect.getargvalues(f).locals["__file__"] cmd_caller = os.path.basename(os.path.normpath(caller)) # As above, save the command called in a separate variable: - global caller_name + #global caller_name caller_name = cmd_caller cmd_caller = importlib.import_module(cmd_caller) caller = cmd_caller.getDir() From 267ed99f30d0134f458db88cd9971235f62bdec9 Mon Sep 17 00:00:00 2001 From: Antonio Date: Fri, 5 Jan 2018 12:53:05 +0000 Subject: [PATCH 03/21] Update Control.py --- CGATPipelines/Pipeline/Control.py | 108 ++++++++++++++++-------------- 1 file changed, 56 insertions(+), 52 deletions(-) diff --git a/CGATPipelines/Pipeline/Control.py b/CGATPipelines/Pipeline/Control.py index 05c9df3b..bbaa18ee 100644 --- a/CGATPipelines/Pipeline/Control.py +++ b/CGATPipelines/Pipeline/Control.py @@ -71,8 +71,60 @@ GLOBAL_OPTIONS, GLOBAL_ARGS = None, None -def writeConfigFiles(paths): - #pipeline_path, pipeline_path_2, general_path): +def getConfigPaths(): + ''' + Search the current and installation paths where the configuration files live. + ''' + # (Antonio) I've modified this section, see explanation and changes in the + # writeConfigFiles function above. + config_paths = [] + try: + f = sys._getframe(1) + caller = inspect.getargvalues(f).locals["__file__"] + # Make it easier to match the name of the command executed so that + # the config file can be searched in case there are more than one + # ini files found in writeConfig(): + caller_name = os.path.basename(os.path.normpath(caller)) + # I think caller_name as separate var is needed for searching as string, can't remember now... + except KeyError as e: + # The following code only works if something like this function is + # present in my_pipeline.py script: + # http://stackoverflow.com/questions/4519127/setuptools-package-data-folder-location + f = sys._getframe(2) + caller = inspect.getargvalues(f).locals["__file__"] + cmd_caller = os.path.basename(os.path.normpath(caller)) + # As above, save the command called in a separate variable: + caller_name = cmd_caller + cmd_caller = importlib.import_module(cmd_caller) + caller = cmd_caller.getDir() + else: + print('''Unable to find path to file being executed. Probably because + CGATPipelines and the pipeline that is being executed + cannot figure out where each other lives. Raise an issue in + GitHub if possible. Exiting.''') + + # CGATPipelines have a pipe_XX/pipe_XX hierarchy, but a simplified + # version would only have pipe_XX/ + # so creating an additional pipeline_path + # TO DO: clean this up + pipeline_path = os.path.splitext(caller)[0] + pipeline_path_2 = os.path.dirname(pipeline_path) + # CGATPipelines have a "configuration" folder + # adding a glob to have a bit more flexibility + general_path = glob.glob(str(os.path.abspath(pipeline_path_2) + + '/**/configuration*'), recursive = True) + if not general_path: + general_path = os.path.join(os.path.dirname(pipeline_path), "configuration") + + # Add paths to search list: + config_paths.extend([pipeline_path, pipeline_path_2]) + # Extend separately in case general_path returns more than one file: + config_paths.extend(general_path) + + return(config_paths, caller_name) + + +def writeConfigFiles(paths, caller_name): '''create default configuration files in `path`. ''' # TO DO: I've modified this function with workarounds to make it more @@ -80,9 +132,8 @@ def writeConfigFiles(paths): # copy pre-run sphinx-quickstart files if they exist. # Other than creating a 'report' dir, it should not change the way it is # run from CGATPipelines. - # See also bottom of script for changes when calling the 'config' option + # See also getConfigPaths() above, these run when calling the 'config' option # Antonio - #paths = [pipeline_path, pipeline_path_2, general_path] report_dir = 'pipeline_report' try: os.mkdir(report_dir) # Sphinx config files will be copied here @@ -1173,54 +1224,7 @@ def main(args=sys.argv): printConfigFiles() elif options.pipeline_action == "config": - # (Antonio) I've modified this section, see explanation and changes in the - # writeConfigFiles function above. - config_paths = [] - try: - f = sys._getframe(1) - caller = inspect.getargvalues(f).locals["__file__"] - # Make it easier to match the name of the command executed so that - # the config file can be searched in case there are more than one - # ini files found in writeConfig(): - # Making it global, check if there's better way: - global caller_name - caller_name = os.path.basename(os.path.normpath(caller)) - except KeyError as e: - # The following code only works if something like this function is - # present in my_pipeline.py script: - # http://stackoverflow.com/questions/4519127/setuptools-package-data-folder-location - f = sys._getframe(2) - caller = inspect.getargvalues(f).locals["__file__"] - cmd_caller = os.path.basename(os.path.normpath(caller)) - # As above, save the command called in a separate variable: - #global caller_name - caller_name = cmd_caller - cmd_caller = importlib.import_module(cmd_caller) - caller = cmd_caller.getDir() - else: - print('''Unable to find path to file being executed. Probably because - CGATPipelines and the pipeline that is being executed - cannot figure out where each other lives. Raise an issue in - GitHub if possible. Exiting.''') - - # CGATPipelines have a pipe_XX/pipe_XX hierarchy, but a simplified - # version would only have pipe_XX/ - # so creating an additional pipeline_path - # TO DO: clean this up - pipeline_path = os.path.splitext(caller)[0] - pipeline_path_2 = os.path.dirname(pipeline_path) - # CGATPipelines have a "configuration" folder - # adding a glob to have a bit more flexibility - general_path = glob.glob(str(os.path.abspath(pipeline_path_2) + - '/**/configuration*'), recursive = True) - - if not general_path: - general_path = os.path.join(os.path.dirname(pipeline_path), "configuration") - - config_paths.extend([pipeline_path, pipeline_path_2]) - # Extend separately in case general_path returns more than one file: - config_paths.extend(general_path) - writeConfigFiles(config_paths) + writeConfigFiles(getConfigPaths()) elif options.pipeline_action == "clone": clonePipeline(options.pipeline_targets[0]) From 0f0c9e265dea44e7085a2cb21db59087ee675fcc Mon Sep 17 00:00:00 2001 From: AntonioJBT Date: Tue, 9 Jan 2018 13:06:25 +0000 Subject: [PATCH 04/21] tests for control.py --- CGATPipelines/Pipeline/Control.py | 108 +- CGATPipelines/Pipeline/Control.py.core_newest | 1235 +++++++++++++++++ 2 files changed, 1287 insertions(+), 56 deletions(-) create mode 100644 CGATPipelines/Pipeline/Control.py.core_newest diff --git a/CGATPipelines/Pipeline/Control.py b/CGATPipelines/Pipeline/Control.py index bbaa18ee..e785de16 100644 --- a/CGATPipelines/Pipeline/Control.py +++ b/CGATPipelines/Pipeline/Control.py @@ -71,60 +71,8 @@ GLOBAL_OPTIONS, GLOBAL_ARGS = None, None -def getConfigPaths(): - ''' - Search the current and installation paths where the configuration files live. - ''' - # (Antonio) I've modified this section, see explanation and changes in the - # writeConfigFiles function above. - config_paths = [] - try: - f = sys._getframe(1) - caller = inspect.getargvalues(f).locals["__file__"] - # Make it easier to match the name of the command executed so that - # the config file can be searched in case there are more than one - # ini files found in writeConfig(): - caller_name = os.path.basename(os.path.normpath(caller)) - # I think caller_name as separate var is needed for searching as string, can't remember now... - except KeyError as e: - # The following code only works if something like this function is - # present in my_pipeline.py script: - # http://stackoverflow.com/questions/4519127/setuptools-package-data-folder-location - f = sys._getframe(2) - caller = inspect.getargvalues(f).locals["__file__"] - cmd_caller = os.path.basename(os.path.normpath(caller)) - # As above, save the command called in a separate variable: - caller_name = cmd_caller - cmd_caller = importlib.import_module(cmd_caller) - caller = cmd_caller.getDir() - else: - print('''Unable to find path to file being executed. Probably because - CGATPipelines and the pipeline that is being executed - cannot figure out where each other lives. Raise an issue in - GitHub if possible. Exiting.''') - - # CGATPipelines have a pipe_XX/pipe_XX hierarchy, but a simplified - # version would only have pipe_XX/ - # so creating an additional pipeline_path - # TO DO: clean this up - pipeline_path = os.path.splitext(caller)[0] - pipeline_path_2 = os.path.dirname(pipeline_path) - # CGATPipelines have a "configuration" folder - # adding a glob to have a bit more flexibility - general_path = glob.glob(str(os.path.abspath(pipeline_path_2) + - '/**/configuration*'), recursive = True) - if not general_path: - general_path = os.path.join(os.path.dirname(pipeline_path), "configuration") - - # Add paths to search list: - config_paths.extend([pipeline_path, pipeline_path_2]) - # Extend separately in case general_path returns more than one file: - config_paths.extend(general_path) - - return(config_paths, caller_name) - - -def writeConfigFiles(paths, caller_name): +def writeConfigFiles(paths): + #pipeline_path, pipeline_path_2, general_path): '''create default configuration files in `path`. ''' # TO DO: I've modified this function with workarounds to make it more @@ -132,8 +80,9 @@ def writeConfigFiles(paths, caller_name): # copy pre-run sphinx-quickstart files if they exist. # Other than creating a 'report' dir, it should not change the way it is # run from CGATPipelines. - # See also getConfigPaths() above, these run when calling the 'config' option + # See also bottom of script for changes when calling the 'config' option # Antonio + #paths = [pipeline_path, pipeline_path_2, general_path] report_dir = 'pipeline_report' try: os.mkdir(report_dir) # Sphinx config files will be copied here @@ -1224,7 +1173,54 @@ def main(args=sys.argv): printConfigFiles() elif options.pipeline_action == "config": - writeConfigFiles(getConfigPaths()) + # (Antonio) I've modified this section, see explanation and changes in the + # writeConfigFiles function above. + config_paths = [] + try: + f = sys._getframe(1) + caller = inspect.getargvalues(f).locals["__file__"] + # Make it easier to match the name of the command executed so that + # the config file can be searched in case there are more than one + # ini files found in writeConfig(): + # Making it global, check if there's better way: + global caller_name + caller_name = os.path.basename(os.path.normpath(caller)) + except KeyError as e: + # The following code only works if something like this function is + # present in my_pipeline.py script: + # http://stackoverflow.com/questions/4519127/setuptools-package-data-folder-location + f = sys._getframe(2) + caller = inspect.getargvalues(f).locals["__file__"] + cmd_caller = os.path.basename(os.path.normpath(caller)) + # As above, save the command called in a separate variable: + global caller_name + caller_name = cmd_caller + cmd_caller = importlib.import_module(cmd_caller) + caller = cmd_caller.getDir() + else: + print('''Unable to find path to file being executed. Probably because + CGATPipelines and the pipeline that is being executed + cannot figure out where each other lives. Raise an issue in + GitHub if possible. Exiting.''') + + # CGATPipelines have a pipe_XX/pipe_XX hierarchy, but a simplified + # version would only have pipe_XX/ + # so creating an additional pipeline_path + # TO DO: clean this up + pipeline_path = os.path.splitext(caller)[0] + pipeline_path_2 = os.path.dirname(pipeline_path) + # CGATPipelines have a "configuration" folder + # adding a glob to have a bit more flexibility + general_path = glob.glob(str(os.path.abspath(pipeline_path_2) + + '/**/configuration*'), recursive = True) + + if not general_path: + general_path = os.path.join(os.path.dirname(pipeline_path), "configuration") + + config_paths.extend([pipeline_path, pipeline_path_2]) + # Extend separately in case general_path returns more than one file: + config_paths.extend(general_path) + writeConfigFiles(config_paths) elif options.pipeline_action == "clone": clonePipeline(options.pipeline_targets[0]) diff --git a/CGATPipelines/Pipeline/Control.py.core_newest b/CGATPipelines/Pipeline/Control.py.core_newest new file mode 100644 index 00000000..898ceb7c --- /dev/null +++ b/CGATPipelines/Pipeline/Control.py.core_newest @@ -0,0 +1,1235 @@ +"""Control.py - Command line control for ruffus pipelines +========================================================= + +The functions :func:`writeConfigFiles`, :func:`clean`, +:func:`clonePipeline` and :func:`peekParameters` provide the +functionality for particular pipeline commands. + +:class:`MultiLineFormatter` improves the formatting +of long log messages, while +:class:`LoggingFilterRabbitMQ` intercepts ruffus log +messages and sends event information to a rabbitMQ message exchange +for task process monitoring. + +Reference +--------- + +""" + +import inspect +import json +import logging +import os +import re +import shutil +import subprocess +import sys +import tempfile +import time +import io +import glob +import fnmatch +import importlib + +from multiprocessing.pool import ThreadPool + +# talking to RabbitMQ +try: + import pika + HAS_PIKA = True +except ImportError: + HAS_PIKA = False + +# talking to a cluster +try: + import drmaa + HAS_DRMAA = True +except: +# the following does not work on Travis +#except ImportError or RuntimeError: + HAS_DRMAA = False + +from ruffus import pipeline_printout_graph, pipeline_printout, \ + pipeline_run, ruffus_exceptions, task + + +import CGAT.Experiment as E +import CGAT.IOTools as IOTools +from CGAT import Requirements as Requirements + +from CGATPipelines.Pipeline.Utils import isTest, getCaller, getCallerLocals +from CGATPipelines.Pipeline.Execution import execute, startSession,\ + closeSession +from CGATPipelines.Pipeline.Local import getProjectName, getPipelineName +from CGATPipelines.Pipeline.Parameters import inputValidation +# Set from Pipeline.py +PARAMS = {} + +# global options and arguments - set but currently not +# used as relevant sections are entered into the PARAMS +# dictionary. Could be deprecated and removed. +GLOBAL_OPTIONS, GLOBAL_ARGS = None, None + + +def getConfigPaths(): + ''' + Search the current and installation paths where the configuration files live. + ''' + # (Antonio) I've modified this section, see explanation and changes in the + # writeConfigFiles function above. + config_paths = [] + try: + f = sys._getframe(1) + caller = inspect.getargvalues(f).locals["__file__"] + # Make it easier to match the name of the command executed so that + # the config file can be searched in case there are more than one + # ini files found in writeConfig(): + caller_name = os.path.basename(os.path.normpath(caller)) + except KeyError as e: + # The following code only works if something like this function is + # present in my_pipeline.py script: + # http://stackoverflow.com/questions/4519127/setuptools-package-data-folder-location + f = sys._getframe(2) + caller = inspect.getargvalues(f).locals["__file__"] + cmd_caller = os.path.basename(os.path.normpath(caller)) + # As above, save the command called in a separate variable: + caller_name = cmd_caller + cmd_caller = importlib.import_module(cmd_caller) + caller = cmd_caller.getDir() + else: + print('''Unable to find path to file being executed. Probably because + CGATPipelines and the pipeline that is being executed + cannot figure out where each other lives. Raise an issue in + GitHub if possible. Exiting.''') + + # CGATPipelines have a pipe_XX/pipe_XX hierarchy, but a simplified + # version would only have pipe_XX/ + # so creating an additional pipeline_path + # TO DO: clean this up + pipeline_path = os.path.splitext(caller)[0] + pipeline_path_2 = os.path.dirname(pipeline_path) + # CGATPipelines have a "configuration" folder + # adding a glob to have a bit more flexibility + general_path = glob.glob(str(os.path.abspath(pipeline_path_2) + + '/**/configuration*'), recursive = True) + if not general_path: + general_path = os.path.join(os.path.dirname(pipeline_path), "configuration") + + # Add paths to search list: + config_paths.extend([pipeline_path, pipeline_path_2]) + # Extend separately in case general_path returns more than one file: + config_paths.extend(general_path) + + return(config_paths, caller_name) + + +def writeConfigFiles(paths, caller_name): + '''create default configuration files in `path`. + ''' + # TO DO: I've modified this function with workarounds to make it more + # flexible in order to find an ini file, find a configuration dir and + # copy pre-run sphinx-quickstart files if they exist. + # Other than creating a 'report' dir, it should not change the way it is + # run from CGATPipelines. + # See also getConfigPaths() above, these run when calling the 'config' option + # Antonio + report_dir = 'pipeline_report' + try: + os.mkdir(report_dir) # Sphinx config files will be copied here + # CGATReport only needs its conf.py to generate the rest + # though + except FileExistsError: + E.warn("directory `%s` already exists" % report_dir) + raise + + # Look for ini file: + f_count = 0 + INI_list = [] + for path in paths: + if os.path.exists(path) and os.path.isdir(path): + for f in os.listdir(os.path.abspath(path)): + if fnmatch.fnmatch(f, 'pipeline*ini'): + f_count += 1 + INI_file = f + INI_list.extend([INI_file]) + + if f_count == 1: + config_files = [INI_file] # This is for the pipeline only + + elif f_count > 1: + # Prioritise the file that contains the command called if more than one + # ini file are found: + for f in INI_list: + if caller_name in f: + INI_file = f + config_files = [INI_file] + else: + if f_count == 0: + print(''' + No configuration (ini) files found in: + {} + '''.format(paths) + ) + else: + print(''' + Found several ini files but could not prioritise based on: + {} + Exiting. + '''.format(caller_name)) + sys.exit() + + # Copy pipeline ini file: + for dest in config_files: + if os.path.exists(dest): + E.warn("file `%s` already exists - skipped" % dest) + continue + + for path in paths: + src = os.path.join(path, dest) + if os.path.exists(src): + shutil.copyfile(src, dest) + E.info("created new configuration file `%s` " % dest) + break + else: + raise ValueError('''default config file for `%s` + not found in + %s + A pipeline cannot be run without this. + ''' % (config_files, paths)) + + # Copy Sphinx configuration files, enforce copy of 'conf.py' in case + # CGATReport is used: + dest = 'conf.py' + if os.path.exists(dest): + E.warn("file `%s` already exists - skipped" % dest) + + for path in paths: + src = os.path.join(path, dest) + if os.path.exists(src): + # Put sphinx files in separate dir: + shutil.copyfile(src, os.path.join(report_dir, dest)) + # Create a softlink outside of report_dir dir for CGATReport: + os.symlink(os.path.join(report_dir, dest), str(dest)) + E.info("created new configuration file `%s` " % dest) + break + + else: + # Only warn as pipeline can be run without report: + E.warn('''default config file for `%s` not found in + %s + CGATReport nor Sphinx can be run without this''' % (dest, paths)) + + # If other Sphinx config files are found, copy them if there is a skeleton + # pipeline report to use: + E.info('Looking for additional Sphinx configuration files.') + sphinx_config_files = ['Makefile', + 'make.bat', + '*.rst', + '*.bib', + ] # These are for a sphinx setup, not needed + # with CGATReport + # A 'report_pipeline_*.rst' template is + # searched for below + + # Look for a pipeline report file: + f_count = 0 + for path in paths: + if os.path.exists(path): + for f in os.listdir(os.path.abspath(path)): + # TO DO: + # This pattern matching is particular to + # https://github.com/AntonioJBT/project_quickstart + # Needs to be made more generic + if fnmatch.fnmatch(f, 'report_pipeline_*.rst'): + f_count += 1 + pipeline_report_file = f + + if f_count == 1: + sphinx_config_files.append(pipeline_report_file) + + else: + # Only warn as pipeline can be run without report: + E.warn('''There is no pipeline report file matching + report_pipeline_*.rst + in the directories: + {} + {} + or + {} + Ignore this if you are using CGATReport. + '''.format(pipeline_path, pipeline_path_2, general_path) + ) + + # Copy the files across if they are found: + f_count = 0 + # Check all the paths and their files given above when searching for config files: + for path in paths: + if os.path.exists(path): + for f in os.listdir(path): + # For each file or search term given, match to an existing file: + for dest in sphinx_config_files: + if fnmatch.fnmatch(f, dest): + f_to_copy = f + # If a match is found, walk the cwd to check it's not + # already present: + for root, dirs, files in os.walk('.'): + if f_to_copy in files: + E.warn("file `%s` already exists - skipped" % f_to_copy) + continue + + # If not present, copy the file: + else: + f_count += 1 + src = os.path.join(path, f_to_copy) + if os.path.exists(src): + # Put sphinx files in separate dir: + shutil.copyfile(src, os.path.join(report_dir, + f_to_copy) + ) + E.info("created new configuration file `%s` " + % f_to_copy) + break + if f_count > 0: + pass + else: + E.warn('''No sphinx-quickstart skeleton files such as: + {} + were found + in + {} + Continuing without.'''.format(dest, paths)) + +def printConfigFiles(): + ''' + Print the list of .ini files used to configure the pipeline + along with their associated priorities. + Priority 1 is the highest. + ''' + + filenames = PARAMS['pipeline_ini'] + print("\n List of .ini files used to configure the pipeline") + s = len(filenames) + if s == 0: + print(" No ini files passed!") + elif s >= 1: + print(" %-11s: %s " % ("Priority", "File")) + for f in filenames: + if s == 1: + print(" (highest) %s: %s\n" % (s, f)) + else: + print(" %-11s: %s " % (s, f)) + s -= 1 + + +def clonePipeline(srcdir, destdir=None): + '''clone a pipeline. + + Cloning entails creating a mirror of the source pipeline. + Generally, data files are mirrored by linking. Configuration + files and the pipeline database will be copied. + + Without modification of any files, building the cloned pipeline in + `destdir` should not re-run any commands. However, on deleting + selected files, the pipeline should run from the appropriate + point. Newly created files will not affect the original pipeline. + + Cloning pipelines permits sharing partial results between + pipelines, for example for parameter optimization. + + Arguments + --------- + scrdir : string + Source directory + destdir : string + Destination directory. If None, use the current directory. + + ''' + + if destdir is None: + destdir = os.path.curdir + + E.info("cloning pipeline from %s to %s" % (srcdir, destdir)) + + copy_files = ("conf.py", "pipeline.ini", "csvdb") + ignore_prefix = ( + "report", "_cache", "export", "tmp", "ctmp", + "_static", "_templates") + + def _ignore(p): + for x in ignore_prefix: + if p.startswith(x): + return True + return False + + for root, dirs, files in os.walk(srcdir): + + relpath = os.path.relpath(root, srcdir) + if _ignore(relpath): + continue + + for d in dirs: + if _ignore(d): + continue + dest = os.path.join(os.path.join(destdir, relpath, d)) + os.mkdir(dest) + # touch + s = os.stat(os.path.join(root, d)) + os.utime(dest, (s.st_atime, s.st_mtime)) + + for f in files: + if _ignore(f): + continue + + fn = os.path.join(root, f) + dest_fn = os.path.join(destdir, relpath, f) + if f in copy_files: + shutil.copyfile(fn, dest_fn) + else: + # realpath resolves links - thus links will be linked to + # the original target + os.symlink(os.path.realpath(fn), + dest_fn) + + +def clean(files, logfile): + '''clean up files given by glob expressions. + + Files are cleaned up by zapping, i.e. the files are set to size + 0. Links to files are replaced with place-holders. + + Information about the original file is written to `logfile`. + + Arguments + --------- + files : list + List of glob expressions of files to clean up. + logfile : string + Filename of logfile. + + ''' + fields = ('st_atime', 'st_blksize', 'st_blocks', + 'st_ctime', 'st_dev', 'st_gid', 'st_ino', + 'st_mode', 'st_mtime', 'st_nlink', + 'st_rdev', 'st_size', 'st_uid') + + dry_run = PARAMS.get("dryrun", False) + + if not dry_run: + if not os.path.exists(logfile): + outfile = IOTools.openFile(logfile, "w") + outfile.write("filename\tzapped\tlinkdest\t%s\n" % + "\t".join(fields)) + else: + outfile = IOTools.openFile(logfile, "a") + + c = E.Counter() + for fn in files: + c.files += 1 + if not dry_run: + stat, linkdest = IOTools.zapFile(fn) + if stat is not None: + c.zapped += 1 + if linkdest is not None: + c.links += 1 + outfile.write("%s\t%s\t%s\t%s\n" % ( + fn, + time.asctime(time.localtime(time.time())), + linkdest, + "\t".join([str(getattr(stat, x)) for x in fields]))) + + E.info("zapped: %s" % (c)) + outfile.close() + + return c + + +def peekParameters(workingdir, + pipeline, + on_error_raise=None, + prefix=None, + update_interface=False, + restrict_interface=False): + '''peek configuration parameters from external pipeline. + + As the paramater dictionary is built at runtime, this method + executes the pipeline in workingdir, dumping its configuration + values and reading them into a dictionary. + + If either `pipeline` or `workingdir` are not found, an error is + raised. This behaviour can be changed by setting `on_error_raise` + to False. In that case, an empty dictionary is returned. + + Arguments + --------- + workingdir : string + Working directory. This is the directory that the pipeline + was executed in. + pipeline : string + Name of the pipeline script. The pipeline is assumed to live + in the same directory as the current pipeline. + on_error_raise : Bool + If set to a boolean, an error will be raised (or not) if there + is an error during parameter peeking, for example if + `workingdir` can not be found. If `on_error_raise` is None, it + will be set to the default, which is to raise an exception + unless the calling script is imported or the option + ``--is-test`` has been passed at the command line. + prefix : string + Add a prefix to all parameters. This is useful if the paramaters + are added to the configuration dictionary of the calling pipeline. + update_interface : bool + If True, this method will prefix any options in the + ``[interface]`` section with `workingdir`. This allows + transparent access to files in the external pipeline. + restrict_interface : bool + If True, only interface parameters will be imported. + + Returns + ------- + config : dict + Dictionary of configuration values. + + ''' + caller_locals = getCallerLocals() + + # check if we should raise errors + if on_error_raise is None: + on_error_raise = not isTest() and \ + "__name__" in caller_locals and \ + caller_locals["__name__"] == "__main__" + + # patch - if --help or -h in command line arguments, + # do not peek as there might be no config file. + if "--help" in sys.argv or "-h" in sys.argv: + return {} + + # Attempt to locate directory with pipeline source code. This is a + # patch as pipelines might be called within the repository + # directory or from an installed location + dirname = PARAMS["pipelinedir"] + + # called without a directory, use current directory + if dirname == "": + dirname = os.path.abspath(".") + else: + # if not exists, assume we want version located + # in directory of calling script. + if not os.path.exists(dirname): + # directory is path of calling script + dirname = os.path.dirname(caller_locals['__file__']) + + pipeline = os.path.join(dirname, pipeline) + if not os.path.exists(pipeline): + if on_error_raise: + raise ValueError( + "can't find pipeline at %s" % (pipeline)) + else: + return {} + + if workingdir == "": + workingdir = os.path.abspath(".") + + # patch for the "config" target - use default + # pipeline directory if directory is not specified + # working dir is set to "?!" + if "config" in sys.argv or "check" in sys.argv or "clone" in sys.argv and workingdir == "?!": + workingdir = os.path.join(PARAMS.get("pipelinedir"), + IOTools.snip(pipeline, ".py")) + + if not os.path.exists(workingdir): + if on_error_raise: + raise ValueError( + "can't find working dir %s" % workingdir) + else: + return {} + + statement = "python %s -f -v 0 dump" % pipeline + process = subprocess.Popen(statement, + cwd=workingdir, + shell=True, + stdin=subprocess.PIPE, + stdout=subprocess.PIPE, + stderr=subprocess.PIPE) + + # process.stdin.close() + stdout, stderr = process.communicate() + if process.returncode != 0: + raise OSError( + ("Child was terminated by signal %i: \n" + "Statement: %s\n" + "The stderr was: \n%s\n" + "Stdout: %s") % + (-process.returncode, statement, stderr, stdout)) + + # subprocess only accepts encoding argument in py >= 3.6 so + # decode here. + stdout = stdout.decode("utf-8").splitlines() + # remove any log messages + stdout = [x for x in stdout if x.startswith("{")] + if len(stdout) > 1: + raise ValueError("received multiple configurations") + dump = json.loads(stdout[0]) + + # update interface + if update_interface: + for key, value in list(dump.items()): + if key.startswith("interface"): + dump[key] = os.path.join(workingdir, value) + + # keep only interface if so required + if restrict_interface: + dump = dict([(k, v) for k, v in dump.items() + if k.startswith("interface")]) + + # prefix all parameters + if prefix is not None: + dump = dict([("%s%s" % (prefix, x), y) for x, y in list(dump.items())]) + + return dump + + +class MultiLineFormatter(logging.Formatter): + """add identation for multi-line entries. + """ + + def format(self, record): + s = logging.Formatter.format(self, record) + if record.message: + header, footer = s.split(record.message) + s = s.replace('\n', '\n' + ' ' * len(header)) + return s + + +class LoggingFilterRabbitMQ(logging.Filter): + """pass event information to a rabbitMQ message queue. + + This is a log filter which detects messages from ruffus_ and sends + them to a rabbitMQ message queue. + + A :term:`task` is a ruffus_ decorated function, which will execute + one or more :term:`jobs`. + + Valid task/job status: + + update + task/job needs updating + completed + task/job completed successfully + failed + task/job failed + running + task/job is running + ignore + ignore task/job (is up-to-date) + + Arguments + --------- + ruffus_text : string + Log messages from ruffus.pipeline_printout. These are used + to collect all tasks that will be executed during pipeline + executation. + project_name : string + Name of the project + pipeline_name : string + Name of the pipeline + host : string + RabbitMQ host name + exchange : string + RabbitMQ exchange name + + """ + + def __init__(self, ruffus_text, + project_name, + pipeline_name, + host="localhost", + exchange="ruffus_pipelines"): + + self.project_name = project_name + self.pipeline_name = pipeline_name + self.exchange = exchange + + # dictionary of jobs to run + self.jobs = {} + self.tasks = {} + + if not HAS_PIKA: + self.connected = False + return + + def split_by_job(text): + text = "".join(text) + job_message = "" + # ignore first entry which is the docstring + for line in text.split(" Job = ")[1:]: + try: + # long file names cause additional wrapping and + # additional white-space characters + job_name = re.search( + "\[.*-> ([^\]]+)\]", line).groups() + except AttributeError: + raise AttributeError("could not parse '%s'" % line) + job_status = "ignore" + if "Job needs update" in line: + job_status = "update" + + yield job_name, job_status, job_message + + def split_by_task(text): + block, task_name = [], None + task_status = None + for line in text.split("\n"): + line = line.strip() + + if line.startswith("Tasks which will be run"): + task_status = "update" + elif line.startswith("Tasks which are up-to-date"): + task_status = "ignore" + + if line.startswith("Task = "): + if task_name: + yield task_name, task_status, list(split_by_job(block)) + block = [] + task_name = re.match("Task = (.*)", line).groups()[0] + continue + if line: + block.append(line) + if task_name: + yield task_name, task_status, list(split_by_job(block)) + + # create connection + try: + connection = pika.BlockingConnection(pika.ConnectionParameters( + host=host)) + self.connected = True + except pika.exceptions.AMQPConnectionError: + self.connected = False + return + + self.channel = connection.channel() + self.channel.exchange_declare( + exchange=self.exchange, + type='topic') + + # populate with initial messages + for task_name, task_status, jobs in split_by_task(ruffus_text): + if task_name.startswith("(mkdir"): + continue + + to_run = 0 + for job_name, job_status, job_message in jobs: + self.jobs[job_name] = (task_name, job_name) + if job_status == "update": + to_run += 1 + + self.tasks[task_name] = [task_status, len(jobs), + len(jobs) - to_run] + self.send_task(task_name) + + def send_task(self, task_name): + '''send task status.''' + + if not self.connected: + return + + task_status, task_total, task_completed = self.tasks[task_name] + + data = {} + data['created_at'] = time.time() + data['pipeline'] = self.pipeline_name + data['task_name'] = task_name + data['task_status'] = task_status + data['task_total'] = task_total + data['task_completed'] = task_completed + + key = "%s.%s.%s" % (self.project_name, self.pipeline_name, task_name) + try: + self.channel.basic_publish(exchange=self.exchange, + routing_key=key, + body=json.dumps(data)) + except pika.exceptions.ConnectionClosed: + E.warn("could not send message - connection closed") + except Exception as e: + E.warn("could not send message: %s" % str(e)) + + def send_error(self, task_name, job, error=None, msg=None): + + if not self.connected: + return + + try: + task_status, task_total, task_completed = self.tasks[task_name] + except KeyError: + E.warn("could not get task information for %s, no message sent" % + task_name) + return + + data = {} + data['created_at'] = time.time() + data['pipeline'] = self.pipeline_name + data['task_name'] = task_name + data['task_status'] = 'failed' + data['task_total'] = task_total + data['task_completed'] = task_completed + + key = "%s.%s.%s" % (self.project_name, self.pipeline_name, task_name) + + try: + self.channel.basic_publish(exchange=self.exchange, + routing_key=key, + body=json.dumps(data)) + except pika.exceptions.ConnectionClosed: + E.warn("could not send message - connection closed") + except Exception as e: + E.warn("could not send message: %s" % str(e)) + + def filter(self, record): + + if not self.connected: + return True + + # filter ruffus logging messages + if record.filename.endswith("task.py"): + try: + before, task_name = record.msg.strip().split(" = ") + except ValueError: + return True + + # ignore the mkdir, etc tasks + if task_name not in self.tasks: + return True + + if before == "Task enters queue": + self.tasks[task_name][0] = "running" + elif before == "Completed Task": + self.tasks[task_name][0] = "completed" + elif before == "Uptodate Task": + self.tasks[task_name][0] = "uptodate" + else: + return True + + # send new task status out + self.send_task(task_name) + + return True + + +USAGE = ''' +usage: %prog [OPTIONS] [CMD] [target] + +Execute pipeline %prog. + +Commands can be any of the following + +make + run all tasks required to build *target* + +show + show tasks required to build *target* without executing them + +plot + plot image (using inkscape) of pipeline state for *target* + +debug [args] + debug a method using the supplied arguments. The method + in the pipeline is run without checking any dependencies. + +config + write new configuration files pipeline.ini, sphinxreport.ini and conf.py + with default values + +dump + write pipeline configuration to stdout + +printconfig + write pipeline configuration to stdout in a user-friendly way so + it is easier to debug pipeline parameters + +touch + touch files only, do not run + +regenerate + regenerate the ruffus checkpoint file + +check + check if requirements (external tool dependencies) are satisfied. + +clone + create a clone of a pipeline in in the current + directory. The cloning process aims to use soft linking to files + (not directories) as much as possible. Time stamps are + preserved. Cloning is useful if a pipeline needs to be re-run from + a certain point but the original pipeline should be preserved. + +''' + + +def main(args=sys.argv): + """command line control function for a pipeline. + + This method defines command line options for the pipeline and + updates the global configuration dictionary correspondingly. + + It then provides a command parser to execute particular tasks + using the ruffus pipeline control functions. See the generated + command line help for usage. + + To use it, add:: + + import CGAT.Pipeline as P + + if __name__ == "__main__": + sys.exit(P.main(sys.argv)) + + to your pipeline script. + + Arguments + --------- + args : list + List of command line arguments. + + """ + + global GLOBAL_OPTIONS + global GLOBAL_ARGS + + parser = E.OptionParser(version="%prog version: $Id$", + usage=USAGE) + + parser.add_option("--pipeline-action", dest="pipeline_action", + type="choice", + choices=( + "make", "show", "plot", "dump", "config", "clone", + "check", "regenerate", "printconfig"), + help="action to take [default=%default].") + + parser.add_option("--pipeline-format", dest="pipeline_format", + type="choice", + choices=("dot", "jpg", "svg", "ps", "png"), + help="pipeline format [default=%default].") + + parser.add_option("-n", "--dry-run", dest="dry_run", + action="store_true", + help="perform a dry run (do not execute any shell " + "commands) [default=%default].") + + parser.add_option("-f", "--force-output", dest="force", + action="store_true", + help="force running the pipeline even if there " + "are uncommited changes " + "in the repository [default=%default].") + + parser.add_option("-p", "--multiprocess", dest="multiprocess", type="int", + help="number of parallel processes to use on " + "submit host " + "(different from number of jobs to use for " + "cluster jobs) " + "[default=%default].") + + parser.add_option("-e", "--exceptions", dest="log_exceptions", + action="store_true", + help="echo exceptions immediately as they occur " + "[default=%default].") + + parser.add_option("-i", "--terminate", dest="terminate", + action="store_true", + help="terminate immediately at the first exception " + "[default=%default].") + + parser.add_option("-d", "--debug", dest="debug", + action="store_true", + help="output debugging information on console, " + "and not the logfile " + "[default=%default].") + + parser.add_option("-s", "--set", dest="variables_to_set", + type="string", action="append", + help="explicitly set paramater values " + "[default=%default].") + + parser.add_option("-c", "--checksums", dest="ruffus_checksums_level", + type="int", + help="set the level of ruffus checksums" + "[default=%default].") + + parser.add_option("-t", "--is-test", dest="is_test", + action="store_true", + help="this is a test run" + "[default=%default].") + + parser.add_option("--rabbitmq-exchange", dest="rabbitmq_exchange", + type="string", + help="RabbitMQ exchange to send log messages to " + "[default=%default].") + + parser.add_option("--rabbitmq-host", dest="rabbitmq_host", + type="string", + help="RabbitMQ host to send log messages to " + "[default=%default].") + + parser.add_option("--input-validation", dest="input_validation", + action="store_true", + help="perform input validation before starting " + "[default=%default].") + + parser.set_defaults( + pipeline_action=None, + pipeline_format="svg", + pipeline_targets=[], + multiprocess=40, + logfile="pipeline.log", + dry_run=False, + force=False, + log_exceptions=False, + exceptions_terminate_immediately=False, + debug=False, + variables_to_set=[], + is_test=False, + ruffus_checksums_level=0, + rabbitmq_host="saruman", + rabbitmq_exchange="ruffus_pipelines", + input_validation=False) + + (options, args) = E.Start(parser, + add_cluster_options=True) + + GLOBAL_OPTIONS, GLOBAL_ARGS = options, args + E.info("Started in: %s" % PARAMS.get("workingdir")) + # At this point, the PARAMS dictionary has already been + # built. It now needs to be updated with selected command + # line options as these should always take precedence over + # configuration files. + + PARAMS["dryrun"] = options.dry_run + PARAMS["input_validation"] = options.input_validation + + # use cli_cluster_* keys in PARAMS to ensure highest priority + # of cluster_* options passed with the command-line + if options.cluster_memory_default is not None: + PARAMS["cli_cluster_memory_default"] = options.cluster_memory_default + PARAMS["cluster_memory_default"] = options.cluster_memory_default + if options.cluster_memory_resource is not None: + PARAMS["cli_cluster_memory_resource"] = options.cluster_memory_resource + PARAMS["cluster_memory_resource"] = options.cluster_memory_resource + if options.cluster_num_jobs is not None: + PARAMS["cli_cluster_num_jobs"] = options.cluster_num_jobs + PARAMS["cluster_num_jobs"] = options.cluster_num_jobs + if options.cluster_options is not None: + PARAMS["cli_cluster_options"] = options.cluster_options + PARAMS["cluster_options"] = options.cluster_options + if options.cluster_parallel_environment is not None: + PARAMS["cli_cluster_parallel_environment"] = options.cluster_parallel_environment + PARAMS["cluster_parallel_environment"] = options.cluster_parallel_environment + if options.cluster_priority is not None: + PARAMS["cli_cluster_priority"] = options.cluster_priority + PARAMS["cluster_priority"] = options.cluster_priority + if options.cluster_queue is not None: + PARAMS["cli_cluster_queue"] = options.cluster_queue + PARAMS["cluster_queue"] = options.cluster_queue + if options.cluster_queue_manager is not None: + PARAMS["cli_cluster_queue_manager"] = options.cluster_queue_manager + PARAMS["cluster_queue_manager"] = options.cluster_queue_manager + + PARAMS["ruffus_checksums_level"] = options.ruffus_checksums_level + + for variables in options.variables_to_set: + variable, value = variables.split("=") + PARAMS[variable.strip()] = IOTools.str2val(value.strip()) + + if args: + options.pipeline_action = args[0] + if len(args) > 1: + options.pipeline_targets.extend(args[1:]) + + # see inputValidation function in Parameters.py + if options.input_validation: + inputValidation(PARAMS, sys.argv[0]) + + if options.pipeline_action == "check": + counter, requirements = Requirements.checkRequirementsFromAllModules() + for requirement in requirements: + E.info("\t".join(map(str, requirement))) + E.info("version check summary: %s" % str(counter)) + E.Stop() + return + + elif options.pipeline_action == "debug": + # create the session proxy + startSession() + + method_name = options.pipeline_targets[0] + caller = getCaller() + method = getattr(caller, method_name) + method(*options.pipeline_targets[1:]) + + elif options.pipeline_action in ("make", "show", "svg", "plot", + "touch", "regenerate"): + + # set up extra file logger + handler = logging.FileHandler(filename=options.logfile, + mode="a") + handler.setFormatter( + MultiLineFormatter( + '%(asctime)s %(levelname)s %(module)s.%(funcName)s.%(lineno)d %(message)s')) + logger = logging.getLogger() + logger.addHandler(handler) + messenger = None + + try: + if options.pipeline_action == "make": + + # get tasks to be done. This essentially replicates + # the state information within ruffus. + stream = io.StringIO() + pipeline_printout( + stream, + options.pipeline_targets, + verbose=5, + checksum_level=options.ruffus_checksums_level) + + messenger = LoggingFilterRabbitMQ( + stream.getvalue(), + project_name=getProjectName(), + pipeline_name=getPipelineName(), + host=options.rabbitmq_host, + exchange=options.rabbitmq_exchange) + + logger.addFilter(messenger) + + if not options.without_cluster and HAS_DRMAA: + global task + # use threading instead of multiprocessing in order to + # limit the number of concurrent jobs by using the + # GIL + # + # Note that threading might cause problems with rpy. + task.Pool = ThreadPool + + # create the session proxy + startSession() + + # + # make sure we are not logging at the same time in + # different processes + # + # session_mutex = manager.Lock() + E.info(E.GetHeader()) + E.info("code location: %s" % PARAMS["pipeline_scriptsdir"]) + E.info("Working directory is: %s" % PARAMS["workingdir"]) + + pipeline_run( + options.pipeline_targets, + multiprocess=options.multiprocess, + logger=logger, + verbose=options.loglevel, + log_exceptions=options.log_exceptions, + exceptions_terminate_immediately=options.exceptions_terminate_immediately, + checksum_level=options.ruffus_checksums_level, + ) + + E.info(E.GetFooter()) + + closeSession() + + elif options.pipeline_action == "show": + pipeline_printout( + options.stdout, + options.pipeline_targets, + verbose=options.loglevel, + checksum_level=options.ruffus_checksums_level) + + elif options.pipeline_action == "touch": + pipeline_run( + options.pipeline_targets, + touch_files_only=True, + verbose=options.loglevel, + checksum_level=options.ruffus_checksums_level) + + elif options.pipeline_action == "regenerate": + pipeline_run( + options.pipeline_targets, + touch_files_only=options.ruffus_checksums_level, + verbose=options.loglevel) + + elif options.pipeline_action == "svg": + pipeline_printout_graph( + options.stdout.buffer, + options.pipeline_format, + options.pipeline_targets, + checksum_level=options.ruffus_checksums_level) + + elif options.pipeline_action == "plot": + outf, filename = tempfile.mkstemp() + pipeline_printout_graph( + os.fdopen(outf, "wb"), + options.pipeline_format, + options.pipeline_targets, + checksum_level=options.ruffus_checksums_level) + execute("inkscape %s" % filename) + os.unlink(filename) + + except ruffus_exceptions.RethrownJobError as value: + + if not options.debug: + E.error("%i tasks with errors, please see summary below:" % + len(value.args)) + for idx, e in enumerate(value.args): + task, job, error, msg, traceback = e + + if task is None: + # this seems to be errors originating within ruffus + # such as a missing dependency + # msg then contains a RethrownJobJerror + msg = str(msg) + pass + else: + task = re.sub("__main__.", "", task) + job = re.sub("\s", "", job) + + if messenger: + messenger.send_error(task, job, error, msg) + + # display only single line messages + if len([x for x in msg.split("\n") if x != ""]) > 1: + msg = "" + + E.error("%i: Task=%s Error=%s %s: %s" % + (idx, task, error, job, msg)) + + E.error("full traceback is in %s" % options.logfile) + + # write full traceback to log file only by removing the stdout + # handler + lhStdout = logger.handlers[0] + logger.removeHandler(lhStdout) + logger.error("start of error messages") + logger.error(value) + logger.error("end of error messages") + logger.addHandler(lhStdout) + + # raise error + raise ValueError( + "pipeline failed with %i errors" % len(value.args)) + else: + raise + + elif options.pipeline_action == "dump": + print(json.dumps(PARAMS)) + + elif options.pipeline_action == "printconfig": + print("Printing out pipeline parameters: ") + for k in sorted(PARAMS): + print(k, "=", PARAMS[k]) + printConfigFiles() + + elif options.pipeline_action == "config": + writeConfigFiles(getConfigPaths()) + + elif options.pipeline_action == "clone": + clonePipeline(options.pipeline_targets[0]) + + else: + raise ValueError("unknown pipeline action %s" % + options.pipeline_action) + + E.Stop() From 986fee17d1e7cead3177de5995d9a3aa97f97f23 Mon Sep 17 00:00:00 2001 From: Antonio Date: Tue, 9 Jan 2018 14:28:08 +0000 Subject: [PATCH 05/21] Update pipeline.ini --- CGATPipelines/configuration/pipeline.ini | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/CGATPipelines/configuration/pipeline.ini b/CGATPipelines/configuration/pipeline.ini index ec33ed06..07af17bf 100644 --- a/CGATPipelines/configuration/pipeline.ini +++ b/CGATPipelines/configuration/pipeline.ini @@ -6,10 +6,10 @@ ######################################################## ######################################################## # The project name to appear in the report -projectname=CGATProject +projectname= # The copyright statement to appear in the report -copyright=CGAT (2010-2014) +copyright= # The short X.Y version to appear in the report version=0.1 @@ -37,7 +37,7 @@ scratchdir=/tmp web_dir=../web # location of indexed genome -genome_dir=/ifs/mirror/genomes/plain +genome_dir=/full/path/here # The genome to use (UCSC convention) genome=hg19 @@ -75,7 +75,8 @@ port=3306 [cluster] # queue to use -queue=all.q +queue= +#all.q # priority of jobs on cluster priority=-10 From 96b77566f391c6d026cbb26e800a121ee9281bf3 Mon Sep 17 00:00:00 2001 From: AntonioJBT Date: Tue, 9 Jan 2018 14:35:34 +0000 Subject: [PATCH 06/21] updated cluster.py for pbspro, already in other branch though --- CGATPipelines/Pipeline/Cluster.py | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/CGATPipelines/Pipeline/Cluster.py b/CGATPipelines/Pipeline/Cluster.py index fc3d0d02..42bc53d4 100644 --- a/CGATPipelines/Pipeline/Cluster.py +++ b/CGATPipelines/Pipeline/Cluster.py @@ -199,13 +199,14 @@ def setupDrmaaJobTemplate(drmaa_session, options, job_name, job_memory): spec = ["-N %s" % job_name[0:15], "-l select=1:ncpus=%s:mem=%s" % (job_threads, job_memory)] - if options["cluster_options"]: - if "mem" not in options["cluster_options"]: - spec.append("%(cluster_options)s") - - elif "mem" in options["cluster_options"]: - raise ValueError('''mem resource specified twice, check ~/.cgat config file, - ini files, command line options, etc.''') + if options["cluster_options"]: + if "mem" not in options["cluster_options"]: + spec.append("%(cluster_options)s") + + elif "mem" in options["cluster_options"]: + raise ValueError('''mem resource specified twice, check ~/.cgat config file, + ini files, command line options, etc. + ''') if "cluster_pe_queue" in options and multithread: spec.append( From 1ef89cad5ab4c0a24727fdeb7171efe18a7dea8d Mon Sep 17 00:00:00 2001 From: AntonioJBT Date: Tue, 9 Jan 2018 14:38:21 +0000 Subject: [PATCH 07/21] updates/testing --- CGATPipelines/Pipeline/Control.py | 107 +++++++++--------- ...ol.py.core_newest => Control.py.works_mac} | 107 +++++++++--------- 2 files changed, 107 insertions(+), 107 deletions(-) rename CGATPipelines/Pipeline/{Control.py.core_newest => Control.py.works_mac} (94%) diff --git a/CGATPipelines/Pipeline/Control.py b/CGATPipelines/Pipeline/Control.py index e785de16..898ceb7c 100644 --- a/CGATPipelines/Pipeline/Control.py +++ b/CGATPipelines/Pipeline/Control.py @@ -71,8 +71,59 @@ GLOBAL_OPTIONS, GLOBAL_ARGS = None, None -def writeConfigFiles(paths): - #pipeline_path, pipeline_path_2, general_path): +def getConfigPaths(): + ''' + Search the current and installation paths where the configuration files live. + ''' + # (Antonio) I've modified this section, see explanation and changes in the + # writeConfigFiles function above. + config_paths = [] + try: + f = sys._getframe(1) + caller = inspect.getargvalues(f).locals["__file__"] + # Make it easier to match the name of the command executed so that + # the config file can be searched in case there are more than one + # ini files found in writeConfig(): + caller_name = os.path.basename(os.path.normpath(caller)) + except KeyError as e: + # The following code only works if something like this function is + # present in my_pipeline.py script: + # http://stackoverflow.com/questions/4519127/setuptools-package-data-folder-location + f = sys._getframe(2) + caller = inspect.getargvalues(f).locals["__file__"] + cmd_caller = os.path.basename(os.path.normpath(caller)) + # As above, save the command called in a separate variable: + caller_name = cmd_caller + cmd_caller = importlib.import_module(cmd_caller) + caller = cmd_caller.getDir() + else: + print('''Unable to find path to file being executed. Probably because + CGATPipelines and the pipeline that is being executed + cannot figure out where each other lives. Raise an issue in + GitHub if possible. Exiting.''') + + # CGATPipelines have a pipe_XX/pipe_XX hierarchy, but a simplified + # version would only have pipe_XX/ + # so creating an additional pipeline_path + # TO DO: clean this up + pipeline_path = os.path.splitext(caller)[0] + pipeline_path_2 = os.path.dirname(pipeline_path) + # CGATPipelines have a "configuration" folder + # adding a glob to have a bit more flexibility + general_path = glob.glob(str(os.path.abspath(pipeline_path_2) + + '/**/configuration*'), recursive = True) + if not general_path: + general_path = os.path.join(os.path.dirname(pipeline_path), "configuration") + + # Add paths to search list: + config_paths.extend([pipeline_path, pipeline_path_2]) + # Extend separately in case general_path returns more than one file: + config_paths.extend(general_path) + + return(config_paths, caller_name) + + +def writeConfigFiles(paths, caller_name): '''create default configuration files in `path`. ''' # TO DO: I've modified this function with workarounds to make it more @@ -80,9 +131,8 @@ def writeConfigFiles(paths): # copy pre-run sphinx-quickstart files if they exist. # Other than creating a 'report' dir, it should not change the way it is # run from CGATPipelines. - # See also bottom of script for changes when calling the 'config' option + # See also getConfigPaths() above, these run when calling the 'config' option # Antonio - #paths = [pipeline_path, pipeline_path_2, general_path] report_dir = 'pipeline_report' try: os.mkdir(report_dir) # Sphinx config files will be copied here @@ -1173,54 +1223,7 @@ def main(args=sys.argv): printConfigFiles() elif options.pipeline_action == "config": - # (Antonio) I've modified this section, see explanation and changes in the - # writeConfigFiles function above. - config_paths = [] - try: - f = sys._getframe(1) - caller = inspect.getargvalues(f).locals["__file__"] - # Make it easier to match the name of the command executed so that - # the config file can be searched in case there are more than one - # ini files found in writeConfig(): - # Making it global, check if there's better way: - global caller_name - caller_name = os.path.basename(os.path.normpath(caller)) - except KeyError as e: - # The following code only works if something like this function is - # present in my_pipeline.py script: - # http://stackoverflow.com/questions/4519127/setuptools-package-data-folder-location - f = sys._getframe(2) - caller = inspect.getargvalues(f).locals["__file__"] - cmd_caller = os.path.basename(os.path.normpath(caller)) - # As above, save the command called in a separate variable: - global caller_name - caller_name = cmd_caller - cmd_caller = importlib.import_module(cmd_caller) - caller = cmd_caller.getDir() - else: - print('''Unable to find path to file being executed. Probably because - CGATPipelines and the pipeline that is being executed - cannot figure out where each other lives. Raise an issue in - GitHub if possible. Exiting.''') - - # CGATPipelines have a pipe_XX/pipe_XX hierarchy, but a simplified - # version would only have pipe_XX/ - # so creating an additional pipeline_path - # TO DO: clean this up - pipeline_path = os.path.splitext(caller)[0] - pipeline_path_2 = os.path.dirname(pipeline_path) - # CGATPipelines have a "configuration" folder - # adding a glob to have a bit more flexibility - general_path = glob.glob(str(os.path.abspath(pipeline_path_2) + - '/**/configuration*'), recursive = True) - - if not general_path: - general_path = os.path.join(os.path.dirname(pipeline_path), "configuration") - - config_paths.extend([pipeline_path, pipeline_path_2]) - # Extend separately in case general_path returns more than one file: - config_paths.extend(general_path) - writeConfigFiles(config_paths) + writeConfigFiles(getConfigPaths()) elif options.pipeline_action == "clone": clonePipeline(options.pipeline_targets[0]) diff --git a/CGATPipelines/Pipeline/Control.py.core_newest b/CGATPipelines/Pipeline/Control.py.works_mac similarity index 94% rename from CGATPipelines/Pipeline/Control.py.core_newest rename to CGATPipelines/Pipeline/Control.py.works_mac index 898ceb7c..e785de16 100644 --- a/CGATPipelines/Pipeline/Control.py.core_newest +++ b/CGATPipelines/Pipeline/Control.py.works_mac @@ -71,59 +71,8 @@ PARAMS = {} GLOBAL_OPTIONS, GLOBAL_ARGS = None, None -def getConfigPaths(): - ''' - Search the current and installation paths where the configuration files live. - ''' - # (Antonio) I've modified this section, see explanation and changes in the - # writeConfigFiles function above. - config_paths = [] - try: - f = sys._getframe(1) - caller = inspect.getargvalues(f).locals["__file__"] - # Make it easier to match the name of the command executed so that - # the config file can be searched in case there are more than one - # ini files found in writeConfig(): - caller_name = os.path.basename(os.path.normpath(caller)) - except KeyError as e: - # The following code only works if something like this function is - # present in my_pipeline.py script: - # http://stackoverflow.com/questions/4519127/setuptools-package-data-folder-location - f = sys._getframe(2) - caller = inspect.getargvalues(f).locals["__file__"] - cmd_caller = os.path.basename(os.path.normpath(caller)) - # As above, save the command called in a separate variable: - caller_name = cmd_caller - cmd_caller = importlib.import_module(cmd_caller) - caller = cmd_caller.getDir() - else: - print('''Unable to find path to file being executed. Probably because - CGATPipelines and the pipeline that is being executed - cannot figure out where each other lives. Raise an issue in - GitHub if possible. Exiting.''') - - # CGATPipelines have a pipe_XX/pipe_XX hierarchy, but a simplified - # version would only have pipe_XX/ - # so creating an additional pipeline_path - # TO DO: clean this up - pipeline_path = os.path.splitext(caller)[0] - pipeline_path_2 = os.path.dirname(pipeline_path) - # CGATPipelines have a "configuration" folder - # adding a glob to have a bit more flexibility - general_path = glob.glob(str(os.path.abspath(pipeline_path_2) + - '/**/configuration*'), recursive = True) - if not general_path: - general_path = os.path.join(os.path.dirname(pipeline_path), "configuration") - - # Add paths to search list: - config_paths.extend([pipeline_path, pipeline_path_2]) - # Extend separately in case general_path returns more than one file: - config_paths.extend(general_path) - - return(config_paths, caller_name) - - -def writeConfigFiles(paths, caller_name): +def writeConfigFiles(paths): + #pipeline_path, pipeline_path_2, general_path): '''create default configuration files in `path`. ''' # TO DO: I've modified this function with workarounds to make it more @@ -131,8 +80,9 @@ def writeConfigFiles(paths, caller_name): # copy pre-run sphinx-quickstart files if they exist. # Other than creating a 'report' dir, it should not change the way it is # run from CGATPipelines. - # See also getConfigPaths() above, these run when calling the 'config' option + # See also bottom of script for changes when calling the 'config' option # Antonio + #paths = [pipeline_path, pipeline_path_2, general_path] report_dir = 'pipeline_report' try: os.mkdir(report_dir) # Sphinx config files will be copied here @@ -1223,7 +1173,54 @@ def main(args=sys.argv): printConfigFiles() elif options.pipeline_action == "config": - writeConfigFiles(getConfigPaths()) + # (Antonio) I've modified this section, see explanation and changes in the + # writeConfigFiles function above. + config_paths = [] + try: + f = sys._getframe(1) + caller = inspect.getargvalues(f).locals["__file__"] + # Make it easier to match the name of the command executed so that + # the config file can be searched in case there are more than one + # ini files found in writeConfig(): + # Making it global, check if there's better way: + global caller_name + caller_name = os.path.basename(os.path.normpath(caller)) + except KeyError as e: + # The following code only works if something like this function is + # present in my_pipeline.py script: + # http://stackoverflow.com/questions/4519127/setuptools-package-data-folder-location + f = sys._getframe(2) + caller = inspect.getargvalues(f).locals["__file__"] + cmd_caller = os.path.basename(os.path.normpath(caller)) + # As above, save the command called in a separate variable: + global caller_name + caller_name = cmd_caller + cmd_caller = importlib.import_module(cmd_caller) + caller = cmd_caller.getDir() + else: + print('''Unable to find path to file being executed. Probably because + CGATPipelines and the pipeline that is being executed + cannot figure out where each other lives. Raise an issue in + GitHub if possible. Exiting.''') + + # CGATPipelines have a pipe_XX/pipe_XX hierarchy, but a simplified + # version would only have pipe_XX/ + # so creating an additional pipeline_path + # TO DO: clean this up + pipeline_path = os.path.splitext(caller)[0] + pipeline_path_2 = os.path.dirname(pipeline_path) + # CGATPipelines have a "configuration" folder + # adding a glob to have a bit more flexibility + general_path = glob.glob(str(os.path.abspath(pipeline_path_2) + + '/**/configuration*'), recursive = True) + + if not general_path: + general_path = os.path.join(os.path.dirname(pipeline_path), "configuration") + + config_paths.extend([pipeline_path, pipeline_path_2]) + # Extend separately in case general_path returns more than one file: + config_paths.extend(general_path) + writeConfigFiles(config_paths) elif options.pipeline_action == "clone": clonePipeline(options.pipeline_targets[0]) From ef798d1ab7c75b14604773cf145d32d22c1554e3 Mon Sep 17 00:00:00 2001 From: AntonioJBT Date: Tue, 9 Jan 2018 17:55:04 +0000 Subject: [PATCH 08/21] control.py changes --- CGATPipelines/Pipeline/Control.py | 127 ++++++++++-------- ...{Control.py.works_mac => Control.py.works} | 1 - 2 files changed, 73 insertions(+), 55 deletions(-) rename CGATPipelines/Pipeline/{Control.py.works_mac => Control.py.works} (99%) diff --git a/CGATPipelines/Pipeline/Control.py b/CGATPipelines/Pipeline/Control.py index 898ceb7c..884c6217 100644 --- a/CGATPipelines/Pipeline/Control.py +++ b/CGATPipelines/Pipeline/Control.py @@ -73,35 +73,46 @@ def getConfigPaths(): ''' - Search the current and installation paths where the configuration files live. + Search the current and installation paths where the configuration files + live for the pipeline being called. ''' # (Antonio) I've modified this section, see explanation and changes in the # writeConfigFiles function above. config_paths = [] + # Get the name of the pipeline being called + # This could be: + # cgatflow readqc config + # pipeline_QTL config + # python /YYYY//XXXX/pipeline_XXXX.py config try: f = sys._getframe(1) - caller = inspect.getargvalues(f).locals["__file__"] + caller = f.f_globals["__file__"] # cgatflow config + #caller = inspect.getargvalues(f).locals["__file__"] # Make it easier to match the name of the command executed so that # the config file can be searched in case there are more than one # ini files found in writeConfig(): caller_name = os.path.basename(os.path.normpath(caller)) + print('try 1', f, caller, caller_name) except KeyError as e: # The following code only works if something like this function is # present in my_pipeline.py script: # http://stackoverflow.com/questions/4519127/setuptools-package-data-folder-location f = sys._getframe(2) - caller = inspect.getargvalues(f).locals["__file__"] + caller = f.f_globals["__file__"] # cgatflow config + #caller = inspect.getargvalues(f).locals["__file__"] cmd_caller = os.path.basename(os.path.normpath(caller)) + print('first defs try 2', f, caller, cmd_caller) # As above, save the command called in a separate variable: caller_name = cmd_caller cmd_caller = importlib.import_module(cmd_caller) caller = cmd_caller.getDir() - else: - print('''Unable to find path to file being executed. Probably because - CGATPipelines and the pipeline that is being executed - cannot figure out where each other lives. Raise an issue in - GitHub if possible. Exiting.''') - + print('2nd defs try 2', caller_name, cmd_caller, caller) + #else: + # print('''Unable to find path to file being executed. Probably because + # CGATPipelines and the pipeline that is being executed + # cannot figure out where each other lives. Raise an issue in + # GitHub if possible. Exiting.''') + # sys.exit() # CGATPipelines have a pipe_XX/pipe_XX hierarchy, but a simplified # version would only have pipe_XX/ # so creating an additional pipeline_path @@ -111,7 +122,7 @@ def getConfigPaths(): # CGATPipelines have a "configuration" folder # adding a glob to have a bit more flexibility general_path = glob.glob(str(os.path.abspath(pipeline_path_2) + - '/**/configuration*'), recursive = True) + '/*/configuration*'), recursive = True) if not general_path: general_path = os.path.join(os.path.dirname(pipeline_path), "configuration") @@ -120,10 +131,10 @@ def getConfigPaths(): # Extend separately in case general_path returns more than one file: config_paths.extend(general_path) + print(config_paths, caller_name) return(config_paths, caller_name) - -def writeConfigFiles(paths, caller_name): +def writeConfigFiles(config_paths, caller_name): '''create default configuration files in `path`. ''' # TO DO: I've modified this function with workarounds to make it more @@ -134,6 +145,10 @@ def writeConfigFiles(paths, caller_name): # See also getConfigPaths() above, these run when calling the 'config' option # Antonio report_dir = 'pipeline_report' + config_files = [] + print(config_paths) + print(caller_name) + try: os.mkdir(report_dir) # Sphinx config files will be copied here # CGATReport only needs its conf.py to generate the rest @@ -145,7 +160,7 @@ def writeConfigFiles(paths, caller_name): # Look for ini file: f_count = 0 INI_list = [] - for path in paths: + for path in config_paths: if os.path.exists(path) and os.path.isdir(path): for f in os.listdir(os.path.abspath(path)): if fnmatch.fnmatch(f, 'pipeline*ini'): @@ -158,44 +173,48 @@ def writeConfigFiles(paths, caller_name): elif f_count > 1: # Prioritise the file that contains the command called if more than one - # ini file are found: + # ini files are found: for f in INI_list: if caller_name in f: + count += 1 INI_file = f config_files = [INI_file] - else: - if f_count == 0: - print(''' - No configuration (ini) files found in: - {} - '''.format(paths) - ) - else: - print(''' - Found several ini files but could not prioritise based on: - {} - Exiting. - '''.format(caller_name)) - sys.exit() + if count == 0: + E.warn(''' + Found several ini files but could not prioritise based on: + {}. + Some pipelines do not require an ini file though, try + without. + '''.format(caller_name)) + + if f_count == 0: + E.warn(''' + No configuration (ini) files found in: + {} + '''.format(config_paths) + ) # Copy pipeline ini file: - for dest in config_files: - if os.path.exists(dest): - E.warn("file `%s` already exists - skipped" % dest) - continue + if not config_files: + E.warn('No configuration files found.') + else: + for dest in config_files: + if os.path.exists(dest): + E.warn("file `%s` already exists - skipped" % dest) + continue - for path in paths: - src = os.path.join(path, dest) - if os.path.exists(src): - shutil.copyfile(src, dest) - E.info("created new configuration file `%s` " % dest) - break - else: - raise ValueError('''default config file for `%s` - not found in - %s - A pipeline cannot be run without this. - ''' % (config_files, paths)) + for path in config_paths: + src = os.path.join(path, dest) + if os.path.exists(src): + shutil.copyfile(src, dest) + E.info("created new configuration file `%s` " % dest) + break + else: + raise ValueError('''default config file for `%s` + not found in + %s + A pipeline cannot be run without this. + ''' % (config_files, config_paths)) # Copy Sphinx configuration files, enforce copy of 'conf.py' in case # CGATReport is used: @@ -203,7 +222,7 @@ def writeConfigFiles(paths, caller_name): if os.path.exists(dest): E.warn("file `%s` already exists - skipped" % dest) - for path in paths: + for path in config_paths: src = os.path.join(path, dest) if os.path.exists(src): # Put sphinx files in separate dir: @@ -217,7 +236,8 @@ def writeConfigFiles(paths, caller_name): # Only warn as pipeline can be run without report: E.warn('''default config file for `%s` not found in %s - CGATReport nor Sphinx can be run without this''' % (dest, paths)) + CGATReport nor Sphinx can be run without this''' % (dest, + config_paths)) # If other Sphinx config files are found, copy them if there is a skeleton # pipeline report to use: @@ -233,7 +253,7 @@ def writeConfigFiles(paths, caller_name): # Look for a pipeline report file: f_count = 0 - for path in paths: + for path in config_paths: if os.path.exists(path): for f in os.listdir(os.path.abspath(path)): # TO DO: @@ -253,17 +273,14 @@ def writeConfigFiles(paths, caller_name): report_pipeline_*.rst in the directories: {} - {} - or - {} Ignore this if you are using CGATReport. - '''.format(pipeline_path, pipeline_path_2, general_path) + '''.format(config_paths) ) # Copy the files across if they are found: f_count = 0 # Check all the paths and their files given above when searching for config files: - for path in paths: + for path in config_paths: if os.path.exists(path): for f in os.listdir(path): # For each file or search term given, match to an existing file: @@ -297,7 +314,7 @@ def writeConfigFiles(paths, caller_name): were found in {} - Continuing without.'''.format(dest, paths)) + Continuing without.'''.format(dest, config_paths)) def printConfigFiles(): ''' @@ -1223,7 +1240,9 @@ def main(args=sys.argv): printConfigFiles() elif options.pipeline_action == "config": - writeConfigFiles(getConfigPaths()) + config_paths = getConfigPaths()[0] + caller_name = getConfigPaths()[1] + writeConfigFiles(config_paths, caller_name) elif options.pipeline_action == "clone": clonePipeline(options.pipeline_targets[0]) diff --git a/CGATPipelines/Pipeline/Control.py.works_mac b/CGATPipelines/Pipeline/Control.py.works similarity index 99% rename from CGATPipelines/Pipeline/Control.py.works_mac rename to CGATPipelines/Pipeline/Control.py.works index e785de16..87dde9b3 100644 --- a/CGATPipelines/Pipeline/Control.py.works_mac +++ b/CGATPipelines/Pipeline/Control.py.works @@ -1193,7 +1193,6 @@ def main(args=sys.argv): caller = inspect.getargvalues(f).locals["__file__"] cmd_caller = os.path.basename(os.path.normpath(caller)) # As above, save the command called in a separate variable: - global caller_name caller_name = cmd_caller cmd_caller = importlib.import_module(cmd_caller) caller = cmd_caller.getDir() From 50173b2c486784919b920e4d66d635f3a5ab4be3 Mon Sep 17 00:00:00 2001 From: Antonio Date: Wed, 17 Jan 2018 15:29:06 +0000 Subject: [PATCH 09/21] added function to search and import external pipeline, untested --- CGATPipelines/cgatflow.py | 36 +++++++++++++++++++++++++++++++----- 1 file changed, 31 insertions(+), 5 deletions(-) diff --git a/CGATPipelines/cgatflow.py b/CGATPipelines/cgatflow.py index 8d219115..c194eb53 100644 --- a/CGATPipelines/cgatflow.py +++ b/CGATPipelines/cgatflow.py @@ -55,6 +55,28 @@ def printListInColumns(l, ncolumns): # put it all together return '\n'.join([pattern % row for row in rows]) +def getExternalPipeline(): + ''' + Import external pipeline built using CGAT approach and installed as python module. + Assumes you have called using cgatflow, e.g. cgatflow externalPipeline CMD + ''' + # Get name of pipeline from command line arguments: + argv = sys.argv + command = argv[1] + command = re.sub("-", "_", command) + pipeline = "pipeline_{}".format(command) + + # Get path to where the pipeline is installed: + path = os.path.join(os.path.abspath(os.path.dirname(command.__file__))) + + # Import it: + (file, pathname, description) = imp.find_module(pipeline, path) + module = imp.load_module(pipeline, file, pathname, description) + # remove 'cgatflow' from sys.argv + del sys.argv[0] + module.main(sys.argv) + + return def main(argv=None): @@ -82,11 +104,15 @@ def main(argv=None): command = re.sub("-", "_", command) pipeline = "pipeline_{}".format(command) - (file, pathname, description) = imp.find_module(pipeline, paths) - module = imp.load_module(pipeline, file, pathname, description) - # remove 'cgatflow' from sys.argv - del sys.argv[0] - module.main(sys.argv) + try: + (file, pathname, description) = imp.find_module(pipeline, paths) + module = imp.load_module(pipeline, file, pathname, description) + # remove 'cgatflow' from sys.argv + del sys.argv[0] + module.main(sys.argv) + + except ImportError: + getExternalPipeline() if __name__ == "__main__": sys.exit(main()) From bf20e7501984486441b59e8ad14fc01e99a0d420 Mon Sep 17 00:00:00 2001 From: Antonio Date: Wed, 17 Jan 2018 15:33:46 +0000 Subject: [PATCH 10/21] docstrings for cgatflow external pipeline import function --- CGATPipelines/cgatflow.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/CGATPipelines/cgatflow.py b/CGATPipelines/cgatflow.py index c194eb53..0615b88c 100644 --- a/CGATPipelines/cgatflow.py +++ b/CGATPipelines/cgatflow.py @@ -59,6 +59,8 @@ def getExternalPipeline(): ''' Import external pipeline built using CGAT approach and installed as python module. Assumes you have called using cgatflow, e.g. cgatflow externalPipeline CMD + "externalPipeline" should be the load_entry_point coomand in setup.py + and the main pipeline script needs to be "pipeline_externalPipeline.py" ''' # Get name of pipeline from command line arguments: argv = sys.argv From 74895c8e2e772106e0901fae6c6f18beebd4a059 Mon Sep 17 00:00:00 2001 From: AntonioJBT Date: Wed, 17 Jan 2018 17:00:53 +0000 Subject: [PATCH 11/21] updates/testing --- CGATPipelines/Pipeline/Control.py | 19 ++++++++----------- 1 file changed, 8 insertions(+), 11 deletions(-) diff --git a/CGATPipelines/Pipeline/Control.py b/CGATPipelines/Pipeline/Control.py index 884c6217..e98410ea 100644 --- a/CGATPipelines/Pipeline/Control.py +++ b/CGATPipelines/Pipeline/Control.py @@ -15,7 +15,6 @@ --------- """ - import inspect import json import logging @@ -87,26 +86,24 @@ def getConfigPaths(): try: f = sys._getframe(1) caller = f.f_globals["__file__"] # cgatflow config - #caller = inspect.getargvalues(f).locals["__file__"] # Make it easier to match the name of the command executed so that # the config file can be searched in case there are more than one # ini files found in writeConfig(): caller_name = os.path.basename(os.path.normpath(caller)) print('try 1', f, caller, caller_name) - except KeyError as e: + except KeyError: # The following code only works if something like this function is # present in my_pipeline.py script: # http://stackoverflow.com/questions/4519127/setuptools-package-data-folder-location - f = sys._getframe(2) - caller = f.f_globals["__file__"] # cgatflow config - #caller = inspect.getargvalues(f).locals["__file__"] + f = sys._getframe(0) # if e.g. call is direct for pipeline_QTL config + caller = inspect.getargvalues(f).locals["__file__"] cmd_caller = os.path.basename(os.path.normpath(caller)) - print('first defs try 2', f, caller, cmd_caller) + print('1st', caller_name, cmd_caller, caller) # As above, save the command called in a separate variable: caller_name = cmd_caller cmd_caller = importlib.import_module(cmd_caller) caller = cmd_caller.getDir() - print('2nd defs try 2', caller_name, cmd_caller, caller) + print('2nd', caller_name, cmd_caller, caller) #else: # print('''Unable to find path to file being executed. Probably because # CGATPipelines and the pipeline that is being executed @@ -129,7 +126,7 @@ def getConfigPaths(): # Add paths to search list: config_paths.extend([pipeline_path, pipeline_path_2]) # Extend separately in case general_path returns more than one file: - config_paths.extend(general_path) + config_paths.append(general_path) print(config_paths, caller_name) return(config_paths, caller_name) @@ -146,8 +143,8 @@ def writeConfigFiles(config_paths, caller_name): # Antonio report_dir = 'pipeline_report' config_files = [] - print(config_paths) - print(caller_name) + #print(config_paths) + #print(caller_name) try: os.mkdir(report_dir) # Sphinx config files will be copied here From d71189592dba8b791bb2d3a84a4d1d3bf24dc9ec Mon Sep 17 00:00:00 2001 From: AntonioJBT Date: Wed, 17 Jan 2018 20:57:43 +0000 Subject: [PATCH 12/21] added options for calling external pipelines to cgatflow --- CGATPipelines/cgatflow.py | 108 ++++++++++++++++++++++++++++++++------ 1 file changed, 93 insertions(+), 15 deletions(-) diff --git a/CGATPipelines/cgatflow.py b/CGATPipelines/cgatflow.py index 0615b88c..205b1924 100644 --- a/CGATPipelines/cgatflow.py +++ b/CGATPipelines/cgatflow.py @@ -22,7 +22,9 @@ import re import glob import imp +import importlib # imp is being deprecated in favour of importlib import collections +import shutil import CGATPipelines @@ -58,22 +60,94 @@ def printListInColumns(l, ncolumns): def getExternalPipeline(): ''' Import external pipeline built using CGAT approach and installed as python module. - Assumes you have called using cgatflow, e.g. cgatflow externalPipeline CMD - "externalPipeline" should be the load_entry_point coomand in setup.py - and the main pipeline script needs to be "pipeline_externalPipeline.py" + Assumes you have called using cgatflow, e.g. cgatflow pipeline_external CMD + The main pipeline script needs to be "pipeline_external.py" + with the directory structure as in CGATPipelines + Alternatively, "pipeline_external" could be the load_entry_point command in setup.py + without the need for cgatflow ''' # Get name of pipeline from command line arguments: argv = sys.argv - command = argv[1] - command = re.sub("-", "_", command) - pipeline = "pipeline_{}".format(command) - - # Get path to where the pipeline is installed: - path = os.path.join(os.path.abspath(os.path.dirname(command.__file__))) - - # Import it: - (file, pathname, description) = imp.find_module(pipeline, path) - module = imp.load_module(pipeline, file, pathname, description) + command = argv[1] #e.g. cgatflow pipeline_external CMD + + try: + # System path to the command: + sys_cmd_path = shutil.which(str(command)) + # Get the base command if cgatflow was given a full path to the command: + # e.g. cgatflow /path_to/CGATPipelines/CGATPipelines/pipeline_readqc.py -h + # Unnecessary though as can just be called as + # python /path_to/CGATPipelines/CGATPipelines/pipeline_readqc.py -h + command_sanitised = os.path.basename(sys_cmd_path) + if command_sanitised.endswith('.py'): + command_sanitised = command_sanitised.replace('.py', '') + else: + pass + # Set the paths to search for the module: + command_path = os.path.abspath(os.path.dirname(sys_cmd_path)) + command_path_up = os.path.abspath(os.path.join(command_path, '..')) + relpath = os.path.abspath("../src") + relpath_2 = os.path.abspath("../code") + # If dependencies are missing (ImportErrors in main() below, they'll be + # sent here and picked as AttributeError from shutil.which(), coded just above): + except AttributeError: + print(''' + The pipeline provided to cgatflow could not be found. + Are you trying to run an external pipeline? + Is the command line correctly specified? + Are all dependencies installed? + Try cgatflow --help for more info or raise an issue in GitHub + with the following trace please: + ''') + raise + + paths = [command_path, command_path_up, relpath, relpath_2] + print(paths) + + # Import module as given: + try: + module = importlib.import_module(command) + #module_spec = importlib.util.find_spec(str(command)) + #(file, pathname, description) = imp.find_module(str(command), paths) + except (ImportError, ValueError, TypeError): + pass + # Import it as sanitised: + try: + module = importlib.import_module(command_sanitised) + #module_spec = importlib.util.find_spec(str(command_sanitised)) + #(file, pathname, description) = imp.find_module(str(command_sanitised), paths) + command = command_sanitised + except: + print(''' + Error. Tried importing the pipeline provided as + {} + and + {} + but it did not work. + See trace below and also try + cgatflow --help + for more information. + '''.format(command, command_sanitised) + ) + raise + + # If module was found: + #if module_spec: + #print('Found {}, loading ...'.format(command)) + #module = importlib.import_module(command) + #module = importlib.util.module_from_spec(module_spec) + #module_spec.loader.exec_module(module) + #else: + # print(''' + # Error. Module + # {} + # could not be loaded. + # Is your pipeline an importable module? + # All dependencies installed? + # See trace below, try cgatflow --help or raise an issue in GitHub. + # '''.format(command)) + + print(dir(module)) + #module = imp.load_module(str(command), file, pathname, description) # remove 'cgatflow' from sys.argv del sys.argv[0] module.main(sys.argv) @@ -89,7 +163,7 @@ def main(argv=None): relpath = os.path.abspath("../src") paths = [path, relpath] - + if len(argv) == 1 or argv[1] == "--help" or argv[1] == "-h": pipelines = [] for path in paths: @@ -105,7 +179,7 @@ def main(argv=None): command = argv[1] command = re.sub("-", "_", command) pipeline = "pipeline_{}".format(command) - + try: (file, pathname, description) = imp.find_module(pipeline, paths) module = imp.load_module(pipeline, file, pathname, description) @@ -113,6 +187,10 @@ def main(argv=None): del sys.argv[0] module.main(sys.argv) + # If the command is an external pipeline this allows to search for it with + # getExternalPipeline() above. If dependencies are not installed they'll be + # picked up here as well though but should error inside + # getExternalPipeline(). except ImportError: getExternalPipeline() From 1aee135a0b4c693a83d35619075c8f2810f66dc7 Mon Sep 17 00:00:00 2001 From: AntonioJBT Date: Wed, 17 Jan 2018 21:12:16 +0000 Subject: [PATCH 13/21] updates/testing --- CGATPipelines/cgatflow.py | 27 ++++----------------------- 1 file changed, 4 insertions(+), 23 deletions(-) diff --git a/CGATPipelines/cgatflow.py b/CGATPipelines/cgatflow.py index 205b1924..9be9f3e4 100644 --- a/CGATPipelines/cgatflow.py +++ b/CGATPipelines/cgatflow.py @@ -101,20 +101,16 @@ def getExternalPipeline(): raise paths = [command_path, command_path_up, relpath, relpath_2] - print(paths) # Import module as given: + # With importlib paths are not needed, leaving for now though. try: module = importlib.import_module(command) - #module_spec = importlib.util.find_spec(str(command)) - #(file, pathname, description) = imp.find_module(str(command), paths) except (ImportError, ValueError, TypeError): pass # Import it as sanitised: try: module = importlib.import_module(command_sanitised) - #module_spec = importlib.util.find_spec(str(command_sanitised)) - #(file, pathname, description) = imp.find_module(str(command_sanitised), paths) command = command_sanitised except: print(''' @@ -130,27 +126,12 @@ def getExternalPipeline(): ) raise - # If module was found: - #if module_spec: - #print('Found {}, loading ...'.format(command)) - #module = importlib.import_module(command) - #module = importlib.util.module_from_spec(module_spec) - #module_spec.loader.exec_module(module) - #else: - # print(''' - # Error. Module - # {} - # could not be loaded. - # Is your pipeline an importable module? - # All dependencies installed? - # See trace below, try cgatflow --help or raise an issue in GitHub. - # '''.format(command)) - - print(dir(module)) - #module = imp.load_module(str(command), file, pathname, description) + #print(dir(module)) # remove 'cgatflow' from sys.argv del sys.argv[0] module.main(sys.argv) + except AttributeError: + return From db7f8a5e5f85f9b9425e9f6e42261b5d9eddeaa1 Mon Sep 17 00:00:00 2001 From: AntonioJBT Date: Wed, 17 Jan 2018 21:12:56 +0000 Subject: [PATCH 14/21] updates/testing --- CGATPipelines/cgatflow.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/CGATPipelines/cgatflow.py b/CGATPipelines/cgatflow.py index 9be9f3e4..caa30be1 100644 --- a/CGATPipelines/cgatflow.py +++ b/CGATPipelines/cgatflow.py @@ -130,9 +130,6 @@ def getExternalPipeline(): # remove 'cgatflow' from sys.argv del sys.argv[0] module.main(sys.argv) - except AttributeError: - - return def main(argv=None): From a3f1497f194d9665c75d7bbd6166eb6dfa5e1252 Mon Sep 17 00:00:00 2001 From: AntonioJBT Date: Mon, 22 Jan 2018 13:59:13 +0000 Subject: [PATCH 15/21] returned cgatflow to original code, no changes, easier to call external pipelines through entry_point --- CGATPipelines/cgatflow.py | 98 +++------------------------------------ 1 file changed, 7 insertions(+), 91 deletions(-) diff --git a/CGATPipelines/cgatflow.py b/CGATPipelines/cgatflow.py index caa30be1..8d219115 100644 --- a/CGATPipelines/cgatflow.py +++ b/CGATPipelines/cgatflow.py @@ -22,9 +22,7 @@ import re import glob import imp -import importlib # imp is being deprecated in favour of importlib import collections -import shutil import CGATPipelines @@ -57,80 +55,6 @@ def printListInColumns(l, ncolumns): # put it all together return '\n'.join([pattern % row for row in rows]) -def getExternalPipeline(): - ''' - Import external pipeline built using CGAT approach and installed as python module. - Assumes you have called using cgatflow, e.g. cgatflow pipeline_external CMD - The main pipeline script needs to be "pipeline_external.py" - with the directory structure as in CGATPipelines - Alternatively, "pipeline_external" could be the load_entry_point command in setup.py - without the need for cgatflow - ''' - # Get name of pipeline from command line arguments: - argv = sys.argv - command = argv[1] #e.g. cgatflow pipeline_external CMD - - try: - # System path to the command: - sys_cmd_path = shutil.which(str(command)) - # Get the base command if cgatflow was given a full path to the command: - # e.g. cgatflow /path_to/CGATPipelines/CGATPipelines/pipeline_readqc.py -h - # Unnecessary though as can just be called as - # python /path_to/CGATPipelines/CGATPipelines/pipeline_readqc.py -h - command_sanitised = os.path.basename(sys_cmd_path) - if command_sanitised.endswith('.py'): - command_sanitised = command_sanitised.replace('.py', '') - else: - pass - # Set the paths to search for the module: - command_path = os.path.abspath(os.path.dirname(sys_cmd_path)) - command_path_up = os.path.abspath(os.path.join(command_path, '..')) - relpath = os.path.abspath("../src") - relpath_2 = os.path.abspath("../code") - # If dependencies are missing (ImportErrors in main() below, they'll be - # sent here and picked as AttributeError from shutil.which(), coded just above): - except AttributeError: - print(''' - The pipeline provided to cgatflow could not be found. - Are you trying to run an external pipeline? - Is the command line correctly specified? - Are all dependencies installed? - Try cgatflow --help for more info or raise an issue in GitHub - with the following trace please: - ''') - raise - - paths = [command_path, command_path_up, relpath, relpath_2] - - # Import module as given: - # With importlib paths are not needed, leaving for now though. - try: - module = importlib.import_module(command) - except (ImportError, ValueError, TypeError): - pass - # Import it as sanitised: - try: - module = importlib.import_module(command_sanitised) - command = command_sanitised - except: - print(''' - Error. Tried importing the pipeline provided as - {} - and - {} - but it did not work. - See trace below and also try - cgatflow --help - for more information. - '''.format(command, command_sanitised) - ) - raise - - #print(dir(module)) - # remove 'cgatflow' from sys.argv - del sys.argv[0] - module.main(sys.argv) - return def main(argv=None): @@ -141,7 +65,7 @@ def main(argv=None): relpath = os.path.abspath("../src") paths = [path, relpath] - + if len(argv) == 1 or argv[1] == "--help" or argv[1] == "-h": pipelines = [] for path in paths: @@ -157,20 +81,12 @@ def main(argv=None): command = argv[1] command = re.sub("-", "_", command) pipeline = "pipeline_{}".format(command) - - try: - (file, pathname, description) = imp.find_module(pipeline, paths) - module = imp.load_module(pipeline, file, pathname, description) - # remove 'cgatflow' from sys.argv - del sys.argv[0] - module.main(sys.argv) - - # If the command is an external pipeline this allows to search for it with - # getExternalPipeline() above. If dependencies are not installed they'll be - # picked up here as well though but should error inside - # getExternalPipeline(). - except ImportError: - getExternalPipeline() + + (file, pathname, description) = imp.find_module(pipeline, paths) + module = imp.load_module(pipeline, file, pathname, description) + # remove 'cgatflow' from sys.argv + del sys.argv[0] + module.main(sys.argv) if __name__ == "__main__": sys.exit(main()) From 21a41724e60587a18737c4581f6b1618c2d51c2f Mon Sep 17 00:00:00 2001 From: AntonioJBT Date: Mon, 22 Jan 2018 17:01:24 +0000 Subject: [PATCH 16/21] updates/testing --- CGATPipelines/Pipeline/Control.py | 74 ++++++++++++++++++++----------- 1 file changed, 48 insertions(+), 26 deletions(-) diff --git a/CGATPipelines/Pipeline/Control.py b/CGATPipelines/Pipeline/Control.py index e98410ea..2f3b012d 100644 --- a/CGATPipelines/Pipeline/Control.py +++ b/CGATPipelines/Pipeline/Control.py @@ -85,25 +85,26 @@ def getConfigPaths(): # python /YYYY//XXXX/pipeline_XXXX.py config try: f = sys._getframe(1) - caller = f.f_globals["__file__"] # cgatflow config + #caller = f.f_globals["__file__"] # cgatflow config + caller = f.f_locals["__file__"] # Make it easier to match the name of the command executed so that # the config file can be searched in case there are more than one # ini files found in writeConfig(): caller_name = os.path.basename(os.path.normpath(caller)) - print('try 1', f, caller, caller_name) except KeyError: # The following code only works if something like this function is # present in my_pipeline.py script: # http://stackoverflow.com/questions/4519127/setuptools-package-data-folder-location - f = sys._getframe(0) # if e.g. call is direct for pipeline_QTL config - caller = inspect.getargvalues(f).locals["__file__"] - cmd_caller = os.path.basename(os.path.normpath(caller)) - print('1st', caller_name, cmd_caller, caller) + f = sys._getframe(2) # if e.g. call is direct for pipeline_QTL config + caller = f.f_globals["__file__"] + #caller = inspect.getargvalues(f).locals["__file__"] + cmd_caller = str(os.path.basename(os.path.normpath(caller))) # As above, save the command called in a separate variable: + if cmd_caller.endswith('.py'): + cmd_caller = cmd_caller.replace('.py', '') caller_name = cmd_caller cmd_caller = importlib.import_module(cmd_caller) caller = cmd_caller.getDir() - print('2nd', caller_name, cmd_caller, caller) #else: # print('''Unable to find path to file being executed. Probably because # CGATPipelines and the pipeline that is being executed @@ -118,17 +119,17 @@ def getConfigPaths(): pipeline_path_2 = os.path.dirname(pipeline_path) # CGATPipelines have a "configuration" folder # adding a glob to have a bit more flexibility - general_path = glob.glob(str(os.path.abspath(pipeline_path_2) + - '/*/configuration*'), recursive = True) + general_path = glob.glob(str(os.path.abspath(pipeline_path) + + '/configuratio*'), recursive = True) if not general_path: - general_path = os.path.join(os.path.dirname(pipeline_path), "configuration") + general_path = [os.path.join(os.path.dirname(pipeline_path), + "configuration")] # Add paths to search list: config_paths.extend([pipeline_path, pipeline_path_2]) # Extend separately in case general_path returns more than one file: - config_paths.append(general_path) + config_paths.extend(general_path) - print(config_paths, caller_name) return(config_paths, caller_name) def writeConfigFiles(config_paths, caller_name): @@ -143,8 +144,6 @@ def writeConfigFiles(config_paths, caller_name): # Antonio report_dir = 'pipeline_report' config_files = [] - #print(config_paths) - #print(caller_name) try: os.mkdir(report_dir) # Sphinx config files will be copied here @@ -158,33 +157,51 @@ def writeConfigFiles(config_paths, caller_name): f_count = 0 INI_list = [] for path in config_paths: + path = str(path) if os.path.exists(path) and os.path.isdir(path): for f in os.listdir(os.path.abspath(path)): if fnmatch.fnmatch(f, 'pipeline*ini'): f_count += 1 INI_file = f - INI_list.extend([INI_file]) + INI_list.append(INI_file) if f_count == 1: - config_files = [INI_file] # This is for the pipeline only + config_files = INI_list # This is for the pipeline only elif f_count > 1: + new_count = 0 # Prioritise the file that contains the command called if more than one # ini files are found: for f in INI_list: if caller_name in f: - count += 1 + new_count += 1 INI_file = f - config_files = [INI_file] - if count == 0: + config_files.append(INI_file) + if new_count > 1: E.warn(''' Found several ini files but could not prioritise based on: - {}. - Some pipelines do not require an ini file though, try - without. - '''.format(caller_name)) + {} + as more than one matched. + Using the first one found: + {} + from + {} + '''.format(caller_name, config_files[0], config_files) + ) + elif new_count == 1: + pass + elif new_count == 0: + print(''' + More than one ini file found but none matched + {} + Using the first one found: + {} + from + {} + '''.format(caller_name, config_files[0], config_files) + ) - if f_count == 0: + elif f_count == 0: E.warn(''' No configuration (ini) files found in: {} @@ -196,11 +213,13 @@ def writeConfigFiles(config_paths, caller_name): E.warn('No configuration files found.') else: for dest in config_files: + dest = str(dest) if os.path.exists(dest): E.warn("file `%s` already exists - skipped" % dest) continue for path in config_paths: + path = str(path) src = os.path.join(path, dest) if os.path.exists(src): shutil.copyfile(src, dest) @@ -220,6 +239,7 @@ def writeConfigFiles(config_paths, caller_name): E.warn("file `%s` already exists - skipped" % dest) for path in config_paths: + path = str(path) src = os.path.join(path, dest) if os.path.exists(src): # Put sphinx files in separate dir: @@ -251,6 +271,7 @@ def writeConfigFiles(config_paths, caller_name): # Look for a pipeline report file: f_count = 0 for path in config_paths: + path = str(path) if os.path.exists(path): for f in os.listdir(os.path.abspath(path)): # TO DO: @@ -278,6 +299,7 @@ def writeConfigFiles(config_paths, caller_name): f_count = 0 # Check all the paths and their files given above when searching for config files: for path in config_paths: + path = str(path) if os.path.exists(path): for f in os.listdir(path): # For each file or search term given, match to an existing file: @@ -1237,8 +1259,8 @@ def main(args=sys.argv): printConfigFiles() elif options.pipeline_action == "config": - config_paths = getConfigPaths()[0] - caller_name = getConfigPaths()[1] + config_paths, caller_name = getConfigPaths() + print(config_paths, caller_name) writeConfigFiles(config_paths, caller_name) elif options.pipeline_action == "clone": From cc7535781d0113c6c6579940b9bed7b669573ec4 Mon Sep 17 00:00:00 2001 From: AntonioJBT Date: Tue, 23 Jan 2018 12:46:16 +0000 Subject: [PATCH 17/21] control.py --- CGATPipelines/Pipeline/Control.py | 32 ++++++++++++------------------- 1 file changed, 12 insertions(+), 20 deletions(-) diff --git a/CGATPipelines/Pipeline/Control.py b/CGATPipelines/Pipeline/Control.py index 2f3b012d..706853b0 100644 --- a/CGATPipelines/Pipeline/Control.py +++ b/CGATPipelines/Pipeline/Control.py @@ -86,7 +86,8 @@ def getConfigPaths(): try: f = sys._getframe(1) #caller = f.f_globals["__file__"] # cgatflow config - caller = f.f_locals["__file__"] + # globals will get Control.py + caller = f.f_locals["__file__"] # TO DO: cgatflow # Make it easier to match the name of the command executed so that # the config file can be searched in case there are more than one # ini files found in writeConfig(): @@ -97,14 +98,9 @@ def getConfigPaths(): # http://stackoverflow.com/questions/4519127/setuptools-package-data-folder-location f = sys._getframe(2) # if e.g. call is direct for pipeline_QTL config caller = f.f_globals["__file__"] - #caller = inspect.getargvalues(f).locals["__file__"] - cmd_caller = str(os.path.basename(os.path.normpath(caller))) - # As above, save the command called in a separate variable: - if cmd_caller.endswith('.py'): - cmd_caller = cmd_caller.replace('.py', '') - caller_name = cmd_caller - cmd_caller = importlib.import_module(cmd_caller) - caller = cmd_caller.getDir() + caller_name = os.path.basename(os.path.normpath(caller)) + if caller_name.endswith('.py'): + caller_name = caller_name.replace('.py', '') #else: # print('''Unable to find path to file being executed. Probably because # CGATPipelines and the pipeline that is being executed @@ -117,10 +113,12 @@ def getConfigPaths(): # TO DO: clean this up pipeline_path = os.path.splitext(caller)[0] pipeline_path_2 = os.path.dirname(pipeline_path) - # CGATPipelines have a "configuration" folder - # adding a glob to have a bit more flexibility - general_path = glob.glob(str(os.path.abspath(pipeline_path) + - '/configuratio*'), recursive = True) + # CGATPipelines have a "configuration" folder + # adding a glob to have a bit more flexibility + # TO DO: add max depth to glob recursion: + general_path = glob.glob(str(os.path.abspath(pipeline_path_2) + + '/**/configuration*'), recursive = True) + if not general_path: general_path = [os.path.join(os.path.dirname(pipeline_path), "configuration")] @@ -129,7 +127,7 @@ def getConfigPaths(): config_paths.extend([pipeline_path, pipeline_path_2]) # Extend separately in case general_path returns more than one file: config_paths.extend(general_path) - + print(config_paths, caller_name) return(config_paths, caller_name) def writeConfigFiles(config_paths, caller_name): @@ -157,7 +155,6 @@ def writeConfigFiles(config_paths, caller_name): f_count = 0 INI_list = [] for path in config_paths: - path = str(path) if os.path.exists(path) and os.path.isdir(path): for f in os.listdir(os.path.abspath(path)): if fnmatch.fnmatch(f, 'pipeline*ini'): @@ -219,7 +216,6 @@ def writeConfigFiles(config_paths, caller_name): continue for path in config_paths: - path = str(path) src = os.path.join(path, dest) if os.path.exists(src): shutil.copyfile(src, dest) @@ -239,7 +235,6 @@ def writeConfigFiles(config_paths, caller_name): E.warn("file `%s` already exists - skipped" % dest) for path in config_paths: - path = str(path) src = os.path.join(path, dest) if os.path.exists(src): # Put sphinx files in separate dir: @@ -271,7 +266,6 @@ def writeConfigFiles(config_paths, caller_name): # Look for a pipeline report file: f_count = 0 for path in config_paths: - path = str(path) if os.path.exists(path): for f in os.listdir(os.path.abspath(path)): # TO DO: @@ -299,7 +293,6 @@ def writeConfigFiles(config_paths, caller_name): f_count = 0 # Check all the paths and their files given above when searching for config files: for path in config_paths: - path = str(path) if os.path.exists(path): for f in os.listdir(path): # For each file or search term given, match to an existing file: @@ -1260,7 +1253,6 @@ def main(args=sys.argv): elif options.pipeline_action == "config": config_paths, caller_name = getConfigPaths() - print(config_paths, caller_name) writeConfigFiles(config_paths, caller_name) elif options.pipeline_action == "clone": From a41c6fcf69b0384eea126e14c8822a90f331eb72 Mon Sep 17 00:00:00 2001 From: AntonioJBT Date: Tue, 23 Jan 2018 13:17:12 +0000 Subject: [PATCH 18/21] control.py for external pipelines --- CGATPipelines/Pipeline/Control.py | 6 - CGATPipelines/Pipeline/Control.py.works | 1231 ----------------------- 2 files changed, 1237 deletions(-) delete mode 100644 CGATPipelines/Pipeline/Control.py.works diff --git a/CGATPipelines/Pipeline/Control.py b/CGATPipelines/Pipeline/Control.py index 706853b0..101d3d54 100644 --- a/CGATPipelines/Pipeline/Control.py +++ b/CGATPipelines/Pipeline/Control.py @@ -93,9 +93,6 @@ def getConfigPaths(): # ini files found in writeConfig(): caller_name = os.path.basename(os.path.normpath(caller)) except KeyError: - # The following code only works if something like this function is - # present in my_pipeline.py script: - # http://stackoverflow.com/questions/4519127/setuptools-package-data-folder-location f = sys._getframe(2) # if e.g. call is direct for pipeline_QTL config caller = f.f_globals["__file__"] caller_name = os.path.basename(os.path.normpath(caller)) @@ -109,8 +106,6 @@ def getConfigPaths(): # sys.exit() # CGATPipelines have a pipe_XX/pipe_XX hierarchy, but a simplified # version would only have pipe_XX/ - # so creating an additional pipeline_path - # TO DO: clean this up pipeline_path = os.path.splitext(caller)[0] pipeline_path_2 = os.path.dirname(pipeline_path) # CGATPipelines have a "configuration" folder @@ -127,7 +122,6 @@ def getConfigPaths(): config_paths.extend([pipeline_path, pipeline_path_2]) # Extend separately in case general_path returns more than one file: config_paths.extend(general_path) - print(config_paths, caller_name) return(config_paths, caller_name) def writeConfigFiles(config_paths, caller_name): diff --git a/CGATPipelines/Pipeline/Control.py.works b/CGATPipelines/Pipeline/Control.py.works deleted file mode 100644 index 87dde9b3..00000000 --- a/CGATPipelines/Pipeline/Control.py.works +++ /dev/null @@ -1,1231 +0,0 @@ -"""Control.py - Command line control for ruffus pipelines -========================================================= - -The functions :func:`writeConfigFiles`, :func:`clean`, -:func:`clonePipeline` and :func:`peekParameters` provide the -functionality for particular pipeline commands. - -:class:`MultiLineFormatter` improves the formatting -of long log messages, while -:class:`LoggingFilterRabbitMQ` intercepts ruffus log -messages and sends event information to a rabbitMQ message exchange -for task process monitoring. - -Reference ---------- - -""" - -import inspect -import json -import logging -import os -import re -import shutil -import subprocess -import sys -import tempfile -import time -import io -import glob -import fnmatch -import importlib - -from multiprocessing.pool import ThreadPool - -# talking to RabbitMQ -try: - import pika - HAS_PIKA = True -except ImportError: - HAS_PIKA = False - -# talking to a cluster -try: - import drmaa - HAS_DRMAA = True -except: -# the following does not work on Travis -#except ImportError or RuntimeError: - HAS_DRMAA = False - -from ruffus import pipeline_printout_graph, pipeline_printout, \ - pipeline_run, ruffus_exceptions, task - - -import CGAT.Experiment as E -import CGAT.IOTools as IOTools -from CGAT import Requirements as Requirements - -from CGATPipelines.Pipeline.Utils import isTest, getCaller, getCallerLocals -from CGATPipelines.Pipeline.Execution import execute, startSession,\ - closeSession -from CGATPipelines.Pipeline.Local import getProjectName, getPipelineName -from CGATPipelines.Pipeline.Parameters import inputValidation -# Set from Pipeline.py -PARAMS = {} - -# global options and arguments - set but currently not -# used as relevant sections are entered into the PARAMS -# dictionary. Could be deprecated and removed. -GLOBAL_OPTIONS, GLOBAL_ARGS = None, None - - -def writeConfigFiles(paths): - #pipeline_path, pipeline_path_2, general_path): - '''create default configuration files in `path`. - ''' - # TO DO: I've modified this function with workarounds to make it more - # flexible in order to find an ini file, find a configuration dir and - # copy pre-run sphinx-quickstart files if they exist. - # Other than creating a 'report' dir, it should not change the way it is - # run from CGATPipelines. - # See also bottom of script for changes when calling the 'config' option - # Antonio - #paths = [pipeline_path, pipeline_path_2, general_path] - report_dir = 'pipeline_report' - try: - os.mkdir(report_dir) # Sphinx config files will be copied here - # CGATReport only needs its conf.py to generate the rest - # though - except FileExistsError: - E.warn("directory `%s` already exists" % report_dir) - raise - - # Look for ini file: - f_count = 0 - INI_list = [] - for path in paths: - if os.path.exists(path) and os.path.isdir(path): - for f in os.listdir(os.path.abspath(path)): - if fnmatch.fnmatch(f, 'pipeline*ini'): - f_count += 1 - INI_file = f - INI_list.extend([INI_file]) - - if f_count == 1: - config_files = [INI_file] # This is for the pipeline only - - elif f_count > 1: - # Prioritise the file that contains the command called if more than one - # ini file are found: - for f in INI_list: - if caller_name in f: - INI_file = f - config_files = [INI_file] - else: - if f_count == 0: - print(''' - No configuration (ini) files found in: - {} - '''.format(paths) - ) - else: - print(''' - Found several ini files but could not prioritise based on: - {} - Exiting. - '''.format(caller_name)) - sys.exit() - - # Copy pipeline ini file: - for dest in config_files: - if os.path.exists(dest): - E.warn("file `%s` already exists - skipped" % dest) - continue - - for path in paths: - src = os.path.join(path, dest) - if os.path.exists(src): - shutil.copyfile(src, dest) - E.info("created new configuration file `%s` " % dest) - break - else: - raise ValueError('''default config file for `%s` - not found in - %s - A pipeline cannot be run without this. - ''' % (config_files, paths)) - - # Copy Sphinx configuration files, enforce copy of 'conf.py' in case - # CGATReport is used: - dest = 'conf.py' - if os.path.exists(dest): - E.warn("file `%s` already exists - skipped" % dest) - - for path in paths: - src = os.path.join(path, dest) - if os.path.exists(src): - # Put sphinx files in separate dir: - shutil.copyfile(src, os.path.join(report_dir, dest)) - # Create a softlink outside of report_dir dir for CGATReport: - os.symlink(os.path.join(report_dir, dest), str(dest)) - E.info("created new configuration file `%s` " % dest) - break - - else: - # Only warn as pipeline can be run without report: - E.warn('''default config file for `%s` not found in - %s - CGATReport nor Sphinx can be run without this''' % (dest, paths)) - - # If other Sphinx config files are found, copy them if there is a skeleton - # pipeline report to use: - E.info('Looking for additional Sphinx configuration files.') - sphinx_config_files = ['Makefile', - 'make.bat', - '*.rst', - '*.bib', - ] # These are for a sphinx setup, not needed - # with CGATReport - # A 'report_pipeline_*.rst' template is - # searched for below - - # Look for a pipeline report file: - f_count = 0 - for path in paths: - if os.path.exists(path): - for f in os.listdir(os.path.abspath(path)): - # TO DO: - # This pattern matching is particular to - # https://github.com/AntonioJBT/project_quickstart - # Needs to be made more generic - if fnmatch.fnmatch(f, 'report_pipeline_*.rst'): - f_count += 1 - pipeline_report_file = f - - if f_count == 1: - sphinx_config_files.append(pipeline_report_file) - - else: - # Only warn as pipeline can be run without report: - E.warn('''There is no pipeline report file matching - report_pipeline_*.rst - in the directories: - {} - {} - or - {} - Ignore this if you are using CGATReport. - '''.format(pipeline_path, pipeline_path_2, general_path) - ) - - # Copy the files across if they are found: - f_count = 0 - # Check all the paths and their files given above when searching for config files: - for path in paths: - if os.path.exists(path): - for f in os.listdir(path): - # For each file or search term given, match to an existing file: - for dest in sphinx_config_files: - if fnmatch.fnmatch(f, dest): - f_to_copy = f - # If a match is found, walk the cwd to check it's not - # already present: - for root, dirs, files in os.walk('.'): - if f_to_copy in files: - E.warn("file `%s` already exists - skipped" % f_to_copy) - continue - - # If not present, copy the file: - else: - f_count += 1 - src = os.path.join(path, f_to_copy) - if os.path.exists(src): - # Put sphinx files in separate dir: - shutil.copyfile(src, os.path.join(report_dir, - f_to_copy) - ) - E.info("created new configuration file `%s` " - % f_to_copy) - break - if f_count > 0: - pass - else: - E.warn('''No sphinx-quickstart skeleton files such as: - {} - were found - in - {} - Continuing without.'''.format(dest, paths)) - -def printConfigFiles(): - ''' - Print the list of .ini files used to configure the pipeline - along with their associated priorities. - Priority 1 is the highest. - ''' - - filenames = PARAMS['pipeline_ini'] - print("\n List of .ini files used to configure the pipeline") - s = len(filenames) - if s == 0: - print(" No ini files passed!") - elif s >= 1: - print(" %-11s: %s " % ("Priority", "File")) - for f in filenames: - if s == 1: - print(" (highest) %s: %s\n" % (s, f)) - else: - print(" %-11s: %s " % (s, f)) - s -= 1 - - -def clonePipeline(srcdir, destdir=None): - '''clone a pipeline. - - Cloning entails creating a mirror of the source pipeline. - Generally, data files are mirrored by linking. Configuration - files and the pipeline database will be copied. - - Without modification of any files, building the cloned pipeline in - `destdir` should not re-run any commands. However, on deleting - selected files, the pipeline should run from the appropriate - point. Newly created files will not affect the original pipeline. - - Cloning pipelines permits sharing partial results between - pipelines, for example for parameter optimization. - - Arguments - --------- - scrdir : string - Source directory - destdir : string - Destination directory. If None, use the current directory. - - ''' - - if destdir is None: - destdir = os.path.curdir - - E.info("cloning pipeline from %s to %s" % (srcdir, destdir)) - - copy_files = ("conf.py", "pipeline.ini", "csvdb") - ignore_prefix = ( - "report", "_cache", "export", "tmp", "ctmp", - "_static", "_templates") - - def _ignore(p): - for x in ignore_prefix: - if p.startswith(x): - return True - return False - - for root, dirs, files in os.walk(srcdir): - - relpath = os.path.relpath(root, srcdir) - if _ignore(relpath): - continue - - for d in dirs: - if _ignore(d): - continue - dest = os.path.join(os.path.join(destdir, relpath, d)) - os.mkdir(dest) - # touch - s = os.stat(os.path.join(root, d)) - os.utime(dest, (s.st_atime, s.st_mtime)) - - for f in files: - if _ignore(f): - continue - - fn = os.path.join(root, f) - dest_fn = os.path.join(destdir, relpath, f) - if f in copy_files: - shutil.copyfile(fn, dest_fn) - else: - # realpath resolves links - thus links will be linked to - # the original target - os.symlink(os.path.realpath(fn), - dest_fn) - - -def clean(files, logfile): - '''clean up files given by glob expressions. - - Files are cleaned up by zapping, i.e. the files are set to size - 0. Links to files are replaced with place-holders. - - Information about the original file is written to `logfile`. - - Arguments - --------- - files : list - List of glob expressions of files to clean up. - logfile : string - Filename of logfile. - - ''' - fields = ('st_atime', 'st_blksize', 'st_blocks', - 'st_ctime', 'st_dev', 'st_gid', 'st_ino', - 'st_mode', 'st_mtime', 'st_nlink', - 'st_rdev', 'st_size', 'st_uid') - - dry_run = PARAMS.get("dryrun", False) - - if not dry_run: - if not os.path.exists(logfile): - outfile = IOTools.openFile(logfile, "w") - outfile.write("filename\tzapped\tlinkdest\t%s\n" % - "\t".join(fields)) - else: - outfile = IOTools.openFile(logfile, "a") - - c = E.Counter() - for fn in files: - c.files += 1 - if not dry_run: - stat, linkdest = IOTools.zapFile(fn) - if stat is not None: - c.zapped += 1 - if linkdest is not None: - c.links += 1 - outfile.write("%s\t%s\t%s\t%s\n" % ( - fn, - time.asctime(time.localtime(time.time())), - linkdest, - "\t".join([str(getattr(stat, x)) for x in fields]))) - - E.info("zapped: %s" % (c)) - outfile.close() - - return c - - -def peekParameters(workingdir, - pipeline, - on_error_raise=None, - prefix=None, - update_interface=False, - restrict_interface=False): - '''peek configuration parameters from external pipeline. - - As the paramater dictionary is built at runtime, this method - executes the pipeline in workingdir, dumping its configuration - values and reading them into a dictionary. - - If either `pipeline` or `workingdir` are not found, an error is - raised. This behaviour can be changed by setting `on_error_raise` - to False. In that case, an empty dictionary is returned. - - Arguments - --------- - workingdir : string - Working directory. This is the directory that the pipeline - was executed in. - pipeline : string - Name of the pipeline script. The pipeline is assumed to live - in the same directory as the current pipeline. - on_error_raise : Bool - If set to a boolean, an error will be raised (or not) if there - is an error during parameter peeking, for example if - `workingdir` can not be found. If `on_error_raise` is None, it - will be set to the default, which is to raise an exception - unless the calling script is imported or the option - ``--is-test`` has been passed at the command line. - prefix : string - Add a prefix to all parameters. This is useful if the paramaters - are added to the configuration dictionary of the calling pipeline. - update_interface : bool - If True, this method will prefix any options in the - ``[interface]`` section with `workingdir`. This allows - transparent access to files in the external pipeline. - restrict_interface : bool - If True, only interface parameters will be imported. - - Returns - ------- - config : dict - Dictionary of configuration values. - - ''' - caller_locals = getCallerLocals() - - # check if we should raise errors - if on_error_raise is None: - on_error_raise = not isTest() and \ - "__name__" in caller_locals and \ - caller_locals["__name__"] == "__main__" - - # patch - if --help or -h in command line arguments, - # do not peek as there might be no config file. - if "--help" in sys.argv or "-h" in sys.argv: - return {} - - # Attempt to locate directory with pipeline source code. This is a - # patch as pipelines might be called within the repository - # directory or from an installed location - dirname = PARAMS["pipelinedir"] - - # called without a directory, use current directory - if dirname == "": - dirname = os.path.abspath(".") - else: - # if not exists, assume we want version located - # in directory of calling script. - if not os.path.exists(dirname): - # directory is path of calling script - dirname = os.path.dirname(caller_locals['__file__']) - - pipeline = os.path.join(dirname, pipeline) - if not os.path.exists(pipeline): - if on_error_raise: - raise ValueError( - "can't find pipeline at %s" % (pipeline)) - else: - return {} - - if workingdir == "": - workingdir = os.path.abspath(".") - - # patch for the "config" target - use default - # pipeline directory if directory is not specified - # working dir is set to "?!" - if "config" in sys.argv or "check" in sys.argv or "clone" in sys.argv and workingdir == "?!": - workingdir = os.path.join(PARAMS.get("pipelinedir"), - IOTools.snip(pipeline, ".py")) - - if not os.path.exists(workingdir): - if on_error_raise: - raise ValueError( - "can't find working dir %s" % workingdir) - else: - return {} - - statement = "python %s -f -v 0 dump" % pipeline - process = subprocess.Popen(statement, - cwd=workingdir, - shell=True, - stdin=subprocess.PIPE, - stdout=subprocess.PIPE, - stderr=subprocess.PIPE) - - # process.stdin.close() - stdout, stderr = process.communicate() - if process.returncode != 0: - raise OSError( - ("Child was terminated by signal %i: \n" - "Statement: %s\n" - "The stderr was: \n%s\n" - "Stdout: %s") % - (-process.returncode, statement, stderr, stdout)) - - # subprocess only accepts encoding argument in py >= 3.6 so - # decode here. - stdout = stdout.decode("utf-8").splitlines() - # remove any log messages - stdout = [x for x in stdout if x.startswith("{")] - if len(stdout) > 1: - raise ValueError("received multiple configurations") - dump = json.loads(stdout[0]) - - # update interface - if update_interface: - for key, value in list(dump.items()): - if key.startswith("interface"): - dump[key] = os.path.join(workingdir, value) - - # keep only interface if so required - if restrict_interface: - dump = dict([(k, v) for k, v in dump.items() - if k.startswith("interface")]) - - # prefix all parameters - if prefix is not None: - dump = dict([("%s%s" % (prefix, x), y) for x, y in list(dump.items())]) - - return dump - - -class MultiLineFormatter(logging.Formatter): - """add identation for multi-line entries. - """ - - def format(self, record): - s = logging.Formatter.format(self, record) - if record.message: - header, footer = s.split(record.message) - s = s.replace('\n', '\n' + ' ' * len(header)) - return s - - -class LoggingFilterRabbitMQ(logging.Filter): - """pass event information to a rabbitMQ message queue. - - This is a log filter which detects messages from ruffus_ and sends - them to a rabbitMQ message queue. - - A :term:`task` is a ruffus_ decorated function, which will execute - one or more :term:`jobs`. - - Valid task/job status: - - update - task/job needs updating - completed - task/job completed successfully - failed - task/job failed - running - task/job is running - ignore - ignore task/job (is up-to-date) - - Arguments - --------- - ruffus_text : string - Log messages from ruffus.pipeline_printout. These are used - to collect all tasks that will be executed during pipeline - executation. - project_name : string - Name of the project - pipeline_name : string - Name of the pipeline - host : string - RabbitMQ host name - exchange : string - RabbitMQ exchange name - - """ - - def __init__(self, ruffus_text, - project_name, - pipeline_name, - host="localhost", - exchange="ruffus_pipelines"): - - self.project_name = project_name - self.pipeline_name = pipeline_name - self.exchange = exchange - - # dictionary of jobs to run - self.jobs = {} - self.tasks = {} - - if not HAS_PIKA: - self.connected = False - return - - def split_by_job(text): - text = "".join(text) - job_message = "" - # ignore first entry which is the docstring - for line in text.split(" Job = ")[1:]: - try: - # long file names cause additional wrapping and - # additional white-space characters - job_name = re.search( - "\[.*-> ([^\]]+)\]", line).groups() - except AttributeError: - raise AttributeError("could not parse '%s'" % line) - job_status = "ignore" - if "Job needs update" in line: - job_status = "update" - - yield job_name, job_status, job_message - - def split_by_task(text): - block, task_name = [], None - task_status = None - for line in text.split("\n"): - line = line.strip() - - if line.startswith("Tasks which will be run"): - task_status = "update" - elif line.startswith("Tasks which are up-to-date"): - task_status = "ignore" - - if line.startswith("Task = "): - if task_name: - yield task_name, task_status, list(split_by_job(block)) - block = [] - task_name = re.match("Task = (.*)", line).groups()[0] - continue - if line: - block.append(line) - if task_name: - yield task_name, task_status, list(split_by_job(block)) - - # create connection - try: - connection = pika.BlockingConnection(pika.ConnectionParameters( - host=host)) - self.connected = True - except pika.exceptions.AMQPConnectionError: - self.connected = False - return - - self.channel = connection.channel() - self.channel.exchange_declare( - exchange=self.exchange, - type='topic') - - # populate with initial messages - for task_name, task_status, jobs in split_by_task(ruffus_text): - if task_name.startswith("(mkdir"): - continue - - to_run = 0 - for job_name, job_status, job_message in jobs: - self.jobs[job_name] = (task_name, job_name) - if job_status == "update": - to_run += 1 - - self.tasks[task_name] = [task_status, len(jobs), - len(jobs) - to_run] - self.send_task(task_name) - - def send_task(self, task_name): - '''send task status.''' - - if not self.connected: - return - - task_status, task_total, task_completed = self.tasks[task_name] - - data = {} - data['created_at'] = time.time() - data['pipeline'] = self.pipeline_name - data['task_name'] = task_name - data['task_status'] = task_status - data['task_total'] = task_total - data['task_completed'] = task_completed - - key = "%s.%s.%s" % (self.project_name, self.pipeline_name, task_name) - try: - self.channel.basic_publish(exchange=self.exchange, - routing_key=key, - body=json.dumps(data)) - except pika.exceptions.ConnectionClosed: - E.warn("could not send message - connection closed") - except Exception as e: - E.warn("could not send message: %s" % str(e)) - - def send_error(self, task_name, job, error=None, msg=None): - - if not self.connected: - return - - try: - task_status, task_total, task_completed = self.tasks[task_name] - except KeyError: - E.warn("could not get task information for %s, no message sent" % - task_name) - return - - data = {} - data['created_at'] = time.time() - data['pipeline'] = self.pipeline_name - data['task_name'] = task_name - data['task_status'] = 'failed' - data['task_total'] = task_total - data['task_completed'] = task_completed - - key = "%s.%s.%s" % (self.project_name, self.pipeline_name, task_name) - - try: - self.channel.basic_publish(exchange=self.exchange, - routing_key=key, - body=json.dumps(data)) - except pika.exceptions.ConnectionClosed: - E.warn("could not send message - connection closed") - except Exception as e: - E.warn("could not send message: %s" % str(e)) - - def filter(self, record): - - if not self.connected: - return True - - # filter ruffus logging messages - if record.filename.endswith("task.py"): - try: - before, task_name = record.msg.strip().split(" = ") - except ValueError: - return True - - # ignore the mkdir, etc tasks - if task_name not in self.tasks: - return True - - if before == "Task enters queue": - self.tasks[task_name][0] = "running" - elif before == "Completed Task": - self.tasks[task_name][0] = "completed" - elif before == "Uptodate Task": - self.tasks[task_name][0] = "uptodate" - else: - return True - - # send new task status out - self.send_task(task_name) - - return True - - -USAGE = ''' -usage: %prog [OPTIONS] [CMD] [target] - -Execute pipeline %prog. - -Commands can be any of the following - -make - run all tasks required to build *target* - -show - show tasks required to build *target* without executing them - -plot - plot image (using inkscape) of pipeline state for *target* - -debug [args] - debug a method using the supplied arguments. The method - in the pipeline is run without checking any dependencies. - -config - write new configuration files pipeline.ini, sphinxreport.ini and conf.py - with default values - -dump - write pipeline configuration to stdout - -printconfig - write pipeline configuration to stdout in a user-friendly way so - it is easier to debug pipeline parameters - -touch - touch files only, do not run - -regenerate - regenerate the ruffus checkpoint file - -check - check if requirements (external tool dependencies) are satisfied. - -clone - create a clone of a pipeline in in the current - directory. The cloning process aims to use soft linking to files - (not directories) as much as possible. Time stamps are - preserved. Cloning is useful if a pipeline needs to be re-run from - a certain point but the original pipeline should be preserved. - -''' - - -def main(args=sys.argv): - """command line control function for a pipeline. - - This method defines command line options for the pipeline and - updates the global configuration dictionary correspondingly. - - It then provides a command parser to execute particular tasks - using the ruffus pipeline control functions. See the generated - command line help for usage. - - To use it, add:: - - import CGAT.Pipeline as P - - if __name__ == "__main__": - sys.exit(P.main(sys.argv)) - - to your pipeline script. - - Arguments - --------- - args : list - List of command line arguments. - - """ - - global GLOBAL_OPTIONS - global GLOBAL_ARGS - - parser = E.OptionParser(version="%prog version: $Id$", - usage=USAGE) - - parser.add_option("--pipeline-action", dest="pipeline_action", - type="choice", - choices=( - "make", "show", "plot", "dump", "config", "clone", - "check", "regenerate", "printconfig"), - help="action to take [default=%default].") - - parser.add_option("--pipeline-format", dest="pipeline_format", - type="choice", - choices=("dot", "jpg", "svg", "ps", "png"), - help="pipeline format [default=%default].") - - parser.add_option("-n", "--dry-run", dest="dry_run", - action="store_true", - help="perform a dry run (do not execute any shell " - "commands) [default=%default].") - - parser.add_option("-f", "--force-output", dest="force", - action="store_true", - help="force running the pipeline even if there " - "are uncommited changes " - "in the repository [default=%default].") - - parser.add_option("-p", "--multiprocess", dest="multiprocess", type="int", - help="number of parallel processes to use on " - "submit host " - "(different from number of jobs to use for " - "cluster jobs) " - "[default=%default].") - - parser.add_option("-e", "--exceptions", dest="log_exceptions", - action="store_true", - help="echo exceptions immediately as they occur " - "[default=%default].") - - parser.add_option("-i", "--terminate", dest="terminate", - action="store_true", - help="terminate immediately at the first exception " - "[default=%default].") - - parser.add_option("-d", "--debug", dest="debug", - action="store_true", - help="output debugging information on console, " - "and not the logfile " - "[default=%default].") - - parser.add_option("-s", "--set", dest="variables_to_set", - type="string", action="append", - help="explicitly set paramater values " - "[default=%default].") - - parser.add_option("-c", "--checksums", dest="ruffus_checksums_level", - type="int", - help="set the level of ruffus checksums" - "[default=%default].") - - parser.add_option("-t", "--is-test", dest="is_test", - action="store_true", - help="this is a test run" - "[default=%default].") - - parser.add_option("--rabbitmq-exchange", dest="rabbitmq_exchange", - type="string", - help="RabbitMQ exchange to send log messages to " - "[default=%default].") - - parser.add_option("--rabbitmq-host", dest="rabbitmq_host", - type="string", - help="RabbitMQ host to send log messages to " - "[default=%default].") - - parser.add_option("--input-validation", dest="input_validation", - action="store_true", - help="perform input validation before starting " - "[default=%default].") - - parser.set_defaults( - pipeline_action=None, - pipeline_format="svg", - pipeline_targets=[], - multiprocess=40, - logfile="pipeline.log", - dry_run=False, - force=False, - log_exceptions=False, - exceptions_terminate_immediately=False, - debug=False, - variables_to_set=[], - is_test=False, - ruffus_checksums_level=0, - rabbitmq_host="saruman", - rabbitmq_exchange="ruffus_pipelines", - input_validation=False) - - (options, args) = E.Start(parser, - add_cluster_options=True) - - GLOBAL_OPTIONS, GLOBAL_ARGS = options, args - E.info("Started in: %s" % PARAMS.get("workingdir")) - # At this point, the PARAMS dictionary has already been - # built. It now needs to be updated with selected command - # line options as these should always take precedence over - # configuration files. - - PARAMS["dryrun"] = options.dry_run - PARAMS["input_validation"] = options.input_validation - - # use cli_cluster_* keys in PARAMS to ensure highest priority - # of cluster_* options passed with the command-line - if options.cluster_memory_default is not None: - PARAMS["cli_cluster_memory_default"] = options.cluster_memory_default - PARAMS["cluster_memory_default"] = options.cluster_memory_default - if options.cluster_memory_resource is not None: - PARAMS["cli_cluster_memory_resource"] = options.cluster_memory_resource - PARAMS["cluster_memory_resource"] = options.cluster_memory_resource - if options.cluster_num_jobs is not None: - PARAMS["cli_cluster_num_jobs"] = options.cluster_num_jobs - PARAMS["cluster_num_jobs"] = options.cluster_num_jobs - if options.cluster_options is not None: - PARAMS["cli_cluster_options"] = options.cluster_options - PARAMS["cluster_options"] = options.cluster_options - if options.cluster_parallel_environment is not None: - PARAMS["cli_cluster_parallel_environment"] = options.cluster_parallel_environment - PARAMS["cluster_parallel_environment"] = options.cluster_parallel_environment - if options.cluster_priority is not None: - PARAMS["cli_cluster_priority"] = options.cluster_priority - PARAMS["cluster_priority"] = options.cluster_priority - if options.cluster_queue is not None: - PARAMS["cli_cluster_queue"] = options.cluster_queue - PARAMS["cluster_queue"] = options.cluster_queue - if options.cluster_queue_manager is not None: - PARAMS["cli_cluster_queue_manager"] = options.cluster_queue_manager - PARAMS["cluster_queue_manager"] = options.cluster_queue_manager - - PARAMS["ruffus_checksums_level"] = options.ruffus_checksums_level - - for variables in options.variables_to_set: - variable, value = variables.split("=") - PARAMS[variable.strip()] = IOTools.str2val(value.strip()) - - if args: - options.pipeline_action = args[0] - if len(args) > 1: - options.pipeline_targets.extend(args[1:]) - - # see inputValidation function in Parameters.py - if options.input_validation: - inputValidation(PARAMS, sys.argv[0]) - - if options.pipeline_action == "check": - counter, requirements = Requirements.checkRequirementsFromAllModules() - for requirement in requirements: - E.info("\t".join(map(str, requirement))) - E.info("version check summary: %s" % str(counter)) - E.Stop() - return - - elif options.pipeline_action == "debug": - # create the session proxy - startSession() - - method_name = options.pipeline_targets[0] - caller = getCaller() - method = getattr(caller, method_name) - method(*options.pipeline_targets[1:]) - - elif options.pipeline_action in ("make", "show", "svg", "plot", - "touch", "regenerate"): - - # set up extra file logger - handler = logging.FileHandler(filename=options.logfile, - mode="a") - handler.setFormatter( - MultiLineFormatter( - '%(asctime)s %(levelname)s %(module)s.%(funcName)s.%(lineno)d %(message)s')) - logger = logging.getLogger() - logger.addHandler(handler) - messenger = None - - try: - if options.pipeline_action == "make": - - # get tasks to be done. This essentially replicates - # the state information within ruffus. - stream = io.StringIO() - pipeline_printout( - stream, - options.pipeline_targets, - verbose=5, - checksum_level=options.ruffus_checksums_level) - - messenger = LoggingFilterRabbitMQ( - stream.getvalue(), - project_name=getProjectName(), - pipeline_name=getPipelineName(), - host=options.rabbitmq_host, - exchange=options.rabbitmq_exchange) - - logger.addFilter(messenger) - - if not options.without_cluster and HAS_DRMAA: - global task - # use threading instead of multiprocessing in order to - # limit the number of concurrent jobs by using the - # GIL - # - # Note that threading might cause problems with rpy. - task.Pool = ThreadPool - - # create the session proxy - startSession() - - # - # make sure we are not logging at the same time in - # different processes - # - # session_mutex = manager.Lock() - E.info(E.GetHeader()) - E.info("code location: %s" % PARAMS["pipeline_scriptsdir"]) - E.info("Working directory is: %s" % PARAMS["workingdir"]) - - pipeline_run( - options.pipeline_targets, - multiprocess=options.multiprocess, - logger=logger, - verbose=options.loglevel, - log_exceptions=options.log_exceptions, - exceptions_terminate_immediately=options.exceptions_terminate_immediately, - checksum_level=options.ruffus_checksums_level, - ) - - E.info(E.GetFooter()) - - closeSession() - - elif options.pipeline_action == "show": - pipeline_printout( - options.stdout, - options.pipeline_targets, - verbose=options.loglevel, - checksum_level=options.ruffus_checksums_level) - - elif options.pipeline_action == "touch": - pipeline_run( - options.pipeline_targets, - touch_files_only=True, - verbose=options.loglevel, - checksum_level=options.ruffus_checksums_level) - - elif options.pipeline_action == "regenerate": - pipeline_run( - options.pipeline_targets, - touch_files_only=options.ruffus_checksums_level, - verbose=options.loglevel) - - elif options.pipeline_action == "svg": - pipeline_printout_graph( - options.stdout.buffer, - options.pipeline_format, - options.pipeline_targets, - checksum_level=options.ruffus_checksums_level) - - elif options.pipeline_action == "plot": - outf, filename = tempfile.mkstemp() - pipeline_printout_graph( - os.fdopen(outf, "wb"), - options.pipeline_format, - options.pipeline_targets, - checksum_level=options.ruffus_checksums_level) - execute("inkscape %s" % filename) - os.unlink(filename) - - except ruffus_exceptions.RethrownJobError as value: - - if not options.debug: - E.error("%i tasks with errors, please see summary below:" % - len(value.args)) - for idx, e in enumerate(value.args): - task, job, error, msg, traceback = e - - if task is None: - # this seems to be errors originating within ruffus - # such as a missing dependency - # msg then contains a RethrownJobJerror - msg = str(msg) - pass - else: - task = re.sub("__main__.", "", task) - job = re.sub("\s", "", job) - - if messenger: - messenger.send_error(task, job, error, msg) - - # display only single line messages - if len([x for x in msg.split("\n") if x != ""]) > 1: - msg = "" - - E.error("%i: Task=%s Error=%s %s: %s" % - (idx, task, error, job, msg)) - - E.error("full traceback is in %s" % options.logfile) - - # write full traceback to log file only by removing the stdout - # handler - lhStdout = logger.handlers[0] - logger.removeHandler(lhStdout) - logger.error("start of error messages") - logger.error(value) - logger.error("end of error messages") - logger.addHandler(lhStdout) - - # raise error - raise ValueError( - "pipeline failed with %i errors" % len(value.args)) - else: - raise - - elif options.pipeline_action == "dump": - print(json.dumps(PARAMS)) - - elif options.pipeline_action == "printconfig": - print("Printing out pipeline parameters: ") - for k in sorted(PARAMS): - print(k, "=", PARAMS[k]) - printConfigFiles() - - elif options.pipeline_action == "config": - # (Antonio) I've modified this section, see explanation and changes in the - # writeConfigFiles function above. - config_paths = [] - try: - f = sys._getframe(1) - caller = inspect.getargvalues(f).locals["__file__"] - # Make it easier to match the name of the command executed so that - # the config file can be searched in case there are more than one - # ini files found in writeConfig(): - # Making it global, check if there's better way: - global caller_name - caller_name = os.path.basename(os.path.normpath(caller)) - except KeyError as e: - # The following code only works if something like this function is - # present in my_pipeline.py script: - # http://stackoverflow.com/questions/4519127/setuptools-package-data-folder-location - f = sys._getframe(2) - caller = inspect.getargvalues(f).locals["__file__"] - cmd_caller = os.path.basename(os.path.normpath(caller)) - # As above, save the command called in a separate variable: - caller_name = cmd_caller - cmd_caller = importlib.import_module(cmd_caller) - caller = cmd_caller.getDir() - else: - print('''Unable to find path to file being executed. Probably because - CGATPipelines and the pipeline that is being executed - cannot figure out where each other lives. Raise an issue in - GitHub if possible. Exiting.''') - - # CGATPipelines have a pipe_XX/pipe_XX hierarchy, but a simplified - # version would only have pipe_XX/ - # so creating an additional pipeline_path - # TO DO: clean this up - pipeline_path = os.path.splitext(caller)[0] - pipeline_path_2 = os.path.dirname(pipeline_path) - # CGATPipelines have a "configuration" folder - # adding a glob to have a bit more flexibility - general_path = glob.glob(str(os.path.abspath(pipeline_path_2) + - '/**/configuration*'), recursive = True) - - if not general_path: - general_path = os.path.join(os.path.dirname(pipeline_path), "configuration") - - config_paths.extend([pipeline_path, pipeline_path_2]) - # Extend separately in case general_path returns more than one file: - config_paths.extend(general_path) - writeConfigFiles(config_paths) - - elif options.pipeline_action == "clone": - clonePipeline(options.pipeline_targets[0]) - - else: - raise ValueError("unknown pipeline action %s" % - options.pipeline_action) - - E.Stop() From 01fd59e21ff7897a0b03ed0eba13a4cd8da2372f Mon Sep 17 00:00:00 2001 From: AntonioJBT Date: Tue, 23 Jan 2018 20:39:57 +0000 Subject: [PATCH 19/21] changes to cluster.py for pbspro --- CGATPipelines/Pipeline/Cluster.py | 64 +++++++++++++++---------------- CGATPipelines/Pipeline/Control.py | 12 +++--- 2 files changed, 35 insertions(+), 41 deletions(-) diff --git a/CGATPipelines/Pipeline/Cluster.py b/CGATPipelines/Pipeline/Cluster.py index 42bc53d4..f04e8949 100644 --- a/CGATPipelines/Pipeline/Cluster.py +++ b/CGATPipelines/Pipeline/Cluster.py @@ -166,51 +166,47 @@ def setupDrmaaJobTemplate(drmaa_session, options, job_name, job_memory): # PBSPro only takes the first 15 characters, throws uninformative error if longer. # mem is maximum amount of RAM used by job; mem_free doesn't seem to be available. + # For qsub job requirements would be passed as e.g. + #PBS -lselect=N:ncpus=X:mem=Ygb + #PBS -lwalltime=HH:00:00 + # 'select=1' determines de number of nodes. Should go in a config file. + # mem is per node and maximum memory + # Site dependent but in general setting '#PBS -l select=NN:ncpus=NN:mem=NN{gb|mb}' + # is sufficient for parallel jobs (OpenMP, MPI). + # Also architecture dependent, jobs could be hanging if resource doesn't exist. + # TO DO: Kill if long waiting time? + nodes = 1 # TO DO: hard coding as unsure of definitions between + # threads, nodes, etc. between programmes for now + + # Set up basic requirements for job submission: + # if process has multiple threads, use a parallel environment: + # TO DO: error in fastqc build_report, var referenced before assignment. + # For now adding to workaround: + if 'job_threads' in options: + job_threads = options["job_threads"] + else: + job_threads = 1 + spec = ["-N %s" % job_name[0:15], - "-l mem=%s" % job_memory] + "-l select=%s:ncpus=%s:mem=%s" % (nodes, job_threads, job_memory)] # Leaving walltime to be specified by user as difficult to set dynamically and # depends on site/admin configuration of default values. Likely means setting for # longest job with trade-off of longer waiting times for resources to be # available for other jobs. if options["cluster_options"]: - if "mem" not in options["cluster_options"]: - spec.append("%(cluster_options)s") - elif "mem" in options["cluster_options"]: + conds = ('mem' in options["cluster_options"], + 'ncpus' in options["cluster_options"], + 'select' in options["cluster_options"] + ) + if any(conds): spec = ["-N %s" % job_name[0:15]] spec.append("%(cluster_options)s") - - # if process has multiple threads, use a parallel environment: - # TO DO: error in fastqc build_report, var referenced before assignment. - # For now adding to workaround: - if 'job_threads' in options: - job_threads = options["job_threads"] - else: - job_threads = 1 - - multithread = 'job_threads' in options and options['job_threads'] > 1 - if multithread: - # TO DO 'select=1' determines de number of nodes. Should go in a config file. - # mem is per node and maximum memory - # Site dependent but in general setting '#PBS -l select=NN:ncpus=NN:mem=NN{gb|mb}' - # is sufficient for parallel jobs (OpenMP, MPI). - # Also architecture dependent, jobs could be hanging if resource doesn't exist. - # TO DO: Kill if long waiting time? - spec = ["-N %s" % job_name[0:15], - "-l select=1:ncpus=%s:mem=%s" % (job_threads, job_memory)] - - if options["cluster_options"]: - if "mem" not in options["cluster_options"]: - spec.append("%(cluster_options)s") - - elif "mem" in options["cluster_options"]: - raise ValueError('''mem resource specified twice, check ~/.cgat config file, - ini files, command line options, etc. - ''') + else: + spec.append("%(cluster_options)s") if "cluster_pe_queue" in options and multithread: - spec.append( - "-q %(cluster_pe_queue)s") + spec.append("-q %(cluster_pe_queue)s") elif options['cluster_queue'] != "NONE": spec.append("-q %(cluster_queue)s") # TO DO: sort out in Parameters.py to allow none values for configparser: diff --git a/CGATPipelines/Pipeline/Control.py b/CGATPipelines/Pipeline/Control.py index 101d3d54..4be0abf5 100644 --- a/CGATPipelines/Pipeline/Control.py +++ b/CGATPipelines/Pipeline/Control.py @@ -147,27 +147,24 @@ def writeConfigFiles(config_paths, caller_name): # Look for ini file: f_count = 0 - INI_list = [] for path in config_paths: if os.path.exists(path) and os.path.isdir(path): for f in os.listdir(os.path.abspath(path)): if fnmatch.fnmatch(f, 'pipeline*ini'): f_count += 1 - INI_file = f - INI_list.append(INI_file) + config_files.append(f) if f_count == 1: - config_files = INI_list # This is for the pipeline only + pass elif f_count > 1: new_count = 0 # Prioritise the file that contains the command called if more than one # ini files are found: - for f in INI_list: + for f in config_files: if caller_name in f: new_count += 1 - INI_file = f - config_files.append(INI_file) + config_files.append(f) if new_count > 1: E.warn(''' Found several ini files but could not prioritise based on: @@ -182,6 +179,7 @@ def writeConfigFiles(config_paths, caller_name): elif new_count == 1: pass elif new_count == 0: + pass print(''' More than one ini file found but none matched {} From 0e760fef9b1933e6c36993d156d5f40fbb31dc46 Mon Sep 17 00:00:00 2001 From: AntonioJBT Date: Mon, 29 Jan 2018 11:53:05 +0000 Subject: [PATCH 20/21] reverting Control.py --- CGATPipelines/Pipeline/Control.py | 274 +++--------------------------- 1 file changed, 25 insertions(+), 249 deletions(-) diff --git a/CGATPipelines/Pipeline/Control.py b/CGATPipelines/Pipeline/Control.py index 4be0abf5..fabb82a4 100644 --- a/CGATPipelines/Pipeline/Control.py +++ b/CGATPipelines/Pipeline/Control.py @@ -15,6 +15,7 @@ --------- """ + import inspect import json import logging @@ -26,9 +27,6 @@ import tempfile import time import io -import glob -import fnmatch -import importlib from multiprocessing.pool import ThreadPool @@ -70,255 +68,29 @@ GLOBAL_OPTIONS, GLOBAL_ARGS = None, None -def getConfigPaths(): - ''' - Search the current and installation paths where the configuration files - live for the pipeline being called. - ''' - # (Antonio) I've modified this section, see explanation and changes in the - # writeConfigFiles function above. - config_paths = [] - # Get the name of the pipeline being called - # This could be: - # cgatflow readqc config - # pipeline_QTL config - # python /YYYY//XXXX/pipeline_XXXX.py config - try: - f = sys._getframe(1) - #caller = f.f_globals["__file__"] # cgatflow config - # globals will get Control.py - caller = f.f_locals["__file__"] # TO DO: cgatflow - # Make it easier to match the name of the command executed so that - # the config file can be searched in case there are more than one - # ini files found in writeConfig(): - caller_name = os.path.basename(os.path.normpath(caller)) - except KeyError: - f = sys._getframe(2) # if e.g. call is direct for pipeline_QTL config - caller = f.f_globals["__file__"] - caller_name = os.path.basename(os.path.normpath(caller)) - if caller_name.endswith('.py'): - caller_name = caller_name.replace('.py', '') - #else: - # print('''Unable to find path to file being executed. Probably because - # CGATPipelines and the pipeline that is being executed - # cannot figure out where each other lives. Raise an issue in - # GitHub if possible. Exiting.''') - # sys.exit() - # CGATPipelines have a pipe_XX/pipe_XX hierarchy, but a simplified - # version would only have pipe_XX/ - pipeline_path = os.path.splitext(caller)[0] - pipeline_path_2 = os.path.dirname(pipeline_path) - # CGATPipelines have a "configuration" folder - # adding a glob to have a bit more flexibility - # TO DO: add max depth to glob recursion: - general_path = glob.glob(str(os.path.abspath(pipeline_path_2) + - '/**/configuration*'), recursive = True) - - if not general_path: - general_path = [os.path.join(os.path.dirname(pipeline_path), - "configuration")] - - # Add paths to search list: - config_paths.extend([pipeline_path, pipeline_path_2]) - # Extend separately in case general_path returns more than one file: - config_paths.extend(general_path) - return(config_paths, caller_name) - -def writeConfigFiles(config_paths, caller_name): +def writeConfigFiles(pipeline_path, general_path): '''create default configuration files in `path`. ''' - # TO DO: I've modified this function with workarounds to make it more - # flexible in order to find an ini file, find a configuration dir and - # copy pre-run sphinx-quickstart files if they exist. - # Other than creating a 'report' dir, it should not change the way it is - # run from CGATPipelines. - # See also getConfigPaths() above, these run when calling the 'config' option - # Antonio - report_dir = 'pipeline_report' - config_files = [] - - try: - os.mkdir(report_dir) # Sphinx config files will be copied here - # CGATReport only needs its conf.py to generate the rest - # though - except FileExistsError: - E.warn("directory `%s` already exists" % report_dir) - raise - - # Look for ini file: - f_count = 0 - for path in config_paths: - if os.path.exists(path) and os.path.isdir(path): - for f in os.listdir(os.path.abspath(path)): - if fnmatch.fnmatch(f, 'pipeline*ini'): - f_count += 1 - config_files.append(f) - - if f_count == 1: - pass - - elif f_count > 1: - new_count = 0 - # Prioritise the file that contains the command called if more than one - # ini files are found: - for f in config_files: - if caller_name in f: - new_count += 1 - config_files.append(f) - if new_count > 1: - E.warn(''' - Found several ini files but could not prioritise based on: - {} - as more than one matched. - Using the first one found: - {} - from - {} - '''.format(caller_name, config_files[0], config_files) - ) - elif new_count == 1: - pass - elif new_count == 0: - pass - print(''' - More than one ini file found but none matched - {} - Using the first one found: - {} - from - {} - '''.format(caller_name, config_files[0], config_files) - ) - - elif f_count == 0: - E.warn(''' - No configuration (ini) files found in: - {} - '''.format(config_paths) - ) - - # Copy pipeline ini file: - if not config_files: - E.warn('No configuration files found.') - else: - for dest in config_files: - dest = str(dest) - if os.path.exists(dest): - E.warn("file `%s` already exists - skipped" % dest) - continue - for path in config_paths: - src = os.path.join(path, dest) - if os.path.exists(src): - shutil.copyfile(src, dest) - E.info("created new configuration file `%s` " % dest) - break - else: - raise ValueError('''default config file for `%s` - not found in - %s - A pipeline cannot be run without this. - ''' % (config_files, config_paths)) - - # Copy Sphinx configuration files, enforce copy of 'conf.py' in case - # CGATReport is used: - dest = 'conf.py' - if os.path.exists(dest): - E.warn("file `%s` already exists - skipped" % dest) - - for path in config_paths: - src = os.path.join(path, dest) - if os.path.exists(src): - # Put sphinx files in separate dir: - shutil.copyfile(src, os.path.join(report_dir, dest)) - # Create a softlink outside of report_dir dir for CGATReport: - os.symlink(os.path.join(report_dir, dest), str(dest)) - E.info("created new configuration file `%s` " % dest) - break + paths = [pipeline_path, general_path] + config_files = ['pipeline.ini', 'conf.py'] - else: - # Only warn as pipeline can be run without report: - E.warn('''default config file for `%s` not found in - %s - CGATReport nor Sphinx can be run without this''' % (dest, - config_paths)) - - # If other Sphinx config files are found, copy them if there is a skeleton - # pipeline report to use: - E.info('Looking for additional Sphinx configuration files.') - sphinx_config_files = ['Makefile', - 'make.bat', - '*.rst', - '*.bib', - ] # These are for a sphinx setup, not needed - # with CGATReport - # A 'report_pipeline_*.rst' template is - # searched for below - - # Look for a pipeline report file: - f_count = 0 - for path in config_paths: - if os.path.exists(path): - for f in os.listdir(os.path.abspath(path)): - # TO DO: - # This pattern matching is particular to - # https://github.com/AntonioJBT/project_quickstart - # Needs to be made more generic - if fnmatch.fnmatch(f, 'report_pipeline_*.rst'): - f_count += 1 - pipeline_report_file = f - - if f_count == 1: - sphinx_config_files.append(pipeline_report_file) + for dest in config_files: + if os.path.exists(dest): + E.warn("file `%s` already exists - skipped" % dest) + continue + + for path in paths: + src = os.path.join(path, dest) + if os.path.exists(src): + shutil.copyfile(src, dest) + E.info("created new configuration file `%s` " % dest) + break + else: + raise ValueError( + "default config file for `%s` not found in %s" % + (config_files, paths)) - else: - # Only warn as pipeline can be run without report: - E.warn('''There is no pipeline report file matching - report_pipeline_*.rst - in the directories: - {} - Ignore this if you are using CGATReport. - '''.format(config_paths) - ) - - # Copy the files across if they are found: - f_count = 0 - # Check all the paths and their files given above when searching for config files: - for path in config_paths: - if os.path.exists(path): - for f in os.listdir(path): - # For each file or search term given, match to an existing file: - for dest in sphinx_config_files: - if fnmatch.fnmatch(f, dest): - f_to_copy = f - # If a match is found, walk the cwd to check it's not - # already present: - for root, dirs, files in os.walk('.'): - if f_to_copy in files: - E.warn("file `%s` already exists - skipped" % f_to_copy) - continue - - # If not present, copy the file: - else: - f_count += 1 - src = os.path.join(path, f_to_copy) - if os.path.exists(src): - # Put sphinx files in separate dir: - shutil.copyfile(src, os.path.join(report_dir, - f_to_copy) - ) - E.info("created new configuration file `%s` " - % f_to_copy) - break - if f_count > 0: - pass - else: - E.warn('''No sphinx-quickstart skeleton files such as: - {} - were found - in - {} - Continuing without.'''.format(dest, config_paths)) def printConfigFiles(): ''' @@ -1244,8 +1016,12 @@ def main(args=sys.argv): printConfigFiles() elif options.pipeline_action == "config": - config_paths, caller_name = getConfigPaths() - writeConfigFiles(config_paths, caller_name) + f = sys._getframe(1) + caller = f.f_globals["__file__"] + pipeline_path = os.path.splitext(caller)[0] + general_path = os.path.join(os.path.dirname(pipeline_path), + "configuration") + writeConfigFiles(pipeline_path, general_path) elif options.pipeline_action == "clone": clonePipeline(options.pipeline_targets[0]) From 36581996e3624af9a88eb6a5293730b1576f16a8 Mon Sep 17 00:00:00 2001 From: Sebastian Luna Valero Date: Mon, 29 Jan 2018 15:26:19 +0000 Subject: [PATCH 21/21] Revert Control.py to master version --- CGATPipelines/Pipeline/Control.py | 274 +++-------------------- CGATPipelines/configuration/pipeline.ini | 7 +- 2 files changed, 29 insertions(+), 252 deletions(-) diff --git a/CGATPipelines/Pipeline/Control.py b/CGATPipelines/Pipeline/Control.py index 4be0abf5..fabb82a4 100644 --- a/CGATPipelines/Pipeline/Control.py +++ b/CGATPipelines/Pipeline/Control.py @@ -15,6 +15,7 @@ --------- """ + import inspect import json import logging @@ -26,9 +27,6 @@ import tempfile import time import io -import glob -import fnmatch -import importlib from multiprocessing.pool import ThreadPool @@ -70,255 +68,29 @@ GLOBAL_OPTIONS, GLOBAL_ARGS = None, None -def getConfigPaths(): - ''' - Search the current and installation paths where the configuration files - live for the pipeline being called. - ''' - # (Antonio) I've modified this section, see explanation and changes in the - # writeConfigFiles function above. - config_paths = [] - # Get the name of the pipeline being called - # This could be: - # cgatflow readqc config - # pipeline_QTL config - # python /YYYY//XXXX/pipeline_XXXX.py config - try: - f = sys._getframe(1) - #caller = f.f_globals["__file__"] # cgatflow config - # globals will get Control.py - caller = f.f_locals["__file__"] # TO DO: cgatflow - # Make it easier to match the name of the command executed so that - # the config file can be searched in case there are more than one - # ini files found in writeConfig(): - caller_name = os.path.basename(os.path.normpath(caller)) - except KeyError: - f = sys._getframe(2) # if e.g. call is direct for pipeline_QTL config - caller = f.f_globals["__file__"] - caller_name = os.path.basename(os.path.normpath(caller)) - if caller_name.endswith('.py'): - caller_name = caller_name.replace('.py', '') - #else: - # print('''Unable to find path to file being executed. Probably because - # CGATPipelines and the pipeline that is being executed - # cannot figure out where each other lives. Raise an issue in - # GitHub if possible. Exiting.''') - # sys.exit() - # CGATPipelines have a pipe_XX/pipe_XX hierarchy, but a simplified - # version would only have pipe_XX/ - pipeline_path = os.path.splitext(caller)[0] - pipeline_path_2 = os.path.dirname(pipeline_path) - # CGATPipelines have a "configuration" folder - # adding a glob to have a bit more flexibility - # TO DO: add max depth to glob recursion: - general_path = glob.glob(str(os.path.abspath(pipeline_path_2) + - '/**/configuration*'), recursive = True) - - if not general_path: - general_path = [os.path.join(os.path.dirname(pipeline_path), - "configuration")] - - # Add paths to search list: - config_paths.extend([pipeline_path, pipeline_path_2]) - # Extend separately in case general_path returns more than one file: - config_paths.extend(general_path) - return(config_paths, caller_name) - -def writeConfigFiles(config_paths, caller_name): +def writeConfigFiles(pipeline_path, general_path): '''create default configuration files in `path`. ''' - # TO DO: I've modified this function with workarounds to make it more - # flexible in order to find an ini file, find a configuration dir and - # copy pre-run sphinx-quickstart files if they exist. - # Other than creating a 'report' dir, it should not change the way it is - # run from CGATPipelines. - # See also getConfigPaths() above, these run when calling the 'config' option - # Antonio - report_dir = 'pipeline_report' - config_files = [] - - try: - os.mkdir(report_dir) # Sphinx config files will be copied here - # CGATReport only needs its conf.py to generate the rest - # though - except FileExistsError: - E.warn("directory `%s` already exists" % report_dir) - raise - - # Look for ini file: - f_count = 0 - for path in config_paths: - if os.path.exists(path) and os.path.isdir(path): - for f in os.listdir(os.path.abspath(path)): - if fnmatch.fnmatch(f, 'pipeline*ini'): - f_count += 1 - config_files.append(f) - - if f_count == 1: - pass - - elif f_count > 1: - new_count = 0 - # Prioritise the file that contains the command called if more than one - # ini files are found: - for f in config_files: - if caller_name in f: - new_count += 1 - config_files.append(f) - if new_count > 1: - E.warn(''' - Found several ini files but could not prioritise based on: - {} - as more than one matched. - Using the first one found: - {} - from - {} - '''.format(caller_name, config_files[0], config_files) - ) - elif new_count == 1: - pass - elif new_count == 0: - pass - print(''' - More than one ini file found but none matched - {} - Using the first one found: - {} - from - {} - '''.format(caller_name, config_files[0], config_files) - ) - - elif f_count == 0: - E.warn(''' - No configuration (ini) files found in: - {} - '''.format(config_paths) - ) - - # Copy pipeline ini file: - if not config_files: - E.warn('No configuration files found.') - else: - for dest in config_files: - dest = str(dest) - if os.path.exists(dest): - E.warn("file `%s` already exists - skipped" % dest) - continue - for path in config_paths: - src = os.path.join(path, dest) - if os.path.exists(src): - shutil.copyfile(src, dest) - E.info("created new configuration file `%s` " % dest) - break - else: - raise ValueError('''default config file for `%s` - not found in - %s - A pipeline cannot be run without this. - ''' % (config_files, config_paths)) - - # Copy Sphinx configuration files, enforce copy of 'conf.py' in case - # CGATReport is used: - dest = 'conf.py' - if os.path.exists(dest): - E.warn("file `%s` already exists - skipped" % dest) - - for path in config_paths: - src = os.path.join(path, dest) - if os.path.exists(src): - # Put sphinx files in separate dir: - shutil.copyfile(src, os.path.join(report_dir, dest)) - # Create a softlink outside of report_dir dir for CGATReport: - os.symlink(os.path.join(report_dir, dest), str(dest)) - E.info("created new configuration file `%s` " % dest) - break + paths = [pipeline_path, general_path] + config_files = ['pipeline.ini', 'conf.py'] - else: - # Only warn as pipeline can be run without report: - E.warn('''default config file for `%s` not found in - %s - CGATReport nor Sphinx can be run without this''' % (dest, - config_paths)) - - # If other Sphinx config files are found, copy them if there is a skeleton - # pipeline report to use: - E.info('Looking for additional Sphinx configuration files.') - sphinx_config_files = ['Makefile', - 'make.bat', - '*.rst', - '*.bib', - ] # These are for a sphinx setup, not needed - # with CGATReport - # A 'report_pipeline_*.rst' template is - # searched for below - - # Look for a pipeline report file: - f_count = 0 - for path in config_paths: - if os.path.exists(path): - for f in os.listdir(os.path.abspath(path)): - # TO DO: - # This pattern matching is particular to - # https://github.com/AntonioJBT/project_quickstart - # Needs to be made more generic - if fnmatch.fnmatch(f, 'report_pipeline_*.rst'): - f_count += 1 - pipeline_report_file = f - - if f_count == 1: - sphinx_config_files.append(pipeline_report_file) + for dest in config_files: + if os.path.exists(dest): + E.warn("file `%s` already exists - skipped" % dest) + continue + + for path in paths: + src = os.path.join(path, dest) + if os.path.exists(src): + shutil.copyfile(src, dest) + E.info("created new configuration file `%s` " % dest) + break + else: + raise ValueError( + "default config file for `%s` not found in %s" % + (config_files, paths)) - else: - # Only warn as pipeline can be run without report: - E.warn('''There is no pipeline report file matching - report_pipeline_*.rst - in the directories: - {} - Ignore this if you are using CGATReport. - '''.format(config_paths) - ) - - # Copy the files across if they are found: - f_count = 0 - # Check all the paths and their files given above when searching for config files: - for path in config_paths: - if os.path.exists(path): - for f in os.listdir(path): - # For each file or search term given, match to an existing file: - for dest in sphinx_config_files: - if fnmatch.fnmatch(f, dest): - f_to_copy = f - # If a match is found, walk the cwd to check it's not - # already present: - for root, dirs, files in os.walk('.'): - if f_to_copy in files: - E.warn("file `%s` already exists - skipped" % f_to_copy) - continue - - # If not present, copy the file: - else: - f_count += 1 - src = os.path.join(path, f_to_copy) - if os.path.exists(src): - # Put sphinx files in separate dir: - shutil.copyfile(src, os.path.join(report_dir, - f_to_copy) - ) - E.info("created new configuration file `%s` " - % f_to_copy) - break - if f_count > 0: - pass - else: - E.warn('''No sphinx-quickstart skeleton files such as: - {} - were found - in - {} - Continuing without.'''.format(dest, config_paths)) def printConfigFiles(): ''' @@ -1244,8 +1016,12 @@ def main(args=sys.argv): printConfigFiles() elif options.pipeline_action == "config": - config_paths, caller_name = getConfigPaths() - writeConfigFiles(config_paths, caller_name) + f = sys._getframe(1) + caller = f.f_globals["__file__"] + pipeline_path = os.path.splitext(caller)[0] + general_path = os.path.join(os.path.dirname(pipeline_path), + "configuration") + writeConfigFiles(pipeline_path, general_path) elif options.pipeline_action == "clone": clonePipeline(options.pipeline_targets[0]) diff --git a/CGATPipelines/configuration/pipeline.ini b/CGATPipelines/configuration/pipeline.ini index 07af17bf..798ebf13 100644 --- a/CGATPipelines/configuration/pipeline.ini +++ b/CGATPipelines/configuration/pipeline.ini @@ -6,7 +6,7 @@ ######################################################## ######################################################## # The project name to appear in the report -projectname= +projectname=to-set # The copyright statement to appear in the report copyright= @@ -37,7 +37,8 @@ scratchdir=/tmp web_dir=../web # location of indexed genome -genome_dir=/full/path/here +#genome_dir=/ifs/mirror/genomes/plain +genome_dir=to-set # The genome to use (UCSC convention) genome=hg19 @@ -75,8 +76,8 @@ port=3306 [cluster] # queue to use +#queue=all.q queue= -#all.q # priority of jobs on cluster priority=-10