diff --git a/.github/workflows/pr_rev_cli_tests.yml b/.github/workflows/pr_rev_cli_tests.yml new file mode 100644 index 00000000..f3615af2 --- /dev/null +++ b/.github/workflows/pr_rev_cli_tests.yml @@ -0,0 +1,40 @@ +name: reV CLI Tests + +on: pull_request + +jobs: + build: + runs-on: ubuntu-latest + steps: + - name: checkout gaps + uses: actions/checkout@v2 + with: + ref: ${{ github.event.pull_request.head.ref }} + fetch-depth: 1 + path: gaps + - name: checkout reV + uses: actions/checkout@v2 + with: + repository: nrel/reV + fetch-depth: 1 + path: reV + - name: Set up Python + uses: conda-incubator/setup-miniconda@v2 + with: + auto-update-conda: true + python-version: 3.9 + - name: Install reV + working-directory: ./reV + shell: bash -l {0} + run: | + pip install -e . + - name: Install gaps + working-directory: ./gaps + shell: bash -l {0} + run: | + pip install -e .[dev] + - name: Run reV CLI tests + working-directory: ./reV + shell: bash -l {0} + run: | + pytest -k cli -v --disable-warnings diff --git a/README.rst b/README.rst index e407e470..9b9af39e 100644 --- a/README.rst +++ b/README.rst @@ -24,21 +24,37 @@ Welcome to Geospatial Analysis Pipelines (GAPs)! .. inclusion-intro Geospatial Analysis Pipelines (GAPs) is a framework designed -to assist users in scaling their geospatial models to a -High-Performance Computing (HPC) environment. In particular, -GAPs automatically distributes the execution of a -single-location model (such as the `System Advisor Model `_) -over a large geospatial extent (e.g. CONUS) across many parallel -HPC nodes. Born from the open-source `reV `_ model, GAPs is a +to assist researchers and software developers add execution +tools to their geospatial python models. Born from the +open-source `reV `_ model, GAPs is a robust and easy-to-use engine that provides a rich set of features -such as configuration file generation, job status monitoring, -CLI Documentation, and more. - +such as command-line interface (CLI) generation and documentation, +basic High-Performance Computing (HPC) scaling capabilities, +configuration file generation, job status monitoring, and more. + + +Who should use GAPs +=================== +GAPs is intended to be used by researchers and/or software developers +who have implemented a working python model but have not yet added any +external model execution tools. Within minimal effort, developers can +use GAPs to add a variety of utility for end-users, including a complete +set of CLI commands and documentation pulled from the model run function +docstrings. In addition, GAPs provides basic HPC execution capabilities, +particularly catered towards embarrassingly parallel geospatial models +(e.g. single-location models such as the `System Advisor Model `_). +GAPs can automatically distribute the execution of such models over a large +geospatial extent (e.g. CONUS) across many parallel HPC nodes. + +GAPs is **NOT** a workflow management system (WMS), and therefore does not +provide any of the in-depth tools/capabilities expected from a proper WMS. +However, GAPs-supported models can sometimes be included as part of the workflow in +WMS tools like `Torc `_. To get started, take a look at the `documentation `_ (examples coming soon!) -Installing gaps +Installing GAPs =============== NOTE: The installation instruction below assume that you have python installed diff --git a/gaps/__init__.py b/gaps/__init__.py index a078140d..2d66ae1f 100644 --- a/gaps/__init__.py +++ b/gaps/__init__.py @@ -15,3 +15,4 @@ logger = logging.getLogger(__name__) logger.addHandler(logging.NullHandler()) logger.setLevel("DEBUG") +logger.propagate = False diff --git a/gaps/batch.py b/gaps/batch.py index a41a2c9c..7b9f67e7 100644 --- a/gaps/batch.py +++ b/gaps/batch.py @@ -17,12 +17,7 @@ from rex.utilities import parse_year -from gaps.config import ( - load_config, - init_logging_from_config, - ConfigType, - resolve_all_paths, -) +from gaps.config import load_config, ConfigType, resolve_all_paths import gaps.cli.pipeline from gaps.pipeline import Pipeline from gaps.exceptions import gapsValueError, gapsConfigError @@ -56,8 +51,6 @@ def __init__(self, config): self._job_tags = None self._base_dir, config = _load_batch_config(config) self._pipeline_fp = Path(config["pipeline_config"]) - - init_logging_from_config(config) self._sets = _parse_config(config) logger.info("Batch job initialized with %d sub jobs.", len(self._sets)) @@ -97,14 +90,12 @@ def _make_job_dirs(self): # walk through current directory getting everything to copy for source_dir, _, filenames in os.walk(self._base_dir): - # don't make additional copies of job sub directories. if any(job_tag in source_dir for job_tag in self._sets): continue # For each dir level, iterate through the batch arg combos for tag, (arg_comb, mod_files, __) in self._sets.items(): - # Add the job tag to the directory path. # This will copy config subdirs into the job subdirs source_dir = Path(source_dir) diff --git a/gaps/cli/batch.py b/gaps/cli/batch.py index 3853f840..beb2d7ff 100644 --- a/gaps/cli/batch.py +++ b/gaps/cli/batch.py @@ -5,12 +5,15 @@ import click import gaps.batch +from gaps.config import init_logging_from_config_file from gaps.cli.command import _WrappedCommand from gaps.cli.documentation import _batch_command_help def _batch(config_file, dry, cancel, delete, monitor_background): """Execute an analysis pipeline over a parametric set of inputs""" + init_logging_from_config_file(config_file, background=monitor_background) + if cancel: gaps.batch.BatchJob(config_file).cancel() elif delete: @@ -64,7 +67,11 @@ def batch_command(): context_settings=None, callback=_batch, params=params, - help="Execute an analysis pipeline over a parametric set of inputs", + help=( + "Execute an analysis pipeline over a parametric set of inputs.\n\n" + "The general structure for calling this CLI command is given " + "below (add``--help`` to print help info to the terminal)." + ), epilog=None, short_help=None, options_metavar="[OPTIONS]", diff --git a/gaps/cli/cli.py b/gaps/cli/cli.py index b9508137..fb30fff7 100644 --- a/gaps/cli/cli.py +++ b/gaps/cli/cli.py @@ -16,6 +16,7 @@ CLICommandFromFunction, _WrappedCommand, ) +from gaps.cli.documentation import _main_command_help from gaps.cli.preprocessing import preprocess_collect_config from gaps.cli.status import status_command @@ -56,9 +57,8 @@ def convert_to_commands(self): command = as_click_command(command_config) self.commands.append(command) Pipeline.COMMANDS[command_config.name] = command - self.template_configs[command_config.name] = ( - command_config.documentation.template_config - ) + template_config = command_config.documentation.template_config + self.template_configs[command_config.name] = template_config return self def add_pipeline_command(self): @@ -168,6 +168,8 @@ def make_cli(commands, info=None): name : str Name of program to display at the top of the CLI help. + This input is optional, but specifying it yields richer + documentation for the main CLI command. version : str Include program version with a ``--version`` CLI option. @@ -181,6 +183,7 @@ def make_cli(commands, info=None): """ info = info or {} command_generator = _CLICommandGenerator(commands) + commands = command_generator.generate() options = [ click.Option( @@ -189,12 +192,17 @@ def make_cli(commands, info=None): help="Flag to turn on debug logging. Default is not verbose.", ), ] - prog_name = [info["name"]] if "name" in info else [] + + prog_name = info.get("name") + if prog_name: + main_help = _main_command_help( + prog_name, command_generator.command_configs + ) + else: + main_help = "Command Line Interface" + main = click.Group( - help=" ".join(prog_name + ["Command Line Interface"]), - params=options, - callback=_main_cb, - commands=command_generator.generate(), + help=main_help, params=options, callback=_main_cb, commands=commands ) version = info.get("version") if version is not None: @@ -208,3 +216,4 @@ def _main_cb(ctx, verbose): """Set the obj and verbose settings of the commands.""" ctx.ensure_object(dict) ctx.obj["VERBOSE"] = verbose + ctx.max_content_width = 92 diff --git a/gaps/cli/command.py b/gaps/cli/command.py index bfde8db6..b9e56c29 100644 --- a/gaps/cli/command.py +++ b/gaps/cli/command.py @@ -9,20 +9,10 @@ import click +from gaps.cli.config import GAPS_SUPPLIED_ARGS from gaps.cli.documentation import CommandDocumentation from gaps.cli.preprocessing import split_project_points_into_ranges - - -GAPS_SUPPLIED_ARGS = { - "tag", - "command_name", - "config_file", - "project_dir", - "job_name", - "out_dir", - "out_fpath", - "config", -} +from gaps.utilities import _is_sphinx_build class AbstractBaseCLICommandConfiguration(ABC): @@ -112,6 +102,13 @@ def __init__( configuration files, so it should be thoroughly documented (using a `NumPy Style Python Docstring `_). + In particular, the "Extended Summary" and the "Parameters" + section will be pulled form the docstring. + + .. WARNING:: The "Extended Summary" section may not show up + properly if the short "Summary" section is + missing. + This function must return the path to the output file it generates (or a list of paths if multiple output files are generated). If no output files are generated, the function @@ -154,22 +151,28 @@ def __init__( convenience purposes. This argument combines the ``out_dir`` with ``job_name`` to yield a unique output filepath for a given node. Note that the - output filename will contain the tag if it is - "requested" from a node function, but *WILL NOT* - contain a tag if requested from a config - pre-processing function, since the number of split - nodes have not been determined at that point. - Also note that this string *WILL NOT* contain a - file-ending, so that will have to be added by the - node function. + output filename will contain the tag. Also note that + this string *WILL NOT* contain a file-ending, so + that will have to be added by the node function. If your function is capable of multiprocessing, you should also include ``max_workers`` in the function signature. ``gaps`` will pass an integer equal to the number of processes the user wants to run on a single node for this - value. Note that the ``config`` parameter is not allowed as + value. + + .. WARNING:: The keywords ``{"max-workers", + "sites_per_worker", "memory_utilization_limit", + "timeout", "pool_size"}`` are assumed to + describe execution control. If you request any + of these as function arguments, users of your + CLI will specify them in the + `execution_control` block of the input config + file. + + Note that the ``config`` parameter is not allowed as a function signature item. Please request all the required - keys/inputs directly. This function can also request + keys/inputs directly instead. This function can also request "private" arguments by including a leading underscore in the argument name. These arguments are NOT exposed to users in the documentation or template configuration files. Instead, @@ -202,7 +205,13 @@ def __init__( function *must* return the path (as a string) to the output file it generates in order for users to be able to use ``"PIPELINE"`` as the input to the ``collect_pattern`` key - in the collection config. By default, ``False``. + in the collection config. The path returned by this function + must also include the ``tag`` in the output file name in + order for collection to function properly (an easy way to do + this is to request ``tag`` in the function signature and + name the output file generated by the function using the + ``f"{out_file_name}{tag}.{extension}"`` format). + By default, ``False``. split_keys : set | container, optional A set of strings identifying the names of the config keys that ``gaps`` should split the function execution on. To @@ -228,9 +237,14 @@ def __init__( "paired" keys match. To allow non-iterable user input for split keys, use the ``config_preprocessor`` argument to specify a preprocessing function that converts the user - input into a list of the expected inputs. If ``None``, - execution is not split across nodes, and a single node is - always used for the function call. By default, ``None``. + input into a list of the expected inputs. If users specify + an empty list or ``None`` for a key in ``split_keys``, then + GAPs will pass ``None`` as the value for that key (i.e. if + ``split_keys=["a"]`` and users specify ``"a": []`` in their + config, then the ``function`` will be called with + ``a=None``). If ``None``, execution is not split across + nodes, and a single node is always used for the function + call. By default, ``None``. config_preprocessor : callable, optional Optional function for configuration pre-processing. The preprocessing step occurs before jobs are split across HPC @@ -243,43 +257,40 @@ def __init__( can also "request" the following arguments by including them in the function signature: - tag : str - Short string unique to this job run that can be used - to generate unique output filenames, thereby - avoiding clashing output files with jobs on other - nodes. This string contains a leading underscore, - so the file name can easily be generated: - ``f"{out_file_name}{tag}.{extension}"``. command_name : str Name of the command being run. This is equivalent to the ``name`` input above. - config_file : str + config_file : Path Path to the configuration file specified by the user. - project_dir : str + project_dir : Path Path to the project directory (parent directory of the configuration file). job_name : str Name of the job being run. This is typically a combination of the project directory and the command name. - out_dir : str + out_dir : Path Path to output directory - typically equivalent to the project directory. - out_fpath : str + out_fpath : Path Suggested path to output file. You are not required to use this argument - it is provided purely for convenience purposes. This argument combines the ``out_dir`` with ``job_name`` to yield a unique output filepath for a given node. Note that the - output filename will contain the tag if it is - "requested" from a node function, but *WILL NOT* - contain a tag if requested from a config - pre-processing function, since the number of split - nodes have not been determined at that point. + output filename *WILL NOT* contain the tag, since + the number of split nodes have not been determined + when the config pre-processing function is called. Also note that this string *WILL NOT* contain a file-ending, so that will have to be added by the node function. + log_directory : Path + Path to log output directory (defaults to + project_dir / "logs"). + verbose : bool + Flag indicating wether the user has selected a DEBUG + verbosity level for logs. These inputs will be provided by GAPs and *will not* be displayed to users in the template configuration files or @@ -385,7 +396,7 @@ def __init__( Parameters ---------- - init : class initializer + init : class The class to be initialized and used to to run on individual nodes. The class must implement ``method``. The initializer, along with the corresponding method, will be used to @@ -393,6 +404,15 @@ def __init__( files, so it should be thoroughly documented (using a `NumPy Style Python Docstring `_). + In particular, the "Extended Summary" (from the ``__init__`` + method *only*) and the "Parameters" section (from *both* the + ``__init__`` method and the run `method` given below) will + be pulled form the docstring. + + .. WARNING:: The "Extended Summary" section may not display + properly if the short "Summary" section in the + ``__init__`` method is missing. + method : str The name of a method of the ``init`` class to act as the model function to run across multiple nodes on the HPC. This @@ -416,10 +436,10 @@ def __init__( command_name : str Name of the command being run. This is equivalent to the ``name`` input argument. - config_file : Path + config_file : str Path to the configuration file specified by the user. - project_dir : Path + project_dir : str Path to the project directory (parent directory of the configuration file). job_name : str @@ -429,17 +449,37 @@ def __init__( will not be included if you request this argument in a config preprocessing function, as the execution has not been split into multiple jobs by that point. - out_dir : Path + out_dir : str Path to output directory - typically equivalent to the project directory. + out_fpath : str + Suggested path to output file. You are not required + to use this argument - it is provided purely for + convenience purposes. This argument combines the + ``out_dir`` with ``job_name`` to yield a unique + output filepath for a given node. Note that the + output filename will contain the tag. Also note that + this string *WILL NOT* contain a file-ending, so + that will have to be added by the node function. If your function is capable of multiprocessing, you should also include ``max_workers`` in the function signature. ``gaps`` will pass an integer equal to the number of processes the user wants to run on a single node for this - value. Note that the ``config`` parameter is not allowed as + value. + + .. WARNING:: The keywords ``{"max-workers", + "sites_per_worker", "memory_utilization_limit", + "timeout", "pool_size"}`` are assumed to + describe execution control. If you request any + of these as function arguments, users of your + CLI will specify them in the + `execution_control` block of the input config + file. + + Note that the ``config`` parameter is not allowed as a function signature item. Please request all the required - keys/inputs directly. This function can also request + keys/inputs directly instead. This function can also request "private" arguments by including a leading underscore in the argument name. These arguments are NOT exposed to users in the documentation or template configuration files. Instead, @@ -471,7 +511,13 @@ def __init__( function *must* return the path (as a string) to the output file it generates in order for users to be able to use ``"PIPELINE"`` as the input to the ``collect_pattern`` key - in the collection config. By default, ``False``. + in the collection config. The path returned by this function + must also include the ``tag`` in the output file name in + order for collection to function properly (an easy way to do + this is to request ``tag`` in the function signature and + name the output file generated by the function using the + ``f"{out_file_name}{tag}.{extension}"`` format). + By default, ``False``. split_keys : set | container, optional A set of strings identifying the names of the config keys that ``gaps`` should split the function execution on. To @@ -497,9 +543,14 @@ def __init__( "paired" keys match. To allow non-iterable user input for split keys, use the ``config_preprocessor`` argument to specify a preprocessing function that converts the user - input into a list of the expected inputs. If ``None``, - execution is not split across nodes, and a single node is - always used for the function call. By default, ``None``. + input into a list of the expected inputs. If users specify + an empty list or ``None`` for a key in ``split_keys``, then + GAPs will pass ``None`` as the value for that key (i.e. if + ``split_keys=["a"]`` and users specify ``"a": []`` in their + config, then the ``function`` will be called with + ``a=None``). If ``None``, execution is not split across + nodes, and a single node is always used for the function + call. By default, ``None``. config_preprocessor : callable, optional Optional function for configuration pre-processing. The preprocessing step occurs before jobs are split across HPC @@ -528,6 +579,24 @@ def __init__( out_dir : Path Path to output directory - typically equivalent to the project directory. + out_fpath : Path + Suggested path to output file. You are not required + to use this argument - it is provided purely for + convenience purposes. This argument combines the + ``out_dir`` with ``job_name`` to yield a unique + output filepath for a given node. Note that the + output filename *WILL NOT* contain the tag, since + the number of split nodes have not been determined + when the config pre-processing function is called. + Also note that this string *WILL NOT* contain a + file-ending, so that will have to be added by the + node function. + log_directory : Path + Path to log output directory (defaults to + project_dir / "logs"). + verbose : bool + Flag indicating wether the user has selected a DEBUG + verbosity level for logs. These inputs will be provided by GAPs and *will not* be displayed to users in the template configuration files or @@ -643,6 +712,12 @@ def wrap_text( ) if "Parameters\n----------" not in wrapped_text.replace(" ", ""): wrapped_text = wrapped_text.replace(".\n", ".\n\n") + elif not _is_sphinx_build(): # pragma: no cover + wrapped_text = wrapped_text.replace( + "Parameters\n----------", + "\nConfig Parameters\n-----------------", + ) + return wrapped_text click.formatting.wrap_text = wrap_text diff --git a/gaps/cli/config.py b/gaps/cli/config.py index 8fbd79b1..85b35c2f 100644 --- a/gaps/cli/config.py +++ b/gaps/cli/config.py @@ -19,6 +19,7 @@ from gaps.config import load_config from gaps.log import init_logger from gaps.cli.execution import kickoff_job +from gaps.cli.documentation import EXTRA_EXEC_PARAMS from gaps.exceptions import gapsKeyError from gaps.warnings import gapsWarning @@ -35,6 +36,18 @@ ] TAG = "_j" MAX_AU_BEFORE_WARNING = 10_000 +GAPS_SUPPLIED_ARGS = { + "tag", + "command_name", + "config_file", + "project_dir", + "job_name", + "out_dir", + "out_fpath", + "config", + "log_directory", + "verbose", +} class _FromConfig: @@ -87,11 +100,11 @@ def enable_logging(self): self.log_directory = self.config.pop( "log_directory", (self.project_dir / "logs").as_posix() ) - self.verbose = ( - self.config.pop("log_level", "INFO") == "DEBUG" - or self.ctx.obj.get("VERBOSE", False) - ) - pipe_log_file = Path(self.log_directory) / f"{self.job_name}.log" + self.log_directory = Path(self.log_directory) + self.verbose = self.config.pop( + "log_level", "INFO" + ) == "DEBUG" or self.ctx.obj.get("VERBOSE", False) + pipe_log_file = self.log_directory / f"{self.job_name}.log" init_logger( stream=self.ctx.obj.get("LOG_STREAM", True), level="DEBUG" if self.verbose else "INFO", @@ -101,7 +114,7 @@ def enable_logging(self): def validate_config(self): """Validate the user input config file.""" - logger.debug("Validating %r", self.config_file) + logger.debug("Validating %r", str(self.config_file)) _validate_config(self.config, self.command_config.documentation) return self @@ -115,6 +128,9 @@ def preprocess_config(self): "project_dir": self.project_dir, "job_name": self.job_name, "out_dir": self.project_dir, + "out_fpath": self.project_dir / self.job_name, + "log_directory": self.log_directory, + "verbose": self.verbose, } extra_preprocessor_kwargs = { k: self.config[k] @@ -143,21 +159,25 @@ def set_exec_kwargs(self): self.exec_kwargs = { "option": "local", "sh_script": "", - "stdout_path": (Path(self.log_directory) / "stdout").as_posix(), + "stdout_path": (self.log_directory / "stdout").as_posix(), } self.exec_kwargs.update(self.config.get("execution_control", {})) - if "max_workers" in self.config: - max_workers = self.config.pop("max_workers") + extra_params = set() + for extra_exec_param in EXTRA_EXEC_PARAMS: + if extra_exec_param in self.config: + extra_params.add(extra_exec_param) + param = self.config.pop(extra_exec_param) + self.exec_kwargs[extra_exec_param] = param + + if extra_params: msg = ( - "Found key 'max_workers' outside of 'execution_control'. " - f"Moving 'max_workers' value ({max_workers}) into " - "'execution_control' block. To silence this warning, " - "please specify 'max_workers' inside of the " - "'execution_control' block." + f"Found key(s) {extra_params} outside of 'execution_control'. " + "Moving these keys into 'execution_control' block. To " + "silence this warning, please specify these keys inside of " + "the 'execution_control' block." ) warn(msg, gapsWarning) - self.exec_kwargs["max_workers"] = max_workers return self @@ -165,7 +185,7 @@ def set_logging_options(self): """Assemble the logging options dictionary.""" self.logging_options = { "name": self.job_name, - "log_directory": self.log_directory, + "log_directory": self.log_directory.as_posix(), "verbose": self.verbose, "node": self.exec_kwargs.get("option", "local") != "local", } @@ -190,6 +210,17 @@ def prepare_context(self): self.ctx.obj["OUT_DIR"] = self.project_dir return self + def log_job_info(self): + """Log information about job submission""" + logger.info( + "Running %s from config file: %r", + self.command_name, + str(self.config_file), + ) + logger.info("Target output directory: %r", str(self.project_dir)) + logger.info("Target logging directory: %r", str(self.log_directory)) + return self + def kickoff_jobs(self): """Kickoff jobs across nodes based on config and run function.""" keys_to_run, lists_to_run = self._keys_and_lists_to_run() @@ -197,8 +228,11 @@ def kickoff_jobs(self): jobs = sorted(product(*lists_to_run)) num_jobs_submit = len(jobs) self._warn_about_excessive_au_usage(num_jobs_submit) - n_zfill = len(str(num_jobs_submit)) - max_workers_per_node = self.exec_kwargs.pop("max_workers", None) + n_zfill = len(str(max(0, num_jobs_submit - 1))) + extra_exec_args = {} + for param in EXTRA_EXEC_PARAMS: + if param in self.exec_kwargs: + extra_exec_args[param] = self.exec_kwargs.pop(param) for node_index, values in enumerate(jobs): if num_jobs_submit > 1: tag = f"{TAG}{str(node_index).zfill(n_zfill)}" @@ -216,12 +250,12 @@ def kickoff_jobs(self): "job_name": job_name, "out_dir": self.project_dir.as_posix(), "out_fpath": (self.project_dir / job_name).as_posix(), - "max_workers": max_workers_per_node, "run_method": getattr( self.command_config, "run_method", None ), } ) + node_specific_config.update(extra_exec_args) for key, val in zip(keys_to_run, values): if isinstance(key, str): @@ -295,6 +329,7 @@ def run(self): return ( self.enable_logging() .validate_config() + .log_job_info() .preprocess_config() .set_exec_kwargs() .set_logging_options() @@ -322,8 +357,9 @@ def _ensure_required_args_exist(config, documentation): name for name in documentation.required_args if name not in config } - if documentation.max_workers_required: - missing = _mark_max_workers_missing_if_needed(config, missing) + missing = _mark_extra_exec_params_missing_if_needed( + documentation, config, missing + ) if any(missing): msg = ( @@ -333,12 +369,13 @@ def _ensure_required_args_exist(config, documentation): raise gapsKeyError(msg) -def _mark_max_workers_missing_if_needed(config, missing): - """Add max_workers as missing key if it is not in `execution_control.""" - +def _mark_extra_exec_params_missing_if_needed(documentation, config, missing): + """Add extra exec params as missing if not found in `execution_control.""" exec_control = config.get("execution_control", {}) - if "max_workers" not in config and "max_workers" not in exec_control: - missing |= {"max_workers"} + for param in EXTRA_EXEC_PARAMS: + if documentation.param_required(param): + if param not in config and param not in exec_control: + missing |= {param} return missing diff --git a/gaps/cli/documentation.py b/gaps/cli/documentation.py index 0c4c333a..df1a6a52 100644 --- a/gaps/cli/documentation.py +++ b/gaps/cli/documentation.py @@ -3,11 +3,14 @@ CLI documentation utilities. """ from copy import deepcopy +from itertools import chain +from functools import lru_cache from inspect import signature, isclass from numpydoc.docscrape import NumpyDocString from gaps.config import config_as_str_for_docstring, ConfigType +from gaps.utilities import _is_sphinx_build DEFAULT_EXEC_VALUES = { @@ -24,106 +27,164 @@ "sh_script": None, } +EXTRA_EXEC_PARAMS = { + "max_workers": """Maximum number of parallel workers run on each node.""", + "sites_per_worker": """Number of sites to run in series on a worker.""", + "memory_utilization_limit": """Memory utilization limit (fractional). + Must be a value between 0 and 100. This input sets + how much data will be stored in-memory at any + given time before flushing to disk. + """, + "timeout": """Number of seconds to wait for parallel run iteration + to complete before early termination. + """, + "pool_size": """Number of futures to submit to a single process pool + for parallel futures. + """, +} + CONFIG_TYPES = [ ConfigType.JSON, ConfigType.YAML, ConfigType.TOML, ] -PIPELINE_CONFIG_DOC = """ -Path to the "pipeline" configuration file. This argument can be left out, -but *one and only one file* with the name "pipeline" should exist in the -directory and contain the config information. Below is a sample template config +MAIN_DOC = """{name} Command Line Interface. -.. tabs:: +Typically, a good place to start is to set up a {name} job with a pipeline +config that points to several {name} modules that you want to run in serial. - .. tab:: JSON - :: +To begin, you can generate some template configuration files using:: - {template_json_config} + $ {name} template-configs - .. tab:: YAML - :: +By default, this generates template JSON configuration files, though you +can request JSON5, YAML, or TOML configuration files instead. You can run +``$ {name} template-configs --help`` on the command line to see all available +options for the ``template-configs`` command. Once the template configuration +files have been generated, you can fill them out by referring to the +module CLI documentation (if available) or the help pages of the module CLIs +for more details on the config options for each CLI command:: - {template_yaml_config} + {cli_help_str} - .. tab:: TOML - :: +After appropriately filling our the configuration files for each module you +want to run, you can call the {name} pipeline CLI using:: - {template_toml_config} + $ {name} pipeline -c config_pipeline.json +This command will run each pipeline step in sequence. -Parameters ----------- -pipeline : list of dicts - A list of dictionaries, where each dictionary represents one step - in the pipeline. Each dictionary should have one key-value pair, - where the key is the name of the CLI command to run, and the value - is the path to a config file containing the configuration for that - command. -logging : dict, optional - Dictionary containing keyword-argument pairs to pass to - :func:`gaps.log.init_logger`. +.. Note:: You will need to re-submit the ``pipeline`` command above after + each completed pipeline step. -""" +To check the status of the pipeline, you can run:: -BATCH_CONFIG_DOC = """ -Path to the "batch" configuration file. Below is a sample template config + $ {name} status -.. tabs:: +This will print a report to the command line detailing the progress of the +current pipeline. See ``$ {name} status --help`` for all status command +options. - .. tab:: JSON - :: +If you need to parameterize the pipeline execution, you can use the ``batch`` +command. For details on setting up a batch config file, see the documentation +or run:: - {template_json_config} + $ {name} batch --help - .. tab:: YAML - :: +on the command line. Once you set up a batch config file, you can execute +it using:: - {template_yaml_config} + $ {name} status -c config_batch.json - .. tab:: TOML - :: +The general structure of the {name} CLI is given below. +""" - {template_toml_config} +PIPELINE_CONFIG_DOC = """ +Path to the ``pipeline`` configuration file. This argument can be +left out, but *one and only one file* with "pipeline" in the +name should exist in the directory and contain the config +information. {sample_config} + +Parameters +---------- +pipeline : list of dicts + A list of dictionaries, where each dictionary represents one + step in the pipeline. Each dictionary should have one + key-value pair, where the key is the name of the CLI + command to run, and the value is the path to a config + file containing the configuration for that command. +logging : dict, optional + Dictionary containing keyword-argument pairs to pass to + `init_logger `_. This + initializes logging for the submission portion of the + pipeline. Note, however, that each step (command) will + **also** record the submission step log output to a + common "project" log file, so it's only ever necessary + to use this input if you want a different (lower) level + of verbosity than the `log_level` specified in the + config for the step of the pipeline being executed. + +""" +BATCH_CONFIG_DOC = """ +Path to the ``batch`` configuration file. {sample_config} Parameters ---------- pipeline_config : str - Path to the pipeline configuration defining the commands to run for - every parametric set. + Path to the pipeline configuration defining the commands to + run for every parametric set. sets : list of dicts - A list of dictionaries, where each dictionary defines a "set" of - parametric runs. Each dictionary should have the following keys: + A list of dictionaries, where each dictionary defines a + "set" of parametric runs. Each dictionary should have + the following keys: args : dict A dictionary defining the arguments across all input - configuration files to parameterize. Each argument to be - parametrized should be a key in this dictionary, and the - value should be a list of the parameter values to run for - this argument. + configuration files to parameterize. Each argument + to be parametrized should be a key in this + dictionary, and the value should be a list of the + parameter values to run for this argument. - .. tabs:: + {batch_args_dict} + + Remember that the keys in the ``args`` dictionary + should be part of (at least) one of your other + configuration files. + files : list + A list of paths to the configuration files that + contain the arguments to be updated for every + parametric run. Arguments can be spread out over + multiple files. {batch_files} + set_tag : str, optional + Optional string defining a set tag that will prefix + each job tag for this set. This tag does not need to + include an underscore, as that is provided during + concatenation. + + +""" +_BATCH_ARGS_DICT = """.. tabs:: - .. tab:: JSON + .. group-tab:: JSON/JSON5 :: {sample_json_args_dict} - .. tab:: YAML + .. group-tab:: YAML :: {sample_yaml_args_dict} - .. tab:: TOML + .. group-tab:: TOML :: {sample_toml_args_dict} - This example would run a total of six pipelines, one with - each of the following arg combinations: + This example would run a total of six pipelines, one + with each of the following arg combinations: :: input_constant_1=18.20, path_to_a_file="/first/path.h5" @@ -134,69 +195,64 @@ input_constant_1=19.04, path_to_a_file="/third/path.h5" - Remember that the keys in the ``args`` dictionary should be - part of (at least) one of your other configuration files. - files : list - A list of paths to the configuration files that contain the - arguments to be updated for every parametric run. Arguments - can be spread out over multiple files. For example: +""" +_BATCH_FILES = """For example: .. tabs:: - .. tab:: JSON + .. group-tab:: JSON/JSON5 :: {sample_json_files} - .. tab:: YAML + .. group-tab:: YAML :: {sample_yaml_files} - .. tab:: TOML + .. group-tab:: TOML :: {sample_toml_files} - set_tag : str, optional - Optional string defining a set tag that will prefix each - job tag for this set. This tag does not need to include an - underscore, as that is provided during concatenation. - - """ CONFIG_DOC = """ -Path to the ``{name}`` configuration file. Below is a sample template config +Path to the ``{name}`` configuration file. {sample_config} + +{docstring} + +Note that you may remove any keys with a ``null`` value if you do not intend to update them yourself. +""" +SAMPLE_CONFIG_DOC = """Below is a sample template config .. tabs:: - .. tab:: JSON + .. group-tab:: JSON/JSON5 :: {template_json_config} - .. tab:: YAML + .. group-tab:: YAML :: {template_yaml_config} - .. tab:: TOML + .. group-tab:: TOML :: {template_toml_config} - -{docstring} - -Note that you may remove any keys with a ``null`` value if you do not intend to update them yourself. """ COMMAND_DOC = """ Execute the ``{name}`` step from a config file. {desc} +The general structure for calling this CLI command is given below (add +``--help`` to print help info to the terminal). + """ EXEC_CONTROL_DOC = """ Parameters @@ -206,27 +262,41 @@ arguments are: :option: ({{'local', 'kestrel', 'eagle', 'peregrine'}}) - Hardware run option. Determines the type of job scheduler - tp use as well as the base AU cost. - :allocation: (str) HPC project (allocation) handle. - :walltime: (int) Node walltime request in hours. - :qos: (str) Quality-of-service specifier. By default, ``"normal"``. - :memory: (int, optional) Node memory request in GB. Default is not to - specify.{n}{mw} - :queue: (str, optional; PBS ONLY) HPC queue to submit job to. - Examples include: 'debug', 'short', 'batch', 'batch-h', - 'long', etc. By default, `None`, which uses `test_queue`. - :feature: (str, optional) Additional flags for SLURM job - (e.g. "-p debug"). Default is not to specify. - :conda_env: (str, optional) Name of conda environment to activate. - Default is not to load any environments. - :module: (str, optional) Module to load. Default is not to load any - modules. - :sh_script: (str, optional) Extra shell script to run before - command call. Default is not to run any scripts. - - Only the "option" input is required for local execution. For - execution on the HPC, the allocation and walltime are also + Hardware run option. Determines the type of job + scheduler to use as well as the base AU cost. + :allocation: (str) + HPC project (allocation) handle. + :walltime: (int) + Node walltime request in hours. + :qos: (str, optional) + Quality-of-service specifier. On Eagle or + Kestrel, this should be one of {{'standby', 'normal', + 'high'}}. Note that 'high' priority doubles the AU + cost. By default, ``"normal"``. + :memory: (int, optional) + Node memory request in GB. By default, ``None``, which + does not specify a memory limit.{n}{eep} + :queue: (str, optional; PBS ONLY) + HPC queue to submit job to. Examples include: 'debug', + 'short', 'batch', 'batch-h', 'long', etc. + By default, ``None``, which uses "test_queue". + :feature: (str, optional) + Additional flags for SLURM job (e.g. "-p debug"). + By default, ``None``, which does not specify any + additional flags. + :conda_env: (str, optional) + Name of conda environment to activate. By default, + ``None``, which does not load any environments. + :module: (str, optional) + Module to load. By default, ``None``, which does not + load any modules. + :sh_script: (str, optional) + Extra shell script to run before command call. + By default, ``None``, which does not run any + scripts. + + Only the `option` key is required for local execution. For + execution on the HPC, the `allocation` and `walltime` keys are also required. All other options are populated with default values, as seen above. log_directory : str @@ -241,13 +311,13 @@ """ NODES_DOC = ( - "\n :nodes: (int, optional) Number of nodes to split the project " - "\n points across. Note that the total number of requested " - "\n nodes for a job may be larger than this value if the " - "\n command splits across other inputs (e.g. analysis " - "\n years) Default is 1." + "\n :nodes: (int, optional)" + "\n Number of nodes to split the project points across. " + "\n Note that the total number of requested nodes for " + "\n a job may be larger than this value if the command" + "\n splits across other inputs. Default is ``1``." ) -MW_DOC = "\n :max_workers: ({type}) {desc}" +EXTRA_EXEC_PARAM_DOC = "\n :{name}: ({type})\n {desc}" class CommandDocumentation: @@ -290,7 +360,7 @@ def __init__(self, *functions, skip_params=None, is_split_spatially=False): p.name: p for doc in self.docs for p in doc["Parameters"] } self.skip_params = set() if skip_params is None else set(skip_params) - self.skip_params |= {"cls", "self", "max_workers"} + self.skip_params |= {"cls", "self"} | set(EXTRA_EXEC_PARAMS) self.is_split_spatially = is_split_spatially @property @@ -299,64 +369,61 @@ def default_exec_values(self): exec_vals = deepcopy(DEFAULT_EXEC_VALUES) if not self.is_split_spatially: exec_vals.pop("nodes", None) - if self.max_workers_in_func_signature: - exec_vals["max_workers"] = ( - self.REQUIRED_TAG - if self.max_workers_required - else self.max_workers_param.default - ) + for param in EXTRA_EXEC_PARAMS: + if self._param_in_func_signature(param): + exec_vals[param] = ( + self.REQUIRED_TAG + if self.param_required(param) + else self._param_value(param).default + ) return exec_vals @property def exec_control_doc(self): """str: Execution_control documentation.""" nodes_doc = NODES_DOC if self.is_split_spatially else "" - return EXEC_CONTROL_DOC.format(n=nodes_doc, mw=self._max_workers_doc) + return EXEC_CONTROL_DOC.format( + n=nodes_doc, eep=self._extra_exec_param_doc + ) @property - def _max_workers_doc(self): - """str: `MW_DOC` formatted with the info from the input func.""" - param = self.param_docs.get("max_workers") + def _extra_exec_param_doc(self): + """str: Docstring formatted with the info from the input func.""" + return "".join( + [ + self._format_extra_exec_param_doc(param_name) + for param_name in EXTRA_EXEC_PARAMS + ] + ) + + def _format_extra_exec_param_doc(self, param_name): + """Format extra exec control parameters""" + param = self.param_docs.get(param_name) try: - return MW_DOC.format(type=param.type, desc=" ".join(param.desc)) + return EXTRA_EXEC_PARAM_DOC.format( + name=param_name, type=param.type, desc=" ".join(param.desc) + ) except AttributeError: pass - if self.max_workers_in_func_signature: - return MW_DOC.format( - type=( - "(int)" if self.max_workers_required else "(int, optional)" - ), - desc=( - "Maximum number of parallel workers run on each node." - "Default is `None`, which uses all available cores." - ), + if self._param_in_func_signature(param_name): + if self.param_required(param_name): + param_type = "int" + default_text = "" + else: + default_val = self._param_value(param_name).default + if default_val is None: + param_type = "int, optional" + else: + param_type = f"{type(default_val).__name__}, optional" + default_text = f"By default, ``{default_val}``." + return EXTRA_EXEC_PARAM_DOC.format( + name=param_name, + type=param_type, + desc=" ".join([EXTRA_EXEC_PARAMS[param_name], default_text]), ) - return "" - @property - def max_workers_in_func_signature(self): - """bool: `True` if "max_workers" is a param of the input function.""" - return self.max_workers_param is not None - - @property - def max_workers_param(self): - """bool: `True` if "max_workers" is a required parameter of `func`.""" - for sig in self.signatures: - for name in sig.parameters: - if name == "max_workers": - return sig.parameters["max_workers"] - return None - - @property - def max_workers_required(self): - """bool: `True` if "max_workers" is a required parameter of `func`.""" - param = self.max_workers_param - if param is None: - return False - return param.default is param.empty - @property def required_args(self): """set: Required parameters of the input function.""" @@ -403,7 +470,7 @@ def parameter_help(self): for p in doc["Parameters"] if p.name in self.template_config ] - return str(param_only_doc) + return "\n".join(_format_lines(str(param_only_doc).split("\n"))) @property def extended_summary(self): @@ -426,19 +493,27 @@ def config_help(self, command_name): str Help string for the config file. """ - return CONFIG_DOC.format( + if _is_sphinx_build(): + sample_config = SAMPLE_CONFIG_DOC.format( + template_json_config=config_as_str_for_docstring( + self.template_config, config_type=ConfigType.JSON + ), + template_yaml_config=config_as_str_for_docstring( + self.template_config, config_type=ConfigType.YAML + ), + template_toml_config=config_as_str_for_docstring( + self.template_config, config_type=ConfigType.TOML + ), + ) + else: + sample_config = "" + + doc = CONFIG_DOC.format( name=command_name, - template_json_config=config_as_str_for_docstring( - self.template_config, config_type=ConfigType.JSON - ), - template_yaml_config=config_as_str_for_docstring( - self.template_config, config_type=ConfigType.YAML - ), - template_toml_config=config_as_str_for_docstring( - self.template_config, config_type=ConfigType.TOML - ), + sample_config=sample_config, docstring=self.parameter_help, ) + return _cli_formatted(doc) def command_help(self, command_name): """Generate a help string for a command. @@ -454,28 +529,78 @@ def command_help(self, command_name): str Help string for the command. """ - return COMMAND_DOC.format( - name=command_name, desc=self.extended_summary - ) + doc = COMMAND_DOC.format(name=command_name, desc=self.extended_summary) + return _cli_formatted(doc) + + @lru_cache(maxsize=16) + def _param_value(self, param): + """Extract parameter if it exists in signature""" + for sig in self.signatures: + for name in sig.parameters: + if name == param: + return sig.parameters[param] + return None + + @lru_cache(maxsize=16) + def _param_in_func_signature(self, param): + """`True` if `param` is a param of the input function.""" + return self._param_value(param) is not None + + @lru_cache(maxsize=16) + def param_required(self, param): + """Check wether a parameter is a required input for the run function. + + Parameters + ---------- + param : str + Name of parameter to check. + + Returns + ------- + bool + ``True`` if `param` is a required parameter of `func`. + """ + param = self._param_value(param) + if param is None: + return False + return param.default is param.empty -def _pipeline_command_help(pipeline_config): +def _main_command_help(prog_name, commands): + """Generate main command help from commands input.""" + cli_help_str = "\n\n ".join( + [f"$ {prog_name} --help"] + + [f"$ {prog_name} {command.name} --help" for command in commands] + ) + return MAIN_DOC.format(name=prog_name, cli_help_str=cli_help_str) + + +def _pipeline_command_help(pipeline_config): # pragma: no cover """Generate pipeline command help from a sample config.""" - format_inputs = {} + if not _is_sphinx_build(): + return _cli_formatted(PIPELINE_CONFIG_DOC.format(sample_config="")) + template_names = [ "template_json_config", "template_yaml_config", "template_toml_config", ] - for name, c_type in zip(template_names, CONFIG_TYPES): - format_inputs[name] = config_as_str_for_docstring( - pipeline_config, config_type=c_type - ) - return PIPELINE_CONFIG_DOC.format(**format_inputs) + sample_config = SAMPLE_CONFIG_DOC.format( + **_format_dict(pipeline_config, template_names) + ) + doc = PIPELINE_CONFIG_DOC.format(sample_config=sample_config) + return _cli_formatted(doc) -def _batch_command_help(): +def _batch_command_help(): # pragma: no cover """Generate batch command help from a sample config.""" + if not _is_sphinx_build(): + doc = BATCH_CONFIG_DOC.format( + sample_config="", batch_args_dict="", batch_files="" + ) + return _cli_formatted(doc) + + format_inputs = {} template_config = { "pipeline_config": CommandDocumentation.REQUIRED_TAG, "sets": [ @@ -491,6 +616,16 @@ def _batch_command_help(): }, ], } + template_names = [ + "template_json_config", + "template_yaml_config", + "template_toml_config", + ] + + format_inputs["sample_config"] = SAMPLE_CONFIG_DOC.format( + **_format_dict(template_config, template_names) + ) + sample_args_dict = { "args": { "input_constant_1": [18.02, 19.04], @@ -501,56 +636,88 @@ def _batch_command_help(): ], } } - sample_files = {"files": ["./config_run.yaml", "./config_analyze.json"]} - format_inputs = {} - template_names = [ - "template_json_config", - "template_yaml_config", - "template_toml_config", - ] - for name, c_type in zip(template_names, CONFIG_TYPES): - format_inputs[name] = config_as_str_for_docstring( - template_config, config_type=c_type - ) - sample_args_names = [ "sample_json_args_dict", "sample_yaml_args_dict", "sample_toml_args_dict", ] - for name, c_type in zip(sample_args_names, CONFIG_TYPES): - format_inputs[name] = config_as_str_for_docstring( - sample_args_dict, - config_type=c_type, - num_spaces=20 if "json" in name else 24, - ) - if "json" in name: - format_inputs[name] = "\n".join( - format_inputs[name].split("\n")[1:-1] - ).lstrip() + format_inputs["batch_args_dict"] = _BATCH_ARGS_DICT.format( + **_format_dict(sample_args_dict, sample_args_names, batch_docs=True) + ) + sample_files = {"files": ["./config_run.yaml", "./config_analyze.json"]} sample_files_names = [ "sample_json_files", "sample_yaml_files", "sample_toml_files", ] - for name, c_type in zip(sample_files_names, CONFIG_TYPES): - format_inputs[name] = config_as_str_for_docstring( - sample_files, + format_inputs["batch_files"] = _BATCH_FILES.format( + **_format_dict(sample_files, sample_files_names, batch_docs=True) + ) + + doc = BATCH_CONFIG_DOC.format(**format_inputs) + return _cli_formatted(doc) + + +def _format_dict(sample, names, batch_docs=False): # pragma: no cover + """Format a sample into a documentation config""" + configs = {} + for name, c_type in zip(names, CONFIG_TYPES): + configs[name] = config_as_str_for_docstring( + sample, config_type=c_type, - num_spaces=20 if "json" in name else 24, + num_spaces=(20 if "json" in name else 24) if batch_docs else 12, ) - if "json" in name: - format_inputs[name] = "\n".join( - format_inputs[name].split("\n")[1:-1] - ).lstrip() - - return BATCH_CONFIG_DOC.format(**format_inputs) + if batch_docs and "json" in name: + configs[name] = "\n".join(configs[name].split("\n")[1:-1]).lstrip() + return configs def _as_functions(functions): - """Yield items from input, converting all classes to their init funcs""" + """Yield items from input, converting all classes to their init methods""" for func in functions: if isclass(func): func = func.__init__ yield func + + +def _line_needs_newline(line): + """Determine wether a newline should be added to the current line""" + if any( + f":{key}:" in line + for key in chain(DEFAULT_EXEC_VALUES, EXTRA_EXEC_PARAMS) + ): + return True + if not line.startswith(" "): + return True + if len(line) <= 4: + return True + if line[4] == " ": + return True + return False + + +def _format_lines(lines): + """Format docs into longer lines of text for easier wrapping in CLI""" + new_lines = [lines[0]] + current_line = [] + for line in lines[1:]: + if _line_needs_newline(line): + current_line = " ".join(current_line) + if current_line: + line = "\n".join([current_line, line]) + new_lines.append(line) + current_line = [] + else: + if current_line: + line = line.lstrip() + current_line.append(line) + + return new_lines + + +def _cli_formatted(doc): + """Apply minor formatting changes for strings when displaying to CLI""" + if not _is_sphinx_build(): + doc = doc.replace("``", "`").replace("{{", "{").replace("}}", "}") + return doc diff --git a/gaps/cli/execution.py b/gaps/cli/execution.py index 537ddee0..eb0ba910 100644 --- a/gaps/cli/execution.py +++ b/gaps/cli/execution.py @@ -61,7 +61,7 @@ def kickoff_job(ctx, cmd, exec_kwargs): exec_kwargs = _filter_exec_kwargs( exec_kwargs, hardware_option.manager.make_script_str, hardware_option ) - _kickoff_hpc_job(ctx, cmd, **exec_kwargs) + _kickoff_hpc_job(ctx, cmd, hardware_option, **exec_kwargs) def _filter_exec_kwargs(kwargs, func, hardware_option): @@ -74,7 +74,8 @@ def _filter_exec_kwargs(kwargs, func, hardware_option): if extra_keys: msg = ( f"Found extra keys in 'execution_control'! The following " - f"inputs will be ignored: {extra_keys}" + f"inputs will be ignored: {extra_keys}. To silence this warning, " + "please remove the extra keys from the 'execution_control' block." ) warn(msg, gapsWarning) @@ -114,7 +115,8 @@ def _kickoff_local_job(ctx, cmd): return name = ctx.obj["NAME"] - logger.info("Running locally with job name %r.", name) + command = ctx.obj["COMMAND_NAME"] + logger.info("Running %r locally with job name %r.", command, name) logger.debug("Submitting the following command:\n%s", cmd) Status.mark_job_as_submitted( ctx.obj["OUT_DIR"], @@ -136,14 +138,15 @@ def _kickoff_local_job(ctx, cmd): logger.info(msg) -def _kickoff_hpc_job(ctx, cmd, **kwargs): +def _kickoff_hpc_job(ctx, cmd, hardware_option, **kwargs): """Run a job (command) on the HPC.""" if not _should_run(ctx): return name = ctx.obj["NAME"] - logger.info("Running on HPC with job name %r.", name) + command = ctx.obj["COMMAND_NAME"] + logger.info("Running %r on HPC with job name %r.", command, name) logger.debug("Submitting the following command:\n%s", cmd) out = ctx.obj["MANAGER"].submit(name, cmd=cmd, **kwargs)[0] id_msg = f" (Job ID #{out})" if out else "" @@ -156,7 +159,7 @@ def _kickoff_hpc_job(ctx, cmd, **kwargs): replace=True, job_attrs={ StatusField.JOB_ID: out, - StatusField.HARDWARE: HardwareOption.EAGLE, + StatusField.HARDWARE: hardware_option, StatusField.QOS: kwargs.get("qos") or QOSOption.UNSPECIFIED, StatusField.JOB_STATUS: StatusOption.SUBMITTED, StatusField.TIME_SUBMITTED: dt.datetime.now().strftime(DT_FMT), @@ -175,7 +178,10 @@ def _should_run(ctx): job_name=name, subprocess_manager=ctx.obj.get("MANAGER"), ) - if status == StatusOption.SUCCESSFUL: + if status == StatusOption.NOT_SUBMITTED: + return True + + if status in {StatusOption.SUCCESSFUL, StatusOption.COMPLETE}: msg = ( f"Job {name!r} is successful in status json found in {out_dir!r}, " f"not re-running." diff --git a/gaps/cli/pipeline.py b/gaps/cli/pipeline.py index 5288d8d1..a1c16e2f 100644 --- a/gaps/cli/pipeline.py +++ b/gaps/cli/pipeline.py @@ -6,15 +6,18 @@ import sys import logging from pathlib import Path +from warnings import warn import click +import psutil from gaps import Pipeline -from gaps.config import ConfigType +from gaps.config import ConfigType, init_logging_from_config_file from gaps.cli.documentation import _pipeline_command_help from gaps.cli.command import _WrappedCommand -from gaps.status import Status +from gaps.status import Status, StatusField from gaps.exceptions import gapsExecutionError +from gaps.warnings import gapsWarning logger = logging.getLogger(__name__) @@ -59,9 +62,12 @@ def pipeline(ctx, config_file, cancel, monitor, background=False): config_file = config_file[0] + init_logging_from_config_file(config_file, background=background) + if cancel: Pipeline.cancel_all(config_file) return + if background: if not _can_run_background(): msg = ( @@ -72,6 +78,20 @@ def pipeline(ctx, config_file, cancel, monitor, background=False): ctx.obj["LOG_STREAM"] = False _kickoff_background(config_file) + project_dir = str(Path(config_file).parent.expanduser().resolve()) + status = Status(project_dir).update_from_all_job_files(purge=False) + monitor_pid = status.get(StatusField.MONITOR_PID) + if monitor_pid is not None and psutil.pid_exists(monitor_pid): + msg = ( + f"Another pipeline in {project_dir!r} is running on monitor PID: " + f"{monitor_pid}. Not starting a new pipeline execution." + ) + warn(msg, gapsWarning) + return + + if monitor: + Status.record_monitor_pid(Path(config_file).parent, os.getpid()) + Pipeline.run(config_file, monitor=monitor) @@ -127,7 +147,11 @@ def pipeline_command(template_config): context_settings=None, callback=pipeline, params=params, - help="""Execute multiple steps in an analysis pipeline""", + help=( + "Execute multiple steps in an analysis pipeline.\n\n" + "The general structure for calling this CLI command is given " + "below (add``--help`` to print help info to the terminal)." + ), epilog=None, short_help=None, options_metavar="[OPTIONS]", diff --git a/gaps/cli/preprocessing.py b/gaps/cli/preprocessing.py index 1ba2cd94..dc5c4901 100644 --- a/gaps/cli/preprocessing.py +++ b/gaps/cli/preprocessing.py @@ -71,13 +71,13 @@ def preprocess_collect_config( out filepath input from the user. command_name : str Name of the command being run. This is used to parse the - pipeline status for output files if "collect_pattern"="PIPELINE" - in the input `config`. + pipeline status for output files if + ``collect_pattern="PIPELINE"`` in the input `config`. collect_pattern : str | list | dict, optional - Unix-style /filepath/pattern*.h5 representing the files to be - collected into a single output HDF5 file. If no output file path - is specified (i.e. this input is a single pattern or a list of - patterns), the output file path will be inferred from the + Unix-style ``/filepath/pattern*.h5`` representing the files to + be collected into a single output HDF5 file. If no output file + path is specified (i.e. this input is a single pattern or a list + of patterns), the output file path will be inferred from the pattern itself (specifically, the wildcard will be removed and the result will be the output file path). If a list of patterns is provided, each pattern will be collected into a @@ -86,10 +86,10 @@ def preprocess_collect_config( output file (including the filename itself; relative paths are allowed) and the values are patterns representing the files that should be collected into the output file. If running a collect - job as part of a pipeline, this input can be set to "PIPELINE", - which will parse the output of the previous step and generate - the input file pattern and output file name automatically. - By default, ``"PIPELINE"``. + job as part of a pipeline, this input can be set to + ``"PIPELINE"``, which will parse the output of the previous step + and generate the input file pattern and output file name + automatically. By default, ``"PIPELINE"``. Returns ------- diff --git a/gaps/cli/status.py b/gaps/cli/status.py index 9a4e42c7..e7e05834 100644 --- a/gaps/cli/status.py +++ b/gaps/cli/status.py @@ -9,7 +9,7 @@ import psutil import pandas as pd from colorama import init, Fore, Style -from tabulate import tabulate +from tabulate import tabulate, SEPARATING_LINE from gaps.status import ( DT_FMT, @@ -24,6 +24,7 @@ from gaps.cli.command import _WrappedCommand +JOB_STATUS_COL = StatusField.JOB_STATUS.value FAILURE_STRINGS = ["failure", "fail", "failed", "f"] RUNNING_STRINGS = ["running", "run", "r"] SUBMITTED_STRINGS = ["submitted", "submit", "sb", "pending", "pend", "p"] @@ -48,6 +49,9 @@ Display the status of a project FOLDER. By default, the status of the current working directory is displayed. + +The general structure for calling this CLI command is given below +(add``--help`` to print help info to the terminal)." """ @@ -86,20 +90,79 @@ def _filter_df_for_status(df, status_request): return df.copy() -def _color_print(df, print_folder, commands, status): - """Color the status portion of the print out.""" +def _calculate_runtime_stats(df): + """Calculate df runtime statistics""" - def color_string(string): - if string == StatusOption.FAILED: - string = f"{Fore.RED}{string}{Style.RESET_ALL}" - elif string == StatusOption.SUCCESSFUL: - string = f"{Fore.GREEN}{string}{Style.RESET_ALL}" - elif string == StatusOption.RUNNING: - string = f"{Fore.BLUE}{string}{Style.RESET_ALL}" - else: - string = f"{Fore.YELLOW}{string}{Style.RESET_ALL}" - return string + run_times_seconds = df[StatusField.RUNTIME_SECONDS] + if run_times_seconds.isna().any(): + return pd.DataFrame() + runtime_stats = pd.DataFrame(run_times_seconds.astype(float).describe()) + runtime_stats = runtime_stats.T[["min", "mean", "50%", "max"]] + runtime_stats = runtime_stats.rename(columns={"50%": "median"}) + # runtime_stats = runtime_stats.rename( + # columns={ + # col: f"{Fore.MAGENTA}{col}{Style.RESET_ALL}" + # for col in runtime_stats.columns + # } + # ) + runtime_stats.index = ["Node runtime"] + runtime_stats = runtime_stats.T + runtime_stats["Node runtime"] = runtime_stats["Node runtime"].apply( + _elapsed_time_as_str + ) + return pd.DataFrame(runtime_stats).reset_index() + + +def _calculate_walltime(df): + """Calculate total project walltime""" + all_jobs_failed = (df[StatusField.JOB_STATUS] == StatusOption.FAILED).all() + all_end_times_missing = df[StatusField.TIME_END].isna().all() + if all_jobs_failed and all_end_times_missing: + return 0 + + start_time = df[StatusField.TIME_SUBMITTED].fillna( + dt.datetime.now().strftime(DT_FMT) + ) + start_time = pd.to_datetime(start_time, format=DT_FMT).min() + + end_time = df[StatusField.TIME_END].fillna( + dt.datetime.now().strftime(DT_FMT) + ) + end_time = pd.to_datetime(end_time, format=DT_FMT).max() + return (end_time - start_time).total_seconds() + + +def _calculate_aus(df): + """Calculate the number of AU's spent by jobs""" + run_times_seconds = df[StatusField.RUNTIME_SECONDS] + + hardware_charge_factors = df[StatusField.HARDWARE].apply( + _extract_qos_charge_factor, enum_class=HardwareOption + ) + qos_charge_factors = df[StatusField.QOS].apply( + _extract_qos_charge_factor, enum_class=QOSOption + ) + aus_used = ( + run_times_seconds / 3600 * hardware_charge_factors * qos_charge_factors + ) + return int(aus_used.sum()) + +def _color_string(string): + """Color string value based on status option""" + if string == StatusOption.FAILED: + string = f"{Fore.RED}{string}{Style.RESET_ALL}" + elif string == StatusOption.SUCCESSFUL: + string = f"{Fore.GREEN}{string}{Style.RESET_ALL}" + elif string == StatusOption.RUNNING: + string = f"{Fore.BLUE}{string}{Style.RESET_ALL}" + else: + string = f"{Fore.YELLOW}{string}{Style.RESET_ALL}" + return string + + +def _print_intro(print_folder, commands, status, monitor_pid): + """Print intro including project folder and command/status filters.""" extras = [] commands = ", ".join(commands or []) if commands: @@ -113,52 +176,127 @@ def color_string(string): else: extras = "" - pid = df.monitor_pid - total_runtime_seconds = df.total_runtime_seconds - total_aus_used = int(df.total_aus_used) - walltime = df.walltime - name = f"\n{Fore.CYAN}{print_folder}{extras}{Style.RESET_ALL}:" - job_status = StatusField.JOB_STATUS - df[job_status.value] = df[job_status].apply(color_string) + print(f"\n{Fore.CYAN}{print_folder}{extras}{Style.RESET_ALL}:") + if monitor_pid is not None and psutil.pid_exists(monitor_pid): + print(f"{Fore.MAGENTA}MONITOR PID: {monitor_pid}{Style.RESET_ALL}") + + +def _print_df(df): + """Print main status body (table of job statuses)""" + df[JOB_STATUS_COL] = df[JOB_STATUS_COL].apply(_color_string) df = df.fillna("--") + + pdf = pd.concat([df, pd.DataFrame({JOB_STATUS_COL: [SEPARATING_LINE]})]) pdf = tabulate( - df, + pdf, showindex=True, # cspell:disable-line headers=df.columns, tablefmt="simple", # cspell:disable-line disable_numparse=[3], # cspell:disable-line ) - print(name) - if pid is not None and psutil.pid_exists(pid): - print(f"{Fore.MAGENTA}MONITOR PID: {pid}{Style.RESET_ALL}") + pdf = pdf.split("\n") + pdf[-1] = pdf[-1].replace(" ", "-") + pdf = "\n".join(pdf) print(pdf) - if total_runtime_seconds is not None: - runtime_str = ( - f"Total Runtime: {_elapsed_time_as_str(total_runtime_seconds)}" - ) - # TODO: make prettier - # divider = "".join(["-"] * len(runtime_str)) - # divider = "".join(["~"] * 3) - divider = "" - print(divider) - print(f"{Style.BRIGHT}{runtime_str}{Style.RESET_ALL}") - if walltime > 2: - walltime_str = ( - f"Total project time (including queue): " - f"{_elapsed_time_as_str(walltime)}" - ) - print(f"{Style.BRIGHT}{walltime_str}{Style.RESET_ALL}") - if total_aus_used > 0: - au_str = f"Total AUs spent: {total_aus_used:,}" - print(f"{Style.BRIGHT}{au_str}{Style.RESET_ALL}") - print( - f"{Style.BRIGHT}**Statistics only include shown jobs (excluding " - f"any previous runs)**{Style.RESET_ALL}" - ) - print() -def main_monitor(folder, commands, status, include): +def _print_job_status_statistics(df): + """Print job status statistics.""" + statistics_str = f"Total number of jobs: {df.shape[0]}" + counts = pd.DataFrame(df[JOB_STATUS_COL].value_counts()).reset_index() + counts = tabulate( + # pylint: disable=unsubscriptable-object + counts[["count", JOB_STATUS_COL]].values, + showindex=False, # cspell:disable-line + headers=counts.columns, + tablefmt="simple", # cspell:disable-line + disable_numparse=[1], # cspell:disable-line + ) + counts = "\n".join( + [f" {substr.lstrip()}" for substr in counts.split("\n")[2:]] + ) + print(f"{Style.BRIGHT}{statistics_str}{Style.RESET_ALL}") + print(counts) + + +def _print_runtime_stats(runtime_stats, total_runtime_seconds): + """Print node runtime statistics""" + + runtime_str = ( + f"Total node runtime: " + f"{_elapsed_time_as_str(total_runtime_seconds)}" + ) + print(f"{Style.BRIGHT}{runtime_str}{Style.RESET_ALL}") + if runtime_stats.empty: + return + + runtime_stats = tabulate( + runtime_stats, + showindex=False, # cspell:disable-line + headers=runtime_stats.columns, + tablefmt="simple", # cspell:disable-line + ) + runtime_stats = "\n".join( + [f" {substr}" for substr in runtime_stats.split("\n")[2:]] + ) + print(runtime_stats) + + +def _print_au_usage(total_aus_used): + """Print the job AU usage.""" + if total_aus_used <= 0: + return + + au_str = f"Total AUs spent: {total_aus_used:,}" + print(f"{Style.BRIGHT}{au_str}{Style.RESET_ALL}") + + +def _print_total_walltime(walltime): + """Print the total project walltime.""" + if walltime <= 2: + return + walltime_str = ( + f"Total project wall time (including queue and downtime " + f"between steps): {_elapsed_time_as_str(walltime)}" + ) + print(f"{Style.BRIGHT}{walltime_str}{Style.RESET_ALL}") + + +def _print_disclaimer(): + """Print disclaimer about statistics.""" + print( + f"{Style.BRIGHT}**Statistics only include shown jobs (excluding " + f"any previous runs or other steps)**{Style.RESET_ALL}" + ) + + +def _color_print( + df, + print_folder, + commands, + status, + walltime, + runtime_stats, + total_aus_used, + monitor_pid=None, + total_runtime_seconds=None, +): + """Color the status portion of the print out.""" + + _print_intro(print_folder, commands, status, monitor_pid) + _print_df(df) + _print_job_status_statistics(df) + + if total_runtime_seconds is None: + return + + _print_runtime_stats(runtime_stats, total_runtime_seconds) + _print_au_usage(total_aus_used) + _print_total_walltime(walltime) + _print_disclaimer() + + +def main_monitor(folder, commands, status, include, recursive): """Run the appropriate monitor functions for a folder. Parameters @@ -171,16 +309,22 @@ def main_monitor(folder, commands, status, include): Container with the statuses to display. include : container of str Container of extra keys to pull from the status files. + recursive : bool + Option to perform recursive search of directories. """ init() - folder = Path(folder).expanduser().resolve() - for directory in chain([folder], folder.rglob("*")): + folders = [Path(folder).expanduser().resolve()] + if recursive: + folders = chain(folders, folders[0].rglob("*")) + + for directory in folders: if not directory.is_dir(): continue pipe_status = Status(directory) if not pipe_status: + print(f"No non-empty status file found in {str(directory)!r}. ") continue include_with_runtime = list(include) + [StatusField.RUNTIME_SECONDS] @@ -190,35 +334,28 @@ def main_monitor(folder, commands, status, include): if status: df = _filter_df_for_status(df, status) - run_times_seconds = df[StatusField.RUNTIME_SECONDS] - hardware_charge_factors = df[StatusField.HARDWARE].apply( - _extract_qos_charge_factor, enum_class=HardwareOption - ) - qos_charge_factors = df[StatusField.QOS].apply( - _extract_qos_charge_factor, enum_class=QOSOption - ) - aus_used = ( - run_times_seconds - / 3600 - * hardware_charge_factors - * qos_charge_factors - ) - df = df[list(df.columns)[:-1]] - df.monitor_pid = pipe_status.get(StatusField.MONITOR_PID) - df.total_runtime_seconds = run_times_seconds.sum() - df.total_aus_used = aus_used.sum() + if df.empty: + print( + f"No status data found to display for {str(directory)!r}. " + "Please check your filters and try again." + ) + continue - start_time = df[StatusField.TIME_SUBMITTED].fillna( - dt.datetime.now().strftime(DT_FMT) - ) - start_time = pd.to_datetime(start_time, format=DT_FMT).min() + runtime_stats = _calculate_runtime_stats(df) + aus_used = _calculate_aus(df) + walltime = _calculate_walltime(df) - end_time = df[StatusField.TIME_END].fillna( - dt.datetime.now().strftime(DT_FMT) + _color_print( + df[list(df.columns)[:-1]].copy(), + directory.name, + commands, + status, + walltime, + runtime_stats, + total_aus_used=aus_used, + monitor_pid=pipe_status.get(StatusField.MONITOR_PID), + total_runtime_seconds=df[StatusField.RUNTIME_SECONDS].sum(), ) - end_time = pd.to_datetime(end_time, format=DT_FMT).max() - df.walltime = (end_time - start_time).total_seconds() - _color_print(df, directory.name, commands, status) def status_command(): @@ -261,6 +398,13 @@ def status_command(): "this option (e.g. :code:`-i key1 -i key2 ...`) By default, no " "extra keys are displayed.", ), + click.Option( + param_decls=["--recursive", "-r"], + is_flag=True, + help="Option to perform a recursive search of directories " + "(starting with the input directory). The status of every nested " + "directory is reported.", + ), ] return _WrappedCommand( diff --git a/gaps/cli/templates.py b/gaps/cli/templates.py index a1893c46..52c241a5 100644 --- a/gaps/cli/templates.py +++ b/gaps/cli/templates.py @@ -16,6 +16,7 @@ logger = logging.getLogger(__name__) + # pylint: disable=redefined-builtin @click.pass_context def _make_template_config(ctx, commands, type, configs): @@ -97,9 +98,13 @@ def template_command(template_configs): configs=template_configs, ), params=params, - help="Generate template config files for requested COMMANDS. If no " - "COMMANDS are given, config files for the entire pipeline are " - "generated.", + help=( + "Generate template config files for requested COMMANDS. If no " + "COMMANDS are given, config files for the entire pipeline are " + "generated.\n\nThe general structure for calling this CLI " + "command is given below (add``--help`` to print help info to " + "the terminal)." + ), epilog=None, short_help=None, options_metavar="", diff --git a/gaps/collection.py b/gaps/collection.py index 8f5e925f..086a651f 100644 --- a/gaps/collection.py +++ b/gaps/collection.py @@ -14,7 +14,7 @@ import pandas as pd from rex import Resource, Outputs -from rex.utilities import log_versions +from gaps.log import log_versions from gaps.warnings import gapsCollectionWarning from gaps.exceptions import gapsRuntimeError from gaps.utilities import project_points_from_container_or_slice @@ -56,7 +56,7 @@ def __init__( gids, dataset_in, dataset_out=None, - mem_util_lim=0.7, + memory_utilization_limit=0.7, pass_through=False, ): """ @@ -74,7 +74,7 @@ def __init__( Name of dataset into which collected data is to be written. If `None` the name of the output dataset is assumed to match the dataset input name. By default, `None`. - mem_util_lim : float, optional + memory_utilization_limit : float, optional Memory utilization limit (fractional). This sets how many sites will be collected at a time. By default, `0.7`. pass_through : bool, optional @@ -96,7 +96,7 @@ def __init__( self._dataset_out = dataset_out tot_mem = psutil.virtual_memory().total - self._mem_avail = mem_util_lim * tot_mem + self._mem_avail = memory_utilization_limit * tot_mem self._axis, self._site_mem_req = self._pre_collect() logger.debug( @@ -362,7 +362,7 @@ def collect_dataset( gids, dataset_in, dataset_out=None, - mem_util_lim=0.7, + memory_utilization_limit=0.7, pass_through=False, ): """Collect a dataset from multiple source files into a single file. @@ -381,7 +381,7 @@ def collect_dataset( Name of dataset into which collected data is to be written. If `None` the name of the output dataset is assumed to match the dataset input name. By default, `None`. - mem_util_lim : float, optional + memory_utilization_limit : float, optional Memory utilization limit (fractional). This sets how many sites will be collected at a time. By default, `0.7`. pass_through : bool, optional @@ -395,7 +395,7 @@ def collect_dataset( gids, dataset_in, dataset_out=dataset_out, - mem_util_lim=mem_util_lim, + memory_utilization_limit=memory_utilization_limit, pass_through=pass_through, ) collector._collect() @@ -425,7 +425,7 @@ def __init__( Flag to purge output HDF5 file if it already exists. By default, `False`. """ - log_versions(logger) + log_versions() self.h5_out = Path(h5_file) self.collect_pattern = collect_pattern if clobber and self.h5_out.exists(): @@ -623,7 +623,7 @@ def collect( self, dataset_in, dataset_out=None, - mem_util_lim=0.7, + memory_utilization_limit=0.7, pass_through=False, ): """Collect a dataset from h5_dir to h5_file @@ -637,7 +637,7 @@ def collect( Name of dataset into which collected data is to be written. If `None` the name of the output dataset is assumed to match the dataset input name. By default, `None`. - mem_util_lim : float + memory_utilization_limit : float Memory utilization limit (fractional). This sets how many sites will be collected at a time. By default, `0.7`. pass_through : bool @@ -670,7 +670,7 @@ def collect( self.gids, dataset_in, dataset_out=dataset_out, - mem_util_lim=mem_util_lim, + memory_utilization_limit=memory_utilization_limit, pass_through=pass_through, ) @@ -687,7 +687,7 @@ def add_dataset( collect_pattern, dataset_in, dataset_out=None, - mem_util_lim=0.7, + memory_utilization_limit=0.7, pass_through=False, ): """Collect and add a dataset to a single HDF5 file. @@ -708,7 +708,7 @@ def add_dataset( Name of dataset into which collected data is to be written. If `None` the name of the output dataset is assumed to match the dataset input name. By default, `None`. - mem_util_lim : float + memory_utilization_limit : float Memory utilization limit (fractional). This sets how many sites will be collected at a time. By default, `0.7`. pass_through : bool @@ -735,7 +735,7 @@ def add_dataset( collector.collect( dataset_in, dataset_out=dataset_out, - mem_util_lim=mem_util_lim, + memory_utilization_limit=memory_utilization_limit, pass_through=pass_through, ) diff --git a/gaps/config.py b/gaps/config.py index f0a5b06a..a5a6d397 100644 --- a/gaps/config.py +++ b/gaps/config.py @@ -9,6 +9,7 @@ import json import yaml import toml +import pyjson5 from gaps.log import init_logger from gaps.utilities import CaseInsensitiveEnum, resolve_path @@ -17,6 +18,18 @@ _CONFIG_HANDLER_REGISTRY = {} +# pylint: disable=too-few-public-methods +class _JSON5Formatter: + """Format input JSON5 data with indentation.""" + + def __init__(self, data): + self.data = data + + def _format_as_json(self): + """Format the data input with as string with indentation.""" + return json.dumps(self.data, indent=4) + + class Handler(ABC): """ABC for configuration file handler.""" @@ -84,6 +97,36 @@ def loads(cls, config_str): return json.loads(config_str) +# pylint: disable=no-member +class JSON5Handler(Handler): + """JSON5 config file handler.""" + + FILE_EXTENSION = "json5" + + @classmethod + def dump(cls, config, stream): + """Write the config to a stream (JSON5 file).""" + return pyjson5.encode_io( + _JSON5Formatter(config), + stream, + supply_bytes=False, + tojson="_format_as_json", + ) + + @classmethod + def dumps(cls, config): + """Convert the config to a JSON5 string.""" + return pyjson5.encode( + _JSON5Formatter(config), + tojson="_format_as_json", + ) + + @classmethod + def loads(cls, config_str): + """Parse the JSON5 string into a config dictionary.""" + return pyjson5.decode(config_str, maxdepth=-1) + + class YAMLHandler(Handler): """YAML config file handler.""" @@ -255,26 +298,34 @@ def resolve_all_paths(container, base_dir): return container -def init_logging_from_config(config): +def init_logging_from_config_file(config_file, background=False): """Init logging, taking care of legacy rex-style kwargs. Parameters ---------- - config : dict - Dictionary which may or may not contain a "logging" key. If key - not found, no further action is taken. If key is found, the - value is expected to be a dictionary of keyword-argument pairs + config_file : path-like + Path to a config file parsable as a dictionary which may or may + not contain a "logging" key. If key not found, no further action + is taken. If key is found, the value is expected to be a + dictionary of keyword-argument pairs to :func:`gaps.log.init_logger`. rex-style keys ("log_level", "log_file", "log_format") are allowed. The values of these inputs are used to initialize a gaps logger. + background : bool, optional + Optional flag to specify background job initialization. If + ``True``, then stream output is disabled. By default, ``False``. """ - + config = load_config(config_file) if "logging" not in config: return + kwargs = config["logging"] kwarg_map = {"log_level": "level", "log_file": "file", "log_format": "fmt"} for legacy_kwarg, new_kwarg in kwarg_map.items(): if legacy_kwarg in kwargs: kwargs[new_kwarg] = kwargs.pop(legacy_kwarg) + if background: + kwargs["stream"] = False + init_logger(**kwargs) diff --git a/gaps/hpc.py b/gaps/hpc.py index 7649585d..63d636ea 100644 --- a/gaps/hpc.py +++ b/gaps/hpc.py @@ -102,6 +102,10 @@ def queue(self): return self._queue + def reset_query_cache(self): + """Reset the query dict cache so that hardware is queried again.""" + self._queue = None + def check_status_using_job_id(self, job_id): """Check the status of a job using the HPC queue and job ID. @@ -154,7 +158,7 @@ def cancel(self, arg): self.cancel(job_id) elif str(arg).lower() == "all": - self._queue = None + self.reset_query_cache() for job_id in self.queue.keys(): self.cancel(job_id) @@ -256,7 +260,7 @@ def _teardown_submission(self, name, out, err, keep_sh=False): def _mark_job_as_submitted(self, name, out): """Mark job as submitted in the queue.""" - job_id = int(out.split(" ")[-1]) + job_id = int(_job_id_or_out(out).split(" ")[-1]) out = str(job_id) logger.debug("Job %r with id #%s submitted successfully", name, job_id) self._queue[job_id] = { @@ -655,7 +659,7 @@ def submit(cmd, background=False, background_stdout=False): return "", "" stdout, stderr = _subprocess_popen(cmd) - return _job_id_or_out(stdout), stderr + return stdout, stderr def format_walltime(hours=None): diff --git a/gaps/legacy.py b/gaps/legacy.py index c4be6613..bbeaa731 100644 --- a/gaps/legacy.py +++ b/gaps/legacy.py @@ -152,6 +152,34 @@ def make_job_file(status_dir, module, job_name, attrs): attrs=attrs, ) + @staticmethod + def _update_job_status_from_hardware(job_data, hardware_status_retriever): + """Update job status from HPC hardware if needed.""" + + # init defaults in case job/command not in status file yet + job_status = job_data.get(gaps.status.StatusField.JOB_STATUS, None) + job_id = job_data.get(gaps.status.StatusField.JOB_ID, None) + job_hardware = job_data.get(gaps.status.StatusField.HARDWARE, None) + + # get job status from hardware + current = hardware_status_retriever[job_id, job_hardware] + + # No current status and job was not successful: failed! + if ( + current is None + and job_status != gaps.status.StatusOption.SUCCESSFUL + ): + job_data[ + gaps.status.StatusField.JOB_STATUS + ] = gaps.status.StatusOption.FAILED + + # do not overwrite a successful or failed job status. + elif ( + current != job_status + and job_status not in gaps.status.StatusOption.members_as_str() + ): + job_data[gaps.status.StatusField.JOB_STATUS] = current + # pylint: disable=no-member,invalid-name,super-init-not-called class Pipeline(gaps.pipeline.Pipeline): diff --git a/gaps/log.py b/gaps/log.py index 1d483d96..7c1b92bf 100644 --- a/gaps/log.py +++ b/gaps/log.py @@ -20,8 +20,8 @@ def log_versions(): """Log package versions.""" - gaps.logger.debug("Running with gaps version %s", gaps.__version__) - gaps.logger.debug(" - rex version %s", rex.__version__) + gaps.logger.info("Running with gaps version %s", gaps.__version__) + rex.utilities.log_versions(gaps.logger) def log_mem(log_level="DEBUG"): @@ -190,13 +190,16 @@ def print_logging_info(): logger_names = [__name__.split(".", maxsplit=1)[0]] for name in logger_names: print(f"LOGGER: {name!r}") - log_to_debug = logging.getLogger(name) - while log_to_debug is not None: + logger_to_debug = logging.getLogger(name) + while logger_to_debug is not None: print( - f"level: {log_to_debug.level}, name: {log_to_debug.name}," - f"handlers: {log_to_debug.handlers}" + f"level: {logger_to_debug.level}, " + f"name: {logger_to_debug.name}, " + f"handlers: " ) - log_to_debug = log_to_debug.parent + for handler in logger_to_debug.handlers: + print(f"\t +++ {handler.name} - {handler}") + logger_to_debug = logger_to_debug.parent def print_logging_info_all_libraries(): diff --git a/gaps/pipeline.py b/gaps/pipeline.py index 26e900b8..7d3bae95 100644 --- a/gaps/pipeline.py +++ b/gaps/pipeline.py @@ -7,10 +7,9 @@ from pathlib import Path from warnings import warn -from gaps.hpc import SLURM -from gaps.status import Status, StatusOption, StatusField +from gaps.status import Status, StatusOption, StatusField, HardwareOption from gaps.utilities import recursively_update_dict -from gaps.config import load_config, init_logging_from_config +from gaps.config import load_config from gaps.exceptions import ( gapsConfigError, gapsExecutionError, @@ -49,17 +48,9 @@ def __init__(self, pipeline, monitor=True): self._out_dir = self._out_dir.as_posix() config = load_config(pipeline) - - if "pipeline" not in config: - raise gapsConfigError( - 'Could not find required key "pipeline" in the ' - "pipeline config." - ) - self._run_list = config["pipeline"] + self._run_list = _check_pipeline(config) self._init_status() - init_logging_from_config(config) - def _init_status(self): """Initialize the status json in the output directory.""" status = self.status @@ -86,9 +77,11 @@ def name(self): def _cancel_all_jobs(self): """Cancel all jobs in this pipeline.""" - slurm_manager = SLURM() - for job_id in self.status.job_ids: - slurm_manager.cancel(job_id) + status = self.status + for job_id, hardware in zip(status.job_ids, status.job_hardware): + manager = HardwareOption(hardware).manager + if manager is not None: + manager.cancel(job_id) logger.info("Pipeline job %r cancelled.", self.name) def _main(self): @@ -109,7 +102,11 @@ def _main(self): break time.sleep(1) - self._monitor(step) + try: + self._monitor(step) + except Exception as error: + self._cancel_all_jobs() + raise error else: logger.info("Pipeline job %r is complete.", self.name) @@ -120,6 +117,7 @@ def _monitor(self, step, seconds=5, step_status=StatusOption.RUNNING): while step_status.is_processing: time.sleep(seconds) + HardwareOption.reset_all_cached_queries() step_status = self._status(step) if step_status == StatusOption.FAILED: @@ -148,7 +146,7 @@ def _status(self, step): status = self.status submitted = _check_jobs_submitted(status, command) if not submitted: - return StatusOption.RUNNING + return StatusOption.NOT_SUBMITTED return self._get_command_return_code(status, command) @@ -162,7 +160,7 @@ def _get_command_return_code(self, status, command): # initialize return code array arr = [] check_failed = False - status.update_from_all_job_files() + status.update_from_all_job_files(check_hardware=True) if command not in status.data: # assume running @@ -186,7 +184,7 @@ def _get_command_return_code(self, status, command): elif job_status is None: arr.append(StatusOption.COMPLETE) else: - msg = "Job status code {job_status!r} not understood!" + msg = f"Job status code {job_status!r} not understood!" raise gapsValueError(msg) _dump_sorted(status) @@ -197,11 +195,12 @@ def _get_command_return_code(self, status, command): if return_code != StatusOption.FAILED and check_failed: fail_str = ", but some jobs have failed" logger.info( - "CLI command %r for job %r %s%s.", + "CLI command %r for job %r %s%s. (%s)", command, self.name, return_code.with_verb, # pylint: disable=no-member fail_str, + time.ctime(), ) return return_code @@ -235,6 +234,35 @@ def run(cls, pipeline, monitor=True): cls(pipeline, monitor=monitor)._main() +def _check_pipeline(config): + """Check pipeline steps input.""" + + if "pipeline" not in config: + raise gapsConfigError( + "Could not find required key " + '"pipeline" ' + "in the pipeline config." + ) + + pipeline = config["pipeline"] + + if not isinstance(pipeline, list): + raise gapsConfigError( + 'Config arg "pipeline" must be a list of ' + f"{{command: f_config}} pairs, but received {type(pipeline)}." + ) + + for step_dict in pipeline: + for f_config in step_dict.values(): + if not Path(f_config).expanduser().resolve().exists(): + raise gapsConfigError( + "Pipeline step depends on non-existent " + f"file: {f_config}" + ) + + return pipeline + + def _parse_code_array(arr): """Parse array of return codes to get single return code for command.""" @@ -288,6 +316,12 @@ def _sort_key(status_entry): def parse_previous_status(status_dir, command, key=StatusField.OUT_FILE): """Parse key from job status(es) from the previous pipeline step. + This command DOES NOT check the HPC queue for jobs and therefore + DOES NOT update the status of previously running jobs that have + errored out of the HPC queue. For best results, ensure that all + previous steps of a pipeline have finished processing before calling + this function. + Parameters ---------- status_dir : path-like diff --git a/gaps/status.py b/gaps/status.py index 36d9aa17..e28c6866 100644 --- a/gaps/status.py +++ b/gaps/status.py @@ -1,9 +1,10 @@ # -*- coding: utf-8 -*- """gaps Job status manager. """ import json -import logging import time import shutil +import pprint +import logging import datetime as dt from pathlib import Path from copy import deepcopy @@ -79,6 +80,14 @@ def _new_post_hook(cls, obj, value): obj.charge_factor = 0 return obj + # pylint: disable=no-member + @classmethod + def reset_all_cached_queries(cls): + """Reset all cached hardware queries.""" + cls.EAGLE.manager.reset_query_cache() + cls.KESTREL.manager.reset_query_cache() + cls.PEREGRINE.manager.reset_query_cache() + class StatusOption(CaseInsensitiveEnum): """A collection of job status options.""" @@ -192,6 +201,11 @@ def job_ids(self): """list: Flat list of job ids.""" return _get_attr_flat_list(self.data, key=StatusField.JOB_ID) + @property + def job_hardware(self): + """list: Flat list of job hardware options.""" + return _get_attr_flat_list(self.data, key=StatusField.HARDWARE) + def as_df(self, commands=None, index_name="job_name", include_cols=None): """Format status as pandas DataFrame. @@ -213,7 +227,7 @@ def as_df(self, commands=None, index_name="job_name", include_cols=None): include_cols = include_cols or [] output_cols = self._DF_COLUMNS + list(include_cols) - self.update_from_all_job_files(purge=False) + self.update_from_all_job_files(purge=False, check_hardware=True) if not self.data: return pd.DataFrame(columns=output_cols) @@ -234,6 +248,7 @@ def as_df(self, commands=None, index_name="job_name", include_cols=None): continue command_df[f"{StatusField.PIPELINE_INDEX}"] = command_index commands.append(command_df) + try: command_df = pd.concat(commands, sort=False) except ValueError: @@ -273,7 +288,7 @@ def dump(self): backup.unlink(missing_ok=True) - def update_from_all_job_files(self, purge=True): + def update_from_all_job_files(self, check_hardware=False, purge=True): """Update status from all single-job job status files. This method loads all single-job status files in the target @@ -282,9 +297,14 @@ def update_from_all_job_files(self, purge=True): Parameters ---------- + check_hardware : bool, optional + Option to check hardware status for job failures for jobs + with a "running" status. This is useful because the + "running" status may not be correctly updated if the job + terminates abnormally on the HPC. By default, `False`. purge : bool, optional Option to purge the individual status files. - By default,`True`. + By default, `True`. Returns ------- @@ -301,11 +321,52 @@ def update_from_all_job_files(self, purge=True): monitor_pid_info = _safe_load(monitor_pid_file, purge=purge) self.data.update(monitor_pid_info) + if check_hardware: + self._update_from_hardware() + if purge: self.dump() return self + def _update_from_hardware(self): + """Check all job status against hardware status.""" + hardware_status_retriever = HardwareStatusRetriever() + for job_data in self._job_statuses(): + self._update_job_status_from_hardware( + job_data, hardware_status_retriever + ) + + def _job_statuses(self): + """Iterate over job status dicts. Ignore other info in self.data""" + for status in self.values(): + try: + yield from _iter_job_status(status) + except AttributeError: + continue + + @staticmethod + def _update_job_status_from_hardware(job_data, hardware_status_retriever): + """Update job status to failed if it is processing but DNE on HPC.""" + + status = job_data.get( + StatusField.JOB_STATUS, StatusOption.NOT_SUBMITTED + ) + try: + if not StatusOption(status).is_processing: + return + except ValueError: + pass + + job_id = job_data.get(StatusField.JOB_ID, None) + job_hardware = job_data.get(StatusField.HARDWARE, None) + + # get job status from hardware + current = hardware_status_retriever[job_id, job_hardware] + # No current status and job was not successful: failed! + if current is None: + job_data[StatusField.JOB_STATUS] = StatusOption.FAILED + def update_job_status( self, command, job_name, hardware_status_retriever=None ): @@ -333,37 +394,20 @@ def update_job_status( # Update status data dict and file if job file was found if current is not None: self.data = recursively_update_dict(self.data, current) - self.dump() # check job status via hardware if job file not found. elif command in self.data: # job exists if job_name in self.data[command]: - job_data = self.data[command][job_name] - - # init defaults in case job/command not in status file yet - previous = job_data.get(StatusField.JOB_STATUS, None) - job_id = job_data.get(StatusField.JOB_ID, None) - job_hardware = job_data.get(StatusField.HARDWARE, None) - - # get job status from hardware - current = hardware_status_retriever[job_id, job_hardware] - - # No current status and job was not successful: failed! - if current is None and previous != StatusOption.SUCCESSFUL: - job_data[StatusField.JOB_STATUS] = StatusOption.FAILED - - # do not overwrite a successful or failed job status. - elif ( - current != previous - and previous not in StatusOption.members_as_str() - ): - job_data[StatusField.JOB_STATUS] = current - + self._update_job_status_from_hardware( + self.data[command][job_name], hardware_status_retriever + ) # job does not yet exist else: self.data[command][job_name] = {} + self.dump() + def _retrieve_job_status( self, command, job_name, hardware_status_retriever ): @@ -586,6 +630,10 @@ def parse_command_status( ): """Parse key from job status(es) from the given command. + This command DOES NOT check the HPC queue for jobs and therefore + DOES NOT update the status of previously running jobs that have + errored out of the HPC queue. + Parameters ---------- status_dir : path-like @@ -647,6 +695,11 @@ def __init__(self, directory, command, job_name, job_attrs): self.out_file = None def __enter__(self): + logger.debug( + "Received job attributes: %s", + pprint.pformat(self.job_attrs, indent=4), + ) + self.start_time = dt.datetime.now() self.job_attrs.update( { @@ -680,10 +733,19 @@ def __exit__(self, exc_type, exc, traceback): StatusField.JOB_STATUS: StatusOption.SUCCESSFUL, } ) + logger.info( + "Command %r complete. Time elapsed: %s. " + "Target output file: %r", + self.command, + time_elapsed, + self.out_file, + ) else: self.job_attrs.update( {StatusField.JOB_STATUS: StatusOption.FAILED} ) + logger.info("Command %r failed in %s", self.command, time_elapsed) + Status.make_single_job_file( self.directory, command=self.command, @@ -699,16 +761,17 @@ def _elapsed_time_as_str(seconds_elapsed): hours, minutes = divmod(minutes, 60) time_str = f"{hours:d}:{minutes:02d}:{seconds:02d}" if days: - time_str = f"{days:d} day{'s' if abs(days) != 1 else ''}, {time_str}" + time_str = f"{days:,d} day{'s' if abs(days) != 1 else ''}, {time_str}" return time_str def _add_elapsed_time(status_df): """Add elapsed time to status DataFrame""" - mask = ( - ~status_df[StatusField.TIME_START].isna() - & status_df[StatusField.TIME_END].isna() - ) + has_start_time = ~status_df[StatusField.TIME_START].isna() + has_no_end_time = status_df[StatusField.TIME_END].isna() + has_not_failed = status_df[StatusField.JOB_STATUS] != StatusOption.FAILED + mask = has_start_time & (has_no_end_time & has_not_failed) + start_times = status_df.loc[mask, StatusField.TIME_START] start_times = pd.to_datetime(start_times, format=DT_FMT) elapsed_times = dt.datetime.now() - start_times @@ -783,3 +846,11 @@ def _validate_hardware(hardware): f"Available options are: {HardwareOption.members_as_str()}." ) raise gapsKeyError(msg) from err + + +def _iter_job_status(status): + """Iterate over job status dictionary.""" + for job_status in status.values(): + if not isinstance(job_status, dict): + continue + yield job_status diff --git a/gaps/utilities.py b/gaps/utilities.py index b401596b..9e83f5ae 100644 --- a/gaps/utilities.py +++ b/gaps/utilities.py @@ -2,6 +2,7 @@ """ GAPs utilities. """ +import sys import copy import logging import collections @@ -165,3 +166,8 @@ def resolve_path(path, base_dir): pass return path + + +def _is_sphinx_build(): + """``True`` if sphinx is in system modules, else ``False``""" + return "sphinx" in sys.modules diff --git a/gaps/version.py b/gaps/version.py index 5ffd5b6e..c6b69185 100644 --- a/gaps/version.py +++ b/gaps/version.py @@ -1,3 +1,3 @@ """GAPs Version Number. """ -__version__ = "0.3.4" +__version__ = "0.4.0" diff --git a/requirements.txt b/requirements.txt index 4aa083b2..325f0e5f 100644 --- a/requirements.txt +++ b/requirements.txt @@ -5,9 +5,12 @@ numpy>=1.24.2 numpydoc>=1.5.0 pandas>=2.0.0 psutil>=5.9.2 +pyjson5>=1.6.4 +pyyaml>=6.0 setuptools>=65.4.0 sphinx>=5.3.0 sphinx-copybutton>=0.5.1 sphinx-click>=4.4.0 sphinx-tabs>=3.4.1 tabulate>=0.9.0 +toml>=0.10.2 diff --git a/tests/cli/test_cli.py b/tests/cli/test_cli.py index c5fbc053..7f2fabb9 100644 --- a/tests/cli/test_cli.py +++ b/tests/cli/test_cli.py @@ -40,6 +40,23 @@ def runnable_script(): _CMD_LIST.pop(0) +def _copy_single_file( + project_points, source_dir, dest_dir, file_pattern, max_workers +): + """Test function that copies over data files.""" + time.sleep(3) + assert project_points.gids == PROJECT_POINTS + assert max_workers == MAX_WORKERS + out_files = [] + for in_file in sorted(Path(source_dir).glob(file_pattern)): + out_file_name = file_pattern.replace("*", "") + out_file = str(Path(dest_dir) / out_file_name) + shutil.copy(in_file, out_file) + out_files.append(out_file) + break + return out_files + + def _copy_files( project_points, source_dir, dest_dir, file_pattern, max_workers ): @@ -91,19 +108,29 @@ def test_make_cli(): ]: assert expected_command in main.commands + assert "test" in main.help + assert "$ test --help" in main.help + assert "$ test run --help" in main.help + assert "$ test analyze --help" in main.help + assert "$ test collect-analyze --help" in main.help + assert "$ test collect-run --help" not in main.help + @pytest.mark.integration +@pytest.mark.parametrize("test_single_file", [True, False]) def test_cli( tmp_cwd, cli_runner, collect_pattern, manual_collect, runnable_script, + test_single_file, assert_message_was_logged, ): """Integration test of `make_cli`""" data_dir, file_pattern = collect_pattern + run_func = _copy_single_file if test_single_file else _copy_files def preprocess_run_config(config, project_dir, out_dir): assert project_dir == out_dir @@ -114,7 +141,7 @@ def preprocess_run_config(config, project_dir, out_dir): commands = [ CLICommandFromFunction( - _copy_files, + run_func, name="run", add_collect=True, split_keys=["project_points"], @@ -124,6 +151,8 @@ def preprocess_run_config(config, project_dir, out_dir): main = make_cli(commands) + assert main.help == "Command Line Interface" + assert not set(tmp_cwd.glob("*")) cli_runner.invoke(main, ["template-configs"]) files = set(tmp_cwd.glob("*")) @@ -154,7 +183,7 @@ def preprocess_run_config(config, project_dir, out_dir): main, ["pipeline", "-c", pipe_config_fp.as_posix()] ) assert len(set((tmp_cwd / "logs").glob("*run*"))) == 1 - assert len(set(tmp_cwd.glob(file_pattern))) == 4 + assert len(set(tmp_cwd.glob(file_pattern))) == 1 if test_single_file else 4 assert tmp_cwd / "logs" in set(tmp_cwd.glob("*")) result = cli_runner.invoke(main, ["status"]) @@ -168,12 +197,18 @@ def preprocess_run_config(config, project_dir, out_dir): "collect-run not submitted", ] - for ind, partial in enumerate(expected_partial_lines[::-1], start=7): + for ind, partial in enumerate(expected_partial_lines[::-1], start=9): err_msg = f"{partial!r} not in {lines[-ind]!r}. All lines: {lines}" assert all( string in lines[-ind] for string in partial.split() ), err_msg + with open(collect_config_fp, "r") as config_file: + config = json.load(config_file) + config["collect_pattern"] = {"out.h5": "./*.h5"} + with open(collect_config_fp, "w") as config_file: + json.dump(config, config_file) + assert tmp_cwd / "chunk_files" not in set(tmp_cwd.glob("*")) result = cli_runner.invoke( main, @@ -190,14 +225,15 @@ def preprocess_run_config(config, project_dir, out_dir): h5_files = set(tmp_cwd.glob("*.h5")) assert len(h5_files) == 1 - with h5py.File(h5_files.pop(), "r") as collected_outputs: - assert len(collected_outputs.keys()) == 5 - assert "cf_mean" in collected_outputs - assert "lcoe_fcr" in collected_outputs - cf_profiles = collected_outputs["cf_profile"][...] + if not test_single_file: + with h5py.File(h5_files.pop(), "r") as collected_outputs: + assert len(collected_outputs.keys()) == 5 + assert "cf_mean" in collected_outputs + assert "lcoe_fcr" in collected_outputs + cf_profiles = collected_outputs["cf_profile"][...] - profiles = manual_collect(data_dir / file_pattern, "cf_profile") - assert np.allclose(profiles, cf_profiles) + profiles = manual_collect(data_dir / file_pattern, "cf_profile") + assert np.allclose(profiles, cf_profiles) result = cli_runner.invoke( main, ["pipeline", "-c", pipe_config_fp.as_posix()] @@ -224,7 +260,7 @@ def preprocess_run_config(config, project_dir, out_dir): assert project_dir == out_dir config["dest_dir"] = str(project_dir) config["source_dir"] = str(data_dir) - config["file_pattern"] = file_pattern + config["file_pattern"] = f"./{file_pattern}" return config commands = [ @@ -370,7 +406,7 @@ def preprocess_run_config(config, project_dir, out_dir): if os.getpid() == status["monitor_pid"]: # Wait to die for __ in range(10): - time.sleep(10) + time.sleep(60) pytest.exit(0) assert ( @@ -380,8 +416,8 @@ def preprocess_run_config(config, project_dir, out_dir): != StatusOption.SUCCESSFUL ) - for __ in range(6): - time.sleep(10) + for __ in range(10): + time.sleep(60) collect_status = Status.retrieve_job_status( tmp_cwd, "collect-run", f"{tmp_cwd.name}_collect_run" ) diff --git a/tests/cli/test_cli_command.py b/tests/cli/test_cli_command.py index 7b412ce2..3b0522e7 100644 --- a/tests/cli/test_cli_command.py +++ b/tests/cli/test_cli_command.py @@ -7,11 +7,8 @@ import click import pytest -from gaps.cli.command import ( - CLICommandFromFunction, - GAPS_SUPPLIED_ARGS, - _WrappedCommand, -) +from gaps.cli.command import CLICommandFromFunction, _WrappedCommand +from gaps.cli.config import GAPS_SUPPLIED_ARGS def test_cli_command_configuration(): @@ -29,8 +26,7 @@ def _test_func(): assert len(ccc.documentation.signatures) == 2 assert not ccc.is_split_spatially assert all( - param in ccc.documentation.skip_params - for param in GAPS_SUPPLIED_ARGS + param in ccc.documentation.skip_params for param in GAPS_SUPPLIED_ARGS ) def _test_preprocessor(config, name, _a_default=3): diff --git a/tests/cli/test_cli_config.py b/tests/cli/test_cli_config.py index fbb8b20f..82d29072 100644 --- a/tests/cli/test_cli_config.py +++ b/tests/cli/test_cli_config.py @@ -42,6 +42,7 @@ def _testing_function( out_dir, out_fpath, max_workers, + pool_size=16, _input2=None, _z_0=None, ): @@ -71,6 +72,8 @@ def _testing_function( Internal GAPs out filepath. max_workers : int Max workers. + pool_size : int, optional + Pool size. By default, ``16``. _input2 : float, optional Secret input 2. By default, ``None``. _z_0 : str, optional @@ -85,6 +88,7 @@ def _testing_function( "_input2": _input2, "input3": input3, "max_workers": max_workers, + "pool_size": pool_size, "_z_0": _z_0, "out_fpath": out_fpath, "out_dir": out_dir, @@ -130,6 +134,7 @@ def run( out_dir, out_fpath, max_workers, + pool_size=16, _z_0=None, ): """Test run function for CLI around. @@ -154,6 +159,8 @@ def run( Internal GAPs out filepath. max_workers : int Max workers. + pool_size : int, optional + Pool size. By default, ``16``. _z_0 : str, optional Secret str. By default, ``None``. """ @@ -166,6 +173,7 @@ def run( "_input2": self._input2, "input3": self._input3, "max_workers": max_workers, + "pool_size": pool_size, "_z_0": _z_0, "out_fpath": out_fpath, "out_dir": out_dir, @@ -193,6 +201,20 @@ def runnable_script(): _CMD_LIST.pop(0) +@pytest.fixture +def job_names_cache(monkeypatch): + """Monkeypatch `_kickoff_hpc_job` and cache call""" + cache = {} + + def _test_kickoff(ctx, cmd, __, **kwargs): + cache[ctx.obj["NAME"]] = cmd + + monkeypatch.setattr( + gaps.cli.execution, "_kickoff_hpc_job", _test_kickoff, raising=True + ) + return cache + + @pytest.mark.parametrize( ("extra_input", "none_list"), [ @@ -202,7 +224,7 @@ def runnable_script(): ) @pytest.mark.parametrize("test_class", [False, True]) def test_run_local( - test_ctx, extra_input, none_list, runnable_script, test_class + test_ctx, extra_input, none_list, runnable_script, test_class, caplog ): """Test the `run` function locally.""" @@ -248,10 +270,19 @@ def pre_processing(config, a_value, a_multiplier): assert "input2" in warning_info[0].message.args[0] assert "_input2" in warning_info[0].message.args[0] + expected_log_starts = [ + "Running run from config file: '", + "Target output directory: '", + "Target logging directory: '", + ] + for expected in expected_log_starts: + assert any(expected in record.message for record in caplog.records) + assert not any("Path(" in record.message for record in caplog.records) + if "max_workers" in extra_input: expected_message = ( - "Found key 'max_workers' outside of 'execution_control'. " - "Moving 'max_workers' value (100) into 'execution_control' block." + "Found key(s) {'max_workers'} outside of 'execution_control'. " + "Moving these keys into 'execution_control' block." ) assert expected_message in warning_info[1].message.args[0] @@ -269,6 +300,7 @@ def pre_processing(config, a_value, a_multiplier): assert outputs["_input2"] == 10 assert outputs["input3"] is None assert outputs["max_workers"] == 100 + assert outputs["pool_size"] == 16 status = Status(tmp_path).update_from_all_job_files() assert len(status["run"]) == 1 @@ -292,7 +324,7 @@ def pre_processing(config, a_value, a_multiplier): @pytest.mark.parametrize("test_class", [False, True]) def test_run_multiple_nodes( - test_ctx, runnable_script, monkeypatch, test_class + test_ctx, runnable_script, monkeypatch, test_class, job_names_cache ): """Test the `run` function calls `_kickoff_hpc_job` for multiple nodes.""" @@ -331,15 +363,6 @@ def test_run_multiple_nodes( with open(config_fp, "w") as config_file: json.dump(config, config_file) - job_names_cache = {} - - def _test_kickoff(ctx, cmd, **kwargs): - job_names_cache[ctx.obj["NAME"]] = cmd - - monkeypatch.setattr( - gaps.cli.execution, "_kickoff_hpc_job", _test_kickoff, raising=True - ) - assert len(job_names_cache) == 0 from_config(config_fp, command_config) assert len(job_names_cache) == 4 @@ -356,20 +379,173 @@ def _test_kickoff(ctx, cmd, **kwargs): ) +@pytest.mark.parametrize("test_class", [False, True]) +def test_run_multiple_nodes_correct_zfill( + test_ctx, runnable_script, monkeypatch, test_class, job_names_cache +): + """Test the `run` function calls `_kickoff_hpc_job` for multiple nodes.""" + + tmp_path = test_ctx.obj["TMP_PATH"] + + if test_class: + command_config = CLICommandFromClass( + TestCommand, + "run", + name="run", + split_keys={"project_points", "_z_0"}, + ) + else: + command_config = CLICommandFromFunction( + _testing_function, + name="run", + split_keys={"project_points", "_z_0"}, + ) + + config = { + "execution_control": { + "option": "eagle", + "allocation": "test", + "walltime": 1, + "nodes": 5, + "max_workers": 1, + }, + "input1": 1, + "input2": 7, + "input3": 8, + "_z_0": ["unsorted", "strings"], + "project_points": [0, 1, 2, 4, 5, 6, 7, 8, 9], + } + + config_fp = tmp_path / "config.json" + with open(config_fp, "w") as config_file: + json.dump(config, config_file) + + assert len(job_names_cache) == 0 + from_config(config_fp, command_config) + assert len(job_names_cache) == 10 + assert len(set(job_names_cache)) == 10 + + assert not any("j00" in job_name for job_name in job_names_cache) + assert any("j0" in job_name for job_name in job_names_cache) + + +@pytest.mark.parametrize("test_class", [False, True]) +def test_run_no_split_keys( + test_ctx, runnable_script, monkeypatch, test_class, job_names_cache +): + """Test the `run` function with no split keys specified.""" + + tmp_path = test_ctx.obj["TMP_PATH"] + + if test_class: + command_config = CLICommandFromClass( + TestCommand, + "run", + name="run", + split_keys=None, + ) + else: + command_config = CLICommandFromFunction( + _testing_function, + name="run", + split_keys=None, + ) + + config = { + "execution_control": { + "option": "eagle", + "allocation": "test", + "walltime": 1, + "max_workers": 1, + }, + "input1": 1, + "input2": 7, + "input3": 8, + "_z_0": ["unsorted", "strings"], + "project_points": [0, 1, 2, 4], + } + + config_fp = tmp_path / "config.json" + with open(config_fp, "w") as config_file: + json.dump(config, config_file) + + assert len(job_names_cache) == 0 + from_config(config_fp, command_config) + assert len(job_names_cache) == 1 + assert len(set(job_names_cache)) == 1 + + for job_name, script in job_names_cache.items(): + assert "_j0" not in job_name + assert "[0, 1, 2, 4]" in script + assert '["unsorted", "strings"]' in script + + +@pytest.mark.parametrize("test_class", [False, True]) +def test_run_empty_split_keys( + test_ctx, runnable_script, monkeypatch, test_class, job_names_cache +): + """Test the `run` function with empty split keys input.""" + + tmp_path = test_ctx.obj["TMP_PATH"] + + if test_class: + command_config = CLICommandFromClass( + TestCommand, + "run", + name="run", + split_keys=["_z_0"], + ) + else: + command_config = CLICommandFromFunction( + _testing_function, + name="run", + split_keys=["_z_0"], + ) + + config = { + "execution_control": { + "option": "eagle", + "allocation": "test", + "walltime": 1, + "max_workers": 1, + }, + "input1": 1, + "input2": 7, + "input3": 8, + "_z_0": [], + "project_points": [0, 1, 2, 4], + } + + config_fp = tmp_path / "config.json" + with open(config_fp, "w") as config_file: + json.dump(config, config_file) + + assert len(job_names_cache) == 0 + from_config(config_fp, command_config) + assert len(job_names_cache) == 1 + assert len(set(job_names_cache)) == 1 + + for job_name, script in job_names_cache.items(): + assert "_j0" not in job_name + assert "[0, 1, 2, 4]" in script + assert '"_z_0": None' in script + + @pytest.mark.parametrize("test_class", [False, True]) @pytest.mark.parametrize("num_nodes", [30, 100]) +@pytest.mark.parametrize("option", ["eagle", "dne"]) def test_warning_about_au_usage( - test_ctx, runnable_script, monkeypatch, test_class, caplog, num_nodes + test_ctx, + runnable_script, + monkeypatch, + test_class, + caplog, + num_nodes, + option, + job_names_cache, ): """Test the `run` function calls `_kickoff_hpc_job` for multiple nodes.""" - # def assert_message_was_logged(caplog): - # """Assert that a particular (partial) message was logged.""" - # caplog.clear() - - # def assert_message(msg, log_level=None, clear_records=False): - # """Assert that a message was logged.""" - # assert caplog.records tmp_path = test_ctx.obj["TMP_PATH"] if test_class: @@ -388,7 +564,7 @@ def test_warning_about_au_usage( config = { "execution_control": { - "option": "eagle", + "option": option, "allocation": "test", "qos": "high", "walltime": 50, @@ -403,25 +579,21 @@ def test_warning_about_au_usage( with open(config_fp, "w") as config_file: json.dump(config, config_file) - job_names_cache = {} - - def _test_kickoff(ctx, cmd, **kwargs): - job_names_cache[ctx.obj["NAME"]] = cmd - - monkeypatch.setattr( - gaps.cli.execution, "_kickoff_hpc_job", _test_kickoff, raising=True - ) - assert not caplog.records assert len(job_names_cache) == 0 - from_config(config_fp, command_config) - assert len(job_names_cache) == num_nodes - assert len(set(job_names_cache)) == num_nodes + try: + from_config(config_fp, command_config) + except ValueError: + pass + + if option == "eagle": + assert len(job_names_cache) == num_nodes + assert len(set(job_names_cache)) == num_nodes warnings = [ record for record in caplog.records if record.levelname == "WARNING" ] - if num_nodes < 33: + if num_nodes < 33 or option != "eagle": assert not warnings else: assert warnings @@ -433,7 +605,7 @@ def _test_kickoff(ctx, cmd, **kwargs): @pytest.mark.parametrize("test_class", [False, True]) def test_run_parallel_split_keys( - test_ctx, runnable_script, monkeypatch, test_class + test_ctx, runnable_script, monkeypatch, test_class, job_names_cache ): """Test the `run` function calls `_kickoff_hpc_job` for multiple nodes.""" @@ -470,15 +642,6 @@ def test_run_parallel_split_keys( with open(config_fp, "w") as config_file: json.dump(config, config_file) - job_names_cache = {} - - def _test_kickoff(ctx, cmd, **kwargs): - job_names_cache[ctx.obj["NAME"]] = cmd - - monkeypatch.setattr( - gaps.cli.execution, "_kickoff_hpc_job", _test_kickoff, raising=True - ) - assert len(job_names_cache) == 0 from_config(config_fp, command_config) assert len(job_names_cache) == 6 @@ -759,5 +922,83 @@ def test_func(project_points, input1, input2, tag): assert all(key not in status["run"]["test"] for key in exclude) +@pytest.mark.parametrize("test_extras", [False, True]) +@pytest.mark.parametrize("test_class", [False, True]) +def test_args_passed_to_pre_processor( + tmp_path, test_ctx, test_extras, test_class, runnable_script +): + """Test that correct arguments are passed to the pre-processor.""" + + input_config = { + "execution_control": {"max_workers": 100}, + "input1": 1, + "a_value": 5, + "a_multiplier": 2, + "input2": 7, + "_input2": 8, + "input3": None, + "project_points": [0, 1, 2], + } + config_fp = tmp_path / "config.json" + + if test_extras: + input_config["log_directory"] = str(tmp_path / "other_logs") + input_config["log_level"] = "DEBUG" + + with open(config_fp, "w") as config_file: + json.dump(input_config, config_file) + + # pylint: disable=too-many-arguments + def pre_processing( + config, + a_value, + a_multiplier, + command_name, + config_file, + project_dir, + job_name, + out_dir, + out_fpath, + log_directory, + verbose, + ): + assert a_value == 5 + assert a_multiplier == 2 + assert command_name == "run" + assert config_file == config_fp + assert project_dir == tmp_path + assert tmp_path.name in job_name + assert "run" in job_name + assert out_dir == tmp_path + assert out_fpath == out_dir / job_name + if test_extras: + assert log_directory == tmp_path / "other_logs" + assert verbose + else: + assert config == input_config + assert log_directory == tmp_path / "logs" + assert not verbose + + return config + + if test_class: + command_config = CLICommandFromClass( + TestCommand, + "run", + split_keys={"project_points", "input3"}, + config_preprocessor=pre_processing, + ) + else: + command_config = CLICommandFromFunction( + _testing_function, + name="run", + split_keys={"project_points", "input3"}, + config_preprocessor=pre_processing, + ) + + with pytest.warns(gapsWarning) as warning_info: + from_config(config_fp, command_config) + + if __name__ == "__main__": pytest.main(["-q", "--show-capture=all", Path(__file__), "-rapP"]) diff --git a/tests/cli/test_cli_documentation.py b/tests/cli/test_cli_documentation.py index 09588114..668a5b66 100644 --- a/tests/cli/test_cli_documentation.py +++ b/tests/cli/test_cli_documentation.py @@ -9,7 +9,12 @@ import pytest -from gaps.cli.documentation import DEFAULT_EXEC_VALUES, CommandDocumentation +import gaps.cli.documentation +from gaps.cli.documentation import ( + DEFAULT_EXEC_VALUES, + EXTRA_EXEC_PARAMS, + CommandDocumentation, +) def func_no_args(): @@ -26,87 +31,219 @@ def test_command_documentation_copies_skip_params(): skip_params_set = ["a"] doc = CommandDocumentation(func_no_args, skip_params=skip_params_set) assert skip_params_set == ["a"] - assert doc.skip_params == {"a", "cls", "self", "max_workers"} + assert doc.skip_params == {"a", "cls", "self"} | set(EXTRA_EXEC_PARAMS) -def test_command_documentation_max_workers(): - """Test the `CommandDocumentation` with and without `max_workers`.""" +def test_command_documentation_extra_exec_params(): + """Test the `CommandDocumentation` with extra exec params.""" - def func(max_workers): + def func( + max_workers, + sites_per_worker, + memory_utilization_limit, + timeout, + pool_size, + ): """A short description. Parameters ---------- max_workers : int Number of workers to run. + sites_per_worker : float + Number of sites to run. + memory_utilization_limit : str + A test documentation. + timeout : dict + A timeout value. + pool_size : list + A worker pool size. """ - doc = CommandDocumentation(func) - assert doc.max_workers_in_func_signature - assert doc.max_workers_required - assert "Number of workers to run." in doc._max_workers_doc - assert "(int)" in doc._max_workers_doc - assert "max_workers" in doc.exec_control_doc - - execution_control = doc.template_config["execution_control"] - assert execution_control["max_workers"] == doc.REQUIRED_TAG - assert doc.default_exec_values["max_workers"] == doc.REQUIRED_TAG + expected_parameters = [ + "max_workers", + "sites_per_worker", + "memory_utilization_limit", + "timeout", + "pool_size", + ] + expected_types = ["(int)", "(float)", "(str)", "(dict)", "(list)"] + expected_decs = [ + "Number of workers to run.", + "Number of sites to run.", + "A test documentation.", + "A timeout value.", + "A worker pool size.", + ] + expected_iter = zip(expected_parameters, expected_types, expected_decs) - def func(max_workers): + doc = CommandDocumentation(func) + for param, p_type, p_doc in expected_iter: + assert doc._param_in_func_signature(param) + assert doc.param_required(param) + assert p_doc in doc._format_extra_exec_param_doc(param) + assert p_type in doc._format_extra_exec_param_doc(param) + assert param in doc.exec_control_doc + + execution_control = doc.template_config["execution_control"] + assert execution_control[param] == doc.REQUIRED_TAG + assert doc.default_exec_values[param] == doc.REQUIRED_TAG + + +def test_command_documentation_extra_exec_params_no_user_doc(): + """Test the `CommandDocumentation` with extra exec params no user doc.""" + + def func( + max_workers, + sites_per_worker, + memory_utilization_limit, + timeout, + pool_size, + ): """A short description.""" + expected_parameters = [ + "max_workers", + "sites_per_worker", + "memory_utilization_limit", + "timeout", + "pool_size", + ] doc = CommandDocumentation(func) - assert doc.max_workers_in_func_signature - assert doc.max_workers_required - assert doc._max_workers_doc - assert "(int)" in doc._max_workers_doc - assert "max_workers" in doc.exec_control_doc - - execution_control = doc.template_config["execution_control"] - assert execution_control["max_workers"] == doc.REQUIRED_TAG - assert doc.default_exec_values["max_workers"] == doc.REQUIRED_TAG - - def func(max_workers=2): + for param in expected_parameters: + assert doc._param_in_func_signature(param) + assert doc.param_required(param) + assert doc._format_extra_exec_param_doc(param) + assert "(int)" in doc._format_extra_exec_param_doc(param) + assert param in doc.exec_control_doc + + execution_control = doc.template_config["execution_control"] + assert execution_control["max_workers"] == doc.REQUIRED_TAG + assert doc.default_exec_values["max_workers"] == doc.REQUIRED_TAG + + +def test_command_documentation_extra_exec_params_user_defaults(): + """Test the `CommandDocumentation` with extra exec params and defaults.""" + + def func( + max_workers=2, + sites_per_worker=0.4, + memory_utilization_limit="test", + timeout=None, + pool_size=None, + ): """A short description. Parameters ---------- max_workers : int, optional - Number of workers to run. By default, `2`. + Number of workers to run. By default, ``2``. + sites_per_worker : float, optional + Number of sites to run. By default, ``0.4``. + memory_utilization_limit : str, optional + A test documentation. By default, ``"test"``. + timeout : dict, optional + A timeout value. By default, ``None``. + pool_size : list, optional + A worker pool size. By default, ``None``. """ + expected_parameters = [ + "max_workers", + "sites_per_worker", + "memory_utilization_limit", + "timeout", + "pool_size", + ] + expected_types = [ + "(int, optional)", + "(float, optional)", + "(str, optional)", + "(dict, optional)", + "(list, optional)", + ] + expected_decs = [ + "Number of workers to run.", + "Number of sites to run.", + "A test documentation.", + "A timeout value.", + "A worker pool size.", + ] + expected_value = [2, 0.4, "test", None, None] + expected_iter = zip( + expected_parameters, expected_types, expected_decs, expected_value + ) + doc = CommandDocumentation(func) - assert doc.max_workers_in_func_signature - assert not doc.max_workers_required - assert "Number of workers to run." in doc._max_workers_doc - assert "(int, optional)" in doc._max_workers_doc - assert "max_workers" in doc.exec_control_doc + for param, p_type, p_doc, p_val in expected_iter: + assert doc._param_in_func_signature(param) + assert not doc.param_required(param) + assert p_doc in doc._format_extra_exec_param_doc(param) + assert p_type in doc._format_extra_exec_param_doc(param) + assert param in doc.exec_control_doc + + execution_control = doc.template_config["execution_control"] + assert execution_control[param] == p_val + assert doc.default_exec_values[param] == p_val + + +def test_command_documentation_extra_exec_params_defaults_no_docs(): + """Test documentation with extra exec params, defaults, no doc.""" + + def func( + max_workers=2, + sites_per_worker=0.4, + memory_utilization_limit="test", + timeout=None, + pool_size=None, + ): + """A short description.""" - execution_control = doc.template_config["execution_control"] - assert execution_control["max_workers"] == 2 - assert doc.default_exec_values["max_workers"] == 2 + expected_parameters = [ + "max_workers", + "sites_per_worker", + "memory_utilization_limit", + "timeout", + "pool_size", + ] + expected_types = [ + "(int, optional)", + "(float, optional)", + "(str, optional)", + "(int, optional)", + "(int, optional)", + ] - def func(max_workers=None): - """A short description.""" + expected_value = [2, 0.4, "test", None, None] + expected_iter = zip(expected_parameters, expected_types, expected_value) doc = CommandDocumentation(func) - assert doc.max_workers_in_func_signature - assert not doc.max_workers_required - assert doc._max_workers_doc - assert "(int, optional)" in doc._max_workers_doc - assert "max_workers" in doc.exec_control_doc + for param, p_type, p_val in expected_iter: + assert doc._param_in_func_signature(param) + assert not doc.param_required(param) + assert doc._extra_exec_param_doc + assert p_type in doc._format_extra_exec_param_doc(param) + assert f"``{p_val}``" in doc._format_extra_exec_param_doc(param) + assert param in doc.exec_control_doc + + execution_control = doc.template_config["execution_control"] + assert execution_control[param] == p_val + assert doc.default_exec_values[param] == p_val + - execution_control = doc.template_config["execution_control"] - assert execution_control["max_workers"] is None - assert doc.default_exec_values["max_workers"] is None +def test_command_documentation_no_extra_exec_params(): + """Test documentation with no extra exec params""" doc = CommandDocumentation(func_no_args) - assert not doc.max_workers_in_func_signature - assert not doc.max_workers_required - assert not doc._max_workers_doc - assert "max_workers" not in doc.template_config["execution_control"] - assert "max_workers" not in doc.default_exec_values - assert "max_workers" not in doc.exec_control_doc + for param in EXTRA_EXEC_PARAMS: + assert not doc._param_in_func_signature(param) + assert not doc.param_required(param) + assert not doc._format_extra_exec_param_doc(param) + assert param not in doc.template_config["execution_control"] + assert param not in doc.default_exec_values + assert param not in doc.exec_control_doc + + assert not doc._extra_exec_param_doc def test_command_documentation_default_exec_values_and_doc(): @@ -246,16 +383,23 @@ def func(): assert doc.extended_summary == expected_str -def test_command_documentation_config_help(): +def test_command_documentation_config_help(monkeypatch): """Test `CommandDocumentation.config_help`.""" + monkeypatch.setattr( + gaps.cli.documentation, "_is_sphinx_build", lambda: True, raising=True + ) + doc = CommandDocumentation(func_no_args, is_split_spatially=True) config_help = doc.config_help(command_name="my_command_name") assert "my_command_name" in config_help - assert doc.parameter_help in config_help + assert ( + gaps.cli.documentation._cli_formatted(doc.parameter_help) + in config_help + ) assert ".. tabs::" in config_help - assert ".. tab::" in config_help + assert ".. group-tab::" in config_help def test_command_documentation_command_help(): diff --git a/tests/cli/test_cli_execution.py b/tests/cli/test_cli_execution.py index 2cdaf104..aa95d396 100644 --- a/tests/cli/test_cli_execution.py +++ b/tests/cli/test_cli_execution.py @@ -9,7 +9,7 @@ import pytest import gaps.hpc -from gaps.status import Status, StatusField, StatusOption +from gaps.status import Status, StatusField, StatusOption, HardwareOption from gaps.cli.execution import kickoff_job, _should_run from gaps.exceptions import gapsConfigError @@ -31,40 +31,45 @@ def test_should_run(test_ctx, caplog, assert_message_was_logged): assert _should_run(test_ctx) assert not caplog.records - Status.make_single_job_file( - test_ctx.obj["OUT_DIR"], - test_ctx.obj["COMMAND_NAME"], - test_ctx.obj["NAME"], - {StatusField.JOB_STATUS: StatusOption.FAILED}, - ) - - assert _should_run(test_ctx) - assert not caplog.records - - Status.make_single_job_file( - test_ctx.obj["OUT_DIR"], - test_ctx.obj["COMMAND_NAME"], - test_ctx.obj["NAME"], - {StatusField.JOB_STATUS: StatusOption.RUNNING}, - ) - - assert not _should_run(test_ctx) - assert_message_was_logged(test_ctx.obj["NAME"], "INFO") - assert_message_was_logged("was found with status", "INFO") - assert_message_was_logged("running", "INFO", clear_records=True) - assert not caplog.records - - Status.make_single_job_file( - test_ctx.obj["OUT_DIR"], - test_ctx.obj["COMMAND_NAME"], - test_ctx.obj["NAME"], - {StatusField.JOB_STATUS: StatusOption.SUCCESSFUL}, - ) - - assert not _should_run(test_ctx) - assert_message_was_logged(test_ctx.obj["NAME"], "INFO") - assert_message_was_logged("is successful", "INFO") - assert_message_was_logged("not re-running", "INFO") + for option in [StatusOption.NOT_SUBMITTED, StatusOption.FAILED]: + Status.make_single_job_file( + test_ctx.obj["OUT_DIR"], + test_ctx.obj["COMMAND_NAME"], + test_ctx.obj["NAME"], + {StatusField.JOB_STATUS: option}, + ) + + assert _should_run(test_ctx) + assert not caplog.records + + for option in [StatusOption.SUBMITTED, StatusOption.RUNNING]: + Status.make_single_job_file( + test_ctx.obj["OUT_DIR"], + test_ctx.obj["COMMAND_NAME"], + test_ctx.obj["NAME"], + {StatusField.JOB_STATUS: option}, + ) + + assert not _should_run(test_ctx) + assert_message_was_logged(test_ctx.obj["NAME"], "INFO") + assert_message_was_logged( + "was found with status", "INFO", clear_records=True + ) + assert not caplog.records + + for option in [StatusOption.SUCCESSFUL, StatusOption.COMPLETE]: + Status.make_single_job_file( + test_ctx.obj["OUT_DIR"], + test_ctx.obj["COMMAND_NAME"], + test_ctx.obj["NAME"], + {StatusField.JOB_STATUS: option}, + ) + + assert not _should_run(test_ctx) + assert_message_was_logged(test_ctx.obj["NAME"], "INFO") + assert_message_was_logged("is successful", "INFO") + assert_message_was_logged("not re-running", "INFO", clear_records=True) + assert not caplog.records def test_kickoff_job_local_basic(test_ctx, assert_message_was_logged): @@ -96,7 +101,7 @@ def test_kickoff_job_local(test_ctx, assert_message_was_logged): exec_kwargs = {"option": "local"} cmd = "python -c \"import warnings; warnings.warn('a test warning')\"" kickoff_job(test_ctx, cmd, exec_kwargs) - assert_message_was_logged("Running locally with job name", "INFO") + assert_message_was_logged("Running 'run' locally with job name", "INFO") assert_message_was_logged(test_ctx.obj["NAME"], "INFO") assert_message_was_logged(cmd, "DEBUG") assert_message_was_logged("Subprocess received stderr") @@ -119,6 +124,7 @@ def test_kickoff_job_hpc( ): """Test kickoff command for HPC job.""" + test_ctx.obj.pop("MANAGER", None) run_dir = test_ctx.obj["TMP_PATH"] assert not list(run_dir.glob("*")) job_name = "_".join([test_ctx.obj["NAME"], str(high_qos)]) @@ -149,13 +155,14 @@ def _test_submit(cmd): assert not list(test_ctx.obj["TMP_PATH"].glob("*")) kickoff_job(test_ctx, cmd, exec_kwargs) + test_ctx.obj.pop("MANAGER", None) - assert cmd_cache + assert len(cmd_cache) == 2 assert_message_was_logged( "Found extra keys in 'execution_control'! ", "WARNING" ) assert_message_was_logged("dne_arg", "WARNING") - assert_message_was_logged("Running on HPC with job name", "INFO") + assert_message_was_logged("Running 'run' on HPC with job name", "INFO") assert_message_was_logged(test_ctx.obj["NAME"], "INFO") assert_message_was_logged("Kicked off job") assert_message_was_logged("(Job ID #9999)", clear_records=True) @@ -178,11 +185,84 @@ def _test_submit(cmd): exec_kwargs = {"option": "eagle", "allocation": "test", "walltime": 0.43} kickoff_job(test_ctx, cmd, exec_kwargs) assert_message_was_logged("not resubmitting", "INFO") + assert len(cmd_cache) == 2 + + Status.make_single_job_file( + run_dir, + "run", + job_name, + {StatusField.JOB_STATUS: StatusOption.RUNNING}, + ) + kickoff_job(test_ctx, cmd, exec_kwargs) + assert len(cmd_cache) == 2 + + # check repeated call does not requeue HPC + kickoff_job(test_ctx, cmd, exec_kwargs) + assert len(cmd_cache) == 2 exec_kwargs = {"option": "eagle", "walltime": 0.43} with pytest.raises(gapsConfigError): kickoff_job(test_ctx, cmd, exec_kwargs) + HardwareOption.EAGLE.manager = gaps.hpc.SLURM() + test_ctx.obj.pop("MANAGER", None) + + +def test_qos_values(test_ctx, monkeypatch): + """Test kickoff command for HPC job.""" + + test_ctx.obj.pop("MANAGER", None) + run_dir = test_ctx.obj["TMP_PATH"] + assert not list(run_dir.glob("*")) + + exec_kwargs = { + "option": "eagle", + "dne_arg": 0, + "allocation": "test", + "walltime": 0.43, + "stdout_path": (test_ctx.obj["TMP_PATH"] / "stdout").as_posix(), + "qos": "dne", + } + + cmd = ( + "python -c \"import warnings; print('hello world'); " + "warnings.warn('a test warning')\"" + ) + cmd_cache = [] + + def _test_submit(cmd): + cmd_cache.append(cmd) + return "9999", None + + monkeypatch.setattr(gaps.hpc, "submit", _test_submit, raising=True) + monkeypatch.setattr( + gaps.hpc.PBS, + "_job_is_running", + lambda *__, **___: True, + raising=True, + ) + assert not cmd_cache + assert not list(test_ctx.obj["TMP_PATH"].glob("*")) + + with pytest.raises(gapsConfigError): + kickoff_job(test_ctx, cmd, exec_kwargs) + + exec_kwargs["option"] = "peregrine" + kickoff_job(test_ctx, cmd, exec_kwargs) + + status_file = list(run_dir.glob("*.json")) + assert len(status_file) == 1 + status_file = status_file[0] + assert status_file.name.endswith(".json") + + with open(status_file, "r") as status_fh: + status = json.load(status_fh) + + assert status["run"][test_ctx.obj["NAME"]][StatusField.QOS] == "dne" + + HardwareOption.EAGLE.manager = gaps.hpc.SLURM() + test_ctx.obj.pop("MANAGER", None) + if __name__ == "__main__": pytest.main(["-q", "--show-capture=all", Path(__file__), "-rapP"]) diff --git a/tests/cli/test_cli_pipeline.py b/tests/cli/test_cli_pipeline.py index a74df1bd..b6a2d71b 100644 --- a/tests/cli/test_cli_pipeline.py +++ b/tests/cli/test_cli_pipeline.py @@ -21,6 +21,7 @@ _can_run_background, ) from gaps.exceptions import gapsExecutionError +from gaps.warnings import gapsWarning TEST_FILE_DIR = Path(__file__).parent.as_posix() @@ -30,30 +31,46 @@ {"run": "./config_run.json"}, ], } +SUCCESS_CONFIG = {"test": "success"} @pytest.fixture -def runnable_pipeline(): +def runnable_pipeline(tmp_path): """Add run to pipeline commands for test only.""" try: Pipeline.COMMANDS["run"] = run + pipe_config_fp = tmp_path / "config_pipe.json" + with open(pipe_config_fp, "w") as config_file: + json.dump(SAMPLE_CONFIG, config_file) yield finally: Pipeline.COMMANDS.pop("run") +@pytest.fixture +def pipe_config_fp(tmp_path): + """Add a sample pipeline config to a temp directory.""" + pipe_config_fp = tmp_path / "config_pipe.json" + with open(pipe_config_fp, "w") as config_file: + json.dump(SAMPLE_CONFIG, config_file) + + yield pipe_config_fp + + @click.command() @click.option("--config", "-c", default=".", help="Path to config file") def run(config): """Test command.""" config_fp = Path(config) - config_fp.touch() + with open(config_fp, "w") as config_file: + json.dump(SUCCESS_CONFIG, config_file) + attrs = {StatusField.JOB_STATUS: StatusOption.SUCCESSFUL} Status.make_single_job_file(config_fp.parent, "run", "test", attrs) # pylint: disable=no-value-for-parameter -def test_can_run_background(monkeypatch, test_ctx, tmp_path): +def test_can_run_background(monkeypatch, test_ctx, pipe_config_fp): """Test the `_can_run_background` method""" monkeypatch.setattr(os, "setsid", lambda: None, raising=False) @@ -66,10 +83,8 @@ def test_can_run_background(monkeypatch, test_ctx, tmp_path): assert not _can_run_background() - pipe_config_fp = tmp_path / "config_pipe.json" - pipe_config_fp.touch() with pytest.raises(gapsExecutionError) as exc_info: - pipeline(tmp_path, cancel=False, monitor=False, background=True) + pipeline(pipe_config_fp, cancel=False, monitor=False, background=True) assert "Cannot run pipeline in background on system" in str(exc_info) @@ -80,16 +95,14 @@ def test_pipeline_command( tmp_path, cli_runner, runnable_pipeline, + pipe_config_fp, assert_message_was_logged, ): """Test the pipeline_command creation.""" target_config_fp = tmp_path / "config_run.json" - pipe_config_fp = tmp_path / "config_pipe.json" - with open(pipe_config_fp, "w") as config_file: - json.dump(SAMPLE_CONFIG, config_file) + target_config_fp.touch() - assert not target_config_fp.exists() pipe = pipeline_command({}) if _can_run_background(): assert "background" in [opt.name for opt in pipe.params] @@ -99,8 +112,12 @@ def test_pipeline_command( if not extra_args: cli_runner.invoke(pipe, ["-c", pipe_config_fp.as_posix()]) + else: + assert Status(tmp_path).get(StatusField.MONITOR_PID) == os.getpid() + + with open(target_config_fp, "r") as config: + assert json.load(config) == SUCCESS_CONFIG - assert target_config_fp.exists() assert_message_was_logged("Pipeline job", "INFO") assert_message_was_logged("is complete.", "INFO") @@ -109,7 +126,7 @@ def test_pipeline_command( "extra_args", [["--background"], ["--monitor", "--background"]] ) def test_pipeline_command_with_background( - extra_args, tmp_path, cli_runner, monkeypatch + extra_args, pipe_config_fp, cli_runner, monkeypatch ): """Test the pipeline_command creation with background.""" @@ -124,9 +141,6 @@ def _new_run(config_file): monkeypatch.setattr(os, "setsid", lambda: None, raising=False) monkeypatch.setattr(os, "fork", lambda: None, raising=False) - pipe_config_fp = tmp_path / "config_pipe.json" - pipe_config_fp.touch() - pipe = pipeline_command({}) assert "background" in [opt.name for opt in pipe.params] assert not kickoff_background_cache @@ -138,12 +152,9 @@ def _new_run(config_file): assert pipe_config_fp.as_posix() in kickoff_background_cache[0] -def test_pipeline_command_cancel(tmp_path, cli_runner, monkeypatch): +def test_pipeline_command_cancel(pipe_config_fp, cli_runner, monkeypatch): """Test the pipeline_command with --cancel.""" - pipe_config_fp = tmp_path / "config_pipe.json" - pipe_config_fp.touch() - def _new_cancel(config): assert config == pipe_config_fp.as_posix() @@ -164,7 +175,7 @@ def test_ppl_command_no_config_arg( target_config_fp = tmp_cwd / "config_run.json" pipe_config_fp = tmp_cwd / "config_pipeline.json" - assert not target_config_fp.exists() + target_config_fp.touch() assert not pipe_config_fp.exists() pipe = pipeline_command({}) result = cli_runner.invoke(pipe) @@ -176,7 +187,8 @@ def test_ppl_command_no_config_arg( json.dump(SAMPLE_CONFIG, config_file) cli_runner.invoke(pipe) - assert target_config_fp.exists() + with open(target_config_fp, "r") as config: + assert json.load(config) == SUCCESS_CONFIG cli_runner.invoke(pipe) assert_message_was_logged("Pipeline job", "INFO") @@ -213,5 +225,29 @@ def _test_func(): assert config == expected_config +def test_pipeline_command_with_running_pid( + pipe_config_fp, cli_runner, monkeypatch +): + """Assert pipeline_command does not start processing if existing pid.""" + + monkeypatch.setattr( + gaps.cli.pipeline.Status, + "get", + lambda *__, **___: os.getpid(), + raising=True, + ) + + pipe = pipeline_command({}) + with pytest.warns(gapsWarning) as warn_info: + cli_runner.invoke(pipe, ["-c", pipe_config_fp.as_posix()], obj={}) + + assert "Another pipeline" in warn_info[0].message.args[0] + assert "is running on monitor PID:" in warn_info[0].message.args[0] + assert f"{os.getpid()}" in warn_info[0].message.args[0] + assert ( + "Not starting a new pipeline execution" in warn_info[0].message.args[0] + ) + + if __name__ == "__main__": pytest.main(["-q", "--show-capture=all", Path(__file__), "-rapP"]) diff --git a/tests/cli/test_cli_preprocesing.py b/tests/cli/test_cli_preprocesing.py index dfb374aa..02e2c470 100644 --- a/tests/cli/test_cli_preprocesing.py +++ b/tests/cli/test_cli_preprocesing.py @@ -84,6 +84,9 @@ def test_preprocess_collect_config_pipeline_input(tmp_path): with open(config_fp, "w") as file_: json.dump(SAMPLE_CONFIG, file_) + (tmp_path / "config.json").touch() + (tmp_path / "collect_config.json").touch() + Pipeline(config_fp) job_files = [ diff --git a/tests/cli/test_cli_status.py b/tests/cli/test_cli_status.py index f40ef643..e2365813 100644 --- a/tests/cli/test_cli_status.py +++ b/tests/cli/test_cli_status.py @@ -2,11 +2,14 @@ """ GAPs status command tests. """ +import json +import shutil from pathlib import Path import psutil import pytest +from gaps.status import HardwareStatusRetriever, StatusOption from gaps.cli.status import status_command from gaps.warnings import gapsWarning @@ -37,6 +40,12 @@ def test_status(test_data_dir, cli_runner, extra_args, monkeypatch): """Test the status command.""" monkeypatch.setattr(psutil, "pid_exists", lambda *__: True, raising=True) + monkeypatch.setattr( + HardwareStatusRetriever, + "__getitem__", + lambda *__, **___: StatusOption.SUBMITTED, + raising=True, + ) status = status_command() if "dne" in extra_args: @@ -75,16 +84,219 @@ def test_status(test_data_dir, cli_runner, extra_args, monkeypatch): "gaps_test_run_j3 submitted local", "collect-run not submitted", ] + start_ind = -21 + for ind, partial in enumerate(expected_partial_lines): + assert all( + string in lines[start_ind + ind] for string in partial.split() + ), f"{partial}, {lines[start_ind + ind:]}" + + assert "Total node runtime" in lines[-5] + assert "Total project wall time" in lines[-3] + assert lines[-4] == "Total AUs spent: 6" + +@pytest.mark.parametrize( + "extra_args", + [ + [], + "-i out_dir".split(), + "-c run".split() + "-c collect-run".split(), + "-s successful".split() + + "-s fail".split() + + "-s r".split() + + "-s pending".split() + + "-s u".split() + + "-s dne".split(), + "-c run".split() + + "-c collect-run".split() + + "-s successful".split() + + "-s fail".split() + + "-s r".split() + + "-s pending".split() + + "-s u".split() + + "-s dne".split(), + ], +) +def test_status_with_hardware_check( + test_data_dir, cli_runner, extra_args, monkeypatch +): + """Test the status command.""" + + monkeypatch.setattr(psutil, "pid_exists", lambda *__: True, raising=True) + + status = status_command() + if "dne" in extra_args: + with pytest.warns(gapsWarning): + result = cli_runner.invoke( + status, + [(test_data_dir / "test_run").as_posix()] + extra_args, + ) + else: + result = cli_runner.invoke( + status, + [(test_data_dir / "test_run").as_posix()] + extra_args, + ) + lines = result.stdout.split("\n") + cols = [ + "job_status", + "time_submitted", + "time_start", + "time_end", + "total_runtime", + "hardware", + "qos", + ] + if "out_dir" in extra_args: + cols += ["out_dir"] + cols = " ".join(cols) + + expected_partial_lines = [ + "test_run", + "MONITOR PID: 1234", + cols, + "--", + "gaps_test_run_j0 successful 0:03:38 local", + "gaps_test_run_j1 failed 0:01:05 eagle high", + "gaps_test_run_j2 failed local unspecified", + "gaps_test_run_j3 failed local", + "collect-run not submitted", + ] + + start_ind = -19 for ind, partial in enumerate(expected_partial_lines): assert all( - string in lines[-16 + ind] for string in partial.split() - ), partial + string in lines[start_ind + ind] for string in partial.split() + ), f"{partial}, {lines[start_ind + ind:]}" + if "failed" in lines[start_ind + ind]: + assert not "(r)" in lines[start_ind + ind] - assert "Total Runtime" in lines[-6] - assert "Total project time" in lines[-5] + assert "Total node runtime" in lines[-5] + assert "Total project wall time" in lines[-3] assert lines[-4] == "Total AUs spent: 6" +@pytest.mark.parametrize( + "extra_args", + [ + [], + "-i out_dir".split(), + "-c run".split() + "-c collect-run".split(), + "-s successful".split() + + "-s fail".split() + + "-s r".split() + + "-s pending".split() + + "-s u".split() + + "-s dne".split(), + "-c run".split() + + "-c collect-run".split() + + "-s successful".split() + + "-s fail".split() + + "-s r".split() + + "-s pending".split() + + "-s u".split() + + "-s dne".split(), + ], +) +@pytest.mark.parametrize("single_command", [True, False]) +def test_failed_run( + tmp_path, + test_data_dir, + cli_runner, + extra_args, + monkeypatch, + single_command, +): + """Test the status command.""" + + monkeypatch.setattr(psutil, "pid_exists", lambda *__: True, raising=True) + + status = status_command() + if single_command: + shutil.copytree( + test_data_dir / "test_failed_run", tmp_path / "test_failed_run" + ) + run_dir = (tmp_path / "test_failed_run").as_posix() + pipe_json = Path(run_dir) / "test_failed_run_status.json" + with open(pipe_json, "r") as config_file: + config = json.load(config_file) + config.pop("collect-run") + with open(pipe_json, "w") as config_file: + json.dump(config, config_file) + else: + run_dir = (test_data_dir / "test_failed_run").as_posix() + + if "dne" in extra_args: + with pytest.warns(gapsWarning): + result = cli_runner.invoke(status, [run_dir] + extra_args) + else: + result = cli_runner.invoke(status, [run_dir] + extra_args) + + lines = result.stdout.split("\n") + cols = [ + "job_status", + "time_submitted", + "time_start", + "time_end", + "total_runtime", + "hardware", + "qos", + ] + if "out_dir" in extra_args: + cols += ["out_dir"] + cols = " ".join(cols) + + expected_partial_lines = [ + "test_failed_run", + "MONITOR PID: 1234", + cols, + "--", + "gaps_test_failed_run_j0 failed local", + "gaps_test_failed_run_j1 failed local high", + "gaps_test_failed_run_j2 failed local unspecified", + ] + + if single_command: + start_ind = -13 + else: + start_ind = -16 + expected_partial_lines += ["collect-run not submitted"] + + for ind, partial in enumerate(expected_partial_lines): + assert all( + string in lines[start_ind + ind] for string in partial.split() + ), f"{partial}, {lines[start_ind + ind]:}" + assert not "(r)" in lines[start_ind + ind] + assert "Total AUs spent" not in lines[start_ind + ind] + if single_command: + assert "Total project wall time" not in lines[start_ind + 12] + + if single_command: + assert "Total node runtime: 0:00:00" in lines[-3] + else: + assert "Total node runtime: 0:00:00" in lines[-4] + assert "Total project wall time" in lines[-3] + + +def test_recursive_status(tmp_path, test_data_dir, cli_runner, monkeypatch): + """Test the status command for recursive directories.""" + + monkeypatch.setattr(psutil, "pid_exists", lambda *__: True, raising=True) + + status = status_command() + shutil.copytree(test_data_dir / "test_run", tmp_path / "test_run") + shutil.copytree( + test_data_dir / "test_failed_run", + tmp_path / "test_run" / "test_failed_run", + ) + run_dir = (tmp_path / "test_run").as_posix() + + result = cli_runner.invoke(status, [run_dir, "-r"]) + + lines = result.stdout.split("\n") + assert any(line == "test_run:" for line in lines) + assert any(line == "test_failed_run:" for line in lines) + assert len(lines) > 20 + + if __name__ == "__main__": pytest.main(["-q", "--show-capture=all", Path(__file__), "-rapP"]) diff --git a/tests/conftest.py b/tests/conftest.py index 821f5687..bba0b120 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -10,13 +10,19 @@ import pytest from click.testing import CliRunner -from gaps import TEST_DATA_DIR +from gaps import TEST_DATA_DIR, logger from gaps.collection import find_h5_files LOGGING_META_FILES = {"log.py", "exceptions.py", "warnings.py"} +@pytest.fixture(autouse=True) +def include_logging(): + """Make the gaps logger propagate calls to the root.""" + logger.propagate = True + + @pytest.fixture def assert_message_was_logged(caplog): """Assert that a particular (partial) message was logged.""" diff --git a/tests/data/test_failed_run/jobstatus_gaps_test_failed_run_node0.json b/tests/data/test_failed_run/jobstatus_gaps_test_failed_run_node0.json new file mode 100644 index 00000000..b0e9ff67 --- /dev/null +++ b/tests/data/test_failed_run/jobstatus_gaps_test_failed_run_node0.json @@ -0,0 +1,11 @@ +{ + "run": { + "gaps_test_failed_run_j0": { + "job_status": "submitted", + "out_dir": "/a/test/path", + "out_file": null, + "time_start": "07-Dec-2022 14:21:38", + "hardware": "local" + } + } +} \ No newline at end of file diff --git a/tests/data/test_failed_run/jobstatus_gaps_test_failed_run_node1.json b/tests/data/test_failed_run/jobstatus_gaps_test_failed_run_node1.json new file mode 100644 index 00000000..a3f98b33 --- /dev/null +++ b/tests/data/test_failed_run/jobstatus_gaps_test_failed_run_node1.json @@ -0,0 +1,10 @@ +{ + "run": { + "gaps_test_failed_run_j1": { + "job_status": "failed", + "out_dir": "/a/test/path", + "time_start": "07-Dec-2022 14:25:21", + "qos": "high" + } + } +} \ No newline at end of file diff --git a/tests/data/test_failed_run/jobstatus_gaps_test_failed_run_node2.json b/tests/data/test_failed_run/jobstatus_gaps_test_failed_run_node2.json new file mode 100644 index 00000000..049a03d5 --- /dev/null +++ b/tests/data/test_failed_run/jobstatus_gaps_test_failed_run_node2.json @@ -0,0 +1,10 @@ +{ + "run": { + "gaps_test_failed_run_j2": { + "job_status": "running", + "out_dir": "/a/test/path", + "time_start": "07-Dec-2022 14:26:17", + "qos": "unspecified" + } + } +} \ No newline at end of file diff --git a/tests/data/test_failed_run/test_failed_run_status.json b/tests/data/test_failed_run/test_failed_run_status.json new file mode 100644 index 00000000..5af608d0 --- /dev/null +++ b/tests/data/test_failed_run/test_failed_run_status.json @@ -0,0 +1,26 @@ +{ + "another_input": ["hello world"], + "a_dict_input": {"a_value": 42}, + "monitor_pid": 1234, + "run": { + "pipeline_index": 0, + "gaps_test_failed_run_j0": { + "job_status": "submitted", + "hardware": "local", + "time_submitted": "07-Dec-2022 14:21:33" + }, + "gaps_test_failed_run_j1": { + "job_status": "submitted", + "hardware": "local", + "time_submitted": "07-Dec-2022 14:25:17" + }, + "gaps_test_failed_run_j2": { + "job_status": "submitted", + "hardware": "local", + "time_submitted": "07-Dec-2022 14:25:17" + } + }, + "collect-run": { + "pipeline_index": 1 + } +} \ No newline at end of file diff --git a/tests/test_collection.py b/tests/test_collection.py index d3918513..a7864515 100644 --- a/tests/test_collection.py +++ b/tests/test_collection.py @@ -372,7 +372,9 @@ def test_low_mem_collect(tmp_path, collect_pattern, points_path): collect_dir, pattern = collect_pattern ctc = Collector(h5_file, collect_dir / pattern, points_path) - ctc.collect("cf_profile", dataset_out=None, mem_util_lim=0.00002) + ctc.collect( + "cf_profile", dataset_out=None, memory_utilization_limit=0.00002 + ) with h5py.File(h5_file, "r") as collected_outputs: assert "cf_profile" in collected_outputs diff --git a/tests/test_hpc.py b/tests/test_hpc.py index 4f31dab7..d7dd7774 100644 --- a/tests/test_hpc.py +++ b/tests/test_hpc.py @@ -167,8 +167,28 @@ def test_check_job(manager, q_str): @pytest.mark.parametrize( - ("manager", "q_str", "kwargs", "expectation"), + ("manager", "q_str", "kwargs", "expectation", "add_qos"), [ + ( + PBS, + Q_STAT, + {"queue": "batch-h", "sh_script": "echo Hello!"}, + { + "#PBS -N submit_test": 1, + "#PBS -A rev": 2, + "#PBS -q batch-h": 3, + # cspell:disable-next-line + "#PBS -o ./stdout/submit_test_$PBS_JOBID.o": 4, + # cspell:disable-next-line + "#PBS -e ./stdout/submit_test_$PBS_JOBID.e": 5, + "#PBS -l walltime=00:26:00,qos=high": 6, + # cspell:disable-next-line + "echo Running on: $HOSTNAME, Machine Type: $MACHTYPE": 7, + "echo Running python in directory `which python`": 8, + "echo Hello!": 9, + }, + True, + ), ( PBS, Q_STAT, @@ -187,6 +207,7 @@ def test_check_job(manager, q_str): "echo Running python in directory `which python`": 8, "echo Hello!": 9, }, + False, ), ( SLURM, @@ -199,16 +220,17 @@ def test_check_job(manager, q_str): "#SBATCH --nodes=1": 4, "#SBATCH --output=./stdout/submit_test_%j.o": 5, "#SBATCH --error=./stdout/submit_test_%j.e": 6, - "#SBATCH --qos=normal": 7, + "#SBATCH --qos=high": 7, # cspell:disable-next-line "echo Running on: $HOSTNAME, Machine Type: $MACHTYPE": 8, "echo Running python in directory `which python`": 9, "echo Hello!": 10, }, + True, ), ], ) -def test_hpc_submit(manager, q_str, kwargs, expectation, monkeypatch): +def test_hpc_submit(manager, q_str, kwargs, expectation, add_qos, monkeypatch): """Test the HPC job submission utility""" queue_dict = manager.parse_queue_str(q_str) @@ -216,6 +238,8 @@ def test_hpc_submit(manager, q_str, kwargs, expectation, monkeypatch): kwargs["cmd"] = "python -c \"print('hello world')\"" kwargs["allocation"] = "rev" kwargs["walltime"] = 0.43 + if add_qos: + kwargs["qos"] = "high" out, err = hpc_manager.submit("job1", **kwargs) assert out is None assert err == "already_running" @@ -286,6 +310,7 @@ def test_submit(monkeypatch): cmd = "python -c \"print('hello world')\"" cmd_id = "python -c \"print('Job ID is 12342')\"" cmd_err = "python -c \"raise ValueError('An error occurred')\"" + cmd_squeue = f'python -c "print({SQUEUE_RAW!r})"' out, err = submit(cmd, background=False, background_stdout=False) assert out == "hello world" @@ -296,7 +321,13 @@ def test_submit(monkeypatch): assert not err out, err = submit(cmd_id, background=False, background_stdout=False) - assert out == "12342" + assert out == "Job ID is 12342" + assert not err + + out, err = submit(cmd_squeue, background=False, background_stdout=False) + out = out.replace("\n", "").replace("\r", "") + expected = SQUEUE_RAW.replace("\n", "").replace("\r", "") + assert out == expected assert not err with pytest.raises(OSError): diff --git a/tests/test_legacy.py b/tests/test_legacy.py index fc2eb51a..1e1a0be4 100644 --- a/tests/test_legacy.py +++ b/tests/test_legacy.py @@ -17,6 +17,7 @@ StatusField, StatusOption, ) +import gaps.status from gaps.legacy import ( HardwareStatusRetriever, Status, @@ -143,7 +144,7 @@ def test_make_job_file_and_retrieve_job_status(temp_job_dir, job_name): assert expected_fn not in [f.name for f in tmp_path.glob("*")] assert status_fp in tmp_path.glob("*") - assert status == "R", "Failed, status is {status!r}" + assert status == "R", f"Failed, status is {status!r}" status = Status.retrieve_job_status( status_dir=tmp_path, module="generation", job_name="test1.h5" @@ -175,6 +176,37 @@ def test_make_job_file_and_retrieve_job_status(temp_job_dir, job_name): ) +def test_update_job_status(tmp_path, monkeypatch): + """Test updating job status""" + status = Status(tmp_path) + status.data = { + "run": { + StatusField.PIPELINE_INDEX: 0, + "test0": {StatusField.JOB_STATUS: "R"}, + "test1": {StatusField.JOB_STATUS: StatusOption.SUCCESSFUL}, + } + } + + monkeypatch.setattr( + gaps.status.HardwareStatusRetriever, + "__getitem__", + lambda *__, **___: "successful", + raising=True, + ) + + status.update_job_status("run", "test0") + assert ( + status.data["run"]["test0"][StatusField.JOB_STATUS] + == StatusOption.SUCCESSFUL + ) + + status.update_job_status("run", "test1") + assert ( + status.data["run"]["test1"][StatusField.JOB_STATUS] + == StatusOption.SUCCESSFUL + ) + + def test_add_job(temp_job_dir): """Test job addition and exist check""" tmp_path, status_fp = temp_job_dir diff --git a/tests/test_pipeline.py b/tests/test_pipeline.py index de7578a7..ba7c94e6 100644 --- a/tests/test_pipeline.py +++ b/tests/test_pipeline.py @@ -18,6 +18,7 @@ Status, StatusField, StatusOption, + HardwareOption, HardwareStatusRetriever, ) from gaps.pipeline import ( @@ -38,8 +39,8 @@ SAMPLE_CONFIG = { "logging": {"log_level": "INFO"}, "pipeline": [ - {"run": "./config.json"}, - {"collect-run": "./collect_config.json"}, + {"run": "./config.{config_type}"}, + {"collect-run": "./collect_config.{config_type}"}, ], } @@ -53,17 +54,31 @@ def submit_call_cache(): @pytest.fixture def sample_config_fp_type(tmp_path): """Generate a sample config type and corresponding fp.""" - config_type = random.choice(list(ConfigType)) + types = list(ConfigType) + config_type = random.choice(types) config_fp = tmp_path / f"pipe_config.{config_type}" - return config_type, config_fp + + sample_config = { + "logging": {"log_level": "INFO"}, + "pipeline": [ + {"run": (tmp_path / "config.json").as_posix()}, + {"collect-run": (tmp_path / "collect_config.json").as_posix()}, + ], + } + return config_type, config_fp, sample_config @pytest.fixture def sample_pipeline_config(sample_config_fp_type): """Write a sample pipeline config for use in tests.""" - config_type, config_fp = sample_config_fp_type + config_type, config_fp, sample_config = sample_config_fp_type with open(config_fp, "w") as file_: - config_type.dump(SAMPLE_CONFIG, file_) + config_type.dump(sample_config, file_) + + for step_dict in sample_config["pipeline"]: + for step_config_fp in step_dict.values(): + Path(step_config_fp).touch() + return config_fp @@ -99,9 +114,28 @@ def test_pipeline_init_bad_config(tmp_path): config_fp = tmp_path / "pipe_config.json" with open(config_fp, "w") as file_: json.dump({}, file_) - with pytest.raises(gapsConfigError): + + with pytest.raises(gapsConfigError) as exc_info: + Pipeline(config_fp) + + assert "Could not find required key" in str(exc_info) + + with open(config_fp, "w") as file_: + json.dump({"pipeline": "./run_config.json"}, file_) + + with pytest.raises(gapsConfigError) as exc_info: + Pipeline(config_fp) + + assert "must be a list" in str(exc_info) + + with open(config_fp, "w") as file_: + json.dump({"pipeline": [{"run": "./dne_config.json"}]}, file_) + + with pytest.raises(gapsConfigError) as exc_info: Pipeline(config_fp) + assert "depends on non-existent file" in str(exc_info) + def test_pipeline_init(sample_pipeline_config, assert_message_was_logged): """Test initializing Pipeline.""" @@ -156,6 +190,9 @@ def test_pipeline_submit(tmp_path, runnable_pipeline): with open(config_fp, "w") as file_: config_type.dump(sample_config, file_) + for fn in ["config.json", "collect_config.json", "dne_config.json"]: + (tmp_path / fn).touch() + pipeline = Pipeline(config_fp) pipeline._submit(0) assert runnable_pipeline[-1] == (tmp_path / "config.json").as_posix() @@ -300,7 +337,7 @@ def test_pipeline_get_command_return_code( def test_pipeline_status(sample_pipeline_config, monkeypatch): """Test the `_status` function.""" pipeline = Pipeline(sample_pipeline_config) - assert pipeline._status(0) == StatusOption.RUNNING + assert pipeline._status(0) == StatusOption.NOT_SUBMITTED monkeypatch.setattr( HardwareStatusRetriever, "__getitem__", @@ -485,22 +522,31 @@ def cache_cancel_calls(__, job_id): sample_pipeline_config.parent, "run", "test1", - job_attrs={StatusField.JOB_ID: 0}, + job_attrs={ + StatusField.JOB_ID: 0, + StatusField.HARDWARE: HardwareOption.LOCAL, + }, ) Status.mark_job_as_submitted( sample_pipeline_config.parent, "run", "test2", - job_attrs={StatusField.JOB_ID: 1}, + job_attrs={ + StatusField.JOB_ID: 1, + StatusField.HARDWARE: HardwareOption.EAGLE, + }, ) Status.mark_job_as_submitted( sample_pipeline_config.parent, "run", "test3", - job_attrs={StatusField.JOB_ID: 12}, + job_attrs={ + StatusField.JOB_ID: 12, + StatusField.HARDWARE: HardwareOption.EAGLE, + }, ) Pipeline.cancel_all(sample_pipeline_config) - assert set(cancelled_jobs) == {0, 1, 12} + assert set(cancelled_jobs) == {1, 12} assert_message_was_logged("Pipeline job", "INFO") assert_message_was_logged("cancelled", "INFO") @@ -617,7 +663,7 @@ def test_parse_previous_status(sample_pipeline_config): with pytest.warns(gapsWarning): assert not parse_previous_status(sample_pipeline_config.parent, "run") - assert len(list(sample_pipeline_config.parent.glob("*"))) == 2 + assert len(list(sample_pipeline_config.parent.glob("*"))) == 4 Status.make_single_job_file( sample_pipeline_config.parent, command="run", @@ -638,13 +684,13 @@ def test_parse_previous_status(sample_pipeline_config): StatusField.OUT_FILE: ["another_test.h5", "a_third.h5"], }, ) - assert len(list(sample_pipeline_config.parent.glob("*"))) == 4 + assert len(list(sample_pipeline_config.parent.glob("*"))) == 6 out_files = parse_previous_status( sample_pipeline_config.parent, "collect-run" ) assert set(out_files) == {"test.h5", "another_test.h5", "a_third.h5"} - assert len(list(sample_pipeline_config.parent.glob("*"))) == 4 + assert len(list(sample_pipeline_config.parent.glob("*"))) == 6 ids = parse_previous_status( sample_pipeline_config.parent, "collect-run", key=StatusField.JOB_ID ) diff --git a/tests/test_status.py b/tests/test_status.py index 847cac3d..06e89029 100644 --- a/tests/test_status.py +++ b/tests/test_status.py @@ -33,7 +33,7 @@ TEST_1_ATTRS_1 = { "job_name": "test1", - StatusField.JOB_STATUS: "R", + StatusField.JOB_STATUS: StatusOption.RUNNING, "run_id": 1234, } TEST_1_ATTRS_2 = { @@ -42,11 +42,11 @@ } TEST_2_ATTRS_1 = { "job_name": "test2", - StatusField.JOB_STATUS: "R", + StatusField.JOB_STATUS: StatusOption.RUNNING, } TEST_2_ATTRS_2 = { "job_name": "test2", - StatusField.JOB_STATUS: "R", + StatusField.JOB_STATUS: StatusOption.RUNNING, StatusField.JOB_ID: 123, } @@ -257,7 +257,7 @@ def test_make_file(temp_job_dir, job_name): assert expected_fn not in [f.name for f in tmp_path.glob("*")] assert status_fp in tmp_path.glob("*") - assert status == "R", "Failed, status is {status!r}" + assert status == StatusOption.RUNNING, f"Failed, status is {status!r}" status = Status.retrieve_job_status(tmp_path, "generation", "test1.h5") assert expected_fn not in [f.name for f in tmp_path.glob("*")] @@ -299,6 +299,9 @@ def test_update_job_status(tmp_path, monkeypatch): status.update_job_status("run", job_name) assert status.data["run"].get(job_name) == {} + tmp_path = tmp_path / "test" + tmp_path.mkdir() + Status.make_single_job_file( tmp_path, "generation", "test1", TEST_1_ATTRS_1 ) @@ -331,33 +334,66 @@ def test_update_job_status(tmp_path, monkeypatch): ) new_attrs = { "job_name": "test1", - StatusField.JOB_STATUS: "some status", + StatusField.JOB_STATUS: StatusOption.SUBMITTED, StatusField.HARDWARE: "local", } Status.make_single_job_file(tmp_path, "generation", "test1", new_attrs) status.update_job_status("generation", "test1") assert ( status.data["generation"]["test1"][StatusField.JOB_STATUS] - == "some status" + == StatusOption.SUBMITTED ) monkeypatch.setattr( HardwareStatusRetriever, "__getitem__", - lambda *__, **___: StatusOption.RUNNING, + lambda *__, **___: StatusOption.SUBMITTED, raising=True, ) status.update_job_status("generation", "test1") assert ( status.data["generation"]["test1"][StatusField.JOB_STATUS] - == StatusOption.RUNNING + == StatusOption.SUBMITTED ) # test a repeated call to hardware status.update_job_status("generation", "test1") assert ( status.data["generation"]["test1"][StatusField.JOB_STATUS] - == StatusOption.RUNNING + == StatusOption.SUBMITTED + ) + + +def test_update_job_status_with_hardware(tmp_path, monkeypatch): + """Test status update with call to hardware""" + monkeypatch.setattr( + HardwareStatusRetriever, + "__getitem__", + lambda *__, **___: None, + raising=True, + ) + + Status.make_single_job_file( + tmp_path, + "generation", + "test1", + {StatusField.JOB_STATUS: StatusOption.SUBMITTED}, + ) + Status.make_single_job_file( + tmp_path, + "generation", + "test2", + {StatusField.JOB_STATUS: StatusOption.COMPLETE}, + ) + + status = Status(tmp_path).update_from_all_job_files(check_hardware=True) + assert ( + status["generation"]["test1"][StatusField.JOB_STATUS] + == StatusOption.FAILED + ) + assert ( + status["generation"]["test2"][StatusField.JOB_STATUS] + == StatusOption.COMPLETE ) @@ -567,7 +603,7 @@ def test_parse_command_status(tmp_path): assert not Status.parse_command_status(tmp_path, "generation", key="dne") -def test_status_updates(tmp_path): +def test_status_updates(tmp_path, assert_message_was_logged): """Test `StatusUpdates` context manager""" assert not list(tmp_path.glob("*")) @@ -591,6 +627,9 @@ def test_status_updates(tmp_path): stu.out_file = "my_test_file.h5" + assert_message_was_logged("Command 'generation' complete.", "INFO") + assert_message_was_logged("Target output file: 'my_test_file.h5'", "INFO") + job_files = list(tmp_path.glob("*")) assert len(job_files) == 1 with open(job_files[0]) as job_status: @@ -605,7 +644,7 @@ def test_status_updates(tmp_path): assert test_attrs == TEST_1_ATTRS_1 -def test_status_for_failed_job(tmp_path): +def test_status_for_failed_job(tmp_path, assert_message_was_logged): """Test `StatusUpdates` context manager for a failed job""" class _TestError(ValueError): @@ -629,6 +668,8 @@ class _TestError(ValueError): except _TestError: pass + assert_message_was_logged("Command 'generation' failed in", "INFO") + status = Status(tmp_path).update_from_all_job_files() status = status["generation"]["test0"]