diff --git a/.github/workflows/push-pr_workflow.yml b/.github/workflows/push-pr_workflow.yml index e2d9e164e..bef6f8608 100644 --- a/.github/workflows/push-pr_workflow.yml +++ b/.github/workflows/push-pr_workflow.yml @@ -129,7 +129,7 @@ jobs: - name: Run pytest over unit test suite run: | - python3 -m pytest tests/unit/ + python3 -m pytest -v --order-scope=module tests/unit/ - name: Run integration test suite for local tests run: | diff --git a/CHANGELOG.md b/CHANGELOG.md index e6c59b44b..0126abcb4 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -6,17 +6,31 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ## [Unreleased] ### Added +- Pytest fixtures in the `conftest.py` file of the integration test suite + - NOTE: an export command `export LC_ALL='C'` had to be added to fix a bug in the WEAVE CI. This can be removed when we resolve this issue for the `merlin server` command +- Tests for the `celeryadapter.py` module +- New CeleryTestWorkersManager context to help with starting/stopping workers for tests +- A new command `merlin detailed-status` that displays task-by-task status information about your study + - This has options to filter by return code, task queues, task statuses, and workers + - You can set a limit on the number of tasks to display + - There are 3 options to modify the output display - New file `merlin/study/status.py` dedicated to work relating to the status command - Contains the Status and DetailedStatus classes +- New file `merlin/study/status_renderers.py` dedicated to formatting the output for the detailed-status command - New file `merlin/common/dumper.py` containing a Dumper object to help dump output to outfiles - Study name and parameter info now stored in the DAG and MerlinStep objects - Added functions to `merlin/display.py` that help display status information: + - `display_task_by_task_status` handles the display for the `merlin detailed-status` command - `display_status_summary` handles the display for the `merlin status` command - `display_progress_bar` generates and displays a progress bar - Added new methods to the MerlinSpec class: + - get_worker_step_map() + - get_queue_step_relationship() - get_tasks_per_step() + - get_step_param_map() - Added methods to the MerlinStepRecord class to mark status changes for tasks as they run (follows Maestro's StepRecord format mostly) - Added methods to the Step class: + - establish_params() - name_no_params() - Added a property paramater_labels to the MerlinStudy class - Added two new utility functions: @@ -24,16 +38,14 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 - ws_time_to_dt() that converts a workspace timestring (YYYYMMDD-HHMMSS) to a datetime object - A new celery task `condense_status_files` to be called when sets of samples finish - Added a celery config setting `worker_cancel_long_running_tasks_on_connection_loss` since this functionality is about to change in the next version of celery -- Tests for the Status class +- Tests for the Status and DetailedStatus classes - this required adding a decent amount of test files to help with the tests; these can be found under the tests/unit/study/status_test_files directory ### Changed - Reformatted the entire `merlin status` command - Now accepts both spec files and workspace directories as arguments - - e.g. "merlin status hello.yaml" and "merlin status hello_20230228-111920/" both work - Removed the --steps flag - Replaced the --csv flag with the --dump flag - - This will make it easier in the future to adopt more file types to dump to - New functionality: - Shows step_by_step progress bar for tasks - Displays a summary of task statuses below the progress bar @@ -45,8 +57,15 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 - Moved `verify_filepath` and `verify_dirpath` from `merlin/main.py` to `merlin/utils.py` ### Fixed +- The `merlin status` command so that it's consistent in its output whether using redis or rabbitmq as the broker +- The `merlin monitor` command will now keep an allocation up if the queues are empty and workers are still processing tasks +- Add the restart keyword to the specification docs - Cyclical imports and config imports that could easily cause ci issues +## [1.11.1] +### Fixed +- Typo in `batch.py` that caused lsf launches to fail (`ALL_SGPUS` changed to `ALL_GPUS`) + ## [1.11.0] ### Added - New reserved variable: diff --git a/Makefile b/Makefile index 4a857a217..b669d51b1 100644 --- a/Makefile +++ b/Makefile @@ -6,7 +6,7 @@ # # LLNL-CODE-797170 # All rights reserved. -# This file is part of Merlin, Version: 1.11.0. +# This file is part of Merlin, Version: 1.11.1. # # For details, see https://github.com/LLNL/merlin. # @@ -87,7 +87,7 @@ install-dev: virtualenv install-merlin install-workflow-deps # tests require a valid dev install of merlin unit-tests: . $(VENV)/bin/activate; \ - $(PYTHON) -m pytest $(UNIT); \ + $(PYTHON) -m pytest -v --order-scope=module $(UNIT); \ # run CLI tests - these require an active install of merlin in a venv @@ -135,9 +135,9 @@ check-flake8: check-black: . $(VENV)/bin/activate; \ - $(PYTHON) -m black --check --line-length $(MAX_LINE_LENGTH) --target-version py36 $(MRLN); \ - $(PYTHON) -m black --check --line-length $(MAX_LINE_LENGTH) --target-version py36 $(TEST); \ - $(PYTHON) -m black --check --line-length $(MAX_LINE_LENGTH) --target-version py36 *.py; \ + $(PYTHON) -m black --check --line-length $(MAX_LINE_LENGTH) --target-version py38 $(MRLN); \ + $(PYTHON) -m black --check --line-length $(MAX_LINE_LENGTH) --target-version py38 $(TEST); \ + $(PYTHON) -m black --check --line-length $(MAX_LINE_LENGTH) --target-version py38 *.py; \ check-isort: @@ -179,9 +179,9 @@ fix-style: $(PYTHON) -m isort -w $(MAX_LINE_LENGTH) $(MRLN); \ $(PYTHON) -m isort -w $(MAX_LINE_LENGTH) $(TEST); \ $(PYTHON) -m isort -w $(MAX_LINE_LENGTH) *.py; \ - $(PYTHON) -m black --target-version py36 -l $(MAX_LINE_LENGTH) $(MRLN); \ - $(PYTHON) -m black --target-version py36 -l $(MAX_LINE_LENGTH) $(TEST); \ - $(PYTHON) -m black --target-version py36 -l $(MAX_LINE_LENGTH) *.py; \ + $(PYTHON) -m black --target-version py38 -l $(MAX_LINE_LENGTH) $(MRLN); \ + $(PYTHON) -m black --target-version py38 -l $(MAX_LINE_LENGTH) $(TEST); \ + $(PYTHON) -m black --target-version py38 -l $(MAX_LINE_LENGTH) *.py; \ # Increment the Merlin version. USE ONLY ON DEVELOP BEFORE MERGING TO MASTER. diff --git a/docs/source/merlin_commands.rst b/docs/source/merlin_commands.rst index cb9b8eefb..1baa0e7a5 100644 --- a/docs/source/merlin_commands.rst +++ b/docs/source/merlin_commands.rst @@ -110,8 +110,15 @@ Monitor (``merlin monitor``) Batch submission scripts may not keep the batch allocation alive if there is not a blocking process in the submission script. The ``merlin monitor`` command addresses this by providing a blocking process that -checks for tasks in the queues every (sleep) seconds. When the queues are empty, the -blocking process will exit and allow the allocation to end. +checks for tasks in the queues every (sleep) seconds. When the queues are empty, the +monitor will query celery to see if any workers are still processing tasks from the +queues. If no workers are processing any tasks from the queues and the queues are empty, +the blocking process will exit and allow the allocation to end. + +The ``monitor`` function will check for celery workers for up to +10*(sleep) seconds before monitoring begins. The loop happens when the +queue(s) in the spec contain tasks, but no running workers are detected. +This is to protect against a failed worker launch. .. code:: bash @@ -129,11 +136,6 @@ for workers. The default is 60 seconds. The only currently available option for ``--task_server`` is celery, which is the default when this flag is excluded. -The ``monitor`` function will check for celery workers for up to -10*(sleep) seconds before monitoring begins. The loop happens when the -queue(s) in the spec contain tasks, but no running workers are detected. -This is to protect against a failed worker launch. - Purging Tasks (``merlin purge``) -------------------------------- diff --git a/docs/source/merlin_specification.rst b/docs/source/merlin_specification.rst index f857fabf3..71e041b33 100644 --- a/docs/source/merlin_specification.rst +++ b/docs/source/merlin_specification.rst @@ -120,6 +120,9 @@ see :doc:`./merlin_variables`. # The $(LAUNCHER) macro can be used to substitute a parallel launcher # based on the batch:type:. # It will use the nodes and procs values for the task. + # restart: The (optional) restart command to run when $(MERLIN_RESTART) + # is the exit code. The command in cmd will be run if the exit code + # is $(MERLIN_RETRY). # task_queue: the queue to assign the step to (optional. default: merlin) # shell: the shell to use for the command (eg /bin/bash /usr/bin/env python) # (optional. default: /bin/bash) @@ -156,6 +159,8 @@ see :doc:`./merlin_variables`. cmd: | cd $(runs1.workspace)/$(MERLIN_SAMPLE_PATH) + # exit $(MERLIN_RESTART) # syntax to send a restart error code + # This will rerun the cmd command. Users can also use $(MERLIN_RETRY). nodes: 1 procs: 1 depends: [runs1] @@ -167,7 +172,14 @@ see :doc:`./merlin_variables`. cmd: | touch learnrun.out $(LAUNCHER) echo "$(VAR1) $(VAR2)" >> learnrun.out - exit $(MERLIN_RETRY) # some syntax to send a retry error code + exit $(MERLIN_RESTART) # syntax to send a restart error code + # exit $(MERLIN_RETRY) # syntax to send a retry error code to + # run the cmd command again instead of the restart command. + restart: | + # Command to run if the $(MERLIN_RESTART) exit code is used + touch learnrunrs.out + $(LAUNCHER) echo "$(VAR1) $(VAR2)" >> learnrunrs.out + exit $(MERLIN_SUCCESS) # syntax to send a success code nodes: 1 procs: 1 task_queue: lqueue diff --git a/merlin/__init__.py b/merlin/__init__.py index 20a0e8b3e..c1ad21b22 100644 --- a/merlin/__init__.py +++ b/merlin/__init__.py @@ -6,7 +6,7 @@ # # LLNL-CODE-797170 # All rights reserved. -# This file is part of Merlin, Version: 1.11.0. +# This file is part of Merlin, Version: 1.11.1. # # For details, see https://github.com/LLNL/merlin. # @@ -38,7 +38,7 @@ import sys -__version__ = "1.11.0" +__version__ = "1.11.1" VERSION = __version__ PATH_TO_PROJ = os.path.join(os.path.dirname(__file__), "") diff --git a/merlin/ascii_art.py b/merlin/ascii_art.py index f823937a6..3cca2c710 100644 --- a/merlin/ascii_art.py +++ b/merlin/ascii_art.py @@ -6,7 +6,7 @@ # # LLNL-CODE-797170 # All rights reserved. -# This file is part of Merlin, Version: 1.11.0. +# This file is part of Merlin, Version: 1.11.1. # # For details, see https://github.com/LLNL/merlin. # diff --git a/merlin/celery.py b/merlin/celery.py index 95f26530e..55d616658 100644 --- a/merlin/celery.py +++ b/merlin/celery.py @@ -6,7 +6,7 @@ # # LLNL-CODE-797170 # All rights reserved. -# This file is part of Merlin, Version: 1.11.0. +# This file is part of Merlin, Version: 1.11.1. # # For details, see https://github.com/LLNL/merlin. # diff --git a/merlin/common/__init__.py b/merlin/common/__init__.py index d6f53d03d..e6dccdf56 100644 --- a/merlin/common/__init__.py +++ b/merlin/common/__init__.py @@ -6,7 +6,7 @@ # # LLNL-CODE-797170 # All rights reserved. -# This file is part of Merlin, Version: 1.11.0. +# This file is part of Merlin, Version: 1.11.1. # # For details, see https://github.com/LLNL/merlin. # diff --git a/merlin/common/abstracts/__init__.py b/merlin/common/abstracts/__init__.py index d6f53d03d..e6dccdf56 100644 --- a/merlin/common/abstracts/__init__.py +++ b/merlin/common/abstracts/__init__.py @@ -6,7 +6,7 @@ # # LLNL-CODE-797170 # All rights reserved. -# This file is part of Merlin, Version: 1.11.0. +# This file is part of Merlin, Version: 1.11.1. # # For details, see https://github.com/LLNL/merlin. # diff --git a/merlin/common/abstracts/enums/__init__.py b/merlin/common/abstracts/enums/__init__.py index 7b8ab80f5..383e7dccd 100644 --- a/merlin/common/abstracts/enums/__init__.py +++ b/merlin/common/abstracts/enums/__init__.py @@ -6,7 +6,7 @@ # # LLNL-CODE-797170 # All rights reserved. -# This file is part of Merlin, Version: 1.11.0. +# This file is part of Merlin, Version: 1.11.1. # # For details, see https://github.com/LLNL/merlin. # diff --git a/merlin/common/openfilelist.py b/merlin/common/openfilelist.py index 124c7851d..d79e4e4f3 100644 --- a/merlin/common/openfilelist.py +++ b/merlin/common/openfilelist.py @@ -8,7 +8,7 @@ # # LLNL-CODE-797170 # All rights reserved. -# This file is part of Merlin, Version: 1.11.0. +# This file is part of Merlin, Version: 1.11.1. # # For details, see https://github.com/LLNL/merlin. # diff --git a/merlin/common/opennpylib.py b/merlin/common/opennpylib.py index a8f8dffb2..da366b452 100644 --- a/merlin/common/opennpylib.py +++ b/merlin/common/opennpylib.py @@ -8,7 +8,7 @@ # # LLNL-CODE-797170 # All rights reserved. -# This file is part of Merlin, Version: 1.11.0. +# This file is part of Merlin, Version: 1.11.1. # # For details, see https://github.com/LLNL/merlin. # diff --git a/merlin/common/sample_index.py b/merlin/common/sample_index.py index 149d52e13..4e3ac3a52 100644 --- a/merlin/common/sample_index.py +++ b/merlin/common/sample_index.py @@ -6,7 +6,7 @@ # # LLNL-CODE-797170 # All rights reserved. -# This file is part of Merlin, Version: 1.11.0. +# This file is part of Merlin, Version: 1.11.1. # # For details, see https://github.com/LLNL/merlin. # diff --git a/merlin/common/sample_index_factory.py b/merlin/common/sample_index_factory.py index dc13d41d1..4303c3a6e 100644 --- a/merlin/common/sample_index_factory.py +++ b/merlin/common/sample_index_factory.py @@ -6,7 +6,7 @@ # # LLNL-CODE-797170 # All rights reserved. -# This file is part of Merlin, Version: 1.11.0. +# This file is part of Merlin, Version: 1.11.1. # # For details, see https://github.com/LLNL/merlin. # diff --git a/merlin/common/security/__init__.py b/merlin/common/security/__init__.py index d6f53d03d..e6dccdf56 100644 --- a/merlin/common/security/__init__.py +++ b/merlin/common/security/__init__.py @@ -6,7 +6,7 @@ # # LLNL-CODE-797170 # All rights reserved. -# This file is part of Merlin, Version: 1.11.0. +# This file is part of Merlin, Version: 1.11.1. # # For details, see https://github.com/LLNL/merlin. # diff --git a/merlin/common/security/encrypt.py b/merlin/common/security/encrypt.py index 125ec5bed..806d42e0c 100644 --- a/merlin/common/security/encrypt.py +++ b/merlin/common/security/encrypt.py @@ -6,7 +6,7 @@ # # LLNL-CODE-797170 # All rights reserved. -# This file is part of Merlin, Version: 1.11.0. +# This file is part of Merlin, Version: 1.11.1. # # For details, see https://github.com/LLNL/merlin. # diff --git a/merlin/common/security/encrypt_backend_traffic.py b/merlin/common/security/encrypt_backend_traffic.py index 68e178b77..e0957ebb8 100644 --- a/merlin/common/security/encrypt_backend_traffic.py +++ b/merlin/common/security/encrypt_backend_traffic.py @@ -6,7 +6,7 @@ # # LLNL-CODE-797170 # All rights reserved. -# This file is part of Merlin, Version: 1.11.0. +# This file is part of Merlin, Version: 1.11.1. # # For details, see https://github.com/LLNL/merlin. # diff --git a/merlin/common/tasks.py b/merlin/common/tasks.py index 76b8ad3b5..893aa10e0 100644 --- a/merlin/common/tasks.py +++ b/merlin/common/tasks.py @@ -6,7 +6,7 @@ # # LLNL-CODE-797170 # All rights reserved. -# This file is part of Merlin, Version: 1.11.0. +# This file is part of Merlin, Version: 1.11.1. # # For details, see https://github.com/LLNL/merlin. # @@ -645,7 +645,7 @@ def expand_tasks_with_samples( # pylint: disable=R0913,R0914 if not found_tasks: for next_index_path, next_index in sample_index.traverse(conditional=condition): LOG.info( - f"generating next step for range {next_index.min}:{next_index.max} {next_index.max-next_index.min}" + f"generating next step for range {next_index.min}:{next_index.max} {next_index.max - next_index.min}" ) next_index.name = next_index_path diff --git a/merlin/common/util_sampling.py b/merlin/common/util_sampling.py index c29763485..134d0b66c 100644 --- a/merlin/common/util_sampling.py +++ b/merlin/common/util_sampling.py @@ -6,7 +6,7 @@ # # LLNL-CODE-797170 # All rights reserved. -# This file is part of Merlin, Version: 1.11.0. +# This file is part of Merlin, Version: 1.11.1. # # For details, see https://github.com/LLNL/merlin. # diff --git a/merlin/config/__init__.py b/merlin/config/__init__.py index 0594ffe45..b58e3b2a9 100644 --- a/merlin/config/__init__.py +++ b/merlin/config/__init__.py @@ -6,7 +6,7 @@ # # LLNL-CODE-797170 # All rights reserved. -# This file is part of Merlin, Version: 1.11.0. +# This file is part of Merlin, Version: 1.11.1. # # For details, see https://github.com/LLNL/merlin. # diff --git a/merlin/config/broker.py b/merlin/config/broker.py index fe49ff162..385b8c1df 100644 --- a/merlin/config/broker.py +++ b/merlin/config/broker.py @@ -6,7 +6,7 @@ # # LLNL-CODE-797170 # All rights reserved. -# This file is part of Merlin, Version: 1.11.0. +# This file is part of Merlin, Version: 1.11.1. # # For details, see https://github.com/LLNL/merlin. # diff --git a/merlin/config/celeryconfig.py b/merlin/config/celeryconfig.py index 67d106809..335c82ed0 100644 --- a/merlin/config/celeryconfig.py +++ b/merlin/config/celeryconfig.py @@ -10,7 +10,7 @@ # # LLNL-CODE-797170 # All rights reserved. -# This file is part of Merlin, Version: 1.11.0. +# This file is part of Merlin, Version: 1.11.1. # # For details, see https://github.com/LLNL/merlin. # diff --git a/merlin/config/configfile.py b/merlin/config/configfile.py index 1f3418377..2ca6c5d04 100644 --- a/merlin/config/configfile.py +++ b/merlin/config/configfile.py @@ -6,7 +6,7 @@ # # LLNL-CODE-797170 # All rights reserved. -# This file is part of Merlin, Version: 1.11.0. +# This file is part of Merlin, Version: 1.11.1. # # For details, see https://github.com/LLNL/merlin. # diff --git a/merlin/config/results_backend.py b/merlin/config/results_backend.py index d3e7002e7..b88655399 100644 --- a/merlin/config/results_backend.py +++ b/merlin/config/results_backend.py @@ -6,7 +6,7 @@ # # LLNL-CODE-797170 # All rights reserved. -# This file is part of Merlin, Version: 1.11.0. +# This file is part of Merlin, Version: 1.11.1. # # For details, see https://github.com/LLNL/merlin. # diff --git a/merlin/config/utils.py b/merlin/config/utils.py index 65fc6f85c..8f0c6b029 100644 --- a/merlin/config/utils.py +++ b/merlin/config/utils.py @@ -6,7 +6,7 @@ # # LLNL-CODE-797170 # All rights reserved. -# This file is part of Merlin, Version: 1.11.0. +# This file is part of Merlin, Version: 1.11.1. # # For details, see https://github.com/LLNL/merlin. # diff --git a/merlin/data/celery/__init__.py b/merlin/data/celery/__init__.py index d6f53d03d..e6dccdf56 100644 --- a/merlin/data/celery/__init__.py +++ b/merlin/data/celery/__init__.py @@ -6,7 +6,7 @@ # # LLNL-CODE-797170 # All rights reserved. -# This file is part of Merlin, Version: 1.11.0. +# This file is part of Merlin, Version: 1.11.1. # # For details, see https://github.com/LLNL/merlin. # diff --git a/merlin/display.py b/merlin/display.py index b34be8b6b..bf9e30fdd 100644 --- a/merlin/display.py +++ b/merlin/display.py @@ -6,7 +6,7 @@ # # LLNL-CODE-797170 # All rights reserved. -# This file is part of Merlin, Version: 1.11.0. +# This file is part of Merlin, Version: 1.11.1. # # For details, see https://github.com/LLNL/merlin. # @@ -45,6 +45,7 @@ from tabulate import tabulate from merlin.ascii_art import banner_small +from merlin.study.status_renderers import status_renderer_factory LOG = logging.getLogger("merlin") @@ -234,6 +235,70 @@ def print_info(args): # pylint: disable=W0613 print("") +def display_status_task_by_task(status_obj: "DetailedStatus", test_mode: bool = False): # noqa: F821 + """ + Displays a low level overview of the status of a study. This is a task-by-task + status display where each task will show: + step name, worker name, task queue, cmd & restart parameters, + step workspace, step status, return code, elapsed time, run time, and num restarts. + If too many tasks are found and the pager is disabled, prompts will appear for the user to decide + what to do that way we don't overload the terminal (unless the no-prompts flag is provided). + + :param `status_obj`: A DetailedStatus object + :param `test_mode`: If true, run this in testing mode and don't print any output. This will also + decrease the limit on the number of tasks allowed before a prompt is displayed. + """ + args = status_obj.args + try: + status_renderer = status_renderer_factory.get_renderer(args.layout, args.disable_theme, args.disable_pager) + except ValueError: + LOG.error(f"Layout '{args.layout}' not implemented.") + raise + + cancel_display = False + + # If the pager is disabled then we need to be careful not to overload the terminal with a bazillion tasks + if args.disable_pager and not args.no_prompts: + # Setting the limit by default to be 250 tasks before asking for additional filters + no_prompt_limit = 250 if not test_mode else 15 + while status_obj.num_requested_statuses > no_prompt_limit: + # See if the user wants to apply additional filters + apply_additional_filters = input( + f"About to display {status_obj.num_requested_statuses} tasks without a pager. " + "Would you like to apply additional filters? (y/n/c) " + ).lower() + while apply_additional_filters not in ("y", "n", "c"): + apply_additional_filters = input( + "Invalid input. You must enter either 'y' for yes, 'n' for no, or 'c' for cancel: " + ).lower() + + # Apply filters if necessary or break the loop + if apply_additional_filters == "y": + status_obj.filter_via_prompts() + elif apply_additional_filters == "n": + print(f"Not filtering further. Displaying {status_obj.num_requested_statuses} tasks...") + break + else: + print("Cancelling status display.") + cancel_display = True + break + + # Display the statuses + if not cancel_display and not test_mode: + if status_obj.num_requested_statuses > 0: + # Table layout requires csv format (since it uses Maestro's renderer) + if args.layout == "table": + status_data = status_obj.format_status_for_csv() + else: + status_data = status_obj.requested_statuses + status_renderer.layout(status_data=status_data, study_title=status_obj.workspace) + status_renderer.render() + + for ustep in status_obj.step_tracker["unstarted_steps"]: + print(f"\n{ustep} has not started yet.") + print() + + def _display_summary(state_info: Dict[str, str], cb_help: bool): """ Given a dict of state info for a step, print a summary of the task states. @@ -306,19 +371,16 @@ def display_status_summary( # pylint: disable=R0912 "RUNNING": {"count": 0, "color": ANSI_COLORS["BLUE"]}, "DRY RUN": {"count": 0, "color": ANSI_COLORS["ORANGE"], "fill": "\\"}, "TOTAL TASKS": {"total": status_obj.tasks_per_step[sstep]}, - "AVG RUN TIME": status_obj.requested_statuses[sstep]["avg_run_time"], - "RUN TIME STD DEV": status_obj.requested_statuses[sstep]["run_time_std_dev"], + "AVG RUN TIME": status_obj.run_time_info[sstep]["avg_run_time"], + "RUN TIME STD DEV": status_obj.run_time_info[sstep]["run_time_std_dev"], } # Initialize a var to track # of completed tasks and grab the statuses for this step num_completed_tasks = 0 - step_statuses = status_obj.requested_statuses[sstep] # Loop through each entry for the step (if there's no parameters there will just be one entry) - for real_step_name, overall_step_info in step_statuses.items(): - # Non-dict entries are just for run time info at the moment - if not isinstance(overall_step_info, dict): - continue + for full_step_name in status_obj.full_step_name_map[sstep]: + overall_step_info = status_obj.requested_statuses[full_step_name] # If this was a non-local run we should have a task queue and worker name to add to state_info if "task_queue" in overall_step_info: diff --git a/merlin/examples/__init__.py b/merlin/examples/__init__.py index d6f53d03d..e6dccdf56 100644 --- a/merlin/examples/__init__.py +++ b/merlin/examples/__init__.py @@ -6,7 +6,7 @@ # # LLNL-CODE-797170 # All rights reserved. -# This file is part of Merlin, Version: 1.11.0. +# This file is part of Merlin, Version: 1.11.1. # # For details, see https://github.com/LLNL/merlin. # diff --git a/merlin/examples/examples.py b/merlin/examples/examples.py index 1d756f00e..9b65f31ae 100644 --- a/merlin/examples/examples.py +++ b/merlin/examples/examples.py @@ -6,7 +6,7 @@ # # LLNL-CODE-797170 # All rights reserved. -# This file is part of Merlin, Version: 1.11.0. +# This file is part of Merlin, Version: 1.11.1. # # For details, see https://github.com/LLNL/merlin. # diff --git a/merlin/examples/generator.py b/merlin/examples/generator.py index 294787857..2fa5e61ce 100644 --- a/merlin/examples/generator.py +++ b/merlin/examples/generator.py @@ -6,7 +6,7 @@ # # LLNL-CODE-797170 # All rights reserved. -# This file is part of Merlin, Version: 1.11.0. +# This file is part of Merlin, Version: 1.11.1. # # For details, see https://github.com/LLNL/merlin. # diff --git a/merlin/exceptions/__init__.py b/merlin/exceptions/__init__.py index 506c3caca..5d92d6242 100644 --- a/merlin/exceptions/__init__.py +++ b/merlin/exceptions/__init__.py @@ -6,7 +6,7 @@ # # LLNL-CODE-797170 # All rights reserved. -# This file is part of Merlin, Version: 1.11.0. +# This file is part of Merlin, Version: 1.11.1. # # For details, see https://github.com/LLNL/merlin. # @@ -42,7 +42,8 @@ "HardFailException", "InvalidChainException", "RestartException", - "DeepMergeException,", + "DeepMergeException", + "NoWorkersException", ) @@ -103,3 +104,13 @@ class DeepMergeException(Exception): def __init__(self, message): super().__init__(message) + + +class NoWorkersException(Exception): + """ + Exception to signal that no workers were started + to process a non-empty queue(s). + """ + + def __init__(self, message): + super().__init__(message) diff --git a/merlin/log_formatter.py b/merlin/log_formatter.py index 3fba8cfc8..b8858f721 100644 --- a/merlin/log_formatter.py +++ b/merlin/log_formatter.py @@ -8,7 +8,7 @@ # # LLNL-CODE-797170 # All rights reserved. -# This file is part of Merlin, Version: 1.11.0. +# This file is part of Merlin, Version: 1.11.1. # # For details, see https://github.com/LLNL/merlin. # diff --git a/merlin/main.py b/merlin/main.py index 54566818f..ab6da588c 100644 --- a/merlin/main.py +++ b/merlin/main.py @@ -8,7 +8,7 @@ # # LLNL-CODE-797170 # All rights reserved. -# This file is part of Merlin, Version: 1.11.0. +# This file is part of Merlin, Version: 1.11.1. # # For details, see https://github.com/LLNL/merlin. # @@ -55,7 +55,9 @@ from merlin.server.server_commands import config_server, init_server, restart_server, start_server, status_server, stop_server from merlin.spec.expansion import RESERVED, get_spec_with_expansion from merlin.spec.specification import MerlinSpec -from merlin.study.status import Status +from merlin.study.status import DetailedStatus, Status +from merlin.study.status_constants import VALID_RETURN_CODES, VALID_STATUS_FILTERS +from merlin.study.status_renderers import status_renderer_factory from merlin.study.study import MerlinStudy from merlin.utils import ARRAY_FILE_FORMATS, verify_dirpath, verify_filepath @@ -191,11 +193,13 @@ def launch_workers(args): spec, filepath = get_merlin_spec_with_override(args) if not args.worker_echo_only: LOG.info(f"Launching workers from '{filepath}'") - status = router.launch_workers(spec, args.worker_steps, args.worker_args, args.disable_logs, args.worker_echo_only) + launch_worker_status = router.launch_workers( + spec, args.worker_steps, args.worker_args, args.disable_logs, args.worker_echo_only + ) if args.worker_echo_only: - print(status) + print(launch_worker_status) else: - LOG.debug(f"celery command: {status}") + LOG.debug(f"celery command: {launch_worker_status}") def purge_tasks(args): @@ -251,15 +255,13 @@ def query_status(args): # If we're loading status based on a spec, load in the spec provided if spec_display: args.specification = file_or_ws - spec_provided, _ = get_merlin_spec_with_override(args) - args.spec_provided = spec_provided + args.spec_provided = get_spec_with_expansion(args.specification) # Get either a Status object or DetailedStatus object - # if args.detailed: - # status_obj = DetailedStatus(args, spec_display, file_or_ws) - # else: - # status_obj = Status(args, spec_display, file_or_ws) - status_obj = Status(args, spec_display, file_or_ws) # The above can be uncommented when we add DetailedStatus + if args.detailed: + status_obj = DetailedStatus(args, spec_display, file_or_ws) + else: + status_obj = Status(args, spec_display, file_or_ws) # Handle output appropriately if args.dump: @@ -361,8 +363,13 @@ def process_monitor(args): """ LOG.info("Monitor: checking queues ...") spec, _ = get_merlin_spec_with_override(args) + + # Give the user time to queue up jobs in case they haven't already + time.sleep(args.sleep) + + # Check if we still need our allocation while router.check_merlin_status(args, spec): - LOG.info("Monitor: found tasks in queues") + LOG.info("Monitor: found tasks in queues and/or tasks being processed") time.sleep(args.sleep) LOG.info("Monitor: ... stop condition met") @@ -919,14 +926,105 @@ def generate_diagnostic_parsers(subparsers: ArgumentParser) -> None: Default: %(default)s", ) status_cmd.add_argument( - "--vars", + "-o", + "--output-path", action="store", - dest="variables", type=str, - nargs="+", default=None, - help="Specify desired Merlin variable values to override those found in the specification. Space-delimited. " - "Example: '--vars LEARN=path/to/new_learn.py EPOCHS=3'", + help="Specify a location to look for output workspaces. Only used when a spec file is passed as the argument " + "to 'status'; this will NOT be used if an output workspace is passed as the argument.", + ) + + # merlin detailed-status + detailed_status: ArgumentParser = subparsers.add_parser( + "detailed-status", + help="Display a task-by-task status of a study.", + ) + detailed_status.set_defaults(func=query_status, detailed=True) + detailed_status.add_argument( + "spec_or_workspace", type=str, help="Path to a Merlin YAML spec file or a launched Merlin study" + ) + detailed_status.add_argument( + "--dump", type=str, help="Dump the status to a file. Provide the filename (must be .csv or .json).", default=None + ) + detailed_status.add_argument( + "--task_server", + type=str, + default="celery", + help="Task server type.\ + Default: %(default)s", + ) + detailed_status.add_argument( + "-o", + "--output-path", + action="store", + type=str, + default=None, + help="Specify a location to look for output workspaces. Only used when a spec file is passed as the argument " + "to 'status'; this will NOT be used if an output workspace is passed as the argument.", + ) + status_filter_group = detailed_status.add_argument_group("filter options") + status_filter_group.add_argument( + "--max-tasks", action="store", type=int, help="Sets a limit on how many tasks can be displayed" + ) + status_filter_group.add_argument( + "--return-code", + action="store", + nargs="+", + type=str, + choices=VALID_RETURN_CODES, + help="Filter which tasks to display based on their return code", + ) + status_filter_group.add_argument( + "--steps", + nargs="+", + type=str, + dest="steps", + default=["all"], + help="Filter which tasks to display based on the steps they're associated with", + ) + status_filter_group.add_argument( + "--task-queues", + nargs="+", + type=str, + help="Filter which tasks to display based on the task queue they're in", + ) + status_filter_group.add_argument( + "--task-status", + action="store", + nargs="+", + type=str, + choices=VALID_STATUS_FILTERS, + help="Filter which tasks to display based on their status", + ) + status_filter_group.add_argument( + "--workers", + nargs="+", + type=str, + help="Filter which tasks to display based on which workers are processing them", + ) + status_display_group = detailed_status.add_argument_group("display options") + status_display_group.add_argument( + "--disable-pager", action="store_true", help="Turn off the pager functionality when viewing the status" + ) + status_display_group.add_argument( + "--disable-theme", + action="store_true", + help="Turn off styling for the status layout (If you want styling but it's not working, try modifying " + "the MANPAGER or PAGER environment variables to be 'less -r'; i.e. export MANPAGER='less -r')", + ) + status_display_group.add_argument( + "--layout", + type=str, + choices=status_renderer_factory.get_layouts(), + default="default", + help="Alternate status layouts [Default: %(default)s]", + ) + status_display_group.add_argument( + "--no-prompts", + action="store_true", + help="Ignore any prompts provided. This will default to the latest study \ + if you provide a spec file rather than a study workspace.", ) # merlin info diff --git a/merlin/merlin_templates.py b/merlin/merlin_templates.py index 7936db03b..a355c4f2f 100644 --- a/merlin/merlin_templates.py +++ b/merlin/merlin_templates.py @@ -6,7 +6,7 @@ # # LLNL-CODE-797170 # All rights reserved. -# This file is part of Merlin, Version: 1.11.0. +# This file is part of Merlin, Version: 1.11.1. # # For details, see https://github.com/LLNL/merlin. # diff --git a/merlin/router.py b/merlin/router.py index 476ab1c0f..6c90c1d80 100644 --- a/merlin/router.py +++ b/merlin/router.py @@ -6,7 +6,7 @@ # # LLNL-CODE-797170 # All rights reserved. -# This file is part of Merlin, Version: 1.11.0. +# This file is part of Merlin, Version: 1.11.1. # # For details, see https://github.com/LLNL/merlin. # @@ -39,9 +39,13 @@ import os import time from datetime import datetime +from typing import Dict, List +from merlin.exceptions import NoWorkersException from merlin.study.celeryadapter import ( + check_celery_workers_processing, create_celery_config, + get_active_celery_queues, get_workers_from_app, purge_celery_tasks, query_celery_queues, @@ -151,12 +155,12 @@ def dump_status(query_return, csv_file): with open(csv_file, mode=fmode) as f: # pylint: disable=W1514,C0103 if f.mode == "w": # add the header f.write("# time") - for name, job, consumer in query_return: + for name in query_return: f.write(f",{name}:tasks,{name}:consumers") f.write("\n") f.write(f"{datetime.now().strftime('%Y-%m-%d %H:%M:%S')}") - for name, job, consumer in query_return: - f.write(f",{job},{consumer}") + for queue_info in query_return.values(): + f.write(f",{queue_info['jobs']},{queue_info['consumers']}") f.write("\n") @@ -236,43 +240,130 @@ def create_config(task_server: str, config_dir: str, broker: str, test: str) -> LOG.error("Only celery can be configured currently.") -def check_merlin_status(args, spec): +def get_active_queues(task_server: str) -> Dict[str, List[str]]: """ - Function to check merlin workers and queues to keep - the allocation alive + Get a dictionary of active queues and the workers attached to these queues. + + :param `task_server`: The task server to query for active queues + :returns: A dict where keys are queue names and values are a list of workers watching them + """ + active_queues = {} + + if task_server == "celery": + from merlin.celery import app # pylint: disable=C0415 + + active_queues, _ = get_active_celery_queues(app) + else: + LOG.error("Only celery can be configured currently.") + + return active_queues + + +def wait_for_workers(sleep: int, task_server: str, spec: "MerlinSpec"): # noqa + """ + Wait on workers to start up. Check on worker start 10 times with `sleep` seconds between + each check. If no workers are started in time, raise an error to kill the monitor (there + was likely an issue with the task server that caused worker launch to fail). + + :param `sleep`: An integer representing the amount of seconds to sleep between each check + :param `task_server`: The task server from which to look for workers + :param `spec`: A MerlinSpec object representing the spec we're monitoring + """ + # Get the names of the workers that we're looking for + worker_names = spec.get_worker_names() + LOG.info(f"Checking for the following workers: {worker_names}") + + # Loop until workers are detected + count = 0 + max_count = 10 + while count < max_count: + # This list will include strings comprised of the worker name with the hostname e.g. worker_name@host. + worker_status = get_workers(task_server) + LOG.info(f"Monitor: checking for workers, running workers = {worker_status} ...") + + # Check to see if any of the workers we're looking for in 'worker_names' have started + check = any(any(iwn in iws for iws in worker_status) for iwn in worker_names) + if check: + break + + # Increment count and sleep until the next check + count += 1 + time.sleep(sleep) + + # If no workers were started in time, raise an exception to stop the monitor + if count == max_count: + raise NoWorkersException("Monitor: no workers available to process the non-empty queue") + + +def check_workers_processing(queues_in_spec: List[str], task_server: str) -> bool: + """ + Check if any workers are still processing tasks by querying the task server. + + :param `queues_in_spec`: A list of queues to check if tasks are still active in + :param `task_server`: The task server from which to query + :returns: True if workers are still processing tasks, False otherwise + """ + result = False + + if task_server == "celery": + from merlin.celery import app + + result = check_celery_workers_processing(queues_in_spec, app) + else: + LOG.error("Celery is not specified as the task server!") + + return result + + +def check_merlin_status(args: "Namespace", spec: "MerlinSpec") -> bool: # noqa + """ + Function to check merlin workers and queues to keep the allocation alive :param `args`: parsed CLI arguments - :param `spec`: the parsed spec.yaml + :param `spec`: the parsed spec.yaml as a MerlinSpec object + :returns: True if there are still tasks being processed, False otherwise """ + # Initialize the variable to track if there are still active tasks + active_tasks = False + + # Get info about jobs and workers in our spec from celery queue_status = query_status(args.task_server, spec, args.steps, verbose=False) + LOG.debug(f"Monitor: queue_status: {queue_status}") + # Count the number of jobs that are active + # (Adding up the number of consumers in the same way is inaccurate so we won't do that) total_jobs = 0 - total_consumers = 0 - for _, jobs, consumers in queue_status: - total_jobs += jobs - total_consumers += consumers - - if total_jobs > 0 and total_consumers == 0: - # Determine if any of the workers are on this allocation - worker_names = spec.get_worker_names() - - # Loop until workers are detected. - count = 0 - max_count = 10 - while count < max_count: - # This list will include strings comprised of the worker name with the hostname e.g. worker_name@host. - worker_status = get_workers(args.task_server) - LOG.info(f"Monitor: checking for workers, running workers = {worker_status} ...") - - check = any(any(iwn in iws for iws in worker_status) for iwn in worker_names) - if check: - break - - count += 1 - time.sleep(args.sleep) - - if count == max_count: - LOG.error("Monitor: no workers available to process the non-empty queue") - total_jobs = 0 - - return total_jobs + for queue_info in queue_status.values(): + total_jobs += queue_info["jobs"] + + # Get the queues defined in the spec + queues_in_spec = spec.get_queue_list(["all"]) + LOG.debug(f"Monitor: queues_in_spec: {queues_in_spec}") + + # Get the active queues and the workers that are watching them + active_queues = get_active_queues(args.task_server) + LOG.debug(f"Monitor: active_queues: {active_queues}") + + # Count the number of workers that are active + consumers = set() + for active_queue, workers_on_queue in active_queues.items(): + if active_queue in queues_in_spec: + consumers |= set(workers_on_queue) + LOG.debug(f"Monitor: consumers found: {consumers}") + total_consumers = len(consumers) + + LOG.info(f"Monitor: found {total_jobs} jobs in queues and {total_consumers} workers alive") + + # If there are no workers, wait for the workers to start + if total_consumers == 0: + wait_for_workers(args.sleep, args.task_server, spec) + + # If we're here, workers have started and jobs should be queued + if total_jobs > 0: + active_tasks = True + # If there are no jobs left, see if any workers are still processing them + elif total_jobs == 0: + active_tasks = check_workers_processing(queues_in_spec, args.task_server) + + LOG.debug(f"Monitor: active_tasks: {active_tasks}") + return active_tasks diff --git a/merlin/server/__init__.py b/merlin/server/__init__.py index d04c75d72..7e2b6f1c1 100644 --- a/merlin/server/__init__.py +++ b/merlin/server/__init__.py @@ -6,7 +6,7 @@ # # LLNL-CODE-797170 # All rights reserved. -# This file is part of Merlin, Version: 1.11.0. +# This file is part of Merlin, Version: 1.11.1. # # For details, see https://github.com/LLNL/merlin. diff --git a/merlin/server/server_commands.py b/merlin/server/server_commands.py index 45411131b..c244c9eca 100644 --- a/merlin/server/server_commands.py +++ b/merlin/server/server_commands.py @@ -8,7 +8,7 @@ # # LLNL-CODE-797170 # All rights reserved. -# This file is part of Merlin, Version: 1.11.0. +# This file is part of Merlin, Version: 1.11.1. # # For details, see https://github.com/LLNL/merlin. # @@ -92,6 +92,9 @@ def config_server(args: Namespace) -> None: # pylint: disable=R0912 redis_users = RedisUsers(server_config.container.get_user_file_path()) redis_users.set_password("default", args.password) redis_users.write() + pass_file = server_config.container.get_pass_file_path() + with open(pass_file, "w") as pfile: + pfile.write(args.password) redis_config.set_directory(args.directory) diff --git a/merlin/server/server_config.py b/merlin/server/server_config.py index 414f7a407..e4ec646fc 100644 --- a/merlin/server/server_config.py +++ b/merlin/server/server_config.py @@ -6,7 +6,7 @@ # # LLNL-CODE-797170 # All rights reserved. -# This file is part of Merlin, Version: 1.11.0. +# This file is part of Merlin, Version: 1.11.1. # # For details, see https://github.com/LLNL/merlin. # diff --git a/merlin/server/server_util.py b/merlin/server/server_util.py index 2b8f1216d..bab641702 100644 --- a/merlin/server/server_util.py +++ b/merlin/server/server_util.py @@ -6,7 +6,7 @@ # # LLNL-CODE-797170 # All rights reserved. -# This file is part of Merlin, Version: 1.11.0. +# This file is part of Merlin, Version: 1.11.1. # # For details, see https://github.com/LLNL/merlin. # diff --git a/merlin/spec/__init__.py b/merlin/spec/__init__.py index d6f53d03d..e6dccdf56 100644 --- a/merlin/spec/__init__.py +++ b/merlin/spec/__init__.py @@ -6,7 +6,7 @@ # # LLNL-CODE-797170 # All rights reserved. -# This file is part of Merlin, Version: 1.11.0. +# This file is part of Merlin, Version: 1.11.1. # # For details, see https://github.com/LLNL/merlin. # diff --git a/merlin/spec/all_keys.py b/merlin/spec/all_keys.py index 556f5924e..dcc02b063 100644 --- a/merlin/spec/all_keys.py +++ b/merlin/spec/all_keys.py @@ -6,7 +6,7 @@ # # LLNL-CODE-797170 # All rights reserved. -# This file is part of Merlin, Version: 1.11.0. +# This file is part of Merlin, Version: 1.11.1. # # For details, see https://github.com/LLNL/merlin. # diff --git a/merlin/spec/defaults.py b/merlin/spec/defaults.py index 8972d5cfe..32fd05aa5 100644 --- a/merlin/spec/defaults.py +++ b/merlin/spec/defaults.py @@ -6,7 +6,7 @@ # # LLNL-CODE-797170 # All rights reserved. -# This file is part of Merlin, Version: 1.11.0. +# This file is part of Merlin, Version: 1.11.1. # # For details, see https://github.com/LLNL/merlin. # diff --git a/merlin/spec/expansion.py b/merlin/spec/expansion.py index 381bc72f4..a8a9d13e2 100644 --- a/merlin/spec/expansion.py +++ b/merlin/spec/expansion.py @@ -6,7 +6,7 @@ # # LLNL-CODE-797170 # All rights reserved. -# This file is part of Merlin, Version: 1.11.0. +# This file is part of Merlin, Version: 1.11.1. # # For details, see https://github.com/LLNL/merlin. # @@ -37,7 +37,7 @@ from merlin.common.abstracts.enums import ReturnCode from merlin.spec.override import error_override_vars, replace_override_vars from merlin.spec.specification import MerlinSpec -from merlin.utils import contains_shell_ref, contains_token +from merlin.utils import contains_shell_ref, contains_token, verify_filepath MAESTRO_RESERVED = {"SPECROOT", "WORKSPACE", "LAUNCHER"} @@ -251,5 +251,6 @@ def get_spec_with_expansion(filepath, override_vars=None): Return a MerlinSpec with overrides and expansion, without creating a MerlinStudy. """ + filepath = verify_filepath(filepath) expanded_spec_text = expand_spec_no_study(filepath, override_vars) return MerlinSpec.load_spec_from_string(expanded_spec_text) diff --git a/merlin/spec/override.py b/merlin/spec/override.py index f3192a38e..abb0f13c9 100644 --- a/merlin/spec/override.py +++ b/merlin/spec/override.py @@ -6,7 +6,7 @@ # # LLNL-CODE-797170 # All rights reserved. -# This file is part of Merlin, Version: 1.11.0. +# This file is part of Merlin, Version: 1.11.1. # # For details, see https://github.com/LLNL/merlin. # diff --git a/merlin/spec/specification.py b/merlin/spec/specification.py index 980edaf24..3e7e89464 100644 --- a/merlin/spec/specification.py +++ b/merlin/spec/specification.py @@ -6,7 +6,7 @@ # # LLNL-CODE-797170 # All rights reserved. -# This file is part of Merlin, Version: 1.11.0. +# This file is part of Merlin, Version: 1.11.1. # # For details, see https://github.com/LLNL/merlin. # @@ -40,7 +40,7 @@ from copy import deepcopy from datetime import timedelta from io import StringIO -from typing import Dict +from typing import Dict, List import yaml from maestrowf.specification import YAMLSpecification @@ -568,10 +568,12 @@ def _process_dict(self, obj, string, key_stack, lvl, tab): # pylint: disable=R0 i += 1 return string - def get_step_worker_map(self): + def get_step_worker_map(self) -> Dict[str, List[str]]: """ Creates a dictionary with step names as keys and a list of workers associated with each step as values. The inverse of get_worker_step_map(). + + :returns: A dict mapping step names to workers """ steps = self.get_study_step_names() step_worker_map = {step_name: [] for step_name in steps} @@ -586,26 +588,81 @@ def get_step_worker_map(self): step_worker_map[step].append(worker_name) return step_worker_map - def get_task_queues(self): - """Returns a dictionary of steps and their corresponding task queues.""" + def get_worker_step_map(self) -> Dict[str, List[str]]: + """ + Creates a dictionary with worker names as keys and a list of steps + associated with each worker as values. The inverse of get_step_worker_map(). + + :returns: A dict mapping workers to the steps they watch + """ + worker_step_map = {} + steps = self.get_study_step_names() + for worker_name, worker_val in self.merlin["resources"]["workers"].items(): + # Case 1: worker doesn't have specific steps + if "all" in worker_val["steps"]: + worker_step_map[worker_name] = steps + # Case 2: worker has specific steps + else: + worker_step_map[worker_name] = [] + for step in worker_val["steps"]: + worker_step_map[worker_name].append(step) + return worker_step_map + + def get_task_queues(self, omit_tag=False): + """ + Creates a dictionary of steps and their corresponding task queues. + This is the inverse of get_queue_step_relationship() + + :param `omit_tag`: If True, omit the celery queue tag. + :returns: A dict of steps and their corresponding task queues + """ from merlin.config.configfile import CONFIG # pylint: disable=C0415 steps = self.get_study_steps() queues = {} for step in steps: - if "task_queue" in step.run and CONFIG.celery.omit_queue_tag: + if "task_queue" in step.run and (omit_tag or CONFIG.celery.omit_queue_tag): queues[step.name] = step.run["task_queue"] elif "task_queue" in step.run: queues[step.name] = CONFIG.celery.queue_tag + step.run["task_queue"] return queues - def get_queue_list(self, steps): + def get_queue_step_relationship(self) -> Dict[str, List[str]]: + """ + Builds a dictionary of task queues and their associated steps. + This returns the inverse of get_task_queues(). + + :returns: A dict of task queues and their associated steps + """ + from merlin.config.configfile import CONFIG # pylint: disable=C0415 + + steps = self.get_study_steps() + relationship_tracker = {} + + for step in steps: + if "task_queue" in step.run: + queue_name = ( + step.run["task_queue"] + if CONFIG.celery.omit_queue_tag + else f"{CONFIG.celery.queue_tag}{step.run['task_queue']}" + ) + + if queue_name in relationship_tracker: + relationship_tracker[queue_name].append(step.name) + else: + relationship_tracker[queue_name] = [step.name] + + return relationship_tracker + + def get_queue_list(self, steps, omit_tag=False) -> set: """ - Return a sorted list of queues corresponding to spec steps + Return a sorted set of queues corresponding to spec steps - param steps: a list of step names or 'all' + :param `steps`: a list of step names or ['all'] + :param `omit_tag`: If True, omit the celery queue tag. + :returns: A sorted set of queues corresponding to spec steps """ - queues = self.get_task_queues() + queues = self.get_task_queues(omit_tag=omit_tag) if steps[0] == "all": task_queues = queues.values() else: @@ -671,3 +728,82 @@ def get_tasks_per_step(self) -> Dict[str, int]: tasks_per_step[step.name] *= num_samples return tasks_per_step + + def _create_param_maps(self, param_gen: "ParameterGenerator", expanded_labels: Dict, label_param_map: Dict): # noqa: F821 + """ + Given a parameters block like so: + global.parameters: + TOKEN: + values: [param_val_1, param_val_2] + label: label.%% + Expanded labels will map tokens to their expanded labels (e.g. {'TOKEN': ['label.param_val_1', 'label.param_val_2']}) + Label param map will map labels to parameter values + (e.g. {'label.param_val_1': {'TOKEN': 'param_val_1'}, 'label.param_val_2': {'TOKEN': 'param_val_2'}}) + + :param `param_gen`: A ParameterGenerator object from Maestro + :param `expanded_labels`: A dict to store the map from tokens to expanded labels + :param `label_param_map`: A dict to store the map from labels to parameter values + """ + for token, orig_label in param_gen.labels.items(): + for param in param_gen.parameters[token]: + expanded_label = orig_label.replace(param_gen.label_token, str(param)) + if token in expanded_labels: + expanded_labels[token].append(expanded_label) + else: + expanded_labels[token] = [expanded_label] + label_param_map[expanded_label] = {token: param} + + def get_step_param_map(self) -> Dict: # pylint: disable=R0914 + """ + Create a mapping of parameters used for each step. Each step will have a cmd + to search for parameters in and could also have a restart cmd to check, too. + This creates a mapping of the form: + step_name_with_parameters: { + "cmd": { + TOKEN_1: param_1_value_1, + TOKEN_2: param_2_value_1, + }, + "restart_cmd": { + TOKEN_1: param_1_value_1, + TOKEN_3: param_3_value_1, + } + } + + :returns: A dict mapping between steps and params of the form shown above + """ + # Get the steps and the parameters in the study + study_steps = self.get_study_steps() + param_gen = self.get_parameters() + + # Create maps between tokens and expanded labels, and between labels and parameter values + expanded_labels = {} + label_param_map = {} + self._create_param_maps(param_gen, expanded_labels, label_param_map) + + step_param_map = {} + for step in study_steps: + # Get the cmd and restart cmd for the step + cmd = step.__dict__["run"]["cmd"] + restart_cmd = step.__dict__["run"]["restart"] + + # Get the parameters used in this step and the labels used with those parameters + all_params_in_step = param_gen.get_used_parameters(step) + labels_used = [expanded_labels[param] for param in sorted(all_params_in_step)] + + # Zip all labels used for the step together (since this is how steps are named in Maestro) + for labels in zip(*labels_used): + # Initialize the entry in the step param map + param_str = ".".join(labels) + step_name_with_params = f"{step.name}_{param_str}" + step_param_map[step_name_with_params] = {"cmd": {}, "restart_cmd": {}} + + # Populate the entry in the step param map based on which token is found in which command (cmd or restart) + for label in labels: + for token, param_value in label_param_map[label].items(): + full_token = f"{param_gen.token}({token})" + if full_token in cmd: + step_param_map[step_name_with_params]["cmd"][token] = param_value + if full_token in restart_cmd: + step_param_map[step_name_with_params]["restart_cmd"][token] = param_value + + return step_param_map diff --git a/merlin/study/__init__.py b/merlin/study/__init__.py index d6f53d03d..e6dccdf56 100644 --- a/merlin/study/__init__.py +++ b/merlin/study/__init__.py @@ -6,7 +6,7 @@ # # LLNL-CODE-797170 # All rights reserved. -# This file is part of Merlin, Version: 1.11.0. +# This file is part of Merlin, Version: 1.11.1. # # For details, see https://github.com/LLNL/merlin. # diff --git a/merlin/study/batch.py b/merlin/study/batch.py index e02a65a32..01a2945e3 100644 --- a/merlin/study/batch.py +++ b/merlin/study/batch.py @@ -6,7 +6,7 @@ # # LLNL-CODE-797170 # All rights reserved. -# This file is part of Merlin, Version: 1.11.0. +# This file is part of Merlin, Version: 1.11.1. # # For details, see https://github.com/LLNL/merlin. # @@ -299,7 +299,7 @@ def construct_scheduler_legend(parsed_batch: Dict, nodes: int) -> Dict: "lsf": { "check cmd": ["jsrun", "--help"], "expected check output": b"jsrun", - "launch": f"jsrun -a 1 -c ALL_CPUS -g ALL_SGPUS --bind=none -n {nodes}", + "launch": f"jsrun -a 1 -c ALL_CPUS -g ALL_GPUS --bind=none -n {nodes}", }, # pbs is mainly a placeholder in case a user wants to try it (we don't have it at the lab so it's mostly untested) "pbs": { @@ -335,12 +335,16 @@ def construct_worker_launch_command(parsed_batch: Dict, nodes: int) -> str: scheduler_legend: Dict = construct_scheduler_legend(parsed_batch, nodes) workload_manager: str = get_batch_type(scheduler_legend) + LOG.debug(f"parsed_batch: {parsed_batch}") + if parsed_batch["btype"] == "pbs" and workload_manager == parsed_batch["btype"]: raise TypeError("The PBS scheduler is only enabled for 'batch: flux' type") if parsed_batch["btype"] == "slurm" and workload_manager not in ("lsf", "flux", "pbs"): workload_manager = "slurm" + LOG.debug(f"workload_manager: {workload_manager}") + try: launch_command = scheduler_legend[workload_manager]["launch"] except KeyError as e: # pylint: disable=C0103 diff --git a/merlin/study/celeryadapter.py b/merlin/study/celeryadapter.py index 440bdc366..4bf0ad710 100644 --- a/merlin/study/celeryadapter.py +++ b/merlin/study/celeryadapter.py @@ -6,7 +6,7 @@ # # LLNL-CODE-797170 # All rights reserved. -# This file is part of Merlin, Version: 1.11.0. +# This file is part of Merlin, Version: 1.11.1. # # For details, see https://github.com/LLNL/merlin. # @@ -37,15 +37,21 @@ import subprocess import time from contextlib import suppress +from typing import Dict, List, Optional +from amqp.exceptions import ChannelError +from celery import Celery from tabulate import tabulate +from merlin.config import Config from merlin.study.batch import batch_check_parallel, batch_worker_launch from merlin.utils import apply_list_of_regex, check_machines, get_procs, get_yaml_var, is_running LOG = logging.getLogger(__name__) +# TODO figure out a better way to handle the import of celery app and CONFIG + def run_celery(study, run_mode=None): """ @@ -71,23 +77,31 @@ def run_celery(study, run_mode=None): queue_merlin_study(study, adapter_config) -def get_running_queues(): +def get_running_queues(celery_app_name: str, test_mode: bool = False) -> List[str]: """ - Check for running celery workers with -Q queues - and return a unique list of the queues + Check for running celery workers by looking at the currently running processes. + If there are running celery workers, we'll pull the queues from the -Q tag in the + process command. The list returned here will contain only unique celery queue names. + This must be run on the allocation where the workers are running. - Must be run on the allocation where the workers are running + :param `celery_app_name`: The name of the celery app (typically merlin here unless testing) + :param `test_mode`: If True, run this function in test mode + :returns: A unique list of celery queues with workers attached to them """ running_queues = [] - if not is_running("celery worker"): + if not is_running(f"{celery_app_name} worker"): return running_queues - procs = get_procs("celery") + proc_name = "celery" if not test_mode else "sh" + procs = get_procs(proc_name) for _, lcmd in procs: lcmd = list(filter(None, lcmd)) cmdline = " ".join(lcmd) if "-Q" in cmdline: + if test_mode: + echo_cmd = lcmd.pop(2) + lcmd.extend(echo_cmd.split()) running_queues.extend(lcmd[lcmd.index("-Q") + 1].split(",")) running_queues = list(set(running_queues)) @@ -95,7 +109,7 @@ def get_running_queues(): return running_queues -def get_queues(app): +def get_active_celery_queues(app): """Get all active queues and workers for a celery application. Unlike get_running_queues, this goes through the application's server. @@ -110,7 +124,7 @@ def get_queues(app): :example: >>> from merlin.celery import app - >>> queues, workers = get_queues(app) + >>> queues, workers = get_active_celery_queues(app) >>> queue_names = [*queues] >>> workers_on_q0 = queues[queue_names[0]] >>> workers_not_on_q0 = [worker for worker in workers @@ -132,7 +146,7 @@ def get_queues(app): def get_active_workers(app): """ - This is the inverse of get_queues() defined above. This function + This is the inverse of get_active_celery_queues() defined above. This function builds a dict where the keys are worker names and the values are lists of queues attached to the worker. @@ -157,19 +171,20 @@ def get_active_workers(app): return worker_queue_map -def celerize_queues(queues): +def celerize_queues(queues: List[str], config: Optional[Dict] = None): """ Celery requires a queue tag to be prepended to their queues so this function will 'celerize' every queue in a list you provide it by prepending the queue tag. - :param `queues`: A list of queues that need the queue - tag prepended. + :param `queues`: A list of queues that need the queue tag prepended. + :param `config`: A dict of configuration settings """ - from merlin.config.configfile import CONFIG # pylint: disable=C0415 + if config is None: + from merlin.config.configfile import CONFIG as config # pylint: disable=C0415 for i, queue in enumerate(queues): - queues[i] = f"{CONFIG.celery.queue_tag}{queue}" + queues[i] = f"{config.celery.queue_tag}{queue}" def _build_output_table(worker_list, output_table): @@ -219,7 +234,7 @@ def query_celery_workers(spec_worker_names, queues, workers_regex): # --queues flag if queues: # Get a mapping between queues and the workers watching them - queue_worker_map, _ = get_queues(app) + queue_worker_map, _ = get_active_celery_queues(app) # Remove duplicates and prepend the celery queue tag to all queues queues = list(set(queues)) celerize_queues(queues) @@ -261,26 +276,54 @@ def query_celery_workers(spec_worker_names, queues, workers_regex): print() -def query_celery_queues(queues): - """Return stats for queues specified. - - Send results to the log. - """ - from merlin.celery import app # pylint: disable=C0415 - - connection = app.connection() - found_queues = [] - try: - channel = connection.channel() - for queue in queues: - try: - name, jobs, consumers = channel.queue_declare(queue=queue, passive=True) - found_queues.append((name, jobs, consumers)) - except Exception as e: # pylint: disable=C0103,W0718 - LOG.warning(f"Cannot find queue {queue} on server.{e}") - finally: - connection.close() - return found_queues +def query_celery_queues(queues: List[str], app: Celery = None, config: Config = None) -> Dict[str, List[str]]: + """ + Build a dict of information about the number of jobs and consumers attached + to specific queues that we want information on. + + :param queues: A list of the queues we want to know about + :param app: The celery application (this will be none unless testing) + :param config: The configuration object that has the broker name (this will be none unless testing) + :returns: A dict of info on the number of jobs and consumers for each queue in `queues` + """ + if app is None: + from merlin.celery import app # pylint: disable=C0415 + if config is None: + from merlin.config.configfile import CONFIG as config # pylint: disable=C0415 + + # Initialize the dictionary with the info we want about our queues + queue_info = {queue: {"consumers": 0, "jobs": 0} for queue in queues} + + # Open a connection via our Celery app + with app.connection() as conn: + # Open a channel inside our connection + with conn.channel() as channel: + # Loop through all the queues we're searching for + for queue in queues: + try: + # Count the number of jobs and consumers for each queue + _, queue_info[queue]["jobs"], queue_info[queue]["consumers"] = channel.queue_declare( + queue=queue, passive=True + ) + # Redis likes to throw this error when a queue we're looking for has no jobs + except ChannelError: + pass + + # Redis doesn't keep track of consumers attached to queues like rabbit does + # so we have to count this ourselves here + if config.broker.name in ("rediss", "redis"): + # Get a dict of active queues by querying the celery app + active_queues = app.control.inspect().active_queues() + if active_queues is not None: + # Loop through each active queue that was found + for active_queue_list in active_queues.values(): + # Loop through each queue that each worker is watching + for active_queue in active_queue_list: + # If this is a queue we're looking for, increment the consumer count + if active_queue["name"] in queues: + queue_info[active_queue["name"]]["consumers"] += 1 + + return queue_info def get_workers_from_app(): @@ -299,6 +342,27 @@ def get_workers_from_app(): return [*workers] +def check_celery_workers_processing(queues_in_spec: List[str], app: Celery) -> bool: + """ + Query celery to see if any workers are still processing tasks. + + :param queues_in_spec: A list of queues to check if tasks are still active in + :param app: The celery app that we're querying + :returns: True if workers are still processing tasks, False otherwise + """ + # Query celery for active tasks + active_tasks = app.control.inspect().active() + + # Search for the queues we provided if necessary + if active_tasks is not None: + for tasks in active_tasks.values(): + for task in tasks: + if task["delivery_info"]["routing_key"] in queues_in_spec: + return True + + return False + + def _get_workers_to_start(spec, steps): """ Helper function to return a set of workers to start based on @@ -463,7 +527,7 @@ def start_celery_workers(spec, steps, celery_args, disable_logs, just_return_com running_queues.extend(local_queues) queues = queues.split(",") if not overlap: - running_queues.extend(get_running_queues()) + running_queues.extend(get_running_queues("merlin")) # Cache the queues from this worker to use to test # for existing queues in any subsequent workers. # If overlap is True, then do not check the local queues. @@ -611,7 +675,7 @@ def stop_celery_workers(queues=None, spec_worker_names=None, worker_regex=None): from merlin.celery import app # pylint: disable=C0415 LOG.debug(f"Sending stop to queues: {queues}, worker_regex: {worker_regex}, spec_worker_names: {spec_worker_names}") - active_queues, _ = get_queues(app) + active_queues, _ = get_active_celery_queues(app) # If not specified, get all the queues if queues is None: diff --git a/merlin/study/dag.py b/merlin/study/dag.py index 6f487f4b4..8c758b33a 100644 --- a/merlin/study/dag.py +++ b/merlin/study/dag.py @@ -6,7 +6,7 @@ # # LLNL-CODE-797170 # All rights reserved. -# This file is part of Merlin, Version: 1.11.0. +# This file is part of Merlin, Version: 1.11.1. # # For details, see https://github.com/LLNL/merlin. # diff --git a/merlin/study/script_adapter.py b/merlin/study/script_adapter.py index 6ecc79c5f..45d211742 100644 --- a/merlin/study/script_adapter.py +++ b/merlin/study/script_adapter.py @@ -6,7 +6,7 @@ # # LLNL-CODE-797170 # All rights reserved. -# This file is part of Merlin, Version: 1.11.0. +# This file is part of Merlin, Version: 1.11.1. # # For details, see https://github.com/LLNL/merlin. # diff --git a/merlin/study/status.py b/merlin/study/status.py index 7ad15a0ce..d2b07dcc1 100644 --- a/merlin/study/status.py +++ b/merlin/study/status.py @@ -33,35 +33,38 @@ import os import re from argparse import Namespace +from copy import deepcopy from datetime import datetime from glob import glob -from typing import Dict, List, Optional, Tuple +from typing import Dict, List, Optional, Tuple, Union import numpy as np from filelock import FileLock, Timeout from tabulate import tabulate from merlin.common.dumper import dump_handler -from merlin.display import ANSI_COLORS, display_status_summary +from merlin.display import ANSI_COLORS, display_status_summary, display_status_task_by_task from merlin.spec.expansion import get_spec_with_expansion +from merlin.study.status_constants import ( + ALL_VALID_FILTERS, + CELERY_KEYS, + NON_WORKSPACE_KEYS, + VALID_EXIT_FILTERS, + VALID_RETURN_CODES, + VALID_STATUS_FILTERS, +) +from merlin.study.status_renderers import status_renderer_factory from merlin.utils import ( convert_timestring, convert_to_timedelta, dict_deep_merge, - pretty_format_HMS, + pretty_format_hms, verify_dirpath, ws_time_to_dt, ) LOG = logging.getLogger(__name__) -VALID_STATUS_FILTERS = ("INITIALIZED", "RUNNING", "FINISHED", "FAILED", "CANCELLED", "DRY_RUN", "UNKNOWN") -VALID_RETURN_CODES = ("SUCCESS", "SOFT_FAIL", "HARD_FAIL", "STOP_WORKERS", "RESTART", "RETRY", "DRY_SUCCESS", "UNRECOGNIZED") -VALID_EXIT_FILTERS = ("E", "EXIT") -ALL_VALID_FILTERS = VALID_STATUS_FILTERS + VALID_RETURN_CODES + VALID_EXIT_FILTERS + ("MAX_TASKS",) -CELERY_KEYS = set(["task_queue", "worker_name"]) -RUN_TIME_STAT_KEYS = set(["avg_run_time", "run_time_std_dev"]) -NON_WORKSPACE_KEYS = CELERY_KEYS.union(RUN_TIME_STAT_KEYS) class Status: @@ -90,6 +93,14 @@ def __init__(self, args: Namespace, spec_display: bool, file_or_ws: str): # Create a tasks per step mapping in order to give accurate totals for each step self.tasks_per_step = self.spec.get_tasks_per_step() + # This attribute will store a map between the overall step name and the full step names + # that are created with parameters (e.g. step name is hello and uses a "GREET: hello" parameter + # so the real step name is hello_GREET.hello) + self.full_step_name_map = {} + + # Variable to store run time information for each step + self.run_time_info = {} + # Variable to store the statuses that the user wants self.requested_statuses = {} self.load_requested_statuses() @@ -177,16 +188,20 @@ def _load_from_spec(self, filepath: str) -> Tuple[str, "MerlinSpec"]: # noqa: F :returns: The workspace of the study we'll check the status for and a MerlinSpec object loaded in from the workspace's merlin_info subdirectory. """ - # Get the output path of the study that was given to us - # Case where the output path is left out of the spec file - if self.args.spec_provided.output_path == "": - output_path = os.path.dirname(filepath) - # Case where output path is absolute - elif self.args.spec_provided.output_path.startswith("/"): - output_path = self.args.spec_provided.output_path - # Case where output path is relative to the specroot + # If the user provided a new output path to look in, use that + if self.args.output_path is not None: + output_path = self.args.output_path + # Otherwise, use the output path of the study that was given to us else: - output_path = f"{os.path.dirname(filepath)}/{self.args.spec_provided.output_path}" + # Case where the output path is left out of the spec file + if self.args.spec_provided.output_path == "": + output_path = os.path.dirname(filepath) + # Case where output path is absolute + elif self.args.spec_provided.output_path.startswith("/"): + output_path = self.args.spec_provided.output_path + # Case where output path is relative to the specroot + else: + output_path = f"{os.path.dirname(filepath)}/{self.args.spec_provided.output_path}" LOG.debug(f"Verifying output path: {output_path}...") study_output_dir = verify_dirpath(output_path) @@ -301,11 +316,9 @@ def num_requested_statuses(self): We need to ignore non workspace keys when we count. """ num_statuses = 0 - for step_name in self.step_tracker["started_steps"]: - for status_key, status_info in self.requested_statuses[step_name].items(): - if status_key in RUN_TIME_STAT_KEYS: - continue - num_statuses += len(status_info.keys() - NON_WORKSPACE_KEYS) + for overall_step_info in self.requested_statuses.values(): + num_statuses += len(overall_step_info.keys() - NON_WORKSPACE_KEYS) + return num_statuses def get_step_statuses(self, step_workspace: str, started_step_name: str) -> Dict[str, List[str]]: @@ -314,37 +327,38 @@ def get_step_statuses(self, step_workspace: str, started_step_name: str) -> Dict for the step and return them in a dict. :param `step_workspace`: The path to the step we're going to read statuses from - :param `started_step_name`: The name of the started step that we're getting statuses from :returns: A dict of statuses for the given step """ - step_statuses = {started_step_name: {}} + step_statuses = {} num_statuses_read = 0 + self.full_step_name_map[started_step_name] = set() + # Traverse the step workspace and look for MERLIN_STATUS files - LOG.info(f"Traversing '{step_workspace}' to find MERLIN_STATUS.json files...") + LOG.debug(f"Traversing '{step_workspace}' to find MERLIN_STATUS.json files...") for root, _, _ in os.walk(step_workspace): # Search for a status file status_filepath = os.path.join(root, "MERLIN_STATUS.json") matching_files = glob(status_filepath) if matching_files: LOG.debug(f"Found status file at '{status_filepath}'") - # Read in the statuses and count how many statuses we read + # Read in the statuses lock = FileLock(f"{root}/status.lock") # pylint: disable=E0110 statuses_read = read_status(status_filepath, lock) - for status_info in statuses_read.values(): + + # Add full step name to the tracker and count number of statuses we just read in + for full_step_name, status_info in statuses_read.items(): + self.full_step_name_map[started_step_name].add(full_step_name) num_statuses_read += len(status_info.keys() - NON_WORKSPACE_KEYS) # Merge the statuses we read with the dict tracking all statuses for this step - dict_deep_merge(step_statuses[started_step_name], statuses_read) + dict_deep_merge(step_statuses, statuses_read) - LOG.info( + LOG.debug( f"Done traversing '{step_workspace}'. Read in {num_statuses_read} " f"{'statuses' if num_statuses_read != 1 else 'status'}." ) - # Calculate run time average and standard deviation for this step - step_statuses = self.get_runtime_avg_std_dev(step_statuses, started_step_name) - return step_statuses def load_requested_statuses(self): @@ -359,6 +373,9 @@ def load_requested_statuses(self): step_statuses = self.get_step_statuses(step_workspace, sstep) dict_deep_merge(self.requested_statuses, step_statuses) + # Calculate run time average and standard deviation for this step + self.get_runtime_avg_std_dev(step_statuses, sstep) + # Count how many statuses in total that we just read in LOG.info(f"Read in {self.num_requested_statuses} statuses total.") @@ -375,12 +392,16 @@ def get_runtime_avg_std_dev(self, step_statuses: Dict, step_name: str) -> Dict: run_times_in_seconds = [] # This outer loop will only loop once - LOG.info(f"Calculating run time avg and std dev for {step_name}...") - for _, overall_step_info in step_statuses[step_name].items(): + LOG.debug(f"Calculating run time avg and std dev for step '{step_name}'...") + for overall_step_info in step_statuses.values(): for step_info_key, step_status_info in overall_step_info.items(): - # Ignore non-workspace keys and any run times that have been yet to be calculated - if step_info_key in NON_WORKSPACE_KEYS or step_status_info["run_time"] == "--:--:--": - LOG.debug(f"Skipping {step_info_key}.") + # Ignore non-workspace keys + if step_info_key in NON_WORKSPACE_KEYS: + continue + + # Ignore any run times that have yet to be calculated + if step_status_info["run_time"] == "--:--:--": + LOG.debug(f"Skipping {step_info_key} since the run time is empty.") continue # Parse the runtime value, convert it to seconds, and add it to the lsit of existing run times @@ -390,20 +411,23 @@ def get_runtime_avg_std_dev(self, step_statuses: Dict, step_name: str) -> Dict: # Using the list of existing run times, calculate avg and std dev LOG.debug(f"Using the following run times for our calculations: {run_times_in_seconds}") - np_run_times_in_seconds = np.array(run_times_in_seconds) - run_time_mean = round(np.mean(np_run_times_in_seconds)) - run_time_std_dev = round(np.std(np_run_times_in_seconds)) - LOG.debug(f"Run time avg in seconds: {run_time_mean}") - LOG.debug(f"Run time std dev in seconds: {run_time_std_dev}") - - # Pretty format the avg and std dev and store them as new entries to the status information for the step - step_statuses[step_name]["avg_run_time"] = pretty_format_HMS(convert_timestring(run_time_mean)) - step_statuses[step_name]["run_time_std_dev"] = f"±{pretty_format_HMS(convert_timestring(run_time_std_dev))}" - LOG.info(f"Run time avg and std dev for {step_name} calculated.") - - return step_statuses - - def display(self, test_mode=False) -> Dict: + self.run_time_info[step_name] = {} + if len(run_times_in_seconds) == 0: + self.run_time_info[step_name]["avg_run_time"] = "--" + self.run_time_info[step_name]["run_time_std_dev"] = "±--" + else: + np_run_times_in_seconds = np.array(run_times_in_seconds) + run_time_mean = round(np.mean(np_run_times_in_seconds)) + run_time_std_dev = round(np.std(np_run_times_in_seconds)) + LOG.debug(f"Run time avg in seconds: {run_time_mean}") + LOG.debug(f"Run time std dev in seconds: {run_time_std_dev}") + + # Pretty format the avg and std dev and store them as new entries in the run time info + self.run_time_info[step_name]["avg_run_time"] = pretty_format_hms(convert_timestring(run_time_mean)) + self.run_time_info[step_name]["run_time_std_dev"] = f"±{pretty_format_hms(convert_timestring(run_time_std_dev))}" + LOG.debug(f"Run time avg and std dev for step '{step_name}' calculated.") + + def display(self, test_mode: Optional[bool] = False) -> Dict: """ Displays the high level summary of the status. @@ -432,7 +456,7 @@ def format_csv_dump(self, date: datetime) -> Dict: """ # Reformat the statuses to a new dict where the keys are the column labels and rows are the values LOG.debug("Formatting statuses for csv dump...") - statuses_to_write = self.format_status_for_display() + statuses_to_write = self.format_status_for_csv() LOG.debug("Statuses formatted.") # Add date entries as the first column then update this dict with the statuses we just reformatted @@ -459,9 +483,9 @@ def dump(self): # Dump the information dump_handler(self.args.dump, dump_info) - def format_status_for_display(self) -> Dict: + def format_status_for_csv(self) -> Dict: """ - Reformat our statuses to display so they can use Maestro's status renderer layouts. + Reformat our statuses to csv format so they can use Maestro's status renderer layouts. :returns: A formatted dictionary where each key is a column and the values are the rows of information to display for that column. @@ -474,43 +498,53 @@ def format_status_for_display(self) -> Dict: "elapsed_time": [], "run_time": [], "restarts": [], + "cmd_parameters": [], + "restart_parameters": [], "task_queue": [], "worker_name": [], } # We only care about started steps since unstarted steps won't have any status to report - for step_name in self.step_tracker["started_steps"]: - # Obtain and loop through all statuses - step_statuses = self.requested_statuses[step_name] - for full_step_name, overall_step_info in step_statuses.items(): - if full_step_name in RUN_TIME_STAT_KEYS: - continue - - # Get the number of statuses for this step so we know how many entries there should be - num_statuses = len(overall_step_info.keys() - NON_WORKSPACE_KEYS) - - # Loop through information for each step - for step_info_key, step_info_value in overall_step_info.items(): - # Format celery specific keys - if step_info_key in CELERY_KEYS: + for step_name, overall_step_info in self.requested_statuses.items(): + # Get the number of statuses for this step so we know how many entries there should be + num_statuses = len(overall_step_info.keys() - NON_WORKSPACE_KEYS) + + # Loop through information for each step + for step_info_key, step_info_value in overall_step_info.items(): + # Format celery specific keys + if step_info_key in CELERY_KEYS: + # Set the val_to_add value based on if a value exists for the key + val_to_add = step_info_value if step_info_value else "-------" + # Add the val_to_add entry for each row + key_entries = [val_to_add] * num_statuses + reformatted_statuses[step_info_key].extend(key_entries) + + # Format parameters + elif step_info_key == "parameters": + for cmd_type in ("cmd", "restart"): + reformatted_statuses_key = f"{cmd_type}_parameters" # Set the val_to_add value based on if a value exists for the key - val_to_add = step_info_value if step_info_value else "-------" - # Add the val_to_add entry for each row - key_entries = [val_to_add] * num_statuses - reformatted_statuses[step_info_key].extend(key_entries) - - # Format workspace keys - else: - # Put the step name and workspace in each entry - reformatted_statuses["step_name"].append(step_name) - reformatted_statuses["step_workspace"].append(step_info_key) - - # Add the rest of the information for each task (status, return code, elapsed & run time, num restarts) - for key, val in step_info_value.items(): - reformatted_statuses[key].append(val) + if step_info_value[cmd_type] is not None: + param_str = ";".join( + [f"{token}:{param_val}" for token, param_val in step_info_value[cmd_type].items()] + ) + else: + param_str = "-------" + # Add the parameter string for each row in this step + reformatted_statuses[reformatted_statuses_key].extend([param_str] * num_statuses) + + # Format workspace keys + else: + # Put the step name and workspace in each entry + reformatted_statuses["step_name"].append(step_name) + reformatted_statuses["step_workspace"].append(step_info_key) + + # Add the rest of the information for each task (status, return code, elapsed & run time, num restarts) + for key, val in step_info_value.items(): + reformatted_statuses[key].append(val) # For local runs, there will be no task queue or worker name so delete these entries - for celery_specific_key in ("task_queue", "worker_name"): + for celery_specific_key in CELERY_KEYS: if not reformatted_statuses[celery_specific_key]: del reformatted_statuses[celery_specific_key] @@ -518,7 +552,481 @@ def format_status_for_display(self) -> Dict: class DetailedStatus(Status): - pass + """ + This class handles obtaining and filtering requested statuses from the user. + This class shares similar methodology to the Status class it inherits from. + """ + + def __init__(self, args: Namespace, spec_display: bool, file_or_ws: str): + args_copy = Namespace(**vars(args)) + super().__init__(args, spec_display, file_or_ws) + + # Check if the steps filter was given + self.steps_filter_provided = "all" not in args_copy.steps + + def _verify_filters( + self, + filters_to_check: List[str], + valid_options: Union[List, Tuple], + suppress_warnings: bool, + warning_msg: Optional[str] = "", + ): + """ + Check each filter in a list of filters provided by the user against a list of valid options. + If the filter is invalid, remove it from the list of filters. + + :param `filters_to_check`: A list of filters provided by the user + :param `valid_options`: A list of valid options for this particular filter + :param `suppress_warnings`: If True, don't log warnings. Otherwise, log them + :param `warning_msg`: An optional warning message to attach to output + """ + for filter_arg in filters_to_check[:]: + if filter_arg not in valid_options: + if not suppress_warnings: + LOG.warning(f"The filter '{filter_arg}' is invalid. {warning_msg}") + filters_to_check.remove(filter_arg) + + def _verify_filter_args(self, suppress_warnings: Optional[bool] = False): + """ + Verify that our filters are all valid and able to be used. + + :param `suppress_warnings`: If True, don't log warnings. Otherwise, log them. + """ + # Ensure the steps are valid + if "all" not in self.args.steps: + LOG.debug(f"args.steps before verification: {self.args.steps}") + existing_steps = self.spec.get_study_step_names() + self._verify_filters( + self.args.steps, + existing_steps, + suppress_warnings, + warning_msg="Removing this step from the list of steps to filter by...", + ) + LOG.debug(f"args.steps after verification: {self.args.steps}") + + # Make sure max_tasks is a positive int + if self.args.max_tasks is not None: + LOG.debug(f"args.max_tasks before verification: {self.args.max_tasks}") + if self.args.max_tasks < 1 or not isinstance(self.args.max_tasks, int): + if not suppress_warnings: + LOG.warning("The value of --max-tasks must be an integer greater than 0. Ignoring --max-tasks...") + self.args.max_tasks = None + LOG.debug(f"args.max_tasks after verification: {self.args.max_tasks}") + + # Make sure task_status is valid + if self.args.task_status: + LOG.debug(f"args.task_status before verificaiton: {self.args.task_status}") + self.args.task_status = [x.upper() for x in self.args.task_status] + self._verify_filters( + self.args.task_status, + VALID_STATUS_FILTERS, + suppress_warnings, + warning_msg="Removing this status from the list of statuses to filter by...", + ) + LOG.debug(f"args.task_status after verification: {self.args.task_status}") + + # Ensure return_code is valid + if self.args.return_code: + LOG.debug(f"args.return_code before verification: {self.args.return_code}") + # TODO remove this code block and uncomment the line below once you've + # implemented entries for restarts/retries + idx = 0 + for ret_code_provided in self.args.return_code[:]: + ret_code_provided = ret_code_provided.upper() + if ret_code_provided in ("RETRY", "RESTART"): + if not suppress_warnings: + LOG.warning(f"The {ret_code_provided} filter is coming soon. Ignoring this filter for now...") + self.args.return_code.remove(ret_code_provided) + else: + self.args.return_code[idx] = ret_code_provided + idx += 1 + + # self.args.return_code = [ret_code.upper() for ret_code in self.args.return_code] + self._verify_filters( + self.args.return_code, + VALID_RETURN_CODES, + suppress_warnings, + warning_msg="Removing this code from the list of return codes to filter by...", + ) + LOG.debug(f"args.return_code after verification: {self.args.return_code}") + + # Ensure every task queue provided exists + if self.args.task_queues: + LOG.debug(f"args.task_queues before verification: {self.args.task_queues}") + existing_queues = self.spec.get_queue_list(["all"], omit_tag=True) + self._verify_filters( + self.args.task_queues, + existing_queues, + suppress_warnings, + warning_msg="Removing this queue from the list of queues to filter by...", + ) + LOG.debug(f"args.task_queues after verification: {self.args.task_queues}") + + # Ensure every worker provided exists + if self.args.workers: + LOG.debug(f"args.workers before verification: {self.args.workers}") + worker_names = self.spec.get_worker_names() + self._verify_filters( + self.args.workers, + worker_names, + suppress_warnings, + warning_msg="Removing this worker from the list of workers to filter by...", + ) + LOG.debug(f"args.workers after verification: {self.args.workers}") + + def _process_workers(self): + """ + Modifies the list of steps to display status for based on + the list of workers provided by the user. + """ + LOG.debug("Processing workers filter...") + # Remove duplicates + workers_provided = list(set(self.args.workers)) + + # Get a map between workers and steps + worker_step_map = self.spec.get_worker_step_map() + + # Append steps associated with each worker provided + for worker_provided in workers_provided: + # Check for invalid workers + if worker_provided not in worker_step_map: + LOG.warning(f"Worker with name {worker_provided} does not exist for this study.") + else: + for step in worker_step_map[worker_provided]: + if step not in self.args.steps: + self.args.steps.append(step) + + LOG.debug(f"Steps after workers filter: {self.args.steps}") + + def _process_task_queue(self): + """ + Modifies the list of steps to display status for based on + the list of task queues provided by the user. + """ + from merlin.config.configfile import CONFIG # pylint: disable=C0415 + + LOG.debug("Processing task_queues filter...") + # Remove duplicate queues + queues_provided = list(set(self.args.task_queues)) + + # Get a map between queues and steps + queue_step_relationship = self.spec.get_queue_step_relationship() + + # Append steps associated with each task queue provided + for queue_provided in queues_provided: + # Check for invalid task queues + queue_with_celery_tag = f"{CONFIG.celery.queue_tag}{queue_provided}" + if queue_with_celery_tag not in queue_step_relationship: + LOG.warning(f"Task queue with name {queue_provided} does not exist for this study.") + else: + for step in queue_step_relationship[queue_with_celery_tag]: + if step not in self.args.steps: + self.args.steps.append(step) + + LOG.debug(f"Steps after task_queues filter: {self.args.steps}") + + def get_steps_to_display(self) -> Dict[str, List[str]]: + """ + Generates a list of steps to display the status for based on information + provided to the merlin detailed-status command by the user. This function + will handle the --steps, --task-queues, and --workers filter options. + + :returns: A dictionary of started and unstarted steps for us to display the status of + """ + existing_steps = self.spec.get_study_step_names() + + LOG.debug(f"existing steps: {existing_steps}") + + if ("all" in self.args.steps) and (not self.args.task_queues) and (not self.args.workers): + LOG.debug("The steps, task_queues, and workers filters weren't provided. Setting steps to be all existing steps.") + self.args.steps = existing_steps + else: + # This won't matter anymore since task_queues or workers is not None here + if "all" in self.args.steps: + self.args.steps = [] + + # Add steps to start based on task queues and/or workers provided + if self.args.task_queues: + self._process_task_queue() + if self.args.workers: + self._process_workers() + + # Sort the steps to start by the order they show up in the study + for i, estep in enumerate(existing_steps): + if estep in self.args.steps: + self.args.steps.remove(estep) + self.args.steps.insert(i, estep) + + LOG.debug(f"Building detailed step tracker based on these steps: {self.args.steps}") + + # Filter the steps to display status for by started/unstarted + step_tracker = self._create_step_tracker(self.args.steps.copy()) + + return step_tracker + + def _remove_steps_without_statuses(self): + """ + After applying filters, there's a chance that certain steps will still exist + in self.requested_statuses but won't have any tasks to view the status of so + we'll remove those here. + """ + result = deepcopy(self.requested_statuses) + + for step_name, overall_step_info in self.requested_statuses.items(): + sub_step_workspaces = sorted(list(overall_step_info.keys() - NON_WORKSPACE_KEYS)) + if len(sub_step_workspaces) == 0: + LOG.debug(f"Removing step '{step_name}' from the requested_statuses dict since it didn't match our filters.") + del result[step_name] + + self.requested_statuses = result + + def apply_filters(self, filter_types: List[str], filters: List[str]): + """ + Given a list of filters, filter the dict of requested statuses by them. + + :param `filter_types`: A list of str denoting the types of filters we're applying + :param `filters`: A list of filters to apply to the dict of statuses we read in + """ + LOG.info(f"Filtering tasks using these filters: {filters}") + + # Create a deep copy of the dict so we can make changes to it while we iterate + result = deepcopy(self.requested_statuses) + + for step_name, overall_step_info in self.requested_statuses.items(): + for sub_step_workspace, task_status_info in overall_step_info.items(): + # Ignore non workspace keys + if sub_step_workspace in NON_WORKSPACE_KEYS: + continue + + # Search for our filters + found_a_match = False + for filter_type in filter_types: + if task_status_info[filter_type] in filters: + found_a_match = True + break + + # If our filters aren't a match for this task then delete it + if not found_a_match: + LOG.debug(f"No matching filter for '{sub_step_workspace}'; removing it from requested_statuses.") + del result[step_name][sub_step_workspace] + + # Get the number of tasks found with our filters + self.requested_statuses = result + self._remove_steps_without_statuses() + LOG.info(f"Found {self.num_requested_statuses} tasks matching your filters.") + + # If no tasks were found set the status dict to empty + if self.num_requested_statuses == 0: + self.requested_statuses = {} + + def apply_max_tasks_limit(self): + """ + Given a number representing the maximum amount of tasks to display, filter the dict of statuses + so that there are at most a max_tasks amount of tasks. + """ + # Make sure the max_tasks variable is set to a reasonable number and store that value + if self.args.max_tasks > self.num_requested_statuses: + LOG.debug( + f"'max_tasks' was set to {self.args.max_tasks} but only {self.num_requested_statuses} statuses exist. " + f"Setting 'max_tasks' to {self.num_requested_statuses}." + ) + self.args.max_tasks = self.num_requested_statuses + max_tasks = self.args.max_tasks + + new_status_dict = {} + for step_name, overall_step_info in self.requested_statuses.items(): + new_status_dict[step_name] = {} + sub_step_workspaces = sorted(list(overall_step_info.keys() - NON_WORKSPACE_KEYS)) + + # If there are more status entries than max_tasks will allow then we need to remove some + if len(sub_step_workspaces) > self.args.max_tasks: + workspaces_to_delete = set(sub_step_workspaces) - set(sub_step_workspaces[: self.args.max_tasks]) + for ws_to_delete in workspaces_to_delete: + del overall_step_info[ws_to_delete] + self.args.max_tasks = 0 + # Otherwise, subtract how many tasks there are in this step from max_tasks + else: + self.args.max_tasks -= len(sub_step_workspaces) + + # Merge in the task statuses that we're allowing + dict_deep_merge(new_status_dict[step_name], overall_step_info) + + LOG.info(f"Limited the number of tasks to display to {max_tasks} tasks.") + + # Set the new requested statuses with the max_tasks limit and remove steps without statuses + self.requested_statuses = new_status_dict + self._remove_steps_without_statuses() + + # Reset max_tasks + self.args.max_tasks = max_tasks + + def load_requested_statuses(self): + """ + Populate the requested_statuses dict with statuses that the user is looking to find. + Filters for steps, task queues, workers will have already been applied + when creating the step_tracker attribute. Remaining filters will be applied here. + """ + # Grab all the statuses based on our step tracker + super().load_requested_statuses() + + # Apply filters to the statuses + filter_types = set() + filters = [] + if self.args.task_status: + filter_types.add("status") + filters += self.args.task_status + if self.args.return_code: + filter_types.add("return_code") + filters += [f"MERLIN_{return_code}" for return_code in self.args.return_code] + + # Apply the filters if necessary + if filters: + self.apply_filters(list(filter_types), filters) + + # Limit the number of tasks to display if necessary + if self.args.max_tasks is not None and self.args.max_tasks > 0: + self.apply_max_tasks_limit() + + def get_user_filters(self) -> List[str]: + """ + Get a filter on the statuses to display from the user. Possible options + for filtering: + - A str MAX_TASKS -> will ask the user for another input that's equivalent to the --max-tasks flag + - A list of statuses -> equivalent to the --task-status flag + - A list of return codes -> equivalent to the --return-code flag + - An exit keyword to leave the filter prompt without filtering + + :returns: A list of strings to filter by + """ + # Build the filter options + filter_info = { + "Filter Type": [ + "Put a limit on the number of tasks to display", + "Filter by status", + "Filter by return code", + "Exit without filtering", + ], + "Description": [ + "Enter 'MAX_TASKS'", + f"Enter a comma separated list of the following statuses you'd like to see: {VALID_STATUS_FILTERS}", + f"Enter a comma separated list of the following return codes you'd like to see: {VALID_RETURN_CODES}", + f"Enter one of the following: {VALID_EXIT_FILTERS}", + ], + "Example": ["MAX_TASKS", "FAILED, CANCELLED", "SOFT_FAIL, RETRY", "EXIT"], + } + + # Display the filter options + filter_option_renderer = status_renderer_factory.get_renderer("table", disable_theme=True, disable_pager=True) + filter_option_renderer.layout(status_data=filter_info) + filter_option_renderer.render() + + # Obtain and validate the filter provided by the user + invalid_filter = True + while invalid_filter: + user_filters = input("How would you like to filter the tasks? ") + # Remove spaces and split user filters by commas + user_filters = user_filters.replace(" ", "") + user_filters = user_filters.split(",") + + # Ensure every filter is valid + for i, entry in enumerate(user_filters): + entry = entry.upper() + if entry not in ALL_VALID_FILTERS: + invalid_filter = True + print(f"Invalid input: {entry}. Input must be one of the following {ALL_VALID_FILTERS}") + break + invalid_filter = False + user_filters[i] = entry + + return user_filters + + def get_user_max_tasks(self) -> int: + """ + Get a limit for the amount of tasks to display from the user. + + :returns: An int representing the max amount of tasks to display + """ + invalid_input = True + + while invalid_input: + try: + user_max_tasks = int(input("What limit would you like to set? (must be an integer greater than 0) ")) + if user_max_tasks > 0: + invalid_input = False + else: + raise ValueError + except ValueError: + print("Invalid input. The limit must be an integer greater than 0.") + continue + + return user_max_tasks + + def filter_via_prompts(self): + """ + Interact with the user to manage how many/which tasks are displayed. This helps to + prevent us from overloading the terminal by displaying a bazillion tasks at once. + """ + # Get the filters from the user + user_filters = self.get_user_filters() + + # TODO remove this once restart/retry functionality is implemented + if "RESTART" in user_filters: + LOG.warning("The RESTART filter is coming soon. Ignoring this filter for now...") + user_filters.remove("RESTART") + if "RETRY" in user_filters: + LOG.warning("The RETRY filter is coming soon. Ignoring this filter for now...") + user_filters.remove("RETRY") + + # Variable to track whether the user wants to stop filtering + exit_without_filtering = False + + # Process the filters + max_tasks_found = False + filter_types = [] + for i, user_filter in enumerate(user_filters): + # Case 1: Exit command found, stop filtering + if user_filter in ("E", "EXIT"): + exit_without_filtering = True + break + # Case 2: MAX_TASKS command found, get the limit from the user + if user_filter == "MAX_TASKS": + max_tasks_found = True + # Case 3: Status filter provided, add it to the list of filter types + elif user_filter in VALID_STATUS_FILTERS and "status" not in filter_types: + filter_types.append("status") + # Case 4: Return Code filter provided, add it to the list of filter types and add the MERLIN prefix + elif user_filter in VALID_RETURN_CODES: + user_filters[i] = f"MERLIN_{user_filter}" + if "return_code" not in filter_types: + filter_types.append("return_code") + + # Remove the MAX_TASKS entry so we don't try to filter using it + try: + user_filters.remove("MAX_TASKS") + except ValueError: + pass + + # Apply the filters and tell the user how many tasks match the filters (if necessary) + if not exit_without_filtering and user_filters: + self.apply_filters(filter_types, user_filters) + + # Apply max tasks limit (if necessary) + if max_tasks_found: + user_max_tasks = self.get_user_max_tasks() + self.args.max_tasks = user_max_tasks + self.apply_max_tasks_limit() + + def display(self, test_mode: Optional[bool] = False): + """ + Displays a task-by-task view of the status based on user filter(s). + + :param `test_mode`: If true, run this in testing mode and don't print any output + """ + # Check that there's statuses found and display them + if self.requested_statuses: + display_status_task_by_task(self, test_mode=test_mode) + else: + LOG.warning("No statuses to display.") def read_status(status_filepath: str, lock: FileLock, display_fnf_message: Optional[bool] = True) -> Dict: diff --git a/merlin/study/status_constants.py b/merlin/study/status_constants.py new file mode 100644 index 000000000..31aeebc22 --- /dev/null +++ b/merlin/study/status_constants.py @@ -0,0 +1,44 @@ +############################################################################### +# Copyright (c) 2023, Lawrence Livermore National Security, LLC. +# Produced at the Lawrence Livermore National Laboratory +# Written by the Merlin dev team, listed in the CONTRIBUTORS file. +# +# +# LLNL-CODE-797170 +# All rights reserved. +# This file is part of Merlin, Version: 1.11.0 +# +# For details, see https://github.com/LLNL/merlin. +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# The above copyright notice and this permission notice shall be included in +# all copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. +############################################################################### +""" +This file contains all of the constants used for the status command. +Separating this from status.py and status_renderers.py helps with circular +import issues. +""" + +VALID_STATUS_FILTERS = ("INITIALIZED", "RUNNING", "FINISHED", "FAILED", "CANCELLED", "DRY_RUN", "UNKNOWN") +VALID_RETURN_CODES = ("SUCCESS", "SOFT_FAIL", "HARD_FAIL", "STOP_WORKERS", "RESTART", "RETRY", "DRY_SUCCESS", "UNRECOGNIZED") +VALID_EXIT_FILTERS = ("E", "EXIT") +ALL_VALID_FILTERS = VALID_STATUS_FILTERS + VALID_RETURN_CODES + VALID_EXIT_FILTERS + ("MAX_TASKS",) + +CELERY_KEYS = set(["task_queue", "worker_name"]) +RUN_TIME_STAT_KEYS = set(["avg_run_time", "run_time_std_dev"]) +NON_WORKSPACE_KEYS = CELERY_KEYS.union(RUN_TIME_STAT_KEYS) +NON_WORKSPACE_KEYS.add("parameters") diff --git a/merlin/study/status_renderers.py b/merlin/study/status_renderers.py new file mode 100644 index 000000000..52ecf2957 --- /dev/null +++ b/merlin/study/status_renderers.py @@ -0,0 +1,412 @@ +############################################################################### +# Copyright (c) 2023, Lawrence Livermore National Security, LLC. +# Produced at the Lawrence Livermore National Laboratory +# Written by the Merlin dev team, listed in the CONTRIBUTORS file. +# +# +# LLNL-CODE-797170 +# All rights reserved. +# This file is part of Merlin, Version: 1.10.0 +# +# For details, see https://github.com/LLNL/merlin. +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# The above copyright notice and this permission notice shall be included in +# all copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. +############################################################################### +"""This module handles creating a formatted task-by-task status display""" +import logging +from typing import Dict, List, Optional, Union + +from maestrowf import BaseStatusRenderer, FlatStatusRenderer, StatusRendererFactory +from rich import box +from rich.columns import Columns +from rich.console import Console +from rich.table import Table +from rich.text import Text +from rich.theme import Theme + +from merlin.study.status_constants import NON_WORKSPACE_KEYS + + +LOG = logging.getLogger(__name__) + + +def format_label(label_to_format: str, delimiter: Optional[str] = "_") -> str: + """ + Take a string of the format 'word1_word2_...' and format it so it's prettier. + This would turn the string above to 'Word1 Word2 ...'. + + :param `label_to_format`: The string we want to format + :param `delimiter`: The character separating words in `label_to_format` + :returns: A formatted string based on `label_to_format` + """ + return label_to_format.replace(delimiter, " ").title() + + +class MerlinDefaultRenderer(BaseStatusRenderer): + """ + This class handles the default status formatting for task-by-task display. + It will separate the display on a step-by-step basis. + + Similar to Maestro's 'narrow' status display. + """ + + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + + self.disable_theme = kwargs.pop("disable_theme", False) + self.disable_pager = kwargs.pop("disable_pager", False) + + # Setup default theme + # TODO modify this theme to add more colors + self._theme_dict = { + "INITIALIZED": "blue", + "RUNNING": "blue", + "DRY_RUN": "green", + "FINISHED": "green", + "CANCELLED": "yellow", + "FAILED": "bold red", + "UNKNOWN": "bold red", + "Step Name": "bold", + "Workspace": "blue", + "row_style": "", + "row_style_dim": "dim", + "row_style_failed": "bold red", + "col_style_1": "", + "col_style_2": "blue", + "background": "grey7", + } + + # Setup the status table that will contain our formatted status + self._status_table = Table.grid(padding=0) + + def create_param_table(self, parameters: Dict[str, Dict[str, str]]) -> Columns: + """ + Create the parameter section of the display + + :param `parameters`: A dict of the form {"cmd": {"TOKEN1": "value1"}, "restart": {"TOKEN2": "value1"}} + :returns: A rich Columns object with the parameter info formatted appropriately + """ + param_table = [] + # Loop through cmd and restart entries + for param_type, param_set in parameters.items(): + # If there are no parameters, don't create a table + if param_set is None: + continue + + # Set up the table for this parameter type + param_subtable = Table( + title=format_label(f"{param_type} Parameters"), show_header=False, show_lines=True, box=box.HORIZONTALS + ) + + # Col names don't actually matter, we're just creating the style here + style = "blue" if not self.disable_theme else "" + param_subtable.add_column("token", style="") # This col will have all the token values + param_subtable.add_column("val", style=style, justify="right") # This col will have all the parameter values + param_subtable.add_column("padding1", style="") # This col is just for padding in the display + param_subtable.add_column("padding2", style=style, justify="right") # This col is just for padding in the display + + # Loop through each parameter token/val for this param type and create a row entry for each token/val + for token, param_val in param_set.items(): + param_subtable.add_row(token, param_val, style="row_style") + + # Add the sub table for this parameter type to the list that will store both sub tables + param_table.append(param_subtable) + + # Put the tables side-by-side in columns and return it + return Columns(param_table) + + def create_step_table( + self, + step_name: str, + parameters: Dict[str, Dict[str, str]], + task_queue: Optional[str] = None, + worker_name: Optional[str] = None, + ) -> Table: + """ + Create each step entry in the display + + :param `step_name`: The name of the step that we're setting the layout for + :param `parameters`: The parameters dict for this step + :param `task_queue`: The name of the task queue associated with this step if one was provided + :param `worker_name`: The name of the worker that ran this step if one was provided + :returns: A rich Table object with info for one sub step (here a 'sub step' is referencing a step + with multiple parameters; each parameter set will have it's own entry in the output) + """ + # Initialize the table that will have our step entry information + step_table = Table(box=box.SIMPLE_HEAVY, show_header=False) + + # Dummy columns used just for aligning our content properly + step_table.add_column("key") + step_table.add_column("val", overflow="fold") + + # Top level contains step name and may contain task queue and worker name + step_table.add_row("STEP:", step_name, style="Step Name") + if worker_name is not None: + step_table.add_row("WORKER NAME:", worker_name, style="Workspace") + if task_queue is not None: + step_table.add_row("TASK QUEUE:", task_queue, style="Workspace") + + step_table.add_row("", "") # just a little whitespace + + # Add optional parameter tables, if step has parameters + param_table = self.create_param_table(parameters) + step_table.add_row("", param_table) + + return step_table + + def create_task_details_table(self, task_statuses: Dict) -> Table: + """ + Create the task details section of the display + + :param `task_statuses`: A dict of task statuses to format into our layout + :returns: A rich Table with the formatted task info for a sub step + """ + # Initialize the task details table + task_details = Table(title="Task Details") + + # Setup the columns + cols = ["Step Workspace", "Status", "Return Code", "Elapsed Time", "Run Time", "Restarts"] + for nominal_col_num, col in enumerate(cols): + if col in list(self._theme_dict): + col_style = col + else: + if nominal_col_num % 2 == 0: + col_style = "col_style_1" + else: + col_style = "col_style_2" + + task_details.add_column(format_label(col), style=col_style, overflow="fold") + + # Set up the rows + row_style = "row_style" + for step_workspace, status_info in task_statuses.items(): + # Ignore the non-workspace keys + if step_workspace in NON_WORKSPACE_KEYS: + continue + + # Create each row entry + status_entry = [step_workspace] + for status_info_key, status_info_val in status_info.items(): + # For status entries we'll color the column differently + if status_info_key == "status": + status_entry.append(Text(status_info_val, style=self._theme_dict[status_info_val])) + # If we have a failed task then let's make that stand out by bolding and styling the whole row red + if status_info_val in ("FAILED", "UNKNOWN"): + row_style = "row_style_failed" + else: + status_entry.append(str(status_info_val)) + + # Add the row entry to the task details table + task_details.add_row(*status_entry, style=row_style) + + # Change styling for each row so statuses stand out more + row_style = "row_style" if row_style == "row_style_dim" else "row_style_dim" + + return task_details + + def layout( + self, status_data, study_title: Optional[str] = None, status_time: Optional[str] = None + ): # pylint: disable=W0237 + """ + Setup the overall layout of the display + + :param `status_data`: A dict of status data to display + :param `study_title`: A title for the study to display at the top of the output + :param `status_time`: A timestamp to add to the title + """ + if isinstance(status_data, dict) and status_data: + self._status_data = status_data + else: + raise ValueError("Status data must be a dict") + + # Create the table title + table_title = "" + if status_time: + table_title += f"Status as of {status_time}" + if study_title: + if status_time: + table_title += "\n" + table_title += f"Study: {study_title}" + if table_title: + LOG.debug(f"Table title: {table_title}") + self._status_table.title = table_title + + # Create settings for the entire display + self._status_table.box = box.HEAVY + self._status_table.show_lines = True + self._status_table.show_edge = False + self._status_table.show_footer = True + self._status_table.collapse_padding = True + + # Uses folding overflow for very long step/workspace names + self._status_table.add_column("Step", overflow="fold") + + # Build out the status table by sectioning it off at each step + for step_name, overall_step_info in self._status_data.items(): + task_queue = overall_step_info["task_queue"] if "task_queue" in overall_step_info else None + worker_name = overall_step_info["worker_name"] if "worker_name" in overall_step_info else None + + # Set up the top section of each step entry + # (this section will have step name, task queue, worker name, and parameters) + step_table = self.create_step_table( + step_name, overall_step_info["parameters"], task_queue=task_queue, worker_name=worker_name + ) + + # Set up the bottom section of each step entry + # (this section will have task-by-task info; status, return code, run time, etc.) + sample_details_table = self.create_task_details_table(overall_step_info) + + # Add the bottom section to the top section + step_table.add_row("", sample_details_table) + + # Add this step to the full status table + self._status_table.add_row(step_table, end_section=True) + + def render(self, theme: Optional[Dict[str, str]] = None): + """ + Do the actual printing + + :param `theme`: A dict of theme settings (see self._theme_dict for the appropriate layout) + """ + # Apply any theme customization + if theme: + LOG.debug(f"Applying theme: {theme}") + for key, value in theme.items(): + self._theme_dict[key] = value + + # If we're disabling the theme, we need to set all themes in the theme dict to none + if self.disable_theme: + LOG.debug("Disabling theme.") + for key in self._theme_dict: + self._theme_dict[key] = "none" + + # Get the rich Console + status_theme = Theme(self._theme_dict) + _printer = Console(theme=status_theme) + + # Display the status table + if self.disable_pager: + _printer.print(self._status_table) + else: + with _printer.pager(styles=(not self.disable_theme)): + _printer.print(self._status_table) + + +class MerlinFlatRenderer(FlatStatusRenderer): + """ + This class handles the flat status formatting for task-by-task display. + It will not separate the display on a step-by-step basis and instead group + all statuses together in a single table. + + Similar to Maestro's 'flat' status display. + """ + + def __init__(self, *args, **kwargs): + super().__init__(args, kwargs) + self.disable_theme = kwargs.pop("disable_theme", False) + self.disable_pager = kwargs.pop("disable_pager", False) + + def layout( + self, status_data: Dict[str, List[Union[str, int]]], study_title: Optional[str] = None + ): # pylint: disable=W0221 + """ + Setup the layout of the display + + :param `status_data`: A dict of status information that we'll display + :param `study_title`: The title of the study to display at the top of the output + """ + if "cmd_parameters" in status_data: + del status_data["cmd_parameters"] + if "restart_parameters" in status_data: + del status_data["restart_parameters"] + + # Capitalize column labels + capitalized_keys = [format_label(key) for key in status_data] + status_data = dict(zip(capitalized_keys, list(status_data.values()))) + + super().layout(status_data, study_title=study_title) + + def render(self, theme: Optional[Dict[str, str]] = None): + """ + Do the actual printing + + :param `theme`: A dict of theme settings (see self._theme_dict for the appropriate layout) + """ + # Apply any theme customization + if theme: + LOG.debug(f"Applying theme: {theme}") + for key, value in theme.items(): + self._theme_dict[key] = value + + # If we're disabling the theme, we need to set all themes in the theme dict to none + if self.disable_theme: + LOG.debug("Disabling theme.") + for key in self._theme_dict: + self._theme_dict[key] = "none" + + # Get the rich Console + status_theme = Theme(self._theme_dict) + _printer = Console(theme=status_theme) + + # Display the status table + if self.disable_pager: + _printer.print(self._status_table) + else: + with _printer.pager(styles=(not self.disable_theme)): + _printer.print(self._status_table) + + +class MerlinStatusRendererFactory(StatusRendererFactory): + """ + This class keeps track of all available status layouts for Merlin. + """ + + # TODO: when maestro releases the pager changes: + # - remove init and render in MerlinFlatRenderer + # - remove the get_renderer method below + # - remove self.disable_theme and self.disable_pager from MerlinFlatRenderer and MerlinDefaultRenderer + # - these variables will be in BaseStatusRenderer in Maestro + # - remove render method in MerlinDefaultRenderer + # - this will also be in BaseStatusRenderer in Maestro + def __init__(self): # pylint: disable=W0231 + self._layouts = { + "table": MerlinFlatRenderer, + "default": MerlinDefaultRenderer, + } + + def get_renderer(self, layout: str, disable_theme: bool, disable_pager: bool): # pylint: disable=W0221 + """Get handle for specific layout renderer to instantiate + + :param `layout`: A string denoting the name of the layout renderer to use + :param `disable_theme`: True if the user wants to disable themes when displaying status. + False otherwise. + :param `disable_pager`: True if the user wants to disable the pager when displaying status. + False otherwise. + + :returns: The status renderer class to use for displaying the output + """ + renderer = self._layouts.get(layout) + + # Note, need to wrap renderer in try/catch too, or return default val? + if not renderer: + raise ValueError(layout) + + return renderer(disable_theme=disable_theme, disable_pager=disable_pager) + + +status_renderer_factory = MerlinStatusRendererFactory() diff --git a/merlin/study/step.py b/merlin/study/step.py index efa062ab9..e6366fbf2 100644 --- a/merlin/study/step.py +++ b/merlin/study/step.py @@ -6,7 +6,7 @@ # # LLNL-CODE-797170 # All rights reserved. -# This file is part of Merlin, Version: 1.11.0. +# This file is part of Merlin, Version: 1.11.1. # # For details, see https://github.com/LLNL/merlin. # @@ -241,8 +241,23 @@ def _update_status_file( lock = FileLock(f"{self.workspace.value}/status.lock") # pylint: disable=E0110 status_info = read_status(status_filepath, lock) else: + # Create the parameter entries + cmd_params = restart_params = None + if self.merlin_step.params["cmd"]: + cmd_params = dict(self.merlin_step.params["cmd"].items()) + if self.merlin_step.params["restart_cmd"]: + restart_params = dict(self.merlin_step.params["restart_cmd"].items()) + LOG.debug(f"Cmd parameters for {self.name}: {cmd_params}; Restart params: {restart_params}") + # Inititalize the status_info dict we'll be dumping to the status file - status_info = {self.name: {}} + status_info = { + self.name: { + "parameters": { + "cmd": cmd_params, + "restart": restart_params, + } + } + } # Add celery specific info if task_server == "celery": @@ -284,6 +299,8 @@ def __init__(self, maestro_step_record, study_name, parameter_info): self.study_name = study_name self.parameter_info = parameter_info self.__restart = False + self.params = {"cmd": {}, "restart_cmd": {}} + self.establish_params() def get_cmd(self): """ @@ -383,6 +400,15 @@ def restart(self, val): """ self.__restart = val + def establish_params(self): + """If this step uses parameters, pull them from the step param map.""" + try: + step_params = self.parameter_info["step_param_map"][self.name()] + for cmd_type in step_params: + self.params[cmd_type].update(step_params[cmd_type]) + except KeyError: + pass + def check_if_expansion_needed(self, labels): """ :return : True if the cmd has any of the default keywords or spec diff --git a/merlin/study/study.py b/merlin/study/study.py index 453c26af2..8502ebebb 100644 --- a/merlin/study/study.py +++ b/merlin/study/study.py @@ -6,7 +6,7 @@ # # LLNL-CODE-797170 # All rights reserved. -# This file is part of Merlin, Version: 1.11.0. +# This file is part of Merlin, Version: 1.11.1. # # For details, see https://github.com/LLNL/merlin. # @@ -550,6 +550,7 @@ def load_dag(self): column_labels = self.expanded_spec.merlin["samples"]["column_labels"] parameter_info = { "labels": self.parameter_labels, + "step_param_map": self.expanded_spec.get_step_param_map(), } # To avoid pickling issues with _pass_detect_cycle from maestro, we unpack the dag here self.dag = DAG(maestro_dag.adjacency_table, maestro_dag.values, column_labels, study.name, parameter_info) diff --git a/merlin/utils.py b/merlin/utils.py index 838c6ea89..78ddd6eae 100644 --- a/merlin/utils.py +++ b/merlin/utils.py @@ -6,7 +6,7 @@ # # LLNL-CODE-797170 # All rights reserved. -# This file is part of Merlin, Version: 1.11.0. +# This file is part of Merlin, Version: 1.11.1. # # For details, see https://github.com/LLNL/merlin. # @@ -669,7 +669,7 @@ def convert_timestring(timestring: Union[str, int], format_method: str = "HMS") return repr_timedelta(tdelta, method=format_method) -def pretty_format_HMS(timestring: str) -> str: +def pretty_format_hms(timestring: str) -> str: """ Given an HMS timestring, format it so it removes blank entries and adds labels. diff --git a/requirements/dev.txt b/requirements/dev.txt index 895a89249..6e8722b4b 100644 --- a/requirements/dev.txt +++ b/requirements/dev.txt @@ -11,3 +11,4 @@ sphinx>=2.0.0 alabaster johnnydep deepdiff +pytest-order diff --git a/setup.py b/setup.py index 7c91d26c7..7303a1ddf 100644 --- a/setup.py +++ b/setup.py @@ -6,7 +6,7 @@ # # LLNL-CODE-797170 # All rights reserved. -# This file is part of Merlin, Version: 1.11.0. +# This file is part of Merlin, Version: 1.11.1. # # For details, see https://github.com/LLNL/merlin. # diff --git a/tests/README.md b/tests/README.md new file mode 100644 index 000000000..a6bf7005a --- /dev/null +++ b/tests/README.md @@ -0,0 +1,152 @@ +# Tests + +This directory utilizes pytest to create and run our test suite. +Here we use pytest fixtures to create a local redis server and a celery app for testing. + +This directory is organized like so: +- `conftest.py` - The script containing all fixtures for our tests +- `unit/` - The directory containing unit tests + - `test_*.py` - The actual test scripts to run +- `integration/` - The directory containing integration tests + + - `definitions.py` - The test definitions + - `run_tests.py` - The script to run the tests defined in `definitions.py` + - `conditions.py` - The conditions to test against + +## How to Run + +Before running any tests: + +1. Activate your virtual environment with Merlin's dev requirements installed +2. Navigate to the tests folder where this README is located + +To run the entire test suite: + +``` +python -m pytest +``` + +To run a specific test file: + +``` +python -m pytest /path/to/test_specific_file.py +``` + +To run a certain test class within a specific test file: + +``` +python -m pytest /path/to/test_specific_file.py::TestCertainClass +``` + +To run one unique test: + +``` +python -m pytest /path/to/test_specific_file.py::TestCertainClass::test_unique_test +``` + +## Killing the Test Server + +In case of an issue with the test suite, or if you stop the tests with `ctrl+C`, you may need to stop +the server manually. This can be done with: + +``` +redis-cli +127.0.0.1:6379> AUTH merlin-test-server +127.0.0.1:6379> shutdown +not connected> quit +``` + +## The Fixture Process Explained + +Pytest fixtures play a fundamental role in establishing a consistent foundation for test execution, +thus ensuring reliable and predictable test outcomes. This section will delve into essential aspects +of these fixtures, including how to integrate fixtures into tests, the utilization of fixtures within other fixtures, +their scope, and the yielding of fixture results. + +### How to Integrate Fixtures Into Tests + +Probably the most important part of fixtures is understanding how to use them. Luckily, this process is very +simple and can be dumbed down to 2 steps: + +1. Create a fixture in the `conftest.py` file by using the `@pytest.fixture` decorator. For example: + +``` +@pytest.fixture +def dummy_fixture(): + return "hello world" +``` + +2. Use it as an argument in a test function (you don't even need to import it!): + +``` +def test_dummy(dummy_fixture): + assert dummy_fixture == "hello world" +``` + +For more information, see [Pytest's documentation](https://docs.pytest.org/en/7.1.x/how-to/fixtures.html#how-to-use-fixtures). + +### Fixtureception + +One of the coolest and most useful aspects of fixtures that we utilize in this test suite is the ability for +fixtures to be used within other fixtures. For more info on this from pytest, see +[here](https://docs.pytest.org/en/7.1.x/how-to/fixtures.html#fixtures-can-request-other-fixtures). + +Pytest will handle fixtures within fixtures in a stack-based way. Let's look at how creating the `redis_pass` +fixture from our `conftest.py` file works in order to illustrate the process. +1. First, we start by telling pytest that we want to use the `redis_pass` fixture by providing it as an argument +to a test/fixture: + +``` +def test_example(redis_pass): + ... +``` + +2. Now pytest will find the `redis_pass` fixture and put it at the top of the stack to be created. However, +it'll see that this fixture requires another fixture `merlin_server_dir` as an argument: + +``` +@pytest.fixture(scope="session") +def redis_pass(merlin_server_dir): + ... +``` + +3. Pytest then puts the `merlin_server_dir` fixture at the top of the stack, but similarly it sees that this fixture +requires yet another fixture `temp_output_dir`: + +``` +@pytest.fixture(scope="session") +def merlin_server_dir(temp_output_dir: str) -> str: + ... +``` + +4. This process continues until it reaches a fixture that doesn't require any more fixtures. At this point the base +fixture is created and pytest will start working its way back up the stack to the first fixture it looked at (in this +case `redis_pass`). + +5. Once all required fixtures are created, execution will be returned to the test which can now access the fixture +that was requested (`redis_pass`). + +As you can see, if we have to re-do this process for every test it could get pretty time intensive. This is where fixture +scopes come to save the day. + +### Fixture Scopes + +There are several different scopes that you can set for fixtures. The majority of our fixtures use a `session` +scope so that we only have to create the fixtures one time (as some of them can take a few seconds to set up). +The goal is to create fixtures with the most general use-case in mind so that we can re-use them for larger +scopes, which helps with efficiency. + +For more info on scopes, see +[Pytest's Fixture Scope documentation](https://docs.pytest.org/en/6.2.x/fixture.html#scope-sharing-fixtures-across-classes-modules-packages-or-session). + +### Yielding Fixtures + +In several fixtures throughout our test suite, we need to run some sort of teardown for the fixture. For example, +once we no longer need the `redis_server` fixture, we need to shut the server down so it stops using resources. +This is where yielding fixtures becomes extremely useful. + +Using the `yield` keyword allows execution to be returned to a test that needs the fixture once the feature has +been set up. After all tests using the fixture have been ran, execution will return to the fixture for us to run +our teardown code. + +For more information on yielding fixtures, see [Pytest's documentation](https://docs.pytest.org/en/7.1.x/how-to/fixtures.html#teardown-cleanup-aka-fixture-finalization). \ No newline at end of file diff --git a/tests/celery_test_workers.py b/tests/celery_test_workers.py new file mode 100644 index 000000000..39eb2a39b --- /dev/null +++ b/tests/celery_test_workers.py @@ -0,0 +1,231 @@ +############################################################################### +# Copyright (c) 2023, Lawrence Livermore National Security, LLC. +# Produced at the Lawrence Livermore National Laboratory +# Written by the Merlin dev team, listed in the CONTRIBUTORS file. +# +# +# LLNL-CODE-797170 +# All rights reserved. +# This file is part of Merlin, Version: 1.11.1. +# +# For details, see https://github.com/LLNL/merlin. +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# The above copyright notice and this permission notice shall be included in +# all copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. +############################################################################### +""" +Module to define functionality for test workers and how to start/stop +them in their own processes. +""" +import multiprocessing +import os +import signal +import subprocess +from time import sleep +from types import TracebackType +from typing import Dict, List, Type + +from celery import Celery + + +class CeleryTestWorkersManager: + """ + A class to handle the setup and teardown of celery workers. + This should be treated as a context and used with python's + built-in 'with' statement. If you use it without this statement, + beware that the processes spun up here may never be stopped. + """ + + def __init__(self, app: Celery): + self.app = app + self.running_workers = [] + self.worker_processes = {} + self.echo_processes = {} + + def __enter__(self): + """This magic method is necessary for allowing this class to be used as a context manager.""" + return self + + def __exit__(self, exc_type: Type[Exception], exc_value: Exception, traceback: TracebackType): + """ + This will always run at the end of a context with statement, even if an error is raised. + It's a safe way to ensure all of our subprocesses are stopped no matter what. + """ + + # Try to stop everything gracefully first + self.stop_all_workers() + + # Check that all the worker processes were stopped, otherwise forcefully terminate them + for worker_process in self.worker_processes.values(): + if worker_process.is_alive(): + worker_process.kill() + + # Check that all the echo processes were stopped, otherwise forcefully terminate them + ps_proc = subprocess.run("ps ux", shell=True, capture_output=True, text=True) + for pid in self.echo_processes.values(): + if str(pid) in ps_proc.stdout: + os.kill(pid, signal.SIGKILL) + + def _is_worker_ready(self, worker_name: str, verbose: bool = False) -> bool: + """ + Check to see if the worker is up and running yet. + + :param worker_name: The name of the worker we're checking on + :param verbose: If true, enable print statements to show where we're at in execution + :returns: True if the worker is running. False otherwise. + """ + ping = self.app.control.inspect().ping(destination=[f"celery@{worker_name}"]) + if verbose: + print(f"ping: {ping}") + return ping is not None and f"celery@{worker_name}" in ping + + def _wait_for_worker_launch(self, worker_name: str, verbose: bool = False): + """ + Poll the worker over a fixed interval of time. If the worker doesn't show up + within the time limit then we'll raise a timeout error. Otherwise, the worker + is up and running and we can continue with our tests. + + :param worker_name: The name of the worker we're checking on + :param verbose: If true, enable print statements to show where we're at in execution + """ + max_wait_time = 2 # Maximum wait time in seconds + wait_interval = 0.5 # Interval between checks in seconds + waited_time = 0 + worker_ready = False + + if verbose: + print(f"waiting for {worker_name} to launch...") + + # Wait until the worker is ready + while waited_time < max_wait_time: + if self._is_worker_ready(worker_name, verbose=verbose): + worker_ready = True + break + + sleep(wait_interval) + waited_time += wait_interval + + if not worker_ready: + raise TimeoutError("Celery workers did not start within the expected time.") + + if verbose: + print(f"{worker_name} launched") + + def start_worker(self, worker_launch_cmd: List[str]): + """ + This is where a worker is actually started. Each worker maintains control of a process until + we tell it to stop, that's why we have to use the multiprocessing library for this. We have to use + app.worker_main instead of the normal "celery -A worker" command to launch the workers + since our celery app is created in a pytest fixture and is unrecognizable by the celery command. + For each worker, the output of it's logs are sent to + /tmp/`whoami`/pytest-of-`whoami`/pytest-current/integration_outfiles_current/ under a file with a name + similar to: test_worker_*.log. + NOTE: pytest-current/ will have the results of the most recent test run. If you want to see a previous run + check under pytest-/. HOWEVER, only the 3 most recent test runs will be saved. + + :param worker_launch_cmd: The command to launch a worker + """ + self.app.worker_main(worker_launch_cmd) + + def launch_worker(self, worker_name: str, queues: List[str], concurrency: int = 1): + """ + Launch a single worker. We'll add the process that the worker is running in to the list of worker processes. + We'll also create an echo process to simulate a celery worker command that will show up with 'ps ux'. + + :param worker_name: The name to give to the worker + :param queues: A list of queues that the worker will be watching + :param concurrency: The concurrency value of the worker (how many child processes to have the worker spin up) + """ + # Check to make sure we have a unique worker name so we can track all processes + if worker_name in self.worker_processes: + self.stop_all_workers() + raise ValueError(f"The worker {worker_name} is already running. Choose a different name.") + + # Create the launch command for this worker + worker_launch_cmd = [ + "worker", + "-n", + worker_name, + "-Q", + ",".join(queues), + "--concurrency", + str(concurrency), + f"--logfile={worker_name}.log", + "--loglevel=DEBUG", + ] + + # Create an echo command to simulate a running celery worker since our celery worker will be spun up in + # a different process and we won't be able to see it with 'ps ux' like we normally would + echo_process = subprocess.Popen( # pylint: disable=consider-using-with + f"echo 'celery merlin_test_app {' '.join(worker_launch_cmd)}'; sleep inf", + shell=True, + preexec_fn=os.setpgrp, # Make this the parent of the group so we can kill the 'sleep inf' that's spun up + ) + self.echo_processes[worker_name] = echo_process.pid + + # Start the worker in a separate process since it'll take control of the entire process until we kill it + worker_process = multiprocessing.Process(target=self.start_worker, args=(worker_launch_cmd,)) + worker_process.start() + self.worker_processes[worker_name] = worker_process + self.running_workers.append(worker_name) + + # Wait for the worker to launch properly + try: + self._wait_for_worker_launch(worker_name, verbose=False) + except TimeoutError as exc: + self.stop_all_workers() + raise exc + + def launch_workers(self, worker_info: Dict[str, Dict]): + """ + Launch multiple workers. This will call `launch_worker` to launch each worker + individually. + + :param worker_info: A dict of worker info with the form + {"worker_name": {"concurrency": , "queues": }} + """ + for worker_name, worker_settings in worker_info.items(): + self.launch_worker(worker_name, worker_settings["queues"], worker_settings["concurrency"]) + + def stop_worker(self, worker_name: str): + """ + Stop a single running worker and its associated processes. + + :param worker_name: The name of the worker to shutdown + """ + # Send a shutdown signal to the worker + self.app.control.broadcast("shutdown", destination=[f"celery@{worker_name}"]) + + # Try to terminate the process gracefully + if self.worker_processes[worker_name] is not None: + self.worker_processes[worker_name].terminate() + process_exit_code = self.worker_processes[worker_name].join(timeout=3) + + # If it won't terminate then force kill it + if process_exit_code is None: + self.worker_processes[worker_name].kill() + + # Terminate the echo process and its sleep inf subprocess + os.killpg(os.getpgid(self.echo_processes[worker_name]), signal.SIGTERM) + sleep(2) + + def stop_all_workers(self): + """ + Stop all of the running workers and the processes associated with them. + """ + for worker_name in self.running_workers: + self.stop_worker(worker_name) diff --git a/tests/conftest.py b/tests/conftest.py new file mode 100644 index 000000000..38c6b0334 --- /dev/null +++ b/tests/conftest.py @@ -0,0 +1,209 @@ +############################################################################### +# Copyright (c) 2023, Lawrence Livermore National Security, LLC. +# Produced at the Lawrence Livermore National Laboratory +# Written by the Merlin dev team, listed in the CONTRIBUTORS file. +# +# +# LLNL-CODE-797170 +# All rights reserved. +# This file is part of Merlin, Version: 1.11.1. +# +# For details, see https://github.com/LLNL/merlin. +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# The above copyright notice and this permission notice shall be included in +# all copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. +############################################################################### +""" +This module contains pytest fixtures to be used throughout the entire +integration test suite. +""" +import os +import subprocess +from time import sleep +from typing import Dict + +import pytest +import redis +from _pytest.tmpdir import TempPathFactory +from celery import Celery +from celery.canvas import Signature + +from tests.celery_test_workers import CeleryTestWorkersManager + + +class RedisServerError(Exception): + """ + Exception to signal that the server wasn't pinged properly. + """ + + +class ServerInitError(Exception): + """ + Exception to signal that there was an error initializing the server. + """ + + +@pytest.fixture(scope="session") +def temp_output_dir(tmp_path_factory: TempPathFactory) -> str: + """ + This fixture will create a temporary directory to store output files of integration tests. + The temporary directory will be stored at /tmp/`whoami`/pytest-of-`whoami`/. There can be at most + 3 temp directories in this location so upon the 4th test run, the 1st temp directory will be removed. + + :param tmp_path_factory: A built in factory with pytest to help create temp paths for testing + :yields: The path to the temp output directory we'll use for this test run + """ + # Log the cwd, then create and move into the temporary one + cwd = os.getcwd() + temp_integration_outfile_dir = tmp_path_factory.mktemp("integration_outfiles_") + os.chdir(temp_integration_outfile_dir) + + yield temp_integration_outfile_dir + + # Move back to the directory we started at + os.chdir(cwd) + + +@pytest.fixture(scope="session") +def redis_pass() -> str: + """ + This fixture represents the password to the merlin test server. + + :returns: The redis password for our test server + """ + return "merlin-test-server" + + +@pytest.fixture(scope="session") +def merlin_server_dir(temp_output_dir: str, redis_pass: str) -> str: # pylint: disable=redefined-outer-name + """ + This fixture will initialize the merlin server (i.e. create all the files we'll + need to start up a local redis server). It will return the path to the directory + containing the files needed for the server to start up. + + :param temp_output_dir: The path to the temporary output directory we'll be using for this test run + :param redis_pass: The password to the test redis server that we'll create here + :returns: The path to the merlin_server directory with the server configurations + """ + # Initialize the setup for the local redis server + # We'll also set the password to 'merlin-test-server' so it'll be easy to shutdown if there's an issue + subprocess.run(f"merlin server init; merlin server config -pwd {redis_pass}", shell=True, capture_output=True, text=True) + + # Check that the merlin server was initialized properly + server_dir = f"{temp_output_dir}/merlin_server" + if not os.path.exists(server_dir): + raise ServerInitError("The merlin server was not initialized properly.") + + return server_dir + + +@pytest.fixture(scope="session") +def redis_server(merlin_server_dir: str, redis_pass: str) -> str: # pylint: disable=redefined-outer-name,unused-argument + """ + Start a redis server instance that runs on localhost:6379. This will yield the + redis server uri that can be used to create a connection with celery. + + :param merlin_server_dir: The directory to the merlin test server configuration. + This will not be used here but we need the server configurations before we can + start the server. + :param redis_pass: The raw redis password stored in the redis.pass file + :yields: The local redis server uri + """ + # Start the local redis server + try: + # Need to set LC_ALL='C' before starting the server or else redis causes a failure + subprocess.run("export LC_ALL='C'; merlin server start", shell=True, timeout=5) + except subprocess.TimeoutExpired: + pass + + # Ensure the server started properly + host = "localhost" + port = 6379 + database = 0 + username = "default" + redis_client = redis.Redis(host=host, port=port, db=database, password=redis_pass, username=username) + if not redis_client.ping(): + raise RedisServerError("The redis server could not be pinged. Check that the server is running with 'ps ux'.") + + # Hand over the redis server url to any other fixtures/tests that need it + redis_server_uri = f"redis://{username}:{redis_pass}@{host}:{port}/{database}" + yield redis_server_uri + + # Kill the server; don't run this until all tests are done (accomplished with 'yield' above) + kill_process = subprocess.run("merlin server stop", shell=True, capture_output=True, text=True) + assert "Merlin server terminated." in kill_process.stderr + + +@pytest.fixture(scope="session") +def celery_app(redis_server: str) -> Celery: # pylint: disable=redefined-outer-name + """ + Create the celery app to be used throughout our integration tests. + + :param redis_server: The redis server uri we'll use to connect to redis + :returns: The celery app object we'll use for testing + """ + return Celery("merlin_test_app", broker=redis_server, backend=redis_server) + + +@pytest.fixture(scope="session") +def sleep_sig(celery_app: Celery) -> Signature: # pylint: disable=redefined-outer-name + """ + Create a task registered to our celery app and return a signature for it. + Once requested by a test, you can set the queue you'd like to send this to + with `sleep_sig.set(queue=)`. Here, will likely be + one of the queues defined in the `worker_queue_map` fixture. + + :param celery_app: The celery app object we'll use for testing + :returns: A celery signature for a task that will sleep for 3 seconds + """ + + # Create a celery task that sleeps for 3 sec + @celery_app.task + def sleep_task(): + print("running sleep task") + sleep(3) + + # Create a signature for this task + return sleep_task.s() + + +@pytest.fixture(scope="session") +def worker_queue_map() -> Dict[str, str]: + """ + Worker and queue names to be used throughout tests + + :returns: A dict of dummy worker/queue associations + """ + return {f"test_worker_{i}": f"test_queue_{i}" for i in range(3)} + + +@pytest.fixture(scope="class") +def launch_workers(celery_app: Celery, worker_queue_map: Dict[str, str]): # pylint: disable=redefined-outer-name + """ + Launch the workers on the celery app fixture using the worker and queue names + defined in the worker_queue_map fixture. + + :param celery_app: The celery app fixture that's connected to our redis server + :param worker_queue_map: A dict where the keys are worker names and the values are queue names + """ + # Format worker info in a format the our workers manager will be able to read + # (basically just add in concurrency value to worker_queue_map) + worker_info = {worker_name: {"concurrency": 1, "queues": [queue]} for worker_name, queue in worker_queue_map.items()} + + with CeleryTestWorkersManager(celery_app) as workers_manager: + workers_manager.launch_workers(worker_info) + yield diff --git a/tests/integration/conditions.py b/tests/integration/conditions.py index b25010ca2..80e3e5855 100644 --- a/tests/integration/conditions.py +++ b/tests/integration/conditions.py @@ -6,7 +6,7 @@ # # LLNL-CODE-797170 # All rights reserved. -# This file is part of Merlin, Version: 1.11.0. +# This file is part of Merlin, Version: 1.11.1. # # For details, see https://github.com/LLNL/merlin. # diff --git a/tests/integration/test_definitions.py b/tests/integration/definitions.py similarity index 99% rename from tests/integration/test_definitions.py rename to tests/integration/definitions.py index f59acf237..273fa7c56 100644 --- a/tests/integration/test_definitions.py +++ b/tests/integration/definitions.py @@ -6,7 +6,7 @@ # # LLNL-CODE-797170 # All rights reserved. -# This file is part of Merlin, Version: 1.11.0. +# This file is part of Merlin, Version: 1.11.1. # # For details, see https://github.com/LLNL/merlin. # diff --git a/tests/integration/run_tests.py b/tests/integration/run_tests.py index 81c88d110..ace508b19 100644 --- a/tests/integration/run_tests.py +++ b/tests/integration/run_tests.py @@ -6,7 +6,7 @@ # # LLNL-CODE-797170 # All rights reserved. -# This file is part of Merlin, Version: 1.11.0. +# This file is part of Merlin, Version: 1.11.1. # # For details, see https://github.com/LLNL/merlin. # @@ -41,7 +41,7 @@ from tabulate import tabulate -from tests.integration.test_definitions import OUTPUT_DIR, define_tests +from tests.integration.definitions import OUTPUT_DIR, define_tests # pylint: disable=E0401 def get_definition_issues(test): @@ -236,7 +236,7 @@ def run_tests(args, tests): # pylint: disable=R0914 total += 1 continue dot_length = 50 - len(test_name) - len(str(test_label)) - print(f"TEST {test_label}: {test_name}{'.'*dot_length}", end="") + print(f"TEST {test_label}: {test_name}{'.' * dot_length}", end="") # Check the format of the test definition definition_issues = get_definition_issues(test) if definition_issues: diff --git a/tests/unit/study/status_test_files/shared_tests.py b/tests/unit/study/status_test_files/shared_tests.py index ecf553b56..27fb31521 100644 --- a/tests/unit/study/status_test_files/shared_tests.py +++ b/tests/unit/study/status_test_files/shared_tests.py @@ -69,6 +69,10 @@ def assert_correct_attribute_creation(status_obj: Union[Status, DetailedStatus]) ) assert requested_statuses_diff == {} + # Ensuring run time info was calculated correctly + run_time_info_diff = DeepDiff(status_obj.run_time_info, status_test_variables.RUN_TIME_INFO, ignore_order=True) + assert run_time_info_diff == {} + # Ensuring num_requested_statuses is getting the correct amount of statuses assert status_obj.num_requested_statuses == status_test_variables.NUM_ALL_REQUESTED_STATUSES @@ -239,7 +243,7 @@ def run_csv_dump_test(status_obj: Union[Status, DetailedStatus], expected_output """ Test the csv dump functionality. This tests both the write and append dump functionalities. The file needs to exist already for an append so it's - better to keep these tests together. This covers the format_status_for_display + better to keep these tests together. This covers the format_status_for_csv and dump methods. :param `status_obj`: A Status or DetailedStatus object that we're testing the dump functionality for diff --git a/tests/unit/study/status_test_files/status_test_study_20230717-162921/cancel_step/MERLIN_STATUS.json b/tests/unit/study/status_test_files/status_test_study_20230717-162921/cancel_step/MERLIN_STATUS.json index 8bed4fb33..8b5ddd35d 100644 --- a/tests/unit/study/status_test_files/status_test_study_20230717-162921/cancel_step/MERLIN_STATUS.json +++ b/tests/unit/study/status_test_files/status_test_study_20230717-162921/cancel_step/MERLIN_STATUS.json @@ -1,5 +1,9 @@ { "cancel_step": { + "parameters": { + "cmd": null, + "restart": null + }, "task_queue": "cancel_queue", "worker_name": "other_worker", "cancel_step": { diff --git a/tests/unit/study/status_test_files/status_test_study_20230717-162921/cancel_step/status.lock b/tests/unit/study/status_test_files/status_test_study_20230717-162921/cancel_step/status.lock old mode 100755 new mode 100644 diff --git a/tests/unit/study/status_test_files/status_test_study_20230717-162921/fail_step/MERLIN_STATUS.json b/tests/unit/study/status_test_files/status_test_study_20230717-162921/fail_step/MERLIN_STATUS.json index f1adbf765..6e076a26e 100644 --- a/tests/unit/study/status_test_files/status_test_study_20230717-162921/fail_step/MERLIN_STATUS.json +++ b/tests/unit/study/status_test_files/status_test_study_20230717-162921/fail_step/MERLIN_STATUS.json @@ -1,5 +1,9 @@ { "fail_step": { + "parameters": { + "cmd": null, + "restart": null + }, "task_queue": "fail_queue", "worker_name": "other_worker", "fail_step": { diff --git a/tests/unit/study/status_test_files/status_test_study_20230717-162921/fail_step/status.lock b/tests/unit/study/status_test_files/status_test_study_20230717-162921/fail_step/status.lock old mode 100755 new mode 100644 diff --git a/tests/unit/study/status_test_files/status_test_study_20230717-162921/just_parameters/GREET.hello.LEAVE.goodbye/MERLIN_STATUS.json b/tests/unit/study/status_test_files/status_test_study_20230717-162921/just_parameters/GREET.hello.LEAVE.goodbye/MERLIN_STATUS.json index b70bfd369..406c090b7 100644 --- a/tests/unit/study/status_test_files/status_test_study_20230717-162921/just_parameters/GREET.hello.LEAVE.goodbye/MERLIN_STATUS.json +++ b/tests/unit/study/status_test_files/status_test_study_20230717-162921/just_parameters/GREET.hello.LEAVE.goodbye/MERLIN_STATUS.json @@ -1,5 +1,13 @@ { "just_parameters_GREET.hello.LEAVE.goodbye": { + "parameters": { + "cmd": { + "GREET": "hello" + }, + "restart": { + "LEAVE": "goodbye" + } + }, "task_queue": "just_parameters_queue", "worker_name": "other_worker", "just_parameters/GREET.hello.LEAVE.goodbye": { diff --git a/tests/unit/study/status_test_files/status_test_study_20230717-162921/just_parameters/GREET.hello.LEAVE.goodbye/status.lock b/tests/unit/study/status_test_files/status_test_study_20230717-162921/just_parameters/GREET.hello.LEAVE.goodbye/status.lock old mode 100755 new mode 100644 diff --git a/tests/unit/study/status_test_files/status_test_study_20230717-162921/just_parameters/GREET.hola.LEAVE.adios/MERLIN_STATUS.json b/tests/unit/study/status_test_files/status_test_study_20230717-162921/just_parameters/GREET.hola.LEAVE.adios/MERLIN_STATUS.json index 2640668f7..bf783d98f 100644 --- a/tests/unit/study/status_test_files/status_test_study_20230717-162921/just_parameters/GREET.hola.LEAVE.adios/MERLIN_STATUS.json +++ b/tests/unit/study/status_test_files/status_test_study_20230717-162921/just_parameters/GREET.hola.LEAVE.adios/MERLIN_STATUS.json @@ -1,5 +1,13 @@ { "just_parameters_GREET.hola.LEAVE.adios": { + "parameters": { + "cmd": { + "GREET": "hola" + }, + "restart": { + "LEAVE": "adios" + } + }, "task_queue": "just_parameters_queue", "worker_name": "other_worker", "just_parameters/GREET.hola.LEAVE.adios": { diff --git a/tests/unit/study/status_test_files/status_test_study_20230717-162921/just_parameters/GREET.hola.LEAVE.adios/status.lock b/tests/unit/study/status_test_files/status_test_study_20230717-162921/just_parameters/GREET.hola.LEAVE.adios/status.lock old mode 100755 new mode 100644 diff --git a/tests/unit/study/status_test_files/status_test_study_20230717-162921/just_samples/MERLIN_STATUS.json b/tests/unit/study/status_test_files/status_test_study_20230717-162921/just_samples/MERLIN_STATUS.json index 3f49c3df2..d7df3d153 100644 --- a/tests/unit/study/status_test_files/status_test_study_20230717-162921/just_samples/MERLIN_STATUS.json +++ b/tests/unit/study/status_test_files/status_test_study_20230717-162921/just_samples/MERLIN_STATUS.json @@ -1,5 +1,9 @@ { "just_samples": { + "parameters": { + "cmd": null, + "restart": null + }, "task_queue": "just_samples_queue", "worker_name": "sample_worker", "just_samples/00": { diff --git a/tests/unit/study/status_test_files/status_test_study_20230717-162921/just_samples/status.lock b/tests/unit/study/status_test_files/status_test_study_20230717-162921/just_samples/status.lock old mode 100755 new mode 100644 diff --git a/tests/unit/study/status_test_files/status_test_study_20230717-162921/params_and_samples/GREET.hello/MERLIN_STATUS.json b/tests/unit/study/status_test_files/status_test_study_20230717-162921/params_and_samples/GREET.hello/MERLIN_STATUS.json index 42d38c58d..364248c43 100644 --- a/tests/unit/study/status_test_files/status_test_study_20230717-162921/params_and_samples/GREET.hello/MERLIN_STATUS.json +++ b/tests/unit/study/status_test_files/status_test_study_20230717-162921/params_and_samples/GREET.hello/MERLIN_STATUS.json @@ -1,5 +1,11 @@ { "params_and_samples_GREET.hello": { + "parameters": { + "cmd": { + "GREET": "hello" + }, + "restart": null + }, "task_queue": "both_queue", "worker_name": "sample_worker", "params_and_samples/GREET.hello/00": { diff --git a/tests/unit/study/status_test_files/status_test_study_20230717-162921/params_and_samples/GREET.hello/status.lock b/tests/unit/study/status_test_files/status_test_study_20230717-162921/params_and_samples/GREET.hello/status.lock old mode 100755 new mode 100644 diff --git a/tests/unit/study/status_test_files/status_test_study_20230717-162921/params_and_samples/GREET.hola/MERLIN_STATUS.json b/tests/unit/study/status_test_files/status_test_study_20230717-162921/params_and_samples/GREET.hola/MERLIN_STATUS.json index 6d3596b5b..b256e65f5 100644 --- a/tests/unit/study/status_test_files/status_test_study_20230717-162921/params_and_samples/GREET.hola/MERLIN_STATUS.json +++ b/tests/unit/study/status_test_files/status_test_study_20230717-162921/params_and_samples/GREET.hola/MERLIN_STATUS.json @@ -1,5 +1,11 @@ { "params_and_samples_GREET.hola": { + "parameters": { + "cmd": { + "GREET": "hola" + }, + "restart": null + }, "task_queue": "both_queue", "worker_name": "sample_worker", "params_and_samples/GREET.hola/00": { diff --git a/tests/unit/study/status_test_files/status_test_study_20230717-162921/params_and_samples/GREET.hola/status.lock b/tests/unit/study/status_test_files/status_test_study_20230717-162921/params_and_samples/GREET.hola/status.lock old mode 100755 new mode 100644 diff --git a/tests/unit/study/status_test_files/status_test_variables.py b/tests/unit/study/status_test_files/status_test_variables.py index 0832204e5..45621a3b5 100644 --- a/tests/unit/study/status_test_files/status_test_variables.py +++ b/tests/unit/study/status_test_files/status_test_variables.py @@ -57,6 +57,80 @@ } NUM_ALL_REQUESTED_STATUSES = sum(TASKS_PER_STEP.values()) - TASKS_PER_STEP["unstarted_step"] +# This is the requested statuses with just the failed step +REQUESTED_STATUSES_JUST_FAILED_STEP = { + "fail_step": { + "parameters": {"cmd": None, "restart": None}, + "task_queue": "fail_queue", + "worker_name": "other_worker", + "fail_step": { + "status": "FAILED", + "return_code": "MERLIN_SOFT_FAIL", + "elapsed_time": "0d:00h:00m:00s", + "run_time": "0d:00h:00m:00s", + "restarts": 0, + }, + } +} + +# This is the requested statuses with just the cancelled step +REQUESTED_STATUSES_JUST_CANCELLED_STEP = { + "cancel_step": { + "parameters": {"cmd": None, "restart": None}, + "task_queue": "cancel_queue", + "worker_name": "other_worker", + "cancel_step": { + "status": "CANCELLED", + "return_code": "MERLIN_STOP_WORKERS", + "elapsed_time": "0d:00h:00m:00s", + "run_time": "0d:00h:00m:00s", + "restarts": 0, + }, + } +} + +# This is the requested statuses with both the failed step and the cancelled step +REQUESTED_STATUSES_FAIL_AND_CANCEL = { + "fail_step": { + "parameters": {"cmd": None, "restart": None}, + "task_queue": "fail_queue", + "worker_name": "other_worker", + "fail_step": { + "status": "FAILED", + "return_code": "MERLIN_SOFT_FAIL", + "elapsed_time": "0d:00h:00m:00s", + "run_time": "0d:00h:00m:00s", + "restarts": 0, + }, + }, + "cancel_step": { + "parameters": {"cmd": None, "restart": None}, + "task_queue": "cancel_queue", + "worker_name": "other_worker", + "cancel_step": { + "status": "CANCELLED", + "return_code": "MERLIN_STOP_WORKERS", + "elapsed_time": "0d:00h:00m:00s", + "run_time": "0d:00h:00m:00s", + "restarts": 0, + }, + }, +} + +FORMATTED_STATUSES_FAIL_AND_CANCEL = { + "step_name": ["fail_step", "cancel_step"], + "step_workspace": ["fail_step", "cancel_step"], + "status": ["FAILED", "CANCELLED"], + "return_code": ["MERLIN_SOFT_FAIL", "MERLIN_STOP_WORKERS"], + "elapsed_time": ["0d:00h:00m:00s", "0d:00h:00m:00s"], + "run_time": ["0d:00h:00m:00s", "0d:00h:00m:00s"], + "restarts": [0, 0], + "cmd_parameters": ["-------", "-------"], + "restart_parameters": ["-------", "-------"], + "task_queue": ["fail_queue", "cancel_queue"], + "worker_name": ["other_worker", "other_worker"], +} + # This variable holds the state_info dict of every step from VALID_WORKSPACE # i.e. the format returned by the display() method when run in test_mode DISPLAY_INFO = { @@ -133,213 +207,223 @@ "unstarted_step": "UNSTARTED", } -# This variable holds every status from the VALID_WORKSPACE in the format used when we first load them in -# i.e. the format loaded in by load_requested_statuses() -ALL_REQUESTED_STATUSES = { +RUN_TIME_INFO = { "just_parameters": { "avg_run_time": "01m:15s", "run_time_std_dev": "±15s", - "just_parameters_GREET.hello.LEAVE.goodbye": { - "task_queue": "just_parameters_queue", - "worker_name": "other_worker", - "just_parameters/GREET.hello.LEAVE.goodbye": { - "status": "FINISHED", - "return_code": "MERLIN_SUCCESS", - "elapsed_time": "0d:00h:02m:00s", - "run_time": "0d:00h:01m:30s", - "restarts": 0, - }, - }, - "just_parameters_GREET.hola.LEAVE.adios": { - "task_queue": "just_parameters_queue", - "worker_name": "other_worker", - "just_parameters/GREET.hola.LEAVE.adios": { - "status": "FINISHED", - "return_code": "MERLIN_SUCCESS", - "elapsed_time": "0d:00h:01m:00s", - "run_time": "0d:00h:01m:00s", - "restarts": 0, - }, - }, }, "just_samples": { "avg_run_time": "01m:30s", "run_time_std_dev": "±21s", - "just_samples": { - "task_queue": "just_samples_queue", - "worker_name": "sample_worker", - "just_samples/00": { - "status": "FINISHED", - "return_code": "MERLIN_SUCCESS", - "elapsed_time": "0d:00h:02m:00s", - "run_time": "0d:00h:01m:00s", - "restarts": 0, - }, - "just_samples/01": { - "status": "FINISHED", - "return_code": "MERLIN_SUCCESS", - "elapsed_time": "0d:00h:02m:00s", - "run_time": "0d:00h:01m:15s", - "restarts": 0, - }, - "just_samples/02": { - "status": "FINISHED", - "return_code": "MERLIN_SUCCESS", - "elapsed_time": "0d:00h:02m:00s", - "run_time": "0d:00h:01m:30s", - "restarts": 0, - }, - "just_samples/03": { - "status": "FINISHED", - "return_code": "MERLIN_SUCCESS", - "elapsed_time": "0d:00h:02m:00s", - "run_time": "0d:00h:01m:45s", - "restarts": 0, - }, - "just_samples/04": { - "status": "FINISHED", - "return_code": "MERLIN_SUCCESS", - "elapsed_time": "0d:00h:02m:00s", - "run_time": "0d:00h:02m:00s", - "restarts": 0, - }, - }, }, "params_and_samples": { "avg_run_time": "16s", "run_time_std_dev": "±06s", - "params_and_samples_GREET.hello": { - "task_queue": "both_queue", - "worker_name": "sample_worker", - "params_and_samples/GREET.hello/00": { - "status": "FINISHED", - "return_code": "MERLIN_SUCCESS", - "elapsed_time": "0d:00h:00m:15s", - "run_time": "0d:00h:00m:10s", - "restarts": 0, - }, - "params_and_samples/GREET.hello/01": { - "status": "FINISHED", - "return_code": "MERLIN_SUCCESS", - "elapsed_time": "0d:00h:00m:15s", - "run_time": "0d:00h:00m:11s", - "restarts": 0, - }, - "params_and_samples/GREET.hello/02": { - "status": "FINISHED", - "return_code": "MERLIN_SUCCESS", - "elapsed_time": "0d:00h:00m:15s", - "run_time": "0d:00h:00m:12s", - "restarts": 0, - }, - "params_and_samples/GREET.hello/03": { - "status": "FINISHED", - "return_code": "MERLIN_SUCCESS", - "elapsed_time": "0d:00h:00m:15s", - "run_time": "0d:00h:00m:13s", - "restarts": 0, - }, - "params_and_samples/GREET.hello/04": { - "status": "FINISHED", - "return_code": "MERLIN_SUCCESS", - "elapsed_time": "0d:00h:00m:15s", - "run_time": "0d:00h:00m:14s", - "restarts": 0, - }, - }, - "params_and_samples_GREET.hola": { - "task_queue": "both_queue", - "worker_name": "sample_worker", - "params_and_samples/GREET.hola/00": { - "status": "FINISHED", - "return_code": "MERLIN_SUCCESS", - "elapsed_time": "0d:00h:00m:30s", - "run_time": "0d:00h:00m:10s", - "restarts": 0, - }, - "params_and_samples/GREET.hola/01": { - "status": "FINISHED", - "return_code": "MERLIN_SUCCESS", - "elapsed_time": "0d:00h:00m:30s", - "run_time": "0d:00h:00m:18s", - "restarts": 0, - }, - "params_and_samples/GREET.hola/02": { - "status": "FINISHED", - "return_code": "MERLIN_SUCCESS", - "elapsed_time": "0d:00h:00m:30s", - "run_time": "0d:00h:00m:23s", - "restarts": 0, - }, - "params_and_samples/GREET.hola/03": { - "status": "FINISHED", - "return_code": "MERLIN_SUCCESS", - "elapsed_time": "0d:00h:00m:30s", - "run_time": "0d:00h:00m:29s", - "restarts": 0, - }, - "params_and_samples/GREET.hola/04": { - "status": "FINISHED", - "return_code": "MERLIN_SUCCESS", - "elapsed_time": "0d:00h:00m:30s", - "run_time": "0d:00h:00m:16s", - "restarts": 0, - }, - }, }, "fail_step": { "avg_run_time": "00s", "run_time_std_dev": "±00s", - "fail_step": { - "task_queue": "fail_queue", - "worker_name": "other_worker", - "fail_step": { - "status": "FAILED", - "return_code": "MERLIN_SOFT_FAIL", - "elapsed_time": "0d:00h:00m:00s", - "run_time": "0d:00h:00m:00s", - "restarts": 0, - }, - }, }, "cancel_step": { "avg_run_time": "00s", "run_time_std_dev": "±00s", + }, +} + +# This variable holds every status from the VALID_WORKSPACE in the format used when we first load them in +# i.e. the format loaded in by load_requested_statuses() +ALL_REQUESTED_STATUSES = { + "just_parameters_GREET.hello.LEAVE.goodbye": { + "parameters": {"cmd": {"GREET": "hello"}, "restart": {"LEAVE": "goodbye"}}, + "task_queue": "just_parameters_queue", + "worker_name": "other_worker", + "just_parameters/GREET.hello.LEAVE.goodbye": { + "status": "FINISHED", + "return_code": "MERLIN_SUCCESS", + "elapsed_time": "0d:00h:02m:00s", + "run_time": "0d:00h:01m:30s", + "restarts": 0, + }, + }, + "just_parameters_GREET.hola.LEAVE.adios": { + "parameters": {"cmd": {"GREET": "hola"}, "restart": {"LEAVE": "adios"}}, + "task_queue": "just_parameters_queue", + "worker_name": "other_worker", + "just_parameters/GREET.hola.LEAVE.adios": { + "status": "FINISHED", + "return_code": "MERLIN_SUCCESS", + "elapsed_time": "0d:00h:01m:00s", + "run_time": "0d:00h:01m:00s", + "restarts": 0, + }, + }, + "just_samples": { + "parameters": {"cmd": None, "restart": None}, + "task_queue": "just_samples_queue", + "worker_name": "sample_worker", + "just_samples/00": { + "status": "FINISHED", + "return_code": "MERLIN_SUCCESS", + "elapsed_time": "0d:00h:02m:00s", + "run_time": "0d:00h:01m:00s", + "restarts": 0, + }, + "just_samples/01": { + "status": "FINISHED", + "return_code": "MERLIN_SUCCESS", + "elapsed_time": "0d:00h:02m:00s", + "run_time": "0d:00h:01m:15s", + "restarts": 0, + }, + "just_samples/02": { + "status": "FINISHED", + "return_code": "MERLIN_SUCCESS", + "elapsed_time": "0d:00h:02m:00s", + "run_time": "0d:00h:01m:30s", + "restarts": 0, + }, + "just_samples/03": { + "status": "FINISHED", + "return_code": "MERLIN_SUCCESS", + "elapsed_time": "0d:00h:02m:00s", + "run_time": "0d:00h:01m:45s", + "restarts": 0, + }, + "just_samples/04": { + "status": "FINISHED", + "return_code": "MERLIN_SUCCESS", + "elapsed_time": "0d:00h:02m:00s", + "run_time": "0d:00h:02m:00s", + "restarts": 0, + }, + }, + "params_and_samples_GREET.hello": { + "parameters": {"cmd": {"GREET": "hello"}, "restart": None}, + "task_queue": "both_queue", + "worker_name": "sample_worker", + "params_and_samples/GREET.hello/00": { + "status": "FINISHED", + "return_code": "MERLIN_SUCCESS", + "elapsed_time": "0d:00h:00m:15s", + "run_time": "0d:00h:00m:10s", + "restarts": 0, + }, + "params_and_samples/GREET.hello/01": { + "status": "FINISHED", + "return_code": "MERLIN_SUCCESS", + "elapsed_time": "0d:00h:00m:15s", + "run_time": "0d:00h:00m:11s", + "restarts": 0, + }, + "params_and_samples/GREET.hello/02": { + "status": "FINISHED", + "return_code": "MERLIN_SUCCESS", + "elapsed_time": "0d:00h:00m:15s", + "run_time": "0d:00h:00m:12s", + "restarts": 0, + }, + "params_and_samples/GREET.hello/03": { + "status": "FINISHED", + "return_code": "MERLIN_SUCCESS", + "elapsed_time": "0d:00h:00m:15s", + "run_time": "0d:00h:00m:13s", + "restarts": 0, + }, + "params_and_samples/GREET.hello/04": { + "status": "FINISHED", + "return_code": "MERLIN_SUCCESS", + "elapsed_time": "0d:00h:00m:15s", + "run_time": "0d:00h:00m:14s", + "restarts": 0, + }, + }, + "params_and_samples_GREET.hola": { + "parameters": {"cmd": {"GREET": "hola"}, "restart": None}, + "task_queue": "both_queue", + "worker_name": "sample_worker", + "params_and_samples/GREET.hola/00": { + "status": "FINISHED", + "return_code": "MERLIN_SUCCESS", + "elapsed_time": "0d:00h:00m:30s", + "run_time": "0d:00h:00m:10s", + "restarts": 0, + }, + "params_and_samples/GREET.hola/01": { + "status": "FINISHED", + "return_code": "MERLIN_SUCCESS", + "elapsed_time": "0d:00h:00m:30s", + "run_time": "0d:00h:00m:18s", + "restarts": 0, + }, + "params_and_samples/GREET.hola/02": { + "status": "FINISHED", + "return_code": "MERLIN_SUCCESS", + "elapsed_time": "0d:00h:00m:30s", + "run_time": "0d:00h:00m:23s", + "restarts": 0, + }, + "params_and_samples/GREET.hola/03": { + "status": "FINISHED", + "return_code": "MERLIN_SUCCESS", + "elapsed_time": "0d:00h:00m:30s", + "run_time": "0d:00h:00m:29s", + "restarts": 0, + }, + "params_and_samples/GREET.hola/04": { + "status": "FINISHED", + "return_code": "MERLIN_SUCCESS", + "elapsed_time": "0d:00h:00m:30s", + "run_time": "0d:00h:00m:16s", + "restarts": 0, + }, + }, + "fail_step": { + "parameters": {"cmd": None, "restart": None}, + "task_queue": "fail_queue", + "worker_name": "other_worker", + "fail_step": { + "status": "FAILED", + "return_code": "MERLIN_SOFT_FAIL", + "elapsed_time": "0d:00h:00m:00s", + "run_time": "0d:00h:00m:00s", + "restarts": 0, + }, + }, + "cancel_step": { + "parameters": {"cmd": None, "restart": None}, + "task_queue": "cancel_queue", + "worker_name": "other_worker", "cancel_step": { - "task_queue": "cancel_queue", - "worker_name": "other_worker", - "cancel_step": { - "status": "CANCELLED", - "return_code": "MERLIN_STOP_WORKERS", - "elapsed_time": "0d:00h:00m:00s", - "run_time": "0d:00h:00m:00s", - "restarts": 0, - }, + "status": "CANCELLED", + "return_code": "MERLIN_STOP_WORKERS", + "elapsed_time": "0d:00h:00m:00s", + "run_time": "0d:00h:00m:00s", + "restarts": 0, }, }, } # This variable holds every status from the VALID_WORKSPACE in the format used for displaying/dumping statuses -# i.e. the format returned by format_status_for_display() +# i.e. the format returned by format_status_for_csv() ALL_FORMATTED_STATUSES = { "step_name": [ - "just_parameters", - "just_parameters", + "just_parameters_GREET.hello.LEAVE.goodbye", + "just_parameters_GREET.hola.LEAVE.adios", "just_samples", "just_samples", "just_samples", "just_samples", "just_samples", - "params_and_samples", - "params_and_samples", - "params_and_samples", - "params_and_samples", - "params_and_samples", - "params_and_samples", - "params_and_samples", - "params_and_samples", - "params_and_samples", - "params_and_samples", + "params_and_samples_GREET.hello", + "params_and_samples_GREET.hello", + "params_and_samples_GREET.hello", + "params_and_samples_GREET.hello", + "params_and_samples_GREET.hello", + "params_and_samples_GREET.hola", + "params_and_samples_GREET.hola", + "params_and_samples_GREET.hola", + "params_and_samples_GREET.hola", + "params_and_samples_GREET.hola", "fail_step", "cancel_step", ], @@ -449,6 +533,48 @@ "0d:00h:00m:00s", ], "restarts": [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], + "cmd_parameters": [ + "GREET:hello", + "GREET:hola", + "-------", + "-------", + "-------", + "-------", + "-------", + "GREET:hello", + "GREET:hello", + "GREET:hello", + "GREET:hello", + "GREET:hello", + "GREET:hola", + "GREET:hola", + "GREET:hola", + "GREET:hola", + "GREET:hola", + "-------", + "-------", + ], + "restart_parameters": [ + "LEAVE:goodbye", + "LEAVE:adios", + "-------", + "-------", + "-------", + "-------", + "-------", + "-------", + "-------", + "-------", + "-------", + "-------", + "-------", + "-------", + "-------", + "-------", + "-------", + "-------", + "-------", + ], "task_queue": [ "just_parameters_queue", "just_parameters_queue", diff --git a/tests/unit/study/test_celeryadapter.py b/tests/unit/study/test_celeryadapter.py new file mode 100644 index 000000000..67728881e --- /dev/null +++ b/tests/unit/study/test_celeryadapter.py @@ -0,0 +1,256 @@ +############################################################################### +# Copyright (c) 2023, Lawrence Livermore National Security, LLC. +# Produced at the Lawrence Livermore National Laboratory +# Written by the Merlin dev team, listed in the CONTRIBUTORS file. +# +# +# LLNL-CODE-797170 +# All rights reserved. +# This file is part of Merlin, Version: 1.11.1. +# +# For details, see https://github.com/LLNL/merlin. +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# The above copyright notice and this permission notice shall be included in +# all copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. +############################################################################### +""" +Tests for the celeryadapter module. +""" +from time import sleep +from typing import Dict + +import pytest +from celery import Celery +from celery.canvas import Signature + +from merlin.config import Config +from merlin.study import celeryadapter + + +@pytest.mark.order(before="TestInactive") +class TestActive: + """ + This class will test functions in the celeryadapter.py module. + It will run tests where we need active queues/workers to interact with. + + NOTE: The tests in this class must be ran before the TestInactive class or else the + Celery workers needed for this class don't start + + TODO: fix the bug noted above and then check if we still need pytest-order + """ + + def test_query_celery_queues( + self, celery_app: Celery, launch_workers: "Fixture", worker_queue_map: Dict[str, str] # noqa: F821 + ): + """ + Test the query_celery_queues function by providing it with a list of active queues. + This should return a dict where keys are queue names and values are more dicts containing + the number of jobs and consumers in that queue. + + :param `celery_app`: A pytest fixture for the test Celery app + :param launch_workers: A pytest fixture that launches celery workers for us to interact with + :param worker_queue_map: A pytest fixture that returns a dict of workers and queues + """ + # Set up a dummy configuration to use in the test + dummy_config = Config({"broker": {"name": "redis"}}) + + # Get the actual output + queues_to_query = list(worker_queue_map.values()) + actual_queue_info = celeryadapter.query_celery_queues(queues_to_query, app=celery_app, config=dummy_config) + + # Ensure all 3 queues in worker_queue_map were queried before looping + assert len(actual_queue_info) == 3 + + # Ensure each queue has a worker attached + for queue_name, queue_info in actual_queue_info.items(): + assert queue_name in worker_queue_map.values() + assert queue_info == {"consumers": 1, "jobs": 0} + + def test_get_running_queues(self, launch_workers: "Fixture", worker_queue_map: Dict[str, str]): # noqa: F821 + """ + Test the get_running_queues function with queues active. + This should return a list of active queues. + + :param `launch_workers`: A pytest fixture that launches celery workers for us to interact with + :param `worker_queue_map`: A pytest fixture that returns a dict of workers and queues + """ + result = celeryadapter.get_running_queues("merlin_test_app", test_mode=True) + assert sorted(result) == sorted(list(worker_queue_map.values())) + + def test_get_active_celery_queues( + self, celery_app: Celery, launch_workers: "Fixture", worker_queue_map: Dict[str, str] # noqa: F821 + ): + """ + Test the get_active_celery_queues function with queues active. + This should return a tuple where the first entry is a dict of queue info + and the second entry is a list of worker names. + + :param `celery_app`: A pytest fixture for the test Celery app + :param `launch_workers`: A pytest fixture that launches celery workers for us to interact with + :param `worker_queue_map`: A pytest fixture that returns a dict of workers and queues + """ + # Start the queues and run the test + queue_result, worker_result = celeryadapter.get_active_celery_queues(celery_app) + + # Ensure we got output before looping + assert len(queue_result) == len(worker_result) == 3 + + for worker, queue in worker_queue_map.items(): + # Check that the entry in the queue_result dict for this queue is correct + assert queue in queue_result + assert len(queue_result[queue]) == 1 + assert worker in queue_result[queue][0] + + # Remove this entry from the queue_result dict + del queue_result[queue] + + # Check that this worker was added to the worker_result list + worker_found = False + for worker_name in worker_result[:]: + if worker in worker_name: + worker_found = True + worker_result.remove(worker_name) + break + assert worker_found + + # Ensure there was no extra output that we weren't expecting + assert queue_result == {} + assert worker_result == [] + + @pytest.mark.order(index=1) + def test_check_celery_workers_processing_tasks( + self, + celery_app: Celery, + sleep_sig: Signature, + launch_workers: "Fixture", # noqa: F821 + ): + """ + Test the check_celery_workers_processing function with workers active and a task in a queue. + This function will query workers for any tasks they're still processing. We'll send a + a task that sleeps for 3 seconds to our workers before we run this test so that there should be + a task for this function to find. + + NOTE: the celery app fixture shows strange behavior when using app.control.inspect() calls (which + check_celery_workers_processing uses) so we have to run this test first in this class in order to + have it run properly. + + :param celery_app: A pytest fixture for the test Celery app + :param sleep_sig: A pytest fixture for a celery signature of a task that sleeps for 3 sec + :param launch_workers: A pytest fixture that launches celery workers for us to interact with + """ + # Our active workers/queues are test_worker_[0-2]/test_queue_[0-2] so we're + # sending this to test_queue_0 for test_worker_0 to process + queue_for_signature = "test_queue_0" + sleep_sig.set(queue=queue_for_signature) + result = sleep_sig.delay() + + # We need to give the task we just sent to the server a second to get picked up by the worker + sleep(1) + + # Run the test now that the task should be getting processed + active_queue_test = celeryadapter.check_celery_workers_processing([queue_for_signature], celery_app) + assert active_queue_test is True + + # Now test that a queue without any tasks returns false + # We sent the signature to task_queue_0 so task_queue_1 shouldn't have any tasks to find + non_active_queue_test = celeryadapter.check_celery_workers_processing(["test_queue_1"], celery_app) + assert non_active_queue_test is False + + # Wait for the worker to finish running the task + result.get() + + +class TestInactive: + """ + This class will test functions in the celeryadapter.py module. + It will run tests where we don't need any active queues/workers to interact with. + """ + + def test_query_celery_queues(self, celery_app: Celery, worker_queue_map: Dict[str, str]): # noqa: F821 + """ + Test the query_celery_queues function by providing it with a list of inactive queues. + This should return a dict where keys are queue names and values are more dicts containing + the number of jobs and consumers in that queue (which should be 0 for both here). + + :param `celery_app`: A pytest fixture for the test Celery app + :param worker_queue_map: A pytest fixture that returns a dict of workers and queues + """ + # Set up a dummy configuration to use in the test + dummy_config = Config({"broker": {"name": "redis"}}) + + # Get the actual output + queues_to_query = list(worker_queue_map.values()) + actual_queue_info = celeryadapter.query_celery_queues(queues_to_query, app=celery_app, config=dummy_config) + + # Ensure all 3 queues in worker_queue_map were queried before looping + assert len(actual_queue_info) == 3 + + # Ensure each queue has no worker attached (since the queues should be inactive here) + for queue_name, queue_info in actual_queue_info.items(): + assert queue_name in worker_queue_map.values() + assert queue_info == {"consumers": 0, "jobs": 0} + + def test_celerize_queues(self, worker_queue_map: Dict[str, str]): + """ + Test the celerize_queues function. This should add the celery queue_tag + to the front of the queues we provide it. + + :param `worker_queue_map`: A pytest fixture that returns a dict of workers and queues + """ + # Create variables to be used in the test + queue_tag = "[merlin]_" + queues_to_check = list(worker_queue_map.values()) + dummy_config = Config({"celery": {"queue_tag": queue_tag}}) + + # Run the test + celeryadapter.celerize_queues(queues_to_check, dummy_config) + + # Ensure the queue tag was added to every queue + for queue in queues_to_check: + assert queue_tag in queue + + def test_get_running_queues(self): + """ + Test the get_running_queues function with no queues active. + This should return an empty list. + """ + result = celeryadapter.get_running_queues("merlin_test_app", test_mode=True) + assert result == [] + + def test_get_active_celery_queues(self, celery_app: Celery): + """ + Test the get_active_celery_queues function with no queues active. + This should return a tuple where the first entry is an empty dict + and the second entry is an empty list. + + :param `celery_app`: A pytest fixture for the test Celery app + """ + queue_result, worker_result = celeryadapter.get_active_celery_queues(celery_app) + assert queue_result == {} + assert worker_result == [] + + def test_check_celery_workers_processing_tasks(self, celery_app: Celery, worker_queue_map: Dict[str, str]): + """ + Test the check_celery_workers_processing function with no workers active. + This function will query workers for any tasks they're still processing. Since no workers are active + this should return False. + + :param celery_app: A pytest fixture for the test Celery app + """ + # Run the test now that the task should be getting processed + result = celeryadapter.check_celery_workers_processing(list(worker_queue_map.values()), celery_app) + assert result is False diff --git a/tests/unit/study/test_detailed_status.py b/tests/unit/study/test_detailed_status.py new file mode 100644 index 000000000..5d0e65a01 --- /dev/null +++ b/tests/unit/study/test_detailed_status.py @@ -0,0 +1,1288 @@ +############################################################################### +# Copyright (c) 2023, Lawrence Livermore National Security, LLC. +# Produced at the Lawrence Livermore National Laboratory +# Written by the Merlin dev team, listed in the CONTRIBUTORS file. +# +# +# LLNL-CODE-797170 +# All rights reserved. +# This file is part of Merlin, Version: 1.11.0. +# +# For details, see https://github.com/LLNL/merlin. +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# The above copyright notice and this permission notice shall be included in +# all copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. +############################################################################### +""" +Tests for the DetailedStatus class in the status.py module +""" +import re +import unittest +from argparse import Namespace +from copy import deepcopy +from io import StringIO +from typing import Dict, List +from unittest.mock import MagicMock, call, patch + +import yaml +from deepdiff import DeepDiff + +from merlin.spec.expansion import get_spec_with_expansion +from merlin.study.status import DetailedStatus +from tests.unit.study.status_test_files import shared_tests, status_test_variables + + +class TestBaseDetailedStatus(unittest.TestCase): + """ + Base class for all detailed status tests to provide the setup configuration. + """ + + @classmethod + def setUpClass(cls): + """ + We need to modify the path to the samples file in the expanded spec for these tests. + This will only happen once when these tests are initialized. + """ + # Read in the contents of the expanded spec + with open(status_test_variables.EXPANDED_SPEC_PATH, "r") as expanded_file: + cls.initial_expanded_contents = yaml.load(expanded_file, yaml.Loader) + + # Create a copy of the contents so we can reset the file when these tests are done + modified_contents = deepcopy(cls.initial_expanded_contents) + + # Modify the samples file path + modified_contents["merlin"]["samples"]["file"] = status_test_variables.SAMPLES_PATH + + # Write the new contents to the expanded spec + with open(status_test_variables.EXPANDED_SPEC_PATH, "w") as expanded_file: + yaml.dump(modified_contents, expanded_file) + + @classmethod + def tearDownClass(cls): + """ + When these tests are done we'll reset the contents of the expanded spec path + to their initial states. + """ + with open(status_test_variables.EXPANDED_SPEC_PATH, "w") as expanded_file: + yaml.dump(cls.initial_expanded_contents, expanded_file) + + def setUp(self): + """ + We'll create an argparse namespace here that can be modified on a + test-by-test basis. + """ + # We'll set all of the args needed to create the DetailedStatus object here and then + # just modify them on a test-by-test basis + self.args = Namespace( + subparsers="detailed-status", + level="INFO", + detailed=True, + output_path=None, + task_server="celery", + dump=None, + no_prompts=True, # We'll set this to True here since it's easier to test this way (in most cases) + max_tasks=None, + return_code=None, + steps=["all"], + task_queues=None, + task_status=None, + workers=None, + disable_pager=True, # We'll set this to True here since it's easier to test this way + disable_theme=False, + layout="default", + ) + + # Create the DetailedStatus object without adding any arguments + # We'll modify the arguments on a test-by-test basis + self.detailed_status_obj = DetailedStatus( + args=self.args, spec_display=False, file_or_ws=status_test_variables.VALID_WORKSPACE_PATH + ) + + +class TestSetup(TestBaseDetailedStatus): + """ + This tests the setup of the DetailedStatus class. + """ + + def test_workspace_setup(self): + """ + Test the setup of the DetailedStatus class using a workspace as input. This should have the same + behavior as setting up the Status class but will hold additional args. Here the DetailedStatus + instance is created in setUp but since it doesn't use any filters, we can just make sure all + of the attributes were initiated correctly. + """ + # Ensure the attributes shared with the Status class that are created upon initialization are correct + shared_tests.assert_correct_attribute_creation(self.detailed_status_obj) + + # The steps arg is expanded from ["all"] to a list of every step name upon class creation + self.assertEqual( + self.detailed_status_obj.args.steps, + ["just_samples", "just_parameters", "params_and_samples", "fail_step", "cancel_step", "unstarted_step"], + ) + + # We didn't give the steps filter arg so this should be False + self.assertEqual(self.detailed_status_obj.steps_filter_provided, False) + + def test_spec_setup(self): + """ + Test the setup of the DetailedStatus class using a spec file as input. This should have the same + behavior as setting up the Status class but will hold additional args. + """ + # We have to reset this to be all since it will have already been expanded due to the setUp method from + # the base class + self.args.steps = ["all"] + + # We need to load in the MerlinSpec object and save it to the args we'll give to DetailedStatus + self.args.specification = status_test_variables.SPEC_PATH + self.args.spec_provided = get_spec_with_expansion(self.args.specification) + + # Create the new object using a specification rather than a workspace + detailed_status_obj = DetailedStatus(args=self.args, spec_display=True, file_or_ws=status_test_variables.SPEC_PATH) + + # Ensure the attributes shared with the Status class that are created upon initialization are correct + shared_tests.assert_correct_attribute_creation(detailed_status_obj) + + # The steps arg is expanded from ["all"] to a list of every step name upon class creation + self.assertEqual( + detailed_status_obj.args.steps, + ["just_samples", "just_parameters", "params_and_samples", "fail_step", "cancel_step", "unstarted_step"], + ) + + # We didn't give the steps filter arg so this should be False + self.assertEqual(detailed_status_obj.steps_filter_provided, False) + + +class TestDumpFunctionality(TestBaseDetailedStatus): + """ + This class will test the dump functionality for dumping detailed-status + to csv and json files. It will run the same test as we run for the Status + command and it will also run dump tests with some filters applied. + """ + + def test_json_dump(self): + """ + Test the json dump functionality. This tests both the write and append + dump functionalities. The file needs to exist already for an append so it's + better to keep these tests together. + + This will test a json dump using the detailed-status command without applying + any filters. + """ + # Set the dump file + json_dump_file = f"{status_test_variables.PATH_TO_TEST_FILES}/detailed_dump_test.json" + self.detailed_status_obj.args.dump = json_dump_file + + # Run the json dump test + shared_tests.run_json_dump_test(self.detailed_status_obj, status_test_variables.ALL_REQUESTED_STATUSES) + + def test_csv_dump(self): + """ + Test the csv dump functionality. This tests both the write and append + dump functionalities. The file needs to exist already for an append so it's + better to keep these tests together. + + This will test a csv dump using the detailed-status command without applying + any filters. + """ + # Set the dump file + csv_dump_file = f"{status_test_variables.PATH_TO_TEST_FILES}/detailed_dump_test.csv" + self.detailed_status_obj.args.dump = csv_dump_file + + # Run the csv dump test + expected_output = shared_tests.build_row_list(status_test_variables.ALL_FORMATTED_STATUSES) + shared_tests.run_csv_dump_test(self.detailed_status_obj, expected_output) + + def test_json_dump_with_filters(self): + """ + Test the json dump functionality while using filters. This tests both the write and append + dump functionalities. The file needs to exist already for an append so it's + better to keep these tests together. + """ + # Set filters for failed and cancelled tasks, and then reload the requested_statuses + self.detailed_status_obj.args.task_status = ["FAILED", "CANCELLED"] + self.detailed_status_obj.load_requested_statuses() + + # Set the dump file + json_dump_file = f"{status_test_variables.PATH_TO_TEST_FILES}/detailed_dump_test.json" + self.detailed_status_obj.args.dump = json_dump_file + + # Run the json dump test (we should only get failed and cancelled statuses) + shared_tests.run_json_dump_test(self.detailed_status_obj, status_test_variables.REQUESTED_STATUSES_FAIL_AND_CANCEL) + + def test_csv_dump_with_filters(self): + """ + Test the csv dump functionality while using filters. This tests both the write and append + dump functionalities. The file needs to exist already for an append so it's + better to keep these tests together. + """ + # Set filters for failed and cancelled tasks, and then reload the requested_statuses + self.detailed_status_obj.args.task_status = ["FAILED", "CANCELLED"] + self.detailed_status_obj.load_requested_statuses() + + # Set the dump file + csv_dump_file = f"{status_test_variables.PATH_TO_TEST_FILES}/detailed_dump_test.csv" + self.detailed_status_obj.args.dump = csv_dump_file + + # Run the csv dump test (we should only get failed and cancelled statuses) + expected_output = shared_tests.build_row_list(status_test_variables.FORMATTED_STATUSES_FAIL_AND_CANCEL) + shared_tests.run_csv_dump_test(self.detailed_status_obj, expected_output) + + +class TestPromptFunctionality(TestBaseDetailedStatus): + """ + This class is strictly for testing that all prompt functionality that's + possible through the DetailedStatus class is running correctly. + + The types of prompts are: + - prompts for selecting a study to view the status of (similar to Status class) + - prompts for filtering statuses further when using the disable-pager option + + This class will test 5 methods: + - _obtain_study (similar to Status class) + - display and, by association, filter_via_prompts + - get_user_filters + - get_user_max_tasks + """ + + ############################################### + # Testing _obtain_study() + ############################################### + + def test_prompt_for_study_with_valid_input(self): + """ + This is testing the prompt that's displayed when multiple study output + directories are found. This tests the _obtain_study method with valid input. + """ + # We need to load in the MerlinSpec object and save it to the args we'll give to DetailedStatus + self.args.specification = status_test_variables.SPEC_PATH + self.args.spec_provided = get_spec_with_expansion(self.args.specification) + + # We're going to load in a status object without prompts first and then use that to call the method + # that prompts the user for input + status_obj = DetailedStatus(args=self.args, spec_display=True, file_or_ws=status_test_variables.SPEC_PATH) + shared_tests.run_study_selector_prompt_valid_input(status_obj) + + def test_prompt_for_study_with_invalid_input(self): + """ + This is testing the prompt that's displayed when multiple study output + directories are found. This tests the _obtain_study method with invalid inputs. + """ + # We need to load in the MerlinSpec object and save it to the args we'll give to DetailedStatus + self.args.specification = status_test_variables.SPEC_PATH + self.args.spec_provided = get_spec_with_expansion(self.args.specification) + + # We're going to load in a status object without prompts first and then use that to call the method + # that prompts the user for input + status_obj = DetailedStatus(args=self.args, spec_display=True, file_or_ws=status_test_variables.SPEC_PATH) + shared_tests.run_study_selector_prompt_invalid_input(status_obj) + + ############################################### + # Testing get_user_filters() + ############################################### + + def run_get_user_filters_test(self, inputs_to_test: List[str], expected_outputs: List[List[str]]): + """ + This will pass every input in `inputs_to_test` to the get_user_filters + method. All inputs in `inputs_to_test` should be valid inputs to the + prompt displayed in the get_user_filters method. After passing inputs in, + we will capture the result of running that method and compare it against + the expected outputs from `expected_outputs`. + + :param `inputs_to_test`: A list of valid inputs to give to the prompt displayed in get_user_filters + :param `expected_outputs`: A list of expected outputs corresponding to the inputs provided in + `inputs_to_test`. Each expected output should be a list + """ + # Ensure the number of inputs matches the number of outputs + if len(inputs_to_test) != len(expected_outputs): + raise ValueError("The run_get_user_filters_test method requires that both arguments are the same length.") + + # Redirect the input prompt to be stored in mock_input and not displayed in stdout + with patch("builtins.input", side_effect=inputs_to_test) as mock_input: + for expected_output in expected_outputs: + # We use patch here to keep stdout from get_user_filters from being displayed + with patch("sys.stdout"): + # Run the method we're testing and capture the result + result = self.detailed_status_obj.get_user_filters() + + # Make sure the prompt is called with the initial prompt message + mock_input.assert_called_with("How would you like to filter the tasks? ") + + # Ensure the result matches the expected output + self.assertEqual(result, expected_output) + + def run_invalid_get_user_filters_test(self, inputs_to_test: List[str]): + """ + This will pass every input in `inputs_to_test` to the get_user_filters + method. All of the inputs in `inputs_to_test` should be invalid except + for the final one. We'll capture the output from stdout and look to make + sure the correct number of "invalid input" messages were displayed. + + :param `inputs_to_test`: A list of invalid inputs (except for the last input) + to give to the prompt displayed in get_user_filters + """ + # Create a variable to store the captured stdout + captured_output = StringIO() + + # Redirect the input prompt to be stored in mock_input and not displayed in stdout + with patch("builtins.input", side_effect=inputs_to_test) as mock_input: + # We use patch here to keep stdout from get_user_filters from being displayed + with patch("sys.stdout", new=captured_output): + # Run the method we're testing (it won't return anything until we hit the valid + # exit filter so we don't save the result) + _ = self.detailed_status_obj.get_user_filters() + + # Make sure the prompt is called with the initial prompt message + mock_input.assert_called_with("How would you like to filter the tasks? ") + + # Find all occurrences of the invalid messages + all_invalid_msgs = re.findall(r"Invalid input: .*\. Input must be one of the following", captured_output.getvalue()) + + # The last input to test will be valid (so this test can exit properly) so we have + # to account for that when we check how many invalid msgs we got in our output + self.assertEqual(len(all_invalid_msgs), len(inputs_to_test) - 1) + + def test_get_user_filters_exit(self): + """ + This will test the exit input to the get_user_filters method. + """ + inputs_to_test = ["E", "EXIT", "E, EXIT"] + expected_outputs = [["E"], ["EXIT"], ["E", "EXIT"]] + self.run_get_user_filters_test(inputs_to_test, expected_outputs) + + def test_get_user_filters_task_status(self): + """ + This will test the task status input to the get_user_filters method. + """ + inputs_to_test = ["FAILED", "CANCELLED", "FAILED, CANCELLED"] + expected_outputs = [["FAILED"], ["CANCELLED"], ["FAILED", "CANCELLED"]] + self.run_get_user_filters_test(inputs_to_test, expected_outputs) + + def test_get_user_filters_return_codes(self): + """ + This will test the return codes input to the get_user_filters method. + """ + inputs_to_test = ["SOFT_FAIL", "STOP_WORKERS", "SOFT_FAIL, STOP_WORKERS"] + expected_outputs = [["SOFT_FAIL"], ["STOP_WORKERS"], ["SOFT_FAIL", "STOP_WORKERS"]] + self.run_get_user_filters_test(inputs_to_test, expected_outputs) + + def test_get_user_filters_max_tasks(self): + """ + This will test the max tasks input to the get_user_filters method. + """ + inputs_to_test = ["MAX_TASKS"] + expected_outputs = [["MAX_TASKS"]] + self.run_get_user_filters_test(inputs_to_test, expected_outputs) + + def test_get_user_filters_combination(self): + """ + This will test a combination of filters as inputs to the get_user_filters method. + """ + inputs_to_test = [ + "CANCELLED, SOFT_FAIL", # testing return code and task status being used together + "STOP_WORKERS, MAX_TASKS", # testing return code and max tasks being used together + "STOP_WORKERS, EXIT", # testing return code and exit being used together + "FAILED, MAX_TASKS", # testing task status and max tasks being used together + "CANCELLED, EXIT", # testing task status and exit being used together + "MAX_TASKS, EXIT", # testing max tasks and exit being used together + ] + expected_outputs = [ + ["CANCELLED", "SOFT_FAIL"], + ["STOP_WORKERS", "MAX_TASKS"], + ["STOP_WORKERS", "EXIT"], + ["FAILED", "MAX_TASKS"], + ["CANCELLED", "EXIT"], + ["MAX_TASKS", "EXIT"], + ] + self.run_get_user_filters_test(inputs_to_test, expected_outputs) + + def test_get_user_filters_only_invalid_inputs(self): + """ + This will test sending invalid inputs to the prompt that the get_user_filters + method displays. The last input we send will be a valid exit input in order + to get the test to exit in a clean manner. + """ + inputs_to_test = [ + "MAX_TASK", # test single invalid input + "fail, cancel", # test two invalid inputs together (should only raise one invalid input message) + "", # test empty input + "FAILED CANCELLED", # test two valid inputs not separated by comma + "E", # this one is valid and we'll use it to exit + ] + self.run_invalid_get_user_filters_test(inputs_to_test) + + def test_get_user_filters_invalid_with_valid_inputs(self): + """ + This will test sending invalid inputs to the prompt that the get_user_filters + method displays alongside valid inputs. The last input we send will be a valid + exit input in order to get the test to exit in a clean manner. + """ + inputs_to_test = [ + "MAX_TASKS, INVALID", # test invalid input with max tasks + "failed, invalid", # test invalid input with task status + "stop_workers, invalid", # test invalid input with return code + "SUCCESS, FAILED, INVALID, MAX_TASKS", # test a combination of all filters with an invalid one + "E", # this one is valid and we'll use it to exit + ] + self.run_invalid_get_user_filters_test(inputs_to_test) + + ############################################### + # Testing get_user_max_tasks() + ############################################### + + # There are 19 tasks in total for the status tests. Here, 1 is an edge + # case. Any positive number is valid (even a number greater than 19) + @patch("builtins.input", side_effect=["1", "10", "20"]) + def test_get_user_max_tasks_valid_inputs(self, mock_input: MagicMock): + """ + This will test sending valid inputs to the get_user_max_tasks method. + + :param `mock_input`: A MagicMock object to send inputs to the prompt + """ + expected_outputs = [1, 10, 20] + for expected_output in expected_outputs: + # We use patch here to keep stdout from get_user_tasks from being displayed + with patch("sys.stdout"): + # Run the method we're testing and save the result + result = self.detailed_status_obj.get_user_max_tasks() + + # Make sure the prompt is called with the correct prompt message + mock_input.assert_called_with("What limit would you like to set? (must be an integer greater than 0) ") + # Ensure we get correct output + self.assertEqual(result, expected_output) + + # '1' is a valid input and we'll use that to exit safely from this test + @patch("builtins.input", side_effect=["0", "-1", "1.5", "a", "1"]) + def test_get_user_max_tasks_invalid_inputs(self, mock_input: MagicMock): + """ + This will test sending valid inputs to the get_user_max_tasks method. + + :param `mock_input`: A MagicMock object to send inputs to the prompt + """ + captured_output = StringIO() + # We use patch here to capture the stdout from get_user_max_tasks + with patch("sys.stdout", new=captured_output): + # Run the method we're testing (it won't return anything until we hit the valid + # filter so we don't save the result) + self.detailed_status_obj.get_user_max_tasks() + + # Make sure the prompt is called with the correct prompt message + mock_input.assert_called_with("What limit would you like to set? (must be an integer greater than 0) ") + + # There should be 4 "invalid input" messages so make sure there are + all_invalid_msgs = re.findall( + r"Invalid input. The limit must be an integer greater than 0.", captured_output.getvalue() + ) + self.assertEqual(len(all_invalid_msgs), 4) + + ############################################### + # Testing display() + ############################################### + + @patch("builtins.input", side_effect=["c"]) + def test_display_ync_prompt_c(self, mock_input: MagicMock): + """ + Test the first prompt that's given when you ask for detailed + status with the disable pager and there's a bunch of tasks. In + this test we're just cancelling the display (i.e. inputting 'c'). + + :param `mock_input`: A MagicMock object to send inputs to the prompt + """ + # We have to set the no_prompts argument to False or else this won't work + self.detailed_status_obj.args.no_prompts = False + + captured_output = StringIO() + with patch("sys.stdout", new=captured_output): + # Setting display to test mode will change the limit before a + # prompt is shown from 250 to 15 + self.detailed_status_obj.display(test_mode=True) + + # Ensure the display y/n/c prompt was given + mock_input.assert_called_once_with( + "About to display 19 tasks without a pager. Would you like to apply additional filters? (y/n/c) " + ) + + # Ensure the display was cancelled + assert "Cancelling status display." in captured_output.getvalue() + + @patch("builtins.input", side_effect=["n"]) + def test_display_ync_prompt_n(self, mock_input: MagicMock): + """ + Test the first prompt that's given when you ask for detailed + status with the disable pager and there's a bunch of tasks. In + this test we're telling the prompt that we don't want to apply + additional filters (i.e. inputting 'n'). + + :param `mock_input`: A MagicMock object to send inputs to the prompt + """ + # We have to set the no_prompts argument to False or else this won't work + self.detailed_status_obj.args.no_prompts = False + + captured_output = StringIO() + with patch("sys.stdout", new=captured_output): + # Setting display to test mode will change the limit before a + # prompt is shown from 250 to 15 + self.detailed_status_obj.display(test_mode=True) + + # Ensure the display y/n/c prompt was given + mock_input.assert_called_once_with( + "About to display 19 tasks without a pager. Would you like to apply additional filters? (y/n/c) " + ) + + # Ensure the display was told not to apply anymore filters + assert "Not filtering further. Displaying 19 tasks..." in captured_output.getvalue() + + # Ensure the requested_statuses dict holds all statuses still + self.assertEqual(self.detailed_status_obj.requested_statuses, status_test_variables.ALL_REQUESTED_STATUSES) + + @patch("builtins.input", side_effect=["y", "e", "c"]) + def test_display_ync_prompt_y(self, mock_input: MagicMock): + """ + Test the first prompt that's given when you ask for detailed + status with the disable pager and there's a bunch of tasks. In + this test we're telling the prompt that we do want to apply + additional filters (i.e. inputting 'y'). + + The input chain is as follows: + The prompt will first ask if we want to filter further and we'll input + 'y' -> this takes us to the second input asking how we want to filter + and we'll input 'e' to exit -> this will take us back to the first prompt + and we'll enter 'c' to cancel the display operation + + :param `mock_input`: A MagicMock object to send inputs to the prompt + """ + # We have to set the no_prompts argument to False or else this won't work + self.detailed_status_obj.args.no_prompts = False + + with patch("sys.stdout"): + # Setting display to test mode will change the limit before a + # prompt is shown from 250 to 15 + self.detailed_status_obj.display(test_mode=True) + + # There should be 3 input calls: the initial prompt, the next prompt after entering + # 'y', and then going back to the initial prompt after entering 'e' to exit + self.assertEqual(len(mock_input.mock_calls), 3) + + # Create the list of calls that should be made (this is in sequential order; the order DOES matter here) + initial_prompt = "About to display 19 tasks without a pager. Would you like to apply additional filters? (y/n/c) " + secondary_prompt = "How would you like to filter the tasks? " + calls = [call(initial_prompt), call(secondary_prompt), call(initial_prompt)] + + # Ensure the correct calls have been made + mock_input.has_calls(calls) + + @patch("builtins.input", side_effect=["a", "0", "", "c"]) + def test_display_ync_prompt_invalid_inputs(self, mock_input: MagicMock): + """ + Test the first prompt that's given when you ask for detailed + status with the disable pager and there's a bunch of tasks. In + this test we're testing against invalid inputs and finishing the + test off by inputting 'c' to cancel the display. + + :param `mock_input`: A MagicMock object to send inputs to the prompt + """ + # We have to set the no_prompts argument to False or else this won't work + self.detailed_status_obj.args.no_prompts = False + + with patch("sys.stdout"): + # Setting display to test mode will change the limit before a + # prompt is shown from 250 to 15 + self.detailed_status_obj.display(test_mode=True) + + # The call order should have the initial prompt followed by an invalid prompt for each + # of our 3 invalid inputs ('a', '0', and '') + initial_prompt = "About to display 19 tasks without a pager. Would you like to apply additional filters? (y/n/c) " + invalid_prompt = "Invalid input. You must enter either 'y' for yes, 'n' for no, or 'c' for cancel: " + calls = [call(initial_prompt)] + [call(invalid_prompt)] * 3 + + # Ensure the mock_input has the correct calls in the correct order + mock_input.assert_has_calls(calls) + + ############################################### + # Testing display(), filter_via_prompts(), + # get_user_filters(), and get_user_max_tasks() + # + # Sort of an integration test but all of these + # methods revolve around display + ############################################### + + @patch("builtins.input", side_effect=["y", "FAILED, STOP_WORKERS"]) + def test_display_full_filter_process(self, mock_input: MagicMock): + """ + This test will run through the prompts given to users when they disable + the pager and there are a bunch of tasks to display. This will test a + run with no invalid inputs (each method has been individually tested above + for invalid inputs). + + This should pull up two prompts: the first asking if we want to apply + additional filters which we'll input 'y' to -> the second asking us + how we'd like to filter, which we'll input 'FAILED, STOP_WORKERS' to. + This uses both the task_status and return_code filters to ask for + any failed or cancelled tasks we have. + + :param `mock_input`: A MagicMock object to send inputs to the prompt + """ + # We have to set the no_prompts argument to False or else this won't work + self.detailed_status_obj.args.no_prompts = False + + with patch("sys.stdout"): + # Setting display to test mode will change the limit before a + # prompt is shown from 250 to 15 + self.detailed_status_obj.display(test_mode=True) + + # The call order should have the initial prompt followed by a prompt asking how + # we want to filter our tasks (this is in a specific order) + initial_prompt = "About to display 19 tasks without a pager. Would you like to apply additional filters? (y/n/c) " + secondary_prompt = "How would you like to filter the tasks? " + calls = [call(initial_prompt), call(secondary_prompt)] + + # Ensure the mock_input has the correct calls in the correct order + mock_input.assert_has_calls(calls) + + # Ensure the requested_statuses dict holds all failed and cancelled tasks + self.assertEqual(self.detailed_status_obj.requested_statuses, status_test_variables.REQUESTED_STATUSES_FAIL_AND_CANCEL) + + @patch("builtins.input", side_effect=["y", "SUCCESS, MAX_TASKS", "3"]) + def test_display_full_filter_process_max_tasks(self, mock_input: MagicMock): + """ + This test will run through the prompts given to users when they disable + the pager and there are a bunch of tasks to display. This will test a + run with no invalid inputs (each method has been individually tested above + for invalid inputs). + + This should pull up three prompts: the first asking if we want to apply + additional filters which we'll input 'y' to -> the second asking us + how we'd like to filter, which we'll input 'SUCCESS, MAX_TASKS' to -> + the third and final asking us what limit we'd like to set for the max_tasks + value + + :param `mock_input`: A MagicMock object to send inputs to the prompt + """ + # We have to set the no_prompts argument to False or else this won't work + self.detailed_status_obj.args.no_prompts = False + + with patch("sys.stdout"): + # Setting display to test mode will change the limit before a + # prompt is shown from 250 to 15 + self.detailed_status_obj.display(test_mode=True) + + # The call order should have the initial prompt followed by a prompt asking how + # we want to filter our tasks followed by a prompt asking us what limit we'd + # like to set (this is in a specific order) + initial_prompt = "About to display 19 tasks without a pager. Would you like to apply additional filters? (y/n/c) " + secondary_prompt = "How would you like to filter the tasks? " + tertiary_prompt = "What limit would you like to set? (must be an integer greater than 0) " + calls = [call(initial_prompt), call(secondary_prompt), call(tertiary_prompt)] + + # Ensure the mock_input has the correct calls in the correct order + mock_input.assert_has_calls(calls) + + # Ensure the requested_statuses dict holds only 3 successful tasks + self.assertEqual(self.detailed_status_obj.num_requested_statuses, 3) + + +class TestFilterApplication(TestBaseDetailedStatus): + """ + This class is strictly for testing that filters are applied correctly. + + The types of filters are: + steps, max_tasks, return_code, task_status, task_queues, and workers. + + By the time filters are applied in the execution process, the filters + will have already been verified so we don't need to check against invalid + inputs (that's what the TestFilterVerification class is for). + + This class will test 3 methods: get_steps_to_display (this applies the + steps, task_queues, and workers filters), apply_filters (this applies the + return_code and task_status filters), and apply_max_tasks_limit (this + applies the max_tasks filter). + """ + + def test_apply_default_steps(self): + """ + This will test the default application of the steps filter. When the + detailed_status_obj variable is created in setUp, the default value + for steps will already be being used, and the get_steps_to_display method + will be called upon initialization. Therefore, we can just ensure it was + processed correctly without needing to directly call it. + """ + # The steps arg is expanded from ["all"] to a list of every step name upon class creation + self.assertEqual( + self.detailed_status_obj.args.steps, + ["just_samples", "just_parameters", "params_and_samples", "fail_step", "cancel_step", "unstarted_step"], + ) + + # The step_tracker dict should have every step here + step_tracker_diff = DeepDiff( + status_test_variables.FULL_STEP_TRACKER, self.detailed_status_obj.step_tracker, ignore_order=True + ) + self.assertEqual(step_tracker_diff, {}) + + def run_get_steps_to_display_test(self, expected_step_tracker: Dict): + """ + A helper method to combine similar code for the get_steps_to_display tests. + This is where the get_steps_to_display method is actually called and tested against. + """ + # Call get_steps_to_display to get the step_tracker object and make sure it matches the expected output + step_tracker_diff = DeepDiff(expected_step_tracker, self.detailed_status_obj.get_steps_to_display(), ignore_order=True) + self.assertEqual(step_tracker_diff, {}) + + def test_apply_single_step(self): + """ + This tests the application of the steps filter with only one step. + """ + # Modify the steps argument and create the expected output + self.detailed_status_obj.args.steps = ["just_parameters"] + expected_step_tracker = {"started_steps": ["just_parameters"], "unstarted_steps": []} + + # Run the test + self.run_get_steps_to_display_test(expected_step_tracker) + + def test_apply_multiple_steps(self): + """ + This tests the application of the steps filter with multiple steps. + """ + # Modify the steps argument and create the expected output + self.detailed_status_obj.args.steps = ["just_parameters", "just_samples", "fail_step"] + expected_step_tracker = {"started_steps": ["just_parameters", "just_samples", "fail_step"], "unstarted_steps": []} + + # Run the test + self.run_get_steps_to_display_test(expected_step_tracker) + + def test_apply_single_task_queue(self): + """ + This tests the application of the task_queues filter with only one task queue. + """ + # Modify the task_queues argument and create the expected output + self.detailed_status_obj.args.task_queues = ["just_parameters_queue"] + expected_step_tracker = {"started_steps": ["just_parameters"], "unstarted_steps": []} + + # We need to reset steps to "all" otherwise this test won't work + self.detailed_status_obj.args.steps = ["all"] + + # Run the test + self.run_get_steps_to_display_test(expected_step_tracker) + + def test_apply_multiple_task_queues(self): + """ + This tests the application of the task_queues filter with multiple task queues. + """ + # Modify the task_queues argument and create the expected output + self.detailed_status_obj.args.task_queues = ["just_parameters_queue", "just_samples_queue", "fail_queue"] + expected_step_tracker = {"started_steps": ["just_parameters", "just_samples", "fail_step"], "unstarted_steps": []} + + # We need to reset steps to "all" otherwise this test won't work + self.detailed_status_obj.args.steps = ["all"] + + # Run the test + self.run_get_steps_to_display_test(expected_step_tracker) + + def test_apply_single_worker(self): + """ + This tests the application of the workers filter with only one worker. + """ + # Modify the workers argument and create the expected output + self.detailed_status_obj.args.workers = ["sample_worker"] + expected_step_tracker = {"started_steps": ["just_samples", "params_and_samples"], "unstarted_steps": []} + + # We need to reset steps to "all" otherwise this test won't work + self.detailed_status_obj.args.steps = ["all"] + + # Run the test + self.run_get_steps_to_display_test(expected_step_tracker) + + def test_apply_multiple_workers(self): + """ + This tests the application of the workers filter with multiple worker. + """ + # Modify the workers argument and create the expected output + self.detailed_status_obj.args.workers = ["sample_worker", "other_worker"] + + # We need to reset steps to "all" otherwise this test won't work + self.detailed_status_obj.args.steps = ["all"] + + # Run the test + self.run_get_steps_to_display_test(status_test_variables.FULL_STEP_TRACKER) + + def test_apply_max_tasks(self): + """ + The max_tasks filter has no default to test against as the default value is None + and will not trigger the apply_max_task_limit method. We'll test the application of this + method here by modifying max tasks and calling it. This method will modify the + requested_statuses dict so we'll check against that. + """ + # Set the max_tasks limit and apply it + self.detailed_status_obj.args.max_tasks = 3 + self.detailed_status_obj.apply_max_tasks_limit() + + # Ensure the max_tasks limit was applied to the requested_statuses + self.assertEqual(self.detailed_status_obj.num_requested_statuses, 3) + + def run_apply_filters_test(self, expected_requested_statuses: Dict): + """ + A helper method to combine similar code for the apply_filters tests. + The apply_filters method is tested here as a side effect of calling + load_requested_statuses. + """ + # Apply any filter given before this method was called + self.detailed_status_obj.load_requested_statuses() + + # Ensure the requested statuses are as expected + requested_statuses_diff = DeepDiff( + expected_requested_statuses, self.detailed_status_obj.requested_statuses, ignore_order=True + ) + self.assertEqual(requested_statuses_diff, {}) + + def test_apply_single_return_code(self): + """ + This tests the application of the return_code filter with only one return codes. + """ + # Set the return code filter and run the test + self.detailed_status_obj.args.return_code = ["SOFT_FAIL"] + self.run_apply_filters_test(status_test_variables.REQUESTED_STATUSES_JUST_FAILED_STEP) + + def test_apply_multiple_return_codes(self): + """ + This tests the application of the return_code filter with multiple return codes. + """ + # Set the return code filter and run the test + self.detailed_status_obj.args.return_code = ["SOFT_FAIL", "STOP_WORKERS"] + self.run_apply_filters_test(status_test_variables.REQUESTED_STATUSES_FAIL_AND_CANCEL) + + def test_apply_single_task_status(self): + """ + This tests the application of the task_status filter with only one task status. + """ + # Set the return code filter and run the test + self.detailed_status_obj.args.task_status = ["FAILED"] + self.run_apply_filters_test(status_test_variables.REQUESTED_STATUSES_JUST_FAILED_STEP) + + def test_apply_multiple_task_statuses(self): + """ + This tests the application of the task_status filter with multiple task statuses. + """ + # Set the return code filter and run the test + self.detailed_status_obj.args.task_status = ["FAILED", "CANCELLED"] + self.run_apply_filters_test(status_test_variables.REQUESTED_STATUSES_FAIL_AND_CANCEL) + + +class TestFilterVerification(TestBaseDetailedStatus): + """ + This class is strictly for testing the verification process when filters + are given to the DetailedStatus object. This does NOT test that filters + are applied properly, just that they're verified correctly. + + The types of filters are: + steps, max_tasks, return_code, task_status, task_queues, and workers. + + For every filter we'll test the verification process against valid and + invalid inputs. Additionally, for every filter besides max_tasks, even + though I don't think it's possible for this scenario to get passed through + to the DetailedStatus class, we'll test against an empty list as input. + """ + + def test_verify_filter_args_valid_steps(self): + """ + Test the verification process of the steps filter using valid steps. + This covers part of the _verify_filter_args method and one use case of the + _verify_filters method that is called by _verify_filter_args. + """ + # Test single valid step + valid_step = ["just_samples"] + self.detailed_status_obj.args.steps = valid_step + # Calling verify_filter_args should not change anything here + self.detailed_status_obj._verify_filter_args() + self.assertEqual(self.detailed_status_obj.args.steps, valid_step) + + # Test multiple valid steps + valid_steps = ["just_samples", "just_parameters"] + self.detailed_status_obj.args.steps = valid_steps + # Calling verify_filter_args should not change anything here + self.detailed_status_obj._verify_filter_args() + self.assertEqual(self.detailed_status_obj.args.steps, valid_steps) + + def test_verify_filter_args_invalid_steps(self): + """ + Test the verification process of the steps filter using invalid steps. + This covers part of the _verify_filter_args method and one use case of the + _verify_filters method that is called by _verify_filter_args. + """ + # Testing "invalid_step" as only step + self.detailed_status_obj.args.steps = ["invalid_step"] + # Calling verify_filter_args should remove "invalid_step" + self.detailed_status_obj._verify_filter_args(suppress_warnings=True) + self.assertEqual(self.detailed_status_obj.args.steps, []) + + # Testing "invalid_step" as first step + self.detailed_status_obj.args.steps = ["invalid_step", "just_samples"] + # Calling verify_filter_args should allow "just_samples" to stay but not "invalid_step" + self.detailed_status_obj._verify_filter_args(suppress_warnings=True) + self.assertEqual(self.detailed_status_obj.args.steps, ["just_samples"]) + + # Testing "invalid_step" as last step + self.detailed_status_obj.args.steps = ["just_samples", "invalid_step"] + # Calling verify_filter_args should allow "just_samples" to stay but not "invalid_step" + self.detailed_status_obj._verify_filter_args(suppress_warnings=True) + self.assertEqual(self.detailed_status_obj.args.steps, ["just_samples"]) + + # Testing "invalid_step" as middle step + self.detailed_status_obj.args.steps = ["just_samples", "invalid_step", "just_parameters"] + # Calling verify_filter_args should allow "just_samples" and "just_parameters" to stay but not "invalid_step" + self.detailed_status_obj._verify_filter_args(suppress_warnings=True) + self.assertEqual(self.detailed_status_obj.args.steps, ["just_samples", "just_parameters"]) + + # Testing multiple invalid steps + self.detailed_status_obj.args.steps = ["just_samples", "invalid_step_1", "just_parameters", "invalid_step_2"] + # Calling verify_filter_args should allow only "just_samples" and "just_parameters" to stay + self.detailed_status_obj._verify_filter_args(suppress_warnings=True) + self.assertEqual(self.detailed_status_obj.args.steps, ["just_samples", "just_parameters"]) + + def test_verify_filter_args_no_steps(self): + """ + Test the verification process of the steps filter using no steps. I don't think this + is even possible to get passed to the DetailedStatus object but we'll test it just in + case. This covers part of the _verify_filter_args method and one use case of the + _verify_filters method that is called by _verify_filter_args. + """ + # Modify the steps filter so we can re-run the verify_filters_args with this filter + self.detailed_status_obj.args.steps = [] + + # Calling verify_filter_args should just keep the empty list + self.detailed_status_obj._verify_filter_args() + self.assertEqual(self.detailed_status_obj.args.steps, []) + + def test_verify_filter_args_valid_max_tasks(self): + """ + Test the verification process of the max_tasks filter using a valid max_tasks value. + This covers part of the _verify_filter_args method and one use case of the + _verify_filters method that is called by _verify_filter_args. + """ + # Test valid max tasks + valid_max_tasks = 12 + self.detailed_status_obj.args.max_tasks = valid_max_tasks + # Calling verify_filter_args should not change anything here + self.detailed_status_obj._verify_filter_args() + self.assertEqual(self.detailed_status_obj.args.max_tasks, valid_max_tasks) + + def test_verify_filter_args_invalid_max_tasks(self): + """ + Test the verification process of the max_tasks filter using invalid max_tasks + values. We don't need to test for too high of a value since the code will + automatically reset the value to however large requested_statuses is. + This covers part of the _verify_filter_args method and one use case of the + _verify_filters method that is called by _verify_filter_args. + """ + # Testing negative max_tasks value + self.detailed_status_obj.args.max_tasks = -1 + # Calling verify_filter_args should reset max_tasks to None + self.detailed_status_obj._verify_filter_args(suppress_warnings=True) + self.assertEqual(self.detailed_status_obj.args.max_tasks, None) + + # Testing max_tasks value of zero (edge case) + self.detailed_status_obj.args.max_tasks = 0 + # Calling verify_filter_args should reset max_tasks to None + self.detailed_status_obj._verify_filter_args(suppress_warnings=True) + self.assertEqual(self.detailed_status_obj.args.max_tasks, None) + + # Testing max_tasks value that's not an integer + self.detailed_status_obj.args.max_tasks = 1.5 + # Calling verify_filter_args should reset max_tasks to None + self.detailed_status_obj._verify_filter_args(suppress_warnings=True) + self.assertEqual(self.detailed_status_obj.args.max_tasks, None) + + def test_verify_filter_args_valid_task_status(self): + """ + Test the verification process of the task_status filter using valid task_status values. + This covers part of the _verify_filter_args method and one use case of the + _verify_filters method that is called by _verify_filter_args. + """ + # Test single valid task status + valid_task_status = ["FINISHED"] + self.detailed_status_obj.args.task_status = valid_task_status + # Calling verify_filter_args should not change anything here + self.detailed_status_obj._verify_filter_args() + self.assertEqual(self.detailed_status_obj.args.task_status, valid_task_status) + + # Test multiple valid task statuses + valid_task_statuses = ["FINISHED", "FAILED", "CANCELLED"] + self.detailed_status_obj.args.task_status = valid_task_statuses + # Calling verify_filter_args should not change anything here + self.detailed_status_obj._verify_filter_args() + self.assertEqual(self.detailed_status_obj.args.task_status, valid_task_statuses) + + def test_verify_filter_args_invalid_task_status(self): + """ + Test the verification process of the task_status filter using invalid task_status values. + This covers part of the _verify_filter_args method and one use case of the + _verify_filters method that is called by _verify_filter_args. + """ + # Testing a single invalid filter + self.detailed_status_obj.args.task_status = ["INVALID"] + # Calling verify_filter_args should remove the invalid filter + self.detailed_status_obj._verify_filter_args(suppress_warnings=True) + self.assertEqual(self.detailed_status_obj.args.task_status, []) + + # Testing invalid filter as first filter + self.detailed_status_obj.args.task_status = ["INVALID", "DRY_RUN"] + # Calling verify_filter_args should only allow "DRY_RUN" to remain + self.detailed_status_obj._verify_filter_args(suppress_warnings=True) + self.assertEqual(self.detailed_status_obj.args.task_status, ["DRY_RUN"]) + + # Testing invalid filter as last filter + self.detailed_status_obj.args.task_status = ["UNKNOWN", "INVALID"] + # Calling verify_filter_args should only allow "UNKNOWN" to remain + self.detailed_status_obj._verify_filter_args(suppress_warnings=True) + self.assertEqual(self.detailed_status_obj.args.task_status, ["UNKNOWN"]) + + # Testing invalid filter as middle filter + self.detailed_status_obj.args.task_status = ["INITIALIZED", "INVALID", "RUNNING"] + # Calling verify_filter_args should only allow "INITIALIZED" and "RUNNING" to remain + self.detailed_status_obj._verify_filter_args(suppress_warnings=True) + self.assertEqual(self.detailed_status_obj.args.task_status, ["INITIALIZED", "RUNNING"]) + + # Testing multiple invalid filters + self.detailed_status_obj.args.task_status = ["INVALID_1", "CANCELLED", "INVALID_2"] + # Calling verify_filter_args should only allow "CANCELLED" to remain + self.detailed_status_obj._verify_filter_args(suppress_warnings=True) + self.assertEqual(self.detailed_status_obj.args.task_status, ["CANCELLED"]) + + def test_verify_filter_args_no_task_status(self): + """ + Test the verification process of the task_status filter using no task_status. I don't think + this is even possible to get passed to the DetailedStatus object but we'll test it just in + case. This covers part of the _verify_filter_args method and one use case of the + _verify_filters method that is called by _verify_filter_args. + """ + # Testing empty task status filter + self.detailed_status_obj.args.task_status = [] + + # Calling verify_filter_args should just keep the empty list + self.detailed_status_obj._verify_filter_args() + self.assertEqual(self.detailed_status_obj.args.task_status, []) + + def test_verify_filter_args_valid_return_code(self): + """ + Test the verification process of the task_status filter using valid task_status values. + This covers part of the _verify_filter_args method and one use case of the + _verify_filters method that is called by _verify_filter_args. + """ + # Test single valid task status + valid_return_code = ["SUCCESS"] + self.detailed_status_obj.args.return_code = valid_return_code + # Calling verify_filter_args should not change anything here + self.detailed_status_obj._verify_filter_args() + self.assertEqual(self.detailed_status_obj.args.return_code, valid_return_code) + + # Test multiple valid task statuses + valid_return_codes = ["SOFT_FAIL", "DRY_SUCCESS", "SUCCESS"] + self.detailed_status_obj.args.return_code = valid_return_codes + # Calling verify_filter_args should not change anything here + self.detailed_status_obj._verify_filter_args() + self.assertEqual(self.detailed_status_obj.args.return_code, valid_return_codes) + + def test_verify_filter_args_invalid_return_code(self): + """ + Test the verification process of the return_code filter using invalid return_code values. + This covers part of the _verify_filter_args method and one use case of the + _verify_filters method that is called by _verify_filter_args. + """ + # Testing a single invalid filter + self.detailed_status_obj.args.return_code = ["INVALID"] + # Calling verify_filter_args should remove the invalid filter + self.detailed_status_obj._verify_filter_args(suppress_warnings=True) + self.assertEqual(self.detailed_status_obj.args.return_code, []) + + # Testing invalid filter as first filter + self.detailed_status_obj.args.return_code = ["INVALID", "SOFT_FAIL"] + # Calling verify_filter_args should only allow "SOFT_FAIL" to remain + self.detailed_status_obj._verify_filter_args(suppress_warnings=True) + self.assertEqual(self.detailed_status_obj.args.return_code, ["SOFT_FAIL"]) + + # Testing invalid filter as last filter + self.detailed_status_obj.args.return_code = ["HARD_FAIL", "INVALID"] + # Calling verify_filter_args should only allow "HARD_FAIL" to remain + self.detailed_status_obj._verify_filter_args(suppress_warnings=True) + self.assertEqual(self.detailed_status_obj.args.return_code, ["HARD_FAIL"]) + + # Testing invalid filter as middle filter + self.detailed_status_obj.args.return_code = ["STOP_WORKERS", "INVALID", "UNRECOGNIZED"] + # Calling verify_filter_args should only allow "STOP_WORKERS" and "UNRECOGNIZED" to remain + self.detailed_status_obj._verify_filter_args(suppress_warnings=True) + self.assertEqual(self.detailed_status_obj.args.return_code, ["STOP_WORKERS", "UNRECOGNIZED"]) + + # Testing multiple invalid filters + self.detailed_status_obj.args.return_code = ["INVALID_1", "SUCCESS", "INVALID_2"] + # Calling verify_filter_args should only allow "SUCCESS" to remain + self.detailed_status_obj._verify_filter_args(suppress_warnings=True) + self.assertEqual(self.detailed_status_obj.args.return_code, ["SUCCESS"]) + + def test_verify_filter_args_no_return_code(self): + """ + Test the verification process of the return_code filter using no return_code. I don't think + this is even possible to get passed to the DetailedStatus object but we'll test it just in + case. This covers part of the _verify_filter_args method and one use case of the + _verify_filters method that is called by _verify_filter_args. + """ + # Testing empty return code filter + self.detailed_status_obj.args.return_code = [] + + # Calling verify_filter_args should just keep the empty list + self.detailed_status_obj._verify_filter_args() + self.assertEqual(self.detailed_status_obj.args.return_code, []) + + def test_verify_filter_args_valid_task_queue(self): + """ + Test the verification process of the task_queues filter using valid task_queues values. + This covers part of the _verify_filter_args method and one use case of the + _verify_filters method that is called by _verify_filter_args. + """ + # Test single valid task status + valid_task_queue = ["just_samples_queue"] + self.detailed_status_obj.args.task_queues = valid_task_queue + # Calling verify_filter_args should not change anything here + self.detailed_status_obj._verify_filter_args() + self.assertEqual(self.detailed_status_obj.args.task_queues, valid_task_queue) + + # Test multiple valid task statuses + valid_task_queues = ["just_samples_queue", "just_parameters_queue", "both_queue"] + self.detailed_status_obj.args.task_queues = valid_task_queues + # Calling verify_filter_args should not change anything here + self.detailed_status_obj._verify_filter_args() + self.assertEqual(self.detailed_status_obj.args.task_queues, valid_task_queues) + + def test_verify_filter_args_invalid_task_queue(self): + """ + Test the verification process of the task_queues filter using invalid task_queues values. + This covers part of the _verify_filter_args method and one use case of the + _verify_filters method that is called by _verify_filter_args. + """ + # Testing a single invalid filter + self.detailed_status_obj.args.task_queues = ["invalid_queue"] + # Calling verify_filter_args should remove the invalid filter + self.detailed_status_obj._verify_filter_args(suppress_warnings=True) + self.assertEqual(self.detailed_status_obj.args.task_queues, []) + + # Testing invalid filter as first filter + self.detailed_status_obj.args.task_queues = ["invalid_queue", "unstarted_queue"] + # Calling verify_filter_args should only allow "unstarted_queue" to remain + self.detailed_status_obj._verify_filter_args(suppress_warnings=True) + self.assertEqual(self.detailed_status_obj.args.task_queues, ["unstarted_queue"]) + + # Testing invalid filter as last filter + self.detailed_status_obj.args.task_queues = ["fail_queue", "invalid_queue"] + # Calling verify_filter_args should only allow "fail_queue" to remain + self.detailed_status_obj._verify_filter_args(suppress_warnings=True) + self.assertEqual(self.detailed_status_obj.args.task_queues, ["fail_queue"]) + + # Testing invalid filter as middle filter + self.detailed_status_obj.args.task_queues = ["cancel_queue", "invalid_queue", "both_queue"] + # Calling verify_filter_args should only allow "cancel_queue" and "both_queue" to remain + self.detailed_status_obj._verify_filter_args(suppress_warnings=True) + self.assertEqual(self.detailed_status_obj.args.task_queues, ["cancel_queue", "both_queue"]) + + # Testing multiple invalid filters + self.detailed_status_obj.args.task_queues = ["invalid_queue_1", "just_samples_queue", "invalid_queue_2"] + # Calling verify_filter_args should only allow "just_samples_queue" to remain + self.detailed_status_obj._verify_filter_args(suppress_warnings=True) + self.assertEqual(self.detailed_status_obj.args.task_queues, ["just_samples_queue"]) + + def test_verify_filter_args_no_task_queue(self): + """ + Test the verification process of the task_queues filter using no task_queues. I don't think + this is even possible to get passed to the DetailedStatus object but we'll test it just in + case. This covers part of the _verify_filter_args method and one use case of the + _verify_filters method that is called by _verify_filter_args. + """ + # Testing empty task queues filter + self.detailed_status_obj.args.task_queues = [] + + # Calling verify_filter_args should just keep the empty list + self.detailed_status_obj._verify_filter_args() + self.assertEqual(self.detailed_status_obj.args.task_queues, []) + + def test_verify_filter_args_valid_workers(self): + """ + Test the verification process of the task_queue filter using valid task_queue values. + This covers part of the _verify_filter_args method and one use case of the + _verify_filters method that is called by _verify_filter_args. + """ + # Test single valid task status + valid_worker = ["sample_worker"] + self.detailed_status_obj.args.workers = valid_worker + # Calling verify_filter_args should not change anything here + self.detailed_status_obj._verify_filter_args() + self.assertEqual(self.detailed_status_obj.args.workers, valid_worker) + + # Test multiple valid task statuses + valid_workers = ["sample_worker", "other_worker"] + self.detailed_status_obj.args.workers = valid_workers + # Calling verify_filter_args should not change anything here + self.detailed_status_obj._verify_filter_args() + self.assertEqual(self.detailed_status_obj.args.workers, valid_workers) + + def test_verify_filter_args_invalid_workers(self): + """ + Test the verification process of the workers filter using invalid workers values. + This covers part of the _verify_filter_args method and one use case of the + _verify_filters method that is called by _verify_filter_args. + """ + # Testing a single invalid filter + self.detailed_status_obj.args.workers = ["invalid_worker"] + # Calling verify_filter_args should remove the invalid filter + self.detailed_status_obj._verify_filter_args(suppress_warnings=True) + self.assertEqual(self.detailed_status_obj.args.workers, []) + + # Testing invalid filter as first filter + self.detailed_status_obj.args.workers = ["invalid_worker", "sample_worker"] + # Calling verify_filter_args should only allow "sample_worker" to remain + self.detailed_status_obj._verify_filter_args(suppress_warnings=True) + self.assertEqual(self.detailed_status_obj.args.workers, ["sample_worker"]) + + # Testing invalid filter as last filter + self.detailed_status_obj.args.workers = ["sample_worker", "invalid_worker"] + # Calling verify_filter_args should only allow "sample_worker" to remain + self.detailed_status_obj._verify_filter_args(suppress_warnings=True) + self.assertEqual(self.detailed_status_obj.args.workers, ["sample_worker"]) + + # Testing invalid filter as middle filter + self.detailed_status_obj.args.workers = ["sample_worker", "invalid_worker", "other_worker"] + # Calling verify_filter_args should only allow "sample_worker" and "other_worker" to remain + self.detailed_status_obj._verify_filter_args(suppress_warnings=True) + self.assertEqual(self.detailed_status_obj.args.workers, ["sample_worker", "other_worker"]) + + # Testing multiple invalid filters + self.detailed_status_obj.args.workers = ["invalid_worker_1", "sample_worker", "invalid_worker_2"] + # Calling verify_filter_args should only allow "sample_worker" to remain + self.detailed_status_obj._verify_filter_args(suppress_warnings=True) + self.assertEqual(self.detailed_status_obj.args.workers, ["sample_worker"]) + + def test_verify_filter_args_no_workers(self): + """ + Test the verification process of the workers filter using no workers. I don't think + this is even possible to get passed to the DetailedStatus object but we'll test it just in + case. This covers part of the _verify_filter_args method and one use case of the + _verify_filters method that is called by _verify_filter_args. + """ + # Testing empty workers filter + self.detailed_status_obj.args.workers = [] + + # Calling verify_filter_args should just keep the empty list + self.detailed_status_obj._verify_filter_args() + self.assertEqual(self.detailed_status_obj.args.workers, []) + + +if __name__ == "__main__": + unittest.main() diff --git a/tests/unit/study/test_status.py b/tests/unit/study/test_status.py index b97f7eeb2..ae96c8111 100644 --- a/tests/unit/study/test_status.py +++ b/tests/unit/study/test_status.py @@ -38,7 +38,7 @@ import yaml from deepdiff import DeepDiff -from merlin.main import get_merlin_spec_with_override +from merlin.spec.expansion import get_spec_with_expansion from merlin.study.status import Status from tests.unit.study.status_test_files import shared_tests, status_test_variables @@ -86,7 +86,7 @@ def setUp(self): subparsers="status", level="INFO", detailed=False, - variables=None, + output_path=None, task_server="celery", cb_help=False, dump=None, @@ -101,7 +101,7 @@ def test_spec_setup_nonexistent_file(self): with self.assertRaises(ValueError): invalid_spec_path = f"{status_test_variables.PATH_TO_TEST_FILES}/nonexistent.yaml" self.args.specification = invalid_spec_path - self.args.spec_provided, _ = get_merlin_spec_with_override(self.args) + self.args.spec_provided = get_spec_with_expansion(self.args.specification) _ = Status(args=self.args, spec_display=True, file_or_ws=invalid_spec_path) def test_spec_setup_no_prompts(self): @@ -113,7 +113,7 @@ def test_spec_setup_no_prompts(self): as well as any methods covered in assert_correct_attribute_creation """ self.args.specification = status_test_variables.SPEC_PATH - self.args.spec_provided, _ = get_merlin_spec_with_override(self.args) + self.args.spec_provided = get_spec_with_expansion(self.args.specification) status_obj = Status(args=self.args, spec_display=True, file_or_ws=status_test_variables.SPEC_PATH) assert isinstance(status_obj, Status) @@ -126,7 +126,7 @@ def test_prompt_for_study_with_valid_input(self): """ # We need to load in the MerlinSpec object and save it to the args we'll give to Status self.args.specification = status_test_variables.SPEC_PATH - self.args.spec_provided, _ = get_merlin_spec_with_override(self.args) + self.args.spec_provided = get_spec_with_expansion(self.args.specification) # We're going to load in a status object without prompts first and then use that to call the method # that prompts the user for input @@ -140,7 +140,7 @@ def test_prompt_for_study_with_invalid_input(self): """ # We need to load in the MerlinSpec object and save it to the args we'll give to Status self.args.specification = status_test_variables.SPEC_PATH - self.args.spec_provided, _ = get_merlin_spec_with_override(self.args) + self.args.spec_provided = get_spec_with_expansion(self.args.specification) # We're going to load in a status object without prompts first and then use that to call the method # that prompts the user for input @@ -230,7 +230,7 @@ def test_csv_dump(self): """ Test the csv dump functionality. This tests both the write and append dump functionalities. The file needs to exist already for an append so it's - better to keep these tests together. This covers the format_status_for_display + better to keep these tests together. This covers the format_status_for_csv and dump methods. """ # Create the status object that we'll run tests on @@ -275,44 +275,42 @@ def test_get_runtime_avg_std_dev(self): deviation for each step. This test covers the get_runtime_avg_std_dev method. """ dummy_step_status = { - "dummy_step": { - "dummy_step_PARAM.1": { - "task_queue": "dummy_queue", - "worker_name": "dummy_worker", - "dummy_step/PARAM.1/00": { - "status": "FINISHED", - "return_code": "MERLIN_SUCCESS", - "elapsed_time": "0d:02h:00m:00s", - "run_time": "0d:01h:38m:27s", # 3600 + 2280 + 27 = 5907 seconds - "restarts": 0, - }, - "dummy_step/PARAM.1/01": { - "status": "FINISHED", - "return_code": "MERLIN_SUCCESS", - "elapsed_time": "0d:02h:00m:00s", - "run_time": "0d:01h:45m:08s", # 3600 + 2700 + 8 = 6308 seconds - "restarts": 0, - }, + "dummy_step_PARAM.1": { + "task_queue": "dummy_queue", + "worker_name": "dummy_worker", + "dummy_step/PARAM.1/00": { + "status": "FINISHED", + "return_code": "MERLIN_SUCCESS", + "elapsed_time": "0d:02h:00m:00s", + "run_time": "0d:01h:38m:27s", # 3600 + 2280 + 27 = 5907 seconds + "restarts": 0, }, - "dummy_step_PARAM.2": { - "task_queue": "dummy_queue", - "worker_name": "dummy_worker", - "dummy_step/PARAM.2/00": { - "status": "FINISHED", - "return_code": "MERLIN_SUCCESS", - "elapsed_time": "0d:02h:00m:00s", - "run_time": "0d:01h:52m:33s", # 3600 + 3120 + 33 = 6753 seconds - "restarts": 0, - }, - "dummy_step/PARAM.2/01": { - "status": "FINISHED", - "return_code": "MERLIN_SUCCESS", - "elapsed_time": "0d:02h:00m:00s", - "run_time": "0d:01h:08m:40s", # 3600 + 480 + 40 = 4120 seconds - "restarts": 0, - }, + "dummy_step/PARAM.1/01": { + "status": "FINISHED", + "return_code": "MERLIN_SUCCESS", + "elapsed_time": "0d:02h:00m:00s", + "run_time": "0d:01h:45m:08s", # 3600 + 2700 + 8 = 6308 seconds + "restarts": 0, }, - } + }, + "dummy_step_PARAM.2": { + "task_queue": "dummy_queue", + "worker_name": "dummy_worker", + "dummy_step/PARAM.2/00": { + "status": "FINISHED", + "return_code": "MERLIN_SUCCESS", + "elapsed_time": "0d:02h:00m:00s", + "run_time": "0d:01h:52m:33s", # 3600 + 3120 + 33 = 6753 seconds + "restarts": 0, + }, + "dummy_step/PARAM.2/01": { + "status": "FINISHED", + "return_code": "MERLIN_SUCCESS", + "elapsed_time": "0d:02h:00m:00s", + "run_time": "0d:01h:08m:40s", # 3600 + 480 + 40 = 4120 seconds + "restarts": 0, + }, + }, } status_obj = Status(args=self.args, spec_display=False, file_or_ws=status_test_variables.VALID_WORKSPACE_PATH) @@ -323,8 +321,8 @@ def test_get_runtime_avg_std_dev(self): expected_std_dev = "±16m:40s" # Std dev is 1000 seconds = 16m:40s # Make sure the values were calculated as expected - self.assertEqual(dummy_step_status["dummy_step"]["avg_run_time"], expected_avg) - self.assertEqual(dummy_step_status["dummy_step"]["run_time_std_dev"], expected_std_dev) + self.assertEqual(status_obj.run_time_info["dummy_step"]["avg_run_time"], expected_avg) + self.assertEqual(status_obj.run_time_info["dummy_step"]["run_time_std_dev"], expected_std_dev) if __name__ == "__main__":