diff --git a/CHANGELOG.md b/CHANGELOG.md index e2d604f78..0600422b8 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -14,6 +14,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 - This has options to filter by return code, task queues, task statuses, and workers - You can set a limit on the number of tasks to display - There are 3 options to modify the output display +- Docs for all of the monitoring commands - New file `merlin/study/status.py` dedicated to work relating to the status command - Contains the Status and DetailedStatus classes - New file `merlin/study/status_renderers.py` dedicated to formatting the output for the detailed-status command diff --git a/README.md b/README.md index 909ba869a..e0f1ca4ff 100644 --- a/README.md +++ b/README.md @@ -62,7 +62,7 @@ HPC batch systems, since it can scale to a very large number of jobs. The integrated system looks a little something like this: -a typical Merlin workflow +![A Typical Merlin Workflow](docs/assets/images/merlin_arch.png) In this example, here's how it all works: diff --git a/docs/assets/images/monitoring/monitor_for_allocation/monitor-flowchart.png b/docs/assets/images/monitoring/monitor_for_allocation/monitor-flowchart.png new file mode 100644 index 000000000..ab8d596ea Binary files /dev/null and b/docs/assets/images/monitoring/monitor_for_allocation/monitor-flowchart.png differ diff --git a/docs/assets/images/monitoring/monitor_for_allocation/status-step-2-incomplete.png b/docs/assets/images/monitoring/monitor_for_allocation/status-step-2-incomplete.png new file mode 100644 index 000000000..e9f2725f3 Binary files /dev/null and b/docs/assets/images/monitoring/monitor_for_allocation/status-step-2-incomplete.png differ diff --git a/docs/assets/images/monitoring/monitor_for_allocation/status-success.png b/docs/assets/images/monitoring/monitor_for_allocation/status-success.png new file mode 100644 index 000000000..7b48897cc Binary files /dev/null and b/docs/assets/images/monitoring/monitor_for_allocation/status-success.png differ diff --git a/docs/assets/images/monitoring/monitor_for_allocation/steps-demo.png b/docs/assets/images/monitoring/monitor_for_allocation/steps-demo.png new file mode 100644 index 000000000..b98b068d3 Binary files /dev/null and b/docs/assets/images/monitoring/monitor_for_allocation/steps-demo.png differ diff --git a/docs/assets/images/monitoring/queues_and_workers/active-queues.png b/docs/assets/images/monitoring/queues_and_workers/active-queues.png new file mode 100644 index 000000000..e0ea29ed8 Binary files /dev/null and b/docs/assets/images/monitoring/queues_and_workers/active-queues.png differ diff --git a/docs/assets/images/monitoring/queues_and_workers/connected-workers.png b/docs/assets/images/monitoring/queues_and_workers/connected-workers.png new file mode 100644 index 000000000..2f030aa71 Binary files /dev/null and b/docs/assets/images/monitoring/queues_and_workers/connected-workers.png differ diff --git a/docs/assets/images/monitoring/queues_and_workers/dump-csv.png b/docs/assets/images/monitoring/queues_and_workers/dump-csv.png new file mode 100644 index 000000000..88cb26173 Binary files /dev/null and b/docs/assets/images/monitoring/queues_and_workers/dump-csv.png differ diff --git a/docs/assets/images/monitoring/queues_and_workers/dump-json.png b/docs/assets/images/monitoring/queues_and_workers/dump-json.png new file mode 100644 index 000000000..e15729756 Binary files /dev/null and b/docs/assets/images/monitoring/queues_and_workers/dump-json.png differ diff --git a/docs/assets/images/monitoring/queues_and_workers/no-active-queues.png b/docs/assets/images/monitoring/queues_and_workers/no-active-queues.png new file mode 100644 index 000000000..07d1b9973 Binary files /dev/null and b/docs/assets/images/monitoring/queues_and_workers/no-active-queues.png differ diff --git a/docs/assets/images/monitoring/queues_and_workers/no-connected-workers.png b/docs/assets/images/monitoring/queues_and_workers/no-connected-workers.png new file mode 100644 index 000000000..1dec3631d Binary files /dev/null and b/docs/assets/images/monitoring/queues_and_workers/no-connected-workers.png differ diff --git a/docs/assets/images/monitoring/queues_and_workers/query-workers-spec-all-workers.png b/docs/assets/images/monitoring/queues_and_workers/query-workers-spec-all-workers.png new file mode 100644 index 000000000..8620ef77f Binary files /dev/null and b/docs/assets/images/monitoring/queues_and_workers/query-workers-spec-all-workers.png differ diff --git a/docs/assets/images/monitoring/queues_and_workers/query-workers-spec-option.png b/docs/assets/images/monitoring/queues_and_workers/query-workers-spec-option.png new file mode 100644 index 000000000..8a50700de Binary files /dev/null and b/docs/assets/images/monitoring/queues_and_workers/query-workers-spec-option.png differ diff --git a/docs/assets/images/monitoring/queues_and_workers/queue-info-no-vars.png b/docs/assets/images/monitoring/queues_and_workers/queue-info-no-vars.png new file mode 100644 index 000000000..a4edd0283 Binary files /dev/null and b/docs/assets/images/monitoring/queues_and_workers/queue-info-no-vars.png differ diff --git a/docs/assets/images/monitoring/queues_and_workers/queue-info-with-vars.png b/docs/assets/images/monitoring/queues_and_workers/queue-info-with-vars.png new file mode 100644 index 000000000..1f7f8e5b9 Binary files /dev/null and b/docs/assets/images/monitoring/queues_and_workers/queue-info-with-vars.png differ diff --git a/docs/assets/images/monitoring/queues_and_workers/queues-example-all-workers.png b/docs/assets/images/monitoring/queues_and_workers/queues-example-all-workers.png new file mode 100644 index 000000000..fdba79fcd Binary files /dev/null and b/docs/assets/images/monitoring/queues_and_workers/queues-example-all-workers.png differ diff --git a/docs/assets/images/monitoring/queues_and_workers/queues-example-filtered-workers.png b/docs/assets/images/monitoring/queues_and_workers/queues-example-filtered-workers.png new file mode 100644 index 000000000..0ca463a1d Binary files /dev/null and b/docs/assets/images/monitoring/queues_and_workers/queues-example-filtered-workers.png differ diff --git a/docs/assets/images/monitoring/queues_and_workers/specific-queues-active.png b/docs/assets/images/monitoring/queues_and_workers/specific-queues-active.png new file mode 100644 index 000000000..3ef2812d1 Binary files /dev/null and b/docs/assets/images/monitoring/queues_and_workers/specific-queues-active.png differ diff --git a/docs/assets/images/monitoring/queues_and_workers/specific-queues-inactive.png b/docs/assets/images/monitoring/queues_and_workers/specific-queues-inactive.png new file mode 100644 index 000000000..3d7852b9f Binary files /dev/null and b/docs/assets/images/monitoring/queues_and_workers/specific-queues-inactive.png differ diff --git a/docs/assets/images/monitoring/queues_and_workers/specification-option.png b/docs/assets/images/monitoring/queues_and_workers/specification-option.png new file mode 100644 index 000000000..438edb2b9 Binary files /dev/null and b/docs/assets/images/monitoring/queues_and_workers/specification-option.png differ diff --git a/docs/assets/images/monitoring/queues_and_workers/steps-option.png b/docs/assets/images/monitoring/queues_and_workers/steps-option.png new file mode 100644 index 000000000..885ad5a96 Binary files /dev/null and b/docs/assets/images/monitoring/queues_and_workers/steps-option.png differ diff --git a/docs/assets/images/monitoring/queues_and_workers/workers-option-with-regex.png b/docs/assets/images/monitoring/queues_and_workers/workers-option-with-regex.png new file mode 100644 index 000000000..ec9687110 Binary files /dev/null and b/docs/assets/images/monitoring/queues_and_workers/workers-option-with-regex.png differ diff --git a/docs/assets/images/monitoring/queues_and_workers/workers-option-with-worker-names.png b/docs/assets/images/monitoring/queues_and_workers/workers-option-with-worker-names.png new file mode 100644 index 000000000..147b29108 Binary files /dev/null and b/docs/assets/images/monitoring/queues_and_workers/workers-option-with-worker-names.png differ diff --git a/docs/assets/images/monitoring/status_cmds/ascii-error.png b/docs/assets/images/monitoring/status_cmds/ascii-error.png new file mode 100644 index 000000000..2991f55f5 Binary files /dev/null and b/docs/assets/images/monitoring/status_cmds/ascii-error.png differ diff --git a/docs/assets/images/monitoring/status_cmds/cb-help.png b/docs/assets/images/monitoring/status_cmds/cb-help.png new file mode 100644 index 000000000..3564f9acb Binary files /dev/null and b/docs/assets/images/monitoring/status_cmds/cb-help.png differ diff --git a/docs/assets/images/monitoring/status_cmds/disable-pager.png b/docs/assets/images/monitoring/status_cmds/disable-pager.png new file mode 100644 index 000000000..16ef80e6f Binary files /dev/null and b/docs/assets/images/monitoring/status_cmds/disable-pager.png differ diff --git a/docs/assets/images/monitoring/status_cmds/disable-theme.png b/docs/assets/images/monitoring/status_cmds/disable-theme.png new file mode 100644 index 000000000..8faf92bfb Binary files /dev/null and b/docs/assets/images/monitoring/status_cmds/disable-theme.png differ diff --git a/docs/assets/images/monitoring/status_cmds/dump-csv.png b/docs/assets/images/monitoring/status_cmds/dump-csv.png new file mode 100644 index 000000000..834144a0a Binary files /dev/null and b/docs/assets/images/monitoring/status_cmds/dump-csv.png differ diff --git a/docs/assets/images/monitoring/status_cmds/dump-json.png b/docs/assets/images/monitoring/status_cmds/dump-json.png new file mode 100644 index 000000000..d8e57366c Binary files /dev/null and b/docs/assets/images/monitoring/status_cmds/dump-json.png differ diff --git a/docs/assets/images/monitoring/status_cmds/filter-max-tasks.png b/docs/assets/images/monitoring/status_cmds/filter-max-tasks.png new file mode 100644 index 000000000..255882352 Binary files /dev/null and b/docs/assets/images/monitoring/status_cmds/filter-max-tasks.png differ diff --git a/docs/assets/images/monitoring/status_cmds/filter-prompt.png b/docs/assets/images/monitoring/status_cmds/filter-prompt.png new file mode 100644 index 000000000..efbeae2d6 Binary files /dev/null and b/docs/assets/images/monitoring/status_cmds/filter-prompt.png differ diff --git a/docs/assets/images/monitoring/status_cmds/filter-return-code.png b/docs/assets/images/monitoring/status_cmds/filter-return-code.png new file mode 100644 index 000000000..9a3d7ce4f Binary files /dev/null and b/docs/assets/images/monitoring/status_cmds/filter-return-code.png differ diff --git a/docs/assets/images/monitoring/status_cmds/filter-steps.png b/docs/assets/images/monitoring/status_cmds/filter-steps.png new file mode 100644 index 000000000..b2775b3c9 Binary files /dev/null and b/docs/assets/images/monitoring/status_cmds/filter-steps.png differ diff --git a/docs/assets/images/monitoring/status_cmds/filter-task-queues.png b/docs/assets/images/monitoring/status_cmds/filter-task-queues.png new file mode 100644 index 000000000..42afdd47a Binary files /dev/null and b/docs/assets/images/monitoring/status_cmds/filter-task-queues.png differ diff --git a/docs/assets/images/monitoring/status_cmds/filter-task-status.png b/docs/assets/images/monitoring/status_cmds/filter-task-status.png new file mode 100644 index 000000000..c1279e8a0 Binary files /dev/null and b/docs/assets/images/monitoring/status_cmds/filter-task-status.png differ diff --git a/docs/assets/images/monitoring/status_cmds/filter-workers.png b/docs/assets/images/monitoring/status_cmds/filter-workers.png new file mode 100644 index 000000000..be5519eab Binary files /dev/null and b/docs/assets/images/monitoring/status_cmds/filter-workers.png differ diff --git a/docs/assets/images/monitoring/status_cmds/inside-pager.png b/docs/assets/images/monitoring/status_cmds/inside-pager.png new file mode 100644 index 000000000..9a867419f Binary files /dev/null and b/docs/assets/images/monitoring/status_cmds/inside-pager.png differ diff --git a/docs/assets/images/monitoring/status_cmds/layout-table.png b/docs/assets/images/monitoring/status_cmds/layout-table.png new file mode 100644 index 000000000..d69d3575b Binary files /dev/null and b/docs/assets/images/monitoring/status_cmds/layout-table.png differ diff --git a/docs/assets/images/monitoring/status_cmds/max-tasks-prompt.png b/docs/assets/images/monitoring/status_cmds/max-tasks-prompt.png new file mode 100644 index 000000000..d8239e414 Binary files /dev/null and b/docs/assets/images/monitoring/status_cmds/max-tasks-prompt.png differ diff --git a/docs/assets/images/monitoring/status_cmds/multiple-studies.png b/docs/assets/images/monitoring/status_cmds/multiple-studies.png new file mode 100644 index 000000000..ae1f5aa14 Binary files /dev/null and b/docs/assets/images/monitoring/status_cmds/multiple-studies.png differ diff --git a/docs/assets/images/monitoring/status_cmds/outside-pager.png b/docs/assets/images/monitoring/status_cmds/outside-pager.png new file mode 100644 index 000000000..99d659645 Binary files /dev/null and b/docs/assets/images/monitoring/status_cmds/outside-pager.png differ diff --git a/docs/assets/images/monitoring/status_cmds/prompt.png b/docs/assets/images/monitoring/status_cmds/prompt.png new file mode 100644 index 000000000..f48dfa939 Binary files /dev/null and b/docs/assets/images/monitoring/status_cmds/prompt.png differ diff --git a/docs/assets/images/monitoring/status_cmds/status.png b/docs/assets/images/monitoring/status_cmds/status.png new file mode 100644 index 000000000..6f1a4212b Binary files /dev/null and b/docs/assets/images/monitoring/status_cmds/status.png differ diff --git a/docs/user_guide/command_line.md b/docs/user_guide/command_line.md index 46bfdc7e4..30faee9ce 100644 --- a/docs/user_guide/command_line.md +++ b/docs/user_guide/command_line.md @@ -539,11 +539,124 @@ merlin stop-workers [OPTIONS] ## Monitoring Commands -The Merlin library comes equipped with commands to help monitor your workflow: +The Merlin library comes equipped with several commands to help monitor your workflow: +- *[detailed-status](#detailed-status-merlin-detailed-status)*: Display task-by-task status information for a study - *[monitor](#monitor-merlin-monitor)*: Keep your allocation alive while tasks are being processed - *[query-workers](#query-workers-merlin-query-workers)*: Communicate with Celery to view information on active workers -- *[status](#status-merlin-status)*: Communicate with Celery to view the status of queues in your workflow(s) +- *[queue-info](#queue-info-merlin-queue-info)*: Communicate with Celery to view the status of queues in your workflow(s) +- *[status](#status-merlin-status)*: Display a summary of the status of a study + +More information on all of these commands can be found below and in the [Monitoring documentation](./monitoring/index.md). + +### Detailed Status (`merlin detailed-status`) + +!!! warning + + For the pager opened by this command to work properly the `MANPAGER` or `PAGER` environment variable must be set to `less -r`. This can be set with: + + === "MANPAGER" + + ```bash + export MANPAGER="less -r" + ``` + + === "PAGER" + + ```bash + export PAGER="less -r" + ``` + +Display the task-by-task status of a workflow. + +This command will open a pager window with task statuses. Inside this pager window, you can search and scroll through task statuses for every step of your workflow. + +For more information, see the [Detailed Status documentation](./monitoring/status_cmds.md#the-detailed-status-command). + +**Usage:** + +```bash +merlin detailed-status [OPTIONS] WORKSPACE_OR_SPECIFICATION +``` + +**Options:** + +| Name | Type | Description | Default | +| ------------ | ------- | ----------- | ------- | +| `-h`, `--help` | boolean | Show this help message and exit | `False` | +| `--dump` | filename | The name of a csv or json file to dump the status to | None | +| `--task_server` | string | Task server type. Currently only "celery" is implemented. | "celery" | +| `-o`, `--output-path` | dirname | Specify a location to look for output workspaces. Only used when a spec file is passed as the argument to `status`. | None | + +**Filter Options:** + +The `detailed-status` command comes equipped with several options to help filter the output of your status query. + +| Name | Type | Description | Default | +| ------------ | ------- | ----------- | ------- | +| `--max-tasks` | integer | Sets a limit on how many tasks can be displayed. | None | +| `--return-code` | List[string] | Filter which tasks to display based on their return code. Multiple return codes can be provided using a space-delimited list. Options: `SUCCESS`, `SOFT_FAIL`, `HARD_FAIL`, `STOP_WORKERS`, `RETRY`, `DRY_SUCCESS`, `UNRECOGNIZED`. | None | +| `--steps` | List[string] | Filter which tasks to display based on the steps that they're associated with. Multiple steps can be provided using a space-delimited list. | `['all']` | +| `--task-queues` | List[string] | Filter which tasks to display based on a the task queues that they were/are in. Multiple task queues can be provided using a space-delimited list. | None | +| `--task-status` | List[string] | Filter which tasks to display based on their status. Multiple statuses can be provided using a space-delimited list. Options: `INITIALIZED`, `RUNNING`, `FINISHED`, `FAILED`, `CANCELLED`, `DRY_RUN`, `UNKNOWN`. | None | +| `--workers` | List[string] | Filter which tasks to display based on which workers are processing them. Multiple workers can be provided using a space-delimited list. | None | + +**Display Options:** + +There are multiple options to modify the way task statuses are displayed. + +| Name | Type | Description | Default | +| ------------ | ------- | ----------- | ------- | +| `--disable-pager` | boolean | Turn off the pager functionality when viewing the task-by-task status. **Caution:** This option is *not* recommended for large workflows as you could freeze your terminal with thousands of task statuses. | `False` | +| `--disable-theme` | boolean | Turn off styling for the status layout. | `False` | +| `--layout` | string | Alternate task-by-task status display layouts. Options: `table`, `default`. | `default` | +| `--no-prompts` | boolean | Ignore any prompts provided. This cause the `detailed-status` command to default to the latest study if you provide a spec file as input. | `False` | + +**Examples:** + +!!! example "Check the Detailed Status Using Workspace as Input" + + ```bash + merlin detailed-status study_name_20240129-123452/ + ``` + +!!! example "Check the Detailed Status Using a Specification as Input" + + This will look in the `OUTPUT_PATH` [Reserved Variable](./variables.md#reserved-variables) defined within the spec file to try to find existing workspace directories associated with this spec file. If more than one are found, a prompt will be displayed for you to select a workspace directory. + + ```bash + merlin detailed-status my_specification.yaml + ``` + +!!! example "Dump the Status Report to a JSON File" + + ```bash + merlin detailed-status study_name_20240129-123452/ --dump status_report.json + ``` + +!!! example "Only Display Failed Tasks" + + ```bash + merlin detailed-status study_name_20240129-123452/ --task-status FAILED + ``` + +!!! example "Display the First 8 Successful Tasks" + + ```bash + merlin detailed-status study_name_20240129-123452/ --return-code SUCCESS --max-tasks 8 + ``` + +!!! example "Disable the Theme" + + ```bash + merlin detailed-status study_name_20240129-123452/ --disable-theme + ``` + +!!! example "Use the Table Layout" + + ```bash + merlin detailed-status study_name_20240129-123452/ --layout table + ``` ### Monitor (`merlin monitor`) @@ -551,6 +664,8 @@ Batch submission scripts may not keep the batch allocation alive if there is not The `monitor` functionality will check for Celery workers for up to 10*(sleep) seconds before monitoring begins. The loop happens when the queue(s) in the spec contain tasks, but no running workers are detected. This is to protect against a failed worker launch. +For more information, see the [Monitoring Studies for Persistent Allocations documentation](./monitoring/monitor_for_allocation.md). + **Usage:** ```bash @@ -591,6 +706,8 @@ Check which workers are currently connected to the task server. This will broadcast a command to all connected workers and print the names of any that respond and the queues they're attached to. This is useful for interacting with workers, such as via `merlin stop-workers --workers`. +For more information, see the [Query Workers documentation](./monitoring/queues_and_workers.md#query-workers). + **Usage:** ```bash @@ -643,14 +760,22 @@ merlin query-workers [OPTIONS] merlin query-workers --workers ^step ``` -### Status (`merlin status`) +### Queue Info (`merlin queue-info`) + +!!! note + + Prior to Merlin v1.12.0 the `merlin status` command would produce the same output as `merlin queue-info --spec ` + +Check the status of queues to see if there are any tasks in them and/or any workers watching them. -Check the status of the queues in your spec file to see if there are any tasks in them and any active workers watching them. +If used without the `--spec` option, this will query any active queues. Active queues are queues that have a worker watching them. + +For more information, see the [Queue Information documentation](./monitoring/queues_and_workers.md#queue-information). **Usage:** ```bash -merlin status [OPTIONS] SPECIFICATION +merlin queue-info [OPTIONS] ``` **Options:** @@ -658,27 +783,125 @@ merlin status [OPTIONS] SPECIFICATION | Name | Type | Description | Default | | ------------ | ------- | ----------- | ------- | | `-h`, `--help` | boolean | Show this help message and exit | `False` | +| `--dump` | filename | The name of a csv or json file to dump the queue information to | None | +| `--specific-queues` | List[string] | A space-delimited list of queues to get information on | None | +| `--task_server` | string | Task server type. Currently only "celery" is implemented. | "celery" | + +**Specification Options:** + +These options all *must* be used with the `--spec` option if used. + +| Name | Type | Description | Default | +| ------------ | ------- | ----------- | ------- | +| `--spec` | filename | Query for the queues named in each step of the spec file given here | None | | `--steps` | List[string] | A space-delimited list of steps in the input spec that you want to query. Should be given after the input spec. | `['all']` | | `--vars` | List[string] | A space-delimited list of variables to override in the spec file. This list should be given after the spec file is provided. Ex: `--vars QUEUE_NAME=new_queue_name` | None | + +**Examples:** + +!!! example "Query All Active Queues" + + ```bash + merlin queue-info + ``` + +!!! example "Check the Status of Specific Queues" + + ```bash + merlin queue-info --specific-queues queue_1 queue_3 + ``` + +!!! example "Check the Status of Queues in a Spec File" + + **This is the same as running `merlin status ` prior to Merlin v1.12.0** + + ```bash + merlin queue-info --spec my_specification.yaml + ``` + +!!! example "Check the Status of Queues for Specific Steps" + + ```bash + merlin queue-info --spec my_specification.yaml --steps step_1 step_3 + ``` + +!!! example "Dump the Queue Information to a JSON File" + + ```bash + merlin queue-info --dump queue_report.json + ``` + +### Status (`merlin status`) + +!!! note + + To obtain the same functionality as the `merlin status` command prior to Merlin v1.12.0 use [`merlin queue-info`](#queue-info-merlin-queue-info) with the `--spec` option: + + ```bash + merlin queue-info --spec + ``` + +Display a high-level status summary of a workflow. + +This will display the progress of each step in your workflow using progress bars and brief summaries. In each summary you can find how many tasks there are in total for a step, how many tasks are in each state, the average run time and standard deviation of run times of the tasks in the step, the task queue, and the worker that is watching the step. + +For more information, see the [Status documentation](./monitoring/status_cmds.md#the-status-command). + +**Usage:** + +```bash +merlin status [OPTIONS] WORKSPACE_OR_SPECIFICATION +``` + +**Options:** + +| Name | Type | Description | Default | +| ------------ | ------- | ----------- | ------- | +| `-h`, `--help` | boolean | Show this help message and exit | `False` | +| `--cb-help` | boolean | Colorblind help option. This will utilize different symbols for each state of a task. | `False` | +| `--dump` | filename | The name of a csv or json file to dump the status to | None | +| `--no-prompts` | boolean | Ignore any prompts provided to the command line. This will default to the latest study if you provide a spec file rather than a study workspace. | `False` | | `--task_server` | string | Task server type. Currently only "celery" is implemented. | "celery" | -| `--csv` | filename | The name of a csv file to dump the queue status report to | None | +| `-o`, `--output-path` | dirname | Specify a location to look for output workspaces. Only used when a spec file is passed as the argument to `status`. | None | **Examples:** -!!! example "Basic Status Check" +!!! example "Check the Status Using Workspace as Input" + + ```bash + merlin status study_name_20240129-123452/ + ``` + +!!! example "Check the Status Using a Specification as Input" + + This will look in the `OUTPUT_PATH` [Reserved Variable](./variables.md#reserved-variables) defined within the spec file to try to find existing workspace directories associated with this spec file. If more than one are found, a prompt will be displayed for you to select a workspace directory. ```bash merlin status my_specification.yaml ``` -!!! example "Check the Status of Queues for Certain Steps" +!!! example "Check the Status Using a Specification as Input & Ignore Any Prompts" + + If multiple workspace directories associated with the spec file provided are found, the `--no-prompts` option will ignore the prompt and select the most recent study that was ran based on the timestamps. + + ```bash + merlin status my_specification.yaml --no-prompts + ``` + +!!! example "Dump the Status Report to a CSV File" + + ```bash + merlin status study_name_20240129-123452/ --dump status_report.csv + ``` + +!!! example "Look For Workspaces at a Certain Location" ```bash - merlin status my_specification.yaml --steps step_1 step_3 + merlin status my_specification.yaml -o new_output_path/ ``` -!!! example "Dump the Status to a CSV File" +!!! example "Utilize the Colorblind Functionality" ```bash - merlin status my_specification.yaml --csv status_report.csv + merlin status study_name_20240129-123452/ --cb-help ``` diff --git a/docs/user_guide/monitoring/index.md b/docs/user_guide/monitoring/index.md new file mode 100644 index 000000000..4ba985fb6 --- /dev/null +++ b/docs/user_guide/monitoring/index.md @@ -0,0 +1,33 @@ +# Monitoring Studies + +This section of the documentation is dedicated to guiding you through the intricacies of monitoring studies with Merlin. From utilizing monitoring tools to interpreting their outputs, we'll explore how to leverage Merlin's monitoring features to enhance your study management experience. + +## Key Objectives + +1. **Real-Time Visibility** + + - Gain instant insights into the progress of your studies. + + - Monitor the status of individual tasks and their dependencies. + +2. **Issue Identification and Resolution** + + - Identify and address issues or bottlenecks in study execution promptly. + + - Utilize monitoring data for efficient troubleshooting. + +3. **Performance Optimization** + + - Analyze historical data to identify patterns and optimize study workflows. + + - Fine-tune parameters based on monitoring insights for enhanced efficiency. + +## What is in This Section? + +There are several commands used specifically for monitoring studies (see [Monitoring Commands](../command_line.md#monitoring-commands)). Throughout this section we'll discuss each and every one in further detail: + +- [The Status Commands](./status_cmds.md): As you may have guessed, this module will cover the two status commands that Merlin provides ([`merlin status`](../command_line.md#status-merlin-status) and [`merlin detailed-status`](../command_line.md#detailed-status-merlin-detailed-status)) + +- [Querying Queues and Workers](./queues_and_workers.md): This module will discuss how queues and workers can be queried with the [`merlin queue-info`](../command_line.md#queue-info-merlin-queue-info) and the [`merlin query-workers`](../command_line.md#query-workers-merlin-query-workers) commands. + +- [Monitoring Studies for Persistent Allocations](./monitor_for_allocation.md): Here we'll discuss how allocations can be kept alive using the [`merlin monitor`](../command_line.md#monitor-merlin-monitor) command. diff --git a/docs/user_guide/monitoring/monitor_for_allocation.md b/docs/user_guide/monitoring/monitor_for_allocation.md new file mode 100644 index 000000000..c2e4a17e5 --- /dev/null +++ b/docs/user_guide/monitoring/monitor_for_allocation.md @@ -0,0 +1,661 @@ +# Monitoring Studies for Persistent Allocations + +Merlin's producer-consumer model ensures the longevity of workers until the batch allocation is terminated or the workers are manually killed by the user. In this framework, workers remain active, processing tasks within a workflow until the allocation concludes or they're deliberately terminated. Consequently, sustaining the vitality of the allocation is crucial during the processing phase, as it guarantees the uninterrupted execution of tasks by the workers. + +To preserve an allocation throughout the lifecycle of a worklow, Merlin offers the [`merlin monitor`](../command_line.md#monitor-merlin-monitor) command. This command serves as a blocking process for a batch submission script, preserving the allocation's continuity while the workers process tasks. + +**Usage:** + +```bash +merlin monitor +``` + +## How Does the Monitor Work? + +The `merlin monitor` command takes a spec file as input, using it to identify the task queues and workers it needs to observe. This monitoring process involves two key actions: + +1. Verifying the presence of tasks in the designated queues. +2. Confirming the ongoing processing of tasks by the assigned workers when the queues are empty. + +The monitor comes with a [`--sleep` option](#sleep), which introduces a deliberate delay. Before the monitoring initiates, the monitor waits up to 10 times the specified sleep duration, providing users with a window to populate the task queues with the [`merlin run`](../command_line.md#run-merlin-run) command. Subsequently, it waits for the specified sleep duration between each check to determine if the queues have tasks (step 1 above). If no tasks are found, and no workers are processing tasks, the monitor concludes that the workflow has finished, allowing the allocation to end. This way, the monitor command acts as a blocking process, ensuring the continuous and effective management of tasks within the specified workflow. + +The resulting flowchart of this process can be seen below. + +
+ ![Monitor Flowchart](../../assets/images/monitoring/monitor_for_allocation/monitor-flowchart.png) +
Monitor Flowchart
+
+ +## Using the Monitor + +Adding the `merlin monitor` command to your workflow process is as simple as putting it at the end of your worker-startup script. The below templates showcase how this is done for [Slurm](../../faq.md#what-is-slurm) and [LSF](../../faq.md#what-is-lsf). + +=== "Slurm" + + The below batch script can be submitted with: + + ```bash + sbatch workers.sbatch + ``` + + ```bash title="workers.sbatch" + #!/bin/bash + #SBATCH -N 1 + #SBATCH --ntasks-per-node=36 + #SBATCH -J Merlin + #SBATCH -t 10:00 + #SBATCH -o merlin_workers_%j.out + + # Turn off core files to work aroung flux exec issue. + ulimit -c 0 + + YAML=default.yaml # (1) + + if [[ $# -gt 0 ]] + then + YAML=$1 + fi + + echo "Specification File: $YAML" + + VENV_PATH= + + # Activate the virtual environment + source ${VENV_PATH}/bin/activate + + # Show the workers command + merlin run-workers ${YAML} --echo + + # Start workers to run the tasks in the broker + merlin run-workers ${YAML} + + # Keep the allocation alive until all workers stop + merlin monitor ${YAML} + ``` + + 1. Modifying this value to be the path to your spec file would make it so you didn't have to pass the path in at the command line when submitting this script. In other words, you could submit this script with `sbatch workers.sbatch`. + +=== "LSF" + + The below batch script can be submitted with: + + ```bash + bsub workers.bsub + ``` + + ```bash title="workers.bsub" + #!/bin/bash + #BSUB -nnodes 1 + #BSUB -W 00:10 # hours:minutes + #BSUB -J Merlin + #BSUB -o merlin_workers_%J.out + #BSUB -e merlin_workers_%J.err + #BSUB -N + + # Turn off core files to work aroung flux exec issue. + ulimit -c 0 + + YAML=default.yaml # (1) + + if [[ $# -gt 0 ]] + then + YAML=$1 + fi + + echo "Specification File: $YAML" + + VENV_PATH= + + # Activate the virtual environment + source ${VENV_PATH}/bin/activate + + # Show the workers command + merlin run-workers ${YAML} --echo + + # Start workers to run the tasks in the broker + merlin run-workers ${YAML} + + # Keep the allocation alive until all workers stop + merlin monitor ${YAML} + ``` + + 1. Modifying this value to be the path to your spec file would make it so you didn't have to pass the path in at the command line when submitting this script. In other words, you could submit this script with `bsub workers.bsub`. + +## Options For the Monitor + +There are three useful options that come with the `merlin monitor` command: + +- [`--sleep`](#sleep): The delay between checks on the task queues +- [`--steps`](#steps): Only monitor specific steps in your workflow +- [`--vars`](#vars): Modify environment variables in a spec from the command line + +### Sleep + +The `--sleep` option in the `monitor` command allows users to specify a custom delay duration between consecutive inspections of the task queues. The default value for this option is 60 seconds. + +As detailed in the ["How Does the Monitor Work?"](#how-does-the-monitor-work) section, the monitor periodically examines task queues to determine task presence. If the queues are currently occupied, the monitor will enter a sleep state for a designated duration before conducting the next inspection. Similarly, if the monitor discovers no tasks in the queues but identifies active workers processing tasks, it will initiate a sleep interval before re-evaluating both the queues and the workers. The `--sleep` option allows you to modify this sleep interval. + +The value that you provide for the `--sleep` option will be an integer representing the number of seconds to sleep before the next inspection of the task queues and workers is conducted. + +**Usage:** + +```bash +merlin monitor --sleep +``` + +??? example "Example of Using `--sleep` With Monitor" + + In the below spec file we have one step that will run for 90 seconds: + + ```yaml title="sleep_demo.yaml" + description: + name: sleep_demo + description: a very simple merlin workflow + + study: + - name: step_1 + description: sleep for 90 seconds + run: + cmd: sleep 90 + task_queue: step_1_queue + ``` + + For this example we'll use a Slurm worker-launch script to start the workers. Notice on the last line that we're setting the sleep duration to be 30 seconds instead of the default 60 seconds: + + ```bash title="workers.sbatch" hl_lines="32" + #!/bin/bash + #SBATCH -N 1 + #SBATCH --ntasks-per-node=36 + #SBATCH -J Merlin + #SBATCH -t 10:00 + #SBATCH -o merlin_workers_%j.out + + # Turn off core files to work aroung flux exec issue. + ulimit -c 0 + + YAML=sleep_demo.yaml + + if [[ $# -gt 0 ]] + then + YAML=$1 + fi + + echo "Specification File: $YAML" + + VENV_PATH= + + # Activate the virtual environment + source ${VENV_PATH}/bin/activate + + # Show the workers command + merlin run-workers ${YAML} --echo + + # Start workers to run the tasks in the broker + merlin run-workers ${YAML} + + # Keep the allocation alive until all workers stop + merlin monitor ${YAML} --sleep 30 + ``` + + Now let's run this study by submitting this worker launch script and sending the tasks to the broker with the [`merlin run`](../command_line.md#run-merlin-run) command: + + === "Submitting the Batch Script" + + ```bash + sbatch workers.sbatch + ``` + + === "Sending Tasks to Broker" + + ```bash + merlin run sleep_demo.yaml + ``` + + From the time stamps in our worker logs we can see that the custom 30 second sleep duration was applied: + + ```bash hl_lines="20-25 33-34" + [2024-02-05 09:13:52,891: INFO] Connected to amqps://rabbitmerlin:**@cz-gunny-rabbitmerlin.apps.czapps.llnl.gov:31118/host4gunny + [2024-02-05 09:13:52,911: INFO] mingle: searching for neighbors + [2024-02-05 09:13:53,956: INFO] mingle: all alone + [2024-02-05 09:13:53,996: INFO] celery@default_worker.%quartz1552 ready. + [2024-02-05 09:13:54,028: INFO] Task merlin.common.tasks.expand_tasks_with_samples[78530a48-95f0-4b0e-90ca-7011e81a7808] received + [2024-02-05 09:13:54,211: INFO] Task merlin.common.tasks.merlin_step[117b28c9-eacd-4e77-9771-01b4ebc29e01] received + [2024-02-05 09:13:54,228: INFO] Executing step 'step_1' in '/usr/WS1/gunny/hello/sleep_demo_20240205-091232/step_1'... + [2024-02-05 09:13:54,228: INFO] Directory does not exist. Creating directories to /usr/WS1/gunny/hello/sleep_demo_20240205-091232/step_1 + [2024-02-05 09:13:54,233: INFO] Writing status for step_1 to '/usr/WS1/gunny/hello/sleep_demo_20240205-091232/step_1/MERLIN_STATUS.json... + [2024-02-05 09:13:54,235: INFO] Status for step_1 successfully written. + [2024-02-05 09:13:54,235: INFO] Generating script for step_1 into /usr/WS1/gunny/hello/sleep_demo_20240205-091232/step_1 + [2024-02-05 09:13:54,238: INFO] Script: /usr/WS1/gunny/hello/sleep_demo_20240205-091232/step_1/step_1.sh + Restart: None + Scheduled?: True + [2024-02-05 09:13:54,273: INFO] Writing status for step_1 to '/usr/WS1/gunny/hello/sleep_demo_20240205-091232/step_1/MERLIN_STATUS.json... + [2024-02-05 09:13:54,276: INFO] Status for step_1 successfully written. + [2024-02-05 09:13:54,276: INFO] Submitting script for step_1 + [2024-02-05 09:13:54,548: INFO] Task merlin.common.tasks.expand_tasks_with_samples[78530a48-95f0-4b0e-90ca-7011e81a7808] succeeded in 0.40144235407933593s: None + [2024-02-05 09:14:16: INFO] Reading app config from file /g/g20/gunny/.merlin/app.yaml + [2024-02-05 09:14:17: INFO] Monitor: found 0 jobs in queues and 1 workers alive + [2024-02-05 09:14:18: INFO] Monitor: found tasks in queues and/or tasks being processed + [2024-02-05 09:14:50: INFO] Monitor: found 0 jobs in queues and 1 workers alive + [2024-02-05 09:14:51: INFO] Monitor: found tasks in queues and/or tasks being processed + [2024-02-05 09:15:22: INFO] Monitor: found 0 jobs in queues and 1 workers alive + [2024-02-05 09:15:23: INFO] Monitor: found tasks in queues and/or tasks being processed + [2024-02-05 09:15:24,298: INFO] Execution returned status OK. + [2024-02-05 09:15:24,304: INFO] Writing status for step_1 to '/usr/WS1/gunny/hello/sleep_demo_20240205-091232/step_1/MERLIN_STATUS.json... + [2024-02-05 09:15:24,307: INFO] Status for step_1 successfully written. + [2024-02-05 09:15:24,307: INFO] Step 'step_1' in '/usr/WS1/gunny/hello/sleep_demo_20240205-091232/step_1' finished successfully. + [2024-02-05 09:15:24,498: INFO] Task merlin:chordfinisher[f442f13e-0436-4162-86ab-eaa28943f526] received + [2024-02-05 09:15:24,501: INFO] Task merlin.common.tasks.merlin_step[117b28c9-eacd-4e77-9771-01b4ebc29e01] succeeded in 90.27513551106676s: 0 + [2024-02-05 09:15:24,507: INFO] Task merlin:chordfinisher[f442f13e-0436-4162-86ab-eaa28943f526] succeeded in 0.007889348082244396s: 'SYNC' + [2024-02-05 09:15:54: INFO] Monitor: found 0 jobs in queues and 1 workers alive + [2024-02-05 09:15:55: INFO] Monitor: ... stop condition met + ``` + +### Steps + +!!! warning + + It's essential to note that using this option might lead to the termination of the allocation while your study is still processing. This outcome occurs if any subsequent steps in your study were not included in the steps provided to the `--steps` option. + +The `--steps` option in the `monitor` command allows you to specify particular steps for monitoring instead of the entire study. By providing specific steps, the monitor identifies the associated task queues and exclusively monitors those queues, disregarding others in the study. + +**Usage:** + +```bash +merlin monitor --steps +``` + +??? example "Example of Using `--steps` Option" + + In the spec file below, we have two steps `step_1` and `step_2` that each have thier own respective task queues `step_1_queue` and `step_2_queue`. + + ```yaml title="steps_demo.yaml" + description: + name: steps_demo + description: a very simple merlin workflow + + study: + - name: step_1 + description: say hello + run: + cmd: echo "hello!" + task_queue: step_1_queue + + - name: step_2 + description: sleep for 90 seconds + run: + cmd: sleep 90 + depends: [step_1] + task_queue: step_2_queue + ``` + + Let's say we just want to monitor `step_1` to make sure it finishes but we don't care if `step_2` finishes. We can convey this to our monitor by using the `--steps` option. This is shown in the worker-launch script below (for this example we'll use a Slurm batch script): + + ```bash hl_lines="32" + #!/bin/bash + #SBATCH -N 1 + #SBATCH --ntasks-per-node=36 + #SBATCH -J Merlin + #SBATCH -t 10:00 + #SBATCH -o merlin_workers_%j.out + + # Turn off core files to work aroung flux exec issue. + ulimit -c 0 + + YAML=steps_demo.yaml + + if [[ $# -gt 0 ]] + then + YAML=$1 + fi + + echo "Specification File: $YAML" + + VENV_PATH= + + # Activate the virtual environment + source ${VENV_PATH}/bin/activate + + # Show the workers command + merlin run-workers ${YAML} --echo + + # Start workers to run the tasks in the broker + merlin run-workers ${YAML} + + # Keep the allocation alive until all workers stop + merlin monitor ${YAML} --steps step_1 + ``` + + After submitting this batch script and sending the task queues to the broker with: + + === "Submitting the Batch Script" + + ```bash + sbatch workers.sbatch + ``` + + === "Sending Tasks to Broker" + + ```bash + merlin run steps_demo.yaml + ``` + + ...we'll see from the status command and the logs that the allocation is terminated after `step_1` finishes but prior to `step_2` finishing. + + === "Status" + + Checking our Slurm queues with: + + ```bash + squeue -u + ``` + + We'll see our allocation is either in a cancelled state or just not there at all. This means our workers are no longer processing anything. + + Now if we check the status of our study, we'll see that `step_1` finished just fine but the status of `step_2` was never updated to a completed state since it never finished processing before the allocation was terminated: + +
+ ![Status Showing Only Step 1 Finishing](../../assets/images/monitoring/monitor_for_allocation/steps-demo.png) +
Status Showing Only Step 1 Finishing
+
+ + === "Worker Logs" + + Focusing on the highlighted lines below, we'll see that `step_1` logs that it starts executing and that it finishes executing. For `step_2` we see a similar log for when it starts but we never see a log for when it finishes. Instead, it's execution is cut off by the monitor terminating the allocation since we told it to only monitor `step_1`. + + ```bash hl_lines="7 21 23 37-38" + [2024-02-05 11:53:18,915: INFO] Connected to amqps://rabbitmerlin:**@cz-gunny-rabbitmerlin.apps.czapps.llnl.gov:31118/host4gunny + [2024-02-05 11:53:18,943: INFO] mingle: searching for neighbors + [2024-02-05 11:53:19,986: INFO] mingle: all alone + [2024-02-05 11:53:20,017: INFO] celery@default_worker.%quartz205 ready. + [2024-02-05 11:53:20,051: INFO] Task merlin.common.tasks.expand_tasks_with_samples[36b7c213-735f-443c-a593-21dd6fbdce82] received + [2024-02-05 11:53:20,237: INFO] Task merlin.common.tasks.merlin_step[661180c7-9921-448d-ad4a-510163fe5dd3] received + [2024-02-05 11:53:20,266: INFO] Executing step 'step_1' in '/usr/WS1/gunny/hello/steps_demo_20240205-115213/step_1'... + [2024-02-05 11:53:20,267: INFO] Directory does not exist. Creating directories to /usr/WS1/gunny/hello/steps_demo_20240205-115213/step_1 + [2024-02-05 11:53:20,274: INFO] Writing status for step_1 to '/usr/WS1/gunny/hello/steps_demo_20240205-115213/step_1/MERLIN_STATUS.json... + [2024-02-05 11:53:20,276: INFO] Status for step_1 successfully written. + [2024-02-05 11:53:20,277: INFO] Generating script for step_1 into /usr/WS1/gunny/hello/steps_demo_20240205-115213/step_1 + [2024-02-05 11:53:20,281: INFO] Script: /usr/WS1/gunny/hello/steps_demo_20240205-115213/step_1/step_1.sh + Restart: None + Scheduled?: True + [2024-02-05 11:53:20,326: INFO] Writing status for step_1 to '/usr/WS1/gunny/hello/steps_demo_20240205-115213/step_1/MERLIN_STATUS.json... + [2024-02-05 11:53:20,331: INFO] Status for step_1 successfully written. + [2024-02-05 11:53:20,331: INFO] Submitting script for step_1 + [2024-02-05 11:53:20,347: INFO] Execution returned status OK. + [2024-02-05 11:53:20,357: INFO] Writing status for step_1 to '/usr/WS1/gunny/hello/steps_demo_20240205-115213/step_1/MERLIN_STATUS.json... + [2024-02-05 11:53:20,362: INFO] Status for step_1 successfully written. + [2024-02-05 11:53:20,362: INFO] Step 'step_1' in '/usr/WS1/gunny/hello/steps_demo_20240205-115213/step_1' finished successfully. + [2024-02-05 11:53:20,427: INFO] Task merlin.common.tasks.merlin_step[29c33e4d-c2c4-4211-bd4d-ba0531f23581] received + [2024-02-05 11:53:20,448: INFO] Executing step 'step_2' in '/usr/WS1/gunny/hello/steps_demo_20240205-115213/step_2'... + [2024-02-05 11:53:20,448: INFO] Directory does not exist. Creating directories to /usr/WS1/gunny/hello/steps_demo_20240205-115213/step_2 + [2024-02-05 11:53:20,460: INFO] Writing status for step_2 to '/usr/WS1/gunny/hello/steps_demo_20240205-115213/step_2/MERLIN_STATUS.json... + [2024-02-05 11:53:20,467: INFO] Status for step_2 successfully written. + [2024-02-05 11:53:20,467: INFO] Generating script for step_2 into /usr/WS1/gunny/hello/steps_demo_20240205-115213/step_2 + [2024-02-05 11:53:20,483: INFO] Script: /usr/WS1/gunny/hello/steps_demo_20240205-115213/step_2/step_2.sh + Restart: None + Scheduled?: True + [2024-02-05 11:53:20,507: INFO] Task merlin.common.tasks.expand_tasks_with_samples[36b7c213-735f-443c-a593-21dd6fbdce82] succeeded in 0.3326189829967916s: None + [2024-02-05 11:53:20,511: INFO] Task merlin.common.tasks.merlin_step[661180c7-9921-448d-ad4a-510163fe5dd3] succeeded in 0.2548455507494509s: 0 + [2024-02-05 11:53:20,535: INFO] Writing status for step_2 to '/usr/WS1/gunny/hello/steps_demo_20240205-115213/step_2/MERLIN_STATUS.json... + [2024-02-05 11:53:20,544: INFO] Status for step_2 successfully written. + [2024-02-05 11:53:20,544: INFO] Submitting script for step_2 + [2024-02-05 11:54:13: INFO] Reading app config from file /g/g20/gunny/.merlin/app.yaml + [2024-02-05 11:54:14: INFO] Monitor: found 0 jobs in queues and 1 workers alive + [2024-02-05 11:54:15: INFO] Monitor: ... stop condition met + ``` + +### Vars + +The `--vars` option can be used to modify any variables defined in your spec file from the command line interface. This option can take a space-delimited list of variables and their assignments, and should be given after the input yaml file. + +**Usage:** + +```bash +merlin monitor --vars = +``` + +??? example "Example of Using `--vars` With Monitor" + + Say we have the following spec file with a variable `QUEUE_NAME` that's referenced in `step_2`: + + ```yaml title="vars_demo.yaml" hl_lines="7 22" + description: + name: vars_demo + description: a very simple merlin workflow + + env: + variables: + QUEUE_NAME: step_2_queue + + study: + - name: step_1 + description: say hello + run: + cmd: echo "hello!" + task_queue: step_1_queue + + - name: step_2 + description: sleep for 90 seconds + run: + cmd: sleep 90 + depends: [step_1] + task_queue: $(QUEUE_NAME) + ``` + + If we decided we wanted to modify this queue at the command line, we could accomplish this with the `--vars` option of the [`merlin run-workers`](../command_line.md#run-workers-merlin-run-workers) and [`merlin run`](../command_line.md#run-merlin-run) commands: + + === "Run Workers" + + In the below worker-startup script, we're not using the `--vars` option with the `monitor` command. This is for demonstrative purposes and will be modified later in this example. + + ```bash title="workers.sbatch" hl_lines="25-32" + #!/bin/bash + #SBATCH -N 1 + #SBATCH --ntasks-per-node=36 + #SBATCH -J Merlin + #SBATCH -t 10:00 + #SBATCH -o merlin_workers_%j.out + + # Turn off core files to work aroung flux exec issue. + ulimit -c 0 + + YAML=vars_demo.yaml + + if [[ $# -gt 0 ]] + then + YAML=$1 + fi + + echo "Specification File: $YAML" + + VENV_PATH= + + # Activate the virtual environment + source ${VENV_PATH}/bin/activate + + # Show the workers command + merlin run-workers ${YAML} --vars QUEUE_NAME=new_queue_name --echo + + # Start workers to run the tasks in the broker + merlin run-workers ${YAML} --vars QUEUE_NAME=new_queue_name + + # Keep the allocation alive until all workers stop + merlin monitor ${YAML} # ***Purposefully not using the --vars argument yet*** + ``` + + This can be submitted with: + + ```bash + sbatch workers.sbatch + ``` + + === "Run" + + ```bash + merlin run vars_demo.yaml --vars QUEUE_NAME=new_queue_name + ``` + + Using the above worker launch script that *does not* utilize the `--vars` option with the `monitor` our allocation would terminate after `step_1` completes. This is due to the fact that it will be watching the `step_1_queue` and `step_2_queue` task queues, but `step_2`'s tasks will be sent to the `new_queue_name` queue instead. We can see this behavior from the status of our study and the the end of the worker logs. + + === "Status" + + Checking our Slurm queues with: + + ```bash + squeue -u + ``` + + We'll see our allocation is either in a cancelled state or just not there at all. This means our workers are no longer processing anything. + + Now if we check the status of our study, we'll see that the status of `step_2` was never updated to a completed state: + +
+ ![Status Displaying Step 2 Never Finishes](../../assets/images/monitoring/monitor_for_allocation/status-step-2-incomplete.png) +
Status Displaying Step 2 Never Finishes
+
+ + === "Worker Logs" + + Focusing on the highlighted lines below, we'll see that `step_1` logs that it starts executing and that it finishes executing. For `step_2` we see a similar log for when it starts but we never see a log for when it finishes. Instead, it's execution is cut off by the monitor terminating the allocation. + + ```bash hl_lines="7 21 23 37-38" + [2024-02-02 16:16:51,152: INFO] Connected to amqps://rabbitmerlin:**@cz-gunny-rabbitmerlin.apps.czapps.llnl.gov:31118/host4gunny + [2024-02-02 16:16:51,171: INFO] mingle: searching for neighbors + [2024-02-02 16:16:52,211: INFO] mingle: all alone + [2024-02-02 16:16:52,248: INFO] celery@default_worker.%quartz3 ready. + [2024-02-02 16:16:52,275: INFO] Task merlin.common.tasks.expand_tasks_with_samples[6c4ea50a-9fe4-43ca-b2bc-47ef2a477a21] received + [2024-02-02 16:16:52,449: INFO] Task merlin.common.tasks.merlin_step[09c2c700-fe97-4f18-9847-69615f3e4dfc] received + [2024-02-02 16:16:52,467: INFO] Executing step 'step_1' in '/usr/WS1/gunny/hello/studies/vars_demo_20240202-161556/step_1'... + [2024-02-02 16:16:52,467: INFO] Directory does not exist. Creating directories to /usr/WS1/gunny/hello/studies/vars_demo_20240202-161556/step_1 + [2024-02-02 16:16:52,470: INFO] Writing status for step_1 to '/usr/WS1/gunny/hello/studies/vars_demo_20240202-161556/step_1/MERLIN_STATUS.json... + [2024-02-02 16:16:52,472: INFO] Status for step_1 successfully written. + [2024-02-02 16:16:52,472: INFO] Generating script for step_1 into /usr/WS1/gunny/hello/studies/vars_demo_20240202-161556/step_1 + [2024-02-02 16:16:52,476: INFO] Script: /usr/WS1/gunny/hello/studies/vars_demo_20240202-161556/step_1/step_1.sh + Restart: None + Scheduled?: True + [2024-02-02 16:16:52,502: INFO] Writing status for step_1 to '/usr/WS1/gunny/hello/studies/vars_demo_20240202-161556/step_1/MERLIN_STATUS.json... + [2024-02-02 16:16:52,505: INFO] Status for step_1 successfully written. + [2024-02-02 16:16:52,505: INFO] Submitting script for step_1 + [2024-02-02 16:16:52,516: INFO] Execution returned status OK. + [2024-02-02 16:16:52,520: INFO] Writing status for step_1 to '/usr/WS1/gunny/hello/studies/vars_demo_20240202-161556/step_1/MERLIN_STATUS.json... + [2024-02-02 16:16:52,522: INFO] Status for step_1 successfully written. + [2024-02-02 16:16:52,522: INFO] Step 'step_1' in '/usr/WS1/gunny/hello/studies/vars_demo_20240202-161556/step_1' finished successfully. + [2024-02-02 16:16:52,580: INFO] Task merlin.common.tasks.merlin_step[bffc5866-7d85-4e2b-9a20-b3d195183581] received + [2024-02-02 16:16:52,594: INFO] Executing step 'step_2' in '/usr/WS1/gunny/hello/studies/vars_demo_20240202-161556/step_2'... + [2024-02-02 16:16:52,594: INFO] Directory does not exist. Creating directories to /usr/WS1/gunny/hello/studies/vars_demo_20240202-161556/step_2 + [2024-02-02 16:16:52,598: INFO] Writing status for step_2 to '/usr/WS1/gunny/hello/studies/vars_demo_20240202-161556/step_2/MERLIN_STATUS.json... + [2024-02-02 16:16:52,601: INFO] Status for step_2 successfully written. + [2024-02-02 16:16:52,602: INFO] Generating script for step_2 into /usr/WS1/gunny/hello/studies/vars_demo_20240202-161556/step_2 + [2024-02-02 16:16:52,605: INFO] Script: /usr/WS1/gunny/hello/studies/vars_demo_20240202-161556/step_2/step_2.sh + Restart: None + Scheduled?: True + [2024-02-02 16:16:52,613: INFO] Writing status for step_2 to '/usr/WS1/gunny/hello/studies/vars_demo_20240202-161556/step_2/MERLIN_STATUS.json... + [2024-02-02 16:16:52,616: INFO] Status for step_2 successfully written. + [2024-02-02 16:16:52,616: INFO] Submitting script for step_2 + [2024-02-02 16:16:52,639: INFO] Task merlin.common.tasks.expand_tasks_with_samples[6c4ea50a-9fe4-43ca-b2bc-47ef2a477a21] succeeded in 0.24488114699488506s: None + [2024-02-02 16:16:52,652: INFO] Task merlin.common.tasks.merlin_step[09c2c700-fe97-4f18-9847-69615f3e4dfc] succeeded in 0.18764808497508056s: 0 + [2024-02-02 16:17:46: INFO] Reading app config from file /g/g20/gunny/.merlin/app.yaml + [2024-02-02 16:17:47: INFO] Monitor: found 0 jobs in queues and 1 workers alive + [2024-02-02 16:17:48: INFO] Monitor: ... stop condition met + ``` + + Now let's modify the worker-startup script to utilize the `--vars` option for the `monitor` so that it can watch the correct queues: + + ```bash title="workers.sbatch" hl_lines="32" + #!/bin/bash + #SBATCH -N 1 + #SBATCH --ntasks-per-node=36 + #SBATCH -J Merlin + #SBATCH -t 10:00 + #SBATCH -o merlin_workers_%j.out + + # Turn off core files to work aroung flux exec issue. + ulimit -c 0 + + YAML=vars_demo.yaml + + if [[ $# -gt 0 ]] + then + YAML=$1 + fi + + echo "Specification File: $YAML" + + VENV_PATH= + + # Activate the virtual environment + source ${VENV_PATH}/bin/activate + + # Show the workers command + merlin run-workers ${YAML} --vars QUEUE_NAME=new_queue_name --echo + + # Start workers to run the tasks in the broker + merlin run-workers ${YAML} --vars QUEUE_NAME=new_queue_name + + # Keep the allocation alive until all workers stop + merlin monitor ${YAML} --vars QUEUE_NAME=new_queue_name + ``` + + Re-submitting this workflow, we'll see that the study runs to completion before terminating. + + === "Status" + + Our status now shows that both steps finished successfully. + +
+ ![Status Displaying Both Steps Finish Successfully](../../assets/images/monitoring/monitor_for_allocation/status-success.png) +
Status Displaying Both Steps Finish Successfully
+
+ + === "Worker Logs" + + Focusing on the highlighted lines below, we'll see that `step_1` and `step_2` both log that execution has started and finished. We'll also notice that prior to `step_2` finishing, the `monitor` checks the task queues/workers and determines it should keep the allocation alive. Once `step_2` is complete, we see that the monitor then terminates the allocation. + + ```bash hl_lines="8 22 24 37-38 42 46-47" + [2024-02-02 16:46:21,634: INFO] Connected to amqps://rabbitmerlin:**@cz-gunny-rabbitmerlin.apps.czapps.llnl.gov:31118/host4gunny + [2024-02-02 16:46:21,660: INFO] mingle: searching for neighbors + [2024-02-02 16:46:22,704: INFO] mingle: all alone + [2024-02-02 16:46:22,737: INFO] celery@default_worker.%quartz8 ready. + [2024-02-02 16:46:22,766: INFO] Task merlin.common.tasks.expand_tasks_with_samples[96a9fe28-cbde-4232-8f42-653646fe032e] received + [2024-02-02 16:46:22,947: INFO] Task merlin.common.tasks.merlin_step[ff620554-6d4d-4981-8488-bec1e718bc68] received + [2024-02-02 16:46:23,156: INFO] Task merlin.common.tasks.expand_tasks_with_samples[96a9fe28-cbde-4232-8f42-653646fe032e] succeeded in 0.27051527600269765s: None + [2024-02-02 16:46:23,240: INFO] Executing step 'step_1' in '/usr/WS1/gunny/hello/studies/vars_demo_20240202-164558/step_1'... + [2024-02-02 16:46:23,241: INFO] Directory does not exist. Creating directories to /usr/WS1/gunny/hello/studies/vars_demo_20240202-164558/step_1 + [2024-02-02 16:46:23,265: INFO] Writing status for step_1 to '/usr/WS1/gunny/hello/studies/vars_demo_20240202-164558/step_1/MERLIN_STATUS.json... + [2024-02-02 16:46:23,271: INFO] Status for step_1 successfully written. + [2024-02-02 16:46:23,271: INFO] Generating script for step_1 into /usr/WS1/gunny/hello/studies/vars_demo_20240202-164558/step_1 + [2024-02-02 16:46:23,278: INFO] Script: /usr/WS1/gunny/hello/studies/vars_demo_20240202-164558/step_1/step_1.sh + Restart: None + Scheduled?: True + [2024-02-02 16:46:23,329: INFO] Writing status for step_1 to '/usr/WS1/gunny/hello/studies/vars_demo_20240202-164558/step_1/MERLIN_STATUS.json... + [2024-02-02 16:46:23,334: INFO] Status for step_1 successfully written. + [2024-02-02 16:46:23,334: INFO] Submitting script for step_1 + [2024-02-02 16:46:23,350: INFO] Execution returned status OK. + [2024-02-02 16:46:23,371: INFO] Writing status for step_1 to '/usr/WS1/gunny/hello/studies/vars_demo_20240202-164558/step_1/MERLIN_STATUS.json... + [2024-02-02 16:46:23,379: INFO] Status for step_1 successfully written. + [2024-02-02 16:46:23,379: INFO] Step 'step_1' in '/usr/WS1/gunny/hello/studies/vars_demo_20240202-164558/step_1' finished successfully. + [2024-02-02 16:46:23,450: INFO] Task merlin.common.tasks.merlin_step[c1432f25-52c7-4967-9367-cff11b1b769c] received + [2024-02-02 16:46:23,453: INFO] Executing step 'step_2' in '/usr/WS1/gunny/hello/studies/vars_demo_20240202-164558/step_2'... + [2024-02-02 16:46:23,454: INFO] Directory does not exist. Creating directories to /usr/WS1/gunny/hello/studies/vars_demo_20240202-164558/step_2 + [2024-02-02 16:46:23,497: INFO] Writing status for step_2 to '/usr/WS1/gunny/hello/studies/vars_demo_20240202-164558/step_2/MERLIN_STATUS.json... + [2024-02-02 16:46:23,508: INFO] Status for step_2 successfully written. + [2024-02-02 16:46:23,508: INFO] Generating script for step_2 into /usr/WS1/gunny/hello/studies/vars_demo_20240202-164558/step_2 + [2024-02-02 16:46:23,525: INFO] Task merlin.common.tasks.merlin_step[ff620554-6d4d-4981-8488-bec1e718bc68] succeeded in 0.5617350109387189s: 0 + [2024-02-02 16:46:23,545: INFO] Script: /usr/WS1/gunny/hello/studies/vars_demo_20240202-164558/step_2/step_2.sh + Restart: None + Scheduled?: True + [2024-02-02 16:46:23,615: INFO] Writing status for step_2 to '/usr/WS1/gunny/hello/studies/vars_demo_20240202-164558/step_2/MERLIN_STATUS.json... + [2024-02-02 16:46:23,647: INFO] Status for step_2 successfully written. + [2024-02-02 16:46:23,647: INFO] Submitting script for step_2 + [2024-02-02 16:47:16: INFO] Reading app config from file /g/g20/gunny/.merlin/app.yaml + [2024-02-02 16:47:18: INFO] Monitor: found 0 jobs in queues and 1 workers alive + [2024-02-02 16:47:19: INFO] Monitor: found tasks in queues and/or tasks being processed + [2024-02-02 16:47:56,398: INFO] Execution returned status OK. + [2024-02-02 16:47:56,509: INFO] Writing status for step_2 to '/usr/WS1/gunny/hello/studies/vars_demo_20240202-164558/step_2/MERLIN_STATUS.json... + [2024-02-02 16:47:56,547: INFO] Status for step_2 successfully written. + [2024-02-02 16:47:56,547: INFO] Step 'step_2' in '/usr/WS1/gunny/hello/studies/vars_demo_20240202-164558/step_2' finished successfully. + [2024-02-02 16:47:59,780: INFO] Task merlin:chordfinisher[a338a16a-18ea-46a1-8593-5f9822cef5da] received + [2024-02-02 16:47:59,783: INFO] Task merlin.common.tasks.merlin_step[c1432f25-52c7-4967-9367-cff11b1b769c] succeeded in 96.33197088190354s: 0 + [2024-02-02 16:47:59,787: INFO] Task merlin:chordfinisher[a338a16a-18ea-46a1-8593-5f9822cef5da] succeeded in 0.005227347952313721s: 'SYNC' + [2024-02-02 16:48:20: INFO] Monitor: found 0 jobs in queues and 1 workers alive + [2024-02-02 16:48:21: INFO] Monitor: ... stop condition met + ``` diff --git a/docs/user_guide/monitoring/queues_and_workers.md b/docs/user_guide/monitoring/queues_and_workers.md new file mode 100644 index 000000000..101c37ec5 --- /dev/null +++ b/docs/user_guide/monitoring/queues_and_workers.md @@ -0,0 +1,608 @@ +# Querying Queues and Workers + +Managing task queues and monitoring the associated workers is a common necessity in many applications. To facilitate these interactions, Merlin offers two essential commands – [Queue Information](#queue-information) and [Query Workers](#query-workers). + +This module will delve into the details of these commands, providing insights into how to effectively retrieve information about task queues and query workers. + +## Queue Information + +Merlin provides users with the [`merlin queue-info`](../command_line.md#queue-info-merlin-queue-info) command to help monitor celery queues. This command will list queue statistics in a table format where the columns are as follows: queue name, number of tasks in the queue, number of workers connected to the queue. + +The default functionality of this command is to display queue statistics for active queues. Active queues are any queues that have a worker watching them. + +**Usage:** + +```bash +merlin queue-info +``` + +??? example "Example Queue-Info Output With No Active Queues" + +
+ ![Output of Queue-Info When No Queues Are Active](../../assets/images/monitoring/queues_and_workers/no-active-queues.png) +
Output of Queue-Info When No Queues Are Active
+
+ +??? example "Example Queue-Info Output With Active Queues" + +
+ ![Output of Queue-Info When Queues Are Active](../../assets/images/monitoring/queues_and_workers/active-queues.png) +
Output of Queue-Info When There Are Queues Active
+
+ +### Basic Options + +The `queue-info` command comes equipped with some basic options: + +- [`--dump`](#dump): Dump the queue information to a `.csv` or `.json` file +- [`--specific-queues`](#specific-queues): Only obtain queue information for queues you list here +- [`--task-server`](#task-server): Modify the task server value + +#### Dump + +Much like [the two status commands](./status_cmds.md), the `queue-info` command provides a way to dump the queue statistics to an output file. + +=== "CSV Usage" + + ```bash + merlin queue-info --dump queue_report.csv + ``` + +=== "JSON Usage" + + ```bash + merlin queue-info --dump queue_report.json + ``` + +When dumping to a file that *does not* yet exist, Merlin will create that file for you and populate it with the queue statistics you requested. + +When dumping to a file that *does* exist, Merlin will append the requested queue statistics to that file. You can differentiate between separate dump calls by looking at the timestamps of the dumps. For CSV files this timestamp exists in the `Time` column (see [CSV Dump Format](#csv-dump-format) below) and for JSON files this timestamp will be the top level key to the queue info entry (see [JSON Dump Format](#json-dump-format) below). + +Using any of the `--specific-steps`, `--spec`, or `--steps` options will modify the output that's written to the output file. + +##### CSV Dump Format + +The format of a CSV dump file for queue information is as follows: + +```bash +Time,[merlin]_:tasks,[merlin]_:consumers +``` + +The `:tasks` and `:consumers` columns will be created for each queue that's listed in the queue-info output at the time of your dump. + +The image below shows an example of dumping the queue statistics of active queues to a csv file, and then displaying that csv file using the [rich-cli library](https://github.com/Textualize/rich-cli): + +
+ ![An Example Showcasing How to do a CSV Dump of Active Queue Stats](../../assets/images/monitoring/queues_and_workers/dump-csv.png) +
An Example Showcasing How to do a CSV Dump of Active Queue Stats
+
+ +##### JSON Dump Format + +The format of a JSON dump file for queue information is as follows: + +```json +{ + "YYYY-MM-DD HH:MM:SS": { + "[merlin]_queue_name": { + "tasks": , + "consumers": + } + } +} +``` + +The image below shows an example of dumping the queue info to a json file, and then displaying that json file using the [rich-cli library](https://github.com/Textualize/rich-cli): + +
+ ![An Example Showcasing How to do a JSON Dump of Active Queue Stats](../../assets/images/monitoring/queues_and_workers/dump-json.png) +
An Example Showcasing How to do a JSON Dump of Active Queue Stats
+
+ +#### Specific Queues + +If you know exactly what queues you want to check on, you can use the `--specific-queues` option to list one or more queues to view. + +**Usage:** + +```bash +merlin queue-info --specific-queues +``` + +??? example "Example Queue-Info Output Using Specific-Queues With Active Queues" + + In the example below, we're querying the `train` and `predict` queues which both have a worker watching them currently (in other words, they are _**active**_). + +
+ ![the queue-info output using the specific-queues option with active queues](../../assets/images/monitoring/queues_and_workers/specific-queues-active.png) +
Output of Queue-Info Using the Specific-Queues Option With Active Queues
+
+ +If you ask for queue-info of inactive queues with the `--specific-queues` option, a table format will still be output for you. + +??? example "Example Queue-Info Output Using Specific-Queues With Inactive Queues" + + In the example below, we're querying the `train` and `predict` queues which both *don't* have a worker watching them currently (in other words, they are _**inactive**_). + +
+ ![the queue-info output using the specific-queues option with inactive queues](../../assets/images/monitoring/queues_and_workers/specific-queues-inactive.png) +
Output of Queue-Info Using the Specific-Queues Option With Inactive Queues
+
+ +#### Task Server + +To modify the task server from the command line you can use the `--task-server` option. However, the only currently available option for task server is celery so you most likely will not want to use this option. + +**Usage:** + +```bash +merlin queue-info --task-server +``` + +### Specification Options + +There are three options with the `--queue-info` command that revolve around using a spec file to query queue information: + +- [`--spec`](#spec): Obtain queue information for all queues defined in a spec file +- [`--steps`](#steps): Obtain queue information for queues attached to certain steps in a spec file +- [`--vars`](#vars): Modify environment variables in a spec from the command line + +!!! note + + The `--steps` and `--vars` options *must* be used alongside the `--spec` option. They *cannot* be used by themselves. + +#### Spec + +Using the `--spec` option allows you to query queue statistics for queues that only exist in the spec file you provide. This is the same functionality as the `merlin status` command prior to the release of Merlin v1.12.0. + +**Usage:** + +```bash +merlin queue-info --spec +``` + +!!! example "Example Queue-Info Output Using the `--spec` Option" + + The below example will display queue information for all queues in the `hello.yaml` spec file. + +
+ ![Output of Queue-Info When Using the Specification Option](../../assets/images/monitoring/queues_and_workers/specification-option.png) +
Output of Queue-Info When Using the Specification Option
+
+ +#### Steps + +!!! warning + + This option *must* be used alongside the [`--spec`](#spec) option. + +If you'd like to see queue information for queues that are attached to specific steps in your workflow, use the `--steps` option. + +**Usage:** + +```bash +merlin queue-info --spec --steps +``` + +!!! example "Example Queue-Info Output Using the `--steps` Option" + + Say we have a spec file with steps named `step_1` through `step_4` and each step is attached to a different task queue `step_1_queue` through `step_4_queue` respectively. Using the `--steps` option for these two steps gives us: + +
+ ![Output of Queue-Info When Using the Steps Option](../../assets/images/monitoring/queues_and_workers/steps-option.png) +
Output of Queue-Info When Using the Steps Option
+
+ +#### Vars + +!!! warning + + This option *must* be used alongside the [`--spec`](#spec) option. + +The `--vars` option can be used to modify any variables defined in your spec file from the command line interface. This option can take a space-delimited list of variables and their assignments, and should be given after the input yaml file. + +**Usage:** + +```bash +merlin queue-info --spec --vars = +``` + +??? example "Example of Using `--vars` With Queue-Info" + + Say we have the following spec file with a variable `QUEUE_NAME` that's referenced in `step_2`: + + ```yaml title="vars_demo.yaml" hl_lines="7 22" + description: + name: vars_demo + description: a very simple merlin workflow + + env: + variables: + QUEUE_NAME: step_2_queue + + study: + - name: step_1 + description: say hello + run: + cmd: echo "hello!" + task_queue: step_1_queue + + - name: step_2 + description: print a success message + run: + cmd: print("Hurrah, we did it!") + depends: [step_1_*] + shell: /usr/bin/env python3 + task_queue: $(QUEUE_NAME) + ``` + + If we decided we wanted to modify this queue at the command line, we could accomplish this with the `--vars` option of the [`merlin run`](../command_line.md#run-merlin-run) and [`merlin run-workers`](../command_line.md#run-workers-merlin-run-workers) commands: + + === "Run" + + ```bash + merlin run vars_demo.yaml --vars QUEUE_NAME=new_queue_name + ``` + + === "Run Workers" + + ```bash + merlin run-workers vars_demo.yaml --vars QUEUE_NAME=new_queue_name + ``` + + Now if we were to try to query the queue information without using the same `--vars` argument: + + ```bash + merlin queue-info --spec vars_demo.yaml + ``` + + ...we see `step_1_queue` and `step_2_queue` but we wouldn't see `new_queue_name`: + +
+ ![Demo Output Showing The Missing 'new_queue_name' Queue](../../assets/images/monitoring/queues_and_workers/queue-info-no-vars.png) +
Demo Output Showing The Missing 'new_queue_name' Queue
+
+ + This is due to the fact that when we modify a variable from the command line, the original spec file is not changed. + + With that being said, let's now run this again but this time we'll use the `--vars` option: + + ```bash + merlin queue-info --spec vars_demo.yaml --vars QUEUE_NAME=new_queue_name + ``` + + ...which should show us a worker watching the `new_queue_name` queue: + +
+ ![Demo Output Showcasing 'new_queue_name' Appearing as Expected](../../assets/images/monitoring/queues_and_workers/queue-info-with-vars.png) +
Demo Output Showcasing 'new_queue_name' Appearing as Expected
+
+ +## Query Workers + +Merlin provides users with the [`merlin query-workers`](../command_line.md#query-workers-merlin-query-workers) command to help users see which workers are running and what task queues they're watching. + +This command will output content to a table format with two columns: workers and queues. The workers column will contain one connected worker per row. The queues column will contain a comma-delimited list of queues that the connected worker is watching. + +**Usage:** + +```bash +merlin query-workers +``` + +??? example "Example Query-Workers Output With No Connected Workers" + +
+ ![Output of Query-Workers When No Workers Are Connected](../../assets/images/monitoring/queues_and_workers/no-connected-workers.png) +
Output of Query-Workers When No Workers Are Connected
+
+ +??? example "Example Query-Workers Output With Connected Workers" + +
+ ![Output of Query-Workers When There Are Workers Connected](../../assets/images/monitoring/queues_and_workers/connected-workers.png) +
Output of Query-Workers When There Are Workers Connected
+
+ +### Query Workers by Spec File + +When utilizing the `--spec` option with the `query-workers` command, your query will be adjusted to exclusively search for the workers specified in the provided spec file. If none of the workers defined in the spec file have been started, a message indicating this will be displayed. + +**Usage:** + +```bash +merlin query-workers --spec +``` + +??? example "Example of Using the `--spec` Option With Query-Workers" + + For this example let's start with two spec files `demo_workflow.yaml` and `demo_workflow_2.yaml`. In `demo_workflow.yaml` we'll have two workers `trainer` and `predictor`, and in `demo_workflow_2.yaml` we'll have one worker `worker1`. + + === "demo_workflow.yaml" + + Here, the `trainer` worker is watching the `create_data` and `train` steps which both have task queues of the same name. Therefore, the `trainer` worker will be watching the `create_data` and `train` task queues. Similarly, the `predictor` worker will be watching the `predict` and `verify` task queues. + + ```yaml title="demo_workflow.yaml" hl_lines="10 16 22 28 33-38" + description: + name: demo_workflow + description: a very simple merlin workflow + + study: + - name: create_data + description: create data for our model to train with + run: + cmd: echo "creating data" + task_queue: create_data + + - name: train + description: train a model + run: + cmd: echo "training a model on the data" + task_queue: train + + - name: predict + description: predict on unseen data + run: + cmd: echo "predict on new, unseen data" + task_queue: predict + + - name: verify + description: verify the validity of our study + run: + cmd: echo "verify our study succeeded" + task_queue: verify + + merlin: + resources: + workers: + trainer: + args: -l INFO + steps: [create_data, train] + predictor: + args: -l INFO + steps: [predict, verify] + ``` + + === "demo_workflow_2.yaml" + + In this workflow, the `worker1` worker is assigned to both `step_1` and `step_2`. Therefore, this worker will be connected to both `step_1_queue` and `step_2_queue`. + + ```yaml title="demo_workflow_2.yaml" hl_lines="23 31 36-38" + description: + name: $(NAME) + description: a very simple merlin workflow + + env: + variables: + NAME: hello + OUTPUT_PATH: ./studies + + global.parameters: + GREET: + values : ["hello","hola"] + label : GREET.%% + WORLD: + values : ["world","mundo"] + label : WORLD.%% + + study: + - name: step_1 + description: say hello + run: + cmd: echo "$(GREET), $(WORLD)!" + task_queue: step_1_queue + + - name: step_2 + description: print a success message + run: + cmd: print("Hurrah, we did it!") + depends: [step_1_*] + shell: /usr/bin/env python3 + task_queue: step_2_queue + + merlin: + resources: + workers: + worker1: + args: -l INFO + steps: [all] + ``` + + Let's start these workers with: + + === "Start Workers for `demo_workflow.yaml`" + + ```bash + merlin run-workers demo_workflow.yaml + ``` + + === "Start Workers for `demo_workflow_2.yaml`" + + ```bash + merlin run-workers demo_workflow_2.yaml + ``` + + Now if we used the `query-workers` command without the `--spec` option, we'd see all three workers across both workflows: `trainer`, `predictor`, and `worker1`: + +
+ ![Workers Across Multiple Spec Files](../../assets/images/monitoring/queues_and_workers/query-workers-spec-all-workers.png) +
Example of Workers Queried Across Multiple Spec Files
+
+ + Great, but what if we wanted to see just the workers for `demo_workflow.yaml`? We can accomplish this by using the `--spec` option: + + ```bash + merlin query-workers --spec demo_workflow.yaml + ``` + + Now, we'll notice that the only workers being displayed are `trainer` and `predictor`: + +
+ ![Output of Query-Workers Using the Spec Option](../../assets/images/monitoring/queues_and_workers/query-workers-spec-option.png) +
Output of Query-Workers Using the Spec Option
+
+ +### Query Workers by Queues + +In Merlin, newly spawned workers are linked to task queues either as assigned by you or automatically designated if not specified. Utilizing the `--queues` option in the `query-workers` command enables you to query workers based on the queues to which they are connected. + +**Usage:** + +```bash +merlin query-workers --queues +``` + +??? example "Example of Using the `--queues` Option With Query-Workers" + + Say we have the below spec file with four workers `creator`, `trainer`, `predictor`, and `verifier` that are each attached to their respective steps/task queues. In other words, `creator` will be connected to the `create_data` task queue, `trainer` will be connected to the `train` task queue, etc.: + + ```yaml title="demo_workflow.yaml" hl_lines="33-44" + description: + name: demo_workflow + description: a very simple merlin workflow + + study: + - name: create_data + description: create data for our model to train with + run: + cmd: echo "creating data" + task_queue: create_data + + - name: train + description: train a model + run: + cmd: echo "training a model on the data" + task_queue: train + + - name: predict + description: predict on unseen data + run: + cmd: echo "predict on new, unseen data" + task_queue: predict + + - name: verify + description: verify the validity of our study + run: + cmd: echo "verify our study succeeded" + task_queue: verify + + merlin: + resources: + workers: + creator: + args: -l INFO + steps: [create_data] + trainer: + args: -l INFO + steps: [train] + predictor: + args: -l INFO + steps: [predict] + verifier: + args: -l INFO + steps: [verify] + ``` + + We can start these workers with: + + ```bash + merlin run-workers demo_workflow.yaml + ``` + + Now if we query the workers *without* the `--queues` option, we'll see all four workers alive and connected to their respective queues: + +
+ ![All Four Workers From 'demo_workflow.yaml' Being Queried](../../assets/images/monitoring/queues_and_workers/queues-example-all-workers.png) +
All Four Workers From 'demo_workflow.yaml' Being Queried
+
+ + Let's refine this query to just view the workers connected to the `train` and `predict` queues: + + ```bash + merlin query-workers --queues train predict + ``` + + As we can see in the output below, only the `trainer` and `predictor` workers are now displayed: + +
+ ![Output of Query-Workers Using the Queues Option](../../assets/images/monitoring/queues_and_workers/queues-example-filtered-workers.png) +
Output of Query-Workers Using the Queues Option
+
+ +### Query Workers by Worker Regex + +There will be instances when you know precisely which workers you want to query. In such cases, the `--workers` option in the `query-workers` command proves useful. This option facilitates querying workers using [regular expressions](https://docs.python.org/3/library/re.html). As full strings are accepted as regular expressions, you can also query workers by worker name. + +**Usage:** + +```bash +merlin query-workers --workers +``` + +??? example "Example of Using the `--workers` Option With Query-Workers" + + Say we have the following spec file with 3 workers `step_1_worker`, `step_2_worker`, and `other_worker`: + + ```yaml title="demo_workflow.yaml" hl_lines="27-35" + description: + name: demo_workflow + description: a very simple merlin workflow + + study: + - name: step_1 + description: A step following the `step_*` name pattern + run: + cmd: echo "step 1" + task_queue: step_1_queue + + - name: step_2 + description: A step following the `step_*` name pattern + run: + cmd: echo "step 2" + task_queue: step_2_queue + + - name: other_step + description: A step with a different name + run: + cmd: echo "other step" + task_queue: predict + + merlin: + resources: + workers: + step_1_worker: + args: -l INFO + steps: [step_1] + step_2_worker: + args: -l INFO + steps: [step_2] + other_worker: + args: -l INFO + steps: [other_step] + ``` + + Once the workers are started for this workflow, we can then query for our `step_1_worker` and `step_2_worker` with: + + ```bash + merlin query-workers --workers step_1_worker step_2_worker + ``` + + In our output we see that both workers that we asked for were queried but `other_worker` was ignored: + +
+ ![Output of Query-Workers Using the Workers Option With Worker Names](../../assets/images/monitoring/queues_and_workers/workers-option-with-worker-names.png) +
Output of Query-Workers Using the Workers Option With Worker Names
+
+ + Alternatively, we can do the exact same query using a regular expression: + + ```bash + merlin query-workers --workers ^step + ``` + + The `^` operator for regular expressions will match the beginning of a string. In this example when we say `^step` we're asking Merlin to match any worker starting with the word `step`, which in this case is `step_1_worker` and `step_2_worker`. We can see this in the output below: + +
+ ![Output of Query-Workers Using the Workers Option With RegEx](../../assets/images/monitoring/queues_and_workers/workers-option-with-regex.png) +
Output of Query-Workers Using the Workers Option With RegEx
+
diff --git a/docs/user_guide/monitoring/status_cmds.md b/docs/user_guide/monitoring/status_cmds.md new file mode 100644 index 000000000..cb0088467 --- /dev/null +++ b/docs/user_guide/monitoring/status_cmds.md @@ -0,0 +1,702 @@ +# The Status Commands + +Monitoring the status of your studies is made accessible with two commands in Merlin: [`merlin status`](../command_line.md#status-merlin-status) and [`merlin detailed-status`](../command_line.md#detailed-status-merlin-detailed-status). [The Status Command](#the-status-command) offers a summary of your entire study's status, while [The Detailed-Status Command](#the-detailed-status-command) provides task-specific information that can be further filtered as needed. + +## How They Work + +As your study is progressing, Merlin will create `MERLIN_STATUS.json` files for each step behind the scenes. When states are modified, so are the +`MERLIN_STATUS.json` files. The `merlin status` and `merlin detailed-status` commands will read from these files and format the output in an easy-to-analyze manner. + +For steps that contain samples, a `MERLIN_STATUS.json` file is created for each sample that's ran. As sets of samples finish running, Merlin will condense all of the `MERLIN_STATUS.json` files in that set of samples into one `MERLIN_STATUS.json` file. This helps to save space in the file system for all of the other important outputs that your studies provide. + +The format of a `MERLIN_STATUS.json` file is as follows: + +```json +{ + "step_name": { + "parameters": { + "cmd": { + "TOKEN1": "value1", + "TOKEN2": "value2", + . + . + . + }, + "restart": { + "TOKEN1": "value1", + "TOKEN2": "value2", + . + . + . + } + }, + "task_queue": "name_of_queue", // (1) + "worker_name": "name_of_worker", + "step_workspace": { + "status": "", // (2) + "return_code": "", // (3) + "elapsed_time": "xd:xxh:xxm:xxs", + "run_time": "xd:xxh:xxm:xxs", + "restarts": + } + } +} +``` + +1. If you [run your study locally](../running_studies.md#local-runs), there will not be any entries for `task_queue` and `worker_name`. +2. See [Possible Statuses](#possible-statuses) below for more information. +3. See [Possible Return Codes](#possible-return-codes) below for more information. + +In the `parameters` section here, the `cmd` parameters are parameters used in the `cmd` key of [the `run` property](../specification.md#the-run-property) in a step, and the `restart` parameters are parameters +used in the `restart` key of the `run` property in a step. Both of these values may be null if no parameters are used in either key. + +If your step uses samples, a "step_workspace" entry for each sample will be created. In other words, you will have multiple "step_workspace" entries of the form "step_workspace/00", "step_workspace/01", "step_workspace/02", etc. + +!!! example "`MERLIN_STATUS.json` Format With Samples" + + Say we have a workflow that generates 3 samples and a step named `just_samples` that utilizes them. This would result in a `MERLIN_STATUS.json` file that looks like so: + + ```json + { + "just_samples": { + "parameters": { + "cmd": null, + "restart": null + }, + "task_queue": "just_samples_queue", + "worker_name": "sample_worker", + "just_samples/00": { + "status": "FINISHED", + "return_code": "MERLIN_SUCCESS", + "elapsed_time": "0d:00h:00m:00s", + "run_time": "0d:00h:00m:00s", + "restarts": 0 + }, + "just_samples/01": { + "status": "FINISHED", + "return_code": "MERLIN_SUCCESS", + "elapsed_time": "0d:00h:00m:00s", + "run_time": "0d:00h:00m:00s", + "restarts": 0 + }, + "just_samples/02": { + "status": "FINISHED", + "return_code": "MERLIN_SUCCESS", + "elapsed_time": "0d:00h:00m:00s", + "run_time": "0d:00h:00m:00s", + "restarts": 0 + } + } + } + ``` + +## Possible Statuses + +!!! note + + The `INITIALIZED` and `RUNNING` states do not have a colorblind symbol since they do not appear in the progress bar, just in the summary section of a step. + +!!! note + + Colors here are chosen based on [Bang Wong's optimized color palette for color-blind individuals](https://www.nature.com/articles/nmeth.1618). + +During the execution process of your step, there are 7 possible statuses that a task may hold: + +| Status | Description | Color | Colorblind Symbol | +| ------ | ----------- | ----- | ----------------- | +| `INITIALIZED` | Tasks in the `INITIALIZED` state have been queued but have not began processing yet |
light-blue
#56b3e9
| N/A | +| `RUNNING` | Tasks in the `RUNNING` state have began executing but have not yet entered a completed state ("completed states" are any of the states listed below) |
blue
#0072b2
| N/A | +| `FINISHED` | Tasks in the `FINISHED` state have successfully ran to completion |
green
#009e74
| █ | +| `CANCELLED` | Tasks in the `CANCELLED` state have exited with the `$(MERLIN_STOP_WORKERS)` exit code (see [Step Return Variables](../variables.md#step-return-variables)) |
yellow
#f0e442
| / | +| `FAILED` | Tasks in the `FAILED` state have exited with the `$(MERLIN_SOFT_FAIL)` or `$(MERLIN_HARD_FAIL)` exit codes (see [Step Return Variables](../variables.md#step-return-variables)) |
red
#d55e00
| ⣿ | +| `DRY_RUN` | Tasks in the DRY_RUN state have successfully run in the [Dry Run](../running_studies.md#dry-runs) mode |
orange
#e69f00
| \\ | +| `UNKNOWN` | Tasks in the `UNKNOWN` state have exited with an unknown return code |
green
#666666
| ? | + +## Possible Return Codes + +!!! note + + The return codes here follow almost directly from the [Step Return Variables](../variables.md#step-return-variables). Only the `MERLIN_DRY_SUCCESS` nad `MERLIN_UNRECOGNIZED` return codes are unique here. + +Once a task has completed running, there are 8 possible return codes that it could have: + +| Return Code | Description | +| ----------- | ----------- | +| `MERLIN_SUCCESS` | This task finished successfully | +| `MERLIN_SOFT_FAIL` | This task failed but not badly enough to stop the workflow entirely | +| `MERLIN_HARD_FAIL` | This task failed and we now need to stop the entire workflow | +| `MERLIN_RESTART` | This task needs restarted; Next run will run the "restart" command in the step unless it's undefined | +| `MERLIN_RETRY` | This task needs to be retried; Will automatically re-run the "cmd" for the step | +| `MERLIN_STOP_WORKERS` | This task exited with a code to stop the workers | +| `MERLIN_DRY_SUCCESS` | This task successfully completed in the [Dry Run](../running_studies.md#dry-runs) mode | +| `MERLIN_UNRECOGNIZED` | This task finished with an unrecognized return code | + +## Inputs + +Both the [`merlin status`](../command_line.md#status-merlin-status) and the [`merlin detailed-status`](../command_line.md#status-merlin-status) commands can take either a yaml spec file or an output workspace as input. + +=== "Status Usage" + + ```bash + merlin status + ``` + +=== "Detailed-Status Usage" + + ```bash + merlin detailed-status + ``` + +!!! example + + Say we have a spec file `hello_world.yaml` that we've ran which created an output workspace `hello_world_20230503-105137/`. We can check the status of our study with either option. + + === "Using the Spec File" + + ```bash + merlin status hello_world.yaml + ``` + + === "Using the Output Workspace" + + ```bash + merlin status hello_world_20230503-105137/ + ``` + +If you choose to provide an output study directory as input, Merlin will pull information from the expanded spec file located in the +`merlin_info/` directory. + +!!! tip + + If you're not familiar with the `merlin_info/` directory, check out [The Basics of Interpreting Output](../interpreting_output.md#the-basics) to learn more. + +If you choose to provide a spec file as input, Merlin will search the `OUTPUT_PATH` (see [Reserved Variables](../variables.md#reserved-variables) for more info on this variable) for possible studies associated with this spec. If there are multiple output directories associated with your spec file, then you will be prompted to select which study you'd like to view the status of, as is shown in the figure below. + +
+ ![Prompt when multiple studies are found](../../assets/images/monitoring/status_cmds/multiple-studies.png) +
Prompt When Multiple Studies Are Found
+
+ +If you'd like to ignore this prompt, you can use the `--no-prompts` option. This will automatically select the most recent study for you. + +**Usage:** + +```bash +merlin status --no-prompts +``` + +## The Status Command + +The [`merlin status`](../command_line.md#status-merlin-status) command is designed to help you see the overall status of your entire study. It provides you a step-by-step view with progress bars and summary tables that will show you exactly how many tasks are in each state, how many tasks in total there are for a step, the average & standard deviation of the run times for tasks in a step, and which workers & task queues each step is associated with (if applicable). + +This command requires you to select a study to view the status of. For more information on inputs to this command see [Inputs](#inputs) above. + +**Usage:** + +```bash +merlin status +``` + +??? example "Example Status Output" + +
+ ![Output of Status Command](../../assets/images/monitoring/status_cmds/status.png) +
Output of Status Command
+
+ +To help assist with colorblindness, Merlin provides the `--cb-help` option for the status command. This option will add symbols to the progress bar for different task statuses. + +**Usage:** + +```bash +merlin status --cb-help +``` + +??? example "Example Colorblind-Assisted Status Output" + +
+ ![Output of Status Command With Colorblind Assistance Enabled](../../assets/images/monitoring/status_cmds/cb-help.png) +
Output of Status Command With Colorblind Assistance Enabled
+
+ +## The Detailed-Status Command + +The [`merlin detailed-status`](../command_line.md#detailed-status-merlin-detailed-status) command is designed to help you see an in-depth status breakdown of each step in your study. It provides a task-by-task view with each task's workspace, status, return code, elapsed time, run time, and number of restarts available for you to see. + +This command requires you to select a study to view the status of. For more information on inputs to this command see [Inputs](#inputs) above. + +!!! note "Manpager Note" + + If the output of the detailed-status command looks something like this: + +
+ ![ASCII Error With Pager Functionality](../../assets/images/monitoring/status_cmds/ascii-error.png) +
ASCII Error With Pager Functionality
+
+ + Then there are a couple things you can try to fix this problem: + + 1. Set the `MANPAGER` or `PAGER` environment variable to be "less -r" and run again + + ```bash + export MANPAGER="less -r" + ``` + + 2. If the error isn't fixed after 1. above, then: + + a. You can disable the theme with the `--disable-theme` option (see [Disable Theme](#disable-theme) below). + + b. If you'd rather not disable the theme, the error usually stems from using the pager functionality, so you can try disabling that with the `--disable-pager` option (see [Disable Pager](#disable-pager) below). **Caution:** you may end up outputting a lot of information to the shell all at once when using this option. + +By default, the `merlin detailed-status` command will pull up a pager window containing the status information that was requested. Merlin uses this pager functionality to ensure we don't overload the shell by displaying too many task statuses at one time. + +**Usage:** + +```bash +merlin detailed-status +``` + +??? example "Example Detailed-Status Output Inside The Pager" + +
+ ![The output of detailed-status inside the pager](../../assets/images/monitoring/status_cmds/inside-pager.png) +
Output of Detailed-Status Inside the Pager
+
+ +To see all of the options that can be used with the pager, press `h`. To exit the pager, press `q`. + +Once you close the pager, the statuses you requested will not appear and you'll be redirected back to your normal shell view. + +??? example "Example Detailed-Status Output Once Pager Is Closed" + +
+ ![The output of detailed-status outside the pager](../../assets/images/monitoring/status_cmds/outside-pager.png) +
Output of Detailed-Status When the Pager Is Closed
+
+ +### Display Options + +The merlin status command comes equipped with four options to help modify the display output: [`--disable-pager`](#disable-pager), [`--disable-theme`](#disable-theme), [`--layout`](#layout), and [`--no-prompts`](#no-prompts). These options can all be used together or by themselves. + +#### Disable Pager + +!!! warning + + The `--disable-pager` option could cause you to unintentionally output thousands of task statuses to stdout, which may overload the shell with output. Merlin tries to help prevent this with prompts for additional filters but you should still use caution. + +The `--disable-pager` option allows you to turn off the pager functionality that is on by default. This will redirect the detailed-status output to stdout rather than the pager. + +**Usage:** + +```bash +merlin detailed-status --disable-pager +``` + +??? example "Example Detailed-Status Output With Pager Disabled" + +
+ ![The output of detailed-status with the pager disabled](../../assets/images/monitoring/status_cmds/disable-pager.png) +
Output of Detailed-Status With Pager Disabled
+
+ +When using this option you may unintentionally output a lot of information to the shell which could cause problems. To help prevent issues from too much information being printed, Merlin will prompt you to further filter your output if there are more than 250 task statuses to display: + +
+ ![Prompt Displayed When Too Many Tasks Are Found With Disable-Pager](../../assets/images/monitoring/status_cmds/prompt.png) +
Prompt Displayed When Too Many Tasks Are Found With Disable-Pager
+
+ +If you'd like to cancel the filter and the display entirely, press `c`. If you don't wish to filter and just want to display your tasks, press `n` (again, _**be cautious**_ when using this option). If you decide that filtering your tasks would be a better option, press `y` and you'll see the following filter options: + +
+ ![The prompt asking for you to provide filters](../../assets/images/monitoring/status_cmds/filter-prompt.png) +
Prompt Asking For You To Provide Filters
+
+ +!!! note + + If you put `E` or `EXIT` anywhere in the prompt, no filters will be applied and you'll be returned to the original prompt. For example, entering `FAILED, E, CANCELLED` will return you to the original prompt without filtering anything. + + +Here, the filters are equivalent to certain [Filter Options](#filter-options): + +- Limiting the number of tasks to display = [`--max-tasks`](#max-tasks) +- Filtering by status = [`--task-status`](#task-status) +- Filtering by return code = [`--return-code`](#return-code) + +It's possible to combine different filter types here. For example, a valid filter could be `FAILED, STOP_WORKERS` which would show any tasks with a `FAILED` status *and* any tasks with a `STOP_WORKERS` return code. + +If you put `MAX_TASKS` anywhere in your filter, you'll receive another prompt asking you for an integer greater than 0 to set as the limit on the number of tasks to display: + +
+ ![The prompt asking you to provide a max-tasks limit](../../assets/images/monitoring/status_cmds/max-tasks-prompt.png) +
Prompt Asking For You To Provide A Max-Tasks Limit
+
+ +#### Disable Theme + +The `--disable-theme` option allows you to disable the color scheme used in the output of the detailed-status command. + +**Usage:** + +```bash +merlin detailed-status --disable-theme +``` + +!!! example "Example Detailed-Status Output With Theme Disabled" + +
+ ![The output of detailed-status with the theme disabled](../../assets/images/monitoring/status_cmds/disable-theme.png) +
Output of Detailed-Status With Theme Disabled
+
+ +#### Layout + +By default, the `merlin detailed-status` command displays tasks on a step-by-step basis. To change this and group all of the tasks together, you can use the `--layout table` option. + +**Usage:** + +```bash +merlin detailed-status --layout +``` + +!!! example "Example Detailed-Status Output With Table Layout" + +
+ ![The output of detailed-status with the table layout](../../assets/images/monitoring/status_cmds/layout-table.png) +
Output of Detailed-Status With Table Layout
+
+ +#### No Prompts + +!!! warning + + _**Be cautious**_ when using this option with the `--disable-pager` option. You may accidentally output thousands of task statuses to the shell. + +The `--no-prompts` option is an option to disable any prompts that might be displayed while using the `detailed-status` command. There are four possible ways to use this filter, each with a slightly different result: + +- **Used with a workspace as input:** Nothing will happen here as there will be no prompts asking to select a study and no prompts asking you to filter tasks +- **Used with a spec as input:** Instead of prompting you to select a study, the most recent study will automatically be selected +- **Used with a workspace as input and the `--disable-pager` option enabled:** Any prompt that may have been displayed asking you to filter your output will be ignored +- **Used with a spec as input and the `--disable-pager` option enabled:** The most recent study will automatically be selected and any prompt that may have been displayed asking you to filter your output will be ignored + +**Usage:** + +```bash +merlin detailed-status --no-prompts +``` + +### Filter Options + +There are six filter options with the detailed-status command: [`--max-tasks`](#max-tasks), [`--return-code`](#return-code), [`--steps`](#steps), [`--task-queues`](#task-queues), [`--task-status`](#task-status), and [`--workers`](#workers). These filters can be used together or by themselves. + +!!! note + + In the images that are in the sections below, the `--disable-pager` filter is used. This is simply to show all of the output of the detailed-status filters in one place. It is *not* required (nor is it recommended) when using the filter options. + + If you don't use the `--disable-pager` option but you get strange ASCII characters in the output of the pager, see the ["Manpager Note" above](#the-detailed-status-command) above for instructions on how to fix that. + +#### Max Tasks + +The `--max-tasks` filter allows you to limit how many tasks are displayed in the output. This filter takes in an integer as input which represents the maximum number of tasks you'd like to display. + +**Usage:** + +```bash +merlin detailed-status --max-tasks +``` + +!!! example "Example Detailed-Status Output With Max Tasks Filter" + + Here, we're setting the maximum number of tasks that can be displayed to 5: + +
+ ![The output of detailed-status with the max tasks filter](../../assets/images/monitoring/status_cmds/filter-max-tasks.png) +
Output of Detailed-Status With Max-Tasks Set to 5
+
+ +#### Return Code + +The `--return-code` filter allows you to search for tasks with a certain return code. This filter can take one or more [return codes](#possible-return-codes) as input. Valid inputs include: `SUCCESS`, `SOFT_FAIL`, `HARD_FAIL`, `STOP_WORKERS`, `RESTART`, `RETRY`, `DRY_SUCCESS`, and `UNRECOGNIZED`. + +**Usage:** + +```bash +merlin detailed-status --return-code +``` + +!!! example "Example Detailed-Status Output With Return Code Filter" + + Here, we're asking to see all tasks that completed with a `SOFT_FAIL` return code: + +
+ ![The output of detailed-status with the return code filter](../../assets/images/monitoring/status_cmds/filter-return-code.png) +
Output of Detailed-Status With Return-Code Set to 'SOFT_FAIL'
+
+ +#### Steps + +The `--steps` filter allows you to view status information about specific steps. This flag can take one or multiple steps as input. If a step provided cannot be found, that step will be removed from the filter. + +**Usage:** + +```bash +merlin detailed-status --steps +``` + +!!! example "Example Detailed-Status Output With Steps Filter" + + Here, we're asking to see all task statuses from the `just_samples` and `fail_step` steps: + +
+ ![The output of detailed-status with the steps filter](../../assets/images/monitoring/status_cmds/filter-steps.png) +
Output of Detailed-Status With The Steps Filter Set to 'just_samples' and 'fail_step'
+
+ +#### Task Queues + +The `--task-queues` filter allows you to view statuses of tasks in certain task queues. This filter can take one or more queues as input. If a queue provided cannot be found, that queue will be removed from the filter. + +**Usage:** + +```bash +merlin detailed-status --task-queues +``` + +!!! example "Example Detailed-Status Output With Task Queues Filter" + + Here, we're asking to see all task statuses of tasks in the `just_parameters_queue` and `cancel_queue` queues: + +
+ ![The output of detailed-status with the task-queues filter](../../assets/images/monitoring/status_cmds/filter-task-queues.png) +
Output of Detailed-Status With The Task-Queues Filter Set to 'just_parameters_queue' and 'cancel_queue'
+
+ +#### Task Status + +The `--task-status` filter allows you to search for tasks with a certain status. This filter can take one or more statuses as input. Valid inputs include: `INITIALIZED`, `RUNNING`, `FINISHED`, `FAILED`, `CANCELLED`, `DRY_RUN`, and `UNKNOWN`. + +**Usage:** + +```bash +merlin detailed-status --task-status +``` + +!!! example "Example Detailed-Status Output With Task Status Filter" + + Here, we're asking to see all task statuses that have a `FAILED` or `UNKNOWN` status: + +
+ ![The output of detailed-status with the task-status filter](../../assets/images/monitoring/status_cmds/filter-task-status.png) +
Output of Detailed-Status With The Task-Status Filter Set to 'FAILED' and 'UNKNOWN'
+
+ +#### Workers + +The `--workers` filter allows you to search for tasks that are being run or were ran by certain celery workers. This filter can take one or more worker names as input. If a worker provided cannot be found, that worker will be removed from the filter. + +**Usage:** + +```bash +merlin detailed-status --workers +``` + +!!! example "Example Detailed-Status Output With Workers Filter" + + Here, we're asking to see all task statuses for tasks ran by the `sample_worker` worker: + +
+ ![The output of detailed-status with the workers filter](../../assets/images/monitoring/status_cmds/filter-workers.png) +
Output of Detailed-Status With The Workers Filter Set to 'sample_worker'
+
+ +## Dumping Status Info to Output Files + +Both status commands in Merlin allow you to dump to an output file. This output file must be either a `.csv` or a `.json` file. + +=== "JSON Dump" + + ```bash + merlin status --dump status_report.json + ``` + +=== "CSV Dump" + + ```bash + merlin status --dump status_report.csv + ``` + +When dumping to a file that *does not* yet exist, Merlin will create that file for you and populate it with the requested status info. + +When dumping to a file that *does* exist, Merlin will append the requested status information to that file. You can differentiate between separate dump calls by looking at the timestamps of the dumps. For CSV files this timestamp exists in the `time_of_status` column (see [Status CSV Dump Format](#csv-dump-format) below) and for JSON files this timestamp will be the top level key to the status entry (see [Status JSON Dump Format](#json-dump-format) below). + +If you use the `--dump` option with `merlin detailed-status` and *don't* provide any filters, this will provide the same output in the file you're dumping to as it would if you used `--dump` with `merlin status`. + +If you use the `--dump` option with `merlin detailed-status` and you *do* provide filters, only the statuses that match your filters will be written to the dump file. + +### CSV Dump Format + +The format of a CSV dump file for statuses is as follows: + +```bash +time_of_status,step_name,step_workspace,status,return_code,elapsed_time,run_time,restarts,cmd_parameters,restart_parameters,task_queue,worker_name +``` + +The image below shows an example of dumping the status info of tasks with `FAILED` task statuses to a CSV file, and then displaying that CSV file using the [rich-cli library](https://github.com/Textualize/rich-cli): + +
+ ![Example of dumping to a csv file and outputting its contents](../../assets/images/monitoring/status_cmds/dump-csv.png) +
An Example Showcasing How to do a Filtered CSV Dump and View its Contents
+
+ +### JSON Dump Format + +The format of a JSON dump file for statuses is almost exactly the same as the [format of the `MERLIN_STATUS.json` files](#how-they-work). The only difference is that each entry begins with a date: + +```json +{ + "YYYY-MM-DD HH:MM:SS": { + "step_name": { + "parameters": { + "cmd": { + "TOKEN1": "value1", + "TOKEN2": "value2", + "etc": "etc" + }, + "restart": { + "TOKEN1": "value1", + "TOKEN2": "value2", + "etc": "etc" + } + }, + "task_queue": "name_of_queue", + "worker_name": "name_of_worker", + "step_workspace": { + "status": "", + "return_code": "", + "elapsed_time": "xd:xxh:xxm:xxs", + "run_time": "xd:xxh:xxm:xxs", + "restarts": + } + } + } +} +``` + +The image below shows an example of dumping the status info of tasks with `FAILED` task statuses to a JSON file, and then displaying that JSON file using the [rich-cli library](https://github.com/Textualize/rich-cli): + +
+ ![Example of dumping to a json file and outputting its contents](../../assets/images/monitoring/status_cmds/dump-json.png) +
An Example Showcasing How to do a Filtered JSON Dump and View its Contents
+
+ +## Output Path and Task Server Options + +The `--output-path` (or `-o` for short) option allows users to specify a new output path to search for studies in. This option is useful *only* when a spec is provided as the input. If this option is used when an output workspace is passed as input then it will be ignored. + +**Usage:** + +```bash +merlin status --output-path +``` + +??? example "Example Usage of the `--output-path` Option" + + Say we have the following study with an `OUTPUT_PATH` variable defined to be the current working directory: + + ```yaml title="hello_samples.yaml" hl_lines="8" + description: + name: hello_samples + description: a very simple merlin workflow, with samples + + env: + variables: + N_SAMPLES: 3 + OUTPUT_PATH: . + + global.parameters: + GREET: + values : ["hello","hola"] + label : GREET.%% + + study: + - name: step_1 + description: say hello + run: + cmd: | + echo "$(GREET), $(NAME)!" + + - name: step_2 + description: print a success message + run: + cmd: print("Hurrah, we did it!") + depends: [step_1_*] + shell: /usr/bin/env python3 + + merlin: + resources: + workers: + demo_worker: + args: -l INFO --concurrency=1 + steps: [all] + samples: + generate: + cmd: python3 $(SPECROOT)/make_samples.py --filepath=$(MERLIN_INFO)/samples.csv --number=$(N_SAMPLES) + file: $(MERLIN_INFO)/samples.csv + column_labels: [NAME] + ``` + + Running this normally with: + + ```bash + merlin run hello_samples.yaml + ``` + + creates a `hello_samples_/` output workspace in the current working directory. However, if we ran this with: + + ```bash + merlin run hello_samples.yaml --vars OUTPUT_PATH=./studies + ``` + + This would create a `hello_samples_/` output workspace in a directory called `studies/` *without* modifying the original `hello_samples.yaml` file. + + In other words, we'd now have the following directory structure in our current working directory: + + ```bash + current_working_dir/ + ├── hello_samples.yaml + ├── hello_samples_/ + │ │ . + │ │ . + │ └── . + └── studies/ + └── hello_samples_/ + │ . + │ . + └── . + ``` + + Now, let's say we check the status of a study using the `hello_samples.yaml` spec as input: + + ```bash + merlin status hello_samples.yaml + ``` + + Since our original spec file was never modified, the `OUTPUT_PATH` variable there will still point to the current working directory. Therefore, the above command will look in the current working directory for studies. The output workspace located in the `studies/` directory will not be seen. + + If we'd like to see the status of the output workspace in the `studies` directory, we can use the `--output-path` option: + + ```bash + merlin status hello_samples.yaml --output-path ./studies + ``` + + This will tell the status command to look in the `studies/` directory for potential output workspaces associated with the `hello_samples.yaml` script. + +Additionally, to modify the task server from the command line you can use the `--task-server` option. However, the only currently available +option for task server is celery so you most likely will not want to use this option. + +**Usage:** + +```bash +merlin status --task-server +``` diff --git a/merlin/main.py b/merlin/main.py index e093750d5..5ad7bae9f 100644 --- a/merlin/main.py +++ b/merlin/main.py @@ -291,7 +291,7 @@ def query_queues(args): # Ensure a supported file type is provided with the dump option if args.dump is not None: - if not args.dump.endswith(".json") or not args.dump.endswith(".csv"): + if not args.dump.endswith(".json") and not args.dump.endswith(".csv"): raise ValueError("Unsupported file type. Dump files must be either '.json' or '.csv'.") spec = None diff --git a/merlin/router.py b/merlin/router.py index b07b46428..3b322aafc 100644 --- a/merlin/router.py +++ b/merlin/router.py @@ -335,7 +335,7 @@ def check_merlin_status(args: "Namespace", spec: "MerlinSpec") -> bool: # noqa total_jobs += queue_info["jobs"] # Get the queues defined in the spec - queues_in_spec = spec.get_queue_list(["all"]) + queues_in_spec = spec.get_queue_list(["all"] if args.steps is None else args.steps) LOG.debug(f"Monitor: queues_in_spec: {queues_in_spec}") # Get the active queues and the workers that are watching them diff --git a/merlin/study/celeryadapter.py b/merlin/study/celeryadapter.py index e0059dde2..94abcba39 100644 --- a/merlin/study/celeryadapter.py +++ b/merlin/study/celeryadapter.py @@ -288,9 +288,9 @@ def build_csv_queue_info(query_return: List[Tuple[str, int, int]], date: str) -> """ # Build the list of labels if necessary csv_to_dump = {"time": [date]} - for name, jobs, consumers in query_return: - csv_to_dump[f"{name}:tasks"] = [str(jobs)] - csv_to_dump[f"{name}:consumers"] = [str(consumers)] + for queue_name, queue_stats in query_return.items(): + csv_to_dump[f"{queue_name}:tasks"] = [str(queue_stats["jobs"])] + csv_to_dump[f"{queue_name}:consumers"] = [str(queue_stats["consumers"])] return csv_to_dump @@ -307,8 +307,8 @@ def build_json_queue_info(query_return: List[Tuple[str, int, int]], date: str) - json_to_dump = {date: {}} # Add info for each queue (name) - for name, jobs, consumers in query_return: - json_to_dump[date][name] = {"tasks": jobs, "consumers": consumers} + for queue_name, queue_stats in query_return.items(): + json_to_dump[date][queue_name] = {"tasks": queue_stats["jobs"], "consumers": queue_stats["consumers"]} return json_to_dump diff --git a/mkdocs.yml b/mkdocs.yml index 5843f9d0a..dab8ab114 100644 --- a/mkdocs.yml +++ b/mkdocs.yml @@ -3,7 +3,7 @@ site_name: Merlin nav: - Merlin: "index.md" - Tutorial: - - Overview: "tutorial/index.md" + - Tutorial Overview: "tutorial/index.md" - 0. Prerequisites: "tutorial/0_prerequisites.md" - 1. Introduction: "tutorial/1_introduction.md" - 2. Installation: "tutorial/2_installation.md" @@ -13,22 +13,27 @@ nav: - 6. Contribute to Merlin: "tutorial/6_contribute.md" - 7. Port Your Own Application: "tutorial/7_port_application.md" - User Guide: - - Overview: "user_guide/index.md" + - User Guide Overview: "user_guide/index.md" - Installation: "user_guide/installation.md" - Configuration: - - Overview: "user_guide/configuration/index.md" + - Configuration Overview: "user_guide/configuration/index.md" - External Server: "user_guide/configuration/external_server.md" - Merlin Server: "user_guide/configuration/merlin_server.md" - Command Line Interface: "user_guide/command_line.md" - Specification: "user_guide/specification.md" - Variables: "user_guide/variables.md" - Running Studies: "user_guide/running_studies.md" - - Interpreting Output: "user_guide/interpreting_output.md" + - Interpreting Study Output: "user_guide/interpreting_output.md" + - Monitoring Studies: + - Monitoring Overview: "user_guide/monitoring/index.md" + - The Status Commands: "user_guide/monitoring/status_cmds.md" + - Querying Queues and Workers: "user_guide/monitoring/queues_and_workers.md" + - Monitoring Studies For Persistent Allocations: "user_guide/monitoring/monitor_for_allocation.md" - Celery: "user_guide/celery.md" - Docker: "user_guide/docker.md" - Contributing: "user_guide/contributing.md" - Examples: - - Overview: "examples/index.md" + - Examples Overview: "examples/index.md" - Hello World Examples: "examples/hello.md" - Feature Demo: "examples/feature_demo.md" - Iterative Demo: "examples/iterative.md" diff --git a/tests/unit/study/test_celeryadapter.py b/tests/unit/study/test_celeryadapter.py index 399544223..2cc16de6d 100644 --- a/tests/unit/study/test_celeryadapter.py +++ b/tests/unit/study/test_celeryadapter.py @@ -401,9 +401,9 @@ def test_build_csv_queue_info(self, worker_queue_map: Dict[str, str]): expected_output = {"time": [date]} # Build the fake query return and the expected output - query_return = [] + query_return = {} for queue in worker_queue_map.values(): - query_return.append((queue, 0, 1)) + query_return[queue] = {"consumers": 1, "jobs": 0} expected_output[f"{queue}:tasks"] = ["0"] expected_output[f"{queue}:consumers"] = ["1"] @@ -426,9 +426,9 @@ def test_build_json_queue_info(self, worker_queue_map: Dict[str, str]): expected_output = {date: {}} # Build the fake query return and the expected output - query_return = [] + query_return = {} for queue in worker_queue_map.values(): - query_return.append((queue, 0, 1)) + query_return[queue] = {"consumers": 1, "jobs": 0} expected_output[date][queue] = {"tasks": 0, "consumers": 1} # Run the test @@ -448,9 +448,9 @@ def test_dump_celery_queue_info_csv(self, worker_queue_map: Dict[str, str]): expected_output = {} # Build the fake query return - query_return = [] + query_return = {} for queue in worker_queue_map.values(): - query_return.append((queue, 0, 1)) + query_return[queue] = {"consumers": 1, "jobs": 0} expected_output[f"{queue}:tasks"] = ["0"] expected_output[f"{queue}:consumers"] = ["1"] @@ -496,9 +496,9 @@ def test_dump_celery_queue_info_json(self, worker_queue_map: Dict[str, str]): expected_output = {} # Build the fake query return - query_return = [] + query_return = {} for queue in worker_queue_map.values(): - query_return.append((queue, 0, 1)) + query_return[queue] = {"consumers": 1, "jobs": 0} expected_output[queue] = {"tasks": 0, "consumers": 1} # Run the test