diff --git a/plugins/hydra_submitit_launcher/example/config.yaml b/plugins/hydra_submitit_launcher/example/config.yaml index 2dbd4efac97..abb6cb85f91 100644 --- a/plugins/hydra_submitit_launcher/example/config.yaml +++ b/plugins/hydra_submitit_launcher/example/config.yaml @@ -1,4 +1,4 @@ defaults: - - hydra/launcher: submitit + - hydra/launcher: submitit_slurm task: 1 diff --git a/plugins/hydra_submitit_launcher/hydra_plugins/hydra_submitit_launcher/config.py b/plugins/hydra_submitit_launcher/hydra_plugins/hydra_submitit_launcher/config.py index 7493ebe1dcb..93f61684b83 100644 --- a/plugins/hydra_submitit_launcher/hydra_plugins/hydra_submitit_launcher/config.py +++ b/plugins/hydra_submitit_launcher/hydra_plugins/hydra_submitit_launcher/config.py @@ -1,71 +1,17 @@ # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved from dataclasses import dataclass -from enum import Enum from typing import Optional from hydra.core.config_store import ConfigStore from hydra.types import ObjectConf -class QueueType(Enum): - auto = "auto" - local = "local" - slurm = "slurm" - - @dataclass -class SlurmQueueConf: - # Params are used to configure sbatch, for more info check: - # https://github.com/facebookincubator/submitit/blob/master/submitit/slurm/slurm.py - - # maximum time for the job in minutes - time: int = 60 - # number of cpus to use for each task - cpus_per_task: int = 10 - # number of gpus to use on each node - gpus_per_node: int = 1 - # number of tasks to spawn on each node - ntasks_per_node: int = 1 - # number of nodes to use for the job - nodes: int = 1 - # memory to reserve for the job on each node, in GB - mem: str = "${hydra.launcher.mem_limit}GB" - # slurm partition to use on the cluster - partition: Optional[str] = None - # USR1 signal delay before timeout - signal_delay_s: int = 120 - # name of the job - job_name: str = "${hydra.job.name}" - # Maximum number of retries on job timeout. - # Change this only after you confirmed your code can handle re-submission - # by properly resuming from the latest stored checkpoint. - # check the following for more info on slurm_max_num_timeout - # https://github.com/facebookincubator/submitit/blob/master/docs/checkpointing.md - max_num_timeout: int = 0 - +class BaseParams: + """Configuration shared by all executors + """ -@dataclass -class LocalQueueConf: - # local executor mocks the behavior of slurm locally - - # maximum time for the job in minutes - timeout_min: int = 60 - # number of gpus to use on each node - gpus_per_node: int = 1 - # number of tasks to spawn on each node (only one node available in local executor) - tasks_per_node: int = 1 - - -@dataclass -class AutoQueueConf: - # auto executor automatically identifies and uses available cluster - # Currently this is only slurm, but local executor can be manually forced - # instead. - # Most parameters are shared between clusters, some can be cluster specific - - # cluster to use (currently either "slurm" or "local" are supported, - # None defaults to an available cluster) - cluster: str = "slurm" + submitit_folder: str = "${hydra.sweep.dir}/.submitit/%j" # maximum time for the job in minutes timeout_min: int = 60 @@ -82,47 +28,65 @@ class AutoQueueConf: # name of the job name: str = "${hydra.job.name}" - # following parameters are SLURM specific +@dataclass +class SlurmParams(BaseParams): + """Slurm configuration overrides and specific parameters + """ + + # Params are used to configure sbatch, for more info check: + # https://github.com/facebookincubator/submitit/blob/master/submitit/slurm/slurm.py + + # Following parameters are slurm specific + # More information: https://slurm.schedmd.com/sbatch.html + # + # slurm partition to use on the cluster + partition: Optional[str] = None + comment: Optional[str] = None + constraint: Optional[str] = None + exclude: Optional[str] = None + + # Following parameters are submitit specifics + # + # USR1 signal delay before timeout + signal_delay_s: int = 120 # Maximum number of retries on job timeout. # Change this only after you confirmed your code can handle re-submission # by properly resuming from the latest stored checkpoint. # check the following for more info on slurm_max_num_timeout # https://github.com/facebookincubator/submitit/blob/master/docs/checkpointing.md - slurm_max_num_timeout: int = 0 - # USR1 signal delay before timeout for the slurm queue - slurm_signal_delay_s: int = 30 - # slurm partition to use on the cluster - slurm_partition: Optional[str] = None + max_num_timeout: int = 0 @dataclass -class QueueParams: - slurm: SlurmQueueConf = SlurmQueueConf() - local: LocalQueueConf = LocalQueueConf() - auto: AutoQueueConf = AutoQueueConf() +class LocalParams(BaseParams): + pass @dataclass -class SubmititConf: - queue: QueueType = QueueType.slurm +class SlurmConf(ObjectConf): + cls: str = "hydra_plugins.hydra_submitit_launcher.submitit_launcher.SlurmSubmititLauncher" + params: SlurmParams = SlurmParams() - folder: str = "${hydra.sweep.dir}/.${hydra.launcher.params.queue}" - queue_parameters: QueueParams = QueueParams() +@dataclass +class LocalConf(ObjectConf): + cls: str = "hydra_plugins.hydra_submitit_launcher.submitit_launcher.LocalSubmititLauncher" + params: LocalParams = LocalParams() -@dataclass -class SubmititLauncherConf(ObjectConf): - cls: str = "hydra_plugins.hydra_submitit_launcher.submitit_launcher.SubmititLauncher" - params: SubmititConf = SubmititConf() - # memory to reserve for the job on each node, in GB - mem_limit: int = 2 +# finally, register two different choices: +ConfigStore.instance().store( + group="hydra/launcher", + name="submitit_local", + node=LocalConf, + provider="submitit_launcher", +) ConfigStore.instance().store( group="hydra/launcher", - name="submitit", - node=SubmititLauncherConf, + name="submitit_slurm", + node=SlurmConf, provider="submitit_launcher", ) diff --git a/plugins/hydra_submitit_launcher/hydra_plugins/hydra_submitit_launcher/submitit_launcher.py b/plugins/hydra_submitit_launcher/hydra_plugins/hydra_submitit_launcher/submitit_launcher.py index a79e6444812..8fdb5d6ac33 100644 --- a/plugins/hydra_submitit_launcher/hydra_plugins/hydra_submitit_launcher/submitit_launcher.py +++ b/plugins/hydra_submitit_launcher/hydra_plugins/hydra_submitit_launcher/submitit_launcher.py @@ -1,8 +1,9 @@ # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved +import dataclasses import logging import os from pathlib import Path -from typing import Dict, List, Optional, Sequence +from typing import Any, Dict, List, Optional, Sequence from hydra import TaskFunction from hydra.core.config_loader import ConfigLoader @@ -19,7 +20,7 @@ from hydra.plugins.search_path_plugin import SearchPathPlugin from omegaconf import DictConfig, OmegaConf, open_dict -from hydra_plugins.hydra_submitit_launcher.config import QueueType +from .config import BaseParams, LocalParams, SlurmParams log = logging.getLogger(__name__) @@ -32,11 +33,15 @@ def manipulate_search_path(self, search_path: ConfigSearchPath) -> None: ) -class SubmititLauncher(Launcher): - def __init__(self, queue: str, folder: str, queue_parameters: DictConfig) -> None: - self.queue = queue - self.queue_parameters = queue_parameters - self.folder = folder +class BaseSubmititLauncher(Launcher): + + _EXECUTOR = "abstract" + + def __init__(self, **params: Any) -> None: + param_classes = {"local": LocalParams, "slurm": SlurmParams} + if self._EXECUTOR not in param_classes: + raise RuntimeError(f'Non-implemented "{self._EXECUTOR}" executor') + self.params = OmegaConf.structured(param_classes[self._EXECUTOR](**params)) self.config: Optional[DictConfig] = None self.config_loader: Optional[ConfigLoader] = None self.task_function: Optional[TaskFunction] = None @@ -66,12 +71,12 @@ def __call__( sweep_config = self.config_loader.load_sweep_config( self.config, sweep_overrides ) + # lazy import to ensure plugin discovery remains fast + import submitit + with open_dict(sweep_config.hydra.job) as job: # Populate new job variables - if "SLURM_JOB_ID" in os.environ: - job.id = os.environ["SLURM_JOB_ID"] - else: - job.id = job_id + job.id = submitit.JobEnvironment().job_id sweep_config.hydra.job.num = job_num return run_job( @@ -96,36 +101,33 @@ def launch( num_jobs = len(job_overrides) assert num_jobs > 0 + params = self.params - # make sure you don't change inplace - queue_parameters = self.queue_parameters.copy() - OmegaConf.set_struct(queue_parameters, True) - executors = { - QueueType.auto: submitit.AutoExecutor, - QueueType.slurm: submitit.SlurmExecutor, - QueueType.local: submitit.LocalExecutor, - } - init_parameters = {"cluster", "max_num_timeout", "slurm_max_num_timeout"} - executor = executors[self.queue]( - folder=self.folder, - **{ - x: y - for x, y in queue_parameters[self.queue.value].items() - if x in init_parameters - }, - ) - executor.update_parameters( + # build executor + init_params = {"folder": params.submitit_folder} + specific_init_keys = {"max_num_timeout"} + init_params.update( **{ - x: y - for x, y in queue_parameters[self.queue.value].items() - if x not in init_parameters + f"{self._EXECUTOR}_{x}": y + for x, y in params.items() + if x in specific_init_keys } ) + init_keys = specific_init_keys | {"submitit_folder"} + executor = submitit.AutoExecutor(cluster=self._EXECUTOR, **init_params) + + # specify resources/parameters + baseparams = set(dataclasses.asdict(BaseParams()).keys()) + params = { + x if x in baseparams else f"{self._EXECUTOR}_{x}": y + for x, y in params.items() + if x not in init_keys + } + executor.update_parameters(**params) log.info( - "Submitit '{}' sweep output dir : {}".format( - self.queue.value, self.config.hydra.sweep.dir - ) + f"Submitit '{self._EXECUTOR}' sweep output dir : " + f"{self.config.hydra.sweep.dir}" ) sweep_dir = Path(str(self.config.hydra.sweep.dir)) sweep_dir.mkdir(parents=True, exist_ok=True) @@ -151,3 +153,11 @@ def launch( jobs = executor.map_array(self, *zip(*params)) return [j.results()[0] for j in jobs] + + +class LocalSubmititLauncher(BaseSubmititLauncher): + _EXECUTOR = "local" + + +class SlurmSubmititLauncher(BaseSubmititLauncher): + _EXECUTOR = "slurm" diff --git a/plugins/hydra_submitit_launcher/tests/test_submitit_launcher.py b/plugins/hydra_submitit_launcher/tests/test_submitit_launcher.py index c640b660400..09a77ee4c48 100644 --- a/plugins/hydra_submitit_launcher/tests/test_submitit_launcher.py +++ b/plugins/hydra_submitit_launcher/tests/test_submitit_launcher.py @@ -1,4 +1,6 @@ # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved +from typing import Type + import pytest # type: ignore from hydra.core.plugins import Plugins from hydra.plugins.launcher import Launcher @@ -8,20 +10,23 @@ ) from hydra.test_utils.test_utils import chdir_plugin_root -from hydra_plugins.hydra_submitit_launcher.submitit_launcher import SubmititLauncher +from hydra_plugins.hydra_submitit_launcher import submitit_launcher chdir_plugin_root() -def test_discovery(): +@pytest.mark.parametrize( + "cls", + [submitit_launcher.LocalSubmititLauncher, submitit_launcher.SlurmSubmititLauncher], +) +def test_discovery(cls: Type[Launcher]) -> None: # Tests that this plugin can be discovered via the plugins subsystem when looking for Launchers - assert SubmititLauncher.__name__ in [ - x.__name__ for x in Plugins.instance().discover(Launcher) - ] + assert cls.__name__ in [x.__name__ for x in Plugins.instance().discover(Launcher)] @pytest.mark.parametrize( - "launcher_name, overrides", [("submitit", ["hydra.launcher.params.queue=local"])] + "launcher_name, overrides", + [("submitit_local", ["hydra.launcher.params.timeout_min=2"])], ) class TestSubmititLauncher(LauncherTestSuite): pass @@ -33,51 +38,12 @@ class TestSubmititLauncher(LauncherTestSuite): ( { "defaults": [ - {"hydra/launcher": "submitit"}, - {"hydra/hydra_logging": "hydra_debug"}, - {"hydra/job_logging": "disabled"}, - ], - "hydra": { - "launcher": { - "params": { - "queue": "local", - "folder": "${hydra.sweep.dir}/.${hydra.launcher.params.queue}", - "queue_parameters": { - "local": { - "gpus_per_node": 1, - "tasks_per_node": 1, - "timeout_min": 1, - } - }, - }, - } - }, - }, - ["-m"], - ), - # auto queue - ( - { - "defaults": [ - {"hydra/launcher": "submitit"}, + {"hydra/launcher": "submitit_local"}, {"hydra/hydra_logging": "hydra_debug"}, {"hydra/job_logging": "disabled"}, ], "hydra": { - "launcher": { - "params": { - "queue": "auto", - "folder": "${hydra.sweep.dir}/.${hydra.launcher.params.queue}", - "queue_parameters": { - "auto": { - "cluster": "local", - "gpus_per_node": 0, - "tasks_per_node": 1, - "timeout_min": 1, - } - }, - }, - } + "launcher": {"params": {"gpus_per_node": 0, "timeout_min": 1}}, }, }, ["-m"], diff --git a/website/docs/plugins/submitit_launcher.md b/website/docs/plugins/submitit_launcher.md index e9c55f6d06d..fe4878da6f3 100644 --- a/website/docs/plugins/submitit_launcher.md +++ b/website/docs/plugins/submitit_launcher.md @@ -12,135 +12,78 @@ sidebar_label: Submitit Launcher plugin The Submitit Launcher plugin provides a [SLURM ](https://slurm.schedmd.com/documentation.html) Launcher based on [Submitit](https://github.com/facebookincubator/submitit). + ### Installation This plugin requires Hydra 1.0 (Release candidate) ```commandline $ pip install hydra-submitit-launcher --pre ``` + ### Usage -Once installed, add `hydra/launcher=submitit` to your command line. Alternatively, override `hydra/launcher` in your config: +Once installed, add `hydra/launcher=submitit_slurm` to your command line. Alternatively, override `hydra/launcher` in your config: ```yaml defaults: - - hydra/launcher: submitit + - hydra/launcher: submitit_slurm ``` Note that this plugin expects a valid environment in the target host. usually this means a shared file system between the launching host and the target host. -Submitit supports 3 types of queues: auto, local and slurm. Its config looks like this -```python -class QueueType(Enum): - auto = "auto" - local = "local" - slurm = "slurm" - - -@dataclass -class SlurmQueueConf: - # Params are used to configure sbatch, for more info check: - # https://github.com/facebookincubator/submitit/blob/master/submitit/slurm/slurm.py - - # maximum time for the job in minutes - time: int = 60 - # number of cpus to use for each task - cpus_per_task: int = 10 - # number of gpus to use on each node - gpus_per_node: int = 1 - # number of tasks to spawn on each node - ntasks_per_node: int = 1 - # number of nodes to use for the job - nodes: int = 1 - # memory to reserve for the job on each node, in GB - mem: str = "${hydra.launcher.mem_limit}GB" - # slurm partition to use on the cluster - partition: Optional[str] = None - # USR1 signal delay before timeout - signal_delay_s: int = 120 - # name of the job - job_name: str = "${hydra.job.name}" - # Maximum number of retries on job timeout. - # Change this only after you confirmed your code can handle re-submission - # by properly resuming from the latest stored checkpoint. - # check the following for more info on slurm_max_num_timeout - # https://github.com/facebookincubator/submitit/blob/master/docs/checkpointing.md - max_num_timeout: int = 0 - - -@dataclass -class LocalQueueConf: - # local executor mocks the behavior of slurm locally - - # maximum time for the job in minutes - timeout_min: int = 60 - # number of gpus to use on each node - gpus_per_node: int = 1 - # number of tasks to spawn on each node (only one node available in local executor) - tasks_per_node: int = 1 - - -@dataclass -class AutoQueueConf: - # auto executor automatically identifies and uses available cluster - # Currently this is only slurm, but local executor can be manually forced - # instead. - # Most parameters are shared between clusters, some can be cluster specific - - # cluster to use (currently either "slurm" or "local" are supported, - # None defaults to an available cluster) - cluster: Optional[str] = None - - # maximum time for the job in minutes - timeout_min: int = 60 - # number of cpus to use for each task - cpus_per_task: int = 1 - # number of gpus to use on each node - gpus_per_node: int = 0 - # number of tasks to spawn on each node - tasks_per_node: int = 1 - # memory to reserve for the job on each node (in GB) - mem_gb: int = 4 - # number of nodes to use for the job - nodes: int = 1 - # name of the job - name: str = "${hydra.job.name}" - - # following parameters are SLURM specific - - # Maximum number of retries on job timeout. - # Change this only after you confirmed your code can handle re-submission - # by properly resuming from the latest stored checkpoint. - # check the following for more info on slurm_max_num_timeout - # https://github.com/facebookincubator/submitit/blob/master/docs/checkpointing.md - slurm_max_num_timeout: int = 0 - # USR1 signal delay before timeout for the slurm queue - slurm_signal_delay_s: int = 30 - # slurm partition to use on the cluster - slurm_partition: Optional[str] = None - - -@dataclass -class QueueParams: - slurm: SlurmQueueConf = SlurmQueueConf() - local: LocalQueueConf = LocalQueueConf() - auto: AutoQueueConf = AutoQueueConf() - - -@dataclass -class SubmititConf: - queue: QueueType = QueueType.local - - folder: str = "${hydra.sweep.dir}/.${hydra.launcher.params.queue}" - - queue_parameters: QueueParams = QueueParams() +Submitit actually implements 2 different launchers: `submitit_slurm` to run on a SLURM cluster, and `submitit_local` for basic local tests. + +You can discover the slurm launcher parameters with: +```text +$ python foo.py hydra/launcher=submitit_slurm --cfg hydra -p hydra.launcher +# @package hydra.launcher +cls: hydra_plugins.hydra_submitit_launcher.submitit_launcher.SlurmSubmititLauncher +params: + submitit_folder: ${hydra.sweep.dir}/.submitit/%j + timeout_min: 60 + cpus_per_task: 1 + gpus_per_node: 0 + tasks_per_node: 1 + mem_gb: 4 + nodes: 1 + name: ${hydra.job.name} + partition: null + comment: null + constraint: null + exclude: null + signal_delay_s: 120 + max_num_timeout: 0 ``` -See [Submitit documentation](https://github.com/facebookincubator/submitit) for full details about the parameters above. +Similarly, you can discover the local launcher parameters with: +```text +$ python my_app.py hydra/launcher=submitit_local --cfg hydra -p hydra.launcher +# @package hydra.launcher +cls: hydra_plugins.hydra_submitit_launcher.submitit_launcher.LocalSubmititLauncher +params: + submitit_folder: ${hydra.sweep.dir}/.submitit/%j + timeout_min: 60 + cpus_per_task: 1 + gpus_per_node: 0 + tasks_per_node: 1 + mem_gb: 4 + nodes: 1 + name: ${hydra.job.name} +``` + +You can set all these parameters in your configuration file and/or override them in the commandline: +```text +python foo.py hydra/launcher=submitit_slurm hydra.launcher.params.timeout_min=3 +``` + +For more details, including descriptions for each parameter, check out the [config file](https://github.com/facebookresearch/hydra/blob/master/plugins/hydra_submitit_launcher/hydra_plugins/hydra_submitit_launcher/config.py). You can also check the [Submitit documentation](https://github.com/facebookincubator/submitit). + + +### Example An [example application](https://github.com/facebookresearch/hydra/tree/master/plugins/hydra_submitit_launcher/example) using this launcher is provided in the plugin repository. -Starting the app with `python my_app.py task=1,2,3 -m` will launch 3 executions: +Starting the app with `python my_app.py task=1,2,3 -m` will launch 3 executions (you can override the launcher to run locally for testing by adding `hydra/launcher=submitit_local`): ```text $ python my_app.py task=1,2,3 -m @@ -161,7 +104,6 @@ $ tree │   └── my_app.log └── multirun.yaml - $ cat 0/my_app.log [2020-05-28 15:05:23,511][__main__][INFO] - Process ID 15887 executing task 1 ... [2020-05-28 15:05:24,514][submitit][INFO] - Job completed successfully