From 0ca8b0b178568e3f9a4e4ccafc241689bf0dc6c8 Mon Sep 17 00:00:00 2001 From: "Yngve S. Kristiansen" Date: Fri, 18 Oct 2024 11:51:34 +0200 Subject: [PATCH] Add torque support to Everest --- docs/everest/config_generated.rst | 68 +++++++++++++++++- src/everest/config/simulator_config.py | 39 ++++++++++- src/everest/config_keys.py | 10 +++ src/everest/detached/__init__.py | 5 +- src/everest/queue_driver/queue_driver.py | 20 +++++- tests/everest/test_config_validation.py | 6 +- .../everest/model/snake_oil_torque.yml | 69 +++++++++++++++++++ tests/everest/test_res_initialization.py | 44 +++++++++++- 8 files changed, 253 insertions(+), 8 deletions(-) create mode 100644 tests/everest/test_data/snake_oil/everest/model/snake_oil_torque.yml diff --git a/docs/everest/config_generated.rst b/docs/everest/config_generated.rst index 3b492e6cd07..004c6e80ead 100644 --- a/docs/everest/config_generated.rst +++ b/docs/everest/config_generated.rst @@ -956,7 +956,7 @@ Simulation settings **queue_system (optional)** - Type: *Optional[Literal['lsf', 'local', 'slurm']]* + Type: *Optional[Literal['lsf', 'local', 'slurm', 'torque']]* Defines which queue system the everest server runs on. @@ -1031,6 +1031,72 @@ Simulation settings optimizer. +**qsub_cmd (optional)** + Type: *Optional[str]* + + The submit command + + +**qstat_cmd (optional)** + Type: *Optional[str]* + + The query command + + +**qdel_cmd (optional)** + Type: *Optional[str]* + + The kill command + + +**qstat_options (optional)** + Type: *Optional[str]* + + Options to be supplied to the qstat command. This defaults to -x, which tells the qstat command to include exited processes. + + +**cluster_label (optional)** + Type: *Optional[str]* + + The name of the cluster you are running simulations in. + + +**memory_per_job (optional)** + Type: *Optional[str]* + + You can specify the amount of memory you will need for running your job. This will ensure that not too many jobs will run on a single shared memory node at once, possibly crashing the compute node if it runs out of memory. + You can get an indication of the memory requirement by watching the course of a local run using the htop utility. Whether you should set the peak memory usage as your requirement or a lower figure depends on how simultaneously each job will run. + The option to be supplied will be used as a string in the qsub argument. You must specify the unit, either gb or mb. + + + +**keep_qsub_output (optional)** + Type: *Optional[int]* + + Set to 1 to keep error messages from qsub. Usually only to be used if somethign is seriously wrong with the queue environment/setup. + + +**submit_sleep (optional)** + Type: *Optional[float]* + + To avoid stressing the TORQUE/PBS system you can instruct the driver to sleep for every submit request. The argument to the SUBMIT_SLEEP is the number of seconds to sleep for every submit, which can be a fraction like 0.5 + + +**queue_query_timeout (optional)** + Type: *Optional[int]* + + + The driver allows the backend TORQUE/PBS system to be flaky, i.e. it may intermittently not respond and give error messages when submitting jobs or asking for job statuses. The timeout (in seconds) determines how long ERT will wait before it will give up. Applies to job submission (qsub) and job status queries (qstat). Default is 126 seconds. + ERT will do exponential sleeps, starting at 2 seconds, and the provided timeout is a maximum. Let the timeout be sums of series like 2+4+8+16+32+64 in order to be explicit about the number of retries. Set to zero to disallow flakyness, setting it to 2 will allow for one re-attempt, and 6 will give two re-attempts. Example allowing six retries: + + + +**project_code (optional)** + Type: *Optional[str]* + + String identifier used to map hardware resource usage to a project or account. The project or account does not have to exist. + + install_jobs (optional) ----------------------- diff --git a/src/everest/config/simulator_config.py b/src/everest/config/simulator_config.py index 9ac570cee98..aac0d2b65df 100644 --- a/src/everest/config/simulator_config.py +++ b/src/everest/config/simulator_config.py @@ -65,7 +65,7 @@ class SimulatorConfig(BaseModel, HasErtQueueOptions, extra="forbid"): # type: i Examples to set memory requirement is: * rusage[mem=1000]""", ) - queue_system: Optional[Literal["lsf", "local", "slurm"]] = Field( + queue_system: Optional[Literal["lsf", "local", "slurm", "torque"]] = Field( default="local", description="Defines which queue system the everest server runs on.", ) @@ -118,3 +118,40 @@ class SimulatorConfig(BaseModel, HasErtQueueOptions, extra="forbid"): # type: i the most common use of a standard optimization with a continuous optimizer.""", ) + qsub_cmd: Optional[str] = Field(default="qsub", description="The submit command") + qstat_cmd: Optional[str] = Field(default="qstat", description="The query command") + qdel_cmd: Optional[str] = Field(default="qdel", description="The kill command") + qstat_options: Optional[str] = Field( + default="-x", + description="Options to be supplied to the qstat command. This defaults to -x, which tells the qstat command to include exited processes.", + ) + cluster_label: Optional[str] = Field( + default=None, + description="The name of the cluster you are running simulations in.", + ) + memory_per_job: Optional[str] = Field( + default=None, + description="""You can specify the amount of memory you will need for running your job. This will ensure that not too many jobs will run on a single shared memory node at once, possibly crashing the compute node if it runs out of memory. + You can get an indication of the memory requirement by watching the course of a local run using the htop utility. Whether you should set the peak memory usage as your requirement or a lower figure depends on how simultaneously each job will run. + The option to be supplied will be used as a string in the qsub argument. You must specify the unit, either gb or mb. + """, + ) + keep_qsub_output: Optional[int] = Field( + default=0, + description="Set to 1 to keep error messages from qsub. Usually only to be used if somethign is seriously wrong with the queue environment/setup.", + ) + submit_sleep: Optional[float] = Field( + default=0.5, + description="To avoid stressing the TORQUE/PBS system you can instruct the driver to sleep for every submit request. The argument to the SUBMIT_SLEEP is the number of seconds to sleep for every submit, which can be a fraction like 0.5", + ) + queue_query_timeout: Optional[int] = Field( + default=126, + description=""" + The driver allows the backend TORQUE/PBS system to be flaky, i.e. it may intermittently not respond and give error messages when submitting jobs or asking for job statuses. The timeout (in seconds) determines how long ERT will wait before it will give up. Applies to job submission (qsub) and job status queries (qstat). Default is 126 seconds. + ERT will do exponential sleeps, starting at 2 seconds, and the provided timeout is a maximum. Let the timeout be sums of series like 2+4+8+16+32+64 in order to be explicit about the number of retries. Set to zero to disallow flakyness, setting it to 2 will allow for one re-attempt, and 6 will give two re-attempts. Example allowing six retries: + """, + ) + project_code: Optional[str] = Field( + default=None, + description="String identifier used to map hardware resource usage to a project or account. The project or account does not have to exist.", + ) diff --git a/src/everest/config_keys.py b/src/everest/config_keys.py index dc713a32524..646468b6329 100644 --- a/src/everest/config_keys.py +++ b/src/everest/config_keys.py @@ -120,6 +120,16 @@ class ConfigKeys: TEMPLATE = "template" TIME_CORR = "time_correlation" TIMES_LIST = "times_list" + TORQUE = "torque" + TORQUE_QSUB_CMD = "qsub_cmd" + TORQUE_QSTAT_CMD = "qstat_cmd" + TORQUE_QDEL_CMD = "qdel_cmd" + TORQUE_QUEUE_NAME = "name" + TORQUE_CLUSTER_LABEL = "cluster_label" + TORQUE_MEMORY_PER_JOB = "memory_per_job" + TORQUE_KEEP_QSUB_OUTPUT = "keep_qsub_output" + TORQUE_SUBMIT_SLEEP = "submit_sleep" + TORQUE_PROJECT_CODE = "project_code" TYPE = "type" UPPER_BOUND = "upper_bound" USER_DEFINED_TYPE = "user_defined_type" diff --git a/src/everest/detached/__init__.py b/src/everest/detached/__init__.py index 1a446416461..024745a7606 100644 --- a/src/everest/detached/__init__.py +++ b/src/everest/detached/__init__.py @@ -385,6 +385,7 @@ def start_monitor(config: EverestConfig, callback, polling_interval=5): ], "name": "PARTITION", }, + "TORQUE": {"options": [CK.TORQUE_CLUSTER_LABEL, "CLUSTER_LABEL"], "name": "QUEUE"}, } @@ -416,7 +417,7 @@ def _generate_queue_options( config: EverestConfig, queue_options: List[Tuple[str, str]], res_queue_name: str, # Literal["LSF_QUEUE", "PARTITION"]? - queue_system: Literal["LSF", "SLURM"], + queue_system: Literal["LSF", "SLURM", "TORQUE"], ): queue_name_simulator = ( config.simulator.name if config.simulator is not None else None @@ -451,7 +452,7 @@ def _generate_queue_options( def _find_res_queue_system(config: EverestConfig): - queue_system_simulator: Literal["lsf", "local", "slurm"] = "local" + queue_system_simulator: Literal["lsf", "local", "slurm", "torque"] = "local" if config.simulator is not None: queue_system_simulator = config.simulator.queue_system or queue_system_simulator diff --git a/src/everest/queue_driver/queue_driver.py b/src/everest/queue_driver/queue_driver.py index b4454d4f57c..7caf7a06377 100644 --- a/src/everest/queue_driver/queue_driver.py +++ b/src/everest/queue_driver/queue_driver.py @@ -27,6 +27,20 @@ (ConfigKeys.SLURM_INCLUDE_HOST_OPTION, "INCLUDE_HOST"), ] +_TORQUE_OPTIONS = [ + (ConfigKeys.CORES, "MAX_RUNNING"), + (ConfigKeys.TORQUE_QSUB_CMD, "QSUB_CMD"), + (ConfigKeys.TORQUE_QSTAT_CMD, "QSTAT_CMD"), + (ConfigKeys.TORQUE_QDEL_CMD, "QDEL_CMD"), + (ConfigKeys.TORQUE_QUEUE_NAME, "QUEUE"), + (ConfigKeys.TORQUE_CLUSTER_LABEL, "CLUSTER_LABEL"), + (ConfigKeys.CORES_PER_NODE, "NUM_CPUS_PER_NODE"), + (ConfigKeys.TORQUE_MEMORY_PER_JOB, "MEMORY_PER_JOB"), + (ConfigKeys.TORQUE_KEEP_QSUB_OUTPUT, "KEEP_QSUB_OUTPUT"), + (ConfigKeys.TORQUE_SUBMIT_SLEEP, "SUBMIT_SLEEP"), + (ConfigKeys.TORQUE_PROJECT_CODE, "PROJECT_CODE"), +] + def _extract_ert_queue_options_from_simulator_config( simulator: Optional[SimulatorConfig], queue_system @@ -46,6 +60,10 @@ def _extract_ert_queue_options_from_simulator_config( simulator.cores or 8, ) ] + elif queue_system == ConfigKeys.TORQUE: + return simulator.extract_ert_queue_options( + queue_system=QueueSystem.TORQUE, everest_to_ert_key_tuples=_TORQUE_OPTIONS + ) elif queue_system == ConfigKeys.SLURM: return simulator.extract_ert_queue_options( queue_system=QueueSystem.SLURM, everest_to_ert_key_tuples=_SLURM_OPTIONS @@ -53,7 +71,7 @@ def _extract_ert_queue_options_from_simulator_config( raise KeyError( f"Invalid queue_system: {queue_system}, " - "expected one of: ['lsf', 'local', 'slurm']" + "expected one of: ['lsf', 'local', 'slurm', 'torque']" ) diff --git a/tests/everest/test_config_validation.py b/tests/everest/test_config_validation.py index bb43657325c..fb514b868c4 100644 --- a/tests/everest/test_config_validation.py +++ b/tests/everest/test_config_validation.py @@ -140,11 +140,13 @@ def test_that_invalid_queue_system_errors(): with pytest.raises(ValueError) as e: EverestConfig.with_defaults(simulator={"queue_system": "docal"}) - assert has_error(e.value, match="Input should be 'lsf', 'local' or 'slurm'") - + assert has_error( + e.value, match="Input should be 'lsf', 'local', 'slurm' or 'torque'" + ) EverestConfig.with_defaults(simulator={"queue_system": "local"}) EverestConfig.with_defaults(simulator={"queue_system": "lsf"}) EverestConfig.with_defaults(simulator={"queue_system": "slurm"}) + EverestConfig.with_defaults(simulator={"queue_system": "torque"}) @pytest.mark.parametrize( diff --git a/tests/everest/test_data/snake_oil/everest/model/snake_oil_torque.yml b/tests/everest/test_data/snake_oil/everest/model/snake_oil_torque.yml new file mode 100644 index 00000000000..0e725f1f6e5 --- /dev/null +++ b/tests/everest/test_data/snake_oil/everest/model/snake_oil_torque.yml @@ -0,0 +1,69 @@ +definitions: + eclbase: eclipse/ECL + +wells: + - {name: W1} + - {name: W2} + - {name: W3} + - {name: W4} + +controls: + - + name: group_0 + type: well_control + min: 0 + max: 1 + variables: + - + name: W1 + initial_guess: 0 + - + name: W2 + initial_guess: 0 + - + name: W3 + initial_guess: 1 + - + name: W4 + initial_guess: 1 + +objective_functions: + - + name: snake_oil_nvp + +install_jobs: + - + name: snake_oil_diff + source: ../../jobs/SNAKE_OIL_DIFF + - + name: snake_oil_simulator + source: ../../jobs/SNAKE_OIL_SIMULATOR + - + name: snake_oil_npv + source: ../../jobs/SNAKE_OIL_NPV + +optimization: + algorithm: optpp_q_newton + +simulator: + max_runtime: 3600 + queue_system: torque + name: permanent_8 + qsub_cmd: qsub + qstat_cmd: qstat + qdel_cmd: qdel + memory_per_job: 100mb + keep_qsub_output: 1 + submit_sleep: 0.5 + project_code: snake_oil_pc + +environment: + simulation_folder: simulations + +model: + realizations: [0] + +forward_model: + - snake_oil_simulator + - snake_oil_npv + - snake_oil_diff diff --git a/tests/everest/test_res_initialization.py b/tests/everest/test_res_initialization.py index 94c9e433ccf..2140e362206 100644 --- a/tests/everest/test_res_initialization.py +++ b/tests/everest/test_res_initialization.py @@ -12,7 +12,10 @@ from everest.config.install_job_config import InstallJobConfig from everest.config.well_config import WellConfig from everest.config.workflow_config import WorkflowConfig -from everest.simulator.everest_to_ert import _everest_to_ert_config_dict +from everest.simulator.everest_to_ert import ( + _everest_to_ert_config_dict, + everest_to_ert_config, +) from everest.util.forward_models import collect_forward_models from tests.everest.utils import ( everest_default_jobs, @@ -264,6 +267,45 @@ def test_snake_everest_to_ert_slurm(copy_test_data_to_tmp): ) +def test_snake_everest_to_ert_torque(copy_test_data_to_tmp): + snake_torque_config_path = os.path.join(SNAKE_CONFIG_DIR, "snake_oil_torque.yml") + + ever_config = EverestConfig.load_file(snake_torque_config_path) + ert_config_dict = _everest_to_ert_config_dict(ever_config) + + assert ert_config_dict["QUEUE_SYSTEM"] == "TORQUE" + + expected_queue_option_tuples = { + ("TORQUE", "QSUB_CMD", "qsub"), + ("TORQUE", "QSTAT_CMD", "qstat"), + ("TORQUE", "QDEL_CMD", "qdel"), + ("TORQUE", "QUEUE", "permanent_8"), + ("TORQUE", "MEMORY_PER_JOB", "100mb"), + ("TORQUE", "KEEP_QSUB_OUTPUT", 1), + ("TORQUE", "SUBMIT_SLEEP", 0.5), + ("TORQUE", "PROJECT_CODE", "snake_oil_pc"), + } + + assert set(ert_config_dict["QUEUE_OPTION"]) == expected_queue_option_tuples + + ert_config = everest_to_ert_config(ever_config) + + qc = ert_config.queue_config + qo = qc.queue_options + assert qc.queue_system == "TORQUE" + assert {k: v for k, v in qo.driver_options.items() if v is not None} == { + "project_code": "snake_oil_pc", + "qsub_cmd": "qsub", + "qstat_cmd": "qstat", + "qdel_cmd": "qdel", + "memory_per_job": "100mb", + "num_cpus_per_node": 1, + "num_nodes": 1, + "keep_qsub_output": True, + "queue_name": "permanent_8", + } + + @patch.dict("os.environ", {"USER": "NO_USERNAME"}) def test_tutorial_everest_to_ert(copy_test_data_to_tmp): tutorial_config_path = os.path.join(TUTORIAL_CONFIG_DIR, "mocked_test_case.yml")