diff --git a/src/aiidalab_qe/app/submission/__init__.py b/src/aiidalab_qe/app/submission/__init__.py index c62011272..f58559172 100644 --- a/src/aiidalab_qe/app/submission/__init__.py +++ b/src/aiidalab_qe/app/submission/__init__.py @@ -34,13 +34,17 @@ def __init__(self, model: SubmissionModel, qe_auto_setup=True, **kwargs): self._model = model self._model.observe( - self._on_process_change, - "process", + self._on_input_structure_change, + "input_structure", ) self._model.observe( self._on_input_parameters_change, "input_parameters", ) + self._model.observe( + self._on_process_change, + "process", + ) self._model.observe( self._on_submission_blockers_change, [ @@ -73,26 +77,6 @@ def __init__(self, model: SubmissionModel, qe_auto_setup=True, **kwargs): self.qe_auto_setup = qe_auto_setup - self._ALERT_MESSAGE = """ -
- - × - - - × - - {message} -
- """ - plugin_codes: PluginCodes = get_entry_items("aiidalab_qe.properties", "code") plugin_codes.update( { @@ -197,6 +181,12 @@ def render(self): (self.submission_blocker_messages, "value"), ) + self.submission_warning_messages = ipw.HTML() + ipw.dlink( + (self._model, "submission_warning_messages"), + (self.submission_warning_messages, "value"), + ) + self.children = [ ipw.HTML("""
@@ -215,6 +205,7 @@ def render(self): self.sssp_installation, self.qe_setup, self.submission_blocker_messages, + self.submission_warning_messages, ipw.HTML("""

Labeling Your Job

@@ -233,10 +224,20 @@ def render(self): self.rendered = True + # Render and set up default PW code pw_code = self._model.get_code("dft", "pw") pw_code.activate() + pw_code_widget = pw_code.get_setup_widget() + pw_code_widget.num_cpus.observe( + self._on_pw_code_resource_change, + "value", + ) + pw_code_widget.num_nodes.observe( + self._on_pw_code_resource_change, + "value", + ) - # Render any active codes starting with pw + # Render any other active codes self._toggle_code(pw_code) for _, code in self._model.get_codes(flat=True): if code is not pw_code and code.is_active: @@ -251,6 +252,14 @@ def reset(self): def _on_previous_step_state_change(self, _): self._update_state() + def _on_input_structure_change(self, _): + self._model.check_resources() + + def _on_input_parameters_change(self, _): + self._model.update_active_codes() + self._model.update_process_label() + self._model.update_submission_blockers() + def _on_process_change(self, _): with self.hold_trait_notifications(): # TODO why here? Do we not populate traits earlier that would cover this? @@ -258,11 +267,6 @@ def _on_process_change(self, _): self._model.input_structure = self._model.process.inputs.structure self._update_state() - def _on_input_parameters_change(self, _): - self._model.update_active_codes() - self._model.update_process_label() - self._model.update_submission_blockers() - def _on_submission_blockers_change(self, _): self._model.update_submission_blocker_message() self._update_state() @@ -282,6 +286,9 @@ def _on_code_activation_change(self, change): def _on_code_selection_change(self, _): self._model.update_submission_blockers() + def _on_pw_code_resource_change(self, _): + self._model.check_resources() + def _on_submission(self, _): self._model.submit() self._update_state() diff --git a/src/aiidalab_qe/app/submission/model.py b/src/aiidalab_qe/app/submission/model.py index 7dc775ac4..51baf0572 100644 --- a/src/aiidalab_qe/app/submission/model.py +++ b/src/aiidalab_qe/app/submission/model.py @@ -1,5 +1,6 @@ from __future__ import annotations +import os import typing as t from copy import deepcopy @@ -35,6 +36,7 @@ class SubmissionModel(tl.HasTraits): process_description = tl.Unicode("") submission_blocker_messages = tl.Unicode("") + submission_warning_messages = tl.Unicode("") installing_qe = tl.Bool(False) installing_sssp = tl.Bool(False) @@ -55,6 +57,21 @@ class SubmissionModel(tl.HasTraits): code_widgets: dict[str, QEAppComputationalResourcesWidget] = {} + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + + self._RUN_ON_LOCALHOST_NUM_SITES_WARN_THRESHOLD = 10 + self._RUN_ON_LOCALHOST_VOLUME_WARN_THRESHOLD = 1000 # \AA^3 + + self._ALERT_MESSAGE = """ +
+ + × + + {message} +
+ """ + @property def is_blocked(self): return any( @@ -84,6 +101,89 @@ def submit(self): ) self.process = process + def check_resources(self): + pw_code_model = self.get_code("dft", "pw") + + if not self.input_structure or not pw_code_model.selected: + return # No code selected or no structure, so nothing to do + + pw_code = pw_code_model.get_setup_widget() + num_cpus = pw_code.num_cpus.value * pw_code.num_nodes.value + on_localhost = orm.load_node(pw_code.value).computer.hostname == "localhost" + num_sites = len(self.input_structure.sites) + volume = self.input_structure.get_cell_volume() + + try: + localhost_cpus = len(os.sched_getaffinity(0)) + except Exception: + # Fallback, in some OS os.sched_getaffinity(0) is not supported + # However, not so reliable in containers + localhost_cpus = os.cpu_count() + + large_system = ( + num_sites > self._RUN_ON_LOCALHOST_NUM_SITES_WARN_THRESHOLD + or volume > self._RUN_ON_LOCALHOST_VOLUME_WARN_THRESHOLD + ) + + # Estimated number of CPUs for a run less than 12 hours. + estimated_CPUs = self._estimate_min_cpus(num_sites, volume) + + # List of possible suggestions for warnings: + suggestions = { + "more_resources": f"
  • Increase the resources (total number of CPUs should be equal or more than {min(100,estimated_CPUs)}, if possible)
  • ", + "change_configuration": "
  • Review the configuration (e.g. choosing fast protocol - this will affect precision)
  • ", + "go_remote": "
  • Select a code that runs on a larger machine
  • ", + "avoid_overloading": "
  • Reduce the number of CPUs to avoid the overloading of the local machine
  • ", + } + + alert_message = "" + if large_system and estimated_CPUs > num_cpus: + # This part is in common between Warnings 1 (2): + # (not) on localhost, big system and few cpus + warnings_1_2 = ( + f" Warning: The selected structure is large, with {num_sites} atoms " + f"and a volume of {int(volume)} Å3, " + "making it computationally demanding " + "to run at the localhost. Consider the following: " + if on_localhost + else "to run in a reasonable amount of time. Consider the following: " + ) + # Warning 1: on localhost, big system and few cpus + alert_message += ( + f"{warnings_1_2}" + if on_localhost + else f"{warnings_1_2}" + ) + if on_localhost and num_cpus / localhost_cpus > 0.8: + # Warning-3: on localhost, more than half of the available cpus + alert_message += ( + " Warning: the selected pw.x code will run locally, but " + f"the number of requested CPUs ({num_cpus}) is larger than the 80% of the available resources ({localhost_cpus}). " + "Please be sure that your local " + "environment has enough free CPUs for the calculation. Consider the following: " + "" + ) + + self.submission_warning_messages = ( + "" + if (on_localhost and num_cpus / localhost_cpus) <= 0.8 + and (not large_system or estimated_CPUs <= num_cpus) + else self._ALERT_MESSAGE.format( + alert_class="warning", + message=alert_message, + ) + ) + def update_active_codes(self): for name, code in self.get_codes(flat=True): if name != "pw": @@ -278,3 +378,49 @@ def _check_submission_blockers(self): yield ( f"Error: hi, plugin developer, please use the QEAppComputationalResourcesWidget from aiidalab_qe.common.widgets for code {name}." ) + + def _estimate_min_cpus( + self, + n, + v, + n0=9, + v0=117, + num_cpus0=4, + t0=129.6, + tmax=12 * 60 * 60, + scf_cycles=5, + ): + """Estimate the minimum number of CPUs required to + complete a task within a given time limit. + + Parameters + ---------- + `n` : `int` + The number of atoms in the system. + `v` : `float` + The volume of the system. + `n0` : `int`, optional + Reference number of atoms. Default is 9. + `v0` : `float`, optional + Reference volume. Default is 117. + `num_cpus0` : `int`, optional + Reference number of CPUs. Default is 4. + `t0` : `float`, optional + Reference time. Default is 129.6. + `tmax` : `float`, optional + Maximum time limit. Default is 12 hours. + `scf_cycles` : `int`, optional + Reference number of SCF cycles in a relaxation. Default is 5. + + Returns + ------- + `int` + The estimated minimum number of CPUs required. + """ + import numpy as np + + return int( + np.ceil( + scf_cycles * num_cpus0 * (n / n0) ** 3 * (v / v0) ** 1.5 * t0 / tmax + ) + ) diff --git a/tests/conftest.py b/tests/conftest.py index 64ac89d0b..851c96d46 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -117,6 +117,13 @@ def _generate_structure_data(name="silicon", pbc=(True, True, True)): structure.append_atom(position=(0.0, 0.0, 1.0), symbols="O") structure.append_atom(position=(0.0, 1.0, 0.0), symbols="H") + elif name == "H2O-larger": + cell = [[20.0, 0.0, 0.0], [0.0, 20.0, 0.0], [0.0, 0.0, 20.0]] + structure = orm.StructureData(cell=cell) + structure.append_atom(position=(0.0, 0.0, 0.0), symbols="H") + structure.append_atom(position=(0.0, 0.0, 1.0), symbols="O") + structure.append_atom(position=(0.0, 1.0, 0.0), symbols="H") + structure.pbc = pbc return structure diff --git a/tests/test_submit_qe_workchain.py b/tests/test_submit_qe_workchain.py index 1a5955691..cfa6afe57 100644 --- a/tests/test_submit_qe_workchain.py +++ b/tests/test_submit_qe_workchain.py @@ -129,6 +129,55 @@ def test_create_builder_advanced_settings( ) +@pytest.mark.usefixtures("aiida_profile_clean", "sssp") +def test_warning_messages( + generate_structure_data, + submit_app_generator, +): + """Test the creation of the warning messages. + + For now, we test that the suggestions are indeed there. + We should check the whole message, but this is for now not easy to do: the message is built + on the fly with variables which are not accessible in this namespace. + """ + import os + + suggestions = { + "more_resources": "Increase the resources", + "change_configuration": "Review the configuration", + "go_remote": "Select a code that runs on a larger machine", + "avoid_overloading": "Reduce the number of CPUs to avoid the overloading of the local machine", + } + + app: App = submit_app_generator(properties=["bands", "pdos"]) + submit_step = app.submit_step + submit_model = app.submit_model + + pw_code = submit_model.get_code("dft", "pw").get_setup_widget() + pw_code.num_cpus.value = 1 + submit_model.check_resources() + # no warning: + assert submit_step.submission_warning_messages.value == "" + + # now we increase the resources, so we should have the Warning-3 + pw_code.num_cpus.value = len(os.sched_getaffinity(0)) + submit_model.check_resources() + for suggestion in ["avoid_overloading", "go_remote"]: + assert suggestions[suggestion] in submit_step.submission_warning_messages.value + + # now we use a large structure, so we should have the Warning-1 (and 2 if not on localhost) + structure = generate_structure_data("H2O-larger") + submit_model.input_structure = structure + pw_code.num_cpus.value = 1 + submit_model.check_resources() + num_sites = len(structure.sites) + volume = structure.get_cell_volume() + estimated_CPUs = submit_model._estimate_min_cpus(num_sites, volume) + assert estimated_CPUs == 2 + for suggestion in ["more_resources", "change_configuration"]: + assert suggestions[suggestion] in submit_step.submission_warning_messages.value + + def builder_to_readable_dict(builder): """transverse the builder and return a dictionary with readable values.""" from aiida import orm