Handle aiidalab#840 and aiidalab#862

edan-bainglass · Oct 23, 2024 · 33ae705 · 33ae705
1 parent 4c35f98
commit 33ae705
Show file tree

Hide file tree

Showing 4 changed files with 237 additions and 28 deletions.
diff --git a/src/aiidalab_qe/app/submission/__init__.py b/src/aiidalab_qe/app/submission/__init__.py
@@ -34,13 +34,17 @@ def __init__(self, model: SubmissionModel, qe_auto_setup=True, **kwargs):
 
         self._model = model
         self._model.observe(
-            self._on_process_change,
-            "process",
+            self._on_input_structure_change,
+            "input_structure",
         )
         self._model.observe(
             self._on_input_parameters_change,
             "input_parameters",
         )
+        self._model.observe(
+            self._on_process_change,
+            "process",
+        )
         self._model.observe(
             self._on_submission_blockers_change,
             [
@@ -73,26 +77,6 @@ def __init__(self, model: SubmissionModel, qe_auto_setup=True, **kwargs):
 
         self.qe_auto_setup = qe_auto_setup
 
-        self._ALERT_MESSAGE = """
-            <div class="alert alert-{alert_class} alert-dismissible">
-                <a
-                    href="#"
-                    class="close"
-                    data-dismiss="alert"
-                    aria-label="close"
-                >
-                    &times;
-                </a>
-                <span
-                    class="closebtn"
-                    onclick="this.parentElement.style.display='none';"
-                >
-                    &times;
-                </span>
-                <strong>{message}</strong>
-            </div>
-        """
-
         plugin_codes: PluginCodes = get_entry_items("aiidalab_qe.properties", "code")
         plugin_codes.update(
             {
@@ -197,6 +181,12 @@ def render(self):
             (self.submission_blocker_messages, "value"),
         )
 
+        self.submission_warning_messages = ipw.HTML()
+        ipw.dlink(
+            (self._model, "submission_warning_messages"),
+            (self.submission_warning_messages, "value"),
+        )
+
         self.children = [
             ipw.HTML("""
                 <div style="padding-top: 0px; padding-bottom: 0px">
@@ -215,6 +205,7 @@ def render(self):
             self.sssp_installation,
             self.qe_setup,
             self.submission_blocker_messages,
+            self.submission_warning_messages,
             ipw.HTML("""
                 <div style="padding-top: 0px; padding-bottom: 0px">
                     <h4>Labeling Your Job</h4>
@@ -233,10 +224,20 @@ def render(self):
 
         self.rendered = True
 
+        # Render and set up default PW code
         pw_code = self._model.get_code("dft", "pw")
         pw_code.activate()
+        pw_code_widget = pw_code.get_setup_widget()
+        pw_code_widget.num_cpus.observe(
+            self._on_pw_code_resource_change,
+            "value",
+        )
+        pw_code_widget.num_nodes.observe(
+            self._on_pw_code_resource_change,
+            "value",
+        )
 
-        # Render any active codes starting with pw
+        # Render any other active codes
         self._toggle_code(pw_code)
         for _, code in self._model.get_codes(flat=True):
             if code is not pw_code and code.is_active:
@@ -251,18 +252,21 @@ def reset(self):
     def _on_previous_step_state_change(self, _):
         self._update_state()
 
+    def _on_input_structure_change(self, _):
+        self._model.check_resources()
+
+    def _on_input_parameters_change(self, _):
+        self._model.update_active_codes()
+        self._model.update_process_label()
+        self._model.update_submission_blockers()
+
     def _on_process_change(self, _):
         with self.hold_trait_notifications():
             # TODO why here? Do we not populate traits earlier that would cover this?
             if self._model.process is not None:
                 self._model.input_structure = self._model.process.inputs.structure
             self._update_state()
 
-    def _on_input_parameters_change(self, _):
-        self._model.update_active_codes()
-        self._model.update_process_label()
-        self._model.update_submission_blockers()
-
     def _on_submission_blockers_change(self, _):
         self._model.update_submission_blocker_message()
         self._update_state()
@@ -282,6 +286,9 @@ def _on_code_activation_change(self, change):
     def _on_code_selection_change(self, _):
         self._model.update_submission_blockers()
 
+    def _on_pw_code_resource_change(self, _):
+        self._model.check_resources()
+
     def _on_submission(self, _):
         self._model.submit()
         self._update_state()

diff --git a/src/aiidalab_qe/app/submission/model.py b/src/aiidalab_qe/app/submission/model.py
@@ -1,5 +1,6 @@
 from __future__ import annotations
 
+import os
 import typing as t
 from copy import deepcopy
 
@@ -35,6 +36,7 @@ class SubmissionModel(tl.HasTraits):
     process_description = tl.Unicode("")
 
     submission_blocker_messages = tl.Unicode("")
+    submission_warning_messages = tl.Unicode("")
 
     installing_qe = tl.Bool(False)
     installing_sssp = tl.Bool(False)
@@ -55,6 +57,21 @@ class SubmissionModel(tl.HasTraits):
 
     code_widgets: dict[str, QEAppComputationalResourcesWidget] = {}
 
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+
+        self._RUN_ON_LOCALHOST_NUM_SITES_WARN_THRESHOLD = 10
+        self._RUN_ON_LOCALHOST_VOLUME_WARN_THRESHOLD = 1000  # \AA^3
+
+        self._ALERT_MESSAGE = """
+            <div class="alert alert-{alert_class} alert-dismissible">
+                <a href="#" class="close" data-dismiss="alert" aria-label="close">
+                    &times;
+                </a>
+                <strong>{message}</strong>
+            </div>
+        """
+
     @property
     def is_blocked(self):
         return any(
@@ -84,6 +101,89 @@ def submit(self):
             )
             self.process = process
 
+    def check_resources(self):
+        pw_code_model = self.get_code("dft", "pw")
+
+        if not self.input_structure or not pw_code_model.selected:
+            return  # No code selected or no structure, so nothing to do
+
+        pw_code = pw_code_model.get_setup_widget()
+        num_cpus = pw_code.num_cpus.value * pw_code.num_nodes.value
+        on_localhost = orm.load_node(pw_code.value).computer.hostname == "localhost"
+        num_sites = len(self.input_structure.sites)
+        volume = self.input_structure.get_cell_volume()
+
+        try:
+            localhost_cpus = len(os.sched_getaffinity(0))
+        except Exception:
+            # Fallback, in some OS os.sched_getaffinity(0) is not supported
+            # However, not so reliable in containers
+            localhost_cpus = os.cpu_count()
+
+        large_system = (
+            num_sites > self._RUN_ON_LOCALHOST_NUM_SITES_WARN_THRESHOLD
+            or volume > self._RUN_ON_LOCALHOST_VOLUME_WARN_THRESHOLD
+        )
+
+        # Estimated number of CPUs for a run less than 12 hours.
+        estimated_CPUs = self._estimate_min_cpus(num_sites, volume)
+
+        # List of possible suggestions for warnings:
+        suggestions = {
+            "more_resources": f"<li>Increase the resources (total number of CPUs should be equal or more than {min(100,estimated_CPUs)}, if possible) </li>",
+            "change_configuration": "<li>Review the configuration (e.g. choosing <i>fast protocol</i> - this will affect precision) </li>",
+            "go_remote": "<li>Select a code that runs on a larger machine</li>",
+            "avoid_overloading": "<li>Reduce the number of CPUs to avoid the overloading of the local machine </li>",
+        }
+
+        alert_message = ""
+        if large_system and estimated_CPUs > num_cpus:
+            # This part is in common between Warnings 1 (2):
+            # (not) on localhost, big system and few cpus
+            warnings_1_2 = (
+                f"<span>&#9888;</span> Warning: The selected structure is large, with {num_sites} atoms "
+                f"and a volume of {int(volume)} Å<sup>3</sup>, "
+                "making it computationally demanding "
+                "to run at the localhost. Consider the following: "
+                if on_localhost
+                else "to run in a reasonable amount of time. Consider the following: "
+            )
+            # Warning 1: on localhost, big system and few cpus
+            alert_message += (
+                f"{warnings_1_2}<ul>"
+                + suggestions["more_resources"]
+                + suggestions["change_configuration"]
+                + "</ul>"
+                if on_localhost
+                else f"{warnings_1_2}<ul>"
+                + suggestions["go_remote"]
+                + suggestions["more_resources"]
+                + suggestions["change_configuration"]
+                + "</ul>"
+            )
+        if on_localhost and num_cpus / localhost_cpus > 0.8:
+            # Warning-3: on localhost, more than half of the available cpus
+            alert_message += (
+                "<span>&#9888;</span> Warning: the selected pw.x code will run locally, but "
+                f"the number of requested CPUs ({num_cpus}) is larger than the 80% of the available resources ({localhost_cpus}). "
+                "Please be sure that your local "
+                "environment has enough free CPUs for the calculation. Consider the following: "
+                "<ul>"
+                + suggestions["avoid_overloading"]
+                + suggestions["go_remote"]
+                + "</ul>"
+            )
+
+        self.submission_warning_messages = (
+            ""
+            if (on_localhost and num_cpus / localhost_cpus) <= 0.8
+            and (not large_system or estimated_CPUs <= num_cpus)
+            else self._ALERT_MESSAGE.format(
+                alert_class="warning",
+                message=alert_message,
+            )
+        )
+
     def update_active_codes(self):
         for name, code in self.get_codes(flat=True):
             if name != "pw":
@@ -278,3 +378,49 @@ def _check_submission_blockers(self):
                 yield (
                     f"Error: hi, plugin developer, please use the QEAppComputationalResourcesWidget from aiidalab_qe.common.widgets for code {name}."
                 )
+
+    def _estimate_min_cpus(
+        self,
+        n,
+        v,
+        n0=9,
+        v0=117,
+        num_cpus0=4,
+        t0=129.6,
+        tmax=12 * 60 * 60,
+        scf_cycles=5,
+    ):
+        """Estimate the minimum number of CPUs required to
+        complete a task within a given time limit.
+
+        Parameters
+        ----------
+        `n` : `int`
+            The number of atoms in the system.
+        `v` : `float`
+            The volume of the system.
+        `n0` : `int`, optional
+            Reference number of atoms. Default is 9.
+        `v0` : `float`, optional
+            Reference volume. Default is 117.
+        `num_cpus0` : `int`, optional
+            Reference number of CPUs. Default is 4.
+        `t0` : `float`, optional
+            Reference time. Default is 129.6.
+        `tmax` : `float`, optional
+            Maximum time limit. Default is 12 hours.
+        `scf_cycles` : `int`, optional
+            Reference number of SCF cycles in a relaxation. Default is 5.
+
+        Returns
+        -------
+        `int`
+            The estimated minimum number of CPUs required.
+        """
+        import numpy as np
+
+        return int(
+            np.ceil(
+                scf_cycles * num_cpus0 * (n / n0) ** 3 * (v / v0) ** 1.5 * t0 / tmax
+            )
+        )
diff --git a/tests/conftest.py b/tests/conftest.py
@@ -117,6 +117,13 @@ def _generate_structure_data(name="silicon", pbc=(True, True, True)):
             structure.append_atom(position=(0.0, 0.0, 1.0), symbols="O")
             structure.append_atom(position=(0.0, 1.0, 0.0), symbols="H")
 
+        elif name == "H2O-larger":
+            cell = [[20.0, 0.0, 0.0], [0.0, 20.0, 0.0], [0.0, 0.0, 20.0]]
+            structure = orm.StructureData(cell=cell)
+            structure.append_atom(position=(0.0, 0.0, 0.0), symbols="H")
+            structure.append_atom(position=(0.0, 0.0, 1.0), symbols="O")
+            structure.append_atom(position=(0.0, 1.0, 0.0), symbols="H")
+
         structure.pbc = pbc
 
         return structure

diff --git a/tests/test_submit_qe_workchain.py b/tests/test_submit_qe_workchain.py
@@ -129,6 +129,55 @@ def test_create_builder_advanced_settings(
     )
 
 
+@pytest.mark.usefixtures("aiida_profile_clean", "sssp")
+def test_warning_messages(
+    generate_structure_data,
+    submit_app_generator,
+):
+    """Test the creation of the warning messages.
+
+    For now, we test that the suggestions are indeed there.
+    We should check the whole message, but this is for now not easy to do: the message is built
+    on the fly with variables which are not accessible in this namespace.
+    """
+    import os
+
+    suggestions = {
+        "more_resources": "Increase the resources",
+        "change_configuration": "Review the configuration",
+        "go_remote": "Select a code that runs on a larger machine",
+        "avoid_overloading": "Reduce the number of CPUs to avoid the overloading of the local machine",
+    }
+
+    app: App = submit_app_generator(properties=["bands", "pdos"])
+    submit_step = app.submit_step
+    submit_model = app.submit_model
+
+    pw_code = submit_model.get_code("dft", "pw").get_setup_widget()
+    pw_code.num_cpus.value = 1
+    submit_model.check_resources()
+    # no warning:
+    assert submit_step.submission_warning_messages.value == ""
+
+    # now we increase the resources, so we should have the Warning-3
+    pw_code.num_cpus.value = len(os.sched_getaffinity(0))
+    submit_model.check_resources()
+    for suggestion in ["avoid_overloading", "go_remote"]:
+        assert suggestions[suggestion] in submit_step.submission_warning_messages.value
+
+    # now we use a large structure, so we should have the Warning-1 (and 2 if not on localhost)
+    structure = generate_structure_data("H2O-larger")
+    submit_model.input_structure = structure
+    pw_code.num_cpus.value = 1
+    submit_model.check_resources()
+    num_sites = len(structure.sites)
+    volume = structure.get_cell_volume()
+    estimated_CPUs = submit_model._estimate_min_cpus(num_sites, volume)
+    assert estimated_CPUs == 2
+    for suggestion in ["more_resources", "change_configuration"]:
+        assert suggestions[suggestion] in submit_step.submission_warning_messages.value
+
+
 def builder_to_readable_dict(builder):
     """transverse the builder and return a dictionary with readable values."""
     from aiida import orm