Skip to content

Commit

Permalink
Browse files Browse the repository at this point in the history
  • Loading branch information
edan-bainglass committed Oct 23, 2024
1 parent 4c35f98 commit 33ae705
Show file tree
Hide file tree
Showing 4 changed files with 237 additions and 28 deletions.
63 changes: 35 additions & 28 deletions src/aiidalab_qe/app/submission/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,13 +34,17 @@ def __init__(self, model: SubmissionModel, qe_auto_setup=True, **kwargs):

self._model = model
self._model.observe(
self._on_process_change,
"process",
self._on_input_structure_change,
"input_structure",
)
self._model.observe(
self._on_input_parameters_change,
"input_parameters",
)
self._model.observe(
self._on_process_change,
"process",
)
self._model.observe(
self._on_submission_blockers_change,
[
Expand Down Expand Up @@ -73,26 +77,6 @@ def __init__(self, model: SubmissionModel, qe_auto_setup=True, **kwargs):

self.qe_auto_setup = qe_auto_setup

self._ALERT_MESSAGE = """
<div class="alert alert-{alert_class} alert-dismissible">
<a
href="#"
class="close"
data-dismiss="alert"
aria-label="close"
>
&times;
</a>
<span
class="closebtn"
onclick="this.parentElement.style.display='none';"
>
&times;
</span>
<strong>{message}</strong>
</div>
"""

plugin_codes: PluginCodes = get_entry_items("aiidalab_qe.properties", "code")
plugin_codes.update(
{
Expand Down Expand Up @@ -197,6 +181,12 @@ def render(self):
(self.submission_blocker_messages, "value"),
)

self.submission_warning_messages = ipw.HTML()
ipw.dlink(
(self._model, "submission_warning_messages"),
(self.submission_warning_messages, "value"),
)

self.children = [
ipw.HTML("""
<div style="padding-top: 0px; padding-bottom: 0px">
Expand All @@ -215,6 +205,7 @@ def render(self):
self.sssp_installation,
self.qe_setup,
self.submission_blocker_messages,
self.submission_warning_messages,
ipw.HTML("""
<div style="padding-top: 0px; padding-bottom: 0px">
<h4>Labeling Your Job</h4>
Expand All @@ -233,10 +224,20 @@ def render(self):

self.rendered = True

# Render and set up default PW code
pw_code = self._model.get_code("dft", "pw")
pw_code.activate()
pw_code_widget = pw_code.get_setup_widget()
pw_code_widget.num_cpus.observe(
self._on_pw_code_resource_change,
"value",
)
pw_code_widget.num_nodes.observe(
self._on_pw_code_resource_change,
"value",
)

# Render any active codes starting with pw
# Render any other active codes
self._toggle_code(pw_code)
for _, code in self._model.get_codes(flat=True):
if code is not pw_code and code.is_active:
Expand All @@ -251,18 +252,21 @@ def reset(self):
def _on_previous_step_state_change(self, _):
self._update_state()

def _on_input_structure_change(self, _):
self._model.check_resources()

def _on_input_parameters_change(self, _):
self._model.update_active_codes()
self._model.update_process_label()
self._model.update_submission_blockers()

def _on_process_change(self, _):
with self.hold_trait_notifications():
# TODO why here? Do we not populate traits earlier that would cover this?
if self._model.process is not None:
self._model.input_structure = self._model.process.inputs.structure
self._update_state()

def _on_input_parameters_change(self, _):
self._model.update_active_codes()
self._model.update_process_label()
self._model.update_submission_blockers()

def _on_submission_blockers_change(self, _):
self._model.update_submission_blocker_message()
self._update_state()
Expand All @@ -282,6 +286,9 @@ def _on_code_activation_change(self, change):
def _on_code_selection_change(self, _):
self._model.update_submission_blockers()

def _on_pw_code_resource_change(self, _):
self._model.check_resources()

def _on_submission(self, _):
self._model.submit()
self._update_state()
Expand Down
146 changes: 146 additions & 0 deletions src/aiidalab_qe/app/submission/model.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
from __future__ import annotations

import os
import typing as t
from copy import deepcopy

Expand Down Expand Up @@ -35,6 +36,7 @@ class SubmissionModel(tl.HasTraits):
process_description = tl.Unicode("")

submission_blocker_messages = tl.Unicode("")
submission_warning_messages = tl.Unicode("")

installing_qe = tl.Bool(False)
installing_sssp = tl.Bool(False)
Expand All @@ -55,6 +57,21 @@ class SubmissionModel(tl.HasTraits):

code_widgets: dict[str, QEAppComputationalResourcesWidget] = {}

def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)

self._RUN_ON_LOCALHOST_NUM_SITES_WARN_THRESHOLD = 10
self._RUN_ON_LOCALHOST_VOLUME_WARN_THRESHOLD = 1000 # \AA^3

self._ALERT_MESSAGE = """
<div class="alert alert-{alert_class} alert-dismissible">
<a href="#" class="close" data-dismiss="alert" aria-label="close">
&times;
</a>
<strong>{message}</strong>
</div>
"""

@property
def is_blocked(self):
return any(
Expand Down Expand Up @@ -84,6 +101,89 @@ def submit(self):
)
self.process = process

def check_resources(self):
pw_code_model = self.get_code("dft", "pw")

if not self.input_structure or not pw_code_model.selected:
return # No code selected or no structure, so nothing to do

pw_code = pw_code_model.get_setup_widget()
num_cpus = pw_code.num_cpus.value * pw_code.num_nodes.value
on_localhost = orm.load_node(pw_code.value).computer.hostname == "localhost"
num_sites = len(self.input_structure.sites)
volume = self.input_structure.get_cell_volume()

try:
localhost_cpus = len(os.sched_getaffinity(0))
except Exception:
# Fallback, in some OS os.sched_getaffinity(0) is not supported
# However, not so reliable in containers
localhost_cpus = os.cpu_count()

large_system = (
num_sites > self._RUN_ON_LOCALHOST_NUM_SITES_WARN_THRESHOLD
or volume > self._RUN_ON_LOCALHOST_VOLUME_WARN_THRESHOLD
)

# Estimated number of CPUs for a run less than 12 hours.
estimated_CPUs = self._estimate_min_cpus(num_sites, volume)

# List of possible suggestions for warnings:
suggestions = {
"more_resources": f"<li>Increase the resources (total number of CPUs should be equal or more than {min(100,estimated_CPUs)}, if possible) </li>",
"change_configuration": "<li>Review the configuration (e.g. choosing <i>fast protocol</i> - this will affect precision) </li>",
"go_remote": "<li>Select a code that runs on a larger machine</li>",
"avoid_overloading": "<li>Reduce the number of CPUs to avoid the overloading of the local machine </li>",
}

alert_message = ""
if large_system and estimated_CPUs > num_cpus:
# This part is in common between Warnings 1 (2):
# (not) on localhost, big system and few cpus
warnings_1_2 = (
f"<span>&#9888;</span> Warning: The selected structure is large, with {num_sites} atoms "
f"and a volume of {int(volume)} Å<sup>3</sup>, "
"making it computationally demanding "
"to run at the localhost. Consider the following: "
if on_localhost
else "to run in a reasonable amount of time. Consider the following: "
)
# Warning 1: on localhost, big system and few cpus
alert_message += (
f"{warnings_1_2}<ul>"
+ suggestions["more_resources"]
+ suggestions["change_configuration"]
+ "</ul>"
if on_localhost
else f"{warnings_1_2}<ul>"
+ suggestions["go_remote"]
+ suggestions["more_resources"]
+ suggestions["change_configuration"]
+ "</ul>"
)
if on_localhost and num_cpus / localhost_cpus > 0.8:
# Warning-3: on localhost, more than half of the available cpus
alert_message += (
"<span>&#9888;</span> Warning: the selected pw.x code will run locally, but "
f"the number of requested CPUs ({num_cpus}) is larger than the 80% of the available resources ({localhost_cpus}). "
"Please be sure that your local "
"environment has enough free CPUs for the calculation. Consider the following: "
"<ul>"
+ suggestions["avoid_overloading"]
+ suggestions["go_remote"]
+ "</ul>"
)

self.submission_warning_messages = (
""
if (on_localhost and num_cpus / localhost_cpus) <= 0.8
and (not large_system or estimated_CPUs <= num_cpus)
else self._ALERT_MESSAGE.format(
alert_class="warning",
message=alert_message,
)
)

def update_active_codes(self):
for name, code in self.get_codes(flat=True):
if name != "pw":
Expand Down Expand Up @@ -278,3 +378,49 @@ def _check_submission_blockers(self):
yield (
f"Error: hi, plugin developer, please use the QEAppComputationalResourcesWidget from aiidalab_qe.common.widgets for code {name}."
)

def _estimate_min_cpus(
self,
n,
v,
n0=9,
v0=117,
num_cpus0=4,
t0=129.6,
tmax=12 * 60 * 60,
scf_cycles=5,
):
"""Estimate the minimum number of CPUs required to
complete a task within a given time limit.
Parameters
----------
`n` : `int`
The number of atoms in the system.
`v` : `float`
The volume of the system.
`n0` : `int`, optional
Reference number of atoms. Default is 9.
`v0` : `float`, optional
Reference volume. Default is 117.
`num_cpus0` : `int`, optional
Reference number of CPUs. Default is 4.
`t0` : `float`, optional
Reference time. Default is 129.6.
`tmax` : `float`, optional
Maximum time limit. Default is 12 hours.
`scf_cycles` : `int`, optional
Reference number of SCF cycles in a relaxation. Default is 5.
Returns
-------
`int`
The estimated minimum number of CPUs required.
"""
import numpy as np

return int(
np.ceil(
scf_cycles * num_cpus0 * (n / n0) ** 3 * (v / v0) ** 1.5 * t0 / tmax
)
)
7 changes: 7 additions & 0 deletions tests/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -117,6 +117,13 @@ def _generate_structure_data(name="silicon", pbc=(True, True, True)):
structure.append_atom(position=(0.0, 0.0, 1.0), symbols="O")
structure.append_atom(position=(0.0, 1.0, 0.0), symbols="H")

elif name == "H2O-larger":
cell = [[20.0, 0.0, 0.0], [0.0, 20.0, 0.0], [0.0, 0.0, 20.0]]
structure = orm.StructureData(cell=cell)
structure.append_atom(position=(0.0, 0.0, 0.0), symbols="H")
structure.append_atom(position=(0.0, 0.0, 1.0), symbols="O")
structure.append_atom(position=(0.0, 1.0, 0.0), symbols="H")

structure.pbc = pbc

return structure
Expand Down
49 changes: 49 additions & 0 deletions tests/test_submit_qe_workchain.py
Original file line number Diff line number Diff line change
Expand Up @@ -129,6 +129,55 @@ def test_create_builder_advanced_settings(
)


@pytest.mark.usefixtures("aiida_profile_clean", "sssp")
def test_warning_messages(
generate_structure_data,
submit_app_generator,
):
"""Test the creation of the warning messages.
For now, we test that the suggestions are indeed there.
We should check the whole message, but this is for now not easy to do: the message is built
on the fly with variables which are not accessible in this namespace.
"""
import os

suggestions = {
"more_resources": "Increase the resources",
"change_configuration": "Review the configuration",
"go_remote": "Select a code that runs on a larger machine",
"avoid_overloading": "Reduce the number of CPUs to avoid the overloading of the local machine",
}

app: App = submit_app_generator(properties=["bands", "pdos"])
submit_step = app.submit_step
submit_model = app.submit_model

pw_code = submit_model.get_code("dft", "pw").get_setup_widget()
pw_code.num_cpus.value = 1
submit_model.check_resources()
# no warning:
assert submit_step.submission_warning_messages.value == ""

# now we increase the resources, so we should have the Warning-3
pw_code.num_cpus.value = len(os.sched_getaffinity(0))
submit_model.check_resources()
for suggestion in ["avoid_overloading", "go_remote"]:
assert suggestions[suggestion] in submit_step.submission_warning_messages.value

# now we use a large structure, so we should have the Warning-1 (and 2 if not on localhost)
structure = generate_structure_data("H2O-larger")
submit_model.input_structure = structure
pw_code.num_cpus.value = 1
submit_model.check_resources()
num_sites = len(structure.sites)
volume = structure.get_cell_volume()
estimated_CPUs = submit_model._estimate_min_cpus(num_sites, volume)
assert estimated_CPUs == 2
for suggestion in ["more_resources", "change_configuration"]:
assert suggestions[suggestion] in submit_step.submission_warning_messages.value


def builder_to_readable_dict(builder):
"""transverse the builder and return a dictionary with readable values."""
from aiida import orm
Expand Down

0 comments on commit 33ae705

Please sign in to comment.