Skip to content

Commit

Permalink
PwParser: Keep scheduler parser error unless more specific available
Browse files Browse the repository at this point in the history
When a `CalcJob` is retrieved, the engine will first call the parser of
the scheduler plugin if one is implemented. If that detects a problem
and returns an exit code, it will be set on the `CalcJobNode`. The
`PwParser` is updated to check for the presence of this exit code and
only override it if a more specific exit code can be provided.

Note that we have to manually recreate the `ExitCode` and return it
because returning `None` is currently equivalent to returning
`ExitCode(0)`.
  • Loading branch information
sphuber committed Apr 18, 2023
1 parent ef21642 commit b8d6a3a
Show file tree
Hide file tree
Showing 6 changed files with 346 additions and 2 deletions.
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@ classifiers = [
keywords = ['aiida', 'workflows']
requires-python = '>=3.8'
dependencies = [
'aiida_core[atomic_tools]~=2.1',
'aiida_core[atomic_tools]~=2.3',
'aiida-pseudo~=1.0',
'click~=8.0',
'importlib_resources',
Expand Down
1 change: 1 addition & 0 deletions src/aiida_quantumespresso/calculations/pw.py
Original file line number Diff line number Diff line change
Expand Up @@ -61,6 +61,7 @@ def xml_filepaths(cls):
@classmethod
def define(cls, spec):
"""Define the process specification."""
# pylint: disable=too-many-statements
# yapf: disable
super().define(spec)
spec.input('metadata.options.parser_name', valid_type=str, default='quantumespresso.pw')
Expand Down
12 changes: 11 additions & 1 deletion src/aiida_quantumespresso/parsers/pw.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@

from aiida import orm
from aiida.common import exceptions
from aiida.engine import ExitCode
import numpy

from aiida_quantumespresso.utils.mapping import get_logging_container
Expand Down Expand Up @@ -107,7 +108,16 @@ def parse(self, **kwargs):
ignore = ['Error while parsing ethr.', 'DEPRECATED: symmetry with ibrav=0, use correct ibrav instead']
self.emit_logs([logs_stdout, logs_xml], ignore=ignore)

# First check for specific known problems that can cause a pre-mature termination of the calculation
# If either the stdout or XML were incomplete or corrupt investigate the potential cause
if self.exit_code_stdout or self.exit_code_xml:

# First check whether the scheduler already reported an exit code.
if self.node.exit_status is not None:

# Now it is unlikely we can provide a more specific exit code so we keep the scheduler one.
return ExitCode(self.node.exit_status, self.node.exit_message)

# Check for specific known problems that can cause a pre-mature termination of the calculation
exit_code = self.validate_premature_exit(logs_stdout)
if exit_code:
return self.exit(exit_code)
Expand Down
69 changes: 69 additions & 0 deletions tests/parsers/fixtures/pw/failed_interrupted_relax/aiida.out
Original file line number Diff line number Diff line change
@@ -0,0 +1,69 @@

Program PWSCF v.7.1 starts on 24Jan2023 at 16:41:50

... CONTENT REMOVED

Writing config-only to output data dir ./out/aiida.save/
NEW-OLD atomic charge density approx. for the potential
extrapolated charge 8.09399, renormalised to 8.00000

total cpu time spent up to now is 17.9 secs

Self-consistent Calculation

iteration # 1 ecut= 30.00 Ry beta= 0.40
Davidson diagonalization with overlap

---- Real-time Memory Report at c_bands before calling an iterative solver
96 MiB given to the printing process from OS
0 MiB allocation reported by mallinfo(arena+hblkhd)
218088 MiB available memory on the node where the printing process lives
------------------
ethr = 1.00E-06, avg # of iterations = 3.1

total cpu time spent up to now is 18.3 secs

WARNING: integrated charge= 8.00027708, expected= 8.00000000

total energy = -22.83490430 Ry
estimated scf accuracy < 0.00004952 Ry

iteration # 2 ecut= 30.00 Ry beta= 0.40
Davidson diagonalization with overlap

---- Real-time Memory Report at c_bands before calling an iterative solver
96 MiB given to the printing process from OS
0 MiB allocation reported by mallinfo(arena+hblkhd)
218087 MiB available memory on the node where the printing process lives
------------------
ethr = 6.19E-07, avg # of iterations = 2.5

total cpu time spent up to now is 18.7 secs

WARNING: integrated charge= 8.00027539, expected= 8.00000000

total energy = -22.83519985 Ry
estimated scf accuracy < 0.00001579 Ry

iteration # 3 ecut= 30.00 Ry beta= 0.40
Davidson diagonalization with overlap

---- Real-time Memory Report at c_bands before calling an iterative solver
96 MiB given to the printing process from OS
0 MiB allocation reported by mallinfo(arena+hblkhd)
218087 MiB available memory on the node where the printing process lives
------------------
ethr = 1.97E-07, avg # of iterations = 1.0

total cpu time spent up to now is 19.0 secs

WARNING: integrated charge= 8.00028503, expected= 8.00000000

total energy = -22.83519230 Ry
estimated scf accuracy < 0.00000790 Ry

iteration # 4 ecut= 30.00 Ry beta= 0.40
Davidson diagonalization with overlap


... CONTENT REMOVED
Original file line number Diff line number Diff line change
@@ -0,0 +1,228 @@
<?xml version="1.0" encoding="UTF-8"?>
<qes:espresso xsi:schemaLocation="http://www.quantum-espresso.org/ns/qes/qes-1.0 http://www.quantum-espresso.org/ns/qes/qes_220603.xsd" Units="Hartree atomic units" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xmlns:qes="http://www.quantum-espresso.org/ns/qes/qes-1.0">
<!--All quantities are in Hartree atomic units unless otherwise specified-->
<general_info>
<xml_format NAME="QEXSD" VERSION="21.11.01">QEXSD_21.11.01</xml_format>
<creator NAME="PWSCF" VERSION="7.1">XML file generated by PWSCF</creator>
<created DATE="24Jan2023" TIME="16:42:24">This run was terminated on: 16:42:24 24 Jan 2023</created>
<job></job>
</general_info>
<parallel_info>
<nprocs>1</nprocs>
<nthreads>1</nthreads>
<ntasks>1</ntasks>
<nbgrp>1</nbgrp>
<npool>1</npool>
<ndiag>1</ndiag>
</parallel_info>
<input>
<control_variables>
<title></title>
<calculation>vc-relax</calculation>
<restart_mode>from_scratch</restart_mode>
<prefix>aiida</prefix>
<pseudo_dir>./pseudo/</pseudo_dir>
<outdir>./out/</outdir>
<stress>true</stress>
<forces>true</forces>
<wf_collect>true</wf_collect>
<disk_io>low</disk_io>
<max_seconds>41040</max_seconds>
<nstep>50</nstep>
<etot_conv_thr>1.000000000000000e-4</etot_conv_thr>
<forc_conv_thr>5.000000000000000e-4</forc_conv_thr>
<press_conv_thr>5.000000000000000e-1</press_conv_thr>
<verbosity>high</verbosity>
<print_every>100000</print_every>
<fcp>false</fcp>
<rism>false</rism>
</control_variables>
<atomic_species ntyp="1">
<species name="Si">
<mass>2.808500000000000e1</mass>
<pseudo_file>Si.pbe-n-rrkjus_psl.1.0.0.UPF</pseudo_file>
</species>
</atomic_species>
<atomic_structure nat="2" alat="7.255773194184e0">
<atomic_positions>
<atom name="Si" index="1">0.000000000000000e0 0.000000000000000e0 0.000000000000000e0</atom>
<atom name="Si" index="2">2.565303214179483e0 2.565303214179483e0 2.565303214179483e0</atom>
</atomic_positions>
<cell>
<a1>0.000000000000000e0 5.130606428358965e0 5.130606428358965e0</a1>
<a2>5.130606428358965e0 0.000000000000000e0 5.130606428358965e0</a2>
<a3>5.130606428358965e0 5.130606428358965e0 0.000000000000000e0</a3>
</cell>
</atomic_structure>
<dft>
<functional>PBE</functional>
</dft>
<spin>
<lsda>false</lsda>
<noncolin>false</noncolin>
<spinorbit>false</spinorbit>
</spin>
<bands>
<smearing degauss="5.000000000000e-3">mv</smearing>
<tot_charge>0.000000000000000e0</tot_charge>
<occupations>smearing</occupations>
</bands>
<basis>
<gamma_only>false</gamma_only>
<ecutwfc>1.500000000000000e1</ecutwfc>
<ecutrho>1.200000000000000e2</ecutrho>
</basis>
<electron_control>
<diagonalization>davidson</diagonalization>
<mixing_mode>plain</mixing_mode>
<mixing_beta>4.000000000000000e-1</mixing_beta>
<conv_thr>4.000000000000000e-10</conv_thr>
<mixing_ndim>8</mixing_ndim>
<max_nstep>80</max_nstep>
<real_space_q>false</real_space_q>
<real_space_beta>false</real_space_beta>
<tq_smoothing>false</tq_smoothing>
<tbeta_smoothing>false</tbeta_smoothing>
<diago_thr_init>0.000000000000000e0</diago_thr_init>
<diago_full_acc>false</diago_full_acc>
<diago_cg_maxiter>20</diago_cg_maxiter>
<diago_ppcg_maxiter>20</diago_ppcg_maxiter>
<diago_rmm_ndim>4</diago_rmm_ndim>
<diago_gs_nblock>16</diago_gs_nblock>
<diago_rmm_conv>false</diago_rmm_conv>
</electron_control>
<k_points_IBZ>
<monkhorst_pack nk1="5" nk2="5" nk3="5" k1="0" k2="0" k3="0">Monkhorst-Pack</monkhorst_pack>
</k_points_IBZ>
<ion_control>
<ion_dynamics>bfgs</ion_dynamics>
<upscale>1.000000000000000e2</upscale>
<remove_rigid_rot>false</remove_rigid_rot>
<refold_pos>false</refold_pos>
<bfgs>
<ndim>1</ndim>
<trust_radius_min>1.000000000000000e-4</trust_radius_min>
<trust_radius_max>8.000000000000000e-1</trust_radius_max>
<trust_radius_init>5.000000000000000e-1</trust_radius_init>
<w1>1.000000000000000e-2</w1>
<w2>5.000000000000000e-1</w2>
</bfgs>
</ion_control>
<cell_control>
<cell_dynamics>bfgs</cell_dynamics>
<pressure>0.000000000000000e0</pressure>
<wmass>5.617000000000001e1</wmass>
<cell_do_free>all</cell_do_free>
</cell_control>
<symmetry_flags>
<nosym>false</nosym>
<nosym_evc>false</nosym_evc>
<noinv>false</noinv>
<no_t_rev>false</no_t_rev>
<force_symmorphic>false</force_symmorphic>
<use_all_frac>false</use_all_frac>
</symmetry_flags>
<free_positions rank="2" dims="3 2">
1 1 1
1 1 1
</free_positions>
</input>
<output>
<convergence_info>
<scf_conv>
<convergence_achieved>true</convergence_achieved>
<n_scf_steps>9</n_scf_steps>
<scf_error>1.011022286698610e-13</scf_error>
</scf_conv>
<opt_conv>
<convergence_achieved>true</convergence_achieved>
<n_opt_steps>2</n_opt_steps>
<grad_norm>3.530839674569442e-33</grad_norm>
</opt_conv>
</convergence_info>
<algorithmic_info>
<real_space_q>false</real_space_q>
<real_space_beta>false</real_space_beta>
<uspp>true</uspp>
<paw>false</paw>
</algorithmic_info>
<atomic_species ntyp="1" pseudo_dir="./pseudo/">
<species name="Si">
<mass>2.808500000000000e1</mass>
<pseudo_file>Si.pbe-n-rrkjus_psl.1.0.0.UPF</pseudo_file>
</species>
</atomic_species>
<atomic_structure nat="2" alat="7.255773194184e0">
<atomic_positions>
<atom name="Si" index="1">1.654282468754416e-32 8.768940425434829e-32 5.987564757578667e-32</atom>
<atom name="Si" index="2">2.585178551024855e0 2.585178551024855e0 2.585178551024855e0</atom>
</atomic_positions>
<cell>
<a1>1.167348011550243e-18 5.170357102049712e0 5.170357102049712e0</a1>
<a2>5.170357102049712e0 -1.567770544918550e-17 5.170357102049712e0</a2>
<a3>5.170357102049712e0 5.170357102049712e0 -4.936781237065698e-18</a3>
</cell>
</atomic_structure>
<basis_set>
<gamma_only>false</gamma_only>
<ecutwfc>1.500000000000000e1</ecutwfc>
<ecutrho>1.200000000000000e2</ecutrho>
<fft_grid nr1="40" nr2="40" nr3="40"></fft_grid>
<fft_smooth nr1="32" nr2="32" nr3="32"></fft_smooth>
<fft_box nr1="40" nr2="40" nr3="40"></fft_box>
<ngm>17261</ngm>
<ngms>6183</ngms>
<npwx>785</npwx>
<reciprocal_lattice>
<b1>-7.016704118277816e-1 7.016704118277816e-1 7.016704118277816e-1</b1>
<b2>7.016704118277816e-1 -7.016704118277816e-1 7.016704118277816e-1</b2>
<b3>7.016704118277816e-1 7.016704118277816e-1 -7.016704118277816e-1</b3>
</reciprocal_lattice>
</basis_set>
<dft>
<functional>PBE</functional>
</dft>
<magnetization>
<lsda>false</lsda>
<noncolin>false</noncolin>
<spinorbit>false</spinorbit>
<absolute>0.000000000000000e0</absolute>
</magnetization>
<total_energy>
<etot>-1.141760135412168e1</etot>
<eband>2.911290566947440e-1</eband>
<ehart>5.654148572292366e-1</ehart>
<vtxc>-3.334728213127106e0</vtxc>
<etxc>-6.143149505066509e0</etxc>
<ewald>-8.334895155792580e0</ewald>
<demet>1.225499808665200e-6</demet>
</total_energy>
<band_structure>
<lsda>false</lsda>
<noncolin>false</noncolin>
<spinorbit>false</spinorbit>
<nbnd>8</nbnd>
<nelec>8.000000000000000e0</nelec>
<num_of_atomic_wfc>8</num_of_atomic_wfc>
<wf_collected>true</wf_collected>
<fermi_energy>2.339433967352145e-1</fermi_energy>
<starting_k_points>
<monkhorst_pack nk1="5" nk2="5" nk3="5" k1="0" k2="0" k3="0">Monkhorst-Pack</monkhorst_pack>
</starting_k_points>
<nks>10</nks>
<occupations_kind>smearing</occupations_kind>
<smearing degauss="5.000000000000e-3">mv</smearing>
</band_structure>
<forces rank="2" dims="3 2">
-7.207296305224787e-34 7.207296305224787e-34 -7.207296305224787e-34
7.207296305224787e-34 -7.207296305224787e-34 7.207296305224787e-34
</forces>
<stress rank="2" dims="3 3">
1.777343942666275e-6 -9.859678771171478e-22 -7.282330695072767e-23
-6.683305218967851e-22 1.777343942666275e-6 -6.022188989846654e-22
1.389349298628474e-22 -6.022188989846654e-22 1.777343942666275e-6
</stress>
</output>
<exit_status>0</exit_status>
<closed DATE="24 Jan 2023" TIME="16:42:24"></closed>
</qes:espresso>
36 changes: 36 additions & 0 deletions tests/parsers/test_pw.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,8 @@
from aiida.common import AttributeDict
import pytest

from aiida_quantumespresso.calculations.pw import PwCalculation


@pytest.fixture
def generate_inputs(generate_structure):
Expand Down Expand Up @@ -496,6 +498,40 @@ def test_pw_failed_interrupted_xml(
data_regression.check(results['output_parameters'].get_dict())


@pytest.mark.parametrize(
'test_case, expected_exit_code', (
('default', None),
('failed_interrupted', PwCalculation.exit_codes.ERROR_SCHEDULER_OUT_OF_WALLTIME),
)
)
def test_pw_failed_interrupted_scheduler(
fixture_localhost, generate_calc_job_node, generate_parser, generate_inputs, test_case, expected_exit_code
):
"""Test that an exit code set by the scheduler is not overridden unless a more specific error is parsed.
The test is run twice, once for the ``default`` test case and once for ``failed_interrupted``, which correspond to a
successful run and a run that got interrupted (usually due to scheduler killing the job). Before calling the parser
the ``ERROR_SCHEDULER_OUT_OF_WALLTIME`` is set on the node. In the case of the ``default`` test case, this should be
ignored and the parser should return ``ExitCode(0)``. For the interrupted case, the exit code of the scheduler
should not be overridden so the parser should return ``ERROR_SCHEDULER_OUT_OF_WALLTIME``.
"""
entry_point_calc_job = 'quantumespresso.pw'
entry_point_parser = 'quantumespresso.pw'

# Generate the node and set an exit status as if it would have been set by the scheduler parser
node = generate_calc_job_node(entry_point_calc_job, fixture_localhost, test_case, generate_inputs())
node.set_exit_status(PwCalculation.exit_codes.ERROR_SCHEDULER_OUT_OF_WALLTIME.status)

parser = generate_parser(entry_point_parser)
_, calcfunction = parser.parse_from_node(node, store_provenance=False)

if expected_exit_code is None:
assert calcfunction.is_finished_ok, (calcfunction.exit_status, calcfunction.exception)
else:
assert not calcfunction.is_finished_ok, calcfunction.exception
assert calcfunction.exit_status == expected_exit_code.status


@pytest.mark.parametrize('filename', ('', '_stdout'))
def test_pw_npools_too_high_error(
fixture_localhost, generate_calc_job_node, generate_parser, generate_inputs, filename
Expand Down

0 comments on commit b8d6a3a

Please sign in to comment.