From b8d6a3a489c8d24a06cfc58956a97b121ad92316 Mon Sep 17 00:00:00 2001 From: Sebastiaan Huber Date: Tue, 24 Jan 2023 07:28:24 +0100 Subject: [PATCH] `PwParser`: Keep scheduler parser error unless more specific available When a `CalcJob` is retrieved, the engine will first call the parser of the scheduler plugin if one is implemented. If that detects a problem and returns an exit code, it will be set on the `CalcJobNode`. The `PwParser` is updated to check for the presence of this exit code and only override it if a more specific exit code can be provided. Note that we have to manually recreate the `ExitCode` and return it because returning `None` is currently equivalent to returning `ExitCode(0)`. --- pyproject.toml | 2 +- src/aiida_quantumespresso/calculations/pw.py | 1 + src/aiida_quantumespresso/parsers/pw.py | 12 +- .../pw/failed_interrupted_relax/aiida.out | 69 ++++++ .../data-file-schema.xml | 228 ++++++++++++++++++ tests/parsers/test_pw.py | 36 +++ 6 files changed, 346 insertions(+), 2 deletions(-) create mode 100644 tests/parsers/fixtures/pw/failed_interrupted_relax/aiida.out create mode 100644 tests/parsers/fixtures/pw/failed_interrupted_relax/data-file-schema.xml diff --git a/pyproject.toml b/pyproject.toml index 2771934f6..5d244d952 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -22,7 +22,7 @@ classifiers = [ keywords = ['aiida', 'workflows'] requires-python = '>=3.8' dependencies = [ - 'aiida_core[atomic_tools]~=2.1', + 'aiida_core[atomic_tools]~=2.3', 'aiida-pseudo~=1.0', 'click~=8.0', 'importlib_resources', diff --git a/src/aiida_quantumespresso/calculations/pw.py b/src/aiida_quantumespresso/calculations/pw.py index 511c93b2a..e2eabcd18 100644 --- a/src/aiida_quantumespresso/calculations/pw.py +++ b/src/aiida_quantumespresso/calculations/pw.py @@ -61,6 +61,7 @@ def xml_filepaths(cls): @classmethod def define(cls, spec): """Define the process specification.""" + # pylint: disable=too-many-statements # yapf: disable super().define(spec) spec.input('metadata.options.parser_name', valid_type=str, default='quantumespresso.pw') diff --git a/src/aiida_quantumespresso/parsers/pw.py b/src/aiida_quantumespresso/parsers/pw.py index 171c4cdc1..61152a472 100644 --- a/src/aiida_quantumespresso/parsers/pw.py +++ b/src/aiida_quantumespresso/parsers/pw.py @@ -4,6 +4,7 @@ from aiida import orm from aiida.common import exceptions +from aiida.engine import ExitCode import numpy from aiida_quantumespresso.utils.mapping import get_logging_container @@ -107,7 +108,16 @@ def parse(self, **kwargs): ignore = ['Error while parsing ethr.', 'DEPRECATED: symmetry with ibrav=0, use correct ibrav instead'] self.emit_logs([logs_stdout, logs_xml], ignore=ignore) - # First check for specific known problems that can cause a pre-mature termination of the calculation + # If either the stdout or XML were incomplete or corrupt investigate the potential cause + if self.exit_code_stdout or self.exit_code_xml: + + # First check whether the scheduler already reported an exit code. + if self.node.exit_status is not None: + + # Now it is unlikely we can provide a more specific exit code so we keep the scheduler one. + return ExitCode(self.node.exit_status, self.node.exit_message) + + # Check for specific known problems that can cause a pre-mature termination of the calculation exit_code = self.validate_premature_exit(logs_stdout) if exit_code: return self.exit(exit_code) diff --git a/tests/parsers/fixtures/pw/failed_interrupted_relax/aiida.out b/tests/parsers/fixtures/pw/failed_interrupted_relax/aiida.out new file mode 100644 index 000000000..e96a80db6 --- /dev/null +++ b/tests/parsers/fixtures/pw/failed_interrupted_relax/aiida.out @@ -0,0 +1,69 @@ + + Program PWSCF v.7.1 starts on 24Jan2023 at 16:41:50 + +... CONTENT REMOVED + + Writing config-only to output data dir ./out/aiida.save/ + NEW-OLD atomic charge density approx. for the potential + extrapolated charge 8.09399, renormalised to 8.00000 + + total cpu time spent up to now is 17.9 secs + + Self-consistent Calculation + + iteration # 1 ecut= 30.00 Ry beta= 0.40 + Davidson diagonalization with overlap + +---- Real-time Memory Report at c_bands before calling an iterative solver + 96 MiB given to the printing process from OS + 0 MiB allocation reported by mallinfo(arena+hblkhd) + 218088 MiB available memory on the node where the printing process lives +------------------ + ethr = 1.00E-06, avg # of iterations = 3.1 + + total cpu time spent up to now is 18.3 secs + + WARNING: integrated charge= 8.00027708, expected= 8.00000000 + + total energy = -22.83490430 Ry + estimated scf accuracy < 0.00004952 Ry + + iteration # 2 ecut= 30.00 Ry beta= 0.40 + Davidson diagonalization with overlap + +---- Real-time Memory Report at c_bands before calling an iterative solver + 96 MiB given to the printing process from OS + 0 MiB allocation reported by mallinfo(arena+hblkhd) + 218087 MiB available memory on the node where the printing process lives +------------------ + ethr = 6.19E-07, avg # of iterations = 2.5 + + total cpu time spent up to now is 18.7 secs + + WARNING: integrated charge= 8.00027539, expected= 8.00000000 + + total energy = -22.83519985 Ry + estimated scf accuracy < 0.00001579 Ry + + iteration # 3 ecut= 30.00 Ry beta= 0.40 + Davidson diagonalization with overlap + +---- Real-time Memory Report at c_bands before calling an iterative solver + 96 MiB given to the printing process from OS + 0 MiB allocation reported by mallinfo(arena+hblkhd) + 218087 MiB available memory on the node where the printing process lives +------------------ + ethr = 1.97E-07, avg # of iterations = 1.0 + + total cpu time spent up to now is 19.0 secs + + WARNING: integrated charge= 8.00028503, expected= 8.00000000 + + total energy = -22.83519230 Ry + estimated scf accuracy < 0.00000790 Ry + + iteration # 4 ecut= 30.00 Ry beta= 0.40 + Davidson diagonalization with overlap + + +... CONTENT REMOVED diff --git a/tests/parsers/fixtures/pw/failed_interrupted_relax/data-file-schema.xml b/tests/parsers/fixtures/pw/failed_interrupted_relax/data-file-schema.xml new file mode 100644 index 000000000..1a1e90ca0 --- /dev/null +++ b/tests/parsers/fixtures/pw/failed_interrupted_relax/data-file-schema.xml @@ -0,0 +1,228 @@ + + + + + QEXSD_21.11.01 + XML file generated by PWSCF + This run was terminated on: 16:42:24 24 Jan 2023 + + + + 1 + 1 + 1 + 1 + 1 + 1 + + + + + vc-relax + from_scratch + aiida + ./pseudo/ + ./out/ + true + true + true + low + 41040 + 50 + 1.000000000000000e-4 + 5.000000000000000e-4 + 5.000000000000000e-1 + high + 100000 + false + false + + + + 2.808500000000000e1 + Si.pbe-n-rrkjus_psl.1.0.0.UPF + + + + + 0.000000000000000e0 0.000000000000000e0 0.000000000000000e0 + 2.565303214179483e0 2.565303214179483e0 2.565303214179483e0 + + + 0.000000000000000e0 5.130606428358965e0 5.130606428358965e0 + 5.130606428358965e0 0.000000000000000e0 5.130606428358965e0 + 5.130606428358965e0 5.130606428358965e0 0.000000000000000e0 + + + + PBE + + + false + false + false + + + mv + 0.000000000000000e0 + smearing + + + false + 1.500000000000000e1 + 1.200000000000000e2 + + + davidson + plain + 4.000000000000000e-1 + 4.000000000000000e-10 + 8 + 80 + false + false + false + false + 0.000000000000000e0 + false + 20 + 20 + 4 + 16 + false + + + Monkhorst-Pack + + + bfgs + 1.000000000000000e2 + false + false + + 1 + 1.000000000000000e-4 + 8.000000000000000e-1 + 5.000000000000000e-1 + 1.000000000000000e-2 + 5.000000000000000e-1 + + + + bfgs + 0.000000000000000e0 + 5.617000000000001e1 + all + + + false + false + false + false + false + false + + + 1 1 1 + 1 1 1 + + + + + + true + 9 + 1.011022286698610e-13 + + + true + 2 + 3.530839674569442e-33 + + + + false + false + true + false + + + + 2.808500000000000e1 + Si.pbe-n-rrkjus_psl.1.0.0.UPF + + + + + 1.654282468754416e-32 8.768940425434829e-32 5.987564757578667e-32 + 2.585178551024855e0 2.585178551024855e0 2.585178551024855e0 + + + 1.167348011550243e-18 5.170357102049712e0 5.170357102049712e0 + 5.170357102049712e0 -1.567770544918550e-17 5.170357102049712e0 + 5.170357102049712e0 5.170357102049712e0 -4.936781237065698e-18 + + + + false + 1.500000000000000e1 + 1.200000000000000e2 + + + + 17261 + 6183 + 785 + + -7.016704118277816e-1 7.016704118277816e-1 7.016704118277816e-1 + 7.016704118277816e-1 -7.016704118277816e-1 7.016704118277816e-1 + 7.016704118277816e-1 7.016704118277816e-1 -7.016704118277816e-1 + + + + PBE + + + false + false + false + 0.000000000000000e0 + + + -1.141760135412168e1 + 2.911290566947440e-1 + 5.654148572292366e-1 + -3.334728213127106e0 + -6.143149505066509e0 + -8.334895155792580e0 + 1.225499808665200e-6 + + + false + false + false + 8 + 8.000000000000000e0 + 8 + true + 2.339433967352145e-1 + + Monkhorst-Pack + + 10 + smearing + mv + + + -7.207296305224787e-34 7.207296305224787e-34 -7.207296305224787e-34 + 7.207296305224787e-34 -7.207296305224787e-34 7.207296305224787e-34 + + + 1.777343942666275e-6 -9.859678771171478e-22 -7.282330695072767e-23 + -6.683305218967851e-22 1.777343942666275e-6 -6.022188989846654e-22 + 1.389349298628474e-22 -6.022188989846654e-22 1.777343942666275e-6 + + + 0 + + diff --git a/tests/parsers/test_pw.py b/tests/parsers/test_pw.py index 98e2f266e..3b1413f9b 100644 --- a/tests/parsers/test_pw.py +++ b/tests/parsers/test_pw.py @@ -5,6 +5,8 @@ from aiida.common import AttributeDict import pytest +from aiida_quantumespresso.calculations.pw import PwCalculation + @pytest.fixture def generate_inputs(generate_structure): @@ -496,6 +498,40 @@ def test_pw_failed_interrupted_xml( data_regression.check(results['output_parameters'].get_dict()) +@pytest.mark.parametrize( + 'test_case, expected_exit_code', ( + ('default', None), + ('failed_interrupted', PwCalculation.exit_codes.ERROR_SCHEDULER_OUT_OF_WALLTIME), + ) +) +def test_pw_failed_interrupted_scheduler( + fixture_localhost, generate_calc_job_node, generate_parser, generate_inputs, test_case, expected_exit_code +): + """Test that an exit code set by the scheduler is not overridden unless a more specific error is parsed. + + The test is run twice, once for the ``default`` test case and once for ``failed_interrupted``, which correspond to a + successful run and a run that got interrupted (usually due to scheduler killing the job). Before calling the parser + the ``ERROR_SCHEDULER_OUT_OF_WALLTIME`` is set on the node. In the case of the ``default`` test case, this should be + ignored and the parser should return ``ExitCode(0)``. For the interrupted case, the exit code of the scheduler + should not be overridden so the parser should return ``ERROR_SCHEDULER_OUT_OF_WALLTIME``. + """ + entry_point_calc_job = 'quantumespresso.pw' + entry_point_parser = 'quantumespresso.pw' + + # Generate the node and set an exit status as if it would have been set by the scheduler parser + node = generate_calc_job_node(entry_point_calc_job, fixture_localhost, test_case, generate_inputs()) + node.set_exit_status(PwCalculation.exit_codes.ERROR_SCHEDULER_OUT_OF_WALLTIME.status) + + parser = generate_parser(entry_point_parser) + _, calcfunction = parser.parse_from_node(node, store_provenance=False) + + if expected_exit_code is None: + assert calcfunction.is_finished_ok, (calcfunction.exit_status, calcfunction.exception) + else: + assert not calcfunction.is_finished_ok, calcfunction.exception + assert calcfunction.exit_status == expected_exit_code.status + + @pytest.mark.parametrize('filename', ('', '_stdout')) def test_pw_npools_too_high_error( fixture_localhost, generate_calc_job_node, generate_parser, generate_inputs, filename