Skip to content

Commit

Permalink
PwBaseWorkChain: fix bug in validate_resources validator
Browse files Browse the repository at this point in the history
The `validate_resources` outline step would check that the
`metadata.options` namespace included at least the input
`resources.num_machines` and `max_wallclock_seconds` as long as the
automatic parallelization feature was not enabled. This was added when
the workchain was first developed, because the scheduler resources were
not validated on the base `CalcJob` in `aiida-core`.

However, since `aiida-core==1.0.0` the scheduler resources are validated
on the `CalcJob` base class. This means that a `PwBaseWorkChain` will
now not even be instantiated if the resources for the `PwCalculation`
are insufficient, so it is better to leave the validation up to the base
class.

What is worse, the logic in `validate_resources` is actually incorrect
as it assumed that the resources always need to include `num_machines`.
However, this is not the case as it is actually `Scheduler`
implementation specific, and for example for the `SgeScheduler` this
field is even prohibited and will raise when specified. This made that
the `PwBaseWorkChain` was fundamentally incompatible with those
schedulers before this fix.

The `PhBaseWorkChain` suffered from the same bug and has also been
fixed.
  • Loading branch information
sphuber committed May 6, 2021
1 parent bedbd57 commit ec6b189
Show file tree
Hide file tree
Showing 2 changed files with 17 additions and 40 deletions.
25 changes: 8 additions & 17 deletions aiida_quantumespresso/workflows/ph/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,6 @@ def define(cls, spec):
spec.outline(
cls.setup,
cls.validate_parameters,
cls.validate_resources,
while_(cls.should_run_process)(
cls.prepare_process,
cls.run_process,
Expand All @@ -40,7 +39,8 @@ def define(cls, spec):
)
spec.expose_outputs(PwCalculation, exclude=('retrieved_folder',))
spec.exit_code(204, 'ERROR_INVALID_INPUT_RESOURCES_UNDERSPECIFIED',
message='The `metadata.options` did not specify both `resources.num_machines` and `max_wallclock_seconds`.')
message='The `metadata.options` did not specify both `resources.num_machines` and `max_wallclock_seconds`. '
'This exit status has been deprecated as the check it corresponded to was incorrect.')
spec.exit_code(300, 'ERROR_UNRECOVERABLE_FAILURE',
message='The calculation failed with an unrecoverable error.')
# yapf: enable
Expand All @@ -66,21 +66,7 @@ def validate_parameters(self):
if self.inputs.only_initialization.value:
self.ctx.inputs.settings['ONLY_INITIALIZATION'] = True

def validate_resources(self):
"""Validate the inputs related to the resources.
The `metadata.options` should at least contain the options `resources` and `max_wallclock_seconds`, where
`resources` should define the `num_machines`.
"""
num_machines = self.ctx.inputs.metadata.options.get('resources', {}).get('num_machines', None)
max_wallclock_seconds = self.ctx.inputs.metadata.options.get('max_wallclock_seconds', None)

if num_machines is None or max_wallclock_seconds is None:
return self.exit_codes.ERROR_INVALID_INPUT_RESOURCES_UNDERSPECIFIED

self.set_max_seconds(max_wallclock_seconds)

def set_max_seconds(self, max_wallclock_seconds):
def set_max_seconds(self, max_wallclock_seconds: None):
"""Set the `max_seconds` to a fraction of `max_wallclock_seconds` option to prevent out-of-walltime problems.
:param max_wallclock_seconds: the maximum wallclock time that will be set in the scheduler settings.
Expand All @@ -95,6 +81,11 @@ def prepare_process(self):
If a `restart_calc` has been set in the context, its `remote_folder` will be used as the `parent_folder` input
for the next calculation and the `restart_mode` is set to `restart`.
"""
max_seconds = self.ctx.inputs.metadata.options.get('max_wallclock_seconds', None)

if max_seconds is not None:
self.set_max_seconds(max_seconds)

if self.ctx.restart_calc:
self.ctx.inputs.parameters['INPUTPH']['recover'] = True
self.ctx.inputs.parent_folder = self.ctx.restart_calc.outputs.remote_folder
Expand Down
32 changes: 9 additions & 23 deletions aiida_quantumespresso/workflows/pw/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -76,7 +76,6 @@ def define(cls, spec):
cls.validate_parameters,
cls.validate_kpoints,
cls.validate_pseudos,
cls.validate_resources,
if_(cls.should_run_init)(
cls.validate_init_inputs,
cls.run_init,
Expand All @@ -99,9 +98,11 @@ def define(cls, spec):
spec.exit_code(202, 'ERROR_INVALID_INPUT_KPOINTS',
message='Neither the `kpoints` nor the `kpoints_distance` input was specified.')
spec.exit_code(203, 'ERROR_INVALID_INPUT_RESOURCES',
message='Neither the `options` nor `automatic_parallelization` input was specified.')
message='Neither the `options` nor `automatic_parallelization` input was specified. '
'This exit status has been deprecated as the check it corresponded to was incorrect.')
spec.exit_code(204, 'ERROR_INVALID_INPUT_RESOURCES_UNDERSPECIFIED',
message='The `metadata.options` did not specify both `resources.num_machines` and `max_wallclock_seconds`.')
message='The `metadata.options` did not specify both `resources.num_machines` and `max_wallclock_seconds`. '
'This exit status has been deprecated as the check it corresponded to was incorrect.')
spec.exit_code(210, 'ERROR_INVALID_INPUT_AUTOMATIC_PARALLELIZATION_MISSING_KEY',
message='Required key for `automatic_parallelization` was not specified.')
spec.exit_code(211, 'ERROR_INVALID_INPUT_AUTOMATIC_PARALLELIZATION_UNRECOGNIZED_KEY',
Expand Down Expand Up @@ -282,26 +283,6 @@ def validate_pseudos(self):
self.report(f'{exception}')
return self.exit_codes.ERROR_INVALID_INPUT_PSEUDO_POTENTIALS

def validate_resources(self):
"""Validate the inputs related to the resources.
One can omit the normally required `options.resources` input for the `PwCalculation`, as long as the input
`automatic_parallelization` is specified. If this is not the case, the `metadata.options` should at least
contain the options `resources` and `max_wallclock_seconds`, where `resources` should define the `num_machines`.
"""
if 'automatic_parallelization' not in self.inputs and 'options' not in self.ctx.inputs.metadata:
return self.exit_codes.ERROR_INVALID_INPUT_RESOURCES

# If automatic parallelization is not enabled, we better make sure that the options satisfy minimum requirements
if 'automatic_parallelization' not in self.inputs:
num_machines = self.ctx.inputs.metadata.options.get('resources', {}).get('num_machines', None)
max_wallclock_seconds = self.ctx.inputs.metadata.options.get('max_wallclock_seconds', None)

if num_machines is None or max_wallclock_seconds is None:
return self.exit_codes.ERROR_INVALID_INPUT_RESOURCES_UNDERSPECIFIED

self.set_max_seconds(max_wallclock_seconds)

def set_max_seconds(self, max_wallclock_seconds):
"""Set the `max_seconds` to a fraction of `max_wallclock_seconds` option to prevent out-of-walltime problems.
Expand Down Expand Up @@ -419,6 +400,11 @@ def prepare_process(self):
for the next calculation and the `restart_mode` is set to `restart`. Otherwise, no `parent_folder` is used and
`restart_mode` is set to `from_scratch`.
"""
max_seconds = self.ctx.inputs.metadata.options.get('max_wallclock_seconds', None)

if max_seconds is not None:
self.set_max_seconds(max_seconds)

if self.ctx.restart_calc:
self.ctx.inputs.parameters['CONTROL']['restart_mode'] = 'restart'
self.ctx.inputs.parent_folder = self.ctx.restart_calc.outputs.remote_folder
Expand Down

0 comments on commit ec6b189

Please sign in to comment.