PpCalculation: add support for retrieving and parsing multiple files

The implementation of the `PpCalculation` and `PpParser` assumed that the code would always only produce on output file with pre-processed data and one output file with the data formatted in a custom plot format. However, for certain input parameter combinations, such as `INPUTPP.plot_num = 7` where more than a single band is requested, a pair of outputfiles is produced for each band. The filename of the data file has `INPUTPP.filplot` as a prefix, with some kind of suffix to distinguish it from the others. The corresponding plot file will use that filename as a prefix with the `PLOT.fileout` value as a suffix. For example, for the inputs: INPUTPP filplot = 'aiida.filplot' PLOT fileout = 'aiida.fileout' The data files will be named `aiida.filplot_K1_B1` and the plot files are formatted as `aiida.filplot_K1_B1aiida.fileout`. To support this use case, the `PpCalculation` is updated to not just retrieve a single file, but add a directive to the `retrieve_files` or `retrieve_temporary_list` that contains the corresponding globbing pattern. The `PpParser` then simply loops over the content of the retrieved (temporary) folder and parses each file whose filename matches the pattern described above. We assume that if there are more than one file, they all have the exact same format and so can be parsed with the same logic. Since now there are potentially more than one parsed output `ArrayData` node, the `output_data` port, which is not a namespace can not be used. To keep backwards compatibility, we add the `output_data_multiple` namespace, which is used if more than one output plot file is parsed.
aiidateam · Jun 27, 2020 · efd41ad · efd41ad
1 parent 4f75b6f
commit efd41ad
Show file tree

Hide file tree

Showing 8 changed files with 244 additions and 68 deletions.
diff --git a/aiida_quantumespresso/calculations/pp.py b/aiida_quantumespresso/calculations/pp.py
@@ -83,6 +83,7 @@ def define(cls, spec):
 
  spec.output('output_parameters', valid_type=orm.Dict)
  spec.output('output_data', valid_type=orm.ArrayData)
+ spec.output_namespace('output_data_multiple', valid_type=orm.ArrayData, dynamic=True)
  spec.default_output_node = 'output_parameters'
 
  # Standard exceptions
@@ -106,11 +107,13 @@ def define(cls, spec):
 
  # Output datafile related exceptions
  spec.exit_code(330, 'ERROR_OUTPUT_DATAFILE_MISSING',
- message='The retrieved folder did not contain the required formatted data output file.')
+ message='The formatted data output file `{filename}` was not present in the retrieved (temporary) folder.')
  spec.exit_code(331, 'ERROR_OUTPUT_DATAFILE_READ',
- message='The formatted data output file could not be read.')
+ message='The formatted data output file `{filename}` could not be read.')
  spec.exit_code(332, 'ERROR_UNSUPPORTED_DATAFILE_FORMAT',
  message='The data file format is not supported by the parser')
+ spec.exit_code(333, 'ERROR_OUTPUT_DATAFILE_PARSE',
+ message='The formatted data output file `{filename}` could not be parsed')
 
  def prepare_for_submission(self, folder): # pylint: disable=too-many-branches,too-many-statements
  """Prepare the calculation job for submission by transforming input nodes into input files.
@@ -210,13 +213,22 @@ def prepare_for_submission(self, folder): # pylint: disable=too-many-branches,t
  calcinfo.local_copy_list = local_copy_list
  calcinfo.remote_copy_list = remote_copy_list
 
- # Retrieve by default the output file and plot file
- calcinfo.retrieve_list = []
+ # Retrieve by default the output file
+ calcinfo.retrieve_list = [self.inputs.metadata.options.output_filename]
  calcinfo.retrieve_temporary_list = []
- calcinfo.retrieve_list.append(self.inputs.metadata.options.output_filename)
+
+ # Depending on the `plot_num` and the corresponding parameters, more than one pair of `filplot` + `fileout`
+ # files may be written. In that case, the data files will have `filplot` as a prefix with some suffix to
+ # distinguish them from one another. The `fileout` filename will be the full data filename with the `fileout`
+ # value as a suffix.
+ retrieve_tuples = [
+ self._FILEOUT,
+ ('{}_*{}'.format(self._FILPLOT, self._FILEOUT), '.', 0)
+ ]
+
  if self.inputs.metadata.options.keep_plot_file:
- calcinfo.retrieve_list.append(self._FILEOUT)
+ calcinfo.retrieve_list.extend(retrieve_tuples)
  else:
- calcinfo.retrieve_temporary_list.append(self._FILEOUT)
+ calcinfo.retrieve_temporary_list.extend(retrieve_tuples)
 
  return calcinfo
diff --git a/aiida_quantumespresso/parsers/pp.py b/aiida_quantumespresso/parsers/pp.py
@@ -1,7 +1,8 @@
 # -*- coding: utf-8 -*-
 """`Parser` implementation for the `PpCalculation` calculation job class."""
+import os
+import re
 import traceback
-import os.path
 
 import numpy as np
 
@@ -45,24 +46,22 @@ def parse(self, **kwargs):
  """
  Parse raw files retrieved from remote dir
  """
-
- temp_folder_path = None
-
- # A retrieved folded is required
  try:
  self.retrieved
  except exceptions.NotExistent:
  return self.exit_codes.ERROR_NO_RETRIEVED_FOLDER
 
+ retrieve_temporary_list = self.node.get_attribute('retrieve_temporary_list', None)
+ filename_stdout = self.node.get_option('output_filename')
+
  # If temporary files were specified, check that we have them
- if self.node.get_attribute('retrieve_temporary_list', None):
+ if retrieve_temporary_list:
  try:
- temp_folder_path = kwargs['retrieved_temporary_folder']
+ retrieved_temporary_folder = kwargs['retrieved_temporary_folder']
  except KeyError:
  return self.exit(self.exit_codes.ERROR_NO_RETRIEVED_TEMPORARY_FOLDER)
 
  # The stdout is required for parsing
- filename_stdout = self.node.get_attribute('output_filename')
  if filename_stdout not in self.retrieved.list_object_names():
  return self.exit_codes.ERROR_OUTPUT_STDOUT_MISSING
 
@@ -71,34 +70,48 @@ def parse(self, **kwargs):
  except (IOError, OSError):
  return self.exit_codes.ERROR_OUTPUT_STDOUT_READ
 
- # The post-processed data should have been written to file, either in the retrieved or temp list
- filename_data = PpCalculation._FILEOUT
- if filename_data in self.retrieved.list_object_names(): # Retrieved list case
- try:
- data_raw = self.retrieved.get_object_content(filename_data)
- except (IOError, OSError):
- return self.exit_codes.ERROR_DATAFILE_READ
- elif temp_folder_path is not None: # Temp list case
- data_file_path = os.path.join(temp_folder_path, filename_data)
- if os.path.isfile(data_file_path):
- try:
- with open(data_file_path, 'r') as fhandle:
- data_raw = fhandle.read()
- except (IOError, OSError):
- return self.exit_codes.ERROR_DATAFILE_READ
- else:
- return self.exit_codes.ERROR_OUTPUT_DATAFILE_MISSING
+ data_raw = []
+
+ # Currently all plot output files should start with the `filplot` as prefix. If only one file was produced the
+ # prefix is the entire filename, but in the case of multiple files, there will be pairs of two files where the
+ # first has the format '{filename_prefix}.{some_random_suffix' and the second has the same name but with the
+ # `filename_suffix` appended.
+ filename_prefix = PpCalculation._FILPLOT
+ filename_suffix = PpCalculation._FILEOUT
+
+ # How to get the output filenames and how to open them, depends on whether they will have been retrieved in the
+ # `retrieved` output node, or in the `retrieved_temporary_folder`. Instead of having a conditional with almost
+ # the same loop logic in each branch, we apply a somewhat dirty trick to define an `opener` which is a callable
+ # that will open a handle to the output file given a certain filename. This works since it is guaranteed that
+ # these output files (excluding the standard output) will all either be in the retrieved, or in the retrieved
+ # temporary folder.
+ if retrieve_temporary_list:
+ filenames = os.listdir(retrieved_temporary_folder)
+ file_opener = lambda filename: open(os.path.join(retrieved_temporary_folder, filename))
  else:
- return self.exit_codes.ERROR_OUTPUT_DATAFILE_MISSING
+ filenames = self.retrieved.list_object_names()
+ file_opener = self.retrieved.open
+
+ for filename in filenames:
+ if filename.endswith(filename_suffix):
+ try:
+ with file_opener(filename) as handle:
+ data_raw.append((filename, handle.read()))
+ except OSError:
+ return self.exit_codes.ERROR_OUTPUT_DATAFILE_READ.format(filename=filename)
+
+ # If we don't have any parsed files, we exit. Note that this will not catch the case where there should be more
+ # than one file, but the engine did not retrieve all of them. Since often we anyway don't know how many files
+ # should be retrieved there really is no way to check this explicitly.
+ if not data_raw:
+ return self.exit_codes.ERROR_OUTPUT_DATAFILE_MISSING.format(filename=filename_prefix)
 
- # Parse stdout
  try:
  logs, self.output_parameters = self.parse_stdout(stdout_raw)
  except Exception:
  self.logger.error(traceback.format_exc())
  return self.exit_codes.ERROR_UNEXPECTED_PARSER_EXCEPTION
 
- # Print the logs
  self.emit_logs(logs)
 
  # Scan logs for known errors
@@ -107,21 +120,43 @@ def parse(self, **kwargs):
  if 'ERROR_OUTPUT_STDOUT_INCOMPLETE' in logs['error']:
  return self.exit_codes.ERROR_OUTPUT_STDOUT_INCOMPLETE
 
- # Parse the post-processed-data according to what kind of data file was produced
- if self.output_parameters['output_format'] == 'gnuplot':
- if self.output_parameters['plot_type'] == '2D polar on a sphere':
- parsed_data = self.parse_gnuplot_polar(data_raw)
- else:
- parsed_data = self.parse_gnuplot1D(data_raw)
- elif self.output_parameters['output_format'] == 'gnuplot x,y,f':
- parsed_data = self.parse_gnuplot2D(data_raw)
- elif self.output_parameters['output_format'] == 'Gaussian cube':
- parsed_data = self.parse_gaussian(data_raw)
- else:
+ # The following check should in principle always succeed since the iflag should in principle be set by the
+ # `PpCalculation` plugin which only ever sets 0 - 4, but we check in order for the code not to except.
+ iflag = self.node.inputs.parameters.get_attribute('PLOT')['iflag']
+ if iflag not in range(5):
  return self.exit_codes.ERROR_UNSUPPORTED_DATAFILE_FORMAT
 
+ data_parsed = []
+ parsers = {
+ 0: self.parse_gnuplot1D,
+ 1: self.parse_gnuplot1D,
+ 2: self.parse_gnuplot2D,
+ 3: self.parse_gaussian,
+ 4: self.parse_gnuplot_polar,
+ }
+
+ def get_key_from_filename(filename):
+ """Determine the output link label for the output file with the given filename."""
+ if filename == filename_suffix:
+ return filename
+
+ pattern = r'{}_(.*){}'.format(filename_prefix, filename_suffix)
+ matches = re.search(pattern, filename)
+ return matches.group(1)
+
+ for filename, data in data_raw:
+ try:
+ key = get_key_from_filename(filename)
+ data_parsed.append((key, parsers[iflag](data)))
+ except Exception: # pylint: disable=broad-except
+ return self.exit_codes.ERROR_OUTPUT_DATAFILE_PARSE.format(filename=filename)
+
  # Create output nodes
- self.out('output_data', parsed_data)
+ if len(data_parsed) == 1:
+ self.out('output_data', data_parsed[0][1])
+ else:
+ self.out('output_data_multiple', dict(data_parsed))
+
  self.out('output_parameters', orm.Dict(dict=self.output_parameters))
 
  def parse_stdout(self, stdout_str):
@@ -199,8 +234,7 @@ def detect_important_message(logs, line):
  return logs, output_dict
 
  def parse_gnuplot1D(self, data_file_str):
- """
- Parse 1D GNUPlot formatted output
+ """Parse 1D GNUPlot formatted output.
 
  :param data_file_str: the data file read in as a single string
  """
@@ -247,11 +281,10 @@ def parse_gnuplot1D(self, data_file_str):
  return arraydata
 
  def parse_gnuplot_polar(self, data_file_str):
- """
- Parse 2D Polar GNUPlot formatted, single column output
+ """Parse 2D Polar GNUPlot formatted, single column output.
 
-  :param data_file_str: the data file read in as a single string
-  """
+ :param data_file_str: the data file read in as a single string
+ """
  data_lines = data_file_str.splitlines()
  data_lines.pop(0) # First line is a header
 
@@ -267,8 +300,7 @@ def parse_gnuplot_polar(self, data_file_str):
  return arraydata
 
  def parse_gnuplot2D(self, data_file_str):
- """
- Parse 2D GNUPlot formatted output
+ """Parse 2D GNUPlot formatted output.
 
  :param data_file_str: the data file read in as a single string
  """
@@ -297,18 +329,15 @@ def parse_gnuplot2D(self, data_file_str):
  return arraydata
 
  def parse_gaussian(self, data_file_str):
- """
- Parse Gaussian Cube formatted output
+ """Parse Gaussian Cube formatted output.
 
  :param data_file_str: the data file read in as a single string
  """
-
  lines = data_file_str.splitlines()
 
  atoms_line = lines[2].split()
  atoms = int(atoms_line[0]) # The number of atoms listed in the file
- header = lines[:6 + atoms
- ] # The header of the file: comments, the voxel, and the number of atoms and datapoints
+ header = lines[:6 + atoms] # Header of the file: comments, the voxel, and the number of atoms and datapoints
  data_lines = lines[6 + atoms:] # The actual data: atoms and volumetric data
 
  # Parse the declared dimensions of the volumetric data

diff --git a/tests/calculations/test_pp.py b/tests/calculations/test_pp.py
@@ -40,14 +40,16 @@ def test_pp_default(aiida_profile, fixture_sandbox, generate_calc_job, generate_
  calc_info = generate_calc_job(fixture_sandbox, entry_point_name, inputs)
 
  retrieve_list = ['aiida.out']
- retrieve_temporary_list = ['aiida.fileout']
+ retrieve_temporary_list = ['aiida.fileout', ('aiida.filplot_*aiida.fileout', '.', 0)]
  local_copy_list = []
 
  # Check the attributes of the returned `CalcInfo`
  assert isinstance(calc_info, datastructures.CalcInfo)
  assert sorted(calc_info.local_copy_list) == sorted(local_copy_list)
  assert sorted(calc_info.retrieve_list) == sorted(retrieve_list)
- assert sorted(calc_info.retrieve_temporary_list) == sorted(retrieve_temporary_list)
+ assert len(calc_info.retrieve_temporary_list) == 2
+ for element in retrieve_temporary_list:
+ assert element in calc_info.retrieve_temporary_list
 
  with fixture_sandbox.open('aiida.in') as handle:
  input_written = handle.read()
@@ -64,15 +66,17 @@ def test_pp_keep_plot_file(aiida_profile, fixture_sandbox, generate_calc_job, ge
  inputs.metadata.options.keep_plot_file = True
 
  calc_info = generate_calc_job(fixture_sandbox, entry_point_name, inputs)
- retrieve_list = ['aiida.out', 'aiida.fileout']
+ retrieve_list = ['aiida.out', 'aiida.fileout', ('aiida.filplot_*aiida.fileout', '.', 0)]
  retrieve_temporary_list = []
  local_copy_list = []
 
  # Check the attributes of the returned `CalcInfo`, no need to check the input file as it is not affected
  assert isinstance(calc_info, datastructures.CalcInfo)
  assert sorted(calc_info.local_copy_list) == sorted(local_copy_list)
- assert sorted(calc_info.retrieve_list) == sorted(retrieve_list)
  assert sorted(calc_info.retrieve_temporary_list) == sorted(retrieve_temporary_list)
+ assert len(calc_info.retrieve_list) == 3
+ for element in retrieve_list:
+ assert element in calc_info.retrieve_list
 
 
 @pytest.mark.parametrize(

diff --git a/tests/conftest.py b/tests/conftest.py
@@ -1,9 +1,11 @@
 # -*- coding: utf-8 -*-
 # pylint: disable=redefined-outer-name,too-many-statements
 """Initialise a text database and profile for pytest."""
+import collections
 import io
 import os
-import collections
+import shutil
+
 import pytest
 
 pytest_plugins = ['aiida.manage.tests.pytest_fixtures'] # pylint: disable=invalid-name
@@ -94,15 +96,20 @@ def flatten_inputs(inputs, prefix=''):
  flat_inputs.append((prefix + key, value))
  return flat_inputs
 
- def _generate_calc_job_node(entry_point_name='base', computer=None, test_name=None, inputs=None, attributes=None):
+ def _generate_calc_job_node(
+ entry_point_name='base', computer=None, test_name=None, inputs=None, attributes=None, retrieve_temporary=None
+ ):
  """Fixture to generate a mock `CalcJobNode` for testing parsers.
 
  :param entry_point_name: entry point name of the calculation class
  :param computer: a `Computer` instance
  :param test_name: relative path of directory with test output files in the `fixtures/{entry_point_name}` folder.
  :param inputs: any optional nodes to add as input links to the corrent CalcJobNode
  :param attributes: any optional attributes to set on the node
- :return: `CalcJobNode` instance with an attached `FolderData` as the `retrieved` node
+ :param retrieve_temporary: optional tuple of an absolute filepath of a temporary directory and a list of
+ filenames that should be written to this directory, which will serve as the `retrieved_temporary_folder`.
+ For now this only works with top-level files and does not support files nested in directories.
+ :return: `CalcJobNode` instance with an attached `FolderData` as the `retrieved` node.
  """
  from aiida import orm
  from aiida.common import LinkType
@@ -155,9 +162,20 @@ def _generate_calc_job_node(entry_point_name='base', computer=None, test_name=No
 
  node.store()
 
+ if retrieve_temporary:
+ dirpath, filenames = retrieve_temporary
+ for filename in filenames:
+ shutil.copy(os.path.join(filepath_folder, filename), os.path.join(dirpath, filename))
+
  if filepath_folder:
  retrieved = orm.FolderData()
  retrieved.put_object_from_tree(filepath_folder)
+
+ # Remove files that are supposed to be only present in the retrieved temporary folder
+ if retrieve_temporary:
+ for filename in filenames:
+ retrieved.delete_object(filename)
+
  retrieved.add_incoming(node, link_type=LinkType.CREATE, link_label='retrieved')
  retrieved.store()
 

diff --git a/tests/parsers/fixtures/pp/default_3d_multiple/aiida.filplot_K001_B001aiida.fileout b/tests/parsers/fixtures/pp/default_3d_multiple/aiida.filplot_K001_B001aiida.fileout
@@ -0,0 +1,8 @@
+ Cubefile created from PWScf calculation
+ k_point 1, band 1
+ 1 0.000000 0.000000 0.000000
+ 1 0.000000 0.141231 0.141231
+ 1 0.141231 0.000000 0.141231
+ 1 0.141231 0.141231 0.000000
+ 14 14.000000 10.168616 10.168616 10.168616
+ 0.16567E-01 0.14498E-01 0.96968E-02
diff --git a/tests/parsers/fixtures/pp/default_3d_multiple/aiida.filplot_K001_B002aiida.fileout b/tests/parsers/fixtures/pp/default_3d_multiple/aiida.filplot_K001_B002aiida.fileout
@@ -0,0 +1,8 @@
+ Cubefile created from PWScf calculation
+ k_point 1, band 2
+ 1 0.000000 0.000000 0.000000
+ 1 0.000000 0.141231 0.141231
+ 1 0.141231 0.000000 0.141231
+ 1 0.141231 0.141231 0.000000
+ 14 14.000000 2.542154 2.542154 2.542154
+ 0.22404E-01 0.19330E-01 0.12458E-01