Skip to content

Commit

Permalink
PpCalculation: add support for retrieving and parsing multiple files
Browse files Browse the repository at this point in the history
The implementation of the `PpCalculation` and `PpParser` assumed that
the code would always only produce on output file with pre-processed
data and one output file with the data formatted in a custom plot
format. However, for certain input parameter combinations, such as
`INPUTPP.plot_num = 7` where more than a single band is requested, a
pair of outputfiles is produced for each band. The filename of the data
file has `INPUTPP.filplot` as a prefix, with some kind of suffix to
distinguish it from the others. The corresponding plot file will use
that filename as a prefix with the `PLOT.fileout` value as a suffix.
For example, for the inputs:

    INPUTPP
        filplot = 'aiida.filplot'
    PLOT
        fileout = 'aiida.fileout'

The data files will be named `aiida.filplot_K1_B1` and the plot files
are formatted as `aiida.filplot_K1_B1aiida.fileout`.

To support this use case, the `PpCalculation` is updated to not just
retrieve a single file, but add a directive to the `retrieve_files` or
`retrieve_temporary_list` that contains the corresponding globbing
pattern. The `PpParser` then simply loops over the content of the
retrieved (temporary) folder and parses each file whose filename matches
the pattern described above. We assume that if there are more than one
file, they all have the exact same format and so can be parsed with the
same logic.

Since now there are potentially more than one parsed output `ArrayData`
node, the `output_data` port, which is not a namespace can not be used.
To keep backwards compatibility, we add the `output_data_multiple`
namespace, which is used if more than one output plot file is parsed.
  • Loading branch information
sphuber committed Jun 27, 2020
1 parent 4f75b6f commit efd41ad
Show file tree
Hide file tree
Showing 8 changed files with 244 additions and 68 deletions.
26 changes: 19 additions & 7 deletions aiida_quantumespresso/calculations/pp.py
Original file line number Diff line number Diff line change
Expand Up @@ -83,6 +83,7 @@ def define(cls, spec):

spec.output('output_parameters', valid_type=orm.Dict)
spec.output('output_data', valid_type=orm.ArrayData)
spec.output_namespace('output_data_multiple', valid_type=orm.ArrayData, dynamic=True)
spec.default_output_node = 'output_parameters'

# Standard exceptions
Expand All @@ -106,11 +107,13 @@ def define(cls, spec):

# Output datafile related exceptions
spec.exit_code(330, 'ERROR_OUTPUT_DATAFILE_MISSING',
message='The retrieved folder did not contain the required formatted data output file.')
message='The formatted data output file `{filename}` was not present in the retrieved (temporary) folder.')
spec.exit_code(331, 'ERROR_OUTPUT_DATAFILE_READ',
message='The formatted data output file could not be read.')
message='The formatted data output file `{filename}` could not be read.')
spec.exit_code(332, 'ERROR_UNSUPPORTED_DATAFILE_FORMAT',
message='The data file format is not supported by the parser')
spec.exit_code(333, 'ERROR_OUTPUT_DATAFILE_PARSE',
message='The formatted data output file `{filename}` could not be parsed')

def prepare_for_submission(self, folder): # pylint: disable=too-many-branches,too-many-statements
"""Prepare the calculation job for submission by transforming input nodes into input files.
Expand Down Expand Up @@ -210,13 +213,22 @@ def prepare_for_submission(self, folder): # pylint: disable=too-many-branches,t
calcinfo.local_copy_list = local_copy_list
calcinfo.remote_copy_list = remote_copy_list

# Retrieve by default the output file and plot file
calcinfo.retrieve_list = []
# Retrieve by default the output file
calcinfo.retrieve_list = [self.inputs.metadata.options.output_filename]
calcinfo.retrieve_temporary_list = []
calcinfo.retrieve_list.append(self.inputs.metadata.options.output_filename)

# Depending on the `plot_num` and the corresponding parameters, more than one pair of `filplot` + `fileout`
# files may be written. In that case, the data files will have `filplot` as a prefix with some suffix to
# distinguish them from one another. The `fileout` filename will be the full data filename with the `fileout`
# value as a suffix.
retrieve_tuples = [
self._FILEOUT,
('{}_*{}'.format(self._FILPLOT, self._FILEOUT), '.', 0)
]

if self.inputs.metadata.options.keep_plot_file:
calcinfo.retrieve_list.append(self._FILEOUT)
calcinfo.retrieve_list.extend(retrieve_tuples)
else:
calcinfo.retrieve_temporary_list.append(self._FILEOUT)
calcinfo.retrieve_temporary_list.extend(retrieve_tuples)

return calcinfo
135 changes: 82 additions & 53 deletions aiida_quantumespresso/parsers/pp.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,8 @@
# -*- coding: utf-8 -*-
"""`Parser` implementation for the `PpCalculation` calculation job class."""
import os
import re
import traceback
import os.path

import numpy as np

Expand Down Expand Up @@ -45,24 +46,22 @@ def parse(self, **kwargs):
"""
Parse raw files retrieved from remote dir
"""

temp_folder_path = None

# A retrieved folded is required
try:
self.retrieved
except exceptions.NotExistent:
return self.exit_codes.ERROR_NO_RETRIEVED_FOLDER

retrieve_temporary_list = self.node.get_attribute('retrieve_temporary_list', None)
filename_stdout = self.node.get_option('output_filename')

# If temporary files were specified, check that we have them
if self.node.get_attribute('retrieve_temporary_list', None):
if retrieve_temporary_list:
try:
temp_folder_path = kwargs['retrieved_temporary_folder']
retrieved_temporary_folder = kwargs['retrieved_temporary_folder']
except KeyError:
return self.exit(self.exit_codes.ERROR_NO_RETRIEVED_TEMPORARY_FOLDER)

# The stdout is required for parsing
filename_stdout = self.node.get_attribute('output_filename')
if filename_stdout not in self.retrieved.list_object_names():
return self.exit_codes.ERROR_OUTPUT_STDOUT_MISSING

Expand All @@ -71,34 +70,48 @@ def parse(self, **kwargs):
except (IOError, OSError):
return self.exit_codes.ERROR_OUTPUT_STDOUT_READ

# The post-processed data should have been written to file, either in the retrieved or temp list
filename_data = PpCalculation._FILEOUT
if filename_data in self.retrieved.list_object_names(): # Retrieved list case
try:
data_raw = self.retrieved.get_object_content(filename_data)
except (IOError, OSError):
return self.exit_codes.ERROR_DATAFILE_READ
elif temp_folder_path is not None: # Temp list case
data_file_path = os.path.join(temp_folder_path, filename_data)
if os.path.isfile(data_file_path):
try:
with open(data_file_path, 'r') as fhandle:
data_raw = fhandle.read()
except (IOError, OSError):
return self.exit_codes.ERROR_DATAFILE_READ
else:
return self.exit_codes.ERROR_OUTPUT_DATAFILE_MISSING
data_raw = []

# Currently all plot output files should start with the `filplot` as prefix. If only one file was produced the
# prefix is the entire filename, but in the case of multiple files, there will be pairs of two files where the
# first has the format '{filename_prefix}.{some_random_suffix' and the second has the same name but with the
# `filename_suffix` appended.
filename_prefix = PpCalculation._FILPLOT
filename_suffix = PpCalculation._FILEOUT

# How to get the output filenames and how to open them, depends on whether they will have been retrieved in the
# `retrieved` output node, or in the `retrieved_temporary_folder`. Instead of having a conditional with almost
# the same loop logic in each branch, we apply a somewhat dirty trick to define an `opener` which is a callable
# that will open a handle to the output file given a certain filename. This works since it is guaranteed that
# these output files (excluding the standard output) will all either be in the retrieved, or in the retrieved
# temporary folder.
if retrieve_temporary_list:
filenames = os.listdir(retrieved_temporary_folder)
file_opener = lambda filename: open(os.path.join(retrieved_temporary_folder, filename))
else:
return self.exit_codes.ERROR_OUTPUT_DATAFILE_MISSING
filenames = self.retrieved.list_object_names()
file_opener = self.retrieved.open

for filename in filenames:
if filename.endswith(filename_suffix):
try:
with file_opener(filename) as handle:
data_raw.append((filename, handle.read()))
except OSError:
return self.exit_codes.ERROR_OUTPUT_DATAFILE_READ.format(filename=filename)

# If we don't have any parsed files, we exit. Note that this will not catch the case where there should be more
# than one file, but the engine did not retrieve all of them. Since often we anyway don't know how many files
# should be retrieved there really is no way to check this explicitly.
if not data_raw:
return self.exit_codes.ERROR_OUTPUT_DATAFILE_MISSING.format(filename=filename_prefix)

# Parse stdout
try:
logs, self.output_parameters = self.parse_stdout(stdout_raw)
except Exception:
self.logger.error(traceback.format_exc())
return self.exit_codes.ERROR_UNEXPECTED_PARSER_EXCEPTION

# Print the logs
self.emit_logs(logs)

# Scan logs for known errors
Expand All @@ -107,21 +120,43 @@ def parse(self, **kwargs):
if 'ERROR_OUTPUT_STDOUT_INCOMPLETE' in logs['error']:
return self.exit_codes.ERROR_OUTPUT_STDOUT_INCOMPLETE

# Parse the post-processed-data according to what kind of data file was produced
if self.output_parameters['output_format'] == 'gnuplot':
if self.output_parameters['plot_type'] == '2D polar on a sphere':
parsed_data = self.parse_gnuplot_polar(data_raw)
else:
parsed_data = self.parse_gnuplot1D(data_raw)
elif self.output_parameters['output_format'] == 'gnuplot x,y,f':
parsed_data = self.parse_gnuplot2D(data_raw)
elif self.output_parameters['output_format'] == 'Gaussian cube':
parsed_data = self.parse_gaussian(data_raw)
else:
# The following check should in principle always succeed since the iflag should in principle be set by the
# `PpCalculation` plugin which only ever sets 0 - 4, but we check in order for the code not to except.
iflag = self.node.inputs.parameters.get_attribute('PLOT')['iflag']
if iflag not in range(5):
return self.exit_codes.ERROR_UNSUPPORTED_DATAFILE_FORMAT

data_parsed = []
parsers = {
0: self.parse_gnuplot1D,
1: self.parse_gnuplot1D,
2: self.parse_gnuplot2D,
3: self.parse_gaussian,
4: self.parse_gnuplot_polar,
}

def get_key_from_filename(filename):
"""Determine the output link label for the output file with the given filename."""
if filename == filename_suffix:
return filename

pattern = r'{}_(.*){}'.format(filename_prefix, filename_suffix)
matches = re.search(pattern, filename)
return matches.group(1)

for filename, data in data_raw:
try:
key = get_key_from_filename(filename)
data_parsed.append((key, parsers[iflag](data)))
except Exception: # pylint: disable=broad-except
return self.exit_codes.ERROR_OUTPUT_DATAFILE_PARSE.format(filename=filename)

# Create output nodes
self.out('output_data', parsed_data)
if len(data_parsed) == 1:
self.out('output_data', data_parsed[0][1])
else:
self.out('output_data_multiple', dict(data_parsed))

self.out('output_parameters', orm.Dict(dict=self.output_parameters))

def parse_stdout(self, stdout_str):
Expand Down Expand Up @@ -199,8 +234,7 @@ def detect_important_message(logs, line):
return logs, output_dict

def parse_gnuplot1D(self, data_file_str):
"""
Parse 1D GNUPlot formatted output
"""Parse 1D GNUPlot formatted output.
:param data_file_str: the data file read in as a single string
"""
Expand Down Expand Up @@ -247,11 +281,10 @@ def parse_gnuplot1D(self, data_file_str):
return arraydata

def parse_gnuplot_polar(self, data_file_str):
"""
Parse 2D Polar GNUPlot formatted, single column output
"""Parse 2D Polar GNUPlot formatted, single column output.
:param data_file_str: the data file read in as a single string
"""
:param data_file_str: the data file read in as a single string
"""
data_lines = data_file_str.splitlines()
data_lines.pop(0) # First line is a header

Expand All @@ -267,8 +300,7 @@ def parse_gnuplot_polar(self, data_file_str):
return arraydata

def parse_gnuplot2D(self, data_file_str):
"""
Parse 2D GNUPlot formatted output
"""Parse 2D GNUPlot formatted output.
:param data_file_str: the data file read in as a single string
"""
Expand Down Expand Up @@ -297,18 +329,15 @@ def parse_gnuplot2D(self, data_file_str):
return arraydata

def parse_gaussian(self, data_file_str):
"""
Parse Gaussian Cube formatted output
"""Parse Gaussian Cube formatted output.
:param data_file_str: the data file read in as a single string
"""

lines = data_file_str.splitlines()

atoms_line = lines[2].split()
atoms = int(atoms_line[0]) # The number of atoms listed in the file
header = lines[:6 + atoms
] # The header of the file: comments, the voxel, and the number of atoms and datapoints
header = lines[:6 + atoms] # Header of the file: comments, the voxel, and the number of atoms and datapoints
data_lines = lines[6 + atoms:] # The actual data: atoms and volumetric data

# Parse the declared dimensions of the volumetric data
Expand Down
12 changes: 8 additions & 4 deletions tests/calculations/test_pp.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,14 +40,16 @@ def test_pp_default(aiida_profile, fixture_sandbox, generate_calc_job, generate_
calc_info = generate_calc_job(fixture_sandbox, entry_point_name, inputs)

retrieve_list = ['aiida.out']
retrieve_temporary_list = ['aiida.fileout']
retrieve_temporary_list = ['aiida.fileout', ('aiida.filplot_*aiida.fileout', '.', 0)]
local_copy_list = []

# Check the attributes of the returned `CalcInfo`
assert isinstance(calc_info, datastructures.CalcInfo)
assert sorted(calc_info.local_copy_list) == sorted(local_copy_list)
assert sorted(calc_info.retrieve_list) == sorted(retrieve_list)
assert sorted(calc_info.retrieve_temporary_list) == sorted(retrieve_temporary_list)
assert len(calc_info.retrieve_temporary_list) == 2
for element in retrieve_temporary_list:
assert element in calc_info.retrieve_temporary_list

with fixture_sandbox.open('aiida.in') as handle:
input_written = handle.read()
Expand All @@ -64,15 +66,17 @@ def test_pp_keep_plot_file(aiida_profile, fixture_sandbox, generate_calc_job, ge
inputs.metadata.options.keep_plot_file = True

calc_info = generate_calc_job(fixture_sandbox, entry_point_name, inputs)
retrieve_list = ['aiida.out', 'aiida.fileout']
retrieve_list = ['aiida.out', 'aiida.fileout', ('aiida.filplot_*aiida.fileout', '.', 0)]
retrieve_temporary_list = []
local_copy_list = []

# Check the attributes of the returned `CalcInfo`, no need to check the input file as it is not affected
assert isinstance(calc_info, datastructures.CalcInfo)
assert sorted(calc_info.local_copy_list) == sorted(local_copy_list)
assert sorted(calc_info.retrieve_list) == sorted(retrieve_list)
assert sorted(calc_info.retrieve_temporary_list) == sorted(retrieve_temporary_list)
assert len(calc_info.retrieve_list) == 3
for element in retrieve_list:
assert element in calc_info.retrieve_list


@pytest.mark.parametrize(
Expand Down
24 changes: 21 additions & 3 deletions tests/conftest.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,11 @@
# -*- coding: utf-8 -*-
# pylint: disable=redefined-outer-name,too-many-statements
"""Initialise a text database and profile for pytest."""
import collections
import io
import os
import collections
import shutil

import pytest

pytest_plugins = ['aiida.manage.tests.pytest_fixtures'] # pylint: disable=invalid-name
Expand Down Expand Up @@ -94,15 +96,20 @@ def flatten_inputs(inputs, prefix=''):
flat_inputs.append((prefix + key, value))
return flat_inputs

def _generate_calc_job_node(entry_point_name='base', computer=None, test_name=None, inputs=None, attributes=None):
def _generate_calc_job_node(
entry_point_name='base', computer=None, test_name=None, inputs=None, attributes=None, retrieve_temporary=None
):
"""Fixture to generate a mock `CalcJobNode` for testing parsers.
:param entry_point_name: entry point name of the calculation class
:param computer: a `Computer` instance
:param test_name: relative path of directory with test output files in the `fixtures/{entry_point_name}` folder.
:param inputs: any optional nodes to add as input links to the corrent CalcJobNode
:param attributes: any optional attributes to set on the node
:return: `CalcJobNode` instance with an attached `FolderData` as the `retrieved` node
:param retrieve_temporary: optional tuple of an absolute filepath of a temporary directory and a list of
filenames that should be written to this directory, which will serve as the `retrieved_temporary_folder`.
For now this only works with top-level files and does not support files nested in directories.
:return: `CalcJobNode` instance with an attached `FolderData` as the `retrieved` node.
"""
from aiida import orm
from aiida.common import LinkType
Expand Down Expand Up @@ -155,9 +162,20 @@ def _generate_calc_job_node(entry_point_name='base', computer=None, test_name=No

node.store()

if retrieve_temporary:
dirpath, filenames = retrieve_temporary
for filename in filenames:
shutil.copy(os.path.join(filepath_folder, filename), os.path.join(dirpath, filename))

if filepath_folder:
retrieved = orm.FolderData()
retrieved.put_object_from_tree(filepath_folder)

# Remove files that are supposed to be only present in the retrieved temporary folder
if retrieve_temporary:
for filename in filenames:
retrieved.delete_object(filename)

retrieved.add_incoming(node, link_type=LinkType.CREATE, link_label='retrieved')
retrieved.store()

Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
Cubefile created from PWScf calculation
k_point 1, band 1
1 0.000000 0.000000 0.000000
1 0.000000 0.141231 0.141231
1 0.141231 0.000000 0.141231
1 0.141231 0.141231 0.000000
14 14.000000 10.168616 10.168616 10.168616
0.16567E-01 0.14498E-01 0.96968E-02
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
Cubefile created from PWScf calculation
k_point 1, band 2
1 0.000000 0.000000 0.000000
1 0.000000 0.141231 0.141231
1 0.141231 0.000000 0.141231
1 0.141231 0.141231 0.000000
14 14.000000 2.542154 2.542154 2.542154
0.22404E-01 0.19330E-01 0.12458E-01
Loading

0 comments on commit efd41ad

Please sign in to comment.