diff --git a/atomisticparsers/h5md/README.md b/atomisticparsers/h5md/README.md new file mode 100644 index 00000000..55858307 --- /dev/null +++ b/atomisticparsers/h5md/README.md @@ -0,0 +1,5 @@ +This is a NOMAD parser for the [H5MD](http://www.??.org/) schema for hdf5 files. It will read +an hdf5 file and transform the content into NOMAD's unified Metainfo based Archive format according to the H5MD schema. + + + diff --git a/atomisticparsers/h5md/__init__.py b/atomisticparsers/h5md/__init__.py new file mode 100644 index 00000000..e47a4f1d --- /dev/null +++ b/atomisticparsers/h5md/__init__.py @@ -0,0 +1,19 @@ +# +# Copyright The NOMAD Authors. +# +# This file is part of NOMAD. +# See https://nomad-lab.eu for further info. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +from .parser import H5MDParser diff --git a/atomisticparsers/h5md/__main__.py b/atomisticparsers/h5md/__main__.py new file mode 100644 index 00000000..05c85dda --- /dev/null +++ b/atomisticparsers/h5md/__main__.py @@ -0,0 +1,31 @@ +# +# Copyright The NOMAD Authors. +# +# This file is part of NOMAD. +# See https://nomad-lab.eu for further info. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +import sys +import json +import logging + +from nomad.utils import configure_logging +from nomad.datamodel import EntryArchive +from atomisticparsers.h5md import H5MDParser + +if __name__ == "__main__": + configure_logging(console_log_level=logging.DEBUG) + archive = EntryArchive() + H5MDParser().parse(sys.argv[1], archive, logging) + json.dump(archive.m_to_dict(), sys.stdout, indent=2) diff --git a/atomisticparsers/h5md/metainfo/__init__.py b/atomisticparsers/h5md/metainfo/__init__.py new file mode 100644 index 00000000..871ee91c --- /dev/null +++ b/atomisticparsers/h5md/metainfo/__init__.py @@ -0,0 +1,24 @@ +# +# Copyright The NOMAD Authors. +# +# This file is part of NOMAD. +# See https://nomad-lab.eu for further info. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +from nomad.metainfo import Environment + +from . import h5md + +m_env = Environment() +m_env.m_add_sub_section(Environment.packages, h5md.m_package) diff --git a/atomisticparsers/h5md/metainfo/h5md.py b/atomisticparsers/h5md/metainfo/h5md.py new file mode 100644 index 00000000..361ddfd8 --- /dev/null +++ b/atomisticparsers/h5md/metainfo/h5md.py @@ -0,0 +1,190 @@ +# +# Copyright The NOMAD Authors. +# +# This file is part of NOMAD. +# See https://nomad-lab.eu for further info. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +import numpy as np # pylint: disable=unused-import +import typing # pylint: disable=unused-import +from nomad.metainfo import ( # pylint: disable=unused-import + MSection, MCategory, Category, Package, Quantity, Section, SubSection, SectionProxy, + Reference +) +from nomad.datamodel.metainfo import simulation + + +m_package = Package() + + +class ParamEntry(MSection): + ''' + Generic section defining a parameter name and value + ''' + + m_def = Section(validate=False) + + kind = Quantity( + type=str, + shape=[], + description=''' + Name of the parameter. + ''') + + value = Quantity( + type=str, + shape=[], + description=''' + Value of the parameter as a string. + ''') + + unit = Quantity( + type=str, + shape=[], + description=''' + Unit of the parameter as a string. + ''') + + # TODO add description quantity + + +class CalcEntry(MSection): + ''' + Section describing a general type of calculation. + ''' + + m_def = Section(validate=False) + + kind = Quantity( + type=str, + shape=[], + description=''' + Kind of the quantity. + ''') + + value = Quantity( + type=np.dtype(np.float64), + shape=[], + description=''' + Value of this contribution. + ''') + + unit = Quantity( + type=str, + shape=[], + description=''' + Unit of the parameter as a string. + ''') + + # TODO add description quantity + +class ForceCalculations(simulation.method.ForceCalculations): + + m_def = Section(validate=False, extends_base_section=True,) + + x_h5md_parameters = SubSection( + sub_section=ParamEntry.m_def, + description=''' + Contains non-normalized force calculation parameters. + ''', + repeats=True) + +class NeighborSearching(simulation.method.NeighborSearching): + + m_def = Section(validate=False, extends_base_section=True,) + + x_h5md_parameters = SubSection( + sub_section=ParamEntry.m_def, + description=''' + Contains non-normalized neighbor searching parameters. + ''', + repeats=True) + +class AtomsGroup(simulation.system.AtomsGroup): + ''' + Describes a group of atoms which may constitute a sub system as in the case of a + molecule. + ''' + + m_def = Section(validate=False, extends_base_section=True,) + + x_h5md_parameters = SubSection( # TODO should this be called parameters or attributes or what? + sub_section=ParamEntry.m_def, + description=''' + Contains additional information about the atom group . + ''', + repeats=True) + + +class Calculation(simulation.calculation.Calculation): + + m_def = Section(validate=False, extends_base_section=True,) + + x_h5md_custom_calculations = SubSection( + sub_section=ParamEntry.m_def, + description=''' + Contains other generic custom calculations that are not already defined. + ''', + repeats=True) + + +class Energy(simulation.calculation.Energy): + + m_def = Section(validate=False, extends_base_section=True,) + + x_h5md_energy_contributions = SubSection( + sub_section=simulation.calculation.EnergyEntry.m_def, + description=''' + Contains other custom energy contributions that are not already defined. + ''', + repeats=True) + + +class Author(MSection): + ''' + Contains the specifications of the program. + ''' + + m_def = Section(validate=False) + + name = Quantity( + type=str, + shape=[], + description=''' + Specifies the name of the author who generated the h5md file. + ''',) + + email = Quantity( + type=str, + shape=[], + description=''' + Author's email. + ''',) + + +class Run(simulation.run.Run): + + m_def = Section(validate=False, extends_base_section=True,) + + # TODO Not sure how we are dealing with versioning with H5MD-NOMAD + x_h5md_version = Quantity( + type=np.dtype(np.int32), + shape=[2], + description=''' + Specifies the version of the h5md schema being followed. + ''',) + + x_h5md_author = SubSection(sub_section=Author.m_def) + + x_h5md_creator = SubSection(sub_section=simulation.run.Program.m_def) diff --git a/atomisticparsers/h5md/nomad_plugin.yaml b/atomisticparsers/h5md/nomad_plugin.yaml new file mode 100644 index 00000000..cb97a11a --- /dev/null +++ b/atomisticparsers/h5md/nomad_plugin.yaml @@ -0,0 +1,19 @@ +code_category: Atomistic code +# code_homepage: http://www.??.org/ +# TODO is this the correct category +code_name: H5MD +metadata: + codeCategory: Atomistic code + codeLabel: H5MD + codeLabelStyle: All in capitals + codeName: h5md + # codeUrl: http://www.??.org/ + parserDirName: dependencies/parsers/atomistic/atomisticparsers/h5md/ + parserGitUrl: https://github.com/nomad-coe/atomistic-parsers.git + parserSpecific: '' + preamble: '' + status: production + tableOfFiles: '' +name: parsers/h5md +parser_class_name: atomisticparsers.h5md.parser.H5MDParser +python_package: atomisticparsers.h5md diff --git a/atomisticparsers/h5md/parser.py b/atomisticparsers/h5md/parser.py new file mode 100644 index 00000000..d611ab22 --- /dev/null +++ b/atomisticparsers/h5md/parser.py @@ -0,0 +1,684 @@ +# +# Copyright The NOMAD Authors. +# +# This file is part of NOMAD. +# See https://nomad-lab.eu for further info. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +import os +import numpy as np +import logging +import h5py + +from typing import List, Dict +from h5py import Group + +from nomad.datamodel import EntryArchive +from nomad.metainfo.util import MEnum +from nomad.parsing.file_parser import FileParser +from nomad.datamodel.metainfo.simulation.run import Run, Program, MSection +from nomad.datamodel.metainfo.simulation.method import ( + Method, ForceField, Model, AtomParameters, ForceCalculations, NeighborSearching +) +from nomad.datamodel.metainfo.simulation.system import ( + System, AtomsGroup +) +from nomad.datamodel.metainfo.simulation.calculation import ( + Calculation, Energy, EnergyEntry, BaseCalculation +) +from nomad.datamodel.metainfo.simulation.workflow import ( + EnsembleProperty, EnsemblePropertyValues, CorrelationFunction, CorrelationFunctionValues +) +from atomisticparsers.utils import MDParser, MOL +from .metainfo.h5md import ParamEntry, CalcEntry, Author +from nomad.units import ureg + + +class HDF5Parser(FileParser): + def __init__(self): + super().__init__(None) + + @property + def filehdf5(self): + if self._file_handler is None: + try: + self._file_handler = h5py.File(self.mainfile, 'r') + except Exception: + self.logger.error('Error reading hdf5 file.') + + return self._file_handler + + def apply_unit(self, quantity, unit: str, unit_factor: float): + + if quantity is None: + return + if unit: + unit = ureg(unit) + unit *= unit_factor + quantity *= unit + + return quantity + + def decode_bytes(self, dataset): + if dataset is None: + return None + elif isinstance(dataset, np.ndarray): + if dataset.size == 0: + return None + dataset = [val.decode("utf-8") for val in dataset] if isinstance(dataset[0], bytes) else dataset + dataset = [val.__bool__() for val in dataset] if isinstance(dataset[0], bool) else dataset + elif type(dataset).__name__ == 'bool_': # TODO fix error when using isinstance() here + dataset = dataset.__bool__() + else: + dataset = dataset.decode("utf-8") if isinstance(dataset, bytes) else dataset + return dataset + + def get_attribute(self, group, attribute: str = None, path: str = None, default=None): + ''' + Extracts attribute from group object based on path, and returns default if not defined. + ''' + if path: + section_segments = path.split('.') + for section in section_segments: + try: + value = group.get(section) + group = value + except Exception: + return + value = group.attrs.get(attribute) + value = self.decode_bytes(value) if value is not None else default + + return value if value is not None else default + + def get_value(self, group, path: str, default=None): + ''' + Extracts group or dataset from group object based on path, and returns default if not defined. + ''' + section_segments = path.split('.') + for section in section_segments: + try: + value = group.get(section) + unit = self.get_attribute(group, 'unit', path=section) + unit_factor = self.get_attribute(group, 'unit_factor', path=section, default=1.0) + group = value + except Exception: + return + + if value is None: + value = default + elif isinstance(value, h5py.Dataset): + value = value[()] + value = self.apply_unit(value, unit, unit_factor) + value = self.decode_bytes(value) + + return value if value is not None else default + + def parse(self, path: str = None, **kwargs): + source = kwargs.get('source', self.filehdf5) + isattr = kwargs.get('isattr', False) + value = None + if isattr: + attr_path, attribute = path.rsplit('.', 1) + value = self.get_attribute(source, attribute, path=attr_path) + else: + value = self.get_value(source, path) + self._results[path] = value + +class H5MDParser(MDParser): + def __init__(self): + super().__init__() + self._data_parser = HDF5Parser() + self._n_frames = None + self._n_atoms = None + self._atom_parameters = None + self._system_info = None + self._observable_info = None + self._parameter_info = None + self._time_unit = ureg.picosecond + self._path_group_particles_all = 'particles.all' + self._path_group_positions_all = 'particles.all.position' + self._path_value_positions_all = 'particles.all.position.value' + + self._nomad_to_particles_group_map = { + 'positions': 'position', + 'velocities': 'velocity', + 'forces': 'force', + 'labels': 'species_label', + 'label': 'force_field_label', + 'mass': 'mass', + 'charge': 'charge', + } + + self._nomad_to_box_group_map = { + 'lattice_vectors': 'edges', + 'periodic': 'boundary', + 'dimension': 'dimension' + } + + def parse_atom_parameters(self): + if self._n_atoms is None: + return {} + self._atom_parameters = {} + n_atoms = self._n_atoms[0] # TODO Extend to non-static n_atoms + + atom_parameter_keys = ['label', 'mass', 'charge'] + for key in atom_parameter_keys: + value = self._data_parser.get(f'{self._path_group_particles_all}.{self._nomad_to_particles_group_map[key]}') + if value is not None: + self._atom_parameters[key] = value + else: + continue + if isinstance(self._atom_parameters[key], h5py.Group): + self.logger.warning('Time-dependent atom parameters currently not supported.' + ' Atom parameter values will not be stored.') + continue + elif len(self._atom_parameters[key]) != n_atoms: + self.logger.warning('Inconsistent length of some atom parameters.' + ' Atom parameter values will not be stored.') + continue + + def parse_system_info(self): + self._system_info = {'system': {}, 'calculation': {}} + particles_group = self._data_parser.get(self._path_group_particles_all) + positions = self._data_parser.get(self._path_value_positions_all) + n_frames = self._n_frames + if particles_group is None or positions is None or positions is None: # For now we require that positions are present in the H5MD file to store other particle attributes + self.logger.warning('No positions available in H5MD file.' + ' Other particle attributes will not be stored') + return self._system_info + + def get_value(value, steps, path=''): + if value is None: + return value + if isinstance(value, h5py.Group): + value = self._data_parser.get(f'{path}.value' if path else 'value') + path_step = f'{path}.step' if path else 'step' + attr_steps = self._data_parser.get(path_step) + if value is None or attr_steps is None: + self.logger.warning('Missing values or steps in particle attributes.' + ' These attributes will not be stored.') + return None + elif sorted(attr_steps) != sorted(steps): + self.logger.warning('Distinct trajectory lengths of particle attributes not supported.' + ' These attributes will not be stored.') + return None + else: + return value + else: + return [value] * n_frames + + # get the steps based on the positions + steps = self._data_parser.get(f'{self._path_group_positions_all}.step') + if steps is None: + self.logger.warning('No step information available in H5MD file.' + ' System information cannot be parsed.') + return self._system_info + self.trajectory_steps = steps + + # get the rest of the particle quantities + values_dict = {'system': {}, 'calculation': {}} + times = self._data_parser.get(f'{self._path_group_positions_all}.time') + values_dict['system']['time'] = times + values_dict['calculation']['time'] = times + values_dict['system']['positions'] = positions + values_dict['system']['n_atoms'] = self._n_atoms + system_keys = {'labels': 'system', 'velocities': 'system', 'forces': 'calculation'} + for key, sec_key in system_keys.items(): + path = f'{self._path_group_particles_all}.{self._nomad_to_particles_group_map[key]}' + value = self._data_parser.get(path) + values_dict[sec_key][key] = get_value(value, steps, path=path) + + # get the box quantities + box = self._data_parser.get(f'{self._path_group_particles_all}.box') + if box is not None: + box_attributes = ['dimension', 'periodic'] + for box_key in box_attributes: + value = self._data_parser.get(f'{self._path_group_particles_all}.box.{self._nomad_to_box_group_map[box_key]}', + isattr=True) + values_dict['system'][box_key] = [value] * n_frames if value is not None else None + + box_key = 'lattice_vectors' + path = f'{self._path_group_particles_all}.box.{self._nomad_to_box_group_map[box_key]}' + value = self._data_parser.get(path) + values_dict['system'][box_key] = get_value(value, steps, path=path) + + # populate the dictionary + for i_step, step in enumerate(steps): + self._system_info['system'][step] = {key: val[i_step] for key,val in values_dict['system'].items()} + self._system_info['calculation'][step] = {key: val[i_step] for key,val in values_dict['calculation'].items()} + + def parse_observable_info(self): + self._observable_info = { + 'configurational': {}, + 'ensemble_average': {}, + 'correlation_function': {} + } + thermodynamics_steps = [] + observables_group = self._data_parser.get('observables') + if observables_group is None: + return self._observable_info + + def get_observable_paths(observable_group: Dict, current_path: str) -> List: + paths = [] + for obs_key in observable_group.keys(): + path = f'{obs_key}.' + observable = self._data_parser.get_value(observable_group, obs_key) + observable_type = self._data_parser.get_value(observable_group, obs_key).attrs.get('type') + if not observable_type: + paths.extend(get_observable_paths(observable, f'{current_path}{path}')) + else: + paths.append(current_path + path[:-1]) + + return paths + + observable_paths = get_observable_paths(observables_group, current_path='') + for path in observable_paths: + full_path = f'observables.{path}' + observable = self._data_parser.get_value(observables_group, path) + observable_type = self._data_parser.get_value(observables_group, path).attrs.get('type') + observable_name, observable_label = path.split('.', 1) if len(path.split('.')) > 1 else [path, ''] + if observable_type == 'configurational': + steps = self._data_parser.get(f'{full_path}.step') + if steps is None: + self.logger.warning('Missing step information in some observables.' + ' These will not be stored.') + continue + thermodynamics_steps = set(list(steps) + list(thermodynamics_steps)) + times = self._data_parser.get(f'{full_path}.time') + values = self._data_parser.get(f'{full_path}.value') + if isinstance(values, h5py.Group): + self.logger.warning('Group structures within individual observables not supported.' + ' These will not be stored.') + continue + for i_step, step in enumerate(steps): + if not self._observable_info[observable_type].get(step): + self._observable_info[observable_type][step] = {} + self._observable_info[observable_type][step]['time'] = times[i_step] + observable_key = f'{observable_name}-{observable_label}' if observable_label else f'{observable_name}' + self._observable_info[observable_type][step][observable_key] = values[i_step] + else: + if observable_name not in self._observable_info[observable_type].keys(): + self._observable_info[observable_type][observable_name] = {} + self._observable_info[observable_type][observable_name][observable_label] = {} + for key in observable.keys(): + observable_attribute = self._data_parser.get(f'{full_path}.{key}') + if isinstance(observable_attribute, h5py.Group): + self.logger.warning('Group structures within individual observables not supported.' + ' These will not be stored.') + continue + self._observable_info[observable_type][observable_name][observable_label][key] = observable_attribute + + self.thermodynamics_steps = thermodynamics_steps + + def parse_atomsgroup(self, nomad_sec: System or AtomsGroup, h5md_sec_particlesgroup: Group, path_particlesgroup: str): + for i_key, key in enumerate(h5md_sec_particlesgroup.keys()): + path_particlesgroup_key = f'{path_particlesgroup}.{key}' + particles_group = {group_key: self._data_parser.get(f'{path_particlesgroup_key}.{group_key}') + for group_key in h5md_sec_particlesgroup[key].keys()} + sec_atomsgroup = AtomsGroup() + nomad_sec.atoms_group.append(sec_atomsgroup) + sec_atomsgroup.type = particles_group.pop('type', None) + sec_atomsgroup.index = i_key + sec_atomsgroup.atom_indices = particles_group.pop('indices', None) + sec_atomsgroup.n_atoms = len(sec_atomsgroup.atom_indices) if sec_atomsgroup.atom_indices is not None else None + sec_atomsgroup.is_molecule = particles_group.pop('is_molecule', None) + sec_atomsgroup.label = particles_group.pop('label', None) + sec_atomsgroup.composition_formula = particles_group.pop('formula', None) + particles_subgroup = particles_group.pop('particles_group', None) + # set the remaining attributes + for particles_group_key in particles_group.keys(): + val = particles_group.get(particles_group_key) + units = val.units if hasattr(val, 'units') else None + val = val.magnitude if units is not None else val + sec_atomsgroup.x_h5md_parameters.append(ParamEntry(kind=particles_group_key, value=val, unit=units)) + # get the next atomsgroup + if particles_subgroup: + self.parse_atomsgroup(sec_atomsgroup, particles_subgroup, f'{path_particlesgroup_key}.particles_group') + + def is_valid_key_val(self, metainfo_class: MSection, key: str, val) -> bool: + if hasattr(metainfo_class, key): + quant_type = getattr(metainfo_class, key).get('type') + is_menum = isinstance(quant_type, MEnum) if quant_type else False + return False if is_menum and val not in quant_type._list else True + else: + return False + + def parse_parameter_info(self): + self._parameter_info = { + 'force_calculations': {}, + 'workflow': {} + } + + def get_parameters(parameter_group: Group, path: str) -> Dict: + param_dict = {} + for key, val in parameter_group.items(): + path_key = f'{path}.{key}' + if isinstance(val, h5py.Group): + param_dict[key] = get_parameters(val, path_key) + else: + param_dict[key] = self._data_parser.get(path_key) + if isinstance(param_dict[key], str): + param_dict[key] = param_dict[key].upper() if key == 'thermodynamic_ensemble' else param_dict[key].lower() + elif isinstance(param_dict[key], (int, np.int32, np.int64)): + param_dict[key] = param_dict[key].item() + + return param_dict + + force_calculations_group = self._data_parser.get('parameters.force_calculations') + if force_calculations_group is not None: + self._parameter_info['force_calculations'] = get_parameters(force_calculations_group, 'parameters.force_calculations') + workflow_group = self._data_parser.get('parameters.workflow') + if workflow_group is not None: + self._parameter_info['workflow'] = get_parameters(workflow_group, 'parameters.workflow') + + def parse_calculation(self): + sec_run = self.archive.run[-1] + calculation_info = self._observable_info.get('configurational') + if not calculation_info: # TODO should still create entries for system time link in this case + return + + system_info = self._system_info.get('calculation') # note: it is currently ensured in parse_system() that these have the same length as the system_map + for step in self.steps: + data = { + 'method_ref': sec_run.method[-1] if sec_run.method else None, + 'step': step, + 'energy': {}, + } + data_h5md = { + 'x_h5md_custom_calculations': [], + 'x_h5md_energy_contributions': [] + } + data['time'] = calculation_info.get('step', {}).get('time') + if not data['time']: + data['time'] = system_info.get('step', {}).get('time') + + for key, val in system_info.get(step, {}).items(): + if key == 'forces': + data[key] = dict(total=dict(value=val)) + else: + if hasattr(BaseCalculation, key): + data[key] = val + else: + unit = None + if hasattr(val, 'units'): + unit = val.units + val = val.magnitude + data_h5md['x_h5md_custom_calculations'].append(CalcEntry(kind=key, value=val, unit=unit)) + + for key, val in calculation_info.get(step).items(): + key_split = key.split('-') + observable_name = key_split[0] + observable_label = key_split[1] if len(key_split) > 1 else key_split[0] + if 'energ' in observable_name: # TODO check for energies or energy when matching name + if hasattr(Energy, observable_label): + data['energy'][observable_label] = dict(value=val) + else: + data_h5md['x_h5md_energy_contributions'].append(EnergyEntry(kind=key, value=val)) + else: + if hasattr(BaseCalculation, observable_label): + data[observable_label] = val + else: + unit = None + if hasattr(val, 'units'): + unit = val.units + val = val.magnitude + data_h5md['x_h5md_custom_calculations'].append(CalcEntry(kind=key, value=val, unit=unit)) + + self.parse_thermodynamics_step(data) + sec_calc = sec_run.calculation[-1] + + if sec_calc.step != step: # TODO check this comparison + sec_calc = Calculation() + sec_run.calculation.append(sec_calc) + sec_calc.step = int(step) + sec_calc.time = data['time'] + for calc_entry in data_h5md['x_h5md_custom_calculations']: + sec_calc.x_h5md_custom_calculations.append(calc_entry) + sec_energy = sec_calc.energy + if not sec_energy: + sec_energy = Energy() + sec_calc.append(sec_energy) + for energy_entry in data_h5md['x_h5md_energy_contributions']: + sec_energy.x_h5md_energy_contributions.append(energy_entry) + + def parse_system(self): + sec_run = self.archive.run[-1] + + system_info = self._system_info.get('system') + if not system_info: + self.logger.error('No particle information found in H5MD file.') + return + + self._system_time_map = {} + for i_step, step in enumerate(self.trajectory_steps): + time = system_info[step].pop('time') + atoms_dict = system_info[step] + + topology = None + if i_step == 0: # TODO extend to time-dependent bond lists and topologies + atoms_dict['bond_list'] = self._data_parser.get('connectivity.bonds') + path_topology = 'connectivity.particles_group' + topology = self._data_parser.get(path_topology) + + self.parse_trajectory_step({ + 'atoms': atoms_dict + }) + + if i_step == 0 and topology: # TODO extend to time-dependent topologies + self.parse_atomsgroup(sec_run.system[i_step], topology, path_topology) + + def parse_method(self): + + sec_method = Method() + self.archive.run[-1].method.append(sec_method) + sec_force_field = ForceField() + sec_method.force_field = sec_force_field + sec_model = Model() + sec_force_field.model.append(sec_model) + + # get the atom parameters + n_atoms = self._n_atoms[0] if self._n_atoms is not None else 0 # TODO Extend to non-static n_atoms + for n in range(n_atoms): + sec_atom = AtomParameters() + sec_method.atom_parameters.append(sec_atom) + + for key in self._atom_parameters.keys(): + sec_atom.m_set(sec_atom.m_get_quantity_definition(key), self._atom_parameters[key][n]) + + # Get the interactions + connectivity_group = self._data_parser.get('connectivity') + if connectivity_group: + atom_labels = self._atom_parameters.get('label') + interaction_keys = ['bonds', 'angles', 'dihedrals', 'impropers'] + interactions_by_type = [] + for interaction_key in interaction_keys: + interaction_list = self._data_parser.get(f'connectivity.{interaction_key}') + if interaction_list is None: + continue + elif isinstance(interaction_list, h5py.Group): + self.logger.warning('Time-dependent interactions currently not supported.' + ' These values will not be stored') + continue + + interaction_type_dict = { + 'type': interaction_key, + 'n_interactions': len(interaction_list), + 'n_atoms': len(interaction_list[0]), + 'atom_indices': interaction_list, + 'atom_labels': np.array(atom_labels)[interaction_list] if atom_labels is not None else None + } + interactions_by_type.append(interaction_type_dict) + self.parse_interactions_by_type(interactions_by_type, sec_model) + + # Get the force calculation parameters + force_calculation_parameters = self._parameter_info.get('force_calculations') + if force_calculation_parameters: + sec_force_calculations = ForceCalculations() + sec_force_field.force_calculations = sec_force_calculations + sec_neighbor_searching = NeighborSearching() + sec_force_calculations.neighbor_searching = sec_neighbor_searching + + for key, val in force_calculation_parameters.items(): + if not isinstance(val, dict): + if self.is_valid_key_val(ForceCalculations, key, val): + sec_force_calculations.m_set(sec_force_calculations.m_get_quantity_definition(key), val) + else: + units = val.units if hasattr(val, 'units') else None + val = val.magnitude if units is not None else val + sec_force_calculations.x_h5md_parameters.append(ParamEntry(kind=key, value=val, unit=units)) + elif key == 'neighbor_searching': + for neigh_key, neigh_val in val.items(): + if self.is_valid_key_val(NeighborSearching, neigh_key, neigh_val): + sec_neighbor_searching.m_set(sec_neighbor_searching.m_get_quantity_definition(neigh_key), neigh_val) + else: + units = val.units if hasattr(val, 'units') else None + val = val.magnitude if units is not None else val + sec_neighbor_searching.x_h5md_parameters.append(ParamEntry(kind=key, value=val, unit=units)) + else: + self.logger.warning('Unknown parameters in force calculations section.' + ' These will not be stored.') + + def get_workflow_properties_dict( + self, observables: Dict, property_type_key=None, property_type_value_key=None, + properties_known={}, property_keys_list=[], property_value_keys_list=[]): + + def populate_property_dict(property_dict, val_name, val, flag_known_property=False): + if val is None: + return + value_unit = val.units if hasattr(val, 'units') else None + value_magnitude = val.magnitude if hasattr(val, 'units') else val + if flag_known_property: + property_dict[val_name] = value_magnitude * value_unit if value_unit else value_magnitude + else: + property_dict[f'{val_name}_unit'] = str(value_unit) if value_unit else None + property_dict[f'{val_name}_magnitude'] = value_magnitude + + workflow_properties_dict = {} + for observable_type, observable_dict in observables.items(): + flag_known_property = False + if observable_type in properties_known: + property_type_key = observable_type + property_type_value_key = properties_known[observable_type] + flag_known_property = True + property_dict = {property_type_value_key: []} + property_dict['label'] = observable_type + for key, observable in observable_dict.items(): + property_values_dict = {'label': key} + for quant_name, val in observable.items(): + if quant_name == 'val': + continue + if quant_name == 'bins': + continue + if quant_name in property_keys_list: + property_dict[quant_name] = val + if quant_name in property_value_keys_list: + property_values_dict[quant_name] = val + # TODO Still need to add custom values here. + + val = observable.get('value') + populate_property_dict(property_values_dict, 'value', val, flag_known_property=flag_known_property) + bins = observable.get('bins') + populate_property_dict(property_values_dict, 'bins', bins, flag_known_property=flag_known_property) + property_dict[property_type_value_key].append(property_values_dict) + + if workflow_properties_dict.get(property_type_key): + workflow_properties_dict[property_type_key].append(property_dict) + else: + workflow_properties_dict[property_type_key] = [property_dict] + + return workflow_properties_dict + + def parse_workflow(self): + + workflow_parameters = self._parameter_info.get('workflow').get('molecular_dynamics') + if workflow_parameters is None: + return + + workflow_results = {} + ensemble_average_observables = self._observable_info.get('ensemble_average') + ensemble_property_dict = { + 'property_type_key': 'ensemble_properties', + 'property_type_value_key': 'ensemble_property_values', + 'properties_known': {'radial_distribution_functions': 'radial_distribution_function_values'}, + 'property_keys_list': EnsembleProperty.m_def.all_quantities.keys(), + 'property_value_keys_list': EnsemblePropertyValues.m_def.all_quantities.keys() + } + workflow_results.update( + self.get_workflow_properties_dict(ensemble_average_observables, **ensemble_property_dict) + ) + correlation_function_observables = self._observable_info.get('correlation_function') + correlation_function_dict = { + 'property_type_key': 'correlation_functions', + 'property_type_value_key': 'correlation_function_values', + 'properties_known': {'mean_squared_displacements': 'mean_squared_displacement_values'}, + 'property_keys_list': CorrelationFunction.m_def.all_quantities.keys(), + 'property_value_keys_list': CorrelationFunctionValues.m_def.all_quantities.keys() + } + workflow_results.update( + self.get_workflow_properties_dict(correlation_function_observables, **correlation_function_dict) + ) + self.parse_md_workflow(dict(method=workflow_parameters, results=workflow_results)) + + def parse(self, filepath, archive: EntryArchive, logger): + + self.filepath = os.path.abspath(filepath) + self.archive = archive + self.logger = logging.getLogger(__name__) if logger is None else logger + self._maindir = os.path.dirname(self.filepath) + self._h5md_files = os.listdir(self._maindir) + self._basename = os.path.basename(filepath).rsplit('.', 1)[0] + self._data_parser.mainfile = self.filepath + if self._data_parser.filehdf5 is None: + self.logger.warning('hdf5 file missing in H5MD Parser.') + return + + positions = self._data_parser.get(self._path_value_positions_all) + if positions is not None: + self._n_frames = len(positions) if positions is not None else None + self._n_atoms = [len(pos) for pos in positions] if positions is not None else None + + # Parse the hdf5 groups + sec_run = Run() + self.archive.run.append(sec_run) + + group_h5md = self._data_parser.get('h5md') + if group_h5md: + program_name = self._data_parser.get('h5md.program.name', isattr=True) + program_version = self._data_parser.get('h5md.program.version', isattr=True) + sec_run.program = Program(name=program_name, version=program_version) + h5md_version = self._data_parser.get('h5md.version', isattr=True) + sec_run.x_h5md_version = h5md_version + h5md_author_name = self._data_parser.get('h5md.author.name', isattr=True) + h5md_author_email = self._data_parser.get('h5md.author.email', isattr=True) + sec_run.x_h5md_author = Author(name=h5md_author_name, email=h5md_author_email) + h5md_creator_name = self._data_parser.get('h5md.creator.name', isattr=True) + h5md_creator_version = self._data_parser.get('h5md.creator.version', isattr=True) + sec_run.x_h5md_creator = Program(name=h5md_creator_name, version=h5md_creator_version) + else: + self.logger.warning('"h5md" group missing in (H5MD)hdf5 file.' + ' Program and author metadata will be missing!') + + self.parse_atom_parameters() + self.parse_system_info() + self.parse_observable_info() + self.parse_parameter_info() + + # Populate the archive + self.parse_method() + + self.parse_system() + + self.parse_calculation() + + self.parse_workflow() diff --git a/atomisticparsers/utils/__init__.py b/atomisticparsers/utils/__init__.py index ea312945..a753b24e 100644 --- a/atomisticparsers/utils/__init__.py +++ b/atomisticparsers/utils/__init__.py @@ -18,3 +18,4 @@ from .mdanalysis import MDAnalysisParser from .parsers import MDParser +MOL = 6.022140857e+23 diff --git a/atomisticparsers/utils/parsers.py b/atomisticparsers/utils/parsers.py index 75c95808..3fe97c83 100644 --- a/atomisticparsers/utils/parsers.py +++ b/atomisticparsers/utils/parsers.py @@ -234,3 +234,13 @@ def parse_interactions(self, interactions: List[Dict], sec_model: MSection) -> N if not sec_interaction.n_atoms: sec_interaction.n_atoms = len(sec_interaction.get('atom_indices')[0]) if sec_interaction.get('atom_indices') is not None else None + + + def parse_interactions_by_type(self, interactions_by_type: List[Dict], sec_model: MSection) -> None: + + for interaction_type_dict in interactions_by_type: + self.parse_section(interaction_type_dict, sec_model.m_create(Interaction)) + # TODO Shift Gromacs and Lammps parsers to use this function as well if possible + + + diff --git a/tests/data/h5md/openmm/test_traj_openmm_5frames.h5 b/tests/data/h5md/openmm/test_traj_openmm_5frames.h5 new file mode 100644 index 00000000..c686fd15 Binary files /dev/null and b/tests/data/h5md/openmm/test_traj_openmm_5frames.h5 differ diff --git a/tests/test_h5md.py b/tests/test_h5md.py new file mode 100644 index 00000000..a5901918 --- /dev/null +++ b/tests/test_h5md.py @@ -0,0 +1,179 @@ +# +# Copyright The NOMAD Authors. +# +# This file is part of NOMAD. See https://nomad-lab.eu for further info. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +import pytest +import numpy as np + +from nomad.datamodel import EntryArchive +from atomisticparsers.h5md import H5MDParser + + +def approx(value, abs=0, rel=1e-6): + return pytest.approx(value, abs=abs, rel=rel) + + +@pytest.fixture(scope='module') +def parser(): + return H5MDParser() + + +def test_md(parser): + archive = EntryArchive() + parser.parse('tests/data/h5md/openmm/test_traj_openmm_5frames.h5', archive, None) + + sec_run = archive.run[0] + assert sec_run.program.name == 'OpenMM' + assert sec_run.program.version == '-1.-1.-1' + assert len(sec_run.x_h5md_version) == 2 + assert sec_run.x_h5md_version[1] == 0 + assert sec_run.x_h5md_author.name == 'Joseph F. Rudzinski' + assert sec_run.x_h5md_author.email == 'joseph.rudzinski@physik.hu-berlin.de' + assert sec_run.x_h5md_creator.name == 'h5py' + assert sec_run.x_h5md_creator.version == '3.6.0' + + sec_method = sec_run.method + sec_atom_params = sec_method[0].atom_parameters + assert len(sec_atom_params) == 31583 + assert sec_atom_params[164].label == 'O' + assert sec_atom_params[164].mass.to('amu').magnitude == approx(15.999429702758789) + assert sec_atom_params[164].charge.to('e').magnitude == approx(-0.5) + + assert len(sec_method[0].force_field.model[0].contributions) == 3 + assert sec_method[0].force_field.model[0].contributions[1].type == 'angles' + assert sec_method[0].force_field.model[0].contributions[1].n_interactions == 762 + assert sec_method[0].force_field.model[0].contributions[1].n_atoms == 3 + assert sec_method[0].force_field.model[0].contributions[1].atom_labels[10][0] == 'O' + assert sec_method[0].force_field.model[0].contributions[1].atom_indices[100][1] == 51 + + assert sec_method[0].force_field.force_calculations.vdw_cutoff.to('nm').magnitude == approx(1.2) + assert sec_method[0].force_field.force_calculations.coulomb_type == 'particle_mesh_ewald' + assert sec_method[0].force_field.force_calculations.coulomb_cutoff.to('nm').magnitude == approx(1.2) + assert sec_method[0].force_field.force_calculations.neighbor_searching.neighbor_update_frequency == 1 + assert sec_method[0].force_field.force_calculations.neighbor_searching.neighbor_update_cutoff.to('nm').magnitude == approx(1.2) + + sec_systems = sec_run.system + assert len(sec_systems) == 5 + assert np.shape(sec_systems[0].atoms.positions) == (31583, 3) + assert np.shape(sec_systems[0].atoms.velocities) == (31583, 3) + assert sec_systems[0].atoms.n_atoms == 31583 + assert sec_systems[0].atoms.labels[100] == 'H' + + assert sec_systems[2].atoms.positions[800][1].to('angstrom').magnitude == approx(26.860575) + assert sec_systems[2].atoms.velocities[1200][2].to('angstrom/ps').magnitude == approx(400.0) + assert sec_systems[3].atoms.lattice_vectors[2][2].to('angstrom').magnitude == approx(68.22318) + assert sec_systems[0].atoms.bond_list[200][0] == 198 + + sec_atoms_group = sec_systems[0].atoms_group + assert len(sec_atoms_group) == 4 + assert sec_atoms_group[0].label == 'group_1ZNF' + assert sec_atoms_group[0].type == 'molecule_group' + assert sec_atoms_group[0].composition_formula == '1ZNF(1)' + assert sec_atoms_group[0].n_atoms == 423 + assert sec_atoms_group[0].atom_indices[159] == 159 + assert sec_atoms_group[0].is_molecule is False + sec_proteins = sec_atoms_group[0].atoms_group + assert len(sec_proteins) == 1 + assert sec_proteins[0].label == '1ZNF' + assert sec_proteins[0].type == 'molecule' + assert sec_proteins[0].composition_formula == 'ACE(1)TYR(1)LYS(1)CYS(1)GLY(1)LEU(1)CYS(1)GLU(1)ARG(1)SER(1)PHE(1)VAL(1)GLU(1)LYS(1)SER(1)ALA(1)LEU(1)SER(1)ARG(1)HIS(1)GLN(1)ARG(1)VAL(1)HIS(1)LYS(1)ASN(1)NH2(1)' + assert sec_proteins[0].n_atoms == 423 + assert sec_proteins[0].atom_indices[400] == 400 + assert sec_proteins[0].is_molecule is True + sec_res_group = sec_proteins[0].atoms_group + assert len(sec_res_group) == 27 + assert sec_res_group[14].label == 'group_ARG' + assert sec_res_group[14].type == 'monomer_group' + assert sec_res_group[14].composition_formula == 'ARG(1)' + assert sec_res_group[14].n_atoms == 24 + assert sec_res_group[14].atom_indices[2] == 329 + assert sec_res_group[14].is_molecule is False + sec_res = sec_res_group[14].atoms_group + assert len(sec_res) == 1 + assert sec_res[0].label == 'ARG' + assert sec_res[0].type == 'monomer' + assert sec_res[0].composition_formula == 'C(1)CA(1)CB(1)CD(1)CG(1)CZ(1)H(1)HA(1)HB2(1)HB3(1)HD2(1)HD3(1)HE(1)HG2(1)HG3(1)HH11(1)HH12(1)HH21(1)HH22(1)N(1)NE(1)NH1(1)NH2(1)O(1)' + assert sec_res[0].n_atoms == 24 + assert sec_res[0].atom_indices[10] == 337 + assert sec_res[0].is_molecule is False + assert sec_res[0].x_h5md_parameters[0].kind == 'hydrophobicity' + assert sec_res[0].x_h5md_parameters[0].value == '0.81' + assert sec_res[0].x_h5md_parameters[0].unit is None + + sec_calc = sec_run.calculation + assert len(sec_calc) == 5 + assert np.shape(sec_calc[1].forces.total.value) == (31583, 3) + assert sec_calc[1].forces.total.value[2100][2].to('newton').magnitude == approx(500.0) + assert sec_calc[2].temperature.to('kelvin').magnitude == approx(300.0) + assert len(sec_calc[1].x_h5md_custom_calculations) == 1 + assert sec_calc[1].x_h5md_custom_calculations[0].kind == 'custom_thermo' + assert sec_calc[1].x_h5md_custom_calculations[0].value == approx(100.0) + assert sec_calc[1].x_h5md_custom_calculations[0].unit == 'newton / angstrom ** 2' + assert sec_calc[2].time.to('ps').magnitude == approx(2.0) + assert sec_calc[2].energy.kinetic.value.to('kilojoule').magnitude == approx(2.0) + assert sec_calc[2].energy.potential.value.to('kilojoule').magnitude == approx(1.0) + assert sec_calc[1].energy.x_h5md_energy_contributions[0].kind == 'energy-custom' + assert sec_calc[1].energy.x_h5md_energy_contributions[0].value.magnitude == approx(3000.0) + + sec_workflow = archive.workflow2 + assert sec_workflow.m_def.name == 'MolecularDynamics' + sec_method = sec_workflow.method + assert sec_method.thermodynamic_ensemble == 'NPT' + assert sec_method.integrator_type == 'langevin_leap_frog' + assert sec_method.integration_timestep.to('ps').magnitude == approx(2.0e-15) + assert sec_method.n_steps == 20000000 + assert sec_method.coordinate_save_frequency == 10000 + assert sec_method.thermostat_parameters[0].thermostat_type == 'langevin_leap_frog' + assert sec_method.thermostat_parameters[0].reference_temperature.to('kelvin').magnitude == approx(300.0) + assert sec_method.thermostat_parameters[0].coupling_constant.to('ps').magnitude == approx(1.0) + assert sec_method.barostat_parameters[0].barostat_type == 'berendsen' + assert sec_method.barostat_parameters[0].coupling_type == 'isotropic' + assert np.all(sec_method.barostat_parameters[0].reference_pressure.to('bar').magnitude == [[1.0, 0., 0.], [0., 1.0, 0.], [0., 0., 1.0]]) + assert np.all(sec_method.barostat_parameters[0].coupling_constant.to('ps').magnitude == [[1., 0., 0.], [0., 1., 0.], [0., 0., 1.]]) + assert np.all(sec_method.barostat_parameters[0].compressibility.to('1/bar').magnitude == [[1., 0., 0.], [0., 1., 0.], [0., 0., 1.]]) + + sec_workflow_results = sec_workflow.results + assert len(sec_workflow_results.ensemble_properties) == 1 + ensemble_property_0 = sec_workflow_results.ensemble_properties[0] + assert ensemble_property_0.label == 'diffusion_constants' + assert ensemble_property_0.error_type == 'Pearson_correlation_coefficient' + assert len(ensemble_property_0.ensemble_property_values) == 2 + assert ensemble_property_0.ensemble_property_values[1].label == 'MOL2' + assert ensemble_property_0.ensemble_property_values[1].errors == 0.95 + assert ensemble_property_0.ensemble_property_values[1].value_magnitude == 2. + assert ensemble_property_0.ensemble_property_values[1].value_unit == 'nanometer ** 2 / picosecond' + ensemble_property_1 = sec_workflow_results.radial_distribution_functions[0] + assert ensemble_property_1.label == 'radial_distribution_functions' + assert ensemble_property_1.type == 'molecular' + assert len(ensemble_property_1.radial_distribution_function_values) == 3 + assert ensemble_property_1.radial_distribution_function_values[1].label == 'MOL1-MOL2' + assert ensemble_property_1.radial_distribution_function_values[1].n_bins == 651 + assert ensemble_property_1.radial_distribution_function_values[1].frame_start == 0 + assert ensemble_property_1.radial_distribution_function_values[1].frame_end == 4 + assert ensemble_property_1.radial_distribution_function_values[1].bins[51].to('nm').magnitude == approx(0.255) + assert ensemble_property_1.radial_distribution_function_values[1].value[51] == approx(0.284764) + correlation_function_0 = sec_workflow_results.mean_squared_displacements[0] + assert correlation_function_0.type == 'molecular' + assert correlation_function_0.label == 'mean_squared_displacements' + assert correlation_function_0.direction == 'xyz' + assert correlation_function_0.error_type == 'standard_deviation' + assert len(correlation_function_0.mean_squared_displacement_values) == 2 + assert correlation_function_0.mean_squared_displacement_values[0].label == 'MOL1' + assert correlation_function_0.mean_squared_displacement_values[0].n_times == 51 + assert correlation_function_0.mean_squared_displacement_values[0].times[10].to('ps').magnitude == approx(20.0) + assert correlation_function_0.mean_squared_displacement_values[0].value[10].to('nm**2').magnitude == approx(0.679723) + assert correlation_function_0.mean_squared_displacement_values[0].errors[10] == approx(0.0)