Add basic parser

nomad-coe · Dec 8, 2023 · 6c351b5 · 6c351b5
1 parent 19dd798
commit 6c351b5
Show file tree

Hide file tree

Showing 8 changed files with 898 additions and 6 deletions.
diff --git a/.github/workflows/python-actions.yaml b/.github/workflows/python-actions.yaml
@@ -15,10 +15,13 @@ jobs:
       run: |
         pip install --upgrade pip
         pip install .[dev]
-        pip install types-PyYAML
     - name: mypy
       run: |
         python -m mypy --ignore-missing-imports --follow-imports=silent --no-strict-optional simulationparsers
+    - name: Test with pytest
+      run: |
+        python -m pytest -sv tests
+
 
   ruff:
     runs-on: ubuntu-latest

diff --git a/pyproject.toml b/pyproject.toml
@@ -10,11 +10,7 @@ readme = "README.md"
 authors = [{ name = "The NOMAD Authors" }]
 license = { file = "LICENSE" }
 dependencies = [
-    "lxml==4.7.1",
-    "pyscf==2.0.1; sys_platform == 'darwin'",
-    "netCDF4==1.5.4",
-    "h5py>=3.6.0",
-    "pyyaml==6.0",
+    "nomad-schema-plugin-run@git+https://github.com/nomad-coe/nomad-schema-plugin-run.git@develop"
 ]
 
 [project.urls]

diff --git a/simulationparsers/utils/__init__.py b/simulationparsers/utils/__init__.py
@@ -0,0 +1,16 @@
+# Copyright 2018 Markus Scheidgen
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an"AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .basic_parser import BasicParser
+
diff --git a/simulationparsers/utils/basic_parser.py b/simulationparsers/utils/basic_parser.py
@@ -0,0 +1,249 @@
+# Copyright 2018 Markus Scheidgen
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an"AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+from typing import List
+import os
+import logging
+import numpy as np
+import re
+from ase.data import chemical_symbols
+
+from nomad.datamodel import EntryArchive
+from nomad.parsing.file_parser import TextParser, Quantity
+from runschema.run import Run, Program
+from runschema.method import Method
+from runschema.system import System, Atoms
+from runschema.calculation import (
+    Calculation, Energy, EnergyEntry, Forces, ForcesEntry, Thermodynamics)
+
+
+class BasicParser:
+    '''
+    Defines a fairdi parser that parse basic quantities for sections method, system and
+    single_configuration_calculation.
+
+    Arguments:
+        code_name: name of the code
+        units_mapping: dictionary of nomad units for basic quantities such as length
+        auxiliary_files: re pattern to match auxilliary files from mainfile. If no files
+            are found will match files in working directory.
+        kwargs: metainfo_key: re pattern pairs used to parse quantity
+    '''
+    def __init__(self, code_name: str, **kwargs):
+        self.code_name = code_name
+        self.units_mapping = kwargs.get('units_mapping', {})
+        self.auxilliary_files = kwargs.get('auxilliary_files', '')
+        self.mainfile_parser = TextParser()
+        for key, pattern in kwargs.items():
+            if isinstance(pattern, str):
+                self.mainfile_parser._quantities.append(
+                    Quantity(key, pattern, repeats=True, flatten=False))
+            elif isinstance(pattern, tuple) and isinstance(pattern[0], str):
+                self.mainfile_parser._quantities.append(
+                    Quantity(key, pattern[0], str_operation=pattern[1], repeats=True))
+        self._re_float = r'\-*\d+\.\d+E*e*\-*\+*\d*'
+        self.auxilliary_parsers: List[TextParser] = []
+
+    def init_parser(self):
+        '''
+        Initializes the mainfile and auxiliary parsers.
+        '''
+        self.mainfile_parser.mainfile = self.mainfile
+        self.mainfile_parser.logger = self.logger
+
+        auxilliary_files = self.mainfile_parser.get('auxilliary_files', os.listdir(self.maindir))
+        # remove duplicates, maintain order
+        auxilliary_files = [f for n, f in enumerate(auxilliary_files) if f not in auxilliary_files[:n]]
+        self.auxilliary_parsers = []
+        for filename in auxilliary_files:
+            filename = os.path.basename(filename)
+            if self.mainfile_parser.get('auxilliary_files') is None:
+                if not self.auxilliary_files or not re.match(self.auxilliary_files, filename):
+                    continue
+            filename = os.path.join(self.maindir, filename)
+            if not os.path.isfile(filename):
+                continue
+            parser = self.mainfile_parser.copy()
+            parser.mainfile = filename
+            parser.logger = self.logger
+            self.auxilliary_parsers.append(parser)
+
+    def parse(self, mainfile: str, archive: EntryArchive, logger=None, child_archives=None) -> None:
+        '''
+        Triggers parsing of mainfile and writing parsed quantities to archive.
+        '''
+        self.mainfile = os.path.abspath(mainfile)
+        self.maindir = os.path.dirname(self.mainfile)
+        self.archive = archive
+        self.logger = logger if logger is not None else logging
+
+        self.init_parser()
+
+        def set_value(section, key, value, unit=None, shape=None, dtype=None):
+            dtype = dtype if dtype is not None else type(value)
+            if value is None:
+                return
+            try:
+                if hasattr(value, 'm_def'):
+                    pass
+                elif not hasattr(value, 'units'):
+                    value = np.reshape(np.array(
+                        value, dtype=np.dtype(dtype)), shape) if shape is not None else dtype(value)
+                    value = value * unit if unit is not None else value
+                setattr(section, key, value)
+            except Exception:
+                pass
+
+        def get_value(source, pattern, key=None):
+            if isinstance(source, str):
+                val = re.findall(pattern, source)
+                return val[0] if len(val) == 1 else val
+            elif isinstance(source, list):
+                return [get_value(s, pattern) for s in source]
+            elif isinstance(source, dict):
+                return source.get(key)
+            else:
+                return source
+
+        def remove_empty_section(sections, definition):
+            for n in range(len(sections) - 1, -1, -1):
+                empty = True
+                for _, property_def, _, _ in sections[n].m_traverse():
+                    if property_def is None:
+                        continue
+                    empty = False
+                    break
+                if empty:
+                    sections[n].m_parent.m_remove_sub_section(definition, n)
+
+        sec_run = Run()
+        self.archive.run.append(sec_run)
+        sec_run.program = Program(name=self.code_name)
+
+        energy_unit = self.units_mapping.get('energy', 1.0)
+        length_unit = self.units_mapping.get('length', 1.0)
+        mass_unit = self.units_mapping.get('mass')
+        time_unit = self.units_mapping.get('time')
+
+        re_f = r'\-*\d+\.\d+E*e*\-*\+*\d*'
+
+        for key, values in self.mainfile_parser.items():
+            if values is None:
+                # get if from auxiliary files
+                values = []
+                for parser in self.auxilliary_parsers:
+                    values.extend(parser.get(key, []))
+            if values is None or len(values) == 0:
+                continue
+            # set header quantities
+            set_value(sec_run, key, values[0])
+            for n, value in enumerate(values):
+                if len(sec_run.method) <= n:
+                    sec_run.m_create(Method)
+                sec_method = sec_run.method[n]
+
+                if len(sec_run.system) <= n:
+                    sec_run.m_create(System)
+                    sec_run.system[-1].m_create(Atoms)
+                sec_system = sec_run.system[n]
+
+                if len(sec_run.calculation) <= n:
+                    sec_run.m_create(Calculation)
+                    sec_run.calculation[-1].m_create(Energy)
+                    sec_run.calculation[-1].m_create(Forces)
+                    sec_run.calculation[-1].m_create(Thermodynamics)
+                sec_scc = sec_run.calculation[n]
+
+                # method related quantities
+                if hasattr(Method, key):
+                    set_value(sec_method, key, value)
+
+                # system related quantities
+                elif hasattr(System, key):
+                    set_value(sec_system, key, value)
+
+                # calculation related quantities
+                elif hasattr(Calculation, key):
+                    set_value(sec_scc, key, value)
+
+                elif hasattr(Thermodynamics, key):
+                    set_value(sec_scc.thermodynamics, key, value)
+
+                # specific quantities that need formatting
+                if 'program' in key:
+                    set_value(sec_run.program, key.replace('program_', ''), value)
+
+                if 'energy' in key:
+                    shape = None
+                    val = value[-1] if 'fermi' in key else EnergyEntry(value=value * energy_unit)
+                    sub_key = 'fermi' if 'fermi' in key else key.replace('energy_', '').lower()
+                    set_value(sec_scc.energy, sub_key, val, energy_unit, shape, np.float64)
+
+                if 'atom_forces' in key:
+                    val = get_value(value, rf'.*({re_f}) +({re_f}) +({re_f}).*', 'atom_forces')
+                    if mass_unit is not None and time_unit is not None:
+                        unit = mass_unit * length_unit / time_unit ** 2
+                    else:
+                        unit = energy_unit / length_unit
+                    sec_scc.forces.total = ForcesEntry()
+                    set_value(sec_scc.forces.total, 'value', val, unit, (np.size(val) // 3, 3), np.float64)
+
+                if 'lattice_vectors' in key:
+                    val = get_value(value, rf'({re_f}) +({re_f}) +({re_f}).*', 'lattice_vectors')
+                    set_value(sec_system.atoms, 'lattice_vectors', val, length_unit, (3, 3), np.float64)
+                    if val is not None:
+                        sec_system.atoms.periodic = [True, True, True]
+
+                if 'atom_positions' in key:
+                    sub_key = 'atom_positions_scaled' if 'atom_positions_scaled' in key else 'atom_positions'
+                    val = get_value(value, rf'({re_f}) +({re_f}) +({re_f}).*', sub_key)
+                    unit = length_unit
+                    if sub_key == 'atom_positions_scaled':
+                        try:
+                            val = np.dot(np.array(val, dtype=np.dtype(np.float64)), sec_system.atoms.lattice_vectors.magnitude)
+                            unit = 1.0
+                        except Exception:
+                            pass
+                    set_value(sec_system.atoms, 'positions', val, unit, (np.size(val) // 3, 3), np.float64)
+
+                if 'atom_velocities' in key:
+                    val = get_value(value, rf'({re_f}) +({re_f}) +({re_f}).*', 'atom_velocities')
+                    set_value(sec_system.atoms, 'velocities', val, length_unit / time_unit, (np.size(val) // 3, 3), np.float64)
+
+                if 'atom_labels' in key:
+                    val = get_value(value, r'([A-Z][a-z]*)\s', 'atom_labels')
+                    val = [val] if isinstance(val, str) else val
+                    set_value(sec_system.atoms, 'labels', val, shape=(len(val)), dtype=str)
+
+                if 'atom_atom_number' in key:
+                    val = get_value(value, r'(\d+)\s', 'atom_atom_number')
+                    val = [val] if isinstance(val, str) else val
+                    set_value(sec_system.atoms, 'atomic_numbers', val, shape=(len(val)), dtype=np.int32)
+                    set_value(sec_system.atoms, 'labels', [chemical_symbols[int(n)] for n in sec_system.atoms.atomic_numbers], shape=(len(val)))
+
+        # remove unfilled sections
+        for system in sec_run.system:
+            if len(system.atoms.values()) == 0:
+                system.m_remove_sub_section(System.atoms, 0)
+        for calculation in sec_run.calculation:
+            if len(calculation.energy.values()) == 0:
+                calculation.m_remove_sub_section(Calculation.energy, 0)
+            if len(calculation.forces.values()) == 0:
+                calculation.m_remove_sub_section(Calculation.forces, 0)
+            if len(calculation.thermodynamics) > 0 and len(calculation.thermodynamics[0].values()) == 0:
+                calculation.m_remove_sub_section(Calculation.thermodynamics, 0)
+        remove_empty_section(sec_run.method, Run.method)
+        remove_empty_section(sec_run.system, Run.system)
+        remove_empty_section(sec_run.calculation, Run.calculation)
diff --git a/simulationparsers/utils/re_patterns.py b/simulationparsers/utils/re_patterns.py
@@ -0,0 +1,20 @@
+#
+# Copyright The NOMAD Authors.
+#
+# This file is part of NOMAD.
+# See https://nomad-lab.eu for further info.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+FLOAT = r'[-+]?\d+\.*\d*(?:[Ee][-+]\d+)?'
diff --git a/tests/data/onetep/fluor/12-difluoroethane.dat b/tests/data/onetep/fluor/12-difluoroethane.dat
@@ -0,0 +1,69 @@
+ !============================================================!
+ ! ~~~~~~~ Quality control test for the ONETEP program ~~~~~~~!
+ !------------------------------------------------------------!
+ ! Molecule: 1,2-difluoroethane                               !
+ ! no. of C   :            2                                  !
+ ! no. of H   :            4                                  ! 
+ ! no. of F   :            2                                  !
+ ! total atoms:            8                                  !
+ !------------------------------------------------------------!
+ ! Aim: test for Hamiltonian diagonalisation and kernel DIIS. !
+ !------------------------------------------------------------!
+ ! Created by Alvaro Ruiz Serrano on 19/11/2010.              !
+ !============================================================!
+
+ print_qc             : TRUE
+ output_detail        : NORMAL
+ timings_level        : 0
+
+ task                 : SINGLEPOINT
+
+ cutoff_energy        : 800 eV
+ kernel_cutoff        : 1000
+ k_zero               : 3.5
+
+ xc_functional        : PBE
+ dispersion           : 1
+
+ kernel_diis_scheme   : dkn_pulay
+ kernel_diis_threshold: 2.0e-6
+
+ maxit_pen            : 0
+ ngwf_threshold_orig  : 2.0e-4
+
+ write_denskern       : FALSE
+ write_tightbox_ngwfs : FALSE
+ write_xyz            : FALSE
+
+
+ %block lattice_cart
+      24.5       0.0       0.0
+       0.0      24.5       0.0
+       0.0       0.0      24.5
+ %endblock lattice_cart
+
+
+ %block positions_abs
+ C      11.03265426     9.90071429     8.78389204
+ C       8.44374314     8.72153141     8.78389204
+ F       8.17540344     7.00000000    10.78510145
+ F      12.50852257     8.97286366    10.78510145
+ H      10.90604328    11.96995347     8.99365054
+ H      12.00963751     9.43773383     7.00000000
+ H       8.09792508     7.68974639     7.00755886
+ H       7.00000000    10.20873802     9.02199628
+ %endblock positions_abs
+
+
+ %block species
+ C   C   6 4 7.0
+ F   F   9 9 7.0
+ H   H   1 1 7.0
+ %endblock species
+
+
+ %block species_pot
+ H     "../../pseudo/hydrogen.recpot"
+ C     "../../pseudo/carbon.recpot"
+ F     "../../pseudo/fluorine.recpot"
+ %endblock species_pot