Squashed commit of the following:

commit 5015a2e Merge: dab04ef 3aa2796 Author: Cristian Gabellini <30401800+FNTwin@users.noreply.github.com> Date: Fri Aug 30 12:42:39 2024 -0400 Merge pull request #112 from valence-labs/3bpa 3bpa dataset commit 3aa2796 Author: FNTwin <cristian@valencelabs.com> Date: Fri Aug 30 09:38:30 2024 -0600 BPA docstrings commit dab04ef Author: FNTwin <cristian@valencelabs.com> Date: Thu Aug 8 14:08:19 2024 -0600 Correct regex parsing + binary strings dec commit a1061a8 Author: FNTwin <cristian@valencelabs.com> Date: Fri Aug 2 13:02:48 2024 -0600 MACEOFF docstrings commit 737b81e Author: FNTwin <cristian@valencelabs.com> Date: Fri Aug 2 12:51:14 2024 -0600 WIP commit d63cb55 Author: FNTwin <cristian@valencelabs.com> Date: Fri Aug 2 12:03:31 2024 -0600 Splits in MACEOFF commit b613fb0 Author: Hatem Helal <hatem@valencelabs.com> Date: Fri Aug 2 10:33:04 2024 -0600 fix download and parsing commit b16a410 Author: Hatem Helal <hatem@valencelabs.com> Date: Fri Aug 2 07:00:53 2024 -0600 initial scaffolding for BPA dataset commit 09c75a7 Author: FNTwin <cristian@valencelabs.com> Date: Mon Jul 29 09:47:40 2024 -0600 MaceOff dataset
valence-labs · Aug 30, 2024 · 8847998 · 8847998
1 parent e052bf6
commit 8847998
Show file tree

Hide file tree

Showing 5 changed files with 217 additions and 5 deletions.
diff --git a/openqdc/__init__.py b/openqdc/__init__.py
@@ -21,6 +21,7 @@ def get_project_root():
     "ANI1CCX_V2": "openqdc.datasets.potential.ani",
     "ANI1X": "openqdc.datasets.potential.ani",
     "ANI2X": "openqdc.datasets.potential.ani",
+    "BPA": "openqdc.datasets.potential.bpa",
     "Spice": "openqdc.datasets.potential.spice",
     "SpiceV2": "openqdc.datasets.potential.spice",
     "SpiceVL2": "openqdc.datasets.potential.spice",
@@ -31,6 +32,7 @@ def get_project_root():
     "COMP6": "openqdc.datasets.potential.comp6",
     "GDML": "openqdc.datasets.potential.gdml",
     "Molecule3D": "openqdc.datasets.potential.molecule3d",
+    "MACEOFF": "openqdc.datasets.potential.maceoff",
     "OrbnetDenali": "openqdc.datasets.potential.orbnet_denali",
     "SN2RXN": "openqdc.datasets.potential.sn2_rxn",
     "QM7X": "openqdc.datasets.potential.qm7x",
@@ -117,11 +119,13 @@ def __dir__():
     # POTENTIAL
     from .datasets.potential.alchemy import Alchemy
     from .datasets.potential.ani import ANI1, ANI1CCX, ANI1CCX_V2, ANI1X, ANI2X
+    from .datasets.potential.bpa import BPA
     from .datasets.potential.comp6 import COMP6
     from .datasets.potential.dummy import Dummy, PredefinedDataset
     from .datasets.potential.gdml import GDML
     from .datasets.potential.geom import GEOM
     from .datasets.potential.iso_17 import ISO17
+    from .datasets.potential.maceoff import MACEOFF
     from .datasets.potential.md22 import MD22
     from .datasets.potential.molecule3d import Molecule3D
     from .datasets.potential.multixcqm9 import MultixcQM9, MultixcQM9_V2

diff --git a/openqdc/datasets/potential/__init__.py b/openqdc/datasets/potential/__init__.py
@@ -1,10 +1,12 @@
 from .alchemy import Alchemy
 from .ani import ANI1, ANI1CCX, ANI1CCX_V2, ANI1X, ANI2X
+from .bpa import BPA
 from .comp6 import COMP6
 from .dummy import Dummy, PredefinedDataset
 from .gdml import GDML
 from .geom import GEOM
 from .iso_17 import ISO17
+from .maceoff import MACEOFF
 from .md22 import MD22
 from .molecule3d import Molecule3D
 from .multixcqm9 import MultixcQM9, MultixcQM9_V2
@@ -33,11 +35,13 @@
     "ANI1CCX_V2": ANI1CCX_V2,
     "ANI1X": ANI1X,
     "ANI2X": ANI2X,
+    "BPA": BPA,
     "COMP6": COMP6,
     "GDML": GDML,
     "GEOM": GEOM,
     "ISO17": ISO17,
     "Molecule3D": Molecule3D,
+    "MACEOFF": MACEOFF,
     "NablaDFT": NablaDFT,
     "OrbnetDenali": OrbnetDenali,
     "PCQM_B3LYP": PCQM_B3LYP,

diff --git a/openqdc/datasets/potential/ani.py b/openqdc/datasets/potential/ani.py
@@ -154,7 +154,7 @@ def convert_forces(self, x):
         return super().convert_forces(x) * 0.529177249  # correct the Dataset error
 
     def __smiles_converter__(self, x):
-        return x
+        return "-".join(x.decode("ascii").split("-")[:-1])
 
 
 class ANI1CCX(ANI1):
@@ -195,10 +195,7 @@ class ANI1CCX(ANI1):
     __links__ = {"ani1x.hdf5.gz": "https://zenodo.org/record/4081694/files/292.hdf5.gz"}
 
     def __smiles_converter__(self, x):
-        """util function to convert string to smiles: useful if the smiles is
-        encoded in a different format than its display format
-        """
-        return x
+        return x.decode("ascii")
 
 
 class ANI1CCX_V2(ANI1CCX):

diff --git a/openqdc/datasets/potential/bpa.py b/openqdc/datasets/potential/bpa.py
@@ -0,0 +1,74 @@
+from typing import Any, Dict, List
+
+import numpy as np
+from ase.atoms import Atoms
+
+from openqdc import BaseDataset
+from openqdc.methods import PotentialMethod
+
+
+def read_bpa_record(subset: str, atoms: Atoms) -> Dict[str, Any]:
+    return dict(
+        name=np.array([str(atoms.symbols)]),
+        subset=subset,
+        energies=np.array([atoms.get_potential_energy()], dtype=np.float64),
+        forces=atoms.get_forces().reshape(-1, 3, 1).astype(np.float32),
+        atomic_inputs=np.column_stack((atoms.numbers, atoms.get_initial_charges(), atoms.positions)).astype(np.float32),
+        n_atoms=np.array([len(atoms)], dtype=np.int32),
+        split=np.array([subset.item().split("_")[0]]),
+    )
+
+
+class BPA(BaseDataset):
+    """
+    BPA (or 3BPA) dataset is a dataset consisting of a flexible druglike
+    molecule 3-(benzyloxy)pyridin-2-amine. This dataset features
+    complex dihedral potential energy surface with many local minima,
+    which can be challenging to approximate using classical or ML force fields.
+    The configuration were sampled from short (0.5 ps) MD simulations using the ANI-1x force field to
+    perturb the toward lower potential energies. Furthermore, long 25 ps MD simulation were performed at
+    three different temperatures (300, 600, and 1200 K) using the Langevin thermostat and a 1 fs time step.
+    The final configurations were re-evaluated using ORCA at the DFT level of
+    theory using the ωB97X exchange correlation functional and the 6-31G(d) basis set.
+
+    Usage:
+    ```python
+    from openqdc.datasets import BPA
+    dataset = BPA()
+    ```
+
+
+    References:
+        https://pubs.acs.org/doi/10.1021/acs.jctc.1c00647
+    """
+
+    __name__ = "BPA"
+    __energy_unit__ = "ev"
+    __forces_unit__ = "ev/ang"
+    __distance_unit__ = "ang"
+    __force_mask__ = [True]
+    __energy_methods__ = [PotentialMethod.WB97X_6_31G_D]
+    __links__ = {"BPA.zip": "https://figshare.com/ndownloader/files/31325990"}
+
+    def read_raw_entries(self) -> List[Dict]:
+        import os.path as osp
+        from glob import glob
+
+        from ase.io import iread
+
+        files = glob(osp.join(self.root, "dataset_3BPA", "*.xyz"))
+        files = [f for f in files if "iso_atoms.xyz" not in f]
+        all_records = []
+
+        for file in files:
+            subset = np.array([osp.basename(file).split(".")[0]])
+
+            for atoms in iread(file, format="extxyz"):
+                all_records.append(read_bpa_record(subset, atoms))
+
+        return all_records
+
+    def __getitem__(self, idx):
+        data = super().__getitem__(idx)
+        data.__setattr__("split", self._convert_array(self.data["split"][idx]))
+        return data
diff --git a/openqdc/datasets/potential/maceoff.py b/openqdc/datasets/potential/maceoff.py
@@ -0,0 +1,133 @@
+import re
+from functools import partial
+from os.path import join as p_join
+
+import datamol as dm
+import numpy as np
+
+from openqdc.datasets.base import BaseDataset
+from openqdc.methods import PotentialMethod
+from openqdc.utils.constants import ATOMIC_NUMBERS
+from openqdc.utils.molecule import get_atomic_number_and_charge
+
+
+def parse_mace_xyz(xyzpath):
+    energy_re = re.compile(r"energy=(\S+)")
+    smiles_re = re.compile(r"smiles=(\S+)")
+    subset_re = re.compile(r"config_type=([^;]+)\ MACE_energy")
+    with open(xyzpath, "r") as f:
+        n_atoms = None
+        counter = 0
+        positions = []
+        numbers = []
+        forces = []
+        energy = None
+        for line in f:
+            if n_atoms is None:
+                n_atoms = int(line)
+                positions = []
+                numbers = []
+                forces = []
+                energy = None
+                counter = 1
+                continue
+            if counter == 1:
+                props = line
+                energy = float(energy_re.search(props).group(1))
+                subset = subset_re.search(props).group(1)
+                try:
+                    smiles = smiles_re.search(props).group(1)
+                except AttributeError:  # water and qmugs subsets do not have smiles
+                    smiles = ""
+                counter = 2
+                continue
+            el, x, y, z, fx, fy, fz, _, _, _ = line.split()
+            numbers.append(ATOMIC_NUMBERS[el])
+            positions.append([float(x), float(y), float(z)])
+            forces.append([float(fx), float(fy), float(fz)])
+            smiles = smiles.replace('"', "")
+            subset = subset.replace('"', "")
+            counter += 1
+            if counter == n_atoms + 2:
+                n_atoms = None
+                yield energy, numbers, positions, forces, smiles, subset
+
+
+def build_data_object(data, split):
+    energy, numbers, positions, forces, smiles, subset = data
+    if smiles == "":
+        x = np.concatenate((np.array(numbers)[:, None], np.zeros((len(numbers), 1))), axis=-1)
+    else:
+        x = get_atomic_number_and_charge(dm.to_mol(smiles, remove_hs=False, ordered=True))
+    res = dict(
+        name=np.array([smiles]),
+        subset=np.array([subset]),
+        energies=np.array([[energy]], dtype=np.float64),
+        forces=np.array(forces, dtype=np.float32).reshape(-1, 3, 1),
+        atomic_inputs=np.concatenate((x, np.array(positions)), axis=-1, dtype=np.float32).reshape(-1, 5),
+        n_atoms=np.array([x.shape[0]], dtype=np.int32),
+        split=np.array([split]),
+    )
+    return res
+
+
+class MACEOFF(BaseDataset):
+    """
+    MACEOFF dataset core of the dataset consist in the Spice V1 dataset.
+    95% of the data are used for training and validation under the "train" split,
+    and 5% for testing. The dataset uses the Spice level of theory
+    ωB97M-D3(BJ)/def2-TZVPPD as implemented in the PSI4 software.
+    MACEOFF uses a subset of SPICE that contains the ten chemical elements
+    H, C, N, O, F, P, S, Cl, Br, and I, and has a neutral formal charge.
+    MACEOFF doesn't contain ion pairs. To facilitate the learning of intramolecular
+    non-bonded interactions, MACEOFF dataset contains larger 50–90 atom molecules
+    randomly selected from the QMugs dataset.
+    MACEOFF contains a number of water clusters carved out of molecular dynamics simulations
+    of liquid water, with sizes of up to 50 water molecules and part of the
+    COMP6 tripeptide geometry dataset.
+
+    Usage:
+    ```python
+    from openqdc.datasets import MACEOFF
+    dataset = MACEOFF()
+    ```
+
+    Species:
+        [H, C, N, O, F, P, S, Cl, Br, I]
+
+    References:
+        https://arxiv.org/pdf/2312.15211\n
+        https://doi.org/10.17863/CAM.107498
+    """
+
+    __name__ = "maceoff"
+
+    __energy_methods__ = [PotentialMethod.WB97M_D3BJ_DEF2_TZVPPD]
+    __force_mask__ = [True]
+    __energy_unit__ = "ev"
+    __distance_unit__ = "ang"
+    __forces_unit__ = "ev/ang"
+
+    energy_target_names = ["dft_total_energy"]
+    force_target_names = ["dft_total_gradient"]
+
+    __links__ = {
+        "train_large_neut_no_bad_clean.tar.gz": "https://api.repository.cam.ac.uk/server/api/core/bitstreams/b185b5ab-91cf-489a-9302-63bfac42824a/content",  # noqa: E501
+        "test_large_neut_all.tar.gz": "https://api.repository.cam.ac.uk/server/api/core/bitstreams/cb8351dd-f09c-413f-921c-67a702a7f0c5/content",  # noqa: E501
+    }
+
+    def read_raw_entries(self):
+        entries = []
+        for filename in self.__links__:
+            filename = filename.split(".")[0]
+            xyzpath = p_join(self.root, f"{filename}.xyz")
+            split = filename.split("_")[0]
+            structure_iterator = parse_mace_xyz(xyzpath)
+            func = partial(build_data_object, split=split)
+            entries.extend(dm.utils.parallelized(func, structure_iterator))
+        return entries
+
+    def __getitem__(self, idx):
+        data = super().__getitem__(idx)
+        data.__setattr__("split", self._convert_array(self.data["split"][idx]))
+        return data