Skip to content

Commit

Permalink
Squashed commit of the following:
Browse files Browse the repository at this point in the history
commit 5015a2e
Merge: dab04ef 3aa2796
Author: Cristian Gabellini <30401800+FNTwin@users.noreply.github.com>
Date:   Fri Aug 30 12:42:39 2024 -0400

    Merge pull request #112 from valence-labs/3bpa

    3bpa dataset

commit 3aa2796
Author: FNTwin <cristian@valencelabs.com>
Date:   Fri Aug 30 09:38:30 2024 -0600

    BPA docstrings

commit dab04ef
Author: FNTwin <cristian@valencelabs.com>
Date:   Thu Aug 8 14:08:19 2024 -0600

    Correct regex parsing + binary strings dec

commit a1061a8
Author: FNTwin <cristian@valencelabs.com>
Date:   Fri Aug 2 13:02:48 2024 -0600

    MACEOFF docstrings

commit 737b81e
Author: FNTwin <cristian@valencelabs.com>
Date:   Fri Aug 2 12:51:14 2024 -0600

    WIP

commit d63cb55
Author: FNTwin <cristian@valencelabs.com>
Date:   Fri Aug 2 12:03:31 2024 -0600

    Splits in MACEOFF

commit b613fb0
Author: Hatem Helal <hatem@valencelabs.com>
Date:   Fri Aug 2 10:33:04 2024 -0600

    fix download and parsing

commit b16a410
Author: Hatem Helal <hatem@valencelabs.com>
Date:   Fri Aug 2 07:00:53 2024 -0600

    initial scaffolding for BPA dataset

commit 09c75a7
Author: FNTwin <cristian@valencelabs.com>
Date:   Mon Jul 29 09:47:40 2024 -0600

    MaceOff dataset
  • Loading branch information
FNTwin committed Aug 30, 2024
1 parent e052bf6 commit 8847998
Show file tree
Hide file tree
Showing 5 changed files with 217 additions and 5 deletions.
4 changes: 4 additions & 0 deletions openqdc/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@ def get_project_root():
"ANI1CCX_V2": "openqdc.datasets.potential.ani",
"ANI1X": "openqdc.datasets.potential.ani",
"ANI2X": "openqdc.datasets.potential.ani",
"BPA": "openqdc.datasets.potential.bpa",
"Spice": "openqdc.datasets.potential.spice",
"SpiceV2": "openqdc.datasets.potential.spice",
"SpiceVL2": "openqdc.datasets.potential.spice",
Expand All @@ -31,6 +32,7 @@ def get_project_root():
"COMP6": "openqdc.datasets.potential.comp6",
"GDML": "openqdc.datasets.potential.gdml",
"Molecule3D": "openqdc.datasets.potential.molecule3d",
"MACEOFF": "openqdc.datasets.potential.maceoff",
"OrbnetDenali": "openqdc.datasets.potential.orbnet_denali",
"SN2RXN": "openqdc.datasets.potential.sn2_rxn",
"QM7X": "openqdc.datasets.potential.qm7x",
Expand Down Expand Up @@ -117,11 +119,13 @@ def __dir__():
# POTENTIAL
from .datasets.potential.alchemy import Alchemy
from .datasets.potential.ani import ANI1, ANI1CCX, ANI1CCX_V2, ANI1X, ANI2X
from .datasets.potential.bpa import BPA
from .datasets.potential.comp6 import COMP6
from .datasets.potential.dummy import Dummy, PredefinedDataset
from .datasets.potential.gdml import GDML
from .datasets.potential.geom import GEOM
from .datasets.potential.iso_17 import ISO17
from .datasets.potential.maceoff import MACEOFF
from .datasets.potential.md22 import MD22
from .datasets.potential.molecule3d import Molecule3D
from .datasets.potential.multixcqm9 import MultixcQM9, MultixcQM9_V2
Expand Down
4 changes: 4 additions & 0 deletions openqdc/datasets/potential/__init__.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,12 @@
from .alchemy import Alchemy
from .ani import ANI1, ANI1CCX, ANI1CCX_V2, ANI1X, ANI2X
from .bpa import BPA
from .comp6 import COMP6
from .dummy import Dummy, PredefinedDataset
from .gdml import GDML
from .geom import GEOM
from .iso_17 import ISO17
from .maceoff import MACEOFF
from .md22 import MD22
from .molecule3d import Molecule3D
from .multixcqm9 import MultixcQM9, MultixcQM9_V2
Expand Down Expand Up @@ -33,11 +35,13 @@
"ANI1CCX_V2": ANI1CCX_V2,
"ANI1X": ANI1X,
"ANI2X": ANI2X,
"BPA": BPA,
"COMP6": COMP6,
"GDML": GDML,
"GEOM": GEOM,
"ISO17": ISO17,
"Molecule3D": Molecule3D,
"MACEOFF": MACEOFF,
"NablaDFT": NablaDFT,
"OrbnetDenali": OrbnetDenali,
"PCQM_B3LYP": PCQM_B3LYP,
Expand Down
7 changes: 2 additions & 5 deletions openqdc/datasets/potential/ani.py
Original file line number Diff line number Diff line change
Expand Up @@ -154,7 +154,7 @@ def convert_forces(self, x):
return super().convert_forces(x) * 0.529177249 # correct the Dataset error

def __smiles_converter__(self, x):
return x
return "-".join(x.decode("ascii").split("-")[:-1])


class ANI1CCX(ANI1):
Expand Down Expand Up @@ -195,10 +195,7 @@ class ANI1CCX(ANI1):
__links__ = {"ani1x.hdf5.gz": "https://zenodo.org/record/4081694/files/292.hdf5.gz"}

def __smiles_converter__(self, x):
"""util function to convert string to smiles: useful if the smiles is
encoded in a different format than its display format
"""
return x
return x.decode("ascii")


class ANI1CCX_V2(ANI1CCX):
Expand Down
74 changes: 74 additions & 0 deletions openqdc/datasets/potential/bpa.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,74 @@
from typing import Any, Dict, List

import numpy as np
from ase.atoms import Atoms

from openqdc import BaseDataset
from openqdc.methods import PotentialMethod


def read_bpa_record(subset: str, atoms: Atoms) -> Dict[str, Any]:
return dict(
name=np.array([str(atoms.symbols)]),
subset=subset,
energies=np.array([atoms.get_potential_energy()], dtype=np.float64),
forces=atoms.get_forces().reshape(-1, 3, 1).astype(np.float32),
atomic_inputs=np.column_stack((atoms.numbers, atoms.get_initial_charges(), atoms.positions)).astype(np.float32),
n_atoms=np.array([len(atoms)], dtype=np.int32),
split=np.array([subset.item().split("_")[0]]),
)


class BPA(BaseDataset):
"""
BPA (or 3BPA) dataset is a dataset consisting of a flexible druglike
molecule 3-(benzyloxy)pyridin-2-amine. This dataset features
complex dihedral potential energy surface with many local minima,
which can be challenging to approximate using classical or ML force fields.
The configuration were sampled from short (0.5 ps) MD simulations using the ANI-1x force field to
perturb the toward lower potential energies. Furthermore, long 25 ps MD simulation were performed at
three different temperatures (300, 600, and 1200 K) using the Langevin thermostat and a 1 fs time step.
The final configurations were re-evaluated using ORCA at the DFT level of
theory using the ωB97X exchange correlation functional and the 6-31G(d) basis set.
Usage:
```python
from openqdc.datasets import BPA
dataset = BPA()
```
References:
https://pubs.acs.org/doi/10.1021/acs.jctc.1c00647
"""

__name__ = "BPA"
__energy_unit__ = "ev"
__forces_unit__ = "ev/ang"
__distance_unit__ = "ang"
__force_mask__ = [True]
__energy_methods__ = [PotentialMethod.WB97X_6_31G_D]
__links__ = {"BPA.zip": "https://figshare.com/ndownloader/files/31325990"}

def read_raw_entries(self) -> List[Dict]:
import os.path as osp
from glob import glob

from ase.io import iread

files = glob(osp.join(self.root, "dataset_3BPA", "*.xyz"))
files = [f for f in files if "iso_atoms.xyz" not in f]
all_records = []

for file in files:
subset = np.array([osp.basename(file).split(".")[0]])

for atoms in iread(file, format="extxyz"):
all_records.append(read_bpa_record(subset, atoms))

return all_records

def __getitem__(self, idx):
data = super().__getitem__(idx)
data.__setattr__("split", self._convert_array(self.data["split"][idx]))
return data
133 changes: 133 additions & 0 deletions openqdc/datasets/potential/maceoff.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,133 @@
import re
from functools import partial
from os.path import join as p_join

import datamol as dm
import numpy as np

from openqdc.datasets.base import BaseDataset
from openqdc.methods import PotentialMethod
from openqdc.utils.constants import ATOMIC_NUMBERS
from openqdc.utils.molecule import get_atomic_number_and_charge


def parse_mace_xyz(xyzpath):
energy_re = re.compile(r"energy=(\S+)")
smiles_re = re.compile(r"smiles=(\S+)")
subset_re = re.compile(r"config_type=([^;]+)\ MACE_energy")
with open(xyzpath, "r") as f:
n_atoms = None
counter = 0
positions = []
numbers = []
forces = []
energy = None
for line in f:
if n_atoms is None:
n_atoms = int(line)
positions = []
numbers = []
forces = []
energy = None
counter = 1
continue
if counter == 1:
props = line
energy = float(energy_re.search(props).group(1))
subset = subset_re.search(props).group(1)
try:
smiles = smiles_re.search(props).group(1)
except AttributeError: # water and qmugs subsets do not have smiles
smiles = ""
counter = 2
continue
el, x, y, z, fx, fy, fz, _, _, _ = line.split()
numbers.append(ATOMIC_NUMBERS[el])
positions.append([float(x), float(y), float(z)])
forces.append([float(fx), float(fy), float(fz)])
smiles = smiles.replace('"', "")
subset = subset.replace('"', "")
counter += 1
if counter == n_atoms + 2:
n_atoms = None
yield energy, numbers, positions, forces, smiles, subset


def build_data_object(data, split):
energy, numbers, positions, forces, smiles, subset = data
if smiles == "":
x = np.concatenate((np.array(numbers)[:, None], np.zeros((len(numbers), 1))), axis=-1)
else:
x = get_atomic_number_and_charge(dm.to_mol(smiles, remove_hs=False, ordered=True))
res = dict(
name=np.array([smiles]),
subset=np.array([subset]),
energies=np.array([[energy]], dtype=np.float64),
forces=np.array(forces, dtype=np.float32).reshape(-1, 3, 1),
atomic_inputs=np.concatenate((x, np.array(positions)), axis=-1, dtype=np.float32).reshape(-1, 5),
n_atoms=np.array([x.shape[0]], dtype=np.int32),
split=np.array([split]),
)
return res


class MACEOFF(BaseDataset):
"""
MACEOFF dataset core of the dataset consist in the Spice V1 dataset.
95% of the data are used for training and validation under the "train" split,
and 5% for testing. The dataset uses the Spice level of theory
ωB97M-D3(BJ)/def2-TZVPPD as implemented in the PSI4 software.
MACEOFF uses a subset of SPICE that contains the ten chemical elements
H, C, N, O, F, P, S, Cl, Br, and I, and has a neutral formal charge.
MACEOFF doesn't contain ion pairs. To facilitate the learning of intramolecular
non-bonded interactions, MACEOFF dataset contains larger 50–90 atom molecules
randomly selected from the QMugs dataset.
MACEOFF contains a number of water clusters carved out of molecular dynamics simulations
of liquid water, with sizes of up to 50 water molecules and part of the
COMP6 tripeptide geometry dataset.
Usage:
```python
from openqdc.datasets import MACEOFF
dataset = MACEOFF()
```
Species:
[H, C, N, O, F, P, S, Cl, Br, I]
References:
https://arxiv.org/pdf/2312.15211\n
https://doi.org/10.17863/CAM.107498
"""

__name__ = "maceoff"

__energy_methods__ = [PotentialMethod.WB97M_D3BJ_DEF2_TZVPPD]
__force_mask__ = [True]
__energy_unit__ = "ev"
__distance_unit__ = "ang"
__forces_unit__ = "ev/ang"

energy_target_names = ["dft_total_energy"]
force_target_names = ["dft_total_gradient"]

__links__ = {
"train_large_neut_no_bad_clean.tar.gz": "https://api.repository.cam.ac.uk/server/api/core/bitstreams/b185b5ab-91cf-489a-9302-63bfac42824a/content", # noqa: E501
"test_large_neut_all.tar.gz": "https://api.repository.cam.ac.uk/server/api/core/bitstreams/cb8351dd-f09c-413f-921c-67a702a7f0c5/content", # noqa: E501
}

def read_raw_entries(self):
entries = []
for filename in self.__links__:
filename = filename.split(".")[0]
xyzpath = p_join(self.root, f"{filename}.xyz")
split = filename.split("_")[0]
structure_iterator = parse_mace_xyz(xyzpath)
func = partial(build_data_object, split=split)
entries.extend(dm.utils.parallelized(func, structure_iterator))
return entries

def __getitem__(self, idx):
data = super().__getitem__(idx)
data.__setattr__("split", self._convert_array(self.data["split"][idx]))
return data

0 comments on commit 8847998

Please sign in to comment.