-
Notifications
You must be signed in to change notification settings - Fork 3
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
commit 5015a2e Merge: dab04ef 3aa2796 Author: Cristian Gabellini <30401800+FNTwin@users.noreply.github.com> Date: Fri Aug 30 12:42:39 2024 -0400 Merge pull request #112 from valence-labs/3bpa 3bpa dataset commit 3aa2796 Author: FNTwin <cristian@valencelabs.com> Date: Fri Aug 30 09:38:30 2024 -0600 BPA docstrings commit dab04ef Author: FNTwin <cristian@valencelabs.com> Date: Thu Aug 8 14:08:19 2024 -0600 Correct regex parsing + binary strings dec commit a1061a8 Author: FNTwin <cristian@valencelabs.com> Date: Fri Aug 2 13:02:48 2024 -0600 MACEOFF docstrings commit 737b81e Author: FNTwin <cristian@valencelabs.com> Date: Fri Aug 2 12:51:14 2024 -0600 WIP commit d63cb55 Author: FNTwin <cristian@valencelabs.com> Date: Fri Aug 2 12:03:31 2024 -0600 Splits in MACEOFF commit b613fb0 Author: Hatem Helal <hatem@valencelabs.com> Date: Fri Aug 2 10:33:04 2024 -0600 fix download and parsing commit b16a410 Author: Hatem Helal <hatem@valencelabs.com> Date: Fri Aug 2 07:00:53 2024 -0600 initial scaffolding for BPA dataset commit 09c75a7 Author: FNTwin <cristian@valencelabs.com> Date: Mon Jul 29 09:47:40 2024 -0600 MaceOff dataset
- Loading branch information
Showing
5 changed files
with
217 additions
and
5 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,74 @@ | ||
from typing import Any, Dict, List | ||
|
||
import numpy as np | ||
from ase.atoms import Atoms | ||
|
||
from openqdc import BaseDataset | ||
from openqdc.methods import PotentialMethod | ||
|
||
|
||
def read_bpa_record(subset: str, atoms: Atoms) -> Dict[str, Any]: | ||
return dict( | ||
name=np.array([str(atoms.symbols)]), | ||
subset=subset, | ||
energies=np.array([atoms.get_potential_energy()], dtype=np.float64), | ||
forces=atoms.get_forces().reshape(-1, 3, 1).astype(np.float32), | ||
atomic_inputs=np.column_stack((atoms.numbers, atoms.get_initial_charges(), atoms.positions)).astype(np.float32), | ||
n_atoms=np.array([len(atoms)], dtype=np.int32), | ||
split=np.array([subset.item().split("_")[0]]), | ||
) | ||
|
||
|
||
class BPA(BaseDataset): | ||
""" | ||
BPA (or 3BPA) dataset is a dataset consisting of a flexible druglike | ||
molecule 3-(benzyloxy)pyridin-2-amine. This dataset features | ||
complex dihedral potential energy surface with many local minima, | ||
which can be challenging to approximate using classical or ML force fields. | ||
The configuration were sampled from short (0.5 ps) MD simulations using the ANI-1x force field to | ||
perturb the toward lower potential energies. Furthermore, long 25 ps MD simulation were performed at | ||
three different temperatures (300, 600, and 1200 K) using the Langevin thermostat and a 1 fs time step. | ||
The final configurations were re-evaluated using ORCA at the DFT level of | ||
theory using the ωB97X exchange correlation functional and the 6-31G(d) basis set. | ||
Usage: | ||
```python | ||
from openqdc.datasets import BPA | ||
dataset = BPA() | ||
``` | ||
References: | ||
https://pubs.acs.org/doi/10.1021/acs.jctc.1c00647 | ||
""" | ||
|
||
__name__ = "BPA" | ||
__energy_unit__ = "ev" | ||
__forces_unit__ = "ev/ang" | ||
__distance_unit__ = "ang" | ||
__force_mask__ = [True] | ||
__energy_methods__ = [PotentialMethod.WB97X_6_31G_D] | ||
__links__ = {"BPA.zip": "https://figshare.com/ndownloader/files/31325990"} | ||
|
||
def read_raw_entries(self) -> List[Dict]: | ||
import os.path as osp | ||
from glob import glob | ||
|
||
from ase.io import iread | ||
|
||
files = glob(osp.join(self.root, "dataset_3BPA", "*.xyz")) | ||
files = [f for f in files if "iso_atoms.xyz" not in f] | ||
all_records = [] | ||
|
||
for file in files: | ||
subset = np.array([osp.basename(file).split(".")[0]]) | ||
|
||
for atoms in iread(file, format="extxyz"): | ||
all_records.append(read_bpa_record(subset, atoms)) | ||
|
||
return all_records | ||
|
||
def __getitem__(self, idx): | ||
data = super().__getitem__(idx) | ||
data.__setattr__("split", self._convert_array(self.data["split"][idx])) | ||
return data |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,133 @@ | ||
import re | ||
from functools import partial | ||
from os.path import join as p_join | ||
|
||
import datamol as dm | ||
import numpy as np | ||
|
||
from openqdc.datasets.base import BaseDataset | ||
from openqdc.methods import PotentialMethod | ||
from openqdc.utils.constants import ATOMIC_NUMBERS | ||
from openqdc.utils.molecule import get_atomic_number_and_charge | ||
|
||
|
||
def parse_mace_xyz(xyzpath): | ||
energy_re = re.compile(r"energy=(\S+)") | ||
smiles_re = re.compile(r"smiles=(\S+)") | ||
subset_re = re.compile(r"config_type=([^;]+)\ MACE_energy") | ||
with open(xyzpath, "r") as f: | ||
n_atoms = None | ||
counter = 0 | ||
positions = [] | ||
numbers = [] | ||
forces = [] | ||
energy = None | ||
for line in f: | ||
if n_atoms is None: | ||
n_atoms = int(line) | ||
positions = [] | ||
numbers = [] | ||
forces = [] | ||
energy = None | ||
counter = 1 | ||
continue | ||
if counter == 1: | ||
props = line | ||
energy = float(energy_re.search(props).group(1)) | ||
subset = subset_re.search(props).group(1) | ||
try: | ||
smiles = smiles_re.search(props).group(1) | ||
except AttributeError: # water and qmugs subsets do not have smiles | ||
smiles = "" | ||
counter = 2 | ||
continue | ||
el, x, y, z, fx, fy, fz, _, _, _ = line.split() | ||
numbers.append(ATOMIC_NUMBERS[el]) | ||
positions.append([float(x), float(y), float(z)]) | ||
forces.append([float(fx), float(fy), float(fz)]) | ||
smiles = smiles.replace('"', "") | ||
subset = subset.replace('"', "") | ||
counter += 1 | ||
if counter == n_atoms + 2: | ||
n_atoms = None | ||
yield energy, numbers, positions, forces, smiles, subset | ||
|
||
|
||
def build_data_object(data, split): | ||
energy, numbers, positions, forces, smiles, subset = data | ||
if smiles == "": | ||
x = np.concatenate((np.array(numbers)[:, None], np.zeros((len(numbers), 1))), axis=-1) | ||
else: | ||
x = get_atomic_number_and_charge(dm.to_mol(smiles, remove_hs=False, ordered=True)) | ||
res = dict( | ||
name=np.array([smiles]), | ||
subset=np.array([subset]), | ||
energies=np.array([[energy]], dtype=np.float64), | ||
forces=np.array(forces, dtype=np.float32).reshape(-1, 3, 1), | ||
atomic_inputs=np.concatenate((x, np.array(positions)), axis=-1, dtype=np.float32).reshape(-1, 5), | ||
n_atoms=np.array([x.shape[0]], dtype=np.int32), | ||
split=np.array([split]), | ||
) | ||
return res | ||
|
||
|
||
class MACEOFF(BaseDataset): | ||
""" | ||
MACEOFF dataset core of the dataset consist in the Spice V1 dataset. | ||
95% of the data are used for training and validation under the "train" split, | ||
and 5% for testing. The dataset uses the Spice level of theory | ||
ωB97M-D3(BJ)/def2-TZVPPD as implemented in the PSI4 software. | ||
MACEOFF uses a subset of SPICE that contains the ten chemical elements | ||
H, C, N, O, F, P, S, Cl, Br, and I, and has a neutral formal charge. | ||
MACEOFF doesn't contain ion pairs. To facilitate the learning of intramolecular | ||
non-bonded interactions, MACEOFF dataset contains larger 50–90 atom molecules | ||
randomly selected from the QMugs dataset. | ||
MACEOFF contains a number of water clusters carved out of molecular dynamics simulations | ||
of liquid water, with sizes of up to 50 water molecules and part of the | ||
COMP6 tripeptide geometry dataset. | ||
Usage: | ||
```python | ||
from openqdc.datasets import MACEOFF | ||
dataset = MACEOFF() | ||
``` | ||
Species: | ||
[H, C, N, O, F, P, S, Cl, Br, I] | ||
References: | ||
https://arxiv.org/pdf/2312.15211\n | ||
https://doi.org/10.17863/CAM.107498 | ||
""" | ||
|
||
__name__ = "maceoff" | ||
|
||
__energy_methods__ = [PotentialMethod.WB97M_D3BJ_DEF2_TZVPPD] | ||
__force_mask__ = [True] | ||
__energy_unit__ = "ev" | ||
__distance_unit__ = "ang" | ||
__forces_unit__ = "ev/ang" | ||
|
||
energy_target_names = ["dft_total_energy"] | ||
force_target_names = ["dft_total_gradient"] | ||
|
||
__links__ = { | ||
"train_large_neut_no_bad_clean.tar.gz": "https://api.repository.cam.ac.uk/server/api/core/bitstreams/b185b5ab-91cf-489a-9302-63bfac42824a/content", # noqa: E501 | ||
"test_large_neut_all.tar.gz": "https://api.repository.cam.ac.uk/server/api/core/bitstreams/cb8351dd-f09c-413f-921c-67a702a7f0c5/content", # noqa: E501 | ||
} | ||
|
||
def read_raw_entries(self): | ||
entries = [] | ||
for filename in self.__links__: | ||
filename = filename.split(".")[0] | ||
xyzpath = p_join(self.root, f"{filename}.xyz") | ||
split = filename.split("_")[0] | ||
structure_iterator = parse_mace_xyz(xyzpath) | ||
func = partial(build_data_object, split=split) | ||
entries.extend(dm.utils.parallelized(func, structure_iterator)) | ||
return entries | ||
|
||
def __getitem__(self, idx): | ||
data = super().__getitem__(idx) | ||
data.__setattr__("split", self._convert_array(self.data["split"][idx])) | ||
return data |