From 821eecc7dfb6212c83f8fab67ea07b708f6fbe7e Mon Sep 17 00:00:00 2001 From: Nikhil Shenoy Date: Wed, 4 Oct 2023 14:31:43 +0000 Subject: [PATCH 01/12] Added ani docs --- src/openqdc/datasets/ani.py | 47 +++++++++++++++++++++++++++++++++++++ 1 file changed, 47 insertions(+) diff --git a/src/openqdc/datasets/ani.py b/src/openqdc/datasets/ani.py index f0b3335..aaa2f8b 100644 --- a/src/openqdc/datasets/ani.py +++ b/src/openqdc/datasets/ani.py @@ -9,6 +9,24 @@ class ANI1(BaseDataset): + """ + The ANI-1 dataset is a collection of 22 x 10^6 structural conformations from 57,000 distinct small + organic molecules with energy labels calculated using DFT. The molecules + contain 4 distinct atoms, C, N, O and H. + + Usage + ```python + from openqdc.datasets import ANI1 + dataset = ANI1() + ``` + + References: + - ANI-1x: https://doi.org/10.1063/1.5023802 + - ANI-1ccx: https://doi.org/10.1038/s41467-019-10827-4 + - wB97x/def2-TZVPP data: https://doi.org/10.1126/sciadv.aav6490 + - Github: https://github.com/aiqm/ANI1x_datasets + """ + __name__ = "ani1" # Energy in hartree, all zeros by default @@ -42,6 +60,20 @@ def read_raw_entries(self): class ANI1CCX(ANI1): + """ + + + Usage + ```python + from openqdc.datasets import ANI1CCX + dataset = ANI1CCX() + ``` + + References: + - ANI-1ccx: https://doi.org/10.1038/s41467-019-10827-4 + - Github: https://github.com/aiqm/ANI1x_datasets + """ + __name__ = "ani1ccx" # Energy in hartree, all zeros by default @@ -69,6 +101,21 @@ def __init__(self) -> None: class ANI1X(ANI1): + """ + The ANI-1X dataset consists of ANI-1 molecules + some molecules added using active learning which leads to + a total of 5,496,771 conformers with 63,865 unique molecules. + + Usage + ```python + from openqdc.datasets import ANI1X + dataset = ANI1X() + ``` + + References: + - ANI-1x: https://doi.org/10.1063/1.5023802 + - Github: https://github.com/aiqm/ANI1x_datasets + """ + __name__ = "ani1x" # Energy in hartree, all zeros by default From 5df2d5e2ed4f5b54cb34f6315e480c86f7e8ea54 Mon Sep 17 00:00:00 2001 From: Nikhil Shenoy Date: Wed, 4 Oct 2023 14:54:58 +0000 Subject: [PATCH 02/12] Added docs for ani, comp6 and gdml --- src/openqdc/datasets/ani.py | 4 +--- src/openqdc/datasets/comp6.py | 16 ++++++++++++++++ src/openqdc/datasets/gdml.py | 23 +++++++++++++++++++++++ 3 files changed, 40 insertions(+), 3 deletions(-) diff --git a/src/openqdc/datasets/ani.py b/src/openqdc/datasets/ani.py index aaa2f8b..ff911ff 100644 --- a/src/openqdc/datasets/ani.py +++ b/src/openqdc/datasets/ani.py @@ -21,9 +21,7 @@ class ANI1(BaseDataset): ``` References: - - ANI-1x: https://doi.org/10.1063/1.5023802 - - ANI-1ccx: https://doi.org/10.1038/s41467-019-10827-4 - - wB97x/def2-TZVPP data: https://doi.org/10.1126/sciadv.aav6490 + - ANI-1: https://www.nature.com/articles/sdata2017193 - Github: https://github.com/aiqm/ANI1x_datasets """ diff --git a/src/openqdc/datasets/comp6.py b/src/openqdc/datasets/comp6.py index 96811c9..0fc8cd3 100644 --- a/src/openqdc/datasets/comp6.py +++ b/src/openqdc/datasets/comp6.py @@ -7,6 +7,22 @@ class COMP6(BaseDataset): + """ + COMP6 is a benchmark suite consisting of broad regions of bio-chemical and organic space + developed for testing the ANI-1x potential. It is curated from 6 benchmark sets: + S66x8, ANI Molecular Dynamics, GDB7to9, GDB10to13, DrugBank, and Tripeptides. + + Usage + ```python + from openqdc.datasets import COMP6 + dataset = COMP6() + ``` + + References: + - https://aip.scitation.org/doi/abs/10.1063/1.5023802 + - Github: https://github.com/isayev/COMP6 + """ + __name__ = "comp6" # Energy in hartree, all zeros by default diff --git a/src/openqdc/datasets/gdml.py b/src/openqdc/datasets/gdml.py index c0ca093..ab38b90 100644 --- a/src/openqdc/datasets/gdml.py +++ b/src/openqdc/datasets/gdml.py @@ -7,6 +7,29 @@ class GDML(BaseDataset): + """ + Gradient Domain Machine Learning (GDML) is a dataset consisting of samples from ab initio + molecular dynamics (AIMD) trajectories. The dataset consists of, + - Benzene: 627000 samples + - Uracil: 133000 samples + - Naptalene: 326000 samples + - Aspirin: 211000 samples + - Salicylic Acid: 320000 samples + - Malonaldehyde: 993000 samples + - Ethanol: 555000 samples + - Toluene: 100000 samples + + Usage + ```python + from openqdc.datasets import GDML + dataset = GDML() + ``` + + References: + - https://www.science.org/doi/10.1126/sciadv.1603015 + - http://www.sgdml.org/#datasets + """ + __name__ = "gdml" # Energy in hartree, all zeros by default From 7045e6e7abff1db2c442aa2286eae52d503510ed Mon Sep 17 00:00:00 2001 From: Nikhil Shenoy Date: Wed, 4 Oct 2023 14:58:21 +0000 Subject: [PATCH 03/12] Added ani docs --- src/openqdc/datasets/ani.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/openqdc/datasets/ani.py b/src/openqdc/datasets/ani.py index ff911ff..ee4bea5 100644 --- a/src/openqdc/datasets/ani.py +++ b/src/openqdc/datasets/ani.py @@ -59,7 +59,8 @@ def read_raw_entries(self): class ANI1CCX(ANI1): """ - + ANI1-CCX is a dataset of 500k conformers subsampled from the 5.5M conformers of ANI-1X dataset. The selected + conformations are then labelled using a high accuracy CCSD(T)*/CBS method. Usage ```python From 99a3506edfbb34805b5422ac40be2584052a7f50 Mon Sep 17 00:00:00 2001 From: Nikhil Shenoy Date: Thu, 5 Oct 2023 18:15:45 +0000 Subject: [PATCH 04/12] Updated docs for geom, molecule3d, orbnet_denali, qmugs --- src/openqdc/datasets/base.py | 22 +++++++------- src/openqdc/datasets/geom.py | 30 ++++++++++++++++---- src/openqdc/datasets/molecule3d.py | 41 +++++++++++++++++++++++++-- src/openqdc/datasets/orbnet_denali.py | 27 ++++++++++++------ src/openqdc/datasets/qmugs.py | 16 +++++++++++ 5 files changed, 111 insertions(+), 25 deletions(-) diff --git a/src/openqdc/datasets/base.py b/src/openqdc/datasets/base.py index 392144d..96e0f0c 100644 --- a/src/openqdc/datasets/base.py +++ b/src/openqdc/datasets/base.py @@ -1,7 +1,9 @@ import os from os.path import join as p_join +from typing import Dict, List, Optional import numpy as np +import pandas as pd import torch from loguru import logger from sklearn.utils import Bunch @@ -18,7 +20,13 @@ from openqdc.utils.molecule import atom_table -def extract_entry(df, i, subset, energy_target_names, force_target_names=None): +def extract_entry( + df: pd.DataFrame, + i: int, + subset: str, + energy_target_names: List[str], + force_target_names: Optional[List[str]] = None, +) -> Dict[str, np.ndarray]: x = np.array([atom_table.GetAtomicNumber(s) for s in df["symbols"][i]]) xs = np.stack((x, np.zeros_like(x)), axis=-1) positions = df["geometry"][i].reshape((-1, 3)) @@ -42,18 +50,12 @@ def extract_entry(df, i, subset, energy_target_names, force_target_names=None): return res -def read_qc_archive_h5(raw_path, subset, energy_target_names, force_target_names): +def read_qc_archive_h5( + raw_path: str, subset: str, energy_target_names: List[str], force_target_names: List[str] +) -> List[Dict[str, np.ndarray]]: data = load_hdf5_file(raw_path) data_t = {k2: data[k1][k2][:] for k1 in data.keys() for k2 in data[k1].keys()} n = len(data_t["molecule_id"]) - # print(f"Reading {n} entries from {raw_path}") - # for k in data_t: - # print(f"Loaded {k} with shape {data_t[k].shape}, dtype {data_t[k].dtype}") - # if "Energy" in k: - # print(np.isnan(data_t[k]).mean(), f"{data_t[k][0]}") - - # print('\n'*3) - # exit() samples = [extract_entry(data_t, i, subset, energy_target_names, force_target_names) for i in tqdm(range(n))] return samples diff --git a/src/openqdc/datasets/geom.py b/src/openqdc/datasets/geom.py index 6af826e..eebcc66 100644 --- a/src/openqdc/datasets/geom.py +++ b/src/openqdc/datasets/geom.py @@ -1,4 +1,5 @@ from os.path import join as p_join +from typing import Dict import datamol as dm import numpy as np @@ -9,7 +10,7 @@ from openqdc.utils.molecule import get_atomic_number_and_charge -def read_mol(mol_id, mol_dict, base_path, partition): +def read_mol(mol_id: str, mol_dict, base_path: str, partition: str) -> Dict[str, np.ndarray]: """Read molecule from pickle file and return dict with conformers and energies Parameters @@ -20,15 +21,18 @@ def read_mol(mol_id, mol_dict, base_path, partition): Dictionary containing the pickle_path and smiles of the molecule base_path: str Path to the folder containing the pickle files + partition: str + Name of the dataset partition, one of ['qm9', 'drugs'] Returns ------- res: dict Dictionary containing the following keys: - - atomic_inputs: flatten np.ndarray of shape (M, 4) containing the atomic numbers and positions - - smiles: np.ndarray of shape (N,) containing the smiles of the molecule - - energies: np.ndarray of shape (N,1) containing the energies of the conformers - - n_atoms: np.ndarray of shape (N,) containing the number of atoms in each conformer + - atomic_inputs: flatten np.ndarray of shape (M, 5) containing the atomic numbers, charges and positions + - smiles: np.ndarray of shape (N,) containing the smiles of the molecule + - energies: np.ndarray of shape (N,1) containing the energies of the conformers + - n_atoms: np.ndarray of shape (N,) containing the number of atoms in each conformer + - subset: np.ndarray of shape (N,) containing the name of the dataset partition """ try: @@ -56,6 +60,22 @@ def read_mol(mol_id, mol_dict, base_path, partition): class GEOM(BaseDataset): + """ + The Geometric Ensemble Of Molecules (GEOM) dataset contains 37 million conformers for 133,000 molecules + from QM9, and 317,000 molecules with experimental data related to biophysics, physiology, + and physical chemistry. The dataset is generated using the GFN2-xTB semi-empirical method. + + Usage: + ```python + from openqdc.datasets import GEOM + dataset = GEOM() + ``` + + References: + - https://www.nature.com/articles/s41597-022-01288-4 + - https://github.com/learningmatter-mit/geom + """ + __name__ = "geom" __energy_methods__ = ["gfn2_xtb"] diff --git a/src/openqdc/datasets/molecule3d.py b/src/openqdc/datasets/molecule3d.py index 0d59400..e5870ca 100644 --- a/src/openqdc/datasets/molecule3d.py +++ b/src/openqdc/datasets/molecule3d.py @@ -1,5 +1,6 @@ from glob import glob from os.path import join as p_join +from typing import Dict, List import datamol as dm import numpy as np @@ -12,7 +13,26 @@ from openqdc.utils.molecule import get_atomic_number_and_charge -def read_mol(mol, energy): +def read_mol(mol: Chem.rdchem.Mol, energy: float) -> Dict[str, np.ndarray]: + """Read molecule (Chem.rdchem.Mol) and energy (float) and return dict with conformers and energies + + Parameters + ---------- + mol: Chem.rdchem.Mol + RDKit molecule + energy: float + Energy of the molecule + + Returns + ------- + res: dict + Dictionary containing the following keys: + - name: np.ndarray of shape (N,) containing the smiles of the molecule + - atomic_inputs: flatten np.ndarray of shape (M, 5) containing the atomic numbers, charges and positions + - energies: np.ndarray of shape (1,) containing the energy of the conformer + - n_atoms: np.ndarray of shape (1) containing the number of atoms in the conformer + - subset: np.ndarray of shape (1) containing "molecule3d" + """ smiles = dm.to_smiles(mol, explicit_hs=False) # subset = dm.to_smiles(dm.to_scaffold_murcko(mol, make_generic=True), explicit_hs=False) x = get_atomic_number_and_charge(mol) @@ -29,7 +49,8 @@ def read_mol(mol, energy): return res -def _read_sdf(sdf_path, properties_path): +def _read_sdf(sdf_path: str, properties_path: str) -> List[Dict[str, np.ndarray]]: + """Reads the sdf path and properties file.""" properties = pd.read_csv(properties_path, dtype={"cid": str}) properties.drop_duplicates(subset="cid", inplace=True, keep="first") xys = properties[["cid", "scf energy"]] @@ -45,6 +66,22 @@ def _read_sdf(sdf_path, properties_path): class Molecule3D(BaseDataset): + """ + Molecule3D dataset consists of 3,899,647 molecules with ground state geometries and energies + calculated at B3LYP/6-31G* level of theory. The molecules are extracted from the + PubChem database and cleaned by removing invalid molecule files. + + Usage: + ```python + from openqdc.datasets import Molecule3D + dataset = Molecule3D() + ``` + + References: + - https://arxiv.org/abs/2110.01717 + - https://github.com/divelab/MoleculeX + """ + __name__ = "molecule3d" __energy_methods__ = ["b3lyp_6-31g*"] diff --git a/src/openqdc/datasets/orbnet_denali.py b/src/openqdc/datasets/orbnet_denali.py index 452cce1..2d8b093 100644 --- a/src/openqdc/datasets/orbnet_denali.py +++ b/src/openqdc/datasets/orbnet_denali.py @@ -1,4 +1,5 @@ from os.path import join as p_join +from typing import Dict, List import datamol as dm import numpy as np @@ -9,7 +10,7 @@ from openqdc.utils.molecule import atom_table -def read_mol(mol_id, conf_dict, base_path, energy_target_names): +def read_mol(mol_id, conf_dict, base_path, energy_target_names: List[str]) -> Dict[str, np.ndarray]: res = [] for conf_id, conf_label in conf_dict.items(): try: @@ -34,6 +35,23 @@ def read_mol(mol_id, conf_dict, base_path, energy_target_names): class OrbnetDenali(BaseDataset): + """ + Orbnet Denali is a collection of 2.3 million conformers from 212,905 unique molecules. It performs + DFT (ωB97X-D3/def2-TZVP) calculations on molecules and geometries consisting of organic molecules + and chemistries, with protonation and tautomeric states, non-covalent interactions, common salts, + and counterions, spanning the most common elements in bio and organic chemistry. + + Usage: + ```python + from openqdc.datasets import OrbnetDenali + dataset = OrbnetDenali() + ``` + + References: + - https://arxiv.org/pdf/2107.00299.pdf + - https://figshare.com/articles/dataset/OrbNet_Denali_Training_Data/14883867 + """ + __name__ = "orbnet_denali" __energy_methods__ = ["wb97x-d3_tz", "gfn1_xtb"] @@ -53,13 +71,6 @@ def read_raw_entries(self): for mol_id, group in df.groupby("mol_id") } - # print(df.head()) - # tmp = df.to_dict('index') - # for i, k in enumerate(tmp): - # print(k, tmp[k]) - # if i > 10: - # break - # exit() fn = lambda x: read_mol(x[0], x[1], self.root, self.energy_target_names) res = dm.parallelized(fn, list(labels.items()), scheduler="threads", n_jobs=-1, progress=True) samples = sum(res, []) diff --git a/src/openqdc/datasets/qmugs.py b/src/openqdc/datasets/qmugs.py index b528f42..d15d83b 100644 --- a/src/openqdc/datasets/qmugs.py +++ b/src/openqdc/datasets/qmugs.py @@ -36,6 +36,22 @@ def read_mol(mol_dir): class QMugs(BaseDataset): + """ + The QMugs dataset contains 2 million conformers for 665k biologically and pharmacologically relevant molecules + extracted from the ChEMBL database. The atomic and molecular properties are calculated using both, + semi-empirical methods (GFN2-xTB) and DFT method (ωB97X-D/def2-SVP). + + Usage: + ```python + from openqdc.datasets import QMugs + dataset = QMugs() + ``` + + References: + - https://www.nature.com/articles/s41597-022-01390-7#ethics + - https://www.research-collection.ethz.ch/handle/20.500.11850/482129 + """ + __name__ = "qmugs" __energy_methods__ = ["gfn2_xtb", "b3lyp/6-31g*"] From bf3c08a970332ecaaf7041a8142dc2654fabc01a Mon Sep 17 00:00:00 2001 From: Nikhil Shenoy Date: Thu, 5 Oct 2023 19:14:48 +0000 Subject: [PATCH 05/12] Updated docs for spice, iso17, nabladft --- src/openqdc/datasets/__init__.py | 25 +++++++++++++++++++++++++ src/openqdc/datasets/iso_17.py | 17 +++++++++++++++++ src/openqdc/datasets/nabladft.py | 19 ++++++++++++++++++- src/openqdc/datasets/spice.py | 17 +++++++++++++++++ 4 files changed, 77 insertions(+), 1 deletion(-) create mode 100644 src/openqdc/datasets/__init__.py diff --git a/src/openqdc/datasets/__init__.py b/src/openqdc/datasets/__init__.py new file mode 100644 index 0000000..7e1811e --- /dev/null +++ b/src/openqdc/datasets/__init__.py @@ -0,0 +1,25 @@ +from .comp6 import COMP6 +from .gdml import GDML +from .geom import GEOM +from .iso_17 import ISO17 +from .molecule3d import Molecule3D +from .nabladft import NablaDFT +from .orbnet_denali import OrbnetDenali +from .qmugs import QMugs +from .sn2_rxn import SN2RXN +from .spice import Spice + +__all__ = [ + "Spice", + "GEOM", + "QMugs", + "NablaDFT", + "ISO17", + "COMP6", + "GDML", + "Molecule3D", + "NablaDFT", + "OrbnetDenali", + "QMugs", + "SN2RXN", +] diff --git a/src/openqdc/datasets/iso_17.py b/src/openqdc/datasets/iso_17.py index 55f395c..a26f382 100644 --- a/src/openqdc/datasets/iso_17.py +++ b/src/openqdc/datasets/iso_17.py @@ -7,6 +7,23 @@ class ISO17(BaseDataset): + """ + ISO17 dataset consists of the largest set of isomers from the QM9 dataset that consists of a fixed + composition of atoms (C7O2H10) arranged in different chemically valid structures. It consists of consist + of 129 molecules each containing 5,000 conformational geometries, energies and forces with a resolution + of 1 femtosecond in the molecular dynamics trajectories. The simulations were carried out using the + Perdew-Burke-Ernzerhof (PBE) functional and the Tkatchenko-Scheffler (TS) van der Waals correction method. + + Usage: + ```python + from openqdc.datasets import ISO17 + dataset = ISO17() + ``` + + References: + - https://paperswithcode.com/dataset/iso17 + """ + __name__ = "iso_17" # Energy in hartree, all zeros by default diff --git a/src/openqdc/datasets/nabladft.py b/src/openqdc/datasets/nabladft.py index 3234011..d5e55d2 100644 --- a/src/openqdc/datasets/nabladft.py +++ b/src/openqdc/datasets/nabladft.py @@ -1,5 +1,6 @@ import os from os.path import join as p_join +from typing import Dict import datamol as dm import numpy as np @@ -10,7 +11,7 @@ from openqdc.utils.constants import MAX_ATOMIC_NUMBER -def to_mol(entry): +def to_mol(entry) -> Dict[str, np.ndarray]: Z, R, E, F = entry[:4] C = np.zeros_like(Z) @@ -37,6 +38,22 @@ def read_chunk_from_db(raw_path, start_idx, stop_idx, step_size=1000): class NablaDFT(BaseDataset): + """ + NablaDFT is a dataset constructed from a subset of the + [Molecular Sets (MOSES) dataset](https://github.com/molecularsets/moses) consisting of 1 million molecules + with 5,340,152 unique conformations generated using ωB97X-D/def2-SVP level of theory. + + Usage: + ```python + from openqdc.datasets import NablaDFT + dataset = NablaDFT() + ``` + + References: + - https://pubs.rsc.org/en/content/articlelanding/2022/CP/D2CP03966D + - https://github.com/AIRI-Institute/nablaDFT + """ + __name__ = "nabladft" __energy_methods__ = ["wb97x-d_svp"] diff --git a/src/openqdc/datasets/spice.py b/src/openqdc/datasets/spice.py index 88af6dc..e273690 100644 --- a/src/openqdc/datasets/spice.py +++ b/src/openqdc/datasets/spice.py @@ -32,8 +32,25 @@ def read_record(r): class Spice(BaseDataset): + """ + Spice Dataset consists of 1.1 million conformations for a diverse set of 19k unique molecules consisting of + small molecules, dimers, dipeptides, and solvated amino acids. It consists of both forces and energies calculated + at {\omega}B97M-D3(BJ)/def2-TZVPPD level of theory. + + Usage: + ```python + from openqdc.datasets import Spice + dataset = Spice() + ``` + + References: + - https://arxiv.org/abs/2209.10702 + - https://github.com/openmm/spice-dataset + """ + __name__ = "spice" __energy_methods__ = ["wb97x_tz"] + __force_methods__ = ["wb97x_tz"] energy_target_names = ["dft_total_energy"] From 7eb107f3b7d23f70f022e9670a934554574112d7 Mon Sep 17 00:00:00 2001 From: Nikhil Shenoy Date: Thu, 5 Oct 2023 19:17:03 +0000 Subject: [PATCH 06/12] Added README from other branch --- README.md | 27 +++++++++++++++++++++++++++ 1 file changed, 27 insertions(+) diff --git a/README.md b/README.md index 9300362..5ac528b 100644 --- a/README.md +++ b/README.md @@ -19,3 +19,30 @@ You can run tests locally with: ```bash pytest ``` + +# Overview of Datasets + + + +We provide support for the following publicly available QM Datasets. + +| Dataset | Description | # Molecules | # Conformers | Average Conformers per Molecule | Force Labels | Atom Types | QM Level of Theory | +| --- | --- | --- | --- | --- | --- | --- | --- | +| [GEOM](https://www.nature.com/articles/s41597-022-01288-4) | | 450,000 | 37,000,000 | 82 | No | 18 | GFN2-xTB | +| [Molecule3D](https://arxiv.org/abs/2110.01717) | | 3,899,647 | 3,899,647 | 1 | No | 5 | B3LYP/6-31G* | +| [NablaDFT](https://pubs.rsc.org/en/content/articlelanding/2022/CP/D2CP03966D) | | 1,000,000 | 5,000,000 | 5 | No | 6 | ωB97X-D/def2-SVP | +| [QMugs](https://www.nature.com/articles/s41597-022-01390-7) | | 665,000 | 2,000,000 | 3 | No | 10 | GFN2-xTB, ωB97X-D/def2-SVP | +| [Spice](https://arxiv.org/abs/2209.10702) | | 19,238 | 1,132,808 | 59 | Yes | 15 | ωB97M-D3(BJ)/def2-TZVPPD | +| [ANI](https://pubs.rsc.org/en/content/articlelanding/2017/SC/C6SC05720A) | | 57,462 | 348 | 20,000,000 | No | 4 | ωB97x:6-31G(d) | +| [tmQM](https://pubs.acs.org/doi/10.1021/acs.jcim.0c01041) | | 86,665 | | | No | | TPSSh-D3BJ/def2-SVP | +| [DES370K](https://www.nature.com/articles/s41597-021-00833-x) | | 3,700 | 370,000 | 100 | No | 20 | CCSD(T) | +| [DES5M](https://www.nature.com/articles/s41597-021-00833-x) | | 3,700 | 5,000,000 | 1351 | No | 20 | SNS-MP2 | +| [OrbNet Denali](https://arxiv.org/abs/2107.00299) | | 212,905 | 2,300,000 | 11 | No | 16 | GFN1-xTB +| [SN2RXN](https://pubs.acs.org/doi/10.1021/acs.jctc.9b00181) | Probes chemical reactions of methyl halides with halide anions of the kind $X^- + H_3C-Y \to X-CH_3 + Y^{-1}$| 39 | 452709 | 11,600 | Yes | 6 | DSD-BLYP-D3(BJ)/def2-TZVP | From ee19191cc5c25f6df76ef1c6184bbb79dff43b23 Mon Sep 17 00:00:00 2001 From: Nikhil Shenoy Date: Thu, 5 Oct 2023 19:18:43 +0000 Subject: [PATCH 07/12] Updated README --- README.md | 26 +++++++++++++------------- 1 file changed, 13 insertions(+), 13 deletions(-) diff --git a/README.md b/README.md index 5ac528b..ac969a2 100644 --- a/README.md +++ b/README.md @@ -33,16 +33,16 @@ pytest We provide support for the following publicly available QM Datasets. -| Dataset | Description | # Molecules | # Conformers | Average Conformers per Molecule | Force Labels | Atom Types | QM Level of Theory | -| --- | --- | --- | --- | --- | --- | --- | --- | -| [GEOM](https://www.nature.com/articles/s41597-022-01288-4) | | 450,000 | 37,000,000 | 82 | No | 18 | GFN2-xTB | -| [Molecule3D](https://arxiv.org/abs/2110.01717) | | 3,899,647 | 3,899,647 | 1 | No | 5 | B3LYP/6-31G* | -| [NablaDFT](https://pubs.rsc.org/en/content/articlelanding/2022/CP/D2CP03966D) | | 1,000,000 | 5,000,000 | 5 | No | 6 | ωB97X-D/def2-SVP | -| [QMugs](https://www.nature.com/articles/s41597-022-01390-7) | | 665,000 | 2,000,000 | 3 | No | 10 | GFN2-xTB, ωB97X-D/def2-SVP | -| [Spice](https://arxiv.org/abs/2209.10702) | | 19,238 | 1,132,808 | 59 | Yes | 15 | ωB97M-D3(BJ)/def2-TZVPPD | -| [ANI](https://pubs.rsc.org/en/content/articlelanding/2017/SC/C6SC05720A) | | 57,462 | 348 | 20,000,000 | No | 4 | ωB97x:6-31G(d) | -| [tmQM](https://pubs.acs.org/doi/10.1021/acs.jcim.0c01041) | | 86,665 | | | No | | TPSSh-D3BJ/def2-SVP | -| [DES370K](https://www.nature.com/articles/s41597-021-00833-x) | | 3,700 | 370,000 | 100 | No | 20 | CCSD(T) | -| [DES5M](https://www.nature.com/articles/s41597-021-00833-x) | | 3,700 | 5,000,000 | 1351 | No | 20 | SNS-MP2 | -| [OrbNet Denali](https://arxiv.org/abs/2107.00299) | | 212,905 | 2,300,000 | 11 | No | 16 | GFN1-xTB -| [SN2RXN](https://pubs.acs.org/doi/10.1021/acs.jctc.9b00181) | Probes chemical reactions of methyl halides with halide anions of the kind $X^- + H_3C-Y \to X-CH_3 + Y^{-1}$| 39 | 452709 | 11,600 | Yes | 6 | DSD-BLYP-D3(BJ)/def2-TZVP | +| Dataset | # Molecules | # Conformers | Average Conformers per Molecule | Force Labels | Atom Types | QM Level of Theory | +| --- | --- | --- | --- | --- | --- | --- | +| [GEOM](https://www.nature.com/articles/s41597-022-01288-4) | 450,000 | 37,000,000 | 82 | No | 18 | GFN2-xTB | +| [Molecule3D](https://arxiv.org/abs/2110.01717) | 3,899,647 | 3,899,647 | 1 | No | 5 | B3LYP/6-31G* | +| [NablaDFT](https://pubs.rsc.org/en/content/articlelanding/2022/CP/D2CP03966D) | 1,000,000 | 5,000,000 | 5 | No | 6 | ωB97X-D/def2-SVP | +| [QMugs](https://www.nature.com/articles/s41597-022-01390-7) | 665,000 | 2,000,000 | 3 | No | 10 | GFN2-xTB, ωB97X-D/def2-SVP | +| [Spice](https://arxiv.org/abs/2209.10702) | 19,238 | 1,132,808 | 59 | Yes | 15 | ωB97M-D3(BJ)/def2-TZVPPD | +| [ANI](https://pubs.rsc.org/en/content/articlelanding/2017/SC/C6SC05720A) | 57,462 | 20,000,000 | 348 | No | 4 | ωB97x:6-31G(d) | +| [tmQM](https://pubs.acs.org/doi/10.1021/acs.jcim.0c01041) | 86,665 | | No | | TPSSh-D3BJ/def2-SVP | +| [DES370K](https://www.nature.com/articles/s41597-021-00833-x) | 3,700 | 370,000 | 100 | No | 20 | CCSD(T) | +| [DES5M](https://www.nature.com/articles/s41597-021-00833-x) | 3,700 | 5,000,000 | 1351 | No | 20 | SNS-MP2 | +| [OrbNet Denali](https://arxiv.org/abs/2107.00299) | 212,905 | 2,300,000 | 11 | No | 16 | GFN1-xTB +| [SN2RXN](https://pubs.acs.org/doi/10.1021/acs.jctc.9b00181) | 39 | 452709 | 11,600 | Yes | 6 | DSD-BLYP-D3(BJ)/def2-TZVP | From a665a132d248d6d37378fdf5c51e3359d6b962af Mon Sep 17 00:00:00 2001 From: Nikhil Shenoy Date: Thu, 5 Oct 2023 19:30:42 +0000 Subject: [PATCH 08/12] Added qm7x to the readme --- README.md | 26 +++++++++++++------------- 1 file changed, 13 insertions(+), 13 deletions(-) diff --git a/README.md b/README.md index ac969a2..560a613 100644 --- a/README.md +++ b/README.md @@ -33,16 +33,16 @@ pytest We provide support for the following publicly available QM Datasets. -| Dataset | # Molecules | # Conformers | Average Conformers per Molecule | Force Labels | Atom Types | QM Level of Theory | -| --- | --- | --- | --- | --- | --- | --- | -| [GEOM](https://www.nature.com/articles/s41597-022-01288-4) | 450,000 | 37,000,000 | 82 | No | 18 | GFN2-xTB | -| [Molecule3D](https://arxiv.org/abs/2110.01717) | 3,899,647 | 3,899,647 | 1 | No | 5 | B3LYP/6-31G* | -| [NablaDFT](https://pubs.rsc.org/en/content/articlelanding/2022/CP/D2CP03966D) | 1,000,000 | 5,000,000 | 5 | No | 6 | ωB97X-D/def2-SVP | -| [QMugs](https://www.nature.com/articles/s41597-022-01390-7) | 665,000 | 2,000,000 | 3 | No | 10 | GFN2-xTB, ωB97X-D/def2-SVP | -| [Spice](https://arxiv.org/abs/2209.10702) | 19,238 | 1,132,808 | 59 | Yes | 15 | ωB97M-D3(BJ)/def2-TZVPPD | -| [ANI](https://pubs.rsc.org/en/content/articlelanding/2017/SC/C6SC05720A) | 57,462 | 20,000,000 | 348 | No | 4 | ωB97x:6-31G(d) | -| [tmQM](https://pubs.acs.org/doi/10.1021/acs.jcim.0c01041) | 86,665 | | No | | TPSSh-D3BJ/def2-SVP | -| [DES370K](https://www.nature.com/articles/s41597-021-00833-x) | 3,700 | 370,000 | 100 | No | 20 | CCSD(T) | -| [DES5M](https://www.nature.com/articles/s41597-021-00833-x) | 3,700 | 5,000,000 | 1351 | No | 20 | SNS-MP2 | -| [OrbNet Denali](https://arxiv.org/abs/2107.00299) | 212,905 | 2,300,000 | 11 | No | 16 | GFN1-xTB -| [SN2RXN](https://pubs.acs.org/doi/10.1021/acs.jctc.9b00181) | 39 | 452709 | 11,600 | Yes | 6 | DSD-BLYP-D3(BJ)/def2-TZVP | +| Dataset | # Molecules | # Conformers | Average Conformers per Molecule | Force Labels | Atom Types | QM Level of Theory | Off-Equilibrium Conformations| +| --- | --- | --- | --- | --- | --- | --- | --- | +| [GEOM](https://www.nature.com/articles/s41597-022-01288-4) | 450,000 | 37,000,000 | 82 | No | 18 | GFN2-xTB | | +| [Molecule3D](https://arxiv.org/abs/2110.01717) | 3,899,647 | 3,899,647 | 1 | No | 5 | B3LYP/6-31G* | | +| [NablaDFT](https://pubs.rsc.org/en/content/articlelanding/2022/CP/D2CP03966D) | 1,000,000 | 5,000,000 | 5 | No | 6 | ωB97X-D/def2-SVP | | +| [QMugs](https://www.nature.com/articles/s41597-022-01390-7) | 665,000 | 2,000,000 | 3 | No | 10 | GFN2-xTB, ωB97X-D/def2-SVP | | +| [Spice](https://arxiv.org/abs/2209.10702) | 19,238 | 1,132,808 | 59 | Yes | 15 | ωB97M-D3(BJ)/def2-TZVPPD | | +| [ANI](https://pubs.rsc.org/en/content/articlelanding/2017/SC/C6SC05720A) | 57,462 | 20,000,000 | 348 | No | 4 | ωB97x:6-31G(d) | | +| [tmQM](https://pubs.acs.org/doi/10.1021/acs.jcim.0c01041) | 86,665 | No | | | | TPSSh-D3BJ/def2-SVP | | +| [DES370K](https://www.nature.com/articles/s41597-021-00833-x) | 3,700 | 370,000 | 100 | No | 20 | CCSD(T) | | +| [DES5M](https://www.nature.com/articles/s41597-021-00833-x) | 3,700 | 5,000,000 | 1351 | No | 20 | SNS-MP2 | | +| [OrbNet Denali](https://arxiv.org/abs/2107.00299) | 212,905 | 2,300,000 | 11 | No | 16 | GFN1-xTB | | +| [SN2RXN](https://pubs.acs.org/doi/10.1021/acs.jctc.9b00181) | 39 | 452709 | 11,600 | Yes | 6 | DSD-BLYP-D3(BJ)/def2-TZVP | | From a639f20f2495e57a54b832497451050181ec3a08 Mon Sep 17 00:00:00 2001 From: Nikhil Shenoy Date: Thu, 5 Oct 2023 19:32:09 +0000 Subject: [PATCH 09/12] Added qm7x to the readme --- README.md | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/README.md b/README.md index 560a613..05ae11d 100644 --- a/README.md +++ b/README.md @@ -35,14 +35,15 @@ We provide support for the following publicly available QM Datasets. | Dataset | # Molecules | # Conformers | Average Conformers per Molecule | Force Labels | Atom Types | QM Level of Theory | Off-Equilibrium Conformations| | --- | --- | --- | --- | --- | --- | --- | --- | -| [GEOM](https://www.nature.com/articles/s41597-022-01288-4) | 450,000 | 37,000,000 | 82 | No | 18 | GFN2-xTB | | -| [Molecule3D](https://arxiv.org/abs/2110.01717) | 3,899,647 | 3,899,647 | 1 | No | 5 | B3LYP/6-31G* | | +| [GEOM](https://www.nature.com/articles/s41597-022-01288-4) | 450,000 | 37,000,000 | 82 | No | 18 | GFN2-xTB | No | +| [Molecule3D](https://arxiv.org/abs/2110.01717) | 3,899,647 | 3,899,647 | 1 | No | 5 | B3LYP/6-31G* | No | | [NablaDFT](https://pubs.rsc.org/en/content/articlelanding/2022/CP/D2CP03966D) | 1,000,000 | 5,000,000 | 5 | No | 6 | ωB97X-D/def2-SVP | | | [QMugs](https://www.nature.com/articles/s41597-022-01390-7) | 665,000 | 2,000,000 | 3 | No | 10 | GFN2-xTB, ωB97X-D/def2-SVP | | -| [Spice](https://arxiv.org/abs/2209.10702) | 19,238 | 1,132,808 | 59 | Yes | 15 | ωB97M-D3(BJ)/def2-TZVPPD | | -| [ANI](https://pubs.rsc.org/en/content/articlelanding/2017/SC/C6SC05720A) | 57,462 | 20,000,000 | 348 | No | 4 | ωB97x:6-31G(d) | | -| [tmQM](https://pubs.acs.org/doi/10.1021/acs.jcim.0c01041) | 86,665 | No | | | | TPSSh-D3BJ/def2-SVP | | +| [Spice](https://arxiv.org/abs/2209.10702) | 19,238 | 1,132,808 | 59 | Yes | 15 | ωB97M-D3(BJ)/def2-TZVPPD | Yes | +| [ANI](https://pubs.rsc.org/en/content/articlelanding/2017/SC/C6SC05720A) | 57,462 | 20,000,000 | 348 | No | 4 | ωB97x:6-31G(d) | Yes | +| [tmQM](https://pubs.acs.org/doi/10.1021/acs.jcim.0c01041) | 86,665 | | | No | | TPSSh-D3BJ/def2-SVP | | | [DES370K](https://www.nature.com/articles/s41597-021-00833-x) | 3,700 | 370,000 | 100 | No | 20 | CCSD(T) | | | [DES5M](https://www.nature.com/articles/s41597-021-00833-x) | 3,700 | 5,000,000 | 1351 | No | 20 | SNS-MP2 | | -| [OrbNet Denali](https://arxiv.org/abs/2107.00299) | 212,905 | 2,300,000 | 11 | No | 16 | GFN1-xTB | | +| [OrbNet Denali](https://arxiv.org/abs/2107.00299) | 212,905 | 2,300,000 | 11 | No | 16 | GFN1-xTB | Yes | | [SN2RXN](https://pubs.acs.org/doi/10.1021/acs.jctc.9b00181) | 39 | 452709 | 11,600 | Yes | 6 | DSD-BLYP-D3(BJ)/def2-TZVP | | +| [QM7X](https://www.nature.com/articles/s41597-021-00812-2) | 6,950 | 4,195,237 | 603 | Yes | 7 | PBE0+MBD | Yes | From aff4de1160bff6c468bf928aa0655a98d13450a1 Mon Sep 17 00:00:00 2001 From: Nikhil Shenoy Date: Thu, 5 Oct 2023 19:39:14 +0000 Subject: [PATCH 10/12] Updated README --- README.md | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/README.md b/README.md index 05ae11d..b3f769a 100644 --- a/README.md +++ b/README.md @@ -38,12 +38,12 @@ We provide support for the following publicly available QM Datasets. | [GEOM](https://www.nature.com/articles/s41597-022-01288-4) | 450,000 | 37,000,000 | 82 | No | 18 | GFN2-xTB | No | | [Molecule3D](https://arxiv.org/abs/2110.01717) | 3,899,647 | 3,899,647 | 1 | No | 5 | B3LYP/6-31G* | No | | [NablaDFT](https://pubs.rsc.org/en/content/articlelanding/2022/CP/D2CP03966D) | 1,000,000 | 5,000,000 | 5 | No | 6 | ωB97X-D/def2-SVP | | -| [QMugs](https://www.nature.com/articles/s41597-022-01390-7) | 665,000 | 2,000,000 | 3 | No | 10 | GFN2-xTB, ωB97X-D/def2-SVP | | +| [QMugs](https://www.nature.com/articles/s41597-022-01390-7) | 665,000 | 2,000,000 | 3 | No | 10 | GFN2-xTB, ωB97X-D/def2-SVP | No | | [Spice](https://arxiv.org/abs/2209.10702) | 19,238 | 1,132,808 | 59 | Yes | 15 | ωB97M-D3(BJ)/def2-TZVPPD | Yes | | [ANI](https://pubs.rsc.org/en/content/articlelanding/2017/SC/C6SC05720A) | 57,462 | 20,000,000 | 348 | No | 4 | ωB97x:6-31G(d) | Yes | | [tmQM](https://pubs.acs.org/doi/10.1021/acs.jcim.0c01041) | 86,665 | | | No | | TPSSh-D3BJ/def2-SVP | | -| [DES370K](https://www.nature.com/articles/s41597-021-00833-x) | 3,700 | 370,000 | 100 | No | 20 | CCSD(T) | | -| [DES5M](https://www.nature.com/articles/s41597-021-00833-x) | 3,700 | 5,000,000 | 1351 | No | 20 | SNS-MP2 | | +| [DES370K](https://www.nature.com/articles/s41597-021-00833-x) | 3,700 | 370,000 | 100 | No | 20 | CCSD(T) | Yes | +| [DES5M](https://www.nature.com/articles/s41597-021-00833-x) | 3,700 | 5,000,000 | 1351 | No | 20 | SNS-MP2 | Yes | | [OrbNet Denali](https://arxiv.org/abs/2107.00299) | 212,905 | 2,300,000 | 11 | No | 16 | GFN1-xTB | Yes | | [SN2RXN](https://pubs.acs.org/doi/10.1021/acs.jctc.9b00181) | 39 | 452709 | 11,600 | Yes | 6 | DSD-BLYP-D3(BJ)/def2-TZVP | | | [QM7X](https://www.nature.com/articles/s41597-021-00812-2) | 6,950 | 4,195,237 | 603 | Yes | 7 | PBE0+MBD | Yes | From 3fae2e370105322db27f96e2c84ede6ee1aa58b1 Mon Sep 17 00:00:00 2001 From: Nikhil Shenoy Date: Sun, 8 Oct 2023 23:46:38 +0000 Subject: [PATCH 11/12] fixed 2 bugs in code --- src/openqdc/datasets/spice.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/openqdc/datasets/spice.py b/src/openqdc/datasets/spice.py index e273690..e00cdbc 100644 --- a/src/openqdc/datasets/spice.py +++ b/src/openqdc/datasets/spice.py @@ -18,10 +18,10 @@ def read_record(r): positions = r["conformations"][:] * BOHR2ANG res = dict( - smiles=np.array([smiles] * n_confs), + name=np.array([smiles] * n_confs), subset=np.array([Spice.subset_mapping[subset]] * n_confs), energies=r[Spice.energy_target_names[0]][:][:, None].astype(np.float32), - forces=r[Spice.force_target_names[0]][:].reshape(-1, 3, 1) / BOHR2ANG, + forces=r[Spice.force_target_names[0]][:].reshape(-1, 3, 1) / BOHR2ANG * (-1.0), # forces -ve of energy gradient atomic_inputs=np.concatenate( (x[None, ...].repeat(n_confs, axis=0), positions), axis=-1, dtype=np.float32 ).reshape(-1, 5), From dae4dae8b57bcf1e0f9ffc467c4c2f98a01c430e Mon Sep 17 00:00:00 2001 From: Nikhil Shenoy Date: Wed, 11 Oct 2023 22:31:55 +0000 Subject: [PATCH 12/12] Added __init__ and qmugs remove hs fix --- src/openqdc/__init__.py | 0 src/openqdc/datasets/__init__.py | 7 ++++--- src/openqdc/datasets/qmugs.py | 2 +- 3 files changed, 5 insertions(+), 4 deletions(-) create mode 100644 src/openqdc/__init__.py diff --git a/src/openqdc/__init__.py b/src/openqdc/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/src/openqdc/datasets/__init__.py b/src/openqdc/datasets/__init__.py index 7e1811e..9c17922 100644 --- a/src/openqdc/datasets/__init__.py +++ b/src/openqdc/datasets/__init__.py @@ -1,24 +1,25 @@ +from .ani import ANI1, ANI1CCX, ANI1X from .comp6 import COMP6 from .gdml import GDML from .geom import GEOM from .iso_17 import ISO17 from .molecule3d import Molecule3D -from .nabladft import NablaDFT from .orbnet_denali import OrbnetDenali from .qmugs import QMugs from .sn2_rxn import SN2RXN from .spice import Spice __all__ = [ + "ANI1", + "ANI1CCX", + "ANI1X", "Spice", "GEOM", "QMugs", - "NablaDFT", "ISO17", "COMP6", "GDML", "Molecule3D", - "NablaDFT", "OrbnetDenali", "QMugs", "SN2RXN", diff --git a/src/openqdc/datasets/qmugs.py b/src/openqdc/datasets/qmugs.py index d15d83b..62bc3b0 100644 --- a/src/openqdc/datasets/qmugs.py +++ b/src/openqdc/datasets/qmugs.py @@ -12,7 +12,7 @@ def read_mol(mol_dir): filenames = glob(p_join(mol_dir, "*.sdf")) - mols = [dm.read_sdf(f)[0] for f in filenames] + mols = [dm.read_sdf(f, remove_hs=False)[0] for f in filenames] n_confs = len(mols) if len(mols) == 0: