diff --git a/README.md b/README.md index 9300362..b3f769a 100644 --- a/README.md +++ b/README.md @@ -19,3 +19,31 @@ You can run tests locally with: ```bash pytest ``` + +# Overview of Datasets + + + +We provide support for the following publicly available QM Datasets. + +| Dataset | # Molecules | # Conformers | Average Conformers per Molecule | Force Labels | Atom Types | QM Level of Theory | Off-Equilibrium Conformations| +| --- | --- | --- | --- | --- | --- | --- | --- | +| [GEOM](https://www.nature.com/articles/s41597-022-01288-4) | 450,000 | 37,000,000 | 82 | No | 18 | GFN2-xTB | No | +| [Molecule3D](https://arxiv.org/abs/2110.01717) | 3,899,647 | 3,899,647 | 1 | No | 5 | B3LYP/6-31G* | No | +| [NablaDFT](https://pubs.rsc.org/en/content/articlelanding/2022/CP/D2CP03966D) | 1,000,000 | 5,000,000 | 5 | No | 6 | ωB97X-D/def2-SVP | | +| [QMugs](https://www.nature.com/articles/s41597-022-01390-7) | 665,000 | 2,000,000 | 3 | No | 10 | GFN2-xTB, ωB97X-D/def2-SVP | No | +| [Spice](https://arxiv.org/abs/2209.10702) | 19,238 | 1,132,808 | 59 | Yes | 15 | ωB97M-D3(BJ)/def2-TZVPPD | Yes | +| [ANI](https://pubs.rsc.org/en/content/articlelanding/2017/SC/C6SC05720A) | 57,462 | 20,000,000 | 348 | No | 4 | ωB97x:6-31G(d) | Yes | +| [tmQM](https://pubs.acs.org/doi/10.1021/acs.jcim.0c01041) | 86,665 | | | No | | TPSSh-D3BJ/def2-SVP | | +| [DES370K](https://www.nature.com/articles/s41597-021-00833-x) | 3,700 | 370,000 | 100 | No | 20 | CCSD(T) | Yes | +| [DES5M](https://www.nature.com/articles/s41597-021-00833-x) | 3,700 | 5,000,000 | 1351 | No | 20 | SNS-MP2 | Yes | +| [OrbNet Denali](https://arxiv.org/abs/2107.00299) | 212,905 | 2,300,000 | 11 | No | 16 | GFN1-xTB | Yes | +| [SN2RXN](https://pubs.acs.org/doi/10.1021/acs.jctc.9b00181) | 39 | 452709 | 11,600 | Yes | 6 | DSD-BLYP-D3(BJ)/def2-TZVP | | +| [QM7X](https://www.nature.com/articles/s41597-021-00812-2) | 6,950 | 4,195,237 | 603 | Yes | 7 | PBE0+MBD | Yes | diff --git a/src/openqdc/__init__.py b/src/openqdc/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/src/openqdc/datasets/__init__.py b/src/openqdc/datasets/__init__.py new file mode 100644 index 0000000..9c17922 --- /dev/null +++ b/src/openqdc/datasets/__init__.py @@ -0,0 +1,26 @@ +from .ani import ANI1, ANI1CCX, ANI1X +from .comp6 import COMP6 +from .gdml import GDML +from .geom import GEOM +from .iso_17 import ISO17 +from .molecule3d import Molecule3D +from .orbnet_denali import OrbnetDenali +from .qmugs import QMugs +from .sn2_rxn import SN2RXN +from .spice import Spice + +__all__ = [ + "ANI1", + "ANI1CCX", + "ANI1X", + "Spice", + "GEOM", + "QMugs", + "ISO17", + "COMP6", + "GDML", + "Molecule3D", + "OrbnetDenali", + "QMugs", + "SN2RXN", +] diff --git a/src/openqdc/datasets/ani.py b/src/openqdc/datasets/ani.py index f0b3335..ee4bea5 100644 --- a/src/openqdc/datasets/ani.py +++ b/src/openqdc/datasets/ani.py @@ -9,6 +9,22 @@ class ANI1(BaseDataset): + """ + The ANI-1 dataset is a collection of 22 x 10^6 structural conformations from 57,000 distinct small + organic molecules with energy labels calculated using DFT. The molecules + contain 4 distinct atoms, C, N, O and H. + + Usage + ```python + from openqdc.datasets import ANI1 + dataset = ANI1() + ``` + + References: + - ANI-1: https://www.nature.com/articles/sdata2017193 + - Github: https://github.com/aiqm/ANI1x_datasets + """ + __name__ = "ani1" # Energy in hartree, all zeros by default @@ -42,6 +58,21 @@ def read_raw_entries(self): class ANI1CCX(ANI1): + """ + ANI1-CCX is a dataset of 500k conformers subsampled from the 5.5M conformers of ANI-1X dataset. The selected + conformations are then labelled using a high accuracy CCSD(T)*/CBS method. + + Usage + ```python + from openqdc.datasets import ANI1CCX + dataset = ANI1CCX() + ``` + + References: + - ANI-1ccx: https://doi.org/10.1038/s41467-019-10827-4 + - Github: https://github.com/aiqm/ANI1x_datasets + """ + __name__ = "ani1ccx" # Energy in hartree, all zeros by default @@ -69,6 +100,21 @@ def __init__(self) -> None: class ANI1X(ANI1): + """ + The ANI-1X dataset consists of ANI-1 molecules + some molecules added using active learning which leads to + a total of 5,496,771 conformers with 63,865 unique molecules. + + Usage + ```python + from openqdc.datasets import ANI1X + dataset = ANI1X() + ``` + + References: + - ANI-1x: https://doi.org/10.1063/1.5023802 + - Github: https://github.com/aiqm/ANI1x_datasets + """ + __name__ = "ani1x" # Energy in hartree, all zeros by default diff --git a/src/openqdc/datasets/base.py b/src/openqdc/datasets/base.py index 392144d..96e0f0c 100644 --- a/src/openqdc/datasets/base.py +++ b/src/openqdc/datasets/base.py @@ -1,7 +1,9 @@ import os from os.path import join as p_join +from typing import Dict, List, Optional import numpy as np +import pandas as pd import torch from loguru import logger from sklearn.utils import Bunch @@ -18,7 +20,13 @@ from openqdc.utils.molecule import atom_table -def extract_entry(df, i, subset, energy_target_names, force_target_names=None): +def extract_entry( + df: pd.DataFrame, + i: int, + subset: str, + energy_target_names: List[str], + force_target_names: Optional[List[str]] = None, +) -> Dict[str, np.ndarray]: x = np.array([atom_table.GetAtomicNumber(s) for s in df["symbols"][i]]) xs = np.stack((x, np.zeros_like(x)), axis=-1) positions = df["geometry"][i].reshape((-1, 3)) @@ -42,18 +50,12 @@ def extract_entry(df, i, subset, energy_target_names, force_target_names=None): return res -def read_qc_archive_h5(raw_path, subset, energy_target_names, force_target_names): +def read_qc_archive_h5( + raw_path: str, subset: str, energy_target_names: List[str], force_target_names: List[str] +) -> List[Dict[str, np.ndarray]]: data = load_hdf5_file(raw_path) data_t = {k2: data[k1][k2][:] for k1 in data.keys() for k2 in data[k1].keys()} n = len(data_t["molecule_id"]) - # print(f"Reading {n} entries from {raw_path}") - # for k in data_t: - # print(f"Loaded {k} with shape {data_t[k].shape}, dtype {data_t[k].dtype}") - # if "Energy" in k: - # print(np.isnan(data_t[k]).mean(), f"{data_t[k][0]}") - - # print('\n'*3) - # exit() samples = [extract_entry(data_t, i, subset, energy_target_names, force_target_names) for i in tqdm(range(n))] return samples diff --git a/src/openqdc/datasets/comp6.py b/src/openqdc/datasets/comp6.py index 96811c9..0fc8cd3 100644 --- a/src/openqdc/datasets/comp6.py +++ b/src/openqdc/datasets/comp6.py @@ -7,6 +7,22 @@ class COMP6(BaseDataset): + """ + COMP6 is a benchmark suite consisting of broad regions of bio-chemical and organic space + developed for testing the ANI-1x potential. It is curated from 6 benchmark sets: + S66x8, ANI Molecular Dynamics, GDB7to9, GDB10to13, DrugBank, and Tripeptides. + + Usage + ```python + from openqdc.datasets import COMP6 + dataset = COMP6() + ``` + + References: + - https://aip.scitation.org/doi/abs/10.1063/1.5023802 + - Github: https://github.com/isayev/COMP6 + """ + __name__ = "comp6" # Energy in hartree, all zeros by default diff --git a/src/openqdc/datasets/gdml.py b/src/openqdc/datasets/gdml.py index c0ca093..ab38b90 100644 --- a/src/openqdc/datasets/gdml.py +++ b/src/openqdc/datasets/gdml.py @@ -7,6 +7,29 @@ class GDML(BaseDataset): + """ + Gradient Domain Machine Learning (GDML) is a dataset consisting of samples from ab initio + molecular dynamics (AIMD) trajectories. The dataset consists of, + - Benzene: 627000 samples + - Uracil: 133000 samples + - Naptalene: 326000 samples + - Aspirin: 211000 samples + - Salicylic Acid: 320000 samples + - Malonaldehyde: 993000 samples + - Ethanol: 555000 samples + - Toluene: 100000 samples + + Usage + ```python + from openqdc.datasets import GDML + dataset = GDML() + ``` + + References: + - https://www.science.org/doi/10.1126/sciadv.1603015 + - http://www.sgdml.org/#datasets + """ + __name__ = "gdml" # Energy in hartree, all zeros by default diff --git a/src/openqdc/datasets/geom.py b/src/openqdc/datasets/geom.py index 6af826e..eebcc66 100644 --- a/src/openqdc/datasets/geom.py +++ b/src/openqdc/datasets/geom.py @@ -1,4 +1,5 @@ from os.path import join as p_join +from typing import Dict import datamol as dm import numpy as np @@ -9,7 +10,7 @@ from openqdc.utils.molecule import get_atomic_number_and_charge -def read_mol(mol_id, mol_dict, base_path, partition): +def read_mol(mol_id: str, mol_dict, base_path: str, partition: str) -> Dict[str, np.ndarray]: """Read molecule from pickle file and return dict with conformers and energies Parameters @@ -20,15 +21,18 @@ def read_mol(mol_id, mol_dict, base_path, partition): Dictionary containing the pickle_path and smiles of the molecule base_path: str Path to the folder containing the pickle files + partition: str + Name of the dataset partition, one of ['qm9', 'drugs'] Returns ------- res: dict Dictionary containing the following keys: - - atomic_inputs: flatten np.ndarray of shape (M, 4) containing the atomic numbers and positions - - smiles: np.ndarray of shape (N,) containing the smiles of the molecule - - energies: np.ndarray of shape (N,1) containing the energies of the conformers - - n_atoms: np.ndarray of shape (N,) containing the number of atoms in each conformer + - atomic_inputs: flatten np.ndarray of shape (M, 5) containing the atomic numbers, charges and positions + - smiles: np.ndarray of shape (N,) containing the smiles of the molecule + - energies: np.ndarray of shape (N,1) containing the energies of the conformers + - n_atoms: np.ndarray of shape (N,) containing the number of atoms in each conformer + - subset: np.ndarray of shape (N,) containing the name of the dataset partition """ try: @@ -56,6 +60,22 @@ def read_mol(mol_id, mol_dict, base_path, partition): class GEOM(BaseDataset): + """ + The Geometric Ensemble Of Molecules (GEOM) dataset contains 37 million conformers for 133,000 molecules + from QM9, and 317,000 molecules with experimental data related to biophysics, physiology, + and physical chemistry. The dataset is generated using the GFN2-xTB semi-empirical method. + + Usage: + ```python + from openqdc.datasets import GEOM + dataset = GEOM() + ``` + + References: + - https://www.nature.com/articles/s41597-022-01288-4 + - https://github.com/learningmatter-mit/geom + """ + __name__ = "geom" __energy_methods__ = ["gfn2_xtb"] diff --git a/src/openqdc/datasets/iso_17.py b/src/openqdc/datasets/iso_17.py index 55f395c..a26f382 100644 --- a/src/openqdc/datasets/iso_17.py +++ b/src/openqdc/datasets/iso_17.py @@ -7,6 +7,23 @@ class ISO17(BaseDataset): + """ + ISO17 dataset consists of the largest set of isomers from the QM9 dataset that consists of a fixed + composition of atoms (C7O2H10) arranged in different chemically valid structures. It consists of consist + of 129 molecules each containing 5,000 conformational geometries, energies and forces with a resolution + of 1 femtosecond in the molecular dynamics trajectories. The simulations were carried out using the + Perdew-Burke-Ernzerhof (PBE) functional and the Tkatchenko-Scheffler (TS) van der Waals correction method. + + Usage: + ```python + from openqdc.datasets import ISO17 + dataset = ISO17() + ``` + + References: + - https://paperswithcode.com/dataset/iso17 + """ + __name__ = "iso_17" # Energy in hartree, all zeros by default diff --git a/src/openqdc/datasets/molecule3d.py b/src/openqdc/datasets/molecule3d.py index 0d59400..e5870ca 100644 --- a/src/openqdc/datasets/molecule3d.py +++ b/src/openqdc/datasets/molecule3d.py @@ -1,5 +1,6 @@ from glob import glob from os.path import join as p_join +from typing import Dict, List import datamol as dm import numpy as np @@ -12,7 +13,26 @@ from openqdc.utils.molecule import get_atomic_number_and_charge -def read_mol(mol, energy): +def read_mol(mol: Chem.rdchem.Mol, energy: float) -> Dict[str, np.ndarray]: + """Read molecule (Chem.rdchem.Mol) and energy (float) and return dict with conformers and energies + + Parameters + ---------- + mol: Chem.rdchem.Mol + RDKit molecule + energy: float + Energy of the molecule + + Returns + ------- + res: dict + Dictionary containing the following keys: + - name: np.ndarray of shape (N,) containing the smiles of the molecule + - atomic_inputs: flatten np.ndarray of shape (M, 5) containing the atomic numbers, charges and positions + - energies: np.ndarray of shape (1,) containing the energy of the conformer + - n_atoms: np.ndarray of shape (1) containing the number of atoms in the conformer + - subset: np.ndarray of shape (1) containing "molecule3d" + """ smiles = dm.to_smiles(mol, explicit_hs=False) # subset = dm.to_smiles(dm.to_scaffold_murcko(mol, make_generic=True), explicit_hs=False) x = get_atomic_number_and_charge(mol) @@ -29,7 +49,8 @@ def read_mol(mol, energy): return res -def _read_sdf(sdf_path, properties_path): +def _read_sdf(sdf_path: str, properties_path: str) -> List[Dict[str, np.ndarray]]: + """Reads the sdf path and properties file.""" properties = pd.read_csv(properties_path, dtype={"cid": str}) properties.drop_duplicates(subset="cid", inplace=True, keep="first") xys = properties[["cid", "scf energy"]] @@ -45,6 +66,22 @@ def _read_sdf(sdf_path, properties_path): class Molecule3D(BaseDataset): + """ + Molecule3D dataset consists of 3,899,647 molecules with ground state geometries and energies + calculated at B3LYP/6-31G* level of theory. The molecules are extracted from the + PubChem database and cleaned by removing invalid molecule files. + + Usage: + ```python + from openqdc.datasets import Molecule3D + dataset = Molecule3D() + ``` + + References: + - https://arxiv.org/abs/2110.01717 + - https://github.com/divelab/MoleculeX + """ + __name__ = "molecule3d" __energy_methods__ = ["b3lyp_6-31g*"] diff --git a/src/openqdc/datasets/nabladft.py b/src/openqdc/datasets/nabladft.py index 3234011..d5e55d2 100644 --- a/src/openqdc/datasets/nabladft.py +++ b/src/openqdc/datasets/nabladft.py @@ -1,5 +1,6 @@ import os from os.path import join as p_join +from typing import Dict import datamol as dm import numpy as np @@ -10,7 +11,7 @@ from openqdc.utils.constants import MAX_ATOMIC_NUMBER -def to_mol(entry): +def to_mol(entry) -> Dict[str, np.ndarray]: Z, R, E, F = entry[:4] C = np.zeros_like(Z) @@ -37,6 +38,22 @@ def read_chunk_from_db(raw_path, start_idx, stop_idx, step_size=1000): class NablaDFT(BaseDataset): + """ + NablaDFT is a dataset constructed from a subset of the + [Molecular Sets (MOSES) dataset](https://github.com/molecularsets/moses) consisting of 1 million molecules + with 5,340,152 unique conformations generated using ωB97X-D/def2-SVP level of theory. + + Usage: + ```python + from openqdc.datasets import NablaDFT + dataset = NablaDFT() + ``` + + References: + - https://pubs.rsc.org/en/content/articlelanding/2022/CP/D2CP03966D + - https://github.com/AIRI-Institute/nablaDFT + """ + __name__ = "nabladft" __energy_methods__ = ["wb97x-d_svp"] diff --git a/src/openqdc/datasets/orbnet_denali.py b/src/openqdc/datasets/orbnet_denali.py index 5e44263..1656e3b 100644 --- a/src/openqdc/datasets/orbnet_denali.py +++ b/src/openqdc/datasets/orbnet_denali.py @@ -1,4 +1,5 @@ from os.path import join as p_join +from typing import Dict, List import datamol as dm import numpy as np @@ -9,7 +10,7 @@ from openqdc.utils.molecule import atom_table -def read_archive(mol_id, conf_dict, base_path, energy_target_names): +def read_archive(mol_id, conf_dict, base_path, energy_target_names: List[str]) -> Dict[str, np.ndarray]: res = [] for conf_id, conf_label in conf_dict.items(): try: @@ -34,6 +35,23 @@ def read_archive(mol_id, conf_dict, base_path, energy_target_names): class OrbnetDenali(BaseDataset): + """ + Orbnet Denali is a collection of 2.3 million conformers from 212,905 unique molecules. It performs + DFT (ωB97X-D3/def2-TZVP) calculations on molecules and geometries consisting of organic molecules + and chemistries, with protonation and tautomeric states, non-covalent interactions, common salts, + and counterions, spanning the most common elements in bio and organic chemistry. + + Usage: + ```python + from openqdc.datasets import OrbnetDenali + dataset = OrbnetDenali() + ``` + + References: + - https://arxiv.org/pdf/2107.00299.pdf + - https://figshare.com/articles/dataset/OrbNet_Denali_Training_Data/14883867 + """ + __name__ = "orbnet_denali" __energy_methods__ = ["wb97x-d3_tz", "gfn1_xtb"] diff --git a/src/openqdc/datasets/qmugs.py b/src/openqdc/datasets/qmugs.py index 6fc468b..62bc3b0 100644 --- a/src/openqdc/datasets/qmugs.py +++ b/src/openqdc/datasets/qmugs.py @@ -36,6 +36,22 @@ def read_mol(mol_dir): class QMugs(BaseDataset): + """ + The QMugs dataset contains 2 million conformers for 665k biologically and pharmacologically relevant molecules + extracted from the ChEMBL database. The atomic and molecular properties are calculated using both, + semi-empirical methods (GFN2-xTB) and DFT method (ωB97X-D/def2-SVP). + + Usage: + ```python + from openqdc.datasets import QMugs + dataset = QMugs() + ``` + + References: + - https://www.nature.com/articles/s41597-022-01390-7#ethics + - https://www.research-collection.ethz.ch/handle/20.500.11850/482129 + """ + __name__ = "qmugs" __energy_methods__ = ["gfn2_xtb", "b3lyp/6-31g*"] diff --git a/src/openqdc/datasets/spice.py b/src/openqdc/datasets/spice.py index 88af6dc..e00cdbc 100644 --- a/src/openqdc/datasets/spice.py +++ b/src/openqdc/datasets/spice.py @@ -18,10 +18,10 @@ def read_record(r): positions = r["conformations"][:] * BOHR2ANG res = dict( - smiles=np.array([smiles] * n_confs), + name=np.array([smiles] * n_confs), subset=np.array([Spice.subset_mapping[subset]] * n_confs), energies=r[Spice.energy_target_names[0]][:][:, None].astype(np.float32), - forces=r[Spice.force_target_names[0]][:].reshape(-1, 3, 1) / BOHR2ANG, + forces=r[Spice.force_target_names[0]][:].reshape(-1, 3, 1) / BOHR2ANG * (-1.0), # forces -ve of energy gradient atomic_inputs=np.concatenate( (x[None, ...].repeat(n_confs, axis=0), positions), axis=-1, dtype=np.float32 ).reshape(-1, 5), @@ -32,8 +32,25 @@ def read_record(r): class Spice(BaseDataset): + """ + Spice Dataset consists of 1.1 million conformations for a diverse set of 19k unique molecules consisting of + small molecules, dimers, dipeptides, and solvated amino acids. It consists of both forces and energies calculated + at {\omega}B97M-D3(BJ)/def2-TZVPPD level of theory. + + Usage: + ```python + from openqdc.datasets import Spice + dataset = Spice() + ``` + + References: + - https://arxiv.org/abs/2209.10702 + - https://github.com/openmm/spice-dataset + """ + __name__ = "spice" __energy_methods__ = ["wb97x_tz"] + __force_methods__ = ["wb97x_tz"] energy_target_names = ["dft_total_energy"]