From 821eecc7dfb6212c83f8fab67ea07b708f6fbe7e Mon Sep 17 00:00:00 2001
From: Nikhil Shenoy <nikhilshenoy98@gmail.com>
Date: Wed, 4 Oct 2023 14:31:43 +0000
Subject: [PATCH 01/12] Added ani docs

---
 src/openqdc/datasets/ani.py | 47 +++++++++++++++++++++++++++++++++++++
 1 file changed, 47 insertions(+)

diff --git a/src/openqdc/datasets/ani.py b/src/openqdc/datasets/ani.py
index f0b3335..aaa2f8b 100644
--- a/src/openqdc/datasets/ani.py
+++ b/src/openqdc/datasets/ani.py
@@ -9,6 +9,24 @@
 
 
 class ANI1(BaseDataset):
+    """
+    The ANI-1 dataset is a collection of 22 x 10^6 structural conformations from 57,000 distinct small
+    organic molecules with energy labels calculated using DFT. The molecules
+    contain 4 distinct atoms, C, N, O and H.
+
+    Usage
+    ```python
+    from openqdc.datasets import ANI1
+    dataset = ANI1()
+    ```
+
+    References:
+    - ANI-1x: https://doi.org/10.1063/1.5023802
+    - ANI-1ccx: https://doi.org/10.1038/s41467-019-10827-4
+    - wB97x/def2-TZVPP data: https://doi.org/10.1126/sciadv.aav6490
+    - Github: https://github.com/aiqm/ANI1x_datasets
+    """
+
     __name__ = "ani1"
 
     # Energy in hartree, all zeros by default
@@ -42,6 +60,20 @@ def read_raw_entries(self):
 
 
 class ANI1CCX(ANI1):
+    """
+
+
+    Usage
+    ```python
+    from openqdc.datasets import ANI1CCX
+    dataset = ANI1CCX()
+    ```
+
+    References:
+    - ANI-1ccx: https://doi.org/10.1038/s41467-019-10827-4
+    - Github: https://github.com/aiqm/ANI1x_datasets
+    """
+
     __name__ = "ani1ccx"
 
     # Energy in hartree, all zeros by default
@@ -69,6 +101,21 @@ def __init__(self) -> None:
 
 
 class ANI1X(ANI1):
+    """
+    The ANI-1X dataset consists of ANI-1 molecules + some molecules added using active learning which leads to
+    a total of 5,496,771 conformers with 63,865 unique molecules.
+
+    Usage
+    ```python
+    from openqdc.datasets import ANI1X
+    dataset = ANI1X()
+    ```
+
+    References:
+    - ANI-1x: https://doi.org/10.1063/1.5023802
+    - Github: https://github.com/aiqm/ANI1x_datasets
+    """
+
     __name__ = "ani1x"
 
     # Energy in hartree, all zeros by default

From 5df2d5e2ed4f5b54cb34f6315e480c86f7e8ea54 Mon Sep 17 00:00:00 2001
From: Nikhil Shenoy <nikhilshenoy98@gmail.com>
Date: Wed, 4 Oct 2023 14:54:58 +0000
Subject: [PATCH 02/12] Added docs for ani, comp6 and gdml

---
 src/openqdc/datasets/ani.py   |  4 +---
 src/openqdc/datasets/comp6.py | 16 ++++++++++++++++
 src/openqdc/datasets/gdml.py  | 23 +++++++++++++++++++++++
 3 files changed, 40 insertions(+), 3 deletions(-)

diff --git a/src/openqdc/datasets/ani.py b/src/openqdc/datasets/ani.py
index aaa2f8b..ff911ff 100644
--- a/src/openqdc/datasets/ani.py
+++ b/src/openqdc/datasets/ani.py
@@ -21,9 +21,7 @@ class ANI1(BaseDataset):
     ```
 
     References:
-    - ANI-1x: https://doi.org/10.1063/1.5023802
-    - ANI-1ccx: https://doi.org/10.1038/s41467-019-10827-4
-    - wB97x/def2-TZVPP data: https://doi.org/10.1126/sciadv.aav6490
+    - ANI-1: https://www.nature.com/articles/sdata2017193
     - Github: https://github.com/aiqm/ANI1x_datasets
     """
 
diff --git a/src/openqdc/datasets/comp6.py b/src/openqdc/datasets/comp6.py
index 96811c9..0fc8cd3 100644
--- a/src/openqdc/datasets/comp6.py
+++ b/src/openqdc/datasets/comp6.py
@@ -7,6 +7,22 @@
 
 
 class COMP6(BaseDataset):
+    """
+    COMP6 is a benchmark suite consisting of broad regions of bio-chemical and organic space
+    developed for testing the ANI-1x potential. It is curated from 6 benchmark sets:
+    S66x8, ANI Molecular Dynamics, GDB7to9, GDB10to13, DrugBank, and Tripeptides.
+
+    Usage
+    ```python
+    from openqdc.datasets import COMP6
+    dataset = COMP6()
+    ```
+
+    References:
+    - https://aip.scitation.org/doi/abs/10.1063/1.5023802
+    - Github: https://github.com/isayev/COMP6
+    """
+
     __name__ = "comp6"
 
     # Energy in hartree, all zeros by default
diff --git a/src/openqdc/datasets/gdml.py b/src/openqdc/datasets/gdml.py
index c0ca093..ab38b90 100644
--- a/src/openqdc/datasets/gdml.py
+++ b/src/openqdc/datasets/gdml.py
@@ -7,6 +7,29 @@
 
 
 class GDML(BaseDataset):
+    """
+    Gradient Domain Machine Learning (GDML) is a dataset consisting of samples from ab initio
+    molecular dynamics (AIMD) trajectories. The dataset consists of,
+    - Benzene: 627000 samples
+    - Uracil: 133000 samples
+    - Naptalene: 326000 samples
+    - Aspirin: 211000 samples
+    - Salicylic Acid: 320000 samples
+    - Malonaldehyde: 993000 samples
+    - Ethanol: 555000 samples
+    - Toluene: 100000 samples
+
+    Usage
+    ```python
+    from openqdc.datasets import GDML
+    dataset = GDML()
+    ```
+
+    References:
+    - https://www.science.org/doi/10.1126/sciadv.1603015
+    - http://www.sgdml.org/#datasets
+    """
+
     __name__ = "gdml"
 
     # Energy in hartree, all zeros by default

From 7045e6e7abff1db2c442aa2286eae52d503510ed Mon Sep 17 00:00:00 2001
From: Nikhil Shenoy <nikhilshenoy98@gmail.com>
Date: Wed, 4 Oct 2023 14:58:21 +0000
Subject: [PATCH 03/12] Added ani docs

---
 src/openqdc/datasets/ani.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/src/openqdc/datasets/ani.py b/src/openqdc/datasets/ani.py
index ff911ff..ee4bea5 100644
--- a/src/openqdc/datasets/ani.py
+++ b/src/openqdc/datasets/ani.py
@@ -59,7 +59,8 @@ def read_raw_entries(self):
 
 class ANI1CCX(ANI1):
     """
-
+    ANI1-CCX is a dataset of 500k conformers subsampled from the 5.5M conformers of ANI-1X dataset. The selected
+    conformations are then labelled using a high accuracy CCSD(T)*/CBS method.
 
     Usage
     ```python

From 99a3506edfbb34805b5422ac40be2584052a7f50 Mon Sep 17 00:00:00 2001
From: Nikhil Shenoy <nikhilshenoy98@gmail.com>
Date: Thu, 5 Oct 2023 18:15:45 +0000
Subject: [PATCH 04/12] Updated docs for geom, molecule3d, orbnet_denali, qmugs

---
 src/openqdc/datasets/base.py          | 22 +++++++-------
 src/openqdc/datasets/geom.py          | 30 ++++++++++++++++----
 src/openqdc/datasets/molecule3d.py    | 41 +++++++++++++++++++++++++--
 src/openqdc/datasets/orbnet_denali.py | 27 ++++++++++++------
 src/openqdc/datasets/qmugs.py         | 16 +++++++++++
 5 files changed, 111 insertions(+), 25 deletions(-)

diff --git a/src/openqdc/datasets/base.py b/src/openqdc/datasets/base.py
index 392144d..96e0f0c 100644
--- a/src/openqdc/datasets/base.py
+++ b/src/openqdc/datasets/base.py
@@ -1,7 +1,9 @@
 import os
 from os.path import join as p_join
+from typing import Dict, List, Optional
 
 import numpy as np
+import pandas as pd
 import torch
 from loguru import logger
 from sklearn.utils import Bunch
@@ -18,7 +20,13 @@
 from openqdc.utils.molecule import atom_table
 
 
-def extract_entry(df, i, subset, energy_target_names, force_target_names=None):
+def extract_entry(
+    df: pd.DataFrame,
+    i: int,
+    subset: str,
+    energy_target_names: List[str],
+    force_target_names: Optional[List[str]] = None,
+) -> Dict[str, np.ndarray]:
     x = np.array([atom_table.GetAtomicNumber(s) for s in df["symbols"][i]])
     xs = np.stack((x, np.zeros_like(x)), axis=-1)
     positions = df["geometry"][i].reshape((-1, 3))
@@ -42,18 +50,12 @@ def extract_entry(df, i, subset, energy_target_names, force_target_names=None):
     return res
 
 
-def read_qc_archive_h5(raw_path, subset, energy_target_names, force_target_names):
+def read_qc_archive_h5(
+    raw_path: str, subset: str, energy_target_names: List[str], force_target_names: List[str]
+) -> List[Dict[str, np.ndarray]]:
     data = load_hdf5_file(raw_path)
     data_t = {k2: data[k1][k2][:] for k1 in data.keys() for k2 in data[k1].keys()}
     n = len(data_t["molecule_id"])
-    # print(f"Reading {n} entries from {raw_path}")
-    # for k in data_t:
-    #     print(f"Loaded {k} with shape {data_t[k].shape}, dtype {data_t[k].dtype}")
-    #     if "Energy" in k:
-    #         print(np.isnan(data_t[k]).mean(), f"{data_t[k][0]}")
-
-    # print('\n'*3)
-    # exit()
 
     samples = [extract_entry(data_t, i, subset, energy_target_names, force_target_names) for i in tqdm(range(n))]
     return samples
diff --git a/src/openqdc/datasets/geom.py b/src/openqdc/datasets/geom.py
index 6af826e..eebcc66 100644
--- a/src/openqdc/datasets/geom.py
+++ b/src/openqdc/datasets/geom.py
@@ -1,4 +1,5 @@
 from os.path import join as p_join
+from typing import Dict
 
 import datamol as dm
 import numpy as np
@@ -9,7 +10,7 @@
 from openqdc.utils.molecule import get_atomic_number_and_charge
 
 
-def read_mol(mol_id, mol_dict, base_path, partition):
+def read_mol(mol_id: str, mol_dict, base_path: str, partition: str) -> Dict[str, np.ndarray]:
     """Read molecule from pickle file and return dict with conformers and energies
 
     Parameters
@@ -20,15 +21,18 @@ def read_mol(mol_id, mol_dict, base_path, partition):
         Dictionary containing the pickle_path and smiles of the molecule
     base_path: str
         Path to the folder containing the pickle files
+    partition: str
+        Name of the dataset partition, one of ['qm9', 'drugs']
 
     Returns
     -------
     res: dict
         Dictionary containing the following keys:
-            - atomic_inputs: flatten np.ndarray of shape (M, 4) containing the atomic numbers and positions
-            - smiles: np.ndarray of shape (N,) containing the smiles of the molecule
-            - energies: np.ndarray of shape (N,1) containing the energies of the conformers
-            - n_atoms: np.ndarray of shape (N,) containing the number of atoms in each conformer
+        - atomic_inputs: flatten np.ndarray of shape (M, 5) containing the atomic numbers, charges and positions
+        - smiles: np.ndarray of shape (N,) containing the smiles of the molecule
+        - energies: np.ndarray of shape (N,1) containing the energies of the conformers
+        - n_atoms: np.ndarray of shape (N,) containing the number of atoms in each conformer
+        - subset: np.ndarray of shape (N,) containing the name of the dataset partition
     """
 
     try:
@@ -56,6 +60,22 @@ def read_mol(mol_id, mol_dict, base_path, partition):
 
 
 class GEOM(BaseDataset):
+    """
+    The Geometric Ensemble Of Molecules (GEOM) dataset contains 37 million conformers for 133,000 molecules
+    from QM9, and 317,000 molecules with experimental data related to biophysics, physiology,
+    and physical chemistry. The dataset is generated using the GFN2-xTB semi-empirical method.
+
+    Usage:
+    ```python
+    from openqdc.datasets import GEOM
+    dataset = GEOM()
+    ```
+
+    References:
+    - https://www.nature.com/articles/s41597-022-01288-4
+    - https://github.com/learningmatter-mit/geom
+    """
+
     __name__ = "geom"
     __energy_methods__ = ["gfn2_xtb"]
 
diff --git a/src/openqdc/datasets/molecule3d.py b/src/openqdc/datasets/molecule3d.py
index 0d59400..e5870ca 100644
--- a/src/openqdc/datasets/molecule3d.py
+++ b/src/openqdc/datasets/molecule3d.py
@@ -1,5 +1,6 @@
 from glob import glob
 from os.path import join as p_join
+from typing import Dict, List
 
 import datamol as dm
 import numpy as np
@@ -12,7 +13,26 @@
 from openqdc.utils.molecule import get_atomic_number_and_charge
 
 
-def read_mol(mol, energy):
+def read_mol(mol: Chem.rdchem.Mol, energy: float) -> Dict[str, np.ndarray]:
+    """Read molecule (Chem.rdchem.Mol) and energy (float) and return dict with conformers and energies
+
+    Parameters
+    ----------
+    mol: Chem.rdchem.Mol
+        RDKit molecule
+    energy: float
+        Energy of the molecule
+
+    Returns
+    -------
+    res: dict
+        Dictionary containing the following keys:
+        - name: np.ndarray of shape (N,) containing the smiles of the molecule
+        - atomic_inputs: flatten np.ndarray of shape (M, 5) containing the atomic numbers, charges and positions
+        - energies: np.ndarray of shape (1,) containing the energy of the conformer
+        - n_atoms: np.ndarray of shape (1) containing the number of atoms in the conformer
+        - subset: np.ndarray of shape (1) containing "molecule3d"
+    """
     smiles = dm.to_smiles(mol, explicit_hs=False)
     # subset = dm.to_smiles(dm.to_scaffold_murcko(mol, make_generic=True), explicit_hs=False)
     x = get_atomic_number_and_charge(mol)
@@ -29,7 +49,8 @@ def read_mol(mol, energy):
     return res
 
 
-def _read_sdf(sdf_path, properties_path):
+def _read_sdf(sdf_path: str, properties_path: str) -> List[Dict[str, np.ndarray]]:
+    """Reads the sdf path and properties file."""
     properties = pd.read_csv(properties_path, dtype={"cid": str})
     properties.drop_duplicates(subset="cid", inplace=True, keep="first")
     xys = properties[["cid", "scf energy"]]
@@ -45,6 +66,22 @@ def _read_sdf(sdf_path, properties_path):
 
 
 class Molecule3D(BaseDataset):
+    """
+    Molecule3D dataset consists of 3,899,647 molecules with ground state geometries and energies
+    calculated at B3LYP/6-31G* level of theory. The molecules are extracted from the
+    PubChem database and cleaned by removing invalid molecule files.
+
+    Usage:
+    ```python
+    from openqdc.datasets import Molecule3D
+    dataset = Molecule3D()
+    ```
+
+    References:
+    - https://arxiv.org/abs/2110.01717
+    - https://github.com/divelab/MoleculeX
+    """
+
     __name__ = "molecule3d"
     __energy_methods__ = ["b3lyp_6-31g*"]
 
diff --git a/src/openqdc/datasets/orbnet_denali.py b/src/openqdc/datasets/orbnet_denali.py
index 452cce1..2d8b093 100644
--- a/src/openqdc/datasets/orbnet_denali.py
+++ b/src/openqdc/datasets/orbnet_denali.py
@@ -1,4 +1,5 @@
 from os.path import join as p_join
+from typing import Dict, List
 
 import datamol as dm
 import numpy as np
@@ -9,7 +10,7 @@
 from openqdc.utils.molecule import atom_table
 
 
-def read_mol(mol_id, conf_dict, base_path, energy_target_names):
+def read_mol(mol_id, conf_dict, base_path, energy_target_names: List[str]) -> Dict[str, np.ndarray]:
     res = []
     for conf_id, conf_label in conf_dict.items():
         try:
@@ -34,6 +35,23 @@ def read_mol(mol_id, conf_dict, base_path, energy_target_names):
 
 
 class OrbnetDenali(BaseDataset):
+    """
+    Orbnet Denali is a collection of 2.3 million conformers from 212,905 unique molecules. It performs
+    DFT (ωB97X-D3/def2-TZVP) calculations on molecules and geometries consisting of organic molecules
+    and chemistries, with protonation and tautomeric states, non-covalent interactions, common salts,
+    and counterions, spanning the most common elements in bio and organic chemistry.
+
+    Usage:
+    ```python
+    from openqdc.datasets import OrbnetDenali
+    dataset = OrbnetDenali()
+    ```
+
+    References:
+    - https://arxiv.org/pdf/2107.00299.pdf
+    - https://figshare.com/articles/dataset/OrbNet_Denali_Training_Data/14883867
+    """
+
     __name__ = "orbnet_denali"
     __energy_methods__ = ["wb97x-d3_tz", "gfn1_xtb"]
 
@@ -53,13 +71,6 @@ def read_raw_entries(self):
             for mol_id, group in df.groupby("mol_id")
         }
 
-        # print(df.head())
-        # tmp = df.to_dict('index')
-        # for i, k in enumerate(tmp):
-        #     print(k, tmp[k])
-        #     if i > 10:
-        #         break
-        # exit()
         fn = lambda x: read_mol(x[0], x[1], self.root, self.energy_target_names)
         res = dm.parallelized(fn, list(labels.items()), scheduler="threads", n_jobs=-1, progress=True)
         samples = sum(res, [])
diff --git a/src/openqdc/datasets/qmugs.py b/src/openqdc/datasets/qmugs.py
index b528f42..d15d83b 100644
--- a/src/openqdc/datasets/qmugs.py
+++ b/src/openqdc/datasets/qmugs.py
@@ -36,6 +36,22 @@ def read_mol(mol_dir):
 
 
 class QMugs(BaseDataset):
+    """
+    The QMugs dataset contains 2 million conformers for 665k biologically and pharmacologically relevant molecules
+    extracted from the ChEMBL database. The atomic and molecular properties are calculated using both,
+    semi-empirical methods (GFN2-xTB) and DFT method (ωB97X-D/def2-SVP).
+
+    Usage:
+    ```python
+    from openqdc.datasets import QMugs
+    dataset = QMugs()
+    ```
+
+    References:
+    - https://www.nature.com/articles/s41597-022-01390-7#ethics
+    - https://www.research-collection.ethz.ch/handle/20.500.11850/482129
+    """
+
     __name__ = "qmugs"
     __energy_methods__ = ["gfn2_xtb", "b3lyp/6-31g*"]
 

From bf3c08a970332ecaaf7041a8142dc2654fabc01a Mon Sep 17 00:00:00 2001
From: Nikhil Shenoy <nikhilshenoy98@gmail.com>
Date: Thu, 5 Oct 2023 19:14:48 +0000
Subject: [PATCH 05/12] Updated docs for spice, iso17, nabladft

---
 src/openqdc/datasets/__init__.py | 25 +++++++++++++++++++++++++
 src/openqdc/datasets/iso_17.py   | 17 +++++++++++++++++
 src/openqdc/datasets/nabladft.py | 19 ++++++++++++++++++-
 src/openqdc/datasets/spice.py    | 17 +++++++++++++++++
 4 files changed, 77 insertions(+), 1 deletion(-)
 create mode 100644 src/openqdc/datasets/__init__.py

diff --git a/src/openqdc/datasets/__init__.py b/src/openqdc/datasets/__init__.py
new file mode 100644
index 0000000..7e1811e
--- /dev/null
+++ b/src/openqdc/datasets/__init__.py
@@ -0,0 +1,25 @@
+from .comp6 import COMP6
+from .gdml import GDML
+from .geom import GEOM
+from .iso_17 import ISO17
+from .molecule3d import Molecule3D
+from .nabladft import NablaDFT
+from .orbnet_denali import OrbnetDenali
+from .qmugs import QMugs
+from .sn2_rxn import SN2RXN
+from .spice import Spice
+
+__all__ = [
+    "Spice",
+    "GEOM",
+    "QMugs",
+    "NablaDFT",
+    "ISO17",
+    "COMP6",
+    "GDML",
+    "Molecule3D",
+    "NablaDFT",
+    "OrbnetDenali",
+    "QMugs",
+    "SN2RXN",
+]
diff --git a/src/openqdc/datasets/iso_17.py b/src/openqdc/datasets/iso_17.py
index 55f395c..a26f382 100644
--- a/src/openqdc/datasets/iso_17.py
+++ b/src/openqdc/datasets/iso_17.py
@@ -7,6 +7,23 @@
 
 
 class ISO17(BaseDataset):
+    """
+    ISO17 dataset consists of the largest set of isomers from the QM9 dataset that consists of a fixed
+    composition of atoms (C7O2H10) arranged in different chemically valid structures. It consists of consist
+    of 129 molecules each containing 5,000 conformational geometries, energies and forces with a resolution
+    of 1 femtosecond in the molecular dynamics trajectories. The simulations were carried out using the
+    Perdew-Burke-Ernzerhof (PBE) functional and the Tkatchenko-Scheffler (TS) van der Waals correction method.
+
+    Usage:
+    ```python
+    from openqdc.datasets import ISO17
+    dataset = ISO17()
+    ```
+
+    References:
+    - https://paperswithcode.com/dataset/iso17
+    """
+
     __name__ = "iso_17"
 
     # Energy in hartree, all zeros by default
diff --git a/src/openqdc/datasets/nabladft.py b/src/openqdc/datasets/nabladft.py
index 3234011..d5e55d2 100644
--- a/src/openqdc/datasets/nabladft.py
+++ b/src/openqdc/datasets/nabladft.py
@@ -1,5 +1,6 @@
 import os
 from os.path import join as p_join
+from typing import Dict
 
 import datamol as dm
 import numpy as np
@@ -10,7 +11,7 @@
 from openqdc.utils.constants import MAX_ATOMIC_NUMBER
 
 
-def to_mol(entry):
+def to_mol(entry) -> Dict[str, np.ndarray]:
     Z, R, E, F = entry[:4]
     C = np.zeros_like(Z)
 
@@ -37,6 +38,22 @@ def read_chunk_from_db(raw_path, start_idx, stop_idx, step_size=1000):
 
 
 class NablaDFT(BaseDataset):
+    """
+    NablaDFT is a dataset constructed from a subset of the
+    [Molecular Sets (MOSES) dataset](https://github.com/molecularsets/moses) consisting of 1 million molecules
+    with 5,340,152 unique conformations generated using ωB97X-D/def2-SVP level of theory.
+
+    Usage:
+    ```python
+    from openqdc.datasets import NablaDFT
+    dataset = NablaDFT()
+    ```
+
+    References:
+    - https://pubs.rsc.org/en/content/articlelanding/2022/CP/D2CP03966D
+    - https://github.com/AIRI-Institute/nablaDFT
+    """
+
     __name__ = "nabladft"
     __energy_methods__ = ["wb97x-d_svp"]
 
diff --git a/src/openqdc/datasets/spice.py b/src/openqdc/datasets/spice.py
index 88af6dc..e273690 100644
--- a/src/openqdc/datasets/spice.py
+++ b/src/openqdc/datasets/spice.py
@@ -32,8 +32,25 @@ def read_record(r):
 
 
 class Spice(BaseDataset):
+    """
+    Spice Dataset consists of 1.1 million conformations for a diverse set of 19k unique molecules consisting of
+    small molecules, dimers, dipeptides, and solvated amino acids. It consists of both forces and energies calculated
+    at {\omega}B97M-D3(BJ)/def2-TZVPPD level of theory.
+
+    Usage:
+    ```python
+    from openqdc.datasets import Spice
+    dataset = Spice()
+    ```
+
+    References:
+    - https://arxiv.org/abs/2209.10702
+    - https://github.com/openmm/spice-dataset
+    """
+
     __name__ = "spice"
     __energy_methods__ = ["wb97x_tz"]
+    __force_methods__ = ["wb97x_tz"]
 
     energy_target_names = ["dft_total_energy"]
 

From 7eb107f3b7d23f70f022e9670a934554574112d7 Mon Sep 17 00:00:00 2001
From: Nikhil Shenoy <nikhilshenoy98@gmail.com>
Date: Thu, 5 Oct 2023 19:17:03 +0000
Subject: [PATCH 06/12] Added README from other branch

---
 README.md | 27 +++++++++++++++++++++++++++
 1 file changed, 27 insertions(+)

diff --git a/README.md b/README.md
index 9300362..5ac528b 100644
--- a/README.md
+++ b/README.md
@@ -19,3 +19,30 @@ You can run tests locally with:
 ```bash
 pytest
 ```
+
+# Overview of Datasets
+
+<!-- Create a table with the following columns
+1. Name of Dataset (with reference of paper) [Dataset Name](paper link)
+2. Number of Molecules
+3. Number of Conformers
+4. Average Conformer to Molecule Ratio (in 2 lines)
+5. Labels
+6. QM Level of Theory
+ -->
+
+We provide support for the following publicly available QM Datasets.
+
+| Dataset | Description | # Molecules | # Conformers | Average Conformers per Molecule | Force Labels | Atom Types | QM Level of Theory |
+| --- | --- | --- | --- | --- | --- | --- | --- |
+| [GEOM](https://www.nature.com/articles/s41597-022-01288-4) | |  450,000 | 37,000,000 | 82 | No | 18 | GFN2-xTB |
+| [Molecule3D](https://arxiv.org/abs/2110.01717) | |  3,899,647 | 3,899,647 | 1 | No | 5 | B3LYP/6-31G* |
+| [NablaDFT](https://pubs.rsc.org/en/content/articlelanding/2022/CP/D2CP03966D) |  |  1,000,000 | 5,000,000 | 5 | No | 6 | ωB97X-D/def2-SVP |
+| [QMugs](https://www.nature.com/articles/s41597-022-01390-7) | |  665,000 | 2,000,000 | 3 | No | 10 | GFN2-xTB, ωB97X-D/def2-SVP |
+| [Spice](https://arxiv.org/abs/2209.10702) | |  19,238 | 1,132,808 | 59 | Yes | 15 | ωB97M-D3(BJ)/def2-TZVPPD |
+| [ANI](https://pubs.rsc.org/en/content/articlelanding/2017/SC/C6SC05720A) | |  57,462 | 348 | 20,000,000 | No | 4 | ωB97x:6-31G(d) |
+| [tmQM](https://pubs.acs.org/doi/10.1021/acs.jcim.0c01041) | |  86,665 | | | No |  | TPSSh-D3BJ/def2-SVP |
+| [DES370K](https://www.nature.com/articles/s41597-021-00833-x) | | 3,700 | 370,000 | 100 | No | 20 | CCSD(T) |
+| [DES5M](https://www.nature.com/articles/s41597-021-00833-x) | | 3,700 | 5,000,000 | 1351 | No | 20 | SNS-MP2 |
+| [OrbNet Denali](https://arxiv.org/abs/2107.00299) | | 212,905 | 2,300,000 | 11 | No | 16 | GFN1-xTB
+| [SN2RXN](https://pubs.acs.org/doi/10.1021/acs.jctc.9b00181) | Probes chemical reactions of methyl halides with halide anions of the kind $X^- + H_3C-Y \to X-CH_3 + Y^{-1}$| 39 | 452709 | 11,600 | Yes | 6 | DSD-BLYP-D3(BJ)/def2-TZVP |

From ee19191cc5c25f6df76ef1c6184bbb79dff43b23 Mon Sep 17 00:00:00 2001
From: Nikhil Shenoy <nikhilshenoy98@gmail.com>
Date: Thu, 5 Oct 2023 19:18:43 +0000
Subject: [PATCH 07/12] Updated README

---
 README.md | 26 +++++++++++++-------------
 1 file changed, 13 insertions(+), 13 deletions(-)

diff --git a/README.md b/README.md
index 5ac528b..ac969a2 100644
--- a/README.md
+++ b/README.md
@@ -33,16 +33,16 @@ pytest
 
 We provide support for the following publicly available QM Datasets.
 
-| Dataset | Description | # Molecules | # Conformers | Average Conformers per Molecule | Force Labels | Atom Types | QM Level of Theory |
-| --- | --- | --- | --- | --- | --- | --- | --- |
-| [GEOM](https://www.nature.com/articles/s41597-022-01288-4) | |  450,000 | 37,000,000 | 82 | No | 18 | GFN2-xTB |
-| [Molecule3D](https://arxiv.org/abs/2110.01717) | |  3,899,647 | 3,899,647 | 1 | No | 5 | B3LYP/6-31G* |
-| [NablaDFT](https://pubs.rsc.org/en/content/articlelanding/2022/CP/D2CP03966D) |  |  1,000,000 | 5,000,000 | 5 | No | 6 | ωB97X-D/def2-SVP |
-| [QMugs](https://www.nature.com/articles/s41597-022-01390-7) | |  665,000 | 2,000,000 | 3 | No | 10 | GFN2-xTB, ωB97X-D/def2-SVP |
-| [Spice](https://arxiv.org/abs/2209.10702) | |  19,238 | 1,132,808 | 59 | Yes | 15 | ωB97M-D3(BJ)/def2-TZVPPD |
-| [ANI](https://pubs.rsc.org/en/content/articlelanding/2017/SC/C6SC05720A) | |  57,462 | 348 | 20,000,000 | No | 4 | ωB97x:6-31G(d) |
-| [tmQM](https://pubs.acs.org/doi/10.1021/acs.jcim.0c01041) | |  86,665 | | | No |  | TPSSh-D3BJ/def2-SVP |
-| [DES370K](https://www.nature.com/articles/s41597-021-00833-x) | | 3,700 | 370,000 | 100 | No | 20 | CCSD(T) |
-| [DES5M](https://www.nature.com/articles/s41597-021-00833-x) | | 3,700 | 5,000,000 | 1351 | No | 20 | SNS-MP2 |
-| [OrbNet Denali](https://arxiv.org/abs/2107.00299) | | 212,905 | 2,300,000 | 11 | No | 16 | GFN1-xTB
-| [SN2RXN](https://pubs.acs.org/doi/10.1021/acs.jctc.9b00181) | Probes chemical reactions of methyl halides with halide anions of the kind $X^- + H_3C-Y \to X-CH_3 + Y^{-1}$| 39 | 452709 | 11,600 | Yes | 6 | DSD-BLYP-D3(BJ)/def2-TZVP |
+| Dataset | # Molecules | # Conformers | Average Conformers per Molecule | Force Labels | Atom Types | QM Level of Theory |
+| --- | --- | --- | --- | --- | --- | --- |
+| [GEOM](https://www.nature.com/articles/s41597-022-01288-4) |  450,000 | 37,000,000 | 82 | No | 18 | GFN2-xTB |
+| [Molecule3D](https://arxiv.org/abs/2110.01717) |  3,899,647 | 3,899,647 | 1 | No | 5 | B3LYP/6-31G* |
+| [NablaDFT](https://pubs.rsc.org/en/content/articlelanding/2022/CP/D2CP03966D) |  1,000,000 | 5,000,000 | 5 | No | 6 | ωB97X-D/def2-SVP |
+| [QMugs](https://www.nature.com/articles/s41597-022-01390-7) |  665,000 | 2,000,000 | 3 | No | 10 | GFN2-xTB, ωB97X-D/def2-SVP |
+| [Spice](https://arxiv.org/abs/2209.10702) |  19,238 | 1,132,808 | 59 | Yes | 15 | ωB97M-D3(BJ)/def2-TZVPPD |
+| [ANI](https://pubs.rsc.org/en/content/articlelanding/2017/SC/C6SC05720A) |  57,462 | 20,000,000 | 348 | No | 4 | ωB97x:6-31G(d) |
+| [tmQM](https://pubs.acs.org/doi/10.1021/acs.jcim.0c01041) |  86,665 | | No |  | TPSSh-D3BJ/def2-SVP |
+| [DES370K](https://www.nature.com/articles/s41597-021-00833-x) | 3,700 | 370,000 | 100 | No | 20 | CCSD(T) |
+| [DES5M](https://www.nature.com/articles/s41597-021-00833-x) | 3,700 | 5,000,000 | 1351 | No | 20 | SNS-MP2 |
+| [OrbNet Denali](https://arxiv.org/abs/2107.00299) | 212,905 | 2,300,000 | 11 | No | 16 | GFN1-xTB
+| [SN2RXN](https://pubs.acs.org/doi/10.1021/acs.jctc.9b00181) | 39 | 452709 | 11,600 | Yes | 6 | DSD-BLYP-D3(BJ)/def2-TZVP |

From a665a132d248d6d37378fdf5c51e3359d6b962af Mon Sep 17 00:00:00 2001
From: Nikhil Shenoy <nikhilshenoy98@gmail.com>
Date: Thu, 5 Oct 2023 19:30:42 +0000
Subject: [PATCH 08/12] Added qm7x to the readme

---
 README.md | 26 +++++++++++++-------------
 1 file changed, 13 insertions(+), 13 deletions(-)

diff --git a/README.md b/README.md
index ac969a2..560a613 100644
--- a/README.md
+++ b/README.md
@@ -33,16 +33,16 @@ pytest
 
 We provide support for the following publicly available QM Datasets.
 
-| Dataset | # Molecules | # Conformers | Average Conformers per Molecule | Force Labels | Atom Types | QM Level of Theory |
-| --- | --- | --- | --- | --- | --- | --- |
-| [GEOM](https://www.nature.com/articles/s41597-022-01288-4) |  450,000 | 37,000,000 | 82 | No | 18 | GFN2-xTB |
-| [Molecule3D](https://arxiv.org/abs/2110.01717) |  3,899,647 | 3,899,647 | 1 | No | 5 | B3LYP/6-31G* |
-| [NablaDFT](https://pubs.rsc.org/en/content/articlelanding/2022/CP/D2CP03966D) |  1,000,000 | 5,000,000 | 5 | No | 6 | ωB97X-D/def2-SVP |
-| [QMugs](https://www.nature.com/articles/s41597-022-01390-7) |  665,000 | 2,000,000 | 3 | No | 10 | GFN2-xTB, ωB97X-D/def2-SVP |
-| [Spice](https://arxiv.org/abs/2209.10702) |  19,238 | 1,132,808 | 59 | Yes | 15 | ωB97M-D3(BJ)/def2-TZVPPD |
-| [ANI](https://pubs.rsc.org/en/content/articlelanding/2017/SC/C6SC05720A) |  57,462 | 20,000,000 | 348 | No | 4 | ωB97x:6-31G(d) |
-| [tmQM](https://pubs.acs.org/doi/10.1021/acs.jcim.0c01041) |  86,665 | | No |  | TPSSh-D3BJ/def2-SVP |
-| [DES370K](https://www.nature.com/articles/s41597-021-00833-x) | 3,700 | 370,000 | 100 | No | 20 | CCSD(T) |
-| [DES5M](https://www.nature.com/articles/s41597-021-00833-x) | 3,700 | 5,000,000 | 1351 | No | 20 | SNS-MP2 |
-| [OrbNet Denali](https://arxiv.org/abs/2107.00299) | 212,905 | 2,300,000 | 11 | No | 16 | GFN1-xTB
-| [SN2RXN](https://pubs.acs.org/doi/10.1021/acs.jctc.9b00181) | 39 | 452709 | 11,600 | Yes | 6 | DSD-BLYP-D3(BJ)/def2-TZVP |
+| Dataset | # Molecules | # Conformers | Average Conformers per Molecule | Force Labels | Atom Types | QM Level of Theory | Off-Equilibrium Conformations|
+| --- | --- | --- | --- | --- | --- | --- | --- |
+| [GEOM](https://www.nature.com/articles/s41597-022-01288-4) |  450,000 | 37,000,000 | 82 | No | 18 | GFN2-xTB |  |
+| [Molecule3D](https://arxiv.org/abs/2110.01717) |  3,899,647 | 3,899,647 | 1 | No | 5 | B3LYP/6-31G* | |
+| [NablaDFT](https://pubs.rsc.org/en/content/articlelanding/2022/CP/D2CP03966D) |  1,000,000 | 5,000,000 | 5 | No | 6 | ωB97X-D/def2-SVP | |
+| [QMugs](https://www.nature.com/articles/s41597-022-01390-7) |  665,000 | 2,000,000 | 3 | No | 10 | GFN2-xTB, ωB97X-D/def2-SVP | |
+| [Spice](https://arxiv.org/abs/2209.10702) |  19,238 | 1,132,808 | 59 | Yes | 15 | ωB97M-D3(BJ)/def2-TZVPPD | |
+| [ANI](https://pubs.rsc.org/en/content/articlelanding/2017/SC/C6SC05720A) |  57,462 | 20,000,000 | 348 | No | 4 | ωB97x:6-31G(d) | |
+| [tmQM](https://pubs.acs.org/doi/10.1021/acs.jcim.0c01041) |  86,665 | No |  |  | | TPSSh-D3BJ/def2-SVP | |
+| [DES370K](https://www.nature.com/articles/s41597-021-00833-x) | 3,700 | 370,000 | 100 | No | 20 | CCSD(T) | |
+| [DES5M](https://www.nature.com/articles/s41597-021-00833-x) | 3,700 | 5,000,000 | 1351 | No | 20 | SNS-MP2 | |
+| [OrbNet Denali](https://arxiv.org/abs/2107.00299) | 212,905 | 2,300,000 | 11 | No | 16 | GFN1-xTB | |
+| [SN2RXN](https://pubs.acs.org/doi/10.1021/acs.jctc.9b00181) | 39 | 452709 | 11,600 | Yes | 6 | DSD-BLYP-D3(BJ)/def2-TZVP | |

From a639f20f2495e57a54b832497451050181ec3a08 Mon Sep 17 00:00:00 2001
From: Nikhil Shenoy <nikhilshenoy98@gmail.com>
Date: Thu, 5 Oct 2023 19:32:09 +0000
Subject: [PATCH 09/12] Added qm7x to the readme

---
 README.md | 13 +++++++------
 1 file changed, 7 insertions(+), 6 deletions(-)

diff --git a/README.md b/README.md
index 560a613..05ae11d 100644
--- a/README.md
+++ b/README.md
@@ -35,14 +35,15 @@ We provide support for the following publicly available QM Datasets.
 
 | Dataset | # Molecules | # Conformers | Average Conformers per Molecule | Force Labels | Atom Types | QM Level of Theory | Off-Equilibrium Conformations|
 | --- | --- | --- | --- | --- | --- | --- | --- |
-| [GEOM](https://www.nature.com/articles/s41597-022-01288-4) |  450,000 | 37,000,000 | 82 | No | 18 | GFN2-xTB |  |
-| [Molecule3D](https://arxiv.org/abs/2110.01717) |  3,899,647 | 3,899,647 | 1 | No | 5 | B3LYP/6-31G* | |
+| [GEOM](https://www.nature.com/articles/s41597-022-01288-4) |  450,000 | 37,000,000 | 82 | No | 18 | GFN2-xTB | No |
+| [Molecule3D](https://arxiv.org/abs/2110.01717) |  3,899,647 | 3,899,647 | 1 | No | 5 | B3LYP/6-31G* | No |
 | [NablaDFT](https://pubs.rsc.org/en/content/articlelanding/2022/CP/D2CP03966D) |  1,000,000 | 5,000,000 | 5 | No | 6 | ωB97X-D/def2-SVP | |
 | [QMugs](https://www.nature.com/articles/s41597-022-01390-7) |  665,000 | 2,000,000 | 3 | No | 10 | GFN2-xTB, ωB97X-D/def2-SVP | |
-| [Spice](https://arxiv.org/abs/2209.10702) |  19,238 | 1,132,808 | 59 | Yes | 15 | ωB97M-D3(BJ)/def2-TZVPPD | |
-| [ANI](https://pubs.rsc.org/en/content/articlelanding/2017/SC/C6SC05720A) |  57,462 | 20,000,000 | 348 | No | 4 | ωB97x:6-31G(d) | |
-| [tmQM](https://pubs.acs.org/doi/10.1021/acs.jcim.0c01041) |  86,665 | No |  |  | | TPSSh-D3BJ/def2-SVP | |
+| [Spice](https://arxiv.org/abs/2209.10702) |  19,238 | 1,132,808 | 59 | Yes | 15 | ωB97M-D3(BJ)/def2-TZVPPD | Yes |
+| [ANI](https://pubs.rsc.org/en/content/articlelanding/2017/SC/C6SC05720A) |  57,462 | 20,000,000 | 348 | No | 4 | ωB97x:6-31G(d) | Yes |
+| [tmQM](https://pubs.acs.org/doi/10.1021/acs.jcim.0c01041) |  86,665 | |  | No | | TPSSh-D3BJ/def2-SVP | |
 | [DES370K](https://www.nature.com/articles/s41597-021-00833-x) | 3,700 | 370,000 | 100 | No | 20 | CCSD(T) | |
 | [DES5M](https://www.nature.com/articles/s41597-021-00833-x) | 3,700 | 5,000,000 | 1351 | No | 20 | SNS-MP2 | |
-| [OrbNet Denali](https://arxiv.org/abs/2107.00299) | 212,905 | 2,300,000 | 11 | No | 16 | GFN1-xTB | |
+| [OrbNet Denali](https://arxiv.org/abs/2107.00299) | 212,905 | 2,300,000 | 11 | No | 16 | GFN1-xTB | Yes |
 | [SN2RXN](https://pubs.acs.org/doi/10.1021/acs.jctc.9b00181) | 39 | 452709 | 11,600 | Yes | 6 | DSD-BLYP-D3(BJ)/def2-TZVP | |
+| [QM7X](https://www.nature.com/articles/s41597-021-00812-2) |  6,950 | 4,195,237 | 603 | Yes | 7 | PBE0+MBD | Yes |

From aff4de1160bff6c468bf928aa0655a98d13450a1 Mon Sep 17 00:00:00 2001
From: Nikhil Shenoy <nikhilshenoy98@gmail.com>
Date: Thu, 5 Oct 2023 19:39:14 +0000
Subject: [PATCH 10/12] Updated README

---
 README.md | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/README.md b/README.md
index 05ae11d..b3f769a 100644
--- a/README.md
+++ b/README.md
@@ -38,12 +38,12 @@ We provide support for the following publicly available QM Datasets.
 | [GEOM](https://www.nature.com/articles/s41597-022-01288-4) |  450,000 | 37,000,000 | 82 | No | 18 | GFN2-xTB | No |
 | [Molecule3D](https://arxiv.org/abs/2110.01717) |  3,899,647 | 3,899,647 | 1 | No | 5 | B3LYP/6-31G* | No |
 | [NablaDFT](https://pubs.rsc.org/en/content/articlelanding/2022/CP/D2CP03966D) |  1,000,000 | 5,000,000 | 5 | No | 6 | ωB97X-D/def2-SVP | |
-| [QMugs](https://www.nature.com/articles/s41597-022-01390-7) |  665,000 | 2,000,000 | 3 | No | 10 | GFN2-xTB, ωB97X-D/def2-SVP | |
+| [QMugs](https://www.nature.com/articles/s41597-022-01390-7) |  665,000 | 2,000,000 | 3 | No | 10 | GFN2-xTB, ωB97X-D/def2-SVP | No |
 | [Spice](https://arxiv.org/abs/2209.10702) |  19,238 | 1,132,808 | 59 | Yes | 15 | ωB97M-D3(BJ)/def2-TZVPPD | Yes |
 | [ANI](https://pubs.rsc.org/en/content/articlelanding/2017/SC/C6SC05720A) |  57,462 | 20,000,000 | 348 | No | 4 | ωB97x:6-31G(d) | Yes |
 | [tmQM](https://pubs.acs.org/doi/10.1021/acs.jcim.0c01041) |  86,665 | |  | No | | TPSSh-D3BJ/def2-SVP | |
-| [DES370K](https://www.nature.com/articles/s41597-021-00833-x) | 3,700 | 370,000 | 100 | No | 20 | CCSD(T) | |
-| [DES5M](https://www.nature.com/articles/s41597-021-00833-x) | 3,700 | 5,000,000 | 1351 | No | 20 | SNS-MP2 | |
+| [DES370K](https://www.nature.com/articles/s41597-021-00833-x) | 3,700 | 370,000 | 100 | No | 20 | CCSD(T) | Yes |
+| [DES5M](https://www.nature.com/articles/s41597-021-00833-x) | 3,700 | 5,000,000 | 1351 | No | 20 | SNS-MP2 | Yes |
 | [OrbNet Denali](https://arxiv.org/abs/2107.00299) | 212,905 | 2,300,000 | 11 | No | 16 | GFN1-xTB | Yes |
 | [SN2RXN](https://pubs.acs.org/doi/10.1021/acs.jctc.9b00181) | 39 | 452709 | 11,600 | Yes | 6 | DSD-BLYP-D3(BJ)/def2-TZVP | |
 | [QM7X](https://www.nature.com/articles/s41597-021-00812-2) |  6,950 | 4,195,237 | 603 | Yes | 7 | PBE0+MBD | Yes |

From 3fae2e370105322db27f96e2c84ede6ee1aa58b1 Mon Sep 17 00:00:00 2001
From: Nikhil Shenoy <nikhilshenoy98@gmail.com>
Date: Sun, 8 Oct 2023 23:46:38 +0000
Subject: [PATCH 11/12] fixed 2 bugs in code

---
 src/openqdc/datasets/spice.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/openqdc/datasets/spice.py b/src/openqdc/datasets/spice.py
index e273690..e00cdbc 100644
--- a/src/openqdc/datasets/spice.py
+++ b/src/openqdc/datasets/spice.py
@@ -18,10 +18,10 @@ def read_record(r):
     positions = r["conformations"][:] * BOHR2ANG
 
     res = dict(
-        smiles=np.array([smiles] * n_confs),
+        name=np.array([smiles] * n_confs),
         subset=np.array([Spice.subset_mapping[subset]] * n_confs),
         energies=r[Spice.energy_target_names[0]][:][:, None].astype(np.float32),
-        forces=r[Spice.force_target_names[0]][:].reshape(-1, 3, 1) / BOHR2ANG,
+        forces=r[Spice.force_target_names[0]][:].reshape(-1, 3, 1) / BOHR2ANG * (-1.0),  # forces -ve of energy gradient
         atomic_inputs=np.concatenate(
             (x[None, ...].repeat(n_confs, axis=0), positions), axis=-1, dtype=np.float32
         ).reshape(-1, 5),

From dae4dae8b57bcf1e0f9ffc467c4c2f98a01c430e Mon Sep 17 00:00:00 2001
From: Nikhil Shenoy <nikhilshenoy98@gmail.com>
Date: Wed, 11 Oct 2023 22:31:55 +0000
Subject: [PATCH 12/12] Added __init__ and qmugs remove hs fix

---
 src/openqdc/__init__.py          | 0
 src/openqdc/datasets/__init__.py | 7 ++++---
 src/openqdc/datasets/qmugs.py    | 2 +-
 3 files changed, 5 insertions(+), 4 deletions(-)
 create mode 100644 src/openqdc/__init__.py

diff --git a/src/openqdc/__init__.py b/src/openqdc/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/src/openqdc/datasets/__init__.py b/src/openqdc/datasets/__init__.py
index 7e1811e..9c17922 100644
--- a/src/openqdc/datasets/__init__.py
+++ b/src/openqdc/datasets/__init__.py
@@ -1,24 +1,25 @@
+from .ani import ANI1, ANI1CCX, ANI1X
 from .comp6 import COMP6
 from .gdml import GDML
 from .geom import GEOM
 from .iso_17 import ISO17
 from .molecule3d import Molecule3D
-from .nabladft import NablaDFT
 from .orbnet_denali import OrbnetDenali
 from .qmugs import QMugs
 from .sn2_rxn import SN2RXN
 from .spice import Spice
 
 __all__ = [
+    "ANI1",
+    "ANI1CCX",
+    "ANI1X",
     "Spice",
     "GEOM",
     "QMugs",
-    "NablaDFT",
     "ISO17",
     "COMP6",
     "GDML",
     "Molecule3D",
-    "NablaDFT",
     "OrbnetDenali",
     "QMugs",
     "SN2RXN",
diff --git a/src/openqdc/datasets/qmugs.py b/src/openqdc/datasets/qmugs.py
index d15d83b..62bc3b0 100644
--- a/src/openqdc/datasets/qmugs.py
+++ b/src/openqdc/datasets/qmugs.py
@@ -12,7 +12,7 @@
 
 def read_mol(mol_dir):
     filenames = glob(p_join(mol_dir, "*.sdf"))
-    mols = [dm.read_sdf(f)[0] for f in filenames]
+    mols = [dm.read_sdf(f, remove_hs=False)[0] for f in filenames]
     n_confs = len(mols)
 
     if len(mols) == 0: