valence-labs · mcneela · Mar 14, 2024 · Mar 1, 2024 · Mar 1, 2024 · Mar 1, 2024
diff --git a/README.md b/README.md
@@ -53,7 +53,7 @@ openqdc download Spice QMugs
 6. QM Level of Theory
  -->
 
-We provide support for the following publicly available QM Datasets.
+We provide support for the following publicly available QM Potential Energy Datasets.
 
 # Potential Energy
 
@@ -78,7 +78,15 @@ We provide support for the following publicly available QM Datasets.
 
 # Interaction energy
 
-| Dataset | # Molecules | # Conformers | Average Conformers per Molecule | Force Labels | Atom Types | QM Level of Theory | Off-Equilibrium Conformations|
-| --- | --- | --- | --- | --- | --- | --- | --- |
-| [DES370K](https://www.nature.com/articles/s41597-021-00833-x) | 3,700 | 370,000 | 100 | No | 20 | CCSD(T) | Yes |
-| [DES5M](https://www.nature.com/articles/s41597-021-00833-x) | 3,700 | 5,000,000 | 1351 | No | 20 | SNS-MP2 | Yes |
+We also provide support for the following publicly available QM Noncovalent Interaction Energy Datasets.
+
+| Dataset |
+| --- |
+| [DES370K](https://www.nature.com/articles/s41597-021-00833-x) |
+| [DES5M](https://www.nature.com/articles/s41597-021-00833-x)   |
+| [Metcalf](https://pubs.aip.org/aip/jcp/article/152/7/074103/1059677/Approaches-for-machine-learning-intermolecular) |
+| [DESS66](https://www.nature.com/articles/s41597-021-00833-x) |
+| [DESS66x8](https://www.nature.com/articles/s41597-021-00833-x) |
+| [Splinter](https://www.nature.com/articles/s41597-023-02443-1) |
+| [X40](https://pubs.acs.org/doi/10.1021/ct300647k) |
+| [L7](https://pubs.acs.org/doi/10.1021/ct400036b)  |
diff --git a/openqdc/datasets/__init__.py b/openqdc/datasets/__init__.py
@@ -1,25 +1,4 @@
-from .base import BaseDataset  # noqa
 from .interaction import AVAILABLE_INTERACTION_DATASETS  # noqa
-from .interaction import DES  # noqa
 from .potential import AVAILABLE_POTENTIAL_DATASETS  # noqa
-from .potential.ani import ANI1, ANI1CCX, ANI1X  # noqa
-from .potential.comp6 import COMP6  # noqa
-from .potential.dummy import Dummy  # noqa
-from .potential.gdml import GDML  # noqa
-from .potential.geom import GEOM  # noqa
-from .potential.iso_17 import ISO17  # noqa
-from .potential.molecule3d import Molecule3D  # noqa
-from .potential.multixcqm9 import MultixcQM9  # noqa
-from .potential.nabladft import NablaDFT  # noqa
-from .potential.orbnet_denali import OrbnetDenali  # noqa
-from .potential.pcqm import PCQM_B3LYP, PCQM_PM6  # noqa
-from .potential.qm7x import QM7X  # noqa
-from .potential.qmugs import QMugs  # noqa
-from .potential.sn2_rxn import SN2RXN  # noqa
-from .potential.solvated_peptides import SolvatedPeptides  # noqa
-from .potential.spice import Spice  # noqa
-from .potential.tmqm import TMQM  # noqa
-from .potential.transition1x import Transition1X  # noqa
-from .potential.waterclusters3_30 import WaterClusters  # noqa
 
 AVAILABLE_DATASETS = {**AVAILABLE_POTENTIAL_DATASETS, **AVAILABLE_INTERACTION_DATASETS}
diff --git a/openqdc/datasets/interaction/L7.py b/openqdc/datasets/interaction/L7.py
@@ -0,0 +1,121 @@
+import os
+from typing import Dict, List
+
+import numpy as np
+import yaml
+from loguru import logger
+
+from openqdc.datasets.interaction.base import BaseInteractionDataset
+from openqdc.utils.molecule import atom_table
+
+
+class DataItemYAMLObj:
+    def __init__(self, name, shortname, geometry, reference_value, setup, group, tags):
+        self.name = name
+        self.shortname = shortname
+        self.geometry = geometry
+        self.reference_value = reference_value
+        self.setup = setup
+        self.group = group
+        self.tags = tags
+
+
+class DataSetYAMLObj:
+    def __init__(self, name, references, text, method_energy, groups_by, groups, global_setup):
+        self.name = name
+        self.references = references
+        self.text = text
+        self.method_energy = method_energy
+        self.groups_by = groups_by
+        self.groups = groups
+        self.global_setup = global_setup
+
+
+def data_item_constructor(loader: yaml.SafeLoader, node: yaml.nodes.MappingNode):
+    """Construct an employee."""
+    return DataItemYAMLObj(**loader.construct_mapping(node))
+
+
+def dataset_constructor(loader: yaml.SafeLoader, node: yaml.nodes.MappingNode):
+    """Construct an employee."""
+    return DataSetYAMLObj(**loader.construct_mapping(node))
+
+
+def get_loader():
+    """Add constructors to PyYAML loader."""
+    loader = yaml.SafeLoader
+    loader.add_constructor("!ruby/object:ProtocolDataset::DataSetItem", data_item_constructor)
+    loader.add_constructor("!ruby/object:ProtocolDataset::DataSetDescription", dataset_constructor)
+    return loader
+
+
+class L7(BaseInteractionDataset):
+    """
+    The L7 interaction energy dataset as described in:
+
+    Accuracy of Quantum Chemical Methods for Large Noncovalent Complexes
+    Robert Sedlak, Tomasz Janowski, Michal Pitoňák, Jan Řezáč, Peter Pulay, and Pavel Hobza
+    Journal of Chemical Theory and Computation 2013 9 (8), 3364-3374
+    DOI: 10.1021/ct400036b
+
+    Data was downloaded and extracted from:
+    http://cuby4.molecular.cz/dataset_l7.html
+    """
+
+    __name__ = "L7"
+    __energy_unit__ = "hartree"
+    __distance_unit__ = "ang"
+    __forces_unit__ = "hartree/ang"
+    __energy_methods__ = [
+        "CSD(T) | QCISD(T)",
+        "DLPNO-CCSD(T)",
+        "MP2/CBS",
+        "MP2C/CBS",
+        "fixed",
+        "DLPNO-CCSD(T0)",
+        "LNO-CCSD(T)",
+        "FN-DMC",
+    ]
+
+    energy_target_names = []
+
+    def read_raw_entries(self) -> List[Dict]:
+        yaml_fpath = os.path.join(self.root, "l7.yaml")
+        logger.info(f"Reading L7 interaction data from {self.root}")
+        yaml_file = open(yaml_fpath, "r")
+        data = []
+        data_dict = yaml.load(yaml_file, Loader=get_loader())
+        charge0 = int(data_dict["description"].global_setup["molecule_a"]["charge"])
+        charge1 = int(data_dict["description"].global_setup["molecule_b"]["charge"])
+
+        for idx, item in enumerate(data_dict["items"]):
+            energies = []
+            name = np.array([item.shortname])
+            fname = item.geometry.split(":")[1]
+            energies.append(item.reference_value)
+            xyz_file = open(os.path.join(self.root, f"{fname}.xyz"), "r")
+            lines = list(map(lambda x: x.strip().split(), xyz_file.readlines()))
+            lines.pop(1)
+            n_atoms = np.array([int(lines[0][0])], dtype=np.int32)
+            n_atoms_first = np.array([int(item.setup["molecule_a"]["selection"].split("-")[1])], dtype=np.int32)
+            subset = np.array([item.group])
+            energies += [float(val[idx]) for val in list(data_dict["alternative_reference"].values())]
+            energies = np.array([energies], dtype=np.float32)
+            pos = np.array(lines[1:])[:, 1:].astype(np.float32)
+            elems = np.array(lines[1:])[:, 0]
+            atomic_nums = np.expand_dims(np.array([atom_table.GetAtomicNumber(x) for x in elems]), axis=1)
+            natoms0 = n_atoms_first[0]
+            natoms1 = n_atoms[0] - natoms0
+            charges = np.expand_dims(np.array([charge0] * natoms0 + [charge1] * natoms1), axis=1)
+            atomic_inputs = np.concatenate((atomic_nums, charges, pos), axis=-1, dtype=np.float32)
+
+            item = dict(
+                energies=energies,
+                subset=subset,
+                n_atoms=n_atoms,
+                n_atoms_first=n_atoms_first,
+                atomic_inputs=atomic_inputs,
+                name=name,
+            )
+            data.append(item)
+        return data
diff --git a/openqdc/datasets/interaction/X40.py b/openqdc/datasets/interaction/X40.py
@@ -0,0 +1,80 @@
+import os
+from typing import Dict, List
+
+import numpy as np
+import yaml
+from loguru import logger
+
+from openqdc.datasets.interaction.base import BaseInteractionDataset
+from openqdc.datasets.interaction.L7 import get_loader
+from openqdc.utils.molecule import atom_table
+
+
+class X40(BaseInteractionDataset):
+    """
+    X40 interaction dataset of 40 dimer pairs as
+    introduced in the following paper:
+
+    Benchmark Calculations of Noncovalent Interactions of Halogenated Molecules
+    Jan Řezáč, Kevin E. Riley, and Pavel Hobza
+    Journal of Chemical Theory and Computation 2012 8 (11), 4285-4292
+    DOI: 10.1021/ct300647k
+
+    Dataset retrieved and processed from:
+    http://cuby4.molecular.cz/dataset_x40.html
+    """
+
+    __name__ = "X40"
+    __energy_unit__ = "hartree"
+    __distance_unit__ = "ang"
+    __forces_unit__ = "hartree/ang"
+    __energy_methods__ = [
+        "CCSD(T)/CBS",
+        "MP2/CBS",
+        "dCCSD(T)/haDZ",
+        "dCCSD(T)/haTZ",
+        "MP2.5/CBS(aDZ)",
+    ]
+
+    energy_target_names = []
+
+    def read_raw_entries(self) -> List[Dict]:
+        yaml_fpath = os.path.join(self.root, "x40.yaml")
+        logger.info(f"Reading X40 interaction data from {self.root}")
+        yaml_file = open(yaml_fpath, "r")
+        data = []
+        data_dict = yaml.load(yaml_file, Loader=get_loader())
+        charge0 = int(data_dict["description"].global_setup["molecule_a"]["charge"])
+        charge1 = int(data_dict["description"].global_setup["molecule_b"]["charge"])
+
+        for idx, item in enumerate(data_dict["items"]):
+            energies = []
+            name = np.array([item.shortname])
+            energies.append(float(item.reference_value))
+            xyz_file = open(os.path.join(self.root, f"{item.shortname}.xyz"), "r")
+            lines = list(map(lambda x: x.strip().split(), xyz_file.readlines()))
+            setup = lines.pop(1)
+            n_atoms = np.array([int(lines[0][0])], dtype=np.int32)
+            n_atoms_first = setup[0].split("-")[1]
+            n_atoms_first = np.array([int(n_atoms_first)], dtype=np.int32)
+            subset = np.array([item.group])
+            energies += [float(val[idx]) for val in list(data_dict["alternative_reference"].values())]
+            energies = np.array([energies], dtype=np.float32)
+            pos = np.array(lines[1:])[:, 1:].astype(np.float32)
+            elems = np.array(lines[1:])[:, 0]
+            atomic_nums = np.expand_dims(np.array([atom_table.GetAtomicNumber(x) for x in elems]), axis=1)
+            natoms0 = n_atoms_first[0]
+            natoms1 = n_atoms[0] - natoms0
+            charges = np.expand_dims(np.array([charge0] * natoms0 + [charge1] * natoms1), axis=1)
+            atomic_inputs = np.concatenate((atomic_nums, charges, pos), axis=-1, dtype=np.float32)
+
+            item = dict(
+                energies=energies,
+                subset=subset,
+                n_atoms=n_atoms,
+                n_atoms_first=n_atoms_first,
+                atomic_inputs=atomic_inputs,
+                name=name,
+            )
+            data.append(item)
+        return data
diff --git a/openqdc/datasets/interaction/__init__.py b/openqdc/datasets/interaction/__init__.py
@@ -1,3 +1,21 @@
-from .des import DES
+from .base import BaseInteractionDataset
+from .des5m import DES5M
+from .des370k import DES370K
+from .dess66 import DESS66
+from .dess66x8 import DESS66x8
+from .L7 import L7
+from .metcalf import Metcalf
+from .splinter import Splinter
+from .X40 import X40
 
-AVAILABLE_INTERACTION_DATASETS = {"des": DES}
+AVAILABLE_INTERACTION_DATASETS = {
+    "base": BaseInteractionDataset,
+    "des5m": DES5M,
+    "des370k": DES370K,
+    "dess66": DESS66,
+    "dess66x8": DESS66x8,
+    "l7": L7,
+    "metcalf": Metcalf,
+    "splinter": Splinter,
+    "x40": X40,
+}