From 31beb71aed0497125a76e727710f014e2e76746a Mon Sep 17 00:00:00 2001
From: Nikhil Shenoy <nikhilshenoy98@gmail.com.com>
Date: Tue, 26 Mar 2024 19:57:25 +0000
Subject: [PATCH 01/27] refactor interaction and initial testing

---
 openqdc/datasets/base.py                 |   8 +-
 openqdc/datasets/interaction/L7.py       |   2 +-
 openqdc/datasets/interaction/X40.py      |   2 +-
 openqdc/datasets/interaction/__init__.py |   2 +
 openqdc/datasets/interaction/base.py     |  76 ++---------------
 openqdc/datasets/interaction/dummy.py    | 100 +++++++++++++++++++++++
 openqdc/datasets/interaction/splinter.py |   2 +-
 tests/test_interaction.py                |  19 +++++
 8 files changed, 139 insertions(+), 72 deletions(-)
 create mode 100644 openqdc/datasets/interaction/dummy.py
 create mode 100644 tests/test_interaction.py

diff --git a/openqdc/datasets/base.py b/openqdc/datasets/base.py
index 94150e1..2c1d2fa 100644
--- a/openqdc/datasets/base.py
+++ b/openqdc/datasets/base.py
@@ -345,6 +345,10 @@ def data_keys(self):
             keys.remove("forces")
         return keys
 
+    @property
+    def pkl_data_keys(self):
+        return ["name", "subset", "n_atoms"]
+
     @property
     def data_types(self):
         return {
@@ -465,7 +469,7 @@ def save_preprocess(self, data_dict):
 
         # save smiles and subset
         local_path = p_join(self.preprocess_path, "props.pkl")
-        for key in ["name", "subset"]:
+        for key in self.pkl_data_keys:
             data_dict[key] = np.unique(data_dict[key], return_inverse=True)
 
         with open(local_path, "wb") as f:
@@ -502,7 +506,7 @@ def read_preprocess(self, overwrite_local_cache=False):
         pull_locally(filename, overwrite=overwrite_local_cache)
         with open(filename, "rb") as f:
             tmp = pkl.load(f)
-            for key in ["name", "subset", "n_atoms"]:
+            for key in self.pkl_data_keys:
                 x = tmp.pop(key)
                 if len(x) == 2:
                     self.data[key] = x[0][x[1]]
diff --git a/openqdc/datasets/interaction/L7.py b/openqdc/datasets/interaction/L7.py
index 987df39..0454ce2 100644
--- a/openqdc/datasets/interaction/L7.py
+++ b/openqdc/datasets/interaction/L7.py
@@ -76,7 +76,7 @@ class L7(BaseInteractionDataset):
         "FN-DMC",
     ]
 
-    energy_target_names = []
+    energy_target_names = __energy_methods__
 
     def read_raw_entries(self) -> List[Dict]:
         yaml_fpath = os.path.join(self.root, "l7.yaml")
diff --git a/openqdc/datasets/interaction/X40.py b/openqdc/datasets/interaction/X40.py
index 08f4037..3f23c6b 100644
--- a/openqdc/datasets/interaction/X40.py
+++ b/openqdc/datasets/interaction/X40.py
@@ -36,7 +36,7 @@ class X40(BaseInteractionDataset):
         "MP2.5/CBS(aDZ)",
     ]
 
-    energy_target_names = []
+    energy_target_names = __energy_methods__
 
     def read_raw_entries(self) -> List[Dict]:
         yaml_fpath = os.path.join(self.root, "x40.yaml")
diff --git a/openqdc/datasets/interaction/__init__.py b/openqdc/datasets/interaction/__init__.py
index 82154a5..ccabcfb 100644
--- a/openqdc/datasets/interaction/__init__.py
+++ b/openqdc/datasets/interaction/__init__.py
@@ -3,6 +3,7 @@
 from .des370k import DES370K
 from .dess66 import DESS66
 from .dess66x8 import DESS66x8
+from .dummy import DummyInteraction
 from .L7 import L7
 from .metcalf import Metcalf
 from .splinter import Splinter
@@ -10,6 +11,7 @@
 
 AVAILABLE_INTERACTION_DATASETS = {
     "base": BaseInteractionDataset,
+    "dummy": DummyInteraction,
     "des5m": DES5M,
     "des370k": DES370K,
     "dess66": DESS66,
diff --git a/openqdc/datasets/interaction/base.py b/openqdc/datasets/interaction/base.py
index ed7fcf7..25b3d9c 100644
--- a/openqdc/datasets/interaction/base.py
+++ b/openqdc/datasets/interaction/base.py
@@ -1,6 +1,6 @@
 import pickle as pkl
 from os.path import join as p_join
-from typing import Dict, List, Optional
+from typing import Dict, List
 
 import numpy as np
 from loguru import logger
@@ -8,24 +8,13 @@
 
 from openqdc.datasets.base import BaseDataset
 from openqdc.utils.atomization_energies import IsolatedAtomEnergyFactory
-from openqdc.utils.constants import NB_ATOMIC_FEATURES
-from openqdc.utils.io import pull_locally, push_remote
+from openqdc.utils.io import push_remote
 
 
 class BaseInteractionDataset(BaseDataset):
-    def __init__(
-        self,
-        energy_unit: Optional[str] = None,
-        distance_unit: Optional[str] = None,
-        overwrite_local_cache: bool = False,
-        cache_dir: Optional[str] = None,
-    ) -> None:
-        super().__init__(
-            energy_unit=energy_unit,
-            distance_unit=distance_unit,
-            overwrite_local_cache=overwrite_local_cache,
-            cache_dir=cache_dir,
-        )
+    @property
+    def pkl_data_keys(self):
+        return ["name", "subset", "n_atoms", "n_atoms_first"]
 
     def collate_list(self, list_entries: List[Dict]):
         # concatenate entries
@@ -42,24 +31,6 @@ def collate_list(self, list_entries: List[Dict]):
 
         return res
 
-    @property
-    def data_shapes(self):
-        return {
-            "atomic_inputs": (-1, NB_ATOMIC_FEATURES),
-            "position_idx_range": (-1, 2),
-            "energies": (-1, len(self.__energy_methods__)),
-            "forces": (-1, 3, len(self.force_target_names)),
-        }
-
-    @property
-    def data_types(self):
-        return {
-            "atomic_inputs": np.float32,
-            "position_idx_range": np.int32,
-            "energies": np.float32,
-            "forces": np.float32,
-        }
-
     def __getitem__(self, idx: int):
         shift = IsolatedAtomEnergyFactory.max_charge
         p_start, p_end = self.data["position_idx_range"][idx]
@@ -102,40 +73,11 @@ def save_preprocess(self, data_dict):
 
         # save all other keys in props.pkl
         local_path = p_join(self.preprocess_path, "props.pkl")
-        for key in data_dict:
-            if key not in self.data_keys:
-                x = data_dict[key]
-                x[x == None] = -1
-                data_dict[key] = np.unique(x, return_inverse=True)
+        for key in self.pkl_data_keys:
+            x = data_dict[key]
+            x[x == None] = -1  # noqa
+            data_dict[key] = np.unique(x, return_inverse=True)
 
         with open(local_path, "wb") as f:
             pkl.dump(data_dict, f)
         push_remote(local_path, overwrite=True)
-
-    def read_preprocess(self, overwrite_local_cache=False):
-        logger.info("Reading preprocessed data.")
-        logger.info(
-            f"Dataset {self.__name__} with the following units:\n\
-                     Energy: {self.energy_unit},\n\
-                     Distance: {self.distance_unit},\n\
-                     Forces: {self.force_unit if self.__force_methods__ else 'None'}"
-        )
-        self.data = {}
-        for key in self.data_keys:
-            filename = p_join(self.preprocess_path, f"{key}.mmap")
-            pull_locally(filename, overwrite=overwrite_local_cache)
-            self.data[key] = np.memmap(filename, mode="r", dtype=self.data_types[key]).reshape(self.data_shapes[key])
-
-        filename = p_join(self.preprocess_path, "props.pkl")
-        pull_locally(filename, overwrite=overwrite_local_cache)
-        with open(filename, "rb") as f:
-            tmp = pkl.load(f)
-            for key in set(tmp.keys()) - set(self.data_keys):
-                x = tmp.pop(key)
-                if len(x) == 2:
-                    self.data[key] = x[0][x[1]]
-                else:
-                    self.data[key] = x
-
-        for key in self.data:
-            logger.info(f"Loaded {key} with shape {self.data[key].shape}, dtype {self.data[key].dtype}")
diff --git a/openqdc/datasets/interaction/dummy.py b/openqdc/datasets/interaction/dummy.py
new file mode 100644
index 0000000..9e22703
--- /dev/null
+++ b/openqdc/datasets/interaction/dummy.py
@@ -0,0 +1,100 @@
+import numpy as np
+
+from openqdc.datasets.interaction.base import BaseDataset
+from openqdc.utils.constants import NOT_DEFINED
+
+
+class DummyInteraction(BaseDataset):
+    """
+    Dummy Interaction Dataset for Testing
+    """
+
+    __name__ = "dummy"
+    __energy_methods__ = ["Method1", "Method2"]
+    __force_mask__ = [False, True]
+    __energy_unit__ = "kcal/mol"
+    __distance_unit__ = "ang"
+    __forces_unit__ = "kcal/mol/ang"
+
+    energy_target_names = [f"energy{i}" for i in range(len(__energy_methods__))]
+
+    force_target_names = [f"forces{i}" for i in range(len(__force_mask__))]
+    __isolated_atom_energies__ = []
+    __average_n_atoms__ = None
+
+    def __init__(
+        self,
+        energy_unit=None,
+        distance_unit=None,
+        cache_dir=None,
+    ) -> None:
+        try:
+            super().__init__(energy_unit=energy_unit, distance_unit=distance_unit, cache_dir=cache_dir)
+
+        except:  # noqa
+            pass
+        self._set_isolated_atom_energies()
+        self.setup_dummy()
+
+    @property
+    def _stats(self):
+        return {
+            "formation": {
+                "energy": {
+                    "mean": np.array([[-12.94348027, -9.83037297]]),
+                    "std": np.array([[4.39971409, 3.3574188]]),
+                },
+                "forces": NOT_DEFINED,
+            },
+            "total": {
+                "energy": {
+                    "mean": np.array([[-89.44242, -1740.5336]]),
+                    "std": np.array([[29.599571, 791.48663]]),
+                },
+                "forces": NOT_DEFINED,
+            },
+        }
+
+    def setup_dummy(self):
+        n_atoms = np.array([np.random.randint(10, 30) for _ in range(len(self))])
+        n_atoms_first = np.array([np.random.randint(1, 10) for _ in range(len(self))])
+        position_idx_range = np.concatenate([[0], np.cumsum(n_atoms)]).repeat(2)[1:-1].reshape(-1, 2)
+        atomic_inputs = np.concatenate(
+            [
+                np.concatenate(
+                    [
+                        # z, c, x, y, z
+                        np.random.randint(1, 100, size=(size, 1)),
+                        np.random.randint(-1, 2, size=(size, 1)),
+                        np.random.randn(size, 3),
+                    ],
+                    axis=1,
+                )
+                for size in n_atoms
+            ],
+            axis=0,
+        )  # (sum(n_atoms), 5)
+        name = [f"dummy_{i}" for i in range(len(self))]
+        subset = ["dummy" for i in range(len(self))]
+        energies = np.random.rand(len(self), len(self.energy_methods))
+        forces = np.concatenate([np.random.randn(size, 3, len(self.force_methods)) * 100 for size in n_atoms])
+        self.data = dict(
+            n_atoms=n_atoms,
+            position_idx_range=position_idx_range,
+            name=name,
+            atomic_inputs=atomic_inputs,
+            subset=subset,
+            energies=energies,
+            n_atoms_first=n_atoms_first,
+            forces=forces,
+        )
+        self.__average_nb_atoms__ = self.data["n_atoms"].mean()
+
+    def is_preprocessed(self):
+        return True
+
+    def read_raw_entries(self):
+        pass
+
+    def __len__(self):
+        return 9999
diff --git a/openqdc/datasets/interaction/splinter.py b/openqdc/datasets/interaction/splinter.py
index fd7f08f..c1fd5df 100644
--- a/openqdc/datasets/interaction/splinter.py
+++ b/openqdc/datasets/interaction/splinter.py
@@ -44,7 +44,7 @@ class Splinter(BaseInteractionDataset):
         "sapt0/aug-cc-pV(D+d)Z_disp_scaled",
     ]
 
-    energy_target_names = []
+    energy_target_names = __energy_methods__
 
     def read_raw_entries(self) -> List[Dict]:
         logger.info(f"Reading Splinter interaction data from {self.root}")
diff --git a/tests/test_interaction.py b/tests/test_interaction.py
new file mode 100644
index 0000000..8f1cc2f
--- /dev/null
+++ b/tests/test_interaction.py
@@ -0,0 +1,19 @@
+try:
+    from openqdc.datasets.interaction import DummyInteraction
+
+    dummy_loaded = True
+except:  # noqa
+    dummy_loaded = False
+
+
+def test_import():
+    assert dummy_loaded
+
+
+def test_init():
+    DummyInteraction()
+
+
+def test_len():
+    ds = DummyInteraction()
+    assert len(ds) == 9999

From dccf67646fbb79e0d985333460cfbfc7bd841f9a Mon Sep 17 00:00:00 2001
From: Nikhil Shenoy <nikhilshenoy98@gmail.com.com>
Date: Tue, 26 Mar 2024 20:25:46 +0000
Subject: [PATCH 02/27] minor changes

---
 openqdc/datasets/interaction/__init__.py |  2 -
 openqdc/datasets/interaction/dummy.py    | 15 ++++++--
 openqdc/datasets/potential/dummy.py      | 22 +++++++++--
 tests/test_dummy.py                      | 47 +++++++++++++++++++++---
 tests/test_interaction.py                | 19 ----------
 5 files changed, 72 insertions(+), 33 deletions(-)
 delete mode 100644 tests/test_interaction.py

diff --git a/openqdc/datasets/interaction/__init__.py b/openqdc/datasets/interaction/__init__.py
index ccabcfb..82154a5 100644
--- a/openqdc/datasets/interaction/__init__.py
+++ b/openqdc/datasets/interaction/__init__.py
@@ -3,7 +3,6 @@
 from .des370k import DES370K
 from .dess66 import DESS66
 from .dess66x8 import DESS66x8
-from .dummy import DummyInteraction
 from .L7 import L7
 from .metcalf import Metcalf
 from .splinter import Splinter
@@ -11,7 +10,6 @@
 
 AVAILABLE_INTERACTION_DATASETS = {
     "base": BaseInteractionDataset,
-    "dummy": DummyInteraction,
     "des5m": DES5M,
     "des370k": DES370K,
     "dess66": DESS66,
diff --git a/openqdc/datasets/interaction/dummy.py b/openqdc/datasets/interaction/dummy.py
index 9e22703..af57e27 100644
--- a/openqdc/datasets/interaction/dummy.py
+++ b/openqdc/datasets/interaction/dummy.py
@@ -1,3 +1,5 @@
+from typing import Optional
+
 import numpy as np
 
 from openqdc.datasets.interaction.base import BaseDataset
@@ -24,9 +26,16 @@ class DummyInteraction(BaseDataset):
 
     def __init__(
         self,
-        energy_unit=None,
-        distance_unit=None,
-        cache_dir=None,
+        energy_unit: Optional[str] = None,
+        distance_unit: Optional[str] = None,
+        overwrite_local_cache: bool = False,
+        cache_dir: Optional[str] = None,
+        recompute_statistics: bool = False,
+        regressor_kwargs={
+            "solver_type": "linear",
+            "sub_sample": None,
+            "stride": 1,
+        },
     ) -> None:
         try:
             super().__init__(energy_unit=energy_unit, distance_unit=distance_unit, cache_dir=cache_dir)
diff --git a/openqdc/datasets/potential/dummy.py b/openqdc/datasets/potential/dummy.py
index 48ed3b2..5563544 100644
--- a/openqdc/datasets/potential/dummy.py
+++ b/openqdc/datasets/potential/dummy.py
@@ -1,3 +1,5 @@
+from typing import Optional
+
 import numpy as np
 
 from openqdc.datasets.base import BaseDataset
@@ -43,12 +45,24 @@ def _stats(self):
 
     def __init__(
         self,
-        energy_unit=None,
-        distance_unit=None,
-        cache_dir=None,
+        energy_unit: Optional[str] = None,
+        distance_unit: Optional[str] = None,
+        overwrite_local_cache: bool = False,
+        cache_dir: Optional[str] = None,
+        recompute_statistics: bool = False,
+        regressor_kwargs={
+            "solver_type": "linear",
+            "sub_sample": None,
+            "stride": 1,
+        },
     ) -> None:
         try:
-            super().__init__(energy_unit=energy_unit, distance_unit=distance_unit, cache_dir=cache_dir)
+            super().__init__(
+                energy_unit=energy_unit,
+                distance_unit=distance_unit,
+                cache_dir=cache_dir,
+                recompute_statistics=recompute_statistics,
+            )
 
         except:  # noqa
             pass
diff --git a/tests/test_dummy.py b/tests/test_dummy.py
index f82376c..b20c899 100644
--- a/tests/test_dummy.py
+++ b/tests/test_dummy.py
@@ -1,5 +1,8 @@
 """Path hack to make tests work."""
 
+import pytest
+
+from openqdc.datasets.interaction.dummy import DummyInteraction  # noqa: E402
 from openqdc.datasets.potential.dummy import Dummy  # noqa: E402
 from openqdc.utils.atomization_energies import (
     ISOLATED_ATOM_ENERGIES,
@@ -7,13 +10,47 @@
 )
 
 
-def test_dummy():
-    ds = Dummy()
-    assert len(ds) > 10
-    assert ds[100]
+@pytest.fixture
+def dummy():
+    return Dummy()
+
+
+@pytest.fixture
+def dummy_interaction():
+    return DummyInteraction()
+
+
+@pytest.mark.parametrize("cls", ["dummy", "dummy_interaction"])
+def test_basic(cls, request):
+    # init
+    ds = request.getfixturevalue(cls)
+
+    # len
+    assert len(ds) == 9999
+
+    # __getitem__
+    assert ds[0]
+
+
+@pytest.mark.parametrize("cls", ["dummy", "dummy_interaction"])
+@pytest.mark.parametrize(
+    "normalization",
+    [
+        "formation",
+        "total",
+        # "residual_regression",
+        # "per_atom_formation",
+        # "per_atom_residual_regression"
+    ],
+)
+def test_stats(cls, normalization, request):
+    ds = request.getfixturevalue(cls)
+
+    stats = ds.get_statistics(normalization=normalization)
+    assert stats is not None
 
 
-def test_is_at_factory():
+def test_isolated_atom_factory():
     res = IsolatedAtomEnergyFactory.get("mp2/cc-pvdz")
     assert len(res) == len(ISOLATED_ATOM_ENERGIES["mp2"]["cc-pvdz"])
     res = IsolatedAtomEnergyFactory.get("PM6")
diff --git a/tests/test_interaction.py b/tests/test_interaction.py
deleted file mode 100644
index 8f1cc2f..0000000
--- a/tests/test_interaction.py
+++ /dev/null
@@ -1,19 +0,0 @@
-try:
-    from openqdc.datasets.interaction import DummyInteraction
-
-    dummy_loaded = True
-except:  # noqa
-    dummy_loaded = False
-
-
-def test_import():
-    assert dummy_loaded
-
-
-def test_init():
-    DummyInteraction()
-
-
-def test_len():
-    ds = DummyInteraction()
-    assert len(ds) == 9999

From 2ab64aaf4d3ce367d86bd3981701ff87237bf453 Mon Sep 17 00:00:00 2001
From: Nikhil Shenoy <nikhilshenoy98@gmail.com.com>
Date: Tue, 26 Mar 2024 20:30:12 +0000
Subject: [PATCH 03/27] dummy modification

---
 openqdc/datasets/interaction/dummy.py | 26 +++++-----------------
 openqdc/datasets/potential/dummy.py   | 31 +++++----------------------
 2 files changed, 10 insertions(+), 47 deletions(-)

diff --git a/openqdc/datasets/interaction/dummy.py b/openqdc/datasets/interaction/dummy.py
index af57e27..b88e623 100644
--- a/openqdc/datasets/interaction/dummy.py
+++ b/openqdc/datasets/interaction/dummy.py
@@ -1,5 +1,3 @@
-from typing import Optional
-
 import numpy as np
 
 from openqdc.datasets.interaction.base import BaseDataset
@@ -24,26 +22,9 @@ class DummyInteraction(BaseDataset):
     __isolated_atom_energies__ = []
     __average_n_atoms__ = None
 
-    def __init__(
-        self,
-        energy_unit: Optional[str] = None,
-        distance_unit: Optional[str] = None,
-        overwrite_local_cache: bool = False,
-        cache_dir: Optional[str] = None,
-        recompute_statistics: bool = False,
-        regressor_kwargs={
-            "solver_type": "linear",
-            "sub_sample": None,
-            "stride": 1,
-        },
-    ) -> None:
-        try:
-            super().__init__(energy_unit=energy_unit, distance_unit=distance_unit, cache_dir=cache_dir)
-
-        except:  # noqa
-            pass
-        self._set_isolated_atom_energies()
+    def _post_init(self, overwrite_local_cache, energy_unit, distance_unit) -> None:
         self.setup_dummy()
+        return super()._post_init(overwrite_local_cache, energy_unit, distance_unit)
 
     @property
     def _stats(self):
@@ -99,6 +80,9 @@ def setup_dummy(self):
         )
         self.__average_nb_atoms__ = self.data["n_atoms"].mean()
 
+    def read_preprocess(self, overwrite_local_cache=False):
+        return
+
     def is_preprocessed(self):
         return True
 
diff --git a/openqdc/datasets/potential/dummy.py b/openqdc/datasets/potential/dummy.py
index 5563544..f5b3aa1 100644
--- a/openqdc/datasets/potential/dummy.py
+++ b/openqdc/datasets/potential/dummy.py
@@ -1,5 +1,3 @@
-from typing import Optional
-
 import numpy as np
 
 from openqdc.datasets.base import BaseDataset
@@ -43,31 +41,9 @@ def _stats(self):
             },
         }
 
-    def __init__(
-        self,
-        energy_unit: Optional[str] = None,
-        distance_unit: Optional[str] = None,
-        overwrite_local_cache: bool = False,
-        cache_dir: Optional[str] = None,
-        recompute_statistics: bool = False,
-        regressor_kwargs={
-            "solver_type": "linear",
-            "sub_sample": None,
-            "stride": 1,
-        },
-    ) -> None:
-        try:
-            super().__init__(
-                energy_unit=energy_unit,
-                distance_unit=distance_unit,
-                cache_dir=cache_dir,
-                recompute_statistics=recompute_statistics,
-            )
-
-        except:  # noqa
-            pass
-        self._set_isolated_atom_energies()
+    def _post_init(self, overwrite_local_cache, energy_unit, distance_unit) -> None:
         self.setup_dummy()
+        return super()._post_init(overwrite_local_cache, energy_unit, distance_unit)
 
     def setup_dummy(self):
         n_atoms = np.array([np.random.randint(1, 100) for _ in range(len(self))])
@@ -102,6 +78,9 @@ def setup_dummy(self):
         )
         self.__average_nb_atoms__ = self.data["n_atoms"].mean()
 
+    def read_preprocess(self, overwrite_local_cache=False):
+        return
+
     def is_preprocessed(self):
         return True
 

From 189ab90d6d2364daf47cc53c0a8a9c608aa26892 Mon Sep 17 00:00:00 2001
From: Nikhil Shenoy <nikhilshenoy98@gmail.com.com>
Date: Fri, 29 Mar 2024 01:32:59 +0000
Subject: [PATCH 04/27] undo changes in interaction dataset, and minor change
 in shape

---
 openqdc/datasets/base.py                 | 4 ++--
 openqdc/datasets/interaction/L7.py       | 2 +-
 openqdc/datasets/interaction/X40.py      | 2 +-
 openqdc/datasets/interaction/splinter.py | 2 +-
 4 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/openqdc/datasets/base.py b/openqdc/datasets/base.py
index 2c1d2fa..7b486a5 100644
--- a/openqdc/datasets/base.py
+++ b/openqdc/datasets/base.py
@@ -363,8 +363,8 @@ def data_shapes(self):
         return {
             "atomic_inputs": (-1, NB_ATOMIC_FEATURES),
             "position_idx_range": (-1, 2),
-            "energies": (-1, len(self.energy_target_names)),
-            "forces": (-1, 3, len(self.force_target_names)),
+            "energies": (-1, len(self.energy_methods)),
+            "forces": (-1, 3, len(self.force_methods)),
         }
 
     @property
diff --git a/openqdc/datasets/interaction/L7.py b/openqdc/datasets/interaction/L7.py
index 0454ce2..987df39 100644
--- a/openqdc/datasets/interaction/L7.py
+++ b/openqdc/datasets/interaction/L7.py
@@ -76,7 +76,7 @@ class L7(BaseInteractionDataset):
         "FN-DMC",
     ]
 
-    energy_target_names = __energy_methods__
+    energy_target_names = []
 
     def read_raw_entries(self) -> List[Dict]:
         yaml_fpath = os.path.join(self.root, "l7.yaml")
diff --git a/openqdc/datasets/interaction/X40.py b/openqdc/datasets/interaction/X40.py
index 3f23c6b..08f4037 100644
--- a/openqdc/datasets/interaction/X40.py
+++ b/openqdc/datasets/interaction/X40.py
@@ -36,7 +36,7 @@ class X40(BaseInteractionDataset):
         "MP2.5/CBS(aDZ)",
     ]
 
-    energy_target_names = __energy_methods__
+    energy_target_names = []
 
     def read_raw_entries(self) -> List[Dict]:
         yaml_fpath = os.path.join(self.root, "x40.yaml")
diff --git a/openqdc/datasets/interaction/splinter.py b/openqdc/datasets/interaction/splinter.py
index c1fd5df..fd7f08f 100644
--- a/openqdc/datasets/interaction/splinter.py
+++ b/openqdc/datasets/interaction/splinter.py
@@ -44,7 +44,7 @@ class Splinter(BaseInteractionDataset):
         "sapt0/aug-cc-pV(D+d)Z_disp_scaled",
     ]
 
-    energy_target_names = __energy_methods__
+    energy_target_names = []
 
     def read_raw_entries(self) -> List[Dict]:
         logger.info(f"Reading Splinter interaction data from {self.root}")

From 282dc919ee3b10afd9f89ce9196d66269cac4b13 Mon Sep 17 00:00:00 2001
From: Nikhil Shenoy <nikhilshenoy98@gmail.com.com>
Date: Tue, 2 Apr 2024 00:50:05 +0000
Subject: [PATCH 05/27] changed super class to BaseInteractionDataset

---
 openqdc/datasets/interaction/dummy.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/openqdc/datasets/interaction/dummy.py b/openqdc/datasets/interaction/dummy.py
index b88e623..cfab609 100644
--- a/openqdc/datasets/interaction/dummy.py
+++ b/openqdc/datasets/interaction/dummy.py
@@ -1,10 +1,10 @@
 import numpy as np
 
-from openqdc.datasets.interaction.base import BaseDataset
+from openqdc.datasets.interaction.base import BaseInteractionDataset
 from openqdc.utils.constants import NOT_DEFINED
 
 
-class DummyInteraction(BaseDataset):
+class DummyInteraction(BaseInteractionDataset):
     """
     Dummy Interaction Dataset for Testing
     """

From afea05302b514e937688254200a9d526ec524f0d Mon Sep 17 00:00:00 2001
From: Nikhil Shenoy <nikhilshenoy98@gmail.com.com>
Date: Wed, 3 Apr 2024 20:14:20 +0000
Subject: [PATCH 06/27] further simplified and rebase

---
 openqdc/datasets/base.py              | 14 +++++++++++---
 openqdc/datasets/interaction/base.py  | 25 +------------------------
 openqdc/datasets/interaction/dummy.py |  3 ++-
 3 files changed, 14 insertions(+), 28 deletions(-)

diff --git a/openqdc/datasets/base.py b/openqdc/datasets/base.py
index 2ade62e..c12e5c4 100644
--- a/openqdc/datasets/base.py
+++ b/openqdc/datasets/base.py
@@ -424,8 +424,13 @@ def save_preprocess(self, data_dict):
 
         # save smiles and subset
         local_path = p_join(self.preprocess_path, "props.pkl")
-        for key in self.pkl_data_keys:
-            data_dict[key] = np.unique(data_dict[key], return_inverse=True)
+        # assert that required keys are present in data_dict
+        assert all([key in data_dict for key in self.pkl_data_keys])
+        for key in data_dict:
+            if key not in self.data_keys:
+                x = data_dict[key]
+                x[x == None] = -1  # noqa
+                data_dict[key] = np.unique(data_dict[key], return_inverse=True)
 
         with open(local_path, "wb") as f:
             pkl.dump(data_dict, f)
@@ -461,7 +466,10 @@ def read_preprocess(self, overwrite_local_cache=False):
         pull_locally(filename, overwrite=overwrite_local_cache)
         with open(filename, "rb") as f:
             tmp = pkl.load(f)
-            for key in self.pkl_data_keys:
+            all_pkl_keys = set(tmp.keys()) - set(self.data_keys)
+            # assert required pkl_keys are present in all_pkl_keys
+            assert all([key in all_pkl_keys for key in self.pkl_data_keys])
+            for key in all_pkl_keys:
                 x = tmp.pop(key)
                 if len(x) == 2:
                     self.data[key] = x[0][x[1]]
diff --git a/openqdc/datasets/interaction/base.py b/openqdc/datasets/interaction/base.py
index 987340a..627cec4 100644
--- a/openqdc/datasets/interaction/base.py
+++ b/openqdc/datasets/interaction/base.py
@@ -1,16 +1,14 @@
 import os
-import pickle as pkl
 from os.path import join as p_join
 from typing import Dict, List, Optional
 
 import numpy as np
 from ase.io.extxyz import write_extxyz
-from loguru import logger
 from sklearn.utils import Bunch
 
 from openqdc.datasets.base import BaseDataset
 from openqdc.utils.constants import MAX_CHARGE
-from openqdc.utils.io import push_remote, to_atoms
+from openqdc.utils.io import to_atoms
 
 
 class BaseInteractionDataset(BaseDataset):
@@ -65,27 +63,6 @@ def __getitem__(self, idx: int):
             n_atoms_first=n_atoms_first,
         )
 
-    def save_preprocess(self, data_dict):
-        # save memmaps
-        logger.info("Preprocessing data and saving it to cache.")
-        for key in self.data_keys:
-            local_path = p_join(self.preprocess_path, f"{key}.mmap")
-            out = np.memmap(local_path, mode="w+", dtype=data_dict[key].dtype, shape=data_dict[key].shape)
-            out[:] = data_dict.pop(key)[:]
-            out.flush()
-            push_remote(local_path, overwrite=True)
-
-        # save all other keys in props.pkl
-        local_path = p_join(self.preprocess_path, "props.pkl")
-        for key in self.pkl_data_keys:
-            x = data_dict[key]
-            x[x == None] = -1  # noqa
-            data_dict[key] = np.unique(x, return_inverse=True)
-
-        with open(local_path, "wb") as f:
-            pkl.dump(data_dict, f)
-        push_remote(local_path, overwrite=True)
-
     def get_ase_atoms(self, idx: int):
         entry = self[idx]
         at = to_atoms(entry["positions"], entry["atomic_numbers"])
diff --git a/openqdc/datasets/interaction/dummy.py b/openqdc/datasets/interaction/dummy.py
index cfab609..48e92a9 100644
--- a/openqdc/datasets/interaction/dummy.py
+++ b/openqdc/datasets/interaction/dummy.py
@@ -1,6 +1,7 @@
 import numpy as np
 
 from openqdc.datasets.interaction.base import BaseInteractionDataset
+from openqdc.methods import InteractionMethod
 from openqdc.utils.constants import NOT_DEFINED
 
 
@@ -10,7 +11,7 @@ class DummyInteraction(BaseInteractionDataset):
     """
 
     __name__ = "dummy"
-    __energy_methods__ = ["Method1", "Method2"]
+    __energy_methods__ = [InteractionMethod.SAPT0_AUG_CC_PVDDZ, InteractionMethod.CCSD_T_CC_PVDZ]
     __force_mask__ = [False, True]
     __energy_unit__ = "kcal/mol"
     __distance_unit__ = "ang"

From ebc2adfd595e534bb5a630f69a8d882c9087e9f5 Mon Sep 17 00:00:00 2001
From: Nikhil Shenoy <nikhilshenoy98@gmail.com.com>
Date: Fri, 5 Apr 2024 21:13:38 +0000
Subject: [PATCH 07/27] fixes

---
 openqdc/datasets/base.py                 | 17 +++--
 openqdc/datasets/interaction/L7.py       | 79 +++++++++++++-----------
 openqdc/datasets/interaction/X40.py      | 10 +--
 openqdc/datasets/interaction/base.py     |  9 ++-
 openqdc/datasets/interaction/splinter.py |  2 +-
 openqdc/raws/config_factory.py           | 43 +++++++++++--
 openqdc/utils/preprocess.py              |  2 +-
 7 files changed, 107 insertions(+), 55 deletions(-)

diff --git a/openqdc/datasets/base.py b/openqdc/datasets/base.py
index c12e5c4..9dc3204 100644
--- a/openqdc/datasets/base.py
+++ b/openqdc/datasets/base.py
@@ -310,7 +310,11 @@ def data_keys(self):
 
     @property
     def pkl_data_keys(self):
-        return ["name", "subset", "n_atoms"]
+        return list(self.pkl_data_types.keys())
+
+    @property
+    def pkl_data_types(self):
+        return {"name": str, "subset": str, "n_atoms": np.int32}
 
     @property
     def data_types(self):
@@ -424,12 +428,13 @@ def save_preprocess(self, data_dict):
 
         # save smiles and subset
         local_path = p_join(self.preprocess_path, "props.pkl")
+
         # assert that required keys are present in data_dict
-        assert all([key in data_dict for key in self.pkl_data_keys])
-        for key in data_dict:
-            if key not in self.data_keys:
-                x = data_dict[key]
-                x[x == None] = -1  # noqa
+        assert all([key in self.pkl_data_keys for key in data_dict.keys()])
+
+        # store unique and inverse indices for str-based pkl keys
+        for key in self.pkl_data_keys:
+            if self.pkl_data_types[key] == str:
                 data_dict[key] = np.unique(data_dict[key], return_inverse=True)
 
         with open(local_path, "wb") as f:
diff --git a/openqdc/datasets/interaction/L7.py b/openqdc/datasets/interaction/L7.py
index fa16509..c72e2c1 100644
--- a/openqdc/datasets/interaction/L7.py
+++ b/openqdc/datasets/interaction/L7.py
@@ -1,5 +1,7 @@
 import os
-from typing import Dict, List
+from dataclasses import dataclass
+from functools import partial
+from typing import Dict, List, Optional
 
 import numpy as np
 import yaml
@@ -10,42 +12,49 @@
 from openqdc.utils.constants import ATOM_TABLE
 
 
-class DataItemYAMLObj:
-    def __init__(self, name, shortname, geometry, reference_value, setup, group, tags):
-        self.name = name
-        self.shortname = shortname
-        self.geometry = geometry
-        self.reference_value = reference_value
-        self.setup = setup
-        self.group = group
-        self.tags = tags
-
-
-class DataSetYAMLObj:
-    def __init__(self, name, references, text, method_energy, groups_by, groups, global_setup, method_geometry=None):
-        self.name = name
-        self.references = references
-        self.text = text
-        self.method_energy = method_energy
-        self.method_geometry = method_geometry
-        self.groups_by = groups_by
-        self.groups = groups
-        self.global_setup = global_setup
-
-
-def data_item_constructor(loader: yaml.SafeLoader, node: yaml.nodes.MappingNode):
-    return DataItemYAMLObj(**loader.construct_mapping(node))
+@dataclass
+class DataSet:
+    description: Dict
+    items: List[Dict]
+    alternative_reference: Dict
 
 
-def dataset_constructor(loader: yaml.SafeLoader, node: yaml.nodes.MappingNode):
-    return DataSetYAMLObj(**loader.construct_mapping(node))
+@dataclass
+class DataItemYAMLObj:
+    name: str
+    shortname: str
+    geometry: str
+    reference_value: float
+    setup: Dict
+    group: str
+    tags: str
+
+
+@dataclass
+class DataSetDescription:
+    name: Dict
+    references: str
+    text: str
+    groups_by: str
+    groups: List[str]
+    global_setup: Dict
+    method_energy: str
+    method_geometry: Optional[str] = None
 
 
 def get_loader():
     """Add constructors to PyYAML loader."""
+
+    def constructor(loader: yaml.SafeLoader, node: yaml.nodes.MappingNode, cls):
+        return cls(**loader.construct_mapping(node))
+
     loader = yaml.SafeLoader
-    loader.add_constructor("!ruby/object:ProtocolDataset::DataSetItem", data_item_constructor)
-    loader.add_constructor("!ruby/object:ProtocolDataset::DataSetDescription", dataset_constructor)
+
+    loader.add_constructor("!ruby/object:ProtocolDataset::DataSet", partial(constructor, cls=DataSet))
+    loader.add_constructor("!ruby/object:ProtocolDataset::DataSetItem", partial(constructor, cls=DataItemYAMLObj))
+    loader.add_constructor(
+        "!ruby/object:ProtocolDataset::DataSetDescription", partial(constructor, cls=DataSetDescription)
+    )
     return loader
 
 
@@ -62,7 +71,7 @@ class L7(BaseInteractionDataset):
     http://cuby4.molecular.cz/dataset_l7.html
     """
 
-    __name__ = "L7"
+    __name__ = "l7"
     __energy_unit__ = "hartree"
     __distance_unit__ = "ang"
     __forces_unit__ = "hartree/ang"
@@ -87,10 +96,10 @@ def read_raw_entries(self) -> List[Dict]:
         yaml_file = open(yaml_fpath, "r")
         data = []
         data_dict = yaml.load(yaml_file, Loader=get_loader())
-        charge0 = int(data_dict["description"].global_setup["molecule_a"]["charge"])
-        charge1 = int(data_dict["description"].global_setup["molecule_b"]["charge"])
+        charge0 = int(data_dict.description.global_setup["molecule_a"]["charge"])
+        charge1 = int(data_dict.description.global_setup["molecule_b"]["charge"])
 
-        for idx, item in enumerate(data_dict["items"]):
+        for idx, item in enumerate(data_dict.items):
             energies = []
             name = np.array([item.shortname])
             fname = item.geometry.split(":")[1]
@@ -101,7 +110,7 @@ def read_raw_entries(self) -> List[Dict]:
             n_atoms = np.array([int(lines[0][0])], dtype=np.int32)
             n_atoms_first = np.array([int(item.setup["molecule_a"]["selection"].split("-")[1])], dtype=np.int32)
             subset = np.array([item.group])
-            energies += [float(val[idx]) for val in list(data_dict["alternative_reference"].values())]
+            energies += [float(val[idx]) for val in list(data_dict.alternative_reference.values())]
             energies = np.array([energies], dtype=np.float32)
             pos = np.array(lines[1:])[:, 1:].astype(np.float32)
             elems = np.array(lines[1:])[:, 0]
diff --git a/openqdc/datasets/interaction/X40.py b/openqdc/datasets/interaction/X40.py
index 98a9d67..dfb43d0 100644
--- a/openqdc/datasets/interaction/X40.py
+++ b/openqdc/datasets/interaction/X40.py
@@ -25,7 +25,7 @@ class X40(BaseInteractionDataset):
     http://cuby4.molecular.cz/dataset_x40.html
     """
 
-    __name__ = "X40"
+    __name__ = "x40"
     __energy_unit__ = "hartree"
     __distance_unit__ = "ang"
     __forces_unit__ = "hartree/ang"
@@ -48,10 +48,10 @@ def read_raw_entries(self) -> List[Dict]:
         yaml_file = open(yaml_fpath, "r")
         data = []
         data_dict = yaml.load(yaml_file, Loader=get_loader())
-        charge0 = int(data_dict["description"].global_setup["molecule_a"]["charge"])
-        charge1 = int(data_dict["description"].global_setup["molecule_b"]["charge"])
+        charge0 = int(data_dict.description.global_setup["molecule_a"]["charge"])
+        charge1 = int(data_dict.description.global_setup["molecule_b"]["charge"])
 
-        for idx, item in enumerate(data_dict["items"]):
+        for idx, item in enumerate(data_dict.items):
             energies = []
             name = np.array([item.shortname])
             energies.append(float(item.reference_value))
@@ -62,7 +62,7 @@ def read_raw_entries(self) -> List[Dict]:
             n_atoms_first = setup[0].split("-")[1]
             n_atoms_first = np.array([int(n_atoms_first)], dtype=np.int32)
             subset = np.array([item.group])
-            energies += [float(val[idx]) for val in list(data_dict["alternative_reference"].values())]
+            energies += [float(val[idx]) for val in list(data_dict.alternative_reference.values())]
             energies = np.array([energies], dtype=np.float32)
             pos = np.array(lines[1:])[:, 1:].astype(np.float32)
             elems = np.array(lines[1:])[:, 0]
diff --git a/openqdc/datasets/interaction/base.py b/openqdc/datasets/interaction/base.py
index 627cec4..0c801fa 100644
--- a/openqdc/datasets/interaction/base.py
+++ b/openqdc/datasets/interaction/base.py
@@ -15,8 +15,13 @@ class BaseInteractionDataset(BaseDataset):
     __energy_type__ = []
 
     @property
-    def pkl_data_keys(self):
-        return ["name", "subset", "n_atoms", "n_atoms_first"]
+    def pkl_data_types(self):
+        return {
+            "name": str,
+            "subset": str,
+            "n_atoms": np.int32,
+            "n_atoms_first": np.int32,
+        }
 
     def collate_list(self, list_entries: List[Dict]):
         # concatenate entries
diff --git a/openqdc/datasets/interaction/splinter.py b/openqdc/datasets/interaction/splinter.py
index a57275d..39e930b 100644
--- a/openqdc/datasets/interaction/splinter.py
+++ b/openqdc/datasets/interaction/splinter.py
@@ -130,7 +130,7 @@ def read_raw_entries(self) -> List[Dict]:
                             index,
                             _,
                         ) = metadata[0].split("_")
-                        r, theta_P, tau_P, theta_L, tau_L, tau_PL = [None] * 6
+                        r, theta_P, tau_P, theta_L, tau_L, tau_PL = [-1] * 6
                     energies = np.array([list(map(float, metadata[4:-1]))]).astype(np.float32)
                     n_atoms_first = np.array([int(metadata[-1])], dtype=np.int32)
                     total_charge, charge0, charge1 = list(map(int, metadata[1:4]))
diff --git a/openqdc/raws/config_factory.py b/openqdc/raws/config_factory.py
index b4784ed..e7750c5 100644
--- a/openqdc/raws/config_factory.py
+++ b/openqdc/raws/config_factory.py
@@ -51,6 +51,14 @@ class DataConfigFactory:
         links={"rdkit_folder.tar.gz": "https://dataverse.harvard.edu/api/access/datafile/4327252"},
     )
 
+    l7 = dict(
+        dataset_name="l7",
+        links={
+            "l7.yaml": "http://cuby4.molecular.cz/download_datasets/l7.yaml",
+            "geometries.tar.gz": "http://cuby4.molecular.cz/download_geometries/L7.tar",
+        },
+    )
+
     molecule3d = dict(
         dataset_name="molecule3d",
         links={"molecule3d.zip": "https://drive.google.com/uc?id=1C_KRf8mX-gxny7kL9ACNCEV4ceu_fUGy"},
@@ -86,6 +94,28 @@ class DataConfigFactory:
         links={"spice-2.0.0.hdf5": "https://zenodo.org/records/10835749/files/SPICE-2.0.0.hdf5?download=1"},
     )
 
+    splinter = dict(
+        dataset_name="splinter",
+        links={
+            "dimerpairs.0.tar.gz": "https://figshare.com/ndownloader/files/39449167",
+            "dimerpairs.1.tar.gz": "https://figshare.com/ndownloader/files/40271983",
+            "dimerpairs.2.tar.gz": "https://figshare.com/ndownloader/files/40271989",
+            "dimerpairs.3.tar.gz": "https://figshare.com/ndownloader/files/40272001",
+            "dimerpairs.4.tar.gz": "https://figshare.com/ndownloader/files/40272022",
+            "dimerpairs.5.tar.gz": "https://figshare.com/ndownloader/files/40552931",
+            "dimerpairs.6.tar.gz": "https://figshare.com/ndownloader/files/40272040",
+            "dimerpairs.7.tar.gz": "https://figshare.com/ndownloader/files/40272052",
+            "dimerpairs.8.tar.gz": "https://figshare.com/ndownloader/files/40272061",
+            "dimerpairs.9.tar.gz": "https://figshare.com/ndownloader/files/40272064",
+            "dimerpairs_nonstandard.tar.gz": "https://figshare.com/ndownloader/files/40272067",
+            "lig_interaction_sites.sdf": "https://figshare.com/ndownloader/files/40272070",
+            "lig_monomers.sdf": "https://figshare.com/ndownloader/files/40272073",
+            "prot_interaction_sites.sdf": "https://figshare.com/ndownloader/files/40272076",
+            "prot_monomers.sdf": "https://figshare.com/ndownloader/files/40272079",
+            "merge_monomers.py": "https://figshare.com/ndownloader/files/41807682",
+        },
+    )
+
     dess = dict(
         dataset_name="dess5m",
         links={
@@ -161,11 +191,6 @@ class DataConfigFactory:
         links={"Transition1x.h5": "https://figshare.com/ndownloader/files/36035789"},
     )
 
-    # l7 = dict(
-    #     dataset_name="l7",
-    #     links={"l7.zip": "http://www.begdb.org/moldown.php?id=40"}
-    # )
-
     des_s66 = dict(
         dataset_name="des_s66",
         links={"DESS66.zip": "https://zenodo.org/records/5676284/files/DESS66.zip?download=1"},
@@ -180,6 +205,14 @@ class DataConfigFactory:
         links={"revmd17.zip": "https://figshare.com/ndownloader/articles/12672038/versions/3"},
     )
 
+    x40 = dict(
+        dataset_name="x40",
+        links={
+            "x40.yaml": "http://cuby4.molecular.cz/download_datasets/x40.yaml",
+            "geometries.tar.gz": "http://cuby4.molecular.cz/download_geometries/X40.tar",
+        },
+    )
+
     available_datasets = [k for k in locals().keys() if not k.startswith("__")]
 
     def __init__(self):
diff --git a/openqdc/utils/preprocess.py b/openqdc/utils/preprocess.py
index a7dd9c7..0fee22b 100644
--- a/openqdc/utils/preprocess.py
+++ b/openqdc/utils/preprocess.py
@@ -7,7 +7,7 @@
 from openqdc import AVAILABLE_DATASETS
 
 options = list(AVAILABLE_DATASETS.values())
-options_map = {d.__name__: d for d in options}
+options_map = {d.__name__.lower(): d for d in options}
 
 
 @click.command()

From ed8e264c688c79adfe872d6115b72e48042feebb Mon Sep 17 00:00:00 2001
From: Nikhil Shenoy <nikhilshenoy98@gmail.com.com>
Date: Fri, 5 Apr 2024 23:42:02 +0000
Subject: [PATCH 08/27] Updated metcalf

---
 openqdc/datasets/interaction/metcalf.py | 48 +++++++++++++++++++++++++
 openqdc/raws/config_factory.py          |  5 +++
 2 files changed, 53 insertions(+)

diff --git a/openqdc/datasets/interaction/metcalf.py b/openqdc/datasets/interaction/metcalf.py
index 819d5dc..1905918 100644
--- a/openqdc/datasets/interaction/metcalf.py
+++ b/openqdc/datasets/interaction/metcalf.py
@@ -2,12 +2,58 @@
 from typing import Dict, List
 
 import numpy as np
+from loguru import logger
 
 from openqdc.datasets.interaction.base import BaseInteractionDataset
 from openqdc.methods import InteractionMethod, InterEnergyType
 from openqdc.utils.constants import ATOM_TABLE
 
 
+def extract_raw_tar_gz(folder):
+    # go over all files
+    logger.info(f"Extracting all tar.gz files in {folder}")
+    expected_tar_files = {
+        "train": [
+            "TRAINING-2073-ssi-neutral.tar.gz",
+            "TRAINING-2610-donors-perturbed.tar.gz",
+            "TRAINING-4795-acceptors-perturbed.tar.gz",
+        ],
+        "val": ["VALIDATION-125-donors.tar.gz", "VALIDATION-254-acceptors.tar.gz"],
+        "test": [
+            "TEST-Acc--3-methylbutan-2-one_Don--NMe-acetamide-PLDB.tar.gz",
+            "TEST-Acc--Cyclohexanone_Don--NMe-acetamide-PLDB.tar.gz",
+            "TEST-Acc--Isoquinolone_NMe-acetamide.tar.gz",
+            "TEST-Acc--NMe-acetamide_Don--Aniline-CSD.tar.gz",
+            "TEST-Acc--NMe-acetamide_Don--Aniline-PLDB.tar.gz",
+            "TEST-Acc--NMe-acetamide_Don--N-isopropylacetamide-PLDB.tar.gz",
+            "TEST-Acc--NMe-acetamide_Don--N-phenylbenzamide-PLDB.tar.gz",
+            "TEST-Acc--NMe-acetamide_Don--Naphthalene-1H-PLDB.tar.gz",
+            "TEST-Acc--NMe-acetamide_Don--Uracil-PLDB.tar.gz",
+            "TEST-Acc--Tetrahydro-2H-pyran-2-one_NMe-acetamide-PLDB.tar.gz",
+            "TEST-NMe-acetamide_Don--Benzimidazole-PLDB.tar.gz",
+        ],
+    }
+
+    # create a folder with the same name as the tar.gz file
+    for subset in expected_tar_files:
+        for tar_file in expected_tar_files[subset]:
+            logger.info(f"Extracting {tar_file}")
+            tar_file_path = os.path.join(folder, tar_file)
+
+            # check if tar file exists
+            if not os.path.exists(tar_file_path):
+                raise FileNotFoundError(f"File {tar_file_path} not found")
+
+            # skip if extracted folder exists
+            if os.path.exists(os.path.join(folder, tar_file.replace(".tar.gz", ""))):
+                logger.info(f"Skipping {tar_file}")
+                continue
+
+            tar_folder_path = tar_file_path.replace(".tar.gz", "")
+            os.mkdir(tar_folder_path)
+            os.system(f"tar -xzf {tar_file_path} -C {tar_folder_path}")
+
+
 class Metcalf(BaseInteractionDataset):
     """
     Hydrogen-bonded dimers of NMA with 126 molecules as described in:
@@ -53,6 +99,8 @@ class Metcalf(BaseInteractionDataset):
     ]
 
     def read_raw_entries(self) -> List[Dict]:
+        # extract in folders
+        extract_raw_tar_gz(self.root)
         data = []
         for dirname in os.listdir(self.root):
             xyz_dir = os.path.join(self.root, dirname)
diff --git a/openqdc/raws/config_factory.py b/openqdc/raws/config_factory.py
index 16d8ee1..9f8c6c1 100644
--- a/openqdc/raws/config_factory.py
+++ b/openqdc/raws/config_factory.py
@@ -299,6 +299,11 @@ class DataConfigFactory:
         },
     )
 
+    metcalf = dict(
+        dataset_name="metcalf",
+        links={"model-data.tar.gz": "https://zenodo.org/records/10934211/files/model-data.tar?download=1"},
+    )
+
     misato = dict(
         dataset_name="misato",
         links={

From 18bc79c4b1cf368d75029e3431b0761022940f02 Mon Sep 17 00:00:00 2001
From: Nikhil Shenoy <nikhilshenoy98@gmail.com.com>
Date: Sat, 6 Apr 2024 00:46:11 +0000
Subject: [PATCH 09/27] bug fix and simplifying interaction dataset

---
 openqdc/datasets/base.py             |  4 ++--
 openqdc/datasets/interaction/base.py | 17 +----------------
 2 files changed, 3 insertions(+), 18 deletions(-)

diff --git a/openqdc/datasets/base.py b/openqdc/datasets/base.py
index 469a033..b5bc43b 100644
--- a/openqdc/datasets/base.py
+++ b/openqdc/datasets/base.py
@@ -341,8 +341,8 @@ def save_preprocess(self, data_dict):
         # save smiles and subset
         local_path = p_join(self.preprocess_path, "props.pkl")
 
-        # assert that required keys are present in data_dict
-        assert all([key in self.pkl_data_keys for key in data_dict.keys()])
+        # assert that (required) pkl keys are present in data_dict
+        assert all([key in data_dict.keys() for key in self.pkl_data_keys])
 
         # store unique and inverse indices for str-based pkl keys
         for key in self.pkl_data_keys:
diff --git a/openqdc/datasets/interaction/base.py b/openqdc/datasets/interaction/base.py
index 18b6a1e..96f39c1 100644
--- a/openqdc/datasets/interaction/base.py
+++ b/openqdc/datasets/interaction/base.py
@@ -1,6 +1,6 @@
 import os
 from os.path import join as p_join
-from typing import Dict, List, Optional
+from typing import Optional
 
 import numpy as np
 from ase.io.extxyz import write_extxyz
@@ -23,21 +23,6 @@ def pkl_data_types(self):
             "n_atoms_first": np.int32,
         }
 
-    def collate_list(self, list_entries: List[Dict]):
-        # concatenate entries
-        res = {
-            key: np.concatenate([r[key] for r in list_entries if r is not None], axis=0)
-            for key in list_entries[0]
-            if not isinstance(list_entries[0][key], dict)
-        }
-
-        csum = np.cumsum(res.get("n_atoms"))
-        x = np.zeros((csum.shape[0], 2), dtype=np.int32)
-        x[1:, 0], x[:, 1] = csum[:-1], csum
-        res["position_idx_range"] = x
-
-        return res
-
     def __getitem__(self, idx: int):
         shift = MAX_CHARGE
         p_start, p_end = self.data["position_idx_range"][idx]

From 2a6e3ef3c1e8b47fbde1e9bcea0570675324f854 Mon Sep 17 00:00:00 2001
From: Nikhil Shenoy <nikhilshenoy98@gmail.com.com>
Date: Sat, 6 Apr 2024 01:14:56 +0000
Subject: [PATCH 10/27] Updated tests for interaction datasets

---
 openqdc/datasets/interaction/base.py  |  3 ++
 openqdc/datasets/interaction/dummy.py |  2 +-
 tests/test_dummy.py                   | 40 +++++++++++++++++++--------
 3 files changed, 33 insertions(+), 12 deletions(-)

diff --git a/openqdc/datasets/interaction/base.py b/openqdc/datasets/interaction/base.py
index 96f39c1..8a8e2ea 100644
--- a/openqdc/datasets/interaction/base.py
+++ b/openqdc/datasets/interaction/base.py
@@ -42,6 +42,7 @@ def __getitem__(self, idx: int):
             forces = self._convert_array(np.array(self.data["forces"][p_start:p_end], dtype=np.float32))
 
         e0 = self._convert_array(np.array(self.__isolated_atom_energies__[..., z, c + shift].T, dtype=np.float32))
+        formation_energies = energies - e0.sum(axis=0)
 
         bunch = Bunch(
             positions=positions,
@@ -49,6 +50,8 @@ def __getitem__(self, idx: int):
             charges=c,
             e0=e0,
             energies=energies,
+            formation_energies=formation_energies,
+            per_atom_formation_energies=formation_energies / len(z),
             name=name,
             subset=subset,
             forces=forces,
diff --git a/openqdc/datasets/interaction/dummy.py b/openqdc/datasets/interaction/dummy.py
index 48e92a9..71bf5ee 100644
--- a/openqdc/datasets/interaction/dummy.py
+++ b/openqdc/datasets/interaction/dummy.py
@@ -10,7 +10,7 @@ class DummyInteraction(BaseInteractionDataset):
     Dummy Interaction Dataset for Testing
     """
 
-    __name__ = "dummy"
+    __name__ = "dummy_interaction"
     __energy_methods__ = [InteractionMethod.SAPT0_AUG_CC_PVDDZ, InteractionMethod.CCSD_T_CC_PVDZ]
     __force_mask__ = [False, True]
     __energy_unit__ = "kcal/mol"
diff --git a/tests/test_dummy.py b/tests/test_dummy.py
index 08ee127..a241384 100644
--- a/tests/test_dummy.py
+++ b/tests/test_dummy.py
@@ -5,6 +5,7 @@
 import numpy as np
 import pytest
 
+from openqdc.datasets.interaction.dummy import DummyInteraction  # noqa: E402
 from openqdc.datasets.potential.dummy import Dummy  # noqa: E402
 from openqdc.utils.io import get_local_cache
 from openqdc.utils.package_utils import has_package
@@ -12,6 +13,7 @@
 # start by removing any cached data
 cache_dir = get_local_cache()
 os.system(f"rm -rf {cache_dir}/dummy")
+os.system(f"rm -rf {cache_dir}/dummy_interaction")
 
 
 if has_package("torch"):
@@ -28,22 +30,30 @@
 
 
 @pytest.fixture
-def ds():
+def dummy():
     return Dummy()
 
 
-def test_dummy(ds):
+@pytest.fixture
+def dummy_interaction():
+    return DummyInteraction()
+
+
+@pytest.mark.parametrize("ds", ["dummy", "dummy_interaction"])
+def test_dummy(ds, request):
+    ds = request.getfixturevalue(ds)
     assert ds is not None
     assert len(ds) == 9999
     assert ds[100]
 
 
+@pytest.mark.parametrize("interaction_ds", [False, True])
 @pytest.mark.parametrize("format", ["numpy", "torch", "jax"])
-def test_array_format(format):
+def test_dummy_array_format(interaction_ds, format):
     if not has_package(format):
         pytest.skip(f"{format} is not installed, skipping test")
 
-    ds = Dummy(array_format=format)
+    ds = DummyInteraction(array_format=format) if interaction_ds else Dummy(array_format=format)
 
     keys = [
         "positions",
@@ -61,13 +71,14 @@ def test_array_format(format):
         assert isinstance(data[key], format_to_type[format])
 
 
-def test_transform():
+@pytest.mark.parametrize("interaction_ds", [False, True])
+def test_transform(interaction_ds):
     def custom_fn(bunch):
         # create new name
         bunch.new_key = bunch.name + bunch.subset
         return bunch
 
-    ds = Dummy(transform=custom_fn)
+    ds = DummyInteraction(transform=custom_fn) if interaction_ds else Dummy(transform=custom_fn)
 
     data = ds[0]
 
@@ -75,14 +86,18 @@ def custom_fn(bunch):
     assert data["new_key"] == data["name"] + data["subset"]
 
 
-def test_get_statistics(ds):
+@pytest.mark.parametrize("ds", ["dummy", "dummy_interaction"])
+def test_get_statistics(ds, request):
+    ds = request.getfixturevalue(ds)
     stats = ds.get_statistics()
 
     keys = ["ForcesCalculatorStats", "FormationEnergyStats", "PerAtomFormationEnergyStats", "TotalEnergyStats"]
     assert all(k in stats for k in keys)
 
 
-def test_energy_statistics_shapes(ds):
+@pytest.mark.parametrize("ds", ["dummy", "dummy_interaction"])
+def test_energy_statistics_shapes(ds, request):
+    ds = request.getfixturevalue(ds)
     stats = ds.get_statistics()
 
     num_methods = len(ds.energy_methods)
@@ -100,7 +115,9 @@ def test_energy_statistics_shapes(ds):
     assert total_energy_stats["std"].shape == (1, num_methods)
 
 
-def test_force_statistics_shapes(ds):
+@pytest.mark.parametrize("ds", ["dummy", "dummy_interaction"])
+def test_force_statistics_shapes(ds, request):
+    ds = request.getfixturevalue(ds)
     stats = ds.get_statistics()
     num_force_methods = len(ds.force_methods)
 
@@ -115,12 +132,13 @@ def test_force_statistics_shapes(ds):
     assert forces_stats["component_rms"].shape == (3, num_force_methods)
 
 
+@pytest.mark.parametrize("interaction_ds", [False, True])
 @pytest.mark.parametrize("format", ["numpy", "torch", "jax"])
-def test_stats_array_format(format):
+def test_stats_array_format(interaction_ds, format):
     if not has_package(format):
         pytest.skip(f"{format} is not installed, skipping test")
 
-    ds = Dummy(array_format=format)
+    ds = DummyInteraction(array_format=format) if interaction_ds else Dummy(array_format=format)
     stats = ds.get_statistics()
 
     for key in stats.keys():

From 749327386a6e34d79ae39e0474036aa09fdbba02 Mon Sep 17 00:00:00 2001
From: Nikhil Shenoy <nikhilshenoy98@gmail.com.com>
Date: Sat, 6 Apr 2024 01:21:56 +0000
Subject: [PATCH 11/27] removed stale stats in dummy interaction

---
 openqdc/datasets/interaction/dummy.py | 20 --------------------
 1 file changed, 20 deletions(-)

diff --git a/openqdc/datasets/interaction/dummy.py b/openqdc/datasets/interaction/dummy.py
index 71bf5ee..085b732 100644
--- a/openqdc/datasets/interaction/dummy.py
+++ b/openqdc/datasets/interaction/dummy.py
@@ -2,7 +2,6 @@
 
 from openqdc.datasets.interaction.base import BaseInteractionDataset
 from openqdc.methods import InteractionMethod
-from openqdc.utils.constants import NOT_DEFINED
 
 
 class DummyInteraction(BaseInteractionDataset):
@@ -27,25 +26,6 @@ def _post_init(self, overwrite_local_cache, energy_unit, distance_unit) -> None:
         self.setup_dummy()
         return super()._post_init(overwrite_local_cache, energy_unit, distance_unit)
 
-    @property
-    def _stats(self):
-        return {
-            "formation": {
-                "energy": {
-                    "mean": np.array([[-12.94348027, -9.83037297]]),
-                    "std": np.array([[4.39971409, 3.3574188]]),
-                },
-                "forces": NOT_DEFINED,
-            },
-            "total": {
-                "energy": {
-                    "mean": np.array([[-89.44242, -1740.5336]]),
-                    "std": np.array([[29.599571, 791.48663]]),
-                },
-                "forces": NOT_DEFINED,
-            },
-        }
-
     def setup_dummy(self):
         n_atoms = np.array([np.random.randint(10, 30) for _ in range(len(self))])
         n_atoms_first = np.array([np.random.randint(1, 10) for _ in range(len(self))])

From ed73e7d97ca434fb707a3804427f833009119348 Mon Sep 17 00:00:00 2001
From: Nikhil Shenoy <nikhilshenoy98@gmail.com.com>
Date: Sat, 6 Apr 2024 16:39:01 +0000
Subject: [PATCH 12/27] changes based on comments

---
 openqdc/datasets/interaction/base.py     |  3 ---
 openqdc/datasets/interaction/des370k.py  |  7 +-----
 openqdc/datasets/interaction/des5m.py    |  2 +-
 openqdc/datasets/interaction/dess66.py   |  8 +-----
 openqdc/datasets/interaction/dess66x8.py |  6 -----
 openqdc/datasets/interaction/dummy.py    |  5 +---
 openqdc/datasets/statistics.py           |  3 ++-
 tests/test_dummy.py                      | 31 ++++++++++++++++--------
 8 files changed, 27 insertions(+), 38 deletions(-)

diff --git a/openqdc/datasets/interaction/base.py b/openqdc/datasets/interaction/base.py
index 8a8e2ea..96f39c1 100644
--- a/openqdc/datasets/interaction/base.py
+++ b/openqdc/datasets/interaction/base.py
@@ -42,7 +42,6 @@ def __getitem__(self, idx: int):
             forces = self._convert_array(np.array(self.data["forces"][p_start:p_end], dtype=np.float32))
 
         e0 = self._convert_array(np.array(self.__isolated_atom_energies__[..., z, c + shift].T, dtype=np.float32))
-        formation_energies = energies - e0.sum(axis=0)
 
         bunch = Bunch(
             positions=positions,
@@ -50,8 +49,6 @@ def __getitem__(self, idx: int):
             charges=c,
             e0=e0,
             energies=energies,
-            formation_energies=formation_energies,
-            per_atom_formation_energies=formation_energies / len(z),
             name=name,
             subset=subset,
             forces=forces,
diff --git a/openqdc/datasets/interaction/des370k.py b/openqdc/datasets/interaction/des370k.py
index 250d42d..5d6e966 100644
--- a/openqdc/datasets/interaction/des370k.py
+++ b/openqdc/datasets/interaction/des370k.py
@@ -101,22 +101,17 @@ def _read_raw_entries(cls) -> List[Dict]:
         logger.info(f"Reading {cls._name} interaction data from {filepath}")
         df = pd.read_csv(filepath)
         data = []
-        for idx, row in tqdm(df.iterrows(), total=df.shape[0]):
+        for _, row in tqdm(df.iterrows(), total=df.shape[0]):
             smiles0, smiles1 = row["smiles0"], row["smiles1"]
             charge0, charge1 = row["charge0"], row["charge1"]
             natoms0, natoms1 = row["natoms0"], row["natoms1"]
             pos = np.array(list(map(float, row["xyz"].split()))).reshape(-1, 3)
 
             elements = row["elements"].split()
-
             atomic_nums = np.expand_dims(np.array([ATOM_TABLE.GetAtomicNumber(x) for x in elements]), axis=1)
-
             charges = np.expand_dims(np.array([charge0] * natoms0 + [charge1] * natoms1), axis=1)
-
             atomic_inputs = np.concatenate((atomic_nums, charges, pos), axis=-1, dtype=np.float32)
-
             energies = np.array(row[cls.energy_target_names].values).astype(np.float32)[None, :]
-
             name = np.array([smiles0 + "." + smiles1])
 
             subsets = []
diff --git a/openqdc/datasets/interaction/des5m.py b/openqdc/datasets/interaction/des5m.py
index 979909c..49c3f4a 100644
--- a/openqdc/datasets/interaction/des5m.py
+++ b/openqdc/datasets/interaction/des5m.py
@@ -75,4 +75,4 @@ class DES5M(DES370K):
     __forces_unit__ = "kcal/mol/ang"
 
     def read_raw_entries(self) -> List[Dict]:
-        return DES5M._read_raw_entries()
+        return super()._read_raw_entries()
diff --git a/openqdc/datasets/interaction/dess66.py b/openqdc/datasets/interaction/dess66.py
index c10811b..e608adb 100644
--- a/openqdc/datasets/interaction/dess66.py
+++ b/openqdc/datasets/interaction/dess66.py
@@ -96,24 +96,18 @@ def read_raw_entries(self) -> List[Dict]:
         logger.info(f"Reading DESS66 interaction data from {self.filepath}")
         df = pd.read_csv(self.filepath)
         data = []
-        for idx, row in tqdm(df.iterrows(), total=df.shape[0]):
+        for _, row in tqdm(df.iterrows(), total=df.shape[0]):
             smiles0, smiles1 = row["smiles0"], row["smiles1"]
             charge0, charge1 = row["charge0"], row["charge1"]
             natoms0, natoms1 = row["natoms0"], row["natoms1"]
             pos = np.array(list(map(float, row["xyz"].split()))).reshape(-1, 3)
 
             elements = row["elements"].split()
-
             atomic_nums = np.expand_dims(np.array([ATOM_TABLE.GetAtomicNumber(x) for x in elements]), axis=1)
-
             charges = np.expand_dims(np.array([charge0] * natoms0 + [charge1] * natoms1), axis=1)
-
             atomic_inputs = np.concatenate((atomic_nums, charges, pos), axis=-1, dtype=np.float32)
-
             energies = np.array(row[self.energy_target_names].values).astype(np.float32)[None, :]
-
             name = np.array([smiles0 + "." + smiles1])
-
             subset = row["system_name"]
 
             item = dict(
diff --git a/openqdc/datasets/interaction/dess66x8.py b/openqdc/datasets/interaction/dess66x8.py
index 709620a..8467eef 100644
--- a/openqdc/datasets/interaction/dess66x8.py
+++ b/openqdc/datasets/interaction/dess66x8.py
@@ -104,17 +104,11 @@ def read_raw_entries(self) -> List[Dict]:
             pos = np.array(list(map(float, row["xyz"].split()))).reshape(-1, 3)
 
             elements = row["elements"].split()
-
             atomic_nums = np.expand_dims(np.array([ATOM_TABLE.GetAtomicNumber(x) for x in elements]), axis=1)
-
             charges = np.expand_dims(np.array([charge0] * natoms0 + [charge1] * natoms1), axis=1)
-
             atomic_inputs = np.concatenate((atomic_nums, charges, pos), axis=-1, dtype=np.float32)
-
             energies = np.array(row[self.energy_target_names].values).astype(np.float32)[None, :]
-
             name = np.array([smiles0 + "." + smiles1])
-
             subset = row["system_name"]
 
             item = dict(
diff --git a/openqdc/datasets/interaction/dummy.py b/openqdc/datasets/interaction/dummy.py
index 085b732..4dcb8a3 100644
--- a/openqdc/datasets/interaction/dummy.py
+++ b/openqdc/datasets/interaction/dummy.py
@@ -11,14 +11,13 @@ class DummyInteraction(BaseInteractionDataset):
 
     __name__ = "dummy_interaction"
     __energy_methods__ = [InteractionMethod.SAPT0_AUG_CC_PVDDZ, InteractionMethod.CCSD_T_CC_PVDZ]
-    __force_mask__ = [False, True]
+    __force_mask__ = [False, False]
     __energy_unit__ = "kcal/mol"
     __distance_unit__ = "ang"
     __forces_unit__ = "kcal/mol/ang"
 
     energy_target_names = [f"energy{i}" for i in range(len(__energy_methods__))]
 
-    force_target_names = [f"forces{i}" for i in range(len(__force_mask__))]
     __isolated_atom_energies__ = []
     __average_n_atoms__ = None
 
@@ -48,7 +47,6 @@ def setup_dummy(self):
         name = [f"dummy_{i}" for i in range(len(self))]
         subset = ["dummy" for i in range(len(self))]
         energies = np.random.rand(len(self), len(self.energy_methods))
-        forces = np.concatenate([np.random.randn(size, 3, len(self.force_methods)) * 100 for size in n_atoms])
         self.data = dict(
             n_atoms=n_atoms,
             position_idx_range=position_idx_range,
@@ -57,7 +55,6 @@ def setup_dummy(self):
             subset=subset,
             energies=energies,
             n_atoms_first=n_atoms_first,
-            forces=forces,
         )
         self.__average_nb_atoms__ = self.data["n_atoms"].mean()
 
diff --git a/openqdc/datasets/statistics.py b/openqdc/datasets/statistics.py
index e4fe9e5..2122271 100644
--- a/openqdc/datasets/statistics.py
+++ b/openqdc/datasets/statistics.py
@@ -21,7 +21,8 @@ def to_dict(self):
 
     def transform(self, func):
         for k, v in self.to_dict().items():
-            setattr(self, k, func(v))
+            if v is not None:
+                setattr(self, k, func(v))
 
 
 @dataclass
diff --git a/tests/test_dummy.py b/tests/test_dummy.py
index a241384..e38a6dc 100644
--- a/tests/test_dummy.py
+++ b/tests/test_dummy.py
@@ -10,10 +10,15 @@
 from openqdc.utils.io import get_local_cache
 from openqdc.utils.package_utils import has_package
 
+
 # start by removing any cached data
-cache_dir = get_local_cache()
-os.system(f"rm -rf {cache_dir}/dummy")
-os.system(f"rm -rf {cache_dir}/dummy_interaction")
+@pytest.fixture(autouse=True)
+def clean_before_run():
+    # start by removing any cached data
+    cache_dir = get_local_cache()
+    os.system(f"rm -rf {cache_dir}/dummy")
+    os.system(f"rm -rf {cache_dir}/dummy_interaction")
+    yield
 
 
 if has_package("torch"):
@@ -62,12 +67,15 @@ def test_dummy_array_format(interaction_ds, format):
         "energies",
         "forces",
         "e0",
-        "formation_energies",
-        "per_atom_formation_energies",
     ]
+    if not interaction_ds:
+        # additional keys returned from the potential dataset
+        keys.extend(["formation_energies", "per_atom_formation_energies"])
 
     data = ds[0]
     for key in keys:
+        if data[key] is None:
+            continue
         assert isinstance(data[key], format_to_type[format])
 
 
@@ -125,11 +133,12 @@ def test_force_statistics_shapes(ds, request):
     keys = ["mean", "std", "component_mean", "component_std", "component_rms"]
     assert all(k in forces_stats for k in keys)
 
-    assert forces_stats["mean"].shape == (1, num_force_methods)
-    assert forces_stats["std"].shape == (1, num_force_methods)
-    assert forces_stats["component_mean"].shape == (3, num_force_methods)
-    assert forces_stats["component_std"].shape == (3, num_force_methods)
-    assert forces_stats["component_rms"].shape == (3, num_force_methods)
+    if len(ds.force_methods) > 0:
+        assert forces_stats["mean"].shape == (1, num_force_methods)
+        assert forces_stats["std"].shape == (1, num_force_methods)
+        assert forces_stats["component_mean"].shape == (3, num_force_methods)
+        assert forces_stats["component_std"].shape == (3, num_force_methods)
+        assert forces_stats["component_rms"].shape == (3, num_force_methods)
 
 
 @pytest.mark.parametrize("interaction_ds", [False, True])
@@ -143,4 +152,6 @@ def test_stats_array_format(interaction_ds, format):
 
     for key in stats.keys():
         for k, v in stats[key].items():
+            if v is None:
+                continue
             assert isinstance(v, format_to_type[format])

From 03590229872d72c798bd1f8a5e44455287a806da Mon Sep 17 00:00:00 2001
From: FNTwin <cris.gabellini@gmail.com>
Date: Sat, 6 Apr 2024 12:49:53 -0400
Subject: [PATCH 13/27] Clean metcalf

---
 openqdc/datasets/interaction/metcalf.py | 144 ++++++++++++------------
 tests/test_dummy.py                     |  12 +-
 2 files changed, 81 insertions(+), 75 deletions(-)

diff --git a/openqdc/datasets/interaction/metcalf.py b/openqdc/datasets/interaction/metcalf.py
index 1905918..34da7ef 100644
--- a/openqdc/datasets/interaction/metcalf.py
+++ b/openqdc/datasets/interaction/metcalf.py
@@ -1,57 +1,85 @@
 import os
+from glob import glob
+from io import StringIO
+from os.path import join as p_join
 from typing import Dict, List
 
 import numpy as np
 from loguru import logger
+from tqdm import tqdm
 
 from openqdc.datasets.interaction.base import BaseInteractionDataset
 from openqdc.methods import InteractionMethod, InterEnergyType
+from openqdc.raws.config_factory import decompress_tar_gz
 from openqdc.utils.constants import ATOM_TABLE
 
+EXPECTED_TAR_FILES = {
+    "train": [
+        "TRAINING-2073-ssi-neutral.tar.gz",
+        "TRAINING-2610-donors-perturbed.tar.gz",
+        "TRAINING-4795-acceptors-perturbed.tar.gz",
+    ],
+    "val": ["VALIDATION-125-donors.tar.gz", "VALIDATION-254-acceptors.tar.gz"],
+    "test": [
+        "TEST-Acc--3-methylbutan-2-one_Don--NMe-acetamide-PLDB.tar.gz",
+        "TEST-Acc--Cyclohexanone_Don--NMe-acetamide-PLDB.tar.gz",
+        "TEST-Acc--Isoquinolone_NMe-acetamide.tar.gz",
+        "TEST-Acc--NMe-acetamide_Don--Aniline-CSD.tar.gz",
+        "TEST-Acc--NMe-acetamide_Don--Aniline-PLDB.tar.gz",
+        "TEST-Acc--NMe-acetamide_Don--N-isopropylacetamide-PLDB.tar.gz",
+        "TEST-Acc--NMe-acetamide_Don--N-phenylbenzamide-PLDB.tar.gz",
+        "TEST-Acc--NMe-acetamide_Don--Naphthalene-1H-PLDB.tar.gz",
+        "TEST-Acc--NMe-acetamide_Don--Uracil-PLDB.tar.gz",
+        "TEST-Acc--Tetrahydro-2H-pyran-2-one_NMe-acetamide-PLDB.tar.gz",
+        "TEST-NMe-acetamide_Don--Benzimidazole-PLDB.tar.gz",
+    ],
+}
+
 
 def extract_raw_tar_gz(folder):
-    # go over all files
     logger.info(f"Extracting all tar.gz files in {folder}")
-    expected_tar_files = {
-        "train": [
-            "TRAINING-2073-ssi-neutral.tar.gz",
-            "TRAINING-2610-donors-perturbed.tar.gz",
-            "TRAINING-4795-acceptors-perturbed.tar.gz",
-        ],
-        "val": ["VALIDATION-125-donors.tar.gz", "VALIDATION-254-acceptors.tar.gz"],
-        "test": [
-            "TEST-Acc--3-methylbutan-2-one_Don--NMe-acetamide-PLDB.tar.gz",
-            "TEST-Acc--Cyclohexanone_Don--NMe-acetamide-PLDB.tar.gz",
-            "TEST-Acc--Isoquinolone_NMe-acetamide.tar.gz",
-            "TEST-Acc--NMe-acetamide_Don--Aniline-CSD.tar.gz",
-            "TEST-Acc--NMe-acetamide_Don--Aniline-PLDB.tar.gz",
-            "TEST-Acc--NMe-acetamide_Don--N-isopropylacetamide-PLDB.tar.gz",
-            "TEST-Acc--NMe-acetamide_Don--N-phenylbenzamide-PLDB.tar.gz",
-            "TEST-Acc--NMe-acetamide_Don--Naphthalene-1H-PLDB.tar.gz",
-            "TEST-Acc--NMe-acetamide_Don--Uracil-PLDB.tar.gz",
-            "TEST-Acc--Tetrahydro-2H-pyran-2-one_NMe-acetamide-PLDB.tar.gz",
-            "TEST-NMe-acetamide_Don--Benzimidazole-PLDB.tar.gz",
-        ],
-    }
-
-    # create a folder with the same name as the tar.gz file
-    for subset in expected_tar_files:
-        for tar_file in expected_tar_files[subset]:
-            logger.info(f"Extracting {tar_file}")
-            tar_file_path = os.path.join(folder, tar_file)
-
-            # check if tar file exists
-            if not os.path.exists(tar_file_path):
-                raise FileNotFoundError(f"File {tar_file_path} not found")
-
-            # skip if extracted folder exists
-            if os.path.exists(os.path.join(folder, tar_file.replace(".tar.gz", ""))):
-                logger.info(f"Skipping {tar_file}")
-                continue
-
-            tar_folder_path = tar_file_path.replace(".tar.gz", "")
-            os.mkdir(tar_folder_path)
-            os.system(f"tar -xzf {tar_file_path} -C {tar_folder_path}")
+    for subset in EXPECTED_TAR_FILES:
+        for tar_file in EXPECTED_TAR_FILES[subset]:
+            tar_file_path = p_join(folder, tar_file)
+            try:
+                decompress_tar_gz(tar_file_path)
+            except FileNotFoundError as e:
+                raise FileNotFoundError(f"File {tar_file_path} not found") from e
+
+
+def content_to_xyz(content, subset):
+    try:
+        num_atoms = np.array([int(content.split("\n")[0])])
+        tmp = content.split("\n")[1].split(",")
+        name = tmp[0]
+        e = tmp[1:-1]
+    except Exception as e:
+        logger.warning(f"Encountered exception in {content} : {e}")
+        return None
+
+    s = StringIO(content)
+    d = np.loadtxt(s, skiprows=2, dtype="str")
+    z, positions = d[:, 0], d[:, 1:].astype(np.float32)
+    z = np.array([ATOM_TABLE.GetAtomicNumber(s) for s in z])
+    xs = np.stack((z, np.zeros_like(z)), axis=-1)
+
+    item = dict(
+        n_atoms=num_atoms,
+        subset=np.array([subset]),
+        energies=e,
+        atomic_inputs=np.concatenate((xs, positions), axis=-1, dtype=np.float32),
+        name=np.array([name]),
+        n_atoms_first=np.array([-1]),
+    )
+
+    return item
+
+
+def read_xyz(fname, subset):
+    with open(fname, "r") as f:
+        contents = f.read().split("\n\n")
+    res = [content_to_xyz(content, subset) for content in tqdm(contents)]
+    return res
 
 
 class Metcalf(BaseInteractionDataset):
@@ -102,35 +130,9 @@ def read_raw_entries(self) -> List[Dict]:
         # extract in folders
         extract_raw_tar_gz(self.root)
         data = []
-        for dirname in os.listdir(self.root):
-            xyz_dir = os.path.join(self.root, dirname)
-            if not os.path.isdir(xyz_dir):
-                continue
+        for _, dirname, _ in os.walk(self.root):
+            xyz_dir = p_join(self.root, dirname)
             subset = np.array([dirname.split("-")[0].lower()])  # training, validation, or test
-            for filename in os.listdir(xyz_dir):
-                if not filename.endswith(".xyz"):
-                    continue
-                lines = list(map(lambda x: x.strip(), open(os.path.join(xyz_dir, filename), "r").readlines()))
-                line_two = lines[1].split(",")
-                energies = np.array([line_two[1:6]], dtype=np.float32)
-                num_atoms = np.array([int(lines[0])])
-
-                elem_xyz = np.array([x.split() for x in lines[2:]])
-                elements = elem_xyz[:, 0]
-                xyz = elem_xyz[:, 1:].astype(np.float32)
-                atomic_nums = np.expand_dims(np.array([ATOM_TABLE.GetAtomicNumber(x) for x in elements]), axis=1)
-                charges = np.expand_dims(np.array([0] * num_atoms[0]), axis=1)
-
-                atomic_inputs = np.concatenate((atomic_nums, charges, xyz), axis=-1, dtype=np.float32)
-
-                item = dict(
-                    n_atoms=num_atoms,
-                    subset=subset,
-                    energies=energies,
-                    positions=xyz,
-                    atomic_inputs=atomic_inputs,
-                    name=np.array([""]),
-                    n_atoms_first=np.array([-1]),
-                )
-                data.append(item)
+            for filename in glob(xyz_dir + f"{os.sep}*.xyz"):
+                data.append(read_xyz(filename, subset))
         return data
diff --git a/tests/test_dummy.py b/tests/test_dummy.py
index a241384..7efbc18 100644
--- a/tests/test_dummy.py
+++ b/tests/test_dummy.py
@@ -10,10 +10,14 @@
 from openqdc.utils.io import get_local_cache
 from openqdc.utils.package_utils import has_package
 
-# start by removing any cached data
-cache_dir = get_local_cache()
-os.system(f"rm -rf {cache_dir}/dummy")
-os.system(f"rm -rf {cache_dir}/dummy_interaction")
+
+@pytest.fixture(autouse=True)
+def clean_before_run():
+    # start by removing any cached data
+    cache_dir = get_local_cache()
+    os.system(f"rm -rf {cache_dir}/dummy")
+    os.system(f"rm -rf {cache_dir}/dummy_interaction")
+    yield
 
 
 if has_package("torch"):

From 33fa342b87f8f72be0170247474f0a7fa79a1f78 Mon Sep 17 00:00:00 2001
From: FNTwin <cris.gabellini@gmail.com>
Date: Sat, 6 Apr 2024 12:58:32 -0400
Subject: [PATCH 14/27] Simplification

---
 openqdc/datasets/interaction/metcalf.py | 7 ++-----
 1 file changed, 2 insertions(+), 5 deletions(-)

diff --git a/openqdc/datasets/interaction/metcalf.py b/openqdc/datasets/interaction/metcalf.py
index 34da7ef..99da5b0 100644
--- a/openqdc/datasets/interaction/metcalf.py
+++ b/openqdc/datasets/interaction/metcalf.py
@@ -130,9 +130,6 @@ def read_raw_entries(self) -> List[Dict]:
         # extract in folders
         extract_raw_tar_gz(self.root)
         data = []
-        for _, dirname, _ in os.walk(self.root):
-            xyz_dir = p_join(self.root, dirname)
-            subset = np.array([dirname.split("-")[0].lower()])  # training, validation, or test
-            for filename in glob(xyz_dir + f"{os.sep}*.xyz"):
-                data.append(read_xyz(filename, subset))
+        for filename in glob(self.root + f"{os.sep}*.xyz"):
+            data.append(read_xyz(filename, self.__name__))
         return data

From cd486a885719a638d15889f8832cefc18342c73f Mon Sep 17 00:00:00 2001
From: FNTwin <cris.gabellini@gmail.com>
Date: Sat, 6 Apr 2024 13:15:03 -0400
Subject: [PATCH 15/27] cleaned des

---
 openqdc/datasets/interaction/des370k.py | 96 ++++++++++++++-----------
 openqdc/datasets/interaction/des5m.py   |  2 +-
 2 files changed, 57 insertions(+), 41 deletions(-)

diff --git a/openqdc/datasets/interaction/des370k.py b/openqdc/datasets/interaction/des370k.py
index 250d42d..ee72923 100644
--- a/openqdc/datasets/interaction/des370k.py
+++ b/openqdc/datasets/interaction/des370k.py
@@ -13,6 +13,58 @@
 from openqdc.utils.molecule import molecule_groups
 
 
+def parse_des_df(row, energy_target_names):
+    smiles0, smiles1 = row["smiles0"], row["smiles1"]
+    charge0, charge1 = row["charge0"], row["charge1"]
+    natoms0, natoms1 = row["natoms0"], row["natoms1"]
+    pos = np.array(list(map(float, row["xyz"].split()))).reshape(-1, 3)
+    elements = row["elements"].split()
+    atomic_nums = np.expand_dims(np.array([ATOM_TABLE.GetAtomicNumber(x) for x in elements]), axis=1)
+    charges = np.expand_dims(np.array([charge0] * natoms0 + [charge1] * natoms1), axis=1)
+    atomic_inputs = np.concatenate((atomic_nums, charges, pos), axis=-1, dtype=np.float32)
+    energies = np.array(row[energy_target_names].values).astype(np.float32)[None, :]
+    name = np.array([smiles0 + "." + smiles1])
+    return {
+        "energies": energies,
+        "n_atoms": np.array([natoms0 + natoms1], dtype=np.int32),
+        "name": name,
+        "atomic_inputs": atomic_inputs,
+        "charges": charges,
+        "atomic_nums": atomic_nums,
+        "elements": elements,
+        "natoms0": natoms0,
+        "natoms1": natoms1,
+        "smiles0": smiles0,
+        "smiles1": smiles1,
+        "charge0": charge0,
+        "charge1": charge1,
+    }
+
+
+def create_subset(smiles0, smiles1):
+    subsets = []
+    for smiles in [smiles0, smiles1]:
+        found = False
+        for functional_group, smiles_set in molecule_groups.items():
+            if smiles in smiles_set:
+                subsets.append(functional_group)
+                found = True
+        if not found:
+            logger.info(f"molecule group lookup failed for {smiles}")
+    return subsets
+
+
+def convert_to_record(item):
+    return dict(
+        energies=item["energies"],
+        subset=np.array([item["subsets"]]),
+        n_atoms=np.array([item["natoms0"] + item["natoms1"]], dtype=np.int32),
+        n_atoms_first=np.array([item["natoms0"]], dtype=np.int32),
+        atomic_inputs=item["atomic_inputs"],
+        name=item["name"],
+    )
+
+
 class DES370K(BaseInteractionDataset):
     """
     DE Shaw Research interaction energy of over 370K
@@ -95,50 +147,14 @@ class DES370K(BaseInteractionDataset):
     def _root(cls):
         return os.path.join(get_local_cache(), cls._name)
 
-    @classmethod
-    def _read_raw_entries(cls) -> List[Dict]:
+    def read_raw_entries(cls) -> List[Dict]:
         filepath = os.path.join(cls._root(), cls._filename)
         logger.info(f"Reading {cls._name} interaction data from {filepath}")
         df = pd.read_csv(filepath)
         data = []
         for idx, row in tqdm(df.iterrows(), total=df.shape[0]):
-            smiles0, smiles1 = row["smiles0"], row["smiles1"]
-            charge0, charge1 = row["charge0"], row["charge1"]
-            natoms0, natoms1 = row["natoms0"], row["natoms1"]
-            pos = np.array(list(map(float, row["xyz"].split()))).reshape(-1, 3)
-
-            elements = row["elements"].split()
-
-            atomic_nums = np.expand_dims(np.array([ATOM_TABLE.GetAtomicNumber(x) for x in elements]), axis=1)
-
-            charges = np.expand_dims(np.array([charge0] * natoms0 + [charge1] * natoms1), axis=1)
-
-            atomic_inputs = np.concatenate((atomic_nums, charges, pos), axis=-1, dtype=np.float32)
-
-            energies = np.array(row[cls.energy_target_names].values).astype(np.float32)[None, :]
-
-            name = np.array([smiles0 + "." + smiles1])
-
-            subsets = []
-            for smiles in [smiles0, smiles1]:
-                found = False
-                for functional_group, smiles_set in molecule_groups.items():
-                    if smiles in smiles_set:
-                        subsets.append(functional_group)
-                        found = True
-                if not found:
-                    logger.info(f"molecule group lookup failed for {smiles}")
-
-            item = dict(
-                energies=energies,
-                subset=np.array([subsets]),
-                n_atoms=np.array([natoms0 + natoms1], dtype=np.int32),
-                n_atoms_first=np.array([natoms0], dtype=np.int32),
-                atomic_inputs=atomic_inputs,
-                name=name,
-            )
+            item = parse_des_df(row, cls.energy_target_names)
+            item["subset"] = create_subset(item["smiles0"], item["smiles1"])
+            item = convert_to_record(item)
             data.append(item)
         return data
-
-    def read_raw_entries(self) -> List[Dict]:
-        return DES370K._read_raw_entries()
diff --git a/openqdc/datasets/interaction/des5m.py b/openqdc/datasets/interaction/des5m.py
index 979909c..e274ba8 100644
--- a/openqdc/datasets/interaction/des5m.py
+++ b/openqdc/datasets/interaction/des5m.py
@@ -75,4 +75,4 @@ class DES5M(DES370K):
     __forces_unit__ = "kcal/mol/ang"
 
     def read_raw_entries(self) -> List[Dict]:
-        return DES5M._read_raw_entries()
+        return super().read_raw_entries()

From 80d7371823a875db92e641a6d6eb33d44a92b1be Mon Sep 17 00:00:00 2001
From: FNTwin <cris.gabellini@gmail.com>
Date: Sat, 6 Apr 2024 13:31:42 -0400
Subject: [PATCH 16/27] Simplified des dataset

---
 openqdc/datasets/interaction/__init__.py      |   6 +-
 .../interaction/{des370k.py => des.py}        |   0
 openqdc/datasets/interaction/des5m.py         | 161 +++++++++++++++++-
 openqdc/datasets/interaction/dess66.py        |  59 ++++---
 openqdc/datasets/interaction/dess66x8.py      | 129 --------------
 5 files changed, 190 insertions(+), 165 deletions(-)
 rename openqdc/datasets/interaction/{des370k.py => des.py} (100%)
 delete mode 100644 openqdc/datasets/interaction/dess66x8.py

diff --git a/openqdc/datasets/interaction/__init__.py b/openqdc/datasets/interaction/__init__.py
index fa3bebd..bf8c834 100644
--- a/openqdc/datasets/interaction/__init__.py
+++ b/openqdc/datasets/interaction/__init__.py
@@ -1,8 +1,6 @@
 from .base import BaseInteractionDataset  # noqa
-from .des5m import DES5M
-from .des370k import DES370K
-from .dess66 import DESS66
-from .dess66x8 import DESS66x8
+from .des import DES5M, DES370K
+from .dess66 import DESS66, DESS66x8
 from .L7 import L7
 from .metcalf import Metcalf
 from .splinter import Splinter
diff --git a/openqdc/datasets/interaction/des370k.py b/openqdc/datasets/interaction/des.py
similarity index 100%
rename from openqdc/datasets/interaction/des370k.py
rename to openqdc/datasets/interaction/des.py
diff --git a/openqdc/datasets/interaction/des5m.py b/openqdc/datasets/interaction/des5m.py
index e274ba8..710fa39 100644
--- a/openqdc/datasets/interaction/des5m.py
+++ b/openqdc/datasets/interaction/des5m.py
@@ -1,7 +1,163 @@
+import os
 from typing import Dict, List
 
-from openqdc.datasets.interaction.des370k import DES370K
+import numpy as np
+import pandas as pd
+from loguru import logger
+from tqdm import tqdm
+
+from openqdc.datasets.interaction.base import BaseInteractionDataset
 from openqdc.methods import InteractionMethod, InterEnergyType
+from openqdc.utils.constants import ATOM_TABLE
+from openqdc.utils.io import get_local_cache
+from openqdc.utils.molecule import molecule_groups
+
+
+def parse_des_df(row, energy_target_names):
+    smiles0, smiles1 = row["smiles0"], row["smiles1"]
+    charge0, charge1 = row["charge0"], row["charge1"]
+    natoms0, natoms1 = row["natoms0"], row["natoms1"]
+    pos = np.array(list(map(float, row["xyz"].split()))).reshape(-1, 3)
+    elements = row["elements"].split()
+    atomic_nums = np.expand_dims(np.array([ATOM_TABLE.GetAtomicNumber(x) for x in elements]), axis=1)
+    charges = np.expand_dims(np.array([charge0] * natoms0 + [charge1] * natoms1), axis=1)
+    atomic_inputs = np.concatenate((atomic_nums, charges, pos), axis=-1, dtype=np.float32)
+    energies = np.array(row[energy_target_names].values).astype(np.float32)[None, :]
+    name = np.array([smiles0 + "." + smiles1])
+    return {
+        "energies": energies,
+        "n_atoms": np.array([natoms0 + natoms1], dtype=np.int32),
+        "name": name,
+        "atomic_inputs": atomic_inputs,
+        "charges": charges,
+        "atomic_nums": atomic_nums,
+        "elements": elements,
+        "natoms0": natoms0,
+        "natoms1": natoms1,
+        "smiles0": smiles0,
+        "smiles1": smiles1,
+        "charge0": charge0,
+        "charge1": charge1,
+    }
+
+
+def create_subset(smiles0, smiles1):
+    subsets = []
+    for smiles in [smiles0, smiles1]:
+        found = False
+        for functional_group, smiles_set in molecule_groups.items():
+            if smiles in smiles_set:
+                subsets.append(functional_group)
+                found = True
+        if not found:
+            logger.info(f"molecule group lookup failed for {smiles}")
+    return subsets
+
+
+def convert_to_record(item):
+    return dict(
+        energies=item["energies"],
+        subset=np.array([item["subsets"]]),
+        n_atoms=np.array([item["natoms0"] + item["natoms1"]], dtype=np.int32),
+        n_atoms_first=np.array([item["natoms0"]], dtype=np.int32),
+        atomic_inputs=item["atomic_inputs"],
+        name=item["name"],
+    )
+
+
+class DES370K(BaseInteractionDataset):
+    """
+    DE Shaw Research interaction energy of over 370K
+    small molecule dimers as described in the paper:
+
+    Quantum chemical benchmark databases of gold-standard dimer interaction energies.
+    Donchev, A.G., Taube, A.G., Decolvenaere, E. et al.
+    Sci Data 8, 55 (2021).
+    https://doi.org/10.1038/s41597-021-00833-x
+    """
+
+    __name__ = "des370k_interaction"
+    __energy_unit__ = "kcal/mol"
+    __distance_unit__ = "ang"
+    __forces_unit__ = "kcal/mol/ang"
+    __energy_methods__ = [
+        InteractionMethod.MP2_CC_PVDZ,
+        InteractionMethod.MP2_CC_PVQZ,
+        InteractionMethod.MP2_CC_PVTZ,
+        InteractionMethod.MP2_CBS,
+        InteractionMethod.CCSD_T_CC_PVDZ,
+        InteractionMethod.CCSD_T_CBS,
+        InteractionMethod.CCSD_T_NN,
+        InteractionMethod.SAPT0_AUG_CC_PWCVXZ,
+        InteractionMethod.SAPT0_AUG_CC_PWCVXZ,
+        InteractionMethod.SAPT0_AUG_CC_PWCVXZ,
+        InteractionMethod.SAPT0_AUG_CC_PWCVXZ,
+        InteractionMethod.SAPT0_AUG_CC_PWCVXZ,
+        InteractionMethod.SAPT0_AUG_CC_PWCVXZ,
+        InteractionMethod.SAPT0_AUG_CC_PWCVXZ,
+        InteractionMethod.SAPT0_AUG_CC_PWCVXZ,
+        InteractionMethod.SAPT0_AUG_CC_PWCVXZ,
+        InteractionMethod.SAPT0_AUG_CC_PWCVXZ,
+    ]
+
+    __energy_type__ = [
+        InterEnergyType.TOTAL,
+        InterEnergyType.TOTAL,
+        InterEnergyType.TOTAL,
+        InterEnergyType.TOTAL,
+        InterEnergyType.TOTAL,
+        InterEnergyType.TOTAL,
+        InterEnergyType.TOTAL,
+        InterEnergyType.TOTAL,
+        InterEnergyType.ES,
+        InterEnergyType.EX,
+        InterEnergyType.EX_S2,
+        InterEnergyType.IND,
+        InterEnergyType.EX_IND,
+        InterEnergyType.DISP,
+        InterEnergyType.EX_DISP_OS,
+        InterEnergyType.EX_DISP_SS,
+        InterEnergyType.DELTA_HF,
+    ]
+
+    energy_target_names = [
+        "cc_MP2_all",
+        "qz_MP2_all",
+        "tz_MP2_all",
+        "cbs_MP2_all",
+        "cc_CCSD(T)_all",
+        "cbs_CCSD(T)_all",
+        "nn_CCSD(T)_all",
+        "sapt_all",
+        "sapt_es",
+        "sapt_ex",
+        "sapt_exs2",
+        "sapt_ind",
+        "sapt_exind",
+        "sapt_disp",
+        "sapt_exdisp_os",
+        "sapt_exdisp_ss",
+        "sapt_delta_HF",
+    ]
+
+    _filename = "DES370K.csv"
+    _name = "des370k_interaction"
+
+    @classmethod
+    def _root(cls):
+        return os.path.join(get_local_cache(), cls._name)
+
+    def read_raw_entries(cls) -> List[Dict]:
+        filepath = os.path.join(cls._root(), cls._filename)
+        logger.info(f"Reading {cls._name} interaction data from {filepath}")
+        df = pd.read_csv(filepath)
+        data = []
+        for idx, row in tqdm(df.iterrows(), total=df.shape[0]):
+            item = parse_des_df(row, cls.energy_target_names)
+            item["subset"] = create_subset(item["smiles0"], item["smiles1"])
+            item = convert_to_record(item)
+            data.append(item)
+        return data
 
 
 class DES5M(DES370K):
@@ -73,6 +229,3 @@ class DES5M(DES370K):
     __energy_unit__ = "kcal/mol"
     __distance_unit__ = "ang"
     __forces_unit__ = "kcal/mol/ang"
-
-    def read_raw_entries(self) -> List[Dict]:
-        return super().read_raw_entries()
diff --git a/openqdc/datasets/interaction/dess66.py b/openqdc/datasets/interaction/dess66.py
index c10811b..45bf6bd 100644
--- a/openqdc/datasets/interaction/dess66.py
+++ b/openqdc/datasets/interaction/dess66.py
@@ -1,14 +1,18 @@
 import os
 from typing import Dict, List
 
-import numpy as np
 import pandas as pd
 from loguru import logger
 from tqdm import tqdm
 
 from openqdc.datasets.interaction.base import BaseInteractionDataset
+from openqdc.datasets.interaction.des370k import convert_to_record, parse_des_df
 from openqdc.methods import InteractionMethod, InterEnergyType
-from openqdc.utils.constants import ATOM_TABLE
+
+CSV_NAME = {
+    "des_s66": "DESS66.csv",
+    "des_s66x8": "DESS66x8.csv",
+}
 
 
 class DESS66(BaseInteractionDataset):
@@ -91,38 +95,37 @@ class DESS66(BaseInteractionDataset):
         "sapt_delta_HF",
     ]
 
+    @property
+    def csv_path(self):
+        return os.path.join(self.root, CSV_NAME[self.__name__])
+
     def read_raw_entries(self) -> List[Dict]:
-        self.filepath = os.path.join(self.root, "DESS66.csv")
-        logger.info(f"Reading DESS66 interaction data from {self.filepath}")
-        df = pd.read_csv(self.filepath)
+        filepath = self.csv_path
+        logger.info(f"Reading DESS66 interaction data from {filepath}")
+        df = pd.read_csv(filepath)
         data = []
         for idx, row in tqdm(df.iterrows(), total=df.shape[0]):
-            smiles0, smiles1 = row["smiles0"], row["smiles1"]
-            charge0, charge1 = row["charge0"], row["charge1"]
-            natoms0, natoms1 = row["natoms0"], row["natoms1"]
-            pos = np.array(list(map(float, row["xyz"].split()))).reshape(-1, 3)
-
-            elements = row["elements"].split()
-
-            atomic_nums = np.expand_dims(np.array([ATOM_TABLE.GetAtomicNumber(x) for x in elements]), axis=1)
+            item = parse_des_df(row)
+            item["subset"] = row["system_name"]
+            data.append(convert_to_record(item))
+        return data
 
-            charges = np.expand_dims(np.array([charge0] * natoms0 + [charge1] * natoms1), axis=1)
 
-            atomic_inputs = np.concatenate((atomic_nums, charges, pos), axis=-1, dtype=np.float32)
+class DESS66x8(DESS66):
+    """
+    DE Shaw Research interaction energy
+    estimates of all 528 conformers from
+    the original S66x8 dataset as described
+    in the paper:
 
-            energies = np.array(row[self.energy_target_names].values).astype(np.float32)[None, :]
+    Quantum chemical benchmark databases of gold-standard dimer interaction energies.
+    Donchev, A.G., Taube, A.G., Decolvenaere, E. et al.
+    Sci Data 8, 55 (2021).
+    https://doi.org/10.1038/s41597-021-00833-x
 
-            name = np.array([smiles0 + "." + smiles1])
+    Data was downloaded from Zenodo:
 
-            subset = row["system_name"]
+    https://zenodo.org/records/5676284
+    """
 
-            item = dict(
-                energies=energies,
-                subset=np.array([subset]),
-                n_atoms=np.array([natoms0 + natoms1], dtype=np.int32),
-                n_atoms_first=np.array([natoms0], dtype=np.int32),
-                atomic_inputs=atomic_inputs,
-                name=name,
-            )
-            data.append(item)
-        return data
+    __name__ = "des_s66x8"
diff --git a/openqdc/datasets/interaction/dess66x8.py b/openqdc/datasets/interaction/dess66x8.py
deleted file mode 100644
index 709620a..0000000
--- a/openqdc/datasets/interaction/dess66x8.py
+++ /dev/null
@@ -1,129 +0,0 @@
-import os
-from typing import Dict, List
-
-import numpy as np
-import pandas as pd
-from loguru import logger
-from tqdm import tqdm
-
-from openqdc.datasets.interaction.base import BaseInteractionDataset
-from openqdc.methods import InteractionMethod, InterEnergyType
-from openqdc.utils.constants import ATOM_TABLE
-
-
-class DESS66x8(BaseInteractionDataset):
-    """
-    DE Shaw Research interaction energy
-    estimates of all 528 conformers from
-    the original S66x8 dataset as described
-    in the paper:
-
-    Quantum chemical benchmark databases of gold-standard dimer interaction energies.
-    Donchev, A.G., Taube, A.G., Decolvenaere, E. et al.
-    Sci Data 8, 55 (2021).
-    https://doi.org/10.1038/s41597-021-00833-x
-
-    Data was downloaded from Zenodo:
-
-    https://zenodo.org/records/5676284
-    """
-
-    __name__ = "des_s66x8"
-    __energy_unit__ = "kcal/mol"
-    __distance_unit__ = "ang"
-    __forces_unit__ = "kcal/mol/ang"
-    __energy_methods__ = [
-        InteractionMethod.MP2_CC_PVDZ,
-        InteractionMethod.MP2_CC_PVQZ,
-        InteractionMethod.MP2_CC_PVTZ,
-        InteractionMethod.MP2_CBS,
-        InteractionMethod.CCSD_T_CC_PVDZ,
-        InteractionMethod.CCSD_T_CBS,
-        InteractionMethod.CCSD_T_NN,
-        InteractionMethod.SAPT0_AUG_CC_PWCVXZ,
-        InteractionMethod.SAPT0_AUG_CC_PWCVXZ,
-        InteractionMethod.SAPT0_AUG_CC_PWCVXZ,
-        InteractionMethod.SAPT0_AUG_CC_PWCVXZ,
-        InteractionMethod.SAPT0_AUG_CC_PWCVXZ,
-        InteractionMethod.SAPT0_AUG_CC_PWCVXZ,
-        InteractionMethod.SAPT0_AUG_CC_PWCVXZ,
-        InteractionMethod.SAPT0_AUG_CC_PWCVXZ,
-        InteractionMethod.SAPT0_AUG_CC_PWCVXZ,
-        InteractionMethod.SAPT0_AUG_CC_PWCVXZ,
-    ]
-
-    __energy_type__ = [
-        InterEnergyType.TOTAL,
-        InterEnergyType.TOTAL,
-        InterEnergyType.TOTAL,
-        InterEnergyType.TOTAL,
-        InterEnergyType.TOTAL,
-        InterEnergyType.TOTAL,
-        InterEnergyType.TOTAL,
-        InterEnergyType.TOTAL,
-        InterEnergyType.ES,
-        InterEnergyType.EX,
-        InterEnergyType.EX_S2,
-        InterEnergyType.IND,
-        InterEnergyType.EX_IND,
-        InterEnergyType.DISP,
-        InterEnergyType.EX_DISP_OS,
-        InterEnergyType.EX_DISP_SS,
-        InterEnergyType.DELTA_HF,
-    ]
-
-    energy_target_names = [
-        "cc_MP2_all",
-        "qz_MP2_all",
-        "tz_MP2_all",
-        "cbs_MP2_all",
-        "cc_CCSD(T)_all",
-        "cbs_CCSD(T)_all",
-        "nn_CCSD(T)_all",
-        "sapt_all",
-        "sapt_es",
-        "sapt_ex",
-        "sapt_exs2",
-        "sapt_ind",
-        "sapt_exind",
-        "sapt_disp",
-        "sapt_exdisp_os",
-        "sapt_exdisp_ss",
-        "sapt_delta_HF",
-    ]
-
-    def read_raw_entries(self) -> List[Dict]:
-        self.filepath = os.path.join(self.root, "DESS66x8.csv")
-        logger.info(f"Reading DESS66x8 interaction data from {self.filepath}")
-        df = pd.read_csv(self.filepath)
-        data = []
-        for idx, row in tqdm(df.iterrows(), total=df.shape[0]):
-            smiles0, smiles1 = row["smiles0"], row["smiles1"]
-            charge0, charge1 = row["charge0"], row["charge1"]
-            natoms0, natoms1 = row["natoms0"], row["natoms1"]
-            pos = np.array(list(map(float, row["xyz"].split()))).reshape(-1, 3)
-
-            elements = row["elements"].split()
-
-            atomic_nums = np.expand_dims(np.array([ATOM_TABLE.GetAtomicNumber(x) for x in elements]), axis=1)
-
-            charges = np.expand_dims(np.array([charge0] * natoms0 + [charge1] * natoms1), axis=1)
-
-            atomic_inputs = np.concatenate((atomic_nums, charges, pos), axis=-1, dtype=np.float32)
-
-            energies = np.array(row[self.energy_target_names].values).astype(np.float32)[None, :]
-
-            name = np.array([smiles0 + "." + smiles1])
-
-            subset = row["system_name"]
-
-            item = dict(
-                energies=energies,
-                subset=np.array([subset]),
-                n_atoms=np.array([natoms0 + natoms1], dtype=np.int32),
-                n_atoms_first=np.array([natoms0], dtype=np.int32),
-                atomic_inputs=atomic_inputs,
-                name=name,
-            )
-            data.append(item)
-        return data

From f3d205ccca65f695bc4beb38d8d5755ebfce31b0 Mon Sep 17 00:00:00 2001
From: FNTwin <cris.gabellini@gmail.com>
Date: Sat, 6 Apr 2024 13:34:21 -0400
Subject: [PATCH 17/27] removed redundant dataset files

---
 openqdc/datasets/interaction/des.py   |  71 ++++++++
 openqdc/datasets/interaction/des5m.py | 231 --------------------------
 2 files changed, 71 insertions(+), 231 deletions(-)
 delete mode 100644 openqdc/datasets/interaction/des5m.py

diff --git a/openqdc/datasets/interaction/des.py b/openqdc/datasets/interaction/des.py
index ee72923..710fa39 100644
--- a/openqdc/datasets/interaction/des.py
+++ b/openqdc/datasets/interaction/des.py
@@ -158,3 +158,74 @@ def read_raw_entries(cls) -> List[Dict]:
             item = convert_to_record(item)
             data.append(item)
         return data
+
+
+class DES5M(DES370K):
+    """
+    DE Shaw Research interaction energy calculations for
+    over 5M small molecule dimers as described in the paper:
+
+    Quantum chemical benchmark databases of gold-standard dimer interaction energies.
+    Donchev, A.G., Taube, A.G., Decolvenaere, E. et al.
+    Sci Data 8, 55 (2021).
+    https://doi.org/10.1038/s41597-021-00833-x
+    """
+
+    __name__ = "des5m_interaction"
+    __energy_methods__ = [
+        InteractionMethod.MP2_CC_PVQZ,
+        InteractionMethod.MP2_CC_PVTZ,
+        InteractionMethod.MP2_CBS,
+        InteractionMethod.CCSD_T_NN,
+        InteractionMethod.SAPT0_AUG_CC_PWCVXZ,
+        InteractionMethod.SAPT0_AUG_CC_PWCVXZ,
+        InteractionMethod.SAPT0_AUG_CC_PWCVXZ,
+        InteractionMethod.SAPT0_AUG_CC_PWCVXZ,
+        InteractionMethod.SAPT0_AUG_CC_PWCVXZ,
+        InteractionMethod.SAPT0_AUG_CC_PWCVXZ,
+        InteractionMethod.SAPT0_AUG_CC_PWCVXZ,
+        InteractionMethod.SAPT0_AUG_CC_PWCVXZ,
+        InteractionMethod.SAPT0_AUG_CC_PWCVXZ,
+        InteractionMethod.SAPT0_AUG_CC_PWCVXZ,
+    ]
+
+    __energy_type__ = [
+        InterEnergyType.TOTAL,
+        InterEnergyType.TOTAL,
+        InterEnergyType.TOTAL,
+        InterEnergyType.TOTAL,
+        InterEnergyType.TOTAL,
+        InterEnergyType.ES,
+        InterEnergyType.EX,
+        InterEnergyType.EX_S2,
+        InterEnergyType.IND,
+        InterEnergyType.EX_IND,
+        InterEnergyType.DISP,
+        InterEnergyType.EX_DISP_OS,
+        InterEnergyType.EX_DISP_SS,
+        InterEnergyType.DELTA_HF,
+    ]
+
+    energy_target_names = [
+        "qz_MP2_all",
+        "tz_MP2_all",
+        "cbs_MP2_all",
+        "nn_CCSD(T)_all",
+        "sapt_all",
+        "sapt_es",
+        "sapt_ex",
+        "sapt_exs2",
+        "sapt_ind",
+        "sapt_exind",
+        "sapt_disp",
+        "sapt_exdisp_os",
+        "sapt_exdisp_ss",
+        "sapt_delta_HF",
+    ]
+
+    _filename = "DES5M.csv"
+    _name = "des5m_interaction"
+
+    __energy_unit__ = "kcal/mol"
+    __distance_unit__ = "ang"
+    __forces_unit__ = "kcal/mol/ang"
diff --git a/openqdc/datasets/interaction/des5m.py b/openqdc/datasets/interaction/des5m.py
deleted file mode 100644
index 710fa39..0000000
--- a/openqdc/datasets/interaction/des5m.py
+++ /dev/null
@@ -1,231 +0,0 @@
-import os
-from typing import Dict, List
-
-import numpy as np
-import pandas as pd
-from loguru import logger
-from tqdm import tqdm
-
-from openqdc.datasets.interaction.base import BaseInteractionDataset
-from openqdc.methods import InteractionMethod, InterEnergyType
-from openqdc.utils.constants import ATOM_TABLE
-from openqdc.utils.io import get_local_cache
-from openqdc.utils.molecule import molecule_groups
-
-
-def parse_des_df(row, energy_target_names):
-    smiles0, smiles1 = row["smiles0"], row["smiles1"]
-    charge0, charge1 = row["charge0"], row["charge1"]
-    natoms0, natoms1 = row["natoms0"], row["natoms1"]
-    pos = np.array(list(map(float, row["xyz"].split()))).reshape(-1, 3)
-    elements = row["elements"].split()
-    atomic_nums = np.expand_dims(np.array([ATOM_TABLE.GetAtomicNumber(x) for x in elements]), axis=1)
-    charges = np.expand_dims(np.array([charge0] * natoms0 + [charge1] * natoms1), axis=1)
-    atomic_inputs = np.concatenate((atomic_nums, charges, pos), axis=-1, dtype=np.float32)
-    energies = np.array(row[energy_target_names].values).astype(np.float32)[None, :]
-    name = np.array([smiles0 + "." + smiles1])
-    return {
-        "energies": energies,
-        "n_atoms": np.array([natoms0 + natoms1], dtype=np.int32),
-        "name": name,
-        "atomic_inputs": atomic_inputs,
-        "charges": charges,
-        "atomic_nums": atomic_nums,
-        "elements": elements,
-        "natoms0": natoms0,
-        "natoms1": natoms1,
-        "smiles0": smiles0,
-        "smiles1": smiles1,
-        "charge0": charge0,
-        "charge1": charge1,
-    }
-
-
-def create_subset(smiles0, smiles1):
-    subsets = []
-    for smiles in [smiles0, smiles1]:
-        found = False
-        for functional_group, smiles_set in molecule_groups.items():
-            if smiles in smiles_set:
-                subsets.append(functional_group)
-                found = True
-        if not found:
-            logger.info(f"molecule group lookup failed for {smiles}")
-    return subsets
-
-
-def convert_to_record(item):
-    return dict(
-        energies=item["energies"],
-        subset=np.array([item["subsets"]]),
-        n_atoms=np.array([item["natoms0"] + item["natoms1"]], dtype=np.int32),
-        n_atoms_first=np.array([item["natoms0"]], dtype=np.int32),
-        atomic_inputs=item["atomic_inputs"],
-        name=item["name"],
-    )
-
-
-class DES370K(BaseInteractionDataset):
-    """
-    DE Shaw Research interaction energy of over 370K
-    small molecule dimers as described in the paper:
-
-    Quantum chemical benchmark databases of gold-standard dimer interaction energies.
-    Donchev, A.G., Taube, A.G., Decolvenaere, E. et al.
-    Sci Data 8, 55 (2021).
-    https://doi.org/10.1038/s41597-021-00833-x
-    """
-
-    __name__ = "des370k_interaction"
-    __energy_unit__ = "kcal/mol"
-    __distance_unit__ = "ang"
-    __forces_unit__ = "kcal/mol/ang"
-    __energy_methods__ = [
-        InteractionMethod.MP2_CC_PVDZ,
-        InteractionMethod.MP2_CC_PVQZ,
-        InteractionMethod.MP2_CC_PVTZ,
-        InteractionMethod.MP2_CBS,
-        InteractionMethod.CCSD_T_CC_PVDZ,
-        InteractionMethod.CCSD_T_CBS,
-        InteractionMethod.CCSD_T_NN,
-        InteractionMethod.SAPT0_AUG_CC_PWCVXZ,
-        InteractionMethod.SAPT0_AUG_CC_PWCVXZ,
-        InteractionMethod.SAPT0_AUG_CC_PWCVXZ,
-        InteractionMethod.SAPT0_AUG_CC_PWCVXZ,
-        InteractionMethod.SAPT0_AUG_CC_PWCVXZ,
-        InteractionMethod.SAPT0_AUG_CC_PWCVXZ,
-        InteractionMethod.SAPT0_AUG_CC_PWCVXZ,
-        InteractionMethod.SAPT0_AUG_CC_PWCVXZ,
-        InteractionMethod.SAPT0_AUG_CC_PWCVXZ,
-        InteractionMethod.SAPT0_AUG_CC_PWCVXZ,
-    ]
-
-    __energy_type__ = [
-        InterEnergyType.TOTAL,
-        InterEnergyType.TOTAL,
-        InterEnergyType.TOTAL,
-        InterEnergyType.TOTAL,
-        InterEnergyType.TOTAL,
-        InterEnergyType.TOTAL,
-        InterEnergyType.TOTAL,
-        InterEnergyType.TOTAL,
-        InterEnergyType.ES,
-        InterEnergyType.EX,
-        InterEnergyType.EX_S2,
-        InterEnergyType.IND,
-        InterEnergyType.EX_IND,
-        InterEnergyType.DISP,
-        InterEnergyType.EX_DISP_OS,
-        InterEnergyType.EX_DISP_SS,
-        InterEnergyType.DELTA_HF,
-    ]
-
-    energy_target_names = [
-        "cc_MP2_all",
-        "qz_MP2_all",
-        "tz_MP2_all",
-        "cbs_MP2_all",
-        "cc_CCSD(T)_all",
-        "cbs_CCSD(T)_all",
-        "nn_CCSD(T)_all",
-        "sapt_all",
-        "sapt_es",
-        "sapt_ex",
-        "sapt_exs2",
-        "sapt_ind",
-        "sapt_exind",
-        "sapt_disp",
-        "sapt_exdisp_os",
-        "sapt_exdisp_ss",
-        "sapt_delta_HF",
-    ]
-
-    _filename = "DES370K.csv"
-    _name = "des370k_interaction"
-
-    @classmethod
-    def _root(cls):
-        return os.path.join(get_local_cache(), cls._name)
-
-    def read_raw_entries(cls) -> List[Dict]:
-        filepath = os.path.join(cls._root(), cls._filename)
-        logger.info(f"Reading {cls._name} interaction data from {filepath}")
-        df = pd.read_csv(filepath)
-        data = []
-        for idx, row in tqdm(df.iterrows(), total=df.shape[0]):
-            item = parse_des_df(row, cls.energy_target_names)
-            item["subset"] = create_subset(item["smiles0"], item["smiles1"])
-            item = convert_to_record(item)
-            data.append(item)
-        return data
-
-
-class DES5M(DES370K):
-    """
-    DE Shaw Research interaction energy calculations for
-    over 5M small molecule dimers as described in the paper:
-
-    Quantum chemical benchmark databases of gold-standard dimer interaction energies.
-    Donchev, A.G., Taube, A.G., Decolvenaere, E. et al.
-    Sci Data 8, 55 (2021).
-    https://doi.org/10.1038/s41597-021-00833-x
-    """
-
-    __name__ = "des5m_interaction"
-    __energy_methods__ = [
-        InteractionMethod.MP2_CC_PVQZ,
-        InteractionMethod.MP2_CC_PVTZ,
-        InteractionMethod.MP2_CBS,
-        InteractionMethod.CCSD_T_NN,
-        InteractionMethod.SAPT0_AUG_CC_PWCVXZ,
-        InteractionMethod.SAPT0_AUG_CC_PWCVXZ,
-        InteractionMethod.SAPT0_AUG_CC_PWCVXZ,
-        InteractionMethod.SAPT0_AUG_CC_PWCVXZ,
-        InteractionMethod.SAPT0_AUG_CC_PWCVXZ,
-        InteractionMethod.SAPT0_AUG_CC_PWCVXZ,
-        InteractionMethod.SAPT0_AUG_CC_PWCVXZ,
-        InteractionMethod.SAPT0_AUG_CC_PWCVXZ,
-        InteractionMethod.SAPT0_AUG_CC_PWCVXZ,
-        InteractionMethod.SAPT0_AUG_CC_PWCVXZ,
-    ]
-
-    __energy_type__ = [
-        InterEnergyType.TOTAL,
-        InterEnergyType.TOTAL,
-        InterEnergyType.TOTAL,
-        InterEnergyType.TOTAL,
-        InterEnergyType.TOTAL,
-        InterEnergyType.ES,
-        InterEnergyType.EX,
-        InterEnergyType.EX_S2,
-        InterEnergyType.IND,
-        InterEnergyType.EX_IND,
-        InterEnergyType.DISP,
-        InterEnergyType.EX_DISP_OS,
-        InterEnergyType.EX_DISP_SS,
-        InterEnergyType.DELTA_HF,
-    ]
-
-    energy_target_names = [
-        "qz_MP2_all",
-        "tz_MP2_all",
-        "cbs_MP2_all",
-        "nn_CCSD(T)_all",
-        "sapt_all",
-        "sapt_es",
-        "sapt_ex",
-        "sapt_exs2",
-        "sapt_ind",
-        "sapt_exind",
-        "sapt_disp",
-        "sapt_exdisp_os",
-        "sapt_exdisp_ss",
-        "sapt_delta_HF",
-    ]
-
-    _filename = "DES5M.csv"
-    _name = "des5m_interaction"
-
-    __energy_unit__ = "kcal/mol"
-    __distance_unit__ = "ang"
-    __forces_unit__ = "kcal/mol/ang"

From da4fece39cb29df0a8c4ef6636e929f8a44dee49 Mon Sep 17 00:00:00 2001
From: FNTwin <cris.gabellini@gmail.com>
Date: Sat, 6 Apr 2024 13:46:48 -0400
Subject: [PATCH 18/27] DES inerithance

---
 openqdc/datasets/interaction/__init__.py |   3 +-
 openqdc/datasets/interaction/des.py      | 133 ++++++++++++-----------
 openqdc/datasets/interaction/dess66.py   | 131 ----------------------
 3 files changed, 69 insertions(+), 198 deletions(-)
 delete mode 100644 openqdc/datasets/interaction/dess66.py

diff --git a/openqdc/datasets/interaction/__init__.py b/openqdc/datasets/interaction/__init__.py
index bf8c834..814a367 100644
--- a/openqdc/datasets/interaction/__init__.py
+++ b/openqdc/datasets/interaction/__init__.py
@@ -1,6 +1,5 @@
 from .base import BaseInteractionDataset  # noqa
-from .des import DES5M, DES370K
-from .dess66 import DESS66, DESS66x8
+from .des import DES5M, DES370K, DESS66, DESS66x8
 from .L7 import L7
 from .metcalf import Metcalf
 from .splinter import Splinter
diff --git a/openqdc/datasets/interaction/des.py b/openqdc/datasets/interaction/des.py
index 710fa39..6ca1bda 100644
--- a/openqdc/datasets/interaction/des.py
+++ b/openqdc/datasets/interaction/des.py
@@ -1,4 +1,5 @@
 import os
+from abc import ABC, abstractmethod
 from typing import Dict, List
 
 import numpy as np
@@ -9,7 +10,6 @@
 from openqdc.datasets.interaction.base import BaseInteractionDataset
 from openqdc.methods import InteractionMethod, InterEnergyType
 from openqdc.utils.constants import ATOM_TABLE
-from openqdc.utils.io import get_local_cache
 from openqdc.utils.molecule import molecule_groups
 
 
@@ -65,7 +65,13 @@ def convert_to_record(item):
     )
 
 
-class DES370K(BaseInteractionDataset):
+class IDES(ABC):
+    @abstractmethod
+    def _create_subsets(self, **kwargs):
+        raise NotImplementedError
+
+
+class DES370K(BaseInteractionDataset, IDES):
     """
     DE Shaw Research interaction energy of over 370K
     small molecule dimers as described in the paper:
@@ -77,6 +83,7 @@ class DES370K(BaseInteractionDataset):
     """
 
     __name__ = "des370k_interaction"
+    __filename__ = "DES370K.csv"
     __energy_unit__ = "kcal/mol"
     __distance_unit__ = "ang"
     __forces_unit__ = "kcal/mol/ang"
@@ -140,21 +147,21 @@ class DES370K(BaseInteractionDataset):
         "sapt_delta_HF",
     ]
 
-    _filename = "DES370K.csv"
-    _name = "des370k_interaction"
+    @property
+    def csv_path(self):
+        return os.path.join(self.root, self.__filename__)
 
-    @classmethod
-    def _root(cls):
-        return os.path.join(get_local_cache(), cls._name)
+    def _create_subsets(self, **kwargs):
+        return create_subset(kwargs["smiles0"], kwargs["smiles1"])
 
-    def read_raw_entries(cls) -> List[Dict]:
-        filepath = os.path.join(cls._root(), cls._filename)
-        logger.info(f"Reading {cls._name} interaction data from {filepath}")
+    def read_raw_entries(self) -> List[Dict]:
+        filepath = self.csv_path
+        logger.info(f"Reading {self.__name__} interaction data from {filepath}")
         df = pd.read_csv(filepath)
         data = []
         for idx, row in tqdm(df.iterrows(), total=df.shape[0]):
-            item = parse_des_df(row, cls.energy_target_names)
-            item["subset"] = create_subset(item["smiles0"], item["smiles1"])
+            item = parse_des_df(row, self.energy_target_names)
+            item["subset"] = self._create_subset(**item)
             item = convert_to_record(item)
             data.append(item)
         return data
@@ -172,60 +179,56 @@ class DES5M(DES370K):
     """
 
     __name__ = "des5m_interaction"
-    __energy_methods__ = [
-        InteractionMethod.MP2_CC_PVQZ,
-        InteractionMethod.MP2_CC_PVTZ,
-        InteractionMethod.MP2_CBS,
-        InteractionMethod.CCSD_T_NN,
-        InteractionMethod.SAPT0_AUG_CC_PWCVXZ,
-        InteractionMethod.SAPT0_AUG_CC_PWCVXZ,
-        InteractionMethod.SAPT0_AUG_CC_PWCVXZ,
-        InteractionMethod.SAPT0_AUG_CC_PWCVXZ,
-        InteractionMethod.SAPT0_AUG_CC_PWCVXZ,
-        InteractionMethod.SAPT0_AUG_CC_PWCVXZ,
-        InteractionMethod.SAPT0_AUG_CC_PWCVXZ,
-        InteractionMethod.SAPT0_AUG_CC_PWCVXZ,
-        InteractionMethod.SAPT0_AUG_CC_PWCVXZ,
-        InteractionMethod.SAPT0_AUG_CC_PWCVXZ,
-    ]
+    __filename__ = "DES5M.csv"
 
-    __energy_type__ = [
-        InterEnergyType.TOTAL,
-        InterEnergyType.TOTAL,
-        InterEnergyType.TOTAL,
-        InterEnergyType.TOTAL,
-        InterEnergyType.TOTAL,
-        InterEnergyType.ES,
-        InterEnergyType.EX,
-        InterEnergyType.EX_S2,
-        InterEnergyType.IND,
-        InterEnergyType.EX_IND,
-        InterEnergyType.DISP,
-        InterEnergyType.EX_DISP_OS,
-        InterEnergyType.EX_DISP_SS,
-        InterEnergyType.DELTA_HF,
-    ]
 
-    energy_target_names = [
-        "qz_MP2_all",
-        "tz_MP2_all",
-        "cbs_MP2_all",
-        "nn_CCSD(T)_all",
-        "sapt_all",
-        "sapt_es",
-        "sapt_ex",
-        "sapt_exs2",
-        "sapt_ind",
-        "sapt_exind",
-        "sapt_disp",
-        "sapt_exdisp_os",
-        "sapt_exdisp_ss",
-        "sapt_delta_HF",
-    ]
+class DESS66(DES370K):
+    """
+    DE Shaw Research interaction energy
+    estimates of all 66 conformers from
+    the original S66 dataset as described
+    in the paper:
+
+    Quantum chemical benchmark databases of gold-standard dimer interaction energies.
+    Donchev, A.G., Taube, A.G., Decolvenaere, E. et al.
+    Sci Data 8, 55 (2021).
+    https://doi.org/10.1038/s41597-021-00833-x
 
-    _filename = "DES5M.csv"
-    _name = "des5m_interaction"
+    Data was downloaded from Zenodo:
+    https://zenodo.org/records/5676284
+    """
 
-    __energy_unit__ = "kcal/mol"
-    __distance_unit__ = "ang"
-    __forces_unit__ = "kcal/mol/ang"
+    __name__ = "des_s66"
+    __filename__ = "DESS66.csv"
+
+    # def read_raw_entries(self) -> List[Dict]:
+    #    filepath = self.csv_path
+    #    logger.info(f"Reading DESS66 interaction data from {filepath}")
+    #    df = pd.read_csv(filepath)
+    #    data = []
+    #    for idx, row in tqdm(df.iterrows(), total=df.shape[0]):
+    #        item = parse_des_df(row)
+    #        item["subset"] = row["system_name"]
+    #        data.append(convert_to_record(item))
+    #    return data
+
+
+class DESS66x8(DESS66):
+    """
+    DE Shaw Research interaction energy
+    estimates of all 528 conformers from
+    the original S66x8 dataset as described
+    in the paper:
+
+    Quantum chemical benchmark databases of gold-standard dimer interaction energies.
+    Donchev, A.G., Taube, A.G., Decolvenaere, E. et al.
+    Sci Data 8, 55 (2021).
+    https://doi.org/10.1038/s41597-021-00833-x
+
+    Data was downloaded from Zenodo:
+
+    https://zenodo.org/records/5676284
+    """
+
+    __name__ = "des_s66x8"
+    __filename__ = "DESS66x8.csv"
diff --git a/openqdc/datasets/interaction/dess66.py b/openqdc/datasets/interaction/dess66.py
deleted file mode 100644
index 45bf6bd..0000000
--- a/openqdc/datasets/interaction/dess66.py
+++ /dev/null
@@ -1,131 +0,0 @@
-import os
-from typing import Dict, List
-
-import pandas as pd
-from loguru import logger
-from tqdm import tqdm
-
-from openqdc.datasets.interaction.base import BaseInteractionDataset
-from openqdc.datasets.interaction.des370k import convert_to_record, parse_des_df
-from openqdc.methods import InteractionMethod, InterEnergyType
-
-CSV_NAME = {
-    "des_s66": "DESS66.csv",
-    "des_s66x8": "DESS66x8.csv",
-}
-
-
-class DESS66(BaseInteractionDataset):
-    """
-    DE Shaw Research interaction energy
-    estimates of all 66 conformers from
-    the original S66 dataset as described
-    in the paper:
-
-    Quantum chemical benchmark databases of gold-standard dimer interaction energies.
-    Donchev, A.G., Taube, A.G., Decolvenaere, E. et al.
-    Sci Data 8, 55 (2021).
-    https://doi.org/10.1038/s41597-021-00833-x
-
-    Data was downloaded from Zenodo:
-    https://zenodo.org/records/5676284
-    """
-
-    __name__ = "des_s66"
-    __energy_unit__ = "kcal/mol"
-    __distance_unit__ = "ang"
-    __forces_unit__ = "kcal/mol/ang"
-    __energy_methods__ = [
-        InteractionMethod.MP2_CC_PVDZ,
-        InteractionMethod.MP2_CC_PVQZ,
-        InteractionMethod.MP2_CC_PVTZ,
-        InteractionMethod.MP2_CBS,
-        InteractionMethod.CCSD_T_CC_PVDZ,
-        InteractionMethod.CCSD_T_CBS,
-        InteractionMethod.CCSD_T_NN,
-        InteractionMethod.SAPT0_AUG_CC_PWCVXZ,
-        InteractionMethod.SAPT0_AUG_CC_PWCVXZ,
-        InteractionMethod.SAPT0_AUG_CC_PWCVXZ,
-        InteractionMethod.SAPT0_AUG_CC_PWCVXZ,
-        InteractionMethod.SAPT0_AUG_CC_PWCVXZ,
-        InteractionMethod.SAPT0_AUG_CC_PWCVXZ,
-        InteractionMethod.SAPT0_AUG_CC_PWCVXZ,
-        InteractionMethod.SAPT0_AUG_CC_PWCVXZ,
-        InteractionMethod.SAPT0_AUG_CC_PWCVXZ,
-        InteractionMethod.SAPT0_AUG_CC_PWCVXZ,
-    ]
-
-    __energy_type__ = [
-        InterEnergyType.TOTAL,
-        InterEnergyType.TOTAL,
-        InterEnergyType.TOTAL,
-        InterEnergyType.TOTAL,
-        InterEnergyType.TOTAL,
-        InterEnergyType.TOTAL,
-        InterEnergyType.TOTAL,
-        InterEnergyType.TOTAL,
-        InterEnergyType.ES,
-        InterEnergyType.EX,
-        InterEnergyType.EX_S2,
-        InterEnergyType.IND,
-        InterEnergyType.EX_IND,
-        InterEnergyType.DISP,
-        InterEnergyType.EX_DISP_OS,
-        InterEnergyType.EX_DISP_SS,
-        InterEnergyType.DELTA_HF,
-    ]
-
-    energy_target_names = [
-        "cc_MP2_all",
-        "qz_MP2_all",
-        "tz_MP2_all",
-        "cbs_MP2_all",
-        "cc_CCSD(T)_all",
-        "cbs_CCSD(T)_all",
-        "nn_CCSD(T)_all",
-        "sapt_all",
-        "sapt_es",
-        "sapt_ex",
-        "sapt_exs2",
-        "sapt_ind",
-        "sapt_exind",
-        "sapt_disp",
-        "sapt_exdisp_os",
-        "sapt_exdisp_ss",
-        "sapt_delta_HF",
-    ]
-
-    @property
-    def csv_path(self):
-        return os.path.join(self.root, CSV_NAME[self.__name__])
-
-    def read_raw_entries(self) -> List[Dict]:
-        filepath = self.csv_path
-        logger.info(f"Reading DESS66 interaction data from {filepath}")
-        df = pd.read_csv(filepath)
-        data = []
-        for idx, row in tqdm(df.iterrows(), total=df.shape[0]):
-            item = parse_des_df(row)
-            item["subset"] = row["system_name"]
-            data.append(convert_to_record(item))
-        return data
-
-
-class DESS66x8(DESS66):
-    """
-    DE Shaw Research interaction energy
-    estimates of all 528 conformers from
-    the original S66x8 dataset as described
-    in the paper:
-
-    Quantum chemical benchmark databases of gold-standard dimer interaction energies.
-    Donchev, A.G., Taube, A.G., Decolvenaere, E. et al.
-    Sci Data 8, 55 (2021).
-    https://doi.org/10.1038/s41597-021-00833-x
-
-    Data was downloaded from Zenodo:
-
-    https://zenodo.org/records/5676284
-    """
-
-    __name__ = "des_s66x8"

From 71ff741a4fadabfc61e36f5d1cb534dd5d041f30 Mon Sep 17 00:00:00 2001
From: FNTwin <cris.gabellini@gmail.com>
Date: Sat, 6 Apr 2024 15:56:39 -0400
Subject: [PATCH 19/27] Removed des and improved des naming

---
 openqdc/raws/config_factory.py | 16 ++++------------
 1 file changed, 4 insertions(+), 12 deletions(-)

diff --git a/openqdc/raws/config_factory.py b/openqdc/raws/config_factory.py
index 9f8c6c1..26e0f2c 100644
--- a/openqdc/raws/config_factory.py
+++ b/openqdc/raws/config_factory.py
@@ -269,22 +269,14 @@ class DataConfigFactory:
         },
     )
 
-    dess = dict(
-        dataset_name="dess5m",
-        links={
-            "DESS5M.zip": "https://zenodo.org/record/5706002/files/DESS5M.zip",
-            "DESS370.zip": "https://zenodo.org/record/5676266/files/DES370K.zip",
-        },
-    )
-
-    des370k_interaction = dict(
+    des370k = dict(
         dataset_name="des370k_interaction",
         links={
             "DES370K.zip": "https://zenodo.org/record/5676266/files/DES370K.zip",
         },
     )
 
-    des5m_interaction = dict(
+    des5m = dict(
         dataset_name="des5m_interaction",
         links={
             "DES5M.zip": "https://zenodo.org/records/5706002/files/DESS5M.zip?download=1",
@@ -349,12 +341,12 @@ class DataConfigFactory:
         links={"Transition1x.h5": "https://figshare.com/ndownloader/files/36035789"},
     )
 
-    des_s66 = dict(
+    dess66 = dict(
         dataset_name="des_s66",
         links={"DESS66.zip": "https://zenodo.org/records/5676284/files/DESS66.zip?download=1"},
     )
 
-    des_s66x8 = dict(
+    dess66x8 = dict(
         dataset_name="des_s66x8",
         links={"DESS66x8.zip": "https://zenodo.org/records/5676284/files/DESS66x8.zip?download=1"},
     )

From f6e12e13a6166bab47377cdc193866b0ee8c877d Mon Sep 17 00:00:00 2001
From: FNTwin <cris.gabellini@gmail.com>
Date: Sat, 6 Apr 2024 15:57:18 -0400
Subject: [PATCH 20/27] DES fixes

---
 openqdc/datasets/interaction/des.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/openqdc/datasets/interaction/des.py b/openqdc/datasets/interaction/des.py
index 6ca1bda..9dbb5d8 100644
--- a/openqdc/datasets/interaction/des.py
+++ b/openqdc/datasets/interaction/des.py
@@ -57,7 +57,7 @@ def create_subset(smiles0, smiles1):
 def convert_to_record(item):
     return dict(
         energies=item["energies"],
-        subset=np.array([item["subsets"]]),
+        subset=np.array([item["subset"]]),
         n_atoms=np.array([item["natoms0"] + item["natoms1"]], dtype=np.int32),
         n_atoms_first=np.array([item["natoms0"]], dtype=np.int32),
         atomic_inputs=item["atomic_inputs"],
@@ -161,7 +161,7 @@ def read_raw_entries(self) -> List[Dict]:
         data = []
         for idx, row in tqdm(df.iterrows(), total=df.shape[0]):
             item = parse_des_df(row, self.energy_target_names)
-            item["subset"] = self._create_subset(**item)
+            item["subset"] = self._create_subsets(**item)
             item = convert_to_record(item)
             data.append(item)
         return data

From 3328a65593547765097936a3cad9383fb0770e4a Mon Sep 17 00:00:00 2001
From: FNTwin <cris.gabellini@gmail.com>
Date: Sat, 6 Apr 2024 15:58:41 -0400
Subject: [PATCH 21/27] Removed comments

---
 openqdc/datasets/interaction/des.py | 11 -----------
 1 file changed, 11 deletions(-)

diff --git a/openqdc/datasets/interaction/des.py b/openqdc/datasets/interaction/des.py
index 9dbb5d8..7c542e2 100644
--- a/openqdc/datasets/interaction/des.py
+++ b/openqdc/datasets/interaction/des.py
@@ -201,17 +201,6 @@ class DESS66(DES370K):
     __name__ = "des_s66"
     __filename__ = "DESS66.csv"
 
-    # def read_raw_entries(self) -> List[Dict]:
-    #    filepath = self.csv_path
-    #    logger.info(f"Reading DESS66 interaction data from {filepath}")
-    #    df = pd.read_csv(filepath)
-    #    data = []
-    #    for idx, row in tqdm(df.iterrows(), total=df.shape[0]):
-    #        item = parse_des_df(row)
-    #        item["subset"] = row["system_name"]
-    #        data.append(convert_to_record(item))
-    #    return data
-
 
 class DESS66x8(DESS66):
     """

From 8b28d59e7f02bca523230203d327073fcd682720 Mon Sep 17 00:00:00 2001
From: FNTwin <cris.gabellini@gmail.com>
Date: Sat, 6 Apr 2024 16:36:06 -0400
Subject: [PATCH 22/27] X40 and L70

---
 openqdc/datasets/interaction/L7.py  | 88 +++++++++++++++++++----------
 openqdc/datasets/interaction/X40.py | 58 ++-----------------
 2 files changed, 62 insertions(+), 84 deletions(-)

diff --git a/openqdc/datasets/interaction/L7.py b/openqdc/datasets/interaction/L7.py
index d7c7361..a7434c2 100644
--- a/openqdc/datasets/interaction/L7.py
+++ b/openqdc/datasets/interaction/L7.py
@@ -1,6 +1,7 @@
 import os
 from dataclasses import dataclass
 from functools import partial
+from os.path import join as p_join
 from typing import Dict, List, Optional
 
 import numpy as np
@@ -58,6 +59,51 @@ def constructor(loader: yaml.SafeLoader, node: yaml.nodes.MappingNode, cls):
     return loader
 
 
+def read_xyz_file(xyz_path):
+    with open(xyz_path, "r") as xyz_file:  # avoid not closing the file
+        lines = list(map(lambda x: x.strip().split(), xyz_file.readlines()))
+        lines.pop(1)
+        n_atoms = np.array([int(lines[0][0])], dtype=np.int32)
+        pos = np.array(lines[1:])[:, 1:].astype(np.float32)
+        elems = np.array(lines[1:])[:, 0]
+        atomic_nums = np.expand_dims(np.array([ATOM_TABLE.GetAtomicNumber(x) for x in elems]), axis=1)
+    return n_atoms, pos, atomic_nums
+
+
+def convert_to_record(item):
+    return dict(
+        energies=item["energies"],
+        subset=np.array([item["subset"]]),
+        n_atoms=np.array([item["natoms0"] + item["natoms1"]], dtype=np.int32),
+        n_atoms_first=np.array([item["natoms0"]], dtype=np.int32),
+        atomic_inputs=item["atomic_inputs"],
+        name=item["name"],
+    )
+
+
+def build_item(item, charge0, charge1, idx, data_dict, root, filename):
+    datum = {
+        "energies": [],
+    }
+    datum["name"] = np.array([item.shortname])
+    datum["energies"].append(item.reference_value)
+    datum["subset"] = np.array([item.group])
+    datum["energies"] += [float(val[idx]) for val in list(data_dict.alternative_reference.values())]
+    datum["energies"] = np.array([datum["energies"]], dtype=np.float32)
+    n_atoms, pos, atomic_nums = read_xyz_file(p_join(root, f"{filename}.xyz"))
+    datum["n_atoms"] = n_atoms
+    datum["pos"] = pos
+    datum["atomic_nums"] = atomic_nums
+    datum["n_atoms_first"] = np.array([int(item.setup["molecule_a"]["selection"].split("-")[1])], dtype=np.int32)
+    datum["natoms0"] = datum["n_atoms_first"][0]
+    datum["natoms1"] = datum["n_atoms"][0] - datum["natoms0"]
+    datum["charges"] = np.expand_dims(np.array([charge0] * datum["natoms0"] + [charge1] * datum["natoms1"]), axis=1)
+    datum["atomic_inputs"] = np.concatenate(
+        (datum["atomic_nums"], datum["charges"], datum["pos"]), axis=-1, dtype=np.float32
+    )
+    return datum
+
+
 class L7(BaseInteractionDataset):
     """
     The L7 interaction energy dataset as described in:
@@ -90,43 +136,25 @@ class L7(BaseInteractionDataset):
 
     energy_target_names = []
 
+    @property
+    def yaml_path(self):
+        return os.path.join(self.root, self.__name__ + ".yaml")
+
     def read_raw_entries(self) -> List[Dict]:
-        yaml_fpath = os.path.join(self.root, "l7.yaml")
-        logger.info(f"Reading L7 interaction data from {self.root}")
+        yaml_fpath = self.yaml_path
+        logger.info(f"Reading {self.__name__} interaction data from {self.root}")
         yaml_file = open(yaml_fpath, "r")
         data = []
         data_dict = yaml.load(yaml_file, Loader=get_loader())
+
         charge0 = int(data_dict.description.global_setup["molecule_a"]["charge"])
         charge1 = int(data_dict.description.global_setup["molecule_b"]["charge"])
 
         for idx, item in enumerate(data_dict.items):
-            energies = []
-            name = np.array([item.shortname])
-            fname = item.geometry.split(":")[1]
-            energies.append(item.reference_value)
-            xyz_file = open(os.path.join(self.root, f"{fname}.xyz"), "r")
-            lines = list(map(lambda x: x.strip().split(), xyz_file.readlines()))
-            lines.pop(1)
-            n_atoms = np.array([int(lines[0][0])], dtype=np.int32)
-            n_atoms_first = np.array([int(item.setup["molecule_a"]["selection"].split("-")[1])], dtype=np.int32)
-            subset = np.array([item.group])
-            energies += [float(val[idx]) for val in list(data_dict.alternative_reference.values())]
-            energies = np.array([energies], dtype=np.float32)
-            pos = np.array(lines[1:])[:, 1:].astype(np.float32)
-            elems = np.array(lines[1:])[:, 0]
-            atomic_nums = np.expand_dims(np.array([ATOM_TABLE.GetAtomicNumber(x) for x in elems]), axis=1)
-            natoms0 = n_atoms_first[0]
-            natoms1 = n_atoms[0] - natoms0
-            charges = np.expand_dims(np.array([charge0] * natoms0 + [charge1] * natoms1), axis=1)
-            atomic_inputs = np.concatenate((atomic_nums, charges, pos), axis=-1, dtype=np.float32)
-
-            item = dict(
-                energies=energies,
-                subset=subset,
-                n_atoms=n_atoms,
-                n_atoms_first=n_atoms_first,
-                atomic_inputs=atomic_inputs,
-                name=name,
-            )
+            tmp_item = build_item(item, charge0, charge1, idx, data_dict, self.root, self._process_name(item))
+            item = convert_to_record(tmp_item)
             data.append(item)
         return data
+
+    def _process_name(self, item):
+        return item.geometry.split(":")[1]
diff --git a/openqdc/datasets/interaction/X40.py b/openqdc/datasets/interaction/X40.py
index dfb43d0..a42d36c 100644
--- a/openqdc/datasets/interaction/X40.py
+++ b/openqdc/datasets/interaction/X40.py
@@ -1,17 +1,8 @@
-import os
-from typing import Dict, List
-
-import numpy as np
-import yaml
-from loguru import logger
-
-from openqdc.datasets.interaction.base import BaseInteractionDataset
-from openqdc.datasets.interaction.L7 import get_loader
+from openqdc.datasets.interaction.L7 import L7
 from openqdc.methods import InteractionMethod, InterEnergyType
-from openqdc.utils.constants import ATOM_TABLE
 
 
-class X40(BaseInteractionDataset):
+class X40(L7):
     """
     X40 interaction dataset of 40 dimer pairs as
     introduced in the following paper:
@@ -26,9 +17,6 @@ class X40(BaseInteractionDataset):
     """
 
     __name__ = "x40"
-    __energy_unit__ = "hartree"
-    __distance_unit__ = "ang"
-    __forces_unit__ = "hartree/ang"
     __energy_methods__ = [
         InteractionMethod.CCSD_T_CBS,  # "CCSD(T)/CBS",
         InteractionMethod.MP2_CBS,  # "MP2/CBS",
@@ -42,43 +30,5 @@ class X40(BaseInteractionDataset):
 
     energy_target_names = []
 
-    def read_raw_entries(self) -> List[Dict]:
-        yaml_fpath = os.path.join(self.root, "x40.yaml")
-        logger.info(f"Reading X40 interaction data from {self.root}")
-        yaml_file = open(yaml_fpath, "r")
-        data = []
-        data_dict = yaml.load(yaml_file, Loader=get_loader())
-        charge0 = int(data_dict.description.global_setup["molecule_a"]["charge"])
-        charge1 = int(data_dict.description.global_setup["molecule_b"]["charge"])
-
-        for idx, item in enumerate(data_dict.items):
-            energies = []
-            name = np.array([item.shortname])
-            energies.append(float(item.reference_value))
-            xyz_file = open(os.path.join(self.root, f"{item.shortname}.xyz"), "r")
-            lines = list(map(lambda x: x.strip().split(), xyz_file.readlines()))
-            setup = lines.pop(1)
-            n_atoms = np.array([int(lines[0][0])], dtype=np.int32)
-            n_atoms_first = setup[0].split("-")[1]
-            n_atoms_first = np.array([int(n_atoms_first)], dtype=np.int32)
-            subset = np.array([item.group])
-            energies += [float(val[idx]) for val in list(data_dict.alternative_reference.values())]
-            energies = np.array([energies], dtype=np.float32)
-            pos = np.array(lines[1:])[:, 1:].astype(np.float32)
-            elems = np.array(lines[1:])[:, 0]
-            atomic_nums = np.expand_dims(np.array([ATOM_TABLE.GetAtomicNumber(x) for x in elems]), axis=1)
-            natoms0 = n_atoms_first[0]
-            natoms1 = n_atoms[0] - natoms0
-            charges = np.expand_dims(np.array([charge0] * natoms0 + [charge1] * natoms1), axis=1)
-            atomic_inputs = np.concatenate((atomic_nums, charges, pos), axis=-1, dtype=np.float32)
-
-            item = dict(
-                energies=energies,
-                subset=subset,
-                n_atoms=n_atoms,
-                n_atoms_first=n_atoms_first,
-                atomic_inputs=atomic_inputs,
-                name=name,
-            )
-            data.append(item)
-        return data
+    def _process_name(self, item):
+        return item.shortname

From 8595fd888df1cde542b637902e51b19e7503c3d5 Mon Sep 17 00:00:00 2001
From: FNTwin <cris.gabellini@gmail.com>
Date: Sat, 6 Apr 2024 16:40:04 -0400
Subject: [PATCH 23/27] Safe opening

---
 openqdc/datasets/interaction/L7.py | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/openqdc/datasets/interaction/L7.py b/openqdc/datasets/interaction/L7.py
index a7434c2..fc354e5 100644
--- a/openqdc/datasets/interaction/L7.py
+++ b/openqdc/datasets/interaction/L7.py
@@ -143,10 +143,9 @@ def yaml_path(self):
     def read_raw_entries(self) -> List[Dict]:
         yaml_fpath = self.yaml_path
         logger.info(f"Reading {self.__name__} interaction data from {self.root}")
-        yaml_file = open(yaml_fpath, "r")
+        with open(yaml_fpath, "r") as yaml_file:
+            data_dict = yaml.load(yaml_file, Loader=get_loader())
         data = []
-        data_dict = yaml.load(yaml_file, Loader=get_loader())
-
         charge0 = int(data_dict.description.global_setup["molecule_a"]["charge"])
         charge1 = int(data_dict.description.global_setup["molecule_b"]["charge"])
 

From ca1b4aff6bd60fe72f36bdcdf6b7a7bd17f83a90 Mon Sep 17 00:00:00 2001
From: FNTwin <cris.gabellini@gmail.com>
Date: Sat, 6 Apr 2024 16:43:44 -0400
Subject: [PATCH 24/27] Moved X40 in L7 and removed x40.py

---
 openqdc/datasets/interaction/X40.py           | 34 -------------------
 openqdc/datasets/interaction/__init__.py      |  3 +-
 .../datasets/interaction/{L7.py => l7x40.py}  | 32 +++++++++++++++++
 3 files changed, 33 insertions(+), 36 deletions(-)
 delete mode 100644 openqdc/datasets/interaction/X40.py
 rename openqdc/datasets/interaction/{L7.py => l7x40.py} (85%)

diff --git a/openqdc/datasets/interaction/X40.py b/openqdc/datasets/interaction/X40.py
deleted file mode 100644
index a42d36c..0000000
--- a/openqdc/datasets/interaction/X40.py
+++ /dev/null
@@ -1,34 +0,0 @@
-from openqdc.datasets.interaction.L7 import L7
-from openqdc.methods import InteractionMethod, InterEnergyType
-
-
-class X40(L7):
-    """
-    X40 interaction dataset of 40 dimer pairs as
-    introduced in the following paper:
-
-    Benchmark Calculations of Noncovalent Interactions of Halogenated Molecules
-    Jan Řezáč, Kevin E. Riley, and Pavel Hobza
-    Journal of Chemical Theory and Computation 2012 8 (11), 4285-4292
-    DOI: 10.1021/ct300647k
-
-    Dataset retrieved and processed from:
-    http://cuby4.molecular.cz/dataset_x40.html
-    """
-
-    __name__ = "x40"
-    __energy_methods__ = [
-        InteractionMethod.CCSD_T_CBS,  # "CCSD(T)/CBS",
-        InteractionMethod.MP2_CBS,  # "MP2/CBS",
-        InteractionMethod.DCCSDT_HA_DZ,  # "dCCSD(T)/haDZ",
-        InteractionMethod.DCCSDT_HA_TZ,  # "dCCSD(T)/haTZ",
-        InteractionMethod.MP2_5_CBS_ADZ,  # "MP2.5/CBS(aDZ)",
-    ]
-    __energy_type__ = [
-        InterEnergyType.TOTAL,
-    ] * 5
-
-    energy_target_names = []
-
-    def _process_name(self, item):
-        return item.shortname
diff --git a/openqdc/datasets/interaction/__init__.py b/openqdc/datasets/interaction/__init__.py
index 814a367..eca842d 100644
--- a/openqdc/datasets/interaction/__init__.py
+++ b/openqdc/datasets/interaction/__init__.py
@@ -1,9 +1,8 @@
 from .base import BaseInteractionDataset  # noqa
 from .des import DES5M, DES370K, DESS66, DESS66x8
-from .L7 import L7
+from .l7x40 import L7, X40
 from .metcalf import Metcalf
 from .splinter import Splinter
-from .X40 import X40
 
 AVAILABLE_INTERACTION_DATASETS = {
     "des5m": DES5M,
diff --git a/openqdc/datasets/interaction/L7.py b/openqdc/datasets/interaction/l7x40.py
similarity index 85%
rename from openqdc/datasets/interaction/L7.py
rename to openqdc/datasets/interaction/l7x40.py
index fc354e5..12b5316 100644
--- a/openqdc/datasets/interaction/L7.py
+++ b/openqdc/datasets/interaction/l7x40.py
@@ -157,3 +157,35 @@ def read_raw_entries(self) -> List[Dict]:
 
     def _process_name(self, item):
         return item.geometry.split(":")[1]
+
+
+class X40(L7):
+    """
+    X40 interaction dataset of 40 dimer pairs as
+    introduced in the following paper:
+
+    Benchmark Calculations of Noncovalent Interactions of Halogenated Molecules
+    Jan Řezáč, Kevin E. Riley, and Pavel Hobza
+    Journal of Chemical Theory and Computation 2012 8 (11), 4285-4292
+    DOI: 10.1021/ct300647k
+
+    Dataset retrieved and processed from:
+    http://cuby4.molecular.cz/dataset_x40.html
+    """
+
+    __name__ = "x40"
+    __energy_methods__ = [
+        InteractionMethod.CCSD_T_CBS,  # "CCSD(T)/CBS",
+        InteractionMethod.MP2_CBS,  # "MP2/CBS",
+        InteractionMethod.DCCSDT_HA_DZ,  # "dCCSD(T)/haDZ",
+        InteractionMethod.DCCSDT_HA_TZ,  # "dCCSD(T)/haTZ",
+        InteractionMethod.MP2_5_CBS_ADZ,  # "MP2.5/CBS(aDZ)",
+    ]
+    __energy_type__ = [
+        InterEnergyType.TOTAL,
+    ] * 5
+
+    energy_target_names = []
+
+    def _process_name(self, item):
+        return item.shortname

From 4bec82de940e11f35516ca6462c1bb570ca857d5 Mon Sep 17 00:00:00 2001
From: FNTwin <cris.gabellini@gmail.com>
Date: Sat, 6 Apr 2024 20:47:24 -0400
Subject: [PATCH 25/27] Moved Yaml utils to _utils.py, L7 + X40 interface

---
 .../interaction/{l7x40.py => _utils.py}       | 67 ++-----------------
 openqdc/datasets/interaction/l7.py            | 32 +++++++++
 openqdc/datasets/interaction/x40.py           | 29 ++++++++
 3 files changed, 68 insertions(+), 60 deletions(-)
 rename openqdc/datasets/interaction/{l7x40.py => _utils.py} (67%)
 create mode 100644 openqdc/datasets/interaction/l7.py
 create mode 100644 openqdc/datasets/interaction/x40.py

diff --git a/openqdc/datasets/interaction/l7x40.py b/openqdc/datasets/interaction/_utils.py
similarity index 67%
rename from openqdc/datasets/interaction/l7x40.py
rename to openqdc/datasets/interaction/_utils.py
index 12b5316..3df948e 100644
--- a/openqdc/datasets/interaction/l7x40.py
+++ b/openqdc/datasets/interaction/_utils.py
@@ -1,4 +1,5 @@
 import os
+from abc import ABC, abstractmethod
 from dataclasses import dataclass
 from functools import partial
 from os.path import join as p_join
@@ -9,7 +10,7 @@
 from loguru import logger
 
 from openqdc.datasets.interaction.base import BaseInteractionDataset
-from openqdc.methods import InteractionMethod, InterEnergyType
+from openqdc.methods import InterEnergyType
 from openqdc.utils.constants import ATOM_TABLE
 
 
@@ -104,37 +105,14 @@ def build_item(item, charge0, charge1, idx, data_dict, root, filename):
     return datum
 
 
-class L7(BaseInteractionDataset):
-    """
-    The L7 interaction energy dataset as described in:
-
-    Accuracy of Quantum Chemical Methods for Large Noncovalent Complexes
-    Robert Sedlak, Tomasz Janowski, Michal Pitoňák, Jan Řezáč, Peter Pulay, and Pavel Hobza
-    Journal of Chemical Theory and Computation 2013 9 (8), 3364-3374
-    DOI: 10.1021/ct400036b
-
-    Data was downloaded and extracted from:
-    http://cuby4.molecular.cz/dataset_l7.html
-    """
-
+class YamlDataset(BaseInteractionDataset, ABC):
     __name__ = "l7"
     __energy_unit__ = "kcal/mol"
     __distance_unit__ = "ang"
     __forces_unit__ = "kcal/mol/ang"
-    __energy_methods__ = [
-        InteractionMethod.QCISDT_CBS,  # "QCISD(T)/CBS",
-        InteractionMethod.DLPNO_CCSDT,  # "DLPNO-CCSD(T)",
-        InteractionMethod.MP2_CBS,  # "MP2/CBS",
-        InteractionMethod.MP2C_CBS,  # "MP2C/CBS",
-        InteractionMethod.FIXED,  # "fixed", TODO: we should remove this level of theory because unless we have a pro
-        InteractionMethod.DLPNO_CCSDT0,  # "DLPNO-CCSD(T0)",
-        InteractionMethod.LNO_CCSDT,  # "LNO-CCSD(T)",
-        InteractionMethod.FN_DMC,  # "FN-DMC",
-    ]
-
-    __energy_type__ = [InterEnergyType.TOTAL] * 8
-
     energy_target_names = []
+    __energy_methods__ = []
+    __energy_type__ = [InterEnergyType.TOTAL] * len(__energy_methods__)
 
     @property
     def yaml_path(self):
@@ -155,37 +133,6 @@ def read_raw_entries(self) -> List[Dict]:
             data.append(item)
         return data
 
+    @abstractmethod
     def _process_name(self, item):
-        return item.geometry.split(":")[1]
-
-
-class X40(L7):
-    """
-    X40 interaction dataset of 40 dimer pairs as
-    introduced in the following paper:
-
-    Benchmark Calculations of Noncovalent Interactions of Halogenated Molecules
-    Jan Řezáč, Kevin E. Riley, and Pavel Hobza
-    Journal of Chemical Theory and Computation 2012 8 (11), 4285-4292
-    DOI: 10.1021/ct300647k
-
-    Dataset retrieved and processed from:
-    http://cuby4.molecular.cz/dataset_x40.html
-    """
-
-    __name__ = "x40"
-    __energy_methods__ = [
-        InteractionMethod.CCSD_T_CBS,  # "CCSD(T)/CBS",
-        InteractionMethod.MP2_CBS,  # "MP2/CBS",
-        InteractionMethod.DCCSDT_HA_DZ,  # "dCCSD(T)/haDZ",
-        InteractionMethod.DCCSDT_HA_TZ,  # "dCCSD(T)/haTZ",
-        InteractionMethod.MP2_5_CBS_ADZ,  # "MP2.5/CBS(aDZ)",
-    ]
-    __energy_type__ = [
-        InterEnergyType.TOTAL,
-    ] * 5
-
-    energy_target_names = []
-
-    def _process_name(self, item):
-        return item.shortname
+        raise NotImplementedError
diff --git a/openqdc/datasets/interaction/l7.py b/openqdc/datasets/interaction/l7.py
new file mode 100644
index 0000000..22e3141
--- /dev/null
+++ b/openqdc/datasets/interaction/l7.py
@@ -0,0 +1,32 @@
+from openqdc.methods import InteractionMethod
+
+from ._utils import YamlDataset
+
+
+class L7(YamlDataset):
+    """
+    The L7 interaction energy dataset as described in:
+
+    Accuracy of Quantum Chemical Methods for Large Noncovalent Complexes
+    Robert Sedlak, Tomasz Janowski, Michal Pitoňák, Jan Řezáč, Peter Pulay, and Pavel Hobza
+    Journal of Chemical Theory and Computation 2013 9 (8), 3364-3374
+    DOI: 10.1021/ct400036b
+
+    Data was downloaded and extracted from:
+    http://cuby4.molecular.cz/dataset_l7.html
+    """
+
+    __name__ = "l7"
+    __energy_methods__ = [
+        InteractionMethod.QCISDT_CBS,  # "QCISD(T)/CBS",
+        InteractionMethod.DLPNO_CCSDT,  # "DLPNO-CCSD(T)",
+        InteractionMethod.MP2_CBS,  # "MP2/CBS",
+        InteractionMethod.MP2C_CBS,  # "MP2C/CBS",
+        InteractionMethod.FIXED,  # "fixed", TODO: we should remove this level of theory because unless we have a pro
+        InteractionMethod.DLPNO_CCSDT0,  # "DLPNO-CCSD(T0)",
+        InteractionMethod.LNO_CCSDT,  # "LNO-CCSD(T)",
+        InteractionMethod.FN_DMC,  # "FN-DMC",
+    ]
+
+    def _process_name(self, item):
+        return item.geometry.split(":")[1]
diff --git a/openqdc/datasets/interaction/x40.py b/openqdc/datasets/interaction/x40.py
new file mode 100644
index 0000000..1b5148c
--- /dev/null
+++ b/openqdc/datasets/interaction/x40.py
@@ -0,0 +1,29 @@
+from openqdc.datasets.interaction._utils import YamlDataset
+from openqdc.methods import InteractionMethod
+
+
+class X40(YamlDataset):
+    """
+    X40 interaction dataset of 40 dimer pairs as
+    introduced in the following paper:
+
+    Benchmark Calculations of Noncovalent Interactions of Halogenated Molecules
+    Jan Řezáč, Kevin E. Riley, and Pavel Hobza
+    Journal of Chemical Theory and Computation 2012 8 (11), 4285-4292
+    DOI: 10.1021/ct300647k
+
+    Dataset retrieved and processed from:
+    http://cuby4.molecular.cz/dataset_x40.html
+    """
+
+    __name__ = "x40"
+    __energy_methods__ = [
+        InteractionMethod.CCSD_T_CBS,  # "CCSD(T)/CBS",
+        InteractionMethod.MP2_CBS,  # "MP2/CBS",
+        InteractionMethod.DCCSDT_HA_DZ,  # "dCCSD(T)/haDZ",
+        InteractionMethod.DCCSDT_HA_TZ,  # "dCCSD(T)/haTZ",
+        InteractionMethod.MP2_5_CBS_ADZ,  # "MP2.5/CBS(aDZ)",
+    ]
+
+    def _process_name(self, item):
+        return item.shortname

From 3303f95b9e6e5b4a10edc649dc7099cdabd6111b Mon Sep 17 00:00:00 2001
From: Nikhil Shenoy <nikhilshenoy98@gmail.com.com>
Date: Fri, 12 Apr 2024 01:04:37 +0000
Subject: [PATCH 26/27] better convert function and n_body_first to ptr

---
 openqdc/datasets/base.py                 | 5 +++++
 openqdc/datasets/interaction/_utils.py   | 6 +++---
 openqdc/datasets/interaction/base.py     | 8 ++++----
 openqdc/datasets/interaction/des.py      | 2 +-
 openqdc/datasets/interaction/dummy.py    | 4 ++--
 openqdc/datasets/interaction/metcalf.py  | 2 +-
 openqdc/datasets/interaction/splinter.py | 6 +++---
 openqdc/datasets/potential/dummy.py      | 4 ++--
 8 files changed, 21 insertions(+), 16 deletions(-)

diff --git a/openqdc/datasets/base.py b/openqdc/datasets/base.py
index b5bc43b..fabdfc1 100644
--- a/openqdc/datasets/base.py
+++ b/openqdc/datasets/base.py
@@ -49,11 +49,15 @@
 
 @requires_package("torch")
 def to_torch(x: np.ndarray):
+    if isinstance(x, torch.Tensor):
+        return x
     return torch.from_numpy(x)
 
 
 @requires_package("jax")
 def to_jax(x: np.ndarray):
+    if isinstance(x, jnp.ndarray):
+        return x
     return jnp.array(x)
 
 
@@ -166,6 +170,7 @@ def _precompute_statistics(self, overwrite_local_cache: bool = False):
             PerAtomFormationEnergyStats,
         )
         self.statistics.run_calculators()  # run the calculators
+        self._compute_average_nb_atoms()
 
     @classmethod
     def no_init(cls):
diff --git a/openqdc/datasets/interaction/_utils.py b/openqdc/datasets/interaction/_utils.py
index 3df948e..0d2915b 100644
--- a/openqdc/datasets/interaction/_utils.py
+++ b/openqdc/datasets/interaction/_utils.py
@@ -76,7 +76,7 @@ def convert_to_record(item):
         energies=item["energies"],
         subset=np.array([item["subset"]]),
         n_atoms=np.array([item["natoms0"] + item["natoms1"]], dtype=np.int32),
-        n_atoms_first=np.array([item["natoms0"]], dtype=np.int32),
+        n_atoms_ptr=np.array([item["natoms0"]], dtype=np.int32),
         atomic_inputs=item["atomic_inputs"],
         name=item["name"],
     )
@@ -95,8 +95,8 @@ def build_item(item, charge0, charge1, idx, data_dict, root, filename):
     datum["n_atoms"] = n_atoms
     datum["pos"] = pos
     datum["atomic_nums"] = atomic_nums
-    datum["n_atoms_first"] = np.array([int(item.setup["molecule_a"]["selection"].split("-")[1])], dtype=np.int32)
-    datum["natoms0"] = datum["n_atoms_first"][0]
+    datum["n_atoms_ptr"] = np.array([int(item.setup["molecule_a"]["selection"].split("-")[1])], dtype=np.int32)
+    datum["natoms0"] = datum["n_atoms_ptr"][0]
     datum["natoms1"] = datum["n_atoms"][0] - datum["natoms0"]
     datum["charges"] = np.expand_dims(np.array([charge0] * datum["natoms0"] + [charge1] * datum["natoms1"]), axis=1)
     datum["atomic_inputs"] = np.concatenate(
diff --git a/openqdc/datasets/interaction/base.py b/openqdc/datasets/interaction/base.py
index 96f39c1..2ce5481 100644
--- a/openqdc/datasets/interaction/base.py
+++ b/openqdc/datasets/interaction/base.py
@@ -20,7 +20,7 @@ def pkl_data_types(self):
             "name": str,
             "subset": str,
             "n_atoms": np.int32,
-            "n_atoms_first": np.int32,
+            "n_atoms_ptr": np.int32,
         }
 
     def __getitem__(self, idx: int):
@@ -35,7 +35,7 @@ def __getitem__(self, idx: int):
         )
         name = self.__smiles_converter__(self.data["name"][idx])
         subset = self.data["subset"][idx]
-        n_atoms_first = self.data["n_atoms_first"][idx]
+        n_atoms_ptr = self.data["n_atoms_ptr"][idx]
 
         forces = None
         if "forces" in self.data:
@@ -52,7 +52,7 @@ def __getitem__(self, idx: int):
             name=name,
             subset=subset,
             forces=forces,
-            n_atoms_first=n_atoms_first,
+            n_atoms_ptr=n_atoms_ptr,
         )
 
         if self.transform is not None:
@@ -63,7 +63,7 @@ def __getitem__(self, idx: int):
     def get_ase_atoms(self, idx: int):
         entry = self[idx]
         at = to_atoms(entry["positions"], entry["atomic_numbers"])
-        at.info["n_atoms"] = entry["n_atoms_first"]
+        at.info["n_atoms"] = entry["n_atoms_ptr"]
         return at
 
     def save_xyz(self, idx: int, path: Optional[str] = None):
diff --git a/openqdc/datasets/interaction/des.py b/openqdc/datasets/interaction/des.py
index 7c542e2..a292fc3 100644
--- a/openqdc/datasets/interaction/des.py
+++ b/openqdc/datasets/interaction/des.py
@@ -59,7 +59,7 @@ def convert_to_record(item):
         energies=item["energies"],
         subset=np.array([item["subset"]]),
         n_atoms=np.array([item["natoms0"] + item["natoms1"]], dtype=np.int32),
-        n_atoms_first=np.array([item["natoms0"]], dtype=np.int32),
+        n_atoms_ptr=np.array([item["natoms0"]], dtype=np.int32),
         atomic_inputs=item["atomic_inputs"],
         name=item["name"],
     )
diff --git a/openqdc/datasets/interaction/dummy.py b/openqdc/datasets/interaction/dummy.py
index 4dcb8a3..7f19154 100644
--- a/openqdc/datasets/interaction/dummy.py
+++ b/openqdc/datasets/interaction/dummy.py
@@ -27,7 +27,7 @@ def _post_init(self, overwrite_local_cache, energy_unit, distance_unit) -> None:
 
     def setup_dummy(self):
         n_atoms = np.array([np.random.randint(10, 30) for _ in range(len(self))])
-        n_atoms_first = np.array([np.random.randint(1, 10) for _ in range(len(self))])
+        n_atoms_ptr = np.array([np.random.randint(1, 10) for _ in range(len(self))])
         position_idx_range = np.concatenate([[0], np.cumsum(n_atoms)]).repeat(2)[1:-1].reshape(-1, 2)
         atomic_inputs = np.concatenate(
             [
@@ -54,7 +54,7 @@ def setup_dummy(self):
             atomic_inputs=atomic_inputs,
             subset=subset,
             energies=energies,
-            n_atoms_first=n_atoms_first,
+            n_atoms_ptr=n_atoms_ptr,
         )
         self.__average_nb_atoms__ = self.data["n_atoms"].mean()
 
diff --git a/openqdc/datasets/interaction/metcalf.py b/openqdc/datasets/interaction/metcalf.py
index 99da5b0..60298c4 100644
--- a/openqdc/datasets/interaction/metcalf.py
+++ b/openqdc/datasets/interaction/metcalf.py
@@ -69,7 +69,7 @@ def content_to_xyz(content, subset):
         energies=e,
         atomic_inputs=np.concatenate((xs, positions), axis=-1, dtype=np.float32),
         name=np.array([name]),
-        n_atoms_first=np.array([-1]),
+        n_atoms_ptr=np.array([-1]),
     )
 
     return item
diff --git a/openqdc/datasets/interaction/splinter.py b/openqdc/datasets/interaction/splinter.py
index 60cb503..72e808a 100644
--- a/openqdc/datasets/interaction/splinter.py
+++ b/openqdc/datasets/interaction/splinter.py
@@ -136,13 +136,13 @@ def read_raw_entries(self) -> List[Dict]:
                         ) = metadata[0].split("_")
                         r, theta_P, tau_P, theta_L, tau_L, tau_PL = [-1] * 6
                     energies = np.array([list(map(float, metadata[4:-1]))]).astype(np.float32)
-                    n_atoms_first = np.array([int(metadata[-1])], dtype=np.int32)
+                    n_atoms_ptr = np.array([int(metadata[-1])], dtype=np.int32)
                     total_charge, charge0, charge1 = list(map(int, metadata[1:4]))
                     lines = list(map(lambda x: x.split(), lines[2:]))
                     pos = np.array(lines)[:, 1:].astype(np.float32)
                     elems = np.array(lines)[:, 0]
                     atomic_nums = np.expand_dims(np.array([ATOM_TABLE.GetAtomicNumber(x) for x in elems]), axis=1)
-                    natoms0 = n_atoms_first[0]
+                    natoms0 = n_atoms_ptr[0]
                     natoms1 = n_atoms[0] - natoms0
                     charges = np.expand_dims(np.array([charge0] * natoms0 + [charge1] * natoms1), axis=1)
                     atomic_inputs = np.concatenate((atomic_nums, charges, pos), axis=-1, dtype=np.float32)
@@ -152,7 +152,7 @@ def read_raw_entries(self) -> List[Dict]:
                         energies=energies,
                         subset=subset,
                         n_atoms=n_atoms,
-                        n_atoms_first=n_atoms_first,
+                        n_atoms_ptr=n_atoms_ptr,
                         atomic_inputs=atomic_inputs,
                         protein_monomer_name=np.array([protein_monomer_name]),
                         protein_interaction_site_type=np.array([protein_interaction_site_type]),
diff --git a/openqdc/datasets/potential/dummy.py b/openqdc/datasets/potential/dummy.py
index 1c7a61c..b485d40 100644
--- a/openqdc/datasets/potential/dummy.py
+++ b/openqdc/datasets/potential/dummy.py
@@ -14,7 +14,7 @@ class Dummy(BaseDataset):
     """
 
     __name__ = "dummy"
-    __energy_methods__ = [PotentialMethod.SVWN_DEF2_TZVP, PotentialMethod.PM6, PotentialMethod.GFN2_XTB]
+    __energy_methods__ = [PotentialMethod.GFN2_XTB, PotentialMethod.WB97X_D_DEF2_SVP, PotentialMethod.GFN2_XTB]
     __force_mask__ = [False, True, True]
     __energy_unit__ = "kcal/mol"
     __distance_unit__ = "ang"
@@ -31,7 +31,7 @@ def _post_init(self, overwrite_local_cache, energy_unit, distance_unit) -> None:
         return super()._post_init(overwrite_local_cache, energy_unit, distance_unit)
 
     def setup_dummy(self):
-        n_atoms = np.array([np.random.randint(1, 100) for _ in range(len(self))])
+        n_atoms = np.array([np.random.randint(2, 100) for _ in range(len(self))])
         position_idx_range = np.concatenate([[0], np.cumsum(n_atoms)]).repeat(2)[1:-1].reshape(-1, 2)
         atomic_inputs = np.concatenate(
             [

From 6f033cf579b48af74642e3ab7215a78dcfd9469e Mon Sep 17 00:00:00 2001
From: Nikhil Shenoy <nikhilshenoy98@gmail.com.com>
Date: Mon, 15 Apr 2024 16:43:36 +0000
Subject: [PATCH 27/27] Updated splinter reading from -1 to nan

---
 openqdc/datasets/interaction/splinter.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/openqdc/datasets/interaction/splinter.py b/openqdc/datasets/interaction/splinter.py
index 72e808a..a793624 100644
--- a/openqdc/datasets/interaction/splinter.py
+++ b/openqdc/datasets/interaction/splinter.py
@@ -134,7 +134,7 @@ def read_raw_entries(self) -> List[Dict]:
                             index,
                             _,
                         ) = metadata[0].split("_")
-                        r, theta_P, tau_P, theta_L, tau_L, tau_PL = [-1] * 6
+                        r, theta_P, tau_P, theta_L, tau_L, tau_PL = [np.nan] * 6
                     energies = np.array([list(map(float, metadata[4:-1]))]).astype(np.float32)
                     n_atoms_ptr = np.array([int(metadata[-1])], dtype=np.int32)
                     total_charge, charge0, charge1 = list(map(int, metadata[1:4]))