From 31beb71aed0497125a76e727710f014e2e76746a Mon Sep 17 00:00:00 2001 From: Nikhil Shenoy Date: Tue, 26 Mar 2024 19:57:25 +0000 Subject: [PATCH 01/27] refactor interaction and initial testing --- openqdc/datasets/base.py | 8 +- openqdc/datasets/interaction/L7.py | 2 +- openqdc/datasets/interaction/X40.py | 2 +- openqdc/datasets/interaction/__init__.py | 2 + openqdc/datasets/interaction/base.py | 76 ++--------------- openqdc/datasets/interaction/dummy.py | 100 +++++++++++++++++++++++ openqdc/datasets/interaction/splinter.py | 2 +- tests/test_interaction.py | 19 +++++ 8 files changed, 139 insertions(+), 72 deletions(-) create mode 100644 openqdc/datasets/interaction/dummy.py create mode 100644 tests/test_interaction.py diff --git a/openqdc/datasets/base.py b/openqdc/datasets/base.py index 94150e1..2c1d2fa 100644 --- a/openqdc/datasets/base.py +++ b/openqdc/datasets/base.py @@ -345,6 +345,10 @@ def data_keys(self): keys.remove("forces") return keys + @property + def pkl_data_keys(self): + return ["name", "subset", "n_atoms"] + @property def data_types(self): return { @@ -465,7 +469,7 @@ def save_preprocess(self, data_dict): # save smiles and subset local_path = p_join(self.preprocess_path, "props.pkl") - for key in ["name", "subset"]: + for key in self.pkl_data_keys: data_dict[key] = np.unique(data_dict[key], return_inverse=True) with open(local_path, "wb") as f: @@ -502,7 +506,7 @@ def read_preprocess(self, overwrite_local_cache=False): pull_locally(filename, overwrite=overwrite_local_cache) with open(filename, "rb") as f: tmp = pkl.load(f) - for key in ["name", "subset", "n_atoms"]: + for key in self.pkl_data_keys: x = tmp.pop(key) if len(x) == 2: self.data[key] = x[0][x[1]] diff --git a/openqdc/datasets/interaction/L7.py b/openqdc/datasets/interaction/L7.py index 987df39..0454ce2 100644 --- a/openqdc/datasets/interaction/L7.py +++ b/openqdc/datasets/interaction/L7.py @@ -76,7 +76,7 @@ class L7(BaseInteractionDataset): "FN-DMC", ] - energy_target_names = [] + energy_target_names = __energy_methods__ def read_raw_entries(self) -> List[Dict]: yaml_fpath = os.path.join(self.root, "l7.yaml") diff --git a/openqdc/datasets/interaction/X40.py b/openqdc/datasets/interaction/X40.py index 08f4037..3f23c6b 100644 --- a/openqdc/datasets/interaction/X40.py +++ b/openqdc/datasets/interaction/X40.py @@ -36,7 +36,7 @@ class X40(BaseInteractionDataset): "MP2.5/CBS(aDZ)", ] - energy_target_names = [] + energy_target_names = __energy_methods__ def read_raw_entries(self) -> List[Dict]: yaml_fpath = os.path.join(self.root, "x40.yaml") diff --git a/openqdc/datasets/interaction/__init__.py b/openqdc/datasets/interaction/__init__.py index 82154a5..ccabcfb 100644 --- a/openqdc/datasets/interaction/__init__.py +++ b/openqdc/datasets/interaction/__init__.py @@ -3,6 +3,7 @@ from .des370k import DES370K from .dess66 import DESS66 from .dess66x8 import DESS66x8 +from .dummy import DummyInteraction from .L7 import L7 from .metcalf import Metcalf from .splinter import Splinter @@ -10,6 +11,7 @@ AVAILABLE_INTERACTION_DATASETS = { "base": BaseInteractionDataset, + "dummy": DummyInteraction, "des5m": DES5M, "des370k": DES370K, "dess66": DESS66, diff --git a/openqdc/datasets/interaction/base.py b/openqdc/datasets/interaction/base.py index ed7fcf7..25b3d9c 100644 --- a/openqdc/datasets/interaction/base.py +++ b/openqdc/datasets/interaction/base.py @@ -1,6 +1,6 @@ import pickle as pkl from os.path import join as p_join -from typing import Dict, List, Optional +from typing import Dict, List import numpy as np from loguru import logger @@ -8,24 +8,13 @@ from openqdc.datasets.base import BaseDataset from openqdc.utils.atomization_energies import IsolatedAtomEnergyFactory -from openqdc.utils.constants import NB_ATOMIC_FEATURES -from openqdc.utils.io import pull_locally, push_remote +from openqdc.utils.io import push_remote class BaseInteractionDataset(BaseDataset): - def __init__( - self, - energy_unit: Optional[str] = None, - distance_unit: Optional[str] = None, - overwrite_local_cache: bool = False, - cache_dir: Optional[str] = None, - ) -> None: - super().__init__( - energy_unit=energy_unit, - distance_unit=distance_unit, - overwrite_local_cache=overwrite_local_cache, - cache_dir=cache_dir, - ) + @property + def pkl_data_keys(self): + return ["name", "subset", "n_atoms", "n_atoms_first"] def collate_list(self, list_entries: List[Dict]): # concatenate entries @@ -42,24 +31,6 @@ def collate_list(self, list_entries: List[Dict]): return res - @property - def data_shapes(self): - return { - "atomic_inputs": (-1, NB_ATOMIC_FEATURES), - "position_idx_range": (-1, 2), - "energies": (-1, len(self.__energy_methods__)), - "forces": (-1, 3, len(self.force_target_names)), - } - - @property - def data_types(self): - return { - "atomic_inputs": np.float32, - "position_idx_range": np.int32, - "energies": np.float32, - "forces": np.float32, - } - def __getitem__(self, idx: int): shift = IsolatedAtomEnergyFactory.max_charge p_start, p_end = self.data["position_idx_range"][idx] @@ -102,40 +73,11 @@ def save_preprocess(self, data_dict): # save all other keys in props.pkl local_path = p_join(self.preprocess_path, "props.pkl") - for key in data_dict: - if key not in self.data_keys: - x = data_dict[key] - x[x == None] = -1 - data_dict[key] = np.unique(x, return_inverse=True) + for key in self.pkl_data_keys: + x = data_dict[key] + x[x == None] = -1 # noqa + data_dict[key] = np.unique(x, return_inverse=True) with open(local_path, "wb") as f: pkl.dump(data_dict, f) push_remote(local_path, overwrite=True) - - def read_preprocess(self, overwrite_local_cache=False): - logger.info("Reading preprocessed data.") - logger.info( - f"Dataset {self.__name__} with the following units:\n\ - Energy: {self.energy_unit},\n\ - Distance: {self.distance_unit},\n\ - Forces: {self.force_unit if self.__force_methods__ else 'None'}" - ) - self.data = {} - for key in self.data_keys: - filename = p_join(self.preprocess_path, f"{key}.mmap") - pull_locally(filename, overwrite=overwrite_local_cache) - self.data[key] = np.memmap(filename, mode="r", dtype=self.data_types[key]).reshape(self.data_shapes[key]) - - filename = p_join(self.preprocess_path, "props.pkl") - pull_locally(filename, overwrite=overwrite_local_cache) - with open(filename, "rb") as f: - tmp = pkl.load(f) - for key in set(tmp.keys()) - set(self.data_keys): - x = tmp.pop(key) - if len(x) == 2: - self.data[key] = x[0][x[1]] - else: - self.data[key] = x - - for key in self.data: - logger.info(f"Loaded {key} with shape {self.data[key].shape}, dtype {self.data[key].dtype}") diff --git a/openqdc/datasets/interaction/dummy.py b/openqdc/datasets/interaction/dummy.py new file mode 100644 index 0000000..9e22703 --- /dev/null +++ b/openqdc/datasets/interaction/dummy.py @@ -0,0 +1,100 @@ +import numpy as np + +from openqdc.datasets.interaction.base import BaseDataset +from openqdc.utils.constants import NOT_DEFINED + + +class DummyInteraction(BaseDataset): + """ + Dummy Interaction Dataset for Testing + """ + + __name__ = "dummy" + __energy_methods__ = ["Method1", "Method2"] + __force_mask__ = [False, True] + __energy_unit__ = "kcal/mol" + __distance_unit__ = "ang" + __forces_unit__ = "kcal/mol/ang" + + energy_target_names = [f"energy{i}" for i in range(len(__energy_methods__))] + + force_target_names = [f"forces{i}" for i in range(len(__force_mask__))] + __isolated_atom_energies__ = [] + __average_n_atoms__ = None + + def __init__( + self, + energy_unit=None, + distance_unit=None, + cache_dir=None, + ) -> None: + try: + super().__init__(energy_unit=energy_unit, distance_unit=distance_unit, cache_dir=cache_dir) + + except: # noqa + pass + self._set_isolated_atom_energies() + self.setup_dummy() + + @property + def _stats(self): + return { + "formation": { + "energy": { + "mean": np.array([[-12.94348027, -9.83037297]]), + "std": np.array([[4.39971409, 3.3574188]]), + }, + "forces": NOT_DEFINED, + }, + "total": { + "energy": { + "mean": np.array([[-89.44242, -1740.5336]]), + "std": np.array([[29.599571, 791.48663]]), + }, + "forces": NOT_DEFINED, + }, + } + + def setup_dummy(self): + n_atoms = np.array([np.random.randint(10, 30) for _ in range(len(self))]) + n_atoms_first = np.array([np.random.randint(1, 10) for _ in range(len(self))]) + position_idx_range = np.concatenate([[0], np.cumsum(n_atoms)]).repeat(2)[1:-1].reshape(-1, 2) + atomic_inputs = np.concatenate( + [ + np.concatenate( + [ + # z, c, x, y, z + np.random.randint(1, 100, size=(size, 1)), + np.random.randint(-1, 2, size=(size, 1)), + np.random.randn(size, 3), + ], + axis=1, + ) + for size in n_atoms + ], + axis=0, + ) # (sum(n_atoms), 5) + name = [f"dummy_{i}" for i in range(len(self))] + subset = ["dummy" for i in range(len(self))] + energies = np.random.rand(len(self), len(self.energy_methods)) + forces = np.concatenate([np.random.randn(size, 3, len(self.force_methods)) * 100 for size in n_atoms]) + self.data = dict( + n_atoms=n_atoms, + position_idx_range=position_idx_range, + name=name, + atomic_inputs=atomic_inputs, + subset=subset, + energies=energies, + n_atoms_first=n_atoms_first, + forces=forces, + ) + self.__average_nb_atoms__ = self.data["n_atoms"].mean() + + def is_preprocessed(self): + return True + + def read_raw_entries(self): + pass + + def __len__(self): + return 9999 diff --git a/openqdc/datasets/interaction/splinter.py b/openqdc/datasets/interaction/splinter.py index fd7f08f..c1fd5df 100644 --- a/openqdc/datasets/interaction/splinter.py +++ b/openqdc/datasets/interaction/splinter.py @@ -44,7 +44,7 @@ class Splinter(BaseInteractionDataset): "sapt0/aug-cc-pV(D+d)Z_disp_scaled", ] - energy_target_names = [] + energy_target_names = __energy_methods__ def read_raw_entries(self) -> List[Dict]: logger.info(f"Reading Splinter interaction data from {self.root}") diff --git a/tests/test_interaction.py b/tests/test_interaction.py new file mode 100644 index 0000000..8f1cc2f --- /dev/null +++ b/tests/test_interaction.py @@ -0,0 +1,19 @@ +try: + from openqdc.datasets.interaction import DummyInteraction + + dummy_loaded = True +except: # noqa + dummy_loaded = False + + +def test_import(): + assert dummy_loaded + + +def test_init(): + DummyInteraction() + + +def test_len(): + ds = DummyInteraction() + assert len(ds) == 9999 From dccf67646fbb79e0d985333460cfbfc7bd841f9a Mon Sep 17 00:00:00 2001 From: Nikhil Shenoy Date: Tue, 26 Mar 2024 20:25:46 +0000 Subject: [PATCH 02/27] minor changes --- openqdc/datasets/interaction/__init__.py | 2 - openqdc/datasets/interaction/dummy.py | 15 ++++++-- openqdc/datasets/potential/dummy.py | 22 +++++++++-- tests/test_dummy.py | 47 +++++++++++++++++++++--- tests/test_interaction.py | 19 ---------- 5 files changed, 72 insertions(+), 33 deletions(-) delete mode 100644 tests/test_interaction.py diff --git a/openqdc/datasets/interaction/__init__.py b/openqdc/datasets/interaction/__init__.py index ccabcfb..82154a5 100644 --- a/openqdc/datasets/interaction/__init__.py +++ b/openqdc/datasets/interaction/__init__.py @@ -3,7 +3,6 @@ from .des370k import DES370K from .dess66 import DESS66 from .dess66x8 import DESS66x8 -from .dummy import DummyInteraction from .L7 import L7 from .metcalf import Metcalf from .splinter import Splinter @@ -11,7 +10,6 @@ AVAILABLE_INTERACTION_DATASETS = { "base": BaseInteractionDataset, - "dummy": DummyInteraction, "des5m": DES5M, "des370k": DES370K, "dess66": DESS66, diff --git a/openqdc/datasets/interaction/dummy.py b/openqdc/datasets/interaction/dummy.py index 9e22703..af57e27 100644 --- a/openqdc/datasets/interaction/dummy.py +++ b/openqdc/datasets/interaction/dummy.py @@ -1,3 +1,5 @@ +from typing import Optional + import numpy as np from openqdc.datasets.interaction.base import BaseDataset @@ -24,9 +26,16 @@ class DummyInteraction(BaseDataset): def __init__( self, - energy_unit=None, - distance_unit=None, - cache_dir=None, + energy_unit: Optional[str] = None, + distance_unit: Optional[str] = None, + overwrite_local_cache: bool = False, + cache_dir: Optional[str] = None, + recompute_statistics: bool = False, + regressor_kwargs={ + "solver_type": "linear", + "sub_sample": None, + "stride": 1, + }, ) -> None: try: super().__init__(energy_unit=energy_unit, distance_unit=distance_unit, cache_dir=cache_dir) diff --git a/openqdc/datasets/potential/dummy.py b/openqdc/datasets/potential/dummy.py index 48ed3b2..5563544 100644 --- a/openqdc/datasets/potential/dummy.py +++ b/openqdc/datasets/potential/dummy.py @@ -1,3 +1,5 @@ +from typing import Optional + import numpy as np from openqdc.datasets.base import BaseDataset @@ -43,12 +45,24 @@ def _stats(self): def __init__( self, - energy_unit=None, - distance_unit=None, - cache_dir=None, + energy_unit: Optional[str] = None, + distance_unit: Optional[str] = None, + overwrite_local_cache: bool = False, + cache_dir: Optional[str] = None, + recompute_statistics: bool = False, + regressor_kwargs={ + "solver_type": "linear", + "sub_sample": None, + "stride": 1, + }, ) -> None: try: - super().__init__(energy_unit=energy_unit, distance_unit=distance_unit, cache_dir=cache_dir) + super().__init__( + energy_unit=energy_unit, + distance_unit=distance_unit, + cache_dir=cache_dir, + recompute_statistics=recompute_statistics, + ) except: # noqa pass diff --git a/tests/test_dummy.py b/tests/test_dummy.py index f82376c..b20c899 100644 --- a/tests/test_dummy.py +++ b/tests/test_dummy.py @@ -1,5 +1,8 @@ """Path hack to make tests work.""" +import pytest + +from openqdc.datasets.interaction.dummy import DummyInteraction # noqa: E402 from openqdc.datasets.potential.dummy import Dummy # noqa: E402 from openqdc.utils.atomization_energies import ( ISOLATED_ATOM_ENERGIES, @@ -7,13 +10,47 @@ ) -def test_dummy(): - ds = Dummy() - assert len(ds) > 10 - assert ds[100] +@pytest.fixture +def dummy(): + return Dummy() + + +@pytest.fixture +def dummy_interaction(): + return DummyInteraction() + + +@pytest.mark.parametrize("cls", ["dummy", "dummy_interaction"]) +def test_basic(cls, request): + # init + ds = request.getfixturevalue(cls) + + # len + assert len(ds) == 9999 + + # __getitem__ + assert ds[0] + + +@pytest.mark.parametrize("cls", ["dummy", "dummy_interaction"]) +@pytest.mark.parametrize( + "normalization", + [ + "formation", + "total", + # "residual_regression", + # "per_atom_formation", + # "per_atom_residual_regression" + ], +) +def test_stats(cls, normalization, request): + ds = request.getfixturevalue(cls) + + stats = ds.get_statistics(normalization=normalization) + assert stats is not None -def test_is_at_factory(): +def test_isolated_atom_factory(): res = IsolatedAtomEnergyFactory.get("mp2/cc-pvdz") assert len(res) == len(ISOLATED_ATOM_ENERGIES["mp2"]["cc-pvdz"]) res = IsolatedAtomEnergyFactory.get("PM6") diff --git a/tests/test_interaction.py b/tests/test_interaction.py deleted file mode 100644 index 8f1cc2f..0000000 --- a/tests/test_interaction.py +++ /dev/null @@ -1,19 +0,0 @@ -try: - from openqdc.datasets.interaction import DummyInteraction - - dummy_loaded = True -except: # noqa - dummy_loaded = False - - -def test_import(): - assert dummy_loaded - - -def test_init(): - DummyInteraction() - - -def test_len(): - ds = DummyInteraction() - assert len(ds) == 9999 From 2ab64aaf4d3ce367d86bd3981701ff87237bf453 Mon Sep 17 00:00:00 2001 From: Nikhil Shenoy Date: Tue, 26 Mar 2024 20:30:12 +0000 Subject: [PATCH 03/27] dummy modification --- openqdc/datasets/interaction/dummy.py | 26 +++++----------------- openqdc/datasets/potential/dummy.py | 31 +++++---------------------- 2 files changed, 10 insertions(+), 47 deletions(-) diff --git a/openqdc/datasets/interaction/dummy.py b/openqdc/datasets/interaction/dummy.py index af57e27..b88e623 100644 --- a/openqdc/datasets/interaction/dummy.py +++ b/openqdc/datasets/interaction/dummy.py @@ -1,5 +1,3 @@ -from typing import Optional - import numpy as np from openqdc.datasets.interaction.base import BaseDataset @@ -24,26 +22,9 @@ class DummyInteraction(BaseDataset): __isolated_atom_energies__ = [] __average_n_atoms__ = None - def __init__( - self, - energy_unit: Optional[str] = None, - distance_unit: Optional[str] = None, - overwrite_local_cache: bool = False, - cache_dir: Optional[str] = None, - recompute_statistics: bool = False, - regressor_kwargs={ - "solver_type": "linear", - "sub_sample": None, - "stride": 1, - }, - ) -> None: - try: - super().__init__(energy_unit=energy_unit, distance_unit=distance_unit, cache_dir=cache_dir) - - except: # noqa - pass - self._set_isolated_atom_energies() + def _post_init(self, overwrite_local_cache, energy_unit, distance_unit) -> None: self.setup_dummy() + return super()._post_init(overwrite_local_cache, energy_unit, distance_unit) @property def _stats(self): @@ -99,6 +80,9 @@ def setup_dummy(self): ) self.__average_nb_atoms__ = self.data["n_atoms"].mean() + def read_preprocess(self, overwrite_local_cache=False): + return + def is_preprocessed(self): return True diff --git a/openqdc/datasets/potential/dummy.py b/openqdc/datasets/potential/dummy.py index 5563544..f5b3aa1 100644 --- a/openqdc/datasets/potential/dummy.py +++ b/openqdc/datasets/potential/dummy.py @@ -1,5 +1,3 @@ -from typing import Optional - import numpy as np from openqdc.datasets.base import BaseDataset @@ -43,31 +41,9 @@ def _stats(self): }, } - def __init__( - self, - energy_unit: Optional[str] = None, - distance_unit: Optional[str] = None, - overwrite_local_cache: bool = False, - cache_dir: Optional[str] = None, - recompute_statistics: bool = False, - regressor_kwargs={ - "solver_type": "linear", - "sub_sample": None, - "stride": 1, - }, - ) -> None: - try: - super().__init__( - energy_unit=energy_unit, - distance_unit=distance_unit, - cache_dir=cache_dir, - recompute_statistics=recompute_statistics, - ) - - except: # noqa - pass - self._set_isolated_atom_energies() + def _post_init(self, overwrite_local_cache, energy_unit, distance_unit) -> None: self.setup_dummy() + return super()._post_init(overwrite_local_cache, energy_unit, distance_unit) def setup_dummy(self): n_atoms = np.array([np.random.randint(1, 100) for _ in range(len(self))]) @@ -102,6 +78,9 @@ def setup_dummy(self): ) self.__average_nb_atoms__ = self.data["n_atoms"].mean() + def read_preprocess(self, overwrite_local_cache=False): + return + def is_preprocessed(self): return True From 189ab90d6d2364daf47cc53c0a8a9c608aa26892 Mon Sep 17 00:00:00 2001 From: Nikhil Shenoy Date: Fri, 29 Mar 2024 01:32:59 +0000 Subject: [PATCH 04/27] undo changes in interaction dataset, and minor change in shape --- openqdc/datasets/base.py | 4 ++-- openqdc/datasets/interaction/L7.py | 2 +- openqdc/datasets/interaction/X40.py | 2 +- openqdc/datasets/interaction/splinter.py | 2 +- 4 files changed, 5 insertions(+), 5 deletions(-) diff --git a/openqdc/datasets/base.py b/openqdc/datasets/base.py index 2c1d2fa..7b486a5 100644 --- a/openqdc/datasets/base.py +++ b/openqdc/datasets/base.py @@ -363,8 +363,8 @@ def data_shapes(self): return { "atomic_inputs": (-1, NB_ATOMIC_FEATURES), "position_idx_range": (-1, 2), - "energies": (-1, len(self.energy_target_names)), - "forces": (-1, 3, len(self.force_target_names)), + "energies": (-1, len(self.energy_methods)), + "forces": (-1, 3, len(self.force_methods)), } @property diff --git a/openqdc/datasets/interaction/L7.py b/openqdc/datasets/interaction/L7.py index 0454ce2..987df39 100644 --- a/openqdc/datasets/interaction/L7.py +++ b/openqdc/datasets/interaction/L7.py @@ -76,7 +76,7 @@ class L7(BaseInteractionDataset): "FN-DMC", ] - energy_target_names = __energy_methods__ + energy_target_names = [] def read_raw_entries(self) -> List[Dict]: yaml_fpath = os.path.join(self.root, "l7.yaml") diff --git a/openqdc/datasets/interaction/X40.py b/openqdc/datasets/interaction/X40.py index 3f23c6b..08f4037 100644 --- a/openqdc/datasets/interaction/X40.py +++ b/openqdc/datasets/interaction/X40.py @@ -36,7 +36,7 @@ class X40(BaseInteractionDataset): "MP2.5/CBS(aDZ)", ] - energy_target_names = __energy_methods__ + energy_target_names = [] def read_raw_entries(self) -> List[Dict]: yaml_fpath = os.path.join(self.root, "x40.yaml") diff --git a/openqdc/datasets/interaction/splinter.py b/openqdc/datasets/interaction/splinter.py index c1fd5df..fd7f08f 100644 --- a/openqdc/datasets/interaction/splinter.py +++ b/openqdc/datasets/interaction/splinter.py @@ -44,7 +44,7 @@ class Splinter(BaseInteractionDataset): "sapt0/aug-cc-pV(D+d)Z_disp_scaled", ] - energy_target_names = __energy_methods__ + energy_target_names = [] def read_raw_entries(self) -> List[Dict]: logger.info(f"Reading Splinter interaction data from {self.root}") From 282dc919ee3b10afd9f89ce9196d66269cac4b13 Mon Sep 17 00:00:00 2001 From: Nikhil Shenoy Date: Tue, 2 Apr 2024 00:50:05 +0000 Subject: [PATCH 05/27] changed super class to BaseInteractionDataset --- openqdc/datasets/interaction/dummy.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/openqdc/datasets/interaction/dummy.py b/openqdc/datasets/interaction/dummy.py index b88e623..cfab609 100644 --- a/openqdc/datasets/interaction/dummy.py +++ b/openqdc/datasets/interaction/dummy.py @@ -1,10 +1,10 @@ import numpy as np -from openqdc.datasets.interaction.base import BaseDataset +from openqdc.datasets.interaction.base import BaseInteractionDataset from openqdc.utils.constants import NOT_DEFINED -class DummyInteraction(BaseDataset): +class DummyInteraction(BaseInteractionDataset): """ Dummy Interaction Dataset for Testing """ From afea05302b514e937688254200a9d526ec524f0d Mon Sep 17 00:00:00 2001 From: Nikhil Shenoy Date: Wed, 3 Apr 2024 20:14:20 +0000 Subject: [PATCH 06/27] further simplified and rebase --- openqdc/datasets/base.py | 14 +++++++++++--- openqdc/datasets/interaction/base.py | 25 +------------------------ openqdc/datasets/interaction/dummy.py | 3 ++- 3 files changed, 14 insertions(+), 28 deletions(-) diff --git a/openqdc/datasets/base.py b/openqdc/datasets/base.py index 2ade62e..c12e5c4 100644 --- a/openqdc/datasets/base.py +++ b/openqdc/datasets/base.py @@ -424,8 +424,13 @@ def save_preprocess(self, data_dict): # save smiles and subset local_path = p_join(self.preprocess_path, "props.pkl") - for key in self.pkl_data_keys: - data_dict[key] = np.unique(data_dict[key], return_inverse=True) + # assert that required keys are present in data_dict + assert all([key in data_dict for key in self.pkl_data_keys]) + for key in data_dict: + if key not in self.data_keys: + x = data_dict[key] + x[x == None] = -1 # noqa + data_dict[key] = np.unique(data_dict[key], return_inverse=True) with open(local_path, "wb") as f: pkl.dump(data_dict, f) @@ -461,7 +466,10 @@ def read_preprocess(self, overwrite_local_cache=False): pull_locally(filename, overwrite=overwrite_local_cache) with open(filename, "rb") as f: tmp = pkl.load(f) - for key in self.pkl_data_keys: + all_pkl_keys = set(tmp.keys()) - set(self.data_keys) + # assert required pkl_keys are present in all_pkl_keys + assert all([key in all_pkl_keys for key in self.pkl_data_keys]) + for key in all_pkl_keys: x = tmp.pop(key) if len(x) == 2: self.data[key] = x[0][x[1]] diff --git a/openqdc/datasets/interaction/base.py b/openqdc/datasets/interaction/base.py index 987340a..627cec4 100644 --- a/openqdc/datasets/interaction/base.py +++ b/openqdc/datasets/interaction/base.py @@ -1,16 +1,14 @@ import os -import pickle as pkl from os.path import join as p_join from typing import Dict, List, Optional import numpy as np from ase.io.extxyz import write_extxyz -from loguru import logger from sklearn.utils import Bunch from openqdc.datasets.base import BaseDataset from openqdc.utils.constants import MAX_CHARGE -from openqdc.utils.io import push_remote, to_atoms +from openqdc.utils.io import to_atoms class BaseInteractionDataset(BaseDataset): @@ -65,27 +63,6 @@ def __getitem__(self, idx: int): n_atoms_first=n_atoms_first, ) - def save_preprocess(self, data_dict): - # save memmaps - logger.info("Preprocessing data and saving it to cache.") - for key in self.data_keys: - local_path = p_join(self.preprocess_path, f"{key}.mmap") - out = np.memmap(local_path, mode="w+", dtype=data_dict[key].dtype, shape=data_dict[key].shape) - out[:] = data_dict.pop(key)[:] - out.flush() - push_remote(local_path, overwrite=True) - - # save all other keys in props.pkl - local_path = p_join(self.preprocess_path, "props.pkl") - for key in self.pkl_data_keys: - x = data_dict[key] - x[x == None] = -1 # noqa - data_dict[key] = np.unique(x, return_inverse=True) - - with open(local_path, "wb") as f: - pkl.dump(data_dict, f) - push_remote(local_path, overwrite=True) - def get_ase_atoms(self, idx: int): entry = self[idx] at = to_atoms(entry["positions"], entry["atomic_numbers"]) diff --git a/openqdc/datasets/interaction/dummy.py b/openqdc/datasets/interaction/dummy.py index cfab609..48e92a9 100644 --- a/openqdc/datasets/interaction/dummy.py +++ b/openqdc/datasets/interaction/dummy.py @@ -1,6 +1,7 @@ import numpy as np from openqdc.datasets.interaction.base import BaseInteractionDataset +from openqdc.methods import InteractionMethod from openqdc.utils.constants import NOT_DEFINED @@ -10,7 +11,7 @@ class DummyInteraction(BaseInteractionDataset): """ __name__ = "dummy" - __energy_methods__ = ["Method1", "Method2"] + __energy_methods__ = [InteractionMethod.SAPT0_AUG_CC_PVDDZ, InteractionMethod.CCSD_T_CC_PVDZ] __force_mask__ = [False, True] __energy_unit__ = "kcal/mol" __distance_unit__ = "ang" From ebc2adfd595e534bb5a630f69a8d882c9087e9f5 Mon Sep 17 00:00:00 2001 From: Nikhil Shenoy Date: Fri, 5 Apr 2024 21:13:38 +0000 Subject: [PATCH 07/27] fixes --- openqdc/datasets/base.py | 17 +++-- openqdc/datasets/interaction/L7.py | 79 +++++++++++++----------- openqdc/datasets/interaction/X40.py | 10 +-- openqdc/datasets/interaction/base.py | 9 ++- openqdc/datasets/interaction/splinter.py | 2 +- openqdc/raws/config_factory.py | 43 +++++++++++-- openqdc/utils/preprocess.py | 2 +- 7 files changed, 107 insertions(+), 55 deletions(-) diff --git a/openqdc/datasets/base.py b/openqdc/datasets/base.py index c12e5c4..9dc3204 100644 --- a/openqdc/datasets/base.py +++ b/openqdc/datasets/base.py @@ -310,7 +310,11 @@ def data_keys(self): @property def pkl_data_keys(self): - return ["name", "subset", "n_atoms"] + return list(self.pkl_data_types.keys()) + + @property + def pkl_data_types(self): + return {"name": str, "subset": str, "n_atoms": np.int32} @property def data_types(self): @@ -424,12 +428,13 @@ def save_preprocess(self, data_dict): # save smiles and subset local_path = p_join(self.preprocess_path, "props.pkl") + # assert that required keys are present in data_dict - assert all([key in data_dict for key in self.pkl_data_keys]) - for key in data_dict: - if key not in self.data_keys: - x = data_dict[key] - x[x == None] = -1 # noqa + assert all([key in self.pkl_data_keys for key in data_dict.keys()]) + + # store unique and inverse indices for str-based pkl keys + for key in self.pkl_data_keys: + if self.pkl_data_types[key] == str: data_dict[key] = np.unique(data_dict[key], return_inverse=True) with open(local_path, "wb") as f: diff --git a/openqdc/datasets/interaction/L7.py b/openqdc/datasets/interaction/L7.py index fa16509..c72e2c1 100644 --- a/openqdc/datasets/interaction/L7.py +++ b/openqdc/datasets/interaction/L7.py @@ -1,5 +1,7 @@ import os -from typing import Dict, List +from dataclasses import dataclass +from functools import partial +from typing import Dict, List, Optional import numpy as np import yaml @@ -10,42 +12,49 @@ from openqdc.utils.constants import ATOM_TABLE -class DataItemYAMLObj: - def __init__(self, name, shortname, geometry, reference_value, setup, group, tags): - self.name = name - self.shortname = shortname - self.geometry = geometry - self.reference_value = reference_value - self.setup = setup - self.group = group - self.tags = tags - - -class DataSetYAMLObj: - def __init__(self, name, references, text, method_energy, groups_by, groups, global_setup, method_geometry=None): - self.name = name - self.references = references - self.text = text - self.method_energy = method_energy - self.method_geometry = method_geometry - self.groups_by = groups_by - self.groups = groups - self.global_setup = global_setup - - -def data_item_constructor(loader: yaml.SafeLoader, node: yaml.nodes.MappingNode): - return DataItemYAMLObj(**loader.construct_mapping(node)) +@dataclass +class DataSet: + description: Dict + items: List[Dict] + alternative_reference: Dict -def dataset_constructor(loader: yaml.SafeLoader, node: yaml.nodes.MappingNode): - return DataSetYAMLObj(**loader.construct_mapping(node)) +@dataclass +class DataItemYAMLObj: + name: str + shortname: str + geometry: str + reference_value: float + setup: Dict + group: str + tags: str + + +@dataclass +class DataSetDescription: + name: Dict + references: str + text: str + groups_by: str + groups: List[str] + global_setup: Dict + method_energy: str + method_geometry: Optional[str] = None def get_loader(): """Add constructors to PyYAML loader.""" + + def constructor(loader: yaml.SafeLoader, node: yaml.nodes.MappingNode, cls): + return cls(**loader.construct_mapping(node)) + loader = yaml.SafeLoader - loader.add_constructor("!ruby/object:ProtocolDataset::DataSetItem", data_item_constructor) - loader.add_constructor("!ruby/object:ProtocolDataset::DataSetDescription", dataset_constructor) + + loader.add_constructor("!ruby/object:ProtocolDataset::DataSet", partial(constructor, cls=DataSet)) + loader.add_constructor("!ruby/object:ProtocolDataset::DataSetItem", partial(constructor, cls=DataItemYAMLObj)) + loader.add_constructor( + "!ruby/object:ProtocolDataset::DataSetDescription", partial(constructor, cls=DataSetDescription) + ) return loader @@ -62,7 +71,7 @@ class L7(BaseInteractionDataset): http://cuby4.molecular.cz/dataset_l7.html """ - __name__ = "L7" + __name__ = "l7" __energy_unit__ = "hartree" __distance_unit__ = "ang" __forces_unit__ = "hartree/ang" @@ -87,10 +96,10 @@ def read_raw_entries(self) -> List[Dict]: yaml_file = open(yaml_fpath, "r") data = [] data_dict = yaml.load(yaml_file, Loader=get_loader()) - charge0 = int(data_dict["description"].global_setup["molecule_a"]["charge"]) - charge1 = int(data_dict["description"].global_setup["molecule_b"]["charge"]) + charge0 = int(data_dict.description.global_setup["molecule_a"]["charge"]) + charge1 = int(data_dict.description.global_setup["molecule_b"]["charge"]) - for idx, item in enumerate(data_dict["items"]): + for idx, item in enumerate(data_dict.items): energies = [] name = np.array([item.shortname]) fname = item.geometry.split(":")[1] @@ -101,7 +110,7 @@ def read_raw_entries(self) -> List[Dict]: n_atoms = np.array([int(lines[0][0])], dtype=np.int32) n_atoms_first = np.array([int(item.setup["molecule_a"]["selection"].split("-")[1])], dtype=np.int32) subset = np.array([item.group]) - energies += [float(val[idx]) for val in list(data_dict["alternative_reference"].values())] + energies += [float(val[idx]) for val in list(data_dict.alternative_reference.values())] energies = np.array([energies], dtype=np.float32) pos = np.array(lines[1:])[:, 1:].astype(np.float32) elems = np.array(lines[1:])[:, 0] diff --git a/openqdc/datasets/interaction/X40.py b/openqdc/datasets/interaction/X40.py index 98a9d67..dfb43d0 100644 --- a/openqdc/datasets/interaction/X40.py +++ b/openqdc/datasets/interaction/X40.py @@ -25,7 +25,7 @@ class X40(BaseInteractionDataset): http://cuby4.molecular.cz/dataset_x40.html """ - __name__ = "X40" + __name__ = "x40" __energy_unit__ = "hartree" __distance_unit__ = "ang" __forces_unit__ = "hartree/ang" @@ -48,10 +48,10 @@ def read_raw_entries(self) -> List[Dict]: yaml_file = open(yaml_fpath, "r") data = [] data_dict = yaml.load(yaml_file, Loader=get_loader()) - charge0 = int(data_dict["description"].global_setup["molecule_a"]["charge"]) - charge1 = int(data_dict["description"].global_setup["molecule_b"]["charge"]) + charge0 = int(data_dict.description.global_setup["molecule_a"]["charge"]) + charge1 = int(data_dict.description.global_setup["molecule_b"]["charge"]) - for idx, item in enumerate(data_dict["items"]): + for idx, item in enumerate(data_dict.items): energies = [] name = np.array([item.shortname]) energies.append(float(item.reference_value)) @@ -62,7 +62,7 @@ def read_raw_entries(self) -> List[Dict]: n_atoms_first = setup[0].split("-")[1] n_atoms_first = np.array([int(n_atoms_first)], dtype=np.int32) subset = np.array([item.group]) - energies += [float(val[idx]) for val in list(data_dict["alternative_reference"].values())] + energies += [float(val[idx]) for val in list(data_dict.alternative_reference.values())] energies = np.array([energies], dtype=np.float32) pos = np.array(lines[1:])[:, 1:].astype(np.float32) elems = np.array(lines[1:])[:, 0] diff --git a/openqdc/datasets/interaction/base.py b/openqdc/datasets/interaction/base.py index 627cec4..0c801fa 100644 --- a/openqdc/datasets/interaction/base.py +++ b/openqdc/datasets/interaction/base.py @@ -15,8 +15,13 @@ class BaseInteractionDataset(BaseDataset): __energy_type__ = [] @property - def pkl_data_keys(self): - return ["name", "subset", "n_atoms", "n_atoms_first"] + def pkl_data_types(self): + return { + "name": str, + "subset": str, + "n_atoms": np.int32, + "n_atoms_first": np.int32, + } def collate_list(self, list_entries: List[Dict]): # concatenate entries diff --git a/openqdc/datasets/interaction/splinter.py b/openqdc/datasets/interaction/splinter.py index a57275d..39e930b 100644 --- a/openqdc/datasets/interaction/splinter.py +++ b/openqdc/datasets/interaction/splinter.py @@ -130,7 +130,7 @@ def read_raw_entries(self) -> List[Dict]: index, _, ) = metadata[0].split("_") - r, theta_P, tau_P, theta_L, tau_L, tau_PL = [None] * 6 + r, theta_P, tau_P, theta_L, tau_L, tau_PL = [-1] * 6 energies = np.array([list(map(float, metadata[4:-1]))]).astype(np.float32) n_atoms_first = np.array([int(metadata[-1])], dtype=np.int32) total_charge, charge0, charge1 = list(map(int, metadata[1:4])) diff --git a/openqdc/raws/config_factory.py b/openqdc/raws/config_factory.py index b4784ed..e7750c5 100644 --- a/openqdc/raws/config_factory.py +++ b/openqdc/raws/config_factory.py @@ -51,6 +51,14 @@ class DataConfigFactory: links={"rdkit_folder.tar.gz": "https://dataverse.harvard.edu/api/access/datafile/4327252"}, ) + l7 = dict( + dataset_name="l7", + links={ + "l7.yaml": "http://cuby4.molecular.cz/download_datasets/l7.yaml", + "geometries.tar.gz": "http://cuby4.molecular.cz/download_geometries/L7.tar", + }, + ) + molecule3d = dict( dataset_name="molecule3d", links={"molecule3d.zip": "https://drive.google.com/uc?id=1C_KRf8mX-gxny7kL9ACNCEV4ceu_fUGy"}, @@ -86,6 +94,28 @@ class DataConfigFactory: links={"spice-2.0.0.hdf5": "https://zenodo.org/records/10835749/files/SPICE-2.0.0.hdf5?download=1"}, ) + splinter = dict( + dataset_name="splinter", + links={ + "dimerpairs.0.tar.gz": "https://figshare.com/ndownloader/files/39449167", + "dimerpairs.1.tar.gz": "https://figshare.com/ndownloader/files/40271983", + "dimerpairs.2.tar.gz": "https://figshare.com/ndownloader/files/40271989", + "dimerpairs.3.tar.gz": "https://figshare.com/ndownloader/files/40272001", + "dimerpairs.4.tar.gz": "https://figshare.com/ndownloader/files/40272022", + "dimerpairs.5.tar.gz": "https://figshare.com/ndownloader/files/40552931", + "dimerpairs.6.tar.gz": "https://figshare.com/ndownloader/files/40272040", + "dimerpairs.7.tar.gz": "https://figshare.com/ndownloader/files/40272052", + "dimerpairs.8.tar.gz": "https://figshare.com/ndownloader/files/40272061", + "dimerpairs.9.tar.gz": "https://figshare.com/ndownloader/files/40272064", + "dimerpairs_nonstandard.tar.gz": "https://figshare.com/ndownloader/files/40272067", + "lig_interaction_sites.sdf": "https://figshare.com/ndownloader/files/40272070", + "lig_monomers.sdf": "https://figshare.com/ndownloader/files/40272073", + "prot_interaction_sites.sdf": "https://figshare.com/ndownloader/files/40272076", + "prot_monomers.sdf": "https://figshare.com/ndownloader/files/40272079", + "merge_monomers.py": "https://figshare.com/ndownloader/files/41807682", + }, + ) + dess = dict( dataset_name="dess5m", links={ @@ -161,11 +191,6 @@ class DataConfigFactory: links={"Transition1x.h5": "https://figshare.com/ndownloader/files/36035789"}, ) - # l7 = dict( - # dataset_name="l7", - # links={"l7.zip": "http://www.begdb.org/moldown.php?id=40"} - # ) - des_s66 = dict( dataset_name="des_s66", links={"DESS66.zip": "https://zenodo.org/records/5676284/files/DESS66.zip?download=1"}, @@ -180,6 +205,14 @@ class DataConfigFactory: links={"revmd17.zip": "https://figshare.com/ndownloader/articles/12672038/versions/3"}, ) + x40 = dict( + dataset_name="x40", + links={ + "x40.yaml": "http://cuby4.molecular.cz/download_datasets/x40.yaml", + "geometries.tar.gz": "http://cuby4.molecular.cz/download_geometries/X40.tar", + }, + ) + available_datasets = [k for k in locals().keys() if not k.startswith("__")] def __init__(self): diff --git a/openqdc/utils/preprocess.py b/openqdc/utils/preprocess.py index a7dd9c7..0fee22b 100644 --- a/openqdc/utils/preprocess.py +++ b/openqdc/utils/preprocess.py @@ -7,7 +7,7 @@ from openqdc import AVAILABLE_DATASETS options = list(AVAILABLE_DATASETS.values()) -options_map = {d.__name__: d for d in options} +options_map = {d.__name__.lower(): d for d in options} @click.command() From ed8e264c688c79adfe872d6115b72e48042feebb Mon Sep 17 00:00:00 2001 From: Nikhil Shenoy Date: Fri, 5 Apr 2024 23:42:02 +0000 Subject: [PATCH 08/27] Updated metcalf --- openqdc/datasets/interaction/metcalf.py | 48 +++++++++++++++++++++++++ openqdc/raws/config_factory.py | 5 +++ 2 files changed, 53 insertions(+) diff --git a/openqdc/datasets/interaction/metcalf.py b/openqdc/datasets/interaction/metcalf.py index 819d5dc..1905918 100644 --- a/openqdc/datasets/interaction/metcalf.py +++ b/openqdc/datasets/interaction/metcalf.py @@ -2,12 +2,58 @@ from typing import Dict, List import numpy as np +from loguru import logger from openqdc.datasets.interaction.base import BaseInteractionDataset from openqdc.methods import InteractionMethod, InterEnergyType from openqdc.utils.constants import ATOM_TABLE +def extract_raw_tar_gz(folder): + # go over all files + logger.info(f"Extracting all tar.gz files in {folder}") + expected_tar_files = { + "train": [ + "TRAINING-2073-ssi-neutral.tar.gz", + "TRAINING-2610-donors-perturbed.tar.gz", + "TRAINING-4795-acceptors-perturbed.tar.gz", + ], + "val": ["VALIDATION-125-donors.tar.gz", "VALIDATION-254-acceptors.tar.gz"], + "test": [ + "TEST-Acc--3-methylbutan-2-one_Don--NMe-acetamide-PLDB.tar.gz", + "TEST-Acc--Cyclohexanone_Don--NMe-acetamide-PLDB.tar.gz", + "TEST-Acc--Isoquinolone_NMe-acetamide.tar.gz", + "TEST-Acc--NMe-acetamide_Don--Aniline-CSD.tar.gz", + "TEST-Acc--NMe-acetamide_Don--Aniline-PLDB.tar.gz", + "TEST-Acc--NMe-acetamide_Don--N-isopropylacetamide-PLDB.tar.gz", + "TEST-Acc--NMe-acetamide_Don--N-phenylbenzamide-PLDB.tar.gz", + "TEST-Acc--NMe-acetamide_Don--Naphthalene-1H-PLDB.tar.gz", + "TEST-Acc--NMe-acetamide_Don--Uracil-PLDB.tar.gz", + "TEST-Acc--Tetrahydro-2H-pyran-2-one_NMe-acetamide-PLDB.tar.gz", + "TEST-NMe-acetamide_Don--Benzimidazole-PLDB.tar.gz", + ], + } + + # create a folder with the same name as the tar.gz file + for subset in expected_tar_files: + for tar_file in expected_tar_files[subset]: + logger.info(f"Extracting {tar_file}") + tar_file_path = os.path.join(folder, tar_file) + + # check if tar file exists + if not os.path.exists(tar_file_path): + raise FileNotFoundError(f"File {tar_file_path} not found") + + # skip if extracted folder exists + if os.path.exists(os.path.join(folder, tar_file.replace(".tar.gz", ""))): + logger.info(f"Skipping {tar_file}") + continue + + tar_folder_path = tar_file_path.replace(".tar.gz", "") + os.mkdir(tar_folder_path) + os.system(f"tar -xzf {tar_file_path} -C {tar_folder_path}") + + class Metcalf(BaseInteractionDataset): """ Hydrogen-bonded dimers of NMA with 126 molecules as described in: @@ -53,6 +99,8 @@ class Metcalf(BaseInteractionDataset): ] def read_raw_entries(self) -> List[Dict]: + # extract in folders + extract_raw_tar_gz(self.root) data = [] for dirname in os.listdir(self.root): xyz_dir = os.path.join(self.root, dirname) diff --git a/openqdc/raws/config_factory.py b/openqdc/raws/config_factory.py index 16d8ee1..9f8c6c1 100644 --- a/openqdc/raws/config_factory.py +++ b/openqdc/raws/config_factory.py @@ -299,6 +299,11 @@ class DataConfigFactory: }, ) + metcalf = dict( + dataset_name="metcalf", + links={"model-data.tar.gz": "https://zenodo.org/records/10934211/files/model-data.tar?download=1"}, + ) + misato = dict( dataset_name="misato", links={ From 18bc79c4b1cf368d75029e3431b0761022940f02 Mon Sep 17 00:00:00 2001 From: Nikhil Shenoy Date: Sat, 6 Apr 2024 00:46:11 +0000 Subject: [PATCH 09/27] bug fix and simplifying interaction dataset --- openqdc/datasets/base.py | 4 ++-- openqdc/datasets/interaction/base.py | 17 +---------------- 2 files changed, 3 insertions(+), 18 deletions(-) diff --git a/openqdc/datasets/base.py b/openqdc/datasets/base.py index 469a033..b5bc43b 100644 --- a/openqdc/datasets/base.py +++ b/openqdc/datasets/base.py @@ -341,8 +341,8 @@ def save_preprocess(self, data_dict): # save smiles and subset local_path = p_join(self.preprocess_path, "props.pkl") - # assert that required keys are present in data_dict - assert all([key in self.pkl_data_keys for key in data_dict.keys()]) + # assert that (required) pkl keys are present in data_dict + assert all([key in data_dict.keys() for key in self.pkl_data_keys]) # store unique and inverse indices for str-based pkl keys for key in self.pkl_data_keys: diff --git a/openqdc/datasets/interaction/base.py b/openqdc/datasets/interaction/base.py index 18b6a1e..96f39c1 100644 --- a/openqdc/datasets/interaction/base.py +++ b/openqdc/datasets/interaction/base.py @@ -1,6 +1,6 @@ import os from os.path import join as p_join -from typing import Dict, List, Optional +from typing import Optional import numpy as np from ase.io.extxyz import write_extxyz @@ -23,21 +23,6 @@ def pkl_data_types(self): "n_atoms_first": np.int32, } - def collate_list(self, list_entries: List[Dict]): - # concatenate entries - res = { - key: np.concatenate([r[key] for r in list_entries if r is not None], axis=0) - for key in list_entries[0] - if not isinstance(list_entries[0][key], dict) - } - - csum = np.cumsum(res.get("n_atoms")) - x = np.zeros((csum.shape[0], 2), dtype=np.int32) - x[1:, 0], x[:, 1] = csum[:-1], csum - res["position_idx_range"] = x - - return res - def __getitem__(self, idx: int): shift = MAX_CHARGE p_start, p_end = self.data["position_idx_range"][idx] From 2a6e3ef3c1e8b47fbde1e9bcea0570675324f854 Mon Sep 17 00:00:00 2001 From: Nikhil Shenoy Date: Sat, 6 Apr 2024 01:14:56 +0000 Subject: [PATCH 10/27] Updated tests for interaction datasets --- openqdc/datasets/interaction/base.py | 3 ++ openqdc/datasets/interaction/dummy.py | 2 +- tests/test_dummy.py | 40 +++++++++++++++++++-------- 3 files changed, 33 insertions(+), 12 deletions(-) diff --git a/openqdc/datasets/interaction/base.py b/openqdc/datasets/interaction/base.py index 96f39c1..8a8e2ea 100644 --- a/openqdc/datasets/interaction/base.py +++ b/openqdc/datasets/interaction/base.py @@ -42,6 +42,7 @@ def __getitem__(self, idx: int): forces = self._convert_array(np.array(self.data["forces"][p_start:p_end], dtype=np.float32)) e0 = self._convert_array(np.array(self.__isolated_atom_energies__[..., z, c + shift].T, dtype=np.float32)) + formation_energies = energies - e0.sum(axis=0) bunch = Bunch( positions=positions, @@ -49,6 +50,8 @@ def __getitem__(self, idx: int): charges=c, e0=e0, energies=energies, + formation_energies=formation_energies, + per_atom_formation_energies=formation_energies / len(z), name=name, subset=subset, forces=forces, diff --git a/openqdc/datasets/interaction/dummy.py b/openqdc/datasets/interaction/dummy.py index 48e92a9..71bf5ee 100644 --- a/openqdc/datasets/interaction/dummy.py +++ b/openqdc/datasets/interaction/dummy.py @@ -10,7 +10,7 @@ class DummyInteraction(BaseInteractionDataset): Dummy Interaction Dataset for Testing """ - __name__ = "dummy" + __name__ = "dummy_interaction" __energy_methods__ = [InteractionMethod.SAPT0_AUG_CC_PVDDZ, InteractionMethod.CCSD_T_CC_PVDZ] __force_mask__ = [False, True] __energy_unit__ = "kcal/mol" diff --git a/tests/test_dummy.py b/tests/test_dummy.py index 08ee127..a241384 100644 --- a/tests/test_dummy.py +++ b/tests/test_dummy.py @@ -5,6 +5,7 @@ import numpy as np import pytest +from openqdc.datasets.interaction.dummy import DummyInteraction # noqa: E402 from openqdc.datasets.potential.dummy import Dummy # noqa: E402 from openqdc.utils.io import get_local_cache from openqdc.utils.package_utils import has_package @@ -12,6 +13,7 @@ # start by removing any cached data cache_dir = get_local_cache() os.system(f"rm -rf {cache_dir}/dummy") +os.system(f"rm -rf {cache_dir}/dummy_interaction") if has_package("torch"): @@ -28,22 +30,30 @@ @pytest.fixture -def ds(): +def dummy(): return Dummy() -def test_dummy(ds): +@pytest.fixture +def dummy_interaction(): + return DummyInteraction() + + +@pytest.mark.parametrize("ds", ["dummy", "dummy_interaction"]) +def test_dummy(ds, request): + ds = request.getfixturevalue(ds) assert ds is not None assert len(ds) == 9999 assert ds[100] +@pytest.mark.parametrize("interaction_ds", [False, True]) @pytest.mark.parametrize("format", ["numpy", "torch", "jax"]) -def test_array_format(format): +def test_dummy_array_format(interaction_ds, format): if not has_package(format): pytest.skip(f"{format} is not installed, skipping test") - ds = Dummy(array_format=format) + ds = DummyInteraction(array_format=format) if interaction_ds else Dummy(array_format=format) keys = [ "positions", @@ -61,13 +71,14 @@ def test_array_format(format): assert isinstance(data[key], format_to_type[format]) -def test_transform(): +@pytest.mark.parametrize("interaction_ds", [False, True]) +def test_transform(interaction_ds): def custom_fn(bunch): # create new name bunch.new_key = bunch.name + bunch.subset return bunch - ds = Dummy(transform=custom_fn) + ds = DummyInteraction(transform=custom_fn) if interaction_ds else Dummy(transform=custom_fn) data = ds[0] @@ -75,14 +86,18 @@ def custom_fn(bunch): assert data["new_key"] == data["name"] + data["subset"] -def test_get_statistics(ds): +@pytest.mark.parametrize("ds", ["dummy", "dummy_interaction"]) +def test_get_statistics(ds, request): + ds = request.getfixturevalue(ds) stats = ds.get_statistics() keys = ["ForcesCalculatorStats", "FormationEnergyStats", "PerAtomFormationEnergyStats", "TotalEnergyStats"] assert all(k in stats for k in keys) -def test_energy_statistics_shapes(ds): +@pytest.mark.parametrize("ds", ["dummy", "dummy_interaction"]) +def test_energy_statistics_shapes(ds, request): + ds = request.getfixturevalue(ds) stats = ds.get_statistics() num_methods = len(ds.energy_methods) @@ -100,7 +115,9 @@ def test_energy_statistics_shapes(ds): assert total_energy_stats["std"].shape == (1, num_methods) -def test_force_statistics_shapes(ds): +@pytest.mark.parametrize("ds", ["dummy", "dummy_interaction"]) +def test_force_statistics_shapes(ds, request): + ds = request.getfixturevalue(ds) stats = ds.get_statistics() num_force_methods = len(ds.force_methods) @@ -115,12 +132,13 @@ def test_force_statistics_shapes(ds): assert forces_stats["component_rms"].shape == (3, num_force_methods) +@pytest.mark.parametrize("interaction_ds", [False, True]) @pytest.mark.parametrize("format", ["numpy", "torch", "jax"]) -def test_stats_array_format(format): +def test_stats_array_format(interaction_ds, format): if not has_package(format): pytest.skip(f"{format} is not installed, skipping test") - ds = Dummy(array_format=format) + ds = DummyInteraction(array_format=format) if interaction_ds else Dummy(array_format=format) stats = ds.get_statistics() for key in stats.keys(): From 749327386a6e34d79ae39e0474036aa09fdbba02 Mon Sep 17 00:00:00 2001 From: Nikhil Shenoy Date: Sat, 6 Apr 2024 01:21:56 +0000 Subject: [PATCH 11/27] removed stale stats in dummy interaction --- openqdc/datasets/interaction/dummy.py | 20 -------------------- 1 file changed, 20 deletions(-) diff --git a/openqdc/datasets/interaction/dummy.py b/openqdc/datasets/interaction/dummy.py index 71bf5ee..085b732 100644 --- a/openqdc/datasets/interaction/dummy.py +++ b/openqdc/datasets/interaction/dummy.py @@ -2,7 +2,6 @@ from openqdc.datasets.interaction.base import BaseInteractionDataset from openqdc.methods import InteractionMethod -from openqdc.utils.constants import NOT_DEFINED class DummyInteraction(BaseInteractionDataset): @@ -27,25 +26,6 @@ def _post_init(self, overwrite_local_cache, energy_unit, distance_unit) -> None: self.setup_dummy() return super()._post_init(overwrite_local_cache, energy_unit, distance_unit) - @property - def _stats(self): - return { - "formation": { - "energy": { - "mean": np.array([[-12.94348027, -9.83037297]]), - "std": np.array([[4.39971409, 3.3574188]]), - }, - "forces": NOT_DEFINED, - }, - "total": { - "energy": { - "mean": np.array([[-89.44242, -1740.5336]]), - "std": np.array([[29.599571, 791.48663]]), - }, - "forces": NOT_DEFINED, - }, - } - def setup_dummy(self): n_atoms = np.array([np.random.randint(10, 30) for _ in range(len(self))]) n_atoms_first = np.array([np.random.randint(1, 10) for _ in range(len(self))]) From ed73e7d97ca434fb707a3804427f833009119348 Mon Sep 17 00:00:00 2001 From: Nikhil Shenoy Date: Sat, 6 Apr 2024 16:39:01 +0000 Subject: [PATCH 12/27] changes based on comments --- openqdc/datasets/interaction/base.py | 3 --- openqdc/datasets/interaction/des370k.py | 7 +----- openqdc/datasets/interaction/des5m.py | 2 +- openqdc/datasets/interaction/dess66.py | 8 +----- openqdc/datasets/interaction/dess66x8.py | 6 ----- openqdc/datasets/interaction/dummy.py | 5 +--- openqdc/datasets/statistics.py | 3 ++- tests/test_dummy.py | 31 ++++++++++++++++-------- 8 files changed, 27 insertions(+), 38 deletions(-) diff --git a/openqdc/datasets/interaction/base.py b/openqdc/datasets/interaction/base.py index 8a8e2ea..96f39c1 100644 --- a/openqdc/datasets/interaction/base.py +++ b/openqdc/datasets/interaction/base.py @@ -42,7 +42,6 @@ def __getitem__(self, idx: int): forces = self._convert_array(np.array(self.data["forces"][p_start:p_end], dtype=np.float32)) e0 = self._convert_array(np.array(self.__isolated_atom_energies__[..., z, c + shift].T, dtype=np.float32)) - formation_energies = energies - e0.sum(axis=0) bunch = Bunch( positions=positions, @@ -50,8 +49,6 @@ def __getitem__(self, idx: int): charges=c, e0=e0, energies=energies, - formation_energies=formation_energies, - per_atom_formation_energies=formation_energies / len(z), name=name, subset=subset, forces=forces, diff --git a/openqdc/datasets/interaction/des370k.py b/openqdc/datasets/interaction/des370k.py index 250d42d..5d6e966 100644 --- a/openqdc/datasets/interaction/des370k.py +++ b/openqdc/datasets/interaction/des370k.py @@ -101,22 +101,17 @@ def _read_raw_entries(cls) -> List[Dict]: logger.info(f"Reading {cls._name} interaction data from {filepath}") df = pd.read_csv(filepath) data = [] - for idx, row in tqdm(df.iterrows(), total=df.shape[0]): + for _, row in tqdm(df.iterrows(), total=df.shape[0]): smiles0, smiles1 = row["smiles0"], row["smiles1"] charge0, charge1 = row["charge0"], row["charge1"] natoms0, natoms1 = row["natoms0"], row["natoms1"] pos = np.array(list(map(float, row["xyz"].split()))).reshape(-1, 3) elements = row["elements"].split() - atomic_nums = np.expand_dims(np.array([ATOM_TABLE.GetAtomicNumber(x) for x in elements]), axis=1) - charges = np.expand_dims(np.array([charge0] * natoms0 + [charge1] * natoms1), axis=1) - atomic_inputs = np.concatenate((atomic_nums, charges, pos), axis=-1, dtype=np.float32) - energies = np.array(row[cls.energy_target_names].values).astype(np.float32)[None, :] - name = np.array([smiles0 + "." + smiles1]) subsets = [] diff --git a/openqdc/datasets/interaction/des5m.py b/openqdc/datasets/interaction/des5m.py index 979909c..49c3f4a 100644 --- a/openqdc/datasets/interaction/des5m.py +++ b/openqdc/datasets/interaction/des5m.py @@ -75,4 +75,4 @@ class DES5M(DES370K): __forces_unit__ = "kcal/mol/ang" def read_raw_entries(self) -> List[Dict]: - return DES5M._read_raw_entries() + return super()._read_raw_entries() diff --git a/openqdc/datasets/interaction/dess66.py b/openqdc/datasets/interaction/dess66.py index c10811b..e608adb 100644 --- a/openqdc/datasets/interaction/dess66.py +++ b/openqdc/datasets/interaction/dess66.py @@ -96,24 +96,18 @@ def read_raw_entries(self) -> List[Dict]: logger.info(f"Reading DESS66 interaction data from {self.filepath}") df = pd.read_csv(self.filepath) data = [] - for idx, row in tqdm(df.iterrows(), total=df.shape[0]): + for _, row in tqdm(df.iterrows(), total=df.shape[0]): smiles0, smiles1 = row["smiles0"], row["smiles1"] charge0, charge1 = row["charge0"], row["charge1"] natoms0, natoms1 = row["natoms0"], row["natoms1"] pos = np.array(list(map(float, row["xyz"].split()))).reshape(-1, 3) elements = row["elements"].split() - atomic_nums = np.expand_dims(np.array([ATOM_TABLE.GetAtomicNumber(x) for x in elements]), axis=1) - charges = np.expand_dims(np.array([charge0] * natoms0 + [charge1] * natoms1), axis=1) - atomic_inputs = np.concatenate((atomic_nums, charges, pos), axis=-1, dtype=np.float32) - energies = np.array(row[self.energy_target_names].values).astype(np.float32)[None, :] - name = np.array([smiles0 + "." + smiles1]) - subset = row["system_name"] item = dict( diff --git a/openqdc/datasets/interaction/dess66x8.py b/openqdc/datasets/interaction/dess66x8.py index 709620a..8467eef 100644 --- a/openqdc/datasets/interaction/dess66x8.py +++ b/openqdc/datasets/interaction/dess66x8.py @@ -104,17 +104,11 @@ def read_raw_entries(self) -> List[Dict]: pos = np.array(list(map(float, row["xyz"].split()))).reshape(-1, 3) elements = row["elements"].split() - atomic_nums = np.expand_dims(np.array([ATOM_TABLE.GetAtomicNumber(x) for x in elements]), axis=1) - charges = np.expand_dims(np.array([charge0] * natoms0 + [charge1] * natoms1), axis=1) - atomic_inputs = np.concatenate((atomic_nums, charges, pos), axis=-1, dtype=np.float32) - energies = np.array(row[self.energy_target_names].values).astype(np.float32)[None, :] - name = np.array([smiles0 + "." + smiles1]) - subset = row["system_name"] item = dict( diff --git a/openqdc/datasets/interaction/dummy.py b/openqdc/datasets/interaction/dummy.py index 085b732..4dcb8a3 100644 --- a/openqdc/datasets/interaction/dummy.py +++ b/openqdc/datasets/interaction/dummy.py @@ -11,14 +11,13 @@ class DummyInteraction(BaseInteractionDataset): __name__ = "dummy_interaction" __energy_methods__ = [InteractionMethod.SAPT0_AUG_CC_PVDDZ, InteractionMethod.CCSD_T_CC_PVDZ] - __force_mask__ = [False, True] + __force_mask__ = [False, False] __energy_unit__ = "kcal/mol" __distance_unit__ = "ang" __forces_unit__ = "kcal/mol/ang" energy_target_names = [f"energy{i}" for i in range(len(__energy_methods__))] - force_target_names = [f"forces{i}" for i in range(len(__force_mask__))] __isolated_atom_energies__ = [] __average_n_atoms__ = None @@ -48,7 +47,6 @@ def setup_dummy(self): name = [f"dummy_{i}" for i in range(len(self))] subset = ["dummy" for i in range(len(self))] energies = np.random.rand(len(self), len(self.energy_methods)) - forces = np.concatenate([np.random.randn(size, 3, len(self.force_methods)) * 100 for size in n_atoms]) self.data = dict( n_atoms=n_atoms, position_idx_range=position_idx_range, @@ -57,7 +55,6 @@ def setup_dummy(self): subset=subset, energies=energies, n_atoms_first=n_atoms_first, - forces=forces, ) self.__average_nb_atoms__ = self.data["n_atoms"].mean() diff --git a/openqdc/datasets/statistics.py b/openqdc/datasets/statistics.py index e4fe9e5..2122271 100644 --- a/openqdc/datasets/statistics.py +++ b/openqdc/datasets/statistics.py @@ -21,7 +21,8 @@ def to_dict(self): def transform(self, func): for k, v in self.to_dict().items(): - setattr(self, k, func(v)) + if v is not None: + setattr(self, k, func(v)) @dataclass diff --git a/tests/test_dummy.py b/tests/test_dummy.py index a241384..e38a6dc 100644 --- a/tests/test_dummy.py +++ b/tests/test_dummy.py @@ -10,10 +10,15 @@ from openqdc.utils.io import get_local_cache from openqdc.utils.package_utils import has_package + # start by removing any cached data -cache_dir = get_local_cache() -os.system(f"rm -rf {cache_dir}/dummy") -os.system(f"rm -rf {cache_dir}/dummy_interaction") +@pytest.fixture(autouse=True) +def clean_before_run(): + # start by removing any cached data + cache_dir = get_local_cache() + os.system(f"rm -rf {cache_dir}/dummy") + os.system(f"rm -rf {cache_dir}/dummy_interaction") + yield if has_package("torch"): @@ -62,12 +67,15 @@ def test_dummy_array_format(interaction_ds, format): "energies", "forces", "e0", - "formation_energies", - "per_atom_formation_energies", ] + if not interaction_ds: + # additional keys returned from the potential dataset + keys.extend(["formation_energies", "per_atom_formation_energies"]) data = ds[0] for key in keys: + if data[key] is None: + continue assert isinstance(data[key], format_to_type[format]) @@ -125,11 +133,12 @@ def test_force_statistics_shapes(ds, request): keys = ["mean", "std", "component_mean", "component_std", "component_rms"] assert all(k in forces_stats for k in keys) - assert forces_stats["mean"].shape == (1, num_force_methods) - assert forces_stats["std"].shape == (1, num_force_methods) - assert forces_stats["component_mean"].shape == (3, num_force_methods) - assert forces_stats["component_std"].shape == (3, num_force_methods) - assert forces_stats["component_rms"].shape == (3, num_force_methods) + if len(ds.force_methods) > 0: + assert forces_stats["mean"].shape == (1, num_force_methods) + assert forces_stats["std"].shape == (1, num_force_methods) + assert forces_stats["component_mean"].shape == (3, num_force_methods) + assert forces_stats["component_std"].shape == (3, num_force_methods) + assert forces_stats["component_rms"].shape == (3, num_force_methods) @pytest.mark.parametrize("interaction_ds", [False, True]) @@ -143,4 +152,6 @@ def test_stats_array_format(interaction_ds, format): for key in stats.keys(): for k, v in stats[key].items(): + if v is None: + continue assert isinstance(v, format_to_type[format]) From 03590229872d72c798bd1f8a5e44455287a806da Mon Sep 17 00:00:00 2001 From: FNTwin Date: Sat, 6 Apr 2024 12:49:53 -0400 Subject: [PATCH 13/27] Clean metcalf --- openqdc/datasets/interaction/metcalf.py | 144 ++++++++++++------------ tests/test_dummy.py | 12 +- 2 files changed, 81 insertions(+), 75 deletions(-) diff --git a/openqdc/datasets/interaction/metcalf.py b/openqdc/datasets/interaction/metcalf.py index 1905918..34da7ef 100644 --- a/openqdc/datasets/interaction/metcalf.py +++ b/openqdc/datasets/interaction/metcalf.py @@ -1,57 +1,85 @@ import os +from glob import glob +from io import StringIO +from os.path import join as p_join from typing import Dict, List import numpy as np from loguru import logger +from tqdm import tqdm from openqdc.datasets.interaction.base import BaseInteractionDataset from openqdc.methods import InteractionMethod, InterEnergyType +from openqdc.raws.config_factory import decompress_tar_gz from openqdc.utils.constants import ATOM_TABLE +EXPECTED_TAR_FILES = { + "train": [ + "TRAINING-2073-ssi-neutral.tar.gz", + "TRAINING-2610-donors-perturbed.tar.gz", + "TRAINING-4795-acceptors-perturbed.tar.gz", + ], + "val": ["VALIDATION-125-donors.tar.gz", "VALIDATION-254-acceptors.tar.gz"], + "test": [ + "TEST-Acc--3-methylbutan-2-one_Don--NMe-acetamide-PLDB.tar.gz", + "TEST-Acc--Cyclohexanone_Don--NMe-acetamide-PLDB.tar.gz", + "TEST-Acc--Isoquinolone_NMe-acetamide.tar.gz", + "TEST-Acc--NMe-acetamide_Don--Aniline-CSD.tar.gz", + "TEST-Acc--NMe-acetamide_Don--Aniline-PLDB.tar.gz", + "TEST-Acc--NMe-acetamide_Don--N-isopropylacetamide-PLDB.tar.gz", + "TEST-Acc--NMe-acetamide_Don--N-phenylbenzamide-PLDB.tar.gz", + "TEST-Acc--NMe-acetamide_Don--Naphthalene-1H-PLDB.tar.gz", + "TEST-Acc--NMe-acetamide_Don--Uracil-PLDB.tar.gz", + "TEST-Acc--Tetrahydro-2H-pyran-2-one_NMe-acetamide-PLDB.tar.gz", + "TEST-NMe-acetamide_Don--Benzimidazole-PLDB.tar.gz", + ], +} + def extract_raw_tar_gz(folder): - # go over all files logger.info(f"Extracting all tar.gz files in {folder}") - expected_tar_files = { - "train": [ - "TRAINING-2073-ssi-neutral.tar.gz", - "TRAINING-2610-donors-perturbed.tar.gz", - "TRAINING-4795-acceptors-perturbed.tar.gz", - ], - "val": ["VALIDATION-125-donors.tar.gz", "VALIDATION-254-acceptors.tar.gz"], - "test": [ - "TEST-Acc--3-methylbutan-2-one_Don--NMe-acetamide-PLDB.tar.gz", - "TEST-Acc--Cyclohexanone_Don--NMe-acetamide-PLDB.tar.gz", - "TEST-Acc--Isoquinolone_NMe-acetamide.tar.gz", - "TEST-Acc--NMe-acetamide_Don--Aniline-CSD.tar.gz", - "TEST-Acc--NMe-acetamide_Don--Aniline-PLDB.tar.gz", - "TEST-Acc--NMe-acetamide_Don--N-isopropylacetamide-PLDB.tar.gz", - "TEST-Acc--NMe-acetamide_Don--N-phenylbenzamide-PLDB.tar.gz", - "TEST-Acc--NMe-acetamide_Don--Naphthalene-1H-PLDB.tar.gz", - "TEST-Acc--NMe-acetamide_Don--Uracil-PLDB.tar.gz", - "TEST-Acc--Tetrahydro-2H-pyran-2-one_NMe-acetamide-PLDB.tar.gz", - "TEST-NMe-acetamide_Don--Benzimidazole-PLDB.tar.gz", - ], - } - - # create a folder with the same name as the tar.gz file - for subset in expected_tar_files: - for tar_file in expected_tar_files[subset]: - logger.info(f"Extracting {tar_file}") - tar_file_path = os.path.join(folder, tar_file) - - # check if tar file exists - if not os.path.exists(tar_file_path): - raise FileNotFoundError(f"File {tar_file_path} not found") - - # skip if extracted folder exists - if os.path.exists(os.path.join(folder, tar_file.replace(".tar.gz", ""))): - logger.info(f"Skipping {tar_file}") - continue - - tar_folder_path = tar_file_path.replace(".tar.gz", "") - os.mkdir(tar_folder_path) - os.system(f"tar -xzf {tar_file_path} -C {tar_folder_path}") + for subset in EXPECTED_TAR_FILES: + for tar_file in EXPECTED_TAR_FILES[subset]: + tar_file_path = p_join(folder, tar_file) + try: + decompress_tar_gz(tar_file_path) + except FileNotFoundError as e: + raise FileNotFoundError(f"File {tar_file_path} not found") from e + + +def content_to_xyz(content, subset): + try: + num_atoms = np.array([int(content.split("\n")[0])]) + tmp = content.split("\n")[1].split(",") + name = tmp[0] + e = tmp[1:-1] + except Exception as e: + logger.warning(f"Encountered exception in {content} : {e}") + return None + + s = StringIO(content) + d = np.loadtxt(s, skiprows=2, dtype="str") + z, positions = d[:, 0], d[:, 1:].astype(np.float32) + z = np.array([ATOM_TABLE.GetAtomicNumber(s) for s in z]) + xs = np.stack((z, np.zeros_like(z)), axis=-1) + + item = dict( + n_atoms=num_atoms, + subset=np.array([subset]), + energies=e, + atomic_inputs=np.concatenate((xs, positions), axis=-1, dtype=np.float32), + name=np.array([name]), + n_atoms_first=np.array([-1]), + ) + + return item + + +def read_xyz(fname, subset): + with open(fname, "r") as f: + contents = f.read().split("\n\n") + res = [content_to_xyz(content, subset) for content in tqdm(contents)] + return res class Metcalf(BaseInteractionDataset): @@ -102,35 +130,9 @@ def read_raw_entries(self) -> List[Dict]: # extract in folders extract_raw_tar_gz(self.root) data = [] - for dirname in os.listdir(self.root): - xyz_dir = os.path.join(self.root, dirname) - if not os.path.isdir(xyz_dir): - continue + for _, dirname, _ in os.walk(self.root): + xyz_dir = p_join(self.root, dirname) subset = np.array([dirname.split("-")[0].lower()]) # training, validation, or test - for filename in os.listdir(xyz_dir): - if not filename.endswith(".xyz"): - continue - lines = list(map(lambda x: x.strip(), open(os.path.join(xyz_dir, filename), "r").readlines())) - line_two = lines[1].split(",") - energies = np.array([line_two[1:6]], dtype=np.float32) - num_atoms = np.array([int(lines[0])]) - - elem_xyz = np.array([x.split() for x in lines[2:]]) - elements = elem_xyz[:, 0] - xyz = elem_xyz[:, 1:].astype(np.float32) - atomic_nums = np.expand_dims(np.array([ATOM_TABLE.GetAtomicNumber(x) for x in elements]), axis=1) - charges = np.expand_dims(np.array([0] * num_atoms[0]), axis=1) - - atomic_inputs = np.concatenate((atomic_nums, charges, xyz), axis=-1, dtype=np.float32) - - item = dict( - n_atoms=num_atoms, - subset=subset, - energies=energies, - positions=xyz, - atomic_inputs=atomic_inputs, - name=np.array([""]), - n_atoms_first=np.array([-1]), - ) - data.append(item) + for filename in glob(xyz_dir + f"{os.sep}*.xyz"): + data.append(read_xyz(filename, subset)) return data diff --git a/tests/test_dummy.py b/tests/test_dummy.py index a241384..7efbc18 100644 --- a/tests/test_dummy.py +++ b/tests/test_dummy.py @@ -10,10 +10,14 @@ from openqdc.utils.io import get_local_cache from openqdc.utils.package_utils import has_package -# start by removing any cached data -cache_dir = get_local_cache() -os.system(f"rm -rf {cache_dir}/dummy") -os.system(f"rm -rf {cache_dir}/dummy_interaction") + +@pytest.fixture(autouse=True) +def clean_before_run(): + # start by removing any cached data + cache_dir = get_local_cache() + os.system(f"rm -rf {cache_dir}/dummy") + os.system(f"rm -rf {cache_dir}/dummy_interaction") + yield if has_package("torch"): From 33fa342b87f8f72be0170247474f0a7fa79a1f78 Mon Sep 17 00:00:00 2001 From: FNTwin Date: Sat, 6 Apr 2024 12:58:32 -0400 Subject: [PATCH 14/27] Simplification --- openqdc/datasets/interaction/metcalf.py | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/openqdc/datasets/interaction/metcalf.py b/openqdc/datasets/interaction/metcalf.py index 34da7ef..99da5b0 100644 --- a/openqdc/datasets/interaction/metcalf.py +++ b/openqdc/datasets/interaction/metcalf.py @@ -130,9 +130,6 @@ def read_raw_entries(self) -> List[Dict]: # extract in folders extract_raw_tar_gz(self.root) data = [] - for _, dirname, _ in os.walk(self.root): - xyz_dir = p_join(self.root, dirname) - subset = np.array([dirname.split("-")[0].lower()]) # training, validation, or test - for filename in glob(xyz_dir + f"{os.sep}*.xyz"): - data.append(read_xyz(filename, subset)) + for filename in glob(self.root + f"{os.sep}*.xyz"): + data.append(read_xyz(filename, self.__name__)) return data From cd486a885719a638d15889f8832cefc18342c73f Mon Sep 17 00:00:00 2001 From: FNTwin Date: Sat, 6 Apr 2024 13:15:03 -0400 Subject: [PATCH 15/27] cleaned des --- openqdc/datasets/interaction/des370k.py | 96 ++++++++++++++----------- openqdc/datasets/interaction/des5m.py | 2 +- 2 files changed, 57 insertions(+), 41 deletions(-) diff --git a/openqdc/datasets/interaction/des370k.py b/openqdc/datasets/interaction/des370k.py index 250d42d..ee72923 100644 --- a/openqdc/datasets/interaction/des370k.py +++ b/openqdc/datasets/interaction/des370k.py @@ -13,6 +13,58 @@ from openqdc.utils.molecule import molecule_groups +def parse_des_df(row, energy_target_names): + smiles0, smiles1 = row["smiles0"], row["smiles1"] + charge0, charge1 = row["charge0"], row["charge1"] + natoms0, natoms1 = row["natoms0"], row["natoms1"] + pos = np.array(list(map(float, row["xyz"].split()))).reshape(-1, 3) + elements = row["elements"].split() + atomic_nums = np.expand_dims(np.array([ATOM_TABLE.GetAtomicNumber(x) for x in elements]), axis=1) + charges = np.expand_dims(np.array([charge0] * natoms0 + [charge1] * natoms1), axis=1) + atomic_inputs = np.concatenate((atomic_nums, charges, pos), axis=-1, dtype=np.float32) + energies = np.array(row[energy_target_names].values).astype(np.float32)[None, :] + name = np.array([smiles0 + "." + smiles1]) + return { + "energies": energies, + "n_atoms": np.array([natoms0 + natoms1], dtype=np.int32), + "name": name, + "atomic_inputs": atomic_inputs, + "charges": charges, + "atomic_nums": atomic_nums, + "elements": elements, + "natoms0": natoms0, + "natoms1": natoms1, + "smiles0": smiles0, + "smiles1": smiles1, + "charge0": charge0, + "charge1": charge1, + } + + +def create_subset(smiles0, smiles1): + subsets = [] + for smiles in [smiles0, smiles1]: + found = False + for functional_group, smiles_set in molecule_groups.items(): + if smiles in smiles_set: + subsets.append(functional_group) + found = True + if not found: + logger.info(f"molecule group lookup failed for {smiles}") + return subsets + + +def convert_to_record(item): + return dict( + energies=item["energies"], + subset=np.array([item["subsets"]]), + n_atoms=np.array([item["natoms0"] + item["natoms1"]], dtype=np.int32), + n_atoms_first=np.array([item["natoms0"]], dtype=np.int32), + atomic_inputs=item["atomic_inputs"], + name=item["name"], + ) + + class DES370K(BaseInteractionDataset): """ DE Shaw Research interaction energy of over 370K @@ -95,50 +147,14 @@ class DES370K(BaseInteractionDataset): def _root(cls): return os.path.join(get_local_cache(), cls._name) - @classmethod - def _read_raw_entries(cls) -> List[Dict]: + def read_raw_entries(cls) -> List[Dict]: filepath = os.path.join(cls._root(), cls._filename) logger.info(f"Reading {cls._name} interaction data from {filepath}") df = pd.read_csv(filepath) data = [] for idx, row in tqdm(df.iterrows(), total=df.shape[0]): - smiles0, smiles1 = row["smiles0"], row["smiles1"] - charge0, charge1 = row["charge0"], row["charge1"] - natoms0, natoms1 = row["natoms0"], row["natoms1"] - pos = np.array(list(map(float, row["xyz"].split()))).reshape(-1, 3) - - elements = row["elements"].split() - - atomic_nums = np.expand_dims(np.array([ATOM_TABLE.GetAtomicNumber(x) for x in elements]), axis=1) - - charges = np.expand_dims(np.array([charge0] * natoms0 + [charge1] * natoms1), axis=1) - - atomic_inputs = np.concatenate((atomic_nums, charges, pos), axis=-1, dtype=np.float32) - - energies = np.array(row[cls.energy_target_names].values).astype(np.float32)[None, :] - - name = np.array([smiles0 + "." + smiles1]) - - subsets = [] - for smiles in [smiles0, smiles1]: - found = False - for functional_group, smiles_set in molecule_groups.items(): - if smiles in smiles_set: - subsets.append(functional_group) - found = True - if not found: - logger.info(f"molecule group lookup failed for {smiles}") - - item = dict( - energies=energies, - subset=np.array([subsets]), - n_atoms=np.array([natoms0 + natoms1], dtype=np.int32), - n_atoms_first=np.array([natoms0], dtype=np.int32), - atomic_inputs=atomic_inputs, - name=name, - ) + item = parse_des_df(row, cls.energy_target_names) + item["subset"] = create_subset(item["smiles0"], item["smiles1"]) + item = convert_to_record(item) data.append(item) return data - - def read_raw_entries(self) -> List[Dict]: - return DES370K._read_raw_entries() diff --git a/openqdc/datasets/interaction/des5m.py b/openqdc/datasets/interaction/des5m.py index 979909c..e274ba8 100644 --- a/openqdc/datasets/interaction/des5m.py +++ b/openqdc/datasets/interaction/des5m.py @@ -75,4 +75,4 @@ class DES5M(DES370K): __forces_unit__ = "kcal/mol/ang" def read_raw_entries(self) -> List[Dict]: - return DES5M._read_raw_entries() + return super().read_raw_entries() From 80d7371823a875db92e641a6d6eb33d44a92b1be Mon Sep 17 00:00:00 2001 From: FNTwin Date: Sat, 6 Apr 2024 13:31:42 -0400 Subject: [PATCH 16/27] Simplified des dataset --- openqdc/datasets/interaction/__init__.py | 6 +- .../interaction/{des370k.py => des.py} | 0 openqdc/datasets/interaction/des5m.py | 161 +++++++++++++++++- openqdc/datasets/interaction/dess66.py | 59 ++++--- openqdc/datasets/interaction/dess66x8.py | 129 -------------- 5 files changed, 190 insertions(+), 165 deletions(-) rename openqdc/datasets/interaction/{des370k.py => des.py} (100%) delete mode 100644 openqdc/datasets/interaction/dess66x8.py diff --git a/openqdc/datasets/interaction/__init__.py b/openqdc/datasets/interaction/__init__.py index fa3bebd..bf8c834 100644 --- a/openqdc/datasets/interaction/__init__.py +++ b/openqdc/datasets/interaction/__init__.py @@ -1,8 +1,6 @@ from .base import BaseInteractionDataset # noqa -from .des5m import DES5M -from .des370k import DES370K -from .dess66 import DESS66 -from .dess66x8 import DESS66x8 +from .des import DES5M, DES370K +from .dess66 import DESS66, DESS66x8 from .L7 import L7 from .metcalf import Metcalf from .splinter import Splinter diff --git a/openqdc/datasets/interaction/des370k.py b/openqdc/datasets/interaction/des.py similarity index 100% rename from openqdc/datasets/interaction/des370k.py rename to openqdc/datasets/interaction/des.py diff --git a/openqdc/datasets/interaction/des5m.py b/openqdc/datasets/interaction/des5m.py index e274ba8..710fa39 100644 --- a/openqdc/datasets/interaction/des5m.py +++ b/openqdc/datasets/interaction/des5m.py @@ -1,7 +1,163 @@ +import os from typing import Dict, List -from openqdc.datasets.interaction.des370k import DES370K +import numpy as np +import pandas as pd +from loguru import logger +from tqdm import tqdm + +from openqdc.datasets.interaction.base import BaseInteractionDataset from openqdc.methods import InteractionMethod, InterEnergyType +from openqdc.utils.constants import ATOM_TABLE +from openqdc.utils.io import get_local_cache +from openqdc.utils.molecule import molecule_groups + + +def parse_des_df(row, energy_target_names): + smiles0, smiles1 = row["smiles0"], row["smiles1"] + charge0, charge1 = row["charge0"], row["charge1"] + natoms0, natoms1 = row["natoms0"], row["natoms1"] + pos = np.array(list(map(float, row["xyz"].split()))).reshape(-1, 3) + elements = row["elements"].split() + atomic_nums = np.expand_dims(np.array([ATOM_TABLE.GetAtomicNumber(x) for x in elements]), axis=1) + charges = np.expand_dims(np.array([charge0] * natoms0 + [charge1] * natoms1), axis=1) + atomic_inputs = np.concatenate((atomic_nums, charges, pos), axis=-1, dtype=np.float32) + energies = np.array(row[energy_target_names].values).astype(np.float32)[None, :] + name = np.array([smiles0 + "." + smiles1]) + return { + "energies": energies, + "n_atoms": np.array([natoms0 + natoms1], dtype=np.int32), + "name": name, + "atomic_inputs": atomic_inputs, + "charges": charges, + "atomic_nums": atomic_nums, + "elements": elements, + "natoms0": natoms0, + "natoms1": natoms1, + "smiles0": smiles0, + "smiles1": smiles1, + "charge0": charge0, + "charge1": charge1, + } + + +def create_subset(smiles0, smiles1): + subsets = [] + for smiles in [smiles0, smiles1]: + found = False + for functional_group, smiles_set in molecule_groups.items(): + if smiles in smiles_set: + subsets.append(functional_group) + found = True + if not found: + logger.info(f"molecule group lookup failed for {smiles}") + return subsets + + +def convert_to_record(item): + return dict( + energies=item["energies"], + subset=np.array([item["subsets"]]), + n_atoms=np.array([item["natoms0"] + item["natoms1"]], dtype=np.int32), + n_atoms_first=np.array([item["natoms0"]], dtype=np.int32), + atomic_inputs=item["atomic_inputs"], + name=item["name"], + ) + + +class DES370K(BaseInteractionDataset): + """ + DE Shaw Research interaction energy of over 370K + small molecule dimers as described in the paper: + + Quantum chemical benchmark databases of gold-standard dimer interaction energies. + Donchev, A.G., Taube, A.G., Decolvenaere, E. et al. + Sci Data 8, 55 (2021). + https://doi.org/10.1038/s41597-021-00833-x + """ + + __name__ = "des370k_interaction" + __energy_unit__ = "kcal/mol" + __distance_unit__ = "ang" + __forces_unit__ = "kcal/mol/ang" + __energy_methods__ = [ + InteractionMethod.MP2_CC_PVDZ, + InteractionMethod.MP2_CC_PVQZ, + InteractionMethod.MP2_CC_PVTZ, + InteractionMethod.MP2_CBS, + InteractionMethod.CCSD_T_CC_PVDZ, + InteractionMethod.CCSD_T_CBS, + InteractionMethod.CCSD_T_NN, + InteractionMethod.SAPT0_AUG_CC_PWCVXZ, + InteractionMethod.SAPT0_AUG_CC_PWCVXZ, + InteractionMethod.SAPT0_AUG_CC_PWCVXZ, + InteractionMethod.SAPT0_AUG_CC_PWCVXZ, + InteractionMethod.SAPT0_AUG_CC_PWCVXZ, + InteractionMethod.SAPT0_AUG_CC_PWCVXZ, + InteractionMethod.SAPT0_AUG_CC_PWCVXZ, + InteractionMethod.SAPT0_AUG_CC_PWCVXZ, + InteractionMethod.SAPT0_AUG_CC_PWCVXZ, + InteractionMethod.SAPT0_AUG_CC_PWCVXZ, + ] + + __energy_type__ = [ + InterEnergyType.TOTAL, + InterEnergyType.TOTAL, + InterEnergyType.TOTAL, + InterEnergyType.TOTAL, + InterEnergyType.TOTAL, + InterEnergyType.TOTAL, + InterEnergyType.TOTAL, + InterEnergyType.TOTAL, + InterEnergyType.ES, + InterEnergyType.EX, + InterEnergyType.EX_S2, + InterEnergyType.IND, + InterEnergyType.EX_IND, + InterEnergyType.DISP, + InterEnergyType.EX_DISP_OS, + InterEnergyType.EX_DISP_SS, + InterEnergyType.DELTA_HF, + ] + + energy_target_names = [ + "cc_MP2_all", + "qz_MP2_all", + "tz_MP2_all", + "cbs_MP2_all", + "cc_CCSD(T)_all", + "cbs_CCSD(T)_all", + "nn_CCSD(T)_all", + "sapt_all", + "sapt_es", + "sapt_ex", + "sapt_exs2", + "sapt_ind", + "sapt_exind", + "sapt_disp", + "sapt_exdisp_os", + "sapt_exdisp_ss", + "sapt_delta_HF", + ] + + _filename = "DES370K.csv" + _name = "des370k_interaction" + + @classmethod + def _root(cls): + return os.path.join(get_local_cache(), cls._name) + + def read_raw_entries(cls) -> List[Dict]: + filepath = os.path.join(cls._root(), cls._filename) + logger.info(f"Reading {cls._name} interaction data from {filepath}") + df = pd.read_csv(filepath) + data = [] + for idx, row in tqdm(df.iterrows(), total=df.shape[0]): + item = parse_des_df(row, cls.energy_target_names) + item["subset"] = create_subset(item["smiles0"], item["smiles1"]) + item = convert_to_record(item) + data.append(item) + return data class DES5M(DES370K): @@ -73,6 +229,3 @@ class DES5M(DES370K): __energy_unit__ = "kcal/mol" __distance_unit__ = "ang" __forces_unit__ = "kcal/mol/ang" - - def read_raw_entries(self) -> List[Dict]: - return super().read_raw_entries() diff --git a/openqdc/datasets/interaction/dess66.py b/openqdc/datasets/interaction/dess66.py index c10811b..45bf6bd 100644 --- a/openqdc/datasets/interaction/dess66.py +++ b/openqdc/datasets/interaction/dess66.py @@ -1,14 +1,18 @@ import os from typing import Dict, List -import numpy as np import pandas as pd from loguru import logger from tqdm import tqdm from openqdc.datasets.interaction.base import BaseInteractionDataset +from openqdc.datasets.interaction.des370k import convert_to_record, parse_des_df from openqdc.methods import InteractionMethod, InterEnergyType -from openqdc.utils.constants import ATOM_TABLE + +CSV_NAME = { + "des_s66": "DESS66.csv", + "des_s66x8": "DESS66x8.csv", +} class DESS66(BaseInteractionDataset): @@ -91,38 +95,37 @@ class DESS66(BaseInteractionDataset): "sapt_delta_HF", ] + @property + def csv_path(self): + return os.path.join(self.root, CSV_NAME[self.__name__]) + def read_raw_entries(self) -> List[Dict]: - self.filepath = os.path.join(self.root, "DESS66.csv") - logger.info(f"Reading DESS66 interaction data from {self.filepath}") - df = pd.read_csv(self.filepath) + filepath = self.csv_path + logger.info(f"Reading DESS66 interaction data from {filepath}") + df = pd.read_csv(filepath) data = [] for idx, row in tqdm(df.iterrows(), total=df.shape[0]): - smiles0, smiles1 = row["smiles0"], row["smiles1"] - charge0, charge1 = row["charge0"], row["charge1"] - natoms0, natoms1 = row["natoms0"], row["natoms1"] - pos = np.array(list(map(float, row["xyz"].split()))).reshape(-1, 3) - - elements = row["elements"].split() - - atomic_nums = np.expand_dims(np.array([ATOM_TABLE.GetAtomicNumber(x) for x in elements]), axis=1) + item = parse_des_df(row) + item["subset"] = row["system_name"] + data.append(convert_to_record(item)) + return data - charges = np.expand_dims(np.array([charge0] * natoms0 + [charge1] * natoms1), axis=1) - atomic_inputs = np.concatenate((atomic_nums, charges, pos), axis=-1, dtype=np.float32) +class DESS66x8(DESS66): + """ + DE Shaw Research interaction energy + estimates of all 528 conformers from + the original S66x8 dataset as described + in the paper: - energies = np.array(row[self.energy_target_names].values).astype(np.float32)[None, :] + Quantum chemical benchmark databases of gold-standard dimer interaction energies. + Donchev, A.G., Taube, A.G., Decolvenaere, E. et al. + Sci Data 8, 55 (2021). + https://doi.org/10.1038/s41597-021-00833-x - name = np.array([smiles0 + "." + smiles1]) + Data was downloaded from Zenodo: - subset = row["system_name"] + https://zenodo.org/records/5676284 + """ - item = dict( - energies=energies, - subset=np.array([subset]), - n_atoms=np.array([natoms0 + natoms1], dtype=np.int32), - n_atoms_first=np.array([natoms0], dtype=np.int32), - atomic_inputs=atomic_inputs, - name=name, - ) - data.append(item) - return data + __name__ = "des_s66x8" diff --git a/openqdc/datasets/interaction/dess66x8.py b/openqdc/datasets/interaction/dess66x8.py deleted file mode 100644 index 709620a..0000000 --- a/openqdc/datasets/interaction/dess66x8.py +++ /dev/null @@ -1,129 +0,0 @@ -import os -from typing import Dict, List - -import numpy as np -import pandas as pd -from loguru import logger -from tqdm import tqdm - -from openqdc.datasets.interaction.base import BaseInteractionDataset -from openqdc.methods import InteractionMethod, InterEnergyType -from openqdc.utils.constants import ATOM_TABLE - - -class DESS66x8(BaseInteractionDataset): - """ - DE Shaw Research interaction energy - estimates of all 528 conformers from - the original S66x8 dataset as described - in the paper: - - Quantum chemical benchmark databases of gold-standard dimer interaction energies. - Donchev, A.G., Taube, A.G., Decolvenaere, E. et al. - Sci Data 8, 55 (2021). - https://doi.org/10.1038/s41597-021-00833-x - - Data was downloaded from Zenodo: - - https://zenodo.org/records/5676284 - """ - - __name__ = "des_s66x8" - __energy_unit__ = "kcal/mol" - __distance_unit__ = "ang" - __forces_unit__ = "kcal/mol/ang" - __energy_methods__ = [ - InteractionMethod.MP2_CC_PVDZ, - InteractionMethod.MP2_CC_PVQZ, - InteractionMethod.MP2_CC_PVTZ, - InteractionMethod.MP2_CBS, - InteractionMethod.CCSD_T_CC_PVDZ, - InteractionMethod.CCSD_T_CBS, - InteractionMethod.CCSD_T_NN, - InteractionMethod.SAPT0_AUG_CC_PWCVXZ, - InteractionMethod.SAPT0_AUG_CC_PWCVXZ, - InteractionMethod.SAPT0_AUG_CC_PWCVXZ, - InteractionMethod.SAPT0_AUG_CC_PWCVXZ, - InteractionMethod.SAPT0_AUG_CC_PWCVXZ, - InteractionMethod.SAPT0_AUG_CC_PWCVXZ, - InteractionMethod.SAPT0_AUG_CC_PWCVXZ, - InteractionMethod.SAPT0_AUG_CC_PWCVXZ, - InteractionMethod.SAPT0_AUG_CC_PWCVXZ, - InteractionMethod.SAPT0_AUG_CC_PWCVXZ, - ] - - __energy_type__ = [ - InterEnergyType.TOTAL, - InterEnergyType.TOTAL, - InterEnergyType.TOTAL, - InterEnergyType.TOTAL, - InterEnergyType.TOTAL, - InterEnergyType.TOTAL, - InterEnergyType.TOTAL, - InterEnergyType.TOTAL, - InterEnergyType.ES, - InterEnergyType.EX, - InterEnergyType.EX_S2, - InterEnergyType.IND, - InterEnergyType.EX_IND, - InterEnergyType.DISP, - InterEnergyType.EX_DISP_OS, - InterEnergyType.EX_DISP_SS, - InterEnergyType.DELTA_HF, - ] - - energy_target_names = [ - "cc_MP2_all", - "qz_MP2_all", - "tz_MP2_all", - "cbs_MP2_all", - "cc_CCSD(T)_all", - "cbs_CCSD(T)_all", - "nn_CCSD(T)_all", - "sapt_all", - "sapt_es", - "sapt_ex", - "sapt_exs2", - "sapt_ind", - "sapt_exind", - "sapt_disp", - "sapt_exdisp_os", - "sapt_exdisp_ss", - "sapt_delta_HF", - ] - - def read_raw_entries(self) -> List[Dict]: - self.filepath = os.path.join(self.root, "DESS66x8.csv") - logger.info(f"Reading DESS66x8 interaction data from {self.filepath}") - df = pd.read_csv(self.filepath) - data = [] - for idx, row in tqdm(df.iterrows(), total=df.shape[0]): - smiles0, smiles1 = row["smiles0"], row["smiles1"] - charge0, charge1 = row["charge0"], row["charge1"] - natoms0, natoms1 = row["natoms0"], row["natoms1"] - pos = np.array(list(map(float, row["xyz"].split()))).reshape(-1, 3) - - elements = row["elements"].split() - - atomic_nums = np.expand_dims(np.array([ATOM_TABLE.GetAtomicNumber(x) for x in elements]), axis=1) - - charges = np.expand_dims(np.array([charge0] * natoms0 + [charge1] * natoms1), axis=1) - - atomic_inputs = np.concatenate((atomic_nums, charges, pos), axis=-1, dtype=np.float32) - - energies = np.array(row[self.energy_target_names].values).astype(np.float32)[None, :] - - name = np.array([smiles0 + "." + smiles1]) - - subset = row["system_name"] - - item = dict( - energies=energies, - subset=np.array([subset]), - n_atoms=np.array([natoms0 + natoms1], dtype=np.int32), - n_atoms_first=np.array([natoms0], dtype=np.int32), - atomic_inputs=atomic_inputs, - name=name, - ) - data.append(item) - return data From f3d205ccca65f695bc4beb38d8d5755ebfce31b0 Mon Sep 17 00:00:00 2001 From: FNTwin Date: Sat, 6 Apr 2024 13:34:21 -0400 Subject: [PATCH 17/27] removed redundant dataset files --- openqdc/datasets/interaction/des.py | 71 ++++++++ openqdc/datasets/interaction/des5m.py | 231 -------------------------- 2 files changed, 71 insertions(+), 231 deletions(-) delete mode 100644 openqdc/datasets/interaction/des5m.py diff --git a/openqdc/datasets/interaction/des.py b/openqdc/datasets/interaction/des.py index ee72923..710fa39 100644 --- a/openqdc/datasets/interaction/des.py +++ b/openqdc/datasets/interaction/des.py @@ -158,3 +158,74 @@ def read_raw_entries(cls) -> List[Dict]: item = convert_to_record(item) data.append(item) return data + + +class DES5M(DES370K): + """ + DE Shaw Research interaction energy calculations for + over 5M small molecule dimers as described in the paper: + + Quantum chemical benchmark databases of gold-standard dimer interaction energies. + Donchev, A.G., Taube, A.G., Decolvenaere, E. et al. + Sci Data 8, 55 (2021). + https://doi.org/10.1038/s41597-021-00833-x + """ + + __name__ = "des5m_interaction" + __energy_methods__ = [ + InteractionMethod.MP2_CC_PVQZ, + InteractionMethod.MP2_CC_PVTZ, + InteractionMethod.MP2_CBS, + InteractionMethod.CCSD_T_NN, + InteractionMethod.SAPT0_AUG_CC_PWCVXZ, + InteractionMethod.SAPT0_AUG_CC_PWCVXZ, + InteractionMethod.SAPT0_AUG_CC_PWCVXZ, + InteractionMethod.SAPT0_AUG_CC_PWCVXZ, + InteractionMethod.SAPT0_AUG_CC_PWCVXZ, + InteractionMethod.SAPT0_AUG_CC_PWCVXZ, + InteractionMethod.SAPT0_AUG_CC_PWCVXZ, + InteractionMethod.SAPT0_AUG_CC_PWCVXZ, + InteractionMethod.SAPT0_AUG_CC_PWCVXZ, + InteractionMethod.SAPT0_AUG_CC_PWCVXZ, + ] + + __energy_type__ = [ + InterEnergyType.TOTAL, + InterEnergyType.TOTAL, + InterEnergyType.TOTAL, + InterEnergyType.TOTAL, + InterEnergyType.TOTAL, + InterEnergyType.ES, + InterEnergyType.EX, + InterEnergyType.EX_S2, + InterEnergyType.IND, + InterEnergyType.EX_IND, + InterEnergyType.DISP, + InterEnergyType.EX_DISP_OS, + InterEnergyType.EX_DISP_SS, + InterEnergyType.DELTA_HF, + ] + + energy_target_names = [ + "qz_MP2_all", + "tz_MP2_all", + "cbs_MP2_all", + "nn_CCSD(T)_all", + "sapt_all", + "sapt_es", + "sapt_ex", + "sapt_exs2", + "sapt_ind", + "sapt_exind", + "sapt_disp", + "sapt_exdisp_os", + "sapt_exdisp_ss", + "sapt_delta_HF", + ] + + _filename = "DES5M.csv" + _name = "des5m_interaction" + + __energy_unit__ = "kcal/mol" + __distance_unit__ = "ang" + __forces_unit__ = "kcal/mol/ang" diff --git a/openqdc/datasets/interaction/des5m.py b/openqdc/datasets/interaction/des5m.py deleted file mode 100644 index 710fa39..0000000 --- a/openqdc/datasets/interaction/des5m.py +++ /dev/null @@ -1,231 +0,0 @@ -import os -from typing import Dict, List - -import numpy as np -import pandas as pd -from loguru import logger -from tqdm import tqdm - -from openqdc.datasets.interaction.base import BaseInteractionDataset -from openqdc.methods import InteractionMethod, InterEnergyType -from openqdc.utils.constants import ATOM_TABLE -from openqdc.utils.io import get_local_cache -from openqdc.utils.molecule import molecule_groups - - -def parse_des_df(row, energy_target_names): - smiles0, smiles1 = row["smiles0"], row["smiles1"] - charge0, charge1 = row["charge0"], row["charge1"] - natoms0, natoms1 = row["natoms0"], row["natoms1"] - pos = np.array(list(map(float, row["xyz"].split()))).reshape(-1, 3) - elements = row["elements"].split() - atomic_nums = np.expand_dims(np.array([ATOM_TABLE.GetAtomicNumber(x) for x in elements]), axis=1) - charges = np.expand_dims(np.array([charge0] * natoms0 + [charge1] * natoms1), axis=1) - atomic_inputs = np.concatenate((atomic_nums, charges, pos), axis=-1, dtype=np.float32) - energies = np.array(row[energy_target_names].values).astype(np.float32)[None, :] - name = np.array([smiles0 + "." + smiles1]) - return { - "energies": energies, - "n_atoms": np.array([natoms0 + natoms1], dtype=np.int32), - "name": name, - "atomic_inputs": atomic_inputs, - "charges": charges, - "atomic_nums": atomic_nums, - "elements": elements, - "natoms0": natoms0, - "natoms1": natoms1, - "smiles0": smiles0, - "smiles1": smiles1, - "charge0": charge0, - "charge1": charge1, - } - - -def create_subset(smiles0, smiles1): - subsets = [] - for smiles in [smiles0, smiles1]: - found = False - for functional_group, smiles_set in molecule_groups.items(): - if smiles in smiles_set: - subsets.append(functional_group) - found = True - if not found: - logger.info(f"molecule group lookup failed for {smiles}") - return subsets - - -def convert_to_record(item): - return dict( - energies=item["energies"], - subset=np.array([item["subsets"]]), - n_atoms=np.array([item["natoms0"] + item["natoms1"]], dtype=np.int32), - n_atoms_first=np.array([item["natoms0"]], dtype=np.int32), - atomic_inputs=item["atomic_inputs"], - name=item["name"], - ) - - -class DES370K(BaseInteractionDataset): - """ - DE Shaw Research interaction energy of over 370K - small molecule dimers as described in the paper: - - Quantum chemical benchmark databases of gold-standard dimer interaction energies. - Donchev, A.G., Taube, A.G., Decolvenaere, E. et al. - Sci Data 8, 55 (2021). - https://doi.org/10.1038/s41597-021-00833-x - """ - - __name__ = "des370k_interaction" - __energy_unit__ = "kcal/mol" - __distance_unit__ = "ang" - __forces_unit__ = "kcal/mol/ang" - __energy_methods__ = [ - InteractionMethod.MP2_CC_PVDZ, - InteractionMethod.MP2_CC_PVQZ, - InteractionMethod.MP2_CC_PVTZ, - InteractionMethod.MP2_CBS, - InteractionMethod.CCSD_T_CC_PVDZ, - InteractionMethod.CCSD_T_CBS, - InteractionMethod.CCSD_T_NN, - InteractionMethod.SAPT0_AUG_CC_PWCVXZ, - InteractionMethod.SAPT0_AUG_CC_PWCVXZ, - InteractionMethod.SAPT0_AUG_CC_PWCVXZ, - InteractionMethod.SAPT0_AUG_CC_PWCVXZ, - InteractionMethod.SAPT0_AUG_CC_PWCVXZ, - InteractionMethod.SAPT0_AUG_CC_PWCVXZ, - InteractionMethod.SAPT0_AUG_CC_PWCVXZ, - InteractionMethod.SAPT0_AUG_CC_PWCVXZ, - InteractionMethod.SAPT0_AUG_CC_PWCVXZ, - InteractionMethod.SAPT0_AUG_CC_PWCVXZ, - ] - - __energy_type__ = [ - InterEnergyType.TOTAL, - InterEnergyType.TOTAL, - InterEnergyType.TOTAL, - InterEnergyType.TOTAL, - InterEnergyType.TOTAL, - InterEnergyType.TOTAL, - InterEnergyType.TOTAL, - InterEnergyType.TOTAL, - InterEnergyType.ES, - InterEnergyType.EX, - InterEnergyType.EX_S2, - InterEnergyType.IND, - InterEnergyType.EX_IND, - InterEnergyType.DISP, - InterEnergyType.EX_DISP_OS, - InterEnergyType.EX_DISP_SS, - InterEnergyType.DELTA_HF, - ] - - energy_target_names = [ - "cc_MP2_all", - "qz_MP2_all", - "tz_MP2_all", - "cbs_MP2_all", - "cc_CCSD(T)_all", - "cbs_CCSD(T)_all", - "nn_CCSD(T)_all", - "sapt_all", - "sapt_es", - "sapt_ex", - "sapt_exs2", - "sapt_ind", - "sapt_exind", - "sapt_disp", - "sapt_exdisp_os", - "sapt_exdisp_ss", - "sapt_delta_HF", - ] - - _filename = "DES370K.csv" - _name = "des370k_interaction" - - @classmethod - def _root(cls): - return os.path.join(get_local_cache(), cls._name) - - def read_raw_entries(cls) -> List[Dict]: - filepath = os.path.join(cls._root(), cls._filename) - logger.info(f"Reading {cls._name} interaction data from {filepath}") - df = pd.read_csv(filepath) - data = [] - for idx, row in tqdm(df.iterrows(), total=df.shape[0]): - item = parse_des_df(row, cls.energy_target_names) - item["subset"] = create_subset(item["smiles0"], item["smiles1"]) - item = convert_to_record(item) - data.append(item) - return data - - -class DES5M(DES370K): - """ - DE Shaw Research interaction energy calculations for - over 5M small molecule dimers as described in the paper: - - Quantum chemical benchmark databases of gold-standard dimer interaction energies. - Donchev, A.G., Taube, A.G., Decolvenaere, E. et al. - Sci Data 8, 55 (2021). - https://doi.org/10.1038/s41597-021-00833-x - """ - - __name__ = "des5m_interaction" - __energy_methods__ = [ - InteractionMethod.MP2_CC_PVQZ, - InteractionMethod.MP2_CC_PVTZ, - InteractionMethod.MP2_CBS, - InteractionMethod.CCSD_T_NN, - InteractionMethod.SAPT0_AUG_CC_PWCVXZ, - InteractionMethod.SAPT0_AUG_CC_PWCVXZ, - InteractionMethod.SAPT0_AUG_CC_PWCVXZ, - InteractionMethod.SAPT0_AUG_CC_PWCVXZ, - InteractionMethod.SAPT0_AUG_CC_PWCVXZ, - InteractionMethod.SAPT0_AUG_CC_PWCVXZ, - InteractionMethod.SAPT0_AUG_CC_PWCVXZ, - InteractionMethod.SAPT0_AUG_CC_PWCVXZ, - InteractionMethod.SAPT0_AUG_CC_PWCVXZ, - InteractionMethod.SAPT0_AUG_CC_PWCVXZ, - ] - - __energy_type__ = [ - InterEnergyType.TOTAL, - InterEnergyType.TOTAL, - InterEnergyType.TOTAL, - InterEnergyType.TOTAL, - InterEnergyType.TOTAL, - InterEnergyType.ES, - InterEnergyType.EX, - InterEnergyType.EX_S2, - InterEnergyType.IND, - InterEnergyType.EX_IND, - InterEnergyType.DISP, - InterEnergyType.EX_DISP_OS, - InterEnergyType.EX_DISP_SS, - InterEnergyType.DELTA_HF, - ] - - energy_target_names = [ - "qz_MP2_all", - "tz_MP2_all", - "cbs_MP2_all", - "nn_CCSD(T)_all", - "sapt_all", - "sapt_es", - "sapt_ex", - "sapt_exs2", - "sapt_ind", - "sapt_exind", - "sapt_disp", - "sapt_exdisp_os", - "sapt_exdisp_ss", - "sapt_delta_HF", - ] - - _filename = "DES5M.csv" - _name = "des5m_interaction" - - __energy_unit__ = "kcal/mol" - __distance_unit__ = "ang" - __forces_unit__ = "kcal/mol/ang" From da4fece39cb29df0a8c4ef6636e929f8a44dee49 Mon Sep 17 00:00:00 2001 From: FNTwin Date: Sat, 6 Apr 2024 13:46:48 -0400 Subject: [PATCH 18/27] DES inerithance --- openqdc/datasets/interaction/__init__.py | 3 +- openqdc/datasets/interaction/des.py | 133 ++++++++++++----------- openqdc/datasets/interaction/dess66.py | 131 ---------------------- 3 files changed, 69 insertions(+), 198 deletions(-) delete mode 100644 openqdc/datasets/interaction/dess66.py diff --git a/openqdc/datasets/interaction/__init__.py b/openqdc/datasets/interaction/__init__.py index bf8c834..814a367 100644 --- a/openqdc/datasets/interaction/__init__.py +++ b/openqdc/datasets/interaction/__init__.py @@ -1,6 +1,5 @@ from .base import BaseInteractionDataset # noqa -from .des import DES5M, DES370K -from .dess66 import DESS66, DESS66x8 +from .des import DES5M, DES370K, DESS66, DESS66x8 from .L7 import L7 from .metcalf import Metcalf from .splinter import Splinter diff --git a/openqdc/datasets/interaction/des.py b/openqdc/datasets/interaction/des.py index 710fa39..6ca1bda 100644 --- a/openqdc/datasets/interaction/des.py +++ b/openqdc/datasets/interaction/des.py @@ -1,4 +1,5 @@ import os +from abc import ABC, abstractmethod from typing import Dict, List import numpy as np @@ -9,7 +10,6 @@ from openqdc.datasets.interaction.base import BaseInteractionDataset from openqdc.methods import InteractionMethod, InterEnergyType from openqdc.utils.constants import ATOM_TABLE -from openqdc.utils.io import get_local_cache from openqdc.utils.molecule import molecule_groups @@ -65,7 +65,13 @@ def convert_to_record(item): ) -class DES370K(BaseInteractionDataset): +class IDES(ABC): + @abstractmethod + def _create_subsets(self, **kwargs): + raise NotImplementedError + + +class DES370K(BaseInteractionDataset, IDES): """ DE Shaw Research interaction energy of over 370K small molecule dimers as described in the paper: @@ -77,6 +83,7 @@ class DES370K(BaseInteractionDataset): """ __name__ = "des370k_interaction" + __filename__ = "DES370K.csv" __energy_unit__ = "kcal/mol" __distance_unit__ = "ang" __forces_unit__ = "kcal/mol/ang" @@ -140,21 +147,21 @@ class DES370K(BaseInteractionDataset): "sapt_delta_HF", ] - _filename = "DES370K.csv" - _name = "des370k_interaction" + @property + def csv_path(self): + return os.path.join(self.root, self.__filename__) - @classmethod - def _root(cls): - return os.path.join(get_local_cache(), cls._name) + def _create_subsets(self, **kwargs): + return create_subset(kwargs["smiles0"], kwargs["smiles1"]) - def read_raw_entries(cls) -> List[Dict]: - filepath = os.path.join(cls._root(), cls._filename) - logger.info(f"Reading {cls._name} interaction data from {filepath}") + def read_raw_entries(self) -> List[Dict]: + filepath = self.csv_path + logger.info(f"Reading {self.__name__} interaction data from {filepath}") df = pd.read_csv(filepath) data = [] for idx, row in tqdm(df.iterrows(), total=df.shape[0]): - item = parse_des_df(row, cls.energy_target_names) - item["subset"] = create_subset(item["smiles0"], item["smiles1"]) + item = parse_des_df(row, self.energy_target_names) + item["subset"] = self._create_subset(**item) item = convert_to_record(item) data.append(item) return data @@ -172,60 +179,56 @@ class DES5M(DES370K): """ __name__ = "des5m_interaction" - __energy_methods__ = [ - InteractionMethod.MP2_CC_PVQZ, - InteractionMethod.MP2_CC_PVTZ, - InteractionMethod.MP2_CBS, - InteractionMethod.CCSD_T_NN, - InteractionMethod.SAPT0_AUG_CC_PWCVXZ, - InteractionMethod.SAPT0_AUG_CC_PWCVXZ, - InteractionMethod.SAPT0_AUG_CC_PWCVXZ, - InteractionMethod.SAPT0_AUG_CC_PWCVXZ, - InteractionMethod.SAPT0_AUG_CC_PWCVXZ, - InteractionMethod.SAPT0_AUG_CC_PWCVXZ, - InteractionMethod.SAPT0_AUG_CC_PWCVXZ, - InteractionMethod.SAPT0_AUG_CC_PWCVXZ, - InteractionMethod.SAPT0_AUG_CC_PWCVXZ, - InteractionMethod.SAPT0_AUG_CC_PWCVXZ, - ] + __filename__ = "DES5M.csv" - __energy_type__ = [ - InterEnergyType.TOTAL, - InterEnergyType.TOTAL, - InterEnergyType.TOTAL, - InterEnergyType.TOTAL, - InterEnergyType.TOTAL, - InterEnergyType.ES, - InterEnergyType.EX, - InterEnergyType.EX_S2, - InterEnergyType.IND, - InterEnergyType.EX_IND, - InterEnergyType.DISP, - InterEnergyType.EX_DISP_OS, - InterEnergyType.EX_DISP_SS, - InterEnergyType.DELTA_HF, - ] - energy_target_names = [ - "qz_MP2_all", - "tz_MP2_all", - "cbs_MP2_all", - "nn_CCSD(T)_all", - "sapt_all", - "sapt_es", - "sapt_ex", - "sapt_exs2", - "sapt_ind", - "sapt_exind", - "sapt_disp", - "sapt_exdisp_os", - "sapt_exdisp_ss", - "sapt_delta_HF", - ] +class DESS66(DES370K): + """ + DE Shaw Research interaction energy + estimates of all 66 conformers from + the original S66 dataset as described + in the paper: + + Quantum chemical benchmark databases of gold-standard dimer interaction energies. + Donchev, A.G., Taube, A.G., Decolvenaere, E. et al. + Sci Data 8, 55 (2021). + https://doi.org/10.1038/s41597-021-00833-x - _filename = "DES5M.csv" - _name = "des5m_interaction" + Data was downloaded from Zenodo: + https://zenodo.org/records/5676284 + """ - __energy_unit__ = "kcal/mol" - __distance_unit__ = "ang" - __forces_unit__ = "kcal/mol/ang" + __name__ = "des_s66" + __filename__ = "DESS66.csv" + + # def read_raw_entries(self) -> List[Dict]: + # filepath = self.csv_path + # logger.info(f"Reading DESS66 interaction data from {filepath}") + # df = pd.read_csv(filepath) + # data = [] + # for idx, row in tqdm(df.iterrows(), total=df.shape[0]): + # item = parse_des_df(row) + # item["subset"] = row["system_name"] + # data.append(convert_to_record(item)) + # return data + + +class DESS66x8(DESS66): + """ + DE Shaw Research interaction energy + estimates of all 528 conformers from + the original S66x8 dataset as described + in the paper: + + Quantum chemical benchmark databases of gold-standard dimer interaction energies. + Donchev, A.G., Taube, A.G., Decolvenaere, E. et al. + Sci Data 8, 55 (2021). + https://doi.org/10.1038/s41597-021-00833-x + + Data was downloaded from Zenodo: + + https://zenodo.org/records/5676284 + """ + + __name__ = "des_s66x8" + __filename__ = "DESS66x8.csv" diff --git a/openqdc/datasets/interaction/dess66.py b/openqdc/datasets/interaction/dess66.py deleted file mode 100644 index 45bf6bd..0000000 --- a/openqdc/datasets/interaction/dess66.py +++ /dev/null @@ -1,131 +0,0 @@ -import os -from typing import Dict, List - -import pandas as pd -from loguru import logger -from tqdm import tqdm - -from openqdc.datasets.interaction.base import BaseInteractionDataset -from openqdc.datasets.interaction.des370k import convert_to_record, parse_des_df -from openqdc.methods import InteractionMethod, InterEnergyType - -CSV_NAME = { - "des_s66": "DESS66.csv", - "des_s66x8": "DESS66x8.csv", -} - - -class DESS66(BaseInteractionDataset): - """ - DE Shaw Research interaction energy - estimates of all 66 conformers from - the original S66 dataset as described - in the paper: - - Quantum chemical benchmark databases of gold-standard dimer interaction energies. - Donchev, A.G., Taube, A.G., Decolvenaere, E. et al. - Sci Data 8, 55 (2021). - https://doi.org/10.1038/s41597-021-00833-x - - Data was downloaded from Zenodo: - https://zenodo.org/records/5676284 - """ - - __name__ = "des_s66" - __energy_unit__ = "kcal/mol" - __distance_unit__ = "ang" - __forces_unit__ = "kcal/mol/ang" - __energy_methods__ = [ - InteractionMethod.MP2_CC_PVDZ, - InteractionMethod.MP2_CC_PVQZ, - InteractionMethod.MP2_CC_PVTZ, - InteractionMethod.MP2_CBS, - InteractionMethod.CCSD_T_CC_PVDZ, - InteractionMethod.CCSD_T_CBS, - InteractionMethod.CCSD_T_NN, - InteractionMethod.SAPT0_AUG_CC_PWCVXZ, - InteractionMethod.SAPT0_AUG_CC_PWCVXZ, - InteractionMethod.SAPT0_AUG_CC_PWCVXZ, - InteractionMethod.SAPT0_AUG_CC_PWCVXZ, - InteractionMethod.SAPT0_AUG_CC_PWCVXZ, - InteractionMethod.SAPT0_AUG_CC_PWCVXZ, - InteractionMethod.SAPT0_AUG_CC_PWCVXZ, - InteractionMethod.SAPT0_AUG_CC_PWCVXZ, - InteractionMethod.SAPT0_AUG_CC_PWCVXZ, - InteractionMethod.SAPT0_AUG_CC_PWCVXZ, - ] - - __energy_type__ = [ - InterEnergyType.TOTAL, - InterEnergyType.TOTAL, - InterEnergyType.TOTAL, - InterEnergyType.TOTAL, - InterEnergyType.TOTAL, - InterEnergyType.TOTAL, - InterEnergyType.TOTAL, - InterEnergyType.TOTAL, - InterEnergyType.ES, - InterEnergyType.EX, - InterEnergyType.EX_S2, - InterEnergyType.IND, - InterEnergyType.EX_IND, - InterEnergyType.DISP, - InterEnergyType.EX_DISP_OS, - InterEnergyType.EX_DISP_SS, - InterEnergyType.DELTA_HF, - ] - - energy_target_names = [ - "cc_MP2_all", - "qz_MP2_all", - "tz_MP2_all", - "cbs_MP2_all", - "cc_CCSD(T)_all", - "cbs_CCSD(T)_all", - "nn_CCSD(T)_all", - "sapt_all", - "sapt_es", - "sapt_ex", - "sapt_exs2", - "sapt_ind", - "sapt_exind", - "sapt_disp", - "sapt_exdisp_os", - "sapt_exdisp_ss", - "sapt_delta_HF", - ] - - @property - def csv_path(self): - return os.path.join(self.root, CSV_NAME[self.__name__]) - - def read_raw_entries(self) -> List[Dict]: - filepath = self.csv_path - logger.info(f"Reading DESS66 interaction data from {filepath}") - df = pd.read_csv(filepath) - data = [] - for idx, row in tqdm(df.iterrows(), total=df.shape[0]): - item = parse_des_df(row) - item["subset"] = row["system_name"] - data.append(convert_to_record(item)) - return data - - -class DESS66x8(DESS66): - """ - DE Shaw Research interaction energy - estimates of all 528 conformers from - the original S66x8 dataset as described - in the paper: - - Quantum chemical benchmark databases of gold-standard dimer interaction energies. - Donchev, A.G., Taube, A.G., Decolvenaere, E. et al. - Sci Data 8, 55 (2021). - https://doi.org/10.1038/s41597-021-00833-x - - Data was downloaded from Zenodo: - - https://zenodo.org/records/5676284 - """ - - __name__ = "des_s66x8" From 71ff741a4fadabfc61e36f5d1cb534dd5d041f30 Mon Sep 17 00:00:00 2001 From: FNTwin Date: Sat, 6 Apr 2024 15:56:39 -0400 Subject: [PATCH 19/27] Removed des and improved des naming --- openqdc/raws/config_factory.py | 16 ++++------------ 1 file changed, 4 insertions(+), 12 deletions(-) diff --git a/openqdc/raws/config_factory.py b/openqdc/raws/config_factory.py index 9f8c6c1..26e0f2c 100644 --- a/openqdc/raws/config_factory.py +++ b/openqdc/raws/config_factory.py @@ -269,22 +269,14 @@ class DataConfigFactory: }, ) - dess = dict( - dataset_name="dess5m", - links={ - "DESS5M.zip": "https://zenodo.org/record/5706002/files/DESS5M.zip", - "DESS370.zip": "https://zenodo.org/record/5676266/files/DES370K.zip", - }, - ) - - des370k_interaction = dict( + des370k = dict( dataset_name="des370k_interaction", links={ "DES370K.zip": "https://zenodo.org/record/5676266/files/DES370K.zip", }, ) - des5m_interaction = dict( + des5m = dict( dataset_name="des5m_interaction", links={ "DES5M.zip": "https://zenodo.org/records/5706002/files/DESS5M.zip?download=1", @@ -349,12 +341,12 @@ class DataConfigFactory: links={"Transition1x.h5": "https://figshare.com/ndownloader/files/36035789"}, ) - des_s66 = dict( + dess66 = dict( dataset_name="des_s66", links={"DESS66.zip": "https://zenodo.org/records/5676284/files/DESS66.zip?download=1"}, ) - des_s66x8 = dict( + dess66x8 = dict( dataset_name="des_s66x8", links={"DESS66x8.zip": "https://zenodo.org/records/5676284/files/DESS66x8.zip?download=1"}, ) From f6e12e13a6166bab47377cdc193866b0ee8c877d Mon Sep 17 00:00:00 2001 From: FNTwin Date: Sat, 6 Apr 2024 15:57:18 -0400 Subject: [PATCH 20/27] DES fixes --- openqdc/datasets/interaction/des.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/openqdc/datasets/interaction/des.py b/openqdc/datasets/interaction/des.py index 6ca1bda..9dbb5d8 100644 --- a/openqdc/datasets/interaction/des.py +++ b/openqdc/datasets/interaction/des.py @@ -57,7 +57,7 @@ def create_subset(smiles0, smiles1): def convert_to_record(item): return dict( energies=item["energies"], - subset=np.array([item["subsets"]]), + subset=np.array([item["subset"]]), n_atoms=np.array([item["natoms0"] + item["natoms1"]], dtype=np.int32), n_atoms_first=np.array([item["natoms0"]], dtype=np.int32), atomic_inputs=item["atomic_inputs"], @@ -161,7 +161,7 @@ def read_raw_entries(self) -> List[Dict]: data = [] for idx, row in tqdm(df.iterrows(), total=df.shape[0]): item = parse_des_df(row, self.energy_target_names) - item["subset"] = self._create_subset(**item) + item["subset"] = self._create_subsets(**item) item = convert_to_record(item) data.append(item) return data From 3328a65593547765097936a3cad9383fb0770e4a Mon Sep 17 00:00:00 2001 From: FNTwin Date: Sat, 6 Apr 2024 15:58:41 -0400 Subject: [PATCH 21/27] Removed comments --- openqdc/datasets/interaction/des.py | 11 ----------- 1 file changed, 11 deletions(-) diff --git a/openqdc/datasets/interaction/des.py b/openqdc/datasets/interaction/des.py index 9dbb5d8..7c542e2 100644 --- a/openqdc/datasets/interaction/des.py +++ b/openqdc/datasets/interaction/des.py @@ -201,17 +201,6 @@ class DESS66(DES370K): __name__ = "des_s66" __filename__ = "DESS66.csv" - # def read_raw_entries(self) -> List[Dict]: - # filepath = self.csv_path - # logger.info(f"Reading DESS66 interaction data from {filepath}") - # df = pd.read_csv(filepath) - # data = [] - # for idx, row in tqdm(df.iterrows(), total=df.shape[0]): - # item = parse_des_df(row) - # item["subset"] = row["system_name"] - # data.append(convert_to_record(item)) - # return data - class DESS66x8(DESS66): """ From 8b28d59e7f02bca523230203d327073fcd682720 Mon Sep 17 00:00:00 2001 From: FNTwin Date: Sat, 6 Apr 2024 16:36:06 -0400 Subject: [PATCH 22/27] X40 and L70 --- openqdc/datasets/interaction/L7.py | 88 +++++++++++++++++++---------- openqdc/datasets/interaction/X40.py | 58 ++----------------- 2 files changed, 62 insertions(+), 84 deletions(-) diff --git a/openqdc/datasets/interaction/L7.py b/openqdc/datasets/interaction/L7.py index d7c7361..a7434c2 100644 --- a/openqdc/datasets/interaction/L7.py +++ b/openqdc/datasets/interaction/L7.py @@ -1,6 +1,7 @@ import os from dataclasses import dataclass from functools import partial +from os.path import join as p_join from typing import Dict, List, Optional import numpy as np @@ -58,6 +59,51 @@ def constructor(loader: yaml.SafeLoader, node: yaml.nodes.MappingNode, cls): return loader +def read_xyz_file(xyz_path): + with open(xyz_path, "r") as xyz_file: # avoid not closing the file + lines = list(map(lambda x: x.strip().split(), xyz_file.readlines())) + lines.pop(1) + n_atoms = np.array([int(lines[0][0])], dtype=np.int32) + pos = np.array(lines[1:])[:, 1:].astype(np.float32) + elems = np.array(lines[1:])[:, 0] + atomic_nums = np.expand_dims(np.array([ATOM_TABLE.GetAtomicNumber(x) for x in elems]), axis=1) + return n_atoms, pos, atomic_nums + + +def convert_to_record(item): + return dict( + energies=item["energies"], + subset=np.array([item["subset"]]), + n_atoms=np.array([item["natoms0"] + item["natoms1"]], dtype=np.int32), + n_atoms_first=np.array([item["natoms0"]], dtype=np.int32), + atomic_inputs=item["atomic_inputs"], + name=item["name"], + ) + + +def build_item(item, charge0, charge1, idx, data_dict, root, filename): + datum = { + "energies": [], + } + datum["name"] = np.array([item.shortname]) + datum["energies"].append(item.reference_value) + datum["subset"] = np.array([item.group]) + datum["energies"] += [float(val[idx]) for val in list(data_dict.alternative_reference.values())] + datum["energies"] = np.array([datum["energies"]], dtype=np.float32) + n_atoms, pos, atomic_nums = read_xyz_file(p_join(root, f"{filename}.xyz")) + datum["n_atoms"] = n_atoms + datum["pos"] = pos + datum["atomic_nums"] = atomic_nums + datum["n_atoms_first"] = np.array([int(item.setup["molecule_a"]["selection"].split("-")[1])], dtype=np.int32) + datum["natoms0"] = datum["n_atoms_first"][0] + datum["natoms1"] = datum["n_atoms"][0] - datum["natoms0"] + datum["charges"] = np.expand_dims(np.array([charge0] * datum["natoms0"] + [charge1] * datum["natoms1"]), axis=1) + datum["atomic_inputs"] = np.concatenate( + (datum["atomic_nums"], datum["charges"], datum["pos"]), axis=-1, dtype=np.float32 + ) + return datum + + class L7(BaseInteractionDataset): """ The L7 interaction energy dataset as described in: @@ -90,43 +136,25 @@ class L7(BaseInteractionDataset): energy_target_names = [] + @property + def yaml_path(self): + return os.path.join(self.root, self.__name__ + ".yaml") + def read_raw_entries(self) -> List[Dict]: - yaml_fpath = os.path.join(self.root, "l7.yaml") - logger.info(f"Reading L7 interaction data from {self.root}") + yaml_fpath = self.yaml_path + logger.info(f"Reading {self.__name__} interaction data from {self.root}") yaml_file = open(yaml_fpath, "r") data = [] data_dict = yaml.load(yaml_file, Loader=get_loader()) + charge0 = int(data_dict.description.global_setup["molecule_a"]["charge"]) charge1 = int(data_dict.description.global_setup["molecule_b"]["charge"]) for idx, item in enumerate(data_dict.items): - energies = [] - name = np.array([item.shortname]) - fname = item.geometry.split(":")[1] - energies.append(item.reference_value) - xyz_file = open(os.path.join(self.root, f"{fname}.xyz"), "r") - lines = list(map(lambda x: x.strip().split(), xyz_file.readlines())) - lines.pop(1) - n_atoms = np.array([int(lines[0][0])], dtype=np.int32) - n_atoms_first = np.array([int(item.setup["molecule_a"]["selection"].split("-")[1])], dtype=np.int32) - subset = np.array([item.group]) - energies += [float(val[idx]) for val in list(data_dict.alternative_reference.values())] - energies = np.array([energies], dtype=np.float32) - pos = np.array(lines[1:])[:, 1:].astype(np.float32) - elems = np.array(lines[1:])[:, 0] - atomic_nums = np.expand_dims(np.array([ATOM_TABLE.GetAtomicNumber(x) for x in elems]), axis=1) - natoms0 = n_atoms_first[0] - natoms1 = n_atoms[0] - natoms0 - charges = np.expand_dims(np.array([charge0] * natoms0 + [charge1] * natoms1), axis=1) - atomic_inputs = np.concatenate((atomic_nums, charges, pos), axis=-1, dtype=np.float32) - - item = dict( - energies=energies, - subset=subset, - n_atoms=n_atoms, - n_atoms_first=n_atoms_first, - atomic_inputs=atomic_inputs, - name=name, - ) + tmp_item = build_item(item, charge0, charge1, idx, data_dict, self.root, self._process_name(item)) + item = convert_to_record(tmp_item) data.append(item) return data + + def _process_name(self, item): + return item.geometry.split(":")[1] diff --git a/openqdc/datasets/interaction/X40.py b/openqdc/datasets/interaction/X40.py index dfb43d0..a42d36c 100644 --- a/openqdc/datasets/interaction/X40.py +++ b/openqdc/datasets/interaction/X40.py @@ -1,17 +1,8 @@ -import os -from typing import Dict, List - -import numpy as np -import yaml -from loguru import logger - -from openqdc.datasets.interaction.base import BaseInteractionDataset -from openqdc.datasets.interaction.L7 import get_loader +from openqdc.datasets.interaction.L7 import L7 from openqdc.methods import InteractionMethod, InterEnergyType -from openqdc.utils.constants import ATOM_TABLE -class X40(BaseInteractionDataset): +class X40(L7): """ X40 interaction dataset of 40 dimer pairs as introduced in the following paper: @@ -26,9 +17,6 @@ class X40(BaseInteractionDataset): """ __name__ = "x40" - __energy_unit__ = "hartree" - __distance_unit__ = "ang" - __forces_unit__ = "hartree/ang" __energy_methods__ = [ InteractionMethod.CCSD_T_CBS, # "CCSD(T)/CBS", InteractionMethod.MP2_CBS, # "MP2/CBS", @@ -42,43 +30,5 @@ class X40(BaseInteractionDataset): energy_target_names = [] - def read_raw_entries(self) -> List[Dict]: - yaml_fpath = os.path.join(self.root, "x40.yaml") - logger.info(f"Reading X40 interaction data from {self.root}") - yaml_file = open(yaml_fpath, "r") - data = [] - data_dict = yaml.load(yaml_file, Loader=get_loader()) - charge0 = int(data_dict.description.global_setup["molecule_a"]["charge"]) - charge1 = int(data_dict.description.global_setup["molecule_b"]["charge"]) - - for idx, item in enumerate(data_dict.items): - energies = [] - name = np.array([item.shortname]) - energies.append(float(item.reference_value)) - xyz_file = open(os.path.join(self.root, f"{item.shortname}.xyz"), "r") - lines = list(map(lambda x: x.strip().split(), xyz_file.readlines())) - setup = lines.pop(1) - n_atoms = np.array([int(lines[0][0])], dtype=np.int32) - n_atoms_first = setup[0].split("-")[1] - n_atoms_first = np.array([int(n_atoms_first)], dtype=np.int32) - subset = np.array([item.group]) - energies += [float(val[idx]) for val in list(data_dict.alternative_reference.values())] - energies = np.array([energies], dtype=np.float32) - pos = np.array(lines[1:])[:, 1:].astype(np.float32) - elems = np.array(lines[1:])[:, 0] - atomic_nums = np.expand_dims(np.array([ATOM_TABLE.GetAtomicNumber(x) for x in elems]), axis=1) - natoms0 = n_atoms_first[0] - natoms1 = n_atoms[0] - natoms0 - charges = np.expand_dims(np.array([charge0] * natoms0 + [charge1] * natoms1), axis=1) - atomic_inputs = np.concatenate((atomic_nums, charges, pos), axis=-1, dtype=np.float32) - - item = dict( - energies=energies, - subset=subset, - n_atoms=n_atoms, - n_atoms_first=n_atoms_first, - atomic_inputs=atomic_inputs, - name=name, - ) - data.append(item) - return data + def _process_name(self, item): + return item.shortname From 8595fd888df1cde542b637902e51b19e7503c3d5 Mon Sep 17 00:00:00 2001 From: FNTwin Date: Sat, 6 Apr 2024 16:40:04 -0400 Subject: [PATCH 23/27] Safe opening --- openqdc/datasets/interaction/L7.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/openqdc/datasets/interaction/L7.py b/openqdc/datasets/interaction/L7.py index a7434c2..fc354e5 100644 --- a/openqdc/datasets/interaction/L7.py +++ b/openqdc/datasets/interaction/L7.py @@ -143,10 +143,9 @@ def yaml_path(self): def read_raw_entries(self) -> List[Dict]: yaml_fpath = self.yaml_path logger.info(f"Reading {self.__name__} interaction data from {self.root}") - yaml_file = open(yaml_fpath, "r") + with open(yaml_fpath, "r") as yaml_file: + data_dict = yaml.load(yaml_file, Loader=get_loader()) data = [] - data_dict = yaml.load(yaml_file, Loader=get_loader()) - charge0 = int(data_dict.description.global_setup["molecule_a"]["charge"]) charge1 = int(data_dict.description.global_setup["molecule_b"]["charge"]) From ca1b4aff6bd60fe72f36bdcdf6b7a7bd17f83a90 Mon Sep 17 00:00:00 2001 From: FNTwin Date: Sat, 6 Apr 2024 16:43:44 -0400 Subject: [PATCH 24/27] Moved X40 in L7 and removed x40.py --- openqdc/datasets/interaction/X40.py | 34 ------------------- openqdc/datasets/interaction/__init__.py | 3 +- .../datasets/interaction/{L7.py => l7x40.py} | 32 +++++++++++++++++ 3 files changed, 33 insertions(+), 36 deletions(-) delete mode 100644 openqdc/datasets/interaction/X40.py rename openqdc/datasets/interaction/{L7.py => l7x40.py} (85%) diff --git a/openqdc/datasets/interaction/X40.py b/openqdc/datasets/interaction/X40.py deleted file mode 100644 index a42d36c..0000000 --- a/openqdc/datasets/interaction/X40.py +++ /dev/null @@ -1,34 +0,0 @@ -from openqdc.datasets.interaction.L7 import L7 -from openqdc.methods import InteractionMethod, InterEnergyType - - -class X40(L7): - """ - X40 interaction dataset of 40 dimer pairs as - introduced in the following paper: - - Benchmark Calculations of Noncovalent Interactions of Halogenated Molecules - Jan Řezáč, Kevin E. Riley, and Pavel Hobza - Journal of Chemical Theory and Computation 2012 8 (11), 4285-4292 - DOI: 10.1021/ct300647k - - Dataset retrieved and processed from: - http://cuby4.molecular.cz/dataset_x40.html - """ - - __name__ = "x40" - __energy_methods__ = [ - InteractionMethod.CCSD_T_CBS, # "CCSD(T)/CBS", - InteractionMethod.MP2_CBS, # "MP2/CBS", - InteractionMethod.DCCSDT_HA_DZ, # "dCCSD(T)/haDZ", - InteractionMethod.DCCSDT_HA_TZ, # "dCCSD(T)/haTZ", - InteractionMethod.MP2_5_CBS_ADZ, # "MP2.5/CBS(aDZ)", - ] - __energy_type__ = [ - InterEnergyType.TOTAL, - ] * 5 - - energy_target_names = [] - - def _process_name(self, item): - return item.shortname diff --git a/openqdc/datasets/interaction/__init__.py b/openqdc/datasets/interaction/__init__.py index 814a367..eca842d 100644 --- a/openqdc/datasets/interaction/__init__.py +++ b/openqdc/datasets/interaction/__init__.py @@ -1,9 +1,8 @@ from .base import BaseInteractionDataset # noqa from .des import DES5M, DES370K, DESS66, DESS66x8 -from .L7 import L7 +from .l7x40 import L7, X40 from .metcalf import Metcalf from .splinter import Splinter -from .X40 import X40 AVAILABLE_INTERACTION_DATASETS = { "des5m": DES5M, diff --git a/openqdc/datasets/interaction/L7.py b/openqdc/datasets/interaction/l7x40.py similarity index 85% rename from openqdc/datasets/interaction/L7.py rename to openqdc/datasets/interaction/l7x40.py index fc354e5..12b5316 100644 --- a/openqdc/datasets/interaction/L7.py +++ b/openqdc/datasets/interaction/l7x40.py @@ -157,3 +157,35 @@ def read_raw_entries(self) -> List[Dict]: def _process_name(self, item): return item.geometry.split(":")[1] + + +class X40(L7): + """ + X40 interaction dataset of 40 dimer pairs as + introduced in the following paper: + + Benchmark Calculations of Noncovalent Interactions of Halogenated Molecules + Jan Řezáč, Kevin E. Riley, and Pavel Hobza + Journal of Chemical Theory and Computation 2012 8 (11), 4285-4292 + DOI: 10.1021/ct300647k + + Dataset retrieved and processed from: + http://cuby4.molecular.cz/dataset_x40.html + """ + + __name__ = "x40" + __energy_methods__ = [ + InteractionMethod.CCSD_T_CBS, # "CCSD(T)/CBS", + InteractionMethod.MP2_CBS, # "MP2/CBS", + InteractionMethod.DCCSDT_HA_DZ, # "dCCSD(T)/haDZ", + InteractionMethod.DCCSDT_HA_TZ, # "dCCSD(T)/haTZ", + InteractionMethod.MP2_5_CBS_ADZ, # "MP2.5/CBS(aDZ)", + ] + __energy_type__ = [ + InterEnergyType.TOTAL, + ] * 5 + + energy_target_names = [] + + def _process_name(self, item): + return item.shortname From 4bec82de940e11f35516ca6462c1bb570ca857d5 Mon Sep 17 00:00:00 2001 From: FNTwin Date: Sat, 6 Apr 2024 20:47:24 -0400 Subject: [PATCH 25/27] Moved Yaml utils to _utils.py, L7 + X40 interface --- .../interaction/{l7x40.py => _utils.py} | 67 ++----------------- openqdc/datasets/interaction/l7.py | 32 +++++++++ openqdc/datasets/interaction/x40.py | 29 ++++++++ 3 files changed, 68 insertions(+), 60 deletions(-) rename openqdc/datasets/interaction/{l7x40.py => _utils.py} (67%) create mode 100644 openqdc/datasets/interaction/l7.py create mode 100644 openqdc/datasets/interaction/x40.py diff --git a/openqdc/datasets/interaction/l7x40.py b/openqdc/datasets/interaction/_utils.py similarity index 67% rename from openqdc/datasets/interaction/l7x40.py rename to openqdc/datasets/interaction/_utils.py index 12b5316..3df948e 100644 --- a/openqdc/datasets/interaction/l7x40.py +++ b/openqdc/datasets/interaction/_utils.py @@ -1,4 +1,5 @@ import os +from abc import ABC, abstractmethod from dataclasses import dataclass from functools import partial from os.path import join as p_join @@ -9,7 +10,7 @@ from loguru import logger from openqdc.datasets.interaction.base import BaseInteractionDataset -from openqdc.methods import InteractionMethod, InterEnergyType +from openqdc.methods import InterEnergyType from openqdc.utils.constants import ATOM_TABLE @@ -104,37 +105,14 @@ def build_item(item, charge0, charge1, idx, data_dict, root, filename): return datum -class L7(BaseInteractionDataset): - """ - The L7 interaction energy dataset as described in: - - Accuracy of Quantum Chemical Methods for Large Noncovalent Complexes - Robert Sedlak, Tomasz Janowski, Michal Pitoňák, Jan Řezáč, Peter Pulay, and Pavel Hobza - Journal of Chemical Theory and Computation 2013 9 (8), 3364-3374 - DOI: 10.1021/ct400036b - - Data was downloaded and extracted from: - http://cuby4.molecular.cz/dataset_l7.html - """ - +class YamlDataset(BaseInteractionDataset, ABC): __name__ = "l7" __energy_unit__ = "kcal/mol" __distance_unit__ = "ang" __forces_unit__ = "kcal/mol/ang" - __energy_methods__ = [ - InteractionMethod.QCISDT_CBS, # "QCISD(T)/CBS", - InteractionMethod.DLPNO_CCSDT, # "DLPNO-CCSD(T)", - InteractionMethod.MP2_CBS, # "MP2/CBS", - InteractionMethod.MP2C_CBS, # "MP2C/CBS", - InteractionMethod.FIXED, # "fixed", TODO: we should remove this level of theory because unless we have a pro - InteractionMethod.DLPNO_CCSDT0, # "DLPNO-CCSD(T0)", - InteractionMethod.LNO_CCSDT, # "LNO-CCSD(T)", - InteractionMethod.FN_DMC, # "FN-DMC", - ] - - __energy_type__ = [InterEnergyType.TOTAL] * 8 - energy_target_names = [] + __energy_methods__ = [] + __energy_type__ = [InterEnergyType.TOTAL] * len(__energy_methods__) @property def yaml_path(self): @@ -155,37 +133,6 @@ def read_raw_entries(self) -> List[Dict]: data.append(item) return data + @abstractmethod def _process_name(self, item): - return item.geometry.split(":")[1] - - -class X40(L7): - """ - X40 interaction dataset of 40 dimer pairs as - introduced in the following paper: - - Benchmark Calculations of Noncovalent Interactions of Halogenated Molecules - Jan Řezáč, Kevin E. Riley, and Pavel Hobza - Journal of Chemical Theory and Computation 2012 8 (11), 4285-4292 - DOI: 10.1021/ct300647k - - Dataset retrieved and processed from: - http://cuby4.molecular.cz/dataset_x40.html - """ - - __name__ = "x40" - __energy_methods__ = [ - InteractionMethod.CCSD_T_CBS, # "CCSD(T)/CBS", - InteractionMethod.MP2_CBS, # "MP2/CBS", - InteractionMethod.DCCSDT_HA_DZ, # "dCCSD(T)/haDZ", - InteractionMethod.DCCSDT_HA_TZ, # "dCCSD(T)/haTZ", - InteractionMethod.MP2_5_CBS_ADZ, # "MP2.5/CBS(aDZ)", - ] - __energy_type__ = [ - InterEnergyType.TOTAL, - ] * 5 - - energy_target_names = [] - - def _process_name(self, item): - return item.shortname + raise NotImplementedError diff --git a/openqdc/datasets/interaction/l7.py b/openqdc/datasets/interaction/l7.py new file mode 100644 index 0000000..22e3141 --- /dev/null +++ b/openqdc/datasets/interaction/l7.py @@ -0,0 +1,32 @@ +from openqdc.methods import InteractionMethod + +from ._utils import YamlDataset + + +class L7(YamlDataset): + """ + The L7 interaction energy dataset as described in: + + Accuracy of Quantum Chemical Methods for Large Noncovalent Complexes + Robert Sedlak, Tomasz Janowski, Michal Pitoňák, Jan Řezáč, Peter Pulay, and Pavel Hobza + Journal of Chemical Theory and Computation 2013 9 (8), 3364-3374 + DOI: 10.1021/ct400036b + + Data was downloaded and extracted from: + http://cuby4.molecular.cz/dataset_l7.html + """ + + __name__ = "l7" + __energy_methods__ = [ + InteractionMethod.QCISDT_CBS, # "QCISD(T)/CBS", + InteractionMethod.DLPNO_CCSDT, # "DLPNO-CCSD(T)", + InteractionMethod.MP2_CBS, # "MP2/CBS", + InteractionMethod.MP2C_CBS, # "MP2C/CBS", + InteractionMethod.FIXED, # "fixed", TODO: we should remove this level of theory because unless we have a pro + InteractionMethod.DLPNO_CCSDT0, # "DLPNO-CCSD(T0)", + InteractionMethod.LNO_CCSDT, # "LNO-CCSD(T)", + InteractionMethod.FN_DMC, # "FN-DMC", + ] + + def _process_name(self, item): + return item.geometry.split(":")[1] diff --git a/openqdc/datasets/interaction/x40.py b/openqdc/datasets/interaction/x40.py new file mode 100644 index 0000000..1b5148c --- /dev/null +++ b/openqdc/datasets/interaction/x40.py @@ -0,0 +1,29 @@ +from openqdc.datasets.interaction._utils import YamlDataset +from openqdc.methods import InteractionMethod + + +class X40(YamlDataset): + """ + X40 interaction dataset of 40 dimer pairs as + introduced in the following paper: + + Benchmark Calculations of Noncovalent Interactions of Halogenated Molecules + Jan Řezáč, Kevin E. Riley, and Pavel Hobza + Journal of Chemical Theory and Computation 2012 8 (11), 4285-4292 + DOI: 10.1021/ct300647k + + Dataset retrieved and processed from: + http://cuby4.molecular.cz/dataset_x40.html + """ + + __name__ = "x40" + __energy_methods__ = [ + InteractionMethod.CCSD_T_CBS, # "CCSD(T)/CBS", + InteractionMethod.MP2_CBS, # "MP2/CBS", + InteractionMethod.DCCSDT_HA_DZ, # "dCCSD(T)/haDZ", + InteractionMethod.DCCSDT_HA_TZ, # "dCCSD(T)/haTZ", + InteractionMethod.MP2_5_CBS_ADZ, # "MP2.5/CBS(aDZ)", + ] + + def _process_name(self, item): + return item.shortname From 3303f95b9e6e5b4a10edc649dc7099cdabd6111b Mon Sep 17 00:00:00 2001 From: Nikhil Shenoy Date: Fri, 12 Apr 2024 01:04:37 +0000 Subject: [PATCH 26/27] better convert function and n_body_first to ptr --- openqdc/datasets/base.py | 5 +++++ openqdc/datasets/interaction/_utils.py | 6 +++--- openqdc/datasets/interaction/base.py | 8 ++++---- openqdc/datasets/interaction/des.py | 2 +- openqdc/datasets/interaction/dummy.py | 4 ++-- openqdc/datasets/interaction/metcalf.py | 2 +- openqdc/datasets/interaction/splinter.py | 6 +++--- openqdc/datasets/potential/dummy.py | 4 ++-- 8 files changed, 21 insertions(+), 16 deletions(-) diff --git a/openqdc/datasets/base.py b/openqdc/datasets/base.py index b5bc43b..fabdfc1 100644 --- a/openqdc/datasets/base.py +++ b/openqdc/datasets/base.py @@ -49,11 +49,15 @@ @requires_package("torch") def to_torch(x: np.ndarray): + if isinstance(x, torch.Tensor): + return x return torch.from_numpy(x) @requires_package("jax") def to_jax(x: np.ndarray): + if isinstance(x, jnp.ndarray): + return x return jnp.array(x) @@ -166,6 +170,7 @@ def _precompute_statistics(self, overwrite_local_cache: bool = False): PerAtomFormationEnergyStats, ) self.statistics.run_calculators() # run the calculators + self._compute_average_nb_atoms() @classmethod def no_init(cls): diff --git a/openqdc/datasets/interaction/_utils.py b/openqdc/datasets/interaction/_utils.py index 3df948e..0d2915b 100644 --- a/openqdc/datasets/interaction/_utils.py +++ b/openqdc/datasets/interaction/_utils.py @@ -76,7 +76,7 @@ def convert_to_record(item): energies=item["energies"], subset=np.array([item["subset"]]), n_atoms=np.array([item["natoms0"] + item["natoms1"]], dtype=np.int32), - n_atoms_first=np.array([item["natoms0"]], dtype=np.int32), + n_atoms_ptr=np.array([item["natoms0"]], dtype=np.int32), atomic_inputs=item["atomic_inputs"], name=item["name"], ) @@ -95,8 +95,8 @@ def build_item(item, charge0, charge1, idx, data_dict, root, filename): datum["n_atoms"] = n_atoms datum["pos"] = pos datum["atomic_nums"] = atomic_nums - datum["n_atoms_first"] = np.array([int(item.setup["molecule_a"]["selection"].split("-")[1])], dtype=np.int32) - datum["natoms0"] = datum["n_atoms_first"][0] + datum["n_atoms_ptr"] = np.array([int(item.setup["molecule_a"]["selection"].split("-")[1])], dtype=np.int32) + datum["natoms0"] = datum["n_atoms_ptr"][0] datum["natoms1"] = datum["n_atoms"][0] - datum["natoms0"] datum["charges"] = np.expand_dims(np.array([charge0] * datum["natoms0"] + [charge1] * datum["natoms1"]), axis=1) datum["atomic_inputs"] = np.concatenate( diff --git a/openqdc/datasets/interaction/base.py b/openqdc/datasets/interaction/base.py index 96f39c1..2ce5481 100644 --- a/openqdc/datasets/interaction/base.py +++ b/openqdc/datasets/interaction/base.py @@ -20,7 +20,7 @@ def pkl_data_types(self): "name": str, "subset": str, "n_atoms": np.int32, - "n_atoms_first": np.int32, + "n_atoms_ptr": np.int32, } def __getitem__(self, idx: int): @@ -35,7 +35,7 @@ def __getitem__(self, idx: int): ) name = self.__smiles_converter__(self.data["name"][idx]) subset = self.data["subset"][idx] - n_atoms_first = self.data["n_atoms_first"][idx] + n_atoms_ptr = self.data["n_atoms_ptr"][idx] forces = None if "forces" in self.data: @@ -52,7 +52,7 @@ def __getitem__(self, idx: int): name=name, subset=subset, forces=forces, - n_atoms_first=n_atoms_first, + n_atoms_ptr=n_atoms_ptr, ) if self.transform is not None: @@ -63,7 +63,7 @@ def __getitem__(self, idx: int): def get_ase_atoms(self, idx: int): entry = self[idx] at = to_atoms(entry["positions"], entry["atomic_numbers"]) - at.info["n_atoms"] = entry["n_atoms_first"] + at.info["n_atoms"] = entry["n_atoms_ptr"] return at def save_xyz(self, idx: int, path: Optional[str] = None): diff --git a/openqdc/datasets/interaction/des.py b/openqdc/datasets/interaction/des.py index 7c542e2..a292fc3 100644 --- a/openqdc/datasets/interaction/des.py +++ b/openqdc/datasets/interaction/des.py @@ -59,7 +59,7 @@ def convert_to_record(item): energies=item["energies"], subset=np.array([item["subset"]]), n_atoms=np.array([item["natoms0"] + item["natoms1"]], dtype=np.int32), - n_atoms_first=np.array([item["natoms0"]], dtype=np.int32), + n_atoms_ptr=np.array([item["natoms0"]], dtype=np.int32), atomic_inputs=item["atomic_inputs"], name=item["name"], ) diff --git a/openqdc/datasets/interaction/dummy.py b/openqdc/datasets/interaction/dummy.py index 4dcb8a3..7f19154 100644 --- a/openqdc/datasets/interaction/dummy.py +++ b/openqdc/datasets/interaction/dummy.py @@ -27,7 +27,7 @@ def _post_init(self, overwrite_local_cache, energy_unit, distance_unit) -> None: def setup_dummy(self): n_atoms = np.array([np.random.randint(10, 30) for _ in range(len(self))]) - n_atoms_first = np.array([np.random.randint(1, 10) for _ in range(len(self))]) + n_atoms_ptr = np.array([np.random.randint(1, 10) for _ in range(len(self))]) position_idx_range = np.concatenate([[0], np.cumsum(n_atoms)]).repeat(2)[1:-1].reshape(-1, 2) atomic_inputs = np.concatenate( [ @@ -54,7 +54,7 @@ def setup_dummy(self): atomic_inputs=atomic_inputs, subset=subset, energies=energies, - n_atoms_first=n_atoms_first, + n_atoms_ptr=n_atoms_ptr, ) self.__average_nb_atoms__ = self.data["n_atoms"].mean() diff --git a/openqdc/datasets/interaction/metcalf.py b/openqdc/datasets/interaction/metcalf.py index 99da5b0..60298c4 100644 --- a/openqdc/datasets/interaction/metcalf.py +++ b/openqdc/datasets/interaction/metcalf.py @@ -69,7 +69,7 @@ def content_to_xyz(content, subset): energies=e, atomic_inputs=np.concatenate((xs, positions), axis=-1, dtype=np.float32), name=np.array([name]), - n_atoms_first=np.array([-1]), + n_atoms_ptr=np.array([-1]), ) return item diff --git a/openqdc/datasets/interaction/splinter.py b/openqdc/datasets/interaction/splinter.py index 60cb503..72e808a 100644 --- a/openqdc/datasets/interaction/splinter.py +++ b/openqdc/datasets/interaction/splinter.py @@ -136,13 +136,13 @@ def read_raw_entries(self) -> List[Dict]: ) = metadata[0].split("_") r, theta_P, tau_P, theta_L, tau_L, tau_PL = [-1] * 6 energies = np.array([list(map(float, metadata[4:-1]))]).astype(np.float32) - n_atoms_first = np.array([int(metadata[-1])], dtype=np.int32) + n_atoms_ptr = np.array([int(metadata[-1])], dtype=np.int32) total_charge, charge0, charge1 = list(map(int, metadata[1:4])) lines = list(map(lambda x: x.split(), lines[2:])) pos = np.array(lines)[:, 1:].astype(np.float32) elems = np.array(lines)[:, 0] atomic_nums = np.expand_dims(np.array([ATOM_TABLE.GetAtomicNumber(x) for x in elems]), axis=1) - natoms0 = n_atoms_first[0] + natoms0 = n_atoms_ptr[0] natoms1 = n_atoms[0] - natoms0 charges = np.expand_dims(np.array([charge0] * natoms0 + [charge1] * natoms1), axis=1) atomic_inputs = np.concatenate((atomic_nums, charges, pos), axis=-1, dtype=np.float32) @@ -152,7 +152,7 @@ def read_raw_entries(self) -> List[Dict]: energies=energies, subset=subset, n_atoms=n_atoms, - n_atoms_first=n_atoms_first, + n_atoms_ptr=n_atoms_ptr, atomic_inputs=atomic_inputs, protein_monomer_name=np.array([protein_monomer_name]), protein_interaction_site_type=np.array([protein_interaction_site_type]), diff --git a/openqdc/datasets/potential/dummy.py b/openqdc/datasets/potential/dummy.py index 1c7a61c..b485d40 100644 --- a/openqdc/datasets/potential/dummy.py +++ b/openqdc/datasets/potential/dummy.py @@ -14,7 +14,7 @@ class Dummy(BaseDataset): """ __name__ = "dummy" - __energy_methods__ = [PotentialMethod.SVWN_DEF2_TZVP, PotentialMethod.PM6, PotentialMethod.GFN2_XTB] + __energy_methods__ = [PotentialMethod.GFN2_XTB, PotentialMethod.WB97X_D_DEF2_SVP, PotentialMethod.GFN2_XTB] __force_mask__ = [False, True, True] __energy_unit__ = "kcal/mol" __distance_unit__ = "ang" @@ -31,7 +31,7 @@ def _post_init(self, overwrite_local_cache, energy_unit, distance_unit) -> None: return super()._post_init(overwrite_local_cache, energy_unit, distance_unit) def setup_dummy(self): - n_atoms = np.array([np.random.randint(1, 100) for _ in range(len(self))]) + n_atoms = np.array([np.random.randint(2, 100) for _ in range(len(self))]) position_idx_range = np.concatenate([[0], np.cumsum(n_atoms)]).repeat(2)[1:-1].reshape(-1, 2) atomic_inputs = np.concatenate( [ From 6f033cf579b48af74642e3ab7215a78dcfd9469e Mon Sep 17 00:00:00 2001 From: Nikhil Shenoy Date: Mon, 15 Apr 2024 16:43:36 +0000 Subject: [PATCH 27/27] Updated splinter reading from -1 to nan --- openqdc/datasets/interaction/splinter.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/openqdc/datasets/interaction/splinter.py b/openqdc/datasets/interaction/splinter.py index 72e808a..a793624 100644 --- a/openqdc/datasets/interaction/splinter.py +++ b/openqdc/datasets/interaction/splinter.py @@ -134,7 +134,7 @@ def read_raw_entries(self) -> List[Dict]: index, _, ) = metadata[0].split("_") - r, theta_P, tau_P, theta_L, tau_L, tau_PL = [-1] * 6 + r, theta_P, tau_P, theta_L, tau_L, tau_PL = [np.nan] * 6 energies = np.array([list(map(float, metadata[4:-1]))]).astype(np.float32) n_atoms_ptr = np.array([int(metadata[-1])], dtype=np.int32) total_charge, charge0, charge1 = list(map(int, metadata[1:4]))