diff --git a/src/openqdc/__init__.py b/src/openqdc/__init__.py index 1432923..e77bc9e 100644 --- a/src/openqdc/__init__.py +++ b/src/openqdc/__init__.py @@ -9,7 +9,7 @@ _lazy_imports_obj = {} -_lazy_imports_mod = {"datasets": "openqdc.datamodule", "utils": "openqdc.utils"} +_lazy_imports_mod = {"datasets": "openqdc.datasets", "utils": "openqdc.utils"} def __getattr__(name): diff --git a/src/openqdc/datasets/ani.py b/src/openqdc/datasets/ani.py index 913fb8a..3f1b92b 100644 --- a/src/openqdc/datasets/ani.py +++ b/src/openqdc/datasets/ani.py @@ -39,6 +39,12 @@ class ANI1(BaseDataset): def root(self): return p_join(get_local_cache(), "ani") + def __smiles_converter__(self, x): + """util function to convert string to smiles: useful if the smiles is + encoded in a different format than its display format + """ + return "-".join(x.decode("ascii").split("-")[:-1]) + @property def preprocess_path(self): path = p_join(self.root, "preprocessed", self.__name__) @@ -89,6 +95,12 @@ class ANI1CCX(ANI1): __force_methods__ = [] force_target_names = [] + def __smiles_converter__(self, x): + """util function to convert string to smiles: useful if the smiles is + encoded in a different format than its display format + """ + return x + class ANI1X(ANI1): """ @@ -145,3 +157,9 @@ class ANI1X(ANI1): def convert_forces(self, x): return super().convert_forces(x) * 0.529177249 # correct the Dataset error + + def __smiles_converter__(self, x): + """util function to convert string to smiles: useful if the smiles is + encoded in a different format than its display format + """ + return x diff --git a/src/openqdc/datasets/base.py b/src/openqdc/datasets/base.py index 1de6ff1..995b297 100644 --- a/src/openqdc/datasets/base.py +++ b/src/openqdc/datasets/base.py @@ -1,4 +1,5 @@ import os +import pickle as pkl from os.path import join as p_join from typing import Dict, List, Optional, Union @@ -24,7 +25,7 @@ push_remote, set_cache_dir, ) -from openqdc.utils.molecule import atom_table +from openqdc.utils.molecule import atom_table, z_to_formula from openqdc.utils.package_utils import requires_package from openqdc.utils.units import get_conversion @@ -43,7 +44,7 @@ def extract_entry( res = dict( name=np.array([df["name"][i]]), - subset=np.array([subset]), + subset=np.array([subset if subset is not None else z_to_formula(x)]), energies=energies.reshape((1, -1)).astype(np.float32), atomic_inputs=np.concatenate((xs, positions), axis=-1, dtype=np.float32), n_atoms=np.array([x.shape[0]], dtype=np.int32), @@ -64,8 +65,8 @@ def read_qc_archive_h5( ) -> List[Dict[str, np.ndarray]]: data = load_hdf5_file(raw_path) data_t = {k2: data[k1][k2][:] for k1 in data.keys() for k2 in data[k1].keys()} - n = len(data_t["molecule_id"]) + n = len(data_t["molecule_id"]) samples = [extract_entry(data_t, i, subset, energy_target_names, force_target_names) for i in tqdm(range(n))] return samples @@ -96,9 +97,6 @@ def __init__( self._set_units(energy_unit, distance_unit) if not self.is_preprocessed(): logger.info("This dataset not available. Please open an issue on Github for the team to look into it.") - # entries = self.read_raw_entries() - # res = self.collate_list(entries) - # self.save_preprocess(res) else: self.read_preprocess(overwrite_local_cache=overwrite_local_cache) self._set_isolated_atom_energies() @@ -107,12 +105,12 @@ def __init__( def numbers(self): if hasattr(self, "_numbers"): return self._numbers - self._numbers = np.array(list(set(self.data["atomic_inputs"][..., 0])), dtype=np.int32) + self._numbers = np.unique(self.data["atomic_inputs"][..., 0]).astype(np.int32) return self._numbers @property def chemical_species(self): - return [chemical_symbols[z] for z in self.numbers] + return np.array(chemical_symbols)[self.numbers] @property def energy_unit(self): @@ -211,10 +209,11 @@ def collate_list(self, list_entries): # concatenate entries res = {key: np.concatenate([r[key] for r in list_entries if r is not None], axis=0) for key in list_entries[0]} - csum = np.cumsum(res.pop("n_atoms")) + csum = np.cumsum(res.get("n_atoms")) x = np.zeros((csum.shape[0], 2), dtype=np.int32) x[1:, 0], x[:, 1] = csum[:-1], csum res["position_idx_range"] = x + return res def save_preprocess(self, data_dict): @@ -228,12 +227,13 @@ def save_preprocess(self, data_dict): push_remote(local_path, overwrite=True) # save smiles and subset + local_path = p_join(self.preprocess_path, "props.pkl") for key in ["name", "subset"]: - local_path = p_join(self.preprocess_path, f"{key}.npz") - uniques, inv_indices = np.unique(data_dict[key], return_inverse=True) - with open(local_path, "wb") as f: - np.savez_compressed(f, uniques=uniques, inv_indices=inv_indices) - push_remote(local_path) + data_dict[key] = np.unique(data_dict[key], return_inverse=True) + + with open(local_path, "wb") as f: + pkl.dump(data_dict, f) + push_remote(local_path, overwrite=True) def read_preprocess(self, overwrite_local_cache=False): logger.info("Reading preprocessed data") @@ -247,32 +247,29 @@ def read_preprocess(self, overwrite_local_cache=False): for key in self.data_keys: filename = p_join(self.preprocess_path, f"{key}.mmap") pull_locally(filename, overwrite=overwrite_local_cache) - self.data[key] = np.memmap( - filename, - mode="r", - dtype=self.data_types[key], - ).reshape(self.data_shapes[key]) + self.data[key] = np.memmap(filename, mode="r", dtype=self.data_types[key]).reshape(self.data_shapes[key]) + + filename = p_join(self.preprocess_path, "props.pkl") + pull_locally(filename, overwrite=overwrite_local_cache) + with open(filename, "rb") as f: + tmp = pkl.load(f) + for key in ["name", "subset", "n_atoms"]: + x = tmp.pop(key) + if len(x) == 2: + self.data[key] = x[0][x[1]] + else: + self.data[key] = x for key in self.data: print(f"Loaded {key} with shape {self.data[key].shape}, dtype {self.data[key].dtype}") - for key in ["name", "subset"]: - filename = p_join(self.preprocess_path, f"{key}.npz") - pull_locally(filename) - self.data[key] = dict() - with open(filename, "rb") as f: - tmp = np.load(f) - for k in tmp: - self.data[key][k] = tmp[k] - print(f"Loaded {key}_{k} with shape {self.data[key][k].shape}, dtype {self.data[key][k].dtype}") - def is_preprocessed(self): predicats = [copy_exists(p_join(self.preprocess_path, f"{key}.mmap")) for key in self.data_keys] - predicats += [copy_exists(p_join(self.preprocess_path, f"{x}.npz")) for x in ["name", "subset"]] + predicats += [copy_exists(p_join(self.preprocess_path, "props.pkl"))] return all(predicats) - def preprocess(self): - if not self.is_preprocessed(): + def preprocess(self, overwrite=False): + if overwrite or not self.is_preprocessed(): entries = self.read_raw_entries() res = self.collate_list(entries) self.save_preprocess(res) @@ -305,7 +302,7 @@ def get_ase_atoms(self, idx: int, ext=True): @requires_package("dscribe") @requires_package("datamol") - def chemical_space( + def soap_descriptors( self, n_samples: Optional[Union[List[int], int]] = None, return_idxs: bool = True, @@ -350,7 +347,7 @@ def chemical_space( idxs = list(range(len(self))) elif isinstance(n_samples, int): idxs = np.random.choice(len(self), size=n_samples, replace=False) - elif isinstance(n_samples, list): + else: # list, set, np.ndarray idxs = n_samples datum = {} r_cut = soap_kwargs.pop("r_cut", 5.0) @@ -383,7 +380,7 @@ def wrapper(idx): entry = self.get_ase_atoms(idx, ext=False) return soap.create(entry, centers=entry.positions) - descr = dm.parallelized(wrapper, idxs, progress=progress, scheduler="threads") + descr = dm.parallelized(wrapper, idxs, progress=progress, scheduler="threads", n_jobs=-1) datum["soap"] = np.vstack(descr) if return_idxs: datum["idxs"] = idxs @@ -392,6 +389,12 @@ def wrapper(idx): def __len__(self): return self.data["energies"].shape[0] + def __smiles_converter__(self, x): + """util function to convert string to smiles: useful if the smiles is + encoded in a different format than its display format + """ + return x + def __getitem__(self, idx: int): shift = IsolatedAtomEnergyFactory.max_charge p_start, p_end = self.data["position_idx_range"][idx] @@ -402,8 +405,8 @@ def __getitem__(self, idx: int): self.convert_distance(np.array(input[:, -3:], dtype=np.float32)), self.convert_energy(np.array(self.data["energies"][idx], dtype=np.float32)), ) - name = self.data["name"]["uniques"][self.data["name"]["inv_indices"][idx]] - subset = self.data["subset"]["uniques"][self.data["subset"]["inv_indices"][idx]] + name = self.__smiles_converter__(self.data["name"][idx]) + subset = self.data["subset"][idx] if "forces" in self.data: forces = self.convert_forces(np.array(self.data["forces"][p_start:p_end], dtype=np.float32)) diff --git a/src/openqdc/datasets/comp6.py b/src/openqdc/datasets/comp6.py index c95ec17..7b6890b 100644 --- a/src/openqdc/datasets/comp6.py +++ b/src/openqdc/datasets/comp6.py @@ -35,8 +35,8 @@ class COMP6(BaseDataset): "pbe-d3bj/def2-tzvp", "pbe/def2-tzvp", "svwn/def2-tzvp", - "wb97m-d3bj/def2-tzvp", - "wb97m/def2-tzvp", + # "wb97m-d3bj/def2-tzvp", + # "wb97m/def2-tzvp", ] energy_target_names = [ @@ -47,8 +47,8 @@ class COMP6(BaseDataset): "PBE-D3M(BJ):def2-tzvp", "PBE:def2-tzvp", "SVWN:def2-tzvp", - "WB97M-D3(BJ):def2-tzvp", - "WB97M:def2-tzvp", + # "WB97M-D3(BJ):def2-tzvp", + # "WB97M:def2-tzvp", ] __force_methods__ = [ @@ -59,6 +59,12 @@ class COMP6(BaseDataset): "Gradient", ] + def __smiles_converter__(self, x): + """util function to convert string to smiles: useful if the smiles is + encoded in a different format than its display format + """ + return "-".join(x.decode("ascii").split("_")[:-1]) + def read_raw_entries(self): samples = [] for subset in ["ani_md", "drugbank", "gdb7_9", "gdb10_13", "s66x8", "tripeptides"]: diff --git a/src/openqdc/datasets/iso_17.py b/src/openqdc/datasets/iso_17.py index 735ae67..4553ec1 100644 --- a/src/openqdc/datasets/iso_17.py +++ b/src/openqdc/datasets/iso_17.py @@ -43,6 +43,12 @@ class ISO17(BaseDataset): __distance_unit__ = "bohr" # bohr __forces_unit__ = "ev/bohr" + def __smiles_converter__(self, x): + """util function to convert string to smiles: useful if the smiles is + encoded in a different format than its display format + """ + return "-".join(x.decode("ascii").split("_")[:-1]) + def read_raw_entries(self): raw_path = p_join(self.root, "iso_17.h5") samples = read_qc_archive_h5(raw_path, "iso_17", self.energy_target_names, self.force_target_names) diff --git a/src/openqdc/datasets/nabladft.py b/src/openqdc/datasets/nabladft.py index e7d9eb8..0555cdc 100644 --- a/src/openqdc/datasets/nabladft.py +++ b/src/openqdc/datasets/nabladft.py @@ -4,30 +4,32 @@ import datamol as dm import numpy as np -from tqdm import tqdm +import pandas as pd from openqdc.datasets.base import BaseDataset +from openqdc.utils.molecule import z_to_formula from openqdc.utils.package_utils import requires_package -def to_mol(entry) -> Dict[str, np.ndarray]: +def to_mol(entry, metadata) -> Dict[str, np.ndarray]: Z, R, E, F = entry[:4] C = np.zeros_like(Z) + E[0] = metadata["DFT TOTAL ENERGY"] res = dict( atomic_inputs=np.concatenate((Z[:, None], C[:, None], R), axis=-1).astype(np.float32), - name=np.array([""]), + name=np.array([metadata["SMILES"]]), energies=E[:, None].astype(np.float32), forces=F[:, :, None].astype(np.float32), n_atoms=np.array([Z.shape[0]], dtype=np.int32), - subset=np.array(["nabla"]), + subset=np.array([z_to_formula(Z)]), ) return res @requires_package("nablaDFT") -def read_chunk_from_db(raw_path, start_idx, stop_idx, step_size=1000): +def read_chunk_from_db(raw_path, start_idx, stop_idx, labels, step_size=1000): from nablaDFT.dataset import HamiltonianDatabase print(f"Loading from {start_idx} to {stop_idx}") @@ -35,7 +37,13 @@ def read_chunk_from_db(raw_path, start_idx, stop_idx, step_size=1000): idxs = list(np.arange(start_idx, stop_idx)) n, s = len(idxs), step_size - samples = [to_mol(entry) for i in tqdm(range(0, n, s)) for entry in db[idxs[i : i + s]]] + cursor = db._get_connection().cursor() + data_idxs = cursor.execute("""SELECT * FROM dataset_ids WHERE id IN (""" + str(idxs)[1:-1] + ")").fetchall() + c_idxs = [tuple(x[1:]) for x in data_idxs] + + samples = [ + to_mol(entry, labels[c_idxs[i + j]]) for i in range(0, n, s) for j, entry in enumerate(db[idxs[i : i + s]]) + ] return samples @@ -68,12 +76,16 @@ class NablaDFT(BaseDataset): def read_raw_entries(self): from nablaDFT.dataset import HamiltonianDatabase + label_path = p_join(self.root, "summary.csv") + df = pd.read_csv(label_path, usecols=["MOSES id", "CONFORMER id", "SMILES", "DFT TOTAL ENERGY"]) + labels = df.set_index(keys=["MOSES id", "CONFORMER id"]).to_dict("index") + raw_path = p_join(self.root, "dataset_full.db") train = HamiltonianDatabase(raw_path) n, c = len(train), 20 step_size = int(np.ceil(n / os.cpu_count())) - fn = lambda i: read_chunk_from_db(raw_path, i * step_size, min((i + 1) * step_size, n)) + fn = lambda i: read_chunk_from_db(raw_path, i * step_size, min((i + 1) * step_size, n), labels=labels) samples = dm.parallelized( fn, list(range(c)), n_jobs=c, progress=False, scheduler="threads" ) # don't use more than 1 job diff --git a/src/openqdc/datasets/pcqm.py b/src/openqdc/datasets/pcqm.py index 505eef1..d1a344c 100644 --- a/src/openqdc/datasets/pcqm.py +++ b/src/openqdc/datasets/pcqm.py @@ -11,7 +11,7 @@ from loguru import logger from openqdc.datasets.base import BaseDataset -from openqdc.utils.io import get_local_cache +from openqdc.utils.io import get_local_cache, push_remote def flatten_dict(d, sep: str = "."): @@ -80,27 +80,83 @@ def __init__(self, energy_unit=None, distance_unit=None) -> None: def root(self): return p_join(get_local_cache(), "pubchemqc") - def collate_list(self, list_entries, partial=False): - # default partial=False is necessary for compatibility with the base class - if partial: - predicat = list_entries is not None and len(list_entries) > 0 - list_entries = [x for x in list_entries if x is not None] - return super().collate_list(list_entries) if predicat else None + @property + def preprocess_path(self): + path = p_join(self.root, "preprocessed", self.__name__) + os.makedirs(path, exist_ok=True) + return path + + def collate_list(self, list_entries): + predicat = list_entries is not None and len(list_entries) > 0 + list_entries = [x for x in list_entries if x is not None] + if predicat: + res = super().collate_list(list_entries) else: - n = 0 - for i in range(len(list_entries)): - list_entries[i]["position_idx_range"] += n - n += list_entries[i]["position_idx_range"].max() - res = {key: np.concatenate([r[key] for r in list_entries], axis=0) for key in list_entries[0]} - return res + res = None + return res def read_raw_entries(self): arxiv_paths = glob(p_join(self.root, f"{self.__energy_methods__[0]}", "*.pkl")) - f = lambda x: self.collate_list(read_preprocessed_archive(x), partial=True) + f = lambda x: self.collate_list(read_preprocessed_archive(x)) samples = dm.parallelized(f, arxiv_paths, n_jobs=1, progress=True) samples = [x for x in samples if x is not None] return samples + def preprocess(self, overwrite=False): + if overwrite or not self.is_preprocessed(): + logger.info("Preprocessing data and saving it to cache.") + logger.info( + f"Dataset {self.__name__} data with the following units:\n" + f"Energy: {self.energy_unit}, Distance: {self.distance_unit}, " + f"Forces: {self.force_unit if self.__force_methods__ else 'None'}" + ) + entries = self.read_raw_entries() + self.collate_and_save_list(entries) + + def collate_and_save_list(self, list_entries): + n_molecules, n_atoms = 0, 0 + for i in range(len(list_entries)): + list_entries[i]["position_idx_range"] += n_atoms + n_atoms += list_entries[i]["position_idx_range"].max() + n_molecules += list_entries[i]["position_idx_range"].shape[0] + + for key in self.data_keys: + first = list_entries[0][key] + shape = (n_molecules, *first.shape[1:]) + local_path = p_join(self.preprocess_path, f"{key}.mmap") + out = np.memmap(local_path, mode="w+", dtype=first.dtype, shape=shape) + + start = 0 + for i in range(len(list_entries)): + x = list_entries[i].pop(key) + n = x.shape[0] + out[start : start + n] = x + out.flush() + push_remote(local_path, overwrite=True) + + # save smiles and subset + tmp, n = dict(name=[]), len(list_entries) + local_path = p_join(self.preprocess_path, "props.pkl") + names = [list_entries[i].pop("name") for i in range(n)] + f = lambda xs: [dm.to_inchikey(x) for x in xs] + res = dm.parallelized(f, names, n_jobs=-1, progress=False) + for x in res: + tmp["name"] += x + for key in ["subset", "n_atoms"]: + tmp[key] = [] + for i in range(n): + tmp[key] += list(list_entries[i].pop(key)) + with open(local_path, "wb") as f: + pkl.dump(tmp, f) + push_remote(local_path, overwrite=True) + # for key in ["name", "subset"]: + # local_path = p_join(self.preprocess_path, f"{key}.npz") + # x = [el for i in range(len(list_entries)) for el in list_entries[i].pop(key)] + # uniques, inv_indices = np.unique(x, return_inverse=True) + # with open(local_path, "wb") as f: + # np.savez_compressed(f, uniques=uniques, inv_indices=inv_indices) + # push_remote(local_path, overwrite=True) + class PCQM_B3LYP(PCQM_PM6): __name__ = "pubchemqc_b3lyp" diff --git a/src/openqdc/datasets/sn2_rxn.py b/src/openqdc/datasets/sn2_rxn.py index 3e75e91..abcbd62 100644 --- a/src/openqdc/datasets/sn2_rxn.py +++ b/src/openqdc/datasets/sn2_rxn.py @@ -25,8 +25,38 @@ class SN2RXN(BaseDataset): "DSD-BLYP-D3(BJ):def2-TZVP Gradient", ] + def __smiles_converter__(self, x): + """util function to convert string to smiles: useful if the smiles is + encoded in a different format than its display format + """ + return "-".join(x.decode("ascii").split("_")[:-1]) + def read_raw_entries(self): raw_path = p_join(self.root, "sn2_rxn.h5") + + # raw_path = p_join(self.root, "sn2_reactions.npz") + # data = np.load(raw_path) + + # # as example for accessing individual entries, print the data for entry idx=0 + # idx = 0 + # print("Data for entry " + str(idx)+":") + # print("Number of atoms") + # print(data["N"][idx]) + # print("Energy [eV]") + # print(data["E"][idx]) + # print("Total charge") + # print(data["Q"][idx]) + # print("Dipole moment vector (with respect to [0.0 0.0 0.0]) [eA]") + # print(data["D"][idx,:]) + # print("Nuclear charges") + # print(data["Z"][idx,:data["N"][idx]]) + # print("Cartesian coordinates [A]") + # print(data["R"][idx,:data["N"][idx],:]) + # print("Forces [eV/A]") + # print(data["F"][idx,:data["N"][idx],:]) + + # exit() + samples = read_qc_archive_h5(raw_path, "sn2_rxn", self.energy_target_names, self.force_target_names) return samples diff --git a/src/openqdc/datasets/solvated_peptides.py b/src/openqdc/datasets/solvated_peptides.py index 9846bdf..216ecdd 100644 --- a/src/openqdc/datasets/solvated_peptides.py +++ b/src/openqdc/datasets/solvated_peptides.py @@ -27,6 +27,12 @@ class SolvatedPeptides(BaseDataset): __distance_unit__ = "bohr" __forces_unit__ = "hartree/bohr" + def __smiles_converter__(self, x): + """util function to convert string to smiles: useful if the smiles is + encoded in a different format than its display format + """ + return "_".join(x.decode("ascii").split("_")[:-1]) + def read_raw_entries(self): raw_path = p_join(self.root, "solvated_peptides.h5") samples = read_qc_archive_h5(raw_path, "solvated_peptides", self.energy_target_names, self.force_target_names) diff --git a/src/openqdc/raws/config_factory.py b/src/openqdc/raws/config_factory.py index 38bec86..c8dddba 100644 --- a/src/openqdc/raws/config_factory.py +++ b/src/openqdc/raws/config_factory.py @@ -37,7 +37,7 @@ class DataConfigFactory: sn2_rxn = dict( dataset_name="sn2_rxn", - links={"sn2_rxn.hdf5.gz": "https://zenodo.org/record/3585800/files/212.hdf5.gz"}, + links={"sn2_rxn.hdf5.gz": "https://zenodo.org/records/2605341/files/sn2_reactions.npz"}, ) # FROM: https://sites.uw.edu/wdbase/database-of-water-clusters/ diff --git a/src/openqdc/utils/atomization_energies.py b/src/openqdc/utils/atomization_energies.py index 40d0d13..6a1a638 100644 --- a/src/openqdc/utils/atomization_energies.py +++ b/src/openqdc/utils/atomization_energies.py @@ -2,124 +2,126 @@ import numpy as np from loguru import logger +from rdkit import Chem from openqdc.utils.constants import MAX_ATOMIC_NUMBER +atom_table = Chem.GetPeriodicTable() + __all__ = ["chemical_symbols", "atomic_numbers", "IsolatedAtomEnergyFactory"] EF_KEY: TypeAlias = Tuple[str, int] -ATOM_SPECIES = "H", "Li", "B", "C", "N", "O", "F", "Na", "Mg", "Si", "P", "S", "Cl", "K", "Ca", "Br", "I" -# Energy in atomic unit/ Hartree / Ang - # didn t calculate for Pd, Pt, Mo, Ni, Fe, Cu, see DESS atomic_numbers = {} -chemical_symbols = [ - "X", - "H", - "He", - "Li", - "Be", - "B", - "C", - "N", - "O", - "F", - "Ne", - "Na", - "Mg", - "Al", - "Si", - "P", - "S", - "Cl", - "Ar", - "K", - "Ca", - "Sc", - "Ti", - "V", - "Cr", - "Mn", - "Fe", - "Co", - "Ni", - "Cu", - "Zn", - "Ga", - "Ge", - "As", - "Se", - "Br", - "Kr", - "Rb", - "Sr", - "Y", - "Zr", - "Nb", - "Mo", - "Tc", - "Ru", - "Rh", - "Pd", - "Ag", - "Cd", - "In", - "Sn", - "Sb", - "Te", - "I", - "Xe", - "Cs", - "Ba", - "La", - "Ce", - "Pr", - "Nd", - "Pm", - "Sm", - "Eu", - "Gd", - "Tb", - "Dy", - "Ho", - "Er", - "Tm", - "Yb", - "Lu", - "Hf", - "Ta", - "W", - "Re", - "Os", - "Ir", - "Pt", - "Au", - "Hg", - "Tl", - "Pb", - "Bi", - "Po", - "At", - "Rn", - "Fr", - "Ra", - "Ac", - "Th", - "Pa", - "U", - "Np", - "Pu", - "Am", - "Cm", - "Bk", - "Cf", - "Es", - "Fm", - "Md", - "No", - "Lr", -] +chemical_symbols = np.array( + [ + "X", + "H", + "He", + "Li", + "Be", + "B", + "C", + "N", + "O", + "F", + "Ne", + "Na", + "Mg", + "Al", + "Si", + "P", + "S", + "Cl", + "Ar", + "K", + "Ca", + "Sc", + "Ti", + "V", + "Cr", + "Mn", + "Fe", + "Co", + "Ni", + "Cu", + "Zn", + "Ga", + "Ge", + "As", + "Se", + "Br", + "Kr", + "Rb", + "Sr", + "Y", + "Zr", + "Nb", + "Mo", + "Tc", + "Ru", + "Rh", + "Pd", + "Ag", + "Cd", + "In", + "Sn", + "Sb", + "Te", + "I", + "Xe", + "Cs", + "Ba", + "La", + "Ce", + "Pr", + "Nd", + "Pm", + "Sm", + "Eu", + "Gd", + "Tb", + "Dy", + "Ho", + "Er", + "Tm", + "Yb", + "Lu", + "Hf", + "Ta", + "W", + "Re", + "Os", + "Ir", + "Pt", + "Au", + "Hg", + "Tl", + "Pb", + "Bi", + "Po", + "At", + "Rn", + "Fr", + "Ra", + "Ac", + "Th", + "Pa", + "U", + "Np", + "Pu", + "Am", + "Cm", + "Bk", + "Cf", + "Es", + "Fm", + "Md", + "No", + "Lr", + ] +) for Z, symbol in enumerate(chemical_symbols): @@ -131,7 +133,7 @@ class IsolatedAtomEnergyFactory: Factory method to get the isolated atom energies for a given level of theory. """ - max_charge = 4 + max_charge = 6 def __init__(self): pass @@ -207,7 +209,14 @@ def get_matrix(level_of_theory: str) -> np.ndarray: if tuple_hashmap is None: return matrix for key in tuple_hashmap.keys(): - matrix[atomic_numbers[key[0]], key[1] + shift] = tuple_hashmap[key] + try: + matrix[atomic_numbers[key[0]], key[1] + shift] = tuple_hashmap[key] + except KeyError: + print(key, list(tuple_hashmap.items())) + print(key[0], "?", key[1], "?", shift) + print(matrix.shape, atomic_numbers[key[0]], key[1] + shift) + logger.warning(f"Isolated atom energies not found for {key} and level of theory {level_of_theory}") + matrix[atomic_numbers[key[0]], key[1] + shift] = 0 return matrix diff --git a/src/openqdc/utils/molecule.py b/src/openqdc/utils/molecule.py index cd2290f..82a58d2 100644 --- a/src/openqdc/utils/molecule.py +++ b/src/openqdc/utils/molecule.py @@ -1,9 +1,22 @@ +from typing import Any + import numpy as np +from numpy import ndarray from rdkit import Chem +from openqdc.utils.atomization_energies import chemical_symbols + atom_table = Chem.GetPeriodicTable() +def z_to_formula(z): + u, c = np.unique(z, return_counts=True) + idxs = np.argsort(u) + u, c = u[idxs], c[idxs] + + return "".join([f"{chemical_symbols[u[i]]}{c[i] if c[i] > 1 else ''}" for i in range(len(u))]) + + def get_atomic_number(mol: Chem.Mol): """Returns atomic numbers for rdkit molecule""" return np.array([atom.GetAtomicNum() for atom in mol.GetAtoms()]) @@ -17,3 +30,76 @@ def get_atomic_charge(mol: Chem.Mol): def get_atomic_number_and_charge(mol: Chem.Mol): """Returns atoms number and charge for rdkit molecule""" return np.array([[atom.GetAtomicNum(), atom.GetFormalCharge()] for atom in mol.GetAtoms()]) + + +def rmsd(P: ndarray, Q: ndarray, **kwargs) -> float: + """ + Calculate Root-mean-square deviation from two sets of vectors V and W. + + Parameters + ---------- + V : array + (N,D) matrix, where N is points and D is dimension. + W : array + (N,D) matrix, where N is points and D is dimension. + + Returns + ------- + rmsd : float + Root-mean-square deviation between the two vectors + """ + diff = P - Q + return np.sqrt((diff * diff).sum() / P.shape[0]) + + +def kabsch_rmsd( + P: ndarray, + Q: ndarray, + translate: bool = False, + **kwargs: Any, +) -> float: + """ + Rotate matrix P unto Q using Kabsch algorithm and calculate the RMSD. + + Parameters + ---------- + P : array + (N,D) matrix, where N is points and D is dimension. + Q : array + (N,D) matrix, where N is points and D is dimension. + translate : bool + Use centroids to translate vector P and Q unto each other. + + Returns + ------- + rmsd : float + root-mean squared deviation + """ + + if translate: + Q = Q - Q.mean(axis=0) + P = P - P.mean(axis=0) + + # Computation of the covariance matrix + C = np.dot(np.transpose(P), Q) + + # Computation of the optimal rotation matrix + # This can be done using singular value decomposition (SVD) + # Getting the sign of the det(V)*(W) to decide + # whether we need to correct our rotation matrix to ensure a + # right-handed coordinate system. + # And finally calculating the optimal rotation matrix U + # see http://en.wikipedia.org/wiki/Kabsch_algorithm + V, S, W = np.linalg.svd(C) + d = (np.linalg.det(V) * np.linalg.det(W)) < 0.0 + + if d: + S[-1] = -S[-1] + V[:, -1] = -V[:, -1] + + # Create Rotation matrix U + U = np.dot(V, W) + + # Rotate P + P_prime = np.dot(P, U) + return rmsd(P_prime, Q) diff --git a/src/openqdc/utils/preprocess.py b/src/openqdc/utils/preprocess.py index 1142dca..b34499e 100644 --- a/src/openqdc/utils/preprocess.py +++ b/src/openqdc/utils/preprocess.py @@ -36,9 +36,11 @@ def preprocess(dataset): if dataset not in options_map: dataset_id = int(dataset) + data_class = options[dataset_id] + else: + data_class = options_map[dataset] - data_class = options[dataset_id] - data_class().preprocess() + data_class().preprocess(overwrite=False) data = data_class() logger.info(f"Preprocessing {data.__name__}") @@ -47,7 +49,7 @@ def preprocess(dataset): x = data[i] print(x.name, x.subset, end=" ") for k in x: - if x[k] is not None: + if isinstance(x[k], np.ndarray): print(k, x[k].shape, end=" ") print() diff --git a/src/openqdc/utils/units.py b/src/openqdc/utils/units.py index fb895ce..69c8972 100644 --- a/src/openqdc/utils/units.py +++ b/src/openqdc/utils/units.py @@ -73,3 +73,6 @@ def get_conversion(in_unit: str, out_unit: str): Conversion("hartree/ang", "kcal/mol/ang", lambda x: get_conversion("hartree", "kcal/mol")(x)) Conversion("hartree/ang", "hartree/bohr", lambda x: get_conversion("bohr", "ang")(x)) Conversion("hartree/bohr", "hartree/ang", lambda x: get_conversion("ang", "bohr")(x)) +Conversion("ev/bohr", "kcal/mol/ang", lambda x: get_conversion("ang", "bohr")(get_conversion("ev", "kcal/mol")(x))) +Conversion("kcal/mol/bohr", "kcal/mol/ang", lambda x: get_conversion("ang", "bohr")(x)) +Conversion("ev/ang", "kcal/mol/ang", lambda x: get_conversion("ev", "kcal/mol")(x))