diff --git a/src/openqdc/__init__.py b/src/openqdc/__init__.py
index 1432923..e77bc9e 100644
--- a/src/openqdc/__init__.py
+++ b/src/openqdc/__init__.py
@@ -9,7 +9,7 @@
 
 _lazy_imports_obj = {}
 
-_lazy_imports_mod = {"datasets": "openqdc.datamodule", "utils": "openqdc.utils"}
+_lazy_imports_mod = {"datasets": "openqdc.datasets", "utils": "openqdc.utils"}
 
 
 def __getattr__(name):
diff --git a/src/openqdc/datasets/ani.py b/src/openqdc/datasets/ani.py
index 913fb8a..3f1b92b 100644
--- a/src/openqdc/datasets/ani.py
+++ b/src/openqdc/datasets/ani.py
@@ -39,6 +39,12 @@ class ANI1(BaseDataset):
     def root(self):
         return p_join(get_local_cache(), "ani")
 
+    def __smiles_converter__(self, x):
+        """util function to convert string to smiles: useful if the smiles is
+        encoded in a different format than its display format
+        """
+        return "-".join(x.decode("ascii").split("-")[:-1])
+
     @property
     def preprocess_path(self):
         path = p_join(self.root, "preprocessed", self.__name__)
@@ -89,6 +95,12 @@ class ANI1CCX(ANI1):
     __force_methods__ = []
     force_target_names = []
 
+    def __smiles_converter__(self, x):
+        """util function to convert string to smiles: useful if the smiles is
+        encoded in a different format than its display format
+        """
+        return x
+
 
 class ANI1X(ANI1):
     """
@@ -145,3 +157,9 @@ class ANI1X(ANI1):
 
     def convert_forces(self, x):
         return super().convert_forces(x) * 0.529177249  # correct the Dataset error
+
+    def __smiles_converter__(self, x):
+        """util function to convert string to smiles: useful if the smiles is
+        encoded in a different format than its display format
+        """
+        return x
diff --git a/src/openqdc/datasets/base.py b/src/openqdc/datasets/base.py
index 1de6ff1..995b297 100644
--- a/src/openqdc/datasets/base.py
+++ b/src/openqdc/datasets/base.py
@@ -1,4 +1,5 @@
 import os
+import pickle as pkl
 from os.path import join as p_join
 from typing import Dict, List, Optional, Union
 
@@ -24,7 +25,7 @@
     push_remote,
     set_cache_dir,
 )
-from openqdc.utils.molecule import atom_table
+from openqdc.utils.molecule import atom_table, z_to_formula
 from openqdc.utils.package_utils import requires_package
 from openqdc.utils.units import get_conversion
 
@@ -43,7 +44,7 @@ def extract_entry(
 
     res = dict(
         name=np.array([df["name"][i]]),
-        subset=np.array([subset]),
+        subset=np.array([subset if subset is not None else z_to_formula(x)]),
         energies=energies.reshape((1, -1)).astype(np.float32),
         atomic_inputs=np.concatenate((xs, positions), axis=-1, dtype=np.float32),
         n_atoms=np.array([x.shape[0]], dtype=np.int32),
@@ -64,8 +65,8 @@ def read_qc_archive_h5(
 ) -> List[Dict[str, np.ndarray]]:
     data = load_hdf5_file(raw_path)
     data_t = {k2: data[k1][k2][:] for k1 in data.keys() for k2 in data[k1].keys()}
-    n = len(data_t["molecule_id"])
 
+    n = len(data_t["molecule_id"])
     samples = [extract_entry(data_t, i, subset, energy_target_names, force_target_names) for i in tqdm(range(n))]
     return samples
 
@@ -96,9 +97,6 @@ def __init__(
         self._set_units(energy_unit, distance_unit)
         if not self.is_preprocessed():
             logger.info("This dataset not available. Please open an issue on Github for the team to look into it.")
-            # entries = self.read_raw_entries()
-            # res = self.collate_list(entries)
-            # self.save_preprocess(res)
         else:
             self.read_preprocess(overwrite_local_cache=overwrite_local_cache)
             self._set_isolated_atom_energies()
@@ -107,12 +105,12 @@ def __init__(
     def numbers(self):
         if hasattr(self, "_numbers"):
             return self._numbers
-        self._numbers = np.array(list(set(self.data["atomic_inputs"][..., 0])), dtype=np.int32)
+        self._numbers = np.unique(self.data["atomic_inputs"][..., 0]).astype(np.int32)
         return self._numbers
 
     @property
     def chemical_species(self):
-        return [chemical_symbols[z] for z in self.numbers]
+        return np.array(chemical_symbols)[self.numbers]
 
     @property
     def energy_unit(self):
@@ -211,10 +209,11 @@ def collate_list(self, list_entries):
         # concatenate entries
         res = {key: np.concatenate([r[key] for r in list_entries if r is not None], axis=0) for key in list_entries[0]}
 
-        csum = np.cumsum(res.pop("n_atoms"))
+        csum = np.cumsum(res.get("n_atoms"))
         x = np.zeros((csum.shape[0], 2), dtype=np.int32)
         x[1:, 0], x[:, 1] = csum[:-1], csum
         res["position_idx_range"] = x
+
         return res
 
     def save_preprocess(self, data_dict):
@@ -228,12 +227,13 @@ def save_preprocess(self, data_dict):
             push_remote(local_path, overwrite=True)
 
         # save smiles and subset
+        local_path = p_join(self.preprocess_path, "props.pkl")
         for key in ["name", "subset"]:
-            local_path = p_join(self.preprocess_path, f"{key}.npz")
-            uniques, inv_indices = np.unique(data_dict[key], return_inverse=True)
-            with open(local_path, "wb") as f:
-                np.savez_compressed(f, uniques=uniques, inv_indices=inv_indices)
-            push_remote(local_path)
+            data_dict[key] = np.unique(data_dict[key], return_inverse=True)
+
+        with open(local_path, "wb") as f:
+            pkl.dump(data_dict, f)
+        push_remote(local_path, overwrite=True)
 
     def read_preprocess(self, overwrite_local_cache=False):
         logger.info("Reading preprocessed data")
@@ -247,32 +247,29 @@ def read_preprocess(self, overwrite_local_cache=False):
         for key in self.data_keys:
             filename = p_join(self.preprocess_path, f"{key}.mmap")
             pull_locally(filename, overwrite=overwrite_local_cache)
-            self.data[key] = np.memmap(
-                filename,
-                mode="r",
-                dtype=self.data_types[key],
-            ).reshape(self.data_shapes[key])
+            self.data[key] = np.memmap(filename, mode="r", dtype=self.data_types[key]).reshape(self.data_shapes[key])
+
+        filename = p_join(self.preprocess_path, "props.pkl")
+        pull_locally(filename, overwrite=overwrite_local_cache)
+        with open(filename, "rb") as f:
+            tmp = pkl.load(f)
+            for key in ["name", "subset", "n_atoms"]:
+                x = tmp.pop(key)
+                if len(x) == 2:
+                    self.data[key] = x[0][x[1]]
+                else:
+                    self.data[key] = x
 
         for key in self.data:
             print(f"Loaded {key} with shape {self.data[key].shape}, dtype {self.data[key].dtype}")
 
-        for key in ["name", "subset"]:
-            filename = p_join(self.preprocess_path, f"{key}.npz")
-            pull_locally(filename)
-            self.data[key] = dict()
-            with open(filename, "rb") as f:
-                tmp = np.load(f)
-                for k in tmp:
-                    self.data[key][k] = tmp[k]
-                    print(f"Loaded {key}_{k} with shape {self.data[key][k].shape}, dtype {self.data[key][k].dtype}")
-
     def is_preprocessed(self):
         predicats = [copy_exists(p_join(self.preprocess_path, f"{key}.mmap")) for key in self.data_keys]
-        predicats += [copy_exists(p_join(self.preprocess_path, f"{x}.npz")) for x in ["name", "subset"]]
+        predicats += [copy_exists(p_join(self.preprocess_path, "props.pkl"))]
         return all(predicats)
 
-    def preprocess(self):
-        if not self.is_preprocessed():
+    def preprocess(self, overwrite=False):
+        if overwrite or not self.is_preprocessed():
             entries = self.read_raw_entries()
             res = self.collate_list(entries)
             self.save_preprocess(res)
@@ -305,7 +302,7 @@ def get_ase_atoms(self, idx: int, ext=True):
 
     @requires_package("dscribe")
     @requires_package("datamol")
-    def chemical_space(
+    def soap_descriptors(
         self,
         n_samples: Optional[Union[List[int], int]] = None,
         return_idxs: bool = True,
@@ -350,7 +347,7 @@ def chemical_space(
             idxs = list(range(len(self)))
         elif isinstance(n_samples, int):
             idxs = np.random.choice(len(self), size=n_samples, replace=False)
-        elif isinstance(n_samples, list):
+        else:  # list, set, np.ndarray
             idxs = n_samples
         datum = {}
         r_cut = soap_kwargs.pop("r_cut", 5.0)
@@ -383,7 +380,7 @@ def wrapper(idx):
             entry = self.get_ase_atoms(idx, ext=False)
             return soap.create(entry, centers=entry.positions)
 
-        descr = dm.parallelized(wrapper, idxs, progress=progress, scheduler="threads")
+        descr = dm.parallelized(wrapper, idxs, progress=progress, scheduler="threads", n_jobs=-1)
         datum["soap"] = np.vstack(descr)
         if return_idxs:
             datum["idxs"] = idxs
@@ -392,6 +389,12 @@ def wrapper(idx):
     def __len__(self):
         return self.data["energies"].shape[0]
 
+    def __smiles_converter__(self, x):
+        """util function to convert string to smiles: useful if the smiles is
+        encoded in a different format than its display format
+        """
+        return x
+
     def __getitem__(self, idx: int):
         shift = IsolatedAtomEnergyFactory.max_charge
         p_start, p_end = self.data["position_idx_range"][idx]
@@ -402,8 +405,8 @@ def __getitem__(self, idx: int):
             self.convert_distance(np.array(input[:, -3:], dtype=np.float32)),
             self.convert_energy(np.array(self.data["energies"][idx], dtype=np.float32)),
         )
-        name = self.data["name"]["uniques"][self.data["name"]["inv_indices"][idx]]
-        subset = self.data["subset"]["uniques"][self.data["subset"]["inv_indices"][idx]]
+        name = self.__smiles_converter__(self.data["name"][idx])
+        subset = self.data["subset"][idx]
 
         if "forces" in self.data:
             forces = self.convert_forces(np.array(self.data["forces"][p_start:p_end], dtype=np.float32))
diff --git a/src/openqdc/datasets/comp6.py b/src/openqdc/datasets/comp6.py
index c95ec17..7b6890b 100644
--- a/src/openqdc/datasets/comp6.py
+++ b/src/openqdc/datasets/comp6.py
@@ -35,8 +35,8 @@ class COMP6(BaseDataset):
         "pbe-d3bj/def2-tzvp",
         "pbe/def2-tzvp",
         "svwn/def2-tzvp",
-        "wb97m-d3bj/def2-tzvp",
-        "wb97m/def2-tzvp",
+        # "wb97m-d3bj/def2-tzvp",
+        # "wb97m/def2-tzvp",
     ]
 
     energy_target_names = [
@@ -47,8 +47,8 @@ class COMP6(BaseDataset):
         "PBE-D3M(BJ):def2-tzvp",
         "PBE:def2-tzvp",
         "SVWN:def2-tzvp",
-        "WB97M-D3(BJ):def2-tzvp",
-        "WB97M:def2-tzvp",
+        # "WB97M-D3(BJ):def2-tzvp",
+        # "WB97M:def2-tzvp",
     ]
 
     __force_methods__ = [
@@ -59,6 +59,12 @@ class COMP6(BaseDataset):
         "Gradient",
     ]
 
+    def __smiles_converter__(self, x):
+        """util function to convert string to smiles: useful if the smiles is
+        encoded in a different format than its display format
+        """
+        return "-".join(x.decode("ascii").split("_")[:-1])
+
     def read_raw_entries(self):
         samples = []
         for subset in ["ani_md", "drugbank", "gdb7_9", "gdb10_13", "s66x8", "tripeptides"]:
diff --git a/src/openqdc/datasets/iso_17.py b/src/openqdc/datasets/iso_17.py
index 735ae67..4553ec1 100644
--- a/src/openqdc/datasets/iso_17.py
+++ b/src/openqdc/datasets/iso_17.py
@@ -43,6 +43,12 @@ class ISO17(BaseDataset):
     __distance_unit__ = "bohr"  # bohr
     __forces_unit__ = "ev/bohr"
 
+    def __smiles_converter__(self, x):
+        """util function to convert string to smiles: useful if the smiles is
+        encoded in a different format than its display format
+        """
+        return "-".join(x.decode("ascii").split("_")[:-1])
+
     def read_raw_entries(self):
         raw_path = p_join(self.root, "iso_17.h5")
         samples = read_qc_archive_h5(raw_path, "iso_17", self.energy_target_names, self.force_target_names)
diff --git a/src/openqdc/datasets/nabladft.py b/src/openqdc/datasets/nabladft.py
index e7d9eb8..0555cdc 100644
--- a/src/openqdc/datasets/nabladft.py
+++ b/src/openqdc/datasets/nabladft.py
@@ -4,30 +4,32 @@
 
 import datamol as dm
 import numpy as np
-from tqdm import tqdm
+import pandas as pd
 
 from openqdc.datasets.base import BaseDataset
+from openqdc.utils.molecule import z_to_formula
 from openqdc.utils.package_utils import requires_package
 
 
-def to_mol(entry) -> Dict[str, np.ndarray]:
+def to_mol(entry, metadata) -> Dict[str, np.ndarray]:
     Z, R, E, F = entry[:4]
     C = np.zeros_like(Z)
+    E[0] = metadata["DFT TOTAL ENERGY"]
 
     res = dict(
         atomic_inputs=np.concatenate((Z[:, None], C[:, None], R), axis=-1).astype(np.float32),
-        name=np.array([""]),
+        name=np.array([metadata["SMILES"]]),
         energies=E[:, None].astype(np.float32),
         forces=F[:, :, None].astype(np.float32),
         n_atoms=np.array([Z.shape[0]], dtype=np.int32),
-        subset=np.array(["nabla"]),
+        subset=np.array([z_to_formula(Z)]),
     )
 
     return res
 
 
 @requires_package("nablaDFT")
-def read_chunk_from_db(raw_path, start_idx, stop_idx, step_size=1000):
+def read_chunk_from_db(raw_path, start_idx, stop_idx, labels, step_size=1000):
     from nablaDFT.dataset import HamiltonianDatabase
 
     print(f"Loading from {start_idx} to {stop_idx}")
@@ -35,7 +37,13 @@ def read_chunk_from_db(raw_path, start_idx, stop_idx, step_size=1000):
     idxs = list(np.arange(start_idx, stop_idx))
     n, s = len(idxs), step_size
 
-    samples = [to_mol(entry) for i in tqdm(range(0, n, s)) for entry in db[idxs[i : i + s]]]
+    cursor = db._get_connection().cursor()
+    data_idxs = cursor.execute("""SELECT * FROM dataset_ids WHERE id IN (""" + str(idxs)[1:-1] + ")").fetchall()
+    c_idxs = [tuple(x[1:]) for x in data_idxs]
+
+    samples = [
+        to_mol(entry, labels[c_idxs[i + j]]) for i in range(0, n, s) for j, entry in enumerate(db[idxs[i : i + s]])
+    ]
     return samples
 
 
@@ -68,12 +76,16 @@ class NablaDFT(BaseDataset):
     def read_raw_entries(self):
         from nablaDFT.dataset import HamiltonianDatabase
 
+        label_path = p_join(self.root, "summary.csv")
+        df = pd.read_csv(label_path, usecols=["MOSES id", "CONFORMER id", "SMILES", "DFT TOTAL ENERGY"])
+        labels = df.set_index(keys=["MOSES id", "CONFORMER id"]).to_dict("index")
+
         raw_path = p_join(self.root, "dataset_full.db")
         train = HamiltonianDatabase(raw_path)
         n, c = len(train), 20
         step_size = int(np.ceil(n / os.cpu_count()))
 
-        fn = lambda i: read_chunk_from_db(raw_path, i * step_size, min((i + 1) * step_size, n))
+        fn = lambda i: read_chunk_from_db(raw_path, i * step_size, min((i + 1) * step_size, n), labels=labels)
         samples = dm.parallelized(
             fn, list(range(c)), n_jobs=c, progress=False, scheduler="threads"
         )  # don't use more than 1 job
diff --git a/src/openqdc/datasets/pcqm.py b/src/openqdc/datasets/pcqm.py
index 505eef1..d1a344c 100644
--- a/src/openqdc/datasets/pcqm.py
+++ b/src/openqdc/datasets/pcqm.py
@@ -11,7 +11,7 @@
 from loguru import logger
 
 from openqdc.datasets.base import BaseDataset
-from openqdc.utils.io import get_local_cache
+from openqdc.utils.io import get_local_cache, push_remote
 
 
 def flatten_dict(d, sep: str = "."):
@@ -80,27 +80,83 @@ def __init__(self, energy_unit=None, distance_unit=None) -> None:
     def root(self):
         return p_join(get_local_cache(), "pubchemqc")
 
-    def collate_list(self, list_entries, partial=False):
-        # default partial=False is necessary for compatibility with the base class
-        if partial:
-            predicat = list_entries is not None and len(list_entries) > 0
-            list_entries = [x for x in list_entries if x is not None]
-            return super().collate_list(list_entries) if predicat else None
+    @property
+    def preprocess_path(self):
+        path = p_join(self.root, "preprocessed", self.__name__)
+        os.makedirs(path, exist_ok=True)
+        return path
+
+    def collate_list(self, list_entries):
+        predicat = list_entries is not None and len(list_entries) > 0
+        list_entries = [x for x in list_entries if x is not None]
+        if predicat:
+            res = super().collate_list(list_entries)
         else:
-            n = 0
-            for i in range(len(list_entries)):
-                list_entries[i]["position_idx_range"] += n
-                n += list_entries[i]["position_idx_range"].max()
-            res = {key: np.concatenate([r[key] for r in list_entries], axis=0) for key in list_entries[0]}
-            return res
+            res = None
+        return res
 
     def read_raw_entries(self):
         arxiv_paths = glob(p_join(self.root, f"{self.__energy_methods__[0]}", "*.pkl"))
-        f = lambda x: self.collate_list(read_preprocessed_archive(x), partial=True)
+        f = lambda x: self.collate_list(read_preprocessed_archive(x))
         samples = dm.parallelized(f, arxiv_paths, n_jobs=1, progress=True)
         samples = [x for x in samples if x is not None]
         return samples
 
+    def preprocess(self, overwrite=False):
+        if overwrite or not self.is_preprocessed():
+            logger.info("Preprocessing data and saving it to cache.")
+            logger.info(
+                f"Dataset {self.__name__} data with the following units:\n"
+                f"Energy: {self.energy_unit}, Distance: {self.distance_unit}, "
+                f"Forces: {self.force_unit if self.__force_methods__ else 'None'}"
+            )
+            entries = self.read_raw_entries()
+            self.collate_and_save_list(entries)
+
+    def collate_and_save_list(self, list_entries):
+        n_molecules, n_atoms = 0, 0
+        for i in range(len(list_entries)):
+            list_entries[i]["position_idx_range"] += n_atoms
+            n_atoms += list_entries[i]["position_idx_range"].max()
+            n_molecules += list_entries[i]["position_idx_range"].shape[0]
+
+        for key in self.data_keys:
+            first = list_entries[0][key]
+            shape = (n_molecules, *first.shape[1:])
+            local_path = p_join(self.preprocess_path, f"{key}.mmap")
+            out = np.memmap(local_path, mode="w+", dtype=first.dtype, shape=shape)
+
+            start = 0
+            for i in range(len(list_entries)):
+                x = list_entries[i].pop(key)
+                n = x.shape[0]
+                out[start : start + n] = x
+                out.flush()
+            push_remote(local_path, overwrite=True)
+
+        # save smiles and subset
+        tmp, n = dict(name=[]), len(list_entries)
+        local_path = p_join(self.preprocess_path, "props.pkl")
+        names = [list_entries[i].pop("name") for i in range(n)]
+        f = lambda xs: [dm.to_inchikey(x) for x in xs]
+        res = dm.parallelized(f, names, n_jobs=-1, progress=False)
+        for x in res:
+            tmp["name"] += x
+        for key in ["subset", "n_atoms"]:
+            tmp[key] = []
+            for i in range(n):
+                tmp[key] += list(list_entries[i].pop(key))
+        with open(local_path, "wb") as f:
+            pkl.dump(tmp, f)
+        push_remote(local_path, overwrite=True)
+        # for key in ["name", "subset"]:
+        #     local_path = p_join(self.preprocess_path, f"{key}.npz")
+        #     x = [el for i in range(len(list_entries)) for el in list_entries[i].pop(key)]
+        #     uniques, inv_indices = np.unique(x, return_inverse=True)
+        #     with open(local_path, "wb") as f:
+        #         np.savez_compressed(f, uniques=uniques, inv_indices=inv_indices)
+        #     push_remote(local_path, overwrite=True)
+
 
 class PCQM_B3LYP(PCQM_PM6):
     __name__ = "pubchemqc_b3lyp"
diff --git a/src/openqdc/datasets/sn2_rxn.py b/src/openqdc/datasets/sn2_rxn.py
index 3e75e91..abcbd62 100644
--- a/src/openqdc/datasets/sn2_rxn.py
+++ b/src/openqdc/datasets/sn2_rxn.py
@@ -25,8 +25,38 @@ class SN2RXN(BaseDataset):
         "DSD-BLYP-D3(BJ):def2-TZVP Gradient",
     ]
 
+    def __smiles_converter__(self, x):
+        """util function to convert string to smiles: useful if the smiles is
+        encoded in a different format than its display format
+        """
+        return "-".join(x.decode("ascii").split("_")[:-1])
+
     def read_raw_entries(self):
         raw_path = p_join(self.root, "sn2_rxn.h5")
+
+        # raw_path = p_join(self.root, "sn2_reactions.npz")
+        # data = np.load(raw_path)
+
+        # # as example for accessing individual entries, print the data for entry idx=0
+        # idx = 0
+        # print("Data for entry " + str(idx)+":")
+        # print("Number of atoms")
+        # print(data["N"][idx])
+        # print("Energy [eV]")
+        # print(data["E"][idx])
+        # print("Total charge")
+        # print(data["Q"][idx])
+        # print("Dipole moment vector (with respect to [0.0 0.0 0.0]) [eA]")
+        # print(data["D"][idx,:])
+        # print("Nuclear charges")
+        # print(data["Z"][idx,:data["N"][idx]])
+        # print("Cartesian coordinates [A]")
+        # print(data["R"][idx,:data["N"][idx],:])
+        # print("Forces [eV/A]")
+        # print(data["F"][idx,:data["N"][idx],:])
+
+        # exit()
+
         samples = read_qc_archive_h5(raw_path, "sn2_rxn", self.energy_target_names, self.force_target_names)
 
         return samples
diff --git a/src/openqdc/datasets/solvated_peptides.py b/src/openqdc/datasets/solvated_peptides.py
index 9846bdf..216ecdd 100644
--- a/src/openqdc/datasets/solvated_peptides.py
+++ b/src/openqdc/datasets/solvated_peptides.py
@@ -27,6 +27,12 @@ class SolvatedPeptides(BaseDataset):
     __distance_unit__ = "bohr"
     __forces_unit__ = "hartree/bohr"
 
+    def __smiles_converter__(self, x):
+        """util function to convert string to smiles: useful if the smiles is
+        encoded in a different format than its display format
+        """
+        return "_".join(x.decode("ascii").split("_")[:-1])
+
     def read_raw_entries(self):
         raw_path = p_join(self.root, "solvated_peptides.h5")
         samples = read_qc_archive_h5(raw_path, "solvated_peptides", self.energy_target_names, self.force_target_names)
diff --git a/src/openqdc/raws/config_factory.py b/src/openqdc/raws/config_factory.py
index 38bec86..c8dddba 100644
--- a/src/openqdc/raws/config_factory.py
+++ b/src/openqdc/raws/config_factory.py
@@ -37,7 +37,7 @@ class DataConfigFactory:
 
     sn2_rxn = dict(
         dataset_name="sn2_rxn",
-        links={"sn2_rxn.hdf5.gz": "https://zenodo.org/record/3585800/files/212.hdf5.gz"},
+        links={"sn2_rxn.hdf5.gz": "https://zenodo.org/records/2605341/files/sn2_reactions.npz"},
     )
 
     # FROM: https://sites.uw.edu/wdbase/database-of-water-clusters/
diff --git a/src/openqdc/utils/atomization_energies.py b/src/openqdc/utils/atomization_energies.py
index 40d0d13..6a1a638 100644
--- a/src/openqdc/utils/atomization_energies.py
+++ b/src/openqdc/utils/atomization_energies.py
@@ -2,124 +2,126 @@
 
 import numpy as np
 from loguru import logger
+from rdkit import Chem
 
 from openqdc.utils.constants import MAX_ATOMIC_NUMBER
 
+atom_table = Chem.GetPeriodicTable()
+
 __all__ = ["chemical_symbols", "atomic_numbers", "IsolatedAtomEnergyFactory"]
 
 EF_KEY: TypeAlias = Tuple[str, int]
 
-ATOM_SPECIES = "H", "Li", "B", "C", "N", "O", "F", "Na", "Mg", "Si", "P", "S", "Cl", "K", "Ca", "Br", "I"
-# Energy in atomic unit/ Hartree / Ang
-
 # didn t calculate for Pd, Pt, Mo, Ni, Fe, Cu, see DESS
 atomic_numbers = {}
-chemical_symbols = [
-    "X",
-    "H",
-    "He",
-    "Li",
-    "Be",
-    "B",
-    "C",
-    "N",
-    "O",
-    "F",
-    "Ne",
-    "Na",
-    "Mg",
-    "Al",
-    "Si",
-    "P",
-    "S",
-    "Cl",
-    "Ar",
-    "K",
-    "Ca",
-    "Sc",
-    "Ti",
-    "V",
-    "Cr",
-    "Mn",
-    "Fe",
-    "Co",
-    "Ni",
-    "Cu",
-    "Zn",
-    "Ga",
-    "Ge",
-    "As",
-    "Se",
-    "Br",
-    "Kr",
-    "Rb",
-    "Sr",
-    "Y",
-    "Zr",
-    "Nb",
-    "Mo",
-    "Tc",
-    "Ru",
-    "Rh",
-    "Pd",
-    "Ag",
-    "Cd",
-    "In",
-    "Sn",
-    "Sb",
-    "Te",
-    "I",
-    "Xe",
-    "Cs",
-    "Ba",
-    "La",
-    "Ce",
-    "Pr",
-    "Nd",
-    "Pm",
-    "Sm",
-    "Eu",
-    "Gd",
-    "Tb",
-    "Dy",
-    "Ho",
-    "Er",
-    "Tm",
-    "Yb",
-    "Lu",
-    "Hf",
-    "Ta",
-    "W",
-    "Re",
-    "Os",
-    "Ir",
-    "Pt",
-    "Au",
-    "Hg",
-    "Tl",
-    "Pb",
-    "Bi",
-    "Po",
-    "At",
-    "Rn",
-    "Fr",
-    "Ra",
-    "Ac",
-    "Th",
-    "Pa",
-    "U",
-    "Np",
-    "Pu",
-    "Am",
-    "Cm",
-    "Bk",
-    "Cf",
-    "Es",
-    "Fm",
-    "Md",
-    "No",
-    "Lr",
-]
+chemical_symbols = np.array(
+    [
+        "X",
+        "H",
+        "He",
+        "Li",
+        "Be",
+        "B",
+        "C",
+        "N",
+        "O",
+        "F",
+        "Ne",
+        "Na",
+        "Mg",
+        "Al",
+        "Si",
+        "P",
+        "S",
+        "Cl",
+        "Ar",
+        "K",
+        "Ca",
+        "Sc",
+        "Ti",
+        "V",
+        "Cr",
+        "Mn",
+        "Fe",
+        "Co",
+        "Ni",
+        "Cu",
+        "Zn",
+        "Ga",
+        "Ge",
+        "As",
+        "Se",
+        "Br",
+        "Kr",
+        "Rb",
+        "Sr",
+        "Y",
+        "Zr",
+        "Nb",
+        "Mo",
+        "Tc",
+        "Ru",
+        "Rh",
+        "Pd",
+        "Ag",
+        "Cd",
+        "In",
+        "Sn",
+        "Sb",
+        "Te",
+        "I",
+        "Xe",
+        "Cs",
+        "Ba",
+        "La",
+        "Ce",
+        "Pr",
+        "Nd",
+        "Pm",
+        "Sm",
+        "Eu",
+        "Gd",
+        "Tb",
+        "Dy",
+        "Ho",
+        "Er",
+        "Tm",
+        "Yb",
+        "Lu",
+        "Hf",
+        "Ta",
+        "W",
+        "Re",
+        "Os",
+        "Ir",
+        "Pt",
+        "Au",
+        "Hg",
+        "Tl",
+        "Pb",
+        "Bi",
+        "Po",
+        "At",
+        "Rn",
+        "Fr",
+        "Ra",
+        "Ac",
+        "Th",
+        "Pa",
+        "U",
+        "Np",
+        "Pu",
+        "Am",
+        "Cm",
+        "Bk",
+        "Cf",
+        "Es",
+        "Fm",
+        "Md",
+        "No",
+        "Lr",
+    ]
+)
 
 
 for Z, symbol in enumerate(chemical_symbols):
@@ -131,7 +133,7 @@ class IsolatedAtomEnergyFactory:
     Factory method to get the isolated atom energies for a given level of theory.
     """
 
-    max_charge = 4
+    max_charge = 6
 
     def __init__(self):
         pass
@@ -207,7 +209,14 @@ def get_matrix(level_of_theory: str) -> np.ndarray:
         if tuple_hashmap is None:
             return matrix
         for key in tuple_hashmap.keys():
-            matrix[atomic_numbers[key[0]], key[1] + shift] = tuple_hashmap[key]
+            try:
+                matrix[atomic_numbers[key[0]], key[1] + shift] = tuple_hashmap[key]
+            except KeyError:
+                print(key, list(tuple_hashmap.items()))
+                print(key[0], "?", key[1], "?", shift)
+                print(matrix.shape, atomic_numbers[key[0]], key[1] + shift)
+                logger.warning(f"Isolated atom energies not found for {key} and level of theory {level_of_theory}")
+                matrix[atomic_numbers[key[0]], key[1] + shift] = 0
         return matrix
 
 
diff --git a/src/openqdc/utils/molecule.py b/src/openqdc/utils/molecule.py
index cd2290f..82a58d2 100644
--- a/src/openqdc/utils/molecule.py
+++ b/src/openqdc/utils/molecule.py
@@ -1,9 +1,22 @@
+from typing import Any
+
 import numpy as np
+from numpy import ndarray
 from rdkit import Chem
 
+from openqdc.utils.atomization_energies import chemical_symbols
+
 atom_table = Chem.GetPeriodicTable()
 
 
+def z_to_formula(z):
+    u, c = np.unique(z, return_counts=True)
+    idxs = np.argsort(u)
+    u, c = u[idxs], c[idxs]
+
+    return "".join([f"{chemical_symbols[u[i]]}{c[i] if c[i] > 1 else ''}" for i in range(len(u))])
+
+
 def get_atomic_number(mol: Chem.Mol):
     """Returns atomic numbers for rdkit molecule"""
     return np.array([atom.GetAtomicNum() for atom in mol.GetAtoms()])
@@ -17,3 +30,76 @@ def get_atomic_charge(mol: Chem.Mol):
 def get_atomic_number_and_charge(mol: Chem.Mol):
     """Returns atoms number and charge for rdkit molecule"""
     return np.array([[atom.GetAtomicNum(), atom.GetFormalCharge()] for atom in mol.GetAtoms()])
+
+
+def rmsd(P: ndarray, Q: ndarray, **kwargs) -> float:
+    """
+    Calculate Root-mean-square deviation from two sets of vectors V and W.
+
+    Parameters
+    ----------
+    V : array
+        (N,D) matrix, where N is points and D is dimension.
+    W : array
+        (N,D) matrix, where N is points and D is dimension.
+
+    Returns
+    -------
+    rmsd : float
+        Root-mean-square deviation between the two vectors
+    """
+    diff = P - Q
+    return np.sqrt((diff * diff).sum() / P.shape[0])
+
+
+def kabsch_rmsd(
+    P: ndarray,
+    Q: ndarray,
+    translate: bool = False,
+    **kwargs: Any,
+) -> float:
+    """
+    Rotate matrix P unto Q using Kabsch algorithm and calculate the RMSD.
+
+    Parameters
+    ----------
+    P : array
+        (N,D) matrix, where N is points and D is dimension.
+    Q : array
+        (N,D) matrix, where N is points and D is dimension.
+    translate : bool
+        Use centroids to translate vector P and Q unto each other.
+
+    Returns
+    -------
+    rmsd : float
+        root-mean squared deviation
+    """
+
+    if translate:
+        Q = Q - Q.mean(axis=0)
+        P = P - P.mean(axis=0)
+
+    # Computation of the covariance matrix
+    C = np.dot(np.transpose(P), Q)
+
+    # Computation of the optimal rotation matrix
+    # This can be done using singular value decomposition (SVD)
+    # Getting the sign of the det(V)*(W) to decide
+    # whether we need to correct our rotation matrix to ensure a
+    # right-handed coordinate system.
+    # And finally calculating the optimal rotation matrix U
+    # see http://en.wikipedia.org/wiki/Kabsch_algorithm
+    V, S, W = np.linalg.svd(C)
+    d = (np.linalg.det(V) * np.linalg.det(W)) < 0.0
+
+    if d:
+        S[-1] = -S[-1]
+        V[:, -1] = -V[:, -1]
+
+    # Create Rotation matrix U
+    U = np.dot(V, W)
+
+    # Rotate P
+    P_prime = np.dot(P, U)
+    return rmsd(P_prime, Q)
diff --git a/src/openqdc/utils/preprocess.py b/src/openqdc/utils/preprocess.py
index 1142dca..b34499e 100644
--- a/src/openqdc/utils/preprocess.py
+++ b/src/openqdc/utils/preprocess.py
@@ -36,9 +36,11 @@
 def preprocess(dataset):
     if dataset not in options_map:
         dataset_id = int(dataset)
+        data_class = options[dataset_id]
+    else:
+        data_class = options_map[dataset]
 
-    data_class = options[dataset_id]
-    data_class().preprocess()
+    data_class().preprocess(overwrite=False)
     data = data_class()
     logger.info(f"Preprocessing {data.__name__}")
 
@@ -47,7 +49,7 @@ def preprocess(dataset):
         x = data[i]
         print(x.name, x.subset, end=" ")
         for k in x:
-            if x[k] is not None:
+            if isinstance(x[k], np.ndarray):
                 print(k, x[k].shape, end=" ")
         print()
 
diff --git a/src/openqdc/utils/units.py b/src/openqdc/utils/units.py
index fb895ce..69c8972 100644
--- a/src/openqdc/utils/units.py
+++ b/src/openqdc/utils/units.py
@@ -73,3 +73,6 @@ def get_conversion(in_unit: str, out_unit: str):
 Conversion("hartree/ang", "kcal/mol/ang", lambda x: get_conversion("hartree", "kcal/mol")(x))
 Conversion("hartree/ang", "hartree/bohr", lambda x: get_conversion("bohr", "ang")(x))
 Conversion("hartree/bohr", "hartree/ang", lambda x: get_conversion("ang", "bohr")(x))
+Conversion("ev/bohr", "kcal/mol/ang", lambda x: get_conversion("ang", "bohr")(get_conversion("ev", "kcal/mol")(x)))
+Conversion("kcal/mol/bohr", "kcal/mol/ang", lambda x: get_conversion("ang", "bohr")(x))
+Conversion("ev/ang", "kcal/mol/ang", lambda x: get_conversion("ev", "kcal/mol")(x))