AstraZeneca · benedekrozemberczki · Jan 25, 2022 · Jan 24, 2022 · Jan 24, 2022 · Jan 24, 2022
diff --git a/chemicalx/data/__init__.py b/chemicalx/data/__init__.py
@@ -8,6 +8,8 @@
     DrugbankDDI,
     DrugComb,
     DrugCombDB,
+    LocalDatasetLoader,
+    OncoPolyPharmacology,
     RemoteDatasetLoader,
     TwoSides,
 )
@@ -20,13 +22,17 @@
     "DrugFeatureSet",
     "DrugPairBatch",
     "LabeledTriples",
-    # Datasets
+    # Abstract datasets
     "dataset_resolver",
     "DatasetLoader",
+    "RemoteDatasetLoader",
+    "LocalDatasetLoader",
+    # Datasets
     "DrugbankDDI",
     "TwoSides",
     "DrugComb",
     "DrugCombDB",
+    "OncoPolyPharmacology",
 ]
 
-dataset_resolver = Resolver.from_subclasses(base=DatasetLoader, skip={RemoteDatasetLoader})
+dataset_resolver = Resolver.from_subclasses(base=DatasetLoader, skip={RemoteDatasetLoader, LocalDatasetLoader})
diff --git a/chemicalx/data/contextfeatureset.py b/chemicalx/data/contextfeatureset.py
@@ -1,7 +1,7 @@
 """A module for the context feature set class."""
 
 from collections import UserDict
-from typing import Iterable, Mapping
+from typing import Iterable, Mapping, Sequence
 
 import torch
 
@@ -13,6 +13,11 @@
 class ContextFeatureSet(UserDict, Mapping[str, torch.FloatTensor]):
     """Context feature set for biological/chemical context feature vectors."""
 
+    @classmethod
+    def from_dict(cls, data: Mapping[str, Sequence[float]]) -> "ContextFeatureSet":
+        """Generate a context feature set from a data dictionary."""
+        return cls({key: torch.FloatTensor(values).view(1, -1) for key, values in data.items()})
+
     def get_feature_matrix(self, contexts: Iterable[str]) -> torch.FloatTensor:
         """Get the feature matrix for a list of contexts.
 

diff --git a/chemicalx/data/datasetloader.py b/chemicalx/data/datasetloader.py
@@ -1,30 +1,37 @@
 """A module for dataset loaders."""
 
+import csv
 import io
 import json
 import urllib.request
 from abc import ABC, abstractmethod
 from functools import lru_cache
+from itertools import chain
+from pathlib import Path
 from textwrap import dedent
-from typing import Dict, Optional, Tuple, cast
+from typing import ClassVar, Dict, Mapping, Optional, Sequence, Tuple, cast
 
 import numpy as np
 import pandas as pd
+import pystow as pystow
 import torch
 
 from .batchgenerator import BatchGenerator
 from .contextfeatureset import ContextFeatureSet
 from .drugfeatureset import DrugFeatureSet
 from .labeledtriples import LabeledTriples
+from .utils import get_features, get_tdc_synergy
 
 __all__ = [
     "DatasetLoader",
     "RemoteDatasetLoader",
+    "LocalDatasetLoader",
     # Actual datasets
     "DrugCombDB",
     "DrugComb",
     "TwoSides",
     "DrugbankDDI",
+    "OncoPolyPharmacology",
 ]
 
 
@@ -125,6 +132,7 @@ def drug_channels(self) -> int:
         """Get the number of features for each drug."""
         return next(iter(self.get_drug_features().values()))["features"].shape[1]
 
+    @abstractmethod
     def get_labeled_triples(self) -> LabeledTriples:
         """
         Get the labeled triples file from the storage.
@@ -155,7 +163,7 @@ def summarize(self) -> None:
 
 
 class RemoteDatasetLoader(DatasetLoader):
-    """General dataset loader for the integrated drug pair scoring datasets."""
+    """A dataset loader for remote data."""
 
     def __init__(self, dataset_name: str):
         """Instantiate the dataset loader.
@@ -278,3 +286,113 @@ class DrugbankDDI(RemoteDatasetLoader):
     def __init__(self):
         """Instantiate the Drugbank DDI dataset loader."""
         super().__init__("drugbankddi")
+
+
+class LocalDatasetLoader(DatasetLoader, ABC):
+    """A dataset loader that processes and caches data locally."""
+
+    structures_name: ClassVar[str] = "structures.tsv"
+    features_name: ClassVar[str] = "features.tsv"
+    contexts_name: ClassVar[str] = "contexts.tsv"
+    labels_name: ClassVar[str] = "labels.tsv"
+
+    def __init__(self, directory: Optional[Path] = None):
+        """Instantiate the local dataset loader."""
+        self.directory = directory or pystow.join("chemicalx", self.__class__.__name__.lower())
+        self.drug_structures_path = self.directory.joinpath(self.structures_name)
+        self.drug_features_path = self.directory.joinpath(self.features_name)
+        self.contexts_path = self.directory.joinpath(self.contexts_name)
+        self.labels_path = self.directory.joinpath(self.labels_name)
+
+        if any(
+            not path.exists()
+            for path in (self.drug_features_path, self.drug_structures_path, self.contexts_path, self.labels_path)
+        ):
+            self.preprocess()
+
+    @abstractmethod
+    def preprocess(self):
+        """Download and preprocess the dataset.
+
+        The implementation of this function should write to all three of ``self.drugs_path``,
+        ``self.contexts_path``, and ``self.labels_path`` using respectively :func:`write_drugs`,
+        :func:`write_contexts`, and :func:`write_labels`.
+        """
+
+    @lru_cache(maxsize=1)
+    def get_drug_features(self) -> DrugFeatureSet:
+        """Get the drug feature set."""
+        with self.drug_structures_path.open() as struct_file, self.drug_features_path.open() as feat_file:
+            struct_reader = csv.reader(struct_file, delimiter="\t")
+            feat_reader = csv.reader(feat_file, delimiter="\t")
+            return DrugFeatureSet.from_dict(
+                {
+                    drug: {"smiles": smiles, "features": [float(f) for f in features]}
+                    for (drug, smiles), (_, *features) in zip(struct_reader, feat_reader)
+                }
+            )
+
+    def write_drugs(self, drugs: Mapping[str, str]) -> None:
+        """Write the drug data."""
+        with self.drug_structures_path.open("w") as struct_file, self.drug_features_path.open("w") as feat_file:
+            for drug, smiles in sorted(drugs.items()):
+                print(drug, smiles, sep="\t", file=struct_file)
+                print(drug, *get_features(smiles), sep="\t", file=feat_file)
+
+    @lru_cache(maxsize=1)
+    def get_context_features(self) -> ContextFeatureSet:
+        """Get the context feature set."""
+        with self.contexts_path.open() as file:
+            return ContextFeatureSet.from_dict(
+                {key: [float(v) for v in values] for key, *values in csv.reader(file, delimiter="\t")}
+            )
+
+    def write_contexts(self, contexts: Mapping[str, Sequence[float]]):
+        """Write the context feature set."""
+        with self.contexts_path.open("w") as file:
+            for key, values in contexts.items():
+                print(key, *values, sep="\t", file=file)
+
+    @lru_cache(maxsize=1)
+    def get_labeled_triples(self) -> LabeledTriples:
+        """Get the labeled triples dataframe."""
+        return LabeledTriples(pd.read_csv(self.labels_path, sep="\t"))
+
+    def write_labels(self, df: pd.DataFrame):
+        """Write the labeled triples dataframe."""
+        df.to_csv(self.labels_path, index=False, sep="\t")
+
+
+class OncoPolyPharmacology(LocalDatasetLoader):
+    """A large-scale oncology screen of drug-drug synergy from [oneil2016]_.
+
+    .. [oneil2016] O’Neil, J., *et al.* (2016). `An Unbiased Oncology Compound Screen to Identify Novel
+       Combination Strategies <https://doi.org/10.1158/1535-7163.MCT-15-0843>`_. *Molecular Cancer
+       Therapeutics*, 15(6), 1155–1162.
+    """
+
+    def preprocess(self) -> None:
+        """Download and process the OncoPolyPharmacology dataset."""
+        tdc_directory = get_tdc_synergy("OncoPolyPharmacology")
+        df = pd.read_pickle(tdc_directory.joinpath("oncopolypharmacology.pkl"))
+
+        drugs = dict(
+            chain(
+                df[["Drug1_ID", "Drug1"]].values,
+                df[["Drug2_ID", "Drug2"]].values,
+            )
+        )
+        self.write_drugs(drugs)
+
+        contexts = {key: values.round(4).tolist() for key, values in df[["Cell_Line_ID", "Cell_Line"]].values}
+        self.write_contexts(contexts)
+
+        labels_df = df[["Drug1_ID", "Drug2_ID", "Cell_Line_ID", "Y"]].rename(
+            columns={
+                "Drug1_ID": "drug_1",
+                "Drug2_ID": "drug_2",
+                "Cell_Line_ID": "context",
+                "Y": "label",
+            }
+        )
+        self.write_labels(labels_df)
diff --git a/chemicalx/data/drugfeatureset.py b/chemicalx/data/drugfeatureset.py
@@ -20,7 +20,7 @@ def from_dict(cls, data: Dict[str, Dict]) -> "DrugFeatureSet":
         return cls(
             {
                 key: {
-                    "features": torch.FloatTensor(features["features"]),
+                    "features": torch.FloatTensor(features["features"]).view(1, -1),
                     "molecule": Molecule.from_smiles(features["smiles"]),
                 }
                 for key, features in data.items()

diff --git a/chemicalx/data/utils.py b/chemicalx/data/utils.py
@@ -0,0 +1,71 @@
+"""Dataset processing utilities."""
+
+import json
+from pathlib import Path
+from typing import Mapping, Sequence
+
+import numpy as np
+import pandas as pd
+import pystow
+import rdkit
+from rdkit.Chem import AllChem, DataStructs
+from tdc.multi_pred import DDI, DrugSyn
+
+__all__ = [
+    "get_tdc_synergy",
+    "get_features",
+    "write_drugs_json",
+    "write_triples",
+    "write_contexts_json",
+]
+
+DRUG_FILE_NAME = "drug_set.json"
+CONTEXT_FILE_NAME = "context_set.json"
+LABELS_FILE_NAME = "labeled_triples.tsv"
+
+
+def get_tdc_synergy(name: str) -> Path:
+    """Download the synergy dataset from TDC and return the standardized directory it went to."""
+    directory = pystow.join("tdc", DrugSyn.__name__.lower())
+    DrugSyn(name=name, path=directory.as_posix())
+    return directory
+
+
+def get_tdc_ddi(name: str) -> Path:
+    """Download the DDI dataset from TDC and return the standardized directory it went to."""
+    directory = pystow.join("tdc", DDI.__name__.lower())
+    DDI(name=name, path=directory.as_posix())
+    return directory
+
+
+def get_features(smiles: str):
+    """Get a morgan fingerprint vector for the given molecule."""
+    molecule = rdkit.Chem.MolFromSmiles(smiles)
+    features = AllChem.GetHashedMorganFingerprint(molecule, 2, nBits=256)
+    array = np.zeros((0,), dtype=np.int8)
+    DataStructs.ConvertToNumpyArray(features, array)
+    return array.tolist()
+
+
+def write_drugs_json(drugs_raw: Mapping[str, str], output_directory: Path) -> Path:
+    """Write drugs dictionary."""
+    drug_set = {drug: {"smiles": smiles, "features": get_features(smiles)} for drug, smiles in drugs_raw.items()}
+    path = output_directory.joinpath(DRUG_FILE_NAME)
+    with path.open("w") as file:
+        json.dump(drug_set, file)
+    return path
+
+
+def write_contexts_json(context_set: Mapping[str, Sequence[float]], output_directory: Path) -> Path:
+    """Write contexts dictionary."""
+    path = output_directory.joinpath(CONTEXT_FILE_NAME)
+    with path.open("w") as file:
+        json.dump(context_set, file)
+    return path
+
+
+def write_triples(df: pd.DataFrame, output_directory: Path, sep: str = "\t") -> Path:
+    """Write labeled triples."""
+    path = output_directory.joinpath(LABELS_FILE_NAME)
+    df.to_csv(path, index=False, sep=sep)
+    return path
diff --git a/chemicalx/pipeline.py b/chemicalx/pipeline.py
@@ -4,12 +4,13 @@
 import time
 from dataclasses import dataclass
 from pathlib import Path
-from typing import Any, List, Mapping, Optional, Type, Union
+from typing import Any, List, Mapping, Optional, Sequence, Type, Union
 
 import pandas as pd
 import torch
-from class_resolver import HintOrType
-from sklearn.metrics import roc_auc_score
+from class_resolver import FunctionResolver, HintOrType
+from sklearn.metrics import mean_absolute_error, mean_squared_error, roc_auc_score
+from tabulate import tabulate
 from torch.nn.modules.loss import _Loss
 from torch.optim.optimizer import Optimizer
 from tqdm import trange
@@ -23,6 +24,11 @@
     "pipeline",
 ]
 
+metric_resolver = FunctionResolver([])
+metric_resolver.register(roc_auc_score, synonyms={"roc_auc", "auc_roc", "auroc"})
+metric_resolver.register(mean_squared_error, synonyms={"mse"})
+metric_resolver.register(mean_absolute_error, synonyms={"mae"})
+
 
 @dataclass
 class Result:
@@ -33,11 +39,11 @@ class Result:
     losses: List[float]
     train_time: float
     evaluation_time: float
-    roc_auc: float
+    metrics: Mapping[str, float]
 
     def summarize(self) -> None:
         """Print results to the console."""
-        print(f"AUC-ROC: {self.roc_auc:0.3f}")
+        print(tabulate(sorted(self.metrics.items()), headers=["Metric", "Value"]))
 
     def save(self, directory: Union[str, Path]) -> None:
         """Save the results to a directory."""
@@ -50,9 +56,7 @@ def save(self, directory: Union[str, Path]) -> None:
         directory.joinpath("results.json").write_text(
             json.dumps(
                 {
-                    "evaluation": {
-                        "auc_roc": self.roc_auc,
-                    },
+                    "evaluation": self.metrics,
                     "losses": self.losses,
                     "training_time": self.train_time,
                     "evaluation_time": self.evaluation_time,
@@ -79,6 +83,7 @@ def pipeline(
     drug_molecules: bool,
     train_size: Optional[float] = None,
     random_state: Optional[int] = None,
+    metrics: Optional[Sequence[str]] = None,
 ) -> Result:
     """Run the training and evaluation pipeline.
 
@@ -118,6 +123,8 @@ def pipeline(
         The ratio of training triples. Default is 0.8 if None is passed.
     :param random_state:
         The random seed for splitting the triples. Default is 42. Set to none for no fixed seed.
+    :param metrics:
+        The list of metrics to use.
     :returns:
         A result object with the trained model and evaluation results
     """
@@ -165,11 +172,18 @@ def pipeline(
 
     predictions_df = pd.concat(predictions)
 
+    if metrics is None:
+        metric_dict = {"roc_auc": roc_auc_score}
+    else:
+        metric_dict = {name: metric_resolver.lookup(name) for name in metrics}
+
     return Result(
         model=model,
         predictions=predictions_df,
         losses=losses,
         train_time=train_time,
         evaluation_time=evaluation_time,
-        roc_auc=roc_auc_score(predictions_df["label"], predictions_df["prediction"]),
+        metrics={
+            name: func(predictions_df["label"], predictions_df["prediction"]) for name, func in metric_dict.items()
+        },
     )