Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add additional datasets #61

Merged
merged 15 commits into from
Jan 25, 2022
10 changes: 8 additions & 2 deletions chemicalx/data/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,8 @@
DrugbankDDI,
DrugComb,
DrugCombDB,
LocalDatasetLoader,
OncoPolyPharmacology,
RemoteDatasetLoader,
TwoSides,
)
Expand All @@ -20,13 +22,17 @@
"DrugFeatureSet",
"DrugPairBatch",
"LabeledTriples",
# Datasets
# Abstract datasets
"dataset_resolver",
"DatasetLoader",
"RemoteDatasetLoader",
"LocalDatasetLoader",
# Datasets
"DrugbankDDI",
"TwoSides",
"DrugComb",
"DrugCombDB",
"OncoPolyPharmacology",
]

dataset_resolver = Resolver.from_subclasses(base=DatasetLoader, skip={RemoteDatasetLoader})
dataset_resolver = Resolver.from_subclasses(base=DatasetLoader, skip={RemoteDatasetLoader, LocalDatasetLoader})
7 changes: 6 additions & 1 deletion chemicalx/data/contextfeatureset.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
"""A module for the context feature set class."""

from collections import UserDict
from typing import Iterable, Mapping
from typing import Iterable, Mapping, Sequence

import torch

Expand All @@ -13,6 +13,11 @@
class ContextFeatureSet(UserDict, Mapping[str, torch.FloatTensor]):
"""Context feature set for biological/chemical context feature vectors."""

@classmethod
def from_dict(cls, data: Mapping[str, Sequence[float]]) -> "ContextFeatureSet":
"""Generate a context feature set from a data dictionary."""
return cls({key: torch.FloatTensor(values).view(1, -1) for key, values in data.items()})

def get_feature_matrix(self, contexts: Iterable[str]) -> torch.FloatTensor:
"""Get the feature matrix for a list of contexts.

Expand Down
122 changes: 120 additions & 2 deletions chemicalx/data/datasetloader.py
Original file line number Diff line number Diff line change
@@ -1,30 +1,37 @@
"""A module for dataset loaders."""

import csv
import io
import json
import urllib.request
from abc import ABC, abstractmethod
from functools import lru_cache
from itertools import chain
from pathlib import Path
from textwrap import dedent
from typing import Dict, Optional, Tuple, cast
from typing import ClassVar, Dict, Mapping, Optional, Sequence, Tuple, cast

import numpy as np
import pandas as pd
import pystow as pystow
import torch

from .batchgenerator import BatchGenerator
from .contextfeatureset import ContextFeatureSet
from .drugfeatureset import DrugFeatureSet
from .labeledtriples import LabeledTriples
from .utils import get_features, get_tdc_synergy

__all__ = [
"DatasetLoader",
"RemoteDatasetLoader",
"LocalDatasetLoader",
# Actual datasets
"DrugCombDB",
"DrugComb",
"TwoSides",
"DrugbankDDI",
"OncoPolyPharmacology",
]


Expand Down Expand Up @@ -125,6 +132,7 @@ def drug_channels(self) -> int:
"""Get the number of features for each drug."""
return next(iter(self.get_drug_features().values()))["features"].shape[1]

@abstractmethod
def get_labeled_triples(self) -> LabeledTriples:
"""
Get the labeled triples file from the storage.
Expand Down Expand Up @@ -155,7 +163,7 @@ def summarize(self) -> None:


class RemoteDatasetLoader(DatasetLoader):
"""General dataset loader for the integrated drug pair scoring datasets."""
"""A dataset loader for remote data."""

def __init__(self, dataset_name: str):
"""Instantiate the dataset loader.
Expand Down Expand Up @@ -278,3 +286,113 @@ class DrugbankDDI(RemoteDatasetLoader):
def __init__(self):
"""Instantiate the Drugbank DDI dataset loader."""
super().__init__("drugbankddi")


class LocalDatasetLoader(DatasetLoader, ABC):
"""A dataset loader that processes and caches data locally."""

structures_name: ClassVar[str] = "structures.tsv"
features_name: ClassVar[str] = "features.tsv"
contexts_name: ClassVar[str] = "contexts.tsv"
labels_name: ClassVar[str] = "labels.tsv"

def __init__(self, directory: Optional[Path] = None):
"""Instantiate the local dataset loader."""
self.directory = directory or pystow.join("chemicalx", self.__class__.__name__.lower())
self.drug_structures_path = self.directory.joinpath(self.structures_name)
self.drug_features_path = self.directory.joinpath(self.features_name)
self.contexts_path = self.directory.joinpath(self.contexts_name)
self.labels_path = self.directory.joinpath(self.labels_name)

if any(
not path.exists()
for path in (self.drug_features_path, self.drug_structures_path, self.contexts_path, self.labels_path)
):
self.preprocess()

@abstractmethod
def preprocess(self):
"""Download and preprocess the dataset.

The implementation of this function should write to all three of ``self.drugs_path``,
``self.contexts_path``, and ``self.labels_path`` using respectively :func:`write_drugs`,
:func:`write_contexts`, and :func:`write_labels`.
"""

@lru_cache(maxsize=1)
def get_drug_features(self) -> DrugFeatureSet:
"""Get the drug feature set."""
with self.drug_structures_path.open() as struct_file, self.drug_features_path.open() as feat_file:
struct_reader = csv.reader(struct_file, delimiter="\t")
feat_reader = csv.reader(feat_file, delimiter="\t")
return DrugFeatureSet.from_dict(
{
drug: {"smiles": smiles, "features": [float(f) for f in features]}
for (drug, smiles), (_, *features) in zip(struct_reader, feat_reader)
}
)

def write_drugs(self, drugs: Mapping[str, str]) -> None:
"""Write the drug data."""
with self.drug_structures_path.open("w") as struct_file, self.drug_features_path.open("w") as feat_file:
for drug, smiles in sorted(drugs.items()):
print(drug, smiles, sep="\t", file=struct_file)
print(drug, *get_features(smiles), sep="\t", file=feat_file)

@lru_cache(maxsize=1)
def get_context_features(self) -> ContextFeatureSet:
"""Get the context feature set."""
with self.contexts_path.open() as file:
return ContextFeatureSet.from_dict(
{key: [float(v) for v in values] for key, *values in csv.reader(file, delimiter="\t")}
)

def write_contexts(self, contexts: Mapping[str, Sequence[float]]):
"""Write the context feature set."""
with self.contexts_path.open("w") as file:
for key, values in contexts.items():
print(key, *values, sep="\t", file=file)

@lru_cache(maxsize=1)
def get_labeled_triples(self) -> LabeledTriples:
"""Get the labeled triples dataframe."""
return LabeledTriples(pd.read_csv(self.labels_path, sep="\t"))

def write_labels(self, df: pd.DataFrame):
"""Write the labeled triples dataframe."""
df.to_csv(self.labels_path, index=False, sep="\t")


class OncoPolyPharmacology(LocalDatasetLoader):
"""A large-scale oncology screen of drug-drug synergy from [oneil2016]_.

.. [oneil2016] O’Neil, J., *et al.* (2016). `An Unbiased Oncology Compound Screen to Identify Novel
Combination Strategies <https://doi.org/10.1158/1535-7163.MCT-15-0843>`_. *Molecular Cancer
Therapeutics*, 15(6), 1155–1162.
"""

def preprocess(self) -> None:
"""Download and process the OncoPolyPharmacology dataset."""
tdc_directory = get_tdc_synergy("OncoPolyPharmacology")
df = pd.read_pickle(tdc_directory.joinpath("oncopolypharmacology.pkl"))

drugs = dict(
chain(
df[["Drug1_ID", "Drug1"]].values,
df[["Drug2_ID", "Drug2"]].values,
)
)
self.write_drugs(drugs)

contexts = {key: values.round(4).tolist() for key, values in df[["Cell_Line_ID", "Cell_Line"]].values}
self.write_contexts(contexts)

labels_df = df[["Drug1_ID", "Drug2_ID", "Cell_Line_ID", "Y"]].rename(
columns={
"Drug1_ID": "drug_1",
"Drug2_ID": "drug_2",
"Cell_Line_ID": "context",
"Y": "label",
}
)
self.write_labels(labels_df)
2 changes: 1 addition & 1 deletion chemicalx/data/drugfeatureset.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@ def from_dict(cls, data: Dict[str, Dict]) -> "DrugFeatureSet":
return cls(
{
key: {
"features": torch.FloatTensor(features["features"]),
"features": torch.FloatTensor(features["features"]).view(1, -1),
"molecule": Molecule.from_smiles(features["smiles"]),
}
for key, features in data.items()
Expand Down
71 changes: 71 additions & 0 deletions chemicalx/data/utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,71 @@
"""Dataset processing utilities."""

import json
from pathlib import Path
from typing import Mapping, Sequence

import numpy as np
import pandas as pd
import pystow
import rdkit
from rdkit.Chem import AllChem, DataStructs
from tdc.multi_pred import DDI, DrugSyn

__all__ = [
"get_tdc_synergy",
"get_features",
"write_drugs_json",
"write_triples",
"write_contexts_json",
]

DRUG_FILE_NAME = "drug_set.json"
CONTEXT_FILE_NAME = "context_set.json"
LABELS_FILE_NAME = "labeled_triples.tsv"


def get_tdc_synergy(name: str) -> Path:
"""Download the synergy dataset from TDC and return the standardized directory it went to."""
directory = pystow.join("tdc", DrugSyn.__name__.lower())
DrugSyn(name=name, path=directory.as_posix())
return directory


def get_tdc_ddi(name: str) -> Path:
"""Download the DDI dataset from TDC and return the standardized directory it went to."""
directory = pystow.join("tdc", DDI.__name__.lower())
DDI(name=name, path=directory.as_posix())
return directory


def get_features(smiles: str):
"""Get a morgan fingerprint vector for the given molecule."""
molecule = rdkit.Chem.MolFromSmiles(smiles)
features = AllChem.GetHashedMorganFingerprint(molecule, 2, nBits=256)
array = np.zeros((0,), dtype=np.int8)
DataStructs.ConvertToNumpyArray(features, array)
return array.tolist()


def write_drugs_json(drugs_raw: Mapping[str, str], output_directory: Path) -> Path:
"""Write drugs dictionary."""
drug_set = {drug: {"smiles": smiles, "features": get_features(smiles)} for drug, smiles in drugs_raw.items()}
path = output_directory.joinpath(DRUG_FILE_NAME)
with path.open("w") as file:
json.dump(drug_set, file)
return path


def write_contexts_json(context_set: Mapping[str, Sequence[float]], output_directory: Path) -> Path:
"""Write contexts dictionary."""
path = output_directory.joinpath(CONTEXT_FILE_NAME)
with path.open("w") as file:
json.dump(context_set, file)
return path


def write_triples(df: pd.DataFrame, output_directory: Path, sep: str = "\t") -> Path:
"""Write labeled triples."""
path = output_directory.joinpath(LABELS_FILE_NAME)
df.to_csv(path, index=False, sep=sep)
return path
32 changes: 23 additions & 9 deletions chemicalx/pipeline.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,12 +4,13 @@
import time
from dataclasses import dataclass
from pathlib import Path
from typing import Any, List, Mapping, Optional, Type, Union
from typing import Any, List, Mapping, Optional, Sequence, Type, Union

import pandas as pd
import torch
from class_resolver import HintOrType
from sklearn.metrics import roc_auc_score
from class_resolver import FunctionResolver, HintOrType
from sklearn.metrics import mean_absolute_error, mean_squared_error, roc_auc_score
from tabulate import tabulate
from torch.nn.modules.loss import _Loss
from torch.optim.optimizer import Optimizer
from tqdm import trange
Expand All @@ -23,6 +24,11 @@
"pipeline",
]

metric_resolver = FunctionResolver([])
metric_resolver.register(roc_auc_score, synonyms={"roc_auc", "auc_roc", "auroc"})
metric_resolver.register(mean_squared_error, synonyms={"mse"})
metric_resolver.register(mean_absolute_error, synonyms={"mae"})


@dataclass
class Result:
Expand All @@ -33,11 +39,11 @@ class Result:
losses: List[float]
train_time: float
evaluation_time: float
roc_auc: float
metrics: Mapping[str, float]

def summarize(self) -> None:
"""Print results to the console."""
print(f"AUC-ROC: {self.roc_auc:0.3f}")
print(tabulate(sorted(self.metrics.items()), headers=["Metric", "Value"]))

def save(self, directory: Union[str, Path]) -> None:
"""Save the results to a directory."""
Expand All @@ -50,9 +56,7 @@ def save(self, directory: Union[str, Path]) -> None:
directory.joinpath("results.json").write_text(
json.dumps(
{
"evaluation": {
"auc_roc": self.roc_auc,
},
"evaluation": self.metrics,
"losses": self.losses,
"training_time": self.train_time,
"evaluation_time": self.evaluation_time,
Expand All @@ -79,6 +83,7 @@ def pipeline(
drug_molecules: bool,
train_size: Optional[float] = None,
random_state: Optional[int] = None,
metrics: Optional[Sequence[str]] = None,
) -> Result:
"""Run the training and evaluation pipeline.

Expand Down Expand Up @@ -118,6 +123,8 @@ def pipeline(
The ratio of training triples. Default is 0.8 if None is passed.
:param random_state:
The random seed for splitting the triples. Default is 42. Set to none for no fixed seed.
:param metrics:
The list of metrics to use.
:returns:
A result object with the trained model and evaluation results
"""
Expand Down Expand Up @@ -165,11 +172,18 @@ def pipeline(

predictions_df = pd.concat(predictions)

if metrics is None:
metric_dict = {"roc_auc": roc_auc_score}
else:
metric_dict = {name: metric_resolver.lookup(name) for name in metrics}

return Result(
model=model,
predictions=predictions_df,
losses=losses,
train_time=train_time,
evaluation_time=evaluation_time,
roc_auc=roc_auc_score(predictions_df["label"], predictions_df["prediction"]),
metrics={
name: func(predictions_df["label"], predictions_df["prediction"]) for name, func in metric_dict.items()
},
)
Loading