preprocess_csg.py

from collections import OrderedDict
from pathlib import Path
from pubchempy import *
from rdkit import Chem
from rdkit.Chem import MolFromSmiles
from utils import *
import argparse
import candle
import csv
import h5py
import json, pickle
import math
import matplotlib.pyplot as plt
import networkx as nx
import numbers
import numpy as np
import os
import pandas as pd
import pickle
import random
import sys


folder = "data/"
fdir = os.path.dirname(os.path.abspath(__file__)) # parent dir

"""
The following code will convert the SMILES format into onehot format (comment from tCNN)
The following code will convert the SMILES format into graph format (relevant comment)
"""

def atom_features(atom):
    """ (ap) Extract atom features and put into array. """
    # a1 = one_of_k_encoding_unk(atom.GetSymbol(), [
    #         'C', 'N', 'O', 'S', 'F', 'Si', 'P', 'Cl', 'Br', 'Mg', 'Na', 'Ca', 'Fe',
    #         'As', 'Al', 'I', 'B', 'V', 'K', 'Tl', 'Yb', 'Sb', 'Sn', 'Ag', 'Pd', 'Co',
    #         'Se', 'Ti', 'Zn', 'H', 'Li', 'Ge', 'Cu', 'Au', 'Ni', 'Cd', 'In', 'Mn', 'Zr',
    #         'Cr', 'Pt', 'Hg', 'Pb', 'Unknown'
    #     ])
    # a2 = one_of_k_encoding(atom.GetDegree(), [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10])
    # a3 = one_of_k_encoding_unk(atom.GetTotalNumHs(), [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10])
    # a4 = one_of_k_encoding_unk(atom.GetImplicitValence(), [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10])
    # a5 = [atom.GetIsAromatic()]
    # arr = np.array(a1 + a2 + a3 + a4 + a5)
    # return arr
    return np.array(
        one_of_k_encoding_unk(atom.GetSymbol(), [
            'C', 'N', 'O', 'S', 'F', 'Si', 'P', 'Cl', 'Br', 'Mg', 'Na', 'Ca', 'Fe',
            'As', 'Al', 'I', 'B', 'V', 'K', 'Tl', 'Yb', 'Sb', 'Sn', 'Ag', 'Pd', 'Co',
            'Se', 'Ti', 'Zn', 'H', 'Li', 'Ge', 'Cu', 'Au', 'Ni', 'Cd', 'In', 'Mn', 'Zr',
            'Cr', 'Pt', 'Hg', 'Pb', 'Unknown'
        ]) + one_of_k_encoding(atom.GetDegree(), [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10]) +
        one_of_k_encoding_unk(atom.GetTotalNumHs(), [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10])
        + one_of_k_encoding_unk(atom.GetImplicitValence(),
                                [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10]) +
        [atom.GetIsAromatic()])


def one_of_k_encoding(x, allowable_set):
    if x not in allowable_set:
        raise Exception("input {0} not in allowable set{1}:".format(x, allowable_set))
    return list(map(lambda s: x == s, allowable_set))


def one_of_k_encoding_unk(x, allowable_set):
    """Maps inputs not in the allowable set to the last element."""
    if x not in allowable_set:
        x = allowable_set[-1]
    return list(map(lambda s: x == s, allowable_set))


def smile_to_graph(smile):
    """ (ap) Convert SMILES to graph. """
    mol = Chem.MolFromSmiles(smile)

    c_size = mol.GetNumAtoms()  # num atoms in molecule

    features = []  # list of feature vectors for each atom in a molecule
    for atom in mol.GetAtoms():
        feature = atom_features(atom)
        features.append(feature / sum(feature))

    edges = []
    for bond in mol.GetBonds():
        edges.append([bond.GetBeginAtomIdx(), bond.GetEndAtomIdx()])
    g = nx.Graph(edges).to_directed()  # return a directed graph
    edge_index = []
    for e1, e2 in g.edges:
        edge_index.append([e1, e2])
    # (ap) How are edges list different from edge_index list??
    # It seems that len(edge_index) is twice the size of len(edges)
    return c_size, features, edge_index


def load_drug_smile():
    """
    (ap) drug_smiles.csv is a table [224, 4] with 223 drugs.
    Creates and returns the following:
    drug_dict:   dict (drug names, rows in which the drug appears)
    drug_smile:  list of drug SMILES
    smile_graph: dict (SMILES, [c_size, features, edge_index])
    """
    reader = csv.reader(open(folder + "drug_smiles.csv"))  # generated by download_smiles()
    next(reader, None)

    drug_dict = {}  # dict of drugs {cell_id: index in the array}
    drug_smile = []

    for item in reader:
        name = item[0]   # Drug name
        smile = item[2]  # Drug canonical SMILES

        if name in drug_dict:
            pos = drug_dict[name]
        else:
            pos = len(drug_dict)
            drug_dict[name] = pos
        drug_smile.append(smile)

    smile_graph = {}
    for smile in drug_smile:
        g = smile_to_graph(smile)  # (ap) g: [c_size, features, edge_index]
        smile_graph[smile] = g

    return drug_dict, drug_smile, smile_graph


"""
The following part will prepare the mutation features for the cell.
"""


def save_cell_mut_matrix():
    """
    Create a binary matrix where 1 indicates that a mutation is present.
    Rows are CCLs and cols are mutations.
    PANCANCER_Genetic_feature.csv is a table [714056, 6].
    The col "genetic_feature" contains either mutation suffixed with
    "_mut" or CNA prefixes with "cna_"
    """
    # aa = pd.read_csv(folder + "PANCANCER_Genetic_feature.csv")
    # print("Read PANCANCER_Genetic_feature.csv")
    # print(aa.shape)
    # print(aa[:2])

    f = open(folder + "PANCANCER_Genetic_feature.csv")
    reader = csv.reader(f)
    next(reader)
    features = {}
    cell_dict = {}    # dict of CCL {cell_id/COSMIC: index/row in the array}
    mut_dict = {}     # dict of genetic features (mutations and CNA)
    matrix_list = []  # list of matrix coordinates where mutations are present

    for item in reader:
        cell_id = item[1]          # CCL ID (cosmic_sample_id)
        mut = item[5]              # mutation (genetic_feature)
        is_mutated = int(item[6])  # whether it's mutated (is_mutated)

        # Mutations will be stored in columns
        if mut in mut_dict:
            col = mut_dict[mut]
        else:
            col = len(mut_dict)
            mut_dict[mut] = col

        # CCLs will be stored in rows
        if cell_id in cell_dict:
            row = cell_dict[cell_id]
        else:
            row = len(cell_dict)
            cell_dict[cell_id] = row

        # Append coordinates where mutations are active
        if is_mutated == 1:
            matrix_list.append((row, col))

    # Create 2-D array [cells, mutations]
    cell_feature = np.zeros((len(cell_dict), len(mut_dict)))

    # Iterate over a list of (cell, genes) tuples and assign 1 for mutated genes
    for item in matrix_list:
        cell_feature[item[0], item[1]] = 1

    # with open('mut_dict', 'wb') as fp:
    #     pickle.dump(mut_dict, fp)

    return cell_dict, cell_feature


"""
This part is used to extract the drug - cell interaction strength. it contains IC50, AUC, Max conc, RMSE, Z_score
"""


"""
The functions below generate datasets for CSG (data from July 2020) - Start
"""


def read_df(fpath, sep="\t"):
    assert Path(fpath).exists(), f"File {fpath} was not found."
    if "parquet" in str(fpath):
        df = pd.read_parquet(fpath)
    else:
        df = pd.read_csv(fpath, sep=sep, na_values=na_values)
    return df


def scale_fea(xdata, scaler_name='stnd', dtype=np.float32, verbose=False):
    """ Returns the scaled dataframe of features. """
    from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler
    if scaler_name is None:
        if verbose:
            print('Scaler is None (not scaling).')
        return xdata
    
    if scaler_name == 'stnd':
        scaler = StandardScaler()
    elif scaler_name == 'minmax':
        scaler = MinMaxScaler()
    elif scaler_name == 'rbst':
        scaler = RobustScaler()
    else:
        print(f'The specified scaler {scaler_name} is not supported (not scaling).')
        return xdata

    cols = xdata.columns
    return pd.DataFrame(scaler.fit_transform(xdata), columns=cols, dtype=dtype)


def gen_cs_data(args):
    # import ipdb; ipdb.set_trace()

    if args.which_data == "cs":

        root = Path(args.datadir)/f"data_split_{args.split}"
        os.makedirs(root, exist_ok=True)

        # -------------------
        # Response
        pathlist = list(Path(args.datadir).glob("rsp*.csv"))
        pathlist = [p for p in pathlist if "full" not in str(p)]
        rsp_df = pd.read_csv(pathlist[0])
        rsp_df = rsp_df[["DrugID", "CancID", "AUC"]]  # temp_data
        print(rsp_df[["CancID", "DrugID"]].nunique())

        # Drugs
        pathlist = list(Path(args.datadir).glob("smiles*.csv"))
        smi = pd.read_csv(pathlist[0])
        d_dict = {v: i for i, v in enumerate(smi["DrugID"].values)}  # drug_dict; len(d_dict): 311
        d_smile = smi["SMILES"].values  # drug_smile
        smile_graph = {}  # smile_graph
        dd = {d_id: s for d_id, s in zip(smi["DrugID"].values, smi["SMILES"].values)}
        # import ipdb; ipdb.set_trace()
        for smile in d_smile:
            g = smile_to_graph(smile)  # (ap) g: [c_size, features, edge_index]
            smile_graph[smile] = g

        print("Unique drugs: {}".format(len(d_dict)))
        print("Unique smiles: {}".format(len(smile_graph)))

        # Cells
        pathlist = list(Path(args.datadir).glob("ge*.parquet"))
        ge = read_df(pathlist[0])

        # Use landmark genes
        use_lincs = True
        if use_lincs:
            with open(Path(args.datadir)/"../landmark_genes") as f:
                genes = [str(line.rstrip()) for line in f]
            genes = ["ge_" + str(g) for g in genes]
            print("Genes count: {}".format(len(set(genes).intersection(set(ge.columns[1:])))))
            genes = list(set(genes).intersection(set(ge.columns[1:])))
            cols = ["CancID"] + genes
            ge = ge[cols]

        # Scale
        ge_xdata = ge.iloc[:, 1:]
        ge_xdata_scaled = scale_fea(ge_xdata, scaler_name='stnd', dtype=np.float32, verbose=False)
        ge = pd.concat([ge[["CancID"]], ge_xdata_scaled], axis=1)

        # ge = ge.iloc[:, :1000]  # Take subset of cols (genes)
        c_dict = {v: i for i, v in enumerate(ge["CancID"].values)}  # cell_dict; len(c_dict): 634
        c_feature = ge.iloc[:, 1:].values  # cell_feature
        cc = {c_id: ge.iloc[i, 1:].values for i, c_id in enumerate(ge["CancID"].values)}  # cell_dict; len(c_dict): 634

        # Data splits
        # root = Path(os.path.join(args.datadir))
        splitdir = Path(os.path.join(args.datadir))/"splits"
        # import ipdb; ipdb.set_trace()
        with open(splitdir/f'split_{args.split}_tr_id') as f:
            tr_id = [int(line.rstrip()) for line in f]
        with open(splitdir/f'split_{args.split}_te_id') as f:
            te_id = [int(line.rstrip()) for line in f]

        # Train and test data
        tr_data = rsp_df.loc[tr_id]
        te_data = rsp_df.loc[te_id]

        # Val data from tr_data
        from sklearn.model_selection import train_test_split
        # tr_data, vl_data = train_test_split(tr_data, test_size=0.11)
        tr_id, vl_id = train_test_split(tr_id, test_size=0.11)
        tr_data = rsp_df.loc[tr_id]
        vl_data = rsp_df.loc[vl_id]
        print("All  ", rsp_df.shape)
        print("Train", tr_data.shape)
        print("Val  ", vl_data.shape)
        print("Test ", te_data.shape)
        # del rsp_df

        def extract_data_vars(df, d_dict, c_dict, d_smile, c_feature, dd, cc):
            xd = []
            xc = []
            y = []
            xd_ = []
            xc_ = []
            nan_rsp_list = []
            miss_cell = []
            miss_drug = []
            meta = []
            # import ipdb; ipdb.set_trace()
            for i in range(df.shape[0]):  # tuples of (drug name, cell id, IC50)
                if i>0 and (i%15000 == 0):
                    print(i)
                drug, cell, rsp = df.iloc[i, :].values.tolist()
                if np.isnan(rsp):
                    nan_rsp_list.append(data)
                # If drug and cell features are available
                if drug in d_dict and cell in c_dict:  # len(drug_dict): 223, len(cell_dict): 990
                    xd.append(d_smile[d_dict[drug]])   # xd contains list of smiles
                    # xd_.append(dd[drug])   # xd contains list of smiles
                    xc.append(c_feature[c_dict[cell]]) # xc contains list of feature vectors
                    # xc_.append(cc[cell]) # xc contains list of feature vectors
                    y.append(rsp)
                    meta.append([drug, cell, rsp])
                elif cell not in c_dict:
                    import ipdb; ipdb.set_trace()
                    miss_cell.append(cell)
                elif drug not in d_dict:
                    import ipdb; ipdb.set_trace()
                    miss_drug.append(drug)

            # Three arrays of size 191049, as the number of responses
            xd, xc, y = np.asarray(xd), np.asarray(xc), np.asarray(y)
            xd_, xc_ = np.asarray(xd_), np.asarray(xc_)
            meta = pd.DataFrame(meta, columns=["DrugID", "CancID", "AUC"])

            return xd, xc, y

        xd_all,  xc_all,  y_all  = extract_data_vars(rsp_df,  d_dict, c_dict, d_smile, c_feature, dd, cc)
        xd_train, xc_train, y_train = np.take(xd_all, tr_id, axis=0), np.take(xc_all, tr_id, axis=0), np.take(y_all, tr_id, axis=0)
        xd_val,   xc_val,   y_val   = np.take(xd_all, vl_id, axis=0), np.take(xc_all, vl_id, axis=0), np.take(y_all, vl_id, axis=0)
        xd_test,  xc_test,  y_test  = np.take(xd_all, te_id, axis=0), np.take(xc_all, te_id, axis=0), np.take(y_all, te_id, axis=0)
        # print((xd_tr == xd_train).all())
        # print((xc_tr == xc_train).all())
        # print((y_tr == y_train).all())
        # print((xd_vl == xd_val).all())
        # print((xc_vl == xc_val).all())
        # print((y_vl == y_val).all())
        # print((xd_te == xd_test).all())
        # print((xc_te == xc_test).all())
        # print((y_te == y_test).all())

        print("xd_all  ", xd_all.shape,   "xc_all  ", xc_all.shape,   "y_all  ", y_all.shape)
        print("xd_train", xd_train.shape, "xc_train", xc_train.shape, "y_train", y_train.shape)
        print("xd_val  ", xd_val.shape,   "xc_val  ", xc_val.shape,   "y_val  ", y_val.shape)
        print("xd_test ", xd_test.shape,  "xc_test ", xc_test.shape,  "y_test ", y_test.shape)

        # Save dfs
        rsp_df.to_csv(root/"all_rsp.csv", index=False)
        tr_data.to_csv(root/"train_rsp.csv", index=False)
        vl_data.to_csv(root/"val_rsp.csv", index=False)
        te_data.to_csv(root/"test_rsp.csv", index=False)
        del rsp_df, tr_data, vl_data, te_data

        # bExist_ = np.zeros((len(d_dict), len(c_dict)))
        # xd_ = []
        # xc_ = []
        # y_ = []

        # t_data = t_data.sample(frac=1.0, random_state=0)
        # nan_rsp_list = []
        # miss_cell = []
        # miss_drug = []
        # for i in range(t_data.shape[0]):  # tuples of (drug name, cell id, IC50)
        #     if i%10000 == 0:
        #         print(i)
        #     # drug, cell, ic50 = data
        #     # data = t_data.iloc[i, :].values.tolist()
        #     # drug, cell, rsp = data[0], data[1], data[2]
        #     drug, cell, rsp = t_data.iloc[i, :].values.tolist()
        #     if np.isnan(rsp):
        #         nan_rsp_list.append(data)
        #     # If drug and cell features are available
        #     if drug in d_dict and cell in c_dict:  # len(drug_dict): 223, len(cell_dict): 990
        #         xd_.append(d_smile[d_dict[drug]])   # xd contains list of smiles
        #         xc_.append(c_feature[c_dict[cell]]) # xc contains list of feature vectors
        #         y_.append(rsp)
        #         bExist_[d_dict[drug], c_dict[cell]] = 1  # not used
        #     elif cell not in c_dict:
        #         import ipdb; ipdb.set_trace()
        #         miss_cell.append(cell)
        #     elif drug not in d_dict:
        #         import ipdb; ipdb.set_trace()
        #         miss_drug.append(drug)

        # # Three arrays of size 191049, as the number of responses
        # xd_, xc_, y_ = np.asarray(xd_), np.asarray(xc_), np.asarray(y_)

        # xd = xd_
        # xc = xc_
        # y = y_
        # smile_graph = s_graph

        # print("xd", xd.shape)
        # print("xc", xc.shape)
        # print("y", y.shape)
        # print("smile_graph", len(smile_graph))
        # sm = list(smile_graph.keys())[0]
        # print("c_size", smile_graph[sm][0])  # c_size
        # # print(len(smile_graph[sm][1]))
        # # print(len(smile_graph[sm][2]))
        # # Define vars that determine train, val, and test sizes

        # size = int(xd.shape[0] * 0.8)
        # size1 = int(xd.shape[0] * 0.9)
        # -------------------

    else:
        f = open(folder + "PANCANCER_IC.csv") # contains the IC50 of 250 drugs and 1074 CCL
        reader = csv.reader(f)
        next(reader)

        # root = os.path.join(args.outdir, "mix_drug_cell")
        root = os.path.join(args.datadir)

        # cell_dict: {cell_id/COSMIC: index/row in the array}
        cell_dict, cell_feature = save_cell_mut_matrix()
        # drug_dict: {drug_id: index in the array}
        drug_dict, drug_smile, smile_graph = load_drug_smile()
        print("Unique drugs: {}".format(len(drug_dict)))
        print("Unique smiles: {}".format(len(smile_graph)))


        with open('drug_dict', 'wb') as fp:
            pickle.dump(drug_dict, fp)

        # Binary 2-D array (drug, cell) containing 1 in coordinates where features
        # and response are available.
        bExist = np.zeros((len(drug_dict), len(cell_dict)))

        temp_data = []
        for item in reader:
            drug = item[0]  # Drug name
            cell = item[3]  # Cosmic sample id
            ic50 = item[8]  # IC50
            ic50 = 1 / (1 + pow(math.exp(float(ic50)), -0.1))
            temp_data.append((drug, cell, ic50))

        xd = []
        xc = []
        y = []
        lst_drug = []
        lst_cell = []

        # import ipdb; ipdb.set_trace()
        random.shuffle(temp_data)  # shuffle cell-drug combinations
        for data in temp_data:  # tuples of (drug name, cell id, IC50)
            drug, cell, ic50 = data
            # If drug and cell features are available
            if drug in drug_dict and cell in cell_dict:  # len(drug_dict): 223, len(cell_dict): 990
                xd.append(drug_smile[drug_dict[drug]])   # xd contains list of smiles
                xc.append(cell_feature[cell_dict[cell]]) # xc contains list of feature vectors
                y.append(ic50)
                lst_drug.append(drug)
                lst_cell.append(cell)
                bExist[drug_dict[drug], cell_dict[cell]] = 1  # not used

        # Three arrays of size 191049, as the number of responses
        xd, xc, y = np.asarray(xd), np.asarray(xc), np.asarray(y)
        print("xd", xd.shape)
        print("xc", xc.shape)
        print("y", y.shape)
        print("smile_graph", len(smile_graph))
        sm = list(smile_graph.keys())[0]
        print("c_size", smile_graph[sm][0])  # c_size
        # print(len(smile_graph[sm][1]))
        # print(len(smile_graph[sm][2]))

        # Define vars that determine train, val, and test sizes
        size = int(xd.shape[0] * 0.8)
        size1 = int(xd.shape[0] * 0.9)


    # # Create data splits
    # xd_train, xd_val, xd_test = xd[:size], xd[size:size1], xd[size1:]
    # xc_train, xc_val, xc_test = xc[:size], xc[size:size1], xc[size1:]
    # y_train,  y_val,  y_test  = y[:size],  y[size:size1],  y[size1:]

    # Create PyTorch datasets
    # dataset = 'GDSC'
    dataset = "data"
    print('preparing ', dataset + '_train.pt in pytorch format!')

    # import ipdb; ipdb.set_trace()
    # Train, val, and test datasets
    train_data = TestbedDataset(
        root=root,
        dataset='train_' + dataset,
        xd=xd_train,
        xt=xc_train,
        y=y_train,
        smile_graph=smile_graph)
    val_data = TestbedDataset(
        root=root,
        dataset='val_' + dataset,
        xd=xd_val,
        xt=xc_val,
        y=y_val,
        smile_graph=smile_graph)
    test_data = TestbedDataset(
        root=root,
        dataset='test_' + dataset,
        xd=xd_test,
        xt=xc_test,
        y=y_test,
        smile_graph=smile_graph)

    # All samples dataset
    all_data = TestbedDataset(
        root=root,
        dataset='all_' + dataset,
        xd=xd_all,
        xt=xc_all,
        y=y_all,
        smile_graph=smile_graph)


"""
The functions below generate datasets for CSG (data from July 2020) - End
"""


if __name__ == "__main__":
    fdir = Path(__file__).resolve().parent

    ftp_fname = fdir/"ftp_file_list"
    with open(ftp_fname, "r") as f:
        data_file_list = f.readlines()

    # Original
    ftp_origin = "https://ftp.mcs.anl.gov/pub/candle/public/improve/model_curation_data/GraphDRP/data"
    # CSG
    ftp_origin = "https://ftp.mcs.anl.gov/pub/candle/public/improve/cross_study_gen/July2020"

    datadir = fdir/"data"
    for f in data_file_list:
        candle.get_file(fname=f.strip(),
                        origin=os.path.join(ftp_origin, f.strip()),
                        unpack=False, md5_hash=None,
                        datadir=datadir,
                        cache_subdir="common")

    parser = argparse.ArgumentParser(description='prepare dataset to train model')
    parser.add_argument(
        '--choice',
        type=int,
        required=False,
        default=0,
        help='0.mix test, 1.saliency value, 2.drug blind, 3.cell blind')
    parser.add_argument(
        '--outdir',
        type=str,
        required=False,
        default="data_processed",
        help='Data dir name to store the preprocessed data.')
    # -------------------
    # That's for CSG analysis
    parser.add_argument(
        '--datadir',
        type=str,
        required=False,
        default="data_processed",
        help='Main data dir.')
    parser.add_argument(
        '--which_data',
        type=str,
        required=False,
        default="cs",
        help='Main data dir.')
    parser.add_argument(
        '--split',
        type=int,
        required=False,
        default=0,
        help='Split id.')
    # -------------------

    args = parser.parse_args()

    choice = args.choice
    if choice == 0:
        # save mix test dataset
        save_mix_drug_cell_matrix(args)
    elif choice == 1:
        # save saliency map dataset
        save_best_individual_drug_cell_matrix(args)
    elif choice == 2:
        # save blind drug dataset
        save_blind_drug_matrix(args)
    elif choice == 3:
        # save blind cell dataset
        save_blind_cell_matrix(args)
    elif choice == 4:
        oo = gen_cs_data(args)
    else:
        print("Invalid option, choose 0 -> 4")

    print("Finished pre-processing.")