Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

GPU Implementation #53

Open
wants to merge 23 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
24 changes: 22 additions & 2 deletions dsm/contrib/dcm_api.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
import numpy as np

from dsm.contrib.dcm_torch import DeepCoxMixturesTorch
from dsm.contrib.dcm_utilities import train_dcm, predict_survival
from dsm.contrib.dcm_utilities import *


class DeepCoxMixtures():
Expand Down Expand Up @@ -178,4 +178,24 @@ def predict_survival(self, x, t):
else:
raise Exception("The model has not been fitted yet. Please fit the " +
"model using the `fit` method on some training data " +
"before calling `predict_survival`.")
"before calling `predict_survival`.")

def compute_nll(self, x, t, e):
if not self.fitted:
raise Exception("The model has not been fitted yet. Please fit the " +
"model using the `fit` method on some training data " +
"before calling `_eval_nll`.")
processed_data = self._preprocess_training_data(x, t, e, 0, None, 0)
_, _, _, x_val, t_val, e_val = processed_data
with torch.no_grad():
return - get_posteriors(repair_probs(get_likelihood(self.torch_model[0], self.torch_model[1], x_val, t_val, e_val))).sum().item()

def predict_alphas(self, x):
x = self._preprocess_test_data(x)
if self.fitted:
alphas, _ = self.torch_model[0](x)
return alphas.detach().exp().cpu().numpy()
else:
raise Exception("The model has not been fitted yet. Please fit the " +
"model using the `fit` method on some training data " +
"before calling `predict_alphas`.")
85 changes: 54 additions & 31 deletions dsm/datasets.py
Original file line number Diff line number Diff line change
Expand Up @@ -54,7 +54,7 @@ def increase_censoring(e, t, p):

return e, t

def _load_framingham_dataset(sequential):
def _load_framingham_dataset(sequential, competing = False):
"""Helper function to load and preprocess the Framingham dataset.

The Framingham Dataset is a subset of 4,434 participants of the well known,
Expand All @@ -80,33 +80,42 @@ def _load_framingham_dataset(sequential):
data = pkgutil.get_data(__name__, 'datasets/framingham.csv')
data = pd.read_csv(io.BytesIO(data))

if not sequential:
# Consider only first event
data = data.groupby('RANDID').first()

dat_cat = data[['SEX', 'CURSMOKE', 'DIABETES', 'BPMEDS',
'educ', 'PREVCHD', 'PREVAP', 'PREVMI',
'PREVSTRK', 'PREVHYP']]
dat_num = data[['TOTCHOL', 'AGE', 'SYSBP', 'DIABP',
'CIGPDAY', 'BMI', 'HEARTRTE', 'GLUCOSE']]

x1 = pd.get_dummies(dat_cat).values
x2 = dat_num.values
x = np.hstack([x1, x2])
x1 = pd.get_dummies(dat_cat)
x2 = dat_num
x = np.hstack([x1.values, x2.values])

time = (data['TIMEDTH'] - data['TIME']).values
event = data['DEATH'].values

x = SimpleImputer(missing_values=np.nan, strategy='mean').fit_transform(x)
x_ = StandardScaler().fit_transform(x)
if competing:
time_cvd = (data['TIMECVD'] - data['TIME']).values
event[data['CVD'] == 1] = 2
time[data['CVD'] == 1] = time_cvd[data['CVD'] == 1]

x_ = SimpleImputer(missing_values=np.nan, strategy='mean').fit_transform(x)

if not sequential:
return x_, time, event
return x_, time + 1, event, np.concatenate([x1.columns, x2.columns])
else:
x_, data, time, event = x_[time > 0], data[time > 0], time[time > 0], event[time > 0]
x, t, e = [], [], []
for id_ in sorted(list(set(data['RANDID']))):
x.append(x_[data['RANDID'] == id_])
t.append(time[data['RANDID'] == id_])
t.append(time[data['RANDID'] == id_] + 1)
e.append(event[data['RANDID'] == id_])
return x, t, e
return (x, x_), t, e, np.concatenate([x1.columns, x2.columns])

def _load_pbc_dataset(sequential):
def _load_pbc_dataset(sequential, competing = False):
"""Helper function to load and preprocess the PBC dataset

The Primary biliary cirrhosis (PBC) Dataset [1] is well known
Expand All @@ -130,33 +139,38 @@ def _load_pbc_dataset(sequential):
data = pkgutil.get_data(__name__, 'datasets/pbc2.csv')
data = pd.read_csv(io.BytesIO(data))

if not sequential:
# Consider only first event
data = data.groupby('id').first()

data['histologic'] = data['histologic'].astype(str)
dat_cat = data[['drug', 'sex', 'ascites', 'hepatomegaly',
'spiders', 'edema', 'histologic']]
dat_num = data[['serBilir', 'serChol', 'albumin', 'alkaline',
'SGOT', 'platelets', 'prothrombin']]
age = data['age'] + data['years']

x1 = pd.get_dummies(dat_cat).values
x2 = dat_num.values
x3 = age.values.reshape(-1, 1)
x = np.hstack([x1, x2, x3])
x1 = pd.get_dummies(dat_cat)
x2 = dat_num
x3 = age
x = np.hstack([x1.values, x2.values, x3.values.reshape(-1, 1)])

time = (data['years'] - data['year']).values
event = data['status2'].values
event = (data['status'] == 'dead').values.astype(int)
if competing:
event[data['status'] == 'transplanted'] = 2

x = SimpleImputer(missing_values=np.nan, strategy='mean').fit_transform(x)
x_ = StandardScaler().fit_transform(x)
x_ = SimpleImputer(missing_values=np.nan, strategy='mean').fit_transform(x)

if not sequential:
return x_, time, event
return x_, time + 1, event, x1.columns.tolist() + x2.columns.tolist() + [x3.name]
else:
x, t, e = [], [], []
for id_ in sorted(list(set(data['id']))):
x.append(x_[data['id'] == id_])
t.append(time[data['id'] == id_])
t.append(time[data['id'] == id_] + 1)
e.append(event[data['id'] == id_])
return x, t, e
return (x, x_), t, e, x1.columns.tolist() + x2.columns.tolist() + [x3.name]

def _load_support_dataset():
"""Helper function to load and preprocess the SUPPORT dataset.
Expand Down Expand Up @@ -189,10 +203,9 @@ def _load_support_dataset():
e = data['death'].values

x = SimpleImputer(missing_values=np.nan, strategy='mean').fit_transform(x)
x = StandardScaler().fit_transform(x)

remove = ~np.isnan(t)
return x[remove], t[remove], e[remove]
return x[remove], t[remove] + 1, e[remove], np.concatenate([x1.columns, x2.columns])

def _load_mnist():
"""Helper function to load and preprocess the MNIST dataset.
Expand Down Expand Up @@ -222,9 +235,9 @@ def _load_mnist():

e, t = increase_censoring(np.ones(t.shape), t, p=.5)

return x, t, e
return x, t + 1, e, train.data.columns

def load_dataset(dataset='SUPPORT', **kwargs):
def load_dataset(dataset='SUPPORT', normalize = True, **kwargs):
"""Helper function to load datasets to test Survival Analysis models.

Currently implemented datasets include:
Expand Down Expand Up @@ -273,14 +286,24 @@ def load_dataset(dataset='SUPPORT', **kwargs):

"""
sequential = kwargs.get('sequential', False)
competing = kwargs.get('competing', False)

if dataset == 'SUPPORT':
return _load_support_dataset()
if dataset == 'PBC':
return _load_pbc_dataset(sequential)
if dataset == 'FRAMINGHAM':
return _load_framingham_dataset(sequential)
if dataset == 'MNIST':
return _load_mnist()
x, t, e, covariates = _load_support_dataset()
elif dataset == 'PBC':
x, t, e, covariates = _load_pbc_dataset(sequential, competing)
elif dataset == 'FRAMINGHAM':
x, t, e, covariates = _load_framingham_dataset(sequential, competing)
elif dataset == 'MNIST':
x, t, e, covariates = _load_mnist()
else:
raise NotImplementedError('Dataset '+dataset+' not implemented.')

if isinstance(x, tuple):
(x, x_all) = x
if normalize:
scaler = StandardScaler().fit(x_all)
x = [scaler.transform(x_) for x_ in x]
elif normalize:
x = StandardScaler().fit_transform(x)
return x, t, e, covariates
49 changes: 42 additions & 7 deletions dsm/dsm_api.py
Original file line number Diff line number Diff line change
Expand Up @@ -52,13 +52,14 @@ class DSMBase():
"""Base Class for all DSM models"""

def __init__(self, k=3, layers=None, distribution="Weibull",
temp=1000., discount=1.0):
temp=1000., discount=1.0, cuda=False):
self.k = k
self.layers = layers
self.dist = distribution
self.temp = temp
self.discount = discount
self.fitted = False
self.cuda = cuda # Two levels: 1 full GPU, 2 batch GPU (prefer 1 if fit on memory)

def _gen_torch_model(self, inputdim, optimizer, risks):
"""Helper function to return a torch model."""
Expand Down Expand Up @@ -122,13 +123,17 @@ def fit(self, x, t, e, vsize=0.15, val_data=None,

maxrisk = int(np.nanmax(e_train.cpu().numpy()))
model = self._gen_torch_model(inputdim, optimizer, risks=maxrisk)

if self.cuda:
model = model.cuda()

model, _ = train_dsm(model,
x_train, t_train, e_train,
x_val, t_val, e_val,
n_iter=iters,
lr=learning_rate,
elbo=elbo,
bs=batch_size)
bs=batch_size, cuda=self.cuda==2)

self.torch_model = model.eval()
self.fitted = True
Expand Down Expand Up @@ -163,15 +168,23 @@ def compute_nll(self, x, t, e):
x_val, t_val, e_val = x_val,\
_reshape_tensor_with_nans(t_val),\
_reshape_tensor_with_nans(e_val)

if self.cuda == 2:
# Data need to be on GPU if loss computed
x_val, t_val, e_val = x_val.cuda(), t_val.cuda(), e_val.cuda()

loss = 0
for r in range(self.torch_model.risks):
loss += float(losses.conditional_loss(self.torch_model,
x_val, t_val, e_val, elbo=False,
risk=str(r+1)).detach().numpy())
risk=str(r+1)).item())
return loss

def _preprocess_test_data(self, x):
return torch.from_numpy(x)
data = torch.from_numpy(x)
if self.cuda:
data = data.cuda()
return data

def _preprocess_training_data(self, x, t, e, vsize, val_data, random_state):

Expand Down Expand Up @@ -201,7 +214,12 @@ def _preprocess_training_data(self, x, t, e, vsize, val_data, random_state):
t_val = torch.from_numpy(t_val).double()
e_val = torch.from_numpy(e_val).double()

return (x_train, t_train, e_train, x_val, t_val, e_val)
if self.cuda == 1:
x_train, t_train, e_train = x_train.cuda(), t_train.cuda(), e_train.cuda()
x_val, t_val, e_val = x_val.cuda(), t_val.cuda(), e_val.cuda()

return (x_train, t_train, e_train,
x_val, t_val, e_val)


def predict_mean(self, x, risk=1):
Expand Down Expand Up @@ -300,6 +318,15 @@ def predict_pdf(self, x, t, risk=1):
"model using the `fit` method on some training data " +
"before calling `predict_survival`.")

def predict_alphas(self, x):
x = self._preprocess_test_data(x)
if self.fitted:
_, _, alphas = self.torch_model(x)
return torch.softmax(alphas, dim = 1).detach().cpu().numpy()
else:
raise Exception("The model has not been fitted yet. Please fit the " +
"model using the `fit` method on some training data " +
"before calling `predict_alphas`.")

class DeepSurvivalMachines(DSMBase):
"""A Deep Survival Machines model.
Expand Down Expand Up @@ -394,7 +421,10 @@ def _gen_torch_model(self, inputdim, optimizer, risks):
risks=risks)

def _preprocess_test_data(self, x):
return torch.from_numpy(_get_padded_features(x))
data = torch.from_numpy(_get_padded_features(x))
if self.cuda:
data = data.cuda()
return data

def _preprocess_training_data(self, x, t, e, vsize, val_data, random_state):
"""RNNs require different preprocessing for variable length sequences"""
Expand Down Expand Up @@ -435,7 +465,12 @@ def _preprocess_training_data(self, x, t, e, vsize, val_data, random_state):
t_val = torch.from_numpy(t_val).double()
e_val = torch.from_numpy(e_val).double()

return (x_train, t_train, e_train, x_val, t_val, e_val)
if self.cuda == 1:
x_train, t_train, e_train = x_train.cuda(), t_train.cuda(), e_train.cuda()
x_val, t_val, e_val = x_val.cuda(), t_val.cuda(), e_val.cuda()

return (x_train, t_train, e_train,
x_val, t_val, e_val)


class DeepConvolutionalSurvivalMachines(DSMBase):
Expand Down
Loading