autonlab · Jeanselme · Sep 8, 2021 · Sep 8, 2021 · Sep 8, 2021 · Sep 8, 2021
diff --git a/dsm/contrib/dcm_api.py b/dsm/contrib/dcm_api.py
@@ -3,7 +3,7 @@
 import numpy as np
 
 from dsm.contrib.dcm_torch import DeepCoxMixturesTorch
-from dsm.contrib.dcm_utilities import train_dcm, predict_survival
+from dsm.contrib.dcm_utilities import *
 
 
 class DeepCoxMixtures():
@@ -178,4 +178,24 @@ def predict_survival(self, x, t):
     else:
       raise Exception("The model has not been fitted yet. Please fit the " +
                       "model using the `fit` method on some training data " +
-                      "before calling `predict_survival`.")
+                      "before calling `predict_survival`.")
+
+  def compute_nll(self, x, t, e):
+    if not self.fitted:
+      raise Exception("The model has not been fitted yet. Please fit the " +
+                      "model using the `fit` method on some training data " +
+                      "before calling `_eval_nll`.")
+    processed_data = self._preprocess_training_data(x, t, e, 0, None, 0)
+    _, _, _, x_val, t_val, e_val = processed_data
+    with torch.no_grad():
+      return - get_posteriors(repair_probs(get_likelihood(self.torch_model[0], self.torch_model[1], x_val, t_val, e_val))).sum().item()
+
+  def predict_alphas(self, x):
+    x = self._preprocess_test_data(x)
+    if self.fitted:
+      alphas, _ = self.torch_model[0](x)
+      return alphas.detach().exp().cpu().numpy()
+    else:
+      raise Exception("The model has not been fitted yet. Please fit the " +
+                      "model using the `fit` method on some training data " +
+                      "before calling `predict_alphas`.")
diff --git a/dsm/datasets.py b/dsm/datasets.py
@@ -54,7 +54,7 @@ def increase_censoring(e, t, p):
 
   return e, t
 
-def _load_framingham_dataset(sequential):
+def _load_framingham_dataset(sequential, competing = False):
   """Helper function to load and preprocess the Framingham dataset.
 
   The Framingham Dataset is a subset of 4,434 participants of the well known,
@@ -80,33 +80,42 @@ def _load_framingham_dataset(sequential):
   data = pkgutil.get_data(__name__, 'datasets/framingham.csv')
   data = pd.read_csv(io.BytesIO(data))
 
+  if not sequential:
+    # Consider only first event
+    data = data.groupby('RANDID').first()
+
   dat_cat = data[['SEX', 'CURSMOKE', 'DIABETES', 'BPMEDS',
                   'educ', 'PREVCHD', 'PREVAP', 'PREVMI',
                   'PREVSTRK', 'PREVHYP']]
   dat_num = data[['TOTCHOL', 'AGE', 'SYSBP', 'DIABP',
                   'CIGPDAY', 'BMI', 'HEARTRTE', 'GLUCOSE']]
 
-  x1 = pd.get_dummies(dat_cat).values
-  x2 = dat_num.values
-  x = np.hstack([x1, x2])
+  x1 = pd.get_dummies(dat_cat)
+  x2 = dat_num
+  x = np.hstack([x1.values, x2.values])
 
   time = (data['TIMEDTH'] - data['TIME']).values
   event = data['DEATH'].values
 
-  x = SimpleImputer(missing_values=np.nan, strategy='mean').fit_transform(x)
-  x_ = StandardScaler().fit_transform(x)
+  if competing:
+    time_cvd = (data['TIMECVD'] - data['TIME']).values
+    event[data['CVD'] == 1] = 2
+    time[data['CVD'] == 1] = time_cvd[data['CVD'] == 1]
+
+  x_ = SimpleImputer(missing_values=np.nan, strategy='mean').fit_transform(x)
 
   if not sequential:
-    return x_, time, event
+    return x_, time + 1, event, np.concatenate([x1.columns, x2.columns])
   else:
+    x_, data, time, event = x_[time > 0], data[time > 0], time[time > 0], event[time > 0]
     x, t, e = [], [], []
     for id_ in sorted(list(set(data['RANDID']))):
       x.append(x_[data['RANDID'] == id_])
-      t.append(time[data['RANDID'] == id_])
+      t.append(time[data['RANDID'] == id_] + 1)
       e.append(event[data['RANDID'] == id_])
-    return x, t, e
+    return (x, x_), t, e, np.concatenate([x1.columns, x2.columns])
 
-def _load_pbc_dataset(sequential):
+def _load_pbc_dataset(sequential, competing = False):
   """Helper function to load and preprocess the PBC dataset
 
   The Primary biliary cirrhosis (PBC) Dataset [1] is well known
@@ -130,33 +139,38 @@ def _load_pbc_dataset(sequential):
   data = pkgutil.get_data(__name__, 'datasets/pbc2.csv')
   data = pd.read_csv(io.BytesIO(data))
 
+  if not sequential:
+    # Consider only first event
+    data = data.groupby('id').first()
+
   data['histologic'] = data['histologic'].astype(str)
   dat_cat = data[['drug', 'sex', 'ascites', 'hepatomegaly',
                   'spiders', 'edema', 'histologic']]
   dat_num = data[['serBilir', 'serChol', 'albumin', 'alkaline',
                   'SGOT', 'platelets', 'prothrombin']]
   age = data['age'] + data['years']
 
-  x1 = pd.get_dummies(dat_cat).values
-  x2 = dat_num.values
-  x3 = age.values.reshape(-1, 1)
-  x = np.hstack([x1, x2, x3])
+  x1 = pd.get_dummies(dat_cat)
+  x2 = dat_num
+  x3 = age
+  x = np.hstack([x1.values, x2.values, x3.values.reshape(-1, 1)])
 
   time = (data['years'] - data['year']).values
-  event = data['status2'].values
+  event = (data['status'] == 'dead').values.astype(int)
+  if competing:
+    event[data['status'] == 'transplanted'] = 2
 
-  x = SimpleImputer(missing_values=np.nan, strategy='mean').fit_transform(x)
-  x_ = StandardScaler().fit_transform(x)
+  x_ = SimpleImputer(missing_values=np.nan, strategy='mean').fit_transform(x)
 
   if not sequential:
-    return x_, time, event
+    return x_, time + 1, event, x1.columns.tolist() + x2.columns.tolist() + [x3.name]
   else:
     x, t, e = [], [], []
     for id_ in sorted(list(set(data['id']))):
       x.append(x_[data['id'] == id_])
-      t.append(time[data['id'] == id_])
+      t.append(time[data['id'] == id_] + 1)
       e.append(event[data['id'] == id_])
-    return x, t, e
+    return (x, x_), t, e, x1.columns.tolist() + x2.columns.tolist() + [x3.name]
 
 def _load_support_dataset():
   """Helper function to load and preprocess the SUPPORT dataset.
@@ -189,10 +203,9 @@ def _load_support_dataset():
   e = data['death'].values
 
   x = SimpleImputer(missing_values=np.nan, strategy='mean').fit_transform(x)
-  x = StandardScaler().fit_transform(x)
 
   remove = ~np.isnan(t)
-  return x[remove], t[remove], e[remove]
+  return x[remove], t[remove] + 1, e[remove], np.concatenate([x1.columns, x2.columns])
 
 def _load_mnist():
   """Helper function to load and preprocess the MNIST dataset.
@@ -222,9 +235,9 @@ def _load_mnist():
 
   e, t = increase_censoring(np.ones(t.shape), t, p=.5)
 
-  return x, t, e
+  return x, t + 1, e, train.data.columns
 
-def load_dataset(dataset='SUPPORT', **kwargs):
+def load_dataset(dataset='SUPPORT', normalize = True, **kwargs):
   """Helper function to load datasets to test Survival Analysis models.
 
   Currently implemented datasets include:
@@ -273,14 +286,24 @@ def load_dataset(dataset='SUPPORT', **kwargs):
 
   """
   sequential = kwargs.get('sequential', False)
+  competing = kwargs.get('competing', False)
 
   if dataset == 'SUPPORT':
-    return _load_support_dataset()
-  if dataset == 'PBC':
-    return _load_pbc_dataset(sequential)
-  if dataset == 'FRAMINGHAM':
-    return _load_framingham_dataset(sequential)
-  if dataset == 'MNIST':
-    return _load_mnist()
+    x, t, e, covariates = _load_support_dataset()
+  elif dataset == 'PBC':
+    x, t, e, covariates = _load_pbc_dataset(sequential, competing)
+  elif dataset == 'FRAMINGHAM':
+    x, t, e, covariates = _load_framingham_dataset(sequential, competing)
+  elif dataset == 'MNIST':
+    x, t, e, covariates = _load_mnist()
   else:
     raise NotImplementedError('Dataset '+dataset+' not implemented.')
+
+  if isinstance(x, tuple):
+    (x, x_all) = x
+    if normalize:
+      scaler = StandardScaler().fit(x_all)
+      x = [scaler.transform(x_) for x_ in x]
+  elif normalize:
+    x = StandardScaler().fit_transform(x)
+  return x, t, e, covariates
diff --git a/dsm/dsm_api.py b/dsm/dsm_api.py
@@ -52,13 +52,14 @@ class DSMBase():
   """Base Class for all DSM models"""
 
   def __init__(self, k=3, layers=None, distribution="Weibull",
-               temp=1000., discount=1.0):
+               temp=1000., discount=1.0, cuda=False):
     self.k = k
     self.layers = layers
     self.dist = distribution
     self.temp = temp
     self.discount = discount
     self.fitted = False
+    self.cuda = cuda # Two levels: 1 full GPU, 2 batch GPU (prefer 1 if fit on memory)
 
   def _gen_torch_model(self, inputdim, optimizer, risks):
     """Helper function to return a torch model."""
@@ -122,13 +123,17 @@ def fit(self, x, t, e, vsize=0.15, val_data=None,
 
     maxrisk = int(np.nanmax(e_train.cpu().numpy()))
     model = self._gen_torch_model(inputdim, optimizer, risks=maxrisk)
+
+    if self.cuda:
+      model = model.cuda() 
+
     model, _ = train_dsm(model,
                          x_train, t_train, e_train,
                          x_val, t_val, e_val,
                          n_iter=iters,
                          lr=learning_rate,
                          elbo=elbo,
-                         bs=batch_size)
+                         bs=batch_size, cuda=self.cuda==2)
 
     self.torch_model = model.eval()
     self.fitted = True
@@ -163,15 +168,23 @@ def compute_nll(self, x, t, e):
     x_val, t_val, e_val = x_val,\
         _reshape_tensor_with_nans(t_val),\
         _reshape_tensor_with_nans(e_val)
+
+    if self.cuda == 2:
+      # Data need to be on GPU if loss computed
+      x_val, t_val, e_val = x_val.cuda(), t_val.cuda(), e_val.cuda()
+
     loss = 0
     for r in range(self.torch_model.risks):
       loss += float(losses.conditional_loss(self.torch_model,
                     x_val, t_val, e_val, elbo=False,
-                    risk=str(r+1)).detach().numpy())
+                    risk=str(r+1)).item())
     return loss
 
   def _preprocess_test_data(self, x):
-    return torch.from_numpy(x)
+    data = torch.from_numpy(x)
+    if self.cuda:
+      data = data.cuda()
+    return data
 
   def _preprocess_training_data(self, x, t, e, vsize, val_data, random_state):
 
@@ -201,7 +214,12 @@ def _preprocess_training_data(self, x, t, e, vsize, val_data, random_state):
       t_val = torch.from_numpy(t_val).double()
       e_val = torch.from_numpy(e_val).double()
 
-    return (x_train, t_train, e_train, x_val, t_val, e_val)
+    if self.cuda == 1:
+      x_train, t_train, e_train = x_train.cuda(), t_train.cuda(), e_train.cuda()
+      x_val, t_val, e_val = x_val.cuda(), t_val.cuda(), e_val.cuda()
+
+    return (x_train, t_train, e_train,
+            x_val, t_val, e_val)
 
 
   def predict_mean(self, x, risk=1):
@@ -300,6 +318,15 @@ def predict_pdf(self, x, t, risk=1):
                       "model using the `fit` method on some training data " +
                       "before calling `predict_survival`.")
 
+  def predict_alphas(self, x):
+    x = self._preprocess_test_data(x)
+    if self.fitted:
+      _, _, alphas = self.torch_model(x)
+      return torch.softmax(alphas, dim = 1).detach().cpu().numpy()
+    else:
+      raise Exception("The model has not been fitted yet. Please fit the " +
+                      "model using the `fit` method on some training data " +
+                      "before calling `predict_alphas`.")
 
 class DeepSurvivalMachines(DSMBase):
   """A Deep Survival Machines model.
@@ -394,7 +421,10 @@ def _gen_torch_model(self, inputdim, optimizer, risks):
                                               risks=risks)
 
   def _preprocess_test_data(self, x):
-    return torch.from_numpy(_get_padded_features(x))
+    data = torch.from_numpy(_get_padded_features(x))
+    if self.cuda:
+      data = data.cuda()
+    return data
 
   def _preprocess_training_data(self, x, t, e, vsize, val_data, random_state):
     """RNNs require different preprocessing for variable length sequences"""
@@ -435,7 +465,12 @@ def _preprocess_training_data(self, x, t, e, vsize, val_data, random_state):
       t_val = torch.from_numpy(t_val).double()
       e_val = torch.from_numpy(e_val).double()
 
-    return (x_train, t_train, e_train, x_val, t_val, e_val)
+    if self.cuda == 1:
+      x_train, t_train, e_train = x_train.cuda(), t_train.cuda(), e_train.cuda()
+      x_val, t_val, e_val = x_val.cuda(), t_val.cuda(), e_val.cuda()
+
+    return (x_train, t_train, e_train,
+            x_val, t_val, e_val)
 
 
 class DeepConvolutionalSurvivalMachines(DSMBase):