zackchase
diff --git a/‎blstm_run.py
Lines changed: 65 additions & 0 deletions b/‎blstm_run.py
Lines changed: 65 additions & 0 deletions
diff --git a/‎char_rnn.py
Lines changed: 42 additions & 0 deletions b/‎char_rnn.py
Lines changed: 42 additions & 0 deletions
diff --git a/‎evaluation.py
Lines changed: 223 additions & 0 deletions b/‎evaluation.py
Lines changed: 223 additions & 0 deletions
@@ -0,0 +1,65 @@
+from multilabel_blstm import ML_BLSTM
+from lib import one_hot, one_hot_to_string, floatX
+import numpy as np
+import theano
+import theano.tensor as T
+import sys
+import random
+import evaluation
+
+###############################
+#  Prepare the data
+###############################
+
+
+f = open("../icu_old/icu_old-resampled_60min.npz")
+g = open("../icu_old/icu_old-labels_all.npz")
+
+a = np.load(f)
+b = np.load(g)
+
+X = a["X"]
+Y = b["Yd"].astype("float")
+
+f.close()
+g.close()
+
+perms = range(len(X))
+random.shuffle(perms)
+
+X = X[perms].copy()
+Y = Y[perms].copy()
+
+train_size=9000
+
+X_train = X[0:train_size]
+Y_train = Y[0:train_size]
+X_test = X[train_size:]
+Y_test = Y[train_size:]
+
+###############################
+#  Instantiate the RNN
+###############################
+
+rnn = ML_BLSTM(num_input=13, num_hidden=128, num_output=429)
+
+
+def train(X, Y, iters=100, eta=100, alpha=0.0):
+    running_total = 0
+
+    for it in xrange(iters):
+        i = random.randint(0, len(X)-1)
+        cost, last_step_cost = rnn.train(X[i], np.tile(Y[i], (len(X[i]), 1)), eta, alpha)
+        running_total += last_step_cost
+        running_avg = running_total / (it + 1.)
+        print "iteration: %s, cost: %s, last: %s, avg: %s" % (it, cost, last_step_cost, running_avg)
+
+
+
+
+
+
+
+
+
+
@@ -0,0 +1,42 @@
+import numpy as np
+import theano
+import theano.tensor as T
+from lstm import InputLayer, SoftmaxLayer, LSTMLayer
+from lib import make_caches, get_params, SGD, momentum, one_step_updates
+
+
+class CharRNN:
+    def __init__(self):
+        X = T.matrix()
+        Y = T.matrix()
+        eta = T.scalar()
+        temperature=T.scalar()
+
+        num_input = 256
+        num_hidden = 500
+        num_output = 256
+
+        inputs = InputLayer(X, name="inputs")
+        lstm1 = LSTMLayer(num_input, num_hidden, input_layer=inputs, name="lstm1")
+        lstm2 = LSTMLayer(num_hidden, num_hidden, input_layer=lstm1, name="lstm2")
+        softmax = SoftmaxLayer(num_hidden, num_output, input_layer=lstm2, name="yhat", temperature=temperature)
+
+        Y_hat = softmax.output()
+
+        self.layers = inputs, lstm1, lstm2, softmax
+
+        params = get_params(self.layers)
+        caches = make_caches(params)
+
+
+        cost = T.mean(T.nnet.categorical_crossentropy(Y_hat, Y))
+        updates = momentum(cost, params, caches, eta)
+
+        self.train = theano.function([X, Y, eta, temperature], cost, updates=updates, allow_input_downcast=True)
+
+        predict_updates = one_step_updates(self.layers)
+        self.predict_char = theano.function([X, temperature], Y_hat, updates=predict_updates, allow_input_downcast=True)
+
+    def reset_state(self):
+        for layer in self.layers:
+            layer.reset_state()
@@ -0,0 +1,223 @@
+import numpy as np
+import sys
+
+from pandas import Series
+from scipy.spatial.distance import cdist
+from sklearn.metrics import precision_recall_curve, average_precision_score
+from sklearn.metrics import roc_curve, roc_auc_score, f1_score
+
+
+def evaluate(rnn, X_test, Y_test, threshold, first_k=0):
+
+    Y_hats = np.zeros(Y_test.shape)
+
+    for i in xrange(len(X_test)):
+        Y_hats[i,:] = rnn.predict(X_test[i])[0]
+
+
+    if first_k>0.0:
+        yh = Y_hats[:,:k]
+        yt = Y_test[:,:k]
+
+    else:
+        yh = Y_hats
+        yt = Y_test
+
+    print "calculating F1"
+    f1, p, r = F1(yh, yt, threshold)
+
+    print "calculating AUC"
+    (ROC, AUC, throwaway) = ROC_AUC(yh,yt)
+
+    print "calculating precision at 10:"
+    p10, best_p10 = precision_at_k(yh, yt, 10)
+
+    print "calculating precision at 20:"
+    p20, best_p20 = precision_at_k(yh, yt, 20)
+
+    print("AUC: %s, P@10: %s, bp10: %s, P@20: %s, bp20: %s, Prec: %s, Rec: %s, F1: %s" % (AUC, p10, best_p10, p20, best_p20, p, r, f1))
+
+    return
+
+def compute_micro_evaluations(Ytrue, Ypred, threshold_score='f1', criterion='zack', k=10):
+    ytrue = Ytrue.flatten()
+    ypred = Ypred.flatten()
+
+    #fpr,tpr,troc = roc_curve(ytrue, ypred)
+    #troc = np.hstack([troc, troc[-1]])
+    #roc = np.vstack([fpr,tpr]).T
+    #auroc = roc_auc_score(ytrue, ypred)
+    roc, auroc, troc = ROC_AUC(ypred, ytrue)
+    roc = np.array(zip(*roc)).T
+
+    prc, auprc, tprc = PRC_AUC(ypred, ytrue)
+    f1c, _ = f1_curve(ypred, ytrue)
+
+    if threshold_score == 'roc':
+        threshold, _ = optimize_threshold_with_roc(roc, troc, criterion=criterion)
+    elif threshold_score == 'prc':
+        threshold, _ = optimize_threshold_with_prc(prc, tprc, criterion=criterion)
+    else:
+        threshold, _ = optimize_threshold_with_f1(f1c, tprc, criterion=criterion)
+
+    f1, p, r = F1(ypred, ytrue, threshold)
+
+    if len(Ytrue.shape) > 1 and Ytrue.shape[1] > k:
+        pk, best_pk = precision_at_k(Ypred, Ytrue, k)
+    else:
+        pk, best_pk = (np.nan, np.nan)
+
+    #return np.array([ auroc, auprc, f1, p, r, threshold, pk, best_pk ])
+    #return {'auroc': auroc, 'auprc': auprc, 'f1': f1, 'precision': p, 'recall': r,
+    #        'threshold': threshold, 'precision_at_{0}'.format(k): pk,
+    #        'best_precision_at_{0}'.format(k): best_pk}
+    return Series([ auroc, auprc, f1, p, r, threshold, pk, best_pk ],
+                index=[ 'auroc', 'auprc', 'f1', 'precision', 'recall',
+                        'threshold', 'precision_at_{0}'.format(k), 'best_precision_at_{0}'.format(k) ])
+
+def F1(Y_hats, Y_test, threshold):
+    YH = Y_hats > threshold
+
+    tp =(YH > .5) & (Y_test > 0)
+    p = tp.sum()*1.0 / YH.sum()
+    #print "tpsum: %s, YHsum: %s" % (tp.sum(), YH.sum())
+    r = tp.sum()*1.0 / Y_test.sum()
+    return  ((2 * p * r) / (p + r)), p, r
+
+def precision_at_k(Y_hats, Y_test, k):
+    rows,cols = Y_hats.shape
+    ranks = np.argsort(-1 * Y_hats, axis=1)
+    numerator = 0.
+    for i in xrange(rows):
+        for j in xrange(k):
+            numerator += Y_test[i, ranks[i,j]]
+
+    p10 = numerator*1.0 / (rows * k)
+
+    best_p10 = Y_test.sum()*1.0 / (rows*k)
+
+    return p10, best_p10
+
+def ROC_AUC(Y_hats, Y_test):
+
+    #print "calculating number of true positives"
+    total_positives = Y_test.sum()*1.0
+    total_negatives = len(Y_test.flatten())*1.0 - total_positives
+
+    #print "sorting predictions by score"
+    sorted_pred = sorted(zip(Y_hats.flatten(), Y_test.flatten()), key=lambda x: -1*x[0])
+
+    tp = 0.0
+    fp = 0.0
+
+    ROC = []
+
+    #print("passing through sorted predictions")
+    for yh, gt in sorted_pred:
+        #print "yh: %s, gt: %s" % (yh, gt)
+        if gt == 1.0:
+            tp += 1.0
+        else:
+            fp += 1.0
+
+        ROC += [((fp/total_negatives), (tp/total_positives))]
+
+    #calculate area under the curve
+    l = len(ROC)
+    AUC = 0.0
+    for x, y in ROC:
+        AUC += y * (1.0/l)
+
+    thresholds = zip(*sorted_pred)[0]
+    return ROC, AUC, list(thresholds)
+
+def PRC_AUC(Y_hats, Y_test):
+    p,r,thresholds = precision_recall_curve(Y_test.flatten(), Y_hats.flatten())
+    thresholds = np.hstack([thresholds, thresholds[-1]])
+    prc = np.vstack([r,p]).T
+    auc = average_precision_score(Y_test.flatten(), Y_hats.flatten(), average='micro')
+    return prc, auc, thresholds
+
+def f1_curve(Y_hats, Y_test):
+    p,r,thresholds = precision_recall_curve(Y_test.flatten(), Y_hats.flatten())
+    thresholds = np.hstack([thresholds, thresholds[-1]])
+    f1 = (2 * p * r) / (p + r)
+    return f1, thresholds
+
+def optimize_threshold_with_roc(roc, thresholds, criterion='dist'):
+    if roc.shape[1] > roc.shape[0]:
+        roc = roc.T
+    assert(roc.shape[0] == thresholds.shape[0])
+    if criterion == 'margin':
+        scores = roc[:,1]-roc[:,0]
+    else:
+        scores = -cdist(np.array([[0,1]]), roc)
+    ti = np.nanargmax(scores)
+    return thresholds[ti], ti
+
+def optimize_threshold_with_prc(prc, thresholds, criterion='min'):
+    prc[np.isnan(prc)] = 0
+    if prc.shape[1] > prc.shape[0]:
+        prc = prc.T
+    assert(prc.shape[0] == thresholds.shape[0])
+    if criterion == 'sum':
+        scores = prc.sum(axis=1)
+    elif criterion.startswith('dist'):
+        scores = -cdist(np.array([[1,1]]), prc)
+    else:
+        scores = prc.min(axis=1)
+    ti = np.nanargmax(scores)
+    return thresholds[ti], ti
+
+mp = np.finfo(float).eps
+
+def optimize_threshold_with_f1(f1c, thresholds, criterion='max'):
+    #f1c[np.isnan(f1c)] = 0
+    if criterion == 'max':
+        ti = np.nanargmax(f1c)
+    else:
+        ti = np.nanargmin(np.abs(thresholds-0.5*f1c))
+        #assert(np.all(thresholds>=0))
+        #idx = (thresholds>=f1c*0.5-mp) & (thresholds<=f1c*0.5+mp)
+        #assert(np.any(idx))
+        #ti = np.where(idx)[0][f1c[idx].argmax()]
+    return thresholds[ti], ti
+
+def random_split(n, test_frac=0.1):
+    all_idx = np.arange(n)
+    test_idx = all_idx[np.random.choice(n, int(np.ceil(test_frac*n)), replace=False)]
+    train_idx = np.setdiff1d(all_idx, test_idx)
+    assert(np.all(np.sort(np.hstack([train_idx, test_idx])) == all_idx))
+    return train_idx, test_idx
+
+def generate_one_split(Y, test_frac=0.1, valid_frac=0.1, minpos=10, verbose=0):
+    split = None
+
+    if verbose > 0:
+        sys.stdout.write('Generating {0} test split'.format(test_frac))
+        sys.stdout.flush()
+    while split is None:
+        if verbose > 0:
+            sys.stdout.write('.')
+            sys.stdout.flush()
+        not_test_idx, test_idx = random_split(Y.shape[0], test_frac=test_frac)
+        assert(np.all(np.sort(np.hstack([not_test_idx,test_idx])) == np.arange(Y.shape[0])))
+        if np.all(Y[not_test_idx,:].sum(axis=0)>=2*minpos) and np.all(Y[test_idx,:].sum(axis=0)>=minpos):
+            if verbose > 0:
+                sys.stdout.write('Generating {0}/{1} train/test splits'.format(1-(test_frac+valid_frac), valid_frac))
+                sys.stdout.flush()
+            while split is None:
+                if verbose > 0:
+                    sys.stdout.write('.')
+                    sys.stdout.flush()
+                train_idx, valid_idx = random_split(Y[not_test_idx].shape[0], test_frac=valid_frac/(1-test_frac))
+                assert(np.all(np.sort(np.hstack((train_idx, valid_idx))) == np.arange(Y[not_test_idx].shape[0])))
+                if np.all(Y[not_test_idx,:][train_idx,:].sum(axis=0)>=minpos) and np.all(Y[not_test_idx,:][valid_idx,:].sum(axis=0)>=minpos):
+                    split = ( np.sort(not_test_idx[train_idx]), np.sort(not_test_idx[valid_idx]), np.sort(test_idx) )
+                    sys.stdout.write('DONE!\n')
+                    break
+
+    return split
+
+def generate_splits(Y, num_splits=10, test_frac=0.1, valid_frac=0.1, minpos=10, verbose=0):
+    return [ generate_one_split(Y, test_frac=test_frac, valid_frac=valid_frac, minpos=minpos, verbose=verbose) for i in range(num_splits) ]