-
Notifications
You must be signed in to change notification settings - Fork 70
/
baseline_methods.py
123 lines (84 loc) · 2.91 KB
/
baseline_methods.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
# coding: utf-8
import json
import fancyimpute
import numpy as np
import pandas as pd
X = []
Y = []
Z = []
for ctx in open('json/json'):
z = json.loads(ctx)['label']
ctx = json.loads(ctx)['forward']
x = np.asarray(ctx['values'])
y = np.asarray(ctx['evals'])
x_mask = np.asarray(ctx['masks']).astype(np.bool)
y_mask = np.asarray(ctx['eval_masks']).astype(np.bool)
x[~x_mask] = np.nan
y[(~x_mask) & (~y_mask)] = np.nan
X.append(x)
Y.append(y)
Z.append(int(z))
def get_loss(X, X_pred, Y):
# find ones in Y but not in X (ground truth)
mask = np.isnan(X) ^ np.isnan(Y)
X_pred = np.nan_to_num(X_pred)
pred = X_pred[mask]
label = Y[mask]
mae = np.abs(pred - label).sum() / (1e-5 + np.sum(mask))
mre = np.abs(pred - label).sum() / (1e-5 + np.sum(np.abs(label)))
return {'mae': mae, 'mre': mre}
# Algo1: Mean imputation
X_mean = []
print(len(X))
for x, y in zip(X, Y):
X_mean.append(fancyimpute.SimpleFill().complete(x))
X_c = np.concatenate(X, axis=0).reshape(-1, 48, 35)
Y_c = np.concatenate(Y, axis=0).reshape(-1, 48, 35)
Z_c = np.array(Z)
X_mean = np.concatenate(X_mean, axis=0).reshape(-1, 48, 35)
print('Mean imputation:')
print(get_loss(X_c, X_mean, Y_c))
# save mean inputation results
print(X_c.shape, Y_c.shape, Z_c.shape)
raw_input()
np.save('./result/mean_data.npy', X_mean)
np.save('./result/mean_label.npy', Z_c)
# Algo2: KNN imputation
X_knn = []
for x, y in zip(X, Y):
X_knn.append(fancyimpute.KNN(k=10, verbose=False).complete(x))
X_c = np.concatenate(X, axis=0)
Y_c = np.concatenate(Y, axis=0)
X_knn = np.concatenate(X_knn, axis=0)
print('KNN imputation')
print(get_loss(X_c, X_knn, Y_c))
raw_input()
# ### Matrix Factorization
# since MF is extremely slow, we evaluate the imputation result every 100 iterations
X_mf = []
for i, (x, y) in enumerate(zip(X, Y)):
X_mf.append(fancyimpute.MatrixFactorization(loss='mae', verbose=False).complete(x))
if i % 100 == 0:
X_c = np.concatenate(X[:i + 1], axis=0)
Y_c = np.concatenate(Y[:i + 1], axis=0)
X_mf_c = np.concatenate(X_mf, axis=0)
print('MF imputation')
print(get_loss(X_c, X_mf_c, Y_c))
# MICE imputation
# Since MICE can not handle the singular matrix, we do it in a batch style
X_mice = []
# since the data matrix of one patient is a singular matrix, we merge a batch of matrices and do MICE impute
n = len(X)
batch_size = 128
nb_batch = (n + batch_size - 1) // batch_size
for i in range(nb_batch):
print('On batch {}'.format(i))
x = np.concatenate(X[i * batch_size: (i + 1) * batch_size])
y = np.concatenate(Y[i * batch_size: (i + 1) * batch_size])
x_mice = fancyimpute.MICE(n_imputations=100, n_pmm_neighbors=20, verbose=False).complete(x)
X_mice.append(x_mice)
X_mice = np.concatenate(X_mice, axis=0)
X_c = np.concatenate(X, axis=0)
Y_c = np.concatenate(Y, axis=0)
print('MICE imputation')
print(get_loss(X_c, X_mice, Y_c))