forked from brockholzer/DeepEHR
-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathmakeData.py
128 lines (108 loc) · 5.01 KB
/
makeData.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
"""
Randomly generate synthetic data to run training / validation scripts
Output:
1. dfTrainPos.json (training set with positive examples)
2. dfTrainNeg.json (training set with negative examples)
3. dfDev.json (validation set)
4. embedding.p (embedding matrix with row ordered by the word index, first row all 0 as padding)
* In the synthetic data we simply take the first 1000 words in the embedding vector matrix. Please re-order the matrix based on the word index of your input data
Each file is formatted as a list of records. Each record is a patient's data containing encounters during the
12-month historical window and labels of the three target disease during the 6-month prediction window.
Each record has the following elements, an element can be a list of values or a value:
[Note, Num, Disease, Mask, Age, gender, race, ethnic]
1. Note: a list with each element an encounter, within each encounter the element are words (as converted to word index, starting from 1) in the note.
2. Num: a list with each element an encounter, within each encounter, the first, second, third 50 values are the min / median / max of the
50 extracted, normalized labValues aggregated within this encounter for the corresponding patient, respectively. The last value is the days between
the current encounter and the previous encounter. Thus there are 151 dimensions of numerical values at each encounter.
3. Disease: a list of 3 binary values, corresponding to whether the patient had CHF, KF, and stroke during the prediction window. (1 = yes, 0 = no)
4. Mask: a list of 3 binary values, value 1 indicates the corresponding record shouldn't be considered for the corresponding disease prediction.
5. Age: normalized value of age
6. Gender: index of gender, 0 is missing
7. Race: index of race, 0 is missing
8. Ethnic: index of ethnic, 0 is missing
"""
import numpy as np
import argparse
import json
import os
import util
import pickle
#import pdb
def makeNotes(maxIdx = 1000, maxDocLen = 800):
'''
Generate synthetic data of word indexes in one encounter's note.
:param maxIdx: maximum number of word index
:param maxDocLen: maximum note length
'''
docLen = np.random.choice(maxDocLen-1) + 1
note = list(np.random.choice(maxIdx, size = docLen))
note = [int(x) for x in note]
return note
def makeNum(maxDays = 50):
'''
Generate synthetic data of the numerical values of one encounter
:param maxDays: maximum days between two encounters
'''
labs = list(np.random.normal(0, 1, size = 150))
days = np.random.choice(maxDays)
labs.append(days)
return labs
def makeBinary(ndim, lsPosPct):
'''
Generate a list of binary values
:param ndim: number of binary values to generate
:param lsPosPct: positive percentage of each value
'''
out = []
for i in range(ndim):
out.append(np.random.choice(2, p = [1-lsPosPct[i], lsPosPct[i]]))
return out
def makeRecord(maxWordIdx, maxEncounter = 30):
'''
Generate synthetic data of each record
:param maxEncounter: maximum number of encounters
'''
nEncounter = np.random.choice(maxEncounter-1) + 1 # minimum 1 encounter
Notes, Num = [], []
for i in range(nEncounter):
Notes.append(makeNotes(maxIdx = maxWordIdx))
Num.append(makeNum())
Disease = makeBinary(ndim = 3, lsPosPct=[0.1, 0.1, 0.1])
Mask = makeBinary(ndim = 3, lsPosPct=[0.05, 0.05, 0.05])
Age = np.random.normal()
Gender = np.random.choice(2)
Race = np.random.choice(25)
Ethnic = np.random.choice(29)
return [Notes, Num, Disease, Mask, Age, Gender, Race, Ethnic]
def splitPos(dfTrain):
dfTrainPos, dfTrainNeg = [], []
for row in dfTrain:
if max(row[2]) > 0:
dfTrainPos.append(row)
else:
dfTrainNeg.append(row)
return dfTrainPos, dfTrainNeg
if __name__ == '__main__':
parser = argparse.ArgumentParser()
parser.add_argument("--nRec", type=int, default=1000) # Number of records to generate in train and dev, first half goes into training and second into dev
parser.add_argument("--outPath", default = 'sampleData/')
parser.add_argument("--randSeed", type=int, default=42)
parser.add_argument("--embFile", default ='sspEmbedding_dim300.tsv') # Name of the embedding vector file
args = parser.parse_args()
if not os.path.isdir(args.outPath):
os.mkdir(args.outPath)
np.random.seed(args.randSeed)
maxWordIdx = 1000
df= []
for i in range(args.nRec):
df.append(makeRecord(maxWordIdx))
n = int(args.nRec/2)
dfTrain, dfDev = df[0:n], df[n:]
dfTrainPos, dfTrainNeg = splitPos(dfTrain)
json.dump(dfTrainPos, open(args.outPath + 'dfTrainPos.json','w'))
json.dump(dfTrainNeg, open(args.outPath + 'dfTrainNeg.json','w'))
json.dump(dfDev, open(args.outPath + 'dfDev.json','w'))
w2v_vocab, vec = util.load_star_space(args.embFile, torch = False)
embedding = np.zeros(shape=(maxWordIdx + 1, vec.shape[1]))
embedding[1:] = vec[0:maxWordIdx]
pickle.dump(embedding, open(args.outPath + 'embedding.p', 'wb'))