-
Notifications
You must be signed in to change notification settings - Fork 0
/
EmbedHelper.py
96 lines (73 loc) · 3.28 KB
/
EmbedHelper.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
import numpy as np
import gensim
class EmbeddingHandler:
embedDict = {
1:"Fast Text",
2:"Google News",
3:"HealthTap",
4:"Pubmed",
5:"Glove",
6:"iCliniq Trigram",
7:"iCliniq default"
}
def __init__(self,embedType,trainNewModel,vectorSize,embedPath):
self.embedType = embedType
self.embedPath = embedPath
self.model = self.getEmbeddingModel(embedType,trainNewModel,vectorSize)
if(self.model == None):
raise Exception("Failed to create embedding model")
@staticmethod
def loadFastTextModel():
raise Exception("Fast Text Not Supported Yet")
# model = FastText.load("data/htFastText.embed")
# model = model.wv
return model
def getEmbeddingModel(self,embeddingType,trainNewModel,vectorSize):
#Google News
if(embeddingType == EmbeddingHandler.embedDict[2]):
print("Loading Google News")
model = gensim.models.KeyedVectors.load_word2vec_format(self.embedPath+"/GoogleNews-vectors-negative300.bin",
binary=True)
return model
#Fast Text
elif(embeddingType == EmbeddingHandler.embedDict[1]):
print("Loading Fast Text")
model = EmbeddingHandler.loadFastTextModel()
return model
elif(embeddingType == EmbeddingHandler.embedDict[3]):
print("Loading HT Word2Vec")
return gensim.models.KeyedVectors.load(self.embedPath+"/healthTapEmbedding.embed")
elif (embeddingType == EmbeddingHandler.embedDict[4]):
print("Loading Pubmed")
return gensim.models.KeyedVectors.load_word2vec_format(self.embedPath + "/wikipedia-pubmed-and-PMC-w2v.bin",binary=True)
elif(embeddingType == EmbeddingHandler.embedDict[5]):
print("Loading Glove")
return gensim.models.KeyedVectors.load_word2vec_format(self.embedPath+"/glove840kW2V.txt")
elif(embeddingType == EmbeddingHandler.embedDict[6]):
print("Loading iCliniq Trigram Embeds (W2V)")
return gensim.models.KeyedVectors.load("Embeddings//icliniq_trigram//icliniq_trigram.w2v")
elif(embeddingType == EmbeddingHandler.embedDict[7]):
print("Loading iCliniq Default Embeds (W2V)")
return gensim.models.KeyedVectors.load("Embeddings//icliniq_default//icliniq_default.w2v")
else:
print("Embedding Does not Exist")
def vectorizeSentence(self, sentence):
embeddedSentence = []
vectorSize = self.model.vector_size
for word in sentence:
embedding = np.zeros(vectorSize)
if(word == "[None]"):
embedding = np.zeros(vectorSize)
else:
if(word in self.model):
embedding = self.model[word]
embedding=np.array(embedding)
else:
embedding=np.zeros(vectorSize)
embeddedSentence += [embedding]
return embeddedSentence
def vectorizeBatch(self,batchData):
embedList = []
for sentence in batchData:
embedList += [self.vectorizeSentence(sentence)]
return np.array(embedList)