forked from arunarn2/HierarchicalAttentionNetworks
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathHierarchicalAttn.py
175 lines (137 loc) · 5.94 KB
/
HierarchicalAttn.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
# Implementation of Hierarchical Attentional Networks for Document Classification
# http://www.cs.cmu.edu/~./hovy/papers/16HLT-hierarchical-attention-networks.pdf
import os
import re
import numpy as np
import pandas as pd
from bs4 import BeautifulSoup
from keras import backend as K
from keras.models import Model
from keras import initializers
from keras.engine.topology import Layer
from keras.layers import Dense, Input
from keras.layers import Embedding, GRU, Bidirectional, TimeDistributed
from keras.preprocessing.text import Tokenizer, text_to_word_sequence
from keras.utils.np_utils import to_categorical
from nltk import tokenize
maxlen = 100
max_sentences = 15
max_words = 20000
embedding_dim = 100
validation_split = 0.2
reviews = []
labels = []
texts = []
glove_dir = "./glove.6B"
embeddings_index = {}
# class defining the custom attention layer
class HierarchicalAttentionNetwork(Layer):
def __init__(self, attention_dim):
self.init = initializers.get('normal')
self.supports_masking = True
self.attention_dim = attention_dim
super(HierarchicalAttentionNetwork, self).__init__()
def build(self, input_shape):
assert len(input_shape) == 3
self.W = K.variable(self.init((input_shape[-1], self.attention_dim)))
self.b = K.variable(self.init((self.attention_dim,)))
self.u = K.variable(self.init((self.attention_dim, 1)))
self.trainable_weights = [self.W, self.b, self.u]
super(HierarchicalAttentionNetwork, self).build(input_shape)
def compute_mask(self, inputs, mask=None):
return mask
def call(self, x, mask=None):
# size of x :[batch_size, sel_len, attention_dim]
# size of u :[batch_size, attention_dim]
# uit = tanh(xW+b)
uit = K.tanh(K.bias_add(K.dot(x, self.W), self.b))
ait = K.exp(K.squeeze(K.dot(uit, self.u), -1))
if mask is not None:
# Cast the mask to floatX to avoid float64 upcasting
ait *= K.cast(mask, K.floatx())
ait /= K.cast(K.sum(ait, axis=1, keepdims=True) + K.epsilon(), K.floatx())
weighted_input = x * K.expand_dims(ait)
output = K.sum(weighted_input, axis=1)
return output
def compute_output_shape(self, input_shape):
return input_shape[0], input_shape[-1]
def remove_html(str_a):
p = re.compile(r'<.*?>')
return p.sub('', str_a)
# replace all non-ASCII (\x00-\x7F) characters with a space
def replace_non_ascii(str_a):
return re.sub(r'[^\x00-\x7f]', r'', str_a)
# Tokenization/string cleaning for dataset
def clean_str(string):
string = re.sub(r"\\", "", string)
string = re.sub(r"\'", "", string)
string = re.sub(r"\"", "", string)
return string.strip().lower()
input_data = pd.read_csv('labeledTrainData.tsv', sep='\t')
for idx in range(input_data.review.shape[0]):
text = BeautifulSoup(input_data.review[idx], features="html5lib")
text = clean_str(text.get_text().encode('ascii', 'ignore'))
texts.append(text)
sentences = tokenize.sent_tokenize(text)
reviews.append(sentences)
labels.append(input_data.sentiment[idx])
tokenizer = Tokenizer(num_words=max_words)
tokenizer.fit_on_texts(texts)
data = np.zeros((len(texts), max_sentences, maxlen), dtype='int32')
for i, sentences in enumerate(reviews):
for j, sent in enumerate(sentences):
if j < max_sentences:
wordTokens = text_to_word_sequence(sent)
k = 0
for _, word in enumerate(wordTokens):
if k < maxlen and tokenizer.word_index[word] < max_words:
data[i, j, k] = tokenizer.word_index[word]
k = k + 1
word_index = tokenizer.word_index
print('Total %s unique tokens.' % len(word_index))
labels = to_categorical(np.asarray(labels))
print('Shape of reviews (data) tensor:', data.shape)
print('Shape of sentiment (label) tensor:', labels.shape)
indices = np.arange(data.shape[0])
np.random.shuffle(indices)
data = data[indices]
labels = labels[indices]
nb_validation_samples = int(validation_split * data.shape[0])
x_train = data[:-nb_validation_samples]
y_train = labels[:-nb_validation_samples]
x_val = data[-nb_validation_samples:]
y_val = labels[-nb_validation_samples:]
print('Number of positive and negative reviews in training and validation set')
print y_train.sum(axis=0)
print y_val.sum(axis=0)
f = open(os.path.join(glove_dir, 'glove.6B.100d.txt'))
for line in f:
values = line.split()
word = values[0]
coefs = np.asarray(values[1:], dtype='float32')
embeddings_index[word] = coefs
f.close()
print('Total %s word vectors.' % len(embeddings_index))
# building Hierachical Attention network
embedding_matrix = np.random.random((len(word_index) + 1, embedding_dim))
for word, i in word_index.items():
embedding_vector = embeddings_index.get(word)
if embedding_vector is not None:
# words not found in embedding index will be all-zeros.
embedding_matrix[i] = embedding_vector
embedding_layer = Embedding(len(word_index) + 1, embedding_dim, weights=[embedding_matrix],
input_length=maxlen, trainable=True, mask_zero=True)
sentence_input = Input(shape=(maxlen,), dtype='int32')
embedded_sequences = embedding_layer(sentence_input)
lstm_word = Bidirectional(GRU(100, return_sequences=True))(embedded_sequences)
attn_word = HierarchicalAttentionNetwork(100)(lstm_word)
sentenceEncoder = Model(sentence_input, attn_word)
review_input = Input(shape=(max_sentences, maxlen), dtype='int32')
review_encoder = TimeDistributed(sentenceEncoder)(review_input)
lstm_sentence = Bidirectional(GRU(100, return_sequences=True))(review_encoder)
attn_sentence = HierarchicalAttentionNetwork(100)(lstm_sentence)
preds = Dense(2, activation='softmax')(attn_sentence)
model = Model(review_input, preds)
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['acc'])
print("model fitting - Hierachical attention network")
model.fit(x_train, y_train, validation_data=(x_val, y_val), nb_epoch=10, batch_size=100)