diff --git a/README.md b/README.md index 1dfc858..1c80c59 100644 --- a/README.md +++ b/README.md @@ -2,18 +2,37 @@ [![license](https://img.shields.io/badge/License-Apache_2.0-brightgreen.svg)](https://github.com/philipperemy/keras-attention-mechanism/blob/master/LICENSE) [![dep1](https://img.shields.io/badge/Tensorflow-2.0+-brightgreen.svg)](https://www.tensorflow.org/) [![dep2](https://img.shields.io/badge/Keras-2.0+-brightgreen.svg)](https://keras.io/) ![Simple Keras Attention CI](https://github.com/philipperemy/keras-attention-mechanism/workflows/Simple%20Keras%20Attention%20CI/badge.svg) -``` -pip install attention -``` - Many-to-one attention mechanism for Keras.
- +
+ +Installation via pip + +```bash +pip install attention +``` + +Import in the source code + +```python +from attention import Attention + +# [...] + +m = Sequential([ + LSTM(128, input_shape=(seq_length, 1), return_sequences=True), + Attention(name='attention_weight'), # <--------- here. + Dense(1, activation='linear') +]) +``` + ## Examples +Install the requirements before running the examples: `pip install -r requirements.txt`. + ### IMDB Dataset In this experiment, we demonstrate that using attention yields a higher accuracy on the IMDB dataset. We consider two @@ -46,6 +65,18 @@ task and the attention map converges to the ground truth. +### Finding max of a sequence + +We consider many 1D sequences of the same length. The task is to find the maximum of each sequence. + +We give the full sequence processed by the RNN layer to the attention layer. We expect the attention layer to focus on the maximum of each sequence. + +After a few epochs, the attention layer converges perfectly to what we expected. + ++ +
+ ## References - https://www.cs.cmu.edu/~./hovy/papers/16HLT-hierarchical-attention-networks.pdf diff --git a/attention/__init__.py b/attention/__init__.py index d68a4cb..8fcf2e0 100644 --- a/attention/__init__.py +++ b/attention/__init__.py @@ -1 +1,3 @@ -from attention.attention import attention_3d_block # noqa +from attention.attention import Attention # noqa + +VERSION = '3.0' diff --git a/attention/attention.py b/attention/attention.py index a0fd894..289ddd2 100644 --- a/attention/attention.py +++ b/attention/attention.py @@ -1,26 +1,32 @@ from tensorflow.keras.layers import Dense, Lambda, dot, Activation, concatenate +from tensorflow.keras.layers import Layer -def attention_3d_block(hidden_states): - """ - Many-to-one attention mechanism for Keras. - @param hidden_states: 3D tensor with shape (batch_size, time_steps, input_dim). - @return: 2D tensor with shape (batch_size, 128) - @author: felixhao28. - """ - hidden_size = int(hidden_states.shape[2]) - # Inside dense layer - # hidden_states dot W => score_first_part - # (batch_size, time_steps, hidden_size) dot (hidden_size, hidden_size) => (batch_size, time_steps, hidden_size) - # W is the trainable weight matrix of attention Luong's multiplicative style score - score_first_part = Dense(hidden_size, use_bias=False, name='attention_score_vec')(hidden_states) - # score_first_part dot last_hidden_state => attention_weights - # (batch_size, time_steps, hidden_size) dot (batch_size, hidden_size) => (batch_size, time_steps) - h_t = Lambda(lambda x: x[:, -1, :], output_shape=(hidden_size,), name='last_hidden_state')(hidden_states) - score = dot([score_first_part, h_t], [2, 1], name='attention_score') - attention_weights = Activation('softmax', name='attention_weight')(score) - # (batch_size, time_steps, hidden_size) dot (batch_size, time_steps) => (batch_size, hidden_size) - context_vector = dot([hidden_states, attention_weights], [1, 1], name='context_vector') - pre_activation = concatenate([context_vector, h_t], name='attention_output') - attention_vector = Dense(128, use_bias=False, activation='tanh', name='attention_vector')(pre_activation) - return attention_vector +class Attention(Layer): + + def __init__(self, **kwargs): + super().__init__(**kwargs) + + def __call__(self, hidden_states): + """ + Many-to-one attention mechanism for Keras. + @param hidden_states: 3D tensor with shape (batch_size, time_steps, input_dim). + @return: 2D tensor with shape (batch_size, 128) + @author: felixhao28. + """ + hidden_size = int(hidden_states.shape[2]) + # Inside dense layer + # hidden_states dot W => score_first_part + # (batch_size, time_steps, hidden_size) dot (hidden_size, hidden_size) => (batch_size, time_steps, hidden_size) + # W is the trainable weight matrix of attention Luong's multiplicative style score + score_first_part = Dense(hidden_size, use_bias=False, name='attention_score_vec')(hidden_states) + # score_first_part dot last_hidden_state => attention_weights + # (batch_size, time_steps, hidden_size) dot (batch_size, hidden_size) => (batch_size, time_steps) + h_t = Lambda(lambda x: x[:, -1, :], output_shape=(hidden_size,), name='last_hidden_state')(hidden_states) + score = dot([score_first_part, h_t], [2, 1], name='attention_score') + attention_weights = Activation('softmax', name='attention_weight')(score) + # (batch_size, time_steps, hidden_size) dot (batch_size, time_steps) => (batch_size, hidden_size) + context_vector = dot([hidden_states, attention_weights], [1, 1], name='context_vector') + pre_activation = concatenate([context_vector, h_t], name='attention_output') + attention_vector = Dense(128, use_bias=False, activation='tanh', name='attention_vector')(pre_activation) + return attention_vector diff --git a/examples/example-attention.py b/examples/example-attention.py index 395e40e..76fe68b 100644 --- a/examples/example-attention.py +++ b/examples/example-attention.py @@ -5,14 +5,11 @@ import numpy import numpy as np from keract import get_activations -from tensorflow.keras import Input -from tensorflow.keras import Model +from tensorflow.keras import Sequential from tensorflow.keras.callbacks import Callback -from tensorflow.keras.layers import Dense -from tensorflow.keras.layers import Dropout -from tensorflow.keras.layers import LSTM +from tensorflow.keras.layers import Dense, Dropout, LSTM -from attention import attention_3d_block +from attention import Attention def task_add_two_numbers_after_delimiter(n: int, seq_length: int, delimiter: float = 0.0, @@ -59,14 +56,13 @@ def main(): x_test_mask[:, test_index_1:test_index_1 + 1] = 1 x_test_mask[:, test_index_2:test_index_2 + 1] = 1 - # model - i = Input(shape=(seq_length, 1)) - x = LSTM(100, return_sequences=True)(i) - x = attention_3d_block(x) - x = Dropout(0.2)(x) - x = Dense(1, activation='linear')(x) + model = Sequential([ + LSTM(100, input_shape=(seq_length, 1), return_sequences=True), + Attention(name='attention_weight'), + Dropout(0.2), + Dense(1, activation='linear') + ]) - model = Model(inputs=[i], outputs=[x]) model.compile(loss='mse', optimizer='adam') print(model.summary()) @@ -79,7 +75,7 @@ def main(): class VisualiseAttentionMap(Callback): def on_epoch_end(self, epoch, logs=None): - attention_map = get_activations(model, x_test, layer_name='attention_weight')['attention_weight'] + attention_map = get_activations(model, x_test, layer_names='attention_weight')['attention_weight'] # top is attention map. # bottom is ground truth. diff --git a/examples/find_max.py b/examples/find_max.py new file mode 100644 index 0000000..4ee7310 --- /dev/null +++ b/examples/find_max.py @@ -0,0 +1,64 @@ +import matplotlib.pyplot as plt +import numpy as np +from keract import get_activations +from tensorflow.keras import Sequential +from tensorflow.keras.callbacks import Callback +from tensorflow.keras.layers import Dense, LSTM + +from attention import Attention + + +class VisualizeAttentionMap(Callback): + + def __init__(self, model, x): + super().__init__() + self.model = model + self.x = x + + def on_epoch_begin(self, epoch, logs=None): + attention_map = get_activations(self.model, self.x, layer_names='attention_weight')['attention_weight'] + x = self.x[..., 0] + fig, axes = plt.subplots(nrows=3, ncols=1, figsize=(5, 6)) + maps = [attention_map, create_argmax_mask(attention_map), create_argmax_mask(x)] + maps_names = ['attention layer', 'attention layer - argmax()', 'ground truth - argmax()'] + for i, ax in enumerate(axes.flat): + im = ax.imshow(maps[i], interpolation='none', cmap='jet') + ax.set_ylabel(maps_names[i] + '\n#sample axis') + ax.set_xlabel('sequence axis') + ax.xaxis.set_ticks([]) + ax.yaxis.set_ticks([]) + cbar_ax = fig.add_axes([0.75, 0.15, 0.05, 0.7]) + fig.colorbar(im, cax=cbar_ax) + fig.suptitle(f'Epoch {epoch} - training') + plt.show() + + +def create_argmax_mask(x): + mask = np.zeros_like(x) + for i, m in enumerate(x.argmax(axis=1)): + mask[i, m] = 1 + return mask + + +def main(): + seq_length = 10 + num_samples = 100000 + # https://stats.stackexchange.com/questions/485784/which-distribution-has-its-maximum-uniformly-distributed + # Choose beta(1/N,1) to have max(X_1,...,X_n) ~ U(0, 1) => minimizes amount of knowledge. + # If all the max(s) are concentrated around 1, then it makes the task easy for the model. + x_data = np.random.beta(a=1 / seq_length, b=1, size=(num_samples, seq_length, 1)) + y_data = np.max(x_data, axis=1) + model = Sequential([ + LSTM(128, input_shape=(seq_length, 1), return_sequences=True), + Attention(name='attention_weight'), + Dense(1, activation='linear') + ]) + model.compile(loss='mae') + max_epoch = 100 + # visualize the attention on the first samples. + visualize = VisualizeAttentionMap(model, x_data[0:12]) + model.fit(x_data, y_data, epochs=max_epoch, validation_split=0.2, callbacks=[visualize]) + + +if __name__ == '__main__': + main() diff --git a/examples/imdb.py b/examples/imdb.py index 0392723..6c5a45e 100644 --- a/examples/imdb.py +++ b/examples/imdb.py @@ -1,42 +1,37 @@ import numpy import numpy as np -from tensorflow.keras import Input -from tensorflow.keras import Model +from tensorflow.keras import Sequential from tensorflow.keras.callbacks import Callback from tensorflow.keras.datasets import imdb -from tensorflow.keras.layers import Dense -from tensorflow.keras.layers import Dropout -from tensorflow.keras.layers import Embedding -from tensorflow.keras.layers import LSTM +from tensorflow.keras.layers import Dense, Dropout, Embedding, LSTM from tensorflow.keras.preprocessing import sequence -from attention import attention_3d_block +from attention import Attention def train_and_evaluate_model_on_imdb(add_attention=True): numpy.random.seed(7) # load the dataset but only keep the top n words, zero the rest top_words = 5000 - (X_train, y_train), (X_test, y_test) = imdb.load_data(num_words=top_words) + (x_train, y_train), (x_test, y_test) = imdb.load_data(num_words=top_words) # truncate and pad input sequences max_review_length = 500 - X_train = sequence.pad_sequences(X_train, maxlen=max_review_length) - X_test = sequence.pad_sequences(X_test, maxlen=max_review_length) + x_train = sequence.pad_sequences(x_train, maxlen=max_review_length) + x_test = sequence.pad_sequences(x_test, maxlen=max_review_length) # create the model embedding_vector_length = 32 - i = Input(shape=(max_review_length,)) - x = Embedding(top_words, embedding_vector_length, input_length=max_review_length)(i) - x = Dropout(0.5)(x) - if add_attention: - x = LSTM(100, return_sequences=True)(x) - x = attention_3d_block(x) - else: - x = LSTM(100, return_sequences=False)(x) - x = Dense(350, activation='relu')(x) # same number of parameters so fair comparison. - x = Dropout(0.5)(x) - x = Dense(1, activation='sigmoid')(x) - model = Model(inputs=[i], outputs=[x]) + model = Sequential([ + Embedding(top_words, embedding_vector_length, input_length=max_review_length), + Dropout(0.5), + # attention vs no attention. same number of parameters so fair comparison. + *([LSTM(100, return_sequences=True), Attention()] if add_attention + else [LSTM(100), Dense(350, activation='relu')]), + Dropout(0.5), + Dense(1, activation='sigmoid') + ] + ) + model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy']) print(model.summary()) @@ -52,7 +47,7 @@ def on_epoch_end(self, epoch, logs=None): self.val_losses.append(logs['val_loss']) rbta = RecordBestTestAccuracy() - model.fit(X_train, y_train, validation_data=(X_test, y_test), epochs=10, batch_size=64, callbacks=[rbta]) + model.fit(x_train, y_train, validation_data=(x_test, y_test), epochs=10, batch_size=64, callbacks=[rbta]) print(f"Max Test Accuracy: {100 * np.max(rbta.val_accuracies):.2f} %") print(f"Mean Test Accuracy: {100 * np.mean(rbta.val_accuracies):.2f} %") diff --git a/examples/readme/example.png b/examples/readme/example.png new file mode 100644 index 0000000..87b3404 Binary files /dev/null and b/examples/readme/example.png differ diff --git a/setup.py b/setup.py index a7a9332..b9e9d60 100644 --- a/setup.py +++ b/setup.py @@ -1,18 +1,19 @@ from setuptools import setup +from attention import VERSION + setup( name='attention', - version='2.2', - description='Keras Attention Many to One', + version=VERSION, + description='Keras Simple Attention', author='Philippe Remy', license='Apache 2.0', long_description_content_type='text/markdown', long_description=open('README.md').read(), packages=['attention'], - # manually install tensorflow or tensorflow-gpu install_requires=[ 'numpy>=1.18.1', 'keras>=2.3.1', - 'gast>=0.2.2' + 'tensorflow>=2.1' ] )