-
Notifications
You must be signed in to change notification settings - Fork 5
/
Copy pathtransformer.py
156 lines (125 loc) · 6.42 KB
/
transformer.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
#! /usr/bin/python3
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
class PositionalEncoding(layers.Layer):
"""
PositionalEncoding Layer applies Input Embedding and Positional Encoding to initial Inputs of the Transformer
*Extends tensorflow.keras.layers.Layer
Attributes
----------
sequence_length: int
Length of the input sequence. Also used as the size of the vocabulary of the encoding
since we are encoding the positon data of the sequence, which is a fixed sized.
projection_dim: int
Dimensionality of dense projection output
Methods
-------
call(inputs: tensor (None, sequence_length, projection_dim):
Applies layer to inputs
compute_output_shape(inputs_shape):
Returns the output shape of layer
"""
def __init__(self, sequence_length, projection_dim, **kwargs):
super(PositionalEncoding, self).__init__(**kwargs)
# Layer to compute positional data
self.position_embeddings = layers.Embedding(input_dim=sequence_length, output_dim=projection_dim)
# Linear project layer for input
self.projection = layers.Dense(units=projection_dim)
# Save initialization parameters
self.sequence_length = sequence_length
self.projection_dim = projection_dim
def call(self, inputs):
# Create tensor of length position
positions = tf.range(start=0, limit=self.sequence_length, delta=1)
# Apply embedding to each position
encoded_positions = self.position_embeddings(positions)
# Apply linear projection to input
projection = self.projection(inputs)
# Combine input with positional information
encoding = projection + encoded_positions
return encoding
def compute_output_shape(self, input_shape):
return (None, self.projection_dim)
class Encoder(layers.Layer):
"""
Encoder Layer applies multihead attention to inputs (using inputs as Query, Keys, and Values), as well as layer
normalization and dense projection according to transformer designed by Vaswani et al. (Note: Mask not implemented)
*Extends tensorflow.keras.layers.Layer
Attributes
----------
embed_dim: int
Dimensionality of the input embedding (conserved throughout)
dense_dim: int
Dimensionality of the first dense projection hidden layer
num_heads: int
Number of attention heads (for multi-head attention)
activation: string
Activation function for dense layer (default=relu)
Methods
-------
call(inputs):
Applies layer to inputs
compute_output_shape(inputs_shape):
Returns the output shape of layer
"""
def __init__(self, embed_dim, dense_dim, num_heads, activation='relu', **kwargs):
super(Encoder, self).__init__(**kwargs)
# Attention layer with num_head heads and embed_dim output dimension
self.attention = layers.MultiHeadAttention(num_heads=num_heads, key_dim=embed_dim, dropout=0.25)
# Linear projection layers for input with 2 hidden layers, first uses dense_dim layers then transforms back to
# embed_dim to keep shape the same
self.dense_proj = keras.Sequential([layers.Dense(dense_dim, activation=activation), layers.Dense(embed_dim),])
# Normalization layer
self.layernorm = layers.LayerNormalization()
# Save initial parameters
self.embed_dim = embed_dim
self.dense_dim = dense_dim
self.num_heads = num_heads
def call(self, inputs):
# Compute attention matrix using Multi-Head Attention (inputs is Q, K, and V)
# Attention(Q,K,V) = softmax([Q dot K transpose] / sqrt(dk)) * V
attention_matrix = self.attention(inputs, inputs, attention_mask=None)
# Apply attention matrix to input and normalize (Risidual Connection)
normalized_attention_output = self.layernorm(inputs + attention_matrix)
# Apply linear projection (feed forward network) to normalized result of attention layer
projection = self.dense_proj(normalized_attention_output)
# Add feed forward projection results to intial inputs and normalize one more time (Risidual Connection)
normalized_projection = self.layernorm(normalized_attention_output + projection)
return normalized_projection
def compute_output_shape(self, input_shape):
return input_shape
def BuildEncoder(sequence_length, input_dim, embed_dim, dense_dim, num_heads, model_name="transformer"):
"""
BuildEncoder will build a model that applies PositionalEncoding to inputs, then feeds those
inputs to a transformer encoder
Parameters
----------
sequence_length: int
length of the input sequence of the transformer
input_dim: int
dimensionality of the input tensor
embed_dim: int
output dimension of the embedding layer
dense_dim: int
number of hidden units of first hidden layer of linear projection in the Encoder
num_heads: int
number of attention heads
model_name: string
name of the resulting model (default="transformer") *Must be changed if using multiple transformer layers
Returns
-------
model: keras.Model
a transformer model with input layer=keras.Input with shape (sequence_length, embed_dim), and output layer=Encoder
"""
# Input of model, tensor will be of shape (Batch_size, sequence_length, None) where None is the dimensionality of the
# input data
inputs = keras.Input(shape=(sequence_length, input_dim))
# Embedding layer, will return an embedding with embed_dim dimensionality that maintains sequence_length
embeded = PositionalEncoding(sequence_length, embed_dim)(inputs)
# Encoder layer, will maintain input_shape. Will have num_heads attention heads, and the linear projection will have
# 2 hidden layers, first with dense_dim units and second with embed_dim units to maintain input_shape
encoded = Encoder(embed_dim, dense_dim, num_heads)(embeded)
# Create the model as described
model = keras.Model(inputs, encoded, name=model_name)
return model