forked from seloufian/Deep-Learning-Computer-Vision
-
Notifications
You must be signed in to change notification settings - Fork 0
/
vae.py
339 lines (278 loc) · 16.3 KB
/
vae.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
from __future__ import print_function
import matplotlib.pyplot as plt
import matplotlib.gridspec as gridspec
import numpy as np
import torch
import torch.utils.data
from torch import nn, optim
from torch.autograd import Variable
from torch.nn import functional as F
from torchvision import datasets, transforms
from torchvision.utils import save_image
def hello_vae():
print("Hello from vae.py!")
class VAE(nn.Module):
def __init__(self, input_size, latent_size=15):
super(VAE, self).__init__()
self.input_size = input_size # H*W
self.latent_size = latent_size # Z
self.hidden_dim = None # H_d
self.encoder = None
self.mu_layer = None
self.logvar_layer = None
self.decoder = None
############################################################################################
# TODO: Implement the fully-connected encoder architecture described in the notebook. #
# Specifically, self.encoder should be a network that inputs a batch of input images of #
# shape (N, 1, H, W) into a batch of hidden features of shape (N, H_d). Set up #
# self.mu_layer and self.logvar_layer to be a pair of linear layers that map the hidden #
# features into estimates of the mean and log-variance of the posterior over the latent #
# vectors; the mean and log-variance estimates will both be tensors of shape (N, Z). #
############################################################################################
# Replace "pass" statement with your code
# Define the hidden dimension as shown in course's lecture example.
# Source: https://web.eecs.umich.edu/~justincj/slides/eecs498/498_FA2019_lecture20.pdf
# Slide: 12/120.
self.hidden_dim = 400
# Define the encoder: A three Linear+ReLU layers neural network.
self.encoder = nn.Sequential(
nn.Flatten(),
nn.Linear(self.input_size, self.hidden_dim),
nn.ReLU(),
nn.Linear(self.hidden_dim, self.hidden_dim),
nn.ReLU(),
nn.Linear(self.hidden_dim, self.hidden_dim),
nn.ReLU()
)
# Define the mu_layer, with input (N, H_d) and output (N, Z)
self.mu_layer = nn.Linear(self.hidden_dim, self.latent_size)
# Define the logvar_layer, with input (N, H_d) and output (N, Z)
self.logvar_layer = nn.Linear(self.hidden_dim, self.latent_size)
############################################################################################
# TODO: Implement the fully-connected decoder architecture described in the notebook. #
# Specifically, self.decoder should be a network that inputs a batch of latent vectors of #
# shape (N, Z) and outputs a tensor of estimated images of shape (N, 1, H, W). #
############################################################################################
# Replace "pass" statement with your code
self.decoder = nn.Sequential(
nn.Linear(self.latent_size, self.hidden_dim),
nn.ReLU(),
nn.Linear(self.hidden_dim, self.hidden_dim),
nn.ReLU(),
nn.Linear(self.hidden_dim, self.hidden_dim),
nn.ReLU(),
nn.Linear(self.hidden_dim, self.input_size),
nn.Sigmoid(),
nn.Unflatten(dim=1, unflattened_size=(1, 28, 28))
)
############################################################################################
# END OF YOUR CODE #
############################################################################################
def forward(self, x):
"""
Performs forward pass through FC-VAE model by passing image through
encoder, reparametrize trick, and decoder models
Inputs:
- x: Batch of input images of shape (N, 1, H, W)
Returns:
- x_hat: Reconstruced input data of shape (N,1,H,W)
- mu: Matrix representing estimated posterior mu (N, Z), with Z latent space dimension
- logvar: Matrix representing estimataed variance in log-space (N, Z), with Z latent space dimension
"""
x_hat = None
mu = None
logvar = None
############################################################################################
# TODO: Implement the forward pass by following these steps #
# (1) Pass the input batch through the encoder model to get posterior mu and logvariance #
# (2) Reparametrize to compute the latent vector z #
# (3) Pass z through the decoder to resconstruct x #
############################################################################################
# Replace "pass" statement with your code
# Pass input images "x" to the encoder. Output shape is: (N, H_d)
encoder_out = self.encoder(x)
# Get the posterior mu from the encoder's output. Its shape is: (N, Z)
mu = self.mu_layer(encoder_out)
# Get the posterior logvariance from the encoder's output. Its shape is: (N, Z)
logvar = self.logvar_layer(encoder_out)
# Reparametrize to compute the latent vector "z", of shape (N, Z)
z = reparametrize(mu, logvar)
# Pass "z" through the decoder to resconstruct "x", the "x_hat".
x_hat = self.decoder(z)
############################################################################################
# END OF YOUR CODE #
############################################################################################
return x_hat, mu, logvar
class CVAE(nn.Module):
def __init__(self, input_size, num_classes=10, latent_size=15):
super(CVAE, self).__init__()
self.input_size = input_size # H*W
self.latent_size = latent_size # Z
self.num_classes = num_classes # C
self.hidden_dim = None # H_d
self.encoder = None
self.mu_layer = None
self.logvar_layer = None
self.decoder = None
############################################################################################
# TODO: Define a FC encoder as described in the notebook that transforms the image--after #
# flattening and now adding our one-hot class vector (N, H*W + C)--into a hidden_dimension # #
# (N, H_d) feature space, and a final two layers that project that feature space #
# to posterior mu and posterior log-variance estimates of the latent space (N, Z) #
############################################################################################
# Replace "pass" statement with your code
# Define the hidden dimension as shown in course's lecture example.
# Source: https://web.eecs.umich.edu/~justincj/slides/eecs498/498_FA2019_lecture20.pdf
# Slide: 12/120.
self.hidden_dim = 400
# Define the encoder: A three Linear+ReLU layers neural network.
# The encoder's input has shape of (N, H*W + C), i.e. The flattened images and their classes.
# Note that the concatenation (between images and classes) is done in the "forward" function.
self.encoder = nn.Sequential(
nn.Linear(self.input_size + self.num_classes, self.hidden_dim),
nn.ReLU(),
nn.Linear(self.hidden_dim, self.hidden_dim),
nn.ReLU(),
nn.Linear(self.hidden_dim, self.hidden_dim),
nn.ReLU()
)
# Define the mu_layer, with input (N, H_d) and output (N, Z)
self.mu_layer = nn.Linear(self.hidden_dim, self.latent_size)
# Define the logvar_layer, with input (N, H_d) and output (N, Z)
self.logvar_layer = nn.Linear(self.hidden_dim, self.latent_size)
############################################################################################
# TODO: Define a fully-connected decoder as described in the notebook that transforms the #
# latent space (N, Z + C) to the estimated images of shape (N, 1, H, W). #
############################################################################################
# Replace "pass" statement with your code
self.decoder = nn.Sequential(
nn.Linear(self.latent_size + self.num_classes, self.hidden_dim),
nn.ReLU(),
nn.Linear(self.hidden_dim, self.hidden_dim),
nn.ReLU(),
nn.Linear(self.hidden_dim, self.hidden_dim),
nn.ReLU(),
nn.Linear(self.hidden_dim, self.input_size),
nn.Sigmoid(),
nn.Unflatten(dim=1, unflattened_size=(1, 28, 28))
)
############################################################################################
# END OF YOUR CODE #
############################################################################################
def forward(self, x, c):
"""
Performs forward pass through FC-CVAE model by passing image through
encoder, reparametrize trick, and decoder models
Inputs:
- x: Input data for this timestep of shape (N, 1, H, W)
- c: One hot vector representing the input class (0-9) (N, C)
Returns:
- x_hat: Reconstruced input data of shape (N, 1, H, W)
- mu: Matrix representing estimated posterior mu (N, Z), with Z latent space dimension
- logvar: Matrix representing estimated variance in log-space (N, Z), with Z latent space dimension
"""
x_hat = None
mu = None
logvar = None
############################################################################################
# TODO: Implement the forward pass by following these steps #
# (1) Pass the concatenation of input batch and one hot vectors through the encoder model #
# to get posterior mu and logvariance #
# (2) Reparametrize to compute the latent vector z #
# (3) Pass concatenation of z and one hot vectors through the decoder to resconstruct x #
############################################################################################
# Replace "pass" statement with your code
# Flatten the "height" and "width" of the input batch 'x'.
# Input shape (N, 1, H, W). Output shape: (N, H*W)
x_flat = torch.flatten(x, start_dim=1, end_dim=-1)
# Create the encoder's input 'enc_in' by concatenating flattened images and their classes.
# Output shape is: (N, H*W + C)
enc_in = torch.cat((x_flat, c), dim=1)
# Pass 'enc_in' to the encoder. Output shape is: (N, H_d)
enc_out = self.encoder(enc_in)
# Get the posterior mu from the encoder's output. Its shape is: (N, Z)
mu = self.mu_layer(enc_out)
# Get the posterior logvariance from the encoder's output. Its shape is: (N, Z)
logvar = self.logvar_layer(enc_out)
# Reparametrize to compute the latent vector "z", of shape (N, Z)
z = reparametrize(mu, logvar)
# Create the decoder's input 'dec_in' by concatenating the latent vector and its classes.
# Output shape is: (N, Z + C)
dec_in = torch.cat((z, c), dim=1)
# Pass "dec_in" through the decoder to resconstruct "x", the "x_hat".
# x.shape == x_hat.shape == (N, 1, H, W)
x_hat = self.decoder(dec_in)
############################################################################################
# END OF YOUR CODE #
############################################################################################
return x_hat, mu, logvar
def reparametrize(mu, logvar):
"""
Differentiably sample random Gaussian data with specified mean and variance using the
reparameterization trick.
Suppose we want to sample a random number z from a Gaussian distribution with mean mu and
standard deviation sigma, such that we can backpropagate from the z back to mu and sigma.
We can achieve this by first sampling a random value epsilon from a standard Gaussian
distribution with zero mean and unit variance, then setting z = sigma * epsilon + mu.
For more stable training when integrating this function into a neural network, it helps to
pass this function the log of the variance of the distribution from which to sample, rather
than specifying the standard deviation directly.
Inputs:
- mu: Tensor of shape (N, Z) giving means
- logvar: Tensor of shape (N, Z) giving log-variances
Returns:
- z: Estimated latent vectors, where z[i, j] is a random value sampled from a Gaussian with
mean mu[i, j] and log-variance logvar[i, j].
"""
z = None
################################################################################################
# TODO: Reparametrize by initializing epsilon as a normal distribution and scaling by #
# posterior mu and sigma to estimate z #
################################################################################################
# Replace "pass" statement with your code
# Convert the "log of the variance" to "sigma" (standard deviation).
sigma = torch.sqrt(torch.exp(logvar))
# Compute 'z'.
# Epsilon is a Tensor that contains random samples from a standard normal
# distribution (mu=0, std=1)
z = sigma * torch.randn_like(mu) + mu
################################################################################################
# END OF YOUR CODE #
################################################################################################
return z
def loss_function(x_hat, x, mu, logvar):
"""
Computes the negative variational lower bound loss term of the VAE (refer to formulation in notebook).
Inputs:
- x_hat: Reconstruced input data of shape (N, 1, H, W)
- x: Input data for this timestep of shape (N, 1, H, W)
- mu: Matrix representing estimated posterior mu (N, Z), with Z latent space dimension
- logvar: Matrix representing estimated variance in log-space (N, Z), with Z latent space dimension
Returns:
- loss: Tensor containing the scalar loss for the negative variational lowerbound
"""
loss = None
################################################################################################
# TODO: Compute negative variational lowerbound loss as described in the notebook #
################################################################################################
# Replace "pass" statement with your code
# Get the minibatch size
N = mu.shape[0]
# Compute the reconstruction loss term, using Binary Cross Entropy (BCE) loss.
# The "BCE loss" have to be adapted to the "reconstruction loss" (Expectation) by:
# - Changing the reduction mode from 'mean' (default) to 'sum' (used in the Expectation).
# - The input to the BCE is 'x_hat' and the target is 'x'. This can be done because we are
# operating on MNIST dataset, where each pixel is either 0 or 1.
# Note that the minus sign is handled by the BCE loss itself.
rec_term = nn.functional.binary_cross_entropy(x_hat, x, reduction='sum')
# Compute the KL divergence term (kldiv_term).
kldiv_term = 1 + logvar - mu**2 - torch.exp(logvar)
kldiv_term = -0.5 * kldiv_term.sum()
# Final loss is the sum of "reconstruction loss term" and "KL divergence term".
loss = rec_term + kldiv_term
# Average the loss across samples in the minibatch.
loss /= N
################################################################################################
# END OF YOUR CODE #
################################################################################################
return loss