-
Notifications
You must be signed in to change notification settings - Fork 0
/
a3_mod.py
92 lines (82 loc) · 4.25 KB
/
a3_mod.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
import autograd.numpy as np
from autograd import value_and_grad
def forward_pass(W1, W2, W3, b1, b2, b3, x):
"""
forward-pass for an fully connected neural network with 2 hidden layers of M neurons
Inputs:
W1 : (M, 784) weights of first (hidden) layer
W2 : (M, M) weights of second (hidden) layer
W3 : (10, M) weights of third (output) layer
b1 : (M, 1) biases of first (hidden) layer
b2 : (M, 1) biases of second (hidden) layer
b3 : (10, 1) biases of third (output) layer
x : (N, 784) training inputs
Outputs:
Fhat : (N, 10) output of the neural network at training inputs
"""
H1 = np.maximum(0, np.dot(x, W1.T) + b1.T) # layer 1 neurons with ReLU activation, shape (N, M)
H2 = np.maximum(0, np.dot(H1, W2.T) + b2.T) # layer 2 neurons with ReLU activation, shape (N, M)
Fhat = np.dot(H2, W3.T) + b3.T # layer 3 (output) neurons with linear activation, shape (N, 10)
# #######
# Note that the activation function at the output layer is linear!
# You must impliment a stable log-softmax activation function at the ouput layer
# #######
a_mat = -1*np.ones(np.shape(Fhat))*np.max(Fhat, axis=1)[:, np.newaxis]
log_sums = np.ones(np.shape(Fhat))*-1*np.log(np.sum(np.exp(np.add(Fhat, a_mat)), axis=1))[:, np.newaxis]
Fhat = np.add(np.add(Fhat, a_mat), log_sums)
return Fhat
def negative_log_likelihood(W1, W2, W3, b1, b2, b3, x, y):
"""
computes the negative log likelihood of the model `forward_pass`
Inputs:
W1, W2, W3, b1, b2, b3, x : same as `forward_pass`
y : (N, 10) training responses
Outputs:
nll : negative log likelihood
"""
Fhat = forward_pass(W1, W2, W3, b1, b2, b3, x)
# ########
# Note that this function assumes a Gaussian likelihood (with variance 1)
# You must modify this function to consider a categorical (generalized Bernoulli) likelihood
# ########
return -1*np.sum(Fhat*y)
#nll = 0.5*np.sum(np.square(Fhat - y)) + 0.5*y.size*np.log(2.*np.pi)
#return nll
nll_gradients = value_and_grad(negative_log_likelihood, argnum=[0,1,2,3,4,5])
"""
returns the output of `negative_log_likelihood` as well as the gradient of the
output with respect to all weights and biases
Inputs:
same as negative_log_likelihood (W1, W2, W3, b1, b2, b3, x, y)
Outputs: (nll, (W1_grad, W2_grad, W3_grad, b1_grad, b2_grad, b3_grad))
nll : output of `negative_log_likelihood`
W1_grad : (M, 784) gradient of the nll with respect to the weights of first (hidden) layer
W2_grad : (M, M) gradient of the nll with respect to the weights of second (hidden) layer
W3_grad : (10, M) gradient of the nll with respect to the weights of third (output) layer
b1_grad : (M, 1) gradient of the nll with respect to the biases of first (hidden) layer
b2_grad : (M, 1) gradient of the nll with respect to the biases of second (hidden) layer
b3_grad : (10, 1) gradient of the nll with respect to the biases of third (output) layer
"""
def run_example():
"""
This example demonstrates computation of the negative log likelihood (nll) as
well as the gradient of the nll with respect to all weights and biases of the
neural network. We will use 50 neurons per hidden layer and will initialize all
weights and biases to zero.
"""
# load the MNIST_small dataset
from data_utils import load_dataset
x_train, x_valid, x_test, y_train, y_valid, y_test = load_dataset('mnist_small')
# initialize the weights and biases of the network
M = 50 # 50 neurons per hidden layer
W1 = np.zeros((M, 784)) # weights of first (hidden) layer
W2 = np.zeros((M, M)) # weights of second (hidden) layer
W3 = np.zeros((10, M)) # weights of third (output) layer
b1 = np.zeros((M, 1)) # biases of first (hidden) layer
b2 = np.zeros((M, 1)) # biases of second (hidden) layer
b3 = np.zeros((10, 1)) # biases of third (output) layer
# considering the first 250 points in the training set,
# compute the negative log likelihood and its gradients
(nll, (W1_grad, W2_grad, W3_grad, b1_grad, b2_grad, b3_grad)) = \
nll_gradients(W1, W2, W3, b1, b2, b3, x_train[:250], y_train[:250])
print("negative log likelihood: %.5f" % nll)