-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathmlp_numpy.py
129 lines (112 loc) · 3.78 KB
/
mlp_numpy.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
#!/usr/bin/env python3
import numpy as np
import random
import matplotlib.pyplot as plt
with open('../data/names.txt', 'r') as f:
data = f.read().splitlines()
stoi = {chr(ord('a') + i):i+1 for i in range(26)}
stoi['.'] = 0
itos = {i:s for s,i in stoi.items()}
rng=np.random.default_rng()
batch_size = 32
vocab_len = len(stoi)
embed_dim = 30
context_len = 3
hidden_dim = 200
C = rng.random((vocab_len, embed_dim)) * 0.01
H = rng.random((context_len * embed_dim, hidden_dim)) * 0.01
d = np.zeros(hidden_dim)
U = rng.random((hidden_dim, vocab_len)) * 0.01
b = np.zeros(vocab_len)
params = [C, H, d, U, b]
def softmax(y):
e_y = np.exp(y - y.max(1,keepdims=True))
return e_y * (e_y.sum(1,keepdims=True)**-1)
def cross_entropy(y_pred, y_true):
return -np.log(y_pred)[range(batch_size),y_true].mean()
def cross_entropy_grad(y_pred, y_true):
grad = y_pred
# NOTE this indexing operation does the same as the following loop
# for i,j in zip(range(batch_size), y_true): grad[i][j] -= 1
# it subtracts 1 from the gradient corresponding to the correct output entry in the ith batch
grad[range(batch_size), y_true] -= 1
grad /= batch_size
return grad
def forward(X):
embed = C[X]
concat = embed.reshape((-1, H.shape[0]))
hidden = np.tanh(concat @ H + d)
logits = hidden @ U + b
probs = softmax(logits)
return probs
def train(X, Y, lr):
# forward pass
global C, H, d, U, b
embed = C[X]
concat = embed.reshape((-1, H.shape[0]))
prehidden = concat @ H + d
hidden = np.tanh(prehidden)
logits = hidden @ U + b
s = softmax(logits)
loss = cross_entropy(s, Y)
# backward pass
dloss_wrt_logits = cross_entropy_grad(s, Y)
dlogits_wrt_b = dloss_wrt_logits.sum(0)
dlogits_wrt_U = hidden.T @ dloss_wrt_logits
dlogits_wrt_hidden = dloss_wrt_logits @ U.T
dhidden_wrt_prehidden = (1 - hidden**2) * dlogits_wrt_hidden
dprehidden_wrt_d = dhidden_wrt_prehidden.sum(0)
dprehidden_wrt_H = concat.T @ dhidden_wrt_prehidden
dprehidden_wrt_concat = dhidden_wrt_prehidden @ H.T
dconcat_wrt_embed = dprehidden_wrt_concat.reshape(embed.shape)
dembed_wrt_C = np.zeros(C.shape)
for k in range(X.shape[0]):
for j in range(X.shape[1]):
ix = X[k,j]
dembed_wrt_C[ix] += dconcat_wrt_embed[k,j]
# update
grads = [dembed_wrt_C, dprehidden_wrt_H, dprehidden_wrt_d, dlogits_wrt_U, dlogits_wrt_b]
for p, g in zip(params, grads):
p -= g * lr
return (loss, grads)
def build_dataset(words, n):
X, Y = [], []
for w in words:
context = [0] * n
for c in w + '.':
ix = stoi[c]
X.append(context)
Y.append(ix)
context = context[1:] + [ix] # crop and append
X = np.array(X)
Y = np.array(Y)
return X, Y
rng.shuffle(data)
n1 = int(0.8*len(data))
n2 = int(0.9*len(data))
Xtrain, Ytrain = build_dataset(data[:n1], context_len)
Xvalidate, Yvalidate = build_dataset(data[n1:n2], context_len)
Xtest, Ytest = build_dataset(data[n2:], context_len)
assert batch_size <= Xtrain.shape[0]
MAX_STEPS = 200000
lossi = []
for i in range(MAX_STEPS):
ix = rng.integers(0, Xtrain.shape[0], (batch_size,))
Xbatch, Ybatch = Xtrain[ix], Ytrain[ix] # batch X,Y
loss, grads = train(Xbatch, Ybatch, 0.1 if i < (MAX_STEPS>>1) else 0.001)
if i % 10000 == 0: # print every once in a while
print(f'{i:7d}/{MAX_STEPS:7d}: {loss:.4f}')
lossi.append(np.log10(loss))
plt.plot(lossi)
plt.savefig('loss_numpy.png')
for _ in range(20):
out = []
context = [0] * context_len
while True:
probs = forward(context)
ix = rng.multinomial(1, probs, size=1).argmax()
context = context[1:] + [ix]
out.append(ix)
if ix == 0:
break
print(''.join(itos[i] for i in out))