-
Notifications
You must be signed in to change notification settings - Fork 4
/
Copy pathsolutions.py
925 lines (719 loc) · 31.6 KB
/
solutions.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
# %%
# Need this for some of the helper functions
import torch as t
# Some nice preliminary functions for testing.
def assert_with_expect(expected, actual):
assert expected == actual, f"Expected: {expected} Actual: {actual}"
def assert_list_of_floats_within_epsilon(
expected: list[float],
actual: list[float],
eps=0.0001,
):
if len(expected) != len(actual):
raise AssertionError(f"Expected: {expected} Actual: {actual}")
is_within_eps = True
for e, a in zip(expected, actual):
is_within_eps = is_within_eps and abs(e - a) < eps
if not is_within_eps:
raise AssertionError(f"Expected: {expected} Actual: {actual}")
def assert_tensors_within_epsilon(
expected: t.Tensor,
actual: t.Tensor,
eps=0.001,
):
if expected.shape != actual.shape:
raise AssertionError(f"Shapes of tensors do not match! Expected: {expected.shape} Acutal: {actual.shape}")
differences_within_epsilon = abs(expected - actual) < eps
if not differences_within_epsilon.all():
raise AssertionError(f"Values of tensors do not match! Expected: {expected} Actual: {actual}")
# %%
# We're going to begin by defining neural networks in a way that emphasizing
# each individual neuron. This is very inefficient and impractical for any real
# neural network (at least in Python). Also by thinking at the individual neuron
# level, this obscures a lot of larger structures in a neural net that can
# actually make it more difficult to understand what's going on at a high-level.
#
# Nonetheless, it's a reasonable starting point for understanding why we call a
# neural net "neural." We'll redo our neural net using matrices later in this
# section to demonstrate how they're actually written "in the wild."
#
# Let's begin by defining one of the simplest non-linear activation functions
# out there. We'll need this as the last step of computation when defining what
# a single neuron does.
def relu(x: float) -> float:
"""
ReLU (rectified linear unit), one of the simplest non-linear activation
functions out there.
"""
# TODO: Fill this in!
return max(x, 0.0)
assert_with_expect(expected=5.0, actual=relu(5.0))
assert_with_expect(expected=0.0, actual=relu(-1.0))
# %%
from dataclasses import dataclass
# Now let's define a single neuron. If you want a reminder of how a single
# neuron is structured, look at
# https://github.com/changlinli/intro-to-technical-ai-safety-slides/blob/master/neural_nets/slides.md#a-single-neuron-also-called-node
# (note that the diagram is a bit misleading w.r.t. bias, bias isn't always +1,
# it can be + some other constant!).
#
# For now don't worry about using tensors to carry this out, normal Python
# iteration is perfectly fine.
@dataclass
class Neuron:
"""
"""
weights: list[float]
bias: float
def compute_output(self, inputs: list[float]) -> float:
"""
Compute what the output of a single neuron should look like.
"""
assert len(inputs) == len(self.weights)
# TODO: Fill this in!
result = 0
for weight, input in zip(self.weights, inputs):
result += weight * input
return relu(result + self.bias)
test_neuron = Neuron(weights=[1, 2], bias=0.5)
assert_with_expect(actual=test_neuron.compute_output([2, 3]), expected=8.5)
assert_with_expect(actual=test_neuron.compute_output([2, -2]), expected=0)
# %%
# Now let's define a forward pass for a single layer of neurons. Note that every
# neuron must have the same number of inputs (as determined by the number of
# weights) and it must match the number of inputs coming into the layer.
def forward_pass_single_layer(input: list[float], layer: list[Neuron]) -> list[float]:
for neuron in layer:
assert len(neuron.weights) == len(input)
# TODO: Fill this in!
return [neuron.compute_output(input) for neuron in layer]
test_layer = [
Neuron(weights=[0.1, 0.2], bias=0.3),
Neuron(weights=[-0.15, 0.1], bias=-0.1),
Neuron(weights=[0.2, 0.1], bias=0.1),
]
assert_list_of_floats_within_epsilon(
actual=forward_pass_single_layer(input=[5.5, 1.2], layer=test_layer),
expected=[1.09, 0.0, 1.32],
)
# %%
# Now let's take `forward_pass_single_layer` and use it to perform a single
# forward pass over an entire network with multiple layers.
def forward_pass_network(initial_inputs: list[float], layers: list[list[Neuron]]) -> list[float]:
last_output = initial_inputs
for layer in layers:
last_output = forward_pass_single_layer(last_output, layer)
return last_output
# The following is an example of a neural net that takes in three inputs and has
# two outputs, and has three layers: 3 neurons, 2 neurons, and 2 neurons
# Notice that:
# 1. Because we take in two inputs, the first layer of neurons all have two weights
# 2. Because there are three neurons that feed into the second layer, all the neurons of the second layer have three
# weights
# 3. Because there are two neurons in the second layer, all the neurons of the third layer have two weights
# 4. We have three inputs and two outputs because the first layer has three neurons and the last layer has two neurons
demo_network: list[list[Neuron]] = \
[
[
Neuron(weights=[0.1, 0.2], bias=0.3),
Neuron(weights=[-0.15, 0.1], bias=-0.1),
Neuron(weights=[0.2, 0.1], bias=0.1),
],
[
Neuron(weights=[0.1, 0.2, 0.3], bias=0.3),
Neuron(weights=[-0.15, 0.1, 0.9], bias=-0.1),
],
[
Neuron(weights=[0.1, 0.2], bias=0.3),
Neuron(weights=[-0.15, 0.1], bias=-0.1),
],
]
assert_list_of_floats_within_epsilon(
expected=[0.342, 0.0],
actual=forward_pass_network([0.0, 1.0], demo_network),
)
# %%
# We could do backpropagation manually by writing out the chain rule by hand to
# calculate each derivative. This is extremely tedious and no one does this. It
# makes it extremely difficult to experiment with different neural nets because
# we have to manually rederive all our derivatives each time.
#
# Instead ML practitioners always use some library that provides
# autodifferentiation (also sometimes called autograd). In our case, that will
# be PyTorch. Hence in the interests of time we'll skip writing out
# backpropagation for our network here.
#
# You'll dive more into the internals of how backpropagation works when we
# reimplement PyTorch's autodifferentiation feature.
def backpropagation(network: list[list[Neuron]]):
# Don't worry about implementing this!
raise NotImplementedError("This is too tedious to implement.")
# %%
import torch as t
from jaxtyping import Float
# Just to make sure our results are reproducible
t.manual_seed(10)
# %%
# Here's an example of using PyTorch to automatically calculate a derivative for
# you. When we are manually creating tensors, we have to explicitly tell PyTorch
# to remember we want to calculate the gradient for this tensor, so we should
# pass in requires_grad=True. As we use PyTorch more and more, we'll see a lot
# of library calls that will automatically take care of this for us.
#
# All of PyTorch's functions that work on tensors keep track of which operations
# have performed on which tensors in what PyTorch calls a "computational graph."
# It is this computational graph that allows PyTorch to automatically calculate
# derivatives for us.
x = t.tensor([5.0], requires_grad=True)
# Derivative here is 2x + 1, so that should be a derivative of 11 for x = 5
y = x ** 2 + x
# PyTorch's auto-differentiation facilities are based entirely around mutability
# Make sure that you call `.backward()` before you look at the gradients!
# Otherwise the gradients will not be set.
#
# Note that you call the `.backward()` method on your final derived value to get
# the gradient/derivative of one of your input variables. That is, in order to
# get the value of dy/dx for any y and x, you must call `.backward()` on `y` to
# first calculate all the derivatives through the computational graph and then
# `.grad` on `x`.
y.backward()
# x.grad is the numeral calculation of dy/dx at x = 5
#
# We see that the derivative is indeed 11 as we calculated by hand.
print(f"{x.grad=}")
# %%
a = t.tensor([5.0], requires_grad=True)
b = t.tensor([3.0], requires_grad=True)
c = a ** 2 + b ** 2
# %%
# Let's move on to a multivariate case. Use PyTorch to calculate what dc/da is
# and what dc/db are.
# TODO: Fill in the Nones!
# Remember to first populate the gradients before calling .grad!
c.backward()
dc_da = a.grad
assert_with_expect(expected=t.tensor(10.0), actual=dc_da)
dc_db = b.grad
assert_with_expect(expected=t.tensor(6.0), actual=dc_db)
# %%
# Like almost everything else in PyTorch, autodifferentiation works with
# multidimensional tensors as well, not just scalar values! Here's a way we
# could calculate a and b "at once" in a single tensor.
a_and_b = t.tensor([5.0, 3.0], requires_grad=True)
c = (a_and_b ** 2).sum()
# %%
# Use PyTorch to calculate again what dc/da and what dc/db are
# TODO: Fill in the Nones!
# Remember to first populate the gradients before calling .grad!
c.backward()
dc_da = a_and_b.grad[0]
dc_db = a_and_b.grad[1]
# %%
# If you construct different computational graphs that involve the same set of
# input tensors and `.backward()` is called each time (so that each input tensor
# has had the result of `.backward()` flow through multiple times), gradients
# will "accumulate." For our purposes this is usually undesirable.
#
# Let's go over what that means and how to avoid this. First let's create
# tensors as usual.
some_input = t.tensor([1.0], requires_grad=True)
some_output = 10 * some_input
some_output.backward()
# Normally because this is just y = 10 * x, we would expect the x's gradient to
# be 10 at this point. And indeed it is.
print(f"{some_input.grad=}")
# %%
# PyTorch is smart enough to warn us if we try to use backward again
try:
some_output.backward()
except RuntimeError as e:
print(f"PyTorch was smart enough to blow up and prevent us from going backward again with the following message:\n{str(e)}")
# %%
# But PyTorch doesn't warn us if we create a new output that reuses `some_input`
# and instead will just keep adding more gradients to the pre-existing gradient.
# This is known as "accumulating gradients," and there are reasons you might
# want to do this, but for our purposes, this is undesirable, as it will give us
# the wrong derivatives/gradients.
another_output = 5 * some_input
another_output.backward()
# Note that we've added two derivatives together, 10 + 5, which is not the
# correct derivative for y = 10 * x or y = 5 * x!
assert some_input.grad == 15
print(f"{some_input.grad=}")
# %%
# Because PyTorch will either throw an error on a given backwards call, or it'll
# accumulate gradients when you potentially don't want that to happen, we
# generally will want to reset gradients between calls to backward(). The
# easiest way to do this is to set `.grad = None`. We'll see later how to do
# this in a less manual fashion.
some_input.grad = None
yet_another_output = 5 * some_input
yet_another_output.backward()
# This time we get the correct gradient!
assert some_input.grad == 5
# %%
# Note that even after resetting a gradient, we still can't call backward again
# on the same output. This has to do with the details of how PyTorch
# automatically calculates derivatives. The exact details of why this is the
# case are irrelevant at the moment (although they may become more apparent when
# we implement backpropagation ourselves), but feel free to ask if you're
# curious.
some_input.grad = None
try:
yet_another_output.backward()
except RuntimeError as e:
print(f"Yep we still get the following error message:\n{str(e)}")
# %%
# Let's drive home the notion that PyTorch's autodifferentiation handles
# multidimensional tensors just fine, since this will often show up. We'll have
# PyTorch calculate 2000 derivatives at once (i.e. calculate a single gradient
# consisting of 2000 components)!
import numpy as np
one_thousand_random_points: Float[t.Tensor, "1000 2"] = t.rand(1000, 2, requires_grad=True)
assert one_thousand_random_points.requires_grad
# Using `generate_one_thousand_points`, call $f(x_i, y_i) = \sum_{0 <= i < 1000} x_i^2 + y_i^2$, (notice that
# the thousand points are all (x, y) pairs because the second dimension is 2).
def gradient_of_x_squared_plus_y_squared_plus_5_thousand_times() -> np.ndarray:
"""
Usually a Tensor will calculate its gradient as another Tensor, but here
we'll return a NumPy array.
We will calculate this for the function $f(x_i, y_i) = \sum_{0 <= i < 1000} x_i^2 + y_i^2$
You should use `one_thousand_random_points` to generate the 1000 points
and then should ultimately return a 1000x2 NumPy array, representing a gradient of size 1000x2.
Note to turn a PyTorch tensor into a NumPy array, call the .detach().numpy() method
on a tensor.
"""
points = one_thousand_random_points
# TODO: Implement this
result = (points ** 2).sum() + 5
result.backward()
return points.grad.detach().numpy()
result = gradient_of_x_squared_plus_y_squared_plus_5_thousand_times()
print(f"{result=}")
assert_with_expect(expected=one_thousand_random_points[0][0] * 2, actual=result[0][0])
assert_with_expect(expected=one_thousand_random_points[1][1] * 2, actual=result[1][1])
# %%
import einops
# Recall that a higher-level way of thinking about a neural layer net is that it
# is a general linear function with an added constant, followed by a non-linear
# function. Let's begin by implementing application of a linear function.
# Remember that a function from a vector space of dimension n to a vector space
# of dimension m can be implemented as an m x n matrix.
#
# There is a major benefit in implementing a neural net layer this way: an
# entire batch of inputs can be processed simultaneously! Instead of passing in
# a single vector of size d_input, we're going to pass in a whole block of them
# at once, in the form of a batch x d_input tensor.
#
# You should be able to write in this one or two lines without any iteration.
# You can either use einops.einsum or you can use built-in PyTorch methods.
def apply_linear_function_to_input(
linear_function: Float[t.Tensor, "d_output d_input"],
input_to_function: Float[t.Tensor, "batch d_input"],
) -> Float[t.Tensor, "batch d_output"]:
# TODO: Implement this
return einops.einsum(linear_function, input_to_function, 'd_output d_input, batch d_input -> batch d_output')
# 3x2 matrix, i.e. f: R^2 -> R^3
test_linear_function = t.tensor(
[
[1.0, 2.0],
[3.0, 4.0],
[5.0, 6.0],
]
)
# A batch size of 4 vectors all combined together
test_input = t.tensor(
[
[0.5, 0.6],
[0.3, 0.4],
[-2.0, -9.0],
[-8.0, 1.0],
]
)
test_output = apply_linear_function_to_input(linear_function=test_linear_function, input_to_function=test_input)
expected_output = t.tensor(
[
[1.7, 3.9, 6.1],
[1.1, 2.5, 3.9],
[-20, -42, -64],
[-6, -20, -34]
]
)
assert_tensors_within_epsilon(
expected=expected_output,
actual=test_output,
)
# %%
# And as we expect, gradients flow through automatically.
example_input = t.rand((1, 2), requires_grad=True)
example_result = apply_linear_function_to_input(example_input, t.rand((50, 2))).sum()
example_result.backward()
print(f"{example_input.grad=}")
# %%
# Putting this together, we can create a three layer neural net consisting of
# six tensors, instead of needing to create each neuron individually.
@dataclass
class ThreeLayerNeuralNet:
layer_0: Float[t.Tensor, "d_output_0 d_input"]
layer_0_bias: Float[t.Tensor, "d_output_0"]
layer_1: Float[t.Tensor, "d_output_1 d_output_0"]
layer_1_bias: Float[t.Tensor, "d_output_1"]
layer_2: Float[t.Tensor, "d_output_2 d_output_1"]
layer_2_bias: Float[t.Tensor, "d_output_2"]
# Now that we have the class structure, let's create a way of actually
# initializing the neural net.
#
# Ultimately we're going to be creating a neural net to recognize handwritten
# digits, so it'll have an output of 10 digits.
#
# In our case we're going to create a very specific neural net. It's going to be
# a series of layers going from an input of dimension 784 to 2000 to 400 and
# finally to 10. That is:
#
# + d_input = 784
# + d_output_0 = 2000
# + d_output_1 = 400
# + d_output_2 = 10
#
# 784 is because our images are of size 28x28. The intermediate dimensions of
# 2000 and 400 in our hidden layers are chosen more or less arbitrarily. We'll
# see some rules of thumb for sizing these layers later.
#
# Let's go ahead and implement that! We've provided the first
# few lines of this, fill in the rest (making sure to call
# ``.uniform_(-initial_bound, initial_bound`)).
def initialize_new_three_layer_net() -> ThreeLayerNeuralNet:
"""
Initialize our
"""
# Since we have ReLU that clamps values to 0, having an initial set of
# weights that are all 0 can sometimes cause gradients to be stuck at 0 and
# never move. So we generally want to inject a little bit of randomness when
# initializing and move weights just a little bit away from 0.
#
# We also can't have these bounds be too big! Otherwise again ReLU may bite
# us and clamp us down to 0 if there's a sign change somewhere.
#
# This particular bound was chosen semi-randomly (I kind of pulled it out of
# a hat and verified that it worked). You'll see later a slightly more
# principled/standard way of choosing this bound.
initial_bound = 1 / 20
with t.no_grad():
neural_net = ThreeLayerNeuralNet(
# We're going to use usual matrix order of dimensions here That is
# for a matrix mxn, that means we have n-dimensional input and
# m-dimensional output, so likewise here (300, 784) means
# 784-dimensional input and 300-dimensional output
layer_0 = t.zeros((2000, 784), requires_grad=True).uniform_(-initial_bound, initial_bound),
layer_0_bias = t.zeros(2000, requires_grad=True).uniform_(-initial_bound, initial_bound),
# TODO: Finish implementing
layer_1 = t.zeros((400, 2000), requires_grad=True).uniform_(-initial_bound, initial_bound),
layer_1_bias = t.zeros(400, requires_grad=True).uniform_(-initial_bound, initial_bound),
layer_2 = t.zeros((10, 400), requires_grad=True).uniform_(-initial_bound, initial_bound),
layer_2_bias = t.zeros(10, requires_grad=True).uniform_(-initial_bound, initial_bound),
)
return neural_net
new_neural_net = initialize_new_three_layer_net()
assert_with_expect(
expected=(400, 2000),
actual=new_neural_net.layer_1.shape,
)
assert_with_expect(
expected=(10, 400),
actual=new_neural_net.layer_2.shape,
)
# %%
# We'll need a version of ReLU that works with tensors of arbitrary size. Let's implement that:
def tensor_relu(input_tensor: t.Tensor) -> t.Tensor:
# TODO: Implement this
return t.maximum(input_tensor, t.zeros_like(input_tensor))
test_input = t.tensor([
[1.0, 2.0, -3.0],
[4.0, -5.0, 6.0],
])
assert_tensors_within_epsilon(
expected=t.tensor([
[1.0, 2.0, 0.0],
[4.0, 0.0, 6.0],
]),
actual=tensor_relu(test_input)
)
# Now let's define a version of `forward` that works with tensors. Again, our
# input tensor is a whole batch of inputs, not just a single input!
#
# Our last layer will use a softmax, which is a function that normalizes a
# vector to be between 0 and 1 and to sum to 1. This helps with stability in
# training and lets us interpret each of the 10 components in our output as a
# probability. You can read
# https://pytorch.org/docs/stable/generated/torch.nn.functional.softmax.html for
# more details.
def forward(x: Float[t.Tensor, "batch d_input"], neural_net: ThreeLayerNeuralNet) -> Float[t.Tensor, "batch d_output"]:
# TODO: Fill in the first two layers of this!
after_layer_0 = tensor_relu(apply_linear_function_to_input(neural_net.layer_0, x) + neural_net.layer_0_bias)
after_layer_1 = tensor_relu(apply_linear_function_to_input(neural_net.layer_1, after_layer_0) + neural_net.layer_1_bias)
# Instead of doing a ReLU at the very end, we're going to use softmax
after_layer_2 = t.nn.functional.softmax(apply_linear_function_to_input(neural_net.layer_2, after_layer_1) + neural_net.layer_2_bias, dim=-1)
return after_layer_2
# %%
example_output = forward(neural_net=new_neural_net, x=t.ones((10, 784)))
print(f"{example_output=}")
# %%
# Note that generally speaking we'll mainly be using scalar (i.e. 0-dimensional
# tensor) outputs that we call .backward() on. This is not too much of a
# limitation because almost always a loss function will output a scalar. There
# are ways to deal with non-scalar outputs, but it's irrelevant to us at the
# moment and for now we'll just point out that trying to do so will cause an
# error.
try:
# example_output is not a scalar!
example_output.backward()
except RuntimeError as e:
print(f"Weren't able to calculate a gradient because of:\n{str(e)}")
# %%
# Notice now that we've turned example_output into a scalar, our call to
# .backward() proceeds with no problem!
example_scalar = example_output.sum()
example_scalar.backward()
# %%
# And we can calculate the gradient of one of our neural net layers relative to
# this scalar!
print(f"{new_neural_net.layer_0=}")
print(f"{new_neural_net.layer_0.grad=}")
# %%
# We'll need a few more pieces to actually be able to train our neural net.
#
# Remember how we mentioned that we need to make sure we reset gradients to
# prevent gradient accumulation? Let's do that now.
def zero_all_gradients(neural_net: ThreeLayerNeuralNet) -> None:
neural_net.layer_0.grad = None
neural_net.layer_0_bias.grad = None
# TODO: Finish implementing this for all the other layers in our neural net
neural_net.layer_1.grad = None
neural_net.layer_1_bias.grad = None
neural_net.layer_2.grad = None
neural_net.layer_2_bias.grad = None
# Now let's implement the simplest loss function out there, the mean squared
# error: https://en.wikipedia.org/wiki/Mean_squared_error. This consists of
# squaring the difference between every component of the two tensors and taking
# their mean.
def loss_function(
expected_outputs: Float[t.Tensor, "batch d_output"],
actual_outputs: Float[t.Tensor, "batch d_output"],
) -> Float[t.Tensor, ""]:
# TODO: Implement this
result = ((expected_outputs - actual_outputs) ** 2).mean()
return result
# Now we can use derivatives/gradients to iteratively nudge an input tensor
# towards a minimum with respect to a loss!
def nudge_tensor_towards_minimum(x: t.Tensor, learning_rate: float) -> None:
# We need to do t.no_grad() here because we will be directly modifying x
# using x's gradients and we don't want to recompute x's gradients, since
# the only thing that should affect x's gradients is the loss function, not
# our adjustment to x.
with t.no_grad():
# TODO: Implement this
x -= x.grad * learning_rate
example_tensor = t.tensor([1.1, 2.2, 3.3])
example_tensor.grad = t.tensor([0.1, 0.1, 0.1])
nudge_tensor_towards_minimum(example_tensor, learning_rate=2)
assert not t.allclose(example_tensor, t.tensor([1.1, 2.2, 3.3])), \
f"It doesn't appear that nudge_tensor_towards_minimum actually modifies your tensor! Make sure that you are using -= and not x = x - ..."
assert_tensors_within_epsilon(expected=t.tensor([0.9, 2.0, 3.1]), actual=example_tensor)
# %%
x = [1, 2, 3]
def concat_a_list_v0(xs):
xs += [4]
concat_a_list_v0(x)
# Now x is [1, 2, 3, 4], because concat_a_list_v0 has mutated x through the
# reference xs
assert x == [1, 2, 3, 4]
def concat_a_list_v1(xs):
xs = xs + [5]
concat_a_list_v1(x)
# x is still [1, 2, 3, 4], because concat_a_list_v1 has replaced the reference
# xs with a new reference to [1, 2, 3, 4, 5] in the function body, but x remains
# unchanged. Our new xs is also useless because it immediately becomes
# inaccessible and eligible for garbage collection once we leave concat_a_list_v1
assert x == [1, 2, 3, 4]
# %%
# Finally we put all this together in a function that performs one iteration of
# tuning the weights of neural nets in training.
#
# This function will do the following steps:
#
# 1. Zero all our gradients (using `zero_all_gradients``)
# 2. Calculate the outputs our neural net produces (using `forward`)
# 3. Compare those outputs against `expected_outputs` to calculate our loss
# using `loss_function`
# 4. Adjust our neural net weights
#
# Reminder: remember to call `.backward()` on the appropriate function!
#
# This function is hard to write good test cases for, so before you proceed,
# take a look at the solutions and make sure that your implementation is
# equivalent (as well as the implementations of `forward`, `loss_function` and
# `nudge_tensor_towards_minimum`!
def tune_weights_once(
neural_net: ThreeLayerNeuralNet,
inputs: Float[t.Tensor, "batch d_input"],
expected_outputs: Float[t.Tensor, "batch d_output"],
learning_rate: float,
) -> None:
zero_all_gradients(neural_net)
# TODO: Fill in the rest
outputs = forward(inputs, neural_net)
loss = loss_function(
expected_outputs=expected_outputs,
actual_outputs=outputs,
)
loss.backward()
nudge_tensor_towards_minimum(neural_net.layer_0, learning_rate)
nudge_tensor_towards_minimum(neural_net.layer_0_bias, learning_rate)
nudge_tensor_towards_minimum(neural_net.layer_1, learning_rate)
nudge_tensor_towards_minimum(neural_net.layer_1_bias, learning_rate)
nudge_tensor_towards_minimum(neural_net.layer_2, learning_rate)
nudge_tensor_towards_minimum(neural_net.layer_2_bias, learning_rate)
# %%
# Now we can actually train our neural net!
from tqdm import tqdm
def train(
neural_net: ThreeLayerNeuralNet,
inputs: t.Tensor,
expected_outputs: t.Tensor,
learning_rate: float,
number_of_iterations: int,
) -> None:
print(f"Initial loss was {loss_function(expected_outputs=expected_outputs, actual_outputs=forward(x=inputs, neural_net=neural_net))}")
for _ in tqdm(range(number_of_iterations)):
tune_weights_once(neural_net, inputs, expected_outputs, learning_rate)
print(f"Final loss was {loss_function(expected_outputs=expected_outputs, actual_outputs=forward(x=inputs, neural_net=neural_net))}")
# %%
import matplotlib.pyplot as plt
from torchvision.datasets import MNIST
from torchvision.transforms import ToTensor
# %%
dataset = MNIST(root="data", download=True, transform=ToTensor())
# %%
img, label = dataset[0]
print(f"This image is meant to express this numeral: {label}")
plt.imshow(img.squeeze())
# %%
# Before we can run training, there's one last thing we have to do: our images
# come labeled with what digit they're supposed to represent, but that's a
# single number, whereas our neural net outputs 10 components.
#
# That means we have to translate a number into a 10-component vector. E.g. if 2
# is the correct answer, the ideal answer from our neural net would be [0, 0, 1,
# 0, 0, 0, 0, 0, 0, 0].
#
# Doing this translation is called a "one-hot encoding." Let's implement it!
def one_hot_encoding(i: int, num_classes: int) -> t.Tensor:
# TODO: Implement this!
result = t.zeros([num_classes])
result[i] = 1
return result
assert_tensors_within_epsilon(
expected=t.tensor([0, 0, 1, 0, 0, 0, 0, 0, 0, 0]),
actual=one_hot_encoding(2, 10),
)
# %%
# We also need to flatten an image down to a flat vector to make it suitable for
# our neural net to ingest.
def make_img_1d(imgs: t.Tensor) -> t.Tensor:
return einops.rearrange(imgs, '... h w -> ... (h w)')
# %%
# This is an inefficient way of getting data from a dataset. We'll see later how
# to do this more efficiently, but for now this suffices to demonstrate the
# logic of how we're using our one-hot encoding and 1d flattening.
training_imgs = []
expected_outputs_in_training = []
non_training_imgs = []
expected_outputs_in_non_training = []
counter = 0
total_imgs = 2000
num_of_training_imgs = 1000
for img, label in dataset:
if counter >= total_imgs:
break
if counter < num_of_training_imgs:
training_imgs.append(make_img_1d(img).squeeze())
expected_outputs_in_training.append(one_hot_encoding(label, num_classes=10))
else:
non_training_imgs.append(make_img_1d(img).squeeze())
expected_outputs_in_non_training.append(one_hot_encoding(label, num_classes=10))
counter += 1
training_imgs = t.stack(training_imgs)
expected_outputs_in_training = t.stack(expected_outputs_in_training)
# %%
print(f"{training_imgs.shape=}")
# %%
# Our neural net starts out with garbage predictions.
non_training_img_idx = 0
img_outside_of_training_dataset = non_training_imgs[non_training_img_idx]
label = expected_outputs_in_non_training[non_training_img_idx].argmax()
print(f"Expected label: {label}")
plt.imshow(einops.rearrange(img_outside_of_training_dataset, '(h w) -> h w', h=28))
model_all_guesses = forward(neural_net=new_neural_net, x=img_outside_of_training_dataset.unsqueeze(dim=0))
model_guess_highest_prob = forward(neural_net=new_neural_net, x=img_outside_of_training_dataset.unsqueeze(dim=0)).argmax()
print(f"Model guessed this was: {model_guess_highest_prob}")
# %%
# Now let's train our neural net!
train(
neural_net=new_neural_net,
inputs=training_imgs,
expected_outputs=expected_outputs_in_training,
# A learning rate of 2 is usually much too high, but we've made some sub-optimal choices in designing our
learning_rate=10,
number_of_iterations=100,
)
# %%
# And let's try again
print(f"Expected label: {label}")
plt.imshow(einops.rearrange(img_outside_of_training_dataset, '(h w) -> h w', h=28))
model_all_guesses = forward(neural_net=new_neural_net, x=img_outside_of_training_dataset.unsqueeze(dim=0))
model_guess_highest_prob = forward(neural_net=new_neural_net, x=img_outside_of_training_dataset.unsqueeze(dim=0)).argmax()
print(f"Model guessed this was: {model_guess_highest_prob}")
# %%
# Here's a demonstration of how we could write this using PyTorch entirely. In
# two days you will have implemented every PyTorch function and class here from
# scratch!
class SimpleNeuralNet(t.nn.Module):
def __init__(self, *args, **kwargs) -> None:
super().__init__(*args, **kwargs)
self.implementation = t.nn.Sequential(
t.nn.Linear(in_features=784, out_features=2000),
t.nn.ReLU(),
t.nn.Linear(in_features=2000, out_features=400),
t.nn.ReLU(),
t.nn.Linear(in_features=400, out_features=10),
t.nn.Softmax(dim=-1),
)
def forward(self, t: t.Tensor):
return self.implementation(t)
def train(model: SimpleNeuralNet, epochs: int, lr: int):
optimizer = t.optim.SGD(model.parameters(), lr=lr)
for epoch in tqdm(range(epochs)):
output = model(training_imgs)
# For those who are confused why we use MSE loss here for a
# classification task, see https://arxiv.org/abs/2006.07322
loss = t.nn.functional.mse_loss(output, expected_outputs_in_training)
optimizer.zero_grad()
loss.backward()
optimizer.step()
if epoch == 0:
print(f"Initial loss: {loss=}")
elif epoch == epochs - 1:
print(f"Final loss: {loss=}")
model = SimpleNeuralNet()
# %%
train(model, epochs=100, lr=10)
# %%
# Let's look at an image that wasn't part of the training data
non_training_img_idx = 0
img_outside_of_training_dataset = non_training_imgs[non_training_img_idx]
label = expected_outputs_in_non_training[non_training_img_idx].argmax()
print(f"Expected label: {label}")
plt.imshow(einops.rearrange(img_outside_of_training_dataset, '(h w) -> h w', h=28))
model_all_guesses = model(img_outside_of_training_dataset)
model_guess_highest_prob = model(img_outside_of_training_dataset).argmax()
print(f"Model guessed this was: {model_guess_highest_prob}")