-
Notifications
You must be signed in to change notification settings - Fork 4.5k
/
gradient_descent.py
156 lines (111 loc) · 5.17 KB
/
gradient_descent.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
from scratch.linear_algebra import Vector, dot
def sum_of_squares(v: Vector) -> float:
"""Computes the sum of squared elements in v"""
return dot(v, v)
from typing import Callable
def difference_quotient(f: Callable[[float], float],
x: float,
h: float) -> float:
return (f(x + h) - f(x)) / h
def square(x: float) -> float:
return x * x
def derivative(x: float) -> float:
return 2 * x
def estimate_gradient(f: Callable[[Vector], float],
v: Vector,
h: float = 0.0001):
return [partial_difference_quotient(f, v, i, h)
for i in range(len(v))]
import random
from scratch.linear_algebra import distance, add, scalar_multiply
def gradient_step(v: Vector, gradient: Vector, step_size: float) -> Vector:
"""Moves `step_size` in the `gradient` direction from `v`"""
assert len(v) == len(gradient)
step = scalar_multiply(step_size, gradient)
return add(v, step)
def sum_of_squares_gradient(v: Vector) -> Vector:
return [2 * v_i for v_i in v]
# x ranges from -50 to 49, y is always 20 * x + 5
inputs = [(x, 20 * x + 5) for x in range(-50, 50)]
def linear_gradient(x: float, y: float, theta: Vector) -> Vector:
slope, intercept = theta
predicted = slope * x + intercept # The prediction of the model.
error = (predicted - y) # error is (predicted - actual)
squared_error = error ** 2 # We'll minimize squared error
grad = [2 * error * x, 2 * error] # using its gradient.
return grad
from typing import TypeVar, List, Iterator
T = TypeVar('T') # this allows us to type "generic" functions
def minibatches(dataset: List[T],
batch_size: int,
shuffle: bool = True) -> Iterator[List[T]]:
"""Generates `batch_size`-sized minibatches from the dataset"""
# Start indexes 0, batch_size, 2 * batch_size, ...
batch_starts = [start for start in range(0, len(dataset), batch_size)]
if shuffle: random.shuffle(batch_starts) # shuffle the batches
for start in batch_starts:
end = start + batch_size
yield dataset[start:end]
def main():
xs = range(-10, 11)
actuals = [derivative(x) for x in xs]
estimates = [difference_quotient(square, x, h=0.001) for x in xs]
# plot to show they're basically the same
import matplotlib.pyplot as plt
plt.title("Actual Derivatives vs. Estimates")
plt.plot(xs, actuals, 'rx', label='Actual') # red x
plt.plot(xs, estimates, 'b+', label='Estimate') # blue +
plt.legend(loc=9)
# plt.show()
plt.close()
def partial_difference_quotient(f: Callable[[Vector], float],
v: Vector,
i: int,
h: float) -> float:
"""Returns the i-th partial difference quotient of f at v"""
w = [v_j + (h if j == i else 0) # add h to just the ith element of v
for j, v_j in enumerate(v)]
return (f(w) - f(v)) / h
# "Using the Gradient" example
# pick a random starting point
v = [random.uniform(-10, 10) for i in range(3)]
for epoch in range(1000):
grad = sum_of_squares_gradient(v) # compute the gradient at v
v = gradient_step(v, grad, -0.01) # take a negative gradient step
print(epoch, v)
assert distance(v, [0, 0, 0]) < 0.001 # v should be close to 0
# First "Using Gradient Descent to Fit Models" example
from scratch.linear_algebra import vector_mean
# Start with random values for slope and intercept.
theta = [random.uniform(-1, 1), random.uniform(-1, 1)]
learning_rate = 0.001
for epoch in range(5000):
# Compute the mean of the gradients
grad = vector_mean([linear_gradient(x, y, theta) for x, y in inputs])
# Take a step in that direction
theta = gradient_step(theta, grad, -learning_rate)
print(epoch, theta)
slope, intercept = theta
assert 19.9 < slope < 20.1, "slope should be about 20"
assert 4.9 < intercept < 5.1, "intercept should be about 5"
# Minibatch gradient descent example
theta = [random.uniform(-1, 1), random.uniform(-1, 1)]
for epoch in range(1000):
for batch in minibatches(inputs, batch_size=20):
grad = vector_mean([linear_gradient(x, y, theta) for x, y in batch])
theta = gradient_step(theta, grad, -learning_rate)
print(epoch, theta)
slope, intercept = theta
assert 19.9 < slope < 20.1, "slope should be about 20"
assert 4.9 < intercept < 5.1, "intercept should be about 5"
# Stochastic gradient descent example
theta = [random.uniform(-1, 1), random.uniform(-1, 1)]
for epoch in range(100):
for x, y in inputs:
grad = linear_gradient(x, y, theta)
theta = gradient_step(theta, grad, -learning_rate)
print(epoch, theta)
slope, intercept = theta
assert 19.9 < slope < 20.1, "slope should be about 20"
assert 4.9 < intercept < 5.1, "intercept should be about 5"
if __name__ == "__main__": main()