diff --git a/intermediate_source/reinforcement_q_learning.py b/intermediate_source/reinforcement_q_learning.py index 52c1e03a1c0..7bc2ccda28f 100644 --- a/intermediate_source/reinforcement_q_learning.py +++ b/intermediate_source/reinforcement_q_learning.py @@ -388,7 +388,7 @@ def plot_durations(): # single step of the optimization. It first samples a batch, concatenates # all the tensors into a single one, computes :math:`Q(s_t, a_t)` and # :math:`V(s_{t+1}) = \max_a Q(s_{t+1}, a)`, and combines them into our -# loss. By defition we set :math:`V(s) = 0` if :math:`s` is a terminal +# loss. By definition we set :math:`V(s) = 0` if :math:`s` is a terminal # state. We also use a target network to compute :math:`V(s_{t+1})` for # added stability. The target network has its weights kept frozen most of # the time, but is updated with the policy network's weights every so often.