diff --git a/chapter_appendix-mathematics-for-deep-learning/maximum-likelihood.md b/chapter_appendix-mathematics-for-deep-learning/maximum-likelihood.md
index 92baae30e2..7e25c86c20 100644
--- a/chapter_appendix-mathematics-for-deep-learning/maximum-likelihood.md
+++ b/chapter_appendix-mathematics-for-deep-learning/maximum-likelihood.md
@@ -130,7 +130,7 @@ This can be written into code, and freely optimized even for billions of coin fl
 n_H = 8675309
 n_T = 256245
 
-# Initialize our paramteres
+# Initialize our parameters
 theta = np.array(0.5)
 theta.attach_grad()
 
@@ -152,7 +152,7 @@ theta, n_H / (n_H + n_T)
 n_H = 8675309
 n_T = 256245
 
-# Initialize our paramteres
+# Initialize our parameters
 theta = torch.tensor(0.5, requires_grad=True)
 
 # Perform gradient descent
@@ -174,7 +174,7 @@ theta, n_H / (n_H + n_T)
 n_H = 8675309
 n_T = 256245
 
-# Initialize our paramteres
+# Initialize our parameters
 theta = tf.Variable(tf.constant(0.5))
 
 # Perform gradient descent
diff --git a/chapter_appendix-tools-for-deep-learning/utils.md b/chapter_appendix-tools-for-deep-learning/utils.md
index 1be0f4efd8..940164934c 100644
--- a/chapter_appendix-tools-for-deep-learning/utils.md
+++ b/chapter_appendix-tools-for-deep-learning/utils.md
@@ -108,13 +108,13 @@ def draw(self, x, y, label, every_n=1):
     display.clear_output(wait=True)
 ```
 
-Add FrozenLake enviroment
+Add FrozenLake environment
 ```{.python .input}
 %%tab pytorch
 
 def frozen_lake(seed): #@save
     # See https://www.gymlibrary.dev/environments/toy_text/frozen_lake/ to learn more about this env
-    # How to process env.P.items is adpated from https://sites.google.com/view/deep-rl-bootcamp/labs
+    # How to process env.P.items is adapted from https://sites.google.com/view/deep-rl-bootcamp/labs
     import gym
 
     env = gym.make('FrozenLake-v1', is_slippery=False)
@@ -145,7 +145,7 @@ def frozen_lake(seed): #@save
 
 ```
 
-Create enviroment
+Create environment
 ```{.python .input}
 %%tab pytorch
 
@@ -707,7 +707,7 @@ import hashlib
 def download(url, folder='../data', sha1_hash=None):  #@save
     """Download a file to folder and return the local filepath."""
     if not url.startswith('http'):
-        # For back compatability
+        # For back compatibility
         url, sha1_hash = DATA_HUB[url]
     os.makedirs(folder, exist_ok=True)
     fname = os.path.join(folder, url.split('/')[-1])
diff --git a/chapter_computational-performance/hybridize.md b/chapter_computational-performance/hybridize.md
index 579ace2bf5..46084ed939 100644
--- a/chapter_computational-performance/hybridize.md
+++ b/chapter_computational-performance/hybridize.md
@@ -160,8 +160,8 @@ By converting the model using `torch.jit.script` function, we are able to compil
 :end_tab:
 
 :begin_tab:`tensorflow`
-Formerly, all functions built in TensorFlow were built as a computational graph, and therefore JIT compiled by default. However, with the release of TensorFlow 2.X and EagerTensor, this is no longer the default behavor. 
-We cen re-enable this functionality with tf.function. tf.function is more commonly used as a function decorator, however it is possible to call it direcly as a normal python function, shown below. The model's computation result remains unchanged.
+Formerly, all functions built in TensorFlow were built as a computational graph, and therefore JIT compiled by default. However, with the release of TensorFlow 2.X and EagerTensor, this is no longer the default behaviour. 
+We cen re-enable this functionality with tf.function. tf.function is more commonly used as a function decorator, however it is possible to call it directly as a normal python function, shown below. The model's computation result remains unchanged.
 :end_tab:
 
 ```{.python .input}
diff --git a/chapter_computer-vision/kaggle-cifar10.md b/chapter_computer-vision/kaggle-cifar10.md
index 89441cbefe..efdba408f0 100644
--- a/chapter_computer-vision/kaggle-cifar10.md
+++ b/chapter_computer-vision/kaggle-cifar10.md
@@ -582,7 +582,7 @@ is similar to that in :numref:`sec_kaggle_house`.
 * We can read datasets containing raw image files after organizing them into the required format.
 
 :begin_tab:`mxnet`
-* We can use convolutional neural networks, image augmentation, and hybrid programing in an image classification competition.
+* We can use convolutional neural networks, image augmentation, and hybrid programming in an image classification competition.
 :end_tab:
 
 :begin_tab:`pytorch`
diff --git a/chapter_convolutional-modern/cnn-design.md b/chapter_convolutional-modern/cnn-design.md
index 429f055c17..b386d3c44a 100644
--- a/chapter_convolutional-modern/cnn-design.md
+++ b/chapter_convolutional-modern/cnn-design.md
@@ -253,7 +253,7 @@ Consider the problem of identifying good parameters in the AnyNet design space.
 1. Results obtained at a smaller scale (for smaller networks) generalize to larger ones. Consequently, optimization is carried out for networks that are structurally similar, but with a smaller number of blocks, fewer channels, etc. Only in the end will we need to verify that the so-found networks also offer good performance at scale. 
 1. Aspects of the design can be approximately factorized so that it is possible to infer their effect on the quality of the outcome somewhat independently. In other words, the optimization problem is moderately easy.
 
-These assumptions allow us to test many networks cheaply. In particular, we can *sample* uniformly from the space of configurations and evaluate their performance. Subsequently, we can evaluate the quality of the choice of parameters by reviewing the *distribution* of error/accuracy that can be achieved with said networks. Denote by $F(e)$ the cumulative distribution function (CDF) for errors committed by networks of a given design space, drawn using probability disribution $p$. That is, 
+These assumptions allow us to test many networks cheaply. In particular, we can *sample* uniformly from the space of configurations and evaluate their performance. Subsequently, we can evaluate the quality of the choice of parameters by reviewing the *distribution* of error/accuracy that can be achieved with said networks. Denote by $F(e)$ the cumulative distribution function (CDF) for errors committed by networks of a given design space, drawn using probability distribution $p$. That is, 
 
 $$F(e, p) \stackrel{\textrm{def}}{=} P_{\textrm{net} \sim p} \{e(\textrm{net}) \leq e\}.$$
 
@@ -262,7 +262,7 @@ Our goal is now to find a distribution $p$ over *networks* such that most networ
 $$\hat{F}(e, \mathcal{Z}) = \frac{1}{n}\sum_{i=1}^n \mathbf{1}(e_i \leq e).$$
 
 Whenever the CDF for one set of choices majorizes (or matches) another CDF it follows that its choice of parameters is superior (or indifferent). Accordingly 
-:citet:`Radosavovic.Kosaraju.Girshick.ea.2020` experimented with a shared network bottleneck ratio $k_i = k$ for all stages $i$ of the network. This gets rid of three of the four parameters governing the bottleneck ratio. To assess whether this (negatively) affects the performance one can draw networks from the constrained and from the unconstrained distribution and compare the corresonding CDFs. It turns out that this constraint does not affect the accuracy of the distribution of networks at all, as can be seen in the first panel of :numref:`fig_regnet-fig`. 
+:citet:`Radosavovic.Kosaraju.Girshick.ea.2020` experimented with a shared network bottleneck ratio $k_i = k$ for all stages $i$ of the network. This gets rid of three of the four parameters governing the bottleneck ratio. To assess whether this (negatively) affects the performance one can draw networks from the constrained and from the unconstrained distribution and compare the corresponding CDFs. It turns out that this constraint does not affect the accuracy of the distribution of networks at all, as can be seen in the first panel of :numref:`fig_regnet-fig`. 
 Likewise, we could choose to pick the same group width $g_i = g$ occurring at the various stages of the network. Again, this does not affect performance, as can be seen in the second panel of :numref:`fig_regnet-fig`.
 Both steps combined reduce the number of free parameters by six. 
 
diff --git a/chapter_convolutional-modern/resnet.md b/chapter_convolutional-modern/resnet.md
index 0879be814e..bae8c21789 100644
--- a/chapter_convolutional-modern/resnet.md
+++ b/chapter_convolutional-modern/resnet.md
@@ -708,7 +708,7 @@ adopts residual connections (together with other design choices) and is pervasiv
 in areas as diverse as
 language, vision, speech, and reinforcement learning.
 
-ResNeXt is an example for how the design of convolutional neural networks has evolved over time: by being more frugal with computation and trading it off against the size of the activations (number of channels), it allows for faster and more accurate networks at lower cost. An alternative way of viewing grouped convolutions is to think of a block-diagonal matrix for the convolutional weights. Note that there are quite a few such "tricks" that lead to more efficient networks. For instance, ShiftNet :cite:`wu2018shift` mimicks the effects of a $3 \times 3$ convolution, simply by adding shifted activations to the channels, offering increased function complexity, this time without any computational cost. 
+ResNeXt is an example for how the design of convolutional neural networks has evolved over time: by being more frugal with computation and trading it off against the size of the activations (number of channels), it allows for faster and more accurate networks at lower cost. An alternative way of viewing grouped convolutions is to think of a block-diagonal matrix for the convolutional weights. Note that there are quite a few such "tricks" that lead to more efficient networks. For instance, ShiftNet :cite:`wu2018shift` mimics the effects of a $3 \times 3$ convolution, simply by adding shifted activations to the channels, offering increased function complexity, this time without any computational cost. 
 
 A common feature of the designs we have discussed so far is that the network design is fairly manual, primarily relying on the ingenuity of the designer to find the "right" network hyperparameters. While clearly feasible, it is also very costly in terms of human time and there is no guarantee that the outcome is optimal in any sense. In :numref:`sec_cnn-design` we will discuss a number of strategies for obtaining high quality networks in a more automated fashion. In particular, we will review the notion of *network design spaces* that led to the RegNetX/Y models
 :cite:`Radosavovic.Kosaraju.Girshick.ea.2020`.
diff --git a/chapter_convolutional-modern/vgg.md b/chapter_convolutional-modern/vgg.md
index 90fdf97aee..ba007fe3c0 100644
--- a/chapter_convolutional-modern/vgg.md
+++ b/chapter_convolutional-modern/vgg.md
@@ -148,7 +148,7 @@ the first consisting mostly of convolutional and pooling layers
 and the second consisting of fully connected layers that are identical to those in AlexNet. 
 The key difference is 
 that the convolutional layers are grouped in nonlinear transformations that 
-leave the dimensonality unchanged, followed by a resolution-reduction step, as 
+leave the dimensionality unchanged, followed by a resolution-reduction step, as 
 depicted in :numref:`fig_vgg`. 
 
 ![From AlexNet to VGG. The key difference is that VGG consists of blocks of layers, whereas AlexNet's layers are all designed individually.](../img/vgg.svg)
diff --git a/chapter_generative-adversarial-networks/dcgan.md b/chapter_generative-adversarial-networks/dcgan.md
index 96a9266191..5ef8e6082d 100644
--- a/chapter_generative-adversarial-networks/dcgan.md
+++ b/chapter_generative-adversarial-networks/dcgan.md
@@ -445,7 +445,7 @@ net_D = tf.keras.Sequential([
     D_block(n_D), # Output: (32, 32, 64)
     D_block(out_channels=n_D*2), # Output: (16, 16, 64 * 2)
     D_block(out_channels=n_D*4), # Output: (8, 8, 64 * 4)
-    D_block(out_channels=n_D*8), # Outupt: (4, 4, 64 * 64)
+    D_block(out_channels=n_D*8), # Output: (4, 4, 64 * 64)
     # Output: (1, 1, 1)
     tf.keras.layers.Conv2D(1, kernel_size=4, use_bias=False)
 ])
diff --git a/chapter_hyperparameter-optimization/hyperopt-api.md b/chapter_hyperparameter-optimization/hyperopt-api.md
index 8135aa94e4..2e32b0a407 100644
--- a/chapter_hyperparameter-optimization/hyperopt-api.md
+++ b/chapter_hyperparameter-optimization/hyperopt-api.md
@@ -124,7 +124,7 @@ distributed HPO cases.
 class HPOTuner(d2l.HyperParameters):  #@save
     def __init__(self, scheduler: HPOScheduler, objective: callable):
         self.save_hyperparameters()
-        # Bookeeping results for plotting
+        # Bookkeeping results for plotting
         self.incumbent = None
         self.incumbent_error = None
         self.incumbent_trajectory = []
diff --git a/chapter_hyperparameter-optimization/sh-async.md b/chapter_hyperparameter-optimization/sh-async.md
index 94e1635d8a..745c756316 100644
--- a/chapter_hyperparameter-optimization/sh-async.md
+++ b/chapter_hyperparameter-optimization/sh-async.md
@@ -36,7 +36,7 @@ trials, i.e., Trial-0 and Trial-3 to the next rung level. This causes idle time
 Worker-1. Then, we continue with Rung 1. Also, here Trial-3 takes longer than Trial-0,
 which leads to an additional ideling time of Worker-0. Once, we reach Rung-2, only
 the best trial, Trial-0, remains which occupies only one worker. To avoid that
-Worker-1 idles during that time, most implementaitons of SH continue already with
+Worker-1 idles during that time, most implementations of SH continue already with
 the next round, and start evaluating new trials (e.g Trial-4) on the first rung.
 
 ![Synchronous successive halving with two workers.](../img/sync_sh.svg)
diff --git a/chapter_linear-classification/classification.md b/chapter_linear-classification/classification.md
index 8020d4627b..745b2dfbe7 100644
--- a/chapter_linear-classification/classification.md
+++ b/chapter_linear-classification/classification.md
@@ -201,7 +201,7 @@ Classification is a sufficiently common problem that it warrants its own conveni
 
 1. Denote by $L_\textrm{v}$ the validation loss, and let $L_\textrm{v}^\textrm{q}$ be its quick and dirty estimate computed by the loss function averaging in this section. Lastly, denote by $l_\textrm{v}^\textrm{b}$ the loss on the last minibatch. Express $L_\textrm{v}$ in terms of $L_\textrm{v}^\textrm{q}$, $l_\textrm{v}^\textrm{b}$, and the sample and minibatch sizes.
 1. Show that the quick and dirty estimate $L_\textrm{v}^\textrm{q}$ is unbiased. That is, show that $E[L_\textrm{v}] = E[L_\textrm{v}^\textrm{q}]$. Why would you still want to use $L_\textrm{v}$ instead?
-1. Given a multiclass classification loss, denoting by $l(y,y')$ the penalty of estimating $y'$ when we see $y$ and given a probabilty $p(y \mid x)$, formulate the rule for an optimal selection of $y'$. Hint: express the expected loss, using $l$ and $p(y \mid x)$.
+1. Given a multiclass classification loss, denoting by $l(y,y')$ the penalty of estimating $y'$ when we see $y$ and given a probability $p(y \mid x)$, formulate the rule for an optimal selection of $y'$. Hint: express the expected loss, using $l$ and $p(y \mid x)$.
 
 :begin_tab:`mxnet`
 [Discussions](https://discuss.d2l.ai/t/6808)
diff --git a/chapter_linear-regression/linear-regression.md b/chapter_linear-regression/linear-regression.md
index 40a15a5b2d..b9a9d45d88 100644
--- a/chapter_linear-regression/linear-regression.md
+++ b/chapter_linear-regression/linear-regression.md
@@ -703,7 +703,7 @@ are chosen to minimize squared loss on the training set.
 We also motivated this choice of objective
 both via some practical considerations
 and through an interpretation
-of linear regression as maximimum likelihood estimation
+of linear regression as maximum likelihood estimation
 under an assumption of linearity and Gaussian noise.
 After discussing both computational considerations
 and connections to statistics,
diff --git a/chapter_multilayer-perceptrons/dropout.md b/chapter_multilayer-perceptrons/dropout.md
index 14befcafb3..12cc7a822f 100644
--- a/chapter_multilayer-perceptrons/dropout.md
+++ b/chapter_multilayer-perceptrons/dropout.md
@@ -9,7 +9,7 @@ tab.interact_select(['mxnet', 'pytorch', 'tensorflow', 'jax'])
 
 Let's think briefly about what we
 expect from a good predictive model.
-We want it to peform well on unseen data.
+We want it to perform well on unseen data.
 Classical generalization theory
 suggests that to close the gap between
 train and test performance,
diff --git a/chapter_natural-language-processing-pretraining/bert-dataset.md b/chapter_natural-language-processing-pretraining/bert-dataset.md
index bcba16fd2b..e12700131f 100644
--- a/chapter_natural-language-processing-pretraining/bert-dataset.md
+++ b/chapter_natural-language-processing-pretraining/bert-dataset.md
@@ -414,7 +414,7 @@ len(vocab)
 
 ## Summary
 
-* Comparing with the PTB dataset, the WikiText-2 dateset retains the original punctuation, case and numbers, and is over twice larger.
+* Comparing with the PTB dataset, the WikiText-2 dataset retains the original punctuation, case and numbers, and is over twice larger.
 * We can arbitrarily access the pretraining (masked language modeling and next sentence prediction) examples generated from a pair of sentences from the WikiText-2 corpus.
 
 
diff --git a/chapter_natural-language-processing-pretraining/glove.md b/chapter_natural-language-processing-pretraining/glove.md
index eee831c6b5..92b37d05c7 100644
--- a/chapter_natural-language-processing-pretraining/glove.md
+++ b/chapter_natural-language-processing-pretraining/glove.md
@@ -211,10 +211,10 @@ and their ratios based on  statistics from a large corpus.
 
 We can observe the following from :numref:`tab_glove`:
 
-* For a word $w_k$ that is related to "ice" but unrelated to "steam", such as $w_k=\textrm{solid}$, we expect a larger ratio of co-occurence probabilities, such as 8.9.
-* For a word $w_k$ that is related to "steam" but unrelated to "ice", such as $w_k=\textrm{gas}$, we expect a smaller ratio of co-occurence probabilities, such as 0.085.
-* For a word $w_k$ that is related to both "ice" and "steam", such as $w_k=\textrm{water}$, we expect a ratio of co-occurence probabilities that is close to 1, such as 1.36.
-* For a word $w_k$ that is unrelated to both "ice" and "steam", such as $w_k=\textrm{fashion}$, we expect a ratio of co-occurence probabilities that is close to 1, such as 0.96.
+* For a word $w_k$ that is related to "ice" but unrelated to "steam", such as $w_k=\textrm{solid}$, we expect a larger ratio of co-occurrence probabilities, such as 8.9.
+* For a word $w_k$ that is related to "steam" but unrelated to "ice", such as $w_k=\textrm{gas}$, we expect a smaller ratio of co-occurrence probabilities, such as 0.085.
+* For a word $w_k$ that is related to both "ice" and "steam", such as $w_k=\textrm{water}$, we expect a ratio of co-occurrence probabilities that is close to 1, such as 1.36.
+* For a word $w_k$ that is unrelated to both "ice" and "steam", such as $w_k=\textrm{fashion}$, we expect a ratio of co-occurrence probabilities that is close to 1, such as 0.96.
 
 
 
diff --git a/chapter_preface/index.md b/chapter_preface/index.md
index 36deedbbb8..f946ccce07 100644
--- a/chapter_preface/index.md
+++ b/chapter_preface/index.md
@@ -315,7 +315,7 @@ tweaking the code in small ways and observing the results.
 Ideally, an elegant mathematical theory might tell us
 precisely how to tweak our code to achieve a desired result.
 However, deep learning practitioners today
-must often tread where no solid theory provides guidance.
+must often thread where no solid theory provides guidance.
 Despite our best attempts, formal explanations
 for the efficacy of various techniques are
 still lacking, for a variety of reasons: the mathematics to characterize these models
@@ -421,7 +421,7 @@ Below lists dependencies in our TensorFlow implementation.
 Most of the code in this book is based on Jax,
 an open-source framework enabling composable function
 transformations such as differentiation of arbitrary
-Python and NumPy functions, as well as JIT compliation,
+Python and NumPy functions, as well as JIT compilation,
 vectorization and much more! It is becoming popular in
 the machine learning research space and has an
 easy-to-learn NumPy-like API. Actually, JAX tries
diff --git a/chapter_recurrent-modern/lstm.md b/chapter_recurrent-modern/lstm.md
index 0bc8567a13..d8f94b21d7 100644
--- a/chapter_recurrent-modern/lstm.md
+++ b/chapter_recurrent-modern/lstm.md
@@ -520,7 +520,7 @@ they rose to great prominence
 with some victories in prediction competitions in the mid-2000s,
 and became the dominant models for sequence learning from 2011 
 until the rise of Transformer models, starting in 2017.
-Even Tranformers owe some of their key ideas 
+Even Transformers owe some of their key ideas 
 to architecture design innovations introduced by the LSTM.
 
 
diff --git a/chapter_reinforcement-learning/qlearning.md b/chapter_reinforcement-learning/qlearning.md
index e9c4b0d4c1..753b383a76 100644
--- a/chapter_reinforcement-learning/qlearning.md
+++ b/chapter_reinforcement-learning/qlearning.md
@@ -46,7 +46,7 @@ There can be situations when there are multiple deterministic policies that corr
 
 ## Exploration in Q-Learning
 
-The policy used by the robot to collect data $\pi_e$ is critical to ensure that Q-Learning works well. Afterall, we have replaced the expectation over $s'$ using the transition function $P(s' \mid s, a)$ using the data collected by the robot. If the policy $\pi_e$ does not reach diverse parts of the state-action space, then it is easy to imagine our estimate $\hat{Q}$ will be a poor approximation of the optimal $Q^*$. It is also important to note that in such a situation, the estimate of $Q^*$ at *all states* $s \in \mathcal{S}$ will be bad, not just the ones visited by $\pi_e$. This is because the Q-Learning objective (or value iteration) is a constraint that ties together the value of all state-action pairs. It is therefore critical to pick the correct policy $\pi_e$ to collect data.
+The policy used by the robot to collect data $\pi_e$ is critical to ensure that Q-Learning works well. After all, we have replaced the expectation over $s'$ using the transition function $P(s' \mid s, a)$ using the data collected by the robot. If the policy $\pi_e$ does not reach diverse parts of the state-action space, then it is easy to imagine our estimate $\hat{Q}$ will be a poor approximation of the optimal $Q^*$. It is also important to note that in such a situation, the estimate of $Q^*$ at *all states* $s \in \mathcal{S}$ will be bad, not just the ones visited by $\pi_e$. This is because the Q-Learning objective (or value iteration) is a constraint that ties together the value of all state-action pairs. It is therefore critical to pick the correct policy $\pi_e$ to collect data.
 
 We can mitigate this concern by picking a completely random policy $\pi_e$ that samples actions uniformly randomly from $\mathcal{A}$. Such a policy would visit all states, but it will take a large number of trajectories before it does so.
 
diff --git a/chapter_reinforcement-learning/value-iter.md b/chapter_reinforcement-learning/value-iter.md
index eaecbccdca..5bea226d5f 100644
--- a/chapter_reinforcement-learning/value-iter.md
+++ b/chapter_reinforcement-learning/value-iter.md
@@ -58,11 +58,11 @@ Let us observe that for a deterministic policy where there is only one action th
 
 $$\pi^*(s) = \underset{a \in \mathcal{A}}{\mathrm{argmax}} \Big[ r(s, a) + \gamma \sum_{s' \in \mathcal{S}} P(s' \mid s, a)\ V^*(s') \Big].$$
 
-A good mnemonic to remember this is that the optimal action at state $s$ (for a deterministic policy) is the one that maximizes the sum of reward $r(s, a)$ from the first stage and the average *return* of the trajectories starting from the next sate $s'$, averaged over all possible next states $s'$ from the second stage.
+A good mnemonic to remember this is that the optimal action at state $s$ (for a deterministic policy) is the one that maximizes the sum of reward $r(s, a)$ from the first stage and the average *return* of the trajectories starting from the next state $s'$, averaged over all possible next states $s'$ from the second stage.
 
 ## Principle of Dynamic Programming
 
-Our developement in the previous section in :eqref:`eq_dynamic_programming` or :eqref:`eq_dynamic_programming_q` can be turned into an algorithm to compute the optimal value function $V^*$ or the action-value function $Q^*$, respectively. Observe that
+Our development in the previous section in :eqref:`eq_dynamic_programming` or :eqref:`eq_dynamic_programming_q` can be turned into an algorithm to compute the optimal value function $V^*$ or the action-value function $Q^*$, respectively. Observe that
 $$ V^*(s) = \sum_{a \in \mathcal{A}} \pi^*(a \mid s) \Big[ r(s,  a) + \gamma\  \sum_{s' \in \mathcal{S}} P(s' \mid s, a) V^*(s') \Big];\ \textrm{for all } s \in \mathcal{S}.$$
 
 For a deterministic optimal policy $\pi^*$, since there is only one action that can be taken at state $s$, we can also write 
@@ -99,7 +99,7 @@ The algorithm for computing the action-value function $Q^\pi(s, a)$ of a policy
 
 ## Implementation of Value Iteration
 :label:`subsec_valueitercode`
-We next show how to implement Value Iteration for a navigation problem called FrozenLake from [Open AI Gym](https://gym.openai.com). We first need to setup the enviroment as shown in the following code.
+We next show how to implement Value Iteration for a navigation problem called FrozenLake from [Open AI Gym](https://gym.openai.com). We first need to setup the environment as shown in the following code.
 
 ```{.python .input}
 %%tab all
diff --git a/d2l/jax.py b/d2l/jax.py
index b1210edc9c..aba31e70df 100644
--- a/d2l/jax.py
+++ b/d2l/jax.py
@@ -1504,7 +1504,7 @@ def download(url, folder='../data', sha1_hash=None):
 
     Defined in :numref:`sec_utils`"""
     if not url.startswith('http'):
-        # For back compatability
+        # For back compatibility
         url, sha1_hash = DATA_HUB[url]
     os.makedirs(folder, exist_ok=True)
     fname = os.path.join(folder, url.split('/')[-1])
diff --git a/d2l/mxnet.py b/d2l/mxnet.py
index 8e7d7673a3..089c46123b 100644
--- a/d2l/mxnet.py
+++ b/d2l/mxnet.py
@@ -3101,7 +3101,7 @@ def download(url, folder='../data', sha1_hash=None):
 
     Defined in :numref:`sec_utils`"""
     if not url.startswith('http'):
-        # For back compatability
+        # For back compatibility
         url, sha1_hash = DATA_HUB[url]
     os.makedirs(folder, exist_ok=True)
     fname = os.path.join(folder, url.split('/')[-1])
diff --git a/d2l/tensorflow.py b/d2l/tensorflow.py
index fd9ca23fda..aba81af507 100644
--- a/d2l/tensorflow.py
+++ b/d2l/tensorflow.py
@@ -1695,7 +1695,7 @@ def download(url, folder='../data', sha1_hash=None):
 
     Defined in :numref:`sec_utils`"""
     if not url.startswith('http'):
-        # For back compatability
+        # For back compatibility
         url, sha1_hash = DATA_HUB[url]
     os.makedirs(folder, exist_ok=True)
     fname = os.path.join(folder, url.split('/')[-1])
diff --git a/d2l/torch.py b/d2l/torch.py
index 84ce7da901..dcd631cfc8 100644
--- a/d2l/torch.py
+++ b/d2l/torch.py
@@ -2661,7 +2661,7 @@ class HPOTuner(d2l.HyperParameters):
     """Defined in :numref:`sec_api_hpo`"""
     def __init__(self, scheduler: HPOScheduler, objective: callable):
         self.save_hyperparameters()
-        # Bookeeping results for plotting
+        # Bookkeeping results for plotting
         self.incumbent = None
         self.incumbent_error = None
         self.incumbent_trajectory = []
@@ -2814,7 +2814,7 @@ def update_G(Z, net_D, net_G, loss, trainer_G):
 def frozen_lake(seed):
     """Defined in :numref:`sec_utils`"""
     # See https://www.gymlibrary.dev/environments/toy_text/frozen_lake/ to learn more about this env
-    # How to process env.P.items is adpated from https://sites.google.com/view/deep-rl-bootcamp/labs
+    # How to process env.P.items is adapted from https://sites.google.com/view/deep-rl-bootcamp/labs
     import gym
 
     env = gym.make('FrozenLake-v1', is_slippery=False)
@@ -3199,7 +3199,7 @@ def download(url, folder='../data', sha1_hash=None):
 
     Defined in :numref:`sec_utils`"""
     if not url.startswith('http'):
-        # For back compatability
+        # For back compatibility
         url, sha1_hash = DATA_HUB[url]
     os.makedirs(folder, exist_ok=True)
     fname = os.path.join(folder, url.split('/')[-1])