Merge branch 'master' into brianjo-speech-fix

holly1238 · web-flow · commit 7081a2bb56d4 · 2021-04-25T17:42:46.000-07:00
diff --git a/.circleci/scripts/build_for_windows.sh b/.circleci/scripts/build_for_windows.sh
@@ -49,6 +49,7 @@ if [[ "${CIRCLE_JOB}" == *worker_* ]]; then
   python $DIR/remove_runnable_code.py advanced_source/static_quantization_tutorial.py advanced_source/static_quantization_tutorial.py || true
   python $DIR/remove_runnable_code.py beginner_source/hyperparameter_tuning_tutorial.py beginner_source/hyperparameter_tuning_tutorial.py || true
   python $DIR/remove_runnable_code.py beginner_source/audio_preprocessing_tutorial.py  beginner_source/audio_preprocessing_tutorial.py || true
+  python $DIR/remove_runnable_code.py beginner_source/dcgan_faces_tutorial.py  beginner_source/dcgan_faces_tutorial.py || true
   python $DIR/remove_runnable_code.py intermediate_source/tensorboard_profiler_tutorial.py intermediate_source/tensorboard_profiler_tutorial.py || true
   # Temp remove for mnist download issue. (Re-enabled for 1.8.1)
   # python $DIR/remove_runnable_code.py beginner_source/fgsm_tutorial.py  beginner_source/fgsm_tutorial.py || true
diff --git a/README.md b/README.md
@@ -28,10 +28,10 @@ In case you prefer to write your tutorial in jupyter, you can use [this script](
 - Then you can build using `make docs`. This will download the data, execute the tutorials and build the documentation to `docs/` directory. This will take about 60-120 min for systems with GPUs. If you do not have a GPU installed on your system, then see next step.
 - You can skip the computationally intensive graph generation by running `make html-noplot` to build basic html documentation to `_build/html`. This way, you can quickly preview your tutorial.
 
-> If you get **ModuleNotFoundError: No module named 'pytorch_sphinx_theme' make: *** [html-noplot] Error 2**, from /tutorials/src/pytorch-sphinx-theme run `python setup.py install`. 
+> If you get **ModuleNotFoundError: No module named 'pytorch_sphinx_theme' make: *** [html-noplot] Error 2** from /tutorials/src/pytorch-sphinx-theme or /venv/src/pytorch-sphinx-theme (while using virtualenv), run `python setup.py install`. 
 
 
 ## About contributing to PyTorch Documentation and Tutorials
 * You can find information about contributing to PyTorch documentation in the 
 PyTorch Repo [README.md](https://github.com/pytorch/pytorch/blob/master/README.md) file. 
-* Additional information can be found in [PyTorch CONTRIBUTING.md](https://github.com/pytorch/pytorch/blob/master/CONTRIBUTING.md).
+* Additional information can be found in [PyTorch CONTRIBUTING.md](https://github.com/pytorch/pytorch/blob/master/CONTRIBUTING.md).
diff --git a/beginner_source/basics/autogradqs_tutorial.py b/beginner_source/basics/autogradqs_tutorial.py
@@ -47,7 +47,7 @@
 #
 # In this network, ``w`` and ``b`` are **parameters**, which we need to
 # optimize. Thus, we need to be able to compute the gradients of loss
-# function with respect to those variables. In orded to do that, we set
+# function with respect to those variables. In order to do that, we set
 # the ``requires_grad`` property of those tensors.
 
 #######################################################################
@@ -58,7 +58,7 @@
 # A function that we apply to tensors to construct computational graph is
 # in fact an object of class ``Function``. This object knows how to
 # compute the function in the *forward* direction, and also how to compute
-# it's derivative during the *backward propagation* step. A reference to
+# its derivative during the *backward propagation* step. A reference to
 # the backward propagation function is stored in ``grad_fn`` property of a
 # tensor. You can find more information of ``Function`` `in the
 # documentation <https://pytorch.org/docs/stable/autograd.html#function>`__.
diff --git a/beginner_source/basics/buildmodel_tutorial.py b/beginner_source/basics/buildmodel_tutorial.py
@@ -67,7 +67,7 @@ def forward(self, x):
 
 ##############################################
 # We create an instance of ``NeuralNetwork``, and move it to the ``device``, and print 
-# it's structure.
+# its structure.
 
 model = NeuralNetwork().to(device)
 print(model)
@@ -119,7 +119,7 @@ def forward(self, x):
 # nn.Linear 
 # ^^^^^^^^^^^^^^^^^^^^^^
 # The `linear layer <https://pytorch.org/docs/stable/generated/torch.nn.Linear.html>`_
-# is a module that applies a linear transformation on the input using it's stored weights and biases.
+# is a module that applies a linear transformation on the input using its stored weights and biases.
 #
 layer1 = nn.Linear(in_features=28*28, out_features=20)
 hidden1 = layer1(flat_image)
diff --git a/beginner_source/basics/data_tutorial.py b/beginner_source/basics/data_tutorial.py
@@ -225,7 +225,7 @@ def __getitem__(self, idx):
 # --------------------------
 #
 # We have loaded that dataset into the ``Dataloader`` and can iterate through the dataset as needed.
-# Each iteration below returns a batch of ``train_features`` and ``train_labels``(containing ``batch_size=64`` features and labels respectively).
+# Each iteration below returns a batch of ``train_features`` and ``train_labels`` (containing ``batch_size=64`` features and labels respectively).
 # Because we specified ``shuffle=True``, after we iterate over all batches the data is shuffled (for finer-grained control over 
 # the data loading order, take a look at `Samplers <https://pytorch.org/docs/stable/data.html#data-loading-order-and-sampler>`_).
 
diff --git a/beginner_source/blitz/README.txt b/beginner_source/blitz/README.txt
@@ -13,12 +13,11 @@ Deep Learning with PyTorch: A 60 Minute Blitz
 	Neural Networks
 	https://pytorch.org/tutorials/beginner/blitz/neural_networks_tutorial.html#
 
-4. autograd_tutorial.py
-	Automatic Differentiation 
-	https://pytorch.org/tutorials/beginner/blitz/autograd_tutorial.html
-
-5. cifar10_tutorial.py
+4. cifar10_tutorial.py
 	Training a Classifier
 	https://pytorch.org/tutorials/beginner/blitz/cifar10_tutorial.html
 
+5. data_parallel_tutorial.py 
+	Optional: Data Parallelism
+	https://pytorch.org/tutorials/beginner/blitz/data_parallel_tutorial.html
 
diff --git a/beginner_source/blitz/neural_networks_tutorial.py b/beginner_source/blitz/neural_networks_tutorial.py
@@ -176,8 +176,9 @@ def num_flat_features(self, x):
 #           -> loss
 #
 # So, when we call ``loss.backward()``, the whole graph is differentiated
-# w.r.t. the loss, and all Tensors in the graph that have ``requires_grad=True``
-# will have their ``.grad`` Tensor accumulated with the gradient.
+# w.r.t. the neural net parameters, and all Tensors in the graph that have
+# ``requires_grad=True`` will have their ``.grad`` Tensor accumulated with the
+# gradient.
 #
 # For illustration, let us follow a few steps backward:
 
diff --git a/beginner_source/chatbot_tutorial.py b/beginner_source/chatbot_tutorial.py
@@ -471,7 +471,7 @@ def trimRareWords(voc, pairs, MIN_COUNT):
 # with mini-batches.
 #
 # Using mini-batches also means that we must be mindful of the variation
-# of sentence length in our batches. To accomodate sentences of different
+# of sentence length in our batches. To accommodate sentences of different
 # sizes in the same batch, we will make our batched input tensor of shape
 # *(max_length, batch_size)*, where sentences shorter than the
 # *max_length* are zero padded after an *EOS_token*.
@@ -615,7 +615,7 @@ def batch2TrainData(voc, pair_batch):
 # in normal sequential order, and one that is fed the input sequence in
 # reverse order. The outputs of each network are summed at each time step.
 # Using a bidirectional GRU will give us the advantage of encoding both
-# past and future context.
+# past and future contexts.
 #
 # Bidirectional RNN:
 #
@@ -700,7 +700,7 @@ def forward(self, input_seq, input_lengths, hidden=None):
 # states to generate the next word in the sequence. It continues
 # generating words until it outputs an *EOS_token*, representing the end
 # of the sentence. A common problem with a vanilla seq2seq decoder is that
-# if we rely soley on the context vector to encode the entire input
+# if we rely solely on the context vector to encode the entire input
 # sequence’s meaning, it is likely that we will have information loss.
 # This is especially the case when dealing with long input sequences,
 # greatly limiting the capability of our decoder.
@@ -950,7 +950,7 @@ def maskNLLLoss(inp, target, mask):
 #   sequence (or batch of sequences). We use the ``GRU`` layer like this in
 #   the ``encoder``. The reality is that under the hood, there is an
 #   iterative process looping over each time step calculating hidden states.
-#   Alternatively, you ran run these modules one time-step at a time. In
+#   Alternatively, you can run these modules one time-step at a time. In
 #   this case, we manually loop over the sequences during the training
 #   process like we must do for the ``decoder`` model. As long as you
 #   maintain the correct conceptual model of these modules, implementing
@@ -1115,7 +1115,7 @@ def trainIters(model_name, voc, pairs, encoder, decoder, encoder_optimizer, deco
 # softmax value. This decoding method is optimal on a single time-step
 # level.
 #
-# To facilite the greedy decoding operation, we define a
+# To facilitate the greedy decoding operation, we define a
 # ``GreedySearchDecoder`` class. When run, an object of this class takes
 # an input sequence (``input_seq``) of shape *(input_seq length, 1)*, a
 # scalar input length (``input_length``) tensor, and a ``max_length`` to
diff --git a/beginner_source/dcgan_faces_tutorial.py b/beginner_source/dcgan_faces_tutorial.py
@@ -71,7 +71,7 @@
 # :math:`D` and :math:`G` play a minimax game in which :math:`D` tries to
 # maximize the probability it correctly classifies reals and fakes
 # (:math:`logD(x)`), and :math:`G` tries to minimize the probability that
-# :math:`D` will predict its outputs are fake (:math:`log(1-D(G(x)))`).
+# :math:`D` will predict its outputs are fake (:math:`log(1-D(G(z)))`).
 # From the paper, the GAN loss function is
 # 
 # .. math:: \underset{G}{\text{min}} \underset{D}{\text{max}}V(D,G) = \mathbb{E}_{x\sim p_{data}(x)}\big[logD(x)\big] + \mathbb{E}_{z\sim p_{z}(z)}\big[log(1-D(G(z)))\big]
diff --git a/beginner_source/nlp/pytorch_tutorial.py b/beginner_source/nlp/pytorch_tutorial.py
@@ -9,7 +9,7 @@
 All of deep learning is computations on tensors, which are
 generalizations of a matrix that can be indexed in more than 2
 dimensions. We will see exactly what this means in-depth later. First,
-lets look what we can do with tensors.
+let's look what we can do with tensors.
 """
 # Author: Robert Guthrie
 
@@ -162,7 +162,7 @@
 # other operation, etc.)
 #
 # If ``requires_grad=True``, the Tensor object keeps track of how it was
-# created. Lets see it in action.
+# created. Let's see it in action.
 #
 
 # Tensor factory methods have a ``requires_grad`` flag
@@ -187,7 +187,7 @@
 # But how does that help us compute a gradient?
 #
 
-# Lets sum up all the entries in z
+# Let's sum up all the entries in z
 s = z.sum()
 print(s)
 print(s.grad_fn)
@@ -222,7 +222,7 @@
 
 
 ######################################################################
-# Lets have Pytorch compute the gradient, and see that we were right:
+# Let's have Pytorch compute the gradient, and see that we were right:
 # (note if you run this block multiple times, the gradient will increment.
 # That is because Pytorch *accumulates* the gradient into the .grad
 # property, since for many models this is very convenient.)
diff --git a/beginner_source/nn_tutorial.py b/beginner_source/nn_tutorial.py
@@ -85,7 +85,6 @@
     torch.tensor, (x_train, y_train, x_valid, y_valid)
 )
 n, c = x_train.shape
-x_train, x_train.shape, y_train.min(), y_train.max()
 print(x_train, y_train)
 print(x_train.shape)
 print(y_train.min(), y_train.max())
diff --git a/beginner_source/transformer_tutorial.py b/beginner_source/transformer_tutorial.py
@@ -45,15 +45,16 @@
 #
 
 import math
+
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
+from torch.nn import TransformerEncoder, TransformerEncoderLayer
 
 class TransformerModel(nn.Module):
 
     def __init__(self, ntoken, ninp, nhead, nhid, nlayers, dropout=0.5):
         super(TransformerModel, self).__init__()
-        from torch.nn import TransformerEncoder, TransformerEncoderLayer
         self.model_type = 'Transformer'
         self.pos_encoder = PositionalEncoding(ninp, dropout)
         encoder_layers = TransformerEncoderLayer(ninp, nhead, nhid, dropout)
@@ -251,12 +252,13 @@ def get_batch(source, i):
 # function to scale all the gradient together to prevent exploding.
 #
 
+import time
+
 criterion = nn.CrossEntropyLoss()
 lr = 5.0 # learning rate
 optimizer = torch.optim.SGD(model.parameters(), lr=lr)
 scheduler = torch.optim.lr_scheduler.StepLR(optimizer, 1.0, gamma=0.95)
 
-import time
 def train():
     model.train() # Turn on the train mode
     total_loss = 0.
diff --git a/index.rst b/index.rst
@@ -497,7 +497,7 @@ Additional Resources
    :header: PyTorch Cheat Sheet
    :description: Quick overview to essential PyTorch elements.
    :button_link: beginner/ptcheat.html
-   :button_text: Download
+   :button_text: Open
 
 .. customcalloutitem::
    :header: Tutorials on GitHub
@@ -509,7 +509,7 @@ Additional Resources
    :header: Run Tutorials on Google Colab
    :description: Learn how to copy tutorial data into Google Drive so that you can run tutorials on Google Colab.
    :button_link: beginner/colab.html
-   :button_text: Download
+   :button_text: Open
 
 .. End of callout section
 
diff --git a/intermediate_source/reinforcement_q_learning.py b/intermediate_source/reinforcement_q_learning.py
@@ -63,7 +63,7 @@
 import numpy as np
 import matplotlib
 import matplotlib.pyplot as plt
-from collections import namedtuple
+from collections import namedtuple, deque
 from itertools import count
 from PIL import Image
 
@@ -115,16 +115,11 @@
 class ReplayMemory(object):
 
     def __init__(self, capacity):
-        self.capacity = capacity
-        self.memory = []
-        self.position = 0
+        self.memory = deque([],maxlen=capacity)
 
     def push(self, *args):
-        """Saves a transition."""
-        if len(self.memory) < self.capacity:
-            self.memory.append(None)
-        self.memory[self.position] = Transition(*args)
-        self.position = (self.position + 1) % self.capacity
+        """Save a transition"""
+        self.memory.append(Transition(*args))
 
     def sample(self, batch_size):
         return random.sample(self.memory, batch_size)
diff --git a/intermediate_source/rpc_tutorial.rst b/intermediate_source/rpc_tutorial.rst
@@ -9,8 +9,8 @@ Prerequisites:
 -  `RPC API documents <https://pytorch.org/docs/master/rpc.html>`__
 
 This tutorial uses two simple examples to demonstrate how to build distributed
-training with the `torch.distributed.rpc <https://pytorch.org/docs/master/rpc.html>`__
-package which is first introduced as a prototype feature in PyTorch v1.4.
+training with the `torch.distributed.rpc <https://pytorch.org/docs/stable/rpc.html>`__
+package which was first introduced as an experimental feature in PyTorch v1.4.
 Source code of the two examples can be found in
 `PyTorch examples <https://github.com/pytorch/examples>`__.
 
@@ -36,19 +36,19 @@ paradigms. For example:
    machines.
 
 
-The `torch.distributed.rpc <https://pytorch.org/docs/master/rpc.html>`__ package
-can help with the above scenarios. In case 1, `RPC <https://pytorch.org/docs/master/rpc.html#rpc>`__
-and `RRef <https://pytorch.org/docs/master/rpc.html#rref>`__ allow sending data
+The `torch.distributed.rpc <https://pytorch.org/docs/stable/rpc.html>`__ package
+can help with the above scenarios. In case 1, `RPC <https://pytorch.org/docs/stable/rpc.html#rpc>`__
+and `RRef <https://pytorch.org/docs/stable/rpc.html#rref>`__ allow sending data
 from one worker to another while easily referencing remote data objects. In
-case 2, `distributed autograd <https://pytorch.org/docs/master/rpc.html#distributed-autograd-framework>`__
-and `distributed optimizer <https://pytorch.org/docs/master/rpc.html#module-torch.distributed.optim>`__
+case 2, `distributed autograd <https://pytorch.org/docs/stable/rpc.html#distributed-autograd-framework>`__
+and `distributed optimizer <https://pytorch.org/docs/stable/rpc.html#module-torch.distributed.optim>`__
 make executing backward pass and optimizer step as if it is local training. In
 the next two sections, we will demonstrate APIs of
-`torch.distributed.rpc <https://pytorch.org/docs/master/rpc.html>`__ using a
+`torch.distributed.rpc <https://pytorch.org/docs/stable/rpc.html>`__ using a
 reinforcement learning example and a language model example. Please note, this
 tutorial does not aim at building the most accurate or efficient models to
 solve given problems, instead, the main goal here is to show how to use the
-`torch.distributed.rpc <https://pytorch.org/docs/master/rpc.html>`__ package to
+`torch.distributed.rpc <https://pytorch.org/docs/stable/rpc.html>`__ package to
 build distributed training applications.
 
 
@@ -289,10 +289,10 @@ observers. The agent serves as master by repeatedly calling ``run_episode`` and
 ``finish_episode`` until the running reward surpasses the reward threshold
 specified by the environment. All observers passively waiting for commands
 from the agent. The code is wrapped by
-`rpc.init_rpc <https://pytorch.org/docs/master/rpc.html#torch.distributed.rpc.init_rpc>`__ and
-`rpc.shutdown <https://pytorch.org/docs/master/rpc.html#torch.distributed.rpc.shutdown>`__,
+`rpc.init_rpc <https://pytorch.org/docs/stable/rpc.html#torch.distributed.rpc.init_rpc>`__ and
+`rpc.shutdown <https://pytorch.org/docs/stable/rpc.html#torch.distributed.rpc.shutdown>`__,
 which initializes and terminates RPC instances respectively. More details are
-available in the `API page <https://pytorch.org/docs/master/rpc.html>`__.
+available in the `API page <https://pytorch.org/docs/stable/rpc.html>`__.
 
 
 .. code:: python
@@ -442,7 +442,7 @@ takes a GPU tensor, you need to move it to the proper device explicitly.
 With the above sub-modules, we can now piece them together using RPC to
 create an RNN model. In the code below ``ps`` represents a parameter server,
 which hosts parameters of the embedding table and the decoder. The constructor
-uses the `remote <https://pytorch.org/docs/master/rpc.html#torch.distributed.rpc.remote>`__
+uses the `remote <https://pytorch.org/docs/stable/rpc.html#torch.distributed.rpc.remote>`__
 API to create an ``EmbeddingTable`` object and a ``Decoder`` object on the
 parameter server, and locally creates the ``LSTM`` sub-module. During the
 forward pass, the trainer uses the ``EmbeddingTable`` ``RRef`` to find the
diff --git a/intermediate_source/torchvision_tutorial.rst b/intermediate_source/torchvision_tutorial.rst
@@ -64,7 +64,7 @@ training and evaluation, and will use the evaluation scripts from
 One note on the ``labels``. The model considers class ``0`` as background. If your dataset does not contain the background class, you should not have ``0`` in your ``labels``. For example, assuming you have just two classes, *cat* and *dog*, you can define ``1`` (not ``0``) to represent *cats* and ``2`` to represent *dogs*. So, for instance, if one of the images has both classes, your ``labels`` tensor should look like ``[1,2]``.
 
 Additionally, if you want to use aspect ratio grouping during training
-(so that each batch only contains images with similar aspect ratio),
+(so that each batch only contains images with similar aspect ratios),
 then it is recommended to also implement a ``get_height_and_width``
 method, which returns the height and the width of the image. If this
 method is not provided, we query all elements of the dataset via

Original file line number	Diff line number	Diff line change
`@@ -225,7 +225,7 @@ def __getitem__(self, idx):`
`225`	`225`	`# --------------------------`
`226`	`226`	`#`
`227`	`227`	# We have loaded that dataset into the ``Dataloader`` and can iterate through the dataset as needed.
`228`		-# Each iteration below returns a batch of ``train_features`` and ``train_labels``(containing ``batch_size=64`` features and labels respectively).
	`228`	+# Each iteration below returns a batch of ``train_features`` and ``train_labels`` (containing ``batch_size=64`` features and labels respectively).
`229`	`229`	# Because we specified ``shuffle=True``, after we iterate over all batches the data is shuffled (for finer-grained control over
`230`	`230`	# the data loading order, take a look at `Samplers <https://pytorch.org/docs/stable/data.html#data-loading-order-and-sampler>`_).
`231`	`231`
Original file line number	Diff line number	Diff line change
`@@ -176,8 +176,9 @@ def num_flat_features(self, x):`
`176`	`176`	`# -> loss`
`177`	`177`	`#`
`178`	`178`	# So, when we call ``loss.backward()``, the whole graph is differentiated
`179`		-# w.r.t. the loss, and all Tensors in the graph that have ``requires_grad=True``
`180`		-# will have their ``.grad`` Tensor accumulated with the gradient.
	`179`	`+# w.r.t. the neural net parameters, and all Tensors in the graph that have`
	`180`	+# ``requires_grad=True`` will have their ``.grad`` Tensor accumulated with the
	`181`	`+# gradient.`
`181`	`182`	`#`
`182`	`183`	`# For illustration, let us follow a few steps backward:`
`183`	`184`
Original file line number	Diff line number	Diff line change
`@@ -71,7 +71,7 @@`
`71`	`71`	# :math:`D` and :math:`G` play a minimax game in which :math:`D` tries to
`72`	`72`	`# maximize the probability it correctly classifies reals and fakes`
`73`	`73`	# (:math:`logD(x)`), and :math:`G` tries to minimize the probability that
`74`		-# :math:`D` will predict its outputs are fake (:math:`log(1-D(G(x)))`).
	`74`	+# :math:`D` will predict its outputs are fake (:math:`log(1-D(G(z)))`).
`75`	`75`	`# From the paper, the GAN loss function is`
`76`	`76`	`#`
`77`	`77`	`# .. math:: \underset{G}{\text{min}} \underset{D}{\text{max}}V(D,G) = \mathbb{E}_{x\sim p_{data}(x)}\big[logD(x)\big] + \mathbb{E}_{z\sim p_{z}(z)}\big[log(1-D(G(z)))\big]`