Merge branch 'master' into typofix

brianjo · web-flow · commit 88bf81a72ec9 · 2020-09-29T12:14:09.000-07:00
diff --git a/.jenkins/remove_runnable_code.py b/.jenkins/remove_runnable_code.py
@@ -16,9 +16,17 @@
             if line.startswith('#'):
                 ret_lines.append(line)
                 state = STATE_NORMAL
+            elif ((line.startswith('"""') or line.startswith('r"""')) and
+                    line.endswith('"""')):
+                ret_lines.append(line)
+                state = STATE_NORMAL
             elif line.startswith('"""') or line.startswith('r"""'):
                 ret_lines.append(line)
                 state = STATE_IN_MULTILINE_COMMENT_BLOCK_DOUBLE_QUOTE
+            elif ((line.startswith("'''") or line.startswith("r'''")) and
+                    line.endswith("'''")):
+                ret_lines.append(line)
+                state = STATE_NORMAL
             elif line.startswith("'''") or line.startswith("r'''"):
                 ret_lines.append(line)
                 state = STATE_IN_MULTILINE_COMMENT_BLOCK_SINGLE_QUOTE
diff --git a/advanced_source/dispatcher.rst b/advanced_source/dispatcher.rst
@@ -1,5 +1,5 @@
-Dispatcher in C++
-=================
+Registering a Dispatched Operator in C++
+========================================
 
 The dispatcher is an internal component of PyTorch which is responsible for
 figuring out what code should actually get run when you call a function like
diff --git a/advanced_source/rpc_ddp_tutorial/main.py b/advanced_source/rpc_ddp_tutorial/main.py
@@ -6,7 +6,7 @@
 import torch.distributed as dist
 import torch.distributed.autograd as dist_autograd
 import torch.distributed.rpc as rpc
-from torch.distributed.rpc import ProcessGroupRpcBackendOptions
+from torch.distributed.rpc import TensorPipeRpcBackendOptions
 import torch.multiprocessing as mp
 import torch.optim as optim
 from torch.distributed.optim import DistributedOptimizer
@@ -128,7 +128,7 @@ def run_worker(rank, world_size):
     os.environ['MASTER_PORT'] = '29500'
 
 
-    rpc_backend_options = ProcessGroupRpcBackendOptions()
+    rpc_backend_options = TensorPipeRpcBackendOptions()
     rpc_backend_options.init_method='tcp://localhost:29501'
 
     # Rank 2 is master, 3 is ps and 0 and 1 are trainers.
diff --git a/beginner_source/dist_overview.rst b/beginner_source/dist_overview.rst
@@ -195,3 +195,13 @@ RPC Tutorials are listed below:
    `@rpc.functions.async_execution <https://pytorch.org/docs/master/rpc.html#torch.distributed.rpc.functions.async_execution>`__
    decorator, which can help speed up inference and training. It uses similar
    RL and PS examples employed in the above tutorials 1 and 2.
+5. The `Combining Distributed DataParallel with Distributed RPC Framework <../advanced/rpc_ddp_tutorial.html>`__
+   tutorial demonstrates how to combine DDP with RPC to train a model using 
+   distributed data parallelism combined with distributed model parallelism.
+
+
+PyTorch Distributed Developers
+------------------------------
+
+If you'd like to contribute to PyTorch Distributed, please refer to our 
+`Developer Guide <https://github.com/pytorch/pytorch/blob/master/torch/distributed/CONTRIBUTING.md>`_.
diff --git a/intermediate_source/dist_pipeline_parallel_tutorial.rst b/intermediate_source/dist_pipeline_parallel_tutorial.rst
@@ -316,7 +316,7 @@ where the ``shutdown`` by default will block until all RPC participants finish.
     def run_worker(rank, world_size, num_split):
         os.environ['MASTER_ADDR'] = 'localhost'
         os.environ['MASTER_PORT'] = '29500'
-        options = rpc.ProcessGroupRpcBackendOptions(num_send_recv_threads=128)
+        options = rpc.TensorPipeRpcBackendOptions(num_worker_threads=128)
 
         if rank == 0:
             rpc.init_rpc(
diff --git a/intermediate_source/dynamic_quantization_bert_tutorial.rst b/intermediate_source/dynamic_quantization_bert_tutorial.rst
@@ -492,7 +492,7 @@ follows:
 
    | Prec | F1 score | Model Size | 1 thread | 4 threads |
    | FP32 |  0.9019  |   438 MB   | 160 sec  | 85 sec    |
-   | INT8 |  0.8953  |   181 MB   |  90 sec  | 46 sec    |
+   | INT8 |  0.902   |   181 MB   |  90 sec  | 46 sec    |
 
 We have 0.6% F1 score accuracy after applying the post-training dynamic
 quantization on the fine-tuned BERT model on the MRPC task. As a
@@ -520,15 +520,23 @@ processing the evaluation of MRPC dataset.
 3.3 Serialize the quantized model
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 
-We can serialize and save the quantized model for the future use.
+We can serialize and save the quantized model for the future use using
+`torch.jit.save` after tracing the model.
 
 .. code:: python
 
-    quantized_output_dir = configs.output_dir + "quantized/"
-    if not os.path.exists(quantized_output_dir):
-        os.makedirs(quantized_output_dir)
-        quantized_model.save_pretrained(quantized_output_dir)
+    input_ids = ids_tensor([8, 128], 2)
+    token_type_ids = ids_tensor([8, 128], 2)
+    attention_mask = ids_tensor([8, 128], vocab_size=2)
+    dummy_input = (input_ids, attention_mask, token_type_ids)
+    traced_model = torch.jit.trace(quantized_model, dummy_input)
+    torch.jit.save(traced_model, "bert_traced_eager_quant.pt")
 
+To load the quantized model, we can use `torch.jit.load`
+
+.. code:: python
+
+    loaded_quantized_model = torch.jit.load("bert_traced_eager_quant.pt")
 
 Conclusion
 ----------
diff --git a/intermediate_source/model_parallel_tutorial.py b/intermediate_source/model_parallel_tutorial.py
@@ -245,7 +245,7 @@ def plot(means, stds, labels, fig_name):
 # -----------------------------
 #
 # In the following experiments, we further divide each 120-image batch into
-# 20-image splits. As PyTorch launches CUDA operations asynchronizely, the
+# 20-image splits. As PyTorch launches CUDA operations asynchronously, the
 # implementation does not need to spawn multiple threads to achieve
 # concurrency.
 
diff --git a/recipes_source/recipes/tuning_guide.py b/recipes_source/recipes/tuning_guide.py
diff --git a/recipes_source/recipes_index.rst b/recipes_source/recipes_index.rst

Original file line number	Diff line number	Diff line change
`@@ -245,7 +245,7 @@ def plot(means, stds, labels, fig_name):`
`245`	`245`	`# -----------------------------`
`246`	`246`	`#`
`247`	`247`	`# In the following experiments, we further divide each 120-image batch into`
`248`		`-# 20-image splits. As PyTorch launches CUDA operations asynchronizely, the`
	`248`	`+# 20-image splits. As PyTorch launches CUDA operations asynchronously, the`
`249`	`249`	`# implementation does not need to spawn multiple threads to achieve`
`250`	`250`	`# concurrency.`
`251`	`251`