Skip to content

Commit 88bf81a

Browse files
authored
Merge branch 'master' into typofix
2 parents 8388c24 + bd1dfa3 commit 88bf81a

File tree

9 files changed

+418
-12
lines changed

9 files changed

+418
-12
lines changed

.jenkins/remove_runnable_code.py

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -16,9 +16,17 @@
1616
if line.startswith('#'):
1717
ret_lines.append(line)
1818
state = STATE_NORMAL
19+
elif ((line.startswith('"""') or line.startswith('r"""')) and
20+
line.endswith('"""')):
21+
ret_lines.append(line)
22+
state = STATE_NORMAL
1923
elif line.startswith('"""') or line.startswith('r"""'):
2024
ret_lines.append(line)
2125
state = STATE_IN_MULTILINE_COMMENT_BLOCK_DOUBLE_QUOTE
26+
elif ((line.startswith("'''") or line.startswith("r'''")) and
27+
line.endswith("'''")):
28+
ret_lines.append(line)
29+
state = STATE_NORMAL
2230
elif line.startswith("'''") or line.startswith("r'''"):
2331
ret_lines.append(line)
2432
state = STATE_IN_MULTILINE_COMMENT_BLOCK_SINGLE_QUOTE

advanced_source/dispatcher.rst

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
1-
Dispatcher in C++
2-
=================
1+
Registering a Dispatched Operator in C++
2+
========================================
33

44
The dispatcher is an internal component of PyTorch which is responsible for
55
figuring out what code should actually get run when you call a function like

advanced_source/rpc_ddp_tutorial/main.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@
66
import torch.distributed as dist
77
import torch.distributed.autograd as dist_autograd
88
import torch.distributed.rpc as rpc
9-
from torch.distributed.rpc import ProcessGroupRpcBackendOptions
9+
from torch.distributed.rpc import TensorPipeRpcBackendOptions
1010
import torch.multiprocessing as mp
1111
import torch.optim as optim
1212
from torch.distributed.optim import DistributedOptimizer
@@ -128,7 +128,7 @@ def run_worker(rank, world_size):
128128
os.environ['MASTER_PORT'] = '29500'
129129

130130

131-
rpc_backend_options = ProcessGroupRpcBackendOptions()
131+
rpc_backend_options = TensorPipeRpcBackendOptions()
132132
rpc_backend_options.init_method='tcp://localhost:29501'
133133

134134
# Rank 2 is master, 3 is ps and 0 and 1 are trainers.

beginner_source/dist_overview.rst

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -195,3 +195,13 @@ RPC Tutorials are listed below:
195195
`@rpc.functions.async_execution <https://pytorch.org/docs/master/rpc.html#torch.distributed.rpc.functions.async_execution>`__
196196
decorator, which can help speed up inference and training. It uses similar
197197
RL and PS examples employed in the above tutorials 1 and 2.
198+
5. The `Combining Distributed DataParallel with Distributed RPC Framework <../advanced/rpc_ddp_tutorial.html>`__
199+
tutorial demonstrates how to combine DDP with RPC to train a model using
200+
distributed data parallelism combined with distributed model parallelism.
201+
202+
203+
PyTorch Distributed Developers
204+
------------------------------
205+
206+
If you'd like to contribute to PyTorch Distributed, please refer to our
207+
`Developer Guide <https://github.com/pytorch/pytorch/blob/master/torch/distributed/CONTRIBUTING.md>`_.

intermediate_source/dist_pipeline_parallel_tutorial.rst

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -316,7 +316,7 @@ where the ``shutdown`` by default will block until all RPC participants finish.
316316
def run_worker(rank, world_size, num_split):
317317
os.environ['MASTER_ADDR'] = 'localhost'
318318
os.environ['MASTER_PORT'] = '29500'
319-
options = rpc.ProcessGroupRpcBackendOptions(num_send_recv_threads=128)
319+
options = rpc.TensorPipeRpcBackendOptions(num_worker_threads=128)
320320
321321
if rank == 0:
322322
rpc.init_rpc(

intermediate_source/dynamic_quantization_bert_tutorial.rst

Lines changed: 14 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -492,7 +492,7 @@ follows:
492492
493493
| Prec | F1 score | Model Size | 1 thread | 4 threads |
494494
| FP32 | 0.9019 | 438 MB | 160 sec | 85 sec |
495-
| INT8 | 0.8953 | 181 MB | 90 sec | 46 sec |
495+
| INT8 | 0.902 | 181 MB | 90 sec | 46 sec |
496496
497497
We have 0.6% F1 score accuracy after applying the post-training dynamic
498498
quantization on the fine-tuned BERT model on the MRPC task. As a
@@ -520,15 +520,23 @@ processing the evaluation of MRPC dataset.
520520
3.3 Serialize the quantized model
521521
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
522522

523-
We can serialize and save the quantized model for the future use.
523+
We can serialize and save the quantized model for the future use using
524+
`torch.jit.save` after tracing the model.
524525

525526
.. code:: python
526527
527-
quantized_output_dir = configs.output_dir + "quantized/"
528-
if not os.path.exists(quantized_output_dir):
529-
os.makedirs(quantized_output_dir)
530-
quantized_model.save_pretrained(quantized_output_dir)
528+
input_ids = ids_tensor([8, 128], 2)
529+
token_type_ids = ids_tensor([8, 128], 2)
530+
attention_mask = ids_tensor([8, 128], vocab_size=2)
531+
dummy_input = (input_ids, attention_mask, token_type_ids)
532+
traced_model = torch.jit.trace(quantized_model, dummy_input)
533+
torch.jit.save(traced_model, "bert_traced_eager_quant.pt")
531534
535+
To load the quantized model, we can use `torch.jit.load`
536+
537+
.. code:: python
538+
539+
loaded_quantized_model = torch.jit.load("bert_traced_eager_quant.pt")
532540
533541
Conclusion
534542
----------

intermediate_source/model_parallel_tutorial.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -245,7 +245,7 @@ def plot(means, stds, labels, fig_name):
245245
# -----------------------------
246246
#
247247
# In the following experiments, we further divide each 120-image batch into
248-
# 20-image splits. As PyTorch launches CUDA operations asynchronizely, the
248+
# 20-image splits. As PyTorch launches CUDA operations asynchronously, the
249249
# implementation does not need to spawn multiple threads to achieve
250250
# concurrency.
251251

0 commit comments

Comments
 (0)