Add recommendations regarding use of datapipes for multi-processing, …

…shuffling, DDP, etc. (#1755)
pytorch · Jun 2, 2022 · 2978507 · 2978507
1 parent 1c1e823
commit 2978507
Show file tree

Hide file tree

Showing 29 changed files with 250 additions and 1 deletion.
diff --git a/docs/source/datasets.rst b/docs/source/datasets.rst
@@ -3,6 +3,61 @@ torchtext.datasets
 
 .. currentmodule:: torchtext.datasets
 
+
+.. _datapipes_warnings:
+
+.. warning::
+
+    The datasets supported by torchtext are datapipes from the `torchdata
+    project <https://pytorch.org/data/beta/index.html>`_, which is still in Beta
+    status. This means that the API is subject to change without deprecation
+    cycles. In particular, we expect a lot of the current idioms to change with
+    the eventual release of ``DataLoaderV2`` from ``torchdata``.
+
+    Here are a few recommendations regarding the use of datapipes:
+
+    - For shuffling the datapipe, do that in the DataLoader: ``DataLoader(dp, shuffle=True)``.
+      You do not need to call ``dp.shuffle()``, because ``torchtext`` has
+      already done that for you. Note however that the datapipe won't be
+      shuffled unless you explicitly pass ``shuffle=True`` to the DataLoader.
+
+    - When using multi-processing (``num_workers=N``), use the builtin ``worker_init_fn``::
+
+            from torch.utils.data.backward_compatibility import worker_init_fn
+            DataLoader(dp, num_workers=4, worker_init_fn=worker_init_fn, drop_last=True)
+
+      This will ensure that data isn't duplicated across workers.
+
+    - We also recommend using ``drop_last=True``. Without this, the batch sizes
+      at the end of an epoch may be very small in some cases (smaller than with
+      other map-style datasets). This might affect accuracy greatly especially
+      when batch-norm is used. ``drop_last=True`` ensures that all batch sizes
+      are equal.
+
+    - Distributed training with ``DistributedDataParallel`` is not yet entirely
+      stable / supported, and we don't recommend it at this point. It will be
+      better supported in DataLoaderV2. If you still wish to use DDP, make sure
+      that:
+
+      - All workers (DDP workers *and* DataLoader workers) see a different part
+        of the data. The datasets are already wrapped inside  `ShardingFilter
+        <https://pytorch.org/data/main/generated/torchdata.datapipes.iter.ShardingFilter.html>`_
+        and you may need to call ``dp.apply_sharing(num_shards, shard_id)`` in order to shard the
+        data across ranks (DDP workers) and DataLoader workers. One way to do this
+        is to create ``worker_init_fn`` that calls ``apply_sharding`` with appropriate
+        number of shards (DDP workers * DataLoader workers) and shard id (inferred through rank
+        and worker ID of corresponding DataLoader withing rank). Note however, that this assumes
+        equal number of DataLoader workers for all the ranks.
+      - All DDP workers work on the same number of batches. One way to do this
+        is to by limit the size of the datapipe within each worker to
+        ``len(datapipe) // num_ddp_workers``, but this might not suit all
+        use-cases.
+      - The shuffling seed is the same across all workers. You might need to
+        call ``torch.utils.data.graph_settings.apply_shuffle_seed(dp, rng)``
+      - The shuffling seed is different across epochs.
+      - The rest of the RNG (typically used for transformations) is
+        **different** across workers, for maximal entropy and optimal accuracy.
+
 General use cases are as follows: ::
 
 

diff --git a/examples/tutorials/sst2_classification_non_distributed.py b/examples/tutorials/sst2_classification_non_distributed.py
@@ -85,7 +85,11 @@
 # and transforms. Below, we demonstrate how to use text and label processing transforms to pre-process the
 # SST-2 dataset.
 #
-#
+# .. note::
+#       Using datapipes is still currently subject to a few caveats. If you wish
+#       to extend this example to include shuffling, multi-processing, or
+#       distributed learning, please see :ref:`this note <datapipes_warnings>`
+#       for further instructions.
 
 from torchtext.datasets import SST2
 

diff --git a/torchtext/datasets/ag_news.py b/torchtext/datasets/ag_news.py
@@ -43,6 +43,13 @@ def _modify_res(t):
 def AG_NEWS(root: str, split: Union[Tuple[str], str]):
     """AG_NEWS Dataset
 
+    .. warning::
+
+        Using datapipes is still currently subject to a few caveats. If you wish
+        to use this dataset with shuffling, multi-processing, or distributed
+        learning, please see :ref:`this note <datapipes_warnings>` for further
+        instructions.
+
     For additional details refer to https://paperswithcode.com/dataset/ag-news
 
     Number of lines per split:

diff --git a/torchtext/datasets/amazonreviewfull.py b/torchtext/datasets/amazonreviewfull.py
@@ -57,6 +57,13 @@ def _modify_res(t):
 def AmazonReviewFull(root: str, split: Union[Tuple[str], str]):
     """AmazonReviewFull Dataset
 
+    .. warning::
+
+        Using datapipes is still currently subject to a few caveats. If you wish
+        to use this dataset with shuffling, multi-processing, or distributed
+        learning, please see :ref:`this note <datapipes_warnings>` for further
+        instructions.
+
     For additional details refer to https://arxiv.org/abs/1509.01626
 
     Number of lines per split:

diff --git a/torchtext/datasets/amazonreviewpolarity.py b/torchtext/datasets/amazonreviewpolarity.py
@@ -53,6 +53,13 @@ def _modify_res(t):
 def AmazonReviewPolarity(root: str, split: Union[Tuple[str], str]):
     """AmazonReviewPolarity Dataset
 
+    .. warning::
+
+        Using datapipes is still currently subject to a few caveats. If you wish
+        to use this dataset with shuffling, multi-processing, or distributed
+        learning, please see :ref:`this note <datapipes_warnings>` for further
+        instructions.
+
     For additional details refer to https://arxiv.org/abs/1509.01626
 
     Number of lines per split:

diff --git a/torchtext/datasets/cc100.py b/torchtext/datasets/cc100.py
@@ -152,6 +152,13 @@ def _modify_res(language_code, x):
 def CC100(root: str, language_code: str = "en"):
     """CC100 Dataset
 
+    .. warning::
+
+        using datapipes is still currently subject to a few caveats. if you wish
+        to use this dataset with shuffling, multi-processing, or distributed
+        learning, please see :ref:`this note <datapipes_warnings>` for further
+        instructions.
+
     For additional details refer to https://data.statmt.org/cc-100/
 
     Args:

diff --git a/torchtext/datasets/cola.py b/torchtext/datasets/cola.py
@@ -52,6 +52,13 @@ def _filter_res(x):
 def CoLA(root: str, split: Union[Tuple[str], str]):
     """CoLA dataset
 
+    .. warning::
+
+        Using datapipes is still currently subject to a few caveats. If you wish
+        to use this dataset with shuffling, multi-processing, or distributed
+        learning, please see :ref:`this note <datapipes_warnings>` for further
+        instructions.
+
     For additional details refer to https://nyu-mll.github.io/CoLA/
 
     Number of lines per split:

diff --git a/torchtext/datasets/conll2000chunking.py b/torchtext/datasets/conll2000chunking.py
@@ -45,6 +45,13 @@ def _extracted_filepath_fn(root, split, _=None):
 def CoNLL2000Chunking(root: str, split: Union[Tuple[str], str]):
     """CoNLL2000Chunking Dataset
 
+    .. warning::
+
+        using datapipes is still currently subject to a few caveats. if you wish
+        to use this dataset with shuffling, multi-processing, or distributed
+        learning, please see :ref:`this note <datapipes_warnings>` for further
+        instructions.
+
     For additional details refer to https://www.clips.uantwerpen.be/conll2000/chunking/
 
     Number of lines per split:

diff --git a/torchtext/datasets/dbpedia.py b/torchtext/datasets/dbpedia.py
@@ -52,6 +52,13 @@ def _modify_res(t):
 def DBpedia(root: str, split: Union[Tuple[str], str]):
     """DBpedia Dataset
 
+    .. warning::
+
+        using datapipes is still currently subject to a few caveats. if you wish
+        to use this dataset with shuffling, multi-processing, or distributed
+        learning, please see :ref:`this note <datapipes_warnings>` for further
+        instructions.
+
     For additional details refer to https://www.dbpedia.org/resources/latest-core/
 
     Number of lines per split:

diff --git a/torchtext/datasets/enwik9.py b/torchtext/datasets/enwik9.py
@@ -31,6 +31,13 @@ def _extracted_filepath_fn(root, _=None):
 def EnWik9(root: str):
     """EnWik9 dataset
 
+    .. warning::
+
+        using datapipes is still currently subject to a few caveats. if you wish
+        to use this dataset with shuffling, multi-processing, or distributed
+        learning, please see :ref:`this note <datapipes_warnings>` for further
+        instructions.
+
     For additional details refer to http://mattmahoney.net/dc/textdata.html
 
     Number of lines in dataset: 13147026

diff --git a/torchtext/datasets/imdb.py b/torchtext/datasets/imdb.py
@@ -65,6 +65,13 @@ def filter_imdb_data(key, fname):
 def IMDB(root: str, split: Union[Tuple[str], str]):
     """IMDB Dataset
 
+    .. warning::
+
+        using datapipes is still currently subject to a few caveats. if you wish
+        to use this dataset with shuffling, multi-processing, or distributed
+        learning, please see :ref:`this note <datapipes_warnings>` for further
+        instructions.
+
     For additional details refer to http://ai.stanford.edu/~amaas/data/sentiment/
 
     Number of lines per split:

diff --git a/torchtext/datasets/iwslt2016.py b/torchtext/datasets/iwslt2016.py
@@ -171,6 +171,13 @@ def IWSLT2016(
 ):
     """IWSLT2016 dataset
 
+    .. warning::
+
+        using datapipes is still currently subject to a few caveats. if you wish
+        to use this dataset with shuffling, multi-processing, or distributed
+        learning, please see :ref:`this note <datapipes_warnings>` for further
+        instructions.
+
     For additional details refer to https://wit3.fbk.eu/2016-01
 
     The available datasets include following:

diff --git a/torchtext/datasets/iwslt2017.py b/torchtext/datasets/iwslt2017.py
@@ -140,6 +140,13 @@ def _inner_iwslt_tar_filepath_fn(inner_iwslt_tar, _=None):
 def IWSLT2017(root=".data", split=("train", "valid", "test"), language_pair=("de", "en")):
     """IWSLT2017 dataset
 
+    .. warning::
+
+        using datapipes is still currently subject to a few caveats. if you wish
+        to use this dataset with shuffling, multi-processing, or distributed
+        learning, please see :ref:`this note <datapipes_warnings>` for further
+        instructions.
+
     For additional details refer to https://wit3.fbk.eu/2017-01
 
     The available datasets include following:

diff --git a/torchtext/datasets/mnli.py b/torchtext/datasets/mnli.py
@@ -65,6 +65,13 @@ def _modify_res(x):
 def MNLI(root, split):
     """MNLI Dataset
 
+    .. warning::
+
+        using datapipes is still currently subject to a few caveats. if you wish
+        to use this dataset with shuffling, multi-processing, or distributed
+        learning, please see :ref:`this note <datapipes_warnings>` for further
+        instructions.
+
     For additional details refer to https://cims.nyu.edu/~sbowman/multinli/
 
     Number of lines per split:

diff --git a/torchtext/datasets/mrpc.py b/torchtext/datasets/mrpc.py
@@ -45,6 +45,13 @@ def _modify_res(x):
 def MRPC(root: str, split: Union[Tuple[str], str]):
     """MRPC Dataset
 
+    .. warning::
+
+        using datapipes is still currently subject to a few caveats. if you wish
+        to use this dataset with shuffling, multi-processing, or distributed
+        learning, please see :ref:`this note <datapipes_warnings>` for further
+        instructions.
+
     For additional details refer to https://www.microsoft.com/en-us/download/details.aspx?id=52398
 
     Number of lines per split:

diff --git a/torchtext/datasets/multi30k.py b/torchtext/datasets/multi30k.py
@@ -56,6 +56,13 @@ def _filter_fn(split, language_pair, i, x):
 def Multi30k(root: str, split: Union[Tuple[str], str], language_pair: Tuple[str] = ("de", "en")):
     """Multi30k dataset
 
+    .. warning::
+
+        using datapipes is still currently subject to a few caveats. if you wish
+        to use this dataset with shuffling, multi-processing, or distributed
+        learning, please see :ref:`this note <datapipes_warnings>` for further
+        instructions.
+
     For additional details refer to https://www.statmt.org/wmt16/multimodal-task.html#task1
 
     Number of lines per split:

diff --git a/torchtext/datasets/penntreebank.py b/torchtext/datasets/penntreebank.py
@@ -46,6 +46,13 @@ def _modify_res(t):
 def PennTreebank(root, split: Union[Tuple[str], str]):
     """PennTreebank Dataset
 
+    .. warning::
+
+        using datapipes is still currently subject to a few caveats. if you wish
+        to use this dataset with shuffling, multi-processing, or distributed
+        learning, please see :ref:`this note <datapipes_warnings>` for further
+        instructions.
+
     For additional details refer to https://catalog.ldc.upenn.edu/docs/LDC95T7/cl93.html
 
     Number of lines per split:

diff --git a/torchtext/datasets/qqp.py b/torchtext/datasets/qqp.py
@@ -30,6 +30,14 @@ def _modify_res(x):
 @_create_dataset_directory(dataset_name=DATASET_NAME)
 def QQP(root: str):
     """QQP dataset
+
+    .. warning::
+
+        using datapipes is still currently subject to a few caveats. if you wish
+        to use this dataset with shuffling, multi-processing, or distributed
+        learning, please see :ref:`this note <datapipes_warnings>` for further
+        instructions.
+
     For additional details refer to https://quoradata.quora.com/First-Quora-Dataset-Release-Question-Pairs
 
     Args:

diff --git a/torchtext/datasets/sogounews.py b/torchtext/datasets/sogounews.py
@@ -57,6 +57,13 @@ def _modify_res(t):
 def SogouNews(root: str, split: Union[Tuple[str], str]):
     """SogouNews Dataset
 
+    .. warning::
+
+        using datapipes is still currently subject to a few caveats. if you wish
+        to use this dataset with shuffling, multi-processing, or distributed
+        learning, please see :ref:`this note <datapipes_warnings>` for further
+        instructions.
+
     For additional details refer to https://arxiv.org/abs/1509.01626
 
      Number of lines per split:

diff --git a/torchtext/datasets/squad1.py b/torchtext/datasets/squad1.py
@@ -40,6 +40,13 @@ def _filepath_fn(root, split, _=None):
 def SQuAD1(root: str, split: Union[Tuple[str], str]):
     """SQuAD1 Dataset
 
+    .. warning::
+
+        using datapipes is still currently subject to a few caveats. if you wish
+        to use this dataset with shuffling, multi-processing, or distributed
+        learning, please see :ref:`this note <datapipes_warnings>` for further
+        instructions.
+
     For additional details refer to https://rajpurkar.github.io/SQuAD-explorer/
 
     Number of lines per split:

diff --git a/torchtext/datasets/squad2.py b/torchtext/datasets/squad2.py
@@ -40,6 +40,13 @@ def _filepath_fn(root, split, _=None):
 def SQuAD2(root: str, split: Union[Tuple[str], str]):
     """SQuAD2 Dataset
 
+    .. warning::
+
+        using datapipes is still currently subject to a few caveats. if you wish
+        to use this dataset with shuffling, multi-processing, or distributed
+        learning, please see :ref:`this note <datapipes_warnings>` for further
+        instructions.
+
     For additional details refer to https://rajpurkar.github.io/SQuAD-explorer/
 
     Number of lines per split:

diff --git a/torchtext/datasets/sst2.py b/torchtext/datasets/sst2.py
@@ -62,6 +62,13 @@ def _modify_res(t):
 def SST2(root, split):
     """SST2 Dataset
 
+    .. warning::
+
+        using datapipes is still currently subject to a few caveats. if you wish
+        to use this dataset with shuffling, multi-processing, or distributed
+        learning, please see :ref:`this note <datapipes_warnings>` for further
+        instructions.
+
     For additional details refer to https://nlp.stanford.edu/sentiment/
 
     Number of lines per split:

diff --git a/torchtext/datasets/stsb.py b/torchtext/datasets/stsb.py
@@ -58,6 +58,13 @@ def _modify_res(x):
 def STSB(root, split):
     """STSB Dataset
 
+    .. warning::
+
+        using datapipes is still currently subject to a few caveats. if you wish
+        to use this dataset with shuffling, multi-processing, or distributed
+        learning, please see :ref:`this note <datapipes_warnings>` for further
+        instructions.
+
     For additional details refer to https://ixa2.si.ehu.eus/stswiki/index.php/STSbenchmark
 
     Number of lines per split:

diff --git a/torchtext/datasets/udpos.py b/torchtext/datasets/udpos.py
@@ -45,6 +45,13 @@ def _filter_fn(split, x):
 def UDPOS(root: str, split: Union[Tuple[str], str]):
     """UDPOS Dataset
 
+    .. warning::
+
+        using datapipes is still currently subject to a few caveats. if you wish
+        to use this dataset with shuffling, multi-processing, or distributed
+        learning, please see :ref:`this note <datapipes_warnings>` for further
+        instructions.
+
     Number of lines per split:
         - train: 12543
         - valid: 2002

diff --git a/torchtext/datasets/wikitext103.py b/torchtext/datasets/wikitext103.py
@@ -48,6 +48,13 @@ def _filter_fn(split, x):
 def WikiText103(root: str, split: Union[Tuple[str], str]):
     """WikiText103 Dataset
 
+    .. warning::
+
+        using datapipes is still currently subject to a few caveats. if you wish
+        to use this dataset with shuffling, multi-processing, or distributed
+        learning, please see :ref:`this note <datapipes_warnings>` for further
+        instructions.
+
     For additional details refer to https://blog.salesforceairesearch.com/the-wikitext-long-term-dependency-language-modeling-dataset/
 
     Number of lines per split: