From ddf33b5e64c9dc84eb8ff0df29468208d5e1e4ba Mon Sep 17 00:00:00 2001
From: Nicolas Hug <nicolashug@fb.com>
Date: Fri, 1 Apr 2022 15:48:28 +0000
Subject: [PATCH 1/3] Add DDP support test for prototype datasets

---
 test/test_prototype_builtin_datasets.py | 28 +++++++++++++++++++++++--
 1 file changed, 26 insertions(+), 2 deletions(-)

diff --git a/test/test_prototype_builtin_datasets.py b/test/test_prototype_builtin_datasets.py
index f8dc3a0542b..763e87dc215 100644
--- a/test/test_prototype_builtin_datasets.py
+++ b/test/test_prototype_builtin_datasets.py
@@ -8,6 +8,7 @@
 from builtin_dataset_mocks import parametrize_dataset_mocks, DATASET_MOCKS
 from torch.testing._comparison import assert_equal, TensorLikePair, ObjectPair
 from torch.utils.data.graph import traverse
+from torch.utils.data import DataLoader
 from torch.utils.data.graph_settings import get_all_graph_pipes
 from torchdata.datapipes.iter import Shuffler, ShardingFilter
 from torchvision._utils import sequence_to_str
@@ -109,7 +110,7 @@ def test_transformable(self, test_home, dataset_mock, config):
 
         next(iter(dataset.map(transforms.Identity())))
 
-    @pytest.mark.xfail(reason="See https://github.com/pytorch/data/issues/237")
+    # @pytest.mark.xfail(reason="See https://github.com/pytorch/data/issues/237")
     @parametrize_dataset_mocks(DATASET_MOCKS)
     def test_serializable(self, test_home, dataset_mock, config):
         dataset_mock.prepare(test_home, config)
@@ -118,10 +119,33 @@ def test_serializable(self, test_home, dataset_mock, config):
 
         pickle.dumps(dataset)
 
+    @parametrize_dataset_mocks(DATASET_MOCKS)
+    def test_ddp(self, test_home, dataset_mock, config,):
+        dataset_mock.prepare(test_home, config)
+
+        import os
+        if not torch.distributed.is_initialized():
+            os.environ["MASTER_ADDR"] = "localhost"
+            os.environ["MASTER_PORT"] = "29501"
+            torch.distributed.init_process_group(backend="gloo", world_size=1, rank=0)
+            torch.distributed.barrier()
+
+        dataset = datasets.load(dataset_mock.name, **config)
+
+        # Ugly hack: custom collate_fn because the default one doesn't handle None values
+        from torch.utils.data import default_collate
+        def collate_fn(batch):
+            return default_collate([x["image"] for x in batch])
+
+        dl = DataLoader(dataset, collate_fn=collate_fn)
+
+        next(iter(dl))
+        # TODO: Do we need  to manually shut down DPP now??
+
     # TODO: we need to enforce not only that both a Shuffler and a ShardingFilter are part of the datapipe, but also
     #  that the Shuffler comes before the ShardingFilter. Early commits in https://github.com/pytorch/vision/pull/5680
     #  contain a custom test for that, but we opted to wait for a potential solution / test from torchdata for now.
-    @pytest.mark.xfail(reason="See https://github.com/pytorch/data/issues/237")
+    # @pytest.mark.xfail(reason="See https://github.com/pytorch/data/issues/237")
     @parametrize_dataset_mocks(DATASET_MOCKS)
     @pytest.mark.parametrize("annotation_dp_type", (Shuffler, ShardingFilter))
     def test_has_annotations(self, test_home, dataset_mock, config, annotation_dp_type):

From e1919dc061f02e03d3d7ff02a49dfbefd87d38f8 Mon Sep 17 00:00:00 2001
From: Nicolas Hug <contact@nicolas-hug.com>
Date: Mon, 4 Apr 2022 16:11:24 +0100
Subject: [PATCH 2/3] Add Fixture with proper shutdown

---
 test/test_prototype_builtin_datasets.py | 31 ++++++++++++++++---------
 1 file changed, 20 insertions(+), 11 deletions(-)

diff --git a/test/test_prototype_builtin_datasets.py b/test/test_prototype_builtin_datasets.py
index 763e87dc215..63ada3528e4 100644
--- a/test/test_prototype_builtin_datasets.py
+++ b/test/test_prototype_builtin_datasets.py
@@ -1,5 +1,6 @@
 import functools
 import io
+import os
 import pickle
 from pathlib import Path
 
@@ -7,8 +8,8 @@
 import torch
 from builtin_dataset_mocks import parametrize_dataset_mocks, DATASET_MOCKS
 from torch.testing._comparison import assert_equal, TensorLikePair, ObjectPair
+from torch.utils.data import DataLoader, default_collate
 from torch.utils.data.graph import traverse
-from torch.utils.data import DataLoader
 from torch.utils.data.graph_settings import get_all_graph_pipes
 from torchdata.datapipes.iter import Shuffler, ShardingFilter
 from torchvision._utils import sequence_to_str
@@ -31,6 +32,23 @@ def test_home(mocker, tmp_path):
     yield tmp_path
 
 
+@pytest.fixture
+def ddp_fixture():
+    # Note: we only test DDP with world_size=1, but it should be enough for our purpose.
+    # If we ever need to go full DDP, we'll need to implement a much more complex logic, similar to
+    # MultiProcessTestCase from torch core.
+
+    os.environ["MASTER_ADDR"] = "localhost"
+    os.environ["MASTER_PORT"] = "29501"
+    torch.distributed.init_process_group(backend="gloo", world_size=1, rank=0)
+    torch.distributed.barrier()
+
+    yield
+
+    torch.distributed.barrier()
+    torch.distributed.destroy_process_group()
+
+
 def test_coverage():
     untested_datasets = set(datasets.list_datasets()) - DATASET_MOCKS.keys()
     if untested_datasets:
@@ -120,27 +138,18 @@ def test_serializable(self, test_home, dataset_mock, config):
         pickle.dumps(dataset)
 
     @parametrize_dataset_mocks(DATASET_MOCKS)
-    def test_ddp(self, test_home, dataset_mock, config,):
+    def test_ddp(self, test_home, dataset_mock, config, ddp_fixture):
         dataset_mock.prepare(test_home, config)
 
-        import os
-        if not torch.distributed.is_initialized():
-            os.environ["MASTER_ADDR"] = "localhost"
-            os.environ["MASTER_PORT"] = "29501"
-            torch.distributed.init_process_group(backend="gloo", world_size=1, rank=0)
-            torch.distributed.barrier()
-
         dataset = datasets.load(dataset_mock.name, **config)
 
         # Ugly hack: custom collate_fn because the default one doesn't handle None values
-        from torch.utils.data import default_collate
         def collate_fn(batch):
             return default_collate([x["image"] for x in batch])
 
         dl = DataLoader(dataset, collate_fn=collate_fn)
 
         next(iter(dl))
-        # TODO: Do we need  to manually shut down DPP now??
 
     # TODO: we need to enforce not only that both a Shuffler and a ShardingFilter are part of the datapipe, but also
     #  that the Shuffler comes before the ShardingFilter. Early commits in https://github.com/pytorch/vision/pull/5680

From 9aac9ee82a0e4831ee52780ca511dd7a8db76eec Mon Sep 17 00:00:00 2001
From: Nicolas Hug <contact@nicolas-hug.com>
Date: Mon, 4 Apr 2022 16:15:00 +0100
Subject: [PATCH 3/3] Simpler collate_fn

---
 test/test_prototype_builtin_datasets.py | 8 ++------
 1 file changed, 2 insertions(+), 6 deletions(-)

diff --git a/test/test_prototype_builtin_datasets.py b/test/test_prototype_builtin_datasets.py
index 63ada3528e4..06f86c0a016 100644
--- a/test/test_prototype_builtin_datasets.py
+++ b/test/test_prototype_builtin_datasets.py
@@ -8,7 +8,7 @@
 import torch
 from builtin_dataset_mocks import parametrize_dataset_mocks, DATASET_MOCKS
 from torch.testing._comparison import assert_equal, TensorLikePair, ObjectPair
-from torch.utils.data import DataLoader, default_collate
+from torch.utils.data import DataLoader
 from torch.utils.data.graph import traverse
 from torch.utils.data.graph_settings import get_all_graph_pipes
 from torchdata.datapipes.iter import Shuffler, ShardingFilter
@@ -143,11 +143,7 @@ def test_ddp(self, test_home, dataset_mock, config, ddp_fixture):
 
         dataset = datasets.load(dataset_mock.name, **config)
 
-        # Ugly hack: custom collate_fn because the default one doesn't handle None values
-        def collate_fn(batch):
-            return default_collate([x["image"] for x in batch])
-
-        dl = DataLoader(dataset, collate_fn=collate_fn)
+        dl = DataLoader(dataset, collate_fn=lambda batch: batch)
 
         next(iter(dl))