Make testing improvements (#3242)

* fix mypy errors * improve robustness of tqdm tests fixes error when running tests using pytest -n >1 this fixes the with of the test time progress bars to avoid some failures in tests. * use xdist for distributed tests Tests from a loadgroup are all run sequentially on a single worker. * remove warnings for unregistered marks seems to miss marks in the conftest.py file itself though. * improve execution of visdom tests limit setup/teardown of visdom servers using a session scoped fixture. Add visdom tests to an xdist group to run them serially to avoid issues with server connection. add timeout to the tests explicitly requesting the server to further limit any future issues. * Improve visdom tests Do not clean up visdom_server fixture. Session scoped fixtures are not guaranteed to be executed just once when using xdist and trying to cleanup twice can cause hangs. Timeout all visdom tests to avoid future issues with a hanging/dead server * remove superfluous marks * add comments * avoid multiple downloads of nltk punkt * fixup * add timeout for all distributed tests * download mnist once * do not use --dist=loadgroup on ci for now --------- Co-authored-by: leej3 <“johnleenimh@gmail.com> Co-authored-by: vfdev <vfdev.5@gmail.com>
pytorch · Apr 29, 2024 · 3f5febf · 3f5febf
1 parent 565e8be
commit 3f5febf
Show file tree

Hide file tree

Showing 11 changed files with 129 additions and 103 deletions.
diff --git a/ignite/metrics/frequency.py b/ignite/metrics/frequency.py
@@ -49,7 +49,7 @@ def reset(self) -> None:
         self._acc = 0
         self._n = 0
         self._elapsed = 0.0
-        super(Frequency, self).reset()
+        super(Frequency, self).reset()  # type: ignore
 
     @reinit__is_reduced
     def update(self, output: int) -> None:

diff --git a/ignite/metrics/gan/fid.py b/ignite/metrics/gan/fid.py
@@ -226,7 +226,7 @@ def reset(self) -> None:
         self._test_total = torch.zeros(self._num_features, dtype=torch.float64, device=self._device)
         self._num_examples: int = 0
 
-        super(FID, self).reset()
+        super(FID, self).reset()  # type: ignore
 
     @reinit__is_reduced
     def update(self, output: Sequence[torch.Tensor]) -> None:

diff --git a/ignite/metrics/gan/inception_score.py b/ignite/metrics/gan/inception_score.py
@@ -106,7 +106,7 @@ def reset(self) -> None:
         self._prob_total = torch.zeros(self._num_features, dtype=torch.float64, device=self._device)
         self._total_kl_d = torch.zeros(self._num_features, dtype=torch.float64, device=self._device)
 
-        super(InceptionScore, self).reset()
+        super(InceptionScore, self).reset()  # type: ignore
 
     @reinit__is_reduced
     def update(self, output: torch.Tensor) -> None:

diff --git a/tests/ignite/conftest.py b/tests/ignite/conftest.py
@@ -13,6 +13,12 @@
 import ignite.distributed as idist
 
 
+def pytest_configure(config):
+    config.addinivalue_line("markers", "distributed: run distributed")
+    config.addinivalue_line("markers", "multinode_distributed: distributed")
+    config.addinivalue_line("markers", "tpu: run on tpu")
+
+
 @pytest.fixture(
     params=[
         "cpu",
@@ -492,3 +498,14 @@ def xla_worker(index, fn):
                 assert ex_.code == 0, "Didn't successfully exit in XLA test"
 
         pyfuncitem.obj = functools.partial(testfunc_wrapper, pyfuncitem.obj)
+
+
+def pytest_collection_modifyitems(items):
+    for item in items:
+        if "distributed" in item.fixturenames:
+            # Run distributed tests on a single worker to avoid RACE conditions
+            # This requires that the --dist=loadgroup option be passed to pytest.
+            item.add_marker(pytest.mark.xdist_group("distributed"))
+            item.add_marker(pytest.mark.timeout(45))
+        if "multinode_distributed" in item.fixturenames:
+            item.add_marker(pytest.mark.timeout(45))
diff --git a/tests/ignite/distributed/comp_models/test_native.py b/tests/ignite/distributed/comp_models/test_native.py
@@ -11,8 +11,6 @@
 else:
     from ignite.distributed.comp_models.native import _expand_hostlist, _NativeDistModel, _setup_ddp_vars_from_slurm_env
 
-pytestmark = pytest.mark.timeout(60)
-
 
 # tests from https://github.com/LLNL/py-hostlist/blob/master/hostlist/unittest_hostlist.py
 @pytest.mark.parametrize(

diff --git a/tests/ignite/distributed/test_launcher.py b/tests/ignite/distributed/test_launcher.py
@@ -10,8 +10,6 @@
 import ignite.distributed as idist
 from ignite.distributed.utils import has_hvd_support, has_native_dist_support, has_xla_support
 
-pytestmark = pytest.mark.timeout(60)
-
 
 def test_parallel_wrong_inputs():
     with pytest.raises(ValueError, match=r"Unknown backend 'abc'. Available backends:"):

diff --git a/tests/ignite/handlers/conftest.py b/tests/ignite/handlers/conftest.py
@@ -1,58 +1,41 @@
-import random
+import subprocess
+import time
 from pathlib import Path
 from unittest.mock import Mock
 
 import pytest
 import torch
+from visdom import Visdom
+from visdom.server.build import download_scripts
 
-vd_hostname = None
-vd_port = None
-vd_server_process = None
 
-
-@pytest.fixture()
+@pytest.fixture(scope="session")
 def visdom_server():
     # Start Visdom server once and stop it with visdom_server_stop
-    global vd_hostname, vd_port, vd_server_process
-
-    if vd_server_process is None:
-        import subprocess
-        import time
-
-        from visdom import Visdom
-        from visdom.server.build import download_scripts
-
+    vd_hostname = "localhost"
+    if not (Path.home() / ".visdom").exists():
         (Path.home() / ".visdom").mkdir(exist_ok=True)
         download_scripts()
+    vis = None
 
-        vd_hostname = "localhost"
-        vd_port = random.randint(8089, 8887)
-
+    vd_port = 29777
+    vd_server_process = subprocess.Popen(
+        ["python", "-m", "visdom.server", "--hostname", vd_hostname, "-port", str(vd_port)]
+    )
+    time.sleep(2)
+    for ii in range(5):
         try:
+            time.sleep(1)
             vis = Visdom(server=vd_hostname, port=vd_port, raise_exceptions=True)
+            break
         except ConnectionError:
-            pass
-
-        vd_server_process = subprocess.Popen(
-            ["python", "-m", "visdom.server", "--hostname", vd_hostname, "-port", str(vd_port)]
-        )
-        time.sleep(5)
-
-        vis = Visdom(server=vd_hostname, port=vd_port)
-        assert vis.check_connection()
-        vis.close()
+            continue
 
+    assert vis and vis.check_connection()
     yield (vd_hostname, vd_port)
-
-
-@pytest.fixture()
-def visdom_server_stop():
-    yield None
-
-    import time
-
-    vd_server_process.kill()
-    time.sleep(2)
+    # Trying to clean up slows things down and sometimes causes hangs.
+    # vis.close()
+    # vd_server_process.kill()
 
 
 @pytest.fixture

diff --git a/tests/ignite/handlers/test_lr_finder.py b/tests/ignite/handlers/test_lr_finder.py
@@ -3,6 +3,8 @@
 from pathlib import Path
 from unittest.mock import MagicMock
 
+import filelock
+
 import matplotlib
 import pytest
 import torch
@@ -144,16 +146,27 @@ def dataloader_plot():
 
 
 @pytest.fixture
-def mnist_dataloader():
+def mnist_dataloader(tmp_path_factory):
     from torch.utils.data import DataLoader
     from torchvision.datasets import MNIST
     from torchvision.transforms import Compose, Normalize, ToTensor
 
     data_transform = Compose([ToTensor(), Normalize((0.1307,), (0.3081,))])
 
-    train_loader = DataLoader(
-        MNIST(download=True, root="/tmp", transform=data_transform, train=True), batch_size=256, shuffle=True
-    )
+    root_tmp_dir = tmp_path_factory.getbasetemp().parent
+    while True:
+        try:
+            with filelock.FileLock(root_tmp_dir / "mnist_download.lock", timeout=0.2) as fn:
+                fn.acquire()
+                train_loader = DataLoader(
+                    MNIST(download=True, root="/tmp", transform=data_transform, train=True),
+                    batch_size=256,
+                    shuffle=True,
+                )
+                fn.release()
+                break
+        except filelock._error.Timeout:
+            pass
 
     yield train_loader