Skip to content

Commit

Permalink
Make testing improvements (#3242)
Browse files Browse the repository at this point in the history
* fix mypy errors

* improve robustness of tqdm tests

fixes error when running tests using pytest -n >1
this fixes the with of the test time progress bars to avoid
some failures in tests.

* use xdist for distributed tests
Tests from a loadgroup are all run sequentially on a single worker.

* remove warnings for unregistered marks

seems to miss marks in the conftest.py file itself though.

* improve execution of visdom tests

limit setup/teardown of visdom servers using a session
scoped fixture. Add visdom tests to an xdist group to run
them serially to avoid issues with server connection.

add timeout to the tests explicitly requesting the server
to further limit any future issues.

* Improve visdom tests

Do not clean up visdom_server fixture. Session scoped fixtures are not
guaranteed to be executed just once when using xdist and trying to
cleanup twice can cause hangs.
Timeout all visdom tests to avoid future issues with a hanging/dead server

* remove superfluous marks

* add comments

* avoid multiple downloads of nltk punkt

* fixup

* add timeout for all distributed tests

* download mnist once

* do not use --dist=loadgroup on ci for now

---------

Co-authored-by: leej3 <“johnleenimh@gmail.com>
Co-authored-by: vfdev <vfdev.5@gmail.com>
  • Loading branch information
3 people authored Apr 29, 2024
1 parent 565e8be commit 3f5febf
Show file tree
Hide file tree
Showing 11 changed files with 129 additions and 103 deletions.
2 changes: 1 addition & 1 deletion ignite/metrics/frequency.py
Original file line number Diff line number Diff line change
Expand Up @@ -49,7 +49,7 @@ def reset(self) -> None:
self._acc = 0
self._n = 0
self._elapsed = 0.0
super(Frequency, self).reset()
super(Frequency, self).reset() # type: ignore

@reinit__is_reduced
def update(self, output: int) -> None:
Expand Down
2 changes: 1 addition & 1 deletion ignite/metrics/gan/fid.py
Original file line number Diff line number Diff line change
Expand Up @@ -226,7 +226,7 @@ def reset(self) -> None:
self._test_total = torch.zeros(self._num_features, dtype=torch.float64, device=self._device)
self._num_examples: int = 0

super(FID, self).reset()
super(FID, self).reset() # type: ignore

@reinit__is_reduced
def update(self, output: Sequence[torch.Tensor]) -> None:
Expand Down
2 changes: 1 addition & 1 deletion ignite/metrics/gan/inception_score.py
Original file line number Diff line number Diff line change
Expand Up @@ -106,7 +106,7 @@ def reset(self) -> None:
self._prob_total = torch.zeros(self._num_features, dtype=torch.float64, device=self._device)
self._total_kl_d = torch.zeros(self._num_features, dtype=torch.float64, device=self._device)

super(InceptionScore, self).reset()
super(InceptionScore, self).reset() # type: ignore

@reinit__is_reduced
def update(self, output: torch.Tensor) -> None:
Expand Down
17 changes: 17 additions & 0 deletions tests/ignite/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,12 @@
import ignite.distributed as idist


def pytest_configure(config):
config.addinivalue_line("markers", "distributed: run distributed")
config.addinivalue_line("markers", "multinode_distributed: distributed")
config.addinivalue_line("markers", "tpu: run on tpu")


@pytest.fixture(
params=[
"cpu",
Expand Down Expand Up @@ -492,3 +498,14 @@ def xla_worker(index, fn):
assert ex_.code == 0, "Didn't successfully exit in XLA test"

pyfuncitem.obj = functools.partial(testfunc_wrapper, pyfuncitem.obj)


def pytest_collection_modifyitems(items):
for item in items:
if "distributed" in item.fixturenames:
# Run distributed tests on a single worker to avoid RACE conditions
# This requires that the --dist=loadgroup option be passed to pytest.
item.add_marker(pytest.mark.xdist_group("distributed"))
item.add_marker(pytest.mark.timeout(45))
if "multinode_distributed" in item.fixturenames:
item.add_marker(pytest.mark.timeout(45))
2 changes: 0 additions & 2 deletions tests/ignite/distributed/comp_models/test_native.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,8 +11,6 @@
else:
from ignite.distributed.comp_models.native import _expand_hostlist, _NativeDistModel, _setup_ddp_vars_from_slurm_env

pytestmark = pytest.mark.timeout(60)


# tests from https://github.com/LLNL/py-hostlist/blob/master/hostlist/unittest_hostlist.py
@pytest.mark.parametrize(
Expand Down
2 changes: 0 additions & 2 deletions tests/ignite/distributed/test_launcher.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,8 +10,6 @@
import ignite.distributed as idist
from ignite.distributed.utils import has_hvd_support, has_native_dist_support, has_xla_support

pytestmark = pytest.mark.timeout(60)


def test_parallel_wrong_inputs():
with pytest.raises(ValueError, match=r"Unknown backend 'abc'. Available backends:"):
Expand Down
59 changes: 21 additions & 38 deletions tests/ignite/handlers/conftest.py
Original file line number Diff line number Diff line change
@@ -1,58 +1,41 @@
import random
import subprocess
import time
from pathlib import Path
from unittest.mock import Mock

import pytest
import torch
from visdom import Visdom
from visdom.server.build import download_scripts

vd_hostname = None
vd_port = None
vd_server_process = None


@pytest.fixture()
@pytest.fixture(scope="session")
def visdom_server():
# Start Visdom server once and stop it with visdom_server_stop
global vd_hostname, vd_port, vd_server_process

if vd_server_process is None:
import subprocess
import time

from visdom import Visdom
from visdom.server.build import download_scripts

vd_hostname = "localhost"
if not (Path.home() / ".visdom").exists():
(Path.home() / ".visdom").mkdir(exist_ok=True)
download_scripts()
vis = None

vd_hostname = "localhost"
vd_port = random.randint(8089, 8887)

vd_port = 29777
vd_server_process = subprocess.Popen(
["python", "-m", "visdom.server", "--hostname", vd_hostname, "-port", str(vd_port)]
)
time.sleep(2)
for ii in range(5):
try:
time.sleep(1)
vis = Visdom(server=vd_hostname, port=vd_port, raise_exceptions=True)
break
except ConnectionError:
pass

vd_server_process = subprocess.Popen(
["python", "-m", "visdom.server", "--hostname", vd_hostname, "-port", str(vd_port)]
)
time.sleep(5)

vis = Visdom(server=vd_hostname, port=vd_port)
assert vis.check_connection()
vis.close()
continue

assert vis and vis.check_connection()
yield (vd_hostname, vd_port)


@pytest.fixture()
def visdom_server_stop():
yield None

import time

vd_server_process.kill()
time.sleep(2)
# Trying to clean up slows things down and sometimes causes hangs.
# vis.close()
# vd_server_process.kill()


@pytest.fixture
Expand Down
21 changes: 17 additions & 4 deletions tests/ignite/handlers/test_lr_finder.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,8 @@
from pathlib import Path
from unittest.mock import MagicMock

import filelock

import matplotlib
import pytest
import torch
Expand Down Expand Up @@ -144,16 +146,27 @@ def dataloader_plot():


@pytest.fixture
def mnist_dataloader():
def mnist_dataloader(tmp_path_factory):
from torch.utils.data import DataLoader
from torchvision.datasets import MNIST
from torchvision.transforms import Compose, Normalize, ToTensor

data_transform = Compose([ToTensor(), Normalize((0.1307,), (0.3081,))])

train_loader = DataLoader(
MNIST(download=True, root="/tmp", transform=data_transform, train=True), batch_size=256, shuffle=True
)
root_tmp_dir = tmp_path_factory.getbasetemp().parent
while True:
try:
with filelock.FileLock(root_tmp_dir / "mnist_download.lock", timeout=0.2) as fn:
fn.acquire()
train_loader = DataLoader(
MNIST(download=True, root="/tmp", transform=data_transform, train=True),
batch_size=256,
shuffle=True,
)
fn.release()
break
except filelock._error.Timeout:
pass

yield train_loader

Expand Down
Loading

0 comments on commit 3f5febf

Please sign in to comment.