From 27d0f9402de75b78631e3a08c8591e85205a6784 Mon Sep 17 00:00:00 2001 From: Heyang Qin Date: Tue, 7 Feb 2023 20:24:25 +0000 Subject: [PATCH 1/5] Check device count before running dist tests --- tests/unit/common.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/tests/unit/common.py b/tests/unit/common.py index bb0e5ca9e656..6caffc9cc053 100644 --- a/tests/unit/common.py +++ b/tests/unit/common.py @@ -317,6 +317,9 @@ def __call__(self, request): else: world_size = self.world_size + if torch.cuda.device_count() < world_size: + pytest.skip(f"Skipping test because not enough GPUs are available: {world_size} required, {torch.cuda.device_count()} available") + if isinstance(world_size, int): world_size = [world_size] for procs in world_size: From d44c28c0453561a8aeca26c2a4940300b3f07d4e Mon Sep 17 00:00:00 2001 From: Heyang Qin Date: Tue, 7 Feb 2023 20:36:47 +0000 Subject: [PATCH 2/5] fixing format for "Check device count before running dist tests" --- tests/unit/common.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/tests/unit/common.py b/tests/unit/common.py index 6caffc9cc053..e6591124908f 100644 --- a/tests/unit/common.py +++ b/tests/unit/common.py @@ -318,7 +318,9 @@ def __call__(self, request): world_size = self.world_size if torch.cuda.device_count() < world_size: - pytest.skip(f"Skipping test because not enough GPUs are available: {world_size} required, {torch.cuda.device_count()} available") + pytest.skip( + f"Skipping test because not enough GPUs are available: {world_size} required, {torch.cuda.device_count()} available" + ) if isinstance(world_size, int): world_size = [world_size] From a1fc431a698484c46e2904818eb395e0fbfd6591 Mon Sep 17 00:00:00 2001 From: Heyang Qin Date: Tue, 7 Feb 2023 22:10:58 +0000 Subject: [PATCH 3/5] Check device count against max world size --- tests/unit/common.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/tests/unit/common.py b/tests/unit/common.py index e6591124908f..a088ab567a03 100644 --- a/tests/unit/common.py +++ b/tests/unit/common.py @@ -317,9 +317,13 @@ def __call__(self, request): else: world_size = self.world_size - if torch.cuda.device_count() < world_size: + if isinstance(world_size, int): + max_world_size = world_size + else: + max_world_size = max(world_size) + if torch.cuda.device_count() < max_world_size: pytest.skip( - f"Skipping test because not enough GPUs are available: {world_size} required, {torch.cuda.device_count()} available" + f"Skipping test because not enough GPUs are available: {max_world_size} required, {torch.cuda.device_count()} available" ) if isinstance(world_size, int): From 9411f5cb762bc0b83ccc46df9fca42b7e2587fa5 Mon Sep 17 00:00:00 2001 From: Heyang Qin Date: Thu, 9 Feb 2023 18:11:31 +0000 Subject: [PATCH 4/5] Check GPU count before launching dist tests --- tests/unit/common.py | 13 ++++--------- 1 file changed, 4 insertions(+), 9 deletions(-) diff --git a/tests/unit/common.py b/tests/unit/common.py index a088ab567a03..67bd309d6078 100644 --- a/tests/unit/common.py +++ b/tests/unit/common.py @@ -105,6 +105,10 @@ def _get_fixture_kwargs(self, request, func): return fixture_kwargs def _launch_procs(self, num_procs): + if torch.cuda.device_count() < num_procs: + pytest.skip( + f"Skipping test because not enough GPUs are available: {num_procs} required, {torch.cuda.device_count()} available" + ) mp.set_start_method('forkserver', force=True) skip_msg = mp.Queue() # Allows forked processes to share pytest.skip reason processes = [] @@ -317,15 +321,6 @@ def __call__(self, request): else: world_size = self.world_size - if isinstance(world_size, int): - max_world_size = world_size - else: - max_world_size = max(world_size) - if torch.cuda.device_count() < max_world_size: - pytest.skip( - f"Skipping test because not enough GPUs are available: {max_world_size} required, {torch.cuda.device_count()} available" - ) - if isinstance(world_size, int): world_size = [world_size] for procs in world_size: From ce93efd969747c0a95fbf47d8f636420154674d5 Mon Sep 17 00:00:00 2001 From: Heyang Qin Date: Thu, 23 Feb 2023 18:21:30 +0000 Subject: [PATCH 5/5] double-check GPU actually exists --- tests/unit/common.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/unit/common.py b/tests/unit/common.py index 2221f24fa39d..f2b2dc387d38 100644 --- a/tests/unit/common.py +++ b/tests/unit/common.py @@ -108,7 +108,7 @@ def _get_fixture_kwargs(self, request, func): return fixture_kwargs def _launch_procs(self, num_procs): - if torch.cuda.device_count() < num_procs: + if torch.cuda.is_available() and torch.cuda.device_count() < num_procs: pytest.skip( f"Skipping test because not enough GPUs are available: {num_procs} required, {torch.cuda.device_count()} available" )