Skip to content

Commit

Permalink
Fix internal CI failure
Browse files Browse the repository at this point in the history
  • Loading branch information
ejguan committed Sep 22, 2022
1 parent 8900311 commit 149ce37
Showing 1 changed file with 8 additions and 5 deletions.
13 changes: 8 additions & 5 deletions test/test_distributed.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,6 @@


import os
import subprocess
import unittest

from functools import partial
Expand Down Expand Up @@ -162,9 +161,11 @@ def test_distributed_dl2(self, backend) -> None:
def test_elastic_training_dl2(self, backend) -> None:
world_size = DEFAULT_WORLD_SIZE if backend != "nccl" else torch.cuda.device_count()
nnodes = 1
subprocess.run(
from torch.distributed import run

run.main(
[
"torchrun",
"--run_path",
f"--nnodes={nnodes}",
f"--nproc_per_node={world_size}",
abs_path("bin/elastic_training.py"),
Expand All @@ -187,9 +188,11 @@ def test_distributed_dl1(self, backend) -> None:
def test_elastic_training_dl1(self, backend) -> None:
world_size = DEFAULT_WORLD_SIZE if backend != "nccl" else torch.cuda.device_count()
nnodes = 1
subprocess.run(
from torch.distributed import run

run.main(
[
"torchrun",
"--run_path",
f"--nnodes={nnodes}",
f"--nproc_per_node={world_size}",
abs_path("bin/elastic_training.py"),
Expand Down

0 comments on commit 149ce37

Please sign in to comment.