Skip to content

Commit

Permalink
Fix cuda defined in train_params bug (#6370)
Browse files Browse the repository at this point in the history
Fixes # .
If user defined CUDA_VISIBLE_DEVICES in train_params, bundleAlgo will
put that into cmd and cause error.
Pop this out before cmd and throw out a warning
### Description

A few sentences describing the changes proposed in this pull request.

### Types of changes
<!--- Put an `x` in all the boxes that apply, and remove the not
applicable items -->
- [x] Non-breaking change (fix or new feature that would not break
existing functionality).
- [ ] Breaking change (fix or new feature that would cause existing
functionality to change).
- [ ] New tests added to cover the changes.
- [ ] Integration tests passed locally by running `./runtests.sh -f -u
--net --coverage`.
- [ ] Quick tests passed locally by running `./runtests.sh --quick
--unittests --disttests`.
- [ ] In-line docstrings updated.
- [ ] Documentation updated, tested `make html` command in the `docs/`
folder.

---------

Signed-off-by: heyufan1995 <heyufan1995@gmail.com>
Signed-off-by: Wenqi Li <wenqil@nvidia.com>
Co-authored-by: Wenqi Li <831580+wyli@users.noreply.github.com>
Co-authored-by: Wenqi Li <wenqil@nvidia.com>
  • Loading branch information
3 people authored Apr 15, 2023
1 parent 888ad2f commit b356fec
Show file tree
Hide file tree
Showing 2 changed files with 13 additions and 30 deletions.
27 changes: 11 additions & 16 deletions monai/apps/auto3dseg/bundle_gen.py
Original file line number Diff line number Diff line change
Expand Up @@ -216,27 +216,18 @@ def _run_cmd(self, cmd: str, devices_info: str = "") -> subprocess.CompletedProc
ps_environ["CUDA_VISIBLE_DEVICES"] = str(self.device_setting["CUDA_VISIBLE_DEVICES"])
if int(self.device_setting["NUM_NODES"]) > 1:
if self.device_setting["MN_START_METHOD"] == "bcprun":
normal_out = subprocess.run(
[
"bcprun",
"-n",
str(self.device_setting["NUM_NODES"]),
"-p",
str(self.device_setting["n_devices"]),
"-c",
cmd,
],
env=ps_environ,
check=True,
)
cmd = f"bcprun -n {self.device_setting['NUM_NODES']} -p {self.device_setting['n_devices']} -c {cmd}"
else:
raise NotImplementedError(
f"{self.device_setting['MN_START_METHOD']} is not supported yet. "
"Try modify BundleAlgo._run_cmd for your cluster."
)
else:
normal_out = subprocess.run(cmd.split(), env=ps_environ, check=True)
return normal_out
cmd_list = cmd.split()
_idx = 0
for _idx, c in enumerate(cmd_list):
if "=" not in c: # remove variable assignments before the command such as "OMP_NUM_THREADS=1"
break
return subprocess.run(cmd_list[_idx:], env=ps_environ, check=True)

def train(
self, train_params: None | dict = None, device_setting: None | dict = None
Expand All @@ -254,6 +245,10 @@ def train(
self.device_setting.update(device_setting)
self.device_setting["n_devices"] = len(str(self.device_setting["CUDA_VISIBLE_DEVICES"]).split(","))

if train_params is not None and "CUDA_VISIBLE_DEVICES" in train_params:
warnings.warn("CUDA_VISIBLE_DEVICES is deprecated from train_params!")
train_params.pop("CUDA_VISIBLE_DEVICES")

cmd, _unused_return = self._create_cmd(train_params)
return self._run_cmd(cmd)

Expand Down
16 changes: 2 additions & 14 deletions monai/apps/auto3dseg/ensemble_builder.py
Original file line number Diff line number Diff line change
Expand Up @@ -563,23 +563,11 @@ def _create_cmd(self) -> None:
logger.info(f"Ensembling on {self.device_setting['NUM_NODES']} nodes!")
cmd = "python " if cmd is None else cmd
cmd = f"{cmd} -m {base_cmd}"
_ = subprocess.run(
[
"bcprun",
"-n",
str(self.device_setting["NUM_NODES"]),
"-p",
str(self.device_setting["n_devices"]),
"-c",
cmd,
],
env=ps_environ,
check=True,
)
cmd = f"bcprun -n {self.device_setting['NUM_NODES']} -p {self.device_setting['n_devices']} -c {cmd}"
else:
logger.info(f"Ensembling using {self.device_setting['n_devices']} GPU!")
if cmd is None:
cmd = f"torchrun --nnodes={1:d} --nproc_per_node={self.device_setting['n_devices']:d} "
cmd = f"{cmd} -m {base_cmd}"
_ = subprocess.run(cmd.split(), env=ps_environ, check=True)
_ = subprocess.run(cmd.split(), env=ps_environ, check=True)
return

0 comments on commit b356fec

Please sign in to comment.