Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Initial version for multinode auto_runner and ensembler #6272

Merged
merged 25 commits into from
Apr 14, 2023
Merged
Show file tree
Hide file tree
Changes from 15 commits
Commits
Show all changes
25 commits
Select commit Hold shift + click to select a range
2af78e5
Initial version for multinode auto_runner and ensembler
heyufan1995 Apr 3, 2023
cf7dbd6
Fix multiple bugs and able to run end-to-end ngc multinode
heyufan1995 Apr 3, 2023
fd89cdd
Add cmd_prefix in env
heyufan1995 Apr 11, 2023
4f671be
Update minor logging issue
heyufan1995 Apr 11, 2023
a067d28
Fix merge issues
heyufan1995 Apr 11, 2023
3735054
[pre-commit.ci] auto fixes from pre-commit.com hooks
pre-commit-ci[bot] Apr 11, 2023
e8fcd37
Merge branch 'dev' into multinode
wyli Apr 11, 2023
20a5d77
Change init function position in ensemblerunner
heyufan1995 Apr 13, 2023
44ff651
[pre-commit.ci] auto fixes from pre-commit.com hooks
pre-commit-ci[bot] Apr 13, 2023
1f7284a
Merge branch 'dev' into multinode
mingxin-zheng Apr 13, 2023
e9fa2df
Move image save to AlgoEnsemble
heyufan1995 Apr 13, 2023
5dc40ac
[pre-commit.ci] auto fixes from pre-commit.com hooks
pre-commit-ci[bot] Apr 13, 2023
c6c2d55
Add set_ensemble_method back to autorunner
heyufan1995 Apr 13, 2023
45e9fd0
Merge branch 'multinode' of github.com:heyufan1995/MONAI into multinode
heyufan1995 Apr 13, 2023
ad1e681
[pre-commit.ci] auto fixes from pre-commit.com hooks
pre-commit-ci[bot] Apr 13, 2023
3ebed6a
Add test case and addressing several comments
heyufan1995 Apr 14, 2023
e32c8ac
Merge branch 'multinode' of github.com:heyufan1995/MONAI into multinode
heyufan1995 Apr 14, 2023
0834bb0
[pre-commit.ci] auto fixes from pre-commit.com hooks
pre-commit-ci[bot] Apr 14, 2023
f6acd3e
Merge remote-tracking branch 'upstream/dev' into multinode
wyli Apr 14, 2023
c053912
typing fixes and unit tests
wyli Apr 14, 2023
4ddc909
typing fixes and unit tests
wyli Apr 14, 2023
3389672
backward compatible _create_cmd
wyli Apr 14, 2023
6317377
autofix
wyli Apr 14, 2023
b4a5d1d
compatible algo.train
wyli Apr 14, 2023
00f0fde
backward compatibility
wyli Apr 14, 2023
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion monai/apps/auto3dseg/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,6 @@
from .auto_runner import AutoRunner
from .bundle_gen import BundleAlgo, BundleGen
from .data_analyzer import DataAnalyzer
from .ensemble_builder import AlgoEnsemble, AlgoEnsembleBestByFold, AlgoEnsembleBestN, AlgoEnsembleBuilder
from .ensemble_builder import AlgoEnsemble, AlgoEnsembleBestByFold, AlgoEnsembleBestN, AlgoEnsembleBuilder, EnsembleRunner
from .hpo_gen import NNIGen, OptunaGen
from .utils import export_bundle_algo_history, import_bundle_algo_history
3 changes: 2 additions & 1 deletion monai/apps/auto3dseg/__main__.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@
from monai.apps.auto3dseg.auto_runner import AutoRunner
from monai.apps.auto3dseg.bundle_gen import BundleAlgo, BundleGen
from monai.apps.auto3dseg.data_analyzer import DataAnalyzer
from monai.apps.auto3dseg.ensemble_builder import AlgoEnsembleBuilder
from monai.apps.auto3dseg.ensemble_builder import AlgoEnsembleBuilder, EnsembleRunner
from monai.apps.auto3dseg.hpo_gen import NNIGen, OptunaGen

if __name__ == "__main__":
Expand All @@ -27,6 +27,7 @@
"BundleGen": BundleGen,
"BundleAlgo": BundleAlgo,
"AlgoEnsembleBuilder": AlgoEnsembleBuilder,
"EnsembleRunner": EnsembleRunner,
"AutoRunner": AutoRunner,
"NNIGen": NNIGen,
"OptunaGen": OptunaGen,
Expand Down
178 changes: 72 additions & 106 deletions monai/apps/auto3dseg/auto_runner.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,25 +19,19 @@
from time import sleep
from typing import Any, cast

import numpy as np
import torch

from monai.apps.auto3dseg.bundle_gen import BundleGen
from monai.apps.auto3dseg.data_analyzer import DataAnalyzer
from monai.apps.auto3dseg.ensemble_builder import (
AlgoEnsemble,
AlgoEnsembleBestByFold,
AlgoEnsembleBestN,
AlgoEnsembleBuilder,
)
from monai.apps.auto3dseg.ensemble_builder import EnsembleRunner
from monai.apps.auto3dseg.hpo_gen import NNIGen
from monai.apps.auto3dseg.utils import export_bundle_algo_history, import_bundle_algo_history
from monai.apps.utils import get_logger
from monai.auto3dseg.utils import algo_to_pickle
from monai.bundle import ConfigParser
from monai.transforms import SaveImage
from monai.utils.enums import AlgoKeys
from monai.utils.module import look_up_option, optional_import
from monai.utils.module import optional_import
from monai.utils.module import look_up_option
wyli marked this conversation as resolved.
Show resolved Hide resolved

logger = get_logger(module_name=__name__)

Expand Down Expand Up @@ -232,6 +226,7 @@ def __init__(
self.data_src_cfg_name = os.path.join(self.work_dir, "input.yaml")
self.algos = algos
self.templates_path_or_url = templates_path_or_url
self.kwargs = kwargs

if input is None and os.path.isfile(self.data_src_cfg_name):
input = self.data_src_cfg_name
Expand Down Expand Up @@ -285,16 +280,11 @@ def __init__(
self.ensemble = ensemble # last step, no need to check

self.set_training_params()
self.set_device_info()
self.set_prediction_params()
self.set_analyze_params()

self.save_image = self.set_image_save_transform(kwargs)

self.ensemble_method: AlgoEnsemble
self.ensemble_method_name: str | None = None

self.set_ensemble_method()
self.set_num_fold(num_fold=num_fold)
self.set_ensemble_method("AlgoEnsembleBestByFold")

self.gpu_customization = False
self.gpu_customization_specs: dict[str, Any] = {}
Expand Down Expand Up @@ -461,18 +451,11 @@ def set_num_fold(self, num_fold: int = 5) -> None:

Args:
num_fold: a positive integer to define the number of folds.

Notes:
If the ensemble method is ``AlgoEnsembleBestByFold``, this function automatically updates the ``n_fold``
parameter in the ``ensemble_method`` to avoid inconsistency between the training and the ensemble.
"""

if num_fold <= 0:
raise ValueError(f"num_fold is expected to be an integer greater than zero. Now it gets {num_fold}")

self.num_fold = num_fold
if self.ensemble_method_name == "AlgoEnsembleBestByFold":
self.ensemble_method.n_fold = self.num_fold # type: ignore

def set_training_params(self, params: dict[str, Any] | None = None) -> None:
"""
Expand All @@ -489,6 +472,64 @@ def set_training_params(self, params: dict[str, Any] | None = None) -> None:
"""
self.train_params = deepcopy(params) if params is not None else {}

def set_device_info(self, cuda_visible_devices: list[int] | str=os.environ.get('CUDA_VISIBLE_DEVICES', None),
num_nodes: int=int(os.environ.get('NUM_NODES', 1)),
mn_start_method: str=os.environ.get('MN_START_METHOD', 'bcprun'),
cmd_prefix: str | None=os.environ.get('CMD_PREFIX', None)):
"""
Set the device related info

Args:
cuda_visible_device: define GPU ids for data analyzer, training, and ensembling.
List of GPU ids [0,1,2,3] or a string "0,1,2,3".
Default using env "CUDA_VISIBLE_DEVICES" or None
num_nodes: number of nodes for training and ensembling.
Default using env "NUM_NODES" or 1
mn_start_method: multi-node start method. Autorunner will use the method to start multi-node processes.
Default using env "MN_START_METHOD" or 'bcprun'
cmd_prefix: command line prefix for subprocess running in BundleAlgo and EnsembleRunner.
Default using env "CMD_PREFIX" or None
Examples are:
1) single GPU/CPU or multinode bcprun: "python " or "/opt/conda/bin/python3.8 ",
2) single node multi-GPU running "torchrun --nnodes=1 --nproc_per_node=2 "
If user define this prefix, please make sure --nproc_per_node matches cuda_visible_device or
os.env['CUDA_VISIBLE_DEVICES]. Also always set --nnodes=1. Set num_nodes for multi-node.
"""
self.device_setting = {}
if cuda_visible_devices is None:
self.device_setting['CUDA_VISIBLE_DEVICES'] = ','.join([str(x) for x in range(torch.cuda.device_count())])
self.device_setting['n_devices'] = torch.cuda.device_count()
else:
if type(cuda_visible_devices) is str:
self.device_setting['CUDA_VISIBLE_DEVICES'] = cuda_visible_devices
self.device_setting['n_devices'] = len(cuda_visible_devices.split(','))
elif type(cuda_visible_devices) is list:
self.device_setting['CUDA_VISIBLE_DEVICES'] = ','.join([str(x) for x in cuda_visible_devices])
self.device_setting['n_devices'] = len(cuda_visible_devices)
else:
logger.warn('Wrong format of cuda_visible_devices, devices not set')
self.device_setting['NUM_NODES'] = num_nodes
self.device_setting['MN_START_METHOD'] = mn_start_method
self.device_setting['CMD_PREFIX'] = cmd_prefix
if cmd_prefix is not None:
logger.info(f'Using user defined command running prefix {cmd_prefix}, will overide other settings')

def set_ensemble_method(self, ensemble_method_name: str='AlgoEnsembleBestByFold', **kwargs: Any):
"""
Set the bundle ensemble method name and parameters for save image transform parameters.

Args:
params: the name of the ensemble method. Only two methods are supported "AlgoEnsembleBestN"
and "AlgoEnsembleBestByFold".
kwargs: the keyword arguments used to define the ensemble method. Currently only ``n_best`` for
``AlgoEnsembleBestN`` is supported.
"""
self.ensemble_method_name = look_up_option(
ensemble_method_name, supported=["AlgoEnsembleBestN", "AlgoEnsembleBestByFold"]
)
self.kwargs.update(kwargs)


def set_prediction_params(self, params: dict[str, Any] | None = None) -> None:
"""
Set the prediction params for all algos.
Expand Down Expand Up @@ -569,58 +610,6 @@ def set_nni_search_space(self, search_space):
self.search_space = search_space
self.hpo_tasks = value_combinations

def set_image_save_transform(self, kwargs):
mingxin-zheng marked this conversation as resolved.
Show resolved Hide resolved
"""
Set the ensemble output transform.

Args:
kwargs: image writing parameters for the ensemble inference. The kwargs format follows SaveImage
transform. For more information, check https://docs.monai.io/en/stable/transforms.html#saveimage .

"""

if "output_dir" in kwargs:
output_dir = kwargs.pop("output_dir")
else:
output_dir = os.path.join(self.work_dir, "ensemble_output")
logger.info(f"The output_dir is not specified. {output_dir} will be used to save ensemble predictions")

if not os.path.isdir(output_dir):
os.makedirs(output_dir)
logger.info(f"Directory {output_dir} is created to save ensemble predictions")

self.output_dir = output_dir
output_postfix = kwargs.pop("output_postfix", "ensemble")
output_dtype = kwargs.pop("output_dtype", np.uint8)
resample = kwargs.pop("resample", False)

return SaveImage(
output_dir=output_dir, output_postfix=output_postfix, output_dtype=output_dtype, resample=resample, **kwargs
)

def set_ensemble_method(self, ensemble_method_name: str = "AlgoEnsembleBestByFold", **kwargs: Any) -> None:
"""
Set the bundle ensemble method

Args:
ensemble_method_name: the name of the ensemble method. Only two methods are supported "AlgoEnsembleBestN"
and "AlgoEnsembleBestByFold".
kwargs: the keyword arguments used to define the ensemble method. Currently only ``n_best`` for
``AlgoEnsembleBestN`` is supported.

"""
self.ensemble_method_name = look_up_option(
ensemble_method_name, supported=["AlgoEnsembleBestN", "AlgoEnsembleBestByFold"]
)
if self.ensemble_method_name == "AlgoEnsembleBestN":
n_best = kwargs.pop("n_best", False)
n_best = 2 if not n_best else n_best
self.ensemble_method = AlgoEnsembleBestN(n_best=n_best)
elif self.ensemble_method_name == "AlgoEnsembleBestByFold":
self.ensemble_method = AlgoEnsembleBestByFold(n_fold=self.num_fold)
else:
raise NotImplementedError(f"Ensemble method {self.ensemble_method_name} is not implemented.")

def _train_algo_in_sequence(self, history: list[dict[str, Any]]) -> None:
"""
Train the Algos in a sequential scheme. The order of training is randomized.
Expand All @@ -637,7 +626,7 @@ def _train_algo_in_sequence(self, history: list[dict[str, Any]]) -> None:
"""
for algo_dict in history:
algo = algo_dict[AlgoKeys.ALGO]
algo.train(self.train_params)
algo.train(self.train_params, self.device_setting)
acc = algo.get_score()

algo_meta_data = {str(AlgoKeys.SCORE): acc}
Expand Down Expand Up @@ -792,34 +781,11 @@ def run(self):

# step 4: model ensemble and write the prediction to disks.
if self.ensemble:
history = import_bundle_algo_history(self.work_dir, only_trained=False)

history_untrained = [h for h in history if not h[AlgoKeys.IS_TRAINED]]
if len(history_untrained) > 0:
warnings.warn(
f"Ensembling step will skip {[h['name'] for h in history_untrained]} untrained algos."
"Generally it means these algos did not complete training."
)
history = [h for h in history if h[AlgoKeys.IS_TRAINED]]

if len(history) == 0:
raise ValueError(
f"Could not find any trained algos in {self.work_dir}. "
"Possibly the required training step was not completed."
)

builder = AlgoEnsembleBuilder(history, self.data_src_cfg_name)
builder.set_ensemble_method(self.ensemble_method)

ensembler = builder.get_ensemble()
preds = ensembler(pred_param=self.pred_params)
if len(preds) > 0:
logger.info("Auto3Dseg picked the following networks to ensemble:")
for algo in ensembler.get_algo_ensemble():
logger.info(algo[AlgoKeys.ID])

for pred in preds:
self.save_image(pred)
logger.info(f"Auto3Dseg ensemble prediction outputs are saved in {self.output_dir}.")

ensemble_runner = EnsembleRunner(data_src_cfg_name=self.data_src_cfg_name,
work_dir=self.work_dir, num_fold=self.num_fold,
ensemble_method_name=self.ensemble_method_name,
mgpu=self.device_setting['n_devices']>1,
**self.kwargs, # for set_image_save_transform
**self.pred_params) # for inference
ensemble_runner.run(self.device_setting)
logger.info("Auto3Dseg pipeline is completed successfully.")
Loading