diff --git a/multimodal/src/autogluon/multimodal/constants.py b/multimodal/src/autogluon/multimodal/constants.py index d6067ddc899..59c64987d75 100644 --- a/multimodal/src/autogluon/multimodal/constants.py +++ b/multimodal/src/autogluon/multimodal/constants.py @@ -305,6 +305,9 @@ # strategies DDP = "ddp" +DDP_FIND_UNUSED_PARAMETERS_FALSE = "ddp_find_unused_parameters_false" +DDP_FIND_UNUSED_PARAMETERS_TRUE = "ddp_find_unused_parameters_true" +DDP_STRATEGIES = [DDP, DDP_FIND_UNUSED_PARAMETERS_FALSE, DDP_FIND_UNUSED_PARAMETERS_TRUE] # torch constants TORCH_COMPILE_MIN_VERSION = "2.2.0.dev20230908" diff --git a/multimodal/src/autogluon/multimodal/matcher.py b/multimodal/src/autogluon/multimodal/matcher.py index eb35156fd6b..f35e6d814d2 100644 --- a/multimodal/src/autogluon/multimodal/matcher.py +++ b/multimodal/src/autogluon/multimodal/matcher.py @@ -92,6 +92,7 @@ init_df_preprocessor, load_text_tokenizers, predict, + run_ddp_only_once, save_pretrained_model_configs, save_text_tokenizers, select_model, @@ -944,6 +945,8 @@ def _fit( # save artifacts for the current running, except for model checkpoint, which will be saved in trainer self.save(save_path) + num_gpus = run_ddp_only_once(num_gpus, strategy) + blacklist_msgs = ["already configured with model summary"] log_filter = LogFilter(blacklist_msgs) with apply_log_filter(log_filter): @@ -1278,6 +1281,8 @@ def _default_predict( match_label=match_label, ) + num_gpus = run_ddp_only_once(num_gpus, strategy) + blacklist_msgs = [] if self._verbosity <= 3: # turn off logging in prediction blacklist_msgs.append("Automatic Mixed Precision") diff --git a/multimodal/src/autogluon/multimodal/predictor.py b/multimodal/src/autogluon/multimodal/predictor.py index ecc83d9f64a..c5810b2c2d8 100644 --- a/multimodal/src/autogluon/multimodal/predictor.py +++ b/multimodal/src/autogluon/multimodal/predictor.py @@ -160,6 +160,7 @@ modify_duplicate_model_names, object_detection_data_to_df, predict, + run_ddp_only_once, save_ovd_result_df, save_pretrained_model_configs, save_result_df, @@ -1505,6 +1506,8 @@ def _fit( # save artifacts for the current running, except for model checkpoint, which will be saved in trainer self.save(save_path, standalone=standalone) + num_gpus = run_ddp_only_once(num_gpus, strategy) + blacklist_msgs = ["already configured with model summary"] log_filter = LogFilter(blacklist_msgs) with apply_log_filter(log_filter): @@ -1801,6 +1804,8 @@ def _default_predict( **optimization_kwargs, ) + num_gpus = run_ddp_only_once(num_gpus, strategy) + blacklist_msgs = [] if self._verbosity <= 3: # turn off logging in prediction blacklist_msgs.append("Automatic Mixed Precision") diff --git a/multimodal/src/autogluon/multimodal/utils/environment.py b/multimodal/src/autogluon/multimodal/utils/environment.py index 5a710a7d997..88f727b13da 100644 --- a/multimodal/src/autogluon/multimodal/utils/environment.py +++ b/multimodal/src/autogluon/multimodal/utils/environment.py @@ -11,7 +11,7 @@ from autogluon.common.utils.resource_utils import ResourceManager -from ..constants import DDP, OBJECT_DETECTION, OCR +from ..constants import DDP_STRATEGIES, OBJECT_DETECTION, OCR logger = logging.getLogger(__name__) @@ -347,3 +347,15 @@ def _get_mmlab_installation_guide(package_name): raise ValueError("Available package_name are: mmdet, mmcv, mmengine.") return err_msg + + +def run_ddp_only_once(num_gpus, strategy): + if strategy in DDP_STRATEGIES: + global FIRST_TIME_DDP + if "FIRST_TIME_DDP" in globals(): + # not the first time running DDP, set number of devices to 1 (use single GPU) + return 1 + else: + if num_gpus > 1: + FIRST_TIME_DDP = False # now we run DDP for the first time + return num_gpus