From ce3f03fe56d4784b5b503bdb1fbf3d717a16f3d2 Mon Sep 17 00:00:00 2001 From: shayaharon Date: Wed, 11 Jan 2023 19:13:38 +0200 Subject: [PATCH 1/9] black and lint --- .../factories/pre_launch_callbacks_factory.py | 7 ++ .../common/registry/registry.py | 2 + .../recipes/cifar10_resnet.yaml | 3 + src/super_gradients/training/params.py | 3 + .../training/pre_launch_callbacks/__init__.py | 5 + .../pre_launch_callbacks.py | 95 +++++++++++++++++++ .../training/sg_trainer/sg_trainer.py | 45 ++++++--- 7 files changed, 148 insertions(+), 12 deletions(-) create mode 100644 src/super_gradients/common/factories/pre_launch_callbacks_factory.py create mode 100644 src/super_gradients/training/pre_launch_callbacks/__init__.py create mode 100644 src/super_gradients/training/pre_launch_callbacks/pre_launch_callbacks.py diff --git a/src/super_gradients/common/factories/pre_launch_callbacks_factory.py b/src/super_gradients/common/factories/pre_launch_callbacks_factory.py new file mode 100644 index 0000000000..b4cec61259 --- /dev/null +++ b/src/super_gradients/common/factories/pre_launch_callbacks_factory.py @@ -0,0 +1,7 @@ +from super_gradients.common.factories.base_factory import BaseFactory +from super_gradients.training import pre_launch_callbacks + + +class PreLaunchCallbacksFactory(BaseFactory): + def __init__(self): + super().__init__(pre_launch_callbacks.ALL_PRE_LAUNCH_CALLBACKS) diff --git a/src/super_gradients/common/registry/registry.py b/src/super_gradients/common/registry/registry.py index 80c8a0cc80..af76cb4788 100644 --- a/src/super_gradients/common/registry/registry.py +++ b/src/super_gradients/common/registry/registry.py @@ -9,6 +9,7 @@ from super_gradients.training.utils.callbacks.all_callbacks import CALLBACKS from super_gradients.training.transforms.all_transforms import TRANSFORMS from super_gradients.training.datasets.all_datasets import ALL_DATASETS +from super_gradients.training.pre_launch_callbacks import ALL_PRE_LAUNCH_CALLBACKS def create_register_decorator(registry: Dict[str, Callable]) -> Callable: @@ -51,3 +52,4 @@ def decorator(cls: Callable) -> Callable: register_callback = create_register_decorator(registry=CALLBACKS) register_transform = create_register_decorator(registry=TRANSFORMS) register_dataset = create_register_decorator(registry=ALL_DATASETS) +register_pre_launch_callback = create_register_decorator(registry=ALL_PRE_LAUNCH_CALLBACKS) diff --git a/src/super_gradients/recipes/cifar10_resnet.yaml b/src/super_gradients/recipes/cifar10_resnet.yaml index 9fb1e1f90f..25d1d0775f 100644 --- a/src/super_gradients/recipes/cifar10_resnet.yaml +++ b/src/super_gradients/recipes/cifar10_resnet.yaml @@ -26,6 +26,9 @@ training_hyperparams: ckpt_root_dir: +pre_launch_callbacks_list: + - AutoTrainBatchSizeSelectionCallback + architecture: resnet18_cifar experiment_name: resnet18_cifar diff --git a/src/super_gradients/training/params.py b/src/super_gradients/training/params.py index e8bd1fa30c..9329f45e35 100755 --- a/src/super_gradients/training/params.py +++ b/src/super_gradients/training/params.py @@ -68,6 +68,9 @@ "ckpt_name": "ckpt_latest.pth", "resume_strict_load": False, "sync_bn": False, + "max_forward_passes_train": None, # When not None- will break out of inner train loop + # (i.e iterating over train_loader) when reaching this number of batches. + "kil_ddp_pgroup_on_end": True, # Whether to kill the DDP process group in the end of training. } DEFAULT_OPTIMIZER_PARAMS_SGD = {"weight_decay": 1e-4, "momentum": 0.9} diff --git a/src/super_gradients/training/pre_launch_callbacks/__init__.py b/src/super_gradients/training/pre_launch_callbacks/__init__.py new file mode 100644 index 0000000000..76c1c8f021 --- /dev/null +++ b/src/super_gradients/training/pre_launch_callbacks/__init__.py @@ -0,0 +1,5 @@ +from super_gradients.training.pre_launch_callbacks.pre_launch_callbacks import PreLaunchCallback, AutoTrainBatchSizeSelectionCallback + +ALL_PRE_LAUNCH_CALLBACKS = {"AutoTrainBatchSizeSelectionCallback": AutoTrainBatchSizeSelectionCallback} + +__all__ = ["PreLaunchCallback", "AutoTrainBatchSizeSelectionCallback", "ALL_PRE_LAUNCH_CALLBACKS"] diff --git a/src/super_gradients/training/pre_launch_callbacks/pre_launch_callbacks.py b/src/super_gradients/training/pre_launch_callbacks/pre_launch_callbacks.py new file mode 100644 index 0000000000..671d954cd5 --- /dev/null +++ b/src/super_gradients/training/pre_launch_callbacks/pre_launch_callbacks.py @@ -0,0 +1,95 @@ +from copy import deepcopy +from typing import Union + +from omegaconf import DictConfig +import torch + +from super_gradients import is_distributed +from super_gradients.common.abstractions.abstract_logger import get_logger +from super_gradients.training import models +from torch.distributed import barrier + +logger = get_logger(__name__) + + +class PreLaunchCallback: + """ + PreLaunchCallback + + Base class for callbacks to be triggered, manipulating the config (cfg) prior to launching training, + when calling Trainer.train_from_config(cfg). + + """ + + def __call__(self, cfg: Union[dict, DictConfig]) -> Union[dict, DictConfig]: + raise NotImplementedError + + +class AutoTrainBatchSizeSelectionCallback(PreLaunchCallback): + def __init__(self, batch_size_start: int = 4096, size_step: int = 1024, num_forward_passes: int = 3): + self.batch_size_start = batch_size_start + self.size_step = size_step + self.num_forward_passes = num_forward_passes + + def __call__(self, cfg: DictConfig) -> DictConfig: + from super_gradients.training.sg_trainer import Trainer + + curr_batch_size = self.batch_size_start + # BUILD NETWORK + model = models.get( + model_name=cfg.architecture, + num_classes=cfg.arch_params.num_classes, + arch_params=cfg.arch_params, + strict_load=cfg.checkpoint_params.strict_load, + pretrained_weights=cfg.checkpoint_params.pretrained_weights, + checkpoint_path=cfg.checkpoint_params.checkpoint_path, + load_backbone=cfg.checkpoint_params.load_backbone, + ) + tmp_cfg = deepcopy(cfg) + tmp_cfg.training_hyperparams.batch_accumulate = 1 + tmp_cfg.training_hyperparams.max_forward_passes_train = self.num_forward_passes + tmp_cfg.training_hyperparams.run_validation_freq = 2 + tmp_cfg.training_hyperparams.silent_mode = True + tmp_cfg.training_hyperparams.save_model = False + tmp_cfg.training_hyperparams.max_epochs = 1 + tmp_cfg.training_hyperparams.average_best_models = False + tmp_cfg.training_hyperparams.kil_ddp_pgroup_on_end = False + tmp_cfg.pre_launch_callbacks_list = [] + + while True: + tmp_cfg.dataset_params.train_dataloader_params.batch_size = curr_batch_size + + try: + Trainer.train_from_config(tmp_cfg) + + except RuntimeError as e: + if "out of memory" in str(e): + if curr_batch_size == self.batch_size_start: + logger.error("Ran out of memory for the smallest batch, try setting smaller batch_size_start.") + raise e + else: + logger.info(f"Ran out of memory for {curr_batch_size}, setting batch size to {curr_batch_size - self.size_step}.") + cfg.dataset_params.train_dataloader_params.batch_size = curr_batch_size - self.size_step + for p in model.parameters(): + if p.grad is not None: + del p.grad # free some memory + torch.cuda.empty_cache() + + # WAIT FOR ALL PROCESSES TO CLEAR THEIR MEMORY BEFORE MOVING ON + if is_distributed(): + barrier() + return cfg + else: + raise e + + else: + logger.info(f"Did not run out of memory for {curr_batch_size}, retrying batch {curr_batch_size + self.size_step}.") + curr_batch_size += self.size_step + for p in model.parameters(): + if p.grad is not None: + del p.grad # free some memory + torch.cuda.empty_cache() + + # WAIT FOR ALL PROCESSES TO CLEAR THEIR MEMORY BEFORE MOVING ON + if is_distributed(): + barrier() diff --git a/src/super_gradients/training/sg_trainer/sg_trainer.py b/src/super_gradients/training/sg_trainer/sg_trainer.py index 232ba3b710..c841fc84a1 100755 --- a/src/super_gradients/training/sg_trainer/sg_trainer.py +++ b/src/super_gradients/training/sg_trainer/sg_trainer.py @@ -86,6 +86,7 @@ from super_gradients.training.utils import HpmStruct from super_gradients.training.utils.hydra_utils import load_experiment_cfg, add_params_to_cfg from omegaconf import OmegaConf +from super_gradients.common.factories.pre_launch_callbacks_factory import PreLaunchCallbacksFactory logger = get_logger(__name__) @@ -193,6 +194,7 @@ def __init__(self, experiment_name: str, device: str = None, multi_gpu: Union[Mu self.train_monitored_values = {} self.valid_monitored_values = {} + self.max_forward_passes_train = None @property def device(self) -> str: @@ -216,8 +218,22 @@ def train_from_config(cls, cfg: Union[DictConfig, dict]) -> Tuple[nn.Module, Tup # INSTANTIATE ALL OBJECTS IN CFG cfg = hydra.utils.instantiate(cfg) + # TRIGGER CFG MODIFYING CALLBACKS + cfg = cls._trigger_cfg_modifying_callbacks(cfg) + trainer = Trainer(experiment_name=cfg.experiment_name, ckpt_root_dir=cfg.ckpt_root_dir) + # BUILD NETWORK + model = models.get( + model_name=cfg.architecture, + num_classes=cfg.arch_params.num_classes, + arch_params=cfg.arch_params, + strict_load=cfg.checkpoint_params.strict_load, + pretrained_weights=cfg.checkpoint_params.pretrained_weights, + checkpoint_path=cfg.checkpoint_params.checkpoint_path, + load_backbone=cfg.checkpoint_params.load_backbone, + ) + # INSTANTIATE DATA LOADERS train_dataloader = dataloaders.get( @@ -232,16 +248,6 @@ def train_from_config(cls, cfg: Union[DictConfig, dict]) -> Tuple[nn.Module, Tup dataloader_params=cfg.dataset_params.val_dataloader_params, ) - # BUILD NETWORK - model = models.get( - model_name=cfg.architecture, - num_classes=cfg.arch_params.num_classes, - arch_params=cfg.arch_params, - strict_load=cfg.checkpoint_params.strict_load, - pretrained_weights=cfg.checkpoint_params.pretrained_weights, - checkpoint_path=cfg.checkpoint_params.checkpoint_path, - load_backbone=cfg.checkpoint_params.load_backbone, - ) recipe_logged_cfg = {"recipe_config": OmegaConf.to_container(cfg, resolve=True)} # TRAIN res = trainer.train( @@ -254,6 +260,14 @@ def train_from_config(cls, cfg: Union[DictConfig, dict]) -> Tuple[nn.Module, Tup return model, res + @classmethod + def _trigger_cfg_modifying_callbacks(cls, cfg): + pre_launch_cbs = get_param(cfg, "pre_launch_callbacks_list", list()) + pre_launch_cbs = ListFactory(PreLaunchCallbacksFactory()).get(pre_launch_cbs) + for plcb in pre_launch_cbs: + cfg = plcb(cfg) + return cfg + @classmethod def resume_experiment(cls, experiment_name: str, ckpt_root_dir: str = None) -> Tuple[nn.Module, Tuple]: """ @@ -445,7 +459,7 @@ def _train_epoch(self, epoch: int, silent_mode: bool = False) -> tuple: # TODO: ITERATE BY MAX ITERS # FOR INFINITE SAMPLERS WE MUST BREAK WHEN REACHING LEN ITERATIONS. - if self._infinite_train_loader and batch_idx == len(self.train_loader) - 1: + if self._infinite_train_loader and batch_idx == len(self.train_loader) - 1 or self.max_forward_passes_train == batch_idx: break if not self.ddp_silent_mode: @@ -965,6 +979,12 @@ def forward(self, inputs, targets): percentile: float, percentile value to use when Trainer,quant_modules_calib_method='percentile'. Discarded when other methods are used (Default=99.99). + - `max_forward_passes_train`: int, when not None- will break out of inner train loop (i.e iterating over + train_loader) when reaching this number of batches. Usefull for debugging (default=None). + + - `kil_ddp_pgroup_on_end`: bool, whether to kill the DDP process group in the end of training. + Useful when launching consecutive DDP trainings with the same Trainer object (default=True). + :return: """ @@ -1142,6 +1162,7 @@ def forward(self, inputs, targets): ) self.ckpt_best_name = self.training_params.ckpt_best_name + self.max_forward_passes_train = self.training_params.max_forward_passes_train # STATE ATTRIBUTE SET HERE FOR SUBSEQUENT TRAIN() CALLS self._first_backward = True @@ -1265,7 +1286,7 @@ def forward(self, inputs, targets): finally: if device_config.multi_gpu == MultiGPUMode.DISTRIBUTED_DATA_PARALLEL: # CLEAN UP THE MULTI-GPU PROCESS GROUP WHEN DONE - if torch.distributed.is_initialized(): + if torch.distributed.is_initialized() and self.training_params.kil_ddp_pgroup_on_end: torch.distributed.destroy_process_group() # PHASE.TRAIN_END From f46ff3d2e3c67baa15a505b2e39c62bec97d7b9d Mon Sep 17 00:00:00 2001 From: shayaharon Date: Thu, 12 Jan 2023 10:43:02 +0200 Subject: [PATCH 2/9] circular import comment added --- .../training/pre_launch_callbacks/pre_launch_callbacks.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/super_gradients/training/pre_launch_callbacks/pre_launch_callbacks.py b/src/super_gradients/training/pre_launch_callbacks/pre_launch_callbacks.py index 671d954cd5..9b49cb71ae 100644 --- a/src/super_gradients/training/pre_launch_callbacks/pre_launch_callbacks.py +++ b/src/super_gradients/training/pre_launch_callbacks/pre_launch_callbacks.py @@ -32,6 +32,8 @@ def __init__(self, batch_size_start: int = 4096, size_step: int = 1024, num_forw self.num_forward_passes = num_forward_passes def __call__(self, cfg: DictConfig) -> DictConfig: + + # IMPORT IS HERE DUE TO CIRCULAR IMPORT PROBLEM from super_gradients.training.sg_trainer import Trainer curr_batch_size = self.batch_size_start From f2d99b6f95f2926cbc942586afdadb1c3a2cde97 Mon Sep 17 00:00:00 2001 From: shayaharon Date: Thu, 12 Jan 2023 12:23:08 +0200 Subject: [PATCH 3/9] max batch size arg added --- .../pre_launch_callbacks.py | 39 +++++++++---------- 1 file changed, 18 insertions(+), 21 deletions(-) diff --git a/src/super_gradients/training/pre_launch_callbacks/pre_launch_callbacks.py b/src/super_gradients/training/pre_launch_callbacks/pre_launch_callbacks.py index 9b49cb71ae..83b1840f62 100644 --- a/src/super_gradients/training/pre_launch_callbacks/pre_launch_callbacks.py +++ b/src/super_gradients/training/pre_launch_callbacks/pre_launch_callbacks.py @@ -26,9 +26,10 @@ def __call__(self, cfg: Union[dict, DictConfig]) -> Union[dict, DictConfig]: class AutoTrainBatchSizeSelectionCallback(PreLaunchCallback): - def __init__(self, batch_size_start: int = 4096, size_step: int = 1024, num_forward_passes: int = 3): - self.batch_size_start = batch_size_start + def __init__(self, min_batch_size: int, size_step: int, num_forward_passes: int = 3, max_batch_size=None): + self.min_batch_size = min_batch_size self.size_step = size_step + self.max_batch_size = max_batch_size self.num_forward_passes = num_forward_passes def __call__(self, cfg: DictConfig) -> DictConfig: @@ -36,7 +37,7 @@ def __call__(self, cfg: DictConfig) -> DictConfig: # IMPORT IS HERE DUE TO CIRCULAR IMPORT PROBLEM from super_gradients.training.sg_trainer import Trainer - curr_batch_size = self.batch_size_start + curr_batch_size = self.min_batch_size # BUILD NETWORK model = models.get( model_name=cfg.architecture, @@ -66,20 +67,13 @@ def __call__(self, cfg: DictConfig) -> DictConfig: except RuntimeError as e: if "out of memory" in str(e): - if curr_batch_size == self.batch_size_start: - logger.error("Ran out of memory for the smallest batch, try setting smaller batch_size_start.") + if curr_batch_size == self.min_batch_size: + logger.error("Ran out of memory for the smallest batch, try setting smaller min_batch_size.") raise e else: logger.info(f"Ran out of memory for {curr_batch_size}, setting batch size to {curr_batch_size - self.size_step}.") cfg.dataset_params.train_dataloader_params.batch_size = curr_batch_size - self.size_step - for p in model.parameters(): - if p.grad is not None: - del p.grad # free some memory - torch.cuda.empty_cache() - - # WAIT FOR ALL PROCESSES TO CLEAR THEIR MEMORY BEFORE MOVING ON - if is_distributed(): - barrier() + self._clear_model_gpu_mem(model) return cfg else: raise e @@ -87,11 +81,14 @@ def __call__(self, cfg: DictConfig) -> DictConfig: else: logger.info(f"Did not run out of memory for {curr_batch_size}, retrying batch {curr_batch_size + self.size_step}.") curr_batch_size += self.size_step - for p in model.parameters(): - if p.grad is not None: - del p.grad # free some memory - torch.cuda.empty_cache() - - # WAIT FOR ALL PROCESSES TO CLEAR THEIR MEMORY BEFORE MOVING ON - if is_distributed(): - barrier() + self._clear_model_gpu_mem(model) + + @classmethod + def _clear_model_gpu_mem(cls, model): + for p in model.parameters(): + if p.grad is not None: + del p.grad # free some memory + torch.cuda.empty_cache() + # WAIT FOR ALL PROCESSES TO CLEAR THEIR MEMORY BEFORE MOVING ON + if is_distributed(): + barrier() From c0cacdce768718b59515ca65c6b7f59905068fa8 Mon Sep 17 00:00:00 2001 From: shayaharon Date: Mon, 16 Jan 2023 10:35:45 +0200 Subject: [PATCH 4/9] lint --- .../recipes/cifar10_resnet.yaml | 3 -- .../pre_launch_callbacks.py | 40 +++++++++++++++++++ ...tomatic_batch_selection_single_gpu_test.py | 31 ++++++++++++++ 3 files changed, 71 insertions(+), 3 deletions(-) create mode 100644 tests/recipe_training_tests/automatic_batch_selection_single_gpu_test.py diff --git a/src/super_gradients/recipes/cifar10_resnet.yaml b/src/super_gradients/recipes/cifar10_resnet.yaml index 25d1d0775f..9fb1e1f90f 100644 --- a/src/super_gradients/recipes/cifar10_resnet.yaml +++ b/src/super_gradients/recipes/cifar10_resnet.yaml @@ -26,9 +26,6 @@ training_hyperparams: ckpt_root_dir: -pre_launch_callbacks_list: - - AutoTrainBatchSizeSelectionCallback - architecture: resnet18_cifar experiment_name: resnet18_cifar diff --git a/src/super_gradients/training/pre_launch_callbacks/pre_launch_callbacks.py b/src/super_gradients/training/pre_launch_callbacks/pre_launch_callbacks.py index 83b1840f62..66a35b9d0f 100644 --- a/src/super_gradients/training/pre_launch_callbacks/pre_launch_callbacks.py +++ b/src/super_gradients/training/pre_launch_callbacks/pre_launch_callbacks.py @@ -26,6 +26,46 @@ def __call__(self, cfg: Union[dict, DictConfig]) -> Union[dict, DictConfig]: class AutoTrainBatchSizeSelectionCallback(PreLaunchCallback): + """ + AutoTrainBatchSizeSelectionCallback + + Modifies cfg.dataset_params.train_dataloader_params.batch_size by searching for the maximal batch size that fits + gpu memory. Works out of the box for DDP. + + The search is done by running a few forward passes for increasing batch sizes, until CUDA OUT OF MEMORY is raised: + + For batch_size in range(min_batch_size:max_batch_size:size_step): + if batch_size raises CUDA OUT OF MEMORY ERROR: + return batch_size-size_step + return batch_size + + Example usage: Inside the main recipe .YAML file (for example super_gradients/recipes/cifar10_resnet.yaml), + add the following: + + pre_launch_callbacks_list: + - AutoTrainBatchSizeSelectionCallback: + min_batch_size: 128 + size_step: 64 + num_forward_passes: 10 + + Then, when running super_gradients/examples/train_from_recipe_example/train_from_recipe.py --config-name=... + this pre_launch_callback will modify cfg.dataset_params.train_dataloader_params.batch_size then pass cfg to + Trainer.train_from_config(cfg) and training will continue with the selected batch size. + + + :param min_batch_size: int, the first batch size to try running forward passes. Should fit memory. + + :param size_step: int, the difference between 2 consecutive batch_ssize trials. + + :param num_forward_passes: int, number of forward passes (i.e train_loader data iterations inside an epoch). + Note that the more forward passes being done, the less the selected batch size is prawn to fail. This is because + other then gradients, model computations, data and other fixed gpu memory that is being used- some more gpu memory + might be used by the metric objects and PhaseCallbacks. + + :param max_batch_size: int, optional, upper limit of the batch sizes to try. When None, the search will continue until + the maximal batch size that does not raise CUDA OUT OF MEMORY is found (deafult=None). + """ + def __init__(self, min_batch_size: int, size_step: int, num_forward_passes: int = 3, max_batch_size=None): self.min_batch_size = min_batch_size self.size_step = size_step diff --git a/tests/recipe_training_tests/automatic_batch_selection_single_gpu_test.py b/tests/recipe_training_tests/automatic_batch_selection_single_gpu_test.py new file mode 100644 index 0000000000..c37fa1bd3e --- /dev/null +++ b/tests/recipe_training_tests/automatic_batch_selection_single_gpu_test.py @@ -0,0 +1,31 @@ +import unittest + +import pkg_resources +from hydra import initialize_config_dir +from hydra.core.global_hydra import GlobalHydra +from hydra import compose +from omegaconf import OmegaConf, open_dict + +from super_gradients import Trainer +from super_gradients.training.utils.hydra_utils import normalize_path + + +class MyTestCase(unittest.TestCase): + def test_something(self): + GlobalHydra.instance().clear() + sg_recipes_dir = pkg_resources.resource_filename("super_gradients.recipes", "") + + with initialize_config_dir(config_dir=normalize_path(sg_recipes_dir), version_base="1.2"): + cfg = compose(config_name="cifar10_resnet") + cfg.experiment_name = "batch_size_selection_test_no_max" + cfg.training_hyperparams.max_epochs = 1 + OmegaConf.set_struct(cfg, True) + with open_dict(cfg): + cfg.pre_launch_callbacks_list = [ + OmegaConf.create({"AutoTrainBatchSizeSelectionCallback": {"min_batch_size": 64, "size_step": 10000, "num_forward_passes": 3}}) + ] + Trainer.train_from_config(cfg) + + +if __name__ == "__main__": + unittest.main() From ef3d5f5c4eeaf045f54cbd4691db760ba9160734 Mon Sep 17 00:00:00 2001 From: shayaharon Date: Mon, 16 Jan 2023 13:48:10 +0200 Subject: [PATCH 5/9] tests lint --- .../pre_launch_callbacks.py | 7 ++++++ tests/deci_core_recipe_test_suite_runner.py | 2 ++ ...tomatic_batch_selection_single_gpu_test.py | 22 ++++++++++++++++--- 3 files changed, 28 insertions(+), 3 deletions(-) diff --git a/src/super_gradients/training/pre_launch_callbacks/pre_launch_callbacks.py b/src/super_gradients/training/pre_launch_callbacks/pre_launch_callbacks.py index 66a35b9d0f..dfdb87984e 100644 --- a/src/super_gradients/training/pre_launch_callbacks/pre_launch_callbacks.py +++ b/src/super_gradients/training/pre_launch_callbacks/pre_launch_callbacks.py @@ -119,6 +119,13 @@ def __call__(self, cfg: DictConfig) -> DictConfig: raise e else: + if self.max_batch_size is not None and curr_batch_size >= self.max_batch_size: + logger.info( + f"Did not run out of memory for {curr_batch_size} >= max_batch_size={self.max_batch_size}, " f"setting batch to {self.max_batch_size}." + ) + cfg.dataset_params.train_dataloader_params.batch_size = self.max_batch_size + self._clear_model_gpu_mem(model) + return cfg logger.info(f"Did not run out of memory for {curr_batch_size}, retrying batch {curr_batch_size + self.size_step}.") curr_batch_size += self.size_step self._clear_model_gpu_mem(model) diff --git a/tests/deci_core_recipe_test_suite_runner.py b/tests/deci_core_recipe_test_suite_runner.py index 5d682b4625..02696c1498 100644 --- a/tests/deci_core_recipe_test_suite_runner.py +++ b/tests/deci_core_recipe_test_suite_runner.py @@ -1,6 +1,7 @@ import sys import unittest +from tests.recipe_training_tests.automatic_batch_selection_single_gpu_test import TestAutoBatchSelectionSingleGPU from tests.recipe_training_tests.shortened_recipes_accuracy_test import ShortenedRecipesAccuracyTests @@ -17,6 +18,7 @@ def _add_modules_to_unit_tests_suite(self): :return: """ self.recipe_tests_suite.addTest(self.test_loader.loadTestsFromModule(ShortenedRecipesAccuracyTests)) + self.recipe_tests_suite.addTest(self.test_loader.loadTestsFromModule(TestAutoBatchSelectionSingleGPU)) if __name__ == "__main__": diff --git a/tests/recipe_training_tests/automatic_batch_selection_single_gpu_test.py b/tests/recipe_training_tests/automatic_batch_selection_single_gpu_test.py index c37fa1bd3e..f135b88c61 100644 --- a/tests/recipe_training_tests/automatic_batch_selection_single_gpu_test.py +++ b/tests/recipe_training_tests/automatic_batch_selection_single_gpu_test.py @@ -5,13 +5,12 @@ from hydra.core.global_hydra import GlobalHydra from hydra import compose from omegaconf import OmegaConf, open_dict - from super_gradients import Trainer from super_gradients.training.utils.hydra_utils import normalize_path -class MyTestCase(unittest.TestCase): - def test_something(self): +class TestAutoBatchSelectionSingleGPU(unittest.TestCase): + def test_auto_batch_size_no_max(self): GlobalHydra.instance().clear() sg_recipes_dir = pkg_resources.resource_filename("super_gradients.recipes", "") @@ -26,6 +25,23 @@ def test_something(self): ] Trainer.train_from_config(cfg) + def test_auto_batch_size_with_upper_limit(self): + GlobalHydra.instance().clear() + sg_recipes_dir = pkg_resources.resource_filename("super_gradients.recipes", "") + + with initialize_config_dir(config_dir=normalize_path(sg_recipes_dir), version_base="1.2"): + cfg = compose(config_name="cifar10_resnet") + cfg.experiment_name = "batch_size_selection_test_with_upper_limit" + cfg.training_hyperparams.max_epochs = 1 + OmegaConf.set_struct(cfg, True) + with open_dict(cfg): + cfg.pre_launch_callbacks_list = [ + OmegaConf.create( + {"AutoTrainBatchSizeSelectionCallback": {"min_batch_size": 32, "size_step": 32, "max_batch_size": 64, "num_forward_passes": 3}} + ) + ] + Trainer.train_from_config(cfg) + if __name__ == "__main__": unittest.main() From a3db1b6f74da4ce74afcf9708e49a15c47287e16 Mon Sep 17 00:00:00 2001 From: shayaharon Date: Mon, 16 Jan 2023 14:42:59 +0200 Subject: [PATCH 6/9] redundant arg in helper cb removed lint --- ...tomatic_batch_selection_single_gpu_test.py | 29 +++++++++++++++---- 1 file changed, 23 insertions(+), 6 deletions(-) diff --git a/tests/recipe_training_tests/automatic_batch_selection_single_gpu_test.py b/tests/recipe_training_tests/automatic_batch_selection_single_gpu_test.py index f135b88c61..80c81da631 100644 --- a/tests/recipe_training_tests/automatic_batch_selection_single_gpu_test.py +++ b/tests/recipe_training_tests/automatic_batch_selection_single_gpu_test.py @@ -1,19 +1,33 @@ import unittest +from typing import Union import pkg_resources from hydra import initialize_config_dir from hydra.core.global_hydra import GlobalHydra from hydra import compose -from omegaconf import OmegaConf, open_dict -from super_gradients import Trainer +from omegaconf import OmegaConf, open_dict, DictConfig +from super_gradients import Trainer, init_trainer +from super_gradients.common.registry.registry import register_pre_launch_callback +from super_gradients.training.pre_launch_callbacks import PreLaunchCallback from super_gradients.training.utils.hydra_utils import normalize_path +@register_pre_launch_callback() +class PreLaunchTrainBatchSizeVerificationCallback(PreLaunchCallback): + def __init__(self, batch_size, experiment_name): + self.batch_size = batch_size + + def __call__(self, cfg: Union[dict, DictConfig]) -> Union[dict, DictConfig]: + if cfg.dataset_params.train_dataloader_params.batch_size != self.batch_size: + raise RuntimeError(f"Final selected batch size is {cfg.dataset_params.train_dataloader_params.batch_size}, expected: {self.batch_size}") + return cfg + + class TestAutoBatchSelectionSingleGPU(unittest.TestCase): def test_auto_batch_size_no_max(self): GlobalHydra.instance().clear() sg_recipes_dir = pkg_resources.resource_filename("super_gradients.recipes", "") - + init_trainer() with initialize_config_dir(config_dir=normalize_path(sg_recipes_dir), version_base="1.2"): cfg = compose(config_name="cifar10_resnet") cfg.experiment_name = "batch_size_selection_test_no_max" @@ -21,14 +35,15 @@ def test_auto_batch_size_no_max(self): OmegaConf.set_struct(cfg, True) with open_dict(cfg): cfg.pre_launch_callbacks_list = [ - OmegaConf.create({"AutoTrainBatchSizeSelectionCallback": {"min_batch_size": 64, "size_step": 10000, "num_forward_passes": 3}}) + OmegaConf.create({"AutoTrainBatchSizeSelectionCallback": {"min_batch_size": 64, "size_step": 10000, "num_forward_passes": 3}}), + OmegaConf.create({"PreLaunchTrainBatchSizeVerificationCallback": {"batch_size": 64}}), ] Trainer.train_from_config(cfg) def test_auto_batch_size_with_upper_limit(self): GlobalHydra.instance().clear() sg_recipes_dir = pkg_resources.resource_filename("super_gradients.recipes", "") - + init_trainer() with initialize_config_dir(config_dir=normalize_path(sg_recipes_dir), version_base="1.2"): cfg = compose(config_name="cifar10_resnet") cfg.experiment_name = "batch_size_selection_test_with_upper_limit" @@ -38,9 +53,11 @@ def test_auto_batch_size_with_upper_limit(self): cfg.pre_launch_callbacks_list = [ OmegaConf.create( {"AutoTrainBatchSizeSelectionCallback": {"min_batch_size": 32, "size_step": 32, "max_batch_size": 64, "num_forward_passes": 3}} - ) + ), + OmegaConf.create({"PreLaunchTrainBatchSizeVerificationCallback": {"batch_size": 64}}), ] Trainer.train_from_config(cfg) + print(cfg) if __name__ == "__main__": From 8ce63d7fab26ee5fc6fa250a5da44f8369d1227a Mon Sep 17 00:00:00 2001 From: shayaharon Date: Mon, 16 Jan 2023 15:10:17 +0200 Subject: [PATCH 7/9] scaling lr lint --- .../pre_launch_callbacks.py | 16 +++- ...tomatic_batch_selection_single_gpu_test.py | 79 ++++++++++++++++++- 2 files changed, 89 insertions(+), 6 deletions(-) diff --git a/src/super_gradients/training/pre_launch_callbacks/pre_launch_callbacks.py b/src/super_gradients/training/pre_launch_callbacks/pre_launch_callbacks.py index dfdb87984e..d37089cf0c 100644 --- a/src/super_gradients/training/pre_launch_callbacks/pre_launch_callbacks.py +++ b/src/super_gradients/training/pre_launch_callbacks/pre_launch_callbacks.py @@ -55,7 +55,7 @@ class AutoTrainBatchSizeSelectionCallback(PreLaunchCallback): :param min_batch_size: int, the first batch size to try running forward passes. Should fit memory. - :param size_step: int, the difference between 2 consecutive batch_ssize trials. + :param size_step: int, the difference between 2 consecutive batch_size trials. :param num_forward_passes: int, number of forward passes (i.e train_loader data iterations inside an epoch). Note that the more forward passes being done, the less the selected batch size is prawn to fail. This is because @@ -64,9 +64,13 @@ class AutoTrainBatchSizeSelectionCallback(PreLaunchCallback): :param max_batch_size: int, optional, upper limit of the batch sizes to try. When None, the search will continue until the maximal batch size that does not raise CUDA OUT OF MEMORY is found (deafult=None). + + :param scale_lr: bool, whether to linearly scale cfg.training_hyperparams.initial_lr, i.e multiply by + FOUND_BATCH_SIZE/cfg.dataset_params.train_datalaoder_params.batch_size (default=True) """ - def __init__(self, min_batch_size: int, size_step: int, num_forward_passes: int = 3, max_batch_size=None): + def __init__(self, min_batch_size: int, size_step: int, num_forward_passes: int = 3, max_batch_size=None, scale_lr: bool = True): + self.scale_lr = scale_lr self.min_batch_size = min_batch_size self.size_step = size_step self.max_batch_size = max_batch_size @@ -112,6 +116,7 @@ def __call__(self, cfg: DictConfig) -> DictConfig: raise e else: logger.info(f"Ran out of memory for {curr_batch_size}, setting batch size to {curr_batch_size - self.size_step}.") + self._adapt_lr_if_needed(cfg, found_batch_size=curr_batch_size - self.size_step) cfg.dataset_params.train_dataloader_params.batch_size = curr_batch_size - self.size_step self._clear_model_gpu_mem(model) return cfg @@ -123,6 +128,7 @@ def __call__(self, cfg: DictConfig) -> DictConfig: logger.info( f"Did not run out of memory for {curr_batch_size} >= max_batch_size={self.max_batch_size}, " f"setting batch to {self.max_batch_size}." ) + self._adapt_lr_if_needed(cfg, found_batch_size=self.max_batch_size) cfg.dataset_params.train_dataloader_params.batch_size = self.max_batch_size self._clear_model_gpu_mem(model) return cfg @@ -130,6 +136,12 @@ def __call__(self, cfg: DictConfig) -> DictConfig: curr_batch_size += self.size_step self._clear_model_gpu_mem(model) + def _adapt_lr_if_needed(self, cfg: DictConfig, found_batch_size: int) -> DictConfig: + if self.scale_lr: + scale_factor = found_batch_size / cfg.dataset_params.train_dataloader_params.batch_size + cfg.training_hyperparams.initial_lr = cfg.training_hyperparams.initial_lr * scale_factor + return cfg + @classmethod def _clear_model_gpu_mem(cls, model): for p in model.parameters(): diff --git a/tests/recipe_training_tests/automatic_batch_selection_single_gpu_test.py b/tests/recipe_training_tests/automatic_batch_selection_single_gpu_test.py index 80c81da631..d962899e03 100644 --- a/tests/recipe_training_tests/automatic_batch_selection_single_gpu_test.py +++ b/tests/recipe_training_tests/automatic_batch_selection_single_gpu_test.py @@ -14,7 +14,7 @@ @register_pre_launch_callback() class PreLaunchTrainBatchSizeVerificationCallback(PreLaunchCallback): - def __init__(self, batch_size, experiment_name): + def __init__(self, batch_size): self.batch_size = batch_size def __call__(self, cfg: Union[dict, DictConfig]) -> Union[dict, DictConfig]: @@ -23,8 +23,66 @@ def __call__(self, cfg: Union[dict, DictConfig]) -> Union[dict, DictConfig]: return cfg +@register_pre_launch_callback() +class PreLaunchLRVerificationCallback(PreLaunchCallback): + def __init__(self, lr): + self.lr = lr + + def __call__(self, cfg: Union[dict, DictConfig]) -> Union[dict, DictConfig]: + if cfg.training_hyperparams.initial_lr != self.lr: + raise RuntimeError(f"Final selected lr is {cfg.training_hyperparams.initial_lr }, expected: {self.lr}") + return cfg + + class TestAutoBatchSelectionSingleGPU(unittest.TestCase): - def test_auto_batch_size_no_max(self): + def test_auto_batch_size_no_max_no_lr_adaptation(self): + GlobalHydra.instance().clear() + sg_recipes_dir = pkg_resources.resource_filename("super_gradients.recipes", "") + init_trainer() + with initialize_config_dir(config_dir=normalize_path(sg_recipes_dir), version_base="1.2"): + cfg = compose(config_name="cifar10_resnet") + cfg.experiment_name = "batch_size_selection_test_no_max" + cfg.training_hyperparams.max_epochs = 1 + OmegaConf.set_struct(cfg, True) + with open_dict(cfg): + cfg.pre_launch_callbacks_list = [ + OmegaConf.create( + {"AutoTrainBatchSizeSelectionCallback": {"min_batch_size": 64, "size_step": 10000, "num_forward_passes": 3, "scale_lr": False}} + ), + OmegaConf.create({"PreLaunchTrainBatchSizeVerificationCallback": {"batch_size": 64}}), + OmegaConf.create({"PreLaunchLRVerificationCallback": {"lr": cfg.training_hyperparams.initial_lr}}), + ] + Trainer.train_from_config(cfg) + + def test_auto_batch_size_with_upper_limit_no_lr_adaptation(self): + GlobalHydra.instance().clear() + sg_recipes_dir = pkg_resources.resource_filename("super_gradients.recipes", "") + init_trainer() + with initialize_config_dir(config_dir=normalize_path(sg_recipes_dir), version_base="1.2"): + cfg = compose(config_name="cifar10_resnet") + cfg.experiment_name = "batch_size_selection_test_with_upper_limit" + cfg.training_hyperparams.max_epochs = 1 + OmegaConf.set_struct(cfg, True) + with open_dict(cfg): + cfg.pre_launch_callbacks_list = [ + OmegaConf.create( + { + "AutoTrainBatchSizeSelectionCallback": { + "min_batch_size": 32, + "size_step": 32, + "max_batch_size": 64, + "num_forward_passes": 3, + "scale_lr": False, + } + } + ), + OmegaConf.create({"PreLaunchTrainBatchSizeVerificationCallback": {"batch_size": 64}}), + OmegaConf.create({"PreLaunchLRVerificationCallback": {"lr": cfg.training_hyperparams.initial_lr}}), + OmegaConf.create({"PreLaunchLRVerificationCallback": {"lr": cfg.training_hyperparams.initial_lr}}), + ] + Trainer.train_from_config(cfg) + + def test_auto_batch_size_no_max_with_lr_adaptation(self): GlobalHydra.instance().clear() sg_recipes_dir = pkg_resources.resource_filename("super_gradients.recipes", "") init_trainer() @@ -37,10 +95,17 @@ def test_auto_batch_size_no_max(self): cfg.pre_launch_callbacks_list = [ OmegaConf.create({"AutoTrainBatchSizeSelectionCallback": {"min_batch_size": 64, "size_step": 10000, "num_forward_passes": 3}}), OmegaConf.create({"PreLaunchTrainBatchSizeVerificationCallback": {"batch_size": 64}}), + OmegaConf.create( + { + "PreLaunchLRVerificationCallback": { + "lr": cfg.training_hyperparams.initial_lr * 64 / cfg.dataset_params.train_dataloader_params.batch_size + } + } + ), ] Trainer.train_from_config(cfg) - def test_auto_batch_size_with_upper_limit(self): + def test_auto_batch_size_with_upper_limit_with_lr_adaptation(self): GlobalHydra.instance().clear() sg_recipes_dir = pkg_resources.resource_filename("super_gradients.recipes", "") init_trainer() @@ -55,9 +120,15 @@ def test_auto_batch_size_with_upper_limit(self): {"AutoTrainBatchSizeSelectionCallback": {"min_batch_size": 32, "size_step": 32, "max_batch_size": 64, "num_forward_passes": 3}} ), OmegaConf.create({"PreLaunchTrainBatchSizeVerificationCallback": {"batch_size": 64}}), + OmegaConf.create( + { + "PreLaunchLRVerificationCallback": { + "lr": cfg.training_hyperparams.initial_lr * 64 / cfg.dataset_params.train_dataloader_params.batch_size + } + } + ), ] Trainer.train_from_config(cfg) - print(cfg) if __name__ == "__main__": From f03b787d31a4454c18665566cf2694ea15ff0f0c Mon Sep 17 00:00:00 2001 From: shayaharon Date: Mon, 16 Jan 2023 15:16:24 +0200 Subject: [PATCH 8/9] conflicts resolved --- .../training/pre_launch_callbacks/pre_launch_callbacks.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/super_gradients/training/pre_launch_callbacks/pre_launch_callbacks.py b/src/super_gradients/training/pre_launch_callbacks/pre_launch_callbacks.py index d37089cf0c..e0a234dec2 100644 --- a/src/super_gradients/training/pre_launch_callbacks/pre_launch_callbacks.py +++ b/src/super_gradients/training/pre_launch_callbacks/pre_launch_callbacks.py @@ -94,7 +94,7 @@ def __call__(self, cfg: DictConfig) -> DictConfig: ) tmp_cfg = deepcopy(cfg) tmp_cfg.training_hyperparams.batch_accumulate = 1 - tmp_cfg.training_hyperparams.max_forward_passes_train = self.num_forward_passes + tmp_cfg.training_hyperparams.max_train_batches = self.num_forward_passes tmp_cfg.training_hyperparams.run_validation_freq = 2 tmp_cfg.training_hyperparams.silent_mode = True tmp_cfg.training_hyperparams.save_model = False From 91015297d46cd9c07dbbda64cf39553d19fdbb28 Mon Sep 17 00:00:00 2001 From: shayaharon Date: Sun, 22 Jan 2023 10:39:40 +0200 Subject: [PATCH 9/9] kil_ddp_pgroup_on_end typo fix --- src/super_gradients/training/params.py | 2 +- .../training/pre_launch_callbacks/pre_launch_callbacks.py | 2 +- src/super_gradients/training/sg_trainer/sg_trainer.py | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/src/super_gradients/training/params.py b/src/super_gradients/training/params.py index 016f3a8c54..de60f9364e 100755 --- a/src/super_gradients/training/params.py +++ b/src/super_gradients/training/params.py @@ -68,7 +68,7 @@ "ckpt_name": "ckpt_latest.pth", "resume_strict_load": False, "sync_bn": False, - "kil_ddp_pgroup_on_end": True, # Whether to kill the DDP process group in the end of training. + "kill_ddp_pgroup_on_end": True, # Whether to kill the DDP process group in the end of training. "max_train_batches": None, # For debug- when not None- will break out of inner train loop # (i.e iterating over train_loader) when reaching this number of batches. "max_valid_batches": None, # For debug- when not None- will break out of inner valid loop diff --git a/src/super_gradients/training/pre_launch_callbacks/pre_launch_callbacks.py b/src/super_gradients/training/pre_launch_callbacks/pre_launch_callbacks.py index e0a234dec2..278e525236 100644 --- a/src/super_gradients/training/pre_launch_callbacks/pre_launch_callbacks.py +++ b/src/super_gradients/training/pre_launch_callbacks/pre_launch_callbacks.py @@ -100,7 +100,7 @@ def __call__(self, cfg: DictConfig) -> DictConfig: tmp_cfg.training_hyperparams.save_model = False tmp_cfg.training_hyperparams.max_epochs = 1 tmp_cfg.training_hyperparams.average_best_models = False - tmp_cfg.training_hyperparams.kil_ddp_pgroup_on_end = False + tmp_cfg.training_hyperparams.kill_ddp_pgroup_on_end = False tmp_cfg.pre_launch_callbacks_list = [] while True: diff --git a/src/super_gradients/training/sg_trainer/sg_trainer.py b/src/super_gradients/training/sg_trainer/sg_trainer.py index bca45ecdd9..403616fa5c 100755 --- a/src/super_gradients/training/sg_trainer/sg_trainer.py +++ b/src/super_gradients/training/sg_trainer/sg_trainer.py @@ -1313,7 +1313,7 @@ def forward(self, inputs, targets): finally: if device_config.multi_gpu == MultiGPUMode.DISTRIBUTED_DATA_PARALLEL: # CLEAN UP THE MULTI-GPU PROCESS GROUP WHEN DONE - if torch.distributed.is_initialized() and self.training_params.kil_ddp_pgroup_on_end: + if torch.distributed.is_initialized() and self.training_params.kill_ddp_pgroup_on_end: torch.distributed.destroy_process_group() # PHASE.TRAIN_END