From ce3f03fe56d4784b5b503bdb1fbf3d717a16f3d2 Mon Sep 17 00:00:00 2001
From: shayaharon <shay.aharon@deci.ai>
Date: Wed, 11 Jan 2023 19:13:38 +0200
Subject: [PATCH 1/9] black and lint

---
 .../factories/pre_launch_callbacks_factory.py |  7 ++
 .../common/registry/registry.py               |  2 +
 .../recipes/cifar10_resnet.yaml               |  3 +
 src/super_gradients/training/params.py        |  3 +
 .../training/pre_launch_callbacks/__init__.py |  5 +
 .../pre_launch_callbacks.py                   | 95 +++++++++++++++++++
 .../training/sg_trainer/sg_trainer.py         | 45 ++++++---
 7 files changed, 148 insertions(+), 12 deletions(-)
 create mode 100644 src/super_gradients/common/factories/pre_launch_callbacks_factory.py
 create mode 100644 src/super_gradients/training/pre_launch_callbacks/__init__.py
 create mode 100644 src/super_gradients/training/pre_launch_callbacks/pre_launch_callbacks.py

diff --git a/src/super_gradients/common/factories/pre_launch_callbacks_factory.py b/src/super_gradients/common/factories/pre_launch_callbacks_factory.py
new file mode 100644
index 0000000000..b4cec61259
--- /dev/null
+++ b/src/super_gradients/common/factories/pre_launch_callbacks_factory.py
@@ -0,0 +1,7 @@
+from super_gradients.common.factories.base_factory import BaseFactory
+from super_gradients.training import pre_launch_callbacks
+
+
+class PreLaunchCallbacksFactory(BaseFactory):
+    def __init__(self):
+        super().__init__(pre_launch_callbacks.ALL_PRE_LAUNCH_CALLBACKS)
diff --git a/src/super_gradients/common/registry/registry.py b/src/super_gradients/common/registry/registry.py
index 80c8a0cc80..af76cb4788 100644
--- a/src/super_gradients/common/registry/registry.py
+++ b/src/super_gradients/common/registry/registry.py
@@ -9,6 +9,7 @@
 from super_gradients.training.utils.callbacks.all_callbacks import CALLBACKS
 from super_gradients.training.transforms.all_transforms import TRANSFORMS
 from super_gradients.training.datasets.all_datasets import ALL_DATASETS
+from super_gradients.training.pre_launch_callbacks import ALL_PRE_LAUNCH_CALLBACKS
 
 
 def create_register_decorator(registry: Dict[str, Callable]) -> Callable:
@@ -51,3 +52,4 @@ def decorator(cls: Callable) -> Callable:
 register_callback = create_register_decorator(registry=CALLBACKS)
 register_transform = create_register_decorator(registry=TRANSFORMS)
 register_dataset = create_register_decorator(registry=ALL_DATASETS)
+register_pre_launch_callback = create_register_decorator(registry=ALL_PRE_LAUNCH_CALLBACKS)
diff --git a/src/super_gradients/recipes/cifar10_resnet.yaml b/src/super_gradients/recipes/cifar10_resnet.yaml
index 9fb1e1f90f..25d1d0775f 100644
--- a/src/super_gradients/recipes/cifar10_resnet.yaml
+++ b/src/super_gradients/recipes/cifar10_resnet.yaml
@@ -26,6 +26,9 @@ training_hyperparams:
 
 ckpt_root_dir:
 
+pre_launch_callbacks_list:
+  - AutoTrainBatchSizeSelectionCallback
+
 architecture: resnet18_cifar
 
 experiment_name: resnet18_cifar
diff --git a/src/super_gradients/training/params.py b/src/super_gradients/training/params.py
index e8bd1fa30c..9329f45e35 100755
--- a/src/super_gradients/training/params.py
+++ b/src/super_gradients/training/params.py
@@ -68,6 +68,9 @@
     "ckpt_name": "ckpt_latest.pth",
     "resume_strict_load": False,
     "sync_bn": False,
+    "max_forward_passes_train": None,  # When not None- will break out of inner train loop
+    # (i.e iterating over train_loader) when reaching this number of batches.
+    "kil_ddp_pgroup_on_end": True,  # Whether to kill the DDP process group in the end of training.
 }
 
 DEFAULT_OPTIMIZER_PARAMS_SGD = {"weight_decay": 1e-4, "momentum": 0.9}
diff --git a/src/super_gradients/training/pre_launch_callbacks/__init__.py b/src/super_gradients/training/pre_launch_callbacks/__init__.py
new file mode 100644
index 0000000000..76c1c8f021
--- /dev/null
+++ b/src/super_gradients/training/pre_launch_callbacks/__init__.py
@@ -0,0 +1,5 @@
+from super_gradients.training.pre_launch_callbacks.pre_launch_callbacks import PreLaunchCallback, AutoTrainBatchSizeSelectionCallback
+
+ALL_PRE_LAUNCH_CALLBACKS = {"AutoTrainBatchSizeSelectionCallback": AutoTrainBatchSizeSelectionCallback}
+
+__all__ = ["PreLaunchCallback", "AutoTrainBatchSizeSelectionCallback", "ALL_PRE_LAUNCH_CALLBACKS"]
diff --git a/src/super_gradients/training/pre_launch_callbacks/pre_launch_callbacks.py b/src/super_gradients/training/pre_launch_callbacks/pre_launch_callbacks.py
new file mode 100644
index 0000000000..671d954cd5
--- /dev/null
+++ b/src/super_gradients/training/pre_launch_callbacks/pre_launch_callbacks.py
@@ -0,0 +1,95 @@
+from copy import deepcopy
+from typing import Union
+
+from omegaconf import DictConfig
+import torch
+
+from super_gradients import is_distributed
+from super_gradients.common.abstractions.abstract_logger import get_logger
+from super_gradients.training import models
+from torch.distributed import barrier
+
+logger = get_logger(__name__)
+
+
+class PreLaunchCallback:
+    """
+    PreLaunchCallback
+
+    Base class for callbacks to be triggered, manipulating the config (cfg) prior to launching training,
+     when calling Trainer.train_from_config(cfg).
+
+    """
+
+    def __call__(self, cfg: Union[dict, DictConfig]) -> Union[dict, DictConfig]:
+        raise NotImplementedError
+
+
+class AutoTrainBatchSizeSelectionCallback(PreLaunchCallback):
+    def __init__(self, batch_size_start: int = 4096, size_step: int = 1024, num_forward_passes: int = 3):
+        self.batch_size_start = batch_size_start
+        self.size_step = size_step
+        self.num_forward_passes = num_forward_passes
+
+    def __call__(self, cfg: DictConfig) -> DictConfig:
+        from super_gradients.training.sg_trainer import Trainer
+
+        curr_batch_size = self.batch_size_start
+        # BUILD NETWORK
+        model = models.get(
+            model_name=cfg.architecture,
+            num_classes=cfg.arch_params.num_classes,
+            arch_params=cfg.arch_params,
+            strict_load=cfg.checkpoint_params.strict_load,
+            pretrained_weights=cfg.checkpoint_params.pretrained_weights,
+            checkpoint_path=cfg.checkpoint_params.checkpoint_path,
+            load_backbone=cfg.checkpoint_params.load_backbone,
+        )
+        tmp_cfg = deepcopy(cfg)
+        tmp_cfg.training_hyperparams.batch_accumulate = 1
+        tmp_cfg.training_hyperparams.max_forward_passes_train = self.num_forward_passes
+        tmp_cfg.training_hyperparams.run_validation_freq = 2
+        tmp_cfg.training_hyperparams.silent_mode = True
+        tmp_cfg.training_hyperparams.save_model = False
+        tmp_cfg.training_hyperparams.max_epochs = 1
+        tmp_cfg.training_hyperparams.average_best_models = False
+        tmp_cfg.training_hyperparams.kil_ddp_pgroup_on_end = False
+        tmp_cfg.pre_launch_callbacks_list = []
+
+        while True:
+            tmp_cfg.dataset_params.train_dataloader_params.batch_size = curr_batch_size
+
+            try:
+                Trainer.train_from_config(tmp_cfg)
+
+            except RuntimeError as e:
+                if "out of memory" in str(e):
+                    if curr_batch_size == self.batch_size_start:
+                        logger.error("Ran out of memory for the smallest batch, try setting smaller batch_size_start.")
+                        raise e
+                    else:
+                        logger.info(f"Ran out of memory for {curr_batch_size}, setting batch size to {curr_batch_size - self.size_step}.")
+                        cfg.dataset_params.train_dataloader_params.batch_size = curr_batch_size - self.size_step
+                        for p in model.parameters():
+                            if p.grad is not None:
+                                del p.grad  # free some memory
+                        torch.cuda.empty_cache()
+
+                        # WAIT FOR ALL PROCESSES TO CLEAR THEIR MEMORY BEFORE MOVING ON
+                        if is_distributed():
+                            barrier()
+                        return cfg
+                else:
+                    raise e
+
+            else:
+                logger.info(f"Did not run out of memory for {curr_batch_size}, retrying batch {curr_batch_size + self.size_step}.")
+                curr_batch_size += self.size_step
+                for p in model.parameters():
+                    if p.grad is not None:
+                        del p.grad  # free some memory
+                torch.cuda.empty_cache()
+
+                # WAIT FOR ALL PROCESSES TO CLEAR THEIR MEMORY BEFORE MOVING ON
+                if is_distributed():
+                    barrier()
diff --git a/src/super_gradients/training/sg_trainer/sg_trainer.py b/src/super_gradients/training/sg_trainer/sg_trainer.py
index 232ba3b710..c841fc84a1 100755
--- a/src/super_gradients/training/sg_trainer/sg_trainer.py
+++ b/src/super_gradients/training/sg_trainer/sg_trainer.py
@@ -86,6 +86,7 @@
 from super_gradients.training.utils import HpmStruct
 from super_gradients.training.utils.hydra_utils import load_experiment_cfg, add_params_to_cfg
 from omegaconf import OmegaConf
+from super_gradients.common.factories.pre_launch_callbacks_factory import PreLaunchCallbacksFactory
 
 logger = get_logger(__name__)
 
@@ -193,6 +194,7 @@ def __init__(self, experiment_name: str, device: str = None, multi_gpu: Union[Mu
 
         self.train_monitored_values = {}
         self.valid_monitored_values = {}
+        self.max_forward_passes_train = None
 
     @property
     def device(self) -> str:
@@ -216,8 +218,22 @@ def train_from_config(cls, cfg: Union[DictConfig, dict]) -> Tuple[nn.Module, Tup
         # INSTANTIATE ALL OBJECTS IN CFG
         cfg = hydra.utils.instantiate(cfg)
 
+        # TRIGGER CFG MODIFYING CALLBACKS
+        cfg = cls._trigger_cfg_modifying_callbacks(cfg)
+
         trainer = Trainer(experiment_name=cfg.experiment_name, ckpt_root_dir=cfg.ckpt_root_dir)
 
+        # BUILD NETWORK
+        model = models.get(
+            model_name=cfg.architecture,
+            num_classes=cfg.arch_params.num_classes,
+            arch_params=cfg.arch_params,
+            strict_load=cfg.checkpoint_params.strict_load,
+            pretrained_weights=cfg.checkpoint_params.pretrained_weights,
+            checkpoint_path=cfg.checkpoint_params.checkpoint_path,
+            load_backbone=cfg.checkpoint_params.load_backbone,
+        )
+
         # INSTANTIATE DATA LOADERS
 
         train_dataloader = dataloaders.get(
@@ -232,16 +248,6 @@ def train_from_config(cls, cfg: Union[DictConfig, dict]) -> Tuple[nn.Module, Tup
             dataloader_params=cfg.dataset_params.val_dataloader_params,
         )
 
-        # BUILD NETWORK
-        model = models.get(
-            model_name=cfg.architecture,
-            num_classes=cfg.arch_params.num_classes,
-            arch_params=cfg.arch_params,
-            strict_load=cfg.checkpoint_params.strict_load,
-            pretrained_weights=cfg.checkpoint_params.pretrained_weights,
-            checkpoint_path=cfg.checkpoint_params.checkpoint_path,
-            load_backbone=cfg.checkpoint_params.load_backbone,
-        )
         recipe_logged_cfg = {"recipe_config": OmegaConf.to_container(cfg, resolve=True)}
         # TRAIN
         res = trainer.train(
@@ -254,6 +260,14 @@ def train_from_config(cls, cfg: Union[DictConfig, dict]) -> Tuple[nn.Module, Tup
 
         return model, res
 
+    @classmethod
+    def _trigger_cfg_modifying_callbacks(cls, cfg):
+        pre_launch_cbs = get_param(cfg, "pre_launch_callbacks_list", list())
+        pre_launch_cbs = ListFactory(PreLaunchCallbacksFactory()).get(pre_launch_cbs)
+        for plcb in pre_launch_cbs:
+            cfg = plcb(cfg)
+        return cfg
+
     @classmethod
     def resume_experiment(cls, experiment_name: str, ckpt_root_dir: str = None) -> Tuple[nn.Module, Tuple]:
         """
@@ -445,7 +459,7 @@ def _train_epoch(self, epoch: int, silent_mode: bool = False) -> tuple:
 
             # TODO: ITERATE BY MAX ITERS
             # FOR INFINITE SAMPLERS WE MUST BREAK WHEN REACHING LEN ITERATIONS.
-            if self._infinite_train_loader and batch_idx == len(self.train_loader) - 1:
+            if self._infinite_train_loader and batch_idx == len(self.train_loader) - 1 or self.max_forward_passes_train == batch_idx:
                 break
 
         if not self.ddp_silent_mode:
@@ -965,6 +979,12 @@ def forward(self, inputs, targets):
                         percentile: float, percentile value to use when Trainer,quant_modules_calib_method='percentile'.
                          Discarded when other methods are used (Default=99.99).
 
+                -   `max_forward_passes_train`: int, when not None- will break out of inner train loop (i.e iterating over
+                      train_loader) when reaching this number of batches. Usefull for debugging (default=None).
+
+                -   `kil_ddp_pgroup_on_end`: bool,  whether to kill the DDP process group in the end of training.
+                      Useful when launching consecutive DDP trainings with the same Trainer object (default=True).
+
 
         :return:
         """
@@ -1142,6 +1162,7 @@ def forward(self, inputs, targets):
         )
 
         self.ckpt_best_name = self.training_params.ckpt_best_name
+        self.max_forward_passes_train = self.training_params.max_forward_passes_train
 
         # STATE ATTRIBUTE SET HERE FOR SUBSEQUENT TRAIN() CALLS
         self._first_backward = True
@@ -1265,7 +1286,7 @@ def forward(self, inputs, targets):
         finally:
             if device_config.multi_gpu == MultiGPUMode.DISTRIBUTED_DATA_PARALLEL:
                 # CLEAN UP THE MULTI-GPU PROCESS GROUP WHEN DONE
-                if torch.distributed.is_initialized():
+                if torch.distributed.is_initialized() and self.training_params.kil_ddp_pgroup_on_end:
                     torch.distributed.destroy_process_group()
 
             # PHASE.TRAIN_END

From f46ff3d2e3c67baa15a505b2e39c62bec97d7b9d Mon Sep 17 00:00:00 2001
From: shayaharon <shay.aharon@deci.ai>
Date: Thu, 12 Jan 2023 10:43:02 +0200
Subject: [PATCH 2/9] circular import comment added

---
 .../training/pre_launch_callbacks/pre_launch_callbacks.py       | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/src/super_gradients/training/pre_launch_callbacks/pre_launch_callbacks.py b/src/super_gradients/training/pre_launch_callbacks/pre_launch_callbacks.py
index 671d954cd5..9b49cb71ae 100644
--- a/src/super_gradients/training/pre_launch_callbacks/pre_launch_callbacks.py
+++ b/src/super_gradients/training/pre_launch_callbacks/pre_launch_callbacks.py
@@ -32,6 +32,8 @@ def __init__(self, batch_size_start: int = 4096, size_step: int = 1024, num_forw
         self.num_forward_passes = num_forward_passes
 
     def __call__(self, cfg: DictConfig) -> DictConfig:
+
+        # IMPORT IS HERE DUE TO CIRCULAR IMPORT PROBLEM
         from super_gradients.training.sg_trainer import Trainer
 
         curr_batch_size = self.batch_size_start

From f2d99b6f95f2926cbc942586afdadb1c3a2cde97 Mon Sep 17 00:00:00 2001
From: shayaharon <shay.aharon@deci.ai>
Date: Thu, 12 Jan 2023 12:23:08 +0200
Subject: [PATCH 3/9] max batch size arg added

---
 .../pre_launch_callbacks.py                   | 39 +++++++++----------
 1 file changed, 18 insertions(+), 21 deletions(-)

diff --git a/src/super_gradients/training/pre_launch_callbacks/pre_launch_callbacks.py b/src/super_gradients/training/pre_launch_callbacks/pre_launch_callbacks.py
index 9b49cb71ae..83b1840f62 100644
--- a/src/super_gradients/training/pre_launch_callbacks/pre_launch_callbacks.py
+++ b/src/super_gradients/training/pre_launch_callbacks/pre_launch_callbacks.py
@@ -26,9 +26,10 @@ def __call__(self, cfg: Union[dict, DictConfig]) -> Union[dict, DictConfig]:
 
 
 class AutoTrainBatchSizeSelectionCallback(PreLaunchCallback):
-    def __init__(self, batch_size_start: int = 4096, size_step: int = 1024, num_forward_passes: int = 3):
-        self.batch_size_start = batch_size_start
+    def __init__(self, min_batch_size: int, size_step: int, num_forward_passes: int = 3, max_batch_size=None):
+        self.min_batch_size = min_batch_size
         self.size_step = size_step
+        self.max_batch_size = max_batch_size
         self.num_forward_passes = num_forward_passes
 
     def __call__(self, cfg: DictConfig) -> DictConfig:
@@ -36,7 +37,7 @@ def __call__(self, cfg: DictConfig) -> DictConfig:
         # IMPORT IS HERE DUE TO CIRCULAR IMPORT PROBLEM
         from super_gradients.training.sg_trainer import Trainer
 
-        curr_batch_size = self.batch_size_start
+        curr_batch_size = self.min_batch_size
         # BUILD NETWORK
         model = models.get(
             model_name=cfg.architecture,
@@ -66,20 +67,13 @@ def __call__(self, cfg: DictConfig) -> DictConfig:
 
             except RuntimeError as e:
                 if "out of memory" in str(e):
-                    if curr_batch_size == self.batch_size_start:
-                        logger.error("Ran out of memory for the smallest batch, try setting smaller batch_size_start.")
+                    if curr_batch_size == self.min_batch_size:
+                        logger.error("Ran out of memory for the smallest batch, try setting smaller min_batch_size.")
                         raise e
                     else:
                         logger.info(f"Ran out of memory for {curr_batch_size}, setting batch size to {curr_batch_size - self.size_step}.")
                         cfg.dataset_params.train_dataloader_params.batch_size = curr_batch_size - self.size_step
-                        for p in model.parameters():
-                            if p.grad is not None:
-                                del p.grad  # free some memory
-                        torch.cuda.empty_cache()
-
-                        # WAIT FOR ALL PROCESSES TO CLEAR THEIR MEMORY BEFORE MOVING ON
-                        if is_distributed():
-                            barrier()
+                        self._clear_model_gpu_mem(model)
                         return cfg
                 else:
                     raise e
@@ -87,11 +81,14 @@ def __call__(self, cfg: DictConfig) -> DictConfig:
             else:
                 logger.info(f"Did not run out of memory for {curr_batch_size}, retrying batch {curr_batch_size + self.size_step}.")
                 curr_batch_size += self.size_step
-                for p in model.parameters():
-                    if p.grad is not None:
-                        del p.grad  # free some memory
-                torch.cuda.empty_cache()
-
-                # WAIT FOR ALL PROCESSES TO CLEAR THEIR MEMORY BEFORE MOVING ON
-                if is_distributed():
-                    barrier()
+                self._clear_model_gpu_mem(model)
+
+    @classmethod
+    def _clear_model_gpu_mem(cls, model):
+        for p in model.parameters():
+            if p.grad is not None:
+                del p.grad  # free some memory
+        torch.cuda.empty_cache()
+        # WAIT FOR ALL PROCESSES TO CLEAR THEIR MEMORY BEFORE MOVING ON
+        if is_distributed():
+            barrier()

From c0cacdce768718b59515ca65c6b7f59905068fa8 Mon Sep 17 00:00:00 2001
From: shayaharon <shay.aharon@deci.ai>
Date: Mon, 16 Jan 2023 10:35:45 +0200
Subject: [PATCH 4/9] lint

---
 .../recipes/cifar10_resnet.yaml               |  3 --
 .../pre_launch_callbacks.py                   | 40 +++++++++++++++++++
 ...tomatic_batch_selection_single_gpu_test.py | 31 ++++++++++++++
 3 files changed, 71 insertions(+), 3 deletions(-)
 create mode 100644 tests/recipe_training_tests/automatic_batch_selection_single_gpu_test.py

diff --git a/src/super_gradients/recipes/cifar10_resnet.yaml b/src/super_gradients/recipes/cifar10_resnet.yaml
index 25d1d0775f..9fb1e1f90f 100644
--- a/src/super_gradients/recipes/cifar10_resnet.yaml
+++ b/src/super_gradients/recipes/cifar10_resnet.yaml
@@ -26,9 +26,6 @@ training_hyperparams:
 
 ckpt_root_dir:
 
-pre_launch_callbacks_list:
-  - AutoTrainBatchSizeSelectionCallback
-
 architecture: resnet18_cifar
 
 experiment_name: resnet18_cifar
diff --git a/src/super_gradients/training/pre_launch_callbacks/pre_launch_callbacks.py b/src/super_gradients/training/pre_launch_callbacks/pre_launch_callbacks.py
index 83b1840f62..66a35b9d0f 100644
--- a/src/super_gradients/training/pre_launch_callbacks/pre_launch_callbacks.py
+++ b/src/super_gradients/training/pre_launch_callbacks/pre_launch_callbacks.py
@@ -26,6 +26,46 @@ def __call__(self, cfg: Union[dict, DictConfig]) -> Union[dict, DictConfig]:
 
 
 class AutoTrainBatchSizeSelectionCallback(PreLaunchCallback):
+    """
+    AutoTrainBatchSizeSelectionCallback
+
+    Modifies cfg.dataset_params.train_dataloader_params.batch_size by searching for the maximal batch size that fits
+     gpu memory. Works out of the box for DDP.
+
+    The search is done by running a few forward passes for increasing batch sizes, until CUDA OUT OF MEMORY is raised:
+
+        For batch_size in range(min_batch_size:max_batch_size:size_step):
+            if batch_size raises CUDA OUT OF MEMORY ERROR:
+                return batch_size-size_step
+        return batch_size
+
+    Example usage: Inside the main recipe .YAML file (for example super_gradients/recipes/cifar10_resnet.yaml),
+     add the following:
+
+    pre_launch_callbacks_list:
+        - AutoTrainBatchSizeSelectionCallback:
+            min_batch_size: 128
+            size_step: 64
+            num_forward_passes: 10
+
+    Then, when running super_gradients/examples/train_from_recipe_example/train_from_recipe.py --config-name=...
+    this pre_launch_callback will modify cfg.dataset_params.train_dataloader_params.batch_size then pass cfg to
+     Trainer.train_from_config(cfg) and training will continue with the selected batch size.
+
+
+    :param min_batch_size: int, the first batch size to try running forward passes. Should fit memory.
+
+    :param size_step: int, the difference between 2 consecutive batch_ssize trials.
+
+    :param num_forward_passes: int, number of forward passes (i.e train_loader data iterations inside an epoch).
+     Note that the more forward passes being done, the less the selected batch size is prawn to fail. This is because
+      other then gradients, model computations, data and other fixed gpu memory that is being used- some more gpu memory
+       might be used by the metric objects and PhaseCallbacks.
+
+    :param max_batch_size: int, optional, upper limit of the batch sizes to try. When None, the search will continue until
+     the maximal batch size that does not raise CUDA OUT OF MEMORY is found (deafult=None).
+    """
+
     def __init__(self, min_batch_size: int, size_step: int, num_forward_passes: int = 3, max_batch_size=None):
         self.min_batch_size = min_batch_size
         self.size_step = size_step
diff --git a/tests/recipe_training_tests/automatic_batch_selection_single_gpu_test.py b/tests/recipe_training_tests/automatic_batch_selection_single_gpu_test.py
new file mode 100644
index 0000000000..c37fa1bd3e
--- /dev/null
+++ b/tests/recipe_training_tests/automatic_batch_selection_single_gpu_test.py
@@ -0,0 +1,31 @@
+import unittest
+
+import pkg_resources
+from hydra import initialize_config_dir
+from hydra.core.global_hydra import GlobalHydra
+from hydra import compose
+from omegaconf import OmegaConf, open_dict
+
+from super_gradients import Trainer
+from super_gradients.training.utils.hydra_utils import normalize_path
+
+
+class MyTestCase(unittest.TestCase):
+    def test_something(self):
+        GlobalHydra.instance().clear()
+        sg_recipes_dir = pkg_resources.resource_filename("super_gradients.recipes", "")
+
+        with initialize_config_dir(config_dir=normalize_path(sg_recipes_dir), version_base="1.2"):
+            cfg = compose(config_name="cifar10_resnet")
+            cfg.experiment_name = "batch_size_selection_test_no_max"
+            cfg.training_hyperparams.max_epochs = 1
+            OmegaConf.set_struct(cfg, True)
+            with open_dict(cfg):
+                cfg.pre_launch_callbacks_list = [
+                    OmegaConf.create({"AutoTrainBatchSizeSelectionCallback": {"min_batch_size": 64, "size_step": 10000, "num_forward_passes": 3}})
+                ]
+        Trainer.train_from_config(cfg)
+
+
+if __name__ == "__main__":
+    unittest.main()

From ef3d5f5c4eeaf045f54cbd4691db760ba9160734 Mon Sep 17 00:00:00 2001
From: shayaharon <shay.aharon@deci.ai>
Date: Mon, 16 Jan 2023 13:48:10 +0200
Subject: [PATCH 5/9] tests lint

---
 .../pre_launch_callbacks.py                   |  7 ++++++
 tests/deci_core_recipe_test_suite_runner.py   |  2 ++
 ...tomatic_batch_selection_single_gpu_test.py | 22 ++++++++++++++++---
 3 files changed, 28 insertions(+), 3 deletions(-)

diff --git a/src/super_gradients/training/pre_launch_callbacks/pre_launch_callbacks.py b/src/super_gradients/training/pre_launch_callbacks/pre_launch_callbacks.py
index 66a35b9d0f..dfdb87984e 100644
--- a/src/super_gradients/training/pre_launch_callbacks/pre_launch_callbacks.py
+++ b/src/super_gradients/training/pre_launch_callbacks/pre_launch_callbacks.py
@@ -119,6 +119,13 @@ def __call__(self, cfg: DictConfig) -> DictConfig:
                     raise e
 
             else:
+                if self.max_batch_size is not None and curr_batch_size >= self.max_batch_size:
+                    logger.info(
+                        f"Did not run out of memory for {curr_batch_size} >= max_batch_size={self.max_batch_size}, " f"setting batch to {self.max_batch_size}."
+                    )
+                    cfg.dataset_params.train_dataloader_params.batch_size = self.max_batch_size
+                    self._clear_model_gpu_mem(model)
+                    return cfg
                 logger.info(f"Did not run out of memory for {curr_batch_size}, retrying batch {curr_batch_size + self.size_step}.")
                 curr_batch_size += self.size_step
                 self._clear_model_gpu_mem(model)
diff --git a/tests/deci_core_recipe_test_suite_runner.py b/tests/deci_core_recipe_test_suite_runner.py
index 5d682b4625..02696c1498 100644
--- a/tests/deci_core_recipe_test_suite_runner.py
+++ b/tests/deci_core_recipe_test_suite_runner.py
@@ -1,6 +1,7 @@
 import sys
 import unittest
 
+from tests.recipe_training_tests.automatic_batch_selection_single_gpu_test import TestAutoBatchSelectionSingleGPU
 from tests.recipe_training_tests.shortened_recipes_accuracy_test import ShortenedRecipesAccuracyTests
 
 
@@ -17,6 +18,7 @@ def _add_modules_to_unit_tests_suite(self):
             :return:
         """
         self.recipe_tests_suite.addTest(self.test_loader.loadTestsFromModule(ShortenedRecipesAccuracyTests))
+        self.recipe_tests_suite.addTest(self.test_loader.loadTestsFromModule(TestAutoBatchSelectionSingleGPU))
 
 
 if __name__ == "__main__":
diff --git a/tests/recipe_training_tests/automatic_batch_selection_single_gpu_test.py b/tests/recipe_training_tests/automatic_batch_selection_single_gpu_test.py
index c37fa1bd3e..f135b88c61 100644
--- a/tests/recipe_training_tests/automatic_batch_selection_single_gpu_test.py
+++ b/tests/recipe_training_tests/automatic_batch_selection_single_gpu_test.py
@@ -5,13 +5,12 @@
 from hydra.core.global_hydra import GlobalHydra
 from hydra import compose
 from omegaconf import OmegaConf, open_dict
-
 from super_gradients import Trainer
 from super_gradients.training.utils.hydra_utils import normalize_path
 
 
-class MyTestCase(unittest.TestCase):
-    def test_something(self):
+class TestAutoBatchSelectionSingleGPU(unittest.TestCase):
+    def test_auto_batch_size_no_max(self):
         GlobalHydra.instance().clear()
         sg_recipes_dir = pkg_resources.resource_filename("super_gradients.recipes", "")
 
@@ -26,6 +25,23 @@ def test_something(self):
                 ]
         Trainer.train_from_config(cfg)
 
+    def test_auto_batch_size_with_upper_limit(self):
+        GlobalHydra.instance().clear()
+        sg_recipes_dir = pkg_resources.resource_filename("super_gradients.recipes", "")
+
+        with initialize_config_dir(config_dir=normalize_path(sg_recipes_dir), version_base="1.2"):
+            cfg = compose(config_name="cifar10_resnet")
+            cfg.experiment_name = "batch_size_selection_test_with_upper_limit"
+            cfg.training_hyperparams.max_epochs = 1
+            OmegaConf.set_struct(cfg, True)
+            with open_dict(cfg):
+                cfg.pre_launch_callbacks_list = [
+                    OmegaConf.create(
+                        {"AutoTrainBatchSizeSelectionCallback": {"min_batch_size": 32, "size_step": 32, "max_batch_size": 64, "num_forward_passes": 3}}
+                    )
+                ]
+        Trainer.train_from_config(cfg)
+
 
 if __name__ == "__main__":
     unittest.main()

From a3db1b6f74da4ce74afcf9708e49a15c47287e16 Mon Sep 17 00:00:00 2001
From: shayaharon <shay.aharon@deci.ai>
Date: Mon, 16 Jan 2023 14:42:59 +0200
Subject: [PATCH 6/9] redundant arg in helper cb removed lint

---
 ...tomatic_batch_selection_single_gpu_test.py | 29 +++++++++++++++----
 1 file changed, 23 insertions(+), 6 deletions(-)

diff --git a/tests/recipe_training_tests/automatic_batch_selection_single_gpu_test.py b/tests/recipe_training_tests/automatic_batch_selection_single_gpu_test.py
index f135b88c61..80c81da631 100644
--- a/tests/recipe_training_tests/automatic_batch_selection_single_gpu_test.py
+++ b/tests/recipe_training_tests/automatic_batch_selection_single_gpu_test.py
@@ -1,19 +1,33 @@
 import unittest
+from typing import Union
 
 import pkg_resources
 from hydra import initialize_config_dir
 from hydra.core.global_hydra import GlobalHydra
 from hydra import compose
-from omegaconf import OmegaConf, open_dict
-from super_gradients import Trainer
+from omegaconf import OmegaConf, open_dict, DictConfig
+from super_gradients import Trainer, init_trainer
+from super_gradients.common.registry.registry import register_pre_launch_callback
+from super_gradients.training.pre_launch_callbacks import PreLaunchCallback
 from super_gradients.training.utils.hydra_utils import normalize_path
 
 
+@register_pre_launch_callback()
+class PreLaunchTrainBatchSizeVerificationCallback(PreLaunchCallback):
+    def __init__(self, batch_size, experiment_name):
+        self.batch_size = batch_size
+
+    def __call__(self, cfg: Union[dict, DictConfig]) -> Union[dict, DictConfig]:
+        if cfg.dataset_params.train_dataloader_params.batch_size != self.batch_size:
+            raise RuntimeError(f"Final selected batch size is {cfg.dataset_params.train_dataloader_params.batch_size}, expected: {self.batch_size}")
+        return cfg
+
+
 class TestAutoBatchSelectionSingleGPU(unittest.TestCase):
     def test_auto_batch_size_no_max(self):
         GlobalHydra.instance().clear()
         sg_recipes_dir = pkg_resources.resource_filename("super_gradients.recipes", "")
-
+        init_trainer()
         with initialize_config_dir(config_dir=normalize_path(sg_recipes_dir), version_base="1.2"):
             cfg = compose(config_name="cifar10_resnet")
             cfg.experiment_name = "batch_size_selection_test_no_max"
@@ -21,14 +35,15 @@ def test_auto_batch_size_no_max(self):
             OmegaConf.set_struct(cfg, True)
             with open_dict(cfg):
                 cfg.pre_launch_callbacks_list = [
-                    OmegaConf.create({"AutoTrainBatchSizeSelectionCallback": {"min_batch_size": 64, "size_step": 10000, "num_forward_passes": 3}})
+                    OmegaConf.create({"AutoTrainBatchSizeSelectionCallback": {"min_batch_size": 64, "size_step": 10000, "num_forward_passes": 3}}),
+                    OmegaConf.create({"PreLaunchTrainBatchSizeVerificationCallback": {"batch_size": 64}}),
                 ]
         Trainer.train_from_config(cfg)
 
     def test_auto_batch_size_with_upper_limit(self):
         GlobalHydra.instance().clear()
         sg_recipes_dir = pkg_resources.resource_filename("super_gradients.recipes", "")
-
+        init_trainer()
         with initialize_config_dir(config_dir=normalize_path(sg_recipes_dir), version_base="1.2"):
             cfg = compose(config_name="cifar10_resnet")
             cfg.experiment_name = "batch_size_selection_test_with_upper_limit"
@@ -38,9 +53,11 @@ def test_auto_batch_size_with_upper_limit(self):
                 cfg.pre_launch_callbacks_list = [
                     OmegaConf.create(
                         {"AutoTrainBatchSizeSelectionCallback": {"min_batch_size": 32, "size_step": 32, "max_batch_size": 64, "num_forward_passes": 3}}
-                    )
+                    ),
+                    OmegaConf.create({"PreLaunchTrainBatchSizeVerificationCallback": {"batch_size": 64}}),
                 ]
         Trainer.train_from_config(cfg)
+        print(cfg)
 
 
 if __name__ == "__main__":

From 8ce63d7fab26ee5fc6fa250a5da44f8369d1227a Mon Sep 17 00:00:00 2001
From: shayaharon <shay.aharon@deci.ai>
Date: Mon, 16 Jan 2023 15:10:17 +0200
Subject: [PATCH 7/9] scaling lr lint

---
 .../pre_launch_callbacks.py                   | 16 +++-
 ...tomatic_batch_selection_single_gpu_test.py | 79 ++++++++++++++++++-
 2 files changed, 89 insertions(+), 6 deletions(-)

diff --git a/src/super_gradients/training/pre_launch_callbacks/pre_launch_callbacks.py b/src/super_gradients/training/pre_launch_callbacks/pre_launch_callbacks.py
index dfdb87984e..d37089cf0c 100644
--- a/src/super_gradients/training/pre_launch_callbacks/pre_launch_callbacks.py
+++ b/src/super_gradients/training/pre_launch_callbacks/pre_launch_callbacks.py
@@ -55,7 +55,7 @@ class AutoTrainBatchSizeSelectionCallback(PreLaunchCallback):
 
     :param min_batch_size: int, the first batch size to try running forward passes. Should fit memory.
 
-    :param size_step: int, the difference between 2 consecutive batch_ssize trials.
+    :param size_step: int, the difference between 2 consecutive batch_size trials.
 
     :param num_forward_passes: int, number of forward passes (i.e train_loader data iterations inside an epoch).
      Note that the more forward passes being done, the less the selected batch size is prawn to fail. This is because
@@ -64,9 +64,13 @@ class AutoTrainBatchSizeSelectionCallback(PreLaunchCallback):
 
     :param max_batch_size: int, optional, upper limit of the batch sizes to try. When None, the search will continue until
      the maximal batch size that does not raise CUDA OUT OF MEMORY is found (deafult=None).
+
+    :param scale_lr: bool, whether to linearly scale cfg.training_hyperparams.initial_lr, i.e multiply by
+     FOUND_BATCH_SIZE/cfg.dataset_params.train_datalaoder_params.batch_size (default=True)
     """
 
-    def __init__(self, min_batch_size: int, size_step: int, num_forward_passes: int = 3, max_batch_size=None):
+    def __init__(self, min_batch_size: int, size_step: int, num_forward_passes: int = 3, max_batch_size=None, scale_lr: bool = True):
+        self.scale_lr = scale_lr
         self.min_batch_size = min_batch_size
         self.size_step = size_step
         self.max_batch_size = max_batch_size
@@ -112,6 +116,7 @@ def __call__(self, cfg: DictConfig) -> DictConfig:
                         raise e
                     else:
                         logger.info(f"Ran out of memory for {curr_batch_size}, setting batch size to {curr_batch_size - self.size_step}.")
+                        self._adapt_lr_if_needed(cfg, found_batch_size=curr_batch_size - self.size_step)
                         cfg.dataset_params.train_dataloader_params.batch_size = curr_batch_size - self.size_step
                         self._clear_model_gpu_mem(model)
                         return cfg
@@ -123,6 +128,7 @@ def __call__(self, cfg: DictConfig) -> DictConfig:
                     logger.info(
                         f"Did not run out of memory for {curr_batch_size} >= max_batch_size={self.max_batch_size}, " f"setting batch to {self.max_batch_size}."
                     )
+                    self._adapt_lr_if_needed(cfg, found_batch_size=self.max_batch_size)
                     cfg.dataset_params.train_dataloader_params.batch_size = self.max_batch_size
                     self._clear_model_gpu_mem(model)
                     return cfg
@@ -130,6 +136,12 @@ def __call__(self, cfg: DictConfig) -> DictConfig:
                 curr_batch_size += self.size_step
                 self._clear_model_gpu_mem(model)
 
+    def _adapt_lr_if_needed(self, cfg: DictConfig, found_batch_size: int) -> DictConfig:
+        if self.scale_lr:
+            scale_factor = found_batch_size / cfg.dataset_params.train_dataloader_params.batch_size
+            cfg.training_hyperparams.initial_lr = cfg.training_hyperparams.initial_lr * scale_factor
+        return cfg
+
     @classmethod
     def _clear_model_gpu_mem(cls, model):
         for p in model.parameters():
diff --git a/tests/recipe_training_tests/automatic_batch_selection_single_gpu_test.py b/tests/recipe_training_tests/automatic_batch_selection_single_gpu_test.py
index 80c81da631..d962899e03 100644
--- a/tests/recipe_training_tests/automatic_batch_selection_single_gpu_test.py
+++ b/tests/recipe_training_tests/automatic_batch_selection_single_gpu_test.py
@@ -14,7 +14,7 @@
 
 @register_pre_launch_callback()
 class PreLaunchTrainBatchSizeVerificationCallback(PreLaunchCallback):
-    def __init__(self, batch_size, experiment_name):
+    def __init__(self, batch_size):
         self.batch_size = batch_size
 
     def __call__(self, cfg: Union[dict, DictConfig]) -> Union[dict, DictConfig]:
@@ -23,8 +23,66 @@ def __call__(self, cfg: Union[dict, DictConfig]) -> Union[dict, DictConfig]:
         return cfg
 
 
+@register_pre_launch_callback()
+class PreLaunchLRVerificationCallback(PreLaunchCallback):
+    def __init__(self, lr):
+        self.lr = lr
+
+    def __call__(self, cfg: Union[dict, DictConfig]) -> Union[dict, DictConfig]:
+        if cfg.training_hyperparams.initial_lr != self.lr:
+            raise RuntimeError(f"Final selected lr is {cfg.training_hyperparams.initial_lr }, expected: {self.lr}")
+        return cfg
+
+
 class TestAutoBatchSelectionSingleGPU(unittest.TestCase):
-    def test_auto_batch_size_no_max(self):
+    def test_auto_batch_size_no_max_no_lr_adaptation(self):
+        GlobalHydra.instance().clear()
+        sg_recipes_dir = pkg_resources.resource_filename("super_gradients.recipes", "")
+        init_trainer()
+        with initialize_config_dir(config_dir=normalize_path(sg_recipes_dir), version_base="1.2"):
+            cfg = compose(config_name="cifar10_resnet")
+            cfg.experiment_name = "batch_size_selection_test_no_max"
+            cfg.training_hyperparams.max_epochs = 1
+            OmegaConf.set_struct(cfg, True)
+            with open_dict(cfg):
+                cfg.pre_launch_callbacks_list = [
+                    OmegaConf.create(
+                        {"AutoTrainBatchSizeSelectionCallback": {"min_batch_size": 64, "size_step": 10000, "num_forward_passes": 3, "scale_lr": False}}
+                    ),
+                    OmegaConf.create({"PreLaunchTrainBatchSizeVerificationCallback": {"batch_size": 64}}),
+                    OmegaConf.create({"PreLaunchLRVerificationCallback": {"lr": cfg.training_hyperparams.initial_lr}}),
+                ]
+        Trainer.train_from_config(cfg)
+
+    def test_auto_batch_size_with_upper_limit_no_lr_adaptation(self):
+        GlobalHydra.instance().clear()
+        sg_recipes_dir = pkg_resources.resource_filename("super_gradients.recipes", "")
+        init_trainer()
+        with initialize_config_dir(config_dir=normalize_path(sg_recipes_dir), version_base="1.2"):
+            cfg = compose(config_name="cifar10_resnet")
+            cfg.experiment_name = "batch_size_selection_test_with_upper_limit"
+            cfg.training_hyperparams.max_epochs = 1
+            OmegaConf.set_struct(cfg, True)
+            with open_dict(cfg):
+                cfg.pre_launch_callbacks_list = [
+                    OmegaConf.create(
+                        {
+                            "AutoTrainBatchSizeSelectionCallback": {
+                                "min_batch_size": 32,
+                                "size_step": 32,
+                                "max_batch_size": 64,
+                                "num_forward_passes": 3,
+                                "scale_lr": False,
+                            }
+                        }
+                    ),
+                    OmegaConf.create({"PreLaunchTrainBatchSizeVerificationCallback": {"batch_size": 64}}),
+                    OmegaConf.create({"PreLaunchLRVerificationCallback": {"lr": cfg.training_hyperparams.initial_lr}}),
+                    OmegaConf.create({"PreLaunchLRVerificationCallback": {"lr": cfg.training_hyperparams.initial_lr}}),
+                ]
+        Trainer.train_from_config(cfg)
+
+    def test_auto_batch_size_no_max_with_lr_adaptation(self):
         GlobalHydra.instance().clear()
         sg_recipes_dir = pkg_resources.resource_filename("super_gradients.recipes", "")
         init_trainer()
@@ -37,10 +95,17 @@ def test_auto_batch_size_no_max(self):
                 cfg.pre_launch_callbacks_list = [
                     OmegaConf.create({"AutoTrainBatchSizeSelectionCallback": {"min_batch_size": 64, "size_step": 10000, "num_forward_passes": 3}}),
                     OmegaConf.create({"PreLaunchTrainBatchSizeVerificationCallback": {"batch_size": 64}}),
+                    OmegaConf.create(
+                        {
+                            "PreLaunchLRVerificationCallback": {
+                                "lr": cfg.training_hyperparams.initial_lr * 64 / cfg.dataset_params.train_dataloader_params.batch_size
+                            }
+                        }
+                    ),
                 ]
         Trainer.train_from_config(cfg)
 
-    def test_auto_batch_size_with_upper_limit(self):
+    def test_auto_batch_size_with_upper_limit_with_lr_adaptation(self):
         GlobalHydra.instance().clear()
         sg_recipes_dir = pkg_resources.resource_filename("super_gradients.recipes", "")
         init_trainer()
@@ -55,9 +120,15 @@ def test_auto_batch_size_with_upper_limit(self):
                         {"AutoTrainBatchSizeSelectionCallback": {"min_batch_size": 32, "size_step": 32, "max_batch_size": 64, "num_forward_passes": 3}}
                     ),
                     OmegaConf.create({"PreLaunchTrainBatchSizeVerificationCallback": {"batch_size": 64}}),
+                    OmegaConf.create(
+                        {
+                            "PreLaunchLRVerificationCallback": {
+                                "lr": cfg.training_hyperparams.initial_lr * 64 / cfg.dataset_params.train_dataloader_params.batch_size
+                            }
+                        }
+                    ),
                 ]
         Trainer.train_from_config(cfg)
-        print(cfg)
 
 
 if __name__ == "__main__":

From f03b787d31a4454c18665566cf2694ea15ff0f0c Mon Sep 17 00:00:00 2001
From: shayaharon <shay.aharon@deci.ai>
Date: Mon, 16 Jan 2023 15:16:24 +0200
Subject: [PATCH 8/9] conflicts resolved

---
 .../training/pre_launch_callbacks/pre_launch_callbacks.py       | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/super_gradients/training/pre_launch_callbacks/pre_launch_callbacks.py b/src/super_gradients/training/pre_launch_callbacks/pre_launch_callbacks.py
index d37089cf0c..e0a234dec2 100644
--- a/src/super_gradients/training/pre_launch_callbacks/pre_launch_callbacks.py
+++ b/src/super_gradients/training/pre_launch_callbacks/pre_launch_callbacks.py
@@ -94,7 +94,7 @@ def __call__(self, cfg: DictConfig) -> DictConfig:
         )
         tmp_cfg = deepcopy(cfg)
         tmp_cfg.training_hyperparams.batch_accumulate = 1
-        tmp_cfg.training_hyperparams.max_forward_passes_train = self.num_forward_passes
+        tmp_cfg.training_hyperparams.max_train_batches = self.num_forward_passes
         tmp_cfg.training_hyperparams.run_validation_freq = 2
         tmp_cfg.training_hyperparams.silent_mode = True
         tmp_cfg.training_hyperparams.save_model = False

From 91015297d46cd9c07dbbda64cf39553d19fdbb28 Mon Sep 17 00:00:00 2001
From: shayaharon <shay.aharon@deci.ai>
Date: Sun, 22 Jan 2023 10:39:40 +0200
Subject: [PATCH 9/9] kil_ddp_pgroup_on_end typo fix

---
 src/super_gradients/training/params.py                          | 2 +-
 .../training/pre_launch_callbacks/pre_launch_callbacks.py       | 2 +-
 src/super_gradients/training/sg_trainer/sg_trainer.py           | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/super_gradients/training/params.py b/src/super_gradients/training/params.py
index 016f3a8c54..de60f9364e 100755
--- a/src/super_gradients/training/params.py
+++ b/src/super_gradients/training/params.py
@@ -68,7 +68,7 @@
     "ckpt_name": "ckpt_latest.pth",
     "resume_strict_load": False,
     "sync_bn": False,
-    "kil_ddp_pgroup_on_end": True,  # Whether to kill the DDP process group in the end of training.
+    "kill_ddp_pgroup_on_end": True,  # Whether to kill the DDP process group in the end of training.
     "max_train_batches": None,  # For debug- when not None- will break out of inner train loop
     # (i.e iterating over train_loader) when reaching this number of batches.
     "max_valid_batches": None,  # For debug- when not None- will break out of inner valid loop
diff --git a/src/super_gradients/training/pre_launch_callbacks/pre_launch_callbacks.py b/src/super_gradients/training/pre_launch_callbacks/pre_launch_callbacks.py
index e0a234dec2..278e525236 100644
--- a/src/super_gradients/training/pre_launch_callbacks/pre_launch_callbacks.py
+++ b/src/super_gradients/training/pre_launch_callbacks/pre_launch_callbacks.py
@@ -100,7 +100,7 @@ def __call__(self, cfg: DictConfig) -> DictConfig:
         tmp_cfg.training_hyperparams.save_model = False
         tmp_cfg.training_hyperparams.max_epochs = 1
         tmp_cfg.training_hyperparams.average_best_models = False
-        tmp_cfg.training_hyperparams.kil_ddp_pgroup_on_end = False
+        tmp_cfg.training_hyperparams.kill_ddp_pgroup_on_end = False
         tmp_cfg.pre_launch_callbacks_list = []
 
         while True:
diff --git a/src/super_gradients/training/sg_trainer/sg_trainer.py b/src/super_gradients/training/sg_trainer/sg_trainer.py
index bca45ecdd9..403616fa5c 100755
--- a/src/super_gradients/training/sg_trainer/sg_trainer.py
+++ b/src/super_gradients/training/sg_trainer/sg_trainer.py
@@ -1313,7 +1313,7 @@ def forward(self, inputs, targets):
         finally:
             if device_config.multi_gpu == MultiGPUMode.DISTRIBUTED_DATA_PARALLEL:
                 # CLEAN UP THE MULTI-GPU PROCESS GROUP WHEN DONE
-                if torch.distributed.is_initialized() and self.training_params.kil_ddp_pgroup_on_end:
+                if torch.distributed.is_initialized() and self.training_params.kill_ddp_pgroup_on_end:
                     torch.distributed.destroy_process_group()
 
             # PHASE.TRAIN_END