From 67c1e726322657cdad7588a313d11b5f5e22baef Mon Sep 17 00:00:00 2001 From: Louis Dupont Date: Wed, 8 Mar 2023 12:58:39 +0200 Subject: [PATCH 1/5] fix --- .../training/sg_trainer/sg_trainer.py | 20 ++++++++++++++----- 1 file changed, 15 insertions(+), 5 deletions(-) diff --git a/src/super_gradients/training/sg_trainer/sg_trainer.py b/src/super_gradients/training/sg_trainer/sg_trainer.py index 89373a9cc6..bd16369b9d 100755 --- a/src/super_gradients/training/sg_trainer/sg_trainer.py +++ b/src/super_gradients/training/sg_trainer/sg_trainer.py @@ -194,6 +194,8 @@ def __init__(self, experiment_name: str, device: str = None, multi_gpu: Union[Mu self.max_train_batches = None self.max_valid_batches = None + self.epoch_start_lr_dict = {} + @property def device(self) -> str: return device_config.device @@ -443,7 +445,7 @@ def _train_epoch(self, epoch: int, silent_mode: bool = False) -> tuple: # LOG LR THAT WILL BE USED IN CURRENT EPOCH AND AFTER FIRST WARMUP/LR_SCHEDULER UPDATE BEFORE WEIGHT UPDATE if not self.ddp_silent_mode and batch_idx == 0: - self._write_lrs(epoch) + self.epoch_start_lr_dict = self._get_lr_dict() self._backward_step(loss, epoch, batch_idx, context) @@ -1280,7 +1282,14 @@ def forward(self, inputs, targets): if not self.ddp_silent_mode: # SAVING AND LOGGING OCCURS ONLY IN THE MAIN PROCESS (IN CASES THERE ARE SEVERAL PROCESSES - DDP) - self._write_to_disk_operations(train_metrics_tuple, validation_results_tuple, inf_time, epoch, context) + self._write_to_disk_operations( + train_metrics=train_metrics_tuple, + validation_results=validation_results_tuple, + lr_dict=self.epoch_start_lr_dict, + inf_time=inf_time, + epoch=epoch, + context=context, + ) self.sg_logger.upload() # Evaluating the average model and removing snapshot averaging file if training is completed @@ -1609,7 +1618,7 @@ def _get_hyper_param_config(self): } return hyper_param_config - def _write_to_disk_operations(self, train_metrics: tuple, validation_results: tuple, inf_time: float, epoch: int, context: PhaseContext): + def _write_to_disk_operations(self, train_metrics: tuple, validation_results: tuple, lr_dict: dict, inf_time: float, epoch: int, context: PhaseContext): """Run the various logging operations, e.g.: log file, Tensorboard, save checkpoint etc.""" # STORE VALUES IN A TENSORBOARD FILE train_results = list(train_metrics) + list(validation_results) + [inf_time] @@ -1617,16 +1626,17 @@ def _write_to_disk_operations(self, train_metrics: tuple, validation_results: tu result_dict = {all_titles[i]: train_results[i] for i in range(len(train_results))} self.sg_logger.add_scalars(tag_scalar_dict=result_dict, global_step=epoch) + self.sg_logger.add_scalars(tag_scalar_dict=lr_dict, global_step=epoch) # SAVE THE CHECKPOINT if self.training_params.save_model: self._save_checkpoint(self.optimizer, epoch + 1, validation_results, context) - def _write_lrs(self, epoch): + def _get_lr_dict(self) -> dict: lrs = [self.optimizer.param_groups[i]["lr"] for i in range(len(self.optimizer.param_groups))] lr_titles = ["LR/Param_group_" + str(i) for i in range(len(self.optimizer.param_groups))] if len(self.optimizer.param_groups) > 1 else ["LR"] lr_dict = {lr_titles[i]: lrs[i] for i in range(len(lrs))} - self.sg_logger.add_scalars(tag_scalar_dict=lr_dict, global_step=epoch) + return lr_dict def test( self, From 5dfc06f16c44ce80225ad2b75f355d2872ce8b7e Mon Sep 17 00:00:00 2001 From: Louis Dupont Date: Wed, 8 Mar 2023 15:19:44 +0200 Subject: [PATCH 2/5] deepcopy --- src/super_gradients/training/sg_trainer/sg_trainer.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/super_gradients/training/sg_trainer/sg_trainer.py b/src/super_gradients/training/sg_trainer/sg_trainer.py index 378e8f4c14..4876b2763e 100755 --- a/src/super_gradients/training/sg_trainer/sg_trainer.py +++ b/src/super_gradients/training/sg_trainer/sg_trainer.py @@ -1637,7 +1637,7 @@ def _get_lr_dict(self) -> dict: lrs = [self.optimizer.param_groups[i]["lr"] for i in range(len(self.optimizer.param_groups))] lr_titles = ["LR/Param_group_" + str(i) for i in range(len(self.optimizer.param_groups))] if len(self.optimizer.param_groups) > 1 else ["LR"] lr_dict = {lr_titles[i]: lrs[i] for i in range(len(lrs))} - return lr_dict + return deepcopy(lr_dict) def test( self, From 05d11aec36c8b4c4d9c448c2d4dbbf5c8b382508 Mon Sep 17 00:00:00 2001 From: Louis Dupont Date: Sun, 12 Mar 2023 11:54:16 +0200 Subject: [PATCH 3/5] fix according to comments --- src/super_gradients/training/sg_trainer/sg_trainer.py | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/src/super_gradients/training/sg_trainer/sg_trainer.py b/src/super_gradients/training/sg_trainer/sg_trainer.py index 4876b2763e..221dc78afe 100755 --- a/src/super_gradients/training/sg_trainer/sg_trainer.py +++ b/src/super_gradients/training/sg_trainer/sg_trainer.py @@ -194,8 +194,6 @@ def __init__(self, experiment_name: str, device: str = None, multi_gpu: Union[Mu self.max_train_batches = None self.max_valid_batches = None - self.epoch_start_lr_dict = {} - @property def device(self) -> str: return device_config.device @@ -444,9 +442,8 @@ def _train_epoch(self, epoch: int, silent_mode: bool = False) -> tuple: context.update_context(preds=outputs, loss_log_items=loss_log_items) self.phase_callback_handler.on_train_batch_loss_end(context) - # LOG LR THAT WILL BE USED IN CURRENT EPOCH AND AFTER FIRST WARMUP/LR_SCHEDULER UPDATE BEFORE WEIGHT UPDATE if not self.ddp_silent_mode and batch_idx == 0: - self.epoch_start_lr_dict = self._get_lr_dict() + self.epoch_start_logging_values = self._get_epoch_start_logging_values() self._backward_step(loss, epoch, batch_idx, context) @@ -1286,7 +1283,7 @@ def forward(self, inputs, targets): self._write_to_disk_operations( train_metrics=train_metrics_tuple, validation_results=validation_results_tuple, - lr_dict=self.epoch_start_lr_dict, + lr_dict=self.epoch_start_logging_values, inf_time=inf_time, epoch=epoch, context=context, @@ -1633,7 +1630,9 @@ def _write_to_disk_operations(self, train_metrics: tuple, validation_results: tu if self.training_params.save_model: self._save_checkpoint(self.optimizer, epoch + 1, validation_results, context) - def _get_lr_dict(self) -> dict: + def _get_epoch_start_logging_values(self) -> dict: + """Get all the values that should be logged at the start of each epoch. + This is useful for values like Learning Rate that can change over an epoch.""" lrs = [self.optimizer.param_groups[i]["lr"] for i in range(len(self.optimizer.param_groups))] lr_titles = ["LR/Param_group_" + str(i) for i in range(len(self.optimizer.param_groups))] if len(self.optimizer.param_groups) > 1 else ["LR"] lr_dict = {lr_titles[i]: lrs[i] for i in range(len(lrs))} From b175cb56639df38a5bdc90a7d4baf5fb0bed8965 Mon Sep 17 00:00:00 2001 From: Louis Dupont Date: Sun, 12 Mar 2023 11:57:22 +0200 Subject: [PATCH 4/5] make private --- src/super_gradients/training/sg_trainer/sg_trainer.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/src/super_gradients/training/sg_trainer/sg_trainer.py b/src/super_gradients/training/sg_trainer/sg_trainer.py index 221dc78afe..72c7a5ac9f 100755 --- a/src/super_gradients/training/sg_trainer/sg_trainer.py +++ b/src/super_gradients/training/sg_trainer/sg_trainer.py @@ -194,6 +194,8 @@ def __init__(self, experiment_name: str, device: str = None, multi_gpu: Union[Mu self.max_train_batches = None self.max_valid_batches = None + self._epoch_start_logging_values = {} + @property def device(self) -> str: return device_config.device @@ -443,7 +445,7 @@ def _train_epoch(self, epoch: int, silent_mode: bool = False) -> tuple: self.phase_callback_handler.on_train_batch_loss_end(context) if not self.ddp_silent_mode and batch_idx == 0: - self.epoch_start_logging_values = self._get_epoch_start_logging_values() + self._epoch_start_logging_values = self._get_epoch_start_logging_values() self._backward_step(loss, epoch, batch_idx, context) @@ -1283,7 +1285,7 @@ def forward(self, inputs, targets): self._write_to_disk_operations( train_metrics=train_metrics_tuple, validation_results=validation_results_tuple, - lr_dict=self.epoch_start_logging_values, + lr_dict=self._epoch_start_logging_values, inf_time=inf_time, epoch=epoch, context=context, From 592b3ea961e896269450adc9065252250dd51673 Mon Sep 17 00:00:00 2001 From: Louis Dupont Date: Sun, 12 Mar 2023 11:58:15 +0200 Subject: [PATCH 5/5] remove deepcopy --- src/super_gradients/training/sg_trainer/sg_trainer.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/super_gradients/training/sg_trainer/sg_trainer.py b/src/super_gradients/training/sg_trainer/sg_trainer.py index 72c7a5ac9f..bee74ca032 100755 --- a/src/super_gradients/training/sg_trainer/sg_trainer.py +++ b/src/super_gradients/training/sg_trainer/sg_trainer.py @@ -1638,7 +1638,7 @@ def _get_epoch_start_logging_values(self) -> dict: lrs = [self.optimizer.param_groups[i]["lr"] for i in range(len(self.optimizer.param_groups))] lr_titles = ["LR/Param_group_" + str(i) for i in range(len(self.optimizer.param_groups))] if len(self.optimizer.param_groups) > 1 else ["LR"] lr_dict = {lr_titles[i]: lrs[i] for i in range(len(lrs))} - return deepcopy(lr_dict) + return lr_dict def test( self,