From f8686cd3000af5c293cedb8ca6412aa9e21053dc Mon Sep 17 00:00:00 2001 From: Shay Aharon <80472096+shaydeci@users.noreply.github.com> Date: Mon, 13 Nov 2023 10:14:05 +0200 Subject: [PATCH] Allow setting per-layer learning rates" (#1612) * updated schedulers, warmup and logging * updated recipes * updated tests * updated some typing and docs * updated some typing and docs * removed update_param_groups test * handled training with instantiated optimizer * updated instantiated optimizer tests * fixed erased initial_lr in unit test * updated message and tests * added train test --------- Co-authored-by: Eugene Khvedchenya --- .../recipes/cityscapes_ddrnet.yaml | 12 +- .../recipes/cityscapes_kd_base.yaml | 14 +- .../recipes/cityscapes_pplite_seg50.yaml | 3 + .../recipes/cityscapes_pplite_seg75.yaml | 4 + .../recipes/cityscapes_stdc_seg50.yaml | 3 + .../recipes/cityscapes_stdc_seg75.yaml | 5 +- .../coco2017_yolo_nas_train_params.yaml | 4 +- .../yolo_nas_pose_collate_fn.py | 2 +- .../models/segmentation_models/segformer.py | 2 +- src/super_gradients/training/params.py | 8 +- .../training/sg_trainer/sg_trainer.py | 44 +++-- .../training/utils/callbacks/callbacks.py | 110 +++++++---- .../training/utils/optimizer_utils.py | 114 ++++++++++- .../training/utils/sg_trainer_utils.py | 51 ++++- tests/deci_core_unit_test_suite_runner.py | 2 - tests/unit_tests/lr_warmup_test.py | 13 +- tests/unit_tests/test_lr_assignment.py | 183 ++++++++++++++++++ .../train_with_intialized_param_args_test.py | 5 +- .../update_param_groups_unit_test.py | 56 ------ 19 files changed, 495 insertions(+), 140 deletions(-) create mode 100644 tests/unit_tests/test_lr_assignment.py delete mode 100644 tests/unit_tests/update_param_groups_unit_test.py diff --git a/src/super_gradients/recipes/cityscapes_ddrnet.yaml b/src/super_gradients/recipes/cityscapes_ddrnet.yaml index d6763593c8..4e8329921c 100644 --- a/src/super_gradients/recipes/cityscapes_ddrnet.yaml +++ b/src/super_gradients/recipes/cityscapes_ddrnet.yaml @@ -55,7 +55,17 @@ architecture: ddrnet_23 training_hyperparams: max_epochs: 500 - initial_lr: 0.0075 # batch size 24 + initial_lr: # batch size 24 + default: 0.075 + # backbone layers + _backbone: 0.0075 + compression3: 0.0075 + compression4: 0.0075 + down3: 0.0075 + down4: 0.0075 + layer3_skip: 0.0075 + layer4_skip: 0.0075 + layer5_skip: 0.0075 loss: DiceCEEdgeLoss: num_classes: 19 diff --git a/src/super_gradients/recipes/cityscapes_kd_base.yaml b/src/super_gradients/recipes/cityscapes_kd_base.yaml index b6592f5948..62f9c87e8b 100644 --- a/src/super_gradients/recipes/cityscapes_kd_base.yaml +++ b/src/super_gradients/recipes/cityscapes_kd_base.yaml @@ -50,7 +50,19 @@ resume: False training_hyperparams: sync_bn: True max_epochs: 500 - initial_lr: 0.0075 # batch size 24 + + initial_lr: # batch size 24 + default: 0.075 + # backbone layers + _backbone: 0.0075 + compression3: 0.0075 + compression4: 0.0075 + down3: 0.0075 + down4: 0.0075 + layer3_skip: 0.0075 + layer4_skip: 0.0075 + layer5_skip: 0.0075 + resume: ${resume} loss: _target_: super_gradients.training.losses.seg_kd_loss.SegKDLoss diff --git a/src/super_gradients/recipes/cityscapes_pplite_seg50.yaml b/src/super_gradients/recipes/cityscapes_pplite_seg50.yaml index ad1b90d03a..b364e9b463 100644 --- a/src/super_gradients/recipes/cityscapes_pplite_seg50.yaml +++ b/src/super_gradients/recipes/cityscapes_pplite_seg50.yaml @@ -67,6 +67,9 @@ checkpoint_params: training_hyperparams: sync_bn: True + initial_lr: + "encoder.backbone": 0.01 + default: 0.1 loss: DiceCEEdgeLoss: num_classes: 19 diff --git a/src/super_gradients/recipes/cityscapes_pplite_seg75.yaml b/src/super_gradients/recipes/cityscapes_pplite_seg75.yaml index cbc19e4660..6cb9ffb874 100644 --- a/src/super_gradients/recipes/cityscapes_pplite_seg75.yaml +++ b/src/super_gradients/recipes/cityscapes_pplite_seg75.yaml @@ -62,6 +62,10 @@ checkpoint_params: training_hyperparams: sync_bn: True + initial_lr: + "encoder.backbone": 0.01 + default: 0.1 + loss: DiceCEEdgeLoss: num_classes: 19 diff --git a/src/super_gradients/recipes/cityscapes_stdc_seg50.yaml b/src/super_gradients/recipes/cityscapes_stdc_seg50.yaml index 05f565256d..a64f32d82c 100644 --- a/src/super_gradients/recipes/cityscapes_stdc_seg50.yaml +++ b/src/super_gradients/recipes/cityscapes_stdc_seg50.yaml @@ -60,6 +60,9 @@ checkpoint_params: strict_load: no_key_matching training_hyperparams: + initial_lr: + cp: 0.01 + default: 0.1 sync_bn: True loss: DiceCEEdgeLoss: diff --git a/src/super_gradients/recipes/cityscapes_stdc_seg75.yaml b/src/super_gradients/recipes/cityscapes_stdc_seg75.yaml index c5b6ff7b5a..4799d4c3a4 100644 --- a/src/super_gradients/recipes/cityscapes_stdc_seg75.yaml +++ b/src/super_gradients/recipes/cityscapes_stdc_seg75.yaml @@ -64,7 +64,10 @@ checkpoint_params: strict_load: no_key_matching training_hyperparams: - initial_lr: 0.005 + initial_lr: + cp: 0.005 + default: 0.05 + sync_bn: True loss: diff --git a/src/super_gradients/recipes/training_hyperparams/coco2017_yolo_nas_train_params.yaml b/src/super_gradients/recipes/training_hyperparams/coco2017_yolo_nas_train_params.yaml index ec6364ca51..60ae2dc419 100644 --- a/src/super_gradients/recipes/training_hyperparams/coco2017_yolo_nas_train_params.yaml +++ b/src/super_gradients/recipes/training_hyperparams/coco2017_yolo_nas_train_params.yaml @@ -8,7 +8,9 @@ warmup_initial_lr: 1e-6 lr_warmup_steps: 1000 lr_warmup_epochs: 0 -initial_lr: 2e-4 +initial_lr: 2e-4 + + lr_mode: CosineLRScheduler cosine_final_lr_ratio: 0.1 diff --git a/src/super_gradients/training/datasets/pose_estimation_datasets/yolo_nas_pose_collate_fn.py b/src/super_gradients/training/datasets/pose_estimation_datasets/yolo_nas_pose_collate_fn.py index 18f3b3080e..58d306929f 100644 --- a/src/super_gradients/training/datasets/pose_estimation_datasets/yolo_nas_pose_collate_fn.py +++ b/src/super_gradients/training/datasets/pose_estimation_datasets/yolo_nas_pose_collate_fn.py @@ -3,7 +3,7 @@ import numpy as np import torch from torch import Tensor -from torch.utils.data import default_collate +from torch.utils.data.dataloader import default_collate from super_gradients.common.registry.registry import register_collate_function from super_gradients.training.samples import PoseEstimationSample diff --git a/src/super_gradients/training/models/segmentation_models/segformer.py b/src/super_gradients/training/models/segmentation_models/segformer.py index 56a31abbff..d5c2544cb4 100644 --- a/src/super_gradients/training/models/segmentation_models/segformer.py +++ b/src/super_gradients/training/models/segmentation_models/segformer.py @@ -458,7 +458,7 @@ def _separate_lr_multiply_params(self): backbone_names = [n for n, p in self.backbone.named_parameters()] multiply_lr_params, no_multiply_params = {}, {} for name, param in self.named_parameters(): - if name in backbone_names: + if any([backbone_name in name for backbone_name in backbone_names]): no_multiply_params[name] = param else: multiply_lr_params[name] = param diff --git a/src/super_gradients/training/params.py b/src/super_gradients/training/params.py index 1388457841..9c4ca2d275 100755 --- a/src/super_gradients/training/params.py +++ b/src/super_gradients/training/params.py @@ -51,6 +51,7 @@ "warmup_mode": "LinearEpochLRWarmup", "step_lr_update_freq": None, "lr_updates": [], + "initial_lr": None, "clip_grad_norm": None, "pre_prediction_callback": None, "ckpt_best_name": "ckpt_best.pth", @@ -98,7 +99,12 @@ # "lr_updates": {"type": "array", "minItems": 1}, "lr_decay_factor": {"type": "number", "minimum": 0, "maximum": 1}, "lr_warmup_epochs": {"type": "number", "minimum": 0, "maximum": 10}, - "initial_lr": {"type": "number", "exclusiveMinimum": 0, "maximum": 10}, + "initial_lr": { + "anyOf": [ + {"type": ["number", "string", "boolean", "null"]}, + {"type": "object", "patternProperties": {"^[a-zA-Z0-9_.]+$": {"type": "number"}}, "additionalProperties": False}, + ] + }, }, "if": {"properties": {"lr_mode": {"const": "StepLRScheduler"}}}, "then": {"required": ["lr_updates", "lr_decay_factor"]}, diff --git a/src/super_gradients/training/sg_trainer/sg_trainer.py b/src/super_gradients/training/sg_trainer/sg_trainer.py index c9efd304c8..8037a80cff 100755 --- a/src/super_gradients/training/sg_trainer/sg_trainer.py +++ b/src/super_gradients/training/sg_trainer/sg_trainer.py @@ -76,7 +76,7 @@ broadcast_from_master, ) from super_gradients.training.utils.ema import ModelEMA -from super_gradients.training.utils.optimizer_utils import build_optimizer +from super_gradients.training.utils.optimizer_utils import build_optimizer, get_initial_lr_from_optimizer from super_gradients.training.utils.sg_trainer_utils import MonitoredValue, log_main_training_params from super_gradients.training.utils.utils import fuzzy_idx_in_list, unwrap_model from super_gradients.training.utils.weight_averaging_utils import ModelWeightAveraging @@ -906,9 +906,16 @@ def train( Final learning rate ratio (only relevant when `lr_mode`='CosineLRScheduler'). The cosine starts from initial_lr and reaches initial_lr * cosine_final_lr_ratio in last epoch - - `inital_lr` : float + - `inital_lr` : Union[float, Dict[str, float] - Initial learning rate. + Initial learning rate as: + float - learning rate value when passed as a scalar + Dictionary where keys are group names and values are the learning rates. + For example {"default": 0.01, "head": 0.1} + + - Keys in such mapping are prefixes of named parameters of the model. + - The "default" key is mandatory, and it's lr value is set for any group not specified in the other keys + - It is also possible to freeze some parts of the model by assigning 0 as a lr value. - `loss` : Union[nn.module, str] @@ -1310,6 +1317,20 @@ def forward(self, inputs, targets): else: raise RuntimeError("warmup_mode has to be either a name of a mode (str) or a subclass of PhaseCallback") + if isinstance(self.training_params.optimizer, str) or ( + inspect.isclass(self.training_params.optimizer) and issubclass(self.training_params.optimizer, torch.optim.Optimizer) + ): + self.optimizer = build_optimizer(net=unwrap_model(self.net), lr=self.training_params.initial_lr, training_params=self.training_params) + elif isinstance(self.training_params.optimizer, torch.optim.Optimizer): + if self.training_params.initial_lr is not None: + raise RuntimeError("An instantiated optimizer cannot be passed along initial_lr != None") + self.optimizer = self.training_params.optimizer + + # NEED TO EXTRACT INITAL_LR FROM THE OPTIMIZER PARAM GROUPS + self.training_params.initial_lr = get_initial_lr_from_optimizer(self.optimizer) + else: + raise UnsupportedOptimizerFormat() + if warmup_callback_cls is not None: self.phase_callbacks.append( warmup_callback_cls( @@ -1343,15 +1364,6 @@ def forward(self, inputs, targets): self._reset_best_metric() load_opt_params = False - if isinstance(self.training_params.optimizer, str) or ( - inspect.isclass(self.training_params.optimizer) and issubclass(self.training_params.optimizer, torch.optim.Optimizer) - ): - self.optimizer = build_optimizer(net=unwrap_model(self.net), lr=self.training_params.initial_lr, training_params=self.training_params) - elif isinstance(self.training_params.optimizer, torch.optim.Optimizer): - self.optimizer = self.training_params.optimizer - else: - raise UnsupportedOptimizerFormat() - if self.lr_mode is not None: lr_scheduler_callback = create_lr_scheduler_callback( lr_mode=self.lr_mode, @@ -1448,6 +1460,8 @@ def forward(self, inputs, targets): train_dataset_length=len(self.train_loader.dataset), train_dataloader_len=len(self.train_loader), max_train_batches=self.max_train_batches, + model=unwrap_model(self.net), + param_groups=self.optimizer.param_groups, ) self._maybe_set_preprocessing_params_for_model_from_dataset() @@ -1992,7 +2006,11 @@ def _get_epoch_start_logging_values(self) -> dict: """Get all the values that should be logged at the start of each epoch. This is useful for values like Learning Rate that can change over an epoch.""" lrs = [self.optimizer.param_groups[i]["lr"] for i in range(len(self.optimizer.param_groups))] - lr_titles = ["LR/Param_group_" + str(i) for i in range(len(self.optimizer.param_groups))] if len(self.optimizer.param_groups) > 1 else ["LR"] + lr_titles = ( + ["LR/" + self.optimizer.param_groups[i].get("name", str(i)) for i in range(len(self.optimizer.param_groups))] + if len(self.optimizer.param_groups) > 1 + else ["LR"] + ) lr_dict = {lr_titles[i]: lrs[i] for i in range(len(lrs))} return lr_dict diff --git a/src/super_gradients/training/utils/callbacks/callbacks.py b/src/super_gradients/training/utils/callbacks/callbacks.py index a77765c3c1..4d91b75cda 100644 --- a/src/super_gradients/training/utils/callbacks/callbacks.py +++ b/src/super_gradients/training/utils/callbacks/callbacks.py @@ -1,6 +1,7 @@ import copy import csv import math +import numbers import os import signal import time @@ -236,8 +237,10 @@ class LRCallbackBase(PhaseCallback): def __init__(self, phase, initial_lr, update_param_groups, train_loader_len, net, training_params, **kwargs): super(LRCallbackBase, self).__init__(phase) + if not isinstance(initial_lr, dict): + initial_lr = {"default": float(initial_lr)} self.initial_lr = initial_lr - self.lr = initial_lr + self.lr = initial_lr.copy() self.update_param_groups = update_param_groups self.train_loader_len = train_loader_len self.net = net @@ -265,15 +268,8 @@ def perform_scheduling(self, context: PhaseContext): raise NotImplementedError def update_lr(self, optimizer, epoch, batch_idx=None): - if self.update_param_groups: - param_groups = unwrap_model(self.net).update_param_groups( - optimizer.param_groups, self.lr, epoch, batch_idx, self.training_params, self.train_loader_len - ) - optimizer.param_groups = param_groups - else: - # UPDATE THE OPTIMIZERS PARAMETER - for param_group in optimizer.param_groups: - param_group["lr"] = self.lr + for param_group in optimizer.param_groups: + param_group["lr"] = self.lr[param_group["name"]] @register_lr_warmup(LRWarmups.LINEAR_EPOCH_STEP, deprecated_name="linear_epoch_step") @@ -287,13 +283,32 @@ class LinearEpochLRWarmup(LRCallbackBase): def __init__(self, **kwargs): super().__init__(Phase.TRAIN_EPOCH_START, **kwargs) - self.warmup_initial_lr = self.training_params.warmup_initial_lr or self.initial_lr / (self.training_params.lr_warmup_epochs + 1) - self.warmup_step_size = ( - (self.initial_lr - self.warmup_initial_lr) / self.training_params.lr_warmup_epochs if self.training_params.lr_warmup_epochs > 0 else 0 - ) + warmup_initial_lr = {} + if self.training_params.warmup_initial_lr is not None: + if isinstance(self.training_params.warmup_initial_lr, float): + for group_name in self.initial_lr.keys(): + warmup_initial_lr[group_name] = self.training_params.warmup_initial_lr + elif isinstance(self.training_params.warmup_initial_lr, Mapping): + warmup_initial_lr = self.training_params.warmup_initial_lr + else: + raise TypeError("Warmup initial lr expected to be of type float or Mapping.") + else: + for group_name in self.initial_lr.keys(): + warmup_initial_lr[group_name] = self.initial_lr[group_name] / (self.training_params.lr_warmup_epochs + 1) + self.warmup_initial_lr = warmup_initial_lr + + warmup_step_size = {} + for group_name in self.initial_lr.keys(): + warmup_step_size[group_name] = ( + (self.initial_lr[group_name] - self.warmup_initial_lr[group_name]) / self.training_params.lr_warmup_epochs + if self.training_params.lr_warmup_epochs > 0 + else 0 + ) + self.warmup_step_size = warmup_step_size def perform_scheduling(self, context): - self.lr = self.warmup_initial_lr + context.epoch * self.warmup_step_size + for group_name in self.initial_lr.keys(): + self.lr[group_name] = self.warmup_initial_lr[group_name] + context.epoch * self.warmup_step_size[group_name] self.update_lr(context.optimizer, context.epoch, None) def is_lr_scheduling_enabled(self, context): @@ -327,7 +342,6 @@ def __init__( warmup_initial_lr: float, initial_lr: float, train_loader_len: int, - update_param_groups: bool, lr_warmup_steps: int, training_params, net, @@ -350,12 +364,23 @@ def __init__( f"Warmup steps will be capped to number of steps in epoch to avoid interfering with any pre-epoch LR schedulers." ) - lr_warmup_steps = min(lr_warmup_steps, train_loader_len) - learning_rates = np.linspace(start=warmup_initial_lr, stop=initial_lr, num=lr_warmup_steps, endpoint=True) - - self.lr = initial_lr + if isinstance(initial_lr, numbers.Number): + initial_lr = {"default": initial_lr} self.initial_lr = initial_lr - self.update_param_groups = update_param_groups + self.lr = initial_lr.copy() + + if isinstance(warmup_initial_lr, numbers.Number): + warmup_initial_lr = {group_name: warmup_initial_lr for group_name in self.lr.keys()} + elif isinstance(warmup_initial_lr, Mapping): + warmup_initial_lr = warmup_initial_lr + else: + raise TypeError("Warmup initial lr expected to be of type float or Mapping.") + + lr_warmup_steps = min(lr_warmup_steps, train_loader_len) + learning_rates = { + group_name: np.linspace(start=warmup_initial_lr[group_name], stop=initial_lr[group_name], num=lr_warmup_steps, endpoint=True) + for group_name in self.initial_lr.keys() + } self.training_params = training_params self.net = net self.learning_rates = learning_rates @@ -365,7 +390,8 @@ def __init__( def on_train_batch_start(self, context: PhaseContext) -> None: global_training_step = context.batch_idx + context.epoch * self.train_loader_len if global_training_step < self.lr_warmup_steps: - self.lr = float(self.learning_rates[global_training_step]) + for group_name in self.initial_lr.keys(): + self.lr[group_name] = float(self.learning_rates[group_name][global_training_step]) self.update_lr(context.optimizer, context.epoch, context.batch_idx) def update_lr(self, optimizer, epoch, batch_idx=None): @@ -376,15 +402,9 @@ def update_lr(self, optimizer, epoch, batch_idx=None): :param batch_idx: :return: """ - if self.update_param_groups: - param_groups = unwrap_model(self.net).update_param_groups( - optimizer.param_groups, self.lr, epoch, batch_idx, self.training_params, self.train_loader_len - ) - optimizer.param_groups = param_groups - else: - # UPDATE THE OPTIMIZERS PARAMETER - for param_group in optimizer.param_groups: - param_group["lr"] = self.lr + # UPDATE THE OPTIMIZERS PARAMETER + for param_group in optimizer.param_groups: + param_group["lr"] = self.lr[param_group["name"]] @deprecated(deprecated_since="3.2.1", removed_from="3.5.0", target=LinearBatchLRWarmup) @@ -416,7 +436,8 @@ def __init__(self, lr_updates, lr_decay_factor, step_lr_update_freq=None, **kwar def perform_scheduling(self, context): num_updates_passed = [x for x in self.lr_updates if x <= context.epoch] - self.lr = self.initial_lr * self.lr_decay_factor ** len(num_updates_passed) + for group_name in self.lr.keys(): + self.lr[group_name] = self.initial_lr[group_name] * self.lr_decay_factor ** len(num_updates_passed) self.update_lr(context.optimizer, context.epoch, None) def is_lr_scheduling_enabled(self, context): @@ -441,7 +462,8 @@ def __init__(self, lr_decay_factor: float, **kwargs): def perform_scheduling(self, context): effective_epoch = context.epoch - self.training_params.lr_warmup_epochs current_iter = self.train_loader_len * effective_epoch + context.batch_idx - self.lr = self.initial_lr * self.lr_decay_factor ** (current_iter / self.train_loader_len) + for group_name in self.lr.keys(): + self.lr[group_name] = self.initial_lr[group_name] * self.lr_decay_factor ** (current_iter / self.train_loader_len) self.update_lr(context.optimizer, context.epoch, context.batch_idx) def is_lr_scheduling_enabled(self, context): @@ -469,7 +491,8 @@ def perform_scheduling(self, context): effective_max_epochs = self.max_epochs - self.training_params.lr_warmup_epochs - self.training_params.lr_cooldown_epochs current_iter = (self.train_loader_len * effective_epoch + context.batch_idx) / self.training_params.batch_accumulate max_iter = self.train_loader_len * effective_max_epochs / self.training_params.batch_accumulate - self.lr = self.initial_lr * pow((1.0 - (current_iter / max_iter)), 0.9) + for group_name in self.lr.keys(): + self.lr[group_name] = self.initial_lr[group_name] * pow((1.0 - (current_iter / max_iter)), 0.9) self.update_lr(context.optimizer, context.epoch, context.batch_idx) def is_lr_scheduling_enabled(self, context): @@ -498,9 +521,9 @@ def perform_scheduling(self, context): effective_max_epochs = self.max_epochs - self.training_params.lr_warmup_epochs - self.training_params.lr_cooldown_epochs current_iter = max(0, self.train_loader_len * effective_epoch + context.batch_idx - self.training_params.lr_warmup_steps) max_iter = self.train_loader_len * effective_max_epochs - self.training_params.lr_warmup_steps + for group_name in self.lr.keys(): + self.lr[group_name] = float(self.compute_learning_rate(current_iter, max_iter, self.initial_lr[group_name], self.cosine_final_lr_ratio)) - lr = self.compute_learning_rate(current_iter, max_iter, self.initial_lr, self.cosine_final_lr_ratio) - self.lr = float(lr) self.update_lr(context.optimizer, context.epoch, context.batch_idx) def is_lr_scheduling_enabled(self, context): @@ -545,13 +568,14 @@ def is_lr_scheduling_enabled(self, context): def perform_scheduling(self, context): effective_epoch = context.epoch - self.training_params.lr_warmup_epochs effective_max_epochs = self.max_epochs - self.training_params.lr_warmup_epochs - self.training_params.lr_cooldown_epochs - self.lr = self.lr_schedule_function( - initial_lr=self.initial_lr, - epoch=effective_epoch, - iter=context.batch_idx, - max_epoch=effective_max_epochs, - iters_per_epoch=self.train_loader_len, - ) + for group_name in self.lr.keys(): + self.lr[group_name] = self.lr_schedule_function( + initial_lr=self.initial_lr[group_name], + epoch=effective_epoch, + iter=context.batch_idx, + max_epoch=effective_max_epochs, + iters_per_epoch=self.train_loader_len, + ) self.update_lr(context.optimizer, context.epoch, context.batch_idx) diff --git a/src/super_gradients/training/utils/optimizer_utils.py b/src/super_gradients/training/utils/optimizer_utils.py index 0d52ee5acf..394cac8ee2 100755 --- a/src/super_gradients/training/utils/optimizer_utils.py +++ b/src/super_gradients/training/utils/optimizer_utils.py @@ -1,3 +1,5 @@ +import warnings + import torch.nn as nn import torch.optim as optim from super_gradients.common.abstractions.abstract_logger import get_logger @@ -13,6 +15,8 @@ from super_gradients.training.utils.utils import is_model_wrapped from torch.nn.modules.batchnorm import _BatchNorm from torch.nn.modules.conv import _ConvNd +from typing import List, Dict, Union +import torch logger = get_logger(__name__) @@ -99,11 +103,16 @@ def build_optimizer(net: nn.Module, lr: float, training_params) -> optim.Optimiz weight_decay = get_param(training_params.optimizer_params, "weight_decay", 0.0) # OPTIMIZER PARAM GROUPS ARE SET USING DEFAULT OR MODEL SPECIFIC INIT - if hasattr(net, "initialize_param_groups"): - # INITIALIZE_PARAM_GROUPS MUST RETURN A LIST OF DICTS WITH 'named_params' AND OPTIMIZER's ATTRIBUTES PER GROUP - net_named_params = net.initialize_param_groups(lr, training_params) - else: - net_named_params = [{"named_params": net.named_parameters()}] + if hasattr(net, "initialize_param_groups") or hasattr(net, "update_param_groups"): + warnings.warn( + "initialize_param_groups and update_param_groups usages are deprecated since 3.4.0, will be removed in " + "3.5.0 and have no effect. \n " + "Assign different learning rates by passing a mapping of layer name prefixes to lr values through " + "initial_lr training hyperparameter (i.e initial_lr={'backbone': 0.01, 'default':0.1})", + DeprecationWarning, + ) + + net_named_params = initialize_param_groups(net, lr) if training_params.zero_weight_decay_on_bias_and_bn: optimizer_training_params = separate_zero_wd_params_groups_for_optimizer(net, net_named_params, weight_decay) @@ -117,6 +126,99 @@ def build_optimizer(net: nn.Module, lr: float, training_params) -> optim.Optimiz optimizer_training_params = net_named_params # CREATE AN OPTIMIZER OBJECT AND INITIALIZE IT - optimizer = optimizer_cls(optimizer_training_params, lr=lr, **training_params.optimizer_params) + optimizer = optimizer_cls(optimizer_training_params, **training_params.optimizer_params) + + return optimizer + + +def separate_lr_groups(model: nn.Module, lr_dict: Dict[str, float]) -> List[Dict]: + """ + Separate parameters based on specified learning rates for each group in the model. + :param model: nn.Module model. + :param lr_dict: Dictionary where keys are group names and values are the learning rates. + :return: List of param groups with named_parameters and corresponding learning rates. + """ + param_groups = [] + default_lr = lr_dict.get("default", None) + if default_lr is None: + raise RuntimeError("When passing initial_lr as dictionary, must pass 'default'.") + group_names = set(lr_dict.keys()) - {"default"} + + for group_name in group_names: + lr = lr_dict[group_name] + named_params = [(name, param) for name, param in model.named_parameters() if name.startswith(group_name)] + + if lr == 0: + for name, param in named_params: + param.requires_grad = False # Freeze the layer + else: + param_groups.append({"named_params": named_params, "lr": lr, "name": group_name}) + + if default_lr != 0: + default_named_params = [ + (name, param) for name, param in model.named_parameters() if all(name.startswith(group) is False for group in group_names) and param.requires_grad + ] + if default_named_params: + param_groups.append({"named_params": default_named_params, "lr": default_lr, "name": "default"}) + + return param_groups + + +def initialize_param_groups(model: nn.Module, lr: Union[float, Dict[str, float]]) -> List[Dict]: + """ + Custom param groups for training with specified learning rates for each group in the model. + :param model: nn.Module model. + :param lr: Dictionary where keys are group names and values are the learning rates, + or a learning rate value when passed as a scalar. + :return: List of param groups. + """ + if isinstance(lr, float) or isinstance(lr, int): + model_named_params = [{"named_params": model.named_parameters(), "lr": lr, "name": "default"}] + else: + model_named_params = separate_lr_groups(model, lr) + return model_named_params + + +def name_optimizer_param_groups_inplace(optimizer: torch.optim.Optimizer) -> torch.optim.Optimizer: + """ + Convert an optimizer's param_groups to use named parameters, modifying it in place. + + :param optimizer: torch.optim.Optimizer, The optimizer to be converted. + + Returns: + torch.optim.Optimizer: The same optimizer with modified param_groups. + """ + + named_parameters = list(optimizer.param_groups[0]["params"]) + num_param_groups = len(optimizer.param_groups) + group_name = [f"group_{i}" for i in range(num_param_groups)] if num_param_groups > 1 else "default" + + for i, param_group in enumerate(optimizer.param_groups): + param_group["params"] = named_parameters + param_group["name"] = group_name if num_param_groups == 1 else group_name[i] return optimizer + + +def get_initial_lr_from_optimizer(optimizer: torch.optim.Optimizer) -> Union[Dict[str, float], float]: + """ + Returns Initial learning rate as: + + float - learning rate value when passed as a scalar + Dictionary where keys are group names and values are the learning rates. + For example {"default": 0.01, "head": 0.1} + + Does so by iterating over the optmizer.param_groups and extracting the "lr" vaules. + If the optimizer was intiialized with .parameters() and not named_paramters(), names will be assigned to the + optimizer parameter groups by index. + + :param optimizer: torch.optim.Optimizer, The optimizer to extract the lrs from. + :return: initial_lr as described above. + """ + if "name" not in optimizer.param_groups[0].keys(): + optimizer = name_optimizer_param_groups_inplace(optimizer) + if len(optimizer.param_groups) == 1: + initial_lr = optimizer.param_groups[0]["lr"] + else: + initial_lr = {group["name"]: group["lr"] for group in optimizer.param_groups} + return initial_lr diff --git a/src/super_gradients/training/utils/sg_trainer_utils.py b/src/super_gradients/training/utils/sg_trainer_utils.py index 1d269f60d9..8eb0e20cc6 100644 --- a/src/super_gradients/training/utils/sg_trainer_utils.py +++ b/src/super_gradients/training/utils/sg_trainer_utils.py @@ -5,11 +5,11 @@ from dataclasses import dataclass from multiprocessing import Process from pathlib import Path -from typing import Tuple, Union, Dict, Sequence, Callable, Optional +from typing import Tuple, Union, Dict, Sequence, Callable, Optional, List import random import inspect - +from torch import nn from super_gradients.common.abstractions.abstract_logger import get_logger from treelib import Tree from termcolor import colored @@ -20,8 +20,6 @@ from super_gradients.common.environment.device_utils import device_config from super_gradients.common.exceptions.dataset_exceptions import UnsupportedBatchItemsFormat from super_gradients.common.data_types.enum import MultiGPUMode - - from enum import Enum @@ -446,6 +444,46 @@ def get_callable_param_names(obj: callable) -> Tuple[str]: return tuple(inspect.signature(obj).parameters) +def get_lr_info(model: nn.Module, param_groups: List[Dict[str, Union[str, float, List[tuple]]]]) -> str: + """ + Generate a string with information about the model and learning rates for each parameter group. + + Parameters: + model (nn.Module): The PyTorch model. + param_groups (List[Dict[str, Union[str, float, List[tuple]]]]): List of dictionaries containing information about + each parameter group, including the group name, learning rate, and named parameters. + + Returns: + str: A formatted string with information about the model and learning rates. + """ + total_params = sum(p.numel() for p in model.parameters()) + optimized_params = sum(p.numel() for group in param_groups for p in group["params"]) + + info_str = f" - Model: {type(model).__name__} ({total_params / 1e6:.1f}M parameters" + + if optimized_params >= 1e6: + precision_optimized = max(0, 4 - int(optimized_params / 1e6).bit_length()) + info_str += f", {optimized_params / 1e6:.{precision_optimized}f}M optimized)\n" + else: + precision_optimized = max(0, 4 - int(optimized_params).bit_length()) + info_str += f", {optimized_params:.{precision_optimized}f}M optimized)\n" + + info_str += " - Learning rates:\n" + for group in param_groups: + group_name = group["name"] + group_lr = group["lr"] + group_params = sum(p.numel() for p in group["params"]) + + if group_params >= 1e6: + precision_group = max(0, 4 - int(group_params / 1e6).bit_length()) + info_str += f" - {group_name}: {group_lr} ({group_params / 1e6:.{precision_group}f}M parameters)\n" + else: + precision_group = max(0, 4 - int(group_params).bit_length()) + info_str += f" - {group_name}: {group_lr} ({group_params:.{precision_group}f}M parameters)\n" + + return info_str + + def log_main_training_params( multi_gpu: MultiGPUMode, num_gpus: int, @@ -453,6 +491,8 @@ def log_main_training_params( batch_accumulate: int, train_dataset_length: int, train_dataloader_len: int, + model: nn.Module, + param_groups: List[Dict[str, Union[str, float, List[tuple]]]], max_train_batches: Optional[int] = None, ): """Log training parameters""" @@ -464,7 +504,7 @@ def log_main_training_params( msg = ( "TRAINING PARAMETERS:\n" f" - Mode: {multi_gpu.name if multi_gpu else 'Single GPU'}\n" - f" - Number of GPUs: {num_gpus if 'cuda' in device_config.device else 0:<10} ({torch.cuda.device_count()} available on the machine)\n" + f" - Number of GPUs: {num_gpus if 'cuda' in device_config.device else 0:<10} ({torch.cuda.device_count()} available on the machine)\n" f" - Full dataset size: {train_dataset_length:<10} (len(train_set))\n" f" - Batch size per GPU: {batch_size:<10} (batch_size)\n" f" - Batch Accumulate: {batch_accumulate:<10} (batch_accumulate)\n" @@ -473,6 +513,7 @@ def log_main_training_params( f" - Iterations per epoch: {iterations_per_epoch:<10} ({what_used_str})\n" f" - Gradient updates per epoch: {gradients_updates_per_epoch:<10} ({what_used_str} / batch_accumulate)\n" ) + msg += get_lr_info(model, param_groups) logger.info(msg) diff --git a/tests/deci_core_unit_test_suite_runner.py b/tests/deci_core_unit_test_suite_runner.py index da3bf52c6c..20bf1f6fc2 100644 --- a/tests/deci_core_unit_test_suite_runner.py +++ b/tests/deci_core_unit_test_suite_runner.py @@ -60,7 +60,6 @@ from tests.unit_tests.kd_trainer_test import KDTrainerTest from tests.unit_tests.dice_loss_test import DiceLossTest from tests.unit_tests.iou_loss_test import IoULossTest -from tests.unit_tests.update_param_groups_unit_test import UpdateParamGroupsTest from tests.unit_tests.vit_unit_test import TestViT from tests.unit_tests.yolo_nas_tests import TestYOLONAS from tests.unit_tests.yolox_unit_test import TestYOLOX @@ -123,7 +122,6 @@ def _add_modules_to_unit_tests_suite(self): self.unit_tests_suite.addTest(self.test_loader.loadTestsFromModule(LRCooldownTest)) self.unit_tests_suite.addTest(self.test_loader.loadTestsFromModule(DetectionTargetsTransformTest)) self.unit_tests_suite.addTest(self.test_loader.loadTestsFromModule(ForwardpassPrepFNTest)) - self.unit_tests_suite.addTest(self.test_loader.loadTestsFromModule(UpdateParamGroupsTest)) self.unit_tests_suite.addTest(self.test_loader.loadTestsFromModule(MaskAttentionLossTest)) self.unit_tests_suite.addTest(self.test_loader.loadTestsFromModule(IoULossTest)) self.unit_tests_suite.addTest(self.test_loader.loadTestsFromModule(TestDetectionDatasetSubsampling)) diff --git a/tests/unit_tests/lr_warmup_test.py b/tests/unit_tests/lr_warmup_test.py index 2521090499..49b87fac63 100644 --- a/tests/unit_tests/lr_warmup_test.py +++ b/tests/unit_tests/lr_warmup_test.py @@ -30,15 +30,20 @@ class ExponentialWarmupLRCallback(LRCallbackBase): def __init__(self, **kwargs): super().__init__(Phase.TRAIN_EPOCH_START, **kwargs) - self.warmup_initial_lr = self.training_params.warmup_initial_lr or 0.001 + warmup_initial_lr = self.training_params.warmup_initial_lr or 0.001 + if isinstance(warmup_initial_lr, float): + warmup_initial_lr = {"default": warmup_initial_lr} + self.warmup_initial_lr = warmup_initial_lr warmup_epochs = self.training_params.lr_warmup_epochs lr_start = self.warmup_initial_lr lr_end = self.initial_lr - self.c1 = (lr_end - lr_start) / (np.exp(warmup_epochs) - 1.0) - self.c2 = (lr_start * np.exp(warmup_epochs) - lr_end) / (np.exp(warmup_epochs) - 1.0) + self.c1 = {group_name: (lr_end[group_name] - lr_start[group_name]) / (np.exp(warmup_epochs) - 1.0) for group_name in self.lr.keys()} + self.c2 = { + group_name: (lr_start[group_name] * np.exp(warmup_epochs) - lr_end[group_name]) / (np.exp(warmup_epochs) - 1.0) for group_name in self.lr.keys() + } def perform_scheduling(self, context): - self.lr = self.c1 * np.exp(context.epoch) + self.c2 + self.lr = {group_name: self.c1[group_name] * np.exp(context.epoch) + self.c2[group_name] for group_name in self.lr.keys()} self.update_lr(context.optimizer, context.epoch, None) def is_lr_scheduling_enabled(self, context): diff --git a/tests/unit_tests/test_lr_assignment.py b/tests/unit_tests/test_lr_assignment.py new file mode 100644 index 0000000000..ba8e27bc41 --- /dev/null +++ b/tests/unit_tests/test_lr_assignment.py @@ -0,0 +1,183 @@ +import unittest +from copy import deepcopy + +from super_gradients import Trainer +from super_gradients.common.object_names import Models +from super_gradients.training.dataloaders.dataloaders import classification_test_dataloader +from super_gradients.training.metrics import Accuracy +from super_gradients.training.models import LeNet +from super_gradients.training.utils import HpmStruct +from super_gradients.training.utils.optimizer_utils import separate_lr_groups +from super_gradients.training import models +from super_gradients.training.utils.utils import check_models_have_same_weights + + +class TestSeparateLRGroups(unittest.TestCase): + def test_all_parameters_covered(self): + model = LeNet() # Create your model + lr_dict = {"fc3": 0.01, "fc2": 0.001, "fc1": 0.005, "default": 0.1} + + param_groups = separate_lr_groups(model, lr_dict) + + all_params = set() + for group in param_groups: + all_params.update(param[0] for param in group["named_params"]) + + all_named_params = set(param[0] for param in model.named_parameters()) + + self.assertEqual(all_params, all_named_params) + + def test_no_parameter_intersection(self): + model = LeNet() # Create your model + lr_dict = {"fc3": 0.01, "fc2": 0.001, "fc1": 0.005, "default": 0.1} + + param_groups = separate_lr_groups(model, lr_dict) + + for group1 in param_groups: + for group2 in param_groups: + if group1 != group2: + intersection = set(param[0] for param in group1["named_params"]).intersection(set(param[0] for param in group2["named_params"])) + self.assertEqual(len(intersection), 0) + + def test_ddrnet_param_groups_consistency(self): + model = models.get(Models.DDRNET_23, pretrained_weights="cityscapes") + lr_dict = { + "default": 0.075, + # backbone layers + "_backbone": 0.0075, + "compression3": 0.0075, + "compression4": 0.0075, + "down3": 0.0075, + "down4": 0.0075, + "layer3_skip": 0.0075, + "layer4_skip": 0.0075, + "layer5_skip": 0.0075, + } + + param_groups = separate_lr_groups(model, lr_dict) + param_groups_old = model.initialize_param_groups(0.0075, training_params=HpmStruct(multiply_head_lr=10)) + + self._check_param_groups_assign_same_lrs(param_groups, param_groups_old) + + def test_ppliteseg_param_groups_consistency(self): + model = models.get(Models.PP_LITE_T_SEG50, pretrained_weights="cityscapes") + lr_dict = {"encoder.backbone": 0.01, "default": 0.1} + + param_groups = separate_lr_groups(model, lr_dict) + param_groups_old = model.initialize_param_groups(0.01, training_params=HpmStruct(multiply_head_lr=10)) + + self._check_param_groups_assign_same_lrs(param_groups, param_groups_old) + + def test_stdc_param_groups_consistency(self): + model = models.get(Models.STDC1_SEG50, pretrained_weights="cityscapes") + lr_dict = {"cp": 0.005, "default": 0.05} + + param_groups = separate_lr_groups(model, lr_dict) + param_groups_old = model.initialize_param_groups(0.005, training_params=HpmStruct(multiply_head_lr=10, loss=None)) + + self._check_param_groups_assign_same_lrs(param_groups, param_groups_old) + + def test_regseg_param_groups_consistency(self): + model = models.get(Models.REGSEG48, pretrained_weights="cityscapes") + lr_dict = {"head.": 0.05, "default": 0.005} + + param_groups = separate_lr_groups(model, lr_dict) + param_groups_old = model.initialize_param_groups(0.005, training_params=HpmStruct(multiply_head_lr=10, loss=None)) + + self._check_param_groups_assign_same_lrs(param_groups, param_groups_old) + + def test_segformer_param_groups_consistency(self): + model = models.get(Models.SEGFORMER_B0, pretrained_weights="cityscapes") + lr_dict = {"default": 0.05, "_backbone": 0.005} + + param_groups = separate_lr_groups(model, lr_dict) + param_groups_old = model.initialize_param_groups(0.005, training_params=HpmStruct(multiply_head_lr=10, loss=None)) + + self._check_param_groups_assign_same_lrs(param_groups, param_groups_old) + + def test_requires_grad_false(self): + # Test when some layers have requires_grad==False + model = LeNet() + lr_dict = {"fc2": 0.001, "fc1": 0.005, "default": 0.1} + for param in model.fc3.parameters(): + param.requires_grad = False + + param_groups = separate_lr_groups(model, lr_dict) + total_optimizable_params = sum(p.numel() for p in model.parameters() if p.requires_grad) + + # Extract tensors from the "named_params" entry in each dictionary + tensors_in_param_groups = [entry[1] for group in param_groups for entry in group["named_params"]] + total_params_in_param_groups = sum(t.numel() for t in tensors_in_param_groups) + + self.assertEqual(total_params_in_param_groups, total_optimizable_params) + + def test_initial_lr_zero(self): + # Test case when initial_lr = {"default": 1, "some_layer": 0} + model = LeNet() + lr_dict = { + "default": 1, + "fc1": 0, + } + + param_groups = separate_lr_groups(model, lr_dict) + total_non_optimizable_params = sum(p.numel() for p in model.parameters() if not p.requires_grad) + total_model_params = sum(p.numel() for p in model.parameters()) + + # Extract tensors from the "named_params" entry in each dictionary + tensors_in_param_groups = [entry[1] for group in param_groups for entry in group["named_params"]] + total_params_in_param_groups = sum(t.numel() for t in tensors_in_param_groups) + + self.assertEqual(total_params_in_param_groups, total_model_params - total_non_optimizable_params) + + def test_train_with_lr_assignment(self): + # Define Model + net = LeNet() + net_before_train = deepcopy(net) + + trainer = Trainer("test_train_with_lr_assignment") + + train_params = { + "max_epochs": 3, + "lr_updates": [], + "lr_decay_factor": 0.1, + "lr_mode": "StepLRScheduler", + "initial_lr": { + "default": 0, + "fc3": 0.1, + }, + "loss": "CrossEntropyLoss", + "optimizer": "SGD", + "criterion_params": {}, + "optimizer_params": {"weight_decay": 1e-4, "momentum": 0.9}, + "train_metrics_list": [Accuracy()], + "valid_metrics_list": [Accuracy()], + "metric_to_watch": "Accuracy", + "greater_metric_to_watch_is_better": True, + "ema": False, + "phase_callbacks": [], + } + + trainer.train( + model=net, + training_params=train_params, + train_loader=classification_test_dataloader(batch_size=4), + valid_loader=classification_test_dataloader(batch_size=4), + ) + + self.assertTrue(check_models_have_same_weights(net_before_train.conv1, net.conv1)) + self.assertTrue(check_models_have_same_weights(net_before_train.conv2, net.conv2)) + self.assertTrue(check_models_have_same_weights(net_before_train.fc1, net.fc1)) + self.assertTrue(check_models_have_same_weights(net_before_train.fc2, net.fc2)) + self.assertFalse(check_models_have_same_weights(net_before_train.fc3, net.fc3)) + + def _check_param_groups_assign_same_lrs(self, param_groups, param_groups_old): + names_lr_pairs = set([(sub_group[0], group["lr"]) for group in param_groups for sub_group in group["named_params"]]) + names_lr_pairs_old = set([(sub_group[0], group["lr"]) for group in param_groups_old for sub_group in group["named_params"]]) + self.assertEqual(set(names_lr_pairs_old), set(names_lr_pairs)) + + if __name__ == "__main__": + unittest.main() + + +if __name__ == "__main__": + unittest.main() diff --git a/tests/unit_tests/train_with_intialized_param_args_test.py b/tests/unit_tests/train_with_intialized_param_args_test.py index d1dcefbd22..d2b99d993e 100644 --- a/tests/unit_tests/train_with_intialized_param_args_test.py +++ b/tests/unit_tests/train_with_intialized_param_args_test.py @@ -54,7 +54,6 @@ def test_train_with_external_optimizer(self): "lr_decay_factor": 0.1, "lr_mode": "StepLRScheduler", "lr_warmup_epochs": 0, - "initial_lr": 0.1, "loss": "CrossEntropyLoss", "optimizer": optimizer, "criterion_params": {}, @@ -80,7 +79,6 @@ def test_train_with_external_scheduler(self): "max_epochs": 2, "phase_callbacks": phase_callbacks, "lr_warmup_epochs": 0, - "initial_lr": lr, "loss": "CrossEntropyLoss", "optimizer": optimizer, "criterion_params": {}, @@ -102,7 +100,7 @@ def test_train_with_external_scheduler_class(self): train_params = { "max_epochs": 2, "lr_warmup_epochs": 0, - "initial_lr": 0.3, + "initial_lr": 0.1, "loss": "CrossEntropyLoss", "optimizer": optimizer, "criterion_params": {}, @@ -127,7 +125,6 @@ def test_train_with_reduce_on_plateau(self): "max_epochs": 2, "phase_callbacks": phase_callbacks, "lr_warmup_epochs": 0, - "initial_lr": lr, "loss": "CrossEntropyLoss", "optimizer": optimizer, "criterion_params": {}, diff --git a/tests/unit_tests/update_param_groups_unit_test.py b/tests/unit_tests/update_param_groups_unit_test.py deleted file mode 100644 index e4edd4ca02..0000000000 --- a/tests/unit_tests/update_param_groups_unit_test.py +++ /dev/null @@ -1,56 +0,0 @@ -import unittest -from super_gradients.training import Trainer -from super_gradients.training.dataloaders.dataloaders import classification_test_dataloader -from super_gradients.training.metrics import Accuracy -from super_gradients.training.models import LeNet -from super_gradients.training.utils import HpmStruct, get_param -from super_gradients.training.utils.callbacks import TestLRCallback -import numpy as np - - -class TestNet(LeNet): - """ - Toy test net with update_param_groups method that hard codes some lr. - """ - - def __init__(self): - super(TestNet, self).__init__() - - def update_param_groups(self, param_groups: list, lr: float, epoch: int, iter: int, training_params: HpmStruct, total_batch: int) -> list: - initial_lr = get_param(training_params, "initial_lr") - for param_group in param_groups: - param_group["lr"] = initial_lr * (epoch + 1) - return param_groups - - -class UpdateParamGroupsTest(unittest.TestCase): - def test_lr_scheduling_with_update_param_groups(self): - # Define Model - net = TestNet() - trainer = Trainer("lr_warmup_test") - - lrs = [] - phase_callbacks = [TestLRCallback(lr_placeholder=lrs)] - - train_params = { - "max_epochs": 3, - "lr_mode": "StepLRScheduler", - "lr_updates": [0, 1, 2], - "initial_lr": 0.1, - "lr_decay_factor": 1, - "loss": "CrossEntropyLoss", - "optimizer": "SGD", - "criterion_params": {}, - "optimizer_params": {"weight_decay": 1e-4, "momentum": 0.9}, - "train_metrics_list": [Accuracy()], - "valid_metrics_list": [Accuracy()], - "metric_to_watch": "Accuracy", - "greater_metric_to_watch_is_better": True, - "ema": False, - "phase_callbacks": phase_callbacks, - } - - expected_lrs = np.array([0.1, 0.2, 0.3]) - trainer.train(model=net, training_params=train_params, train_loader=classification_test_dataloader(), valid_loader=classification_test_dataloader()) - - self.assertTrue(np.allclose(np.array(lrs), expected_lrs, rtol=0.0000001))