Skip to content

Commit

Permalink
Allow setting per-layer learning rates" (#1612)
Browse files Browse the repository at this point in the history
* updated schedulers, warmup and logging

* updated recipes

* updated tests

* updated some typing and docs

* updated some typing and docs

* removed update_param_groups test

* handled training with instantiated optimizer

* updated instantiated optimizer tests

* fixed erased initial_lr in unit test

* updated message and tests

* added train test

---------

Co-authored-by: Eugene Khvedchenya <ekhvedchenya@gmail.com>
  • Loading branch information
shaydeci and BloodAxe authored Nov 13, 2023
1 parent aa58407 commit f8686cd
Show file tree
Hide file tree
Showing 19 changed files with 495 additions and 140 deletions.
12 changes: 11 additions & 1 deletion src/super_gradients/recipes/cityscapes_ddrnet.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -55,7 +55,17 @@ architecture: ddrnet_23

training_hyperparams:
max_epochs: 500
initial_lr: 0.0075 # batch size 24
initial_lr: # batch size 24
default: 0.075
# backbone layers
_backbone: 0.0075
compression3: 0.0075
compression4: 0.0075
down3: 0.0075
down4: 0.0075
layer3_skip: 0.0075
layer4_skip: 0.0075
layer5_skip: 0.0075
loss:
DiceCEEdgeLoss:
num_classes: 19
Expand Down
14 changes: 13 additions & 1 deletion src/super_gradients/recipes/cityscapes_kd_base.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -50,7 +50,19 @@ resume: False
training_hyperparams:
sync_bn: True
max_epochs: 500
initial_lr: 0.0075 # batch size 24

initial_lr: # batch size 24
default: 0.075
# backbone layers
_backbone: 0.0075
compression3: 0.0075
compression4: 0.0075
down3: 0.0075
down4: 0.0075
layer3_skip: 0.0075
layer4_skip: 0.0075
layer5_skip: 0.0075

resume: ${resume}
loss:
_target_: super_gradients.training.losses.seg_kd_loss.SegKDLoss
Expand Down
3 changes: 3 additions & 0 deletions src/super_gradients/recipes/cityscapes_pplite_seg50.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -67,6 +67,9 @@ checkpoint_params:

training_hyperparams:
sync_bn: True
initial_lr:
"encoder.backbone": 0.01
default: 0.1
loss:
DiceCEEdgeLoss:
num_classes: 19
Expand Down
4 changes: 4 additions & 0 deletions src/super_gradients/recipes/cityscapes_pplite_seg75.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -62,6 +62,10 @@ checkpoint_params:

training_hyperparams:
sync_bn: True
initial_lr:
"encoder.backbone": 0.01
default: 0.1

loss:
DiceCEEdgeLoss:
num_classes: 19
Expand Down
3 changes: 3 additions & 0 deletions src/super_gradients/recipes/cityscapes_stdc_seg50.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -60,6 +60,9 @@ checkpoint_params:
strict_load: no_key_matching

training_hyperparams:
initial_lr:
cp: 0.01
default: 0.1
sync_bn: True
loss:
DiceCEEdgeLoss:
Expand Down
5 changes: 4 additions & 1 deletion src/super_gradients/recipes/cityscapes_stdc_seg75.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -64,7 +64,10 @@ checkpoint_params:
strict_load: no_key_matching

training_hyperparams:
initial_lr: 0.005
initial_lr:
cp: 0.005
default: 0.05

sync_bn: True

loss:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,9 @@ warmup_initial_lr: 1e-6
lr_warmup_steps: 1000
lr_warmup_epochs: 0

initial_lr: 2e-4
initial_lr: 2e-4


lr_mode: CosineLRScheduler
cosine_final_lr_ratio: 0.1

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
import numpy as np
import torch
from torch import Tensor
from torch.utils.data import default_collate
from torch.utils.data.dataloader import default_collate

from super_gradients.common.registry.registry import register_collate_function
from super_gradients.training.samples import PoseEstimationSample
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -458,7 +458,7 @@ def _separate_lr_multiply_params(self):
backbone_names = [n for n, p in self.backbone.named_parameters()]
multiply_lr_params, no_multiply_params = {}, {}
for name, param in self.named_parameters():
if name in backbone_names:
if any([backbone_name in name for backbone_name in backbone_names]):
no_multiply_params[name] = param
else:
multiply_lr_params[name] = param
Expand Down
8 changes: 7 additions & 1 deletion src/super_gradients/training/params.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,7 @@
"warmup_mode": "LinearEpochLRWarmup",
"step_lr_update_freq": None,
"lr_updates": [],
"initial_lr": None,
"clip_grad_norm": None,
"pre_prediction_callback": None,
"ckpt_best_name": "ckpt_best.pth",
Expand Down Expand Up @@ -98,7 +99,12 @@
# "lr_updates": {"type": "array", "minItems": 1},
"lr_decay_factor": {"type": "number", "minimum": 0, "maximum": 1},
"lr_warmup_epochs": {"type": "number", "minimum": 0, "maximum": 10},
"initial_lr": {"type": "number", "exclusiveMinimum": 0, "maximum": 10},
"initial_lr": {
"anyOf": [
{"type": ["number", "string", "boolean", "null"]},
{"type": "object", "patternProperties": {"^[a-zA-Z0-9_.]+$": {"type": "number"}}, "additionalProperties": False},
]
},
},
"if": {"properties": {"lr_mode": {"const": "StepLRScheduler"}}},
"then": {"required": ["lr_updates", "lr_decay_factor"]},
Expand Down
44 changes: 31 additions & 13 deletions src/super_gradients/training/sg_trainer/sg_trainer.py
Original file line number Diff line number Diff line change
Expand Up @@ -76,7 +76,7 @@
broadcast_from_master,
)
from super_gradients.training.utils.ema import ModelEMA
from super_gradients.training.utils.optimizer_utils import build_optimizer
from super_gradients.training.utils.optimizer_utils import build_optimizer, get_initial_lr_from_optimizer
from super_gradients.training.utils.sg_trainer_utils import MonitoredValue, log_main_training_params
from super_gradients.training.utils.utils import fuzzy_idx_in_list, unwrap_model
from super_gradients.training.utils.weight_averaging_utils import ModelWeightAveraging
Expand Down Expand Up @@ -906,9 +906,16 @@ def train(
Final learning rate ratio (only relevant when `lr_mode`='CosineLRScheduler'). The cosine starts from initial_lr and reaches
initial_lr * cosine_final_lr_ratio in last epoch
- `inital_lr` : float
- `inital_lr` : Union[float, Dict[str, float]
Initial learning rate.
Initial learning rate as:
float - learning rate value when passed as a scalar
Dictionary where keys are group names and values are the learning rates.
For example {"default": 0.01, "head": 0.1}
- Keys in such mapping are prefixes of named parameters of the model.
- The "default" key is mandatory, and it's lr value is set for any group not specified in the other keys
- It is also possible to freeze some parts of the model by assigning 0 as a lr value.
- `loss` : Union[nn.module, str]
Expand Down Expand Up @@ -1310,6 +1317,20 @@ def forward(self, inputs, targets):
else:
raise RuntimeError("warmup_mode has to be either a name of a mode (str) or a subclass of PhaseCallback")

if isinstance(self.training_params.optimizer, str) or (
inspect.isclass(self.training_params.optimizer) and issubclass(self.training_params.optimizer, torch.optim.Optimizer)
):
self.optimizer = build_optimizer(net=unwrap_model(self.net), lr=self.training_params.initial_lr, training_params=self.training_params)
elif isinstance(self.training_params.optimizer, torch.optim.Optimizer):
if self.training_params.initial_lr is not None:
raise RuntimeError("An instantiated optimizer cannot be passed along initial_lr != None")
self.optimizer = self.training_params.optimizer

# NEED TO EXTRACT INITAL_LR FROM THE OPTIMIZER PARAM GROUPS
self.training_params.initial_lr = get_initial_lr_from_optimizer(self.optimizer)
else:
raise UnsupportedOptimizerFormat()

if warmup_callback_cls is not None:
self.phase_callbacks.append(
warmup_callback_cls(
Expand Down Expand Up @@ -1343,15 +1364,6 @@ def forward(self, inputs, targets):
self._reset_best_metric()
load_opt_params = False

if isinstance(self.training_params.optimizer, str) or (
inspect.isclass(self.training_params.optimizer) and issubclass(self.training_params.optimizer, torch.optim.Optimizer)
):
self.optimizer = build_optimizer(net=unwrap_model(self.net), lr=self.training_params.initial_lr, training_params=self.training_params)
elif isinstance(self.training_params.optimizer, torch.optim.Optimizer):
self.optimizer = self.training_params.optimizer
else:
raise UnsupportedOptimizerFormat()

if self.lr_mode is not None:
lr_scheduler_callback = create_lr_scheduler_callback(
lr_mode=self.lr_mode,
Expand Down Expand Up @@ -1448,6 +1460,8 @@ def forward(self, inputs, targets):
train_dataset_length=len(self.train_loader.dataset),
train_dataloader_len=len(self.train_loader),
max_train_batches=self.max_train_batches,
model=unwrap_model(self.net),
param_groups=self.optimizer.param_groups,
)

self._maybe_set_preprocessing_params_for_model_from_dataset()
Expand Down Expand Up @@ -1992,7 +2006,11 @@ def _get_epoch_start_logging_values(self) -> dict:
"""Get all the values that should be logged at the start of each epoch.
This is useful for values like Learning Rate that can change over an epoch."""
lrs = [self.optimizer.param_groups[i]["lr"] for i in range(len(self.optimizer.param_groups))]
lr_titles = ["LR/Param_group_" + str(i) for i in range(len(self.optimizer.param_groups))] if len(self.optimizer.param_groups) > 1 else ["LR"]
lr_titles = (
["LR/" + self.optimizer.param_groups[i].get("name", str(i)) for i in range(len(self.optimizer.param_groups))]
if len(self.optimizer.param_groups) > 1
else ["LR"]
)
lr_dict = {lr_titles[i]: lrs[i] for i in range(len(lrs))}
return lr_dict

Expand Down
Loading

0 comments on commit f8686cd

Please sign in to comment.