From 911c39a4e4842418e7259d70b0d081d48eafe244 Mon Sep 17 00:00:00 2001 From: dimapihtar Date: Mon, 3 Jun 2024 10:41:01 -0700 Subject: [PATCH 01/21] add reset_lr functionality Signed-off-by: dimapihtar --- .../conf/megatron_gpt_config.yaml | 19 +++++++-------- .../language_modeling/megatron_gpt_model.py | 7 ++++++ nemo/core/optim/lr_scheduler.py | 23 +++++++++++++------ 3 files changed, 33 insertions(+), 16 deletions(-) diff --git a/examples/nlp/language_modeling/conf/megatron_gpt_config.yaml b/examples/nlp/language_modeling/conf/megatron_gpt_config.yaml index ca0c3f74e4c8..be46747ea27c 100755 --- a/examples/nlp/language_modeling/conf/megatron_gpt_config.yaml +++ b/examples/nlp/language_modeling/conf/megatron_gpt_config.yaml @@ -9,14 +9,14 @@ trainer: devices: 1 num_nodes: 1 accelerator: gpu - precision: 16 + precision: bf16 logger: False # logger provided by exp_manager enable_checkpointing: False use_distributed_sampler: False max_epochs: -1 # PTL default. In practice, max_steps will be reached first. - max_steps: 100000 # consumed_samples = global_step * micro_batch_size * data_parallel_size * accumulate_grad_batches + max_steps: 100 # consumed_samples = global_step * micro_batch_size * data_parallel_size * accumulate_grad_batches log_every_n_steps: 10 - val_check_interval: 100 + val_check_interval: 40 limit_val_batches: 50 limit_test_batches: 500 accumulate_grad_batches: 1 # do not modify, grad acc is automatic for training megatron models @@ -121,7 +121,7 @@ model: fp16_lm_cross_entropy: False # Move the cross entropy unreduced loss calculation for lm head to fp16 # Megatron O2-style half-precision - megatron_amp_O2: False # Enable O2-level automatic mixed precision using main parameters + megatron_amp_O2: True # Enable O2-level automatic mixed precision using main parameters grad_allreduce_chunk_size_mb: 125 # Fusion @@ -240,9 +240,9 @@ model: # Dictionary: can override from CLI "model.data.data_prefix"={"train":[1.0, /path/to/data], "validation":/path/to/data, "test":/path/to/test} # Or see example below: # "model.data.data_prefix: {train:[1.0,/path/to/data], validation:[/path/to/data], test:[/path/to/test]}" - data_prefix: ??? + data_prefix: [] index_mapping_dir: null # path to save index mapping .npy files, by default will save in the same location as data_prefix - data_impl: mmap + data_impl: mock mmap_bin_files: True splits_string: 900,50,50 seq_length: ${model.encoder_seq_length} @@ -267,7 +267,7 @@ model: gen_shape: False # Generate model and kernel details including input shapes optim: - name: fused_adam + name: distributed_fused_adam lr: 2e-4 weight_decay: 0.01 betas: @@ -275,9 +275,10 @@ model: - 0.98 sched: name: CosineAnnealing - warmup_steps: 500 - constant_steps: 0 + warmup_steps: 15 + constant_steps: 15 min_lr: 2e-5 + reset_lr: True gc_interval: 0 # Interval of the host memory garbage collection. When it is zero, collectiion relies on the automatic garbage collector. diff --git a/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py b/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py index a5b4450c7b44..3ec440c95af4 100644 --- a/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py +++ b/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py @@ -305,6 +305,8 @@ def __init__(self, cfg: DictConfig, trainer: Trainer): self.if_first_step = 0 self.prev_global_batch_size = None + self.if_init_step = True + if cfg.get('data', None) is not None: self.reset_position_ids = cfg.data.get('reset_position_ids', False) self.reset_attention_mask = cfg.data.get('reset_attention_mask', False) @@ -757,6 +759,10 @@ def training_step(self, dataloader_iter): The input batch to each micro-batch is fetched using the dataloader function in the micro-batch fwd function. """ + if self.if_init_step: + self._optimizer.param_groups[0]['step'] = self.trainer.global_step + self.if_init_step = False + # Initialize userbuffer communicators. if self.initialize_ub: self.initialize_ub_func() @@ -914,6 +920,7 @@ def training_step(self, dataloader_iter): self.log('loss_scale', loss_scale, batch_size=1) lr = self._optimizer.param_groups[0]['lr'] + self.log('lr', lr, rank_zero_only=True, batch_size=1) self.log( 'global_step', diff --git a/nemo/core/optim/lr_scheduler.py b/nemo/core/optim/lr_scheduler.py index 473ca0f5c416..e51ed03f28e3 100644 --- a/nemo/core/optim/lr_scheduler.py +++ b/nemo/core/optim/lr_scheduler.py @@ -42,7 +42,7 @@ class WarmupPolicy(_LRScheduler): infinite training """ - def __init__(self, optimizer, *, warmup_steps=None, warmup_ratio=None, max_steps=None, min_lr=0.0, last_epoch=-1): + def __init__(self, optimizer, *, warmup_steps=None, warmup_ratio=None, max_steps=None, min_lr=0.0, last_epoch=-1, reset_lr=False): assert not ( warmup_steps is not None and warmup_ratio is not None ), "Either use particular number of step or ratio" @@ -59,6 +59,7 @@ def __init__(self, optimizer, *, warmup_steps=None, warmup_ratio=None, max_steps self.warmup_steps = 0 self.min_lr = min_lr + self.reset_lr = reset_lr super().__init__(optimizer, last_epoch) def get_lr(self): @@ -97,7 +98,7 @@ class SquareRootConstantPolicy(_LRScheduler): """ def __init__( - self, optimizer, *, constant_steps=None, constant_ratio=None, max_steps=None, min_lr=0.0, last_epoch=-1 + self, optimizer, *, constant_steps=None, constant_ratio=None, max_steps=None, min_lr=0.0, last_epoch=-1, reset_lr=False ): assert not ( constant_steps is not None and constant_ratio is not None @@ -116,6 +117,7 @@ def __init__( self.constant_lr = 1 / (constant_steps ** 0.5) self.min_lr = min_lr + self.reset_lr = reset_lr super().__init__(optimizer, last_epoch) def get_lr(self): @@ -240,6 +242,7 @@ def __init__( max_steps=None, min_lr=0.0, last_epoch=-1, + reset_lr=False, ): assert not ( warmup_steps is not None and warmup_ratio is not None @@ -270,6 +273,8 @@ def __init__( self.decay_steps = max_steps - (self.constant_steps + self.warmup_steps) self.min_lr = min_lr + self.reset_lr = reset_lr + print(self.reset_lr) super().__init__(optimizer, last_epoch) def get_lr(self): @@ -277,8 +282,11 @@ def get_lr(self): warnings.warn( "To get the last learning rate computed by the scheduler, please use `get_last_lr()`.", UserWarning ) - - step = self.last_epoch + + if self.reset_lr and 'step' in self.optimizer.param_groups[0].keys(): + step = self.last_epoch - self.optimizer.param_groups[0]['step'] + else: + step = self.last_epoch # Warmup steps if self.warmup_steps > 0 and step <= self.warmup_steps: @@ -401,8 +409,8 @@ def _get_lr(self, step): class CosineAnnealing(WarmupAnnealHoldPolicy): - def __init__(self, optimizer, *, max_steps, min_lr=0, last_epoch=-1, **kwargs): - super().__init__(optimizer=optimizer, max_steps=max_steps, last_epoch=last_epoch, min_lr=min_lr, **kwargs) + def __init__(self, optimizer, *, max_steps, min_lr=0, last_epoch=-1, reset_lr=False, **kwargs): + super().__init__(optimizer=optimizer, max_steps=max_steps, last_epoch=last_epoch, min_lr=min_lr, reset_lr=reset_lr, **kwargs) def _get_lr(self, step): for initial_lr in self.base_lrs: @@ -453,7 +461,7 @@ def _get_linear_warmup_with_cosine_annealing_lr(self, step): class NoamAnnealing(_LRScheduler): def __init__( - self, optimizer, *, d_model, warmup_steps=None, warmup_ratio=None, max_steps=None, min_lr=0.0, last_epoch=-1 + self, optimizer, *, d_model, warmup_steps=None, warmup_ratio=None, max_steps=None, min_lr=0.0, last_epoch=-1, reset_lr=False ): self._normalize = d_model ** (-0.5) assert not ( @@ -472,6 +480,7 @@ def __init__( self.warmup_steps = 0 self.min_lr = min_lr + self.reset_lr = reset_lr super().__init__(optimizer, last_epoch) def get_lr(self): From 7802851d81352a3bdb24d27be09e2f9f3e89bbe1 Mon Sep 17 00:00:00 2001 From: dimapihtar Date: Mon, 3 Jun 2024 16:17:29 -0700 Subject: [PATCH 02/21] fix reset_lr logic Signed-off-by: dimapihtar --- .../conf/megatron_gpt_config.yaml | 13 +++++----- .../language_modeling/megatron_base_model.py | 2 +- .../language_modeling/megatron_gpt_model.py | 4 ++- nemo/core/optim/lr_scheduler.py | 26 +++++++++++-------- 4 files changed, 25 insertions(+), 20 deletions(-) diff --git a/examples/nlp/language_modeling/conf/megatron_gpt_config.yaml b/examples/nlp/language_modeling/conf/megatron_gpt_config.yaml index be46747ea27c..21f456ca61d8 100755 --- a/examples/nlp/language_modeling/conf/megatron_gpt_config.yaml +++ b/examples/nlp/language_modeling/conf/megatron_gpt_config.yaml @@ -14,9 +14,9 @@ trainer: enable_checkpointing: False use_distributed_sampler: False max_epochs: -1 # PTL default. In practice, max_steps will be reached first. - max_steps: 100 # consumed_samples = global_step * micro_batch_size * data_parallel_size * accumulate_grad_batches - log_every_n_steps: 10 - val_check_interval: 40 + max_steps: 150 # consumed_samples = global_step * micro_batch_size * data_parallel_size * accumulate_grad_batches + log_every_n_steps: 1 + val_check_interval: 50 limit_val_batches: 50 limit_test_batches: 500 accumulate_grad_batches: 1 # do not modify, grad acc is automatic for training megatron models @@ -28,10 +28,10 @@ exp_manager: explicit_log_dir: null exp_dir: null name: megatron_gpt - create_wandb_logger: False + create_wandb_logger: True wandb_logger_kwargs: - project: null - name: null + project: reset_lr_test + name: test_run create_neptune_logger: false neptune_logger_kwargs: project: null @@ -279,7 +279,6 @@ model: constant_steps: 15 min_lr: 2e-5 reset_lr: True - gc_interval: 0 # Interval of the host memory garbage collection. When it is zero, collectiion relies on the automatic garbage collector. # If an interger value larger than zero is set, collection is done manually by the batch step interval of `gc_interval`. diff --git a/nemo/collections/nlp/models/language_modeling/megatron_base_model.py b/nemo/collections/nlp/models/language_modeling/megatron_base_model.py index a27f9fd5e5e4..f2ab132256c3 100644 --- a/nemo/collections/nlp/models/language_modeling/megatron_base_model.py +++ b/nemo/collections/nlp/models/language_modeling/megatron_base_model.py @@ -841,7 +841,7 @@ def configure_optimizers(self): if hasattr(self._cfg.optim, 'sched'): sched_config = self._cfg.optim.sched self._scheduler = prepare_lr_scheduler( - optimizer=self._optimizer, scheduler_config=sched_config, train_dataloader=self._train_dl + optimizer=self._optimizer, scheduler_config=sched_config, train_dataloader=self._train_dl, ) if getattr(self._cfg.optim, 'sched', None) is not None and self._scheduler is None: diff --git a/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py b/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py index 3ec440c95af4..4df5166187e8 100644 --- a/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py +++ b/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py @@ -759,8 +759,10 @@ def training_step(self, dataloader_iter): The input batch to each micro-batch is fetched using the dataloader function in the micro-batch fwd function. """ - if self.if_init_step: + + if self.if_init_step and self.cfg.optim.sched.get('reset_lr', False): self._optimizer.param_groups[0]['step'] = self.trainer.global_step + self._optimizer.param_groups[0]['reset_lr'] = True self.if_init_step = False # Initialize userbuffer communicators. diff --git a/nemo/core/optim/lr_scheduler.py b/nemo/core/optim/lr_scheduler.py index e51ed03f28e3..e915bc9ac315 100644 --- a/nemo/core/optim/lr_scheduler.py +++ b/nemo/core/optim/lr_scheduler.py @@ -42,7 +42,7 @@ class WarmupPolicy(_LRScheduler): infinite training """ - def __init__(self, optimizer, *, warmup_steps=None, warmup_ratio=None, max_steps=None, min_lr=0.0, last_epoch=-1, reset_lr=False): + def __init__(self, optimizer, *, reset_lr, warmup_steps=None, warmup_ratio=None, max_steps=None, min_lr=0.0, last_epoch=-1): assert not ( warmup_steps is not None and warmup_ratio is not None ), "Either use particular number of step or ratio" @@ -98,7 +98,7 @@ class SquareRootConstantPolicy(_LRScheduler): """ def __init__( - self, optimizer, *, constant_steps=None, constant_ratio=None, max_steps=None, min_lr=0.0, last_epoch=-1, reset_lr=False + self, optimizer, *, reset_lr, constant_steps=None, constant_ratio=None, max_steps=None, min_lr=0.0, last_epoch=-1 ): assert not ( constant_steps is not None and constant_ratio is not None @@ -235,6 +235,7 @@ def __init__( self, optimizer, *, + reset_lr, warmup_steps=None, warmup_ratio=None, constant_steps=None, @@ -242,7 +243,6 @@ def __init__( max_steps=None, min_lr=0.0, last_epoch=-1, - reset_lr=False, ): assert not ( warmup_steps is not None and warmup_ratio is not None @@ -273,8 +273,7 @@ def __init__( self.decay_steps = max_steps - (self.constant_steps + self.warmup_steps) self.min_lr = min_lr - self.reset_lr = reset_lr - print(self.reset_lr) + self.first_step = True super().__init__(optimizer, last_epoch) def get_lr(self): @@ -283,10 +282,14 @@ def get_lr(self): "To get the last learning rate computed by the scheduler, please use `get_last_lr()`.", UserWarning ) - if self.reset_lr and 'step' in self.optimizer.param_groups[0].keys(): - step = self.last_epoch - self.optimizer.param_groups[0]['step'] - else: - step = self.last_epoch + step = self.last_epoch + if 'reset_lr' in self.optimizer.param_groups[0].keys(): + init_steps = self.optimizer.param_groups[0]['step'] + step -= init_steps + if self.first_step: + self.decay_steps -= init_steps + self.max_steps -= init_steps + self.first_step = False # Warmup steps if self.warmup_steps > 0 and step <= self.warmup_steps: @@ -409,7 +412,7 @@ def _get_lr(self, step): class CosineAnnealing(WarmupAnnealHoldPolicy): - def __init__(self, optimizer, *, max_steps, min_lr=0, last_epoch=-1, reset_lr=False, **kwargs): + def __init__(self, optimizer, *, max_steps, reset_lr, min_lr=0, last_epoch=-1, **kwargs): super().__init__(optimizer=optimizer, max_steps=max_steps, last_epoch=last_epoch, min_lr=min_lr, reset_lr=reset_lr, **kwargs) def _get_lr(self, step): @@ -461,7 +464,7 @@ def _get_linear_warmup_with_cosine_annealing_lr(self, step): class NoamAnnealing(_LRScheduler): def __init__( - self, optimizer, *, d_model, warmup_steps=None, warmup_ratio=None, max_steps=None, min_lr=0.0, last_epoch=-1, reset_lr=False + self, optimizer, *, d_model, reset_lr, warmup_steps=None, warmup_ratio=None, max_steps=None, min_lr=0.0, last_epoch=-1 ): self._normalize = d_model ** (-0.5) assert not ( @@ -697,6 +700,7 @@ def prepare_lr_scheduler( optimizer: optim.Optimizer, scheduler_config: Union[Dict[str, Any], DictConfig], train_dataloader: Optional[dataloader.DataLoader] = None, + reset_lr: bool = False, ) -> Optional[Dict[str, Any]]: """ Constructs an LR Scheduler (optionally) for a given optimizer, based on a config with the following schema From b2f5eed22444b94f7cf42198b53596b8e4fb87a3 Mon Sep 17 00:00:00 2001 From: dimapihtar Date: Tue, 4 Jun 2024 11:11:58 +0000 Subject: [PATCH 03/21] Apply isort and black reformatting Signed-off-by: dimapihtar --- .../language_modeling/megatron_base_model.py | 4 +- .../language_modeling/megatron_gpt_model.py | 2 +- nemo/core/optim/lr_scheduler.py | 37 +++++++++++++++---- 3 files changed, 33 insertions(+), 10 deletions(-) diff --git a/nemo/collections/nlp/models/language_modeling/megatron_base_model.py b/nemo/collections/nlp/models/language_modeling/megatron_base_model.py index f2ab132256c3..f8ecb7aa4bf8 100644 --- a/nemo/collections/nlp/models/language_modeling/megatron_base_model.py +++ b/nemo/collections/nlp/models/language_modeling/megatron_base_model.py @@ -841,7 +841,9 @@ def configure_optimizers(self): if hasattr(self._cfg.optim, 'sched'): sched_config = self._cfg.optim.sched self._scheduler = prepare_lr_scheduler( - optimizer=self._optimizer, scheduler_config=sched_config, train_dataloader=self._train_dl, + optimizer=self._optimizer, + scheduler_config=sched_config, + train_dataloader=self._train_dl, ) if getattr(self._cfg.optim, 'sched', None) is not None and self._scheduler is None: diff --git a/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py b/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py index 4df5166187e8..f6ef2a00601d 100644 --- a/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py +++ b/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py @@ -764,7 +764,7 @@ def training_step(self, dataloader_iter): self._optimizer.param_groups[0]['step'] = self.trainer.global_step self._optimizer.param_groups[0]['reset_lr'] = True self.if_init_step = False - + # Initialize userbuffer communicators. if self.initialize_ub: self.initialize_ub_func() diff --git a/nemo/core/optim/lr_scheduler.py b/nemo/core/optim/lr_scheduler.py index e915bc9ac315..fd204bc3a42d 100644 --- a/nemo/core/optim/lr_scheduler.py +++ b/nemo/core/optim/lr_scheduler.py @@ -42,7 +42,9 @@ class WarmupPolicy(_LRScheduler): infinite training """ - def __init__(self, optimizer, *, reset_lr, warmup_steps=None, warmup_ratio=None, max_steps=None, min_lr=0.0, last_epoch=-1): + def __init__( + self, optimizer, *, reset_lr, warmup_steps=None, warmup_ratio=None, max_steps=None, min_lr=0.0, last_epoch=-1 + ): assert not ( warmup_steps is not None and warmup_ratio is not None ), "Either use particular number of step or ratio" @@ -98,7 +100,15 @@ class SquareRootConstantPolicy(_LRScheduler): """ def __init__( - self, optimizer, *, reset_lr, constant_steps=None, constant_ratio=None, max_steps=None, min_lr=0.0, last_epoch=-1 + self, + optimizer, + *, + reset_lr, + constant_steps=None, + constant_ratio=None, + max_steps=None, + min_lr=0.0, + last_epoch=-1, ): assert not ( constant_steps is not None and constant_ratio is not None @@ -115,7 +125,7 @@ def __init__( else: self.constant_steps = 0 - self.constant_lr = 1 / (constant_steps ** 0.5) + self.constant_lr = 1 / (constant_steps**0.5) self.min_lr = min_lr self.reset_lr = reset_lr super().__init__(optimizer, last_epoch) @@ -281,7 +291,7 @@ def get_lr(self): warnings.warn( "To get the last learning rate computed by the scheduler, please use `get_last_lr()`.", UserWarning ) - + step = self.last_epoch if 'reset_lr' in self.optimizer.param_groups[0].keys(): init_steps = self.optimizer.param_groups[0]['step'] @@ -375,7 +385,7 @@ def _poly_decay(initial_lr, step, decay_steps, power, min_lr, cycle): def _noam_hold_annealing(initial_lr, step, warmup_steps, hold_steps, decay_rate, min_lr): # hold_steps = total number of steps to hold the LR, not the warmup + hold steps. - T_warmup_decay = max(1, warmup_steps ** decay_rate) + T_warmup_decay = max(1, warmup_steps**decay_rate) T_hold_decay = max(1, (step - hold_steps) ** decay_rate) lr = (initial_lr * T_warmup_decay) / T_hold_decay lr = max(lr, min_lr) @@ -413,7 +423,9 @@ def _get_lr(self, step): class CosineAnnealing(WarmupAnnealHoldPolicy): def __init__(self, optimizer, *, max_steps, reset_lr, min_lr=0, last_epoch=-1, **kwargs): - super().__init__(optimizer=optimizer, max_steps=max_steps, last_epoch=last_epoch, min_lr=min_lr, reset_lr=reset_lr, **kwargs) + super().__init__( + optimizer=optimizer, max_steps=max_steps, last_epoch=last_epoch, min_lr=min_lr, reset_lr=reset_lr, **kwargs + ) def _get_lr(self, step): for initial_lr in self.base_lrs: @@ -464,7 +476,16 @@ def _get_linear_warmup_with_cosine_annealing_lr(self, step): class NoamAnnealing(_LRScheduler): def __init__( - self, optimizer, *, d_model, reset_lr, warmup_steps=None, warmup_ratio=None, max_steps=None, min_lr=0.0, last_epoch=-1 + self, + optimizer, + *, + d_model, + reset_lr, + warmup_steps=None, + warmup_ratio=None, + max_steps=None, + min_lr=0.0, + last_epoch=-1, ): self._normalize = d_model ** (-0.5) assert not ( @@ -605,7 +626,7 @@ def __init__(self, optimizer, *, max_steps, last_epoch=-1, min_lr=0.0, **kwargs) super().__init__(optimizer=optimizer, max_steps=max_steps, **kwargs, last_epoch=last_epoch, min_lr=min_lr) def _get_lr(self, step): - return [1 / (step ** 0.5) for _ in self.base_lrs] + return [1 / (step**0.5) for _ in self.base_lrs] class PolynomialDecayAnnealing(WarmupPolicy): From e6e95974a5e88e9e451f1d8c9d369786e232b6bb Mon Sep 17 00:00:00 2001 From: dimapihtar Date: Tue, 4 Jun 2024 05:09:11 -0700 Subject: [PATCH 04/21] move reset_lr from optim section Signed-off-by: dimapihtar --- .../conf/megatron_gpt_config.yaml | 14 +++++------ .../language_modeling/megatron_gpt_model.py | 15 ++++++------ nemo/core/optim/lr_scheduler.py | 23 ++++++++----------- 3 files changed, 23 insertions(+), 29 deletions(-) diff --git a/examples/nlp/language_modeling/conf/megatron_gpt_config.yaml b/examples/nlp/language_modeling/conf/megatron_gpt_config.yaml index 21f456ca61d8..2c1f82deaa05 100755 --- a/examples/nlp/language_modeling/conf/megatron_gpt_config.yaml +++ b/examples/nlp/language_modeling/conf/megatron_gpt_config.yaml @@ -14,9 +14,9 @@ trainer: enable_checkpointing: False use_distributed_sampler: False max_epochs: -1 # PTL default. In practice, max_steps will be reached first. - max_steps: 150 # consumed_samples = global_step * micro_batch_size * data_parallel_size * accumulate_grad_batches + max_steps: 500 # consumed_samples = global_step * micro_batch_size * data_parallel_size * accumulate_grad_batches log_every_n_steps: 1 - val_check_interval: 50 + val_check_interval: 100 limit_val_batches: 50 limit_test_batches: 500 accumulate_grad_batches: 1 # do not modify, grad acc is automatic for training megatron models @@ -103,6 +103,7 @@ model: batch_p2p_comm: True # Batch consecutive inter-peer send/recv operations. This argument is valid only when `virtual_pipeline_model_parallel_size` is larger than 1 seq_len_interpolation_factor: null # RoPE Interpolation factor for sequence length. This is used to build long-context models with RoPE ex: https://arxiv.org/abs/2306.15595. num_query_groups: null # Number of query groups for group query attention. If None, normal attention is used. + reset_lr: True # Set to True to reset learning rate. tokenizer: library: 'megatron' @@ -269,16 +270,15 @@ model: optim: name: distributed_fused_adam lr: 2e-4 - weight_decay: 0.01 + weight_decay: 0.02 betas: - 0.9 - - 0.98 + - 0.95 sched: name: CosineAnnealing - warmup_steps: 15 - constant_steps: 15 + warmup_steps: 25 + constant_steps: 25 min_lr: 2e-5 - reset_lr: True gc_interval: 0 # Interval of the host memory garbage collection. When it is zero, collectiion relies on the automatic garbage collector. # If an interger value larger than zero is set, collection is done manually by the batch step interval of `gc_interval`. diff --git a/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py b/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py index f6ef2a00601d..0b0b32d624d7 100644 --- a/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py +++ b/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py @@ -305,8 +305,6 @@ def __init__(self, cfg: DictConfig, trainer: Trainer): self.if_first_step = 0 self.prev_global_batch_size = None - self.if_init_step = True - if cfg.get('data', None) is not None: self.reset_position_ids = cfg.data.get('reset_position_ids', False) self.reset_attention_mask = cfg.data.get('reset_attention_mask', False) @@ -396,6 +394,7 @@ def __init__(self, cfg: DictConfig, trainer: Trainer): self.validation_param_sync_overlap = self.cfg.get('validation_param_sync_overlap', False) self.inference_params = None + self.if_init_step = True # default to false since this doesn't work with sequence parallelism currently self.use_loss_mask = self.cfg.get('use_loss_mask', False) @@ -759,16 +758,16 @@ def training_step(self, dataloader_iter): The input batch to each micro-batch is fetched using the dataloader function in the micro-batch fwd function. """ - - if self.if_init_step and self.cfg.optim.sched.get('reset_lr', False): - self._optimizer.param_groups[0]['step'] = self.trainer.global_step - self._optimizer.param_groups[0]['reset_lr'] = True - self.if_init_step = False - # Initialize userbuffer communicators. if self.initialize_ub: self.initialize_ub_func() + # Reset learning rate + if self.if_init_step and self.cfg.get('reset_lr', False): + self._optimizer.param_groups[0]['num_steps'] = self.trainer.global_step + self._optimizer.param_groups[0]['reset_lr'] = True + self.if_init_step = False + if self.rampup_batch_size: num_microbatch_calculator = apex.transformer.pipeline_parallel.utils._GLOBAL_NUM_MICROBATCHES_CALCULATOR current_global_batch_size = num_microbatch_calculator.current_global_batch_size diff --git a/nemo/core/optim/lr_scheduler.py b/nemo/core/optim/lr_scheduler.py index fd204bc3a42d..8aec03e152e6 100644 --- a/nemo/core/optim/lr_scheduler.py +++ b/nemo/core/optim/lr_scheduler.py @@ -43,7 +43,7 @@ class WarmupPolicy(_LRScheduler): """ def __init__( - self, optimizer, *, reset_lr, warmup_steps=None, warmup_ratio=None, max_steps=None, min_lr=0.0, last_epoch=-1 + self, optimizer, *, warmup_steps=None, warmup_ratio=None, max_steps=None, min_lr=0.0, last_epoch=-1 ): assert not ( warmup_steps is not None and warmup_ratio is not None @@ -61,7 +61,6 @@ def __init__( self.warmup_steps = 0 self.min_lr = min_lr - self.reset_lr = reset_lr super().__init__(optimizer, last_epoch) def get_lr(self): @@ -103,7 +102,6 @@ def __init__( self, optimizer, *, - reset_lr, constant_steps=None, constant_ratio=None, max_steps=None, @@ -127,7 +125,6 @@ def __init__( self.constant_lr = 1 / (constant_steps**0.5) self.min_lr = min_lr - self.reset_lr = reset_lr super().__init__(optimizer, last_epoch) def get_lr(self): @@ -245,7 +242,6 @@ def __init__( self, optimizer, *, - reset_lr, warmup_steps=None, warmup_ratio=None, constant_steps=None, @@ -293,12 +289,14 @@ def get_lr(self): ) step = self.last_epoch + + # Reset learning rate if 'reset_lr' in self.optimizer.param_groups[0].keys(): - init_steps = self.optimizer.param_groups[0]['step'] - step -= init_steps + num_steps = self.optimizer.param_groups[0]['num_steps'] + step -= num_steps if self.first_step: - self.decay_steps -= init_steps - self.max_steps -= init_steps + self.decay_steps -= num_steps + self.max_steps -= num_steps self.first_step = False # Warmup steps @@ -422,9 +420,9 @@ def _get_lr(self, step): class CosineAnnealing(WarmupAnnealHoldPolicy): - def __init__(self, optimizer, *, max_steps, reset_lr, min_lr=0, last_epoch=-1, **kwargs): + def __init__(self, optimizer, *, max_steps, min_lr=0, last_epoch=-1, **kwargs): super().__init__( - optimizer=optimizer, max_steps=max_steps, last_epoch=last_epoch, min_lr=min_lr, reset_lr=reset_lr, **kwargs + optimizer=optimizer, max_steps=max_steps, last_epoch=last_epoch, min_lr=min_lr, **kwargs ) def _get_lr(self, step): @@ -480,7 +478,6 @@ def __init__( optimizer, *, d_model, - reset_lr, warmup_steps=None, warmup_ratio=None, max_steps=None, @@ -504,7 +501,6 @@ def __init__( self.warmup_steps = 0 self.min_lr = min_lr - self.reset_lr = reset_lr super().__init__(optimizer, last_epoch) def get_lr(self): @@ -721,7 +717,6 @@ def prepare_lr_scheduler( optimizer: optim.Optimizer, scheduler_config: Union[Dict[str, Any], DictConfig], train_dataloader: Optional[dataloader.DataLoader] = None, - reset_lr: bool = False, ) -> Optional[Dict[str, Any]]: """ Constructs an LR Scheduler (optionally) for a given optimizer, based on a config with the following schema From 5c4dd1473d92f8b0d4147bd613e9576d76ebd573 Mon Sep 17 00:00:00 2001 From: dimapihtar Date: Tue, 4 Jun 2024 12:10:12 +0000 Subject: [PATCH 05/21] Apply isort and black reformatting Signed-off-by: dimapihtar --- nemo/core/optim/lr_scheduler.py | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/nemo/core/optim/lr_scheduler.py b/nemo/core/optim/lr_scheduler.py index 8aec03e152e6..1e99103ef63d 100644 --- a/nemo/core/optim/lr_scheduler.py +++ b/nemo/core/optim/lr_scheduler.py @@ -42,9 +42,7 @@ class WarmupPolicy(_LRScheduler): infinite training """ - def __init__( - self, optimizer, *, warmup_steps=None, warmup_ratio=None, max_steps=None, min_lr=0.0, last_epoch=-1 - ): + def __init__(self, optimizer, *, warmup_steps=None, warmup_ratio=None, max_steps=None, min_lr=0.0, last_epoch=-1): assert not ( warmup_steps is not None and warmup_ratio is not None ), "Either use particular number of step or ratio" @@ -421,9 +419,7 @@ def _get_lr(self, step): class CosineAnnealing(WarmupAnnealHoldPolicy): def __init__(self, optimizer, *, max_steps, min_lr=0, last_epoch=-1, **kwargs): - super().__init__( - optimizer=optimizer, max_steps=max_steps, last_epoch=last_epoch, min_lr=min_lr, **kwargs - ) + super().__init__(optimizer=optimizer, max_steps=max_steps, last_epoch=last_epoch, min_lr=min_lr, **kwargs) def _get_lr(self, step): for initial_lr in self.base_lrs: From 46687033d2e8ce1191d59a88274db4e00164a9fc Mon Sep 17 00:00:00 2001 From: dimapihtar Date: Tue, 4 Jun 2024 05:13:49 -0700 Subject: [PATCH 06/21] add reset_lr value to config Signed-off-by: dimapihtar --- .../conf/megatron_gpt_config.yaml | 29 ++++++++++--------- 1 file changed, 15 insertions(+), 14 deletions(-) diff --git a/examples/nlp/language_modeling/conf/megatron_gpt_config.yaml b/examples/nlp/language_modeling/conf/megatron_gpt_config.yaml index 2c1f82deaa05..5d5401497200 100755 --- a/examples/nlp/language_modeling/conf/megatron_gpt_config.yaml +++ b/examples/nlp/language_modeling/conf/megatron_gpt_config.yaml @@ -9,13 +9,13 @@ trainer: devices: 1 num_nodes: 1 accelerator: gpu - precision: bf16 + precision: 16 logger: False # logger provided by exp_manager enable_checkpointing: False use_distributed_sampler: False max_epochs: -1 # PTL default. In practice, max_steps will be reached first. - max_steps: 500 # consumed_samples = global_step * micro_batch_size * data_parallel_size * accumulate_grad_batches - log_every_n_steps: 1 + max_steps: 100000 # consumed_samples = global_step * micro_batch_size * data_parallel_size * accumulate_grad_batches + log_every_n_steps: 10 val_check_interval: 100 limit_val_batches: 50 limit_test_batches: 500 @@ -28,10 +28,10 @@ exp_manager: explicit_log_dir: null exp_dir: null name: megatron_gpt - create_wandb_logger: True + create_wandb_logger: False wandb_logger_kwargs: - project: reset_lr_test - name: test_run + project: null + name: null create_neptune_logger: false neptune_logger_kwargs: project: null @@ -122,7 +122,7 @@ model: fp16_lm_cross_entropy: False # Move the cross entropy unreduced loss calculation for lm head to fp16 # Megatron O2-style half-precision - megatron_amp_O2: True # Enable O2-level automatic mixed precision using main parameters + megatron_amp_O2: False # Enable O2-level automatic mixed precision using main parameters grad_allreduce_chunk_size_mb: 125 # Fusion @@ -241,9 +241,9 @@ model: # Dictionary: can override from CLI "model.data.data_prefix"={"train":[1.0, /path/to/data], "validation":/path/to/data, "test":/path/to/test} # Or see example below: # "model.data.data_prefix: {train:[1.0,/path/to/data], validation:[/path/to/data], test:[/path/to/test]}" - data_prefix: [] + data_prefix: ??? index_mapping_dir: null # path to save index mapping .npy files, by default will save in the same location as data_prefix - data_impl: mock + data_impl: mmap mmap_bin_files: True splits_string: 900,50,50 seq_length: ${model.encoder_seq_length} @@ -268,17 +268,18 @@ model: gen_shape: False # Generate model and kernel details including input shapes optim: - name: distributed_fused_adam + name: fused_adam lr: 2e-4 - weight_decay: 0.02 + weight_decay: 0.01 betas: - 0.9 - - 0.95 + - 0.98 sched: name: CosineAnnealing - warmup_steps: 25 - constant_steps: 25 + warmup_steps: 500 + constant_steps: 0 min_lr: 2e-5 + gc_interval: 0 # Interval of the host memory garbage collection. When it is zero, collectiion relies on the automatic garbage collector. # If an interger value larger than zero is set, collection is done manually by the batch step interval of `gc_interval`. From de6750a4d4d4a39582afc35dcff6044bed5700de Mon Sep 17 00:00:00 2001 From: dimapihtar Date: Tue, 4 Jun 2024 05:14:48 -0700 Subject: [PATCH 07/21] set reset_lr False by default Signed-off-by: dimapihtar --- examples/nlp/language_modeling/conf/megatron_gpt_config.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/nlp/language_modeling/conf/megatron_gpt_config.yaml b/examples/nlp/language_modeling/conf/megatron_gpt_config.yaml index 5d5401497200..34081b8d68e9 100755 --- a/examples/nlp/language_modeling/conf/megatron_gpt_config.yaml +++ b/examples/nlp/language_modeling/conf/megatron_gpt_config.yaml @@ -103,7 +103,7 @@ model: batch_p2p_comm: True # Batch consecutive inter-peer send/recv operations. This argument is valid only when `virtual_pipeline_model_parallel_size` is larger than 1 seq_len_interpolation_factor: null # RoPE Interpolation factor for sequence length. This is used to build long-context models with RoPE ex: https://arxiv.org/abs/2306.15595. num_query_groups: null # Number of query groups for group query attention. If None, normal attention is used. - reset_lr: True # Set to True to reset learning rate. + reset_lr: False # Set to True to reset learning rate. tokenizer: library: 'megatron' From b0b3e17886356c9db40615d632a92ff7239cd8cd Mon Sep 17 00:00:00 2001 From: dimapihtar Date: Tue, 4 Jun 2024 05:15:50 -0700 Subject: [PATCH 08/21] remove extra line Signed-off-by: dimapihtar --- .../nlp/models/language_modeling/megatron_gpt_model.py | 1 - 1 file changed, 1 deletion(-) diff --git a/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py b/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py index 0b0b32d624d7..f009bdd1ff71 100644 --- a/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py +++ b/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py @@ -921,7 +921,6 @@ def training_step(self, dataloader_iter): self.log('loss_scale', loss_scale, batch_size=1) lr = self._optimizer.param_groups[0]['lr'] - self.log('lr', lr, rank_zero_only=True, batch_size=1) self.log( 'global_step', From 7fac9d36828d5e723193c12246eeb0b1ff8df770 Mon Sep 17 00:00:00 2001 From: dimapihtar Date: Tue, 4 Jun 2024 06:10:02 -0700 Subject: [PATCH 09/21] add reset_lr test Signed-off-by: dimapihtar --- .github/workflows/cicd-main.yml | 83 +++++++++++++++++++++++++++++++++ 1 file changed, 83 insertions(+) diff --git a/.github/workflows/cicd-main.yml b/.github/workflows/cicd-main.yml index 29e84b933f14..510283bf07f8 100644 --- a/.github/workflows/cicd-main.yml +++ b/.github/workflows/cicd-main.yml @@ -4417,6 +4417,89 @@ jobs: # } # } + L2_Megatron_GPT_with_ResetLR_Pretraining_and_Resume_Training_TP2: + needs: [cicd-test-container-setup] + runs-on: self-hosted-azure + timeout-minutes: 10 + container: + image: nemoci.azurecr.io/nemo_container_${{ github.run_id }} + options: + # --user 0:128 + --device=/dev/nvidia0 + --gpus all + --shm-size=8g + --env TRANSFORMERS_OFFLINE=0 + --env HYDRA_FULL_ERROR=1 + --volume /mnt/datadrive/TestData:/home/TestData + steps: + - name: Checkout repository + uses: actions/checkout@v4 + - run: | + python examples/nlp/language_modeling/megatron_gpt_pretraining.py \ + trainer.devices=2 \ + trainer.accelerator=gpu \ + trainer.log_every_n_steps=1 \ + trainer.val_check_interval=3 \ + trainer.limit_val_batches=2 \ + trainer.accumulate_grad_batches=1 \ + trainer.max_steps=3 \ + trainer.precision=bf16 \ + trainer.gradient_clip_val=1.0 \ + exp_manager.exp_dir=examples/nlp/language_modeling/gpt_pretrain_results \ + model.tensor_model_parallel_size=2 \ + model.megatron_amp_O2=True \ + model.optim.name=distributed_fused_adam \ + model.optim.lr=2e-4 \ + model.optim.sched.warmup_steps=2 \ + model.optim.sched.constant_steps=2 \ + model.optim.sched.min_lr=8e-5 \ + model.max_position_embeddings=128 \ + model.encoder_seq_length=128 \ + model.data.seq_length=128 \ + model.tokenizer.vocab_file=/home/TestData/nlp/megatron_gpt/data/gpt/vocab.json \ + model.tokenizer.merge_file=/home/TestData/nlp/megatron_gpt/data/gpt/merges.txt \ + model.num_layers=8 \ + model.hidden_size=256 \ + model.num_attention_heads=8 \ + model.data.data_prefix=[.5,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document,.5,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document] \ + model.data.index_mapping_dir=examples/nlp/language_modeling/gpt_index_mappings + + python examples/nlp/language_modeling/megatron_gpt_pretraining.py \ + trainer.devices=2 \ + trainer.accelerator=gpu \ + trainer.log_every_n_steps=1 \ + trainer.val_check_interval=3 \ + trainer.limit_val_batches=2 \ + trainer.accumulate_grad_batches=1 \ + trainer.max_steps=6 \ + trainer.precision=bf16 \ + trainer.gradient_clip_val=1.0 \ + exp_manager.exp_dir=examples/nlp/language_modeling/gpt_pretrain_results \ + exp_manager.resume_if_exists=True \ + model.reset_lr=True \ + model.tensor_model_parallel_size=2 \ + model.megatron_amp_O2=True \ + model.optim.name=distributed_fused_adam \ + model.optim.lr=2e-4 \ + model.optim.sched.warmup_steps=2 \ + model.optim.sched.constant_steps=2 \ + model.optim.sched.min_lr=8e-5 \ + model.max_position_embeddings=128 \ + model.encoder_seq_length=128 \ + model.data.seq_length=128 \ + model.tokenizer.vocab_file=/home/TestData/nlp/megatron_gpt/data/gpt/vocab.json \ + model.tokenizer.merge_file=/home/TestData/nlp/megatron_gpt/data/gpt/merges.txt \ + model.num_layers=8 \ + model.hidden_size=256 \ + model.num_attention_heads=8 \ + model.data.data_prefix=[.5,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document,.5,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document] \ + model.data.index_mapping_dir=examples/nlp/language_modeling/gpt_index_mappings" + + rm -rf examples/nlp/language_modeling/gpt_pretrain_results + rm -rf examples/nlp/language_modeling/gpt_index_mappings + - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main" + if: "failure()" + L2_Megatron_GPT_with_ALiBi_Pretraining_and_Resume_Training_TP2: needs: [cicd-test-container-setup] runs-on: self-hosted-azure From 0604dc487541b38f76d3d38847af28e07b66b89a Mon Sep 17 00:00:00 2001 From: dimapihtar Date: Tue, 4 Jun 2024 06:11:00 -0700 Subject: [PATCH 10/21] add reset_lr test Signed-off-by: dimapihtar --- .github/workflows/cicd-main.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.github/workflows/cicd-main.yml b/.github/workflows/cicd-main.yml index 510283bf07f8..443eeafbc4ac 100644 --- a/.github/workflows/cicd-main.yml +++ b/.github/workflows/cicd-main.yml @@ -6608,6 +6608,7 @@ jobs: - L2_BioMegatron_Bert_NER_Task - L2_Megatron_GPT_Pretraining_and_Resume_Training_TP2 - L2_Megatron_GPT_with_Rope_Pretraining_and_Resume_Training_TP2 + - L2_Megatron_GPT_with_ResetLR_Pretraining_and_Resume_Training_TP2 - L2_Megatron_GPT_with_ALiBi_Pretraining_and_Resume_Training_TP2 - L2_Megatron_GPT_with_KERPLE_Pretraining_and_Resume_Training_TP2 - L2_Megatron_GPT_Pretraining_and_Resume_Training_PP2 From 5a2d4c624179551c758b389118ea6107f676e8e7 Mon Sep 17 00:00:00 2001 From: dimapihtar Date: Wed, 5 Jun 2024 03:31:55 -0700 Subject: [PATCH 11/21] remove extra quote Signed-off-by: dimapihtar --- .github/workflows/cicd-main.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/cicd-main.yml b/.github/workflows/cicd-main.yml index 443eeafbc4ac..59478a307258 100644 --- a/.github/workflows/cicd-main.yml +++ b/.github/workflows/cicd-main.yml @@ -4493,7 +4493,7 @@ jobs: model.hidden_size=256 \ model.num_attention_heads=8 \ model.data.data_prefix=[.5,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document,.5,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document] \ - model.data.index_mapping_dir=examples/nlp/language_modeling/gpt_index_mappings" + model.data.index_mapping_dir=examples/nlp/language_modeling/gpt_index_mappings rm -rf examples/nlp/language_modeling/gpt_pretrain_results rm -rf examples/nlp/language_modeling/gpt_index_mappings From 47956e1068f2c14671f028e9edf2aba49a09f9af Mon Sep 17 00:00:00 2001 From: dimapihtar Date: Mon, 10 Jun 2024 07:32:41 -0700 Subject: [PATCH 12/21] add ability to reset schedule's max_steps and decay_steps Signed-off-by: dimapihtar --- .../language_modeling/conf/megatron_gpt_config.yaml | 11 +++++++---- .../models/language_modeling/megatron_gpt_model.py | 12 +++++++++--- nemo/core/optim/lr_scheduler.py | 4 ++-- 3 files changed, 18 insertions(+), 9 deletions(-) diff --git a/examples/nlp/language_modeling/conf/megatron_gpt_config.yaml b/examples/nlp/language_modeling/conf/megatron_gpt_config.yaml index 34081b8d68e9..281bcb8022eb 100755 --- a/examples/nlp/language_modeling/conf/megatron_gpt_config.yaml +++ b/examples/nlp/language_modeling/conf/megatron_gpt_config.yaml @@ -15,7 +15,7 @@ trainer: use_distributed_sampler: False max_epochs: -1 # PTL default. In practice, max_steps will be reached first. max_steps: 100000 # consumed_samples = global_step * micro_batch_size * data_parallel_size * accumulate_grad_batches - log_every_n_steps: 10 + log_every_n_steps: 1 val_check_interval: 100 limit_val_batches: 50 limit_test_batches: 500 @@ -28,10 +28,10 @@ exp_manager: explicit_log_dir: null exp_dir: null name: megatron_gpt - create_wandb_logger: False + create_wandb_logger: True wandb_logger_kwargs: - project: null - name: null + project: reset_lr_test + name: test_run create_neptune_logger: false neptune_logger_kwargs: project: null @@ -103,7 +103,10 @@ model: batch_p2p_comm: True # Batch consecutive inter-peer send/recv operations. This argument is valid only when `virtual_pipeline_model_parallel_size` is larger than 1 seq_len_interpolation_factor: null # RoPE Interpolation factor for sequence length. This is used to build long-context models with RoPE ex: https://arxiv.org/abs/2306.15595. num_query_groups: null # Number of query groups for group query attention. If None, normal attention is used. + + # Reset learning rate schedule. reset_lr: False # Set to True to reset learning rate. + reset_lr_steps: False # Set to True to reset learning rate max_steps and decay_steps. tokenizer: library: 'megatron' diff --git a/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py b/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py index f009bdd1ff71..a260de20c005 100644 --- a/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py +++ b/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py @@ -394,7 +394,11 @@ def __init__(self, cfg: DictConfig, trainer: Trainer): self.validation_param_sync_overlap = self.cfg.get('validation_param_sync_overlap', False) self.inference_params = None + + # Reset learning rate params self.if_init_step = True + self.reset_lr = self.cfg.get('reset_lr', False) + self.reset_lr_steps = self.cfg.get('reset_lr_steps', False) # default to false since this doesn't work with sequence parallelism currently self.use_loss_mask = self.cfg.get('use_loss_mask', False) @@ -763,9 +767,11 @@ def training_step(self, dataloader_iter): self.initialize_ub_func() # Reset learning rate - if self.if_init_step and self.cfg.get('reset_lr', False): - self._optimizer.param_groups[0]['num_steps'] = self.trainer.global_step - self._optimizer.param_groups[0]['reset_lr'] = True + if self.if_init_step and self.reset_lr: + self._optimizer.param_groups[0]['reset_lr'] = { + 'num_steps': self.trainer.global_step, + 'reset_lr_steps': True if self.reset_lr_steps else False, + } self.if_init_step = False if self.rampup_batch_size: diff --git a/nemo/core/optim/lr_scheduler.py b/nemo/core/optim/lr_scheduler.py index 1e99103ef63d..bbbcb46c6c98 100644 --- a/nemo/core/optim/lr_scheduler.py +++ b/nemo/core/optim/lr_scheduler.py @@ -290,9 +290,9 @@ def get_lr(self): # Reset learning rate if 'reset_lr' in self.optimizer.param_groups[0].keys(): - num_steps = self.optimizer.param_groups[0]['num_steps'] + num_steps = self.optimizer.param_groups[0]['reset_lr']['num_steps'] step -= num_steps - if self.first_step: + if self.first_step and self.optimizer.param_groups[0]['reset_lr']['reset_lr_steps']: self.decay_steps -= num_steps self.max_steps -= num_steps self.first_step = False From 61639095b830a06d1c01400af529b9b032a91a79 Mon Sep 17 00:00:00 2001 From: dimapihtar Date: Mon, 10 Jun 2024 14:33:49 +0000 Subject: [PATCH 13/21] Apply isort and black reformatting Signed-off-by: dimapihtar --- .../nlp/models/language_modeling/megatron_gpt_model.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py b/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py index a260de20c005..47b6dd90b0a7 100644 --- a/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py +++ b/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py @@ -769,7 +769,7 @@ def training_step(self, dataloader_iter): # Reset learning rate if self.if_init_step and self.reset_lr: self._optimizer.param_groups[0]['reset_lr'] = { - 'num_steps': self.trainer.global_step, + 'num_steps': self.trainer.global_step, 'reset_lr_steps': True if self.reset_lr_steps else False, } self.if_init_step = False From 4119a1d19b65020c633c9be87b1a37f467b3844a Mon Sep 17 00:00:00 2001 From: dimapihtar Date: Mon, 10 Jun 2024 08:40:44 -0700 Subject: [PATCH 14/21] change scheduler's first step logic when using reset_lr Signed-off-by: dimapihtar --- .../nlp/language_modeling/conf/megatron_gpt_config.yaml | 8 ++++---- .../nlp/models/language_modeling/megatron_gpt_model.py | 1 + 2 files changed, 5 insertions(+), 4 deletions(-) diff --git a/examples/nlp/language_modeling/conf/megatron_gpt_config.yaml b/examples/nlp/language_modeling/conf/megatron_gpt_config.yaml index 281bcb8022eb..8687074c15f9 100755 --- a/examples/nlp/language_modeling/conf/megatron_gpt_config.yaml +++ b/examples/nlp/language_modeling/conf/megatron_gpt_config.yaml @@ -28,10 +28,10 @@ exp_manager: explicit_log_dir: null exp_dir: null name: megatron_gpt - create_wandb_logger: True + create_wandb_logger: False wandb_logger_kwargs: - project: reset_lr_test - name: test_run + project: null + name: null create_neptune_logger: false neptune_logger_kwargs: project: null @@ -125,7 +125,7 @@ model: fp16_lm_cross_entropy: False # Move the cross entropy unreduced loss calculation for lm head to fp16 # Megatron O2-style half-precision - megatron_amp_O2: False # Enable O2-level automatic mixed precision using main parameters + megatron_amp_O2: True # Enable O2-level automatic mixed precision using main parameters grad_allreduce_chunk_size_mb: 125 # Fusion diff --git a/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py b/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py index 8240ac4a64ec..5877f9b2e273 100644 --- a/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py +++ b/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py @@ -769,6 +769,7 @@ def training_step(self, dataloader_iter): # Reset learning rate if self.if_init_step and self.reset_lr: + self._optimizer.param_groups[0]['lr'] = 0.0 if self.cfg.optim.sched.warmup_steps > 0 else self.cfg.optim.lr self._optimizer.param_groups[0]['reset_lr'] = { 'num_steps': self.trainer.global_step, 'reset_lr_steps': True if self.reset_lr_steps else False, From 92e7cf8a9056cabc8f7956924b9e5c9d81da387e Mon Sep 17 00:00:00 2001 From: dimapihtar Date: Mon, 10 Jun 2024 08:42:11 -0700 Subject: [PATCH 15/21] revert config Signed-off-by: dimapihtar --- examples/nlp/language_modeling/conf/megatron_gpt_config.yaml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/examples/nlp/language_modeling/conf/megatron_gpt_config.yaml b/examples/nlp/language_modeling/conf/megatron_gpt_config.yaml index 8687074c15f9..c71b6e908b50 100755 --- a/examples/nlp/language_modeling/conf/megatron_gpt_config.yaml +++ b/examples/nlp/language_modeling/conf/megatron_gpt_config.yaml @@ -15,7 +15,7 @@ trainer: use_distributed_sampler: False max_epochs: -1 # PTL default. In practice, max_steps will be reached first. max_steps: 100000 # consumed_samples = global_step * micro_batch_size * data_parallel_size * accumulate_grad_batches - log_every_n_steps: 1 + log_every_n_steps: 10 val_check_interval: 100 limit_val_batches: 50 limit_test_batches: 500 @@ -125,7 +125,7 @@ model: fp16_lm_cross_entropy: False # Move the cross entropy unreduced loss calculation for lm head to fp16 # Megatron O2-style half-precision - megatron_amp_O2: True # Enable O2-level automatic mixed precision using main parameters + megatron_amp_O2: False # Enable O2-level automatic mixed precision using main parameters grad_allreduce_chunk_size_mb: 125 # Fusion From 5da92cd918d6b6f184473d8d0632effa5b1263ec Mon Sep 17 00:00:00 2001 From: dimapihtar Date: Tue, 11 Jun 2024 06:22:48 -0700 Subject: [PATCH 16/21] fix reset_lr logic Signed-off-by: dimapihtar --- .../conf/megatron_gpt_config.yaml | 24 +++++++++---------- .../language_modeling/megatron_gpt_model.py | 7 +++++- nemo/core/optim/lr_scheduler.py | 8 +++---- 3 files changed, 22 insertions(+), 17 deletions(-) diff --git a/examples/nlp/language_modeling/conf/megatron_gpt_config.yaml b/examples/nlp/language_modeling/conf/megatron_gpt_config.yaml index c71b6e908b50..f7c86af57c76 100755 --- a/examples/nlp/language_modeling/conf/megatron_gpt_config.yaml +++ b/examples/nlp/language_modeling/conf/megatron_gpt_config.yaml @@ -9,13 +9,13 @@ trainer: devices: 1 num_nodes: 1 accelerator: gpu - precision: 16 + precision: bf16 logger: False # logger provided by exp_manager enable_checkpointing: False use_distributed_sampler: False max_epochs: -1 # PTL default. In practice, max_steps will be reached first. max_steps: 100000 # consumed_samples = global_step * micro_batch_size * data_parallel_size * accumulate_grad_batches - log_every_n_steps: 10 + log_every_n_steps: 1 val_check_interval: 100 limit_val_batches: 50 limit_test_batches: 500 @@ -28,10 +28,10 @@ exp_manager: explicit_log_dir: null exp_dir: null name: megatron_gpt - create_wandb_logger: False + create_wandb_logger: True wandb_logger_kwargs: - project: null - name: null + project: reset_lr_test + name: test_run create_neptune_logger: false neptune_logger_kwargs: project: null @@ -56,7 +56,7 @@ exp_manager: model: # use GPTModel from megatron.core - mcore_gpt: False + mcore_gpt: True # specify micro_batch_size, global_batch_size, and model parallelism # gradient accumulation will be done automatically based on data_parallel_size @@ -105,8 +105,8 @@ model: num_query_groups: null # Number of query groups for group query attention. If None, normal attention is used. # Reset learning rate schedule. - reset_lr: False # Set to True to reset learning rate. - reset_lr_steps: False # Set to True to reset learning rate max_steps and decay_steps. + reset_lr: False # Set to True to reset learning rate. Only supported with distributed optmizer and megatron_amp_O2. + reset_lr_steps: False # Set to True to decrease learning rate's max_steps and decay_steps by number of previously used steps. tokenizer: library: 'megatron' @@ -125,7 +125,7 @@ model: fp16_lm_cross_entropy: False # Move the cross entropy unreduced loss calculation for lm head to fp16 # Megatron O2-style half-precision - megatron_amp_O2: False # Enable O2-level automatic mixed precision using main parameters + megatron_amp_O2: True # Enable O2-level automatic mixed precision using main parameters grad_allreduce_chunk_size_mb: 125 # Fusion @@ -271,7 +271,7 @@ model: gen_shape: False # Generate model and kernel details including input shapes optim: - name: fused_adam + name: distributed_fused_adam lr: 2e-4 weight_decay: 0.01 betas: @@ -279,8 +279,8 @@ model: - 0.98 sched: name: CosineAnnealing - warmup_steps: 500 - constant_steps: 0 + warmup_steps: 10 + constant_steps: 10 min_lr: 2e-5 gc_interval: 0 diff --git a/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py b/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py index 5877f9b2e273..176cade9b6f0 100644 --- a/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py +++ b/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py @@ -399,6 +399,8 @@ def __init__(self, cfg: DictConfig, trainer: Trainer): self.if_init_step = True self.reset_lr = self.cfg.get('reset_lr', False) self.reset_lr_steps = self.cfg.get('reset_lr_steps', False) + if self.reset_lr and (not self.with_distributed_adam or not self.megatron_amp_O2): + raise ValueError('Learning rate reset feature is only supported with the distributed optmizer and megatron_amp_O2 for now.') # default to false since this doesn't work with sequence parallelism currently self.use_loss_mask = self.cfg.get('use_loss_mask', False) @@ -769,10 +771,13 @@ def training_step(self, dataloader_iter): # Reset learning rate if self.if_init_step and self.reset_lr: - self._optimizer.param_groups[0]['lr'] = 0.0 if self.cfg.optim.sched.warmup_steps > 0 else self.cfg.optim.lr + num_groups = len(self._optimizer.param_groups) + for group in range(num_groups): + self._optimizer.param_groups[group]['lr'] = 0.0 if self.cfg.optim.sched.warmup_steps > 0 else self.cfg.optim.lr self._optimizer.param_groups[0]['reset_lr'] = { 'num_steps': self.trainer.global_step, 'reset_lr_steps': True if self.reset_lr_steps else False, + 'if_init_step': self.if_init_step, } self.if_init_step = False diff --git a/nemo/core/optim/lr_scheduler.py b/nemo/core/optim/lr_scheduler.py index bbbcb46c6c98..cfb3068b1cc8 100644 --- a/nemo/core/optim/lr_scheduler.py +++ b/nemo/core/optim/lr_scheduler.py @@ -277,7 +277,6 @@ def __init__( self.decay_steps = max_steps - (self.constant_steps + self.warmup_steps) self.min_lr = min_lr - self.first_step = True super().__init__(optimizer, last_epoch) def get_lr(self): @@ -290,12 +289,13 @@ def get_lr(self): # Reset learning rate if 'reset_lr' in self.optimizer.param_groups[0].keys(): - num_steps = self.optimizer.param_groups[0]['reset_lr']['num_steps'] + reset_lr = self.optimizer.param_groups[0]['reset_lr'] + num_steps = reset_lr['num_steps'] step -= num_steps - if self.first_step and self.optimizer.param_groups[0]['reset_lr']['reset_lr_steps']: + if reset_lr['if_init_step'] and reset_lr['reset_lr_steps']: self.decay_steps -= num_steps self.max_steps -= num_steps - self.first_step = False + self.optimizer.param_groups[0]['reset_lr']['if_init_step'] = False # Warmup steps if self.warmup_steps > 0 and step <= self.warmup_steps: From 7cfd47ad449ab597bf773e817719617360534534 Mon Sep 17 00:00:00 2001 From: dimapihtar Date: Tue, 11 Jun 2024 13:24:04 +0000 Subject: [PATCH 17/21] Apply isort and black reformatting Signed-off-by: dimapihtar --- .../nlp/models/language_modeling/megatron_gpt_model.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py b/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py index 176cade9b6f0..75f7ffbaab1b 100644 --- a/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py +++ b/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py @@ -400,7 +400,9 @@ def __init__(self, cfg: DictConfig, trainer: Trainer): self.reset_lr = self.cfg.get('reset_lr', False) self.reset_lr_steps = self.cfg.get('reset_lr_steps', False) if self.reset_lr and (not self.with_distributed_adam or not self.megatron_amp_O2): - raise ValueError('Learning rate reset feature is only supported with the distributed optmizer and megatron_amp_O2 for now.') + raise ValueError( + 'Learning rate reset feature is only supported with the distributed optmizer and megatron_amp_O2 for now.' + ) # default to false since this doesn't work with sequence parallelism currently self.use_loss_mask = self.cfg.get('use_loss_mask', False) @@ -773,7 +775,9 @@ def training_step(self, dataloader_iter): if self.if_init_step and self.reset_lr: num_groups = len(self._optimizer.param_groups) for group in range(num_groups): - self._optimizer.param_groups[group]['lr'] = 0.0 if self.cfg.optim.sched.warmup_steps > 0 else self.cfg.optim.lr + self._optimizer.param_groups[group]['lr'] = ( + 0.0 if self.cfg.optim.sched.warmup_steps > 0 else self.cfg.optim.lr + ) self._optimizer.param_groups[0]['reset_lr'] = { 'num_steps': self.trainer.global_step, 'reset_lr_steps': True if self.reset_lr_steps else False, From 067c2645dd80c00b4d6c3f1fc5719007c0702e3b Mon Sep 17 00:00:00 2001 From: dimapihtar Date: Tue, 11 Jun 2024 06:25:56 -0700 Subject: [PATCH 18/21] revert config Signed-off-by: dimapihtar --- .../language_modeling/conf/megatron_gpt_config.yaml | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/examples/nlp/language_modeling/conf/megatron_gpt_config.yaml b/examples/nlp/language_modeling/conf/megatron_gpt_config.yaml index f7c86af57c76..0ba150d8e7b6 100755 --- a/examples/nlp/language_modeling/conf/megatron_gpt_config.yaml +++ b/examples/nlp/language_modeling/conf/megatron_gpt_config.yaml @@ -15,7 +15,7 @@ trainer: use_distributed_sampler: False max_epochs: -1 # PTL default. In practice, max_steps will be reached first. max_steps: 100000 # consumed_samples = global_step * micro_batch_size * data_parallel_size * accumulate_grad_batches - log_every_n_steps: 1 + log_every_n_steps: 10 val_check_interval: 100 limit_val_batches: 50 limit_test_batches: 500 @@ -28,10 +28,10 @@ exp_manager: explicit_log_dir: null exp_dir: null name: megatron_gpt - create_wandb_logger: True + create_wandb_logger: False wandb_logger_kwargs: - project: reset_lr_test - name: test_run + project: null + name: null create_neptune_logger: false neptune_logger_kwargs: project: null @@ -271,7 +271,7 @@ model: gen_shape: False # Generate model and kernel details including input shapes optim: - name: distributed_fused_adam + name: fused_adam lr: 2e-4 weight_decay: 0.01 betas: From 43ccac7377efc4c3f441205c3b90d2ba344d47c4 Mon Sep 17 00:00:00 2001 From: dimapihtar Date: Tue, 11 Jun 2024 06:27:30 -0700 Subject: [PATCH 19/21] revert config Signed-off-by: dimapihtar --- examples/nlp/language_modeling/conf/megatron_gpt_config.yaml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/examples/nlp/language_modeling/conf/megatron_gpt_config.yaml b/examples/nlp/language_modeling/conf/megatron_gpt_config.yaml index 0ba150d8e7b6..2cf512477a92 100755 --- a/examples/nlp/language_modeling/conf/megatron_gpt_config.yaml +++ b/examples/nlp/language_modeling/conf/megatron_gpt_config.yaml @@ -279,8 +279,8 @@ model: - 0.98 sched: name: CosineAnnealing - warmup_steps: 10 - constant_steps: 10 + warmup_steps: 500 + constant_steps: 0 min_lr: 2e-5 gc_interval: 0 From 0d91dcd446ebf2d0b7209acc23d0baa49c201e2d Mon Sep 17 00:00:00 2001 From: dimapihtar Date: Tue, 25 Jun 2024 04:04:52 -0700 Subject: [PATCH 20/21] update reset_lr comments Signed-off-by: dimapihtar --- examples/nlp/language_modeling/conf/megatron_gpt_config.yaml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/examples/nlp/language_modeling/conf/megatron_gpt_config.yaml b/examples/nlp/language_modeling/conf/megatron_gpt_config.yaml index 2cf512477a92..2ce1d478a642 100755 --- a/examples/nlp/language_modeling/conf/megatron_gpt_config.yaml +++ b/examples/nlp/language_modeling/conf/megatron_gpt_config.yaml @@ -105,8 +105,8 @@ model: num_query_groups: null # Number of query groups for group query attention. If None, normal attention is used. # Reset learning rate schedule. - reset_lr: False # Set to True to reset learning rate. Only supported with distributed optmizer and megatron_amp_O2. - reset_lr_steps: False # Set to True to decrease learning rate's max_steps and decay_steps by number of previously used steps. + reset_lr: False # Set to True to reset learning rate to initial learning rate. Only supported with distributed optmizer and megatron_amp_O2. + reset_lr_steps: False # Set to True to adjust learning rate's max_steps and decay_steps by subtracting number of steps already completed at the checkpoint. tokenizer: library: 'megatron' From ce4200a0ee3fc840a92a3b6243a832f3dbb8a540 Mon Sep 17 00:00:00 2001 From: dimapihtar Date: Tue, 25 Jun 2024 11:10:25 -0700 Subject: [PATCH 21/21] add use cases for reset_lr feature Signed-off-by: dimapihtar --- .../nlp/language_modeling/conf/megatron_gpt_config.yaml | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/examples/nlp/language_modeling/conf/megatron_gpt_config.yaml b/examples/nlp/language_modeling/conf/megatron_gpt_config.yaml index 8e313827efaa..8c6d97821222 100755 --- a/examples/nlp/language_modeling/conf/megatron_gpt_config.yaml +++ b/examples/nlp/language_modeling/conf/megatron_gpt_config.yaml @@ -115,7 +115,11 @@ model: seq_len_interpolation_factor: null # RoPE Interpolation factor for sequence length. This is used to build long-context models with RoPE ex: https://arxiv.org/abs/2306.15595. num_query_groups: null # Number of query groups for group query attention. If None, normal attention is used. - # Reset learning rate schedule. + ## Reset learning rate schedule. + # 1. reset_lr=True, reset_lr_steps=False. When pre-training an existing checkpoint "from scratch" on a different dataset. + # 2. reset_lr=True, reset_lr_steps=True. When continuing training from an existing checkpoint with the same configuration. + # Learning rate's max_steps and decay_steps will be recalculated as follows: max_steps -= completed_steps, decay_steps -= completed_steps where completed_steps is the number of steps already completed at the checkpoint. + # This will help to reach the min_lr value by the end of training without changing trainer.max_steps. reset_lr: False # Set to True to reset learning rate to initial learning rate. Only supported with distributed optmizer and megatron_amp_O2. reset_lr_steps: False # Set to True to adjust learning rate's max_steps and decay_steps by subtracting number of steps already completed at the checkpoint.