From 911c39a4e4842418e7259d70b0d081d48eafe244 Mon Sep 17 00:00:00 2001
From: dimapihtar <dpihtar@gmail.com>
Date: Mon, 3 Jun 2024 10:41:01 -0700
Subject: [PATCH 01/21] add reset_lr functionality

Signed-off-by: dimapihtar <dpihtar@gmail.com>
---
 .../conf/megatron_gpt_config.yaml             | 19 +++++++--------
 .../language_modeling/megatron_gpt_model.py   |  7 ++++++
 nemo/core/optim/lr_scheduler.py               | 23 +++++++++++++------
 3 files changed, 33 insertions(+), 16 deletions(-)

diff --git a/examples/nlp/language_modeling/conf/megatron_gpt_config.yaml b/examples/nlp/language_modeling/conf/megatron_gpt_config.yaml
index ca0c3f74e4c8..be46747ea27c 100755
--- a/examples/nlp/language_modeling/conf/megatron_gpt_config.yaml
+++ b/examples/nlp/language_modeling/conf/megatron_gpt_config.yaml
@@ -9,14 +9,14 @@ trainer:
   devices: 1
   num_nodes: 1
   accelerator: gpu
-  precision: 16
+  precision: bf16
   logger: False # logger provided by exp_manager
   enable_checkpointing: False
   use_distributed_sampler: False
   max_epochs: -1 # PTL default. In practice, max_steps will be reached first.
-  max_steps: 100000 # consumed_samples = global_step * micro_batch_size * data_parallel_size * accumulate_grad_batches
+  max_steps: 100 # consumed_samples = global_step * micro_batch_size * data_parallel_size * accumulate_grad_batches
   log_every_n_steps: 10
-  val_check_interval: 100
+  val_check_interval: 40
   limit_val_batches: 50
   limit_test_batches: 500
   accumulate_grad_batches: 1 # do not modify, grad acc is automatic for training megatron models
@@ -121,7 +121,7 @@ model:
   fp16_lm_cross_entropy: False # Move the cross entropy unreduced loss calculation for lm head to fp16
 
   # Megatron O2-style half-precision
-  megatron_amp_O2: False # Enable O2-level automatic mixed precision using main parameters
+  megatron_amp_O2: True # Enable O2-level automatic mixed precision using main parameters
   grad_allreduce_chunk_size_mb: 125
 
   # Fusion
@@ -240,9 +240,9 @@ model:
     # Dictionary: can override from CLI "model.data.data_prefix"={"train":[1.0, /path/to/data], "validation":/path/to/data, "test":/path/to/test}
     # Or see example below:
     # "model.data.data_prefix: {train:[1.0,/path/to/data], validation:[/path/to/data], test:[/path/to/test]}"
-    data_prefix: ???
+    data_prefix: []
     index_mapping_dir: null # path to save index mapping .npy files, by default will save in the same location as data_prefix
-    data_impl: mmap
+    data_impl: mock
     mmap_bin_files: True
     splits_string: 900,50,50
     seq_length: ${model.encoder_seq_length}
@@ -267,7 +267,7 @@ model:
     gen_shape: False # Generate model and kernel details including input shapes
 
   optim:
-    name: fused_adam
+    name: distributed_fused_adam
     lr: 2e-4
     weight_decay: 0.01
     betas:
@@ -275,9 +275,10 @@ model:
     - 0.98
     sched:
       name: CosineAnnealing
-      warmup_steps: 500
-      constant_steps: 0
+      warmup_steps: 15
+      constant_steps: 15
       min_lr: 2e-5
+      reset_lr: True
 
   gc_interval: 0
   # Interval of the host memory garbage collection. When it is zero, collectiion relies on the automatic garbage collector.
diff --git a/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py b/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py
index a5b4450c7b44..3ec440c95af4 100644
--- a/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py
+++ b/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py
@@ -305,6 +305,8 @@ def __init__(self, cfg: DictConfig, trainer: Trainer):
             self.if_first_step = 0
             self.prev_global_batch_size = None
 
+        self.if_init_step = True
+
         if cfg.get('data', None) is not None:
             self.reset_position_ids = cfg.data.get('reset_position_ids', False)
             self.reset_attention_mask = cfg.data.get('reset_attention_mask', False)
@@ -757,6 +759,10 @@ def training_step(self, dataloader_iter):
         The input batch to each micro-batch is fetched using the dataloader function
         in the micro-batch fwd function.
         """
+        if self.if_init_step:
+            self._optimizer.param_groups[0]['step'] = self.trainer.global_step
+            self.if_init_step = False
+            
         # Initialize userbuffer communicators.
         if self.initialize_ub:
             self.initialize_ub_func()
@@ -914,6 +920,7 @@ def training_step(self, dataloader_iter):
                     self.log('loss_scale', loss_scale, batch_size=1)
 
         lr = self._optimizer.param_groups[0]['lr']
+
         self.log('lr', lr, rank_zero_only=True, batch_size=1)
         self.log(
             'global_step',
diff --git a/nemo/core/optim/lr_scheduler.py b/nemo/core/optim/lr_scheduler.py
index 473ca0f5c416..e51ed03f28e3 100644
--- a/nemo/core/optim/lr_scheduler.py
+++ b/nemo/core/optim/lr_scheduler.py
@@ -42,7 +42,7 @@ class WarmupPolicy(_LRScheduler):
             infinite training
     """
 
-    def __init__(self, optimizer, *, warmup_steps=None, warmup_ratio=None, max_steps=None, min_lr=0.0, last_epoch=-1):
+    def __init__(self, optimizer, *, warmup_steps=None, warmup_ratio=None, max_steps=None, min_lr=0.0, last_epoch=-1, reset_lr=False):
         assert not (
             warmup_steps is not None and warmup_ratio is not None
         ), "Either use particular number of step or ratio"
@@ -59,6 +59,7 @@ def __init__(self, optimizer, *, warmup_steps=None, warmup_ratio=None, max_steps
             self.warmup_steps = 0
 
         self.min_lr = min_lr
+        self.reset_lr = reset_lr
         super().__init__(optimizer, last_epoch)
 
     def get_lr(self):
@@ -97,7 +98,7 @@ class SquareRootConstantPolicy(_LRScheduler):
     """
 
     def __init__(
-        self, optimizer, *, constant_steps=None, constant_ratio=None, max_steps=None, min_lr=0.0, last_epoch=-1
+        self, optimizer, *, constant_steps=None, constant_ratio=None, max_steps=None, min_lr=0.0, last_epoch=-1, reset_lr=False
     ):
         assert not (
             constant_steps is not None and constant_ratio is not None
@@ -116,6 +117,7 @@ def __init__(
 
         self.constant_lr = 1 / (constant_steps ** 0.5)
         self.min_lr = min_lr
+        self.reset_lr = reset_lr
         super().__init__(optimizer, last_epoch)
 
     def get_lr(self):
@@ -240,6 +242,7 @@ def __init__(
         max_steps=None,
         min_lr=0.0,
         last_epoch=-1,
+        reset_lr=False,
     ):
         assert not (
             warmup_steps is not None and warmup_ratio is not None
@@ -270,6 +273,8 @@ def __init__(
         self.decay_steps = max_steps - (self.constant_steps + self.warmup_steps)
 
         self.min_lr = min_lr
+        self.reset_lr = reset_lr
+        print(self.reset_lr)
         super().__init__(optimizer, last_epoch)
 
     def get_lr(self):
@@ -277,8 +282,11 @@ def get_lr(self):
             warnings.warn(
                 "To get the last learning rate computed by the scheduler, please use `get_last_lr()`.", UserWarning
             )
-
-        step = self.last_epoch
+        
+        if self.reset_lr and 'step' in self.optimizer.param_groups[0].keys():
+            step = self.last_epoch - self.optimizer.param_groups[0]['step']
+        else:
+            step = self.last_epoch
 
         # Warmup steps
         if self.warmup_steps > 0 and step <= self.warmup_steps:
@@ -401,8 +409,8 @@ def _get_lr(self, step):
 
 
 class CosineAnnealing(WarmupAnnealHoldPolicy):
-    def __init__(self, optimizer, *, max_steps, min_lr=0, last_epoch=-1, **kwargs):
-        super().__init__(optimizer=optimizer, max_steps=max_steps, last_epoch=last_epoch, min_lr=min_lr, **kwargs)
+    def __init__(self, optimizer, *, max_steps, min_lr=0, last_epoch=-1, reset_lr=False, **kwargs):
+        super().__init__(optimizer=optimizer, max_steps=max_steps, last_epoch=last_epoch, min_lr=min_lr, reset_lr=reset_lr, **kwargs)
 
     def _get_lr(self, step):
         for initial_lr in self.base_lrs:
@@ -453,7 +461,7 @@ def _get_linear_warmup_with_cosine_annealing_lr(self, step):
 
 class NoamAnnealing(_LRScheduler):
     def __init__(
-        self, optimizer, *, d_model, warmup_steps=None, warmup_ratio=None, max_steps=None, min_lr=0.0, last_epoch=-1
+        self, optimizer, *, d_model, warmup_steps=None, warmup_ratio=None, max_steps=None, min_lr=0.0, last_epoch=-1, reset_lr=False
     ):
         self._normalize = d_model ** (-0.5)
         assert not (
@@ -472,6 +480,7 @@ def __init__(
             self.warmup_steps = 0
 
         self.min_lr = min_lr
+        self.reset_lr = reset_lr
         super().__init__(optimizer, last_epoch)
 
     def get_lr(self):

From 7802851d81352a3bdb24d27be09e2f9f3e89bbe1 Mon Sep 17 00:00:00 2001
From: dimapihtar <dpihtar@gmail.com>
Date: Mon, 3 Jun 2024 16:17:29 -0700
Subject: [PATCH 02/21] fix reset_lr logic

Signed-off-by: dimapihtar <dpihtar@gmail.com>
---
 .../conf/megatron_gpt_config.yaml             | 13 +++++-----
 .../language_modeling/megatron_base_model.py  |  2 +-
 .../language_modeling/megatron_gpt_model.py   |  4 ++-
 nemo/core/optim/lr_scheduler.py               | 26 +++++++++++--------
 4 files changed, 25 insertions(+), 20 deletions(-)

diff --git a/examples/nlp/language_modeling/conf/megatron_gpt_config.yaml b/examples/nlp/language_modeling/conf/megatron_gpt_config.yaml
index be46747ea27c..21f456ca61d8 100755
--- a/examples/nlp/language_modeling/conf/megatron_gpt_config.yaml
+++ b/examples/nlp/language_modeling/conf/megatron_gpt_config.yaml
@@ -14,9 +14,9 @@ trainer:
   enable_checkpointing: False
   use_distributed_sampler: False
   max_epochs: -1 # PTL default. In practice, max_steps will be reached first.
-  max_steps: 100 # consumed_samples = global_step * micro_batch_size * data_parallel_size * accumulate_grad_batches
-  log_every_n_steps: 10
-  val_check_interval: 40
+  max_steps: 150 # consumed_samples = global_step * micro_batch_size * data_parallel_size * accumulate_grad_batches
+  log_every_n_steps: 1
+  val_check_interval: 50
   limit_val_batches: 50
   limit_test_batches: 500
   accumulate_grad_batches: 1 # do not modify, grad acc is automatic for training megatron models
@@ -28,10 +28,10 @@ exp_manager:
   explicit_log_dir: null
   exp_dir: null
   name: megatron_gpt
-  create_wandb_logger: False
+  create_wandb_logger: True
   wandb_logger_kwargs:
-    project: null
-    name: null
+    project: reset_lr_test
+    name: test_run
   create_neptune_logger: false
   neptune_logger_kwargs:
     project: null
@@ -279,7 +279,6 @@ model:
       constant_steps: 15
       min_lr: 2e-5
       reset_lr: True
-
   gc_interval: 0
   # Interval of the host memory garbage collection. When it is zero, collectiion relies on the automatic garbage collector.
   # If an interger value larger than zero is set, collection is done manually by the batch step interval of `gc_interval`.
diff --git a/nemo/collections/nlp/models/language_modeling/megatron_base_model.py b/nemo/collections/nlp/models/language_modeling/megatron_base_model.py
index a27f9fd5e5e4..f2ab132256c3 100644
--- a/nemo/collections/nlp/models/language_modeling/megatron_base_model.py
+++ b/nemo/collections/nlp/models/language_modeling/megatron_base_model.py
@@ -841,7 +841,7 @@ def configure_optimizers(self):
             if hasattr(self._cfg.optim, 'sched'):
                 sched_config = self._cfg.optim.sched
                 self._scheduler = prepare_lr_scheduler(
-                    optimizer=self._optimizer, scheduler_config=sched_config, train_dataloader=self._train_dl
+                    optimizer=self._optimizer, scheduler_config=sched_config, train_dataloader=self._train_dl,
                 )
 
         if getattr(self._cfg.optim, 'sched', None) is not None and self._scheduler is None:
diff --git a/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py b/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py
index 3ec440c95af4..4df5166187e8 100644
--- a/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py
+++ b/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py
@@ -759,8 +759,10 @@ def training_step(self, dataloader_iter):
         The input batch to each micro-batch is fetched using the dataloader function
         in the micro-batch fwd function.
         """
-        if self.if_init_step:
+
+        if self.if_init_step and self.cfg.optim.sched.get('reset_lr', False):
             self._optimizer.param_groups[0]['step'] = self.trainer.global_step
+            self._optimizer.param_groups[0]['reset_lr'] = True
             self.if_init_step = False
             
         # Initialize userbuffer communicators.
diff --git a/nemo/core/optim/lr_scheduler.py b/nemo/core/optim/lr_scheduler.py
index e51ed03f28e3..e915bc9ac315 100644
--- a/nemo/core/optim/lr_scheduler.py
+++ b/nemo/core/optim/lr_scheduler.py
@@ -42,7 +42,7 @@ class WarmupPolicy(_LRScheduler):
             infinite training
     """
 
-    def __init__(self, optimizer, *, warmup_steps=None, warmup_ratio=None, max_steps=None, min_lr=0.0, last_epoch=-1, reset_lr=False):
+    def __init__(self, optimizer, *, reset_lr, warmup_steps=None, warmup_ratio=None, max_steps=None, min_lr=0.0, last_epoch=-1):
         assert not (
             warmup_steps is not None and warmup_ratio is not None
         ), "Either use particular number of step or ratio"
@@ -98,7 +98,7 @@ class SquareRootConstantPolicy(_LRScheduler):
     """
 
     def __init__(
-        self, optimizer, *, constant_steps=None, constant_ratio=None, max_steps=None, min_lr=0.0, last_epoch=-1, reset_lr=False
+        self, optimizer, *, reset_lr, constant_steps=None, constant_ratio=None, max_steps=None, min_lr=0.0, last_epoch=-1
     ):
         assert not (
             constant_steps is not None and constant_ratio is not None
@@ -235,6 +235,7 @@ def __init__(
         self,
         optimizer,
         *,
+        reset_lr,
         warmup_steps=None,
         warmup_ratio=None,
         constant_steps=None,
@@ -242,7 +243,6 @@ def __init__(
         max_steps=None,
         min_lr=0.0,
         last_epoch=-1,
-        reset_lr=False,
     ):
         assert not (
             warmup_steps is not None and warmup_ratio is not None
@@ -273,8 +273,7 @@ def __init__(
         self.decay_steps = max_steps - (self.constant_steps + self.warmup_steps)
 
         self.min_lr = min_lr
-        self.reset_lr = reset_lr
-        print(self.reset_lr)
+        self.first_step = True
         super().__init__(optimizer, last_epoch)
 
     def get_lr(self):
@@ -283,10 +282,14 @@ def get_lr(self):
                 "To get the last learning rate computed by the scheduler, please use `get_last_lr()`.", UserWarning
             )
         
-        if self.reset_lr and 'step' in self.optimizer.param_groups[0].keys():
-            step = self.last_epoch - self.optimizer.param_groups[0]['step']
-        else:
-            step = self.last_epoch
+        step = self.last_epoch
+        if 'reset_lr' in self.optimizer.param_groups[0].keys():
+            init_steps = self.optimizer.param_groups[0]['step']
+            step -= init_steps
+            if self.first_step:
+                self.decay_steps -= init_steps
+                self.max_steps -= init_steps
+                self.first_step = False
 
         # Warmup steps
         if self.warmup_steps > 0 and step <= self.warmup_steps:
@@ -409,7 +412,7 @@ def _get_lr(self, step):
 
 
 class CosineAnnealing(WarmupAnnealHoldPolicy):
-    def __init__(self, optimizer, *, max_steps, min_lr=0, last_epoch=-1, reset_lr=False, **kwargs):
+    def __init__(self, optimizer, *, max_steps, reset_lr, min_lr=0, last_epoch=-1, **kwargs):
         super().__init__(optimizer=optimizer, max_steps=max_steps, last_epoch=last_epoch, min_lr=min_lr, reset_lr=reset_lr, **kwargs)
 
     def _get_lr(self, step):
@@ -461,7 +464,7 @@ def _get_linear_warmup_with_cosine_annealing_lr(self, step):
 
 class NoamAnnealing(_LRScheduler):
     def __init__(
-        self, optimizer, *, d_model, warmup_steps=None, warmup_ratio=None, max_steps=None, min_lr=0.0, last_epoch=-1, reset_lr=False
+        self, optimizer, *, d_model, reset_lr, warmup_steps=None, warmup_ratio=None, max_steps=None, min_lr=0.0, last_epoch=-1
     ):
         self._normalize = d_model ** (-0.5)
         assert not (
@@ -697,6 +700,7 @@ def prepare_lr_scheduler(
     optimizer: optim.Optimizer,
     scheduler_config: Union[Dict[str, Any], DictConfig],
     train_dataloader: Optional[dataloader.DataLoader] = None,
+    reset_lr: bool = False,
 ) -> Optional[Dict[str, Any]]:
     """
     Constructs an LR Scheduler (optionally) for a given optimizer, based on a config with the following schema

From b2f5eed22444b94f7cf42198b53596b8e4fb87a3 Mon Sep 17 00:00:00 2001
From: dimapihtar <dimapihtar@users.noreply.github.com>
Date: Tue, 4 Jun 2024 11:11:58 +0000
Subject: [PATCH 03/21] Apply isort and black reformatting

Signed-off-by: dimapihtar <dimapihtar@users.noreply.github.com>
---
 .../language_modeling/megatron_base_model.py  |  4 +-
 .../language_modeling/megatron_gpt_model.py   |  2 +-
 nemo/core/optim/lr_scheduler.py               | 37 +++++++++++++++----
 3 files changed, 33 insertions(+), 10 deletions(-)

diff --git a/nemo/collections/nlp/models/language_modeling/megatron_base_model.py b/nemo/collections/nlp/models/language_modeling/megatron_base_model.py
index f2ab132256c3..f8ecb7aa4bf8 100644
--- a/nemo/collections/nlp/models/language_modeling/megatron_base_model.py
+++ b/nemo/collections/nlp/models/language_modeling/megatron_base_model.py
@@ -841,7 +841,9 @@ def configure_optimizers(self):
             if hasattr(self._cfg.optim, 'sched'):
                 sched_config = self._cfg.optim.sched
                 self._scheduler = prepare_lr_scheduler(
-                    optimizer=self._optimizer, scheduler_config=sched_config, train_dataloader=self._train_dl,
+                    optimizer=self._optimizer,
+                    scheduler_config=sched_config,
+                    train_dataloader=self._train_dl,
                 )
 
         if getattr(self._cfg.optim, 'sched', None) is not None and self._scheduler is None:
diff --git a/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py b/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py
index 4df5166187e8..f6ef2a00601d 100644
--- a/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py
+++ b/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py
@@ -764,7 +764,7 @@ def training_step(self, dataloader_iter):
             self._optimizer.param_groups[0]['step'] = self.trainer.global_step
             self._optimizer.param_groups[0]['reset_lr'] = True
             self.if_init_step = False
-            
+
         # Initialize userbuffer communicators.
         if self.initialize_ub:
             self.initialize_ub_func()
diff --git a/nemo/core/optim/lr_scheduler.py b/nemo/core/optim/lr_scheduler.py
index e915bc9ac315..fd204bc3a42d 100644
--- a/nemo/core/optim/lr_scheduler.py
+++ b/nemo/core/optim/lr_scheduler.py
@@ -42,7 +42,9 @@ class WarmupPolicy(_LRScheduler):
             infinite training
     """
 
-    def __init__(self, optimizer, *, reset_lr, warmup_steps=None, warmup_ratio=None, max_steps=None, min_lr=0.0, last_epoch=-1):
+    def __init__(
+        self, optimizer, *, reset_lr, warmup_steps=None, warmup_ratio=None, max_steps=None, min_lr=0.0, last_epoch=-1
+    ):
         assert not (
             warmup_steps is not None and warmup_ratio is not None
         ), "Either use particular number of step or ratio"
@@ -98,7 +100,15 @@ class SquareRootConstantPolicy(_LRScheduler):
     """
 
     def __init__(
-        self, optimizer, *, reset_lr, constant_steps=None, constant_ratio=None, max_steps=None, min_lr=0.0, last_epoch=-1
+        self,
+        optimizer,
+        *,
+        reset_lr,
+        constant_steps=None,
+        constant_ratio=None,
+        max_steps=None,
+        min_lr=0.0,
+        last_epoch=-1,
     ):
         assert not (
             constant_steps is not None and constant_ratio is not None
@@ -115,7 +125,7 @@ def __init__(
         else:
             self.constant_steps = 0
 
-        self.constant_lr = 1 / (constant_steps ** 0.5)
+        self.constant_lr = 1 / (constant_steps**0.5)
         self.min_lr = min_lr
         self.reset_lr = reset_lr
         super().__init__(optimizer, last_epoch)
@@ -281,7 +291,7 @@ def get_lr(self):
             warnings.warn(
                 "To get the last learning rate computed by the scheduler, please use `get_last_lr()`.", UserWarning
             )
-        
+
         step = self.last_epoch
         if 'reset_lr' in self.optimizer.param_groups[0].keys():
             init_steps = self.optimizer.param_groups[0]['step']
@@ -375,7 +385,7 @@ def _poly_decay(initial_lr, step, decay_steps, power, min_lr, cycle):
 
 def _noam_hold_annealing(initial_lr, step, warmup_steps, hold_steps, decay_rate, min_lr):
     # hold_steps = total number of steps to hold the LR, not the warmup + hold steps.
-    T_warmup_decay = max(1, warmup_steps ** decay_rate)
+    T_warmup_decay = max(1, warmup_steps**decay_rate)
     T_hold_decay = max(1, (step - hold_steps) ** decay_rate)
     lr = (initial_lr * T_warmup_decay) / T_hold_decay
     lr = max(lr, min_lr)
@@ -413,7 +423,9 @@ def _get_lr(self, step):
 
 class CosineAnnealing(WarmupAnnealHoldPolicy):
     def __init__(self, optimizer, *, max_steps, reset_lr, min_lr=0, last_epoch=-1, **kwargs):
-        super().__init__(optimizer=optimizer, max_steps=max_steps, last_epoch=last_epoch, min_lr=min_lr, reset_lr=reset_lr, **kwargs)
+        super().__init__(
+            optimizer=optimizer, max_steps=max_steps, last_epoch=last_epoch, min_lr=min_lr, reset_lr=reset_lr, **kwargs
+        )
 
     def _get_lr(self, step):
         for initial_lr in self.base_lrs:
@@ -464,7 +476,16 @@ def _get_linear_warmup_with_cosine_annealing_lr(self, step):
 
 class NoamAnnealing(_LRScheduler):
     def __init__(
-        self, optimizer, *, d_model, reset_lr, warmup_steps=None, warmup_ratio=None, max_steps=None, min_lr=0.0, last_epoch=-1
+        self,
+        optimizer,
+        *,
+        d_model,
+        reset_lr,
+        warmup_steps=None,
+        warmup_ratio=None,
+        max_steps=None,
+        min_lr=0.0,
+        last_epoch=-1,
     ):
         self._normalize = d_model ** (-0.5)
         assert not (
@@ -605,7 +626,7 @@ def __init__(self, optimizer, *, max_steps, last_epoch=-1, min_lr=0.0, **kwargs)
         super().__init__(optimizer=optimizer, max_steps=max_steps, **kwargs, last_epoch=last_epoch, min_lr=min_lr)
 
     def _get_lr(self, step):
-        return [1 / (step ** 0.5) for _ in self.base_lrs]
+        return [1 / (step**0.5) for _ in self.base_lrs]
 
 
 class PolynomialDecayAnnealing(WarmupPolicy):

From e6e95974a5e88e9e451f1d8c9d369786e232b6bb Mon Sep 17 00:00:00 2001
From: dimapihtar <dpihtar@gmail.com>
Date: Tue, 4 Jun 2024 05:09:11 -0700
Subject: [PATCH 04/21] move reset_lr from optim section

Signed-off-by: dimapihtar <dpihtar@gmail.com>
---
 .../conf/megatron_gpt_config.yaml             | 14 +++++------
 .../language_modeling/megatron_gpt_model.py   | 15 ++++++------
 nemo/core/optim/lr_scheduler.py               | 23 ++++++++-----------
 3 files changed, 23 insertions(+), 29 deletions(-)

diff --git a/examples/nlp/language_modeling/conf/megatron_gpt_config.yaml b/examples/nlp/language_modeling/conf/megatron_gpt_config.yaml
index 21f456ca61d8..2c1f82deaa05 100755
--- a/examples/nlp/language_modeling/conf/megatron_gpt_config.yaml
+++ b/examples/nlp/language_modeling/conf/megatron_gpt_config.yaml
@@ -14,9 +14,9 @@ trainer:
   enable_checkpointing: False
   use_distributed_sampler: False
   max_epochs: -1 # PTL default. In practice, max_steps will be reached first.
-  max_steps: 150 # consumed_samples = global_step * micro_batch_size * data_parallel_size * accumulate_grad_batches
+  max_steps: 500 # consumed_samples = global_step * micro_batch_size * data_parallel_size * accumulate_grad_batches
   log_every_n_steps: 1
-  val_check_interval: 50
+  val_check_interval: 100
   limit_val_batches: 50
   limit_test_batches: 500
   accumulate_grad_batches: 1 # do not modify, grad acc is automatic for training megatron models
@@ -103,6 +103,7 @@ model:
   batch_p2p_comm: True # Batch consecutive inter-peer send/recv operations. This argument is valid only when `virtual_pipeline_model_parallel_size` is larger than 1
   seq_len_interpolation_factor: null # RoPE Interpolation factor for sequence length. This is used to build long-context models with RoPE ex: https://arxiv.org/abs/2306.15595.
   num_query_groups: null # Number of query groups for group query attention. If None, normal attention is used.
+  reset_lr: True # Set to True to reset learning rate.
 
   tokenizer:
     library: 'megatron'
@@ -269,16 +270,15 @@ model:
   optim:
     name: distributed_fused_adam
     lr: 2e-4
-    weight_decay: 0.01
+    weight_decay: 0.02
     betas:
     - 0.9
-    - 0.98
+    - 0.95
     sched:
       name: CosineAnnealing
-      warmup_steps: 15
-      constant_steps: 15
+      warmup_steps: 25
+      constant_steps: 25
       min_lr: 2e-5
-      reset_lr: True
   gc_interval: 0
   # Interval of the host memory garbage collection. When it is zero, collectiion relies on the automatic garbage collector.
   # If an interger value larger than zero is set, collection is done manually by the batch step interval of `gc_interval`.
diff --git a/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py b/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py
index f6ef2a00601d..0b0b32d624d7 100644
--- a/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py
+++ b/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py
@@ -305,8 +305,6 @@ def __init__(self, cfg: DictConfig, trainer: Trainer):
             self.if_first_step = 0
             self.prev_global_batch_size = None
 
-        self.if_init_step = True
-
         if cfg.get('data', None) is not None:
             self.reset_position_ids = cfg.data.get('reset_position_ids', False)
             self.reset_attention_mask = cfg.data.get('reset_attention_mask', False)
@@ -396,6 +394,7 @@ def __init__(self, cfg: DictConfig, trainer: Trainer):
         self.validation_param_sync_overlap = self.cfg.get('validation_param_sync_overlap', False)
 
         self.inference_params = None
+        self.if_init_step = True
 
         # default to false since this doesn't work with sequence parallelism currently
         self.use_loss_mask = self.cfg.get('use_loss_mask', False)
@@ -759,16 +758,16 @@ def training_step(self, dataloader_iter):
         The input batch to each micro-batch is fetched using the dataloader function
         in the micro-batch fwd function.
         """
-
-        if self.if_init_step and self.cfg.optim.sched.get('reset_lr', False):
-            self._optimizer.param_groups[0]['step'] = self.trainer.global_step
-            self._optimizer.param_groups[0]['reset_lr'] = True
-            self.if_init_step = False
-
         # Initialize userbuffer communicators.
         if self.initialize_ub:
             self.initialize_ub_func()
 
+        # Reset learning rate
+        if self.if_init_step and self.cfg.get('reset_lr', False):
+            self._optimizer.param_groups[0]['num_steps'] = self.trainer.global_step
+            self._optimizer.param_groups[0]['reset_lr'] = True
+            self.if_init_step = False
+
         if self.rampup_batch_size:
             num_microbatch_calculator = apex.transformer.pipeline_parallel.utils._GLOBAL_NUM_MICROBATCHES_CALCULATOR
             current_global_batch_size = num_microbatch_calculator.current_global_batch_size
diff --git a/nemo/core/optim/lr_scheduler.py b/nemo/core/optim/lr_scheduler.py
index fd204bc3a42d..8aec03e152e6 100644
--- a/nemo/core/optim/lr_scheduler.py
+++ b/nemo/core/optim/lr_scheduler.py
@@ -43,7 +43,7 @@ class WarmupPolicy(_LRScheduler):
     """
 
     def __init__(
-        self, optimizer, *, reset_lr, warmup_steps=None, warmup_ratio=None, max_steps=None, min_lr=0.0, last_epoch=-1
+        self, optimizer, *, warmup_steps=None, warmup_ratio=None, max_steps=None, min_lr=0.0, last_epoch=-1
     ):
         assert not (
             warmup_steps is not None and warmup_ratio is not None
@@ -61,7 +61,6 @@ def __init__(
             self.warmup_steps = 0
 
         self.min_lr = min_lr
-        self.reset_lr = reset_lr
         super().__init__(optimizer, last_epoch)
 
     def get_lr(self):
@@ -103,7 +102,6 @@ def __init__(
         self,
         optimizer,
         *,
-        reset_lr,
         constant_steps=None,
         constant_ratio=None,
         max_steps=None,
@@ -127,7 +125,6 @@ def __init__(
 
         self.constant_lr = 1 / (constant_steps**0.5)
         self.min_lr = min_lr
-        self.reset_lr = reset_lr
         super().__init__(optimizer, last_epoch)
 
     def get_lr(self):
@@ -245,7 +242,6 @@ def __init__(
         self,
         optimizer,
         *,
-        reset_lr,
         warmup_steps=None,
         warmup_ratio=None,
         constant_steps=None,
@@ -293,12 +289,14 @@ def get_lr(self):
             )
 
         step = self.last_epoch
+
+        # Reset learning rate
         if 'reset_lr' in self.optimizer.param_groups[0].keys():
-            init_steps = self.optimizer.param_groups[0]['step']
-            step -= init_steps
+            num_steps = self.optimizer.param_groups[0]['num_steps']
+            step -= num_steps
             if self.first_step:
-                self.decay_steps -= init_steps
-                self.max_steps -= init_steps
+                self.decay_steps -= num_steps
+                self.max_steps -= num_steps
                 self.first_step = False
 
         # Warmup steps
@@ -422,9 +420,9 @@ def _get_lr(self, step):
 
 
 class CosineAnnealing(WarmupAnnealHoldPolicy):
-    def __init__(self, optimizer, *, max_steps, reset_lr, min_lr=0, last_epoch=-1, **kwargs):
+    def __init__(self, optimizer, *, max_steps, min_lr=0, last_epoch=-1, **kwargs):
         super().__init__(
-            optimizer=optimizer, max_steps=max_steps, last_epoch=last_epoch, min_lr=min_lr, reset_lr=reset_lr, **kwargs
+            optimizer=optimizer, max_steps=max_steps, last_epoch=last_epoch, min_lr=min_lr, **kwargs
         )
 
     def _get_lr(self, step):
@@ -480,7 +478,6 @@ def __init__(
         optimizer,
         *,
         d_model,
-        reset_lr,
         warmup_steps=None,
         warmup_ratio=None,
         max_steps=None,
@@ -504,7 +501,6 @@ def __init__(
             self.warmup_steps = 0
 
         self.min_lr = min_lr
-        self.reset_lr = reset_lr
         super().__init__(optimizer, last_epoch)
 
     def get_lr(self):
@@ -721,7 +717,6 @@ def prepare_lr_scheduler(
     optimizer: optim.Optimizer,
     scheduler_config: Union[Dict[str, Any], DictConfig],
     train_dataloader: Optional[dataloader.DataLoader] = None,
-    reset_lr: bool = False,
 ) -> Optional[Dict[str, Any]]:
     """
     Constructs an LR Scheduler (optionally) for a given optimizer, based on a config with the following schema

From 5c4dd1473d92f8b0d4147bd613e9576d76ebd573 Mon Sep 17 00:00:00 2001
From: dimapihtar <dimapihtar@users.noreply.github.com>
Date: Tue, 4 Jun 2024 12:10:12 +0000
Subject: [PATCH 05/21] Apply isort and black reformatting

Signed-off-by: dimapihtar <dimapihtar@users.noreply.github.com>
---
 nemo/core/optim/lr_scheduler.py | 8 ++------
 1 file changed, 2 insertions(+), 6 deletions(-)

diff --git a/nemo/core/optim/lr_scheduler.py b/nemo/core/optim/lr_scheduler.py
index 8aec03e152e6..1e99103ef63d 100644
--- a/nemo/core/optim/lr_scheduler.py
+++ b/nemo/core/optim/lr_scheduler.py
@@ -42,9 +42,7 @@ class WarmupPolicy(_LRScheduler):
             infinite training
     """
 
-    def __init__(
-        self, optimizer, *, warmup_steps=None, warmup_ratio=None, max_steps=None, min_lr=0.0, last_epoch=-1
-    ):
+    def __init__(self, optimizer, *, warmup_steps=None, warmup_ratio=None, max_steps=None, min_lr=0.0, last_epoch=-1):
         assert not (
             warmup_steps is not None and warmup_ratio is not None
         ), "Either use particular number of step or ratio"
@@ -421,9 +419,7 @@ def _get_lr(self, step):
 
 class CosineAnnealing(WarmupAnnealHoldPolicy):
     def __init__(self, optimizer, *, max_steps, min_lr=0, last_epoch=-1, **kwargs):
-        super().__init__(
-            optimizer=optimizer, max_steps=max_steps, last_epoch=last_epoch, min_lr=min_lr, **kwargs
-        )
+        super().__init__(optimizer=optimizer, max_steps=max_steps, last_epoch=last_epoch, min_lr=min_lr, **kwargs)
 
     def _get_lr(self, step):
         for initial_lr in self.base_lrs:

From 46687033d2e8ce1191d59a88274db4e00164a9fc Mon Sep 17 00:00:00 2001
From: dimapihtar <dpihtar@gmail.com>
Date: Tue, 4 Jun 2024 05:13:49 -0700
Subject: [PATCH 06/21] add reset_lr value to config

Signed-off-by: dimapihtar <dpihtar@gmail.com>
---
 .../conf/megatron_gpt_config.yaml             | 29 ++++++++++---------
 1 file changed, 15 insertions(+), 14 deletions(-)

diff --git a/examples/nlp/language_modeling/conf/megatron_gpt_config.yaml b/examples/nlp/language_modeling/conf/megatron_gpt_config.yaml
index 2c1f82deaa05..5d5401497200 100755
--- a/examples/nlp/language_modeling/conf/megatron_gpt_config.yaml
+++ b/examples/nlp/language_modeling/conf/megatron_gpt_config.yaml
@@ -9,13 +9,13 @@ trainer:
   devices: 1
   num_nodes: 1
   accelerator: gpu
-  precision: bf16
+  precision: 16
   logger: False # logger provided by exp_manager
   enable_checkpointing: False
   use_distributed_sampler: False
   max_epochs: -1 # PTL default. In practice, max_steps will be reached first.
-  max_steps: 500 # consumed_samples = global_step * micro_batch_size * data_parallel_size * accumulate_grad_batches
-  log_every_n_steps: 1
+  max_steps: 100000 # consumed_samples = global_step * micro_batch_size * data_parallel_size * accumulate_grad_batches
+  log_every_n_steps: 10
   val_check_interval: 100
   limit_val_batches: 50
   limit_test_batches: 500
@@ -28,10 +28,10 @@ exp_manager:
   explicit_log_dir: null
   exp_dir: null
   name: megatron_gpt
-  create_wandb_logger: True
+  create_wandb_logger: False
   wandb_logger_kwargs:
-    project: reset_lr_test
-    name: test_run
+    project: null
+    name: null
   create_neptune_logger: false
   neptune_logger_kwargs:
     project: null
@@ -122,7 +122,7 @@ model:
   fp16_lm_cross_entropy: False # Move the cross entropy unreduced loss calculation for lm head to fp16
 
   # Megatron O2-style half-precision
-  megatron_amp_O2: True # Enable O2-level automatic mixed precision using main parameters
+  megatron_amp_O2: False # Enable O2-level automatic mixed precision using main parameters
   grad_allreduce_chunk_size_mb: 125
 
   # Fusion
@@ -241,9 +241,9 @@ model:
     # Dictionary: can override from CLI "model.data.data_prefix"={"train":[1.0, /path/to/data], "validation":/path/to/data, "test":/path/to/test}
     # Or see example below:
     # "model.data.data_prefix: {train:[1.0,/path/to/data], validation:[/path/to/data], test:[/path/to/test]}"
-    data_prefix: []
+    data_prefix: ???
     index_mapping_dir: null # path to save index mapping .npy files, by default will save in the same location as data_prefix
-    data_impl: mock
+    data_impl: mmap
     mmap_bin_files: True
     splits_string: 900,50,50
     seq_length: ${model.encoder_seq_length}
@@ -268,17 +268,18 @@ model:
     gen_shape: False # Generate model and kernel details including input shapes
 
   optim:
-    name: distributed_fused_adam
+    name: fused_adam
     lr: 2e-4
-    weight_decay: 0.02
+    weight_decay: 0.01
     betas:
     - 0.9
-    - 0.95
+    - 0.98
     sched:
       name: CosineAnnealing
-      warmup_steps: 25
-      constant_steps: 25
+      warmup_steps: 500
+      constant_steps: 0
       min_lr: 2e-5
+
   gc_interval: 0
   # Interval of the host memory garbage collection. When it is zero, collectiion relies on the automatic garbage collector.
   # If an interger value larger than zero is set, collection is done manually by the batch step interval of `gc_interval`.

From de6750a4d4d4a39582afc35dcff6044bed5700de Mon Sep 17 00:00:00 2001
From: dimapihtar <dpihtar@gmail.com>
Date: Tue, 4 Jun 2024 05:14:48 -0700
Subject: [PATCH 07/21] set reset_lr False by default

Signed-off-by: dimapihtar <dpihtar@gmail.com>
---
 examples/nlp/language_modeling/conf/megatron_gpt_config.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/nlp/language_modeling/conf/megatron_gpt_config.yaml b/examples/nlp/language_modeling/conf/megatron_gpt_config.yaml
index 5d5401497200..34081b8d68e9 100755
--- a/examples/nlp/language_modeling/conf/megatron_gpt_config.yaml
+++ b/examples/nlp/language_modeling/conf/megatron_gpt_config.yaml
@@ -103,7 +103,7 @@ model:
   batch_p2p_comm: True # Batch consecutive inter-peer send/recv operations. This argument is valid only when `virtual_pipeline_model_parallel_size` is larger than 1
   seq_len_interpolation_factor: null # RoPE Interpolation factor for sequence length. This is used to build long-context models with RoPE ex: https://arxiv.org/abs/2306.15595.
   num_query_groups: null # Number of query groups for group query attention. If None, normal attention is used.
-  reset_lr: True # Set to True to reset learning rate.
+  reset_lr: False # Set to True to reset learning rate.
 
   tokenizer:
     library: 'megatron'

From b0b3e17886356c9db40615d632a92ff7239cd8cd Mon Sep 17 00:00:00 2001
From: dimapihtar <dpihtar@gmail.com>
Date: Tue, 4 Jun 2024 05:15:50 -0700
Subject: [PATCH 08/21] remove extra line

Signed-off-by: dimapihtar <dpihtar@gmail.com>
---
 .../nlp/models/language_modeling/megatron_gpt_model.py           | 1 -
 1 file changed, 1 deletion(-)

diff --git a/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py b/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py
index 0b0b32d624d7..f009bdd1ff71 100644
--- a/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py
+++ b/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py
@@ -921,7 +921,6 @@ def training_step(self, dataloader_iter):
                     self.log('loss_scale', loss_scale, batch_size=1)
 
         lr = self._optimizer.param_groups[0]['lr']
-
         self.log('lr', lr, rank_zero_only=True, batch_size=1)
         self.log(
             'global_step',

From 7fac9d36828d5e723193c12246eeb0b1ff8df770 Mon Sep 17 00:00:00 2001
From: dimapihtar <dpihtar@gmail.com>
Date: Tue, 4 Jun 2024 06:10:02 -0700
Subject: [PATCH 09/21] add reset_lr test

Signed-off-by: dimapihtar <dpihtar@gmail.com>
---
 .github/workflows/cicd-main.yml | 83 +++++++++++++++++++++++++++++++++
 1 file changed, 83 insertions(+)

diff --git a/.github/workflows/cicd-main.yml b/.github/workflows/cicd-main.yml
index 29e84b933f14..510283bf07f8 100644
--- a/.github/workflows/cicd-main.yml
+++ b/.github/workflows/cicd-main.yml
@@ -4417,6 +4417,89 @@ jobs:
     #    }
     #  }
 
+  L2_Megatron_GPT_with_ResetLR_Pretraining_and_Resume_Training_TP2:
+    needs: [cicd-test-container-setup]
+    runs-on: self-hosted-azure
+    timeout-minutes: 10
+    container:
+      image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
+      options: 
+        # --user 0:128
+        --device=/dev/nvidia0
+        --gpus all
+        --shm-size=8g
+        --env TRANSFORMERS_OFFLINE=0 
+        --env HYDRA_FULL_ERROR=1
+        --volume /mnt/datadrive/TestData:/home/TestData
+    steps:
+        - name: Checkout repository
+          uses: actions/checkout@v4
+        - run: |
+           python examples/nlp/language_modeling/megatron_gpt_pretraining.py \
+           trainer.devices=2 \
+           trainer.accelerator=gpu \
+           trainer.log_every_n_steps=1 \
+           trainer.val_check_interval=3 \
+           trainer.limit_val_batches=2 \
+           trainer.accumulate_grad_batches=1 \
+           trainer.max_steps=3 \
+           trainer.precision=bf16 \
+           trainer.gradient_clip_val=1.0 \
+           exp_manager.exp_dir=examples/nlp/language_modeling/gpt_pretrain_results \
+           model.tensor_model_parallel_size=2 \
+           model.megatron_amp_O2=True \
+           model.optim.name=distributed_fused_adam \
+           model.optim.lr=2e-4 \
+           model.optim.sched.warmup_steps=2 \
+           model.optim.sched.constant_steps=2 \
+           model.optim.sched.min_lr=8e-5 \
+           model.max_position_embeddings=128 \
+           model.encoder_seq_length=128 \
+           model.data.seq_length=128 \
+           model.tokenizer.vocab_file=/home/TestData/nlp/megatron_gpt/data/gpt/vocab.json \
+           model.tokenizer.merge_file=/home/TestData/nlp/megatron_gpt/data/gpt/merges.txt \
+           model.num_layers=8 \
+           model.hidden_size=256 \
+           model.num_attention_heads=8 \
+           model.data.data_prefix=[.5,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document,.5,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document] \
+           model.data.index_mapping_dir=examples/nlp/language_modeling/gpt_index_mappings
+        
+           python examples/nlp/language_modeling/megatron_gpt_pretraining.py \
+           trainer.devices=2 \
+           trainer.accelerator=gpu \
+           trainer.log_every_n_steps=1 \
+           trainer.val_check_interval=3 \
+           trainer.limit_val_batches=2 \
+           trainer.accumulate_grad_batches=1 \
+           trainer.max_steps=6 \
+           trainer.precision=bf16 \
+           trainer.gradient_clip_val=1.0 \
+           exp_manager.exp_dir=examples/nlp/language_modeling/gpt_pretrain_results \
+           exp_manager.resume_if_exists=True \
+           model.reset_lr=True \
+           model.tensor_model_parallel_size=2 \
+           model.megatron_amp_O2=True \
+           model.optim.name=distributed_fused_adam \
+           model.optim.lr=2e-4 \
+           model.optim.sched.warmup_steps=2 \
+           model.optim.sched.constant_steps=2 \
+           model.optim.sched.min_lr=8e-5 \
+           model.max_position_embeddings=128 \
+           model.encoder_seq_length=128 \
+           model.data.seq_length=128 \
+           model.tokenizer.vocab_file=/home/TestData/nlp/megatron_gpt/data/gpt/vocab.json \
+           model.tokenizer.merge_file=/home/TestData/nlp/megatron_gpt/data/gpt/merges.txt \
+           model.num_layers=8 \
+           model.hidden_size=256 \
+           model.num_attention_heads=8 \
+           model.data.data_prefix=[.5,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document,.5,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document] \
+           model.data.index_mapping_dir=examples/nlp/language_modeling/gpt_index_mappings"
+
+           rm -rf examples/nlp/language_modeling/gpt_pretrain_results
+           rm -rf examples/nlp/language_modeling/gpt_index_mappings
+        - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main"
+          if: "failure()"
+
   L2_Megatron_GPT_with_ALiBi_Pretraining_and_Resume_Training_TP2:
     needs: [cicd-test-container-setup]
     runs-on: self-hosted-azure

From 0604dc487541b38f76d3d38847af28e07b66b89a Mon Sep 17 00:00:00 2001
From: dimapihtar <dpihtar@gmail.com>
Date: Tue, 4 Jun 2024 06:11:00 -0700
Subject: [PATCH 10/21] add reset_lr test

Signed-off-by: dimapihtar <dpihtar@gmail.com>
---
 .github/workflows/cicd-main.yml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/.github/workflows/cicd-main.yml b/.github/workflows/cicd-main.yml
index 510283bf07f8..443eeafbc4ac 100644
--- a/.github/workflows/cicd-main.yml
+++ b/.github/workflows/cicd-main.yml
@@ -6608,6 +6608,7 @@ jobs:
       - L2_BioMegatron_Bert_NER_Task
       - L2_Megatron_GPT_Pretraining_and_Resume_Training_TP2
       - L2_Megatron_GPT_with_Rope_Pretraining_and_Resume_Training_TP2
+      - L2_Megatron_GPT_with_ResetLR_Pretraining_and_Resume_Training_TP2
       - L2_Megatron_GPT_with_ALiBi_Pretraining_and_Resume_Training_TP2
       - L2_Megatron_GPT_with_KERPLE_Pretraining_and_Resume_Training_TP2
       - L2_Megatron_GPT_Pretraining_and_Resume_Training_PP2

From 5a2d4c624179551c758b389118ea6107f676e8e7 Mon Sep 17 00:00:00 2001
From: dimapihtar <dpihtar@gmail.com>
Date: Wed, 5 Jun 2024 03:31:55 -0700
Subject: [PATCH 11/21] remove extra quote

Signed-off-by: dimapihtar <dpihtar@gmail.com>
---
 .github/workflows/cicd-main.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/cicd-main.yml b/.github/workflows/cicd-main.yml
index 443eeafbc4ac..59478a307258 100644
--- a/.github/workflows/cicd-main.yml
+++ b/.github/workflows/cicd-main.yml
@@ -4493,7 +4493,7 @@ jobs:
            model.hidden_size=256 \
            model.num_attention_heads=8 \
            model.data.data_prefix=[.5,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document,.5,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document] \
-           model.data.index_mapping_dir=examples/nlp/language_modeling/gpt_index_mappings"
+           model.data.index_mapping_dir=examples/nlp/language_modeling/gpt_index_mappings
 
            rm -rf examples/nlp/language_modeling/gpt_pretrain_results
            rm -rf examples/nlp/language_modeling/gpt_index_mappings

From 47956e1068f2c14671f028e9edf2aba49a09f9af Mon Sep 17 00:00:00 2001
From: dimapihtar <dpihtar@gmail.com>
Date: Mon, 10 Jun 2024 07:32:41 -0700
Subject: [PATCH 12/21] add ability to reset schedule's max_steps and
 decay_steps

Signed-off-by: dimapihtar <dpihtar@gmail.com>
---
 .../language_modeling/conf/megatron_gpt_config.yaml  | 11 +++++++----
 .../models/language_modeling/megatron_gpt_model.py   | 12 +++++++++---
 nemo/core/optim/lr_scheduler.py                      |  4 ++--
 3 files changed, 18 insertions(+), 9 deletions(-)

diff --git a/examples/nlp/language_modeling/conf/megatron_gpt_config.yaml b/examples/nlp/language_modeling/conf/megatron_gpt_config.yaml
index 34081b8d68e9..281bcb8022eb 100755
--- a/examples/nlp/language_modeling/conf/megatron_gpt_config.yaml
+++ b/examples/nlp/language_modeling/conf/megatron_gpt_config.yaml
@@ -15,7 +15,7 @@ trainer:
   use_distributed_sampler: False
   max_epochs: -1 # PTL default. In practice, max_steps will be reached first.
   max_steps: 100000 # consumed_samples = global_step * micro_batch_size * data_parallel_size * accumulate_grad_batches
-  log_every_n_steps: 10
+  log_every_n_steps: 1
   val_check_interval: 100
   limit_val_batches: 50
   limit_test_batches: 500
@@ -28,10 +28,10 @@ exp_manager:
   explicit_log_dir: null
   exp_dir: null
   name: megatron_gpt
-  create_wandb_logger: False
+  create_wandb_logger: True
   wandb_logger_kwargs:
-    project: null
-    name: null
+    project: reset_lr_test
+    name: test_run
   create_neptune_logger: false
   neptune_logger_kwargs:
     project: null
@@ -103,7 +103,10 @@ model:
   batch_p2p_comm: True # Batch consecutive inter-peer send/recv operations. This argument is valid only when `virtual_pipeline_model_parallel_size` is larger than 1
   seq_len_interpolation_factor: null # RoPE Interpolation factor for sequence length. This is used to build long-context models with RoPE ex: https://arxiv.org/abs/2306.15595.
   num_query_groups: null # Number of query groups for group query attention. If None, normal attention is used.
+
+  # Reset learning rate schedule.
   reset_lr: False # Set to True to reset learning rate.
+  reset_lr_steps: False # Set to True to reset learning rate max_steps and decay_steps.
 
   tokenizer:
     library: 'megatron'
diff --git a/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py b/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py
index f009bdd1ff71..a260de20c005 100644
--- a/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py
+++ b/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py
@@ -394,7 +394,11 @@ def __init__(self, cfg: DictConfig, trainer: Trainer):
         self.validation_param_sync_overlap = self.cfg.get('validation_param_sync_overlap', False)
 
         self.inference_params = None
+
+        # Reset learning rate params
         self.if_init_step = True
+        self.reset_lr = self.cfg.get('reset_lr', False)
+        self.reset_lr_steps = self.cfg.get('reset_lr_steps', False)
 
         # default to false since this doesn't work with sequence parallelism currently
         self.use_loss_mask = self.cfg.get('use_loss_mask', False)
@@ -763,9 +767,11 @@ def training_step(self, dataloader_iter):
             self.initialize_ub_func()
 
         # Reset learning rate
-        if self.if_init_step and self.cfg.get('reset_lr', False):
-            self._optimizer.param_groups[0]['num_steps'] = self.trainer.global_step
-            self._optimizer.param_groups[0]['reset_lr'] = True
+        if self.if_init_step and self.reset_lr:
+            self._optimizer.param_groups[0]['reset_lr'] = {
+                'num_steps': self.trainer.global_step, 
+                'reset_lr_steps': True if self.reset_lr_steps else False,
+            }
             self.if_init_step = False
 
         if self.rampup_batch_size:
diff --git a/nemo/core/optim/lr_scheduler.py b/nemo/core/optim/lr_scheduler.py
index 1e99103ef63d..bbbcb46c6c98 100644
--- a/nemo/core/optim/lr_scheduler.py
+++ b/nemo/core/optim/lr_scheduler.py
@@ -290,9 +290,9 @@ def get_lr(self):
 
         # Reset learning rate
         if 'reset_lr' in self.optimizer.param_groups[0].keys():
-            num_steps = self.optimizer.param_groups[0]['num_steps']
+            num_steps = self.optimizer.param_groups[0]['reset_lr']['num_steps']
             step -= num_steps
-            if self.first_step:
+            if self.first_step and self.optimizer.param_groups[0]['reset_lr']['reset_lr_steps']:
                 self.decay_steps -= num_steps
                 self.max_steps -= num_steps
                 self.first_step = False

From 61639095b830a06d1c01400af529b9b032a91a79 Mon Sep 17 00:00:00 2001
From: dimapihtar <dimapihtar@users.noreply.github.com>
Date: Mon, 10 Jun 2024 14:33:49 +0000
Subject: [PATCH 13/21] Apply isort and black reformatting

Signed-off-by: dimapihtar <dimapihtar@users.noreply.github.com>
---
 .../nlp/models/language_modeling/megatron_gpt_model.py          | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py b/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py
index a260de20c005..47b6dd90b0a7 100644
--- a/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py
+++ b/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py
@@ -769,7 +769,7 @@ def training_step(self, dataloader_iter):
         # Reset learning rate
         if self.if_init_step and self.reset_lr:
             self._optimizer.param_groups[0]['reset_lr'] = {
-                'num_steps': self.trainer.global_step, 
+                'num_steps': self.trainer.global_step,
                 'reset_lr_steps': True if self.reset_lr_steps else False,
             }
             self.if_init_step = False

From 4119a1d19b65020c633c9be87b1a37f467b3844a Mon Sep 17 00:00:00 2001
From: dimapihtar <dpihtar@gmail.com>
Date: Mon, 10 Jun 2024 08:40:44 -0700
Subject: [PATCH 14/21] change scheduler's first step logic when using reset_lr

Signed-off-by: dimapihtar <dpihtar@gmail.com>
---
 .../nlp/language_modeling/conf/megatron_gpt_config.yaml   | 8 ++++----
 .../nlp/models/language_modeling/megatron_gpt_model.py    | 1 +
 2 files changed, 5 insertions(+), 4 deletions(-)

diff --git a/examples/nlp/language_modeling/conf/megatron_gpt_config.yaml b/examples/nlp/language_modeling/conf/megatron_gpt_config.yaml
index 281bcb8022eb..8687074c15f9 100755
--- a/examples/nlp/language_modeling/conf/megatron_gpt_config.yaml
+++ b/examples/nlp/language_modeling/conf/megatron_gpt_config.yaml
@@ -28,10 +28,10 @@ exp_manager:
   explicit_log_dir: null
   exp_dir: null
   name: megatron_gpt
-  create_wandb_logger: True
+  create_wandb_logger: False
   wandb_logger_kwargs:
-    project: reset_lr_test
-    name: test_run
+    project: null
+    name: null
   create_neptune_logger: false
   neptune_logger_kwargs:
     project: null
@@ -125,7 +125,7 @@ model:
   fp16_lm_cross_entropy: False # Move the cross entropy unreduced loss calculation for lm head to fp16
 
   # Megatron O2-style half-precision
-  megatron_amp_O2: False # Enable O2-level automatic mixed precision using main parameters
+  megatron_amp_O2: True # Enable O2-level automatic mixed precision using main parameters
   grad_allreduce_chunk_size_mb: 125
 
   # Fusion
diff --git a/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py b/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py
index 8240ac4a64ec..5877f9b2e273 100644
--- a/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py
+++ b/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py
@@ -769,6 +769,7 @@ def training_step(self, dataloader_iter):
 
         # Reset learning rate
         if self.if_init_step and self.reset_lr:
+            self._optimizer.param_groups[0]['lr'] = 0.0 if self.cfg.optim.sched.warmup_steps > 0 else self.cfg.optim.lr
             self._optimizer.param_groups[0]['reset_lr'] = {
                 'num_steps': self.trainer.global_step,
                 'reset_lr_steps': True if self.reset_lr_steps else False,

From 92e7cf8a9056cabc8f7956924b9e5c9d81da387e Mon Sep 17 00:00:00 2001
From: dimapihtar <dpihtar@gmail.com>
Date: Mon, 10 Jun 2024 08:42:11 -0700
Subject: [PATCH 15/21] revert config

Signed-off-by: dimapihtar <dpihtar@gmail.com>
---
 examples/nlp/language_modeling/conf/megatron_gpt_config.yaml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/examples/nlp/language_modeling/conf/megatron_gpt_config.yaml b/examples/nlp/language_modeling/conf/megatron_gpt_config.yaml
index 8687074c15f9..c71b6e908b50 100755
--- a/examples/nlp/language_modeling/conf/megatron_gpt_config.yaml
+++ b/examples/nlp/language_modeling/conf/megatron_gpt_config.yaml
@@ -15,7 +15,7 @@ trainer:
   use_distributed_sampler: False
   max_epochs: -1 # PTL default. In practice, max_steps will be reached first.
   max_steps: 100000 # consumed_samples = global_step * micro_batch_size * data_parallel_size * accumulate_grad_batches
-  log_every_n_steps: 1
+  log_every_n_steps: 10
   val_check_interval: 100
   limit_val_batches: 50
   limit_test_batches: 500
@@ -125,7 +125,7 @@ model:
   fp16_lm_cross_entropy: False # Move the cross entropy unreduced loss calculation for lm head to fp16
 
   # Megatron O2-style half-precision
-  megatron_amp_O2: True # Enable O2-level automatic mixed precision using main parameters
+  megatron_amp_O2: False # Enable O2-level automatic mixed precision using main parameters
   grad_allreduce_chunk_size_mb: 125
 
   # Fusion

From 5da92cd918d6b6f184473d8d0632effa5b1263ec Mon Sep 17 00:00:00 2001
From: dimapihtar <dpihtar@gmail.com>
Date: Tue, 11 Jun 2024 06:22:48 -0700
Subject: [PATCH 16/21] fix reset_lr logic

Signed-off-by: dimapihtar <dpihtar@gmail.com>
---
 .../conf/megatron_gpt_config.yaml             | 24 +++++++++----------
 .../language_modeling/megatron_gpt_model.py   |  7 +++++-
 nemo/core/optim/lr_scheduler.py               |  8 +++----
 3 files changed, 22 insertions(+), 17 deletions(-)

diff --git a/examples/nlp/language_modeling/conf/megatron_gpt_config.yaml b/examples/nlp/language_modeling/conf/megatron_gpt_config.yaml
index c71b6e908b50..f7c86af57c76 100755
--- a/examples/nlp/language_modeling/conf/megatron_gpt_config.yaml
+++ b/examples/nlp/language_modeling/conf/megatron_gpt_config.yaml
@@ -9,13 +9,13 @@ trainer:
   devices: 1
   num_nodes: 1
   accelerator: gpu
-  precision: 16
+  precision: bf16
   logger: False # logger provided by exp_manager
   enable_checkpointing: False
   use_distributed_sampler: False
   max_epochs: -1 # PTL default. In practice, max_steps will be reached first.
   max_steps: 100000 # consumed_samples = global_step * micro_batch_size * data_parallel_size * accumulate_grad_batches
-  log_every_n_steps: 10
+  log_every_n_steps: 1
   val_check_interval: 100
   limit_val_batches: 50
   limit_test_batches: 500
@@ -28,10 +28,10 @@ exp_manager:
   explicit_log_dir: null
   exp_dir: null
   name: megatron_gpt
-  create_wandb_logger: False
+  create_wandb_logger: True
   wandb_logger_kwargs:
-    project: null
-    name: null
+    project: reset_lr_test
+    name: test_run
   create_neptune_logger: false
   neptune_logger_kwargs:
     project: null
@@ -56,7 +56,7 @@ exp_manager:
 
 model:
   # use GPTModel from megatron.core
-  mcore_gpt: False
+  mcore_gpt: True
 
   # specify micro_batch_size, global_batch_size, and model parallelism
   # gradient accumulation will be done automatically based on data_parallel_size
@@ -105,8 +105,8 @@ model:
   num_query_groups: null # Number of query groups for group query attention. If None, normal attention is used.
 
   # Reset learning rate schedule.
-  reset_lr: False # Set to True to reset learning rate.
-  reset_lr_steps: False # Set to True to reset learning rate max_steps and decay_steps.
+  reset_lr: False # Set to True to reset learning rate. Only supported with distributed optmizer and megatron_amp_O2.
+  reset_lr_steps: False # Set to True to decrease learning rate's max_steps and decay_steps by number of previously used steps.
 
   tokenizer:
     library: 'megatron'
@@ -125,7 +125,7 @@ model:
   fp16_lm_cross_entropy: False # Move the cross entropy unreduced loss calculation for lm head to fp16
 
   # Megatron O2-style half-precision
-  megatron_amp_O2: False # Enable O2-level automatic mixed precision using main parameters
+  megatron_amp_O2: True # Enable O2-level automatic mixed precision using main parameters
   grad_allreduce_chunk_size_mb: 125
 
   # Fusion
@@ -271,7 +271,7 @@ model:
     gen_shape: False # Generate model and kernel details including input shapes
 
   optim:
-    name: fused_adam
+    name: distributed_fused_adam
     lr: 2e-4
     weight_decay: 0.01
     betas:
@@ -279,8 +279,8 @@ model:
     - 0.98
     sched:
       name: CosineAnnealing
-      warmup_steps: 500
-      constant_steps: 0
+      warmup_steps: 10
+      constant_steps: 10
       min_lr: 2e-5
 
   gc_interval: 0
diff --git a/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py b/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py
index 5877f9b2e273..176cade9b6f0 100644
--- a/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py
+++ b/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py
@@ -399,6 +399,8 @@ def __init__(self, cfg: DictConfig, trainer: Trainer):
         self.if_init_step = True
         self.reset_lr = self.cfg.get('reset_lr', False)
         self.reset_lr_steps = self.cfg.get('reset_lr_steps', False)
+        if self.reset_lr and (not self.with_distributed_adam or not self.megatron_amp_O2):
+            raise ValueError('Learning rate reset feature is only supported with the distributed optmizer and megatron_amp_O2 for now.')
 
         # default to false since this doesn't work with sequence parallelism currently
         self.use_loss_mask = self.cfg.get('use_loss_mask', False)
@@ -769,10 +771,13 @@ def training_step(self, dataloader_iter):
 
         # Reset learning rate
         if self.if_init_step and self.reset_lr:
-            self._optimizer.param_groups[0]['lr'] = 0.0 if self.cfg.optim.sched.warmup_steps > 0 else self.cfg.optim.lr
+            num_groups = len(self._optimizer.param_groups)
+            for group in range(num_groups):
+                self._optimizer.param_groups[group]['lr'] = 0.0 if self.cfg.optim.sched.warmup_steps > 0 else self.cfg.optim.lr
             self._optimizer.param_groups[0]['reset_lr'] = {
                 'num_steps': self.trainer.global_step,
                 'reset_lr_steps': True if self.reset_lr_steps else False,
+                'if_init_step': self.if_init_step,
             }
             self.if_init_step = False
 
diff --git a/nemo/core/optim/lr_scheduler.py b/nemo/core/optim/lr_scheduler.py
index bbbcb46c6c98..cfb3068b1cc8 100644
--- a/nemo/core/optim/lr_scheduler.py
+++ b/nemo/core/optim/lr_scheduler.py
@@ -277,7 +277,6 @@ def __init__(
         self.decay_steps = max_steps - (self.constant_steps + self.warmup_steps)
 
         self.min_lr = min_lr
-        self.first_step = True
         super().__init__(optimizer, last_epoch)
 
     def get_lr(self):
@@ -290,12 +289,13 @@ def get_lr(self):
 
         # Reset learning rate
         if 'reset_lr' in self.optimizer.param_groups[0].keys():
-            num_steps = self.optimizer.param_groups[0]['reset_lr']['num_steps']
+            reset_lr = self.optimizer.param_groups[0]['reset_lr']
+            num_steps = reset_lr['num_steps']
             step -= num_steps
-            if self.first_step and self.optimizer.param_groups[0]['reset_lr']['reset_lr_steps']:
+            if reset_lr['if_init_step'] and reset_lr['reset_lr_steps']:
                 self.decay_steps -= num_steps
                 self.max_steps -= num_steps
-                self.first_step = False
+                self.optimizer.param_groups[0]['reset_lr']['if_init_step'] = False
 
         # Warmup steps
         if self.warmup_steps > 0 and step <= self.warmup_steps:

From 7cfd47ad449ab597bf773e817719617360534534 Mon Sep 17 00:00:00 2001
From: dimapihtar <dimapihtar@users.noreply.github.com>
Date: Tue, 11 Jun 2024 13:24:04 +0000
Subject: [PATCH 17/21] Apply isort and black reformatting

Signed-off-by: dimapihtar <dimapihtar@users.noreply.github.com>
---
 .../nlp/models/language_modeling/megatron_gpt_model.py    | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py b/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py
index 176cade9b6f0..75f7ffbaab1b 100644
--- a/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py
+++ b/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py
@@ -400,7 +400,9 @@ def __init__(self, cfg: DictConfig, trainer: Trainer):
         self.reset_lr = self.cfg.get('reset_lr', False)
         self.reset_lr_steps = self.cfg.get('reset_lr_steps', False)
         if self.reset_lr and (not self.with_distributed_adam or not self.megatron_amp_O2):
-            raise ValueError('Learning rate reset feature is only supported with the distributed optmizer and megatron_amp_O2 for now.')
+            raise ValueError(
+                'Learning rate reset feature is only supported with the distributed optmizer and megatron_amp_O2 for now.'
+            )
 
         # default to false since this doesn't work with sequence parallelism currently
         self.use_loss_mask = self.cfg.get('use_loss_mask', False)
@@ -773,7 +775,9 @@ def training_step(self, dataloader_iter):
         if self.if_init_step and self.reset_lr:
             num_groups = len(self._optimizer.param_groups)
             for group in range(num_groups):
-                self._optimizer.param_groups[group]['lr'] = 0.0 if self.cfg.optim.sched.warmup_steps > 0 else self.cfg.optim.lr
+                self._optimizer.param_groups[group]['lr'] = (
+                    0.0 if self.cfg.optim.sched.warmup_steps > 0 else self.cfg.optim.lr
+                )
             self._optimizer.param_groups[0]['reset_lr'] = {
                 'num_steps': self.trainer.global_step,
                 'reset_lr_steps': True if self.reset_lr_steps else False,

From 067c2645dd80c00b4d6c3f1fc5719007c0702e3b Mon Sep 17 00:00:00 2001
From: dimapihtar <dpihtar@gmail.com>
Date: Tue, 11 Jun 2024 06:25:56 -0700
Subject: [PATCH 18/21] revert config

Signed-off-by: dimapihtar <dpihtar@gmail.com>
---
 .../language_modeling/conf/megatron_gpt_config.yaml    | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/examples/nlp/language_modeling/conf/megatron_gpt_config.yaml b/examples/nlp/language_modeling/conf/megatron_gpt_config.yaml
index f7c86af57c76..0ba150d8e7b6 100755
--- a/examples/nlp/language_modeling/conf/megatron_gpt_config.yaml
+++ b/examples/nlp/language_modeling/conf/megatron_gpt_config.yaml
@@ -15,7 +15,7 @@ trainer:
   use_distributed_sampler: False
   max_epochs: -1 # PTL default. In practice, max_steps will be reached first.
   max_steps: 100000 # consumed_samples = global_step * micro_batch_size * data_parallel_size * accumulate_grad_batches
-  log_every_n_steps: 1
+  log_every_n_steps: 10
   val_check_interval: 100
   limit_val_batches: 50
   limit_test_batches: 500
@@ -28,10 +28,10 @@ exp_manager:
   explicit_log_dir: null
   exp_dir: null
   name: megatron_gpt
-  create_wandb_logger: True
+  create_wandb_logger: False
   wandb_logger_kwargs:
-    project: reset_lr_test
-    name: test_run
+    project: null
+    name: null
   create_neptune_logger: false
   neptune_logger_kwargs:
     project: null
@@ -271,7 +271,7 @@ model:
     gen_shape: False # Generate model and kernel details including input shapes
 
   optim:
-    name: distributed_fused_adam
+    name: fused_adam
     lr: 2e-4
     weight_decay: 0.01
     betas:

From 43ccac7377efc4c3f441205c3b90d2ba344d47c4 Mon Sep 17 00:00:00 2001
From: dimapihtar <dpihtar@gmail.com>
Date: Tue, 11 Jun 2024 06:27:30 -0700
Subject: [PATCH 19/21] revert config

Signed-off-by: dimapihtar <dpihtar@gmail.com>
---
 examples/nlp/language_modeling/conf/megatron_gpt_config.yaml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/examples/nlp/language_modeling/conf/megatron_gpt_config.yaml b/examples/nlp/language_modeling/conf/megatron_gpt_config.yaml
index 0ba150d8e7b6..2cf512477a92 100755
--- a/examples/nlp/language_modeling/conf/megatron_gpt_config.yaml
+++ b/examples/nlp/language_modeling/conf/megatron_gpt_config.yaml
@@ -279,8 +279,8 @@ model:
     - 0.98
     sched:
       name: CosineAnnealing
-      warmup_steps: 10
-      constant_steps: 10
+      warmup_steps: 500
+      constant_steps: 0
       min_lr: 2e-5
 
   gc_interval: 0

From 0d91dcd446ebf2d0b7209acc23d0baa49c201e2d Mon Sep 17 00:00:00 2001
From: dimapihtar <dpihtar@gmail.com>
Date: Tue, 25 Jun 2024 04:04:52 -0700
Subject: [PATCH 20/21] update reset_lr comments

Signed-off-by: dimapihtar <dpihtar@gmail.com>
---
 examples/nlp/language_modeling/conf/megatron_gpt_config.yaml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/examples/nlp/language_modeling/conf/megatron_gpt_config.yaml b/examples/nlp/language_modeling/conf/megatron_gpt_config.yaml
index 2cf512477a92..2ce1d478a642 100755
--- a/examples/nlp/language_modeling/conf/megatron_gpt_config.yaml
+++ b/examples/nlp/language_modeling/conf/megatron_gpt_config.yaml
@@ -105,8 +105,8 @@ model:
   num_query_groups: null # Number of query groups for group query attention. If None, normal attention is used.
 
   # Reset learning rate schedule.
-  reset_lr: False # Set to True to reset learning rate. Only supported with distributed optmizer and megatron_amp_O2.
-  reset_lr_steps: False # Set to True to decrease learning rate's max_steps and decay_steps by number of previously used steps.
+  reset_lr: False # Set to True to reset learning rate to initial learning rate. Only supported with distributed optmizer and megatron_amp_O2.
+  reset_lr_steps: False # Set to True to adjust learning rate's max_steps and decay_steps by subtracting number of steps already completed at the checkpoint.
 
   tokenizer:
     library: 'megatron'

From ce4200a0ee3fc840a92a3b6243a832f3dbb8a540 Mon Sep 17 00:00:00 2001
From: dimapihtar <dpihtar@gmail.com>
Date: Tue, 25 Jun 2024 11:10:25 -0700
Subject: [PATCH 21/21] add use cases for reset_lr feature

Signed-off-by: dimapihtar <dpihtar@gmail.com>
---
 .../nlp/language_modeling/conf/megatron_gpt_config.yaml     | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/examples/nlp/language_modeling/conf/megatron_gpt_config.yaml b/examples/nlp/language_modeling/conf/megatron_gpt_config.yaml
index 8e313827efaa..8c6d97821222 100755
--- a/examples/nlp/language_modeling/conf/megatron_gpt_config.yaml
+++ b/examples/nlp/language_modeling/conf/megatron_gpt_config.yaml
@@ -115,7 +115,11 @@ model:
   seq_len_interpolation_factor: null # RoPE Interpolation factor for sequence length. This is used to build long-context models with RoPE ex: https://arxiv.org/abs/2306.15595.
   num_query_groups: null # Number of query groups for group query attention. If None, normal attention is used.
 
-  # Reset learning rate schedule.
+  ## Reset learning rate schedule.
+  # 1. reset_lr=True, reset_lr_steps=False. When pre-training an existing checkpoint "from scratch" on a different dataset.
+  # 2. reset_lr=True, reset_lr_steps=True. When continuing training from an existing checkpoint with the same configuration.
+  #    Learning rate's max_steps and decay_steps will be recalculated as follows: max_steps -= completed_steps, decay_steps -= completed_steps where completed_steps is the number of steps already completed at the checkpoint.
+  #    This will help to reach the min_lr value by the end of training without changing trainer.max_steps.
   reset_lr: False # Set to True to reset learning rate to initial learning rate. Only supported with distributed optmizer and megatron_amp_O2.
   reset_lr_steps: False # Set to True to adjust learning rate's max_steps and decay_steps by subtracting number of steps already completed at the checkpoint.