NVIDIA · ericharper · Jan 31, 2023 · Dec 21, 2022 · Dec 23, 2022 · Jan 5, 2023
diff --git a/nemo/collections/nlp/modules/common/megatron/clip_grads.py b/nemo/collections/nlp/modules/common/megatron/clip_grads.py
@@ -190,6 +190,8 @@ def clip_grad_norm_distributed_optimizer(optimizer, max_norm, norm_type=2):
     # Compute grad norm
     # Note: Compute norm of local grads and sum over all procs
     grad_norm_sq = optimizer._local_grad_norm(parameters=params_for_norm, norm_type=norm_type)
+    if optimizer.redundant_size > 1:
+        grad_norm_sq /= optimizer.redundant_size
 assert isinstance(optimizer, DistributedFusedAdam) 
 class MegatronDistributedFusedAdam(DistributedFusedAdam): 
 assert isinstance(optimizer, DistributedFusedAdam) 
 class MegatronDistributedFusedAdam(DistributedFusedAdam): 
     torch.distributed.all_reduce(
         grad_norm_sq, op=torch.distributed.ReduceOp.SUM,
     )

diff --git a/nemo/core/optim/distributed_adam.py b/nemo/core/optim/distributed_adam.py
@@ -12,16 +12,23 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+import torch
 from apex.contrib.optimizers.distributed_fused_adam import DistributedFusedAdam
 from apex.transformer import parallel_state
 
 
 # Wrapper class that supports main_grad buffer
 # Note: main_grad buffer is used for O2-style optimizations
 class MegatronDistributedFusedAdam(DistributedFusedAdam):
-    def __init__(self, *args, **kwargs):
+    def __init__(self, *args, disable_distributed_parameters=False, **kwargs):
         if 'process_group' not in kwargs and not parallel_state.is_unitialized():
             kwargs['process_group'] = parallel_state.get_data_parallel_group()
+        if disable_distributed_parameters:
+            world_size = torch.distributed.get_world_size()
+            rank = torch.distributed.get_rank()
+            self_groups = [torch.distributed.new_group(ranks=[i]) for i in range(world_size)]
+            kwargs['distributed_process_group'] = self_groups[rank]
+            kwargs['redundant_process_group'] = kwargs['process_group']
         super().__init__(*args, **kwargs)
 
     def _make_post_backward_hook(self, param, param_group_id, param_id):