linting fix

Signed-off-by: Sangkug Lym <slym@nvidia.com>
erhoo82 · Feb 12, 2025 · 783f67d · 783f67d
1 parent 8e9e655
commit 783f67d
Show file tree

Hide file tree

Showing 2 changed files with 72 additions and 52 deletions.
diff --git a/nemo/collections/nlp/models/language_modeling/megatron_base_model.py b/nemo/collections/nlp/models/language_modeling/megatron_base_model.py
@@ -101,15 +101,17 @@ def __init__(self, cfg: DictConfig, trainer: Trainer, no_lm_init=True):
 
         if not HAVE_MEGATRON_CORE:
             raise ImportError(
-                "megatron-core was not found. Please see the NeMo README for installation instructions: https://github.com/NVIDIA/NeMo#megatron-gpt."
+                "megatron-core was not found. Please see the NeMo README for installation instructions: "
+                "https://github.com/NVIDIA/NeMo#megatron-gpt."
             )
 
         if trainer is None:
-            raise ValueError(f"Trainer cannot be None for Megatron-based models. Please provide a PTL trainer object.")
+            raise ValueError("Trainer cannot be None for Megatron-based models. Please provide a PTL trainer object.")
 
         if cfg.get('use_flash_attention', False) and not HAVE_FLASH_ATTENTION:
             raise ImportError(
-                "flash_attn was not found. Please see the installation instructions: https://github.com/HazyResearch/flash-attention."
+                "flash_attn was not found. Please see the installation instructions: "
+                "https://github.com/HazyResearch/flash-attention."
                 "If you use flash_attn with triton. Please install triton==2.0.0.dev20221202."
             )
 
@@ -256,7 +258,7 @@ def setup_transformer_engine_tp_groups(self):
         """
         for module in self.get_model_module_list():
             """Set TP group
-            Copied from: https://github.com/NVIDIA/TransformerEngine/blob/main/transformer_engine/pytorch/transformer.py#L398
+            Copied from: https://github.com/NVIDIA/TransformerEngine/blob/main/transformer_engine/pytorch/transformer.py#L398 # pylint: disable=line-too-long
             """
             # Deep iterate but skip self to avoid infinite recursion.
             for index, child in enumerate(module.modules()):
@@ -274,7 +276,7 @@ def setup_transformer_engine_cp_groups(self):
 
         for module in self.get_model_module_list():
             """Set context parallel running
-            Copied from: https://github.com/NVIDIA/TransformerEngine/blob/main/transformer_engine/pytorch/transformer.py
+            Copied from: https://github.com/NVIDIA/TransformerEngine/blob/main/transformer_engine/pytorch/transformer.py # pylint: disable=line-too-long
             """
             # Deep iterate but skip self to avoid infinite recursion.
             for index, child in enumerate(module.modules()):
@@ -349,7 +351,8 @@ def _reconfigure_limit_batches(self, limit_batches, dataloader, mode):
         """
         Reconfigure trainer.limit_val_batches for pretraining
         """
-        # Override limit_batches in terms of num microbatches and so there are limit_batches//num_micro_batches num of global batches
+        # Override limit_batches in terms of num microbatches
+        # and so there are limit_batches//num_micro_batches num of global batches
         if isinstance(limit_batches, int):
             limit_batches *= get_num_microbatches()
         else:
@@ -618,7 +621,8 @@ def _vocab_size_with_padding(self, orig_vocab_size, make_vocab_size_divisible_by
         multiple = make_vocab_size_divisible_by * tensor_model_parallel_size
         after = ((after + multiple - 1) // multiple) * multiple
         logging.info(
-            f'Padded vocab_size: {after}, original vocab_size: {orig_vocab_size}, dummy tokens: {after - orig_vocab_size}.'
+            f"Padded vocab_size: {after}, original vocab_size: {orig_vocab_size}, "
+            f"dummy tokens: {after - orig_vocab_size}."
         )
         return after
 
@@ -673,7 +677,7 @@ def configure_gradient_clipping(self, *args, **kwargs):
 
     def allreduce_gradients(self):
         """Reduce gradients across data parallel ranks.
-        Modified from megatron-lm: https://github.com/NVIDIA/Megatron-LM/blob/d41696840ed0a7edb7e0499eb82a48ae112d9bb3/megatron/model/distributed.py#L188
+        Modified from megatron-lm: https://github.com/NVIDIA/Megatron-LM/blob/d41696840ed0a7edb7e0499eb82a48ae112d9bb3/megatron/model/distributed.py#L188 # pylint: disable=line-too-long
         """
         # Bucketize and all-reduce
         buckets = {}
@@ -845,7 +849,8 @@ def configure_optimizers(self):
                 # TODO: contiguous grad bucket for fp16 is also planned to be supported
                 contiguous_grad_bucket = False
                 raise ValueError(
-                    "fp16 training is not yet supported with O2. Please set megatron_amp_O2 to False in the model config."
+                    "fp16 training is not yet supported with O2."
+                    "Please set megatron_amp_O2 to False in the model config."
                 )
 
             # if using tensor parallel only, we automatically use async grad all-reduce
@@ -983,7 +988,8 @@ def _validate_and_override_config(self):
 
         if self.cfg.get('sequence_parallel', False) and self.cfg.get('tensor_model_parallel_size', 1) == 1:
             logging.info(
-                "Sequence parallel should only be used with tensor parallel size > 1. Setting sequence parallel to False"
+                "Sequence parallel should only be used with tensor parallel size > 1. "
+                "Setting sequence parallel to False"
             )
             with open_dict(self.cfg):
                 self.cfg.sequence_parallel = False
@@ -1002,7 +1008,8 @@ def _validate_and_override_config(self):
         if self.cfg.get('gradient_accumulation_fusion', False):
             if data_parallel_size > 1 and pipeline_model_parallel_size == 1 and not distributed_fused_adam:
                 logging.info(
-                    "When not using pipeline model parallel, gradient accumulation fusion can only be used with distributed_fused_adam."
+                    "When not using pipeline model parallel, "
+                    "gradient accumulation fusion can only be used with distributed_fused_adam."
                 )
                 with open_dict(self.cfg):
                     self.cfg.gradient_accumulation_fusion = False
@@ -1123,7 +1130,8 @@ def _get_total_params_across_model_parallel_groups_enc_dec(self, model):
             parallel_state.get_pipeline_model_parallel_rank() == self.cfg.get('pipeline_model_parallel_split_rank', 0)
             or parallel_state.is_pipeline_last_stage()
         ):
-            # If the current rank is the in the decoder first stage (decoder emb) or last rank (output layer), subtract those weights since it is already accounted for in the encoder first stage.
+            # If the current rank is the in the decoder first stage (decoder emb) or last rank (output layer),
+            # subtract those weights since it is already accounted for in the encoder first stage.
             # TODO: If we support embedding untying with PP > 1, we will need to update this.
             num_word_embedding_parameters = sum([p.nelement() for p in model.word_embeddings_weight()])
             num_parameters_on_device -= num_word_embedding_parameters
@@ -1180,7 +1188,7 @@ def build_model_parallel_config(self) -> ModelParallelConfig:
         config_mapping = {
             "perform_initialization": True,  # initailize weights when constructing the module
             "fp16": self.torch_dtype == torch.float16
-            and megatron_amp_O2,  # NeMo does not currently support fp16 training with megatron amp O2, eval and inference is supported
+            and megatron_amp_O2,  # fp16 training with megatron amp O2 not supported, eval and inference is supported
             "bf16": self.torch_dtype == torch.bfloat16 and megatron_amp_O2,
             "params_dtype": self.params_dtype,
             "timers": self.megatron_timers,
@@ -1229,7 +1237,8 @@ def build_model_parallel_config(self) -> ModelParallelConfig:
             setattr(model_parallel_config, 'hidden_size', self.cfg.hidden_size)
         except AttributeError:
             logging.warning(
-                f'hidden_size not found in {self.cfg}. Set this in model_parallel_config if using pipeline parallelism.'
+                f'hidden_size not found in {self.cfg}. '
+                'Set this in model_parallel_config if using pipeline parallelism.'
             )
 
         return model_parallel_config
@@ -1312,7 +1321,8 @@ def find_frozen_submodules(model):
                 logging.debug(f"Ignoring state {submodule} in FSDP.")
             self.trainer.strategy.kwargs['ignored_states'] = frozen_submodules
             # FSDP requires uniform status of require_grads
-            # Diffusion models like SD has frozen parts and needs to be added to 'ignored_states' from sharding for FSDP to work
+            # Diffusion models like SD has frozen parts and needs to be added to 'ignored_states'
+            # from sharding for FSDP to work
             self.model = self.trainer.strategy._setup_model(self.model)
             # Move the CPU-initialized model (with `use_cpu_initialization=True`) to GPU, which is to avoid
             # out-of-memory carash before sharding. In case of GPU-initialized model, this is no-op.