Skip to content

Commit

Permalink
linting fix
Browse files Browse the repository at this point in the history
Signed-off-by: Sangkug Lym <slym@nvidia.com>
  • Loading branch information
erhoo82 committed Feb 12, 2025
1 parent 8e9e655 commit 783f67d
Show file tree
Hide file tree
Showing 2 changed files with 72 additions and 52 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -101,15 +101,17 @@ def __init__(self, cfg: DictConfig, trainer: Trainer, no_lm_init=True):

if not HAVE_MEGATRON_CORE:
raise ImportError(
"megatron-core was not found. Please see the NeMo README for installation instructions: https://github.com/NVIDIA/NeMo#megatron-gpt."
"megatron-core was not found. Please see the NeMo README for installation instructions: "
"https://github.com/NVIDIA/NeMo#megatron-gpt."
)

if trainer is None:
raise ValueError(f"Trainer cannot be None for Megatron-based models. Please provide a PTL trainer object.")
raise ValueError("Trainer cannot be None for Megatron-based models. Please provide a PTL trainer object.")

if cfg.get('use_flash_attention', False) and not HAVE_FLASH_ATTENTION:
raise ImportError(
"flash_attn was not found. Please see the installation instructions: https://github.com/HazyResearch/flash-attention."
"flash_attn was not found. Please see the installation instructions: "
"https://github.com/HazyResearch/flash-attention."
"If you use flash_attn with triton. Please install triton==2.0.0.dev20221202."
)

Expand Down Expand Up @@ -256,7 +258,7 @@ def setup_transformer_engine_tp_groups(self):
"""
for module in self.get_model_module_list():
"""Set TP group
Copied from: https://github.com/NVIDIA/TransformerEngine/blob/main/transformer_engine/pytorch/transformer.py#L398
Copied from: https://github.com/NVIDIA/TransformerEngine/blob/main/transformer_engine/pytorch/transformer.py#L398 # pylint: disable=line-too-long
"""
# Deep iterate but skip self to avoid infinite recursion.
for index, child in enumerate(module.modules()):
Expand All @@ -274,7 +276,7 @@ def setup_transformer_engine_cp_groups(self):

for module in self.get_model_module_list():
"""Set context parallel running
Copied from: https://github.com/NVIDIA/TransformerEngine/blob/main/transformer_engine/pytorch/transformer.py
Copied from: https://github.com/NVIDIA/TransformerEngine/blob/main/transformer_engine/pytorch/transformer.py # pylint: disable=line-too-long
"""
# Deep iterate but skip self to avoid infinite recursion.
for index, child in enumerate(module.modules()):
Expand Down Expand Up @@ -349,7 +351,8 @@ def _reconfigure_limit_batches(self, limit_batches, dataloader, mode):
"""
Reconfigure trainer.limit_val_batches for pretraining
"""
# Override limit_batches in terms of num microbatches and so there are limit_batches//num_micro_batches num of global batches
# Override limit_batches in terms of num microbatches
# and so there are limit_batches//num_micro_batches num of global batches
if isinstance(limit_batches, int):
limit_batches *= get_num_microbatches()
else:
Expand Down Expand Up @@ -618,7 +621,8 @@ def _vocab_size_with_padding(self, orig_vocab_size, make_vocab_size_divisible_by
multiple = make_vocab_size_divisible_by * tensor_model_parallel_size
after = ((after + multiple - 1) // multiple) * multiple
logging.info(
f'Padded vocab_size: {after}, original vocab_size: {orig_vocab_size}, dummy tokens: {after - orig_vocab_size}.'
f"Padded vocab_size: {after}, original vocab_size: {orig_vocab_size}, "
f"dummy tokens: {after - orig_vocab_size}."
)
return after

Expand Down Expand Up @@ -673,7 +677,7 @@ def configure_gradient_clipping(self, *args, **kwargs):

def allreduce_gradients(self):
"""Reduce gradients across data parallel ranks.
Modified from megatron-lm: https://github.com/NVIDIA/Megatron-LM/blob/d41696840ed0a7edb7e0499eb82a48ae112d9bb3/megatron/model/distributed.py#L188
Modified from megatron-lm: https://github.com/NVIDIA/Megatron-LM/blob/d41696840ed0a7edb7e0499eb82a48ae112d9bb3/megatron/model/distributed.py#L188 # pylint: disable=line-too-long
"""
# Bucketize and all-reduce
buckets = {}
Expand Down Expand Up @@ -845,7 +849,8 @@ def configure_optimizers(self):
# TODO: contiguous grad bucket for fp16 is also planned to be supported
contiguous_grad_bucket = False
raise ValueError(
"fp16 training is not yet supported with O2. Please set megatron_amp_O2 to False in the model config."
"fp16 training is not yet supported with O2."
"Please set megatron_amp_O2 to False in the model config."
)

# if using tensor parallel only, we automatically use async grad all-reduce
Expand Down Expand Up @@ -983,7 +988,8 @@ def _validate_and_override_config(self):

if self.cfg.get('sequence_parallel', False) and self.cfg.get('tensor_model_parallel_size', 1) == 1:
logging.info(
"Sequence parallel should only be used with tensor parallel size > 1. Setting sequence parallel to False"
"Sequence parallel should only be used with tensor parallel size > 1. "
"Setting sequence parallel to False"
)
with open_dict(self.cfg):
self.cfg.sequence_parallel = False
Expand All @@ -1002,7 +1008,8 @@ def _validate_and_override_config(self):
if self.cfg.get('gradient_accumulation_fusion', False):
if data_parallel_size > 1 and pipeline_model_parallel_size == 1 and not distributed_fused_adam:
logging.info(
"When not using pipeline model parallel, gradient accumulation fusion can only be used with distributed_fused_adam."
"When not using pipeline model parallel, "
"gradient accumulation fusion can only be used with distributed_fused_adam."
)
with open_dict(self.cfg):
self.cfg.gradient_accumulation_fusion = False
Expand Down Expand Up @@ -1123,7 +1130,8 @@ def _get_total_params_across_model_parallel_groups_enc_dec(self, model):
parallel_state.get_pipeline_model_parallel_rank() == self.cfg.get('pipeline_model_parallel_split_rank', 0)
or parallel_state.is_pipeline_last_stage()
):
# If the current rank is the in the decoder first stage (decoder emb) or last rank (output layer), subtract those weights since it is already accounted for in the encoder first stage.
# If the current rank is the in the decoder first stage (decoder emb) or last rank (output layer),
# subtract those weights since it is already accounted for in the encoder first stage.
# TODO: If we support embedding untying with PP > 1, we will need to update this.
num_word_embedding_parameters = sum([p.nelement() for p in model.word_embeddings_weight()])
num_parameters_on_device -= num_word_embedding_parameters
Expand Down Expand Up @@ -1180,7 +1188,7 @@ def build_model_parallel_config(self) -> ModelParallelConfig:
config_mapping = {
"perform_initialization": True, # initailize weights when constructing the module
"fp16": self.torch_dtype == torch.float16
and megatron_amp_O2, # NeMo does not currently support fp16 training with megatron amp O2, eval and inference is supported
and megatron_amp_O2, # fp16 training with megatron amp O2 not supported, eval and inference is supported
"bf16": self.torch_dtype == torch.bfloat16 and megatron_amp_O2,
"params_dtype": self.params_dtype,
"timers": self.megatron_timers,
Expand Down Expand Up @@ -1229,7 +1237,8 @@ def build_model_parallel_config(self) -> ModelParallelConfig:
setattr(model_parallel_config, 'hidden_size', self.cfg.hidden_size)
except AttributeError:
logging.warning(
f'hidden_size not found in {self.cfg}. Set this in model_parallel_config if using pipeline parallelism.'
f'hidden_size not found in {self.cfg}. '
'Set this in model_parallel_config if using pipeline parallelism.'
)

return model_parallel_config
Expand Down Expand Up @@ -1312,7 +1321,8 @@ def find_frozen_submodules(model):
logging.debug(f"Ignoring state {submodule} in FSDP.")
self.trainer.strategy.kwargs['ignored_states'] = frozen_submodules
# FSDP requires uniform status of require_grads
# Diffusion models like SD has frozen parts and needs to be added to 'ignored_states' from sharding for FSDP to work
# Diffusion models like SD has frozen parts and needs to be added to 'ignored_states'
# from sharding for FSDP to work
self.model = self.trainer.strategy._setup_model(self.model)
# Move the CPU-initialized model (with `use_cpu_initialization=True`) to GPU, which is to avoid
# out-of-memory carash before sharding. In case of GPU-initialized model, this is no-op.
Expand Down
Loading

0 comments on commit 783f67d

Please sign in to comment.