Merge branch 'main' into fix_missing_pip_package

NVIDIA · Sep 12, 2023 · bc60bcf · bc60bcf
2 parents 563d40c + 29c03dd
commit bc60bcf
Show file tree

Hide file tree

Showing 64 changed files with 3,493 additions and 578 deletions.
diff --git a/Dockerfile b/Dockerfile
@@ -46,16 +46,16 @@ WORKDIR /workspace/
 
 WORKDIR /tmp/
 
-# DP independent checkpoint format for distributed adam
+# Distributed Adam support for multiple dtypes
 RUN git clone https://github.com/NVIDIA/apex.git && \
   cd apex && \
-  git checkout 7995de18677295c5edeeab082179edbfdb6ee16a && \
-  pip3 install -v --disable-pip-version-check --no-cache-dir --global-option="--cpp_ext" --global-option="--cuda_ext" --global-option="--fast_layer_norm" --global-option="--distributed_adam" --global-option="--deprecated_fused_adam" ./
+  git checkout 52e18c894223800cb611682dce27d88050edf1de && \
+  pip3 install -v --no-build-isolation --disable-pip-version-check --no-cache-dir --global-option="--cpp_ext" --global-option="--cuda_ext" --global-option="--fast_layer_norm" --global-option="--distributed_adam" --global-option="--deprecated_fused_adam" ./
 
 # install megatron core, this can be removed once 0.3 pip package is released
 RUN git clone https://github.com/NVIDIA/Megatron-LM.git && \
   cd Megatron-LM && \
-  git checkout 01c8704453af7e26134441224c8a351746ca0349 && \
+  git checkout ab0336a5c8eab77aa74ae604ba1e73decbf6d560 && \
   pip install -e .
 
 # uninstall stuff from base container

diff --git a/Jenkinsfile b/Jenkinsfile
@@ -59,15 +59,15 @@ pipeline {
 
     stage('Megatron Core installation') {
       steps {
-        // pinned MCore https://github.com/NVIDIA/Megatron-LM/commit/01c8704453af7e26134441224c8a351746ca0349
+        // pinned MCore https://github.com/NVIDIA/Megatron-LM/commit/ab0336a5c8eab77aa74ae604ba1e73decbf6d560
         // ToT for 23.08 branch
         sh 'git clone https://github.com/NVIDIA/Megatron-LM.git && \
             cd Megatron-LM && \
-            git checkout 01c8704453af7e26134441224c8a351746ca0349 && \
+            git checkout ab0336a5c8eab77aa74ae604ba1e73decbf6d560 && \
             pip install -e .'
       }
     }
-      
+
 
     stage('PyTorch Lightning version') {
       steps {
@@ -120,7 +120,6 @@ pipeline {
         sh 'CUDA_VISIBLE_DEVICES=0 python scripts/nlp_language_modeling/convert_hf_llama_to_nemo.py \
         --in-file=/home/TestData/nlp/megatron_llama/llama-ci-hf \
         --out-file=/home/TestData/nlp/megatron_llama/ci.nemo \
-        --fast-swiglu \
         --precision=16'
         sh 'rm -f /home/TestData/nlp/megatron_llama/ci.nemo'
       }
@@ -839,7 +838,7 @@ pipeline {
         }
       }
     }
-    
+
     stage('L2: Speech Transcription') {
       when {
         anyOf {
@@ -3374,7 +3373,7 @@ assert_frame_equal(training_curve, gt_curve, rtol=1e-3, atol=1e-3)"'''
         sh "rm -rf examples/nlp/language_modeling/gpt_index_mappings"
        }
      }
-    
+
     // This test requires Ampere but some of the test GPUs are Volta
     // Need to add a check for compute capability before uncommenting this test
     // stage('L2: Megatron GPT with Rope Pretraining using Flash Attention and Resume Training TP=2') {
@@ -3937,7 +3936,7 @@ assert_frame_equal(training_curve, gt_curve, rtol=1e-3, atol=1e-3)"'''
             rm -rf examples/nlp/language_modeling/out.jsonl"
       }
     }
-   
+
     // TODO: Add this test back. Test was failing on CI machines due to HW error
     // stage('L2: Megatron GPT Convert from Megatron-LM checkpoing and Eval') {
     //   when {

diff --git a/README.rst b/README.rst
@@ -41,14 +41,14 @@
 Introduction
 ------------
 
-NVIDIA NeMo is a conversational AI toolkit built for researchers working on automatic speech recognition (ASR), 
-text-to-speech synthesis (TTS), large language models (LLMs), and 
+NVIDIA NeMo is a conversational AI toolkit built for researchers working on automatic speech recognition (ASR),
+text-to-speech synthesis (TTS), large language models (LLMs), and
 natural language processing (NLP).
-The primary objective of NeMo is to help researchers from industry and academia to reuse prior work (code and pretrained models) 
+The primary objective of NeMo is to help researchers from industry and academia to reuse prior work (code and pretrained models)
 and make it easier to create new `conversational AI models <https://developer.nvidia.com/conversational-ai#started>`_.
 
-All NeMo models are trained with `Lightning <https://github.com/Lightning-AI/lightning>`_ and 
-training is automatically scalable to 1000s of GPUs. 
+All NeMo models are trained with `Lightning <https://github.com/Lightning-AI/lightning>`_ and
+training is automatically scalable to 1000s of GPUs.
 Additionally, NeMo Megatron LLM models can be trained up to 1 trillion parameters using tensor and pipeline model parallelism.
 NeMo models can be optimized for inference and deployed for production use-cases with `NVIDIA Riva <https://developer.nvidia.com/riva>`_.
 
@@ -57,14 +57,14 @@ State of the Art pretrained NeMo models are freely available on `HuggingFace Hub
 `NVIDIA NGC <https://catalog.ngc.nvidia.com/models?query=nemo&orderBy=weightPopularDESC>`_.
 These models can be used to transcribe audio, synthesize speech, or translate text in just a few lines of code.
 
-We have extensive `tutorials <https://docs.nvidia.com/deeplearning/nemo/user-guide/docs/en/stable/starthere/tutorials.html>`_ that 
+We have extensive `tutorials <https://docs.nvidia.com/deeplearning/nemo/user-guide/docs/en/stable/starthere/tutorials.html>`_ that
 can all be run on `Google Colab <https://colab.research.google.com>`_.
 
-For advanced users that want to train NeMo models from scratch or finetune existing NeMo models 
+For advanced users that want to train NeMo models from scratch or finetune existing NeMo models
 we have a full suite of `example scripts <https://github.com/NVIDIA/NeMo/tree/main/examples>`_ that support multi-GPU/multi-node training.
 
 For scaling NeMo LLM training on Slurm clusters or public clouds, please see the `NVIDIA NeMo Megatron Launcher <https://github.com/NVIDIA/NeMo-Megatron-Launcher>`_.
-The NM launcher has extensive recipes, scripts, utilities, and documentation for training NeMo LLMs and also has an `Autoconfigurator <https://github.com/NVIDIA/NeMo-Megatron-Launcher#53-using-autoconfigurator-to-find-the-optimal-configuration>`_ 
+The NM launcher has extensive recipes, scripts, utilities, and documentation for training NeMo LLMs and also has an `Autoconfigurator <https://github.com/NVIDIA/NeMo-Megatron-Launcher#53-using-autoconfigurator-to-find-the-optimal-configuration>`_
 which can be used to find the optimal model parallel configuration for training on a specific cluster.
 
 Also see our `introductory video <https://www.youtube.com/embed/wBgpMf_KQVw>`_ for a high level overview of NeMo.
@@ -247,7 +247,7 @@ To install Apex, run
 
     git clone https://github.com/NVIDIA/apex.git
     cd apex
-    git checkout 7995de18677295c5edeeab082179edbfdb6ee16a
+    git checkout 52e18c894223800cb611682dce27d88050edf1de
     pip install -v --no-build-isolation --disable-pip-version-check --no-cache-dir --global-option="--cpp_ext" --global-option="--cuda_ext" --global-option="--fast_layer_norm" --global-option="--distributed_adam" --global-option="--deprecated_fused_adam" ./
 
 It is highly recommended to use the NVIDIA PyTorch or NeMo container if having issues installing Apex or any other dependencies.
@@ -287,7 +287,7 @@ Transformer Engine requires PyTorch to be built with CUDA 11.8.
 
 Flash Attention
 ~~~~~~~~~~~~~~~~~~~~
-Transformer Engine already supports Flash Attention for GPT models. If you want to use Flash Attention for non-causal models or use with attention bias (introduced from position encoding, e.g. Alibi), please install `flash-attn <https://github.com/HazyResearch/flash-attention>`_. 
+Transformer Engine already supports Flash Attention for GPT models. If you want to use Flash Attention for non-causal models or use with attention bias (introduced from position encoding, e.g. Alibi), please install `flash-attn <https://github.com/HazyResearch/flash-attention>`_.
 
 .. code-block:: bash
 
@@ -296,7 +296,7 @@ Transformer Engine already supports Flash Attention for GPT models. If you want
 
 NLP inference UI
 ~~~~~~~~~~~~~~~~~~~~
-To launch the inference web UI server, please install the gradio `gradio <https://gradio.app/>`_. 
+To launch the inference web UI server, please install the gradio `gradio <https://gradio.app/>`_.
 
 .. code-block:: bash
 

diff --git a/docs/source/starthere/tutorials.rst b/docs/source/starthere/tutorials.rst
@@ -190,6 +190,9 @@ To run a tutorial:
    * - Tools
      - NeMo Forced Aligner
      - `NeMo Forced Aligner <https://colab.research.google.com/github/NVIDIA/NeMo/blob/main/tutorials/tools/NeMo_Forced_Aligner_Tutorial.ipynb>`_
+   * - Tools
+     - Speech Data Explorer
+     - `Speech Data Explorer  <https://colab.research.google.com/github/NVIDIA/NeMo/blob/stable/tutorials/tools/SDE_HowTo_v2.ipynb>`_   
    * - Tools
      - CTC Segmentation
      - `CTC Segmentation <https://colab.research.google.com/github/NVIDIA/NeMo/blob/stable/tutorials/tools/CTC_Segmentation_Tutorial.ipynb>`_

diff --git a/examples/nlp/language_modeling/conf/megatron_gpt_inference.yaml b/examples/nlp/language_modeling/conf/megatron_gpt_inference.yaml
@@ -17,6 +17,8 @@ trainer:
   accelerator: gpu
   logger: False # logger provided by exp_manager
   precision: 16 # 16, 32, or bf16
+  use_distributed_sampler: False
+
 
 tensor_model_parallel_size: -1
 pipeline_model_parallel_size: -1

diff --git a/examples/nlp/language_modeling/conf/megatron_llama_inference.yaml b/examples/nlp/language_modeling/conf/megatron_llama_inference.yaml
@@ -17,6 +17,7 @@ trainer:
   accelerator: gpu
   logger: False # logger provided by exp_manager
   precision: 32 # 16, 32, or bf16
+  use_distributed_sampler: False
 
 tensor_model_parallel_size: -1
 pipeline_model_parallel_size: -1

diff --git a/examples/nlp/language_modeling/conf/tp_overlap/ub_cfg_a100_h6144_tp4_mbs4_seqlen2048.yaml b/examples/nlp/language_modeling/conf/tp_overlap/ub_cfg_a100_h6144_tp4_mbs4_seqlen2048.yaml
@@ -0,0 +1 @@
+# dummy file to build hydra configs
diff --git a/examples/nlp/language_modeling/conf/tp_overlap/ub_cfg_a100_h6144_tp8_mbs4_seqlen2048.yaml b/examples/nlp/language_modeling/conf/tp_overlap/ub_cfg_a100_h6144_tp8_mbs4_seqlen2048.yaml
@@ -0,0 +1 @@
+# dummy file to build hydra configs
diff --git a/examples/nlp/language_modeling/conf/tp_overlap/ub_cfg_a100_h8192_tp8_mbs4_seqlen2048.yaml b/examples/nlp/language_modeling/conf/tp_overlap/ub_cfg_a100_h8192_tp8_mbs4_seqlen2048.yaml
@@ -0,0 +1 @@
+# dummy file to build hydra configs
diff --git a/examples/nlp/language_modeling/conf/tp_overlap/ub_cfg_h100_h6144_tp4_mbs4_seqlen2048.yaml b/examples/nlp/language_modeling/conf/tp_overlap/ub_cfg_h100_h6144_tp4_mbs4_seqlen2048.yaml
@@ -0,0 +1 @@
+# dummy file to build hydra configs
diff --git a/examples/nlp/language_modeling/conf/tp_overlap/ub_cfg_h100_h6144_tp8_mbs4_seqlen2048.yaml b/examples/nlp/language_modeling/conf/tp_overlap/ub_cfg_h100_h6144_tp8_mbs4_seqlen2048.yaml
@@ -0,0 +1 @@
+# dummy file to build hydra configs
diff --git a/examples/nlp/language_modeling/conf/tp_overlap/ub_cfg_h100_h8192_tp8_mbs4_seqlen2048.yaml b/examples/nlp/language_modeling/conf/tp_overlap/ub_cfg_h100_h8192_tp8_mbs4_seqlen2048.yaml
@@ -0,0 +1 @@
+# dummy file to build hydra configs
diff --git a/examples/nlp/language_modeling/megatron_bart_pretraining.py b/examples/nlp/language_modeling/megatron_bart_pretraining.py
@@ -21,6 +21,7 @@
 
 from nemo.collections.nlp.models.language_modeling.megatron_bart_model import MegatronBARTModel
 from nemo.collections.nlp.parts.nlp_overrides import (
+    CustomProgressBar,
     GradScaler,
     MegatronHalfPrecisionPlugin,
     NLPDDPStrategy,
@@ -63,7 +64,9 @@ def main(cfg) -> None:
     if cfg.get('cluster_type', None) == 'BCP':
         plugins.append(TorchElasticEnvironment())
 
-    trainer = Trainer(plugins=plugins, strategy=strategy, **cfg.trainer, callbacks=[ModelSummary(max_depth=3)])
+    trainer = Trainer(
+        plugins=plugins, strategy=strategy, **cfg.trainer, callbacks=[ModelSummary(max_depth=3), CustomProgressBar()]
+    )
 
     exp_manager(trainer, cfg.exp_manager)
 
@@ -73,10 +76,6 @@ def main(cfg) -> None:
 
     logging.info(f'Resuming training from checkpoint: {trainer.ckpt_path}')
 
-    # hydra interpolation does not work here as the interpolation key is lost when PTL saves hparams
-    with open_dict(cfg):
-        cfg.model.precision = cfg.trainer.precision
-
     model = MegatronBARTModel(cfg.model, trainer)
     trainer.fit(model)
 

diff --git a/examples/nlp/language_modeling/megatron_bert_pretraining.py b/examples/nlp/language_modeling/megatron_bert_pretraining.py
@@ -33,10 +33,6 @@ def main(cfg) -> None:
     trainer = MegatronBertTrainerBuilder(cfg).create_trainer()
     exp_manager(trainer, cfg.exp_manager)
 
-    # hydra interpolation does not work here as the interpolation key is lost when PTL saves hparams
-    with open_dict(cfg):
-        cfg.model.precision = cfg.trainer.precision
-
     model = MegatronBertModel(cfg.model, trainer)
 
     trainer.fit(model)

diff --git a/examples/nlp/language_modeling/megatron_change_num_partitions.py b/examples/nlp/language_modeling/megatron_change_num_partitions.py
@@ -13,6 +13,8 @@
 # limitations under the License.
 
 import os
+import shutil
+import tarfile
 import tempfile
 from argparse import ArgumentParser
 from typing import Dict, List
@@ -233,7 +235,7 @@ def compute_tp_splits(
             for i in range(tp_size):
                 tp_qkv = torch.cat([tp_qkv_splits[item] for item in range(i, tp_size * 2, tp_size)])
                 split.append(tp_qkv)
-        elif ('dense_h_to_4h.weight' in param_name or 'linear_fc1.weight' in param_name) and fast_glu_activation:
+        elif ('dense_h_to_4h' in param_name or 'linear_fc1' in param_name) and fast_glu_activation:
             # For Megatron GPT model with Fast Glu activation
             # Handle gated linear units
             # concat all the first halves ('W's) and all the second halves ('V's)
@@ -275,7 +277,7 @@ def compute_tp_merge(idx, name, param, partitions_pp, model_cfg):
         concated = torch.cat([partitions_pp[i][idx].data for i in range(len(partitions_pp))], dim=0)
 
     # Logic for Fast Glu activation
-    if 'dense_h_to_4h.weight' in name and fast_glu_activation:
+    if 'dense_h_to_4h' in name and fast_glu_activation:
         # concat all the first halves ('W's) and all the second halves ('V's)
         wk_splits = []
         for tpr in range(len(partitions_pp)):
@@ -977,6 +979,31 @@ def main():
     if vp_size > 1:
         set_virtual_parallel_rank_safely(0)
 
+    # Extract tokenizer artifact from the model to temp directory
+    logging.info("Extracting tokenizer artifact from NeMo file...")
+    temp_dir = tempfile.mkdtemp()
+    tokenizer_model_path = None
+    with tarfile.open(args.model_file, "r") as tar:
+        for member in tar.getmembers():
+            if '.model' in member.name:
+                extracted_file = tar.extractfile(member)
+                extracted_file_path = os.path.join(temp_dir, member.name)
+
+                if tokenizer_model_path is None:
+                    logging.info(f"Found tokenizer. Extracting {member.name} to {extracted_file_path}")
+
+                    tokenizer_model_path = extracted_file_path
+                    with open(extracted_file_path, "wb") as f:
+                        f.write(extracted_file.read())
+                else:
+                    if args.tokenizer_model_path is None:
+                        logging.warning(
+                            f"\n\nFound multiple tokenizer artifacts in the model file.\n"
+                            f"Using only {tokenizer_model_path}.\n"
+                            f"If this is incorrect, manually pass the correct tokenizer using "
+                            f"`--tokenizer_model_path`.\n\n"
+                        )
+
     # If input model has TP > 1 or PP > 1
     # Reconstruct the model to have TP = 1 and PP = 1
     # Note that this is a forward loop that will process PP [0..N] TP [0..M] in sequential order.
@@ -1384,6 +1411,15 @@ def main():
                 with open_dict(model.cfg):
                     model.cfg.tokenizer.model = args.tokenizer_model_path
 
+            else:
+                if tokenizer_model_path is None:
+                    logging.warning("Could not extract tokenizer model file from checkpoint.")
+
+                else:
+                    # Extract tokenizer info
+                    with open_dict(model.cfg):
+                        model.cfg.tokenizer.model = tokenizer_model_path
+
             model.cfg, restore_dict = force_cpu_model(model.cfg)
 
             model = cls(model.cfg, trainer)
@@ -1458,6 +1494,9 @@ def main():
 
     logging.info("Successfully finished changing partitions!")
 
+    if temp_dir is not None:
+        shutil.rmtree(temp_dir, ignore_errors=True)
+
 
 if __name__ == '__main__':
     main()
diff --git a/examples/nlp/language_modeling/megatron_gpt_continue_training.py b/examples/nlp/language_modeling/megatron_gpt_continue_training.py
@@ -23,6 +23,7 @@
 from nemo.collections.nlp.models.language_modeling.megatron_gpt_model import MegatronGPTModel
 from nemo.collections.nlp.modules.common.megatron.megatron_init import fake_initialize_model_parallel
 from nemo.collections.nlp.parts.nlp_overrides import (
+    CustomProgressBar,
     GradScaler,
     MegatronHalfPrecisionPlugin,
     NLPDDPStrategy,
@@ -156,7 +157,7 @@ def main(cfg) -> None:
     if cfg.get('cluster_type', None) == 'BCP':
         plugins.append(TorchElasticEnvironment())
 
-    trainer = Trainer(plugins=plugins, strategy=strategy, **cfg.trainer)
+    trainer = Trainer(plugins=plugins, strategy=strategy, **cfg.trainer, callbacks=[CustomProgressBar()])
 
     exp_manager(trainer, cfg.exp_manager)
 
@@ -182,9 +183,6 @@ def main(cfg) -> None:
         model = load_from_checkpoint_dir(MegatronGPTModel, cfg, trainer, gpt_cfg, modify_confg_fn=_modify_config)
     else:
         print(' > WARNING: No checkpoint provided. Starting from scratch.')
-        # hydra interpolation does not work here as the interpolation key is lost when PTL saves hparams
-        with open_dict(cfg):
-            cfg.model.precision = cfg.trainer.precision
         model = MegatronGPTModel(cfg.model, trainer)
     trainer.fit(model)