Merge remote-tracking branch 'upstream/main' into undecorate-typecheck

borisfom · Jun 26, 2024 · 7a8209a · 7a8209a
2 parents 21a5882 + 3371ad5
commit 7a8209a
Show file tree

Hide file tree

Showing 64 changed files with 3,821 additions and 336 deletions.
diff --git a/.github/workflows/cicd-main.yml b/.github/workflows/cicd-main.yml
@@ -4185,6 +4185,55 @@ jobs:
       AFTER_SCRIPT: |
         rm -f examples/asr/evaluation_transcripts.json
 
+  L2_Stable_Diffusion_Training:
+    needs: [cicd-test-container-setup]
+    uses: ./.github/workflows/_test_template.yml
+    with:
+      RUNNER: self-hosted-azure-gpus-1
+      SCRIPT: |
+        rm -rf examples/multimodal/text_to_image/sd_train_results
+
+        python examples/multimodal/text_to_image/stable_diffusion/sd_train.py \
+        trainer.devices=1 \
+        trainer.max_steps=3 \
+        +trainer.val_check_interval=10 \
+        trainer.limit_val_batches=2 \
+        trainer.gradient_clip_val=0 \
+        exp_manager.exp_dir=examples/multimodal/text_to_image/sd_train_results \
+        exp_manager.create_checkpoint_callback=False \
+        exp_manager.resume_if_exists=False \
+        model.resume_from_checkpoint=null \
+        model.precision=16 \
+        model.micro_batch_size=1 \
+        model.global_batch_size=1 \
+        model.first_stage_key=moments \
+        model.cond_stage_key=encoded \
+        +model.load_vae=False \
+        +model.load_unet=False \
+        +model.load_encoder=False \
+        model.parameterization=v \
+        model.load_only_unet=False \
+        model.text_embedding_dropout_rate=0.0 \
+        model.inductor=True \
+        model.inductor_cudagraphs=False \
+        model.capture_cudagraph_iters=15 \
+        +model.unet_config.num_head_channels=64 \
+        +model.unet_config.use_linear_in_transformer=True \
+        model.unet_config.context_dim=1024 \
+        model.unet_config.use_flash_attention=null \
+        model.unet_config.resblock_gn_groups=16 \
+        model.unet_config.unet_precision=fp16 \
+        +model.unet_config.timesteps=1000 \
+        model.optim.name=megatron_fused_adam \
+        +model.optim.capturable=True \
+        +model.optim.master_weights=True \
+        model.optim.weight_decay=0.01 \
+        model.first_stage_config.from_pretrained=null \
+        model.data.num_workers=16 \
+        model.data.synthetic_data=True
+      AFTER_SCRIPT: |
+        rm -rf examples/multimodal/text_to_image/sd_train_results
+
   Nemo_CICD_Test:
     needs: 
       #- OPTIONAL_L0_Unit_Tests_GPU
@@ -4279,6 +4328,7 @@ jobs:
       - L2_TTS_Fast_dev_runs_1_Mixer-TTS
       - L2_TTS_Fast_dev_runs_1_Hifigan
       - Speech_Checkpoints_tests
+      - L2_Stable_Diffusion_Training
     if: always()
     runs-on: ubuntu-latest
     steps:  

diff --git a/Dockerfile b/Dockerfile
@@ -66,8 +66,7 @@ WORKDIR /workspace/
 # We leave it here in case we need to work off of a specific commit in main
 RUN git clone https://github.com/NVIDIA/Megatron-LM.git && \
   cd Megatron-LM && \
-  git checkout 36e9b6bf3d8034b10c9bbd9fc357c2df2bd1515c && \
-  git cherry-pick -n e69187bc3679ea5841030a165d587bb48b56ee77 && \
+  git checkout 02871b4df8c69fac687ab6676c4246e936ce92d0 && \
   pip install .
 
 # Performance optimizations for distributed optimizer: https://github.com/NVIDIA/apex/pull/1771

diff --git a/Dockerfile.ci b/Dockerfile.ci
@@ -34,7 +34,7 @@ WORKDIR /workspace
 # Install NeMo requirements
 ARG TE_TAG=bfe21c3d68b0a9951e5716fb520045db53419c5e
 ARG MODELOPT_VERSION=0.11.0
-ARG MCORE_TAG=c90aa1671fc0b97f80fa6c3bb892ce6f8e88e7c9
+ARG MCORE_TAG=02871b4df8c69fac687ab6676c4246e936ce92d0
 ARG APEX_TAG=810ffae374a2b9cb4b5c5e28eaeca7d7998fca0c
 RUN \
 --mount=type=bind,source=requirements,target=requirements \

diff --git a/README.rst b/README.rst
@@ -45,6 +45,20 @@ Latest News
 
   <details open>
     <summary><b>Large Language Models and Multimodal</b></summary>
+      <details>
+        <summary>
+          <a href="https://huggingface.co/models?sort=trending&search=nvidia%2Fnemotron-4-340B">
+            NVIDIA releases 340B base, instruct, and reward models pretrained on a total of 9T tokens.
+          </a> (2024-06-18)
+        </summary>
+        See documentation and tutorials for SFT, PEFT, and PTQ with 
+        <a href="https://docs.nvidia.com/nemo-framework/user-guide/latest/llms/nemotron/index.html">
+          Nemotron 340B 
+        </a>
+        in the NeMo Framework User Guide.
+        <br><br>
+      </details>
+
       <details>
         <summary>
           <a href="https://developer.nvidia.com/blog/nvidia-sets-new-generative-ai-performance-and-scale-records-in-mlperf-training-v4-0/">
@@ -417,7 +431,7 @@ The most recent working versions of these dependencies are here:
 
   export apex_commit=810ffae374a2b9cb4b5c5e28eaeca7d7998fca0c
   export te_commit=bfe21c3d68b0a9951e5716fb520045db53419c5e
-  export mcore_commit=fbb375d4b5e88ce52f5f7125053068caff47f93f
+  export mcore_commit=02871b4df8c69fac687ab6676c4246e936ce92d0
   export nv_pytorch_tag=24.02-py3
 
 When using a released version of NeMo, please refer to the `Software Component Versions <https://docs.nvidia.com/nemo-framework/user-guide/latest/softwarecomponentversions.html>`_ for the correct versions.

diff --git a/docs/source/nlp/quantization.rst b/docs/source/nlp/quantization.rst
@@ -103,7 +103,7 @@ The TensorRT-LLM engine can be conveniently built and run using ``TensorRTLLM``
 
 .. code-block:: python
 
-    from nemo.export import TensorRTLLM
+    from nemo.export.tensorrt_llm import TensorRTLLM
 
 
     trt_llm_exporter = TensorRTLLM(model_dir="/path/to/trt_llm_engine_folder")

diff --git a/examples/multimodal/multimodal_llm/neva/conf/llava_config.yaml b/examples/multimodal/multimodal_llm/neva/conf/llava_config.yaml
@@ -86,7 +86,7 @@ model:
 
   # LLM configs
   # use GPTModel from megatron.core
-  mcore_gpt: False
+  mcore_gpt: True
 
   # model architecture
   encoder_seq_length: 4096
@@ -149,7 +149,7 @@ model:
   bias_activation_fusion: False
   megatron_legacy: False
 
-  transformer_engine: False
+  transformer_engine: True
   fp8: False # enables fp8 in TransformerLayer forward
   fp8_e4m3: False # sets fp8_format = recipe.Format.E4M3
   fp8_hybrid: False # sets fp8_format = recipe.Format.HYBRID

diff --git a/examples/nlp/language_modeling/megatron_gpt_continue_training.py b/examples/nlp/language_modeling/megatron_gpt_continue_training.py
@@ -115,7 +115,11 @@ def load_from_checkpoint_dir(cls, cfg, trainer, modify_confg_fn):
     gpt_cfg = modify_confg_fn(hparams_file.cfg, cfg, add_cfg_to_tree=True)
     with tempfile.NamedTemporaryFile(suffix='.yaml') as f:
         OmegaConf.save(config=gpt_cfg, f=f.name)
-        model = cls.load_from_checkpoint(checkpoint_path=checkpoint_path, trainer=trainer, hparams_file=f.name,)
+        model = cls.load_from_checkpoint(
+            checkpoint_path=checkpoint_path,
+            trainer=trainer,
+            hparams_file=f.name,
+        )
         return model
 
 
@@ -141,11 +145,12 @@ def main(cfg) -> None:
         gradient_as_bucket_view=cfg.model.gradient_as_bucket_view,
         find_unused_parameters=False,
     )
+    precision = cfg.trainer.precision
     if cfg.trainer.precision in [16, '16', 'bf16', '16-mixed', 'bf16-mixed']:
         scaler = None
         if cfg.trainer.precision in [16, '16', '16-mixed']:
             scaler = GradScaler(
-                init_scale=cfg.model.get('native_amp_init_scale', 2 ** 32),
+                init_scale=cfg.model.get('native_amp_init_scale', 2**32),
                 growth_interval=cfg.model.get('native_amp_growth_interval', 1000),
                 hysteresis=cfg.model.get('hysteresis', 2),
             )
@@ -156,7 +161,7 @@ def main(cfg) -> None:
             plugins.append(MegatronHalfPrecisionPlugin(precision=plugin_precision, device='cuda', scaler=scaler))
         else:
             plugins.append(PipelineMixedPrecisionPlugin(precision=plugin_precision, device='cuda', scaler=scaler))
-
+        cfg.trainer.precision = None
     if cfg.get('cluster_type', None) == 'BCP':
         plugins.append(TorchElasticEnvironment())
 
@@ -165,6 +170,7 @@ def main(cfg) -> None:
     if 'enable_progress_bar' not in cfg.trainer or cfg.trainer.enable_progress_bar:
         callbacks.append(CustomProgressBar())
     trainer = Trainer(plugins=plugins, strategy=strategy, **cfg.trainer, callbacks=callbacks)
+    cfg.trainer.precision = precision
 
     exp_manager(trainer, cfg.exp_manager)
 

diff --git a/nemo/collections/llm/__init__.py b/nemo/collections/llm/__init__.py
@@ -13,11 +13,30 @@
     SquadDataModule,
 )
 from nemo.collections.llm.gpt.model import (
+    CodeGemmaConfig2B,
+    CodeGemmaConfig7B,
+    CodeLlamaConfig7B,
+    CodeLlamaConfig13B,
+    CodeLlamaConfig34B,
+    CodeLlamaConfig70B,
+    GemmaConfig,
+    GemmaConfig2B,
+    GemmaConfig7B,
+    GemmaModel,
     GPTConfig,
     GPTModel,
+    Llama2Config7B,
+    Llama2Config13B,
+    Llama2Config70B,
+    Llama3Config8B,
+    Llama3Config70B,
+    LlamaConfig,
+    LlamaModel,
     MaskedTokenLossReduction,
     Mistral7BConfig,
     Mistral7BModel,
+    MixtralConfig,
+    MixtralModel,
     gpt_data_step,
     gpt_forward_step,
 )
@@ -31,6 +50,25 @@
     "MaskedTokenLossReduction",
     "Mistral7BConfig",
     "Mistral7BModel",
+    "MixtralConfig",
+    "MixtralModel",
+    "LlamaConfig",
+    "Llama2Config7B",
+    "Llama2Config13B",
+    "Llama2Config70B",
+    "Llama3Config8B",
+    "Llama3Config70B",
+    "CodeLlamaConfig7B",
+    "CodeLlamaConfig13B",
+    "CodeLlamaConfig34B",
+    "CodeLlamaConfig70B",
+    "LlamaModel",
+    "GemmaConfig",
+    "GemmaConfig2B",
+    "GemmaConfig7B",
+    "CodeGemmaConfig2B",
+    "CodeGemmaConfig7B",
+    "GemmaModel",
     "PreTrainingDataModule",
     "FineTuningDataModule",
     "SquadDataModule",

diff --git a/nemo/collections/llm/api.py b/nemo/collections/llm/api.py
@@ -15,7 +15,7 @@ def train(
     trainer: Trainer,
     log: Annotated[Optional[NeMoLogger], Config[NeMoLogger]] = None,
     resume: Annotated[Optional[AutoResume], Config[AutoResume]] = None,
-    opt: Optional[OptimizerModule] = None,
+    optim: Optional[OptimizerModule] = None,
     tokenizer: Optional[str] = None,
     # TODO: Fix export export: Optional[str] = None,
 ) -> Path:
@@ -28,7 +28,7 @@ def train(
         trainer (Trainer): The trainer instance configured with a MegatronStrategy.
         log (NeMoLogger): A nemologger instance.
         resume (Optional[Union[AutoResume, Resume]]): Resume training from a checkpoint.
-        opt (Optional[OptimizerModule]): The optimizer module to be used. If not provided, the default optimizer
+        optim (Optional[OptimizerModule]): The optimizer module to be used. If not provided, the default optimizer
             from the model will be used.
         tokenizer (Optional[str]): Tokenizer setting to be applied. Can be 'data' or 'model'.
         export (Optional[str]): Filename to save the exported checkpoint after training.
@@ -49,27 +49,18 @@ def train(
         >>> train(model, data, trainer, tokenizer='data', source='path/to/ckpt.ckpt', export='final.ckpt')
         PosixPath('/path/to/log_dir')
     """
-    if not isinstance(trainer.strategy, MegatronStrategy):
-        raise ValueError("Only MegatronStrategy is supported")
-
     _log = log or NeMoLogger()
-
-    if tokenizer:  # TODO: Improve this
-        _use_tokenizer(model, data, tokenizer)
-
     app_state = _log.setup(
         trainer,
         resume_if_exists=getattr(resume, "resume_if_exists", False),
+        task_config=getattr(train, "__io__", None),
     )
     if resume is not None:
         resume.setup(model, trainer)
-    if opt:
-        opt.connect(model)
-
-    trainer.fit(model, data)
-
-    if hasattr(train, "__io__"):
-        _save_config_img(app_state.exp_dir, train.__io__)
+    if optim:
+        optim.connect(model)
+    if tokenizer:  # TODO: Improve this
+        _use_tokenizer(model, data, tokenizer)
 
     trainer.fit(model, data)
 

diff --git a/nemo/collections/llm/fn/activation.py b/nemo/collections/llm/fn/activation.py
@@ -0,0 +1,11 @@
+import torch
+
+
+@torch.jit.script
+def gelu_impl(x):
+    """OpenAI's gelu implementation."""
+    return 0.5 * x * (1.0 + torch.tanh(0.7978845608028654 * x * (1.0 + 0.044715 * x * x)))
+
+
+def openai_gelu(x):
+    return gelu_impl(x)
diff --git a/nemo/collections/llm/gpt/data/pre_training.py b/nemo/collections/llm/gpt/data/pre_training.py
@@ -3,6 +3,7 @@
 
 import pytorch_lightning as pl
 from pytorch_lightning.utilities.types import EVAL_DATALOADERS, TRAIN_DATALOADERS
+from torch.utils import data
 from torch.utils.data import DataLoader
 
 from nemo.lightning.pytorch.plugins import MegatronDataSampler
@@ -121,7 +122,7 @@ def _create_dataloader(self, dataset, **kwargs) -> DataLoader:
             num_workers=self.num_workers,
             pin_memory=self.pin_memory,
             persistent_workers=self.persistent_workers,
-            collate_fn=dataset.collate_fn,
+            collate_fn=getattr(dataset, 'collate_fn', data.dataloader.default_collate),
             **kwargs,
         )
 

diff --git a/nemo/collections/llm/gpt/model/__init__.py b/nemo/collections/llm/gpt/model/__init__.py
@@ -5,13 +5,54 @@
     gpt_data_step,
     gpt_forward_step,
 )
+from nemo.collections.llm.gpt.model.gemma import (
+    CodeGemmaConfig2B,
+    CodeGemmaConfig7B,
+    GemmaConfig,
+    GemmaConfig2B,
+    GemmaConfig7B,
+    GemmaModel,
+)
+from nemo.collections.llm.gpt.model.llama import (
+    CodeLlamaConfig7B,
+    CodeLlamaConfig13B,
+    CodeLlamaConfig34B,
+    CodeLlamaConfig70B,
+    Llama2Config7B,
+    Llama2Config13B,
+    Llama2Config70B,
+    Llama3Config8B,
+    Llama3Config70B,
+    LlamaConfig,
+    LlamaModel,
+)
 from nemo.collections.llm.gpt.model.mistral_7b import Mistral7BConfig, Mistral7BModel
+from nemo.collections.llm.gpt.model.mixtral import MixtralConfig, MixtralModel
 
 __all__ = [
     "GPTConfig",
     "GPTModel",
     "Mistral7BConfig",
     "Mistral7BModel",
+    "MixtralConfig",
+    "MixtralModel",
+    "LlamaConfig",
+    "Llama2Config7B",
+    "Llama2Config13B",
+    "Llama2Config70B",
+    "Llama3Config8B",
+    "Llama3Config70B",
+    "CodeLlamaConfig7B",
+    "CodeLlamaConfig13B",
+    "CodeLlamaConfig34B",
+    "CodeLlamaConfig70B",
+    "GemmaConfig",
+    "GemmaConfig2B",
+    "GemmaConfig7B",
+    "CodeGemmaConfig2B",
+    "CodeGemmaConfig7B",
+    "GemmaModel",
+    "LlamaModel",
     "MaskedTokenLossReduction",
     "gpt_data_step",
     "gpt_forward_step",