Merge remote-tracking branch 'upstream/main' into undecorate-typecheck

borisfom · Jun 18, 2024 · a88f1c2 · a88f1c2
2 parents 1a28fe1 + 501f0df
commit a88f1c2
Show file tree

Hide file tree

Showing 30 changed files with 1,112 additions and 308 deletions.
diff --git a/.github/workflows/cicd-main.yml b/.github/workflows/cicd-main.yml
@@ -3060,13 +3060,13 @@ jobs:
  AFTER_SCRIPT: |
  rm -rf /home/TestData/nlp/megatron_ir/working_dir
 
- L2_Megatron_GPT_PEFT_Lora_PP2:
+ L2_Megatron_GPT_PEFT_Lora_PP2_O2:
  needs: [cicd-test-container-setup]
  uses: ./.github/workflows/_test_template.yml
  with:
  RUNNER: self-hosted-azure
  SCRIPT: |
- rm -rf examples/nlp/language_modeling/gpt_peft_lora_results_pp2
+ rm -rf /home/TestData/nlp/lora_tuning_pp2
 
  python examples/nlp/language_modeling/tuning/megatron_gpt_finetuning.py \
  trainer.devices=2 \
@@ -3075,11 +3075,12 @@ jobs:
  trainer.max_steps=3 \
  trainer.val_check_interval=3 \
  ++trainer.limit_val_batches=2 \
- trainer.precision=16 \
- exp_manager.exp_dir=examples/nlp/language_modeling/gpt_peft_lora_results_pp2 \
+ trainer.precision=bf16 \
+ exp_manager.exp_dir=/home/TestData/nlp/lora_tuning_pp2 \
  model.pipeline_model_parallel_size=2 \
  model.tensor_model_parallel_size=1 \
- model.restore_from_path=/home/TestData/nlp/megatron_gpt/PP2/gpt_pp2_tp1.nemo \
+ model.restore_from_path=/home/TestData/nlp/megatron_gpt/mcore_45M/megatron_llama.nemo \
+ model.megatron_amp_O2=True \
  model.peft.peft_scheme=lora \
  model.answer_only_loss=True \
  model.micro_batch_size=1 \
@@ -3090,10 +3091,28 @@ jobs:
  model.data.validation_ds.num_workers=0 \
  model.data.validation_ds.file_names=[/home/TestData/nlp/megatron_sft/quarel.jsonl] \
  model.data.validation_ds.names=[quarel]
+ 
+ python examples/nlp/language_modeling/tuning/megatron_gpt_generate.py \
+ model.restore_from_path=/home/TestData/nlp/megatron_gpt/mcore_45M/megatron_llama.nemo \
+ model.peft.restore_from_path=/home/TestData/nlp/lora_tuning_pp2/megatron_gpt_peft_lora_tuning/checkpoints/megatron_gpt_peft_lora_tuning.nemo \
+ model.pipeline_model_parallel_size=2 \
+ model.tensor_model_parallel_size=1 \
+ trainer.devices=2 \
+ model.megatron_amp_O2=True \
+ model.data.test_ds.file_names=[/home/TestData/nlp/megatron_sft/quarel_4.jsonl] \
+ model.data.test_ds.names=['quarel4'] \
+ model.global_batch_size=2 \
+ model.micro_batch_size=1 \
+ model.data.test_ds.tokens_to_generate=10 \
+ model.data.test_ds.write_predictions_to_file=True \
+ model.data.test_ds.output_file_path_prefix='/home/TestData/nlp/lora_tuning_pp2/out' \
+ inference.greedy=True \
+ inference.repetition_penalty=1.0 \
+ inference.outfile_path='/home/TestData/nlp/lora_tuning_pp2/out.jsonl'
  AFTER_SCRIPT: |
- rm -rf examples/nlp/language_modeling/gpt_peft_lora_results_pp2
+ rm -rf /home/TestData/nlp/lora_tuning_pp2
 
- L2_Megatron_GPT_PEFT_Lora_TP2:
+ L2_Megatron_GPT_PEFT_Lora_TP2_O1:
  needs: [cicd-test-container-setup]
  uses: ./.github/workflows/_test_template.yml
  with:
@@ -3108,11 +3127,11 @@ jobs:
  trainer.max_steps=3 \
  trainer.val_check_interval=3 \
  ++trainer.limit_val_batches=2 \
- trainer.precision=16 \
+ trainer.precision=bf16 \
  exp_manager.exp_dir=/home/TestData/nlp/lora_tuning_tp2 \
  model.pipeline_model_parallel_size=1 \
  model.tensor_model_parallel_size=2 \
- model.restore_from_path=/home/TestData/nlp/megatron_gpt/TP2/megatron_gpt_tp2.nemo \
+ model.restore_from_path=/home/TestData/nlp/megatron_gpt/mcore_45M/megatron_llama.nemo \
  model.peft.peft_scheme='lora' \
  model.answer_only_loss=True \
  model.micro_batch_size=1 \
@@ -3125,7 +3144,7 @@ jobs:
  model.data.validation_ds.names=[quarel]
 
  python examples/nlp/language_modeling/tuning/megatron_gpt_generate.py \
- model.restore_from_path=/home/TestData/nlp/megatron_gpt/TP2/megatron_gpt_tp2.nemo \
+ model.restore_from_path=/home/TestData/nlp/megatron_gpt/mcore_45M/megatron_llama.nemo \
  model.peft.restore_from_path=/home/TestData/nlp/lora_tuning_tp2/megatron_gpt_peft_lora_tuning/checkpoints/megatron_gpt_peft_lora_tuning.nemo \
  model.tensor_model_parallel_size=2 \
  trainer.devices=2 \
@@ -4234,8 +4253,8 @@ jobs:
  - L2_Megatron_GPT_Finetuning_PP2
  - L2_Megatron_GPT_Finetuning_StarCoder_PP1
  - L2_Megatron_GPT_Embedding 
- - L2_Megatron_GPT_PEFT_Lora_PP2
- - L2_Megatron_GPT_PEFT_Lora_TP2
+ - L2_Megatron_GPT_PEFT_Lora_PP2_O2
+ - L2_Megatron_GPT_PEFT_Lora_TP2_O1
  - L2_Megatron_GPT_Eval
  - L2_Megatron_GPT_Eval_PP2
  - L2_Megatron_GPT_SFT_Eval_inference_seq_len_greaterThan_training_seq_len

diff --git a/docs/source/index.rst b/docs/source/index.rst
@@ -12,7 +12,7 @@ NVIDIA NeMo Framework is an end-to-end, cloud-native framework designed to build
 - Flash Attention
 - Activation Recomputation
 - Positional Embeddings and Positional Interpolation
-- Post-Training Quantization (PTQ) with Ammo
+- Post-Training Quantization (PTQ) with ModelOpt
 - Sequence Packing
 
 `NVIDIA NeMo Framework <https://github.com/NVIDIA/NeMo>`_ has separate collections for:

diff --git a/docs/source/multimodal/text2img/sdxl_quantization.rst b/docs/source/multimodal/text2img/sdxl_quantization.rst
@@ -1,11 +1,11 @@
 Stable Diffusion XL Int8 Quantization
 =======================================
 
-This example shows how to use Ammo to calibrate and quantize the UNet part of the SDXL. The UNet part typically consumes
+This example shows how to use ModelOpt to calibrate and quantize the UNet part of the SDXL. The UNet part typically consumes
 >95% of the e2e Stable Diffusion latency.
 
 We also provide instructions on deploying and running E2E SDXL pipeline
-with Ammo quantized int8 UNet to generate images and measure latency on target GPUs.
+with ModelOpt quantized int8 UNet to generate images and measure latency on target GPUs.
 
 To get started, it is required to have a pretrained SDXL checkpoint in ``nemo`` format. The example training configs are provided in NeMo,
 which is located in ``NeMo/examples/multimodal/text2img/stable_diffusion``.
@@ -104,31 +104,31 @@ GPU: H100
 TRT int8 vs Framework fp16
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^
 
-+---------------------+------------+-------------+----------------+------------+---------+------------+
-| Pipeline | Batch Size | Latency (ms)| Pipeline | Batch Size | Latency | Speedup |
-+=====================+============+=============+================+============+=========+============+
-| Framework fp16 base | 1 | 3056.01 | Ammo TRT Int8 | 1 | 1406.68 | 2.172498365|
-+---------------------+------------+-------------+----------------+------------+---------+------------+
-| Framework fp16 base | 2 | 4832.24 | Ammo TRT Int8 | 2 | 2403.29 | 2.01067703 |
-+---------------------+------------+-------------+----------------+------------+---------+------------+
-| Framework fp16 base | 4 | 8433.71 | Ammo TRT Int8 | 4 | 4252.6 | 1.983189108|
-+---------------------+------------+-------------+----------------+------------+---------+------------+
++---------------------+------------+-------------+--------------------+------------+---------+------------+
+| Pipeline | Batch Size | Latency (ms)| Pipeline  | Batch Size | Latency | Speedup |
++=====================+============+=============+====================+============+=========+============+
+| Framework fp16 base | 1 | 3056.01 | ModelOpt TRT Int8 | 1 | 1406.68 | 2.172498365|
++---------------------+------------+-------------+--------------------+------------+---------+------------+
+| Framework fp16 base | 2 | 4832.24 | ModelOpt TRT Int8 | 2 | 2403.29 | 2.01067703 |
++---------------------+------------+-------------+--------------------+------------+---------+------------+
+| Framework fp16 base | 4 | 8433.71 | ModelOpt TRT Int8 | 4 | 4252.6 | 1.983189108|
++---------------------+------------+-------------+--------------------+------------+---------+------------+
 
 
 
 TRT int8 vs TRT fp16
 ^^^^^^^^^^^^^^^^^^^^^^^
 
 
-+-------------+------------+--------------+-----------+------------+------------+-------------+
-| Pipeline | Batch Size | Latency (ms) | Precision | Batch Size | Latency | Speedup |
-+=============+============+==============+===========+============+============+=============+
-| fp16 base | 1 | 1723.97 | Ammo Int8 | 1 | 1406.68 | 1.225559473 |
-+-------------+------------+--------------+-----------+------------+------------+-------------+
-| fp16 base | 2 | 3004.47 | Ammo Int8 | 2 | 2403.29 | 1.250148754 |
-+-------------+------------+--------------+-----------+------------+------------+-------------+
-| fp16 base | 4 | 5657.19 | Ammo Int8 | 4 | 4252.6 | 1.330289705 |
-+-------------+------------+--------------+-----------+------------+------------+-------------+
++-------------+------------+--------------+---------------+------------+------------+-------------+
+| Pipeline | Batch Size | Latency (ms) | Precision  | Batch Size | Latency | Speedup |
++=============+============+==============+===============+============+============+=============+
+| fp16 base | 1 | 1723.97 | ModelOpt Int8 | 1 | 1406.68 | 1.225559473 |
++-------------+------------+--------------+---------------+------------+------------+-------------+
+| fp16 base | 2 | 3004.47 | ModelOpt Int8 | 2 | 2403.29 | 1.250148754 |
++-------------+------------+--------------+---------------+------------+------------+-------------+
+| fp16 base | 4 | 5657.19 | ModelOpt Int8 | 4 | 4252.6 | 1.330289705 |
++-------------+------------+--------------+---------------+------------+------------+-------------+
 
 
 FP16 inference vs Int8 inference

diff --git a/examples/multimodal/multimodal_llm/neva/neva_evaluation.py b/examples/multimodal/multimodal_llm/neva/neva_evaluation.py
@@ -24,13 +24,13 @@
 
 
 try:
- import ammo.torch.quantization as atq
+ import modelopt.torch.quantization as mtq
 
- HAVE_AMMO = True
+ HAVE_MODELOPT = True
 
 except (ImportError, ModuleNotFoundError):
 
- HAVE_AMMO = False
+ HAVE_MODELOPT = False
 
 if not torch.cuda.is_available():
  raise EnvironmentError("GPU is needed for the inference")
@@ -41,7 +41,9 @@ def __init__(self, sentences):
  super().__init__()
  self.sentences = sentences
 
- def __len__(self,):
+ def __len__(
+ self,
+ ):
  return len(self.sentences)
 
  def __getitem__(self, idx):
@@ -99,14 +101,14 @@ def main(cfg) -> None:
  )
 
  # =================== Start Quantization ====================
- if HAVE_AMMO and cfg.quantization.enable == True:
+ if HAVE_MODELOPT and cfg.quantization.enable == True:
  print(f"Using quantization algorithm: {cfg.quantization.algorithm}")
  if cfg.quantization.algorithm == "int8_sq":
- atq_config = atq.INT8_SMOOTHQUANT_CFG
+ mtq_config = mtq.INT8_SMOOTHQUANT_CFG
  elif cfg.quantization.algorithm == "fp8":
- atq_config = atq.FP8_DEFAULT_CFG
+ mtq_config = mtq.FP8_DEFAULT_CFG
  elif cfg.quantization.algorithm == "awq":
- atq_config = atq.INT4_AWQ_CFG
+ mtq_config = mtq.INT4_AWQ_CFG
  else:
  raise ValueError(f"Unsupported quantization algorithm: {cfg.quantization.algorithm}")
 
@@ -118,7 +120,7 @@ def forward_loop():
  inference_config=cfg,
  )
 
- atq.quantize(model, atq_config, forward_loop)
+ mtq.quantize(model, mtq_config, forward_loop)
 
  responses = model.generate(
  input_prompts=final_prompts,

diff --git a/examples/multimodal/text_to_image/stable_diffusion/sd_xl_quantize.py b/examples/multimodal/text_to_image/stable_diffusion/sd_xl_quantize.py
@@ -15,10 +15,10 @@
 import os
 from pathlib import Path
 
-import ammo.torch.opt as ato
-import ammo.torch.quantization as atq
+import modelopt.torch.opt as mto
+import modelopt.torch.quantization as mtq
 import torch
-from ammo.torch.quantization.nn import QuantModuleRegistry
+from modelopt.torch.quantization.nn import QuantModuleRegistry
 from torch.onnx import export as onnx_export
 
 from nemo.collections.multimodal.models.text_to_image.stable_diffusion.diffusion_engine import MegatronDiffusionEngine
@@ -92,7 +92,7 @@ def model_cfg_modifier(model_cfg):
  QuantModuleRegistry.register({LinearWrapper: "nemo_linear_wrapper"})(_QuantNeMoLinearWrapper)
 
  if cfg.run_quantization:
- # Start quantization with ammo
+ # Start quantization with ModelOpt
 
  cali_prompts = load_calib_prompts(
  cfg.quantize.batch_size,
@@ -124,15 +124,15 @@ def forward_loop():
  num_samples=cfg.infer.num_samples,
  )
 
- atq.quantize(base.model.model.diffusion_model, quant_config, forward_loop)
- ato.save(base.model.model.diffusion_model, cfg.quantize.quantized_ckpt)
+ mtq.quantize(base.model.model.diffusion_model, quant_config, forward_loop)
+ mto.save(base.model.model.diffusion_model, cfg.quantize.quantized_ckpt)
 
  if cfg.run_onnx_export:
  os.makedirs(cfg.onnx_export.onnx_dir, exist_ok=True)
  output = Path(f"{cfg.onnx_export.onnx_dir}/unet.onnx")
  # Export quantized model to ONNX
  if not cfg.run_quantization:
- ato.restore(base.model.model.diffusion_model, cfg.onnx_export.quantized_ckpt)
+ mto.restore(base.model.model.diffusion_model, cfg.onnx_export.quantized_ckpt)
  quantize_lvl(base.model.model.diffusion_model, cfg.quantize.quant_level)
 
  # QDQ needs to be in FP32

diff --git a/nemo/collections/llm/api.py b/nemo/collections/llm/api.py
@@ -1,20 +1,20 @@
 from pathlib import Path
-from typing import Callable, Optional, Union
+from typing import Callable, Optional
 
 import pytorch_lightning as pl
+from typing_extensions import Annotated
 
-from nemo.collections.llm.utils import task
+from nemo.collections.llm.utils import Config, task
 from nemo.lightning import AutoResume, MegatronStrategy, NeMoLogger, OptimizerModule, Trainer, io, teardown
-from nemo.lightning.resume import Resume
 
 
 @task(namespace="llm")
 def train(
  model: pl.LightningModule,
  data: pl.LightningDataModule,
  trainer: Trainer,
- log: NeMoLogger = NeMoLogger(),
- resume: Optional[Union[AutoResume, Resume]] = AutoResume(),
+ log: Annotated[Optional[NeMoLogger], Config[NeMoLogger]] = None,
+ resume: Annotated[Optional[AutoResume], Config[AutoResume]] = None,
  opt: Optional[OptimizerModule] = None,
  tokenizer: Optional[str] = None,
  # TODO: Fix export export: Optional[str] = None,
@@ -52,10 +52,12 @@ def train(
  if not isinstance(trainer.strategy, MegatronStrategy):
  raise ValueError("Only MegatronStrategy is supported")
 
+ _log = log or NeMoLogger()
+
  if tokenizer: # TODO: Improve this
  _use_tokenizer(model, data, tokenizer)
 
- app_state = log.setup(
+ app_state = _log.setup(
  trainer,
  resume_if_exists=getattr(resume, "resume_if_exists", False),
  )
@@ -64,14 +66,14 @@ def train(
  if opt:
  opt.connect(model)
 
- trainer.fit(model, data, **fit_kwargs)
+ trainer.fit(model, data)
 
  if hasattr(train, "__io__"):
  _save_config_img(app_state.exp_dir, train.__io__)
 
  trainer.fit(model, data)
 
- log.teardown()
+ _log.teardown()
 
  return app_state.exp_dir
 

diff --git a/nemo/collections/llm/gpt/model/mistral_7b.py b/nemo/collections/llm/gpt/model/mistral_7b.py
@@ -2,16 +2,21 @@
 from pathlib import Path
 from typing import TYPE_CHECKING, Callable, List, Optional
 
+import pytorch_lightning as pl
 import torch
 import torch.nn.functional as F
+from typing_extensions import Annotated
 
 from nemo.collections.llm.gpt.model.base import GPTConfig, GPTModel
+from nemo.collections.llm.utils import Config
 from nemo.lightning import io, teardown
+from nemo.lightning.pytorch.opt import OptimizerModule
 
 if TYPE_CHECKING:
  from transformers import MistralConfig, MistralForCausalLM
 
  from nemo.collections.common.tokenizers.huggingface.auto_tokenizer import AutoTokenizer
+ from nemo.collections.common.tokenizers.tokenizer_spec import TokenizerSpec
 
 
 @dataclass
@@ -36,10 +41,13 @@ class Mistral7BConfig(GPTConfig):
 
 
 class Mistral7BModel(GPTModel):
- def __init__(self, config: Optional[Mistral7BConfig] = None, tokenizer=None):
- _tokenizer = tokenizer or HFMistral7BImporter("mistralai/Mistral-7B-v0.1").tokenizer
-
- super().__init__(config or Mistral7BConfig(), _tokenizer)
+ def __init__(
+ self,
+ config: Annotated[Optional[Mistral7BConfig], Config[Mistral7BConfig]] = None,
+ optim: Optional[OptimizerModule] = None,
+ tokenizer: Optional["TokenizerSpec"] = None,
+ ):
+ super().__init__(config or Mistral7BConfig(), optim=optim, tokenizer=tokenizer)
 
 
 @io.model_importer(Mistral7BModel, "hf")
@@ -63,6 +71,9 @@ def apply(self, output_path: Path) -> Path:
 
  return output_path
 
+ def on_import_ckpt(self, model: pl.LightningModule):
+ model.tokenizer = self.tokenizer
+
  def convert_state(self, source, target):
  mapping = {
  "model.embed_tokens.weight": "embedding.word_embeddings.weight",

diff --git a/nemo/collections/llm/utils.py b/nemo/collections/llm/utils.py
@@ -1,7 +1,21 @@
-from typing import Any, Callable, TypeVar
+from typing import Any, Callable, Generic, TypeVar
 
 T = TypeVar('T', bound=Callable[..., Any])
 
+try:
+ import nemo_sdk as sdk
+
+ Config = sdk.Config
+ Partial = sdk.Partial
+except ImportError:
+ _T = TypeVar('_T')
+
+ class Config(Generic[_T]):
+ pass
+
+ class Partial(Generic[_T]):
+ pass
+
 
 def task(*args: Any, **kwargs: Any) -> Callable[[T], T]:
  try: