Merging main and resolving conflicts

Signed-off-by: taejinp <tango4j@gmail.com>
tango4j · Nov 29, 2024 · f469e72 · f469e72
2 parents 2c6eed7 + 2a6d144
commit f469e72
Show file tree

Hide file tree

Showing 23 changed files with 1,188 additions and 106 deletions.
diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml
@@ -20,10 +20,15 @@ on:
         description: Ref (SHA or branch name) to release
         required: true
         type: string
+      dry-run:
+        description: Do not publish a wheel and GitHub release.
+        required: true
+        default: true
+        type: boolean
 
 jobs: 
   release:
-    uses: NVIDIA/NeMo-FW-CI-templates/.github/workflows/_release_library.yml@v0.12.3
+    uses: NVIDIA/NeMo-FW-CI-templates/.github/workflows/_release_library.yml@v0.15.0
     with:
       release-ref: ${{ inputs.release-ref }}
       image-name: nemo_container
@@ -35,8 +40,10 @@ jobs:
       python-package: nemo
       container-workdir: /workspace
       library-name: Neural Modules
+      dry-run: ${{ inputs.dry-run }}
     secrets:
       TWINE_USERNAME: ${{ secrets.TWINE_USERNAME }}
       TWINE_PASSWORD: ${{ secrets.TWINE_PASSWORD }}
       SLACK_RELEASE_ENDPOINT: ${{ secrets.SLACK_RELEASE_ENDPOINT }}
       PAT: ${{ secrets.PAT }}
+      SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}
diff --git a/nemo/collections/asr/data/audio_to_text_dataset.py b/nemo/collections/asr/data/audio_to_text_dataset.py
@@ -871,7 +871,6 @@ def write_on_batch_end(
                     item["audio_filepath"] = sample.recording.sources[0].source
                 else:
                     item["audio_filepath"] = sample.id
-                item["audio_filepath"] = sample.recording.sources[0].source
                 item["offset"] = sample.start
                 item["duration"] = sample.duration
                 item["text"] = sample.supervisions[0].text or ''

diff --git a/nemo/collections/llm/api.py b/nemo/collections/llm/api.py
@@ -265,12 +265,11 @@ def validate(
 @run.cli.entrypoint(name="ptq", namespace="llm")
 def ptq(
     nemo_checkpoint: str,
+    export_config: ExportConfig,
     calib_tp: int = 1,
     calib_pp: int = 1,
     quantization_config: Annotated[Optional[QuantizationConfig], run.Config[QuantizationConfig]] = None,
-    export_config: Optional[Union[ExportConfig, run.Config[ExportConfig]]] = None,
 ) -> Path:
-    # TODO: Fix "nemo_run.cli.cli_parser.CLIException: An unexpected error occurred (Argument: , Context: {})"
     """
     Applies Post-Training Quantization (PTQ) for a model using the specified quantization and export configs. It runs
     calibration for a small dataset to collect scaling factors low-precision GEMMs used by desired quantization method.
@@ -297,6 +296,9 @@ def ptq(
     Returns:
         Path: The path where the quantized checkpoint has been saved after calibration.
     """
+    if not quantization_config:
+        quantization_config = QuantizationConfig()
+
     if export_config.path is None:
         raise ValueError("The export_config.path needs to be specified, got None.")
 

diff --git a/nemo/collections/llm/gpt/model/hf_auto_model_for_causal_lm.py b/nemo/collections/llm/gpt/model/hf_auto_model_for_causal_lm.py
@@ -41,6 +41,7 @@ def __init__(
         model_transform=None,
         model_accelerator=None,
         trust_remote_code=False,
+        default_dtype=torch.bfloat16,
     ):
         super().__init__()
         self.save_hyperparameters()
@@ -53,6 +54,7 @@ def __init__(
         self.model_transform = model_transform
         self.model_accelerator = model_accelerator
         self.trust_remote_code = trust_remote_code
+        self.default_dtype = default_dtype
 
     @property
     def tokenizer(self):
@@ -79,7 +81,10 @@ def configure_model(self):
             from transformers import AutoConfig
 
             config = AutoConfig.from_pretrained(self.model_name, trust_remote_code=self.trust_remote_code)
-            self.model = AutoModelForCausalLM.from_config(config, trust_remote_code=self.trust_remote_code)
+            dtype = getattr(config, 'torch_dtype', self.default_dtype)
+            self.model = AutoModelForCausalLM.from_config(
+                config, torch_dtype=dtype, trust_remote_code=self.trust_remote_code
+            )
 
         if self.model_accelerator is not None:
             self.model_accelerator(self.model)

diff --git a/nemo/collections/llm/recipes/t5_11b.py b/nemo/collections/llm/recipes/t5_11b.py
@@ -175,15 +175,17 @@ def pretrain_recipe(
         guide in the `examples/llm/pretrain/` directory.
     """
 
-    opt_config = OptimizerConfig(
+    opt_config = run.Config(
+        OptimizerConfig,
         optimizer='adam',
         lr=0.0001,
         use_distributed_optimizer=True,
         bf16=True,
         weight_decay=0.01,
     )
 
-    lr_scheduler = WarmupAnnealingScheduler(
+    lr_scheduler = run.Config(
+        WarmupAnnealingScheduler,
         warmup_steps=None,
         warmup_ratio=0.01,
         max_steps=1000000,
@@ -202,7 +204,7 @@ def pretrain_recipe(
             MockDataModule, seq_length=512, seq_length_dec=128, global_batch_size=1920, micro_batch_size=24
         ),
         log=default_log(dir=dir, name=name, tensorboard_logger=tensorboard_logger(name=name)),
-        optim=MegatronOptimizerModule(config=opt_config, lr_scheduler=lr_scheduler),
+        optim=run.Config(MegatronOptimizerModule, config=opt_config, lr_scheduler=lr_scheduler),
         resume=default_resume(),
     )
 
@@ -248,15 +250,17 @@ def finetune_recipe(
         on fine-tuning LLMs with NeMo, see the fine-tuning guide in the
         `examples/llm/finetune/` directory.
     """
-    opt_config = OptimizerConfig(
+    opt_config = run.Config(
+        OptimizerConfig,
         optimizer='adam',
-        lr=1e-4,
+        lr=0.0001,
         use_distributed_optimizer=True,
         bf16=True,
         weight_decay=0.01,
     )
 
-    lr_scheduler = WarmupAnnealingScheduler(
+    lr_scheduler = run.Config(
+        WarmupAnnealingScheduler,
         warmup_steps=50,
         max_steps=2000,
         min_lr=0.00001,
@@ -273,7 +277,7 @@ def finetune_recipe(
             SquadDataModule, seq_length=512, seq_length_dec=128, global_batch_size=128, micro_batch_size=1
         ),
         log=default_log(dir=dir, name=name, tensorboard_logger=tensorboard_logger(name=name)),
-        optim=MegatronOptimizerModule(config=opt_config, lr_scheduler=lr_scheduler),
+        optim=run.Config(MegatronOptimizerModule, config=opt_config, lr_scheduler=lr_scheduler),
         resume=nemo_resume(checkpoint_path),
     )
 

diff --git a/nemo/collections/llm/recipes/t5_3b.py b/nemo/collections/llm/recipes/t5_3b.py
@@ -175,15 +175,17 @@ def pretrain_recipe(
         guide in the `examples/llm/pretrain/` directory.
     """
 
-    opt_config = OptimizerConfig(
+    opt_config = run.Config(
+        OptimizerConfig,
         optimizer='adam',
         lr=0.0001,
         use_distributed_optimizer=True,
         bf16=True,
         weight_decay=0.01,
     )
 
-    lr_scheduler = WarmupAnnealingScheduler(
+    lr_scheduler = run.Config(
+        WarmupAnnealingScheduler,
         warmup_steps=None,
         warmup_ratio=0.01,
         max_steps=1000000,
@@ -202,7 +204,7 @@ def pretrain_recipe(
             MockDataModule, seq_length=512, seq_length_dec=128, global_batch_size=1920, micro_batch_size=24
         ),
         log=default_log(dir=dir, name=name, tensorboard_logger=tensorboard_logger(name=name)),
-        optim=MegatronOptimizerModule(config=opt_config, lr_scheduler=lr_scheduler),
+        optim=run.Config(MegatronOptimizerModule, config=opt_config, lr_scheduler=lr_scheduler),
         resume=default_resume(),
     )
 
@@ -248,15 +250,17 @@ def finetune_recipe(
         on fine-tuning LLMs with NeMo, see the fine-tuning guide in the
         `examples/llm/finetune/` directory.
     """
-    opt_config = OptimizerConfig(
+    opt_config = run.Config(
+        OptimizerConfig,
         optimizer='adam',
-        lr=1e-4,
+        lr=0.0001,
         use_distributed_optimizer=True,
         bf16=True,
         weight_decay=0.01,
     )
 
-    lr_scheduler = WarmupAnnealingScheduler(
+    lr_scheduler = run.Config(
+        WarmupAnnealingScheduler,
         warmup_steps=50,
         max_steps=2000,
         min_lr=0.00001,
@@ -273,7 +277,7 @@ def finetune_recipe(
             SquadDataModule, seq_length=512, seq_length_dec=128, global_batch_size=128, micro_batch_size=1
         ),
         log=default_log(dir=dir, name=name, tensorboard_logger=tensorboard_logger(name=name)),
-        optim=MegatronOptimizerModule(config=opt_config, lr_scheduler=lr_scheduler),
+        optim=run.Config(MegatronOptimizerModule, config=opt_config, lr_scheduler=lr_scheduler),
         resume=nemo_resume(checkpoint_path),
     )
 

diff --git a/nemo/collections/multimodal/data/energon/task_encoder.py b/nemo/collections/multimodal/data/energon/task_encoder.py
@@ -48,7 +48,8 @@ class MultiModalTaskEncoder(
     and similarity interleaved samples.
 
     This class extends the DefaultTaskEncoder and provides a flexible mechanism to handle and encode
-    different types of multimodal data. Support for VQA, captioning and interleaved samples is provided by default. It supports registering custom encoders for each sample type
+    different types of multimodal data. Support for VQA, captioning and interleaved samples is provided by default.
+    It supports registering custom encoders for each sample type
     and provides methods for encoding individual samples, batching them, and further processing the batch
     for model input.
     """
@@ -59,8 +60,8 @@ def __init__(self, tokenizer, image_processor, multimodal_sample_config):
 
         Parameters:
         tokenizer (Tokenizer): The tokenizer used for processing text across different sample types.
-        image_processor (ImageProcessor): The image processor used for preprocessing images across different sample types.
-        multimodal_sample_config (MultiModalSampleConfig): Configuration object for multimodal samples, including tokens and placeholders.
+        image_processor (ImageProcessor): The image processor used for preprocessing images.
+        multimodal_sample_config (MultiModalSampleConfig): MultiModalSampleConfig object.
         """
         self.tokenizer = tokenizer
         self.encoders: Dict[str, SampleEncoder] = {
@@ -173,5 +174,6 @@ def encode_batch(self, batch_data: ImageTextRawBatch) -> dict:
         position_ids = torch.arange(seq_length, dtype=torch.long)
         position_ids = position_ids.unsqueeze(0).repeat(micro_batch_size, 1)
         batch_dict['position_ids'] = position_ids
-        batch_dict['attention_mask'] = None
+        if 'attention_mask' not in batch_dict:
+            batch_dict['attention_mask'] = None
         return batch_dict
diff --git a/nemo/collections/nlp/data/language_modeling/megatron/gpt_sft_dataset.py b/nemo/collections/nlp/data/language_modeling/megatron/gpt_sft_dataset.py
@@ -573,15 +573,23 @@ def _build_samples_mapping(self):
             self.samples_mapping = None
 
     def _build_loss_mask(self, processed_example):
+        seq_boundaries = processed_example['seq_boundaries']
         if self.answer_only_loss:
-            seq_boundaries = processed_example['seq_boundaries']
             return np.concatenate(
                 [
                     processed_example['loss_mask'][seq_boundaries[i] + 1 : seq_boundaries[i + 1]]
                     for i in range(len(seq_boundaries) - 1)
                 ]
             )
-        return [1.0] * (len(processed_example['input_ids']) - len(processed_example['seq_boundaries']) + 1)
+        return np.concatenate(
+            [
+                [
+                    0 if x == self.tokenizer.eos_id else 1.0
+                    for x in processed_example['input_ids'][seq_boundaries[i] : seq_boundaries[i + 1] - 1]
+                ]
+                for i in range(len(seq_boundaries) - 1)
+            ]
+        )
 
     def _maybe_cast_to_list(self, x):
         return [item.tolist() if isinstance(item, np.ndarray) else item for item in x]
@@ -622,16 +630,40 @@ def collate_fn(self, batch):
 
         position_ids: List[List[int]] = []
         cu_seqlens: List[List[int]] = []
+        cu_seqlens_unpadded: List[List[int]] = []
         for item in batch:
             position_ids.append([])
             cu_seqlens.append([0])
+            cu_seqlens_unpadded.append([0])
             seqlens = np.array(item['seq_boundaries'][1:]) - np.array(item['seq_boundaries'][:-1])
             for l in seqlens:
                 # length minus 1 because input_ids is truncated by 1 for labels
                 position_ids[-1].extend(list(range(l - 1)))
                 cu_seqlens[-1].append(cu_seqlens[-1][-1] + l - 1)
-            # set last seq to the max seq len because rope and attn kernels expect no padding
-            cu_seqlens[-1][-1] = max_length
+
+            # the last seq needs to be the max seq len because rope and attn kernels expect no padding
+            assert cu_seqlens[-1][-1] <= max_length
+
+            # since data is prepadded when cp_size > 1, there may be some extra padding at the end
+            # of the packed sequence. In this case, we need to add the max seq len to the end.
+            if cu_seqlens[-1][-1] != max_length:
+                cu_seqlens[-1].append(max_length)
+
+            for i in range(len(item['seq_boundaries']) - 1):
+                current_seq = item['input_ids'][item['seq_boundaries'][i] : item['seq_boundaries'][i + 1] - 1]
+
+                # since the data could be prepadded with tokenizer's eos_id, we can find out the index of all the eos_id
+                eos_idx = np.where(np.array(current_seq) == self.tokenizer.eos_id)
+
+                # The second eos_id index marks the length of the original unpadded sequence if the sequence is
+                # prepadded for cp_size > 1. Otherwise, there is no extra padding.
+                seqlen_unpadded = eos_idx[0][0] + 1 if eos_idx[0].any() else len(current_seq)
+                cu_seqlens_unpadded[-1].append(cu_seqlens_unpadded[-1][-1] + seqlen_unpadded)
+
+            # if extra paddings are added in the packed sequence, they can't be counted as
+            # actual tokens for training
+            if len(cu_seqlens[-1]) > len(cu_seqlens_unpadded[-1]):
+                cu_seqlens_unpadded[-1].append(cu_seqlens_unpadded[-1][-1])
 
         assert len(input_ids[0]) == len(
             position_ids[0]
@@ -652,12 +684,16 @@ def collate_fn(self, batch):
 
         if self.return_cu_seqlen:
             cu_seqlens = self._collate_item(cu_seqlens, max_length=max(len(l) for l in cu_seqlens) + 1, pad_id=-1)
-
+            cu_seqlens_unpadded = self._collate_item(
+                cu_seqlens_unpadded, max_length=max(len(l) for l in cu_seqlens_unpadded) + 1, pad_id=-1
+            )
             # Pre-generate `cu_seqlens_argmin` and `max_seqlen` as CPU tensor to avoid device-to-host copies.
             cu_seqlens = torch.IntTensor(cu_seqlens)
             cu_seqlens_argmin = torch.argmin(cu_seqlens, dim=1, keepdim=True)
             seqlens = cu_seqlens[:, 1:] - cu_seqlens[:, :-1]
             max_seqlen, _ = seqlens.max(dim=1, keepdim=True)
+            cu_seqlens_unpadded = torch.IntTensor(cu_seqlens_unpadded)
+            cu_seqlens_unpadded_argmin = torch.argmin(cu_seqlens_unpadded, dim=1, keepdim=True)
 
             processed_batch.update(
                 {
@@ -667,6 +703,8 @@ def collate_fn(self, batch):
                     'cu_seqlens': torch.IntTensor(cu_seqlens),  # cu_seqlens_q must be in dtype torch.int32
                     'cu_seqlens_argmin': cu_seqlens_argmin,  # only required for perf
                     'max_seqlen': max_seqlen,  # only required for perf
+                    'cu_seqlens_unpadded': torch.IntTensor(cu_seqlens_unpadded),
+                    'cu_seqlens_unpadded_argmin': cu_seqlens_unpadded_argmin,
                 }
             )
         else: