Merge branch 'main' into neva_mistral

NVIDIA · Jul 2, 2024 · eb345bf · eb345bf
2 parents 93cd6a7 + 043a080
commit eb345bf
Show file tree

Hide file tree

Showing 188 changed files with 12,233 additions and 7,375 deletions.
diff --git a/.github/labeler.yml b/.github/labeler.yml
@@ -34,6 +34,13 @@ TTS:
 - tests/collections/tts/**
 - tests/collections/common/tokenizers/text_to_speech/**
 
+Audio:
+- nemo/collections/audio/**/*
+- examples/audio/**/*
+- tutorials/audio/**/*
+- docs/source/audio/**/*
+- tests/collections/audio/**
+
 core:
 - nemo/core/**/*
 - tests/core/**

diff --git a/.github/workflows/cicd-main.yml b/.github/workflows/cicd-main.yml
@@ -213,7 +213,7 @@ jobs:
     with:
       RUNNER: self-hosted-azure
       SCRIPT: |
-        python examples/nlp/language_modeling/megatron_gpt_quantization.py \
+        python examples/nlp/language_modeling/megatron_gpt_ptq.py \
           model.restore_from_path=/home/TestData/nlp/megatron_llama/llama_ci.nemo \
           quantization.algorithm=null \
           export.save_path=/home/TestData/nlp/megatron_llama/ci_baseline
@@ -226,7 +226,7 @@ jobs:
     with:
       RUNNER: self-hosted-azure
       SCRIPT: |
-        python examples/nlp/language_modeling/megatron_gpt_quantization.py \
+        python examples/nlp/language_modeling/megatron_gpt_ptq.py \
           model.restore_from_path=/home/TestData/nlp/megatron_llama/llama_ci.nemo \
           model.tensor_model_parallel_size=2 \
           trainer.devices=2 \
@@ -245,7 +245,7 @@ jobs:
     with:
       RUNNER: self-hosted-azure
       SCRIPT: |
-        python examples/nlp/language_modeling/megatron_gpt_quantization.py \
+        python examples/nlp/language_modeling/megatron_gpt_ptq.py \
         model.restore_from_path=/home/TestData/nlp/megatron_llama/llama_ci.nemo \
         quantization.calib_dataset=/home/TestData/nlp/test_quantization/test.json \
         quantization.algorithm=int8_sq \
@@ -274,7 +274,7 @@ jobs:
   #      - name: Checkout repository
   #        uses: actions/checkout@v4
   #      - run: |
-  #          python examples/nlp/language_modeling/megatron_gpt_quantization.py \
+  #          python examples/nlp/language_modeling/megatron_gpt_ptq.py \
   #          model.restore_from_path=/home/TestData/nlp/megatron_llama/llama_ci.nemo \
   #          model.tensor_model_parallel_size=1 \
   #          trainer.devices=1 \
@@ -288,6 +288,45 @@ jobs:
         #- uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main"
         #  if: "failure()"
 
+  L2_QAT_Llama2_INT4:
+     needs: [cicd-test-container-setup]
+     runs-on: self-hosted-azure
+     timeout-minutes: 10
+     container:
+       image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
+       options:
+         # --user 0:128
+         --device=/dev/nvidia0
+         --gpus all
+         --shm-size=8g
+         --env TRANSFORMERS_OFFLINE=0
+         --env HYDRA_FULL_ERROR=1
+         --volume /mnt/datadrive/TestData:/home/TestData
+     steps:
+         - name: Checkout repository
+           uses: actions/checkout@v4
+         - run: |
+            python examples/nlp/language_modeling/tuning/megatron_gpt_qat.py \
+            quantization.algorithm=int4 \
+            quantization.num_calib_size=8 \
+            trainer.devices=1 \
+            trainer.num_nodes=1 \
+            trainer.max_steps=4 \
+            trainer.val_check_interval=4 \
+            +trainer.limit_val_batches=2 \
+            exp_manager.explicit_log_dir=llama2_qat_results \
+            model.restore_from_path=/home/TestData/nlp/megatron_llama/llama_ci.nemo \
+            model.tensor_model_parallel_size=1 \
+            model.pipeline_model_parallel_size=1 \
+            model.global_batch_size=2 \
+            model.data.train_ds.file_names=[/home/TestData/nlp/megatron_sft/quarel.jsonl] \
+            model.data.train_ds.concat_sampling_probabilities=[1.0] \
+            model.data.validation_ds.file_names=[/home/TestData/nlp/megatron_sft/quarel.jsonl]
+
+            rm -rf llama2_qat_results
+         - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main"
+           if: "failure()"
+
   # L2: ASR dev run
   ASR_dev_run_Speech_to_Text:
     needs: [cicd-test-container-setup]
@@ -4435,7 +4474,9 @@ jobs:
         name: Checkout repository
         uses: actions/checkout@v4
 
-      - if: ${{ always() && steps.pipeline-conclusion.outputs.FAILED == 'true' }}
+      - if: ${{ always() && steps.pipeline-conclusion.outputs.FAILED == 'true' && env.SLACK_WEBHOOK != '' }}
+        env: 
+          SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}
         run: |
           set -x
 

diff --git a/Dockerfile.ci b/Dockerfile.ci
@@ -33,7 +33,7 @@ WORKDIR /workspace
 
 # Install NeMo requirements
 ARG TE_TAG=bfe21c3d68b0a9951e5716fb520045db53419c5e
-ARG MODELOPT_VERSION=0.11.0
+ARG MODELOPT_VERSION=0.13.0
 ARG MCORE_TAG=02871b4df8c69fac687ab6676c4246e936ce92d0
 ARG APEX_TAG=810ffae374a2b9cb4b5c5e28eaeca7d7998fca0c
 RUN \

diff --git a/docs/source/core/exp_manager.rst b/docs/source/core/exp_manager.rst
@@ -203,10 +203,121 @@ file followed by a graceful exit from the run. The checkpoint saved upon preempt
 This feature is useful to increase utilization on clusters.
 The ``PreemptionCallback`` is enabled by default. To disable it simply add ``create_preemption_callback: False`` under exp_manager in the config YAML file. 
 
+Stragglers Detection
+----------------------
 
-.. _nemo_multirun-label:
+.. _exp_manager_straggler_det_support-label:
+
+.. note::
+    Stragglers Detection feature is included in the optional NeMo resiliency package.
+
+Distributed training can be affected by stragglers, which are slow workers that slow down the overall training process. 
+NeMo provides a straggler detection feature that can identify slower GPUs.
+
+This feature is implemented in the ``StragglerDetectionCallback``, which is disabled by default.
+
+The callback computes normalized GPU performance scores, which are scalar values ranging from 0.0 (worst) to 1.0 (best). 
+A performance score can be interpreted as the ratio of current performance to reference performance.
+
+There are two types of performance scores provided by the callback:
+    - Relative GPU performance score: The best-performing GPU in the workload is used as a reference.
+    - Individual GPU performance score: The best historical performance of the GPU is used as a reference.
+
+Examples:
+    - If the relative performance score is 0.5, it means that a GPU is twice slower than the fastest GPU.
+    - If the individual performance score is 0.5, it means that a GPU is twice slower than its best observed performance.
 
+If a GPU performance score drops below the specified threshold, it is identified as a straggler.
+
+To enable straggler detection, add ``create_straggler_detection_callback: True`` under exp_manager in the config YAML file. 
+You might also want to adjust the callback parameters:
+
+.. code-block:: yaml
+
+    exp_manager:
+        ...
+        create_straggler_detection_callback: True
+        straggler_detection_callback_params:
+            report_time_interval: 300      # Interval [seconds] of the straggler check
+            calc_relative_gpu_perf: True   # Calculate relative GPU performance
+            calc_individual_gpu_perf: True # Calculate individual GPU performance
+            num_gpu_perf_scores_to_log: 5       # Log 5 best and 5 worst GPU performance scores, even if no stragglers are detected
+            gpu_relative_perf_threshold: 0.7    # Threshold for relative GPU performance scores
+            gpu_individual_perf_threshold: 0.7  # Threshold for individual GPU performance scores
+            stop_if_detected: True              # Terminate the workload if stragglers are detected
+
+Straggler detection might involve inter-rank synchronization, and should be invoked with reasonable frequency (e.g. every few minutes).
+
+Fault Tolerance
+---------------
 
+.. _exp_manager_fault_tolerance_support-label:
+
+.. note::
+    Fault Tolerance feature is included in the optional NeMo resiliency package.
+
+When training DNN models, faults may occur, hindering the progress of the entire training process. 
+This is particularly common in distributed, multi-node training scenarios, with many nodes and GPUs involved. 
+
+NeMo incorporates a fault tolerance mechanism to detect training halts. 
+In response, it can terminate a hung workload and, if requested, restart it from the last checkpoint.
+
+Fault tolerance ("FT") relies on a special launcher (``ft_launcher``), which is a modified ``torchrun``. 
+The FT launcher runs background processes called rank monitors. **You need to use ft_launcher to start 
+your workload if you are using FT**. I.e., `NeMo-Framework-Launcher <https://github.com/NVIDIA/NeMo-Framework-Launcher>`_  
+can be used to generate SLURM batch scripts with FT support. 
+
+Each training process (rank) sends `heartbeats` to its monitor during training and validation steps.
+If a rank monitor stops receiving `heartbeats`, a training failure is detected.
+
+Fault detection is implemented in the ``FaultToleranceCallback`` and is disabled by default. 
+To enable it, add a ``create_fault_tolerance_callback: True`` option under ``exp_manager`` in the 
+config YAML file. Additionally, you can customize FT parameters by adding ``fault_tolerance`` section:
+
+.. code-block:: yaml
+
+    exp_manager:
+        ...
+        create_fault_tolerance_callback: True
+        fault_tolerance:
+            initial_rank_heartbeat_timeout: 600  # wait for 10 minutes for the initial heartbeat
+            rank_heartbeat_timeout: 300  # wait for 5 minutes for subsequent heartbeats
+            calculate_timeouts: True # estimate more accurate timeouts based on observed intervals
+
+Timeouts for fault detection need to be adjusted for a given workload:
+    * ``initial_rank_heartbeat_timeout`` should be long enough to allow for workload initialization.
+    * ``rank_heartbeat_timeout`` should be at least as long as the longest possible interval between steps. 
+
+**Importantly, `heartbeats` are not sent during checkpoint loading and saving**, so time for 
+checkpointing related operations should be taken into account.
+
+If ``calculate_timeouts: True`` timeouts will be automatically estimated based on observed intervals. 
+Estimated timeouts take precedence over timeouts defined in the config file. **Timeouts are estimated after 
+checkpoint loading and saving was observed**. For example, in multi-part training started from scratch, 
+estimated timeouts won't be available during the first run. Estimated timeouts are stored in the checkpoint. 
+
+``max_subsequent_job_failures`` allows for the automatic continuation of training on a SLURM cluster. 
+This feature requires SLURM job to be scheduled with ``NeMo-Framework-Launcher``. If ``max_subsequent_job_failures`` 
+value is `>0` continuation job is prescheduled. It will continue  the work until ``max_subsequent_job_failures`` 
+subsequent jobs failed (SLURM job exit code is `!= 0`) or the training is completed successfully 
+("end of training" marker file is produced by the ``FaultToleranceCallback``, i.e. due to iters or time limit reached).
+
+All FT configuration items summary:
+    * ``workload_check_interval`` (float, default=5.0) Periodic workload check interval [seconds] in the workload monitor.
+    * ``initial_rank_heartbeat_timeout`` (Optional[float], default=60.0 * 60.0) Timeout for the first heartbeat from a rank. 
+    * ``rank_heartbeat_timeout`` (Optional[float], default=45.0 * 60.0) Timeout for subsequent heartbeats from a rank. 
+    * ``calculate_timeouts`` (bool, default=True) Try to calculate ``rank_heartbeat_timeout`` and ``initial_rank_heartbeat_timeout`` 
+      based on the observed heartbeat intervals.
+    * ``rank_termination_signal`` (signal.Signals, default=signal.SIGKILL) Signal used to terminate the rank when failure is detected.
+    * ``log_level`` (str, default='INFO') Log level for the FT client and server(rank monitor).
+    * ``max_rank_restarts`` (int, default=0) Used by FT launcher. Max number of restarts for a rank. 
+      If ``>0`` ranks will be restarted on existing nodes in case of a failure.
+    * ``max_subsequent_job_failures`` (int, default=0) Used by FT launcher. How many subsequent job failures are allowed until stopping autoresuming. 
+      ``0`` means do not autoresume.
+    * ``additional_ft_launcher_args`` (str, default='') Additional FT launcher params (for advanced use).
+
+
+.. _nemo_multirun-label:
 Hydra Multi-Run with NeMo
 -------------------------
 

diff --git a/docs/source/index.rst b/docs/source/index.rst
@@ -12,7 +12,7 @@ NVIDIA NeMo Framework is an end-to-end, cloud-native framework designed to build
 - Flash Attention
 - Activation Recomputation
 - Positional Embeddings and Positional Interpolation
-- Post-Training Quantization (PTQ) with ModelOpt
+- Post-Training Quantization (PTQ) and Quantization Aware Training (QAT) with `TensorRT Model Optimizer <https://github.com/NVIDIA/TensorRT-Model-Optimizer>`_
 - Sequence Packing
 
 `NVIDIA NeMo Framework <https://github.com/NVIDIA/NeMo>`_ has separate collections for: