From 4830a0786213b0dc15053bb2f55c37fba1a953ce Mon Sep 17 00:00:00 2001
From: Anna Shors <ashors@nvidia.com>
Date: Tue, 10 Dec 2024 13:39:05 -0800
Subject: [PATCH 1/2] docs: add eval documentation (#428)

Signed-off-by: ashors1 <ashors@nvidia.com>
---
 docs/user-guide/aligner-algo-header.rst       |  4 +-
 docs/user-guide/evaluation.rst                | 39 +++++++++++++++++++
 .../nlp/data/sft/remove_long_dialogues.py     |  2 +-
 3 files changed, 43 insertions(+), 2 deletions(-)
 create mode 100644 docs/user-guide/evaluation.rst
diff --git a/docs/user-guide/aligner-algo-header.rst b/docs/user-guide/aligner-algo-header.rst
index 15114dc02..a9e029784 100644
--- a/docs/user-guide/aligner-algo-header.rst
+++ b/docs/user-guide/aligner-algo-header.rst
@@ -1,4 +1,6 @@
 .. important::
    Before starting this tutorial, be sure to review the :ref:`introduction <nemo-aligner-getting-started>` for tips on setting up your NeMo-Aligner environment.
 
-   If you run into any problems, refer to NeMo's `Known Issues page <https://docs.nvidia.com/nemo-framework/user-guide/latest/knownissues.html>`__. The page enumerates known issues and provides suggested workarounds where appropriate.
\ No newline at end of file
+   If you run into any problems, refer to NeMo's `Known Issues page <https://docs.nvidia.com/nemo-framework/user-guide/latest/knownissues.html>`__. The page enumerates known issues and provides suggested workarounds where appropriate.
+
+   After completing this tutorial, refer to the :ref:`evaluation documentation <nemo-aligner-eval>` for tips on evaluating a trained model.
\ No newline at end of file
diff --git a/docs/user-guide/evaluation.rst b/docs/user-guide/evaluation.rst
new file mode 100644
index 000000000..0922905a8
--- /dev/null
+++ b/docs/user-guide/evaluation.rst
@@ -0,0 +1,39 @@
+.. include:: /content/nemo.rsts
+
+.. _nemo-aligner-eval:
+
+Evaluate a Trained Model
+@@@@@@@@@@@@@@@@@@@@@@@@
+
+After training a model, you may want to run evaluation to understand how the model performs on unseen tasks. You can use Eleuther AI's `Language Model Evaluation Harness <https://github.com/EleutherAI/lm-evaluation-harness>`_
+to quickly run a variety of popular benchmarks, including MMLU, SuperGLUE, HellaSwag, and WinoGrande.
+A full list of supported tasks can be found `here <https://github.com/EleutherAI/lm-evaluation-harness/blob/main/lm_eval/tasks/README.md>`_.
+
+Install the LM Evaluation Harness
+#################################
+
+Run the following commands inside of a NeMo container to install the LM Evaluation Harness:
+
+.. code-block:: bash
+
+   git clone --depth 1 https://github.com/EleutherAI/lm-evaluation-harness
+   cd lm-evaluation-harness
+   pip install -e .
+
+
+Run Evaluations
+###############
+
+A detailed description of running evaluation with ``.nemo`` models can be found in Eleuther AI's `documentation <https://github.com/EleutherAI/lm-evaluation-harness?tab=readme-ov-file#nvidia-nemo-models>`_.
+Single- and multi-GPU evaluation is supported. The following is an example of running evaluation using 8 GPUs on the ``hellaswag``, ``super_glue``, and ``winogrande`` tasks using a ``.nemo`` file from NeMo-Aligner.
+Please note that while it is recommended, you are not required to unzip your .nemo file before running evaluations.
+
+.. code-block:: bash
+
+   mkdir unzipped_checkpoint
+   tar -xvf /path/to/model.nemo -c unzipped_checkpoint
+
+   torchrun --nproc-per-node=8 --no-python lm_eval --model nemo_lm \
+     --model_args path='unzipped_checkpoint',devices=8,tensor_model_parallel_size=8 \
+     --tasks lambada_openai,super-glue-lm-eval-v1,winogrande \
+     --batch_size 8
diff --git a/examples/nlp/data/sft/remove_long_dialogues.py b/examples/nlp/data/sft/remove_long_dialogues.py
index 680f91606..95208f440 100644
--- a/examples/nlp/data/sft/remove_long_dialogues.py
+++ b/examples/nlp/data/sft/remove_long_dialogues.py
@@ -25,7 +25,7 @@
 Usage:
   python3 remove_long_dialogues.py \
     --tokenizer_path <PATH TO TOKENIZER MODEL> \
-    --tokenizer_type sentencepiece
+    --tokenizer_type sentencepiece \
     --dataset_file <PATH TO DATASET TO PREPROCESS> \
     --output_file <WHERE TO SAVE PREPROCESSED DATASET> \
     --seq_len <MAX_SEQ_LEN TO USE DURING TRAINING>

From 2ead6bf14d37f776f82c3b3204b3542cef2b226b Mon Sep 17 00:00:00 2001
From: Anna Shors <ashors@nvidia.com>
Date: Wed, 11 Dec 2024 10:28:37 -0800
Subject: [PATCH 2/2] fix: bug fix for KD + PP (#443)

Signed-off-by: ashors1 <ashors@nvidia.com>
---
 .../models/nlp/gpt/megatron_gpt_knowledge_distillation.py   | 6 ++++--
 tests/functional/kd.sh                                      | 2 +-
 2 files changed, 5 insertions(+), 3 deletions(-)

diff --git a/nemo_aligner/models/nlp/gpt/megatron_gpt_knowledge_distillation.py b/nemo_aligner/models/nlp/gpt/megatron_gpt_knowledge_distillation.py
index b67d858ed..db93d29ec 100644
--- a/nemo_aligner/models/nlp/gpt/megatron_gpt_knowledge_distillation.py
+++ b/nemo_aligner/models/nlp/gpt/megatron_gpt_knowledge_distillation.py
@@ -72,7 +72,7 @@ def fwd_output_and_loss_func(dataloader_iter, model, checkpoint_activations_all_
                     required_keys.update(("tokens", "position_ids"))
 
                 if parallel_state.is_pipeline_last_stage():
-                    required_keys.update(("labels", "loss_mask"))
+                    required_keys.update(("labels", "loss_mask", "topk_logits", "topk_token_ids"))
 
             batch = {key: val.cuda(non_blocking=True) if key in required_keys else None for key, val in batch.items()}
 
@@ -83,7 +83,9 @@ def fwd_output_and_loss_func(dataloader_iter, model, checkpoint_activations_all_
 
             tokens = batch["tokens"]
             labels = batch["labels"]
-            loss_mask = batch["loss_mask"].clamp(min=0, max=1)
+            loss_mask = batch["loss_mask"]
+            if loss_mask is not None:
+                loss_mask = loss_mask.clamp(min=0, max=1)
             target_topk_logits = batch["topk_logits"]
             target_topk_token_ids = batch["topk_token_ids"]
             # Model forward pass
diff --git a/tests/functional/kd.sh b/tests/functional/kd.sh
index 83e472f52..fa7c2a4b9 100644
--- a/tests/functional/kd.sh
+++ b/tests/functional/kd.sh
@@ -83,7 +83,7 @@ torchrun --nproc-per-node 2 ${GPFS}/examples/nlp/gpt/train_gpt_knowledge_distill
     exp_manager.create_checkpoint_callback=False \
     model.data.num_workers=2 \
     ++model.tensor_model_parallel_size=1 \
-    ++model.pipeline_model_parallel_size=1 \
+    ++model.pipeline_model_parallel_size=2 \
     exp_manager.explicit_log_dir=${RESULTS_DIR} \
     ++model.activations_checkpoint_granularity=full \
     ++model.activations_checkpoint_method=uniform \