From fe03ee9003b22dd9b113655ef3749095319940e0 Mon Sep 17 00:00:00 2001
From: eharper <eharper@nvidia.com>
Date: Tue, 12 Sep 2023 17:09:08 -0600
Subject: [PATCH 01/10] update branch

Signed-off-by: eharper <eharper@nvidia.com>
---
 Jenkinsfile                                   | 344 +++++++++---------
 nemo/package_info.py                          |   2 +-
 tutorials/00_NeMo_Primer.ipynb                |   2 +-
 tutorials/01_NeMo_Models.ipynb                |   2 +-
 tutorials/02_NeMo_Adapters.ipynb              |   2 +-
 tutorials/AudioTranslationSample.ipynb        |   2 +-
 ...blish_NeMo_Model_On_Hugging_Face_Hub.ipynb |   2 +-
 tutorials/VoiceSwapSample.ipynb               |   2 +-
 .../asr/ASR_CTC_Language_Finetuning.ipynb     |   2 +-
 tutorials/asr/ASR_Confidence_Estimation.ipynb |   2 +-
 tutorials/asr/ASR_TTS_Tutorial.ipynb          |   2 +-
 tutorials/asr/ASR_for_telephony_speech.ipynb  |   2 +-
 tutorials/asr/ASR_with_NeMo.ipynb             |   4 +-
 .../asr/ASR_with_Subword_Tokenization.ipynb   |   2 +-
 tutorials/asr/ASR_with_Transducers.ipynb      |   2 +-
 .../asr/Buffered_Transducer_Inference.ipynb   |   2 +-
 ..._Transducer_Inference_with_LCS_Merge.ipynb |   2 +-
 tutorials/asr/Intro_to_Transducers.ipynb      |   2 +-
 tutorials/asr/Multilang_ASR.ipynb             |   2 +-
 tutorials/asr/Offline_ASR.ipynb               |   2 +-
 .../Offline_ASR_with_VAD_for_CTC_models.ipynb |   2 +-
 .../asr/Online_ASR_Microphone_Demo.ipynb      |   2 +-
 tutorials/asr/Online_Noise_Augmentation.ipynb |   2 +-
 .../Online_Offline_Microphone_VAD_Demo.ipynb  |   2 +-
 .../Online_Offline_Speech_Commands_Demo.ipynb |   2 +-
 .../asr/Self_Supervised_Pre_Training.ipynb    |   2 +-
 tutorials/asr/Speech_Commands.ipynb           |   2 +-
 tutorials/asr/Streaming_ASR.ipynb             |   2 +-
 tutorials/asr/Voice_Activity_Detection.ipynb  |   2 +-
 .../asr/asr_adapters/ASR_with_Adapters.ipynb  |   2 +-
 .../Speech_Enhancement_with_NeMo.ipynb        |   4 +-
 ...netuning_at_Scale_with_AWS_SageMaker.ipynb |   2 +-
 .../cloud/aws/SageMaker_ASR_Training.ipynb    |   2 +-
 ...Language_Models_for_Downstream_Tasks.ipynb |   2 +-
 tutorials/nlp/02_NLP_Tokenizers.ipynb         |   4 +-
 ...a_Preprocessing_and_Cleaning_for_NMT.ipynb |   2 +-
 tutorials/nlp/Dialogue.ipynb                  |   2 +-
 tutorials/nlp/Entity_Linking_Medical.ipynb    |   2 +-
 tutorials/nlp/GLUE_Benchmark.ipynb            |   2 +-
 tutorials/nlp/ITN_with_Thutmose_Tagger.ipynb  |   2 +-
 ...Joint_Intent_and_Slot_Classification.ipynb |   2 +-
 tutorials/nlp/MegatronBert_export.ipynb       |   2 +-
 ...on_Synthetic_Tabular_Data_Generation.ipynb |   2 +-
 .../nlp/Multitask_Prompt_and_PTuning.ipynb    |   2 +-
 .../nlp/Punctuation_and_Capitalization.ipynb  |   2 +-
 ...ion_and_Capitalization_Lexical_Audio.ipynb |   2 +-
 tutorials/nlp/Question_Answering.ipynb        |   2 +-
 .../nlp/Relation_Extraction-BioMegatron.ipynb |   2 +-
 ...pellMapper_English_ASR_Customization.ipynb |   2 +-
 ...xt_Classification_Sentiment_Analysis.ipynb |   2 +-
 .../Token_Classification-BioMegatron.ipynb    |   2 +-
 ...ssification_Named_Entity_Recognition.ipynb |   4 +-
 .../nlp/Zero_Shot_Intent_Recognition.ipynb    |   2 +-
 tutorials/nlp/lora.ipynb                      |   2 +-
 .../ASR_with_SpeakerDiarization.ipynb         |   2 +-
 .../Speaker_Diarization_Inference.ipynb       |   2 +-
 .../Speaker_Diarization_Training.ipynb        |   4 +-
 .../Speaker_Identification_Verification.ipynb |   2 +-
 .../tools/CTC_Segmentation_Tutorial.ipynb     |   2 +-
 tutorials/tools/Multispeaker_Simulator.ipynb  |   2 +-
 .../tools/NeMo_Forced_Aligner_Tutorial.ipynb  |   2 +-
 .../tts/Aligner_Inference_Examples.ipynb      |   2 +-
 .../Evaluation_MelCepstralDistortion.ipynb    |   2 +-
 .../tts/FastPitch_Adapter_Finetuning.ipynb    |   2 +-
 .../tts/FastPitch_ChineseTTS_Training.ipynb   |   2 +-
 .../tts/FastPitch_Data_Preparation.ipynb      |   2 +-
 tutorials/tts/FastPitch_Finetuning.ipynb      |   2 +-
 .../tts/FastPitch_GermanTTS_Training.ipynb    |   2 +-
 .../tts/FastPitch_MixerTTS_Training.ipynb     |   2 +-
 .../FastPitch_MultiSpeaker_Pretraining.ipynb  |   2 +-
 .../tts/FastPitch_Speaker_Interpolation.ipynb |   2 +-
 .../tts/Inference_DurationPitchControl.ipynb  |   2 +-
 tutorials/tts/Inference_ModelSelect.ipynb     |   2 +-
 tutorials/tts/NeMo_TTS_Primer.ipynb           |   2 +-
 .../tts/Pronunciation_customization.ipynb     |   2 +-
 tutorials/tts/Tacotron2_Training.ipynb        |   2 +-
 tutorials/tts/Vits_Training.ipynb             |   2 +-
 77 files changed, 253 insertions(+), 253 deletions(-)

diff --git a/Jenkinsfile b/Jenkinsfile
index 04bc96ff1596..318edcd86227 100644
--- a/Jenkinsfile
+++ b/Jenkinsfile
@@ -97,8 +97,8 @@ pipeline {
     stage('L0: Unit Tests CPU') {
       when {
         anyOf {
-          branch 'main'
-          changeRequest target: 'main'
+          branch 'r1.21.0'
+          changeRequest target: 'r1.21.0'
         }
       }
       steps {
@@ -111,8 +111,8 @@ pipeline {
     stage('L2: Community LLM Checkpoints tests') {
       when {
         anyOf {
-          branch 'main'
-          changeRequest target: 'main'
+          branch 'r1.21.0'
+          changeRequest target: 'r1.21.0'
         }
       }
       failFast true
@@ -128,8 +128,8 @@ pipeline {
     stage('L2: ASR dev run') {
       when {
         anyOf {
-          branch 'main'
-          changeRequest target: 'main'
+          branch 'r1.21.0'
+          changeRequest target: 'r1.21.0'
         }
       }
       failFast true
@@ -232,8 +232,8 @@ pipeline {
     stage('L2: ASR dev run - part two') {
       when {
         anyOf {
-          branch 'main'
-          changeRequest target: 'main'
+          branch 'r1.21.0'
+          changeRequest target: 'r1.21.0'
         }
       }
       failFast true
@@ -262,8 +262,8 @@ pipeline {
     stage('L2: Speech to Text EMA') {
       when {
         anyOf {
-          branch 'main'
-          changeRequest target: 'main'
+          branch 'r1.21.0'
+          changeRequest target: 'r1.21.0'
         }
       }
       steps {
@@ -283,8 +283,8 @@ pipeline {
     stage('L2: Speaker dev run') {
       when {
         anyOf {
-          branch 'main'
-          changeRequest target: 'main'
+          branch 'r1.21.0'
+          changeRequest target: 'r1.21.0'
         }
       }
       failFast true
@@ -406,8 +406,8 @@ pipeline {
     // stage('L2: ASR DALI dev run') {
     //   when {
     //     anyOf {
-    //       branch 'main'
-    //       changeRequest target: 'main'
+    //       branch 'r1.21.0'
+    //       changeRequest target: 'r1.21.0'
     //     }
     //   }
     //   failFast true
@@ -474,8 +474,8 @@ pipeline {
     // stage('L2: ASR RNNT dev run') {
     //   when {
     //     anyOf {
-    //       branch 'main'
-    //       changeRequest target: 'main'
+    //       branch 'r1.21.0'
+    //       changeRequest target: 'r1.21.0'
     //     }
     //   }
     //   failFast true
@@ -536,8 +536,8 @@ pipeline {
     // stage('L2: Hybrid ASR RNNT-CTC dev run') {
     //   when {
     //     anyOf {
-    //       branch 'main'
-    //       changeRequest target: 'main'
+    //       branch 'r1.21.0'
+    //       changeRequest target: 'r1.21.0'
     //     }
     //   }
     //   failFast true
@@ -566,8 +566,8 @@ pipeline {
     stage('L2: ASR Multi-dataloader dev run') {
       when {
         anyOf {
-          branch 'main'
-          changeRequest target: 'main'
+          branch 'r1.21.0'
+          changeRequest target: 'r1.21.0'
         }
       }
       failFast true
@@ -614,8 +614,8 @@ pipeline {
     stage('L2: ASR Adapters') {
       when {
         anyOf {
-          branch 'main'
-          changeRequest target: 'main'
+          branch 'r1.21.0'
+          changeRequest target: 'r1.21.0'
         }
       }
       failFast true
@@ -660,8 +660,8 @@ pipeline {
     //stage('L2: Megatron T5 Adapter PP=2') {
     //  when {
     //    anyOf {
-    //      branch 'main'
-    //      changeRequest target: 'main'
+    //      branch 'r1.21.0'
+    //      changeRequest target: 'r1.21.0'
     //    }
     //  }
     //  failFast true
@@ -706,8 +706,8 @@ pipeline {
     //stage('L2: Megatron T5 Adapter TP=2') {
     //  when {
     //    anyOf {
-    //      branch 'main'
-    //      changeRequest target: 'main'
+    //      branch 'r1.21.0'
+    //      changeRequest target: 'r1.21.0'
     //    }
     //  }
     //  failFast true
@@ -750,8 +750,8 @@ pipeline {
     stage('L2: Megatron T5 IA3 PP=2') {
       when {
         anyOf {
-          branch 'main'
-          changeRequest target: 'main'
+          branch 'r1.21.0'
+          changeRequest target: 'r1.21.0'
         }
       }
       failFast true
@@ -797,8 +797,8 @@ pipeline {
     stage('L2: Megatron T5 IA3 TP=2') {
       when {
         anyOf {
-          branch 'main'
-          changeRequest target: 'main'
+          branch 'r1.21.0'
+          changeRequest target: 'r1.21.0'
         }
       }
       failFast true
@@ -842,8 +842,8 @@ pipeline {
     stage('L2: Speech Transcription') {
       when {
         anyOf {
-          branch 'main'
-          changeRequest target: 'main'
+          branch 'r1.21.0'
+          changeRequest target: 'r1.21.0'
         }
       }
       failFast true
@@ -863,8 +863,8 @@ pipeline {
     stage('L2: Transducer alignment') {
       when {
         anyOf {
-          branch 'main'
-          changeRequest target: 'main'
+          branch 'r1.21.0'
+          changeRequest target: 'r1.21.0'
         }
       }
       failFast true
@@ -880,8 +880,8 @@ pipeline {
     stage('L2: Segmentation Tool') {
       when {
             anyOf {
-              branch 'main'
-              changeRequest target: 'main'
+              branch 'r1.21.0'
+              changeRequest target: 'r1.21.0'
             }
       }
       stages {
@@ -936,8 +936,8 @@ pipeline {
     stage('L2: G2P Models') {
       when {
         anyOf {
-          branch 'main'
-          changeRequest target: 'main'
+          branch 'r1.21.0'
+          changeRequest target: 'r1.21.0'
         }
       }
       failFast true
@@ -1018,8 +1018,8 @@ pipeline {
     // stage('L2: Multi-GPU Megatron finetuning') {
     //   when {
     //     anyOf {
-    //       branch 'main'
-    //       changeRequest target: 'main'
+    //       branch 'r1.21.0'
+    //       changeRequest target: 'r1.21.0'
     //     }
     //   }
     //   failFast true
@@ -1045,8 +1045,8 @@ pipeline {
     stage('L2: STS-b') {
       when {
         anyOf {
-          branch 'main'
-          changeRequest target: 'main'
+          branch 'r1.21.0'
+          changeRequest target: 'r1.21.0'
         }
       }
       failFast true
@@ -1105,8 +1105,8 @@ pipeline {
     stage('L2: Dialogue Classification') {
       when {
         anyOf {
-          branch 'main'
-          changeRequest target: 'main'
+          branch 'r1.21.0'
+          changeRequest target: 'r1.21.0'
         }
       }
       failFast true
@@ -1276,8 +1276,8 @@ pipeline {
     stage('L2: Dialogue Generation') {
       when {
         anyOf {
-          branch 'main'
-          changeRequest target: 'main'
+          branch 'r1.21.0'
+          changeRequest target: 'r1.21.0'
         }
       }
       failFast true
@@ -1342,8 +1342,8 @@ pipeline {
 //     stage('L2: Dialogue Generation Part 2') {
 //       when {
 //         anyOf {
-//           branch 'main'
-//           changeRequest target: 'main'
+//           branch 'r1.21.0'
+//           changeRequest target: 'r1.21.0'
 //         }
 //       }
 //       failFast true
@@ -1372,8 +1372,8 @@ pipeline {
     stage('L2: COPY') {
       when {
         anyOf {
-          branch 'main'
-          changeRequest target: 'main'
+          branch 'r1.21.0'
+          changeRequest target: 'r1.21.0'
         }
       }
       failFast true
@@ -1402,8 +1402,8 @@ pipeline {
     stage('L2: Duplex Text Normalization') {
       when {
         anyOf {
-          branch 'main'
-          changeRequest target: 'main'
+          branch 'r1.21.0'
+          changeRequest target: 'r1.21.0'
         }
       }
       failFast true
@@ -1440,8 +1440,8 @@ pipeline {
     // stage('L2: MegaBERT Token Classification') {
     //   when {
     //     anyOf {
-    //       branch 'main'
-    //       changeRequest target: 'main'
+    //       branch 'r1.21.0'
+    //       changeRequest target: 'r1.21.0'
     //     }
     //   }
     //   failFast true
@@ -1466,8 +1466,8 @@ pipeline {
     stage('L2: BERT Text Classification') {
       when {
         anyOf {
-          branch 'main'
-          changeRequest target: 'main'
+          branch 'r1.21.0'
+          changeRequest target: 'r1.21.0'
         }
       }
       failFast true
@@ -1495,8 +1495,8 @@ pipeline {
     stage('L2: Parallel BERT Question-Answering SQUAD v1.1 & v2.0') {
       when {
         anyOf {
-          branch 'main'
-          changeRequest target: 'main'
+          branch 'r1.21.0'
+          changeRequest target: 'r1.21.0'
         }
       }
       failFast true
@@ -1554,8 +1554,8 @@ pipeline {
     stage('L2: Parallel BART Question-Answering SQUAD v1.1 & v2.0') {
       when {
         anyOf {
-          branch 'main'
-          changeRequest target: 'main'
+          branch 'r1.21.0'
+          changeRequest target: 'r1.21.0'
         }
       }
       failFast true
@@ -1615,8 +1615,8 @@ pipeline {
     stage('L2: Parallel GPT2 Question-Answering SQUAD v1.1 & v2.0') {
       when {
         anyOf {
-          branch 'main'
-          changeRequest target: 'main'
+          branch 'r1.21.0'
+          changeRequest target: 'r1.21.0'
         }
       }
       failFast true
@@ -1676,8 +1676,8 @@ pipeline {
     stage('L2: Intent and Slot Classification Tasks') {
       when {
         anyOf {
-          branch 'main'
-          changeRequest target: 'main'
+          branch 'r1.21.0'
+          changeRequest target: 'r1.21.0'
         }
       }
       failFast true
@@ -1716,8 +1716,8 @@ pipeline {
     // stage('L2: Model Parallel Size 2 Megatron Text Classification') {
     //   when {
     //     anyOf{
-    //       branch 'main'
-    //       changeRequest target: 'main'
+    //       branch 'r1.21.0'
+    //       changeRequest target: 'r1.21.0'
     //     }
     //   }
     //   failFast true
@@ -1745,8 +1745,8 @@ pipeline {
     // stage('L2: Model Parallel Size 2 Megatron Autoresume') {
     //   when {
     //     anyOf{
-    //       branch 'main'
-    //       changeRequest target: 'main'
+    //       branch 'r1.21.0'
+    //       changeRequest target: 'r1.21.0'
     //     }
     //   }
     //   failFast true
@@ -1776,8 +1776,8 @@ pipeline {
     // stage('L2: Model Parallel Size 2 Megatron Evaluation from .nemo') {
     //   when {
     //     anyOf{
-    //       branch 'main'
-    //       changeRequest target: 'main'
+    //       branch 'r1.21.0'
+    //       changeRequest target: 'r1.21.0'
     //     }
     //   }
     //   failFast true
@@ -1797,8 +1797,8 @@ pipeline {
     // stage('L2: Model Parallel Size 2 Megatron Train from .nemo') {
     //   when {
     //     anyOf{
-    //       branch 'main'
-    //       changeRequest target: 'main'
+    //       branch 'r1.21.0'
+    //       changeRequest target: 'r1.21.0'
     //     }
     //   }
     //   failFast true
@@ -1820,8 +1820,8 @@ pipeline {
     stage('L2: Parallel NLP Examples 2') {
       when {
         anyOf {
-          branch 'main'
-          changeRequest target: 'main'
+          branch 'r1.21.0'
+          changeRequest target: 'r1.21.0'
         }
       }
       failFast true
@@ -1945,8 +1945,8 @@ pipeline {
     stage('Punctuation & Capitalization tarred dataset') {
       when {
         anyOf {
-          branch 'main'
-          changeRequest target: 'main'
+          branch 'r1.21.0'
+          changeRequest target: 'r1.21.0'
         }
       }
       failFast true
@@ -2004,8 +2004,8 @@ pipeline {
     stage('Punctuation & Capitalization, Different ways of passing labels to model') {
       when {
         anyOf {
-          branch 'main'
-          changeRequest target: 'main'
+          branch 'r1.21.0'
+          changeRequest target: 'r1.21.0'
         }
       }
       failFast true
@@ -2112,8 +2112,8 @@ pipeline {
     stage('Punctuation & Capitalization inference') {
       when {
         anyOf {
-          branch 'main'
-          changeRequest target: 'main'
+          branch 'r1.21.0'
+          changeRequest target: 'r1.21.0'
         }
       }
       failFast true
@@ -2138,8 +2138,8 @@ pipeline {
     stage('L2: Parallel Pretraining BERT pretraining from Text/Preprocessed') {
       when {
         anyOf {
-          branch 'main'
-          changeRequest target: 'main'
+          branch 'r1.21.0'
+          changeRequest target: 'r1.21.0'
         }
       }
       failFast true
@@ -2203,8 +2203,8 @@ pipeline {
     stage('L2: Entity Linking') {
       when {
         anyOf {
-          branch 'main'
-          changeRequest target: 'main'
+          branch 'r1.21.0'
+          changeRequest target: 'r1.21.0'
         }
       }
       failFast true
@@ -2231,8 +2231,8 @@ pipeline {
     stage('L2: NMT Attention is All You Need Training') {
       when {
         anyOf {
-          branch 'main'
-          changeRequest target: 'main'
+          branch 'r1.21.0'
+          changeRequest target: 'r1.21.0'
         }
       }
       failFast true
@@ -2354,8 +2354,8 @@ pipeline {
     stage('L2: NMT Attention is All You Need Inference') {
       when {
         anyOf {
-          branch 'main'
-          changeRequest target: 'main'
+          branch 'r1.21.0'
+          changeRequest target: 'r1.21.0'
         }
       }
       failFast true
@@ -2390,8 +2390,8 @@ pipeline {
     stage('L2: NMT Attention is All You Need Finetuning') {
       when {
         anyOf {
-          branch 'main'
-          changeRequest target: 'main'
+          branch 'r1.21.0'
+          changeRequest target: 'r1.21.0'
         }
       }
       failFast true
@@ -2425,8 +2425,8 @@ pipeline {
     stage('L2: NMT Tarred Dataset Creation') {
       when {
         anyOf {
-          branch 'main'
-          changeRequest target: 'main'
+          branch 'r1.21.0'
+          changeRequest target: 'r1.21.0'
         }
       }
       failFast true
@@ -2479,8 +2479,8 @@ pipeline {
     stage('L2: Megatron NMT Training TP=2') {
       when {
         anyOf {
-          branch 'main'
-          changeRequest target: 'main'
+          branch 'r1.21.0'
+          changeRequest target: 'r1.21.0'
         }
       }
       failFast true
@@ -2580,8 +2580,8 @@ pipeline {
       // Testing Megatron hidden transformations
       when {
         anyOf {
-          branch 'main'
-          changeRequest target: 'main'
+          branch 'r1.21.0'
+          changeRequest target: 'r1.21.0'
         }
       }
       failFast true
@@ -2680,8 +2680,8 @@ pipeline {
     // stage('L2: NMT Bottleneck Fallback') {
     //   when {
     //     anyOf {
-    //       branch 'main'
-    //       changeRequest target: 'main'
+    //       branch 'r1.21.0'
+    //       changeRequest target: 'r1.21.0'
     //     }
     //   }
     //   failFast true
@@ -2727,8 +2727,8 @@ pipeline {
     // stage('L2: NMT Bottleneck Architecture') {
     //   when {
     //     anyOf {
-    //       branch 'main'
-    //       changeRequest target: 'main'
+    //       branch 'r1.21.0'
+    //       changeRequest target: 'r1.21.0'
     //     }
     //   }
     //   failFast true
@@ -2810,8 +2810,8 @@ pipeline {
     // stage('L2: NMT Bottleneck LVM') {
     //   when {
     //     anyOf {
-    //       branch 'main'
-    //       changeRequest target: 'main'
+    //       branch 'r1.21.0'
+    //       changeRequest target: 'r1.21.0'
     //     }
     //   }
     //   failFast true
@@ -2893,8 +2893,8 @@ pipeline {
     stage('L2: Megatron Bert Pretraining and Resume Training with Pipeline Paralleism') {
       when {
         anyOf {
-          branch 'main'
-          changeRequest target: 'main'
+          branch 'r1.21.0'
+          changeRequest target: 'r1.21.0'
         }
       }
       failFast true
@@ -2963,8 +2963,8 @@ pipeline {
     stage('L2: Megatron Bert Pretraining and Resume Training') {
       when {
         anyOf {
-          branch 'main'
-          changeRequest target: 'main'
+          branch 'r1.21.0'
+          changeRequest target: 'r1.21.0'
         }
       }
       failFast true
@@ -3034,8 +3034,8 @@ pipeline {
     stage('L2: Megatron RETRO Pretraining and Resume Training') {
       when {
         anyOf {
-          branch 'main'
-          changeRequest target: 'main'
+          branch 'r1.21.0'
+          changeRequest target: 'r1.21.0'
         }
       }
       failFast true
@@ -3106,8 +3106,8 @@ pipeline {
     stage('L2: Megatron RETRO muTransfer Pretraining Performance') {
       when {
         anyOf {
-          branch 'main'
-          changeRequest target: 'main'
+          branch 'r1.21.0'
+          changeRequest target: 'r1.21.0'
         }
       }
       failFast true
@@ -3189,8 +3189,8 @@ assert_frame_equal(training_curve, gt_curve, rtol=1e-3, atol=1e-3)"'''
     stage('L2: BioMegatron Bert NER Task') {
       when {
         anyOf {
-          branch 'main'
-          changeRequest target: 'main'
+          branch 'r1.21.0'
+          changeRequest target: 'r1.21.0'
         }
       }
       failFast true
@@ -3207,8 +3207,8 @@ assert_frame_equal(training_curve, gt_curve, rtol=1e-3, atol=1e-3)"'''
     stage('L2: Megatron GPT Pretraining and Resume Training TP=2') {
       when {
         anyOf {
-          branch 'main'
-          changeRequest target: 'main'
+          branch 'r1.21.0'
+          changeRequest target: 'r1.21.0'
         }
       }
       failFast true
@@ -3289,8 +3289,8 @@ assert_frame_equal(training_curve, gt_curve, rtol=1e-3, atol=1e-3)"'''
     stage('L2: Megatron GPT with Rope Pretraining and Resume Training TP=2') {
       when {
         anyOf {
-          branch 'main'
-          changeRequest target: 'main'
+          branch 'r1.21.0'
+          changeRequest target: 'r1.21.0'
         }
       }
       failFast true
@@ -3379,8 +3379,8 @@ assert_frame_equal(training_curve, gt_curve, rtol=1e-3, atol=1e-3)"'''
     // stage('L2: Megatron GPT with Rope Pretraining using Flash Attention and Resume Training TP=2') {
     //   when {
     //     anyOf {
-    //       branch 'main'
-    //       changeRequest target: 'main'
+    //       branch 'r1.21.0'
+    //       changeRequest target: 'r1.21.0'
     //     }
     //   }
     //   failFast true
@@ -3468,8 +3468,8 @@ assert_frame_equal(training_curve, gt_curve, rtol=1e-3, atol=1e-3)"'''
     stage('L2: Megatron GPT with ALiBi Pretraining and Resume Training TP=2') {
       when {
         anyOf {
-          branch 'main'
-          changeRequest target: 'main'
+          branch 'r1.21.0'
+          changeRequest target: 'r1.21.0'
         }
       }
       failFast true
@@ -3553,8 +3553,8 @@ assert_frame_equal(training_curve, gt_curve, rtol=1e-3, atol=1e-3)"'''
     stage('L2: Megatron GPT with KERPLE Pretraining and Resume Training TP=2') {
       when {
         anyOf {
-          branch 'main'
-          changeRequest target: 'main'
+          branch 'r1.21.0'
+          changeRequest target: 'r1.21.0'
         }
       }
       failFast true
@@ -3638,8 +3638,8 @@ assert_frame_equal(training_curve, gt_curve, rtol=1e-3, atol=1e-3)"'''
     stage('L2: Megatron GPT Pretraining and Resume Training PP=2') {
       when {
         anyOf {
-          branch 'main'
-          changeRequest target: 'main'
+          branch 'r1.21.0'
+          changeRequest target: 'r1.21.0'
         }
       }
       failFast true
@@ -3723,8 +3723,8 @@ assert_frame_equal(training_curve, gt_curve, rtol=1e-3, atol=1e-3)"'''
     stage('L2: Megatron GPT Finetuning PP=2') {
       when {
         anyOf {
-          branch 'main'
-          changeRequest target: 'main'
+          branch 'r1.21.0'
+          changeRequest target: 'r1.21.0'
         }
       }
       failFast true
@@ -3791,8 +3791,8 @@ assert_frame_equal(training_curve, gt_curve, rtol=1e-3, atol=1e-3)"'''
     stage('L2: Megatron GPT PEFT Lora PP=2') {
       when {
         anyOf {
-          branch 'main'
-          changeRequest target: 'main'
+          branch 'r1.21.0'
+          changeRequest target: 'r1.21.0'
         }
       }
       failFast true
@@ -3826,8 +3826,8 @@ assert_frame_equal(training_curve, gt_curve, rtol=1e-3, atol=1e-3)"'''
     stage('L2: Megatron GPT PEFT Lora TP=2') {
       when {
         anyOf {
-          branch 'main'
-          changeRequest target: 'main'
+          branch 'r1.21.0'
+          changeRequest target: 'r1.21.0'
         }
       }
       failFast true
@@ -3877,8 +3877,8 @@ assert_frame_equal(training_curve, gt_curve, rtol=1e-3, atol=1e-3)"'''
     stage('L2: Megatron GPT Eval') {
       when {
         anyOf {
-          branch 'main'
-          changeRequest target: 'main'
+          branch 'r1.21.0'
+          changeRequest target: 'r1.21.0'
         }
       }
       failFast true
@@ -3894,8 +3894,8 @@ assert_frame_equal(training_curve, gt_curve, rtol=1e-3, atol=1e-3)"'''
     stage('L2: Megatron GPT Eval PP2') {
       when {
         anyOf {
-          branch 'main'
-          changeRequest target: 'main'
+          branch 'r1.21.0'
+          changeRequest target: 'r1.21.0'
         }
       }
       failFast true
@@ -3913,8 +3913,8 @@ assert_frame_equal(training_curve, gt_curve, rtol=1e-3, atol=1e-3)"'''
     stage('L2: Megatron GPT SFT Eval (inference seq len > training seq len)') {
       when {
         anyOf {
-          branch 'main'
-          changeRequest target: 'main'
+          branch 'r1.21.0'
+          changeRequest target: 'r1.21.0'
         }
       }
       failFast true
@@ -3941,8 +3941,8 @@ assert_frame_equal(training_curve, gt_curve, rtol=1e-3, atol=1e-3)"'''
     // stage('L2: Megatron GPT Convert from Megatron-LM checkpoing and Eval') {
     //   when {
     //     anyOf {
-    //       branch 'main'
-    //       changeRequest target: 'main'
+    //       branch 'r1.21.0'
+    //       changeRequest target: 'r1.21.0'
     //     }
     //   }
     //   failFast true
@@ -3968,8 +3968,8 @@ assert_frame_equal(training_curve, gt_curve, rtol=1e-3, atol=1e-3)"'''
     stage('L2: Megatron Change Partitions') {
       when {
         anyOf {
-          branch 'main'
-          changeRequest target: 'main'
+          branch 'r1.21.0'
+          changeRequest target: 'r1.21.0'
         }
       }
       failFast true
@@ -4015,8 +4015,8 @@ assert_frame_equal(training_curve, gt_curve, rtol=1e-3, atol=1e-3)"'''
     stage('L2: Megatron T5 Pretraining and Resume Training TP=2') {
       when {
         anyOf {
-          branch 'main'
-          changeRequest target: 'main'
+          branch 'r1.21.0'
+          changeRequest target: 'r1.21.0'
         }
       }
       failFast true
@@ -4111,8 +4111,8 @@ assert_frame_equal(training_curve, gt_curve, rtol=1e-3, atol=1e-3)"'''
     stage('L2: Megatron T5 with ALiBi Pretraining and Resume Training TP=2') {
       when {
         anyOf {
-          branch 'main'
-          changeRequest target: 'main'
+          branch 'r1.21.0'
+          changeRequest target: 'r1.21.0'
         }
       }
       failFast true
@@ -4207,8 +4207,8 @@ assert_frame_equal(training_curve, gt_curve, rtol=1e-3, atol=1e-3)"'''
     stage('L2: Megatron T5 with KERPLE Pretraining and Resume Training TP=2') {
       when {
         anyOf {
-          branch 'main'
-          changeRequest target: 'main'
+          branch 'r1.21.0'
+          changeRequest target: 'r1.21.0'
         }
       }
       failFast true
@@ -4303,8 +4303,8 @@ assert_frame_equal(training_curve, gt_curve, rtol=1e-3, atol=1e-3)"'''
     stage('L2: Megatron T5 Pretraining and Resume Training PP=2') {
       when {
         anyOf {
-          branch 'main'
-          changeRequest target: 'main'
+          branch 'r1.21.0'
+          changeRequest target: 'r1.21.0'
         }
       }
       failFast true
@@ -4373,8 +4373,8 @@ assert_frame_equal(training_curve, gt_curve, rtol=1e-3, atol=1e-3)"'''
     stage('L2: Megatron T5 w/ Mixture of Expert Pretraining') {
       when {
         anyOf {
-          branch 'main'
-          changeRequest target: 'main'
+          branch 'r1.21.0'
+          changeRequest target: 'r1.21.0'
         }
       }
       failFast true
@@ -4419,8 +4419,8 @@ assert_frame_equal(training_curve, gt_curve, rtol=1e-3, atol=1e-3)"'''
     //stage('L2: Megatron T5 Prompt Learning TP1 PP1') {
     //  when {
     //    anyOf {
-    //      branch 'main'
-    //      changeRequest target: 'main'
+    //      branch 'r1.21.0'
+    //      changeRequest target: 'r1.21.0'
     //    }
     //  }
     //  failFast true
@@ -4460,8 +4460,8 @@ assert_frame_equal(training_curve, gt_curve, rtol=1e-3, atol=1e-3)"'''
     stage('L2: Megatron T5 Prompt Learning TP2 PP1') {
       when {
         anyOf {
-          branch 'main'
-          changeRequest target: 'main'
+          branch 'r1.21.0'
+          changeRequest target: 'r1.21.0'
         }
       }
       failFast true
@@ -4505,8 +4505,8 @@ assert_frame_equal(training_curve, gt_curve, rtol=1e-3, atol=1e-3)"'''
     // stage('L2: Megatron T5 Prompt Learning TP1 PP2') {
     //   when {
     //     anyOf {
-    //       branch 'main'
-    //       changeRequest target: 'main'
+    //       branch 'r1.21.0'
+    //       changeRequest target: 'r1.21.0'
     //     }
     //   }
     //   failFast true
@@ -4548,8 +4548,8 @@ assert_frame_equal(training_curve, gt_curve, rtol=1e-3, atol=1e-3)"'''
     stage('L2: Megatron UL2 Pretraining and Resume Training TP=2') {
       when {
         anyOf {
-          branch 'main'
-          changeRequest target: 'main'
+          branch 'r1.21.0'
+          changeRequest target: 'r1.21.0'
         }
       }
       failFast true
@@ -4628,8 +4628,8 @@ assert_frame_equal(training_curve, gt_curve, rtol=1e-3, atol=1e-3)"'''
     stage('L2: Megatron T5 Eval') {
       when {
         anyOf {
-          branch 'main'
-          changeRequest target: 'main'
+          branch 'r1.21.0'
+          changeRequest target: 'r1.21.0'
         }
       }
       failFast true
@@ -4645,8 +4645,8 @@ assert_frame_equal(training_curve, gt_curve, rtol=1e-3, atol=1e-3)"'''
     stage('L2: Megatron BART Pretraining and Resume Training, TP=2') {
       when {
         anyOf {
-          branch 'main'
-          changeRequest target: 'main'
+          branch 'r1.21.0'
+          changeRequest target: 'r1.21.0'
         }
       }
       failFast true
@@ -4714,8 +4714,8 @@ assert_frame_equal(training_curve, gt_curve, rtol=1e-3, atol=1e-3)"'''
     stage('L2: Megatron BART Pretraining and Resume Training, PP=2') {
       when {
         anyOf {
-          branch 'main'
-          changeRequest target: 'main'
+          branch 'r1.21.0'
+          changeRequest target: 'r1.21.0'
         }
       }
       failFast true
@@ -4787,8 +4787,8 @@ assert_frame_equal(training_curve, gt_curve, rtol=1e-3, atol=1e-3)"'''
     stage('L2: Megatron T5 GLUE/XNLI Finetuning') {
       when {
         anyOf {
-          branch 'main'
-          changeRequest target: 'main'
+          branch 'r1.21.0'
+          changeRequest target: 'r1.21.0'
         }
       }
       failFast true
@@ -4860,8 +4860,8 @@ assert_frame_equal(training_curve, gt_curve, rtol=1e-3, atol=1e-3)"'''
     stage('L2: Megatron Mock Data Generation') {
       when {
         anyOf {
-          branch 'main'
-          changeRequest target: 'main'
+          branch 'r1.21.0'
+          changeRequest target: 'r1.21.0'
         }
       }
       failFast true
@@ -4897,8 +4897,8 @@ assert_frame_equal(training_curve, gt_curve, rtol=1e-3, atol=1e-3)"'''
     stage('L2: TTS Fast dev runs 1') {
       when {
         anyOf {
-          branch 'main'
-          changeRequest target: 'main'
+          branch 'r1.21.0'
+          changeRequest target: 'r1.21.0'
         }
       }
       parallel {
@@ -5043,8 +5043,8 @@ assert_frame_equal(training_curve, gt_curve, rtol=1e-3, atol=1e-3)"'''
     stage('L??: Speech Checkpoints tests') {
       when {
         anyOf {
-          branch 'main'
-          changeRequest target: 'main'
+          branch 'r1.21.0'
+          changeRequest target: 'r1.21.0'
         }
       }
       failFast true
diff --git a/nemo/package_info.py b/nemo/package_info.py
index 981bb1f6b090..aca57e0f9dcb 100644
--- a/nemo/package_info.py
+++ b/nemo/package_info.py
@@ -16,7 +16,7 @@
 MAJOR = 1
 MINOR = 21
 PATCH = 0
-PRE_RELEASE = 'rc0'
+PRE_RELEASE = ''
 
 # Use the following formatting: (major, minor, patch, pre-release)
 VERSION = (MAJOR, MINOR, PATCH, PRE_RELEASE)
diff --git a/tutorials/00_NeMo_Primer.ipynb b/tutorials/00_NeMo_Primer.ipynb
index 50aa60260b35..2e9d8c75a442 100644
--- a/tutorials/00_NeMo_Primer.ipynb
+++ b/tutorials/00_NeMo_Primer.ipynb
@@ -42,7 +42,7 @@
         "!pip install text-unidecode\n",
         "\n",
         "# ## Install NeMo\n",
-        "BRANCH = 'main'\n",
+        "BRANCH = 'r1.21.0'\n",
         "!python -m pip install git+https://github.com/NVIDIA/NeMo.git@$BRANCH#egg=nemo_toolkit[all]\n",
         "\n",
         "## Install TorchAudio\n",
diff --git a/tutorials/01_NeMo_Models.ipynb b/tutorials/01_NeMo_Models.ipynb
index 4255a6656b8a..bb054e3ea724 100644
--- a/tutorials/01_NeMo_Models.ipynb
+++ b/tutorials/01_NeMo_Models.ipynb
@@ -37,7 +37,7 @@
         "!pip install text-unidecode\n",
         "\n",
         "# ## Install NeMo\n",
-        "BRANCH = 'main'\n",
+        "BRANCH = 'r1.21.0'\n",
         "!python -m pip install git+https://github.com/NVIDIA/NeMo.git@$BRANCH#egg=nemo_toolkit[all]\n",
         "\n",
         "## Install TorchAudio\n",
diff --git a/tutorials/02_NeMo_Adapters.ipynb b/tutorials/02_NeMo_Adapters.ipynb
index 51a91a3c7053..273f7a7c551b 100644
--- a/tutorials/02_NeMo_Adapters.ipynb
+++ b/tutorials/02_NeMo_Adapters.ipynb
@@ -25,7 +25,7 @@
         "!pip install text-unidecode\n",
         "\n",
         "# ## Install NeMo\n",
-        "BRANCH = 'main'\n",
+        "BRANCH = 'r1.21.0'\n",
         "!python -m pip install git+https://github.com/NVIDIA/NeMo.git@$BRANCH#egg=nemo_toolkit[all]\n",
         "\n",
         "## Grab the config we'll use in this example\n",
diff --git a/tutorials/AudioTranslationSample.ipynb b/tutorials/AudioTranslationSample.ipynb
index 0c34baacc953..93f539aa7082 100644
--- a/tutorials/AudioTranslationSample.ipynb
+++ b/tutorials/AudioTranslationSample.ipynb
@@ -38,7 +38,7 @@
             },
             "outputs": [],
             "source": [
-                "BRANCH = 'main'\n",
+                "BRANCH = 'r1.21.0'\n",
                 "!python -m pip install git+https://github.com/NVIDIA/NeMo.git@$BRANCH#egg=nemo_toolkit[all]\n"
             ]
         },
diff --git a/tutorials/Publish_NeMo_Model_On_Hugging_Face_Hub.ipynb b/tutorials/Publish_NeMo_Model_On_Hugging_Face_Hub.ipynb
index ae4f43867c8d..fd828ce56ba1 100644
--- a/tutorials/Publish_NeMo_Model_On_Hugging_Face_Hub.ipynb
+++ b/tutorials/Publish_NeMo_Model_On_Hugging_Face_Hub.ipynb
@@ -41,7 +41,7 @@
         "!pip install text-unidecode\n",
         "\n",
         "### Install NeMo\n",
-        "BRANCH = 'main'\n",
+        "BRANCH = 'r1.21.0'\n",
         "!python -m pip install git+https://github.com/NVIDIA/NeMo.git@$BRANCH#egg=nemo_toolkit[all]"
       ]
     },
diff --git a/tutorials/VoiceSwapSample.ipynb b/tutorials/VoiceSwapSample.ipynb
index addf19f3b236..638f48e31a52 100644
--- a/tutorials/VoiceSwapSample.ipynb
+++ b/tutorials/VoiceSwapSample.ipynb
@@ -39,7 +39,7 @@
             },
             "outputs": [],
             "source": [
-                "BRANCH = 'main'\n",
+                "BRANCH = 'r1.21.0'\n",
                 "!python -m pip install git+https://github.com/NVIDIA/NeMo.git@$BRANCH#egg=nemo_toolkit[all]\n"
             ]
         },
diff --git a/tutorials/asr/ASR_CTC_Language_Finetuning.ipynb b/tutorials/asr/ASR_CTC_Language_Finetuning.ipynb
index b9c0db866f9c..fe2c4e4413c2 100644
--- a/tutorials/asr/ASR_CTC_Language_Finetuning.ipynb
+++ b/tutorials/asr/ASR_CTC_Language_Finetuning.ipynb
@@ -40,7 +40,7 @@
         "!pip install matplotlib>=3.3.2\n",
         "\n",
         "## Install NeMo\n",
-        "BRANCH = 'main'\n",
+        "BRANCH = 'r1.21.0'\n",
         "!python -m pip install git+https://github.com/NVIDIA/NeMo.git@$BRANCH#egg=nemo_toolkit[all]\n",
         "\n",
         "\"\"\"\n",
diff --git a/tutorials/asr/ASR_Confidence_Estimation.ipynb b/tutorials/asr/ASR_Confidence_Estimation.ipynb
index 2a1ad024a889..6881254583e3 100644
--- a/tutorials/asr/ASR_Confidence_Estimation.ipynb
+++ b/tutorials/asr/ASR_Confidence_Estimation.ipynb
@@ -9,7 +9,7 @@
    },
    "outputs": [],
    "source": [
-    "BRANCH = 'main'\n",
+    "BRANCH = 'r1.21.0'\n",
     "\n",
     "\"\"\"\n",
     "You can run either this notebook locally (if you have all the dependencies and a GPU) or on Google Colab.\n",
diff --git a/tutorials/asr/ASR_TTS_Tutorial.ipynb b/tutorials/asr/ASR_TTS_Tutorial.ipynb
index 007713ee3cc2..fd3ca2445a7b 100644
--- a/tutorials/asr/ASR_TTS_Tutorial.ipynb
+++ b/tutorials/asr/ASR_TTS_Tutorial.ipynb
@@ -129,7 +129,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "BRANCH = 'main'"
+    "BRANCH = 'r1.21.0'"
    ]
   },
   {
diff --git a/tutorials/asr/ASR_for_telephony_speech.ipynb b/tutorials/asr/ASR_for_telephony_speech.ipynb
index 6133fdc9a8b9..69b3225724e1 100644
--- a/tutorials/asr/ASR_for_telephony_speech.ipynb
+++ b/tutorials/asr/ASR_for_telephony_speech.ipynb
@@ -28,7 +28,7 @@
                 "!pip install matplotlib>=3.3.2\n",
                 "\n",
                 "## Install NeMo\n",
-                "BRANCH = 'main'\n",
+                "BRANCH = 'r1.21.0'\n",
                 "!python -m pip install git+https://github.com/NVIDIA/NeMo.git@$BRANCH#egg=nemo_toolkit[all]\n",
                 "\n",
                 "## Grab the config we'll use in this example\n",
diff --git a/tutorials/asr/ASR_with_NeMo.ipynb b/tutorials/asr/ASR_with_NeMo.ipynb
index 74cd0f739e84..ca47266c2d6a 100644
--- a/tutorials/asr/ASR_with_NeMo.ipynb
+++ b/tutorials/asr/ASR_with_NeMo.ipynb
@@ -54,7 +54,7 @@
                 "!pip install matplotlib>=3.3.2\n",
                 "\n",
                 "## Install NeMo\n",
-                "BRANCH = 'main'\n",
+                "BRANCH = 'r1.21.0'\n",
                 "!python -m pip install git+https://github.com/NVIDIA/NeMo.git@$BRANCH#egg=nemo_toolkit[all]\n",
                 "\n",
                 "\"\"\"\n",
@@ -588,7 +588,7 @@
                 "\n",
                 "if not os.path.exists(config_path):\n",
                 "    # Grab the config we'll use in this example\n",
-                "    BRANCH = 'main'\n",
+                "    BRANCH = 'r1.21.0'\n",
                 "    !mkdir configs\n",
                 "    !wget -P configs/ https://raw.githubusercontent.com/NVIDIA/NeMo/$BRANCH/examples/asr/conf/config.yaml\n",
                 "\n",
diff --git a/tutorials/asr/ASR_with_Subword_Tokenization.ipynb b/tutorials/asr/ASR_with_Subword_Tokenization.ipynb
index cdb36251fb70..b8f72a47292b 100644
--- a/tutorials/asr/ASR_with_Subword_Tokenization.ipynb
+++ b/tutorials/asr/ASR_with_Subword_Tokenization.ipynb
@@ -41,7 +41,7 @@
         "!pip install matplotlib>=3.3.2\n",
         "\n",
         "## Install NeMo\n",
-        "BRANCH = 'main'\n",
+        "BRANCH = 'r1.21.0'\n",
         "!python -m pip install git+https://github.com/NVIDIA/NeMo.git@$BRANCH#egg=nemo_toolkit[all]\n",
         "\n",
         "## Grab the config we'll use in this example\n",
diff --git a/tutorials/asr/ASR_with_Transducers.ipynb b/tutorials/asr/ASR_with_Transducers.ipynb
index e1eb494f777e..30f42064683e 100644
--- a/tutorials/asr/ASR_with_Transducers.ipynb
+++ b/tutorials/asr/ASR_with_Transducers.ipynb
@@ -29,7 +29,7 @@
         "!pip install matplotlib>=3.3.2\n",
         "\n",
         "## Install NeMo\n",
-        "BRANCH = 'main'\n",
+        "BRANCH = 'r1.21.0'\n",
         "!python -m pip install git+https://github.com/NVIDIA/NeMo.git@$BRANCH#egg=nemo_toolkit[all]\n",
         "\n",
         "## Grab the config we'll use in this example\n",
diff --git a/tutorials/asr/Buffered_Transducer_Inference.ipynb b/tutorials/asr/Buffered_Transducer_Inference.ipynb
index c23398dca46a..b38f3818c370 100644
--- a/tutorials/asr/Buffered_Transducer_Inference.ipynb
+++ b/tutorials/asr/Buffered_Transducer_Inference.ipynb
@@ -28,7 +28,7 @@
     "!pip install matplotlib>=3.3.2\n",
     "\n",
     "## Install NeMo\n",
-    "BRANCH = 'main'\n",
+    "BRANCH = 'r1.21.0'\n",
     "!python -m pip install git+https://github.com/NVIDIA/NeMo.git@$BRANCH#egg=nemo_toolkit[all]\n",
     "\n",
     "# Update numba and restart (this is required to update internal numba version of Colab)\n",
diff --git a/tutorials/asr/Buffered_Transducer_Inference_with_LCS_Merge.ipynb b/tutorials/asr/Buffered_Transducer_Inference_with_LCS_Merge.ipynb
index 2f179eaa9a5a..ca49c0f33238 100644
--- a/tutorials/asr/Buffered_Transducer_Inference_with_LCS_Merge.ipynb
+++ b/tutorials/asr/Buffered_Transducer_Inference_with_LCS_Merge.ipynb
@@ -46,7 +46,7 @@
         "!pip install matplotlib>=3.3.2\n",
         "\n",
         "## Install NeMo\n",
-        "BRANCH = 'main'\n",
+        "BRANCH = 'r1.21.0'\n",
         "!python -m pip install git+https://github.com/NVIDIA/NeMo.git@$BRANCH#egg=nemo_toolkit[all]\n",
         "\n",
         "# Update numba and restart (this is required to update internal numba version of Colab)\n",
diff --git a/tutorials/asr/Intro_to_Transducers.ipynb b/tutorials/asr/Intro_to_Transducers.ipynb
index d3928bed987f..ba67906fcb1e 100644
--- a/tutorials/asr/Intro_to_Transducers.ipynb
+++ b/tutorials/asr/Intro_to_Transducers.ipynb
@@ -44,7 +44,7 @@
         "!pip install matplotlib>=3.3.2\n",
         "\n",
         "## Install NeMo\n",
-        "BRANCH = 'main'\n",
+        "BRANCH = 'r1.21.0'\n",
         "!python -m pip install git+https://github.com/NVIDIA/NeMo.git@$BRANCH#egg=nemo_toolkit[all]"
       ],
       "execution_count": null,
diff --git a/tutorials/asr/Multilang_ASR.ipynb b/tutorials/asr/Multilang_ASR.ipynb
index a1edeea815d0..3d279c283bb7 100644
--- a/tutorials/asr/Multilang_ASR.ipynb
+++ b/tutorials/asr/Multilang_ASR.ipynb
@@ -104,7 +104,7 @@
     "\n",
     "## Install NeMo\n",
     "## We are using the main branch but you might want to adjust that too\n",
-    "BRANCH = 'main'\n",
+    "BRANCH = 'r1.21.0'\n",
     "!python -m pip install git+https://github.com/NVIDIA/NeMo.git@$BRANCH#egg=nemo_toolkit[all]\n",
     "\n",
     "\"\"\"\n",
diff --git a/tutorials/asr/Offline_ASR.ipynb b/tutorials/asr/Offline_ASR.ipynb
index 8544d230878c..9e34fe442341 100644
--- a/tutorials/asr/Offline_ASR.ipynb
+++ b/tutorials/asr/Offline_ASR.ipynb
@@ -52,7 +52,7 @@
         "id": "I9eIxAyKHREB"
       },
       "source": [
-        "BRANCH = 'main'\n",
+        "BRANCH = 'r1.21.0'\n",
         "try:\n",
         "    # Import NeMo Speech Recognition collection\n",
         "    import nemo.collections.asr as nemo_asr\n",
diff --git a/tutorials/asr/Offline_ASR_with_VAD_for_CTC_models.ipynb b/tutorials/asr/Offline_ASR_with_VAD_for_CTC_models.ipynb
index 8a8335ac1542..74a1ae35c0ea 100644
--- a/tutorials/asr/Offline_ASR_with_VAD_for_CTC_models.ipynb
+++ b/tutorials/asr/Offline_ASR_with_VAD_for_CTC_models.ipynb
@@ -25,7 +25,7 @@
                 "!pip install wget\n",
                 "\n",
                 "## Install NeMo\n",
-                "BRANCH = 'main'\n",
+                "BRANCH = 'r1.21.0'\n",
                 "!python -m pip install git+https://github.com/NVIDIA/NeMo.git@$BRANCH#egg=nemo_toolkit[all]\n",
                 "\n",
                 "\"\"\"\n",
diff --git a/tutorials/asr/Online_ASR_Microphone_Demo.ipynb b/tutorials/asr/Online_ASR_Microphone_Demo.ipynb
index 31d2c0dec943..cc7f94a23504 100644
--- a/tutorials/asr/Online_ASR_Microphone_Demo.ipynb
+++ b/tutorials/asr/Online_ASR_Microphone_Demo.ipynb
@@ -27,7 +27,7 @@
                 "!pip install pyaudio\n",
                 "\n",
                 "# ## Install NeMo\n",
-                "BRANCH = 'main'\n",
+                "BRANCH = 'r1.21.0'\n",
                 "!python -m pip install git+https://github.com/NVIDIA/NeMo.git@$BRANCH#egg=nemo_toolkit[asr]\n",
                 "\n",
                 "## Grab the config we'll use in this example\n",
diff --git a/tutorials/asr/Online_Noise_Augmentation.ipynb b/tutorials/asr/Online_Noise_Augmentation.ipynb
index 8883cce55a80..bbb85686a34d 100644
--- a/tutorials/asr/Online_Noise_Augmentation.ipynb
+++ b/tutorials/asr/Online_Noise_Augmentation.ipynb
@@ -32,7 +32,7 @@
                 "!pip install text-unidecode\n",
                 "\n",
                 "# ## Install NeMo\n",
-                "BRANCH = 'main'\n",
+                "BRANCH = 'r1.21.0'\n",
                 "!python -m pip install git+https://github.com/NVIDIA/NeMo.git@$BRANCH#egg=nemo_toolkit[asr]\n",
                 "\n",
                 "## Install TorchAudio\n",
diff --git a/tutorials/asr/Online_Offline_Microphone_VAD_Demo.ipynb b/tutorials/asr/Online_Offline_Microphone_VAD_Demo.ipynb
index 7a8dacd82b6a..2b8886c41a1f 100644
--- a/tutorials/asr/Online_Offline_Microphone_VAD_Demo.ipynb
+++ b/tutorials/asr/Online_Offline_Microphone_VAD_Demo.ipynb
@@ -27,7 +27,7 @@
                 "!pip install pyaudio\n",
                 "\n",
                 "# ## Install NeMo\n",
-                "BRANCH = 'main'\n",
+                "BRANCH = 'r1.21.0'\n",
                 "!python -m pip install git+https://github.com/NVIDIA/NeMo.git@$BRANCH#egg=nemo_toolkit[asr]\n",
                 "\n",
                 "## Install TorchAudio\n",
diff --git a/tutorials/asr/Online_Offline_Speech_Commands_Demo.ipynb b/tutorials/asr/Online_Offline_Speech_Commands_Demo.ipynb
index b1d70d8446c2..7430fe3f9ae2 100644
--- a/tutorials/asr/Online_Offline_Speech_Commands_Demo.ipynb
+++ b/tutorials/asr/Online_Offline_Speech_Commands_Demo.ipynb
@@ -29,7 +29,7 @@
                 "!pip install pyaudio\n",
                 "\n",
                 "# ## Install NeMo\n",
-                "BRANCH = 'main'\n",
+                "BRANCH = 'r1.21.0'\n",
                 "!python -m pip install git+https://github.com/NVIDIA/NeMo.git@$BRANCH#egg=nemo_toolkit[asr]\n",
                 "\n",
                 "## Install TorchAudio\n",
diff --git a/tutorials/asr/Self_Supervised_Pre_Training.ipynb b/tutorials/asr/Self_Supervised_Pre_Training.ipynb
index 04998f68f23e..bcc37d68ffd8 100644
--- a/tutorials/asr/Self_Supervised_Pre_Training.ipynb
+++ b/tutorials/asr/Self_Supervised_Pre_Training.ipynb
@@ -28,7 +28,7 @@
         "!pip install matplotlib>=3.3.2\n",
         "\n",
         "## Install NeMo\n",
-        "BRANCH = 'main'\n",
+        "BRANCH = 'r1.21.0'\n",
         "!python -m pip install git+https://github.com/NVIDIA/NeMo.git@$BRANCH#egg=nemo_toolkit[all]\n",
         "\n",
         "\"\"\"\n",
diff --git a/tutorials/asr/Speech_Commands.ipynb b/tutorials/asr/Speech_Commands.ipynb
index 208752347d64..720ce997ed0b 100644
--- a/tutorials/asr/Speech_Commands.ipynb
+++ b/tutorials/asr/Speech_Commands.ipynb
@@ -61,7 +61,7 @@
                 "!pip install text-unidecode\n",
                 "\n",
                 "# ## Install NeMo\n",
-                "BRANCH = 'main'\n",
+                "BRANCH = 'r1.21.0'\n",
                 "!python -m pip install git+https://github.com/NVIDIA/NeMo.git@$BRANCH#egg=nemo_toolkit[asr]\n",
                 "\n",
                 "## Install TorchAudio\n",
diff --git a/tutorials/asr/Streaming_ASR.ipynb b/tutorials/asr/Streaming_ASR.ipynb
index a4701dc025d8..901e6d412c99 100644
--- a/tutorials/asr/Streaming_ASR.ipynb
+++ b/tutorials/asr/Streaming_ASR.ipynb
@@ -28,7 +28,7 @@
     "!pip install matplotlib>=3.3.2\n",
     "\n",
     "## Install NeMo\n",
-    "BRANCH = 'main'\n",
+    "BRANCH = 'r1.21.0'\n",
     "!python -m pip install git+https://github.com/NVIDIA/NeMo.git@$BRANCH#egg=nemo_toolkit[all]\n",
     "\n",
     "## Grab the config we'll use in this example\n",
diff --git a/tutorials/asr/Voice_Activity_Detection.ipynb b/tutorials/asr/Voice_Activity_Detection.ipynb
index b1bdd434511b..29718af8ef2b 100644
--- a/tutorials/asr/Voice_Activity_Detection.ipynb
+++ b/tutorials/asr/Voice_Activity_Detection.ipynb
@@ -30,7 +30,7 @@
                 "!pip install text-unidecode\n",
                 "\n",
                 "# ## Install NeMo\n",
-                "BRANCH = 'main'\n",
+                "BRANCH = 'r1.21.0'\n",
                 "!python -m pip install git+https://github.com/NVIDIA/NeMo.git@$BRANCH#egg=nemo_toolkit[asr]\n",
                 "\n",
                 "## Install TorchAudio\n",
diff --git a/tutorials/asr/asr_adapters/ASR_with_Adapters.ipynb b/tutorials/asr/asr_adapters/ASR_with_Adapters.ipynb
index c9c547a8383e..8cbcf2d7cc8e 100644
--- a/tutorials/asr/asr_adapters/ASR_with_Adapters.ipynb
+++ b/tutorials/asr/asr_adapters/ASR_with_Adapters.ipynb
@@ -50,7 +50,7 @@
         "!pip install matplotlib>=3.3.2\n",
         "\n",
         "## Install NeMo\n",
-        "BRANCH = 'main'\n",
+        "BRANCH = 'r1.21.0'\n",
         "!python -m pip install git+https://github.com/NVIDIA/NeMo.git@$BRANCH#egg=nemo_toolkit[all]\n",
         "\n",
         "## Grab the config we'll use in this example\n",
diff --git a/tutorials/audio_tasks/speech_enhancement/Speech_Enhancement_with_NeMo.ipynb b/tutorials/audio_tasks/speech_enhancement/Speech_Enhancement_with_NeMo.ipynb
index 41a49688d35e..6ef41fc77473 100644
--- a/tutorials/audio_tasks/speech_enhancement/Speech_Enhancement_with_NeMo.ipynb
+++ b/tutorials/audio_tasks/speech_enhancement/Speech_Enhancement_with_NeMo.ipynb
@@ -45,7 +45,7 @@
         "5. Restart the runtime (Runtime -> Restart Runtime) for any upgraded packages to take effect\n",
         "\"\"\"\n",
         "\n",
-        "GIT_USER, GIT_BRANCH = 'NVIDIA', 'main'\n",
+        "GIT_USER, GIT_BRANCH = 'NVIDIA', 'r1.21.0'\n",
         "\n",
         "if 'google.colab' in str(get_ipython()):\n",
         "\n",
@@ -104,7 +104,7 @@
         "\n",
         "# Used to download data processing scripts\n",
         "USER = 'anteju' # TODO: change to 'NVIDIA'\n",
-        "BRANCH = 'dev/se-tutorial' # TODO: change to 'main'\n",
+        "BRANCH = 'dev/se-tutorial' # TODO: change to 'r1.21.0'\n",
         "\n",
         "\n",
         "# Utility functions for displaying signals and metrics\n",
diff --git a/tutorials/cloud/aws/ASR_Finetuning_at_Scale_with_AWS_SageMaker.ipynb b/tutorials/cloud/aws/ASR_Finetuning_at_Scale_with_AWS_SageMaker.ipynb
index c4406a4f04ee..25a8ad1a6e26 100644
--- a/tutorials/cloud/aws/ASR_Finetuning_at_Scale_with_AWS_SageMaker.ipynb
+++ b/tutorials/cloud/aws/ASR_Finetuning_at_Scale_with_AWS_SageMaker.ipynb
@@ -70,7 +70,7 @@
     "!pip install matplotlib>=3.3.2\n",
     "\n",
     "## Install NeMo\n",
-    "BRANCH = 'main'\n",
+    "BRANCH = 'r1.21.0'\n",
     "!python -m pip install git+https://github.com/NVIDIA/NeMo.git@$BRANCH#egg=nemo_toolkit[all]\n",
     "\n",
     "\"\"\"\n",
diff --git a/tutorials/cloud/aws/SageMaker_ASR_Training.ipynb b/tutorials/cloud/aws/SageMaker_ASR_Training.ipynb
index 8cf540b27114..fd7502f99e1f 100644
--- a/tutorials/cloud/aws/SageMaker_ASR_Training.ipynb
+++ b/tutorials/cloud/aws/SageMaker_ASR_Training.ipynb
@@ -55,7 +55,7 @@
     "!pip install matplotlib>=3.3.2\n",
     "\n",
     "## Install NeMo\n",
-    "BRANCH = 'main'\n",
+    "BRANCH = 'r1.21.0'\n",
     "!python -m pip install git+https://github.com/NVIDIA/NeMo.git@$BRANCH#egg=nemo_toolkit[all]\n",
     "\n",
     "\"\"\"\n",
diff --git a/tutorials/nlp/01_Pretrained_Language_Models_for_Downstream_Tasks.ipynb b/tutorials/nlp/01_Pretrained_Language_Models_for_Downstream_Tasks.ipynb
index faa93de12514..ac99573cd511 100644
--- a/tutorials/nlp/01_Pretrained_Language_Models_for_Downstream_Tasks.ipynb
+++ b/tutorials/nlp/01_Pretrained_Language_Models_for_Downstream_Tasks.ipynb
@@ -26,7 +26,7 @@
                 "# If you're using Google Colab and not running locally, run this cell\n",
                 "\n",
                 "# install NeMo\n",
-                "BRANCH = 'main'\n",
+                "BRANCH = 'r1.21.0'\n",
                 "!python -m pip install git+https://github.com/NVIDIA/NeMo.git@$BRANCH#egg=nemo_toolkit[nlp]"
             ]
         },
diff --git a/tutorials/nlp/02_NLP_Tokenizers.ipynb b/tutorials/nlp/02_NLP_Tokenizers.ipynb
index c63d2a8b1689..b13b1c4526d0 100644
--- a/tutorials/nlp/02_NLP_Tokenizers.ipynb
+++ b/tutorials/nlp/02_NLP_Tokenizers.ipynb
@@ -10,7 +10,7 @@
             },
             "outputs": [],
             "source": [
-                "BRANCH = 'main'"
+                "BRANCH = 'r1.21.0'"
             ]
         },
         {
@@ -35,7 +35,7 @@
                 "# If you're using Google Colab and not running locally, run this cell\n",
                 "\n",
                 "# install NeMo\n",
-                "BRANCH = 'main'\n",
+                "BRANCH = 'r1.21.0'\n",
                 "!python -m pip install git+https://github.com/NVIDIA/NeMo.git@$BRANCH#egg=nemo_toolkit[nlp]"
             ]
         },
diff --git a/tutorials/nlp/Data_Preprocessing_and_Cleaning_for_NMT.ipynb b/tutorials/nlp/Data_Preprocessing_and_Cleaning_for_NMT.ipynb
index 323bfa1c49b8..050028bdb269 100644
--- a/tutorials/nlp/Data_Preprocessing_and_Cleaning_for_NMT.ipynb
+++ b/tutorials/nlp/Data_Preprocessing_and_Cleaning_for_NMT.ipynb
@@ -300,7 +300,7 @@
     "\n",
     "## Install NeMo\n",
     "\n",
-    "BRANCH = 'main'\n",
+    "BRANCH = 'r1.21.0'\n",
     "!python -m pip install git+https://github.com/NVIDIA/NeMo.git@$BRANCH#egg=nemo_toolkit[all]\n",
     "\n",
     "!pip uninstall -y sacrebleu\n",
diff --git a/tutorials/nlp/Dialogue.ipynb b/tutorials/nlp/Dialogue.ipynb
index ddd3bdd4f929..26a9346f0e78 100644
--- a/tutorials/nlp/Dialogue.ipynb
+++ b/tutorials/nlp/Dialogue.ipynb
@@ -27,7 +27,7 @@
       "outputs": [],
       "source": [
         "import os \n",
-        "BRANCH = 'main'\n",
+        "BRANCH = 'r1.21.0'\n",
         "!apt-get update && apt-get install -y libsndfile1 ffmpeg\n",
         "!git clone https://github.com/NVIDIA/NeMo --branch $BRANCH\n",
         "os.chdir('NeMo')\n",
diff --git a/tutorials/nlp/Entity_Linking_Medical.ipynb b/tutorials/nlp/Entity_Linking_Medical.ipynb
index ff8eda123b7f..825d7879d21d 100644
--- a/tutorials/nlp/Entity_Linking_Medical.ipynb
+++ b/tutorials/nlp/Entity_Linking_Medical.ipynb
@@ -17,7 +17,7 @@
     "\"\"\"\n",
     "\n",
     "## Install NeMo if using google collab or if its not installed locally\n",
-    "BRANCH = 'main'\n",
+    "BRANCH = 'r1.21.0'\n",
     "!python -m pip install git+https://github.com/NVIDIA/NeMo.git@$BRANCH#egg=nemo_toolkit[all]"
    ]
   },
diff --git a/tutorials/nlp/GLUE_Benchmark.ipynb b/tutorials/nlp/GLUE_Benchmark.ipynb
index d8fe75940b09..ce65bb927f7c 100644
--- a/tutorials/nlp/GLUE_Benchmark.ipynb
+++ b/tutorials/nlp/GLUE_Benchmark.ipynb
@@ -44,7 +44,7 @@
         "# If you're using Google Colab and not running locally, run this cell\n",
         "\n",
         "# install NeMo\n",
-        "BRANCH = 'main'\n!python -m pip install git+https://github.com/NVIDIA/NeMo.git@$BRANCH#egg=nemo_toolkit[nlp]\n"
+        "BRANCH = 'r1.21.0'\n!python -m pip install git+https://github.com/NVIDIA/NeMo.git@$BRANCH#egg=nemo_toolkit[nlp]\n"
       ],
       "execution_count": null,
       "outputs": []
diff --git a/tutorials/nlp/ITN_with_Thutmose_Tagger.ipynb b/tutorials/nlp/ITN_with_Thutmose_Tagger.ipynb
index 6204bf2516bb..05ee26f9e8c8 100644
--- a/tutorials/nlp/ITN_with_Thutmose_Tagger.ipynb
+++ b/tutorials/nlp/ITN_with_Thutmose_Tagger.ipynb
@@ -21,7 +21,7 @@
         "import os\n",
         "\n",
         "# install NeMo\n",
-        "BRANCH = 'main'\n",
+        "BRANCH = 'r1.21.0'\n",
         "\n",
         "GITHUB_ACCOUNT = 'NVIDIA'  # change this if using a fork\n",
         "\n",
diff --git a/tutorials/nlp/Joint_Intent_and_Slot_Classification.ipynb b/tutorials/nlp/Joint_Intent_and_Slot_Classification.ipynb
index 104d69df18e2..b8366a68c8d1 100644
--- a/tutorials/nlp/Joint_Intent_and_Slot_Classification.ipynb
+++ b/tutorials/nlp/Joint_Intent_and_Slot_Classification.ipynb
@@ -22,7 +22,7 @@
                 "# If you're using Google Colab and not running locally, run this cell\n",
                 "\n",
                 "# install NeMo\n",
-                "BRANCH = 'main'\n",
+                "BRANCH = 'r1.21.0'\n",
                 "!python -m pip install git+https://github.com/NVIDIA/NeMo.git@$BRANCH#egg=nemo_toolkit[nlp]"
             ]
         },
diff --git a/tutorials/nlp/MegatronBert_export.ipynb b/tutorials/nlp/MegatronBert_export.ipynb
index f925d2bc59b0..1d402d3b35f5 100644
--- a/tutorials/nlp/MegatronBert_export.ipynb
+++ b/tutorials/nlp/MegatronBert_export.ipynb
@@ -7,7 +7,7 @@
             "metadata": {},
             "outputs": [],
             "source": [
-                "BRANCH='main'"
+                "BRANCH='r1.21.0'"
             ]
         },
         {
diff --git a/tutorials/nlp/Megatron_Synthetic_Tabular_Data_Generation.ipynb b/tutorials/nlp/Megatron_Synthetic_Tabular_Data_Generation.ipynb
index bfd3c7094198..0aab88d7ec99 100644
--- a/tutorials/nlp/Megatron_Synthetic_Tabular_Data_Generation.ipynb
+++ b/tutorials/nlp/Megatron_Synthetic_Tabular_Data_Generation.ipynb
@@ -65,7 +65,7 @@
             "metadata": {},
             "outputs": [],
             "source": [
-                "BRANCH = 'main'\n",
+                "BRANCH = 'r1.21.0'\n",
                 "DATA_PATH='.'\n",
                 "TRANSACTIONS=DATA_PATH+'/card_transaction.v1.csv'\n",
                 "#CHECKPOINTS='/chk_points'\n",
diff --git a/tutorials/nlp/Multitask_Prompt_and_PTuning.ipynb b/tutorials/nlp/Multitask_Prompt_and_PTuning.ipynb
index 004014ebdeeb..d52b8e9b301b 100644
--- a/tutorials/nlp/Multitask_Prompt_and_PTuning.ipynb
+++ b/tutorials/nlp/Multitask_Prompt_and_PTuning.ipynb
@@ -7,7 +7,7 @@
     "metadata": {},
     "outputs": [],
     "source": [
-     "BRANCH='main'"
+     "BRANCH='r1.21.0'"
     ]
    },
    {
diff --git a/tutorials/nlp/Punctuation_and_Capitalization.ipynb b/tutorials/nlp/Punctuation_and_Capitalization.ipynb
index 1519c234372b..3db42c6a6e1a 100644
--- a/tutorials/nlp/Punctuation_and_Capitalization.ipynb
+++ b/tutorials/nlp/Punctuation_and_Capitalization.ipynb
@@ -6,7 +6,7 @@
             "metadata": {},
             "outputs": [],
             "source": [
-                "BRANCH = 'main'"
+                "BRANCH = 'r1.21.0'"
             ]
         },
         {
diff --git a/tutorials/nlp/Punctuation_and_Capitalization_Lexical_Audio.ipynb b/tutorials/nlp/Punctuation_and_Capitalization_Lexical_Audio.ipynb
index 5580bc4cf946..ceaedc1d57b7 100644
--- a/tutorials/nlp/Punctuation_and_Capitalization_Lexical_Audio.ipynb
+++ b/tutorials/nlp/Punctuation_and_Capitalization_Lexical_Audio.ipynb
@@ -10,7 +10,7 @@
    },
    "outputs": [],
    "source": [
-    "BRANCH = 'main'"
+    "BRANCH = 'r1.21.0'"
    ]
   },
   {
diff --git a/tutorials/nlp/Question_Answering.ipynb b/tutorials/nlp/Question_Answering.ipynb
index 7217b0fb6756..5d353dd76074 100644
--- a/tutorials/nlp/Question_Answering.ipynb
+++ b/tutorials/nlp/Question_Answering.ipynb
@@ -74,7 +74,7 @@
       },
       "outputs": [],
       "source": [
-        "BRANCH = 'main'"
+        "BRANCH = 'r1.21.0'"
       ]
     },
     {
diff --git a/tutorials/nlp/Relation_Extraction-BioMegatron.ipynb b/tutorials/nlp/Relation_Extraction-BioMegatron.ipynb
index b7c25cb416ef..9e552b9fe301 100644
--- a/tutorials/nlp/Relation_Extraction-BioMegatron.ipynb
+++ b/tutorials/nlp/Relation_Extraction-BioMegatron.ipynb
@@ -6,7 +6,7 @@
             "metadata": {},
             "outputs": [],
             "source": [
-                "BRANCH = 'main'"
+                "BRANCH = 'r1.21.0'"
             ]
         },
         {
diff --git a/tutorials/nlp/SpellMapper_English_ASR_Customization.ipynb b/tutorials/nlp/SpellMapper_English_ASR_Customization.ipynb
index 8506c1368b23..4dfd7579a94c 100644
--- a/tutorials/nlp/SpellMapper_English_ASR_Customization.ipynb
+++ b/tutorials/nlp/SpellMapper_English_ASR_Customization.ipynb
@@ -85,7 +85,7 @@
         "# Install NeMo library. If you are running locally (rather than on Google Colab), comment out the below lines\n",
         "# and instead follow the instructions at https://github.com/NVIDIA/NeMo#Installation\n",
         "GITHUB_ACCOUNT = \"NVIDIA\"\n",
-        "BRANCH = 'main'\n",
+        "BRANCH = 'r1.21.0'\n",
         "!python -m pip install git+https://github.com/{GITHUB_ACCOUNT}/NeMo.git@{BRANCH}#egg=nemo_toolkit[all]\n",
         "\n",
         "# Download local version of NeMo scripts. If you are running locally and want to use your own local NeMo code,\n",
diff --git a/tutorials/nlp/Text_Classification_Sentiment_Analysis.ipynb b/tutorials/nlp/Text_Classification_Sentiment_Analysis.ipynb
index 5b5b74e7bf11..064ff7f0c046 100644
--- a/tutorials/nlp/Text_Classification_Sentiment_Analysis.ipynb
+++ b/tutorials/nlp/Text_Classification_Sentiment_Analysis.ipynb
@@ -20,7 +20,7 @@
                 "# If you're using Google Colab and not running locally, run this cell\n",
                 "\n",
                 "# install NeMo\n",
-                "BRANCH = 'main'\n",
+                "BRANCH = 'r1.21.0'\n",
                 "!python -m pip install git+https://github.com/NVIDIA/NeMo.git@$BRANCH#egg=nemo_toolkit[nlp]\n",
                 "\n"
             ]
diff --git a/tutorials/nlp/Token_Classification-BioMegatron.ipynb b/tutorials/nlp/Token_Classification-BioMegatron.ipynb
index 517f2e557743..81de2380ff57 100644
--- a/tutorials/nlp/Token_Classification-BioMegatron.ipynb
+++ b/tutorials/nlp/Token_Classification-BioMegatron.ipynb
@@ -7,7 +7,7 @@
             "metadata": {},
             "outputs": [],
             "source": [
-                "BRANCH='main'"
+                "BRANCH='r1.21.0'"
             ]
         },
         {
diff --git a/tutorials/nlp/Token_Classification_Named_Entity_Recognition.ipynb b/tutorials/nlp/Token_Classification_Named_Entity_Recognition.ipynb
index c3f7e28b6b1f..3d16650d151b 100644
--- a/tutorials/nlp/Token_Classification_Named_Entity_Recognition.ipynb
+++ b/tutorials/nlp/Token_Classification_Named_Entity_Recognition.ipynb
@@ -30,7 +30,7 @@
       "metadata": {},
       "outputs": [],
       "source": [
-        "BRANCH = 'main'"
+        "BRANCH = 'r1.21.0'"
       ]
     },
     {
@@ -53,7 +53,7 @@
         "# If you're using Google Colab and not running locally, run this cell\n",
         "\n",
         "# install NeMo\n",
-        "BRANCH = 'main'\n!python -m pip install git+https://github.com/NVIDIA/NeMo.git@$BRANCH#egg=nemo_toolkit[nlp]\n"
+        "BRANCH = 'r1.21.0'\n!python -m pip install git+https://github.com/NVIDIA/NeMo.git@$BRANCH#egg=nemo_toolkit[nlp]\n"
       ],
       "execution_count": null,
       "outputs": []
diff --git a/tutorials/nlp/Zero_Shot_Intent_Recognition.ipynb b/tutorials/nlp/Zero_Shot_Intent_Recognition.ipynb
index 69df7b27b02d..611ab77b58c8 100644
--- a/tutorials/nlp/Zero_Shot_Intent_Recognition.ipynb
+++ b/tutorials/nlp/Zero_Shot_Intent_Recognition.ipynb
@@ -22,7 +22,7 @@
                 "# If you're using Google Colab and not running locally, run this cell\n",
                 "\n",
                 "# install NeMo\n",
-                "BRANCH = 'main'\n",
+                "BRANCH = 'r1.21.0'\n",
                 "!python -m pip install git+https://github.com/NVIDIA/NeMo.git@$BRANCH#egg=nemo_toolkit[nlp]"
             ]
         },
diff --git a/tutorials/nlp/lora.ipynb b/tutorials/nlp/lora.ipynb
index fc79f74a6e2a..3e3090afe305 100644
--- a/tutorials/nlp/lora.ipynb
+++ b/tutorials/nlp/lora.ipynb
@@ -7,7 +7,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "BRANCH='main'\n",
+    "BRANCH='r1.21.0'\n",
     "import os\n",
     "import wget"
    ]
diff --git a/tutorials/speaker_tasks/ASR_with_SpeakerDiarization.ipynb b/tutorials/speaker_tasks/ASR_with_SpeakerDiarization.ipynb
index ea943b35e0d0..c4a184c8b414 100644
--- a/tutorials/speaker_tasks/ASR_with_SpeakerDiarization.ipynb
+++ b/tutorials/speaker_tasks/ASR_with_SpeakerDiarization.ipynb
@@ -30,7 +30,7 @@
     "!pip install text-unidecode\n",
     "\n",
     "# ## Install NeMo\n",
-    "BRANCH = 'main'\n",
+    "BRANCH = 'r1.21.0'\n",
     "!python -m pip install git+https://github.com/NVIDIA/NeMo.git@$BRANCH#egg=nemo_toolkit[asr]\n",
     "\n",
     "## Install TorchAudio\n",
diff --git a/tutorials/speaker_tasks/Speaker_Diarization_Inference.ipynb b/tutorials/speaker_tasks/Speaker_Diarization_Inference.ipynb
index 1fd0f1b140d5..bcbd1db3f0c2 100644
--- a/tutorials/speaker_tasks/Speaker_Diarization_Inference.ipynb
+++ b/tutorials/speaker_tasks/Speaker_Diarization_Inference.ipynb
@@ -23,7 +23,7 @@
     "!pip install text-unidecode\n",
     "\n",
     "# ## Install NeMo\n",
-    "BRANCH = 'main'\n",
+    "BRANCH = 'r1.21.0'\n",
     "!python -m pip install git+https://github.com/NVIDIA/NeMo.git@$BRANCH#egg=nemo_toolkit[asr]\n",
     "\n",
     "## Install TorchAudio\n",
diff --git a/tutorials/speaker_tasks/Speaker_Diarization_Training.ipynb b/tutorials/speaker_tasks/Speaker_Diarization_Training.ipynb
index 3c56df2bbba0..d585e4add6dd 100644
--- a/tutorials/speaker_tasks/Speaker_Diarization_Training.ipynb
+++ b/tutorials/speaker_tasks/Speaker_Diarization_Training.ipynb
@@ -18,7 +18,7 @@
     "\"\"\"\n",
     "\n",
     "NEMO_DIR_PATH = \"NeMo\"\n",
-    "BRANCH = 'main'\n",
+    "BRANCH = 'r1.21.0'\n",
     "\n",
     "! git clone https://github.com/NVIDIA/NeMo\n",
     "%cd NeMo\n",
@@ -232,7 +232,7 @@
    "source": [
     "import os\n",
     "NEMO_DIR_PATH = \"NeMo\"\n",
-    "BRANCH = 'main'\n",
+    "BRANCH = 'r1.21.0'\n",
     "\n",
     "# download scripts if not already there \n",
     "if not os.path.exists('NeMo/scripts'):\n",
diff --git a/tutorials/speaker_tasks/Speaker_Identification_Verification.ipynb b/tutorials/speaker_tasks/Speaker_Identification_Verification.ipynb
index dce8c46df1b0..5663698eb70b 100644
--- a/tutorials/speaker_tasks/Speaker_Identification_Verification.ipynb
+++ b/tutorials/speaker_tasks/Speaker_Identification_Verification.ipynb
@@ -27,7 +27,7 @@
     "!pip install text-unidecode\n",
     "\n",
     "## Install NeMo\n",
-    "BRANCH = 'main'\n",
+    "BRANCH = 'r1.21.0'\n",
     "!python -m pip install git+https://github.com/NVIDIA/NeMo.git@$BRANCH#egg=nemo_toolkit[asr]\n",
     "\n",
     "# Install TorchAudio\n",
diff --git a/tutorials/tools/CTC_Segmentation_Tutorial.ipynb b/tutorials/tools/CTC_Segmentation_Tutorial.ipynb
index 98f0cce4e9ec..6ae0b848f50c 100644
--- a/tutorials/tools/CTC_Segmentation_Tutorial.ipynb
+++ b/tutorials/tools/CTC_Segmentation_Tutorial.ipynb
@@ -35,7 +35,7 @@
         "id": "d4KCUoxSpdoZ"
       },
       "source": [
-        "BRANCH = 'main'\n",
+        "BRANCH = 'r1.21.0'\n",
         "\n",
         "\"\"\"\n",
         "You can run either this notebook locally (if you have all the dependencies and a GPU) or on Google Colab.\n",
diff --git a/tutorials/tools/Multispeaker_Simulator.ipynb b/tutorials/tools/Multispeaker_Simulator.ipynb
index c2a9caf1ea72..c82a2c37b209 100644
--- a/tutorials/tools/Multispeaker_Simulator.ipynb
+++ b/tutorials/tools/Multispeaker_Simulator.ipynb
@@ -18,7 +18,7 @@
     "\"\"\"\n",
     "\n",
     "NEMO_DIR_PATH = \"NeMo\"\n",
-    "BRANCH = 'main'\n",
+    "BRANCH = 'r1.21.0'\n",
     "\n",
     "! git clone https://github.com/NVIDIA/NeMo\n",
     "%cd NeMo\n",
diff --git a/tutorials/tools/NeMo_Forced_Aligner_Tutorial.ipynb b/tutorials/tools/NeMo_Forced_Aligner_Tutorial.ipynb
index a6ab57854bad..6f533d7c3923 100644
--- a/tutorials/tools/NeMo_Forced_Aligner_Tutorial.ipynb
+++ b/tutorials/tools/NeMo_Forced_Aligner_Tutorial.ipynb
@@ -13,7 +13,7 @@
    },
    "outputs": [],
    "source": [
-    "BRANCH = 'main'\n",
+    "BRANCH = 'r1.21.0'\n",
     "\n",
     "\"\"\"\n",
     "You can run either this notebook locally (if you have all the dependencies and a GPU) or on Google Colab.\n",
diff --git a/tutorials/tts/Aligner_Inference_Examples.ipynb b/tutorials/tts/Aligner_Inference_Examples.ipynb
index 611e1e3b6e66..e264ee7f5096 100644
--- a/tutorials/tts/Aligner_Inference_Examples.ipynb
+++ b/tutorials/tts/Aligner_Inference_Examples.ipynb
@@ -39,7 +39,7 @@
     "3. Connect to an instance with a GPU (Runtime -> Change runtime type -> select \"GPU\" for hardware accelerator)\n",
     "4. Run this cell to set up dependencies.\n",
     "\"\"\"\n",
-    "BRANCH = 'main'\n",
+    "BRANCH = 'r1.21.0'\n",
     "# # If you're using Colab and not running locally, uncomment and run this cell.\n",
     "# !apt-get install sox libsndfile1 ffmpeg\n",
     "# !pip install wget text-unidecode\n",
diff --git a/tutorials/tts/Evaluation_MelCepstralDistortion.ipynb b/tutorials/tts/Evaluation_MelCepstralDistortion.ipynb
index 699f1b131408..907cb4766de5 100644
--- a/tutorials/tts/Evaluation_MelCepstralDistortion.ipynb
+++ b/tutorials/tts/Evaluation_MelCepstralDistortion.ipynb
@@ -57,7 +57,7 @@
     "3. Connect to an instance with a GPU (Runtime -> Change runtime type -> select \"GPU\" for hardware accelerator)\n",
     "4. Run this cell to set up dependencies.\n",
     "\"\"\"\n",
-    "BRANCH = 'main'\n",
+    "BRANCH = 'r1.21.0'\n",
     "# If you're using Google Colab and not running locally, uncomment and run this cell.\n",
     "# !pip install librosa numpy matplotlib"
    ]
diff --git a/tutorials/tts/FastPitch_Adapter_Finetuning.ipynb b/tutorials/tts/FastPitch_Adapter_Finetuning.ipynb
index 263d22b60599..05c002b170b1 100644
--- a/tutorials/tts/FastPitch_Adapter_Finetuning.ipynb
+++ b/tutorials/tts/FastPitch_Adapter_Finetuning.ipynb
@@ -59,7 +59,7 @@
     "4. Run this cell to set up dependencies# .\n",
     "\"\"\"\n",
     "# # If you're using Colab and not running locally, uncomment and run this cell.\n",
-    "# BRANCH = 'main'\n",
+    "# BRANCH = 'r1.21.0'\n",
     "# !apt-get install sox libsndfile1 ffmpeg\n",
     "# !pip install wget unidecode pynini==2.1.4 scipy==1.7.3\n",
     "# !python -m pip install git+https://github.com/NVIDIA/NeMo.git@$BRANCH#egg=nemo_toolkit[all]\n",
diff --git a/tutorials/tts/FastPitch_ChineseTTS_Training.ipynb b/tutorials/tts/FastPitch_ChineseTTS_Training.ipynb
index 2a12b417a271..f3d1a3a57a31 100644
--- a/tutorials/tts/FastPitch_ChineseTTS_Training.ipynb
+++ b/tutorials/tts/FastPitch_ChineseTTS_Training.ipynb
@@ -61,7 +61,7 @@
     "# !pip install wget text-unidecode matplotlib>=3.3.2\n",
     "\n",
     "## Install NeMo\n",
-    "BRANCH = 'main'\n",
+    "BRANCH = 'r1.21.0'\n",
     "# !python -m pip install \"git+https://github.com/NVIDIA/NeMo.git@${BRANCH}#egg=nemo_toolkit[all]\"\n",
     "\n",
     "## Install pynini\n",
diff --git a/tutorials/tts/FastPitch_Data_Preparation.ipynb b/tutorials/tts/FastPitch_Data_Preparation.ipynb
index 46778759d5cb..150b0cbfa60e 100644
--- a/tutorials/tts/FastPitch_Data_Preparation.ipynb
+++ b/tutorials/tts/FastPitch_Data_Preparation.ipynb
@@ -74,7 +74,7 @@
     {
       "cell_type": "code",
       "source": [
-        "BRANCH = 'main'\n",
+        "BRANCH = 'r1.21.0'\n",
         "NEMO_ROOT_DIR = '/content/nemo'"
       ],
       "metadata": {
diff --git a/tutorials/tts/FastPitch_Finetuning.ipynb b/tutorials/tts/FastPitch_Finetuning.ipynb
index e0c34b3c0de5..f37293d8b39f 100755
--- a/tutorials/tts/FastPitch_Finetuning.ipynb
+++ b/tutorials/tts/FastPitch_Finetuning.ipynb
@@ -57,7 +57,7 @@
                 "3. Connect to an instance with a GPU (Runtime -> Change runtime type -> select \"GPU\" for hardware accelerator)\n",
                 "4. Run this cell to set up dependencies.\n",
                 "\"\"\"\n",
-                "BRANCH = 'main'\n",
+                "BRANCH = 'r1.21.0'\n",
                 "# # If you're using Google Colab and not running locally, uncomment and run this cell.\n",
                 "# !apt-get install sox libsndfile1 ffmpeg\n",
                 "# !pip install wget text-unidecode \n",
diff --git a/tutorials/tts/FastPitch_GermanTTS_Training.ipynb b/tutorials/tts/FastPitch_GermanTTS_Training.ipynb
index e7cb0e896650..891b767d56f9 100644
--- a/tutorials/tts/FastPitch_GermanTTS_Training.ipynb
+++ b/tutorials/tts/FastPitch_GermanTTS_Training.ipynb
@@ -61,7 +61,7 @@
     "# !pip install wget text-unidecode matplotlib>=3.3.2\n",
     "\n",
     "## Install NeMo\n",
-    "BRANCH = 'main'\n",
+    "BRANCH = 'r1.21.0'\n",
     "# !python -m pip install \"git+https://github.com/NVIDIA/NeMo.git@${BRANCH}#egg=nemo_toolkit[all]\"\n",
     "\n",
     "## Install pynini\n",
diff --git a/tutorials/tts/FastPitch_MixerTTS_Training.ipynb b/tutorials/tts/FastPitch_MixerTTS_Training.ipynb
index 558c0d95d30b..87feb9c2f015 100644
--- a/tutorials/tts/FastPitch_MixerTTS_Training.ipynb
+++ b/tutorials/tts/FastPitch_MixerTTS_Training.ipynb
@@ -50,7 +50,7 @@
     "3. Connect to an instance with a GPU (Runtime -> Change runtime type -> select \"GPU\" for hardware accelerator)\n",
     "4. Run this cell to set up dependencies# .\n",
     "\"\"\"\n",
-    "BRANCH = 'main'\n",
+    "BRANCH = 'r1.21.0'\n",
     "# # If you're using Colab and not running locally, uncomment and run this cell.\n",
     "# !apt-get install sox libsndfile1 ffmpeg\n",
     "# !pip install wget text-unidecode scipy==1.7.3\n",
diff --git a/tutorials/tts/FastPitch_MultiSpeaker_Pretraining.ipynb b/tutorials/tts/FastPitch_MultiSpeaker_Pretraining.ipynb
index cb5cb651d76e..c7e31d72828b 100644
--- a/tutorials/tts/FastPitch_MultiSpeaker_Pretraining.ipynb
+++ b/tutorials/tts/FastPitch_MultiSpeaker_Pretraining.ipynb
@@ -56,7 +56,7 @@
     "3. Connect to an instance with a GPU (Runtime -> Change runtime type -> select \"GPU\" for hardware accelerator)\n",
     "4. Run this cell to set up dependencies# .\n",
     "\"\"\"\n",
-    "# BRANCH = 'main'\n",
+    "# BRANCH = 'r1.21.0'\n",
     "# # If you're using Colab and not running locally, uncomment and run this cell.\n",
     "# !apt-get install sox libsndfile1 ffmpeg\n",
     "# !pip install wget unidecode pynini==2.1.4 scipy==1.7.3\n",
diff --git a/tutorials/tts/FastPitch_Speaker_Interpolation.ipynb b/tutorials/tts/FastPitch_Speaker_Interpolation.ipynb
index eda5bba0aa1e..e45d03a5275a 100644
--- a/tutorials/tts/FastPitch_Speaker_Interpolation.ipynb
+++ b/tutorials/tts/FastPitch_Speaker_Interpolation.ipynb
@@ -94,7 +94,7 @@
    "source": [
     "# Install NeMo library. If you are running locally (rather than on Google Colab), comment out the below lines\n",
     "# and instead follow the instructions at https://github.com/NVIDIA/NeMo#Installation\n",
-    "BRANCH = 'main'\n",
+    "BRANCH = 'r1.21.0'\n",
     "!python -m pip install git+https://github.com/NVIDIA/NeMo.git@$BRANCH#egg=nemo_toolkit[all]"
    ]
   },
diff --git a/tutorials/tts/Inference_DurationPitchControl.ipynb b/tutorials/tts/Inference_DurationPitchControl.ipynb
index 73c12bc79900..e73594f2fc94 100644
--- a/tutorials/tts/Inference_DurationPitchControl.ipynb
+++ b/tutorials/tts/Inference_DurationPitchControl.ipynb
@@ -46,7 +46,7 @@
                 "3. Connect to an instance with a GPU (Runtime -> Change runtime type -> select \"GPU\" for hardware accelerator)\n",
                 "4. Run this cell to set up dependencies.\n",
                 "\"\"\"\n",
-                "BRANCH = 'main'\n",
+                "BRANCH = 'r1.21.0'\n",
                 "# # If you're using Google Colab and not running locally, uncomment and run this cell.\n",
                 "# !apt-get install sox libsndfile1 ffmpeg\n",
                 "# !pip install wget text-unidecode\n",
diff --git a/tutorials/tts/Inference_ModelSelect.ipynb b/tutorials/tts/Inference_ModelSelect.ipynb
index 195b773fb5ee..72615251a43d 100644
--- a/tutorials/tts/Inference_ModelSelect.ipynb
+++ b/tutorials/tts/Inference_ModelSelect.ipynb
@@ -46,7 +46,7 @@
                 "3. Connect to an instance with a GPU (Runtime -> Change runtime type -> select \"GPU\" for hardware accelerator)\n",
                 "4. Run this cell to set up dependencies.\n",
                 "\"\"\"\n",
-                "BRANCH = 'main'\n",
+                "BRANCH = 'r1.21.0'\n",
                 "# # If you're using Google Colab and not running locally, uncomment and run this cell.\n",
                 "# !apt-get install sox libsndfile1 ffmpeg\n",
                 "# !pip install wget text-unidecode\n",
diff --git a/tutorials/tts/NeMo_TTS_Primer.ipynb b/tutorials/tts/NeMo_TTS_Primer.ipynb
index 99306744dd05..1f1bd0c62a4a 100644
--- a/tutorials/tts/NeMo_TTS_Primer.ipynb
+++ b/tutorials/tts/NeMo_TTS_Primer.ipynb
@@ -25,7 +25,7 @@
    "source": [
     "# Install NeMo library. If you are running locally (rather than on Google Colab), comment out the below lines\n",
     "# and instead follow the instructions at https://github.com/NVIDIA/NeMo#Installation\n",
-    "BRANCH = 'main'\n",
+    "BRANCH = 'r1.21.0'\n",
     "!python -m pip install git+https://github.com/NVIDIA/NeMo.git@$BRANCH#egg=nemo_toolkit[all]"
    ]
   },
diff --git a/tutorials/tts/Pronunciation_customization.ipynb b/tutorials/tts/Pronunciation_customization.ipynb
index 6fe269e76904..bfdab646cef7 100644
--- a/tutorials/tts/Pronunciation_customization.ipynb
+++ b/tutorials/tts/Pronunciation_customization.ipynb
@@ -26,7 +26,7 @@
     "4. Run this cell to set up dependencies.\n",
     "\"\"\"\n",
     "\n",
-    "BRANCH = 'main'\n",
+    "BRANCH = 'r1.21.0'\n",
     "# # If you're using Google Colab and not running locally, uncomment and run this cell.\n",
     "# !apt-get install sox libsndfile1 ffmpeg\n",
     "# !pip install wget text-unidecode \n",
diff --git a/tutorials/tts/Tacotron2_Training.ipynb b/tutorials/tts/Tacotron2_Training.ipynb
index e2ae5082e608..62be415aa439 100644
--- a/tutorials/tts/Tacotron2_Training.ipynb
+++ b/tutorials/tts/Tacotron2_Training.ipynb
@@ -54,7 +54,7 @@
     "3. Connect to an instance with a GPU (Runtime -> Change runtime type -> select \"GPU\" for hardware accelerator)\n",
     "4. Run this cell to set up dependencies# .\n",
     "\"\"\"\n",
-    "BRANCH = 'main'\n",
+    "BRANCH = 'r1.21.0'\n",
     "# # If you're using Colab and not running locally, uncomment and run this cell.\n",
     "# !apt-get install sox libsndfile1 ffmpeg\n",
     "# !pip install wget text-unidecode\n",
diff --git a/tutorials/tts/Vits_Training.ipynb b/tutorials/tts/Vits_Training.ipynb
index 84dad62bba6f..10196cc864d6 100644
--- a/tutorials/tts/Vits_Training.ipynb
+++ b/tutorials/tts/Vits_Training.ipynb
@@ -63,7 +63,7 @@
     "# !pip install wget text-unidecode matplotlib>=3.3.2\n",
     "\n",
     "## Install NeMo\n",
-    "BRANCH = 'main'\n",
+    "BRANCH = 'r1.21.0'\n",
     "# !python -m pip install \"git+https://github.com/NVIDIA/NeMo.git@${BRANCH}#egg=nemo_toolkit[all]\"\n",
     "\n",
     "## Install pynini\n",

From b850d14d4639d63f622a1823ae7f6b8f8f331613 Mon Sep 17 00:00:00 2001
From: Abhishree Thittenamane <47577437+athitten@users.noreply.github.com>
Date: Tue, 12 Sep 2023 18:24:31 -0700
Subject: [PATCH 02/10] Fix resume from checkpoint in exp_manager (#7424)

Signed-off-by: Abhishree <abhishreetm@gmail.com>
Co-authored-by: Eric Harper <complex451@gmail.com>
---
 nemo/utils/exp_manager.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/nemo/utils/exp_manager.py b/nemo/utils/exp_manager.py
index 3deb814ae2df..1629aa5cbb50 100644
--- a/nemo/utils/exp_manager.py
+++ b/nemo/utils/exp_manager.py
@@ -578,8 +578,8 @@ def check_resume(
         end_dist_checkpoints = [d for d in dist_checkpoints if d.match("*end")]
         last_dist_checkpoints = [d for d in dist_checkpoints if d.match("*last")]
 
-        end_checkpoints = end_dist_checkpoints if end_dist_checkpoints else list(checkpoint_dir.glob("*end.ckpt"))
-        last_checkpoints = last_dist_checkpoints if last_dist_checkpoints else list(checkpoint_dir.glob("*last.ckpt"))
+        end_checkpoints = end_dist_checkpoints if end_dist_checkpoints else list(checkpoint_dir.rglob("*end.ckpt"))
+        last_checkpoints = last_dist_checkpoints if last_dist_checkpoints else list(checkpoint_dir.rglob("*last.ckpt"))
 
         if not checkpoint_dir.exists() or (not len(end_checkpoints) > 0 and not len(last_checkpoints) > 0):
             if resume_ignore_no_checkpoint:

From e9ca147935b5dbf72e81514b3096fb3bc9d3fe1f Mon Sep 17 00:00:00 2001
From: Abhishree Thittenamane <47577437+athitten@users.noreply.github.com>
Date: Tue, 19 Sep 2023 13:57:19 -0700
Subject: [PATCH 03/10] Add strategy as ddp_find_unused_parameters_true for
 glue_benchmark.py (#7454)

Signed-off-by: Abhishree <abhishreetm@gmail.com>
---
 examples/nlp/glue_benchmark/glue_benchmark.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/examples/nlp/glue_benchmark/glue_benchmark.py b/examples/nlp/glue_benchmark/glue_benchmark.py
index 87486dbc47b0..3cb5f8e4af3e 100644
--- a/examples/nlp/glue_benchmark/glue_benchmark.py
+++ b/examples/nlp/glue_benchmark/glue_benchmark.py
@@ -46,6 +46,10 @@
 
 @hydra_runner(config_name="glue_benchmark_config")
 def main(cfg: DictConfig) -> None:
+    # PTL 2.0 has find_unused_parameters as False by default, so its required to set it to True
+    # when there are unused parameters like here
+    if cfg.trainer.strategy == 'ddp':
+        cfg.trainer.strategy = "ddp_find_unused_parameters_true"
     logging.info(f'Config: {OmegaConf.to_yaml(cfg)}')
     trainer = pl.Trainer(**cfg.trainer)
     exp_manager_cfg = cfg.get("exp_manager", None)

From f219e2f67d78cce15867ddea8c81bfbeaa4659b4 Mon Sep 17 00:00:00 2001
From: Aleksandr Laptev <alaptev@nvidia.com>
Date: Wed, 20 Sep 2023 08:55:59 +0700
Subject: [PATCH 04/10] RNN-T confidence and alignment bugfix (#7381) (#7459)

* new frame_confidence and alignments lists are now always created after the while loop

Signed-off-by: Aleksandr Laptev <alaptev@nvidia.com>

* tests added

Signed-off-by: Aleksandr Laptev <alaptev@nvidia.com>

---------

Signed-off-by: Aleksandr Laptev <alaptev@nvidia.com>
(cherry picked from commit 6012ca2f4034cadb2f3186899ffc9ef8490472cb)
---
 .../parts/submodules/rnnt_greedy_decoding.py  | 278 ++++++++----------
 .../asr/test_asr_rnnt_encdec_model.py         | 238 ++++++++++++++-
 2 files changed, 363 insertions(+), 153 deletions(-)

diff --git a/nemo/collections/asr/parts/submodules/rnnt_greedy_decoding.py b/nemo/collections/asr/parts/submodules/rnnt_greedy_decoding.py
index dfa3ac27854b..2a0186f788e4 100644
--- a/nemo/collections/asr/parts/submodules/rnnt_greedy_decoding.py
+++ b/nemo/collections/asr/parts/submodules/rnnt_greedy_decoding.py
@@ -441,13 +441,6 @@ def _greedy_decode(
                 # If blank token is predicted, exit inner loop, move onto next timestep t
                 if k == self._blank_index:
                     not_blank = False
-
-                    if self.preserve_alignments:
-                        # convert Ti-th logits into a torch array
-                        hypothesis.alignments.append([])  # blank buffer for next timestep
-
-                    if self.preserve_frame_confidence:
-                        hypothesis.frame_confidence.append([])  # blank buffer for next timestep
                 else:
                     # Append token to label set, update RNN state.
                     hypothesis.y_sequence.append(k)
@@ -459,6 +452,13 @@ def _greedy_decode(
                 # Increment token counter.
                 symbols_added += 1
 
+            if self.preserve_alignments:
+                # convert Ti-th logits into a torch array
+                hypothesis.alignments.append([])  # blank buffer for next timestep
+
+            if self.preserve_frame_confidence:
+                hypothesis.frame_confidence.append([])  # blank buffer for next timestep
+
         # Remove trailing empty list of Alignments
         if self.preserve_alignments:
             if len(hypothesis.alignments[-1]) == 0:
@@ -642,9 +642,6 @@ def _greedy_decode_blank_as_pad(
                 # frame_confidence is a 3-dimensional dangling list representing B x T x U
                 for hyp in hypotheses:
                     hyp.frame_confidence = [[]]
-                    hyp.y_3best = [[]]
-                    hyp.frame_confidence_3best = [[[]]]
-                    hyp.logp = [[]]
 
             # Last Label buffer + Last Label without blank buffer
             # batch level equivalent of the last_label
@@ -731,32 +728,6 @@ def _greedy_decode_blank_as_pad(
                     # This is equivalent to if single sample predicted k
                     if all_blanks:
                         not_blank = False
-
-                        # If preserving alignments, convert the current Uj alignments into a torch.Tensor
-                        # Then preserve U at current timestep Ti
-                        # Finally, forward the timestep history to Ti+1 for that sample
-                        # All of this should only be done iff the current time index <= sample-level AM length.
-                        # Otherwise ignore and move to next sample / next timestep.
-                        if self.preserve_alignments:
-
-                            # convert Ti-th logits into a torch array
-                            for batch_idx in range(batchsize):
-
-                                # this checks if current timestep <= sample-level AM length
-                                # If current timestep > sample-level AM length, no alignments will be added
-                                # Therefore the list of Uj alignments is empty here.
-                                if len(hypotheses[batch_idx].alignments[-1]) > 0:
-                                    hypotheses[batch_idx].alignments.append([])  # blank buffer for next timestep
-
-                        # Do the same if preserving per-frame confidence
-                        if self.preserve_frame_confidence:
-
-                            for batch_idx in range(batchsize):
-                                if len(hypotheses[batch_idx].frame_confidence[-1]) > 0:
-                                    hypotheses[batch_idx].frame_confidence.append([])  # blank buffer for next timestep
-                                    hypotheses[batch_idx].y_3best.append([])
-                                    hypotheses[batch_idx].frame_confidence_3best.append([])
-                                    hypotheses[batch_idx].logp.append([])
                     else:
                         # Collect batch indices where blanks occurred now/past
                         blank_indices = (blank_mask == 1).nonzero(as_tuple=False)
@@ -791,6 +762,29 @@ def _greedy_decode_blank_as_pad(
                                 hypotheses[kidx].score += float(v[kidx])
                         symbols_added += 1
 
+                # If preserving alignments, convert the current Uj alignments into a torch.Tensor
+                # Then preserve U at current timestep Ti
+                # Finally, forward the timestep history to Ti+1 for that sample
+                # All of this should only be done iff the current time index <= sample-level AM length.
+                # Otherwise ignore and move to next sample / next timestep.
+                if self.preserve_alignments:
+
+                    # convert Ti-th logits into a torch array
+                    for batch_idx in range(batchsize):
+
+                        # this checks if current timestep <= sample-level AM length
+                        # If current timestep > sample-level AM length, no alignments will be added
+                        # Therefore the list of Uj alignments is empty here.
+                        if len(hypotheses[batch_idx].alignments[-1]) > 0:
+                            hypotheses[batch_idx].alignments.append([])  # blank buffer for next timestep
+
+                # Do the same if preserving per-frame confidence
+                if self.preserve_frame_confidence:
+
+                    for batch_idx in range(batchsize):
+                        if len(hypotheses[batch_idx].frame_confidence[-1]) > 0:
+                            hypotheses[batch_idx].frame_confidence.append([])  # blank buffer for next timestep
+
             # Remove trailing empty list of alignments at T_{am-len} x Uj
             if self.preserve_alignments:
                 for batch_idx in range(batchsize):
@@ -802,9 +796,6 @@ def _greedy_decode_blank_as_pad(
                 for batch_idx in range(batchsize):
                     if len(hypotheses[batch_idx].frame_confidence[-1]) == 0:
                         del hypotheses[batch_idx].frame_confidence[-1]
-                        del hypotheses[batch_idx].y_3best[-1]
-                        del hypotheses[batch_idx].frame_confidence_3best[-1]
-                        del hypotheses[batch_idx].logp[-1]
 
         # Preserve states
         for batch_idx in range(batchsize):
@@ -946,29 +937,6 @@ def _greedy_decode_masked(
                     # This is equivalent to if single sample predicted k
                     if blank_mask.all():
                         not_blank = False
-
-                        # If preserving alignments, convert the current Uj alignments into a torch.Tensor
-                        # Then preserve U at current timestep Ti
-                        # Finally, forward the timestep history to Ti+1 for that sample
-                        # All of this should only be done iff the current time index <= sample-level AM length.
-                        # Otherwise ignore and move to next sample / next timestep.
-                        if self.preserve_alignments:
-
-                            # convert Ti-th logits into a torch array
-                            for batch_idx in range(batchsize):
-
-                                # this checks if current timestep <= sample-level AM length
-                                # If current timestep > sample-level AM length, no alignments will be added
-                                # Therefore the list of Uj alignments is empty here.
-                                if len(hypotheses[batch_idx].alignments[-1]) > 0:
-                                    hypotheses[batch_idx].alignments.append([])  # blank buffer for next timestep
-
-                        # Do the same if preserving per-frame confidence
-                        if self.preserve_frame_confidence:
-
-                            for batch_idx in range(batchsize):
-                                if len(hypotheses[batch_idx].frame_confidence[-1]) > 0:
-                                    hypotheses[batch_idx].frame_confidence.append([])  # blank buffer for next timestep
                     else:
                         # Collect batch indices where blanks occurred now/past
                         blank_indices = (blank_mask == 1).nonzero(as_tuple=False)
@@ -1004,6 +972,29 @@ def _greedy_decode_masked(
 
                     symbols_added += 1
 
+                # If preserving alignments, convert the current Uj alignments into a torch.Tensor
+                # Then preserve U at current timestep Ti
+                # Finally, forward the timestep history to Ti+1 for that sample
+                # All of this should only be done iff the current time index <= sample-level AM length.
+                # Otherwise ignore and move to next sample / next timestep.
+                if self.preserve_alignments:
+
+                    # convert Ti-th logits into a torch array
+                    for batch_idx in range(batchsize):
+
+                        # this checks if current timestep <= sample-level AM length
+                        # If current timestep > sample-level AM length, no alignments will be added
+                        # Therefore the list of Uj alignments is empty here.
+                        if len(hypotheses[batch_idx].alignments[-1]) > 0:
+                            hypotheses[batch_idx].alignments.append([])  # blank buffer for next timestep
+
+                # Do the same if preserving per-frame confidence
+                if self.preserve_frame_confidence:
+
+                    for batch_idx in range(batchsize):
+                        if len(hypotheses[batch_idx].frame_confidence[-1]) > 0:
+                            hypotheses[batch_idx].frame_confidence.append([])  # blank buffer for next timestep
+
         # Remove trailing empty list of alignments at T_{am-len} x Uj
         if self.preserve_alignments:
             for batch_idx in range(batchsize):
@@ -1624,13 +1615,6 @@ def _greedy_decode(
                 # If any type of blank token is predicted, exit inner loop, move onto next timestep t
                 if k >= self._blank_index - len(self.big_blank_durations):
                     not_blank = False
-
-                    if self.preserve_alignments:
-                        # convert Ti-th logits into a torch array
-                        hypothesis.alignments.append([])  # blank buffer for next timestep
-
-                    if self.preserve_frame_confidence:
-                        hypothesis.frame_confidence.append([])  # blank buffer for next timestep
                 else:
                     # Append token to label set, update RNN state.
                     hypothesis.y_sequence.append(k)
@@ -1642,6 +1626,13 @@ def _greedy_decode(
                 # Increment token counter.
                 symbols_added += 1
 
+            if self.preserve_alignments:
+                # convert Ti-th logits into a torch array
+                hypothesis.alignments.append([])  # blank buffer for next timestep
+
+            if self.preserve_frame_confidence:
+                hypothesis.frame_confidence.append([])  # blank buffer for next timestep
+
         # Remove trailing empty list of Alignments
         if self.preserve_alignments:
             if len(hypothesis.alignments[-1]) == 0:
@@ -1781,9 +1772,6 @@ def _greedy_decode_blank_as_pad(
                 # frame_confidence is a 3-dimensional dangling list representing B x T x U
                 for hyp in hypotheses:
                     hyp.frame_confidence = [[]]
-                    hyp.y_3best = [[]]
-                    hyp.frame_confidence_3best = [[[]]]
-                    hyp.logp = [[]]
 
             # Last Label buffer + Last Label without blank buffer
             # batch level equivalent of the last_label
@@ -1897,40 +1885,6 @@ def _greedy_decode_blank_as_pad(
                     # This is equivalent to if single sample predicted k
                     if blank_mask.all():
                         not_blank = False
-
-                        for i in range(len(big_blank_masks) + 1):
-                            # The task here is find the shortest blank duration of all batches.
-                            # so we start from the shortest blank duration and go up,
-                            # and stop once we found the duration whose corresponding mask isn't all True.
-                            if i == len(big_blank_masks) or not big_blank_masks[i].all():
-                                big_blank_duration = self.big_blank_durations[i - 1] if i > 0 else 1
-                                break
-
-                        # If preserving alignments, convert the current Uj alignments into a torch.Tensor
-                        # Then preserve U at current timestep Ti
-                        # Finally, forward the timestep history to Ti+1 for that sample
-                        # All of this should only be done iff the current time index <= sample-level AM length.
-                        # Otherwise ignore and move to next sample / next timestep.
-                        if self.preserve_alignments:
-
-                            # convert Ti-th logits into a torch array
-                            for batch_idx in range(batchsize):
-
-                                # this checks if current timestep <= sample-level AM length
-                                # If current timestep > sample-level AM length, no alignments will be added
-                                # Therefore the list of Uj alignments is empty here.
-                                if len(hypotheses[batch_idx].alignments[-1]) > 0:
-                                    hypotheses[batch_idx].alignments.append([])  # blank buffer for next timestep
-
-                        # Do the same if preserving per-frame confidence
-                        if self.preserve_frame_confidence:
-
-                            for batch_idx in range(batchsize):
-                                if len(hypotheses[batch_idx].frame_confidence[-1]) > 0:
-                                    hypotheses[batch_idx].frame_confidence.append([])  # blank buffer for next timestep
-                                    hypotheses[batch_idx].y_3best.append([])
-                                    hypotheses[batch_idx].frame_confidence_3best.append([])
-                                    hypotheses[batch_idx].logp.append([])
                     else:
                         # Collect batch indices where blanks occurred now/past
                         blank_indices = (blank_mask == 1).nonzero(as_tuple=False)
@@ -1966,6 +1920,37 @@ def _greedy_decode_blank_as_pad(
 
                         symbols_added += 1
 
+                for i in range(len(big_blank_masks) + 1):
+                    # The task here is find the shortest blank duration of all batches.
+                    # so we start from the shortest blank duration and go up,
+                    # and stop once we found the duration whose corresponding mask isn't all True.
+                    if i == len(big_blank_masks) or not big_blank_masks[i].all():
+                        big_blank_duration = self.big_blank_durations[i - 1] if i > 0 else 1
+                        break
+
+                # If preserving alignments, convert the current Uj alignments into a torch.Tensor
+                # Then preserve U at current timestep Ti
+                # Finally, forward the timestep history to Ti+1 for that sample
+                # All of this should only be done iff the current time index <= sample-level AM length.
+                # Otherwise ignore and move to next sample / next timestep.
+                if self.preserve_alignments:
+
+                    # convert Ti-th logits into a torch array
+                    for batch_idx in range(batchsize):
+
+                        # this checks if current timestep <= sample-level AM length
+                        # If current timestep > sample-level AM length, no alignments will be added
+                        # Therefore the list of Uj alignments is empty here.
+                        if len(hypotheses[batch_idx].alignments[-1]) > 0:
+                            hypotheses[batch_idx].alignments.append([])  # blank buffer for next timestep
+
+                # Do the same if preserving per-frame confidence
+                if self.preserve_frame_confidence:
+
+                    for batch_idx in range(batchsize):
+                        if len(hypotheses[batch_idx].frame_confidence[-1]) > 0:
+                            hypotheses[batch_idx].frame_confidence.append([])  # blank buffer for next timestep
+
             # Remove trailing empty list of alignments at T_{am-len} x Uj
             if self.preserve_alignments:
                 for batch_idx in range(batchsize):
@@ -1977,9 +1962,6 @@ def _greedy_decode_blank_as_pad(
                 for batch_idx in range(batchsize):
                     if len(hypotheses[batch_idx].frame_confidence[-1]) == 0:
                         del hypotheses[batch_idx].frame_confidence[-1]
-                        del hypotheses[batch_idx].y_3best[-1]
-                        del hypotheses[batch_idx].frame_confidence_3best[-1]
-                        del hypotheses[batch_idx].logp[-1]
 
         # Preserve states
         for batch_idx in range(batchsize):
@@ -2121,29 +2103,6 @@ def _greedy_decode_masked(
                     # This is equivalent to if single sample predicted k
                     if blank_mask.all():
                         not_blank = False
-
-                        # If preserving alignments, convert the current Uj alignments into a torch.Tensor
-                        # Then preserve U at current timestep Ti
-                        # Finally, forward the timestep history to Ti+1 for that sample
-                        # All of this should only be done iff the current time index <= sample-level AM length.
-                        # Otherwise ignore and move to next sample / next timestep.
-                        if self.preserve_alignments:
-
-                            # convert Ti-th logits into a torch array
-                            for batch_idx in range(batchsize):
-
-                                # this checks if current timestep <= sample-level AM length
-                                # If current timestep > sample-level AM length, no alignments will be added
-                                # Therefore the list of Uj alignments is empty here.
-                                if len(hypotheses[batch_idx].alignments[-1]) > 0:
-                                    hypotheses[batch_idx].alignments.append([])  # blank buffer for next timestep
-
-                        # Do the same if preserving per-frame confidence
-                        if self.preserve_frame_confidence:
-
-                            for batch_idx in range(batchsize):
-                                if len(hypotheses[batch_idx].frame_confidence[-1]) > 0:
-                                    hypotheses[batch_idx].frame_confidence.append([])  # blank buffer for next timestep
                     else:
                         # Collect batch indices where blanks occurred now/past
                         blank_indices = (blank_mask == 1).nonzero(as_tuple=False)
@@ -2179,6 +2138,29 @@ def _greedy_decode_masked(
 
                     symbols_added += 1
 
+                # If preserving alignments, convert the current Uj alignments into a torch.Tensor
+                # Then preserve U at current timestep Ti
+                # Finally, forward the timestep history to Ti+1 for that sample
+                # All of this should only be done iff the current time index <= sample-level AM length.
+                # Otherwise ignore and move to next sample / next timestep.
+                if self.preserve_alignments:
+
+                    # convert Ti-th logits into a torch array
+                    for batch_idx in range(batchsize):
+
+                        # this checks if current timestep <= sample-level AM length
+                        # If current timestep > sample-level AM length, no alignments will be added
+                        # Therefore the list of Uj alignments is empty here.
+                        if len(hypotheses[batch_idx].alignments[-1]) > 0:
+                            hypotheses[batch_idx].alignments.append([])  # blank buffer for next timestep
+
+                # Do the same if preserving per-frame confidence
+                if self.preserve_frame_confidence:
+
+                    for batch_idx in range(batchsize):
+                        if len(hypotheses[batch_idx].frame_confidence[-1]) > 0:
+                            hypotheses[batch_idx].frame_confidence.append([])  # blank buffer for next timestep
+
         # Remove trailing empty list of alignments at T_{am-len} x Uj
         if self.preserve_alignments:
             for batch_idx in range(batchsize):
@@ -2475,19 +2457,6 @@ def _greedy_decode(
                 # If blank token is predicted, exit inner loop, move onto next timestep t
                 if k == self._blank_index:
                     not_blank = False
-
-                    # this rarely happens, but we manually increment the `skip` number
-                    # if blank is emitted and duration=0 is predicted. This prevents possible
-                    # infinite loops.
-                    if skip == 0:
-                        skip = 1
-
-                    if self.preserve_alignments:
-                        # convert Ti-th logits into a torch array
-                        hypothesis.alignments.append([])  # blank buffer for next timestep
-
-                    if self.preserve_frame_confidence:
-                        hypothesis.frame_confidence.append([])  # blank buffer for next timestep
                 else:
                     # Append token to label set, update RNN state.
                     hypothesis.y_sequence.append(k)
@@ -2501,6 +2470,19 @@ def _greedy_decode(
                 time_idx += skip
                 need_loop = skip == 0
 
+            # this rarely happens, but we manually increment the `skip` number
+            # if blank is emitted and duration=0 is predicted. This prevents possible
+            # infinite loops.
+            if skip == 0:
+                skip = 1
+
+            if self.preserve_alignments:
+                # convert Ti-th logits into a torch array
+                hypothesis.alignments.append([])  # blank buffer for next timestep
+
+            if self.preserve_frame_confidence:
+                hypothesis.frame_confidence.append([])  # blank buffer for next timestep
+
             if symbols_added == self.max_symbols:
                 time_idx += 1
 
@@ -2684,9 +2666,6 @@ def _greedy_decode_blank_as_pad(
                 # frame_confidence is a 3-dimensional dangling list representing B x T x U
                 for hyp in hypotheses:
                     hyp.frame_confidence = [[]]
-                    hyp.y_3best = [[]]
-                    hyp.frame_confidence_3best = [[[]]]
-                    hyp.logp = [[]]
 
             # Last Label buffer + Last Label without blank buffer
             # batch level equivalent of the last_label
@@ -2813,9 +2792,6 @@ def _greedy_decode_blank_as_pad(
                 for batch_idx in range(batchsize):
                     if len(hypotheses[batch_idx].frame_confidence[-1]) == 0:
                         del hypotheses[batch_idx].frame_confidence[-1]
-                        del hypotheses[batch_idx].y_3best[-1]
-                        del hypotheses[batch_idx].frame_confidence_3best[-1]
-                        del hypotheses[batch_idx].logp[-1]
 
         # Preserve states
         for batch_idx in range(batchsize):
diff --git a/tests/collections/asr/test_asr_rnnt_encdec_model.py b/tests/collections/asr/test_asr_rnnt_encdec_model.py
index 775a146c74c4..aee6956133db 100644
--- a/tests/collections/asr/test_asr_rnnt_encdec_model.py
+++ b/tests/collections/asr/test_asr_rnnt_encdec_model.py
@@ -12,6 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import copy
+from typing import Any, Dict, List, Optional, Tuple
 
 import pytest
 import torch
@@ -31,6 +32,71 @@
 ) or numba_utils.numba_cuda_is_supported(__NUMBA_MINIMUM_VERSION__)
 
 
+@pytest.fixture()
+def max_symbols_setup():
+    from nemo.collections.asr.modules.rnnt_abstract import AbstractRNNTDecoder, AbstractRNNTJoint
+    from nemo.collections.asr.parts.utils.rnnt_utils import Hypothesis
+
+    class DummyRNNTDecoder(AbstractRNNTDecoder):
+        def predict(
+            self,
+            y: Optional[torch.Tensor] = None,
+            state: Optional[torch.Tensor] = None,
+            add_sos: bool = False,
+            batch_size: Optional[int] = None,
+        ) -> Tuple[torch.Tensor, List[torch.Tensor]]:
+            if y is not None and state is not None:
+                return (y + state) / 2, y * state
+            elif state is not None:
+                return torch.tensor([0] * self.vocab_size + [1], dtype=torch.float32).repeat(state.size()), state
+            elif y is not None:
+                return y, torch.tensor([0] * self.vocab_size + [1], dtype=torch.float32).repeat(y.size())
+            return (
+                torch.tensor([0] * self.vocab_size + [1], dtype=torch.float32).repeat([1, 1, 1]),
+                torch.tensor([0] * self.vocab_size + [1], dtype=torch.float32).repeat([1, 1, 1]),
+            )
+
+        def initialize_state(self, y: torch.Tensor) -> List[torch.Tensor]:
+            return [torch.tensor()]
+
+        def score_hypothesis(
+            self, hypothesis: Hypothesis, cache: Dict[Tuple[int], Any]
+        ) -> Tuple[torch.Tensor, List[torch.Tensor], torch.Tensor]:
+            return torch.tensor(), [torch.tensor()], torch.tensor()
+
+        def batch_select_state(self, batch_states: List[torch.Tensor], idx: int) -> List[List[torch.Tensor]]:
+            if batch_states is not None:
+                states = batch_states[0][idx]
+                states = states.long()
+                return [states]
+            else:
+                return None
+
+        def batch_copy_states(
+            self,
+            old_states: List[torch.Tensor],
+            new_states: List[torch.Tensor],
+            ids: List[int],
+            value: Optional[float] = None,
+        ) -> List[torch.Tensor]:
+            if value is None:
+                old_states[0][ids, :] = new_states[0][ids, :]
+
+            return old_states
+
+    class DummyRNNTJoint(AbstractRNNTJoint):
+        def joint(self, f: torch.Tensor, g: torch.Tensor) -> torch.Tensor:
+            return f.unsqueeze(dim=2) + g.unsqueeze(dim=1)
+
+    setup = {}
+    setup["decoder"] = DummyRNNTDecoder(vocab_size=2, blank_idx=2, blank_as_pad=True)
+    setup["decoder_masked"] = DummyRNNTDecoder(vocab_size=2, blank_idx=2, blank_as_pad=False)
+    setup["joint"] = DummyRNNTJoint()
+    setup["encoder_output"] = torch.tensor([[[1, 0, 0], [0, 1, 0], [0, 0, 1]]], dtype=torch.float32).transpose(1, 2)
+    setup["encoded_lengths"] = torch.tensor([3])
+    return setup
+
+
 @pytest.fixture()
 def asr_model():
     preprocessor = {'cls': 'nemo.collections.asr.modules.AudioToMelSpectrogramPreprocessor', 'params': dict({})}
@@ -591,11 +657,16 @@ def test_greedy_decoding_preserve_alignment(self, greedy_class):
 
         decoder = RNNTDecoder(prednet_cfg, vocab_size)
 
+        max_symbols_per_step = 5
         for joint_type in [RNNTJoint, HATJoint]:
             joint_net = joint_type(jointnet_cfg, vocab_size, vocabulary=token_list)
 
             greedy = greedy_class(
-                decoder, joint_net, blank_index=len(token_list) - 1, preserve_alignments=True, max_symbols_per_step=5
+                decoder,
+                joint_net,
+                blank_index=len(token_list),
+                preserve_alignments=True,
+                max_symbols_per_step=max_symbols_per_step,
             )
 
             # (B, D, T)
@@ -606,12 +677,175 @@ def test_greedy_decoding_preserve_alignment(self, greedy_class):
                 hyp = greedy(encoder_output=enc_out, encoded_lengths=enc_len)[0][0]  # type: rnnt_utils.Hypothesis
                 assert hyp.alignments is not None
 
+                timestep_count = {
+                    u.item(): c.item() for u, c in zip(*torch.unique(torch.tensor(hyp.timestep), return_counts=True))
+                }
                 for t in range(len(hyp.alignments)):
-                    for u in range(len(hyp.alignments[t])):
+
+                    # check that the number of alignment elements is consistent with hyp.timestep
+                    alignment_len = len(hyp.alignments[t])
+                    assert alignment_len <= max_symbols_per_step
+                    if t in timestep_count:  # non-blank
+                        assert alignment_len == timestep_count[t] + (1 if alignment_len < max_symbols_per_step else 0)
+                    else:  # blank
+                        assert alignment_len == 1
+
+                    for u in range(alignment_len):
                         logp, label = hyp.alignments[t][u]
                         assert torch.is_tensor(logp)
                         assert torch.is_tensor(label)
 
+    @pytest.mark.skipif(
+        not NUMBA_RNNT_LOSS_AVAILABLE, reason='RNNTLoss has not been compiled with appropriate numba version.',
+    )
+    @pytest.mark.unit
+    @pytest.mark.parametrize(
+        "greedy_class", [greedy_decode.GreedyRNNTInfer, greedy_decode.GreedyBatchedRNNTInfer],
+    )
+    def test_greedy_decoding_preserve_frame_confidence(self, greedy_class):
+        token_list = [" ", "a", "b", "c"]
+        vocab_size = len(token_list)
+
+        encoder_output_size = 4
+        decoder_output_size = 4
+        joint_output_shape = 4
+
+        prednet_cfg = {'pred_hidden': decoder_output_size, 'pred_rnn_layers': 1}
+        jointnet_cfg = {
+            'encoder_hidden': encoder_output_size,
+            'pred_hidden': decoder_output_size,
+            'joint_hidden': joint_output_shape,
+            'activation': 'relu',
+        }
+
+        decoder = RNNTDecoder(prednet_cfg, vocab_size)
+
+        max_symbols_per_step = 5
+        for joint_type in [RNNTJoint, HATJoint]:
+            joint_net = joint_type(jointnet_cfg, vocab_size, vocabulary=token_list)
+
+            greedy = greedy_class(
+                decoder,
+                joint_net,
+                blank_index=len(token_list),
+                preserve_alignments=True,
+                preserve_frame_confidence=True,
+                max_symbols_per_step=max_symbols_per_step,
+            )
+
+            # (B, D, T)
+            enc_out = torch.randn(1, encoder_output_size, 30)
+            enc_len = torch.tensor([30], dtype=torch.int32)
+
+            with torch.no_grad():
+                hyp = greedy(encoder_output=enc_out, encoded_lengths=enc_len)[0][0]  # type: rnnt_utils.Hypothesis
+                assert hyp.frame_confidence is not None
+
+                timestep_count = {
+                    u.item(): c.item() for u, c in zip(*torch.unique(torch.tensor(hyp.timestep), return_counts=True))
+                }
+                for t in range(len(hyp.frame_confidence)):
+
+                    # check that the number of confidence elements is consistent with hyp.timestep
+                    confidence_len = len(hyp.frame_confidence[t])
+                    assert confidence_len <= max_symbols_per_step
+                    if t in timestep_count:  # non-blank
+                        assert confidence_len == timestep_count[t] + (
+                            1 if confidence_len < max_symbols_per_step else 0
+                        )
+                    else:  # blank
+                        assert confidence_len == 1
+
+                    for u in range(confidence_len):
+                        score = hyp.frame_confidence[t][u]
+                        assert 0 <= score <= 1
+
+    @pytest.mark.skipif(
+        not NUMBA_RNNT_LOSS_AVAILABLE, reason='RNNTLoss has not been compiled with appropriate numba version.',
+    )
+    @pytest.mark.unit
+    @pytest.mark.parametrize(
+        "greedy_class", [greedy_decode.GreedyRNNTInfer, greedy_decode.GreedyBatchedRNNTInfer],
+    )
+    @pytest.mark.parametrize("max_symbols_per_step", [0, 1, 5])
+    def test_greedy_decoding_max_symbols_alignment(self, max_symbols_setup, greedy_class, max_symbols_per_step):
+        decoders = [max_symbols_setup["decoder"]]
+        if greedy_class is greedy_decode.GreedyBatchedRNNTInfer:
+            decoders.append(max_symbols_setup["decoder_masked"])
+        joint = max_symbols_setup["joint"]
+        encoder_output = max_symbols_setup["encoder_output"]
+        encoded_lengths = max_symbols_setup["encoded_lengths"]
+
+        for decoder in decoders:
+            greedy = greedy_class(
+                decoder_model=decoder,
+                joint_model=joint,
+                blank_index=decoder.blank_idx,
+                max_symbols_per_step=max_symbols_per_step,
+                preserve_alignments=True,
+            )
+
+            with torch.no_grad():
+                hyp = greedy(encoder_output=encoder_output, encoded_lengths=encoded_lengths)[0][0]
+                assert hyp.alignments is not None
+
+                timestep_count = {
+                    u.item(): c.item() for u, c in zip(*torch.unique(torch.tensor(hyp.timestep), return_counts=True))
+                }
+                for t in range(len(hyp.alignments)):
+
+                    # check that the number of confidence elements is consistent with hyp.timestep
+                    alignment_len = len(hyp.alignments[t])
+                    assert alignment_len <= max_symbols_per_step
+                    if t in timestep_count:  # non-blank
+                        assert alignment_len == timestep_count[t] + (1 if alignment_len < max_symbols_per_step else 0)
+                    else:  # blank or max_symbols_per_step == 0
+                        assert alignment_len <= 1
+
+    @pytest.mark.skipif(
+        not NUMBA_RNNT_LOSS_AVAILABLE, reason='RNNTLoss has not been compiled with appropriate numba version.',
+    )
+    @pytest.mark.unit
+    @pytest.mark.parametrize(
+        "greedy_class", [greedy_decode.GreedyRNNTInfer, greedy_decode.GreedyBatchedRNNTInfer],
+    )
+    @pytest.mark.parametrize("max_symbols_per_step", [0, 1, 5])
+    def test_greedy_decoding_max_symbols_confidence(self, max_symbols_setup, greedy_class, max_symbols_per_step):
+        decoders = [max_symbols_setup["decoder"]]
+        if greedy_class is greedy_decode.GreedyBatchedRNNTInfer:
+            decoders.append(max_symbols_setup["decoder_masked"])
+        joint = max_symbols_setup["joint"]
+        encoder_output = max_symbols_setup["encoder_output"]
+        encoded_lengths = max_symbols_setup["encoded_lengths"]
+
+        for decoder in decoders:
+            greedy = greedy_class(
+                decoder_model=decoder,
+                joint_model=joint,
+                blank_index=decoder.blank_idx,
+                max_symbols_per_step=max_symbols_per_step,
+                preserve_frame_confidence=True,
+            )
+
+            with torch.no_grad():
+                hyp = greedy(encoder_output=encoder_output, encoded_lengths=encoded_lengths)[0][0]
+                assert hyp.frame_confidence is not None
+
+                timestep_count = {
+                    u.item(): c.item() for u, c in zip(*torch.unique(torch.tensor(hyp.timestep), return_counts=True))
+                }
+                for t in range(len(hyp.frame_confidence)):
+
+                    # check that the number of confidence elements is consistent with hyp.timestep
+                    confidence_len = len(hyp.frame_confidence[t])
+                    assert confidence_len <= max_symbols_per_step
+                    if t in timestep_count:  # non-blank
+                        assert confidence_len == timestep_count[t] + (
+                            1 if confidence_len < max_symbols_per_step else 0
+                        )
+                    else:  # blank or max_symbols_per_step == 0
+                        assert confidence_len <= 1
+
     @pytest.mark.skipif(
         not NUMBA_RNNT_LOSS_AVAILABLE, reason='RNNTLoss has not been compiled with appropriate numba version.',
     )

From f208a73f0be5f123d77c87cee0a2fac461990da6 Mon Sep 17 00:00:00 2001
From: Nithin Rao <nithinrao.koluguri@gmail.com>
Date: Fri, 22 Sep 2023 13:15:59 -0700
Subject: [PATCH 05/10] update branch (#7488)

Signed-off-by: Nithin Rao Koluguri <nithinraok>
Co-authored-by: Nithin Rao Koluguri <nithinraok>
---
 tutorials/asr/Confidence_Ensembles.ipynb | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tutorials/asr/Confidence_Ensembles.ipynb b/tutorials/asr/Confidence_Ensembles.ipynb
index 4516d2b70d6d..6711e8a6ec9c 100644
--- a/tutorials/asr/Confidence_Ensembles.ipynb
+++ b/tutorials/asr/Confidence_Ensembles.ipynb
@@ -39,7 +39,7 @@
     "\n",
     "# option #2: download NeMo repo\n",
     "if 'google.colab' in str(get_ipython()) or not os.path.exists(os.path.join(NEMO_DIR, \"nemo\")):\n",
-    "    BRANCH = \"main\"\n",
+    "    BRANCH = \"r1.21.0\"\n",
     "    !git clone -b $BRANCH https://github.com/NVIDIA/NeMo $WORKSPACE_DIR/NeMo\n",
     "    NEMO_DIR = os.path.join(WORKSPACE_DIR, 'NeMo')\n",
     "\n",

From 23e6f2184342d5ac40c4a83a39e5ca05a28ee7c0 Mon Sep 17 00:00:00 2001
From: Kelvin Liu <lhb8125@users.noreply.github.com>
Date: Sat, 23 Sep 2023 04:54:04 +0800
Subject: [PATCH 06/10] fix bug when loading dist ckpt in peft (#7479)

Signed-off-by: Hongbin Liu <hongbinl@nvidia.com>
Co-authored-by: Hongbin Liu <hongbinl@nvidia.com>
---
 nemo/collections/nlp/parts/nlp_overrides.py | 65 ++++++++++++++++++---
 1 file changed, 57 insertions(+), 8 deletions(-)

diff --git a/nemo/collections/nlp/parts/nlp_overrides.py b/nemo/collections/nlp/parts/nlp_overrides.py
index d7eaaa90e536..a116c8e60299 100644
--- a/nemo/collections/nlp/parts/nlp_overrides.py
+++ b/nemo/collections/nlp/parts/nlp_overrides.py
@@ -740,7 +740,8 @@ def _load_state_dict_from_disk(self, model_weights, map_location=None):
             peft_state_dict = torch.load(model_weights_path, map_location)['state_dict']
         else:
             peft_state_dict = {}
-        base_model_state_dict.update(peft_state_dict)  # add the PEFT state_dict into the base model's state_dict
+        if base_model_state_dict:
+            base_model_state_dict.update(peft_state_dict)  # add the PEFT state_dict into the base model's state_dict
         return base_model_state_dict
 
     def restore_from(
@@ -765,13 +766,61 @@ def restore_from(
             return loaded_params
         conf, instance, state_dict = loaded_params
 
-        if (
-            self.peft_model_nemo_path is None and self.peft_model_ckpt_dir is None
-        ):  # we have this check only for training PEFT from scratch
-            peft_state_dict = instance.get_peft_state_dict()
-            state_dict.update(peft_state_dict)
-        state_dict = self.modify_state_dict(conf, state_dict)
-        self.load_instance_with_state_dict(instance, state_dict, strict)
+        # if we're using dist checkpointing then state_dict will be None
+        if state_dict is None:
+            # dist checkpointing needs torch.distributed to load the checkpoint
+            if parallel_state.is_unitialized():
+
+                def dummy():
+                    return
+
+                if trainer.strategy.launcher is not None:
+                    trainer.strategy.launcher.launch(dummy, trainer=trainer)
+                trainer.strategy.setup_environment()
+
+            with tempfile.TemporaryDirectory() as tmpdir:
+                # Check if self.model_extracted_dir is set, and is a valid path
+                if self.model_extracted_dir is not None and os.path.isdir(self.model_extracted_dir):
+                    # Log that NeMo will use the provided `model_extracted_dir`
+                    logging.info(
+                        f"Restoration will occur within pre-extracted directory : " f"`{self.model_extracted_dir}`."
+                    )
+
+                    # Override `tmpdir` above with the pre-extracted `model_extracted_dir`
+                    tmpdir = self.model_extracted_dir
+
+                else:
+                    # Extract the nemo file into the temporary directory
+                    self._unpack_nemo_file(
+                        path2file=restore_path, out_folder=tmpdir, extract_config_only=return_config is True
+                    )
+                checkpoint = {}
+                sharded_state_dict = instance.sharded_state_dict()
+                peft_state_dict = instance.get_peft_state_dict()
+                for k in peft_state_dict.keys():
+                    sharded_state_dict.pop(k)
+                checkpoint['state_dict'] = sharded_state_dict
+                # remove model weights extension
+                tmp_model_weights_ckpt = os.path.join(tmpdir, self.model_weights_ckpt)
+                tmp_model_weights_dir = os.path.splitext(tmp_model_weights_ckpt)[0]
+                assert os.path.isdir(tmp_model_weights_dir), f'Expected {tmp_model_weights_dir} to be a directory.'
+                checkpoint = dist_checkpointing.load(
+                    sharded_state_dict=checkpoint, checkpoint_dir=tmp_model_weights_dir
+                )
+                checkpoint['state_dict'].update(peft_state_dict)
+                instance.on_load_checkpoint(checkpoint)
+                if hasattr(instance, 'setup_transformer_engine_tp_groups'):
+                    instance.setup_transformer_engine_tp_groups()
+
+        else:
+            if (
+                self.peft_model_nemo_path is None and self.peft_model_ckpt_dir is None
+            ):  # we have this check only for training PEFT from scratch
+                peft_state_dict = instance.get_peft_state_dict()
+                state_dict.update(peft_state_dict)
+            state_dict = self.modify_state_dict(conf, state_dict)
+            self.load_instance_with_state_dict(instance, state_dict, strict)
+
         logging.info(f'Model {instance.__class__.__name__} was successfully restored from {restore_path}.')
         return instance
 

From f857a270d88d368d40af2d52b73861f8aecf3e52 Mon Sep 17 00:00:00 2001
From: Xuesong Yang <1646669+XuesongYang@users.noreply.github.com>
Date: Fri, 22 Sep 2023 17:11:29 -0700
Subject: [PATCH 07/10] bugfix: trainer.accelerator=auto from None. (#7492)

Signed-off-by: Xuesong Yang <16880-xueyang@users.noreply.gitlab-master.nvidia.com>
Co-authored-by: Xuesong Yang <16880-xueyang@users.noreply.gitlab-master.nvidia.com>
---
 tutorials/tts/Vits_Training.ipynb | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tutorials/tts/Vits_Training.ipynb b/tutorials/tts/Vits_Training.ipynb
index 10196cc864d6..9f2207403d75 100644
--- a/tutorials/tts/Vits_Training.ipynb
+++ b/tutorials/tts/Vits_Training.ipynb
@@ -308,7 +308,7 @@
     "  phoneme_dict_path=tts_dataset_files/ipa_cmudict-0.7b_nv23.01.txt \\\n",
     "  heteronyms_path=tts_dataset_files/heteronyms-052722 \\\n",
     "  trainer.max_epochs=3 \\\n",
-    "  trainer.accelerator=null \\\n",
+    "  trainer.accelerator=auto \\\n",
     "  trainer.check_val_every_n_epoch=1 \\\n",
     "  trainer.devices=1)"
    ]

From b8e631f0c8693512ccc1df75f1a4263311343940 Mon Sep 17 00:00:00 2001
From: Gerald Shen <119401249+gshennvm@users.noreply.github.com>
Date: Sat, 23 Sep 2023 21:17:07 -0700
Subject: [PATCH 08/10] add sleep (#7498)

* add sleep

Signed-off-by: Gerald Shen <geshen@nvidia.com>

* add sleep onto config instead

Signed-off-by: Gerald Shen <geshen@nvidia.com>

* add comment

Signed-off-by: Gerald Shen <geshen@nvidia.com>

---------

Signed-off-by: Gerald Shen <geshen@nvidia.com>
---
 nemo/utils/exp_manager.py | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/nemo/utils/exp_manager.py b/nemo/utils/exp_manager.py
index 1629aa5cbb50..addbf4eda617 100644
--- a/nemo/utils/exp_manager.py
+++ b/nemo/utils/exp_manager.py
@@ -170,6 +170,8 @@ class ExpManagerConfig:
     ema: Optional[EMAParams] = EMAParams()
     # Wall clock time limit
     max_time_per_run: Optional[str] = None
+    # time to sleep non 0 ranks during initialization
+    seconds_to_sleep: float = 5
 
 
 class TimingCallback(Callback):
@@ -301,6 +303,7 @@ def exp_manager(trainer: 'pytorch_lightning.Trainer', cfg: Optional[Union[DictCo
                 Set this to True if you are using DDP with many GPUs and do not want many log files in your exp dir.
             - max_time (str): The maximum wall clock time *per run*. This is intended to be used on clusters where you want 
                 a checkpoint to be saved after this specified time and be able to resume from that checkpoint. Defaults to None.
+            - seconds_to_sleep (float): seconds to sleep non rank 0 processes for. Used to give enough time for rank 0 to initialize
 
     returns:
         log_dir (Path): The final logging directory where logging files are saved. Usually the concatenation of
@@ -501,6 +504,11 @@ def exp_manager(trainer: 'pytorch_lightning.Trainer', cfg: Optional[Union[DictCo
         # Add lightning file logging to global_rank zero
         add_filehandlers_to_pl_logger(log_dir / 'lightning_logs.txt', log_dir / 'nemo_error_log.txt')
 
+    elif trainer.num_devices * trainer.num_devices > 1:
+        # sleep other ranks so rank 0 can finish
+        # doing the initialization such as moving files
+        time.sleep(cfg.seconds_to_sleep)
+
     return log_dir
 
 

From 24947bd82b03443acdcb17f45e29ab0edcb268eb Mon Sep 17 00:00:00 2001
From: Somshubra Majumdar <titu1994@gmail.com>
Date: Mon, 25 Sep 2023 09:49:02 -0700
Subject: [PATCH 09/10] Fix exp manager check for sleep (#7503)

Signed-off-by: smajumdar <titu1994@gmail.com>
---
 nemo/utils/exp_manager.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/nemo/utils/exp_manager.py b/nemo/utils/exp_manager.py
index addbf4eda617..1125ae06cabb 100644
--- a/nemo/utils/exp_manager.py
+++ b/nemo/utils/exp_manager.py
@@ -504,7 +504,7 @@ def exp_manager(trainer: 'pytorch_lightning.Trainer', cfg: Optional[Union[DictCo
         # Add lightning file logging to global_rank zero
         add_filehandlers_to_pl_logger(log_dir / 'lightning_logs.txt', log_dir / 'nemo_error_log.txt')
 
-    elif trainer.num_devices * trainer.num_devices > 1:
+    elif trainer.num_nodes * trainer.num_devices > 1:
         # sleep other ranks so rank 0 can finish
         # doing the initialization such as moving files
         time.sleep(cfg.seconds_to_sleep)

From 3206aeb91e4d95dddc3f99c46b7b174766c556f2 Mon Sep 17 00:00:00 2001
From: Jocelyn Huang <jocelynh@nvidia.com>
Date: Mon, 25 Sep 2023 14:23:25 -0700
Subject: [PATCH 10/10] Fix get_dist() tensor dimension

Signed-off-by: Jocelyn Huang <jocelynh@nvidia.com>
---
 nemo/collections/tts/modules/aligner.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/nemo/collections/tts/modules/aligner.py b/nemo/collections/tts/modules/aligner.py
index 2910602474fd..f044a86a52eb 100644
--- a/nemo/collections/tts/modules/aligner.py
+++ b/nemo/collections/tts/modules/aligner.py
@@ -98,7 +98,7 @@ def get_dist(self, keys, queries, mask=None):
 
         self._apply_mask(dist, mask, float("inf"))
 
-        return dist
+        return dist.squeeze(1)
 
     @staticmethod
     def get_euclidean_dist(queries_enc, keys_enc):