NVIDIA · titu1994 · Jan 31, 2024 · Jan 31, 2024
diff --git a/examples/asr/conf/speech_multitask/fast-conformer_aed.yaml b/examples/asr/conf/speech_multitask/fast-conformer_aed.yaml
@@ -74,8 +74,19 @@ model:
   # recommend small vocab size of 128 or 256 when using 4x sub-sampling
   # you may find more detail on how to train a tokenizer at: /scripts/tokenizers/process_asr_text_tokenizer.py
   tokenizer:
-    dir: ???  # path to directory which contains either tokenizer.model (bpe) or vocab.txt (wpe)
-    type: bpe  # Can be either bpe (SentencePiece tokenizer) or wpe (WordPiece tokenizer)
+    dir: null  # Null for aggregate tokenizers
+    type: agg  # Can be either bpe (SentencePiece tokenizer) or wpe (WordPiece tokenizer) or `agg` for aggregate tokenizers
+    langs:
+      spl_tokens: # special tokens model
+        dir: ???
+        type: bpe
+      en: # English tokenizer (example, replace with whichever language you would like)
+        dir: ???
+        type: bpe
+
+    custom_tokenizer:
+      _target_: nemo.collections.common.tokenizers.canary_tokenizer.CanaryTokenizer  # Can be replaced with other tokenizer for different prompt formats
+      tokenizers: null  # Filled at runtime by all the tokenizers inside the aggregate tokenizer
 
   # Audio Preprocessor
   preprocessor: